From e4a96663d53674f3612829a6cb421a942a516177 Mon Sep 17 00:00:00 2001
From: Floreal Cabanettes <floreal.cabanettes@inra.fr>
Date: Wed, 21 Jun 2017 11:36:34 +0200
Subject: [PATCH] Some improvements

---
 build_pop.py          | 44 +++++++++++++++++++++++++++++++++----------
 defaults.rules        |  5 +++++
 variants_simulator.py | 42 ++++++-----------------------------------
 3 files changed, 45 insertions(+), 46 deletions(-)
 create mode 100644 defaults.rules

diff --git a/build_pop.py b/build_pop.py
index cbf5930..f763d70 100755
--- a/build_pop.py
+++ b/build_pop.py
@@ -2,6 +2,7 @@
 
 import sys
 import os
+import shutil
 import random
 import argparse
 from collections import OrderedDict
@@ -11,6 +12,7 @@ import tempfile
 from lib.svrunner_utils import eprint
 from pysam import tabix_compress, tabix_index
 from variants_simulator import VariantsSimulator
+from exceptions import InputException
 
 
 def parse_args():
@@ -25,10 +27,12 @@ def parse_args():
     parser.add_argument("-s", "--sv-list", help="File containing the SVs", required=False)
     parser.add_argument("-c", "--coverage", help="Coverage of reads", default=15, type=int)
     parser.add_argument("-o", "--output-directory", help="Output directory", default="res")
-    parser.add_argument("-f", "--force-polymorphism", help="Force polymorphism for each SV", action='store_const', const=True,
-                        default=False)
-    parser.add_argument("-a", "--haploid", help="Make a haploid genome, instead of diploid one", action="store_const", const=True,
-                        default=False)
+    parser.add_argument("-e", "--erase-output", help="Delete output, if directory already exists", action='store_const',
+                        const=True, default=False)
+    parser.add_argument("-f", "--force-polymorphism", help="Force polymorphism for each SV", action='store_const',
+                        const=True, default=False)
+    parser.add_argument("-a", "--haploid", help="Make a haploid genome, instead of diploid one", action="store_const",
+                        const=True, default=False)
     parser.add_argument("-p", "--proba-del", help="Probabilty to have a deletion", type=float, default=0.000001)
     parser.add_argument("-l", "--read-len", help="Generate reads having a length of LEN", type=int, default=100)
     parser.add_argument("-m", "--insert-len-mean", help="Generate inserts (fragments) having an average length of LEN",
@@ -285,11 +289,24 @@ def generate_samples_fastq(haploid, nb_inds, output_dir, coverage, read_len, ins
                                  insert_len_mean, insert_len_sd, threads))
 
 
-def confirm(deletions: dict):
+def confirm(deletions: dict, variants: dict):
     nb_dels = 0
+    variants = sorted(variants["DEL"], key=lambda x: x["min"])
+    variants_counts = OrderedDict()
+    for variant in variants:
+        variants_counts["{0}-{1}".format(variant["min"], variant["max"])] = 0
     for deletes in deletions.values():
         nb_dels += len(deletes)
+        for delete in deletes:
+            v_len = delete["length"]
+            for variant in variants:
+                if variant["min"] < v_len < variant["max"]:
+                    variants_counts["{0}-{1}".format(variant["min"], variant["max"])] += 1
+                    break
     print("We generate {0} deletion variants.".format(nb_dels))
+    for v_range, v_count in variants_counts.items():
+        print("\t- Range {0}: {1}".format(v_range, v_count))
+    print("")
     return input("Continue [Y/n]? ") in ["y", "Y", ""]
 
 
@@ -303,8 +320,11 @@ def main():
     nb_inds = args.nb_inds
 
     if os.path.isdir(output_dir):
-        eprint("Error: output directory {0} already exists.".format(output_dir))
-        return 1
+        if args.erase_output:
+            shutil.rmtree(output_dir)
+        else:
+            eprint("Error: output directory {0} already exists.".format(output_dir))
+            return 1
     elif os.path.isfile(output_dir):
         eprint("Error: unable to create output directory {0}: file exists.".format(output_dir))
         return 1
@@ -329,9 +349,13 @@ def main():
     ################
 
     print("GENERATE RANDOM DELETIONS VARIANTS...\n")
-    sv_sim = VariantsSimulator(sv_list)
-    deletions = sv_sim.get_random_deletions(args.proba_del, args.reference)
-    if args.quiet or confirm(deletions):
+    try:
+        sv_sim = VariantsSimulator(sv_list)
+        deletions = sv_sim.get_random_deletions(args.proba_del, args.reference)
+    except InputException as e:
+        print(e)
+        return False
+    if args.quiet or confirm(deletions, sv_sim.variants):
         genotypes_for_inds = build_genotypes_vcf_list(deletions, output_vcf, haploid, args.force_polymorphism, nb_inds,
                                                       tmp_dir, prg_path)
         build_fastas_chromosomes(reference, genotypes_for_inds, haploid, output_dir)
diff --git a/defaults.rules b/defaults.rules
new file mode 100644
index 0000000..6c35423
--- /dev/null
+++ b/defaults.rules
@@ -0,0 +1,5 @@
+DEL 100   200    0.7
+DEL 200   500    0.2
+DEL 500   1000   0.07
+DEL 1000  2000   0.02
+DEL 2000  10000  0.01
\ No newline at end of file
diff --git a/variants_simulator.py b/variants_simulator.py
index 7d93446..0426ddc 100755
--- a/variants_simulator.py
+++ b/variants_simulator.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3
 
+import os
 import sys
 import re
 import random
@@ -11,39 +12,9 @@ from Bio import SeqIO
 class VariantsSimulator:
 
     def __init__(self, sv_list=None):
-        if sv_list is not None:
-            self.variants = self.__load_variants(sv_list)
-        else:
-            self.variants = {
-                "DEL": [
-                    {
-                        "min": 100,
-                        "max": 200,
-                        "proba": 0.7,
-                        "proba_cumul": 0.7
-                    }, {
-                        "min": 200,
-                        "max": 500,
-                        "proba": 0.2,
-                        "proba_cumul": 0.9
-                    }, {
-                        "min": 500,
-                        "max": 1000,
-                        "proba": 0.07,
-                        "proba_cumul": 0.97
-                    }, {
-                        "min": 1000,
-                        "max": 2000,
-                        "proba": 0.02,
-                        "proba_cumul": 0.99
-                    }, {
-                        "min": 2000,
-                        "max": 10000,
-                        "proba": 0.01,
-                        "proba_cumul": 1.0
-                    }
-                ]
-            }
+        if sv_list is None:
+            sv_list = os.path.join(os.path.dirname(os.path.realpath(__file__)), "defaults.rules")
+        self.variants = self.__load_variants(sv_list)
         self.deletions = {}
 
     @staticmethod
@@ -52,7 +23,7 @@ class VariantsSimulator:
         allowed_sv = {"DEL", "INV", "DUP"}
         with open(sv_list, "r") as sv_list_f:
             for line in sv_list_f:
-                parts = re.split("\s", line.strip("\n"))
+                parts = re.split("\s+", line.strip("\n"))
                 type_sv = parts[0]
                 if type_sv not in allowed_sv:
                     raise InputException("Disallowed variant type: " + type_sv)
@@ -119,9 +90,8 @@ class VariantsSimulator:
                     freq = 0.2 if random.uniform(0, 1) < 0.5 else 0.5
                     if chrm not in deletions:
                         deletions[chrm] = []
-                    nb_total_str = str(nb_total)
                     deletions[chrm].append({
-                        "name": "DEL{0}_{1}_{2}".format("0" * (5-len(nb_total_str)) + nb_total_str, chrm, nb_on_ch),
+                        "name": "DEL{0}_{1}_{2}".format("%05d" % nb_total, chrm, nb_on_ch),
                         "start": i+1,
                         "end": i+1+del_size,
                         "length": del_size,
-- 
GitLab