From e4a96663d53674f3612829a6cb421a942a516177 Mon Sep 17 00:00:00 2001 From: Floreal Cabanettes <floreal.cabanettes@inra.fr> Date: Wed, 21 Jun 2017 11:36:34 +0200 Subject: [PATCH] Some improvements --- build_pop.py | 44 +++++++++++++++++++++++++++++++++---------- defaults.rules | 5 +++++ variants_simulator.py | 42 ++++++----------------------------------- 3 files changed, 45 insertions(+), 46 deletions(-) create mode 100644 defaults.rules diff --git a/build_pop.py b/build_pop.py index cbf5930..f763d70 100755 --- a/build_pop.py +++ b/build_pop.py @@ -2,6 +2,7 @@ import sys import os +import shutil import random import argparse from collections import OrderedDict @@ -11,6 +12,7 @@ import tempfile from lib.svrunner_utils import eprint from pysam import tabix_compress, tabix_index from variants_simulator import VariantsSimulator +from exceptions import InputException def parse_args(): @@ -25,10 +27,12 @@ def parse_args(): parser.add_argument("-s", "--sv-list", help="File containing the SVs", required=False) parser.add_argument("-c", "--coverage", help="Coverage of reads", default=15, type=int) parser.add_argument("-o", "--output-directory", help="Output directory", default="res") - parser.add_argument("-f", "--force-polymorphism", help="Force polymorphism for each SV", action='store_const', const=True, - default=False) - parser.add_argument("-a", "--haploid", help="Make a haploid genome, instead of diploid one", action="store_const", const=True, - default=False) + parser.add_argument("-e", "--erase-output", help="Delete output, if directory already exists", action='store_const', + const=True, default=False) + parser.add_argument("-f", "--force-polymorphism", help="Force polymorphism for each SV", action='store_const', + const=True, default=False) + parser.add_argument("-a", "--haploid", help="Make a haploid genome, instead of diploid one", action="store_const", + const=True, default=False) parser.add_argument("-p", "--proba-del", help="Probabilty to have a deletion", type=float, default=0.000001) parser.add_argument("-l", "--read-len", help="Generate reads having a length of LEN", type=int, default=100) parser.add_argument("-m", "--insert-len-mean", help="Generate inserts (fragments) having an average length of LEN", @@ -285,11 +289,24 @@ def generate_samples_fastq(haploid, nb_inds, output_dir, coverage, read_len, ins insert_len_mean, insert_len_sd, threads)) -def confirm(deletions: dict): +def confirm(deletions: dict, variants: dict): nb_dels = 0 + variants = sorted(variants["DEL"], key=lambda x: x["min"]) + variants_counts = OrderedDict() + for variant in variants: + variants_counts["{0}-{1}".format(variant["min"], variant["max"])] = 0 for deletes in deletions.values(): nb_dels += len(deletes) + for delete in deletes: + v_len = delete["length"] + for variant in variants: + if variant["min"] < v_len < variant["max"]: + variants_counts["{0}-{1}".format(variant["min"], variant["max"])] += 1 + break print("We generate {0} deletion variants.".format(nb_dels)) + for v_range, v_count in variants_counts.items(): + print("\t- Range {0}: {1}".format(v_range, v_count)) + print("") return input("Continue [Y/n]? ") in ["y", "Y", ""] @@ -303,8 +320,11 @@ def main(): nb_inds = args.nb_inds if os.path.isdir(output_dir): - eprint("Error: output directory {0} already exists.".format(output_dir)) - return 1 + if args.erase_output: + shutil.rmtree(output_dir) + else: + eprint("Error: output directory {0} already exists.".format(output_dir)) + return 1 elif os.path.isfile(output_dir): eprint("Error: unable to create output directory {0}: file exists.".format(output_dir)) return 1 @@ -329,9 +349,13 @@ def main(): ################ print("GENERATE RANDOM DELETIONS VARIANTS...\n") - sv_sim = VariantsSimulator(sv_list) - deletions = sv_sim.get_random_deletions(args.proba_del, args.reference) - if args.quiet or confirm(deletions): + try: + sv_sim = VariantsSimulator(sv_list) + deletions = sv_sim.get_random_deletions(args.proba_del, args.reference) + except InputException as e: + print(e) + return False + if args.quiet or confirm(deletions, sv_sim.variants): genotypes_for_inds = build_genotypes_vcf_list(deletions, output_vcf, haploid, args.force_polymorphism, nb_inds, tmp_dir, prg_path) build_fastas_chromosomes(reference, genotypes_for_inds, haploid, output_dir) diff --git a/defaults.rules b/defaults.rules new file mode 100644 index 0000000..6c35423 --- /dev/null +++ b/defaults.rules @@ -0,0 +1,5 @@ +DEL 100 200 0.7 +DEL 200 500 0.2 +DEL 500 1000 0.07 +DEL 1000 2000 0.02 +DEL 2000 10000 0.01 \ No newline at end of file diff --git a/variants_simulator.py b/variants_simulator.py index 7d93446..0426ddc 100755 --- a/variants_simulator.py +++ b/variants_simulator.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 +import os import sys import re import random @@ -11,39 +12,9 @@ from Bio import SeqIO class VariantsSimulator: def __init__(self, sv_list=None): - if sv_list is not None: - self.variants = self.__load_variants(sv_list) - else: - self.variants = { - "DEL": [ - { - "min": 100, - "max": 200, - "proba": 0.7, - "proba_cumul": 0.7 - }, { - "min": 200, - "max": 500, - "proba": 0.2, - "proba_cumul": 0.9 - }, { - "min": 500, - "max": 1000, - "proba": 0.07, - "proba_cumul": 0.97 - }, { - "min": 1000, - "max": 2000, - "proba": 0.02, - "proba_cumul": 0.99 - }, { - "min": 2000, - "max": 10000, - "proba": 0.01, - "proba_cumul": 1.0 - } - ] - } + if sv_list is None: + sv_list = os.path.join(os.path.dirname(os.path.realpath(__file__)), "defaults.rules") + self.variants = self.__load_variants(sv_list) self.deletions = {} @staticmethod @@ -52,7 +23,7 @@ class VariantsSimulator: allowed_sv = {"DEL", "INV", "DUP"} with open(sv_list, "r") as sv_list_f: for line in sv_list_f: - parts = re.split("\s", line.strip("\n")) + parts = re.split("\s+", line.strip("\n")) type_sv = parts[0] if type_sv not in allowed_sv: raise InputException("Disallowed variant type: " + type_sv) @@ -119,9 +90,8 @@ class VariantsSimulator: freq = 0.2 if random.uniform(0, 1) < 0.5 else 0.5 if chrm not in deletions: deletions[chrm] = [] - nb_total_str = str(nb_total) deletions[chrm].append({ - "name": "DEL{0}_{1}_{2}".format("0" * (5-len(nb_total_str)) + nb_total_str, chrm, nb_on_ch), + "name": "DEL{0}_{1}_{2}".format("%05d" % nb_total, chrm, nb_on_ch), "start": i+1, "end": i+1+del_size, "length": del_size, -- GitLab