diff --git a/build_pop.py b/build_pop.py index 9ea88d389684620777a4b4ddfd8165f226826248..520f5ded5ce216485f3d8fcd07187d205c6bc0d4 100755 --- a/build_pop.py +++ b/build_pop.py @@ -57,7 +57,7 @@ def parse_args(): formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("-n", "--nb-inds", help="Number of individuals to generate", type=int) parser.add_argument("-r", "--reference", help="Reference genome", required=True) - parser.add_argument("-ns", "--nstretches", help="N-stretches positions bed file") + parser.add_argument("-ns", "--nstretches", help="N-stretches positions bed file", required=False) parser.add_argument("-s", "--sv-list", help="File containing the SVs", required=False) parser.add_argument("-c", "--coverage", help="Coverage of reads", default=15, type=int) parser.add_argument("-o", "--output-directory", help="Output directory", default="res") @@ -67,7 +67,8 @@ def parse_args(): const=True, default=False) parser.add_argument("-a", "--haploid", help="Make a haploid genome, instead of diploid one", action="store_const", const=True, default=False) - parser.add_argument("-p", "--proba-del", help="Probabilty to have a deletion", type=float, default=0.000001) + parser.add_argument("-pd", "--proba-del", help="Probabilty to have a deletion", type=float, default=0.000001) + parser.add_argument("-pi", "--proba-inv", help="Probablity to have an insertion", type=float, default=0.000001) parser.add_argument("-l", "--read-len", help="Generate reads having a length of LEN", type=int, default=100) parser.add_argument("-m", "--insert-len-mean", help="Generate inserts (fragments) having an average length of LEN", type=int, default=300) @@ -77,6 +78,8 @@ def parse_args(): const=True, default=False) parser.add_argument("-md", "--min-deletions", help="Minimum of deletions to generate (>=1)", default=1, type=check_min_size) + parser.add_argument("-mi", "--min-inversions", help="Minimum of inversions to generate (>=1)", default=1, + type=check_min_size) parser.add_argument("-t", "--threads", help="Number of threads", default=multiprocessing.cpu_count(), type=int) parser.add_argument("-g", "--genotypes", help="Position of SV with genotypes of individuals") @@ -85,6 +88,15 @@ def parse_args(): if args.nb_inds is None and args.genotypes is None: parser.error("--nb-inds parameter is required (except if --genotypes parameter is given)") + if args.proba_del > 1 or args.proba_del < 0: + parser.error("Probability of deletions must be between 0 and 1") + + if args.proba_inv > 1 or args.proba_inv < 0: + parser.error("Probability of inversions must be between 0 and 1") + + if args.proba_del + args.proba_inv > 1: + parser.error("Sum of inversions and deletions probabilities must be lesser than 1") + return args @@ -154,10 +166,12 @@ def _build_vcf_header(vcf_file, prg_path, tmp_dir, nb_inds): raise ExecException("ERROR: template file not found in program directory.") -def build_genotypes_vcf_list(deletions: dict, output_vcf, haploid, force_polymorphism, nb_inds, tmp_dir, prg_path): +def build_genotypes_vcf_list(deletions: dict, inversions: dict, output_vcf, haploid, force_polymorphism, nb_inds, + tmp_dir, prg_path): """ Build VCF file describing genotypes for each individual (and the associated python dictionary) :param deletions: deletions description {dict} + :param inversions: inversions description {dict} :param output_vcf: output VCF file full path name {str} :param haploid: is haploid {bool} :param force_polymorphism: force polymorphism {bool} @@ -166,7 +180,7 @@ def build_genotypes_vcf_list(deletions: dict, output_vcf, haploid, force_polymor :param prg_path: program path {str} :return: dictionary of genotypes for each variant for each dictionary {OrderedDict} """ - genotypes_for_inds = OrderedDict() + genotypes_for_inds_DEL = OrderedDict() # {Â chr: {Â id_indiv: {start: #start, end: #end, genotypes: [[0,1],[1,1],...], ...}, # ...} vcf_file = os.path.join(tmp_dir, "template.vcf") @@ -176,11 +190,13 @@ def build_genotypes_vcf_list(deletions: dict, output_vcf, haploid, force_polymor vcf_reader = vcf.Reader(filename=vcf_file) vcf_writer = vcf.Writer(open(output_vcf, "w"), vcf_reader) + + # Deletions: for chrm, deletes in deletions.items(): for delete in deletes: - if chrm not in genotypes_for_inds: - genotypes_for_inds[chrm] = {} - genotypes_for_inds[chrm][delete["name"]] = {"start": delete["start"], "end": delete["end"], "genotypes": []} + if chrm not in genotypes_for_inds_DEL: + genotypes_for_inds_DEL[chrm] = {} + genotypes_for_inds_DEL[chrm][delete["name"]] = {"start": delete["start"], "end": delete["end"], "genotypes": []} # Get genotypes: all_genotypes, genotypes = [], [] @@ -191,12 +207,37 @@ def build_genotypes_vcf_list(deletions: dict, output_vcf, haploid, force_polymor polymorph = len(set(all_genotypes)) > 1 else: all_genotypes, genotypes = _get_genotypes_for_inds(nb_inds, haploid, delete["freq"]) - genotypes_for_inds[chrm][delete["name"]]["genotypes"] = [x.split("/") for x in all_genotypes] + genotypes_for_inds_DEL[chrm][delete["name"]]["genotypes"] = [x.split("/") for x in all_genotypes] info = {"END": delete["end"], "AF": delete["freq"]} vcf_record = vcf.model._Record(chrm, delete["start"], delete["name"], "N", [vcf.model._SV("DEL")], ".", ".", info, "GT", [0], genotypes) vcf_writer.write_record(vcf_record) + + # Inversions: + genotypes_for_inds_INV = OrderedDict() + for chrm, the_inversions in inversions.items(): + for inversion in the_inversions: + if chrm not in genotypes_for_inds_INV: + genotypes_for_inds_INV[chrm] = {} + genotypes_for_inds_INV[chrm][inversion["name"]] = {"start": inversion["start"], "end": inversion["end"], + "genotypes": []} + # Get genotypes: + all_genotypes, genotypes = [], [] + if force_polymorphism: + polymorph = False + while not polymorph: + all_genotypes, genotypes = _get_genotypes_for_inds(nb_inds, haploid, inversion["freq"]) + polymorph = len(set(all_genotypes)) > 1 + else: + all_genotypes, genotypes = _get_genotypes_for_inds(nb_inds, haploid, inversion["freq"]) + genotypes_for_inds_INV[chrm][inversion["name"]]["genotypes"] = [x.split("/") for x in all_genotypes] + + info = {"END": inversion["end"], "AF": inversion["freq"]} + vcf_record = vcf.model._Record(chrm, inversion["start"], inversion["name"], "N", [vcf.model._SV("INV")], + ".", ".", info, "GT", [0], genotypes) + vcf_writer.write_record(vcf_record) + vcf_writer.close() tabix_compress(output_vcf, output_vcf + ".gz", True) @@ -204,29 +245,39 @@ def build_genotypes_vcf_list(deletions: dict, output_vcf, haploid, force_polymor os.remove(output_vcf) - return genotypes_for_inds + return genotypes_for_inds_DEL, genotypes_for_inds_INV def load_genotypes_from_file(genotypes_file): variants = VariantFile(genotypes_file) - genotypes_for_inds = {} + genotypes_for_inds_DEL = {} + genotypes_for_inds_INV = {} nb_inds = 0 for variant in variants: chrm = variant.chrom id_v = variant.id start = variant.start end = variant.stop + type_v = variant.alts[0][1:-1] samples = variant.samples gt = [list(map(str, samples[x]["GT"])) for x in samples] - if chrm not in genotypes_for_inds: - genotypes_for_inds[chrm] = {} - genotypes_for_inds[chrm][id_v] = { - "start": start, - "end": end, - "genotypes": gt - } - nb_inds = len(gt) - return nb_inds, genotypes_for_inds + genotypes_for_inds = None + if type_v == "DEL": + genotypes_for_inds = genotypes_for_inds_DEL + elif type_v == "INV": + genotypes_for_inds = genotypes_for_inds_INV + else: + print("ERROR: Variant type not supported: %s. Ignoring this line..." % type_v) + if genotypes_for_inds is not None: + if chrm not in genotypes_for_inds: + genotypes_for_inds[chrm] = {} + genotypes_for_inds[chrm][id_v] = { + "start": start, + "end": end, + "genotypes": gt + } + nb_inds = len(gt) + return nb_inds, genotypes_for_inds_DEL, genotypes_for_inds_INV def _compute_keeped_genomic_regions(svs, svs_infos, haploid): @@ -268,9 +319,11 @@ def _build_fastas(chrm, regions, current_region_pointer, output_dir, fasta_orig_ :param last_nt: :return: """ + n = 0 for indiv, chrs_dips in regions.items(): id_chrs = list(chrs_dips.keys()) id_chrs = sorted(id_chrs) + c = 0 for id_chr in id_chrs: chr_dip = chrs_dips[id_chr] # SVs for each diploid chromosome logs_regions = [] # Store logs @@ -282,12 +335,12 @@ def _build_fastas(chrm, regions, current_region_pointer, output_dir, fasta_orig_ fasta = "" last_one = 0 for chr_region in chr_dip: - fasta += fasta_orig_chr[int(chr_region[0]):int(chr_region[1])] + fasta += fasta_orig_chr[n][c][int(chr_region[0]):int(chr_region[1])] logs_regions.append("\t".join(str(x) for x in chr_region)) last_one = int(chr_region[1]) if last_one < last_nt: logs_regions.append("\t".join([str(current_region_pointer[indiv][id_chr]), str(last_nt)])) - fasta += fasta_orig_chr[int(current_region_pointer[indiv][id_chr]):last_nt] + fasta += fasta_orig_chr[n][c][int(current_region_pointer[indiv][id_chr]):last_nt] SeqIO.write(fasta, output_handle, "fasta") except IOError: raise ExecException("ERROR: unable to write \"{0}\" file.". @@ -303,26 +356,55 @@ def _build_fastas(chrm, regions, current_region_pointer, output_dir, fasta_orig_ except IOError: raise ExecException("ERROR: unable to write log file: \"{0}\"". format(os.path.join(output_dir, "indiv" + str(indiv + 1) + ".regions.log"))) + c += 1 + n += 1 + +def _build_fastas_inversions(fasta_orig_chrm: dict, genotypes_for_inds: dict, nb_inds, haploid): + fastas = [] + for j in range(0, nb_inds): + fastas.append([fasta_orig_chrm] if haploid else [fasta_orig_chrm, fasta_orig_chrm]) + for inv_name, props in genotypes_for_inds.items(): + i = 0 + for genotype in props["genotypes"]: + for k in range(0, len(genotype)): + if genotype[k] == "1": + start = int(props["start"]) + end = int(props["end"]) + fastas[i][k] = fastas[i][k][:start] + fastas[i][k][start:end][::-1] + fastas[i][k][end:] + i += 1 + return fastas -def build_fastas_chromosomes(reference, genotypes_for_inds, haploid, output_dir, printer): + +def build_fastas_chromosomes(reference, genotypes_for_inds_DEL, genotypes_for_inds_INV, haploid, output_dir, nb_inds, + printer): printer.out("BUILD FASTA GENOME FOR EACH INDIVIDUAL...\n") fasta_orig = SeqIO.index(reference, "fasta") - for chrm, svs_infos in genotypes_for_inds.items(): + # First, inversions (does not change coordinates): + printer.out("STARTING WITH INVERSIONS...") + fastas_by_chrm = {} + for chrm, svs_infos in genotypes_for_inds_DEL.items(): + printer.out("PROCESSING CHROMOSOME {0}...\n".format(chrm)) + svs = list(svs_infos.keys()) + fastas_by_chrm[chrm] = _build_fastas_inversions(fasta_orig[chrm], genotypes_for_inds_INV[chrm], nb_inds, haploid) + + printer.out("STARTING WITH DELETIONS...") + # Then, deletions: + for chrm, svs_infos in genotypes_for_inds_DEL.items(): printer.out("PROCESSING CHROMOSOME {0}...\n".format(chrm)) svs = list(svs_infos.keys()) - svs = sorted(svs, key=lambda x:_svsort(x, chrm, genotypes_for_inds)) + svs = sorted(svs, key=lambda x:_svsort(x, chrm, genotypes_for_inds_DEL)) - fasta_orig_chr = fasta_orig[chrm] - last_nt = len(fasta_orig_chr)-1 + fastas_orig_chr = fastas_by_chrm[chrm] + last_nt = len(fastas_orig_chr[0][0])-1 # Compute keeped genomic regions for each diploid chromosome: regions, current_region_pointer = _compute_keeped_genomic_regions(svs, svs_infos, haploid) # Build FASTA of each diploid/haploid chromosome: - _build_fastas(chrm, regions, current_region_pointer, output_dir, fasta_orig_chr, last_nt) + _build_fastas(chrm, regions, current_region_pointer, output_dir, fastas_orig_chr, last_nt) def generate_samples_fastq(haploid, nb_inds, output_dir, coverage, read_len, insert_len_mean, insert_len_sd, @@ -349,37 +431,66 @@ def generate_samples_fastq(haploid, nb_inds, output_dir, coverage, read_len, ins stderr=open(stderr, "a") if stderr is not None else None) -def confirm(deletions: dict, variants: dict, printer: Print): +def confirm(deletions: dict, inversions: dict, variants: dict, printer: Print): + # Deletions: nb_dels = 0 - variants = sorted(variants["DEL"], key=lambda x: x["min"]) - variants_counts = OrderedDict() - variants_counts_chr = {} - for variant in variants: - variants_counts["{0}-{1}".format(variant["min"], variant["max"])] = 0 + variants_del = sorted(variants["DEL"], key=lambda x: x["min"]) + variants_del_counts = OrderedDict() + variants_del_counts_chr = {} + for variant in variants_del: + variants_del_counts["{0}-{1}".format(variant["min"], variant["max"])] = 0 for chrm, deletes in deletions.items(): nb_dels_chr = len(deletes) nb_dels += nb_dels_chr - variants_counts_chr[chrm] = nb_dels_chr + variants_del_counts_chr[chrm] = nb_dels_chr for delete in deletes: v_len = delete["length"] - for variant in variants: - if variant["min"] < v_len < variant["max"]: - variants_counts["{0}-{1}".format(variant["min"], variant["max"])] += 1 + for variant in variants_del: + if variant["min"] < v_len <= variant["max"]: + variants_del_counts["{0}-{1}".format(variant["min"], variant["max"])] += 1 break printer.out("We generate {0} deletion variants.".format(nb_dels)) printer.out("Ranges:") - for v_range, v_count in variants_counts.items(): + for v_range, v_count in variants_del_counts.items(): printer.out("\t- Range {0}: {1}".format(v_range, v_count)) printer.out("Chromosomes:") - for chrm in sorted(list(variants_counts_chr.keys())): - printer.out("\t- {0}: {1}".format(chrm, variants_counts_chr[chrm])) + for chrm in sorted(list(variants_del_counts_chr.keys())): + printer.out("\t- {0}: {1}".format(chrm, variants_del_counts_chr[chrm])) printer.out("") + + # inversions: + nb_invs = 0 + variants_inv = sorted(variants["INV"], key=lambda x: x["min"]) + variants_inv_counts = OrderedDict() + variants_inv_counts_chr = {} + for variant in variants_inv: + variants_inv_counts["{0}-{1}".format(variant["min"], variant["max"])] = 0 + for chrm, the_inversions in inversions.items(): + nb_invs_chr = len(the_inversions) + nb_invs += nb_invs_chr + variants_inv_counts_chr[chrm] = nb_invs_chr + for inversion in the_inversions: + v_len = inversion["length"] + for variant in variants_inv: + if variant["min"] < v_len <= variant["max"]: + variants_inv_counts["{0}-{1}".format(variant["min"], variant["max"])] += 1 + break + printer.out("We generate {0} inversion variants.".format(nb_invs)) + printer.out("Ranges:") + for v_range, v_count in variants_inv_counts.items(): + printer.out("\t- Range {0}: {1}".format(v_range, v_count)) + printer.out("Chromosomes:") + for chrm in sorted(list(variants_inv_counts_chr.keys())): + printer.out("\t- {0}: {1}".format(chrm, variants_inv_counts_chr[chrm])) + printer.out("") + + # Confirm: return input("Continue [Y/n]? ") in ["y", "Y", ""] -def init(output_dir, force_outputdir, sv_list, nstretches, nb_inds, reference, proba_del, haploid, force_polymorphism, - coverage, read_len, insert_len_mean, insert_len_sd, threads, genotypes=None, min_deletions=1, quiet=True, - stdout=None, stderr=None): +def init(output_dir, force_outputdir, sv_list, nstretches, nb_inds, reference, proba_del, proba_inv, haploid, + force_polymorphism, coverage, read_len, insert_len_mean, insert_len_sd, threads, genotypes=None, + min_deletions=1, min_inversions=1, quiet=True, stdout=None, stderr=None): printer = Print(stdout=stdout, stderr=stderr) @@ -416,25 +527,32 @@ def init(output_dir, force_outputdir, sv_list, nstretches, nb_inds, reference, p ################ deletions = None + inversions = None sv_sim = None if genotypes is None: printer.out("GENERATE RANDOM DELETIONS VARIANTS...\n") try: nb_deletions = 0 + nb_inversions = 0 nb_try = 0 sv_sim = VariantsSimulator(sv_list, nstretches, threads, stdout, stderr) printer.out("Try to generate at least %s deletions..." % min_deletions) max_try = 10 while nb_deletions < min_deletions and nb_try < max_try: printer.out("\nTry {0} / {1}...".format(nb_try + 1, max_try)) - nb_deletions, deletions = sv_sim.get_random_deletions(proba_del, reference) + nb_deletions, deletions, nb_inversions, inversions = sv_sim.get_random_variants(proba_del, proba_inv, + reference) sv_sim.print_flush() nb_try += 1 if nb_deletions < min_deletions: printer.err("\nUnable to generate %s deletions. Try to reduce size of deletions or increase " "general probability to have a deletion." % min_deletions) return 1 + if nb_inversions < min_inversions: + printer.err("\nUnable to generate %s inversions. Try to reduce size of inversions or increase " + "general probability to have an inversion." % min_inversions) + return 1 printer.out("") except InputException as e: printer.err(e) @@ -445,19 +563,20 @@ def init(output_dir, force_outputdir, sv_list, nstretches, nb_inds, reference, p except Exception: printer.err(traceback.format_exc()) return 1 - if quiet or genotypes is not None or confirm(deletions, sv_sim.variants, printer): + if quiet or genotypes is not None or confirm(deletions, inversions, sv_sim.variants, printer): try: - genotypes_for_inds = None if genotypes is None: printer.out("GENERATE SUMMARY VCF FILE...\n") - genotypes_for_inds = build_genotypes_vcf_list(deletions, output_vcf, haploid, force_polymorphism, nb_inds, - tmp_dir, prg_path) + genotypes_for_inds_DEL, genotypes_for_inds_INV = \ + build_genotypes_vcf_list(deletions, inversions, output_vcf, haploid, force_polymorphism, nb_inds, + tmp_dir, prg_path) else: - nb_inds, genotypes_for_inds = load_genotypes_from_file(genotypes) + nb_inds, genotypes_for_inds_DEL, genotypes_for_inds_INV = load_genotypes_from_file(genotypes) if nb_inds < 2: printer.err("nb-inds must be at least 2") return 1 - build_fastas_chromosomes(reference, genotypes_for_inds, haploid, output_dir, printer) + build_fastas_chromosomes(reference, genotypes_for_inds_DEL, genotypes_for_inds_INV, haploid, output_dir, + nb_inds, printer) printer.out("GENERATE RANDOM READS FOR EACH INDIVIDUAL FROM GENOME...\n") generate_samples_fastq(haploid, nb_inds, output_dir, coverage, read_len, insert_len_mean, insert_len_sd, prg_path, threads, stdout, stderr) @@ -483,6 +602,7 @@ def main(): nb_inds = args.nb_inds force_outputdir = args.force_outputdir proba_del = args.proba_del + proba_inv = args.proba_inv threads = args.threads force_polymorphism = args.force_polymorphism coverage = args.coverage @@ -491,11 +611,12 @@ def main(): insert_len_sd = args.insert_len_sd quiet = args.quiet min_deletions = args.min_deletions + min_inversions = args.min_inversions genotypes = args.genotypes - return init(output_dir, force_outputdir, sv_list, nstretches, nb_inds, reference, proba_del, haploid, + return init(output_dir, force_outputdir, sv_list, nstretches, nb_inds, reference, proba_del, proba_inv, haploid, force_polymorphism, coverage, read_len, insert_len_mean, insert_len_sd, threads, genotypes, - min_deletions, quiet) + min_deletions, min_inversions, quiet) if __name__ == '__main__': sys.exit(main()) diff --git a/defaults.rules b/defaults.rules index 6c35423d4f61026cabbc0882de320565a44f6558..b15ef3f7e33a0d68b17a668983ceb7e344b4040a 100644 --- a/defaults.rules +++ b/defaults.rules @@ -2,4 +2,9 @@ DEL 100 200 0.7 DEL 200 500 0.2 DEL 500 1000 0.07 DEL 1000 2000 0.02 -DEL 2000 10000 0.01 \ No newline at end of file +DEL 2000 10000 0.01 +INV 100 200 0.7 +INV 200 500 0.2 +INV 500 1000 0.07 +INV 1000 2000 0.02 +INV 2000 10000 0.01 diff --git a/variants_simulator.py b/variants_simulator.py index 70d794311cd57935b03913aac8ba20e6de761be3..0e5a6999ace74f8a156313ab9e7fd4d6173a07bd 100755 --- a/variants_simulator.py +++ b/variants_simulator.py @@ -33,6 +33,7 @@ class VariantsSimulator: if nstretches is not None: self.nstretches = self.__load_nstretches(nstretches) self.deletions = {} + self.invertions = {} self.threads = threads if threads > -1 else multiprocessing.cpu_count() self.stdout = stdout self.stderr = stderr @@ -80,7 +81,7 @@ class VariantsSimulator: def __load_variants(self, sv_list): svs = {} - allowed_sv = {"DEL", "INV", "DUP"} + allowed_sv = {"DEL", "INV"} with open(sv_list, "r") as sv_list_f: for line in sv_list_f: parts = re.split("\s+", line.strip("\n")) @@ -110,10 +111,10 @@ class VariantsSimulator: self.min_sizes[type_sv] = min(self.min_sizes[type_sv], min_s) # Only deletions are supported for now: - if "DEL" not in svs: - raise Exception("Error: only deletions are supported for now!") - elif len(svs) > 1: - self.print_err("Warn: only deletions are supported for now. Other variant types will be ignored.") + if "DEL" not in svs and "INV" not in svs: + raise Exception("Error: only deletions and inversions are supported for now!") + elif (("DEL" not in svs or "INV" not in svs) and len(svs) > 1) or (len(svs) > 2): + self.print_err("Warn: only deletions and inversions are supported for now. Other variant types will be ignored.") for type_sv in svs: svs[type_sv].sort(key=lambda x: -x["proba"]) @@ -143,6 +144,13 @@ class VariantsSimulator: stretches[chrm].sort(key=lambda x: x[0]) return stretches + def _get_inversion_size(self): + r = random.uniform(0, 1) + for sizes in self.variants["INV"]: + if r < sizes["proba_cumul"]: + break + return random.randint(sizes["min"], sizes["max"] + 1) + def _get_deletion_size(self): r = random.uniform(0, 1) for sizes in self.variants["DEL"]: @@ -168,7 +176,6 @@ class VariantsSimulator: intervals.sort(key=lambda x: x[0]-x[1]) return intervals[0] - def _get_far_from_nstretches(self, chrm, start, end, chr_len, pos=0): """ Get if a deletion is far from N stretches @@ -242,18 +249,21 @@ class VariantsSimulator: break return True, False, False, start, end, -1 - def _get_random_deletions_by_chr(self, chrm, chr_length, deletions, proba_deletion): + def _get_random_variants_by_chr(self, chrm, chr_length, deletions, inversions, proba_deletion, proba_inversion): self._print_out("Generating for chromosome %s..." % chrm) - nb_on_ch = 0 - nb_dels = 0 + nb_dels_on_ch = 0 + nb_invs_on_ch = 0 + nb_inv_on_ch = 0 + nb_inv = 0 i = 0 len_chr = chr_length[chrm] deletions[chrm] = [] + inversions[chrm] = [] + proba_inversion_cum = proba_deletion + proba_inversion while i < len_chr: r = random.uniform(0, 1) if r <= proba_deletion: - nb_dels += 1 - nb_on_ch += 1 + nb_dels_on_ch += 1 del_size = self._get_deletion_size() start = i + 1 end = i + 1 + del_size @@ -285,7 +295,7 @@ class VariantsSimulator: i = start freq = 0.2 if random.uniform(0, 1) < 0.5 else 0.5 deletions[chrm].append({ - "name": "DEL{0}_{1}".format(chrm, nb_on_ch), + "name": "DEL{0}_{1}".format(chrm, nb_dels_on_ch), "start": start, "end": end, "length": del_size, @@ -294,26 +304,72 @@ class VariantsSimulator: i += del_size else: self.print_err("N-stretch filter: removed", print_message_header) + elif r <= proba_inversion_cum: + nb_inv += 1 + nb_inv_on_ch += 1 + inv_size = self._get_inversion_size() + start = i + 1 + end = i + 1 + inv_size + print_message_header = "One inversion found ({0}:{1}-{2} ; size: {3})...".format(chrm, start, end, + inv_size) + pos = 0 + keep_inversion = True + has_resize = False + has_move = False + while keep_inversion and pos >= 0: + keep_inversion, with_resize, with_move, start, end, pos = self._get_far_from_nstretches(chrm, start, + end, len_chr, + pos) + if with_resize: + inv_size = end - start + has_resize = True + elif with_move: + has_move = True + if keep_inversion: + if has_resize: + self.print_warn("N-stretch filter: resized ({0}:{1}-{2}, new size:{3})". + format(chrm, start, end, inv_size), print_message_header) + elif has_move: + self.print_warn("N-stretch filter: moved ({0}:{1}-{2})".format(chrm, start, end), + print_message_header) + else: + self.print_ok("N-stretch filter: OK", print_message_header) + i = start + freq = 0.2 if random.uniform(0, 1) < 0.5 else 0.5 + inversions[chrm].append({ + "name": "INV{0}_{1}".format(chrm, nb_inv_on_ch), + "start": start, + "end": end, + "length": inv_size, + "freq": freq + }) + i += inv_size i += 1 - return len(deletions[chrm]), deletions, self.stdout_stash, self.stderr_stash + return len(deletions[chrm]), deletions, len(inversions[chrm]), inversions, self.stdout_stash, self.stderr_stash - def get_random_deletions(self, proba_deletion, reference_genome): + def get_random_variants(self, proba_deletion, proba_inversion, reference_genome): fasta_index = SeqIO.index(reference_genome, "fasta") deletions = {} + inversions = {} chr_length = {} for chrm in fasta_index: chr_length[str(chrm)] = len(fasta_index[chrm].seq) - results = Parallel(n_jobs=self.threads)(delayed(self._get_random_deletions_by_chr) - (str(chrm), chr_length, deletions, proba_deletion) + results = Parallel(n_jobs=self.threads)(delayed(self._get_random_variants_by_chr) + (str(chrm), chr_length, deletions, inversions, proba_deletion, + proba_inversion) for chrm in fasta_index) nb_dels = 0 + nb_invs = 0 for result in results: nb_dels += result[0] deletions.update(result[1]) - self.stdout_stash += result[2] - self.stderr_stash += result[3] + nb_invs += result[2] + inversions.update(result[3]) + self.stdout_stash += result[4] + self.stderr_stash += result[5] self.deletions = deletions - return nb_dels, deletions + self.invertions = inversions + return nb_dels, deletions, nb_invs, inversions def print_variants(self): self._print_out("SV\tCHR\tSTART\tEND\tLENGTH\tFREQ") @@ -332,7 +388,7 @@ def main(): args = parser.parse_args() try: simultator = VariantsSimulator(args.sv_list) - simultator.get_random_deletions(args.proba_del, args.reference) + simultator.get_random_variants(args.proba_del, args.reference) simultator.print_variants() except InputException as e: print(e)