diff --git a/build_pop.py b/build_pop.py index 4d02caa680a34b2ec9b0b10205f983cfd7b2cb1b..9ea88d389684620777a4b4ddfd8165f226826248 100755 --- a/build_pop.py +++ b/build_pop.py @@ -10,7 +10,7 @@ from collections import OrderedDict import vcf from Bio import SeqIO import tempfile -from pysam import tabix_compress, tabix_index +from pysam import tabix_compress, tabix_index, VariantFile from variants_simulator import VariantsSimulator from exceptions import InputException, ExecException import multiprocessing @@ -55,7 +55,7 @@ def parse_args(): """ parser = argparse.ArgumentParser(description='Generate simulated populations with SV', formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument("-n", "--nb-inds", help="Number of individuals to generate", required=True, type=int) + parser.add_argument("-n", "--nb-inds", help="Number of individuals to generate", type=int) parser.add_argument("-r", "--reference", help="Reference genome", required=True) parser.add_argument("-ns", "--nstretches", help="N-stretches positions bed file") parser.add_argument("-s", "--sv-list", help="File containing the SVs", required=False) @@ -78,8 +78,13 @@ def parse_args(): parser.add_argument("-md", "--min-deletions", help="Minimum of deletions to generate (>=1)", default=1, type=check_min_size) parser.add_argument("-t", "--threads", help="Number of threads", default=multiprocessing.cpu_count(), type=int) + parser.add_argument("-g", "--genotypes", help="Position of SV with genotypes of individuals") args = parser.parse_args() + + if args.nb_inds is None and args.genotypes is None: + parser.error("--nb-inds parameter is required (except if --genotypes parameter is given)") + return args @@ -202,6 +207,28 @@ def build_genotypes_vcf_list(deletions: dict, output_vcf, haploid, force_polymor return genotypes_for_inds +def load_genotypes_from_file(genotypes_file): + variants = VariantFile(genotypes_file) + genotypes_for_inds = {} + nb_inds = 0 + for variant in variants: + chrm = variant.chrom + id_v = variant.id + start = variant.start + end = variant.stop + samples = variant.samples + gt = [list(map(str, samples[x]["GT"])) for x in samples] + if chrm not in genotypes_for_inds: + genotypes_for_inds[chrm] = {} + genotypes_for_inds[chrm][id_v] = { + "start": start, + "end": end, + "genotypes": gt + } + nb_inds = len(gt) + return nb_inds, genotypes_for_inds + + def _compute_keeped_genomic_regions(svs, svs_infos, haploid): """ Get list of all regions keeped (not deleted) @@ -351,8 +378,8 @@ def confirm(deletions: dict, variants: dict, printer: Print): def init(output_dir, force_outputdir, sv_list, nstretches, nb_inds, reference, proba_del, haploid, force_polymorphism, - coverage, read_len, insert_len_mean, insert_len_sd, threads, min_deletions=1, quiet=True, stdout=None, - stderr=None): + coverage, read_len, insert_len_mean, insert_len_sd, threads, genotypes=None, min_deletions=1, quiet=True, + stdout=None, stderr=None): printer = Print(stdout=stdout, stderr=stderr) @@ -366,7 +393,7 @@ def init(output_dir, force_outputdir, sv_list, nstretches, nb_inds, reference, p printer.err("Error: unable to create output directory {0}: file exists.".format(output_dir)) return 1 - if nb_inds < 2: + if nb_inds is not None and nb_inds < 2: printer.err("nb-inds must be at least 2") return 1 @@ -388,37 +415,48 @@ def init(output_dir, force_outputdir, sv_list, nstretches, nb_inds, reference, p # Launch steps # ################ - printer.out("GENERATE RANDOM DELETIONS VARIANTS...\n") - try: - nb_deletions = 0 - nb_try = 0 - sv_sim = VariantsSimulator(sv_list, nstretches, threads, stdout, stderr) - printer.out("Try to generate at least %s deletions..." % min_deletions) - max_try = 10 - while nb_deletions < min_deletions and nb_try < max_try: - printer.out("\nTry {0} / {1}...".format(nb_try + 1, max_try)) - nb_deletions, deletions = sv_sim.get_random_deletions(proba_del, reference) - sv_sim.print_flush() - nb_try += 1 - if nb_deletions < min_deletions: - printer.err("\nUnable to generate %s deletions. Try to reduce size of deletions or increase " - "general probability to have a deletion." % min_deletions) + deletions = None + sv_sim = None + + if genotypes is None: + printer.out("GENERATE RANDOM DELETIONS VARIANTS...\n") + try: + nb_deletions = 0 + nb_try = 0 + sv_sim = VariantsSimulator(sv_list, nstretches, threads, stdout, stderr) + printer.out("Try to generate at least %s deletions..." % min_deletions) + max_try = 10 + while nb_deletions < min_deletions and nb_try < max_try: + printer.out("\nTry {0} / {1}...".format(nb_try + 1, max_try)) + nb_deletions, deletions = sv_sim.get_random_deletions(proba_del, reference) + sv_sim.print_flush() + nb_try += 1 + if nb_deletions < min_deletions: + printer.err("\nUnable to generate %s deletions. Try to reduce size of deletions or increase " + "general probability to have a deletion." % min_deletions) + return 1 + printer.out("") + except InputException as e: + printer.err(e) return 1 - printer.out("") - except InputException as e: - printer.err(e) - return 1 - except IOError as e: - printer.err(e) - return 1 - except Exception: - printer.err(traceback.format_exc()) - return 1 - if quiet or confirm(deletions, sv_sim.variants, printer): + except IOError as e: + printer.err(e) + return 1 + except Exception: + printer.err(traceback.format_exc()) + return 1 + if quiet or genotypes is not None or confirm(deletions, sv_sim.variants, printer): try: - printer.out("GENERATE SUMMARY VCF FILE...\n") - genotypes_for_inds = build_genotypes_vcf_list(deletions, output_vcf, haploid, force_polymorphism, nb_inds, - tmp_dir, prg_path) + genotypes_for_inds = None + if genotypes is None: + printer.out("GENERATE SUMMARY VCF FILE...\n") + genotypes_for_inds = build_genotypes_vcf_list(deletions, output_vcf, haploid, force_polymorphism, nb_inds, + tmp_dir, prg_path) + else: + nb_inds, genotypes_for_inds = load_genotypes_from_file(genotypes) + if nb_inds < 2: + printer.err("nb-inds must be at least 2") + return 1 build_fastas_chromosomes(reference, genotypes_for_inds, haploid, output_dir, printer) printer.out("GENERATE RANDOM READS FOR EACH INDIVIDUAL FROM GENOME...\n") generate_samples_fastq(haploid, nb_inds, output_dir, coverage, read_len, insert_len_mean, @@ -453,9 +491,11 @@ def main(): insert_len_sd = args.insert_len_sd quiet = args.quiet min_deletions = args.min_deletions + genotypes = args.genotypes return init(output_dir, force_outputdir, sv_list, nstretches, nb_inds, reference, proba_del, haploid, - force_polymorphism, coverage, read_len, insert_len_mean, insert_len_sd, threads, min_deletions, quiet) + force_polymorphism, coverage, read_len, insert_len_mean, insert_len_sd, threads, genotypes, + min_deletions, quiet) if __name__ == '__main__': sys.exit(main())