diff --git a/README b/README index 82a0d7ae0ee29387b1c01fd7b4503eae105ffa4d..56393949df878989654ad8224ce536cc88115a9f 100644 --- a/README +++ b/README @@ -61,6 +61,8 @@ If increment is not specified, it defaults to 1. 1 line by SV type, like above. +IMPORTANT: only deletion is implemented yet. + We use SVsim to generate positions of SVs. https://github.com/GregoryFaust/SVsim diff --git a/build_pop.py b/build_pop.py index b3b2f1ad51174b703e40f6096ed4973a311105fe..164daf58dfb719403bd651b544bb4f39fac81dae 100755 --- a/build_pop.py +++ b/build_pop.py @@ -14,6 +14,8 @@ parser.add_argument("--sv-list", help="File containing the SVs", required=True) parser.add_argument("--coverage", help="Coverage of reads (default: 15)", default=15, type=int) parser.add_argument("--output-directory", help="Output directory (default: res)", default="res") parser.add_argument("--tmp-directory", help="Temporary directory (default: tmp)", default="tmp") +parser.add_argument("--force-polymorphism", help="Force polymorphism for each SV", action='store_const', const=True, + default=False) args = parser.parse_args() @@ -31,9 +33,33 @@ if not os.path.isfile(reference + ".fai"): os.system("samtools faidx " + reference) +############# +# FUNCTIONS # +############# + + def allele(frequency): return 1 if random.uniform(0, 1) < frequency else 0 + +def get_genotypes_for_inds(): + all_genotypes = [] + genotypes_row = [] + for i in range(1, nb_inds + 1): + genotype = str(allele(freq)) + "/" + str(allele(freq)) + genotype_data = vcf.model._Call(None, "INDIV_" + str(i), vcf.model.make_calldata_tuple("GT")(GT=genotype)) + genotypes_row.append(genotype_data) + all_genotypes.append(genotype) + return all_genotypes, genotypes_row + + +def svsort(sv, chr): + """ + Function to sort regions + """ + return int(genotypes_for_inds[chr][sv]["start"]) + + prg_path = os.path.dirname(os.path.realpath(__file__)) if not os.path.isdir(tmp_dir): @@ -93,22 +119,16 @@ with open(os.path.join(tmp_dir, "reference-sv.bed"), "r") as bed: if parts[0] not in genotypes_for_inds: genotypes_for_inds[parts[0]] = {} genotypes_for_inds[parts[0]][parts[3]] = {"start": int(parts[1]), "end": int(parts[2]), "genotypes": []} - genotypes = [] - polymorph = False - while not polymorph: - unique_genotypes = set() - all_genotypes = [] - genotypes_tmp = [] - for i in range(1, nb_inds+1): - genotype = str(allele(freq)) + "/" + str(allele(freq)) - genotype_data = vcf.model._Call(None, "INDIV_" + str(i), vcf.model.make_calldata_tuple("GT")(GT=genotype)) - genotypes_tmp.append(genotype_data) - unique_genotypes.add(genotype) - all_genotypes.append(genotype) - polymorph = len(unique_genotypes) > 1 - if polymorph: - genotypes += genotypes_tmp - genotypes_for_inds[parts[0]][parts[3]]["genotypes"] = [x.split("/") for x in all_genotypes] + + # Get genotypes: + if args.force_polymorphism: + polymorph = False + while not polymorph: + all_genotypes, genotypes = get_genotypes_for_inds() + polymorph = len(set(all_genotypes)) > 1 + else: + all_genotypes, genotypes = get_genotypes_for_inds() + genotypes_for_inds[parts[0]][parts[3]]["genotypes"] = [x.split("/") for x in all_genotypes] info = {"END": int(parts[2]), "AF": freq} vcf_record = vcf.model._Record(parts[0], int(parts[1]), parts[3], "N", [vcf.model._SV("DEL")], ".", ".", info, "GT", [0], genotypes) @@ -123,13 +143,6 @@ with open(os.path.join(tmp_dir, "reference-sv.bed"), "r") as bed: print("BUILD FASTA GENOME FOR EACH INDIVIDUAL...\n") -def svsort(sv, chr): - """ - Function to sort regions - """ - return int(genotypes_for_inds[chr][sv]["start"]) - - fasta_orig = SeqIO.index(reference, "fasta") for chr, svs_infos in genotypes_for_inds.items():