diff --git a/build_pop.py b/build_pop.py index c6c62e2c9c0064337e24bc05074560b5e5e23531..7f5d3e20b47a84ad9d3ff4a99fc412b9da401433 100755 --- a/build_pop.py +++ b/build_pop.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 import os -import shutil import random import argparse from collections import OrderedDict @@ -27,8 +26,9 @@ tmp_dir = args.tmp_directory if not os.path.isfile(reference + ".fai"): os.system("samtools faidx " + reference) -def allele(freq): - return 1 if random.uniform(0,1) < freq else 0 + +def allele(frequency): + return 1 if random.uniform(0, 1) < frequency else 0 prg_path = os.path.dirname(os.path.realpath(__file__)) @@ -48,7 +48,7 @@ os.system(os.path.join(prg_path, "SVsim -W -i {0} -r {1} -o {2}{3}reference-sv". ################################### -#Â 2. Build BED files of deletions # +# 2. Build BED files of deletions # ################################### with open(os.path.join(tmp_dir, "reference-sv.bed"), "w") as bed: @@ -60,20 +60,20 @@ with open(os.path.join(tmp_dir, "reference-sv.bed"), "w") as bed: ############################################################################### -#Â 3. Build VCF files containing genotypes for the given number of individuals # +# 3. Build VCF files containing genotypes for the given number of individuals # ############################################################################### -#Â VCF file is a result file, not used by this script +# VCF file is a result file, not used by this script print("GENERATE SUMMARY VCF FILE...\n") -genotypes_for_inds = OrderedDict() # {Â chr: id_indiv: {start: #start, end: #end, genotypes: [[0,1],[1,1],...]}} +genotypes_for_inds = OrderedDict() +# {Â chr: {Â id_indiv: {start: #start, end: #end, genotypes: [[0,1],[1,1],...], ...}, # ...} # Build VCF header: with open(os.path.join(prg_path, "template.vcf"), "r") as template: with open(os.path.join(tmp_dir, "template.vcf"), "w") as my_template: for line in template: if line[:6] == "#CHROM": - #line += "\t" + "\t".join(list((("INDIV_" + str(i)) for i in range(0, int(args.nb_inds))))) line = line.replace("\n", "") for i in range(0, int(args.nb_inds)): line += "\tINDIV_" + str(i+1) @@ -90,11 +90,22 @@ with open(os.path.join("tmp", "reference-sv.bed"), "r") as bed: genotypes_for_inds[parts[0]] = {} genotypes_for_inds[parts[0]][parts[3]] = {"start": int(parts[1]), "end": int(parts[2]), "genotypes": []} genotypes = [] - for i in range(1, int(args.nb_inds)+1): - genotype = str(allele(freq)) + "/" + str(allele(freq)) - genotypes_for_inds[parts[0]][parts[3]]["genotypes"].append(genotype.split("/")) - genotype_data = vcf.model._Call(None, "INDIV_" + str(i), vcf.model.make_calldata_tuple("GT")(GT=genotype)) - genotypes.append(genotype_data) + polymorph = False + while not polymorph: + unique_genotypes = set() + all_genotypes = [] + genotypes_tmp = [] + for i in range(1, int(args.nb_inds)+1): + genotype = str(allele(freq)) + "/" + str(allele(freq)) + genotype_data = vcf.model._Call(None, "INDIV_" + str(i), vcf.model.make_calldata_tuple("GT")(GT=genotype)) + genotypes_tmp.append(genotype_data) + unique_genotypes.add(genotype) + all_genotypes.append(genotype) + polymorph = len(unique_genotypes) > 1 + if polymorph: + genotypes += genotypes_tmp + genotypes_for_inds[parts[0]][parts[3]]["genotypes"] = [x.split("/") for x in all_genotypes] + info = {"END": int(parts[2]), "AF": freq} vcf_record = vcf.model._Record(parts[0], int(parts[1]), parts[3], "N", [vcf.model._SV("DEL")], ".", ".", info, "GT", [0], genotypes) vcf_writer.write_record(vcf_record) @@ -107,6 +118,7 @@ with open(os.path.join("tmp", "reference-sv.bed"), "r") as bed: print("BUILD FASTA GENOME FOR EACH INDIVIDUAL...\n") + def svsort(sv, chr): """ Function to sort regions @@ -119,7 +131,7 @@ fasta_orig = SeqIO.index(reference, "fasta") for chr, svs_infos in genotypes_for_inds.items(): print("PROCESSING CHROMOSOME {0}...\n".format(chr)) - + svs = list(svs_infos.keys()) svs = sorted(svs, key=lambda x:svsort(x, chr)) @@ -149,10 +161,10 @@ for chr, svs_infos in genotypes_for_inds.items(): id_chrs = list(chrs_dips.keys()) id_chrs = sorted(id_chrs) for id_chr in id_chrs: - chr_dip = chrs_dips[id_chr] #Â SVs for each diploid chromosome + chr_dip = chrs_dips[id_chr] # SVs for each diploid chromosome logs_regions = [] # Store logs - #Â Build FASTA and store logs: + # Build FASTA and store logs: with open(os.path.join(output_dir, "INDIV_" + str(indiv+1) + "_chr_" + str(id_chr) + ".fasta"), "a") as output_handle: fasta = "" last_one = 0 @@ -173,7 +185,7 @@ for chr, svs_infos in genotypes_for_inds.items(): print("GENERATE RANDOM READS FOR EACH INDIVIDUAL FROM GENOME...\n") -#Â Generate reads for all individuals: +# Generate reads for all individuals: cmd = "{4}pirs/pirs simulate -z -x {3} -d -B {4}pirs/Profiles/Base-Calling_Profiles/humNew.PE100.matrix.gz -I {4}pirs/Profiles/InDel_Profiles/phixv2.InDel.matrix -G {4}pirs/Profiles/GC-depth_Profiles/humNew.gcdep_100.dat -o {0} {1} {2}" for i in range(1, int(args.nb_inds)+1): prefix = os.path.join(output_dir, "INDIV_" + str(i))