Make sure there is always polymorph

b654d97e · Floreal Cabanettes · e8bcd0a4 · b654d97e
Commit b654d97e authored 8 years ago by Floreal Cabanettes
--- a/build_pop.py
+++ b/build_pop.py
 #!/usr/bin/env python3

 import os
-import shutil
 import random
 import argparse
 from collections import OrderedDict
@@ -27,8 +26,9 @@ tmp_dir = args.tmp_directory
 if not os.path.isfile(reference + ".fai"):
    os.system("samtools faidx " + reference)

-def allele(freq):
-    return 1 if random.uniform(0,1) < freq else 0
+
+def allele(frequency):
+    return 1 if random.uniform(0, 1) < frequency else 0

 prg_path = os.path.dirname(os.path.realpath(__file__))

@@ -48,7 +48,7 @@ os.system(os.path.join(prg_path, "SVsim -W -i {0} -r {1} -o {2}{3}reference-sv".


 ###################################
-# 2. Build BED files of deletions #
+# 2. Build BED files of deletions #
 ###################################

 with open(os.path.join(tmp_dir, "reference-sv.bed"), "w") as bed:
@@ -60,20 +60,20 @@ with open(os.path.join(tmp_dir, "reference-sv.bed"), "w") as bed:


 ###############################################################################
-# 3. Build VCF files containing genotypes for the given number of individuals #
+# 3. Build VCF files containing genotypes for the given number of individuals #
 ###############################################################################
-# VCF file is a result file, not used by this script
+# VCF file is a result file, not used by this script

 print("GENERATE SUMMARY VCF FILE...\n")

-genotypes_for_inds = OrderedDict() # { chr: id_indiv: {start: #start, end: #end, genotypes: [[0,1],[1,1],...]}}
+genotypes_for_inds = OrderedDict()
+# { chr: { id_indiv: {start: #start, end: #end, genotypes: [[0,1],[1,1],...], ...}, # ...}

 # Build VCF header:
 with open(os.path.join(prg_path, "template.vcf"), "r") as template:
    with open(os.path.join(tmp_dir, "template.vcf"), "w") as my_template:
        for line in template:
            if line[:6] == "#CHROM":
-                #line += "\t" + "\t".join(list((("INDIV_" + str(i)) for i in range(0, int(args.nb_inds)))))
                line = line.replace("\n", "")
                for i in range(0, int(args.nb_inds)):
                    line += "\tINDIV_" + str(i+1)
@@ -90,11 +90,22 @@ with open(os.path.join("tmp", "reference-sv.bed"), "r") as bed:
            genotypes_for_inds[parts[0]] = {}
        genotypes_for_inds[parts[0]][parts[3]] = {"start": int(parts[1]), "end": int(parts[2]), "genotypes": []}
        genotypes = []
-        for i in range(1, int(args.nb_inds)+1):
-            genotype = str(allele(freq)) + "/" + str(allele(freq))
-            genotypes_for_inds[parts[0]][parts[3]]["genotypes"].append(genotype.split("/"))
-            genotype_data = vcf.model._Call(None, "INDIV_" + str(i), vcf.model.make_calldata_tuple("GT")(GT=genotype))
-            genotypes.append(genotype_data)
+        polymorph = False
+        while not polymorph:
+            unique_genotypes = set()
+            all_genotypes = []
+            genotypes_tmp = []
+            for i in range(1, int(args.nb_inds)+1):
+                genotype = str(allele(freq)) + "/" + str(allele(freq))
+                genotype_data = vcf.model._Call(None, "INDIV_" + str(i), vcf.model.make_calldata_tuple("GT")(GT=genotype))
+                genotypes_tmp.append(genotype_data)
+                unique_genotypes.add(genotype)
+                all_genotypes.append(genotype)
+            polymorph = len(unique_genotypes) > 1
+            if polymorph:
+                genotypes += genotypes_tmp
+                genotypes_for_inds[parts[0]][parts[3]]["genotypes"] = [x.split("/") for x in all_genotypes]
+
        info = {"END": int(parts[2]), "AF": freq}
        vcf_record = vcf.model._Record(parts[0], int(parts[1]), parts[3], "N", [vcf.model._SV("DEL")], ".", ".", info, "GT", [0], genotypes)
        vcf_writer.write_record(vcf_record)
@@ -107,6 +118,7 @@ with open(os.path.join("tmp", "reference-sv.bed"), "r") as bed:

 print("BUILD FASTA GENOME FOR EACH INDIVIDUAL...\n")

+
 def svsort(sv, chr):
    """
    Function to sort regions
@@ -119,7 +131,7 @@ fasta_orig = SeqIO.index(reference, "fasta")
 for chr, svs_infos in genotypes_for_inds.items():

    print("PROCESSING CHROMOSOME {0}...\n".format(chr))
-    
+
    svs = list(svs_infos.keys())
    svs = sorted(svs, key=lambda x:svsort(x, chr))

@@ -149,10 +161,10 @@ for chr, svs_infos in genotypes_for_inds.items():
        id_chrs = list(chrs_dips.keys())
        id_chrs = sorted(id_chrs)
        for id_chr in id_chrs:
-            chr_dip = chrs_dips[id_chr] # SVs for each diploid chromosome
+            chr_dip = chrs_dips[id_chr] # SVs for each diploid chromosome
            logs_regions = [] # Store logs

-            # Build FASTA and store logs:
+            # Build FASTA and store logs:
            with open(os.path.join(output_dir, "INDIV_" + str(indiv+1) + "_chr_" + str(id_chr) + ".fasta"), "a") as output_handle:
                fasta = ""
                last_one = 0
@@ -173,7 +185,7 @@ for chr, svs_infos in genotypes_for_inds.items():

 print("GENERATE RANDOM READS FOR EACH INDIVIDUAL FROM GENOME...\n")

-# Generate reads for all individuals:
+# Generate reads for all individuals:
 cmd = "{4}pirs/pirs simulate -z -x {3} -d -B {4}pirs/Profiles/Base-Calling_Profiles/humNew.PE100.matrix.gz -I {4}pirs/Profiles/InDel_Profiles/phixv2.InDel.matrix -G {4}pirs/Profiles/GC-depth_Profiles/humNew.gcdep_100.dat -o {0} {1} {2}"
 for i in range(1, int(args.nb_inds)+1):
    prefix = os.path.join(output_dir, "INDIV_" + str(i))