Skip to content
Snippets Groups Projects
Commit b654d97e authored by Floreal Cabanettes's avatar Floreal Cabanettes
Browse files

Make sure there is always polymorph

parent e8bcd0a4
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python3
import os
import shutil
import random
import argparse
from collections import OrderedDict
......@@ -27,8 +26,9 @@ tmp_dir = args.tmp_directory
if not os.path.isfile(reference + ".fai"):
os.system("samtools faidx " + reference)
def allele(freq):
return 1 if random.uniform(0,1) < freq else 0
def allele(frequency):
return 1 if random.uniform(0, 1) < frequency else 0
prg_path = os.path.dirname(os.path.realpath(__file__))
......@@ -48,7 +48,7 @@ os.system(os.path.join(prg_path, "SVsim -W -i {0} -r {1} -o {2}{3}reference-sv".
###################################
# 2. Build BED files of deletions #
# 2. Build BED files of deletions #
###################################
with open(os.path.join(tmp_dir, "reference-sv.bed"), "w") as bed:
......@@ -60,20 +60,20 @@ with open(os.path.join(tmp_dir, "reference-sv.bed"), "w") as bed:
###############################################################################
# 3. Build VCF files containing genotypes for the given number of individuals #
# 3. Build VCF files containing genotypes for the given number of individuals #
###############################################################################
# VCF file is a result file, not used by this script
# VCF file is a result file, not used by this script
print("GENERATE SUMMARY VCF FILE...\n")
genotypes_for_inds = OrderedDict() # { chr: id_indiv: {start: #start, end: #end, genotypes: [[0,1],[1,1],...]}}
genotypes_for_inds = OrderedDict()
# { chr: { id_indiv: {start: #start, end: #end, genotypes: [[0,1],[1,1],...], ...}, # ...}
# Build VCF header:
with open(os.path.join(prg_path, "template.vcf"), "r") as template:
with open(os.path.join(tmp_dir, "template.vcf"), "w") as my_template:
for line in template:
if line[:6] == "#CHROM":
#line += "\t" + "\t".join(list((("INDIV_" + str(i)) for i in range(0, int(args.nb_inds)))))
line = line.replace("\n", "")
for i in range(0, int(args.nb_inds)):
line += "\tINDIV_" + str(i+1)
......@@ -90,11 +90,22 @@ with open(os.path.join("tmp", "reference-sv.bed"), "r") as bed:
genotypes_for_inds[parts[0]] = {}
genotypes_for_inds[parts[0]][parts[3]] = {"start": int(parts[1]), "end": int(parts[2]), "genotypes": []}
genotypes = []
for i in range(1, int(args.nb_inds)+1):
genotype = str(allele(freq)) + "/" + str(allele(freq))
genotypes_for_inds[parts[0]][parts[3]]["genotypes"].append(genotype.split("/"))
genotype_data = vcf.model._Call(None, "INDIV_" + str(i), vcf.model.make_calldata_tuple("GT")(GT=genotype))
genotypes.append(genotype_data)
polymorph = False
while not polymorph:
unique_genotypes = set()
all_genotypes = []
genotypes_tmp = []
for i in range(1, int(args.nb_inds)+1):
genotype = str(allele(freq)) + "/" + str(allele(freq))
genotype_data = vcf.model._Call(None, "INDIV_" + str(i), vcf.model.make_calldata_tuple("GT")(GT=genotype))
genotypes_tmp.append(genotype_data)
unique_genotypes.add(genotype)
all_genotypes.append(genotype)
polymorph = len(unique_genotypes) > 1
if polymorph:
genotypes += genotypes_tmp
genotypes_for_inds[parts[0]][parts[3]]["genotypes"] = [x.split("/") for x in all_genotypes]
info = {"END": int(parts[2]), "AF": freq}
vcf_record = vcf.model._Record(parts[0], int(parts[1]), parts[3], "N", [vcf.model._SV("DEL")], ".", ".", info, "GT", [0], genotypes)
vcf_writer.write_record(vcf_record)
......@@ -107,6 +118,7 @@ with open(os.path.join("tmp", "reference-sv.bed"), "r") as bed:
print("BUILD FASTA GENOME FOR EACH INDIVIDUAL...\n")
def svsort(sv, chr):
"""
Function to sort regions
......@@ -119,7 +131,7 @@ fasta_orig = SeqIO.index(reference, "fasta")
for chr, svs_infos in genotypes_for_inds.items():
print("PROCESSING CHROMOSOME {0}...\n".format(chr))
svs = list(svs_infos.keys())
svs = sorted(svs, key=lambda x:svsort(x, chr))
......@@ -149,10 +161,10 @@ for chr, svs_infos in genotypes_for_inds.items():
id_chrs = list(chrs_dips.keys())
id_chrs = sorted(id_chrs)
for id_chr in id_chrs:
chr_dip = chrs_dips[id_chr] # SVs for each diploid chromosome
chr_dip = chrs_dips[id_chr] # SVs for each diploid chromosome
logs_regions = [] # Store logs
# Build FASTA and store logs:
# Build FASTA and store logs:
with open(os.path.join(output_dir, "INDIV_" + str(indiv+1) + "_chr_" + str(id_chr) + ".fasta"), "a") as output_handle:
fasta = ""
last_one = 0
......@@ -173,7 +185,7 @@ for chr, svs_infos in genotypes_for_inds.items():
print("GENERATE RANDOM READS FOR EACH INDIVIDUAL FROM GENOME...\n")
# Generate reads for all individuals:
# Generate reads for all individuals:
cmd = "{4}pirs/pirs simulate -z -x {3} -d -B {4}pirs/Profiles/Base-Calling_Profiles/humNew.PE100.matrix.gz -I {4}pirs/Profiles/InDel_Profiles/phixv2.InDel.matrix -G {4}pirs/Profiles/GC-depth_Profiles/humNew.gcdep_100.dat -o {0} {1} {2}"
for i in range(1, int(args.nb_inds)+1):
prefix = os.path.join(output_dir, "INDIV_" + str(i))
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment