Skip to content
Snippets Groups Projects
Commit fa876d5b authored by Floreal Cabanettes's avatar Floreal Cabanettes
Browse files

Refactoring of build_pop script

parent 8d6c7883
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python3
import sys
import os
import random
import argparse
......@@ -8,192 +9,220 @@ import vcf
from Bio import SeqIO
import tempfile
from lib.svrunner_utils import eprint
from pysam import tabix_compress, tabix_index
parser = argparse.ArgumentParser(description='Generate simulated populations with SV')
parser.add_argument("--nb-inds", help="Number of individuals to generate", required=True, type=int)
parser.add_argument("--reference", help="Reference genome", required=True)
parser.add_argument("--sv-list", help="File containing the SVs", required=True)
parser.add_argument("--coverage", help="Coverage of reads (default: 15)", default=15, type=int)
parser.add_argument("--output-directory", help="Output directory (default: res)", default="res")
parser.add_argument("--force-polymorphism", help="Force polymorphism for each SV", action='store_const', const=True,
default=False)
parser.add_argument("--haploid", help="Make a haploid genome, instead of diploid one", action="store_const", const=True,
default=False)
parser.add_argument("-l", "--read-len", help="Generate reads having a length of LEN", type=int, default=100)
parser.add_argument("-m", "--insert-len-mean", help="Generate inserts (fragments) having an average length of LEN",
type=int, default=300)
parser.add_argument("-v", "--insert-len-sd", help="Set the standard deviation of the insert (fragment) length (%%)",
type=int, default=30)
args = parser.parse_args()
nb_inds = args.nb_inds
if nb_inds < 2:
raise Exception("nb-inds must be at least 2")
reference = args.reference
sv_list = args.sv_list
output_dir = args.output_directory
tmp_dir = tempfile.mkdtemp()
haploid = args.haploid
if not os.path.isfile(reference + ".fai"):
os.system("samtools faidx " + reference)
#############
# FUNCTIONS #
#############
def allele(frequency):
def parse_args():
"""
Parse script arguments
:return: argparse arguments object
"""
parser = argparse.ArgumentParser(description='Generate simulated populations with SV')
parser.add_argument("--nb-inds", help="Number of individuals to generate", required=True, type=int)
parser.add_argument("--reference", help="Reference genome", required=True)
parser.add_argument("--sv-list", help="File containing the SVs", required=True)
parser.add_argument("--coverage", help="Coverage of reads (default: 15)", default=15, type=int)
parser.add_argument("--output-directory", help="Output directory (default: res)", default="res")
parser.add_argument("--force-polymorphism", help="Force polymorphism for each SV", action='store_const', const=True,
default=False)
parser.add_argument("--haploid", help="Make a haploid genome, instead of diploid one", action="store_const", const=True,
default=False)
parser.add_argument("-l", "--read-len", help="Generate reads having a length of LEN", type=int, default=100)
parser.add_argument("-m", "--insert-len-mean", help="Generate inserts (fragments) having an average length of LEN",
type=int, default=300)
parser.add_argument("-v", "--insert-len-sd", help="Set the standard deviation of the insert (fragment) length (%%)",
type=int, default=30)
args = parser.parse_args()
return args
def _allele(frequency):
"""
Get randomly an allele, given a frequency
:param frequency: frequency of the allele {float}
:return: allele randomly choosen {0 or 1}
"""
return 1 if random.uniform(0, 1) < frequency else 0
def get_genotypes_for_inds():
def _get_genotypes_for_inds(nb_inds, haploid, freq):
"""
Get genotypes for each individual
:param nb_inds: number of individuals {int}
:param haploid: is the genome hamploid {bool}
:param freq: frequency of the allele {float}
:return: list of genotypes, vcf data {list}
"""
all_genotypes = []
genotypes_row = []
for i in range(1, nb_inds + 1):
if not haploid:
genotype = str(allele(freq)) + "/" + str(allele(freq))
genotype = str(_allele(freq)) + "/" + str(_allele(freq))
else:
genotype = str(allele(freq))
genotype = str(_allele(freq))
genotype_data = vcf.model._Call(None, "INDIV_" + str(i), vcf.model.make_calldata_tuple("GT")(GT=genotype))
genotypes_row.append(genotype_data)
all_genotypes.append(genotype)
return all_genotypes, genotypes_row
def svsort(sv, chr):
def _svsort(sv, chrm, genotypes_for_inds):
"""
Function to sort regions
:param sv: the variant
:param chrm: chromosome {str}
:param genotypes_for_inds: dictionary of genotypes for each individual
"""
return int(genotypes_for_inds[chr][sv]["start"])
prg_path = os.path.dirname(os.path.realpath(__file__))
if not os.path.isdir(output_dir):
os.mkdir(output_dir)
return int(genotypes_for_inds[chrm][sv]["start"])
###########################
# 1. Get random deletions #
###########################
print("GENERATE SV DELs...\n")
def get_random_deletions(sv_list, reference, tmp_dir, prg_path):
"""
Get random deletions
:param sv_list: file describing deletions (in SVsim format) {str}
:param reference: reference genome fasta file {str}
:param tmp_dir: temporary directory {str}
:param prg_path: program path {str}
"""
print("GENERATE SV DELs...\n")
os.system(os.path.join(prg_path, "SVsim -W -i {0} -r {1} -o {2}{3}reference-sv".format(sv_list, reference, tmp_dir, os.path.sep)))
os.system(os.path.join(prg_path, "SVsim -W -i {0} -r {1} -o {2}{3}reference-sv".format(sv_list, reference, tmp_dir, os.path.sep)))
###################################
# 2. Build BED files of deletions #
###################################
try:
with open(os.path.join(tmp_dir, "reference-sv.bed"), "w") as bed:
try:
with open(os.path.join(tmp_dir, "reference-sv.bedpe"), "r") as bedpe:
for line in bedpe:
freq = 0.2 if random.uniform(0,1) < 0.5 else 0.5
parts = line.split("\t")
bed.write("\t".join([parts[0], parts[2], parts[4], parts[6].replace("::", "_"), str(freq)]) + "\n")
except IOError:
eprint("ERROR: SVSim failed to generate variants.")
exit(1)
except IOError:
eprint("ERROR: Unable to generate BED file \"{0}\".".format(os.path.join(tmp_dir, "reference-sv.bed")))
exit(1)
def build_bed_deletions_file(bed_file, tmp_dir):
"""
Build BED file of deletions
:param bed_file: bed full path name {str}
:param tmp_dir: temporary directory {str}
"""
try:
with open(bed_file, "w") as bed:
try:
with open(os.path.join(tmp_dir, "reference-sv.bedpe"), "r") as bedpe:
for line in bedpe:
freq = 0.2 if random.uniform(0,1) < 0.5 else 0.5
parts = line.split("\t")
bed.write("\t".join([parts[0], parts[2], parts[4], parts[6].replace("::", "_"), str(freq)]) + "\n")
except IOError:
eprint("ERROR: SVSim failed to generate variants.")
exit(1)
except IOError:
eprint("ERROR: Unable to generate BED file \"{0}\".".format(os.path.join(tmp_dir, "reference-sv.bed")))
exit(1)
###############################################################################
# 3. Build VCF files containing genotypes for the given number of individuals #
###############################################################################
# VCF file is a result file, not used by this script
print("GENERATE SUMMARY VCF FILE...\n")
genotypes_for_inds = OrderedDict()
# { chr: { id_indiv: {start: #start, end: #end, genotypes: [[0,1],[1,1],...], ...}, # ...}
# Build VCF header:
try:
with open(os.path.join(prg_path, "template.vcf"), "r") as template:
try:
with open(os.path.join(tmp_dir, "template.vcf"), "w") as my_template:
for line in template:
if line[:6] == "#CHROM":
line = line.replace("\n", "")
for i in range(0, nb_inds):
line += "\tINDIV_" + str(i+1)
line += "\n"
my_template.write(line)
except IOError:
eprint("ERROR: unable to create template file \"{0}\" in temp dir.".format(os.path.join(tmp_dir, "template.vcf")))
exit(1)
except IOError:
eprint("ERROR: template file not found in program directory.")
exit(1)
try:
with open(os.path.join(tmp_dir, "reference-sv.bed"), "r") as bed:
vcf_reader = vcf.Reader(filename=os.path.join(tmp_dir, 'template.vcf'))
output_vcf = os.path.join(output_dir, "genotypes.vcf")
vcf_writer = vcf.Writer(open(output_vcf, "w"), vcf_reader)
for line in bed:
parts = line.replace("\n", "").split("\t")
freq = float(parts[4])
if parts[0] not in genotypes_for_inds:
genotypes_for_inds[parts[0]] = {}
genotypes_for_inds[parts[0]][parts[3]] = {"start": int(parts[1]), "end": int(parts[2]), "genotypes": []}
# Get genotypes:
all_genotypes, genotypes = [], []
if args.force_polymorphism:
polymorph = False
while not polymorph:
all_genotypes, genotypes = get_genotypes_for_inds()
polymorph = len(set(all_genotypes)) > 1
else:
all_genotypes, genotypes = get_genotypes_for_inds()
genotypes_for_inds[parts[0]][parts[3]]["genotypes"] = [x.split("/") for x in all_genotypes]
info = {"END": int(parts[2]), "AF": freq}
vcf_record = vcf.model._Record(parts[0], int(parts[1]), parts[3], "N", [vcf.model._SV("DEL")], ".", ".", info, "GT", [0], genotypes)
vcf_writer.write_record(vcf_record)
vcf_writer.close()
# Bgzip + tabix:
os.system("bgzip -c " + output_vcf + " > " + output_vcf + ".gz")
os.unlink(output_vcf)
output_vcf += ".gz"
os.system("tabix -p vcf " + output_vcf)
except IOError:
eprint("ERROR: Unable to load bed file \"{0}\": file not found.".format(os.path.join(tmp_dir, "reference-sv.bed")))
exit(1)
def _build_vcf_header(vcf_file, prg_path, tmp_dir, nb_inds):
"""
Build header of the VCF file
:param vcf_file: vcf file full path name {str}
:param prg_path: program path {str}
:param tmp_dir: temporary directory {str}
:param nb_inds: number of individuals {int}
"""
try:
with open(os.path.join(prg_path, "template.vcf"), "r") as template:
try:
with open(vcf_file, "w") as my_template:
for line in template:
if line[:6] == "#CHROM":
line = line.replace("\n", "")
for i in range(0, nb_inds):
line += "\tINDIV_" + str(i+1)
line += "\n"
my_template.write(line)
except IOError:
eprint("ERROR: unable to create template file \"{0}\" in temp dir.".format(os.path.join(tmp_dir, "template.vcf")))
exit(1)
except IOError:
eprint("ERROR: template file not found in program directory.")
exit(1)
def build_genotypes_vcf_list(bed_file, output_vcf, haploid, force_polymorphism, nb_inds, tmp_dir, prg_path):
"""
Build VCF file describing genotypes for each individual (and the associated python dictionary)
:param bed_file: bed file describing deletions {str}
:param output_vcf: output VCF file full path name {str}
:param haploid: is haploid {bool}
:param force_polymorphism: force polymorphism {bool}
:param output_dir: output directory {str}
:param nb_inds: number of individuals {int}
:param tmp_dir: temporary directory {str}
:param prg_path: program path {str}
:return: dictionary of genotypes for each variant for each dictionary {OrderedDict}
"""
print("GENERATE SUMMARY VCF FILE...\n")
genotypes_for_inds = OrderedDict()
# { chr: { id_indiv: {start: #start, end: #end, genotypes: [[0,1],[1,1],...], ...}, # ...}
vcf_file = os.path.join(tmp_dir, "template.vcf")
# Build VCF header:
_build_vcf_header(vcf_file, prg_path, tmp_dir, nb_inds)
try:
with open(bed_file, "r") as bed:
vcf_reader = vcf.Reader(filename=vcf_file)
vcf_writer = vcf.Writer(open(output_vcf, "w"), vcf_reader)
for line in bed:
parts = line.replace("\n", "").split("\t")
freq = float(parts[4])
if parts[0] not in genotypes_for_inds:
genotypes_for_inds[parts[0]] = {}
genotypes_for_inds[parts[0]][parts[3]] = {"start": int(parts[1]), "end": int(parts[2]), "genotypes": []}
# Get genotypes:
all_genotypes, genotypes = [], []
if force_polymorphism:
polymorph = False
while not polymorph:
all_genotypes, genotypes = _get_genotypes_for_inds(nb_inds, haploid, freq)
polymorph = len(set(all_genotypes)) > 1
else:
all_genotypes, genotypes = _get_genotypes_for_inds(nb_inds, haploid, freq)
genotypes_for_inds[parts[0]][parts[3]]["genotypes"] = [x.split("/") for x in all_genotypes]
info = {"END": int(parts[2]), "AF": freq}
vcf_record = vcf.model._Record(parts[0], int(parts[1]), parts[3], "N", [vcf.model._SV("DEL")], ".", ".", info, "GT", [0], genotypes)
vcf_writer.write_record(vcf_record)
vcf_writer.close()
tabix_compress(output_vcf, output_vcf + ".gz", True)
tabix_index(output_vcf + ".gz", force=True, preset="vcf")
except IOError:
eprint("ERROR: Unable to load bed file \"{0}\": file not found.".format(bed_file))
exit(1)
return genotypes_for_inds
###############################################
# Build fasta chromosomes for each individual #
###############################################
print("BUILD FASTA GENOME FOR EACH INDIVIDUAL...\n")
fasta_orig = SeqIO.index(reference, "fasta")
for chr, svs_infos in genotypes_for_inds.items():
print("PROCESSING CHROMOSOME {0}...\n".format(chr))
svs = list(svs_infos.keys())
svs = sorted(svs, key=lambda x:svsort(x, chr))
fasta_orig_chr = fasta_orig[chr]
last_nt = len(fasta_orig_chr)-1
# Compute keeped genomic regions for each diploid chromosome:
def _compute_keeped_genomic_regions(svs, svs_infos, haploid):
"""
Get list of all regions keeped (not deleted)
:param svs: list of variants (deletions) {list}
:param svs_infos: infos of variants {dict}
:param haploid: is haploid {bool}
:return: regions for each individuals {OrderedDict}, the last position for each region {OrderedDict}
"""
regions = OrderedDict()
current_region_pointer = OrderedDict()
for svs_i in svs:
......@@ -203,7 +232,7 @@ for chr, svs_infos in genotypes_for_inds.items():
regions[i] = {}
current_region_pointer[i] = {}
for j in range(0, 2 if not haploid else 1): # For each chromosome of the diploid genome, or the chromosome
# of the haploid genome
# of the haploid genome
if j not in regions[i]:
regions[i][j] = []
current_region_pointer[i][j] = "0"
......@@ -211,18 +240,31 @@ for chr, svs_infos in genotypes_for_inds.items():
regions[i][j].append([current_region_pointer[i][j], svs_infos[svs_i]["start"]])
current_region_pointer[i][j] = svs_infos[svs_i]["end"]
i += 1
return regions, current_region_pointer
# Build FASTA of each diploid/haploid chromosome:
def _build_fastas(chrm, regions, current_region_pointer, output_dir, fasta_orig_chr, last_nt):
"""
Build fasta files
:param chrm:
:param regions:
:param current_region_pointer:
:param output_dir:
:param fasta_orig_chr:
:param last_nt:
:return:
"""
for indiv, chrs_dips in regions.items():
id_chrs = list(chrs_dips.keys())
id_chrs = sorted(id_chrs)
for id_chr in id_chrs:
chr_dip = chrs_dips[id_chr] # SVs for each diploid chromosome
logs_regions = [] # Store logs
chr_dip = chrs_dips[id_chr] # SVs for each diploid chromosome
logs_regions = [] # Store logs
# Build FASTA and store logs:
try:
with open(os.path.join(output_dir, "INDIV_" + str(indiv+1) + "_chr_" + str(id_chr) + ".fasta"), "a") as output_handle:
with open(os.path.join(output_dir, "INDIV_" + str(indiv + 1) + "_chr_" + str(id_chr) + ".fasta"),
"a") as output_handle:
fasta = ""
last_one = 0
for chr_region in chr_dip:
......@@ -235,39 +277,103 @@ for chr, svs_infos in genotypes_for_inds.items():
SeqIO.write(fasta, output_handle, "fasta")
except IOError:
eprint("ERROR: unable to write \"{0}\" file.".
format(os.path.join(output_dir, "INDIV_" + str(indiv+1) + "_chr_" + str(id_chr) + ".fasta")))
format(os.path.join(output_dir, "INDIV_" + str(indiv + 1) + "_chr_" + str(id_chr) + ".fasta")))
exit(1)
# Write logs
try:
with open(os.path.join(output_dir, "INDIV_" + str(indiv+1) + ".regions.log"), "a") as log_handle:
log_handle.write(chr + "_" + str(id_chr) + "\t")
with open(os.path.join(output_dir, "INDIV_" + str(indiv + 1) + ".regions.log"), "a") as log_handle:
log_handle.write(chrm + "_" + str(id_chr) + "\t")
log_handle.write("\n\t".join(logs_regions))
log_handle.write("\n")
except IOError:
eprint("ERROR: unable to write log file: \"{0}\"".
format(os.path.join(output_dir, "INDIV_" + str(indiv+1) + ".regions.log")))
print("GENERATE RANDOM READS FOR EACH INDIVIDUAL FROM GENOME...\n")
# Generate reads for all individuals:
if not haploid:
cmd = str("{4}pirs/pirs simulate -z -x {3} -d -B {4}pirs/Profiles/Base-Calling_Profiles/humNew.PE100.matrix.gz "
"-I {4}pirs/Profiles/InDel_Profiles/phixv2.InDel.matrix -l {5} -m {6} -v {7} "
"-G {4}pirs/Profiles/GC-depth_Profiles/humNew.gcdep_100.dat -o {0} {1} {2}")
else:
cmd = str("{4}pirs/pirs simulate -z -x {3} -B {4}pirs/Profiles/Base-Calling_Profiles/humNew.PE100.matrix.gz "
"-I {4}pirs/Profiles/InDel_Profiles/phixv2.InDel.matrix -l {5} -m {6} -v {7} "
"-G {4}pirs/Profiles/GC-depth_Profiles/humNew.gcdep_100.dat -o {0} {1}")
for i in range(1, nb_inds+1):
prefix = os.path.join(output_dir, "INDIV_" + str(i))
chr0 = prefix + "_chr_0.fasta"
chr1 = prefix + "_chr_1.fasta"
format(os.path.join(output_dir, "INDIV_" + str(indiv + 1) + ".regions.log")))
def build_fastas_chromosomes(reference, genotypes_for_inds, haploid, output_dir):
print("BUILD FASTA GENOME FOR EACH INDIVIDUAL...\n")
fasta_orig = SeqIO.index(reference, "fasta")
for chrm, svs_infos in genotypes_for_inds.items():
print("PROCESSING CHROMOSOME {0}...\n".format(chrm))
svs = list(svs_infos.keys())
svs = sorted(svs, key=lambda x:_svsort(x, chrm, genotypes_for_inds))
fasta_orig_chr = fasta_orig[chrm]
last_nt = len(fasta_orig_chr)-1
# Compute keeped genomic regions for each diploid chromosome:
regions, current_region_pointer = _compute_keeped_genomic_regions(svs, svs_infos, haploid)
# Build FASTA of each diploid/haploid chromosome:
_build_fastas(chrm, regions, current_region_pointer, output_dir, fasta_orig_chr, last_nt)
def generate_samples_fastq(haploid, nb_inds, output_dir, coverage, read_len, insert_len_mean, insert_len_sd,
prg_path):
print("GENERATE RANDOM READS FOR EACH INDIVIDUAL FROM GENOME...\n")
# Generate reads for all individuals:
if not haploid:
os.system(cmd.format(prefix, chr0, chr1, args.coverage, prg_path + os.path.sep, args.read_len,
args.insert_len_mean, args.insert_len_sd))
cmd = str("{4}pirs/pirs simulate -z -x {3} -d -B {4}pirs/Profiles/Base-Calling_Profiles/humNew.PE100.matrix.gz "
"-I {4}pirs/Profiles/InDel_Profiles/phixv2.InDel.matrix -l {5} -m {6} -v {7} "
"-G {4}pirs/Profiles/GC-depth_Profiles/humNew.gcdep_100.dat -o {0} {1} {2}")
else:
os.system(cmd.format(prefix, chr0, "", args.coverage, prg_path + os.path.sep, args.read_len,
args.insert_len_mean, args.insert_len_sd))
print("DONE!\n")
cmd = str("{4}pirs/pirs simulate -z -x {3} -B {4}pirs/Profiles/Base-Calling_Profiles/humNew.PE100.matrix.gz "
"-I {4}pirs/Profiles/InDel_Profiles/phixv2.InDel.matrix -l {5} -m {6} -v {7} "
"-G {4}pirs/Profiles/GC-depth_Profiles/humNew.gcdep_100.dat -o {0} {1}")
for i in range(1, nb_inds+1):
prefix = os.path.join(output_dir, "INDIV_" + str(i))
chr0 = prefix + "_chr_0.fasta"
chr1 = prefix + "_chr_1.fasta"
if not haploid:
os.system(cmd.format(prefix, chr0, chr1, coverage, prg_path + os.path.sep, read_len,
insert_len_mean, insert_len_sd))
else:
os.system(cmd.format(prefix, chr0, "", coverage, prg_path + os.path.sep, read_len,
insert_len_mean, insert_len_sd))
def main():
args = parse_args()
reference = args.reference
sv_list = args.sv_list
output_dir = args.output_directory
tmp_dir = tempfile.mkdtemp()
haploid = args.haploid
nb_inds = args.nb_inds
if nb_inds < 2:
raise Exception("nb-inds must be at least 2")
if not os.path.isfile(reference + ".fai"):
os.system("samtools faidx " + reference)
prg_path = os.path.dirname(os.path.realpath(__file__))
if not os.path.isdir(output_dir):
os.mkdir(output_dir)
################
# Launch steps #
################
get_random_deletions(sv_list, reference, tmp_dir, prg_path)
bed_file = os.path.join(tmp_dir, "reference-sv.bed")
build_bed_deletions_file(bed_file, tmp_dir)
output_vcf = os.path.join(output_dir, "genotypes.vcf")
genotypes_for_inds = build_genotypes_vcf_list(bed_file, output_vcf, haploid, args.force_polymorphism, nb_inds, tmp_dir, prg_path)
build_fastas_chromosomes(reference, genotypes_for_inds, haploid, output_dir)
generate_samples_fastq(haploid, nb_inds, output_dir, args.coverage, args.read_len, args.insert_len_mean,
args.insert_len_sd, prg_path)
print("DONE!\n")
if __name__ == '__main__':
try:
sys.exit(main())
except Exception as e:
print(e)
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment