Newer
Older
import os
import random
import argparse
from collections import OrderedDict
import vcf
from Bio import SeqIO
Floreal Cabanettes
committed
import tempfile
from lib.svrunner_utils import eprint
from pysam import tabix_compress, tabix_index
def parse_args():
"""
Parse script arguments
:return: argparse arguments object
"""
parser = argparse.ArgumentParser(description='Generate simulated populations with SV')
parser.add_argument("--nb-inds", help="Number of individuals to generate", required=True, type=int)
parser.add_argument("--reference", help="Reference genome", required=True)
parser.add_argument("--sv-list", help="File containing the SVs", required=True)
parser.add_argument("--coverage", help="Coverage of reads (default: 15)", default=15, type=int)
parser.add_argument("--output-directory", help="Output directory (default: res)", default="res")
parser.add_argument("--force-polymorphism", help="Force polymorphism for each SV", action='store_const', const=True,
default=False)
parser.add_argument("--haploid", help="Make a haploid genome, instead of diploid one", action="store_const", const=True,
default=False)
parser.add_argument("-l", "--read-len", help="Generate reads having a length of LEN", type=int, default=100)
parser.add_argument("-m", "--insert-len-mean", help="Generate inserts (fragments) having an average length of LEN",
type=int, default=300)
parser.add_argument("-v", "--insert-len-sd", help="Set the standard deviation of the insert (fragment) length (%%)",
type=int, default=30)
args = parser.parse_args()
return args
def _allele(frequency):
"""
Get randomly an allele, given a frequency
:param frequency: frequency of the allele {float}
:return: allele randomly choosen {0 or 1}
"""
return 1 if random.uniform(0, 1) < frequency else 0
def _get_genotypes_for_inds(nb_inds, haploid, freq):
"""
Get genotypes for each individual
:param nb_inds: number of individuals {int}
:param haploid: is the genome hamploid {bool}
:param freq: frequency of the allele {float}
:return: list of genotypes, vcf data {list}
"""
all_genotypes = []
genotypes_row = []
for i in range(1, nb_inds + 1):
genotype = str(_allele(freq)) + "/" + str(_allele(freq))
genotype_data = vcf.model._Call(None, "INDIV_" + str(i), vcf.model.make_calldata_tuple("GT")(GT=genotype))
genotypes_row.append(genotype_data)
all_genotypes.append(genotype)
return all_genotypes, genotypes_row
def _svsort(sv, chrm, genotypes_for_inds):
:param sv: the variant
:param chrm: chromosome {str}
:param genotypes_for_inds: dictionary of genotypes for each individual
return int(genotypes_for_inds[chrm][sv]["start"])
###########################
# 1. Get random deletions #
###########################
def get_random_deletions(sv_list, reference, tmp_dir, prg_path):
"""
Get random deletions
:param sv_list: file describing deletions (in SVsim format) {str}
:param reference: reference genome fasta file {str}
:param tmp_dir: temporary directory {str}
:param prg_path: program path {str}
"""
print("GENERATE SV DELs...\n")
os.system(os.path.join(prg_path, "SVsim -W -i {0} -r {1} -o {2}{3}reference-sv".format(sv_list, reference, tmp_dir, os.path.sep)))
# 2. Build BED files of deletions #
def build_bed_deletions_file(bed_file, tmp_dir):
"""
Build BED file of deletions
:param bed_file: bed full path name {str}
:param tmp_dir: temporary directory {str}
"""
try:
with open(bed_file, "w") as bed:
try:
with open(os.path.join(tmp_dir, "reference-sv.bedpe"), "r") as bedpe:
for line in bedpe:
freq = 0.2 if random.uniform(0,1) < 0.5 else 0.5
parts = line.split("\t")
bed.write("\t".join([parts[0], parts[2], parts[4], parts[6].replace("::", "_"), str(freq)]) + "\n")
except IOError:
eprint("ERROR: SVSim failed to generate variants.")
exit(1)
except IOError:
eprint("ERROR: Unable to generate BED file \"{0}\".".format(os.path.join(tmp_dir, "reference-sv.bed")))
exit(1)
###############################################################################
# 3. Build VCF files containing genotypes for the given number of individuals #
###############################################################################
# VCF file is a result file, not used by this script
def _build_vcf_header(vcf_file, prg_path, tmp_dir, nb_inds):
"""
Build header of the VCF file
:param vcf_file: vcf file full path name {str}
:param prg_path: program path {str}
:param tmp_dir: temporary directory {str}
:param nb_inds: number of individuals {int}
"""
try:
with open(os.path.join(prg_path, "template.vcf"), "r") as template:
try:
with open(vcf_file, "w") as my_template:
for line in template:
if line[:6] == "#CHROM":
line = line.replace("\n", "")
for i in range(0, nb_inds):
line += "\tINDIV_" + str(i+1)
line += "\n"
my_template.write(line)
except IOError:
eprint("ERROR: unable to create template file \"{0}\" in temp dir.".format(os.path.join(tmp_dir, "template.vcf")))
exit(1)
except IOError:
eprint("ERROR: template file not found in program directory.")
exit(1)
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
def build_genotypes_vcf_list(bed_file, output_vcf, haploid, force_polymorphism, nb_inds, tmp_dir, prg_path):
"""
Build VCF file describing genotypes for each individual (and the associated python dictionary)
:param bed_file: bed file describing deletions {str}
:param output_vcf: output VCF file full path name {str}
:param haploid: is haploid {bool}
:param force_polymorphism: force polymorphism {bool}
:param output_dir: output directory {str}
:param nb_inds: number of individuals {int}
:param tmp_dir: temporary directory {str}
:param prg_path: program path {str}
:return: dictionary of genotypes for each variant for each dictionary {OrderedDict}
"""
print("GENERATE SUMMARY VCF FILE...\n")
genotypes_for_inds = OrderedDict()
# { chr: { id_indiv: {start: #start, end: #end, genotypes: [[0,1],[1,1],...], ...}, # ...}
vcf_file = os.path.join(tmp_dir, "template.vcf")
# Build VCF header:
_build_vcf_header(vcf_file, prg_path, tmp_dir, nb_inds)
try:
with open(bed_file, "r") as bed:
vcf_reader = vcf.Reader(filename=vcf_file)
vcf_writer = vcf.Writer(open(output_vcf, "w"), vcf_reader)
for line in bed:
parts = line.replace("\n", "").split("\t")
freq = float(parts[4])
if parts[0] not in genotypes_for_inds:
genotypes_for_inds[parts[0]] = {}
genotypes_for_inds[parts[0]][parts[3]] = {"start": int(parts[1]), "end": int(parts[2]), "genotypes": []}
# Get genotypes:
all_genotypes, genotypes = [], []
if force_polymorphism:
polymorph = False
while not polymorph:
all_genotypes, genotypes = _get_genotypes_for_inds(nb_inds, haploid, freq)
polymorph = len(set(all_genotypes)) > 1
else:
all_genotypes, genotypes = _get_genotypes_for_inds(nb_inds, haploid, freq)
genotypes_for_inds[parts[0]][parts[3]]["genotypes"] = [x.split("/") for x in all_genotypes]
info = {"END": int(parts[2]), "AF": freq}
vcf_record = vcf.model._Record(parts[0], int(parts[1]), parts[3], "N", [vcf.model._SV("DEL")], ".", ".", info, "GT", [0], genotypes)
vcf_writer.write_record(vcf_record)
vcf_writer.close()
tabix_compress(output_vcf, output_vcf + ".gz", True)
tabix_index(output_vcf + ".gz", force=True, preset="vcf")
except IOError:
eprint("ERROR: Unable to load bed file \"{0}\": file not found.".format(bed_file))
exit(1)
return genotypes_for_inds
###############################################
# Build fasta chromosomes for each individual #
###############################################
def _compute_keeped_genomic_regions(svs, svs_infos, haploid):
"""
Get list of all regions keeped (not deleted)
:param svs: list of variants (deletions) {list}
:param svs_infos: infos of variants {dict}
:param haploid: is haploid {bool}
:return: regions for each individuals {OrderedDict}, the last position for each region {OrderedDict}
"""
regions = OrderedDict()
current_region_pointer = OrderedDict()
for svs_i in svs:
i = 0
for genotypes in svs_infos[svs_i]["genotypes"]:
if i not in regions:
regions[i] = {}
current_region_pointer[i] = {}
for j in range(0, 2 if not haploid else 1): # For each chromosome of the diploid genome, or the chromosome
if j not in regions[i]:
regions[i][j] = []
current_region_pointer[i][j] = "0"
if svs_infos[svs_i]["genotypes"][i][j] == "1":
regions[i][j].append([current_region_pointer[i][j], svs_infos[svs_i]["start"]])
current_region_pointer[i][j] = svs_infos[svs_i]["end"]
i += 1
return regions, current_region_pointer
def _build_fastas(chrm, regions, current_region_pointer, output_dir, fasta_orig_chr, last_nt):
"""
Build fasta files
:param chrm:
:param regions:
:param current_region_pointer:
:param output_dir:
:param fasta_orig_chr:
:param last_nt:
:return:
"""
for indiv, chrs_dips in regions.items():
id_chrs = list(chrs_dips.keys())
id_chrs = sorted(id_chrs)
for id_chr in id_chrs:
chr_dip = chrs_dips[id_chr] # SVs for each diploid chromosome
logs_regions = [] # Store logs
Floreal Cabanettes
committed
try:
with open(os.path.join(output_dir, "INDIV_" + str(indiv + 1) + "_chr_" + str(id_chr) + ".fasta"),
"a") as output_handle:
Floreal Cabanettes
committed
fasta = ""
last_one = 0
for chr_region in chr_dip:
fasta += fasta_orig_chr[int(chr_region[0]):int(chr_region[1])]
logs_regions.append("\t".join(str(x) for x in chr_region))
last_one = int(chr_region[1])
if last_one < last_nt:
logs_regions.append("\t".join([str(current_region_pointer[indiv][id_chr]), str(last_nt)]))
fasta += fasta_orig_chr[int(current_region_pointer[indiv][id_chr]):last_nt]
SeqIO.write(fasta, output_handle, "fasta")
except IOError:
eprint("ERROR: unable to write \"{0}\" file.".
format(os.path.join(output_dir, "INDIV_" + str(indiv + 1) + "_chr_" + str(id_chr) + ".fasta")))
Floreal Cabanettes
committed
exit(1)
Floreal Cabanettes
committed
try:
with open(os.path.join(output_dir, "INDIV_" + str(indiv + 1) + ".regions.log"), "a") as log_handle:
log_handle.write(chrm + "_" + str(id_chr) + "\t")
Floreal Cabanettes
committed
log_handle.write("\n\t".join(logs_regions))
log_handle.write("\n")
except IOError:
eprint("ERROR: unable to write log file: \"{0}\"".
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
format(os.path.join(output_dir, "INDIV_" + str(indiv + 1) + ".regions.log")))
def build_fastas_chromosomes(reference, genotypes_for_inds, haploid, output_dir):
print("BUILD FASTA GENOME FOR EACH INDIVIDUAL...\n")
fasta_orig = SeqIO.index(reference, "fasta")
for chrm, svs_infos in genotypes_for_inds.items():
print("PROCESSING CHROMOSOME {0}...\n".format(chrm))
svs = list(svs_infos.keys())
svs = sorted(svs, key=lambda x:_svsort(x, chrm, genotypes_for_inds))
fasta_orig_chr = fasta_orig[chrm]
last_nt = len(fasta_orig_chr)-1
# Compute keeped genomic regions for each diploid chromosome:
regions, current_region_pointer = _compute_keeped_genomic_regions(svs, svs_infos, haploid)
# Build FASTA of each diploid/haploid chromosome:
_build_fastas(chrm, regions, current_region_pointer, output_dir, fasta_orig_chr, last_nt)
def generate_samples_fastq(haploid, nb_inds, output_dir, coverage, read_len, insert_len_mean, insert_len_sd,
prg_path):
print("GENERATE RANDOM READS FOR EACH INDIVIDUAL FROM GENOME...\n")
# Generate reads for all individuals:
cmd = str("{4}pirs/pirs simulate -z -x {3} -d -B {4}pirs/Profiles/Base-Calling_Profiles/humNew.PE100.matrix.gz "
"-I {4}pirs/Profiles/InDel_Profiles/phixv2.InDel.matrix -l {5} -m {6} -v {7} "
"-G {4}pirs/Profiles/GC-depth_Profiles/humNew.gcdep_100.dat -o {0} {1} {2}")
cmd = str("{4}pirs/pirs simulate -z -x {3} -B {4}pirs/Profiles/Base-Calling_Profiles/humNew.PE100.matrix.gz "
"-I {4}pirs/Profiles/InDel_Profiles/phixv2.InDel.matrix -l {5} -m {6} -v {7} "
"-G {4}pirs/Profiles/GC-depth_Profiles/humNew.gcdep_100.dat -o {0} {1}")
for i in range(1, nb_inds+1):
prefix = os.path.join(output_dir, "INDIV_" + str(i))
chr0 = prefix + "_chr_0.fasta"
chr1 = prefix + "_chr_1.fasta"
if not haploid:
os.system(cmd.format(prefix, chr0, chr1, coverage, prg_path + os.path.sep, read_len,
insert_len_mean, insert_len_sd))
else:
os.system(cmd.format(prefix, chr0, "", coverage, prg_path + os.path.sep, read_len,
insert_len_mean, insert_len_sd))
def main():
args = parse_args()
reference = args.reference
sv_list = args.sv_list
output_dir = args.output_directory
tmp_dir = tempfile.mkdtemp()
haploid = args.haploid
nb_inds = args.nb_inds
if os.path.isdir(output_dir):
eprint("Error: output directory {0} already exists.".format(output_dir))
return 1
elif os.path.isfile(output_dir):
eprint("Error: unable to create output directory {0}: file exists.".format(output_dir))
return 1
if nb_inds < 2:
raise Exception("nb-inds must be at least 2")
if not os.path.isfile(reference + ".fai"):
os.system("samtools faidx " + reference)
prg_path = os.path.dirname(os.path.realpath(__file__))
############################
# Define fixed files names #
############################
bed_file = os.path.join(tmp_dir, "reference-sv.bed")
output_vcf = os.path.join(output_dir, "genotypes.vcf")
################
# Launch steps #
################
get_random_deletions(sv_list, reference, tmp_dir, prg_path)
build_bed_deletions_file(bed_file, tmp_dir)
genotypes_for_inds = build_genotypes_vcf_list(bed_file, output_vcf, haploid, args.force_polymorphism, nb_inds, tmp_dir, prg_path)
build_fastas_chromosomes(reference, genotypes_for_inds, haploid, output_dir)
generate_samples_fastq(haploid, nb_inds, output_dir, args.coverage, args.read_len, args.insert_len_mean,
args.insert_len_sd, prg_path)
print("DONE!\n")
if __name__ == '__main__':