Newer
Older
from collections import OrderedDict
import vcf
from Bio import SeqIO
Floreal Cabanettes
committed
import tempfile
from pysam import tabix_compress, tabix_index, VariantFile
from lib.variants_simulator import VariantsSimulator
from lib.exceptions import InputException, ExecException
import multiprocessing
class Print:
def __init__(self, stdout=None, stderr=None):
self.stdout = stdout
self.stderr = stderr
if self.stdout is not None and os.path.isfile(self.stdout):
os.remove(self.stdout)
if self.stderr is not None and os.path.isfile(self.stderr):
os.remove(self.stderr)
def out(self, message):
if self.stdout is not None:
with open(self.stdout, "a") as stdout_f:
print(message, file=stdout_f)
else:
print(message, file=sys.stdout)
def err(self, message):
if self.stderr is not None:
with open(self.stderr, "a") as stderr_f:
print(message, file=stderr_f)
else:
print(message, file=sys.stderr)
def check_min_size(value):
ivalue = int(value)
if ivalue <= 1:
raise argparse.ArgumentTypeError("%s is an invalid size value (>= 1)" % value)
return ivalue
def parse_args():
"""
Parse script arguments
:return: argparse arguments object
"""
Floreal Cabanettes
committed
parser = argparse.ArgumentParser(description='Generate simulated populations with SV',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("-n", "--nb-inds", help="Number of individuals to generate", type=int)
parser.add_argument("-r", "--reference", help="Reference genome", required=True)
parser.add_argument("-ns", "--nstretches", help="N-stretches positions bed file", required=False)
parser.add_argument("-s", "--sv-list", help="File containing the SVs", required=False)
parser.add_argument("-c", "--coverage", help="Coverage of reads", default=15, type=int)
parser.add_argument("-o", "--output-dir", help="Output directory", default="res")
parser.add_argument("-e", "--force-outputdir", help="Delete output directory before start, if already exists",
action='store_const', const=True, default=False)
parser.add_argument("-f", "--force-polymorphism", help="Force polymorphism for each SV", action='store_const',
const=True, default=False)
parser.add_argument("-a", "--haploid", help="Make a haploid genome, instead of diploid one", action="store_const",
const=True, default=False)
parser.add_argument("-pd", "--proba-del", help="Probabilty to have a deletion", type=float, default=0.000001)
parser.add_argument("-pi", "--proba-inv", help="Probablity to have an insertion", type=float, default=0.000001)
parser.add_argument("-l", "--read-len", help="Generate reads having a length of LEN", type=int, default=100)
parser.add_argument("-m", "--insert-len-mean", help="Generate inserts (fragments) having an average length of LEN",
type=int, default=300)
parser.add_argument("-v", "--insert-len-sd", help="Set the standard deviation of the insert (fragment) length (%%)",
type=int, default=30)
Floreal Cabanettes
committed
parser.add_argument("-q", "--quiet", help="Don't ask anything, choose default answer instead", action="store_const",
const=True, default=False)
parser.add_argument("-md", "--min-deletions", help="Minimum of deletions to generate (>=1)", default=0,
type=check_min_size)
parser.add_argument("-mi", "--min-inversions", help="Minimum of inversions to generate (>=1)", default=0,
parser.add_argument("--max-try", help="Maximum of tries", default=10, type=check_min_size)
parser.add_argument("-t", "--threads", help="Number of threads", default=multiprocessing.cpu_count(), type=int)
parser.add_argument("-g", "--genotypes", help="Position of SV with genotypes of individuals")
if args.nb_inds is None and args.genotypes is None:
parser.error("--nb-inds parameter is required (except if --genotypes parameter is given)")
if args.proba_del > 1 or args.proba_del < 0:
parser.error("Probability of deletions must be between 0 and 1")
if args.proba_inv > 1 or args.proba_inv < 0:
parser.error("Probability of inversions must be between 0 and 1")
if args.proba_del + args.proba_inv > 1:
parser.error("Sum of inversions and deletions probabilities must be lesser than 1")
return args
def _allele(frequency):
"""
Get randomly an allele, given a frequency
:param frequency: frequency of the allele {float}
:return: allele randomly choosen {0 or 1}
"""
return 1 if random.uniform(0, 1) < frequency else 0
def _get_genotypes_for_inds(nb_inds, haploid, freq):
"""
Get genotypes for each individual
:param nb_inds: number of individuals {int}
:param haploid: is the genome hamploid {bool}
:param freq: frequency of the allele {float}
:return: list of genotypes, vcf data {list}
"""
all_genotypes = []
genotypes_row = []
for i in range(1, nb_inds + 1):
genotype = str(_allele(freq)) + "/" + str(_allele(freq))
genotype_data = vcf.model._Call(None, "indiv" + str(i), vcf.model.make_calldata_tuple("GT")(GT=genotype))
genotypes_row.append(genotype_data)
all_genotypes.append(genotype)
return all_genotypes, genotypes_row
def _svsort(sv, chrm, genotypes_for_inds):
"""
Function to sort regions
:param sv: the variant
:param chrm: chromosome {str}
:param genotypes_for_inds: dictionary of genotypes for each individual
return int(genotypes_for_inds[chrm][sv]["start"])
def _build_vcf_header(vcf_file, prg_path, tmp_dir, nb_inds):
"""
Build header of the VCF file
:param vcf_file: vcf file full path name {str}
:param prg_path: program path {str}
:param tmp_dir: temporary directory {str}
:param nb_inds: number of individuals {int}
"""
try:
with open(os.path.join(prg_path, "template.vcf"), "r") as template:
try:
with open(vcf_file, "w") as my_template:
for line in template:
if line[:6] == "#CHROM":
line = line.replace("\n", "")
for i in range(0, nb_inds):
line += "\n"
my_template.write(line)
except IOError:
raise ExecException("ERROR: unable to create template file \"{0}\" in temp dir.".
format(os.path.join(tmp_dir, "template.vcf")))
raise ExecException("ERROR: template file not found in program directory.")
def build_genotypes_vcf_list(deletions: dict, inversions: dict, output_vcf, haploid, force_polymorphism, nb_inds,
tmp_dir, prg_path):
"""
Build VCF file describing genotypes for each individual (and the associated python dictionary)
Floreal Cabanettes
committed
:param deletions: deletions description {dict}
:param inversions: inversions description {dict}
:param output_vcf: output VCF file full path name {str}
:param haploid: is haploid {bool}
:param force_polymorphism: force polymorphism {bool}
:param nb_inds: number of individuals {int}
:param tmp_dir: temporary directory {str}
:param prg_path: program path {str}
:return: dictionary of genotypes for each variant for each dictionary {OrderedDict}
"""
# { chr: { id_indiv: {start: #start, end: #end, genotypes: [[0,1],[1,1],...], ...}, # ...}
vcf_file = os.path.join(tmp_dir, "template.vcf")
# Build VCF header:
_build_vcf_header(vcf_file, prg_path, tmp_dir, nb_inds)
Floreal Cabanettes
committed
for chrm, deletes in deletions.items():
for delete in deletes:
if chrm not in genotypes_for_inds_DEL:
genotypes_for_inds_DEL[chrm] = {}
genotypes_for_inds_DEL[chrm][delete["name"]] = {"start": delete["start"], "end": delete["end"], "genotypes": []}
Floreal Cabanettes
committed
# Get genotypes:
all_genotypes, genotypes = [], []
if force_polymorphism:
polymorph = False
while not polymorph:
all_genotypes, genotypes = _get_genotypes_for_inds(nb_inds, haploid, delete["freq"])
polymorph = len(set(all_genotypes)) > 1
else:
all_genotypes, genotypes = _get_genotypes_for_inds(nb_inds, haploid, delete["freq"])
genotypes_for_inds_DEL[chrm][delete["name"]]["genotypes"] = [x.split("/") for x in all_genotypes]
Floreal Cabanettes
committed
info = {"END": delete["end"], "AF": delete["freq"]}
vcf_record = vcf.model._Record(chrm, delete["start"], delete["name"], "N", [vcf.model._SV("DEL")], ".",
".", info, "GT", [0], genotypes)
# Inversions:
genotypes_for_inds_INV = OrderedDict()
for chrm, the_inversions in inversions.items():
for inversion in the_inversions:
if chrm not in genotypes_for_inds_INV:
genotypes_for_inds_INV[chrm] = {}
genotypes_for_inds_INV[chrm][inversion["name"]] = {"start": inversion["start"], "end": inversion["end"],
"genotypes": []}
# Get genotypes:
all_genotypes, genotypes = [], []
if force_polymorphism:
polymorph = False
while not polymorph:
all_genotypes, genotypes = _get_genotypes_for_inds(nb_inds, haploid, inversion["freq"])
polymorph = len(set(all_genotypes)) > 1
else:
all_genotypes, genotypes = _get_genotypes_for_inds(nb_inds, haploid, inversion["freq"])
genotypes_for_inds_INV[chrm][inversion["name"]]["genotypes"] = [x.split("/") for x in all_genotypes]
info = {"END": inversion["end"], "AF": inversion["freq"]}
vcf_record = vcf.model._Record(chrm, inversion["start"], inversion["name"], "N", [vcf.model._SV("INV")],
".", ".", info, "GT", [0], genotypes)
records.sort(key=lambda r: (r.CHROM, r.start))
with open(output_vcf, "w") as o_vcf:
vcf_reader = vcf.Reader(filename=vcf_file)
vcf_writer = vcf.Writer(o_vcf, vcf_reader)
for record in records:
vcf_writer.write_record(record)
vcf_writer.close()
Floreal Cabanettes
committed
tabix_compress(output_vcf, output_vcf + ".gz", True)
tabix_index(output_vcf + ".gz", force=True, preset="vcf")
return genotypes_for_inds_DEL, genotypes_for_inds_INV
def load_genotypes_from_file(genotypes_file):
variants = VariantFile(genotypes_file)
genotypes_for_inds_DEL = {}
genotypes_for_inds_INV = {}
nb_inds = 0
for variant in variants:
chrm = variant.chrom
id_v = variant.id
start = variant.start
end = variant.stop
samples = variant.samples
gt = [list(map(str, samples[x]["GT"])) for x in samples]
genotypes_for_inds = None
if type_v == "DEL":
genotypes_for_inds = genotypes_for_inds_DEL
elif type_v == "INV":
genotypes_for_inds = genotypes_for_inds_INV
else:
print("ERROR: Variant type not supported: %s. Ignoring this line..." % type_v)
if genotypes_for_inds is not None:
if chrm not in genotypes_for_inds:
genotypes_for_inds[chrm] = {}
genotypes_for_inds[chrm][id_v] = {
"start": start,
"end": end,
"genotypes": gt
}
nb_inds = len(gt)
return nb_inds, genotypes_for_inds_DEL, genotypes_for_inds_INV
def _compute_keeped_genomic_regions(svs, svs_infos, haploid):
"""
Get list of all regions keeped (not deleted)
:param svs: list of variants (deletions) {list}
:param svs_infos: infos of variants {dict}
:param haploid: is haploid {bool}
:return: regions for each individuals {OrderedDict}, the last position for each region {OrderedDict}
"""
regions = OrderedDict()
current_region_pointer = OrderedDict()
for svs_i in svs:
i = 0
for genotypes in svs_infos[svs_i]["genotypes"]:
if i not in regions:
regions[i] = {}
current_region_pointer[i] = {}
for j in range(0, 2 if not haploid else 1): # For each chromosome of the diploid genome, or the chromosome
if j not in regions[i]:
regions[i][j] = []
current_region_pointer[i][j] = "0"
if svs_infos[svs_i]["genotypes"][i][j] == "1":
regions[i][j].append([current_region_pointer[i][j], svs_infos[svs_i]["start"]])
current_region_pointer[i][j] = svs_infos[svs_i]["end"]
i += 1
return regions, current_region_pointer
def _build_fastas_deletions(chrm, regions, current_region_pointer, output_dir, fasta_orig_chr, last_nt):
"""
Build fasta files
:param chrm:
:param regions:
:param current_region_pointer:
:param output_dir:
:param fasta_orig_chr:
:param last_nt:
:return:
"""
for indiv, chrs_dips in regions.items():
id_chrs = list(chrs_dips.keys())
id_chrs = sorted(id_chrs)
chr_dip = chrs_dips[id_chr] # SVs for each diploid chromosome
logs_regions = [] # Store logs
fasta = ""
last_one = 0
for chr_region in chr_dip:
fasta += fasta_orig_chr[n][c][int(chr_region[0]):int(chr_region[1])]
logs_regions.append("\t".join(str(x) for x in chr_region))
last_one = int(chr_region[1])
if last_one < last_nt:
logs_regions.append("\t".join([str(current_region_pointer[indiv][id_chr]), str(last_nt)]))
fasta += fasta_orig_chr[n][c][int(current_region_pointer[indiv][id_chr]):last_nt]
fastas[n].append(fasta)
Floreal Cabanettes
committed
try:
with open(os.path.join(output_dir, "indiv" + str(indiv + 1) + ".regions.log"), "a") as log_handle:
log_handle.write(chrm + "_" + str(id_chr) + "\t")
Floreal Cabanettes
committed
log_handle.write("\n\t".join(logs_regions))
log_handle.write("\n")
except IOError:
raise ExecException("ERROR: unable to write log file: \"{0}\"".
format(os.path.join(output_dir, "indiv" + str(indiv + 1) + ".regions.log")))
def _build_fastas_inversions(fasta_orig_chrm: dict, genotypes_for_inds: dict, nb_inds, haploid):
fastas = []
for j in range(0, nb_inds):
fastas.append([fasta_orig_chrm] if haploid else [fasta_orig_chrm, fasta_orig_chrm])
for inv_name, props in genotypes_for_inds.items():
i = 0
for genotype in props["genotypes"]:
for k in range(0, len(genotype)):
if genotype[k] == "1":
start = int(props["start"])
end = int(props["end"])
o_id = fastas[i][k].id
o_name = fastas[i][k].name
o_description = fastas[i][k].description
fastas[i][k] = fastas[i][k][:start] + fastas[i][k][start:end].reverse_complement() + \
fastas[i][k].id = o_id
fastas[i][k].name = o_name
fastas[i][k].description = o_description
def _write_fastas(fastas, output_dir):
for indiv, chrs_dips in zip(range(0, len(fastas)), fastas):
for id_chr, fasta in zip(range(0, len(chrs_dips)), chrs_dips):
try:
with open(os.path.join(output_dir, "indiv" + str(indiv + 1) + "_chr_" + str(id_chr) + ".fasta"),
"a") as output_handle:
SeqIO.write(fasta, output_handle, "fasta")
except IOError:
raise ExecException("ERROR: unable to write \"{0}\" file.".
format(os.path.join(output_dir, "indiv" + str(indiv + 1) + "_chr_" + str(id_chr) +
".fasta")))
def build_fastas_chromosomes(reference, genotypes_for_inds_DEL, genotypes_for_inds_INV, haploid, output_dir, nb_inds,
printer):
printer.out("BUILD FASTA GENOME FOR EACH INDIVIDUAL...\n")
fasta_orig = SeqIO.index(reference, "fasta")
# First, inversions (does not change coordinates):
printer.out("STARTING WITH INVERSIONS...")
fastas_by_chrm = {}
for chrm, svs_infos in genotypes_for_inds_INV.items():
printer.out("PROCESSING CHROMOSOME {0}...\n".format(chrm))
svs = list(svs_infos.keys())
fastas_by_chrm[chrm] = _build_fastas_inversions(fasta_orig[chrm], genotypes_for_inds_INV[chrm], nb_inds, haploid)
printer.out("STARTING WITH DELETIONS...")
# Then, deletions:
for chrm, svs_infos in genotypes_for_inds_DEL.items():
printer.out("PROCESSING CHROMOSOME {0}...\n".format(chrm))
svs = sorted(svs, key=lambda x:_svsort(x, chrm, genotypes_for_inds_DEL))
if chrm in fastas_by_chrm:
fastas_orig_chr = fastas_by_chrm[chrm]
else: # Build data as _build_fastas_inversions does
fastas_orig_chr = []
for i in range(0, nb_inds):
fastas_orig_chr.append([fasta_orig[chrm]] if haploid else [fasta_orig[chrm], fasta_orig[chrm]])
# Compute keeped genomic regions for each diploid chromosome:
regions, current_region_pointer = _compute_keeped_genomic_regions(svs, svs_infos, haploid)
# Build FASTA of each diploid/haploid chromosome:
fastas_by_chrm[chrm] = _build_fastas_deletions(chrm, regions, current_region_pointer, output_dir, fastas_orig_chr, last_nt)
printer.out("WRITING FASTAS...")
for chrm, fastas in fastas_by_chrm.items():
_write_fastas(fastas, output_dir)
printer.out("")
def generate_samples_fastq(haploid, nb_inds, output_dir, coverage, read_len, insert_len_mean, insert_len_sd,
prg_path, threads, stdout, stderr):
# Generate reads for all individuals:
cmd = [prg_path + "/pirs/pirs", "simulate", "-z", "-x", str(coverage), "-d", "-B",
prg_path + "/pirs/Profiles/Base-Calling_Profiles/humNew.PE100.matrix.gz", "-I",
prg_path + "/pirs/Profiles/InDel_Profiles/phixv2.InDel.matrix", "-l", str(read_len), "-m",
str(insert_len_mean), "-v", str(insert_len_sd),
"-G", prg_path + "/pirs/Profiles/GC-depth_Profiles/humNew.gcdep_100.dat", "-t", str(threads), "-o"]
if haploid:
cmd.remove("-d")
cmd_full = cmd + [output_dir, "-s", "indiv%d" % i]
prefix_fasta = os.path.join(output_dir, "indiv" + str(i))
chr0 = prefix_fasta + "_chr_0.fasta"
chr1 = prefix_fasta + "_chr_1.fasta"
cmd_full += [chr0, chr1]
cmd_full.append(chr0)
subprocess.call(cmd_full, stdout=open(stdout, "a") if stdout is not None else None,
stderr=open(stderr, "a") if stderr is not None else None)
def confirm(deletions: dict, inversions: dict, variants: dict, printer: Print, ask=True):
Floreal Cabanettes
committed
nb_dels = 0
variants_del = sorted(variants["DEL"], key=lambda x: x["min"])
variants_del_counts = OrderedDict()
variants_del_counts_chr = {}
for variant in variants_del:
variants_del_counts["{0}-{1}".format(variant["min"], variant["max"])] = 0
for chrm, deletes in deletions.items():
nb_dels_chr = len(deletes)
nb_dels += nb_dels_chr
variants_del_counts_chr[chrm] = nb_dels_chr
for delete in deletes:
v_len = delete["length"]
for variant in variants_del:
if variant["min"] < v_len <= variant["max"]:
variants_del_counts["{0}-{1}".format(variant["min"], variant["max"])] += 1
printer.out("We generate {0} deletion variants.".format(nb_dels))
if nb_dels > 0:
printer.out("Ranges:")
for v_range, v_count in variants_del_counts.items():
printer.out("\t- Range {0}: {1}".format(v_range, v_count))
printer.out("Chromosomes:")
for chrm in sorted(list(variants_del_counts_chr.keys())):
printer.out("\t- {0}: {1}".format(chrm, variants_del_counts_chr[chrm]))
# inversions:
nb_invs = 0
variants_inv = sorted(variants["INV"], key=lambda x: x["min"])
variants_inv_counts = OrderedDict()
variants_inv_counts_chr = {}
for variant in variants_inv:
variants_inv_counts["{0}-{1}".format(variant["min"], variant["max"])] = 0
for chrm, the_inversions in inversions.items():
nb_invs_chr = len(the_inversions)
nb_invs += nb_invs_chr
variants_inv_counts_chr[chrm] = nb_invs_chr
for inversion in the_inversions:
v_len = inversion["length"]
for variant in variants_inv:
if variant["min"] < v_len <= variant["max"]:
variants_inv_counts["{0}-{1}".format(variant["min"], variant["max"])] += 1
break
printer.out("We generate {0} inversion variants.".format(nb_invs))
if nb_invs > 0:
printer.out("Ranges:")
for v_range, v_count in variants_inv_counts.items():
printer.out("\t- Range {0}: {1}".format(v_range, v_count))
printer.out("Chromosomes:")
for chrm in sorted(list(variants_inv_counts_chr.keys())):
printer.out("\t- {0}: {1}".format(chrm, variants_inv_counts_chr[chrm]))
if ask:
return input("Continue [Y/n]? ") in ["y", "Y", ""]
return True
Floreal Cabanettes
committed
def init(output_dir, force_outputdir, sv_list, nstretches, nb_inds, reference, proba_del, proba_inv, haploid,
force_polymorphism, coverage, read_len, insert_len_mean, insert_len_sd, threads, genotypes=None,
min_deletions=1, min_inversions=1, max_try=10, quiet=True, stdout=None, stderr=None):
printer = Print(stdout=stdout, stderr=stderr)
printer.err("Error: output directory {0} already exists.".format(output_dir))
printer.err("Error: unable to create output directory {0}: file exists.".format(output_dir))
if nb_inds is not None and nb_inds < 2:
printer.err("nb-inds must be at least 2")
return 1
if not os.path.isfile(reference + ".fai"):
os.system("samtools faidx " + reference)
tmp_dir = tempfile.mkdtemp()
prg_path = os.path.dirname(os.path.realpath(__file__))
############################
# Define fixed files names #
############################
output_vcf = os.path.join(output_dir, "genotypes.vcf")
################
# Launch steps #
################
deletions = None
sv_sim = None
if genotypes is None:
printer.out("GENERATE RANDOM DELETIONS VARIANTS...\n")
try:
nb_deletions = -1 # To allow 0 deletions
nb_inversions = -1 # To allow 0 inversions
nb_try = 0
sv_sim = VariantsSimulator(sv_list, nstretches, threads, stdout, stderr)
printer.out("Try to generate at least %s deletions and %s inversions..." % (min_deletions, min_inversions))
max_del = 0
max_inv = 0
while (nb_deletions < min_deletions or nb_inversions < min_inversions) and nb_try < max_try:
printer.out("\nTry {0} / {1}...".format(nb_try + 1, max_try))
nb_deletions, deletions, nb_inversions, inversions = sv_sim.get_random_variants(proba_del, proba_inv,
reference)
max_del = max(max_del, nb_deletions)
max_inv = max(max_inv, nb_inversions)
sv_sim.print_flush()
nb_try += 1
if nb_deletions < min_deletions:
printer.err("\nUnable to generate %s deletions. Try to reduce size of deletions or increase "
"general probability to have a deletion." % min_deletions)
printer.out("\nMaximum of each event: %d deletions, %d inversions." % (max_del, max_inv))
if nb_inversions < min_inversions:
printer.err("\nUnable to generate %s inversions. Try to reduce size of inversions or increase "
"general probability to have an inversion." % min_inversions)
printer.out("\nMaximum of each event: %d deletions, %d inversions." % (max_del, max_inv))
printer.out("")
except InputException as e:
printer.err(e)
except IOError as e:
printer.err(e)
return 1
except Exception:
printer.err(traceback.format_exc())
return 1
printer_confirm = Print(stdout=os.path.join(output_dir, "confirm.txt"), stderr=stderr)
ok = confirm(deletions, inversions, sv_sim.variants, printer_confirm, False)
else:
ok = confirm(deletions, inversions, sv_sim.variants, printer_confirm)
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
if ok:
try:
if genotypes is None:
printer.out("GENERATE SUMMARY VCF FILE...\n")
genotypes_for_inds_DEL, genotypes_for_inds_INV = \
build_genotypes_vcf_list(deletions, inversions, output_vcf, haploid, force_polymorphism, nb_inds,
tmp_dir, prg_path)
else:
nb_inds, genotypes_for_inds_DEL, genotypes_for_inds_INV = load_genotypes_from_file(genotypes)
if nb_inds < 2:
printer.err("nb-inds must be at least 2")
return 1
build_fastas_chromosomes(reference, genotypes_for_inds_DEL, genotypes_for_inds_INV, haploid, output_dir,
nb_inds, printer)
printer.out("GENERATE RANDOM READS FOR EACH INDIVIDUAL FROM GENOME...\n")
generate_samples_fastq(haploid, nb_inds, output_dir, coverage, read_len, insert_len_mean,
insert_len_sd, prg_path, threads, stdout, stderr)
printer.out("DONE!\n")
except ExecException as e:
printer.err(e)
return 1
except Exception:
printer.err(traceback.format_exc())
return 1
else:
printer.out("Aborted!\n")
def main():
args = parse_args()
init(**{k: v for k, v in vars(args).items() if k in init.__code__.co_varnames})