#!/usr/bin/env python3 import sys import os import shutil import random import argparse import traceback from collections import OrderedDict import vcf from Bio import SeqIO import tempfile from pysam import tabix_compress, tabix_index, VariantFile from lib.variants_simulator import VariantsSimulator from lib.exceptions import InputException, ExecException import multiprocessing import subprocess class Print: def __init__(self, stdout=None, stderr=None): self.stdout = stdout self.stderr = stderr if self.stdout is not None and os.path.isfile(self.stdout): os.remove(self.stdout) if self.stderr is not None and os.path.isfile(self.stderr): os.remove(self.stderr) def out(self, message): if self.stdout is not None: with open(self.stdout, "a") as stdout_f: print(message, file=stdout_f) else: print(message, file=sys.stdout) def err(self, message): if self.stderr is not None: with open(self.stderr, "a") as stderr_f: print(message, file=stderr_f) else: print(message, file=sys.stderr) def check_min_size(value): ivalue = int(value) if ivalue <= 1: raise argparse.ArgumentTypeError("%s is an invalid size value (>= 1)" % value) return ivalue def parse_args(): """ Parse script arguments :return: argparse arguments object """ parser = argparse.ArgumentParser(description='Generate simulated populations with SV', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("-n", "--nb-inds", help="Number of individuals to generate", type=int) parser.add_argument("-r", "--reference", help="Reference genome", required=True) parser.add_argument("-ns", "--nstretches", help="N-stretches positions bed file", required=False) parser.add_argument("-s", "--sv-list", help="File containing the SVs", required=False) parser.add_argument("-c", "--coverage", help="Coverage of reads", default=15, type=int) parser.add_argument("-o", "--output-dir", help="Output directory", default="res") parser.add_argument("-e", "--force-outputdir", help="Delete output directory before start, if already exists", action='store_const', const=True, default=False) parser.add_argument("-f", "--force-polymorphism", help="Force polymorphism for each SV", action='store_const', const=True, default=False) parser.add_argument("-a", "--haploid", help="Make a haploid genome, instead of diploid one", action="store_const", const=True, default=False) parser.add_argument("-pd", "--proba-del", help="Probabilty to have a deletion", type=float, default=0.000001) parser.add_argument("-pi", "--proba-inv", help="Probablity to have an insertion", type=float, default=0.000001) parser.add_argument("-l", "--read-len", help="Generate reads having a length of LEN", type=int, default=100) parser.add_argument("-m", "--insert-len-mean", help="Generate inserts (fragments) having an average length of LEN", type=int, default=300) parser.add_argument("-v", "--insert-len-sd", help="Set the standard deviation of the insert (fragment) length (%%)", type=int, default=30) parser.add_argument("-q", "--quiet", help="Don't ask anything, choose default answer instead", action="store_const", const=True, default=False) parser.add_argument("-md", "--min-deletions", help="Minimum of deletions to generate (>=1)", default=0, type=check_min_size) parser.add_argument("-mi", "--min-inversions", help="Minimum of inversions to generate (>=1)", default=0, type=check_min_size) parser.add_argument("--max-try", help="Maximum of tries", default=10, type=check_min_size) parser.add_argument("-t", "--threads", help="Number of threads", default=multiprocessing.cpu_count(), type=int) parser.add_argument("-g", "--genotypes", help="Position of SV with genotypes of individuals") args = parser.parse_args() if args.nb_inds is None and args.genotypes is None: parser.error("--nb-inds parameter is required (except if --genotypes parameter is given)") if args.proba_del > 1 or args.proba_del < 0: parser.error("Probability of deletions must be between 0 and 1") if args.proba_inv > 1 or args.proba_inv < 0: parser.error("Probability of inversions must be between 0 and 1") if args.proba_del + args.proba_inv > 1: parser.error("Sum of inversions and deletions probabilities must be lesser than 1") return args def _allele(frequency): """ Get randomly an allele, given a frequency :param frequency: frequency of the allele {float} :return: allele randomly choosen {0 or 1} """ return 1 if random.uniform(0, 1) < frequency else 0 def _get_genotypes_for_inds(nb_inds, haploid, freq): """ Get genotypes for each individual :param nb_inds: number of individuals {int} :param haploid: is the genome hamploid {bool} :param freq: frequency of the allele {float} :return: list of genotypes, vcf data {list} """ all_genotypes = [] genotypes_row = [] for i in range(1, nb_inds + 1): if not haploid: genotype = str(_allele(freq)) + "/" + str(_allele(freq)) else: genotype = str(_allele(freq)) genotype_data = vcf.model._Call(None, "indiv" + str(i), vcf.model.make_calldata_tuple("GT")(GT=genotype)) genotypes_row.append(genotype_data) all_genotypes.append(genotype) return all_genotypes, genotypes_row def _svsort(sv, chrm, genotypes_for_inds): """ Function to sort regions :param sv: the variant :param chrm: chromosome {str} :param genotypes_for_inds: dictionary of genotypes for each individual """ return int(genotypes_for_inds[chrm][sv]["start"]) def _build_vcf_header(vcf_file, prg_path, tmp_dir, nb_inds): """ Build header of the VCF file :param vcf_file: vcf file full path name {str} :param prg_path: program path {str} :param tmp_dir: temporary directory {str} :param nb_inds: number of individuals {int} """ try: with open(os.path.join(prg_path, "template.vcf"), "r") as template: try: with open(vcf_file, "w") as my_template: for line in template: if line[:6] == "#CHROM": line = line.replace("\n", "") for i in range(0, nb_inds): line += "\tindiv" + str(i+1) line += "\n" my_template.write(line) except IOError: raise ExecException("ERROR: unable to create template file \"{0}\" in temp dir.". format(os.path.join(tmp_dir, "template.vcf"))) except IOError: raise ExecException("ERROR: template file not found in program directory.") def build_genotypes_vcf_list(deletions: dict, inversions: dict, output_vcf, haploid, force_polymorphism, nb_inds, tmp_dir, prg_path): """ Build VCF file describing genotypes for each individual (and the associated python dictionary) :param deletions: deletions description {dict} :param inversions: inversions description {dict} :param output_vcf: output VCF file full path name {str} :param haploid: is haploid {bool} :param force_polymorphism: force polymorphism {bool} :param nb_inds: number of individuals {int} :param tmp_dir: temporary directory {str} :param prg_path: program path {str} :return: dictionary of genotypes for each variant for each dictionary {OrderedDict} """ genotypes_for_inds_DEL = OrderedDict() # { chr: { id_indiv: {start: #start, end: #end, genotypes: [[0,1],[1,1],...], ...}, # ...} vcf_file = os.path.join(tmp_dir, "template.vcf") # Build VCF header: _build_vcf_header(vcf_file, prg_path, tmp_dir, nb_inds) records = [] # Deletions: for chrm, deletes in deletions.items(): for delete in deletes: if chrm not in genotypes_for_inds_DEL: genotypes_for_inds_DEL[chrm] = {} genotypes_for_inds_DEL[chrm][delete["name"]] = {"start": delete["start"], "end": delete["end"], "genotypes": []} # Get genotypes: all_genotypes, genotypes = [], [] if force_polymorphism: polymorph = False while not polymorph: all_genotypes, genotypes = _get_genotypes_for_inds(nb_inds, haploid, delete["freq"]) polymorph = len(set(all_genotypes)) > 1 else: all_genotypes, genotypes = _get_genotypes_for_inds(nb_inds, haploid, delete["freq"]) genotypes_for_inds_DEL[chrm][delete["name"]]["genotypes"] = [x.split("/") for x in all_genotypes] info = {"END": delete["end"], "AF": delete["freq"]} vcf_record = vcf.model._Record(chrm, delete["start"], delete["name"], "N", [vcf.model._SV("DEL")], ".", ".", info, "GT", [0], genotypes) records.append(vcf_record) # Inversions: genotypes_for_inds_INV = OrderedDict() for chrm, the_inversions in inversions.items(): for inversion in the_inversions: if chrm not in genotypes_for_inds_INV: genotypes_for_inds_INV[chrm] = {} genotypes_for_inds_INV[chrm][inversion["name"]] = {"start": inversion["start"], "end": inversion["end"], "genotypes": []} # Get genotypes: all_genotypes, genotypes = [], [] if force_polymorphism: polymorph = False while not polymorph: all_genotypes, genotypes = _get_genotypes_for_inds(nb_inds, haploid, inversion["freq"]) polymorph = len(set(all_genotypes)) > 1 else: all_genotypes, genotypes = _get_genotypes_for_inds(nb_inds, haploid, inversion["freq"]) genotypes_for_inds_INV[chrm][inversion["name"]]["genotypes"] = [x.split("/") for x in all_genotypes] info = {"END": inversion["end"], "AF": inversion["freq"]} vcf_record = vcf.model._Record(chrm, inversion["start"], inversion["name"], "N", [vcf.model._SV("INV")], ".", ".", info, "GT", [0], genotypes) records.append(vcf_record) records.sort(key=lambda r: (r.CHROM, r.start)) with open(output_vcf, "w") as o_vcf: vcf_reader = vcf.Reader(filename=vcf_file) vcf_writer = vcf.Writer(o_vcf, vcf_reader) for record in records: vcf_writer.write_record(record) vcf_writer.close() tabix_compress(output_vcf, output_vcf + ".gz", True) tabix_index(output_vcf + ".gz", force=True, preset="vcf") os.remove(output_vcf) return genotypes_for_inds_DEL, genotypes_for_inds_INV def load_genotypes_from_file(genotypes_file): variants = VariantFile(genotypes_file) genotypes_for_inds_DEL = {} genotypes_for_inds_INV = {} nb_inds = 0 for variant in variants: chrm = variant.chrom id_v = variant.id start = variant.start end = variant.stop type_v = variant.alts[0][1:-1] samples = variant.samples gt = [list(map(str, samples[x]["GT"])) for x in samples] genotypes_for_inds = None if type_v == "DEL": genotypes_for_inds = genotypes_for_inds_DEL elif type_v == "INV": genotypes_for_inds = genotypes_for_inds_INV else: print("ERROR: Variant type not supported: %s. Ignoring this line..." % type_v) if genotypes_for_inds is not None: if chrm not in genotypes_for_inds: genotypes_for_inds[chrm] = {} genotypes_for_inds[chrm][id_v] = { "start": start, "end": end, "genotypes": gt } nb_inds = len(gt) return nb_inds, genotypes_for_inds_DEL, genotypes_for_inds_INV def _compute_keeped_genomic_regions(svs, svs_infos, haploid): """ Get list of all regions keeped (not deleted) :param svs: list of variants (deletions) {list} :param svs_infos: infos of variants {dict} :param haploid: is haploid {bool} :return: regions for each individuals {OrderedDict}, the last position for each region {OrderedDict} """ regions = OrderedDict() current_region_pointer = OrderedDict() for svs_i in svs: i = 0 for genotypes in svs_infos[svs_i]["genotypes"]: if i not in regions: regions[i] = {} current_region_pointer[i] = {} for j in range(0, 2 if not haploid else 1): # For each chromosome of the diploid genome, or the chromosome # of the haploid genome if j not in regions[i]: regions[i][j] = [] current_region_pointer[i][j] = "0" if svs_infos[svs_i]["genotypes"][i][j] == "1": regions[i][j].append([current_region_pointer[i][j], svs_infos[svs_i]["start"]]) current_region_pointer[i][j] = svs_infos[svs_i]["end"] i += 1 return regions, current_region_pointer def _build_fastas_deletions(chrm, regions, current_region_pointer, output_dir, fasta_orig_chr, last_nt): """ Build fasta files :param chrm: :param regions: :param current_region_pointer: :param output_dir: :param fasta_orig_chr: :param last_nt: :return: """ n = 0 fastas = [] for indiv, chrs_dips in regions.items(): id_chrs = list(chrs_dips.keys()) id_chrs = sorted(id_chrs) c = 0 fastas.append([]) for id_chr in id_chrs: chr_dip = chrs_dips[id_chr] # SVs for each diploid chromosome logs_regions = [] # Store logs # Build FASTA and store logs: fasta = "" last_one = 0 for chr_region in chr_dip: fasta += fasta_orig_chr[n][c][int(chr_region[0]):int(chr_region[1])] logs_regions.append("\t".join(str(x) for x in chr_region)) last_one = int(chr_region[1]) if last_one < last_nt: logs_regions.append("\t".join([str(current_region_pointer[indiv][id_chr]), str(last_nt)])) fasta += fasta_orig_chr[n][c][int(current_region_pointer[indiv][id_chr]):last_nt] fastas[n].append(fasta) # Write logs try: with open(os.path.join(output_dir, "indiv" + str(indiv + 1) + ".regions.log"), "a") as log_handle: log_handle.write(chrm + "_" + str(id_chr) + "\t") log_handle.write("\n\t".join(logs_regions)) log_handle.write("\n") except IOError: raise ExecException("ERROR: unable to write log file: \"{0}\"". format(os.path.join(output_dir, "indiv" + str(indiv + 1) + ".regions.log"))) c += 1 n += 1 return fastas def _build_fastas_inversions(fasta_orig_chrm: dict, genotypes_for_inds: dict, nb_inds, haploid): fastas = [] for j in range(0, nb_inds): fastas.append([fasta_orig_chrm] if haploid else [fasta_orig_chrm, fasta_orig_chrm]) for inv_name, props in genotypes_for_inds.items(): i = 0 for genotype in props["genotypes"]: for k in range(0, len(genotype)): if genotype[k] == "1": start = int(props["start"]) end = int(props["end"]) o_id = fastas[i][k].id o_name = fastas[i][k].name o_description = fastas[i][k].description fastas[i][k] = fastas[i][k][:start] + fastas[i][k][start:end].reverse_complement() + \ fastas[i][k][end:] fastas[i][k].id = o_id fastas[i][k].name = o_name fastas[i][k].description = o_description i += 1 return fastas def _write_fastas(fastas, output_dir): for indiv, chrs_dips in zip(range(0, len(fastas)), fastas): for id_chr, fasta in zip(range(0, len(chrs_dips)), chrs_dips): try: with open(os.path.join(output_dir, "indiv" + str(indiv + 1) + "_chr_" + str(id_chr) + ".fasta"), "a") as output_handle: SeqIO.write(fasta, output_handle, "fasta") except IOError: raise ExecException("ERROR: unable to write \"{0}\" file.". format(os.path.join(output_dir, "indiv" + str(indiv + 1) + "_chr_" + str(id_chr) + ".fasta"))) def build_fastas_chromosomes(reference, genotypes_for_inds_DEL, genotypes_for_inds_INV, haploid, output_dir, nb_inds, printer): printer.out("BUILD FASTA GENOME FOR EACH INDIVIDUAL...\n") fasta_orig = SeqIO.index(reference, "fasta") # First, inversions (does not change coordinates): printer.out("STARTING WITH INVERSIONS...") fastas_by_chrm = {} for chrm, svs_infos in genotypes_for_inds_INV.items(): printer.out("PROCESSING CHROMOSOME {0}...\n".format(chrm)) svs = list(svs_infos.keys()) fastas_by_chrm[chrm] = _build_fastas_inversions(fasta_orig[chrm], genotypes_for_inds_INV[chrm], nb_inds, haploid) printer.out("STARTING WITH DELETIONS...") # Then, deletions: for chrm, svs_infos in genotypes_for_inds_DEL.items(): printer.out("PROCESSING CHROMOSOME {0}...\n".format(chrm)) svs = list(svs_infos.keys()) svs = sorted(svs, key=lambda x:_svsort(x, chrm, genotypes_for_inds_DEL)) if chrm in fastas_by_chrm: fastas_orig_chr = fastas_by_chrm[chrm] else: # Build data as _build_fastas_inversions does fastas_orig_chr = [] for i in range(0, nb_inds): fastas_orig_chr.append([fasta_orig[chrm]] if haploid else [fasta_orig[chrm], fasta_orig[chrm]]) last_nt = len(fastas_orig_chr[0][0])-1 # Compute keeped genomic regions for each diploid chromosome: regions, current_region_pointer = _compute_keeped_genomic_regions(svs, svs_infos, haploid) # Build FASTA of each diploid/haploid chromosome: fastas_by_chrm[chrm] = _build_fastas_deletions(chrm, regions, current_region_pointer, output_dir, fastas_orig_chr, last_nt) printer.out("WRITING FASTAS...") for chrm, fastas in fastas_by_chrm.items(): _write_fastas(fastas, output_dir) printer.out("") def generate_samples_fastq(haploid, nb_inds, output_dir, coverage, read_len, insert_len_mean, insert_len_sd, prg_path, threads, stdout, stderr): # Generate reads for all individuals: cmd = [prg_path + "/pirs/pirs", "simulate", "-z", "-x", str(coverage), "-d", "-B", prg_path + "/pirs/Profiles/Base-Calling_Profiles/humNew.PE100.matrix.gz", "-I", prg_path + "/pirs/Profiles/InDel_Profiles/phixv2.InDel.matrix", "-l", str(read_len), "-m", str(insert_len_mean), "-v", str(insert_len_sd), "-G", prg_path + "/pirs/Profiles/GC-depth_Profiles/humNew.gcdep_100.dat", "-t", str(threads), "-o"] if haploid: cmd.remove("-d") for i in range(1, nb_inds+1): cmd_full = cmd + [output_dir, "-s", "indiv%d" % i] prefix_fasta = os.path.join(output_dir, "indiv" + str(i)) chr0 = prefix_fasta + "_chr_0.fasta" if not haploid: chr1 = prefix_fasta + "_chr_1.fasta" cmd_full += [chr0, chr1] else: cmd_full.append(chr0) subprocess.call(cmd_full, stdout=open(stdout, "a") if stdout is not None else None, stderr=open(stderr, "a") if stderr is not None else None) def confirm(deletions: dict, inversions: dict, variants: dict, printer: Print, ask=True): # Deletions: nb_dels = 0 variants_del = sorted(variants["DEL"], key=lambda x: x["min"]) variants_del_counts = OrderedDict() variants_del_counts_chr = {} for variant in variants_del: variants_del_counts["{0}-{1}".format(variant["min"], variant["max"])] = 0 for chrm, deletes in deletions.items(): nb_dels_chr = len(deletes) nb_dels += nb_dels_chr variants_del_counts_chr[chrm] = nb_dels_chr for delete in deletes: v_len = delete["length"] for variant in variants_del: if variant["min"] < v_len <= variant["max"]: variants_del_counts["{0}-{1}".format(variant["min"], variant["max"])] += 1 break printer.out("We generate {0} deletion variants.".format(nb_dels)) if nb_dels > 0: printer.out("Ranges:") for v_range, v_count in variants_del_counts.items(): printer.out("\t- Range {0}: {1}".format(v_range, v_count)) printer.out("Chromosomes:") for chrm in sorted(list(variants_del_counts_chr.keys())): printer.out("\t- {0}: {1}".format(chrm, variants_del_counts_chr[chrm])) printer.out("") # inversions: nb_invs = 0 variants_inv = sorted(variants["INV"], key=lambda x: x["min"]) variants_inv_counts = OrderedDict() variants_inv_counts_chr = {} for variant in variants_inv: variants_inv_counts["{0}-{1}".format(variant["min"], variant["max"])] = 0 for chrm, the_inversions in inversions.items(): nb_invs_chr = len(the_inversions) nb_invs += nb_invs_chr variants_inv_counts_chr[chrm] = nb_invs_chr for inversion in the_inversions: v_len = inversion["length"] for variant in variants_inv: if variant["min"] < v_len <= variant["max"]: variants_inv_counts["{0}-{1}".format(variant["min"], variant["max"])] += 1 break printer.out("We generate {0} inversion variants.".format(nb_invs)) if nb_invs > 0: printer.out("Ranges:") for v_range, v_count in variants_inv_counts.items(): printer.out("\t- Range {0}: {1}".format(v_range, v_count)) printer.out("Chromosomes:") for chrm in sorted(list(variants_inv_counts_chr.keys())): printer.out("\t- {0}: {1}".format(chrm, variants_inv_counts_chr[chrm])) printer.out("") # Confirm: if ask: return input("Continue [Y/n]? ") in ["y", "Y", ""] return True def init(output_dir, force_outputdir, sv_list, nstretches, nb_inds, reference, proba_del, proba_inv, haploid, force_polymorphism, coverage, read_len, insert_len_mean, insert_len_sd, threads, genotypes=None, min_deletions=1, min_inversions=1, max_try=10, quiet=True, stdout=None, stderr=None): printer = Print(stdout=stdout, stderr=stderr) if os.path.isdir(output_dir): if force_outputdir: shutil.rmtree(output_dir) else: printer.err("Error: output directory {0} already exists.".format(output_dir)) return 1 elif os.path.isfile(output_dir): printer.err("Error: unable to create output directory {0}: file exists.".format(output_dir)) return 1 if nb_inds is not None and nb_inds < 2: printer.err("nb-inds must be at least 2") return 1 if not os.path.isfile(reference + ".fai"): os.system("samtools faidx " + reference) tmp_dir = tempfile.mkdtemp() prg_path = os.path.dirname(os.path.realpath(__file__)) os.mkdir(output_dir) ############################ # Define fixed files names # ############################ output_vcf = os.path.join(output_dir, "genotypes.vcf") ################ # Launch steps # ################ deletions = None inversions = None sv_sim = None if genotypes is None: printer.out("GENERATE RANDOM DELETIONS VARIANTS...\n") try: nb_deletions = -1 # To allow 0 deletions nb_inversions = -1 # To allow 0 inversions nb_try = 0 sv_sim = VariantsSimulator(sv_list, nstretches, threads, stdout, stderr) printer.out("Try to generate at least %s deletions and %s inversions..." % (min_deletions, min_inversions)) max_del = 0 max_inv = 0 while (nb_deletions < min_deletions or nb_inversions < min_inversions) and nb_try < max_try: printer.out("\nTry {0} / {1}...".format(nb_try + 1, max_try)) nb_deletions, deletions, nb_inversions, inversions = sv_sim.get_random_variants(proba_del, proba_inv, reference) max_del = max(max_del, nb_deletions) max_inv = max(max_inv, nb_inversions) sv_sim.print_flush() nb_try += 1 if nb_deletions < min_deletions: printer.err("\nUnable to generate %s deletions. Try to reduce size of deletions or increase " "general probability to have a deletion." % min_deletions) printer.out("\nMaximum of each event: %d deletions, %d inversions." % (max_del, max_inv)) return 1 if nb_inversions < min_inversions: printer.err("\nUnable to generate %s inversions. Try to reduce size of inversions or increase " "general probability to have an inversion." % min_inversions) printer.out("\nMaximum of each event: %d deletions, %d inversions." % (max_del, max_inv)) return 1 printer.out("") except InputException as e: printer.err(e) return 1 except IOError as e: printer.err(e) return 1 except Exception: printer.err(traceback.format_exc()) return 1 printer_confirm = printer ok = True if genotypes is None: if quiet: printer_confirm = Print(stdout=os.path.join(output_dir, "confirm.txt"), stderr=stderr) ok = confirm(deletions, inversions, sv_sim.variants, printer_confirm, False) else: ok = confirm(deletions, inversions, sv_sim.variants, printer_confirm) if ok: try: if genotypes is None: printer.out("GENERATE SUMMARY VCF FILE...\n") genotypes_for_inds_DEL, genotypes_for_inds_INV = \ build_genotypes_vcf_list(deletions, inversions, output_vcf, haploid, force_polymorphism, nb_inds, tmp_dir, prg_path) else: nb_inds, genotypes_for_inds_DEL, genotypes_for_inds_INV = load_genotypes_from_file(genotypes) if nb_inds < 2: printer.err("nb-inds must be at least 2") return 1 build_fastas_chromosomes(reference, genotypes_for_inds_DEL, genotypes_for_inds_INV, haploid, output_dir, nb_inds, printer) printer.out("GENERATE RANDOM READS FOR EACH INDIVIDUAL FROM GENOME...\n") generate_samples_fastq(haploid, nb_inds, output_dir, coverage, read_len, insert_len_mean, insert_len_sd, prg_path, threads, stdout, stderr) printer.out("DONE!\n") except ExecException as e: printer.err(e) return 1 except Exception: printer.err(traceback.format_exc()) return 1 else: printer.out("Aborted!\n") return 0 def main(): args = parse_args() init(**{k: v for k, v in vars(args).items() if k in init.__code__.co_varnames}) if __name__ == '__main__': sys.exit(main())