#!/usr/bin/env python3 import os import random import argparse from collections import OrderedDict import vcf from Bio import SeqIO parser = argparse.ArgumentParser(description='Generate simulated populations with SV') parser.add_argument("--nb-inds", help="Number of individuals to generate", required=True, type=int) parser.add_argument("--reference", help="Reference genome", required=True) parser.add_argument("--sv-list", help="File containing the SVs", required=True) parser.add_argument("--coverage", help="Coverage of reads (default: 15)", default=15, type=int) parser.add_argument("--output-directory", help="Output directory (default: res)", default="res") parser.add_argument("--tmp-directory", help="Temporary directory (default: tmp)", default="tmp") parser.add_argument("--force-polymorphism", help="Force polymorphism for each SV", action='store_const', const=True, default=False) parser.add_argument("--haploid", help="Make a haploid genome, instead of diploid one", action="store_const", const=True, default=False) parser.add_argument("-l", "--read-len", help="Generate reads having a length of LEN", type=int, default=100) parser.add_argument("-m", "--insert-len-mean", help="Generate inserts (fragments) having an average length of LEN", type=int, default=300) parser.add_argument("-v", "--insert-len-sd", help="Set the standard deviation of the insert (fragment) length (%%)", type=int, default=30) args = parser.parse_args() nb_inds = args.nb_inds if nb_inds < 2: raise Exception("nb-inds must be at least 2") reference = args.reference sv_list = args.sv_list output_dir = args.output_directory tmp_dir = args.tmp_directory haploid = args.haploid if not os.path.isfile(reference + ".fai"): os.system("samtools faidx " + reference) ############# # FUNCTIONS # ############# def allele(frequency): return 1 if random.uniform(0, 1) < frequency else 0 def get_genotypes_for_inds(): all_genotypes = [] genotypes_row = [] for i in range(1, nb_inds + 1): if not haploid: genotype = str(allele(freq)) + "/" + str(allele(freq)) else: genotype = str(allele(freq)) genotype_data = vcf.model._Call(None, "INDIV_" + str(i), vcf.model.make_calldata_tuple("GT")(GT=genotype)) genotypes_row.append(genotype_data) all_genotypes.append(genotype) return all_genotypes, genotypes_row def svsort(sv, chr): """ Function to sort regions """ return int(genotypes_for_inds[chr][sv]["start"]) prg_path = os.path.dirname(os.path.realpath(__file__)) if not os.path.isdir(tmp_dir): os.mkdir(tmp_dir) if not os.path.isdir(output_dir): os.mkdir(output_dir) ########################### # 1. Get random deletions # ########################### print("GENERATE SV DELs...\n") os.system(os.path.join(prg_path, "SVsim -W -i {0} -r {1} -o {2}{3}reference-sv".format(sv_list, reference, tmp_dir, os.path.sep))) ################################### # 2. Build BED files of deletions # ################################### with open(os.path.join(tmp_dir, "reference-sv.bed"), "w") as bed: with open(os.path.join(tmp_dir, "reference-sv.bedpe"), "r") as bedpe: for line in bedpe: freq = 0.2 if random.uniform(0,1) < 0.5 else 0.5 parts = line.split("\t") bed.write("\t".join([parts[0], parts[2], parts[4], parts[6].replace("::", "_"), str(freq)]) + "\n") ############################################################################### # 3. Build VCF files containing genotypes for the given number of individuals # ############################################################################### # VCF file is a result file, not used by this script print("GENERATE SUMMARY VCF FILE...\n") genotypes_for_inds = OrderedDict() # { chr: { id_indiv: {start: #start, end: #end, genotypes: [[0,1],[1,1],...], ...}, # ...} # Build VCF header: with open(os.path.join(prg_path, "template.vcf"), "r") as template: with open(os.path.join(tmp_dir, "template.vcf"), "w") as my_template: for line in template: if line[:6] == "#CHROM": line = line.replace("\n", "") for i in range(0, nb_inds): line += "\tINDIV_" + str(i+1) line += "\n" my_template.write(line) with open(os.path.join(tmp_dir, "reference-sv.bed"), "r") as bed: vcf_reader = vcf.Reader(filename=os.path.join(tmp_dir, 'template.vcf')) output_vcf = os.path.join(output_dir, "genotypes.vcf") vcf_writer = vcf.Writer(open(output_vcf, "w"), vcf_reader) for line in bed: parts = line.replace("\n", "").split("\t") freq = float(parts[4]) if parts[0] not in genotypes_for_inds: genotypes_for_inds[parts[0]] = {} genotypes_for_inds[parts[0]][parts[3]] = {"start": int(parts[1]), "end": int(parts[2]), "genotypes": []} # Get genotypes: all_genotypes, genotypes = [], [] if args.force_polymorphism: polymorph = False while not polymorph: all_genotypes, genotypes = get_genotypes_for_inds() polymorph = len(set(all_genotypes)) > 1 else: all_genotypes, genotypes = get_genotypes_for_inds() genotypes_for_inds[parts[0]][parts[3]]["genotypes"] = [x.split("/") for x in all_genotypes] info = {"END": int(parts[2]), "AF": freq} vcf_record = vcf.model._Record(parts[0], int(parts[1]), parts[3], "N", [vcf.model._SV("DEL")], ".", ".", info, "GT", [0], genotypes) vcf_writer.write_record(vcf_record) vcf_writer.close() # Bgzip + tabix: os.system("bgzip -c " + output_vcf + " > " + output_vcf + ".gz") os.unlink(output_vcf) output_vcf += ".gz" os.system("tabix -p vcf " + output_vcf) ############################################### # Build fasta chromosomes for each individual # ############################################### print("BUILD FASTA GENOME FOR EACH INDIVIDUAL...\n") fasta_orig = SeqIO.index(reference, "fasta") for chr, svs_infos in genotypes_for_inds.items(): print("PROCESSING CHROMOSOME {0}...\n".format(chr)) svs = list(svs_infos.keys()) svs = sorted(svs, key=lambda x:svsort(x, chr)) fasta_orig_chr = fasta_orig[chr] last_nt = len(fasta_orig_chr)-1 # Compute keeped genomic regions for each diploid chromosome: regions = OrderedDict() current_region_pointer = OrderedDict() for svs_i in svs: i = 0 for genotypes in svs_infos[svs_i]["genotypes"]: if i not in regions: regions[i] = {} current_region_pointer[i] = {} for j in range(0, 2 if not haploid else 1): # For each chromosome of the diploid genome, or the chromosome # of the haploid genome if j not in regions[i]: regions[i][j] = [] current_region_pointer[i][j] = "0" if svs_infos[svs_i]["genotypes"][i][j] == "1": regions[i][j].append([current_region_pointer[i][j], svs_infos[svs_i]["start"]]) current_region_pointer[i][j] = svs_infos[svs_i]["end"] i += 1 # Build FASTA of each diploid/haploid chromosome: for indiv, chrs_dips in regions.items(): id_chrs = list(chrs_dips.keys()) id_chrs = sorted(id_chrs) for id_chr in id_chrs: chr_dip = chrs_dips[id_chr] # SVs for each diploid chromosome logs_regions = [] # Store logs # Build FASTA and store logs: with open(os.path.join(output_dir, "INDIV_" + str(indiv+1) + "_chr_" + str(id_chr) + ".fasta"), "a") as output_handle: fasta = "" last_one = 0 for chr_region in chr_dip: fasta += fasta_orig_chr[int(chr_region[0]):int(chr_region[1])] logs_regions.append("\t".join(str(x) for x in chr_region)) last_one = int(chr_region[1]) if last_one < last_nt: logs_regions.append("\t".join([str(current_region_pointer[indiv][id_chr]), str(last_nt)])) fasta += fasta_orig_chr[int(current_region_pointer[indiv][id_chr]):last_nt] SeqIO.write(fasta, output_handle, "fasta") # Write logs with open(os.path.join(output_dir, "INDIV_" + str(indiv+1) + ".regions.log"), "a") as log_handle: log_handle.write(chr + "_" + str(id_chr) + "\t") log_handle.write("\n\t".join(logs_regions)) log_handle.write("\n") print("GENERATE RANDOM READS FOR EACH INDIVIDUAL FROM GENOME...\n") # Generate reads for all individuals: if not haploid: cmd = str("{4}pirs/pirs simulate -z -x {3} -d -B {4}pirs/Profiles/Base-Calling_Profiles/humNew.PE100.matrix.gz " "-I {4}pirs/Profiles/InDel_Profiles/phixv2.InDel.matrix -l {5} -m {6} -v {7} " "-G {4}pirs/Profiles/GC-depth_Profiles/humNew.gcdep_100.dat -o {0} {1} {2}") else: cmd = str("{4}pirs/pirs simulate -z -x {3} -B {4}pirs/Profiles/Base-Calling_Profiles/humNew.PE100.matrix.gz " "-I {4}pirs/Profiles/InDel_Profiles/phixv2.InDel.matrix -l {5} -m {6} -v {7} " "-G {4}pirs/Profiles/GC-depth_Profiles/humNew.gcdep_100.dat -o {0} {1}") for i in range(1, nb_inds+1): prefix = os.path.join(output_dir, "INDIV_" + str(i)) chr0 = prefix + "_chr_0.fasta" chr1 = prefix + "_chr_1.fasta" if not haploid: os.system(cmd.format(prefix, chr0, chr1, args.coverage, prg_path + os.path.sep, args.read_len, args.insert_len_mean, args.insert_len_sd)) else: os.system(cmd.format(prefix, chr0, "", args.coverage, prg_path + os.path.sep, args.read_len, args.insert_len_mean, args.insert_len_sd)) print("DONE!\n")