diff --git a/build_pop.py b/build_pop.py index d1105cb549a2da7ecc97c30bd0d39cbd26a33ddb..c0fb88bdd948cedfad2bcfea61804929c849d674 100755 --- a/build_pop.py +++ b/build_pop.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 +import sys import os import random import argparse @@ -8,192 +9,220 @@ import vcf from Bio import SeqIO import tempfile from lib.svrunner_utils import eprint +from pysam import tabix_compress, tabix_index -parser = argparse.ArgumentParser(description='Generate simulated populations with SV') -parser.add_argument("--nb-inds", help="Number of individuals to generate", required=True, type=int) -parser.add_argument("--reference", help="Reference genome", required=True) -parser.add_argument("--sv-list", help="File containing the SVs", required=True) -parser.add_argument("--coverage", help="Coverage of reads (default: 15)", default=15, type=int) -parser.add_argument("--output-directory", help="Output directory (default: res)", default="res") -parser.add_argument("--force-polymorphism", help="Force polymorphism for each SV", action='store_const', const=True, - default=False) -parser.add_argument("--haploid", help="Make a haploid genome, instead of diploid one", action="store_const", const=True, - default=False) -parser.add_argument("-l", "--read-len", help="Generate reads having a length of LEN", type=int, default=100) -parser.add_argument("-m", "--insert-len-mean", help="Generate inserts (fragments) having an average length of LEN", - type=int, default=300) -parser.add_argument("-v", "--insert-len-sd", help="Set the standard deviation of the insert (fragment) length (%%)", - type=int, default=30) -args = parser.parse_args() - -nb_inds = args.nb_inds - -if nb_inds < 2: - raise Exception("nb-inds must be at least 2") - -reference = args.reference -sv_list = args.sv_list -output_dir = args.output_directory -tmp_dir = tempfile.mkdtemp() -haploid = args.haploid - -if not os.path.isfile(reference + ".fai"): - os.system("samtools faidx " + reference) - - -############# -# FUNCTIONS # -############# - - -def allele(frequency): +def parse_args(): + """ + Parse script arguments + :return: argparse arguments object + """ + parser = argparse.ArgumentParser(description='Generate simulated populations with SV') + parser.add_argument("--nb-inds", help="Number of individuals to generate", required=True, type=int) + parser.add_argument("--reference", help="Reference genome", required=True) + parser.add_argument("--sv-list", help="File containing the SVs", required=True) + parser.add_argument("--coverage", help="Coverage of reads (default: 15)", default=15, type=int) + parser.add_argument("--output-directory", help="Output directory (default: res)", default="res") + parser.add_argument("--force-polymorphism", help="Force polymorphism for each SV", action='store_const', const=True, + default=False) + parser.add_argument("--haploid", help="Make a haploid genome, instead of diploid one", action="store_const", const=True, + default=False) + parser.add_argument("-l", "--read-len", help="Generate reads having a length of LEN", type=int, default=100) + parser.add_argument("-m", "--insert-len-mean", help="Generate inserts (fragments) having an average length of LEN", + type=int, default=300) + parser.add_argument("-v", "--insert-len-sd", help="Set the standard deviation of the insert (fragment) length (%%)", + type=int, default=30) + + args = parser.parse_args() + return args + + +def _allele(frequency): + """ + Get randomly an allele, given a frequency + :param frequency: frequency of the allele {float} + :return: allele randomly choosen {0 or 1} + """ return 1 if random.uniform(0, 1) < frequency else 0 -def get_genotypes_for_inds(): +def _get_genotypes_for_inds(nb_inds, haploid, freq): + """ + Get genotypes for each individual + :param nb_inds: number of individuals {int} + :param haploid: is the genome hamploid {bool} + :param freq: frequency of the allele {float} + :return: list of genotypes, vcf data {list} + """ all_genotypes = [] genotypes_row = [] for i in range(1, nb_inds + 1): if not haploid: - genotype = str(allele(freq)) + "/" + str(allele(freq)) + genotype = str(_allele(freq)) + "/" + str(_allele(freq)) else: - genotype = str(allele(freq)) + genotype = str(_allele(freq)) genotype_data = vcf.model._Call(None, "INDIV_" + str(i), vcf.model.make_calldata_tuple("GT")(GT=genotype)) genotypes_row.append(genotype_data) all_genotypes.append(genotype) return all_genotypes, genotypes_row -def svsort(sv, chr): +def _svsort(sv, chrm, genotypes_for_inds): """ Function to sort regions + :param sv: the variant + :param chrm: chromosome {str} + :param genotypes_for_inds: dictionary of genotypes for each individual """ - return int(genotypes_for_inds[chr][sv]["start"]) - - -prg_path = os.path.dirname(os.path.realpath(__file__)) - -if not os.path.isdir(output_dir): - os.mkdir(output_dir) + return int(genotypes_for_inds[chrm][sv]["start"]) ########################### # 1. Get random deletions # ########################### -print("GENERATE SV DELs...\n") +def get_random_deletions(sv_list, reference, tmp_dir, prg_path): + """ + Get random deletions + :param sv_list: file describing deletions (in SVsim format) {str} + :param reference: reference genome fasta file {str} + :param tmp_dir: temporary directory {str} + :param prg_path: program path {str} + """ + + print("GENERATE SV DELs...\n") -os.system(os.path.join(prg_path, "SVsim -W -i {0} -r {1} -o {2}{3}reference-sv".format(sv_list, reference, tmp_dir, os.path.sep))) + os.system(os.path.join(prg_path, "SVsim -W -i {0} -r {1} -o {2}{3}reference-sv".format(sv_list, reference, tmp_dir, os.path.sep))) ################################### # 2. Build BED files of deletions # ################################### -try: - with open(os.path.join(tmp_dir, "reference-sv.bed"), "w") as bed: - try: - with open(os.path.join(tmp_dir, "reference-sv.bedpe"), "r") as bedpe: - for line in bedpe: - freq = 0.2 if random.uniform(0,1) < 0.5 else 0.5 - parts = line.split("\t") - bed.write("\t".join([parts[0], parts[2], parts[4], parts[6].replace("::", "_"), str(freq)]) + "\n") - except IOError: - eprint("ERROR: SVSim failed to generate variants.") - exit(1) -except IOError: - eprint("ERROR: Unable to generate BED file \"{0}\".".format(os.path.join(tmp_dir, "reference-sv.bed"))) - exit(1) +def build_bed_deletions_file(bed_file, tmp_dir): + """ + Build BED file of deletions + :param bed_file: bed full path name {str} + :param tmp_dir: temporary directory {str} + """ + try: + with open(bed_file, "w") as bed: + try: + with open(os.path.join(tmp_dir, "reference-sv.bedpe"), "r") as bedpe: + for line in bedpe: + freq = 0.2 if random.uniform(0,1) < 0.5 else 0.5 + parts = line.split("\t") + bed.write("\t".join([parts[0], parts[2], parts[4], parts[6].replace("::", "_"), str(freq)]) + "\n") + except IOError: + eprint("ERROR: SVSim failed to generate variants.") + exit(1) + except IOError: + eprint("ERROR: Unable to generate BED file \"{0}\".".format(os.path.join(tmp_dir, "reference-sv.bed"))) + exit(1) + ############################################################################### # 3. Build VCF files containing genotypes for the given number of individuals # ############################################################################### # VCF file is a result file, not used by this script -print("GENERATE SUMMARY VCF FILE...\n") - -genotypes_for_inds = OrderedDict() -# {Â chr: {Â id_indiv: {start: #start, end: #end, genotypes: [[0,1],[1,1],...], ...}, # ...} - -# Build VCF header: -try: - with open(os.path.join(prg_path, "template.vcf"), "r") as template: - try: - with open(os.path.join(tmp_dir, "template.vcf"), "w") as my_template: - for line in template: - if line[:6] == "#CHROM": - line = line.replace("\n", "") - for i in range(0, nb_inds): - line += "\tINDIV_" + str(i+1) - line += "\n" - my_template.write(line) - except IOError: - eprint("ERROR: unable to create template file \"{0}\" in temp dir.".format(os.path.join(tmp_dir, "template.vcf"))) - exit(1) -except IOError: - eprint("ERROR: template file not found in program directory.") - exit(1) - -try: - with open(os.path.join(tmp_dir, "reference-sv.bed"), "r") as bed: - vcf_reader = vcf.Reader(filename=os.path.join(tmp_dir, 'template.vcf')) - output_vcf = os.path.join(output_dir, "genotypes.vcf") - vcf_writer = vcf.Writer(open(output_vcf, "w"), vcf_reader) - for line in bed: - parts = line.replace("\n", "").split("\t") - freq = float(parts[4]) - if parts[0] not in genotypes_for_inds: - genotypes_for_inds[parts[0]] = {} - genotypes_for_inds[parts[0]][parts[3]] = {"start": int(parts[1]), "end": int(parts[2]), "genotypes": []} - - # Get genotypes: - all_genotypes, genotypes = [], [] - if args.force_polymorphism: - polymorph = False - while not polymorph: - all_genotypes, genotypes = get_genotypes_for_inds() - polymorph = len(set(all_genotypes)) > 1 - else: - all_genotypes, genotypes = get_genotypes_for_inds() - genotypes_for_inds[parts[0]][parts[3]]["genotypes"] = [x.split("/") for x in all_genotypes] - - info = {"END": int(parts[2]), "AF": freq} - vcf_record = vcf.model._Record(parts[0], int(parts[1]), parts[3], "N", [vcf.model._SV("DEL")], ".", ".", info, "GT", [0], genotypes) - vcf_writer.write_record(vcf_record) - vcf_writer.close() - - # Bgzip + tabix: - os.system("bgzip -c " + output_vcf + " > " + output_vcf + ".gz") - os.unlink(output_vcf) - output_vcf += ".gz" - os.system("tabix -p vcf " + output_vcf) -except IOError: - eprint("ERROR: Unable to load bed file \"{0}\": file not found.".format(os.path.join(tmp_dir, "reference-sv.bed"))) - exit(1) +def _build_vcf_header(vcf_file, prg_path, tmp_dir, nb_inds): + """ + Build header of the VCF file + :param vcf_file: vcf file full path name {str} + :param prg_path: program path {str} + :param tmp_dir: temporary directory {str} + :param nb_inds: number of individuals {int} + """ + try: + with open(os.path.join(prg_path, "template.vcf"), "r") as template: + try: + with open(vcf_file, "w") as my_template: + for line in template: + if line[:6] == "#CHROM": + line = line.replace("\n", "") + for i in range(0, nb_inds): + line += "\tINDIV_" + str(i+1) + line += "\n" + my_template.write(line) + except IOError: + eprint("ERROR: unable to create template file \"{0}\" in temp dir.".format(os.path.join(tmp_dir, "template.vcf"))) + exit(1) + except IOError: + eprint("ERROR: template file not found in program directory.") + exit(1) +def build_genotypes_vcf_list(bed_file, output_vcf, haploid, force_polymorphism, nb_inds, tmp_dir, prg_path): + """ + Build VCF file describing genotypes for each individual (and the associated python dictionary) + :param bed_file: bed file describing deletions {str} + :param output_vcf: output VCF file full path name {str} + :param haploid: is haploid {bool} + :param force_polymorphism: force polymorphism {bool} + :param output_dir: output directory {str} + :param nb_inds: number of individuals {int} + :param tmp_dir: temporary directory {str} + :param prg_path: program path {str} + :return: dictionary of genotypes for each variant for each dictionary {OrderedDict} + """ + print("GENERATE SUMMARY VCF FILE...\n") + genotypes_for_inds = OrderedDict() + # {Â chr: {Â id_indiv: {start: #start, end: #end, genotypes: [[0,1],[1,1],...], ...}, # ...} + + vcf_file = os.path.join(tmp_dir, "template.vcf") + + # Build VCF header: + _build_vcf_header(vcf_file, prg_path, tmp_dir, nb_inds) + + try: + with open(bed_file, "r") as bed: + vcf_reader = vcf.Reader(filename=vcf_file) + vcf_writer = vcf.Writer(open(output_vcf, "w"), vcf_reader) + for line in bed: + parts = line.replace("\n", "").split("\t") + freq = float(parts[4]) + if parts[0] not in genotypes_for_inds: + genotypes_for_inds[parts[0]] = {} + genotypes_for_inds[parts[0]][parts[3]] = {"start": int(parts[1]), "end": int(parts[2]), "genotypes": []} + + # Get genotypes: + all_genotypes, genotypes = [], [] + if force_polymorphism: + polymorph = False + while not polymorph: + all_genotypes, genotypes = _get_genotypes_for_inds(nb_inds, haploid, freq) + polymorph = len(set(all_genotypes)) > 1 + else: + all_genotypes, genotypes = _get_genotypes_for_inds(nb_inds, haploid, freq) + genotypes_for_inds[parts[0]][parts[3]]["genotypes"] = [x.split("/") for x in all_genotypes] + + info = {"END": int(parts[2]), "AF": freq} + vcf_record = vcf.model._Record(parts[0], int(parts[1]), parts[3], "N", [vcf.model._SV("DEL")], ".", ".", info, "GT", [0], genotypes) + vcf_writer.write_record(vcf_record) + vcf_writer.close() + + tabix_compress(output_vcf, output_vcf + ".gz", True) + tabix_index(output_vcf + ".gz", force=True, preset="vcf") + except IOError: + eprint("ERROR: Unable to load bed file \"{0}\": file not found.".format(bed_file)) + exit(1) + + return genotypes_for_inds + ############################################### # Build fasta chromosomes for each individual # ############################################### -print("BUILD FASTA GENOME FOR EACH INDIVIDUAL...\n") - - -fasta_orig = SeqIO.index(reference, "fasta") - -for chr, svs_infos in genotypes_for_inds.items(): - - print("PROCESSING CHROMOSOME {0}...\n".format(chr)) - - svs = list(svs_infos.keys()) - svs = sorted(svs, key=lambda x:svsort(x, chr)) - - fasta_orig_chr = fasta_orig[chr] - last_nt = len(fasta_orig_chr)-1 - - # Compute keeped genomic regions for each diploid chromosome: +def _compute_keeped_genomic_regions(svs, svs_infos, haploid): + """ + Get list of all regions keeped (not deleted) + :param svs: list of variants (deletions) {list} + :param svs_infos: infos of variants {dict} + :param haploid: is haploid {bool} + :return: regions for each individuals {OrderedDict}, the last position for each region {OrderedDict} + """ regions = OrderedDict() current_region_pointer = OrderedDict() for svs_i in svs: @@ -203,7 +232,7 @@ for chr, svs_infos in genotypes_for_inds.items(): regions[i] = {} current_region_pointer[i] = {} for j in range(0, 2 if not haploid else 1): # For each chromosome of the diploid genome, or the chromosome - # of the haploid genome + # of the haploid genome if j not in regions[i]: regions[i][j] = [] current_region_pointer[i][j] = "0" @@ -211,18 +240,31 @@ for chr, svs_infos in genotypes_for_inds.items(): regions[i][j].append([current_region_pointer[i][j], svs_infos[svs_i]["start"]]) current_region_pointer[i][j] = svs_infos[svs_i]["end"] i += 1 + return regions, current_region_pointer - # Build FASTA of each diploid/haploid chromosome: + +def _build_fastas(chrm, regions, current_region_pointer, output_dir, fasta_orig_chr, last_nt): + """ + Build fasta files + :param chrm: + :param regions: + :param current_region_pointer: + :param output_dir: + :param fasta_orig_chr: + :param last_nt: + :return: + """ for indiv, chrs_dips in regions.items(): id_chrs = list(chrs_dips.keys()) id_chrs = sorted(id_chrs) for id_chr in id_chrs: - chr_dip = chrs_dips[id_chr] # SVs for each diploid chromosome - logs_regions = [] # Store logs + chr_dip = chrs_dips[id_chr] # SVs for each diploid chromosome + logs_regions = [] # Store logs # Build FASTA and store logs: try: - with open(os.path.join(output_dir, "INDIV_" + str(indiv+1) + "_chr_" + str(id_chr) + ".fasta"), "a") as output_handle: + with open(os.path.join(output_dir, "INDIV_" + str(indiv + 1) + "_chr_" + str(id_chr) + ".fasta"), + "a") as output_handle: fasta = "" last_one = 0 for chr_region in chr_dip: @@ -235,39 +277,103 @@ for chr, svs_infos in genotypes_for_inds.items(): SeqIO.write(fasta, output_handle, "fasta") except IOError: eprint("ERROR: unable to write \"{0}\" file.". - format(os.path.join(output_dir, "INDIV_" + str(indiv+1) + "_chr_" + str(id_chr) + ".fasta"))) + format(os.path.join(output_dir, "INDIV_" + str(indiv + 1) + "_chr_" + str(id_chr) + ".fasta"))) exit(1) # Write logs try: - with open(os.path.join(output_dir, "INDIV_" + str(indiv+1) + ".regions.log"), "a") as log_handle: - log_handle.write(chr + "_" + str(id_chr) + "\t") + with open(os.path.join(output_dir, "INDIV_" + str(indiv + 1) + ".regions.log"), "a") as log_handle: + log_handle.write(chrm + "_" + str(id_chr) + "\t") log_handle.write("\n\t".join(logs_regions)) log_handle.write("\n") except IOError: eprint("ERROR: unable to write log file: \"{0}\"". - format(os.path.join(output_dir, "INDIV_" + str(indiv+1) + ".regions.log"))) - -print("GENERATE RANDOM READS FOR EACH INDIVIDUAL FROM GENOME...\n") - -# Generate reads for all individuals: -if not haploid: - cmd = str("{4}pirs/pirs simulate -z -x {3} -d -B {4}pirs/Profiles/Base-Calling_Profiles/humNew.PE100.matrix.gz " - "-I {4}pirs/Profiles/InDel_Profiles/phixv2.InDel.matrix -l {5} -m {6} -v {7} " - "-G {4}pirs/Profiles/GC-depth_Profiles/humNew.gcdep_100.dat -o {0} {1} {2}") -else: - cmd = str("{4}pirs/pirs simulate -z -x {3} -B {4}pirs/Profiles/Base-Calling_Profiles/humNew.PE100.matrix.gz " - "-I {4}pirs/Profiles/InDel_Profiles/phixv2.InDel.matrix -l {5} -m {6} -v {7} " - "-G {4}pirs/Profiles/GC-depth_Profiles/humNew.gcdep_100.dat -o {0} {1}") -for i in range(1, nb_inds+1): - prefix = os.path.join(output_dir, "INDIV_" + str(i)) - chr0 = prefix + "_chr_0.fasta" - chr1 = prefix + "_chr_1.fasta" + format(os.path.join(output_dir, "INDIV_" + str(indiv + 1) + ".regions.log"))) + + +def build_fastas_chromosomes(reference, genotypes_for_inds, haploid, output_dir): + print("BUILD FASTA GENOME FOR EACH INDIVIDUAL...\n") + fasta_orig = SeqIO.index(reference, "fasta") + + for chrm, svs_infos in genotypes_for_inds.items(): + + print("PROCESSING CHROMOSOME {0}...\n".format(chrm)) + + svs = list(svs_infos.keys()) + svs = sorted(svs, key=lambda x:_svsort(x, chrm, genotypes_for_inds)) + + fasta_orig_chr = fasta_orig[chrm] + last_nt = len(fasta_orig_chr)-1 + + # Compute keeped genomic regions for each diploid chromosome: + regions, current_region_pointer = _compute_keeped_genomic_regions(svs, svs_infos, haploid) + + # Build FASTA of each diploid/haploid chromosome: + _build_fastas(chrm, regions, current_region_pointer, output_dir, fasta_orig_chr, last_nt) + + +def generate_samples_fastq(haploid, nb_inds, output_dir, coverage, read_len, insert_len_mean, insert_len_sd, + prg_path): + + print("GENERATE RANDOM READS FOR EACH INDIVIDUAL FROM GENOME...\n") + + # Generate reads for all individuals: if not haploid: - os.system(cmd.format(prefix, chr0, chr1, args.coverage, prg_path + os.path.sep, args.read_len, - args.insert_len_mean, args.insert_len_sd)) + cmd = str("{4}pirs/pirs simulate -z -x {3} -d -B {4}pirs/Profiles/Base-Calling_Profiles/humNew.PE100.matrix.gz " + "-I {4}pirs/Profiles/InDel_Profiles/phixv2.InDel.matrix -l {5} -m {6} -v {7} " + "-G {4}pirs/Profiles/GC-depth_Profiles/humNew.gcdep_100.dat -o {0} {1} {2}") else: - os.system(cmd.format(prefix, chr0, "", args.coverage, prg_path + os.path.sep, args.read_len, - args.insert_len_mean, args.insert_len_sd)) - -print("DONE!\n") + cmd = str("{4}pirs/pirs simulate -z -x {3} -B {4}pirs/Profiles/Base-Calling_Profiles/humNew.PE100.matrix.gz " + "-I {4}pirs/Profiles/InDel_Profiles/phixv2.InDel.matrix -l {5} -m {6} -v {7} " + "-G {4}pirs/Profiles/GC-depth_Profiles/humNew.gcdep_100.dat -o {0} {1}") + for i in range(1, nb_inds+1): + prefix = os.path.join(output_dir, "INDIV_" + str(i)) + chr0 = prefix + "_chr_0.fasta" + chr1 = prefix + "_chr_1.fasta" + if not haploid: + os.system(cmd.format(prefix, chr0, chr1, coverage, prg_path + os.path.sep, read_len, + insert_len_mean, insert_len_sd)) + else: + os.system(cmd.format(prefix, chr0, "", coverage, prg_path + os.path.sep, read_len, + insert_len_mean, insert_len_sd)) + + +def main(): + args = parse_args() + reference = args.reference + sv_list = args.sv_list + output_dir = args.output_directory + tmp_dir = tempfile.mkdtemp() + haploid = args.haploid + nb_inds = args.nb_inds + + if nb_inds < 2: + raise Exception("nb-inds must be at least 2") + + if not os.path.isfile(reference + ".fai"): + os.system("samtools faidx " + reference) + + prg_path = os.path.dirname(os.path.realpath(__file__)) + + if not os.path.isdir(output_dir): + os.mkdir(output_dir) + + ################ + # Launch steps # + ################ + + get_random_deletions(sv_list, reference, tmp_dir, prg_path) + bed_file = os.path.join(tmp_dir, "reference-sv.bed") + build_bed_deletions_file(bed_file, tmp_dir) + output_vcf = os.path.join(output_dir, "genotypes.vcf") + genotypes_for_inds = build_genotypes_vcf_list(bed_file, output_vcf, haploid, args.force_polymorphism, nb_inds, tmp_dir, prg_path) + build_fastas_chromosomes(reference, genotypes_for_inds, haploid, output_dir) + generate_samples_fastq(haploid, nb_inds, output_dir, args.coverage, args.read_len, args.insert_len_mean, + args.insert_len_sd, prg_path) + print("DONE!\n") + +if __name__ == '__main__': + try: + sys.exit(main()) + except Exception as e: + print(e) \ No newline at end of file