diff --git a/build_pop.py b/build_pop.py index 1528dda4d57f5f8739f8ae92ac143170dd3f57c2..d1105cb549a2da7ecc97c30bd0d39cbd26a33ddb 100755 --- a/build_pop.py +++ b/build_pop.py @@ -6,6 +6,8 @@ import argparse from collections import OrderedDict import vcf from Bio import SeqIO +import tempfile +from lib.svrunner_utils import eprint parser = argparse.ArgumentParser(description='Generate simulated populations with SV') parser.add_argument("--nb-inds", help="Number of individuals to generate", required=True, type=int) @@ -13,7 +15,6 @@ parser.add_argument("--reference", help="Reference genome", required=True) parser.add_argument("--sv-list", help="File containing the SVs", required=True) parser.add_argument("--coverage", help="Coverage of reads (default: 15)", default=15, type=int) parser.add_argument("--output-directory", help="Output directory (default: res)", default="res") -parser.add_argument("--tmp-directory", help="Temporary directory (default: tmp)", default="tmp") parser.add_argument("--force-polymorphism", help="Force polymorphism for each SV", action='store_const', const=True, default=False) parser.add_argument("--haploid", help="Make a haploid genome, instead of diploid one", action="store_const", const=True, @@ -34,7 +35,7 @@ if nb_inds < 2: reference = args.reference sv_list = args.sv_list output_dir = args.output_directory -tmp_dir = args.tmp_directory +tmp_dir = tempfile.mkdtemp() haploid = args.haploid if not os.path.isfile(reference + ".fai"): @@ -73,8 +74,6 @@ def svsort(sv, chr): prg_path = os.path.dirname(os.path.realpath(__file__)) -if not os.path.isdir(tmp_dir): - os.mkdir(tmp_dir) if not os.path.isdir(output_dir): os.mkdir(output_dir) @@ -92,13 +91,20 @@ os.system(os.path.join(prg_path, "SVsim -W -i {0} -r {1} -o {2}{3}reference-sv". # 2. Build BED files of deletions # ################################### -with open(os.path.join(tmp_dir, "reference-sv.bed"), "w") as bed: - with open(os.path.join(tmp_dir, "reference-sv.bedpe"), "r") as bedpe: - for line in bedpe: - freq = 0.2 if random.uniform(0,1) < 0.5 else 0.5 - parts = line.split("\t") - bed.write("\t".join([parts[0], parts[2], parts[4], parts[6].replace("::", "_"), str(freq)]) + "\n") - +try: + with open(os.path.join(tmp_dir, "reference-sv.bed"), "w") as bed: + try: + with open(os.path.join(tmp_dir, "reference-sv.bedpe"), "r") as bedpe: + for line in bedpe: + freq = 0.2 if random.uniform(0,1) < 0.5 else 0.5 + parts = line.split("\t") + bed.write("\t".join([parts[0], parts[2], parts[4], parts[6].replace("::", "_"), str(freq)]) + "\n") + except IOError: + eprint("ERROR: SVSim failed to generate variants.") + exit(1) +except IOError: + eprint("ERROR: Unable to generate BED file \"{0}\".".format(os.path.join(tmp_dir, "reference-sv.bed"))) + exit(1) ############################################################################### # 3. Build VCF files containing genotypes for the given number of individuals # @@ -111,48 +117,60 @@ genotypes_for_inds = OrderedDict() # {Â chr: {Â id_indiv: {start: #start, end: #end, genotypes: [[0,1],[1,1],...], ...}, # ...} # Build VCF header: -with open(os.path.join(prg_path, "template.vcf"), "r") as template: - with open(os.path.join(tmp_dir, "template.vcf"), "w") as my_template: - for line in template: - if line[:6] == "#CHROM": - line = line.replace("\n", "") - for i in range(0, nb_inds): - line += "\tINDIV_" + str(i+1) - line += "\n" - my_template.write(line) - -with open(os.path.join(tmp_dir, "reference-sv.bed"), "r") as bed: - vcf_reader = vcf.Reader(filename=os.path.join(tmp_dir, 'template.vcf')) - output_vcf = os.path.join(output_dir, "genotypes.vcf") - vcf_writer = vcf.Writer(open(output_vcf, "w"), vcf_reader) - for line in bed: - parts = line.replace("\n", "").split("\t") - freq = float(parts[4]) - if parts[0] not in genotypes_for_inds: - genotypes_for_inds[parts[0]] = {} - genotypes_for_inds[parts[0]][parts[3]] = {"start": int(parts[1]), "end": int(parts[2]), "genotypes": []} - - # Get genotypes: - all_genotypes, genotypes = [], [] - if args.force_polymorphism: - polymorph = False - while not polymorph: +try: + with open(os.path.join(prg_path, "template.vcf"), "r") as template: + try: + with open(os.path.join(tmp_dir, "template.vcf"), "w") as my_template: + for line in template: + if line[:6] == "#CHROM": + line = line.replace("\n", "") + for i in range(0, nb_inds): + line += "\tINDIV_" + str(i+1) + line += "\n" + my_template.write(line) + except IOError: + eprint("ERROR: unable to create template file \"{0}\" in temp dir.".format(os.path.join(tmp_dir, "template.vcf"))) + exit(1) +except IOError: + eprint("ERROR: template file not found in program directory.") + exit(1) + +try: + with open(os.path.join(tmp_dir, "reference-sv.bed"), "r") as bed: + vcf_reader = vcf.Reader(filename=os.path.join(tmp_dir, 'template.vcf')) + output_vcf = os.path.join(output_dir, "genotypes.vcf") + vcf_writer = vcf.Writer(open(output_vcf, "w"), vcf_reader) + for line in bed: + parts = line.replace("\n", "").split("\t") + freq = float(parts[4]) + if parts[0] not in genotypes_for_inds: + genotypes_for_inds[parts[0]] = {} + genotypes_for_inds[parts[0]][parts[3]] = {"start": int(parts[1]), "end": int(parts[2]), "genotypes": []} + + # Get genotypes: + all_genotypes, genotypes = [], [] + if args.force_polymorphism: + polymorph = False + while not polymorph: + all_genotypes, genotypes = get_genotypes_for_inds() + polymorph = len(set(all_genotypes)) > 1 + else: all_genotypes, genotypes = get_genotypes_for_inds() - polymorph = len(set(all_genotypes)) > 1 - else: - all_genotypes, genotypes = get_genotypes_for_inds() - genotypes_for_inds[parts[0]][parts[3]]["genotypes"] = [x.split("/") for x in all_genotypes] - - info = {"END": int(parts[2]), "AF": freq} - vcf_record = vcf.model._Record(parts[0], int(parts[1]), parts[3], "N", [vcf.model._SV("DEL")], ".", ".", info, "GT", [0], genotypes) - vcf_writer.write_record(vcf_record) - vcf_writer.close() - - # Bgzip + tabix: - os.system("bgzip -c " + output_vcf + " > " + output_vcf + ".gz") - os.unlink(output_vcf) - output_vcf += ".gz" + genotypes_for_inds[parts[0]][parts[3]]["genotypes"] = [x.split("/") for x in all_genotypes] + + info = {"END": int(parts[2]), "AF": freq} + vcf_record = vcf.model._Record(parts[0], int(parts[1]), parts[3], "N", [vcf.model._SV("DEL")], ".", ".", info, "GT", [0], genotypes) + vcf_writer.write_record(vcf_record) + vcf_writer.close() + + # Bgzip + tabix: + os.system("bgzip -c " + output_vcf + " > " + output_vcf + ".gz") + os.unlink(output_vcf) + output_vcf += ".gz" os.system("tabix -p vcf " + output_vcf) +except IOError: + eprint("ERROR: Unable to load bed file \"{0}\": file not found.".format(os.path.join(tmp_dir, "reference-sv.bed"))) + exit(1) @@ -203,23 +221,32 @@ for chr, svs_infos in genotypes_for_inds.items(): logs_regions = [] # Store logs # Build FASTA and store logs: - with open(os.path.join(output_dir, "INDIV_" + str(indiv+1) + "_chr_" + str(id_chr) + ".fasta"), "a") as output_handle: - fasta = "" - last_one = 0 - for chr_region in chr_dip: - fasta += fasta_orig_chr[int(chr_region[0]):int(chr_region[1])] - logs_regions.append("\t".join(str(x) for x in chr_region)) - last_one = int(chr_region[1]) - if last_one < last_nt: - logs_regions.append("\t".join([str(current_region_pointer[indiv][id_chr]), str(last_nt)])) - fasta += fasta_orig_chr[int(current_region_pointer[indiv][id_chr]):last_nt] - SeqIO.write(fasta, output_handle, "fasta") + try: + with open(os.path.join(output_dir, "INDIV_" + str(indiv+1) + "_chr_" + str(id_chr) + ".fasta"), "a") as output_handle: + fasta = "" + last_one = 0 + for chr_region in chr_dip: + fasta += fasta_orig_chr[int(chr_region[0]):int(chr_region[1])] + logs_regions.append("\t".join(str(x) for x in chr_region)) + last_one = int(chr_region[1]) + if last_one < last_nt: + logs_regions.append("\t".join([str(current_region_pointer[indiv][id_chr]), str(last_nt)])) + fasta += fasta_orig_chr[int(current_region_pointer[indiv][id_chr]):last_nt] + SeqIO.write(fasta, output_handle, "fasta") + except IOError: + eprint("ERROR: unable to write \"{0}\" file.". + format(os.path.join(output_dir, "INDIV_" + str(indiv+1) + "_chr_" + str(id_chr) + ".fasta"))) + exit(1) # Write logs - with open(os.path.join(output_dir, "INDIV_" + str(indiv+1) + ".regions.log"), "a") as log_handle: - log_handle.write(chr + "_" + str(id_chr) + "\t") - log_handle.write("\n\t".join(logs_regions)) - log_handle.write("\n") + try: + with open(os.path.join(output_dir, "INDIV_" + str(indiv+1) + ".regions.log"), "a") as log_handle: + log_handle.write(chr + "_" + str(id_chr) + "\t") + log_handle.write("\n\t".join(logs_regions)) + log_handle.write("\n") + except IOError: + eprint("ERROR: unable to write log file: \"{0}\"". + format(os.path.join(output_dir, "INDIV_" + str(indiv+1) + ".regions.log"))) print("GENERATE RANDOM READS FOR EACH INDIVIDUAL FROM GENOME...\n")