diff --git a/build_pop.py b/build_pop.py index 8cb345c7146b2f0ce370c579997cf361452871af..364709523562da96c8a5190a733281e630eee5af 100755 --- a/build_pop.py +++ b/build_pop.py @@ -77,9 +77,9 @@ def parse_args(): type=int, default=30) parser.add_argument("-q", "--quiet", help="Don't ask anything, choose default answer instead", action="store_const", const=True, default=False) - parser.add_argument("-md", "--min-deletions", help="Minimum of deletions to generate (>=1)", default=1, + parser.add_argument("-md", "--min-deletions", help="Minimum of deletions to generate (>=1)", default=0, type=check_min_size) - parser.add_argument("-mi", "--min-inversions", help="Minimum of inversions to generate (>=1)", default=1, + parser.add_argument("-mi", "--min-inversions", help="Minimum of inversions to generate (>=1)", default=0, type=check_min_size) parser.add_argument("-t", "--threads", help="Number of threads", default=multiprocessing.cpu_count(), type=int) parser.add_argument("-g", "--genotypes", help="Position of SV with genotypes of individuals") @@ -309,7 +309,7 @@ def _compute_keeped_genomic_regions(svs, svs_infos, haploid): return regions, current_region_pointer -def _build_fastas(chrm, regions, current_region_pointer, output_dir, fasta_orig_chr, last_nt): +def _build_fastas_deletions(chrm, regions, current_region_pointer, output_dir, fasta_orig_chr, last_nt): """ Build fasta files :param chrm: @@ -321,32 +321,28 @@ def _build_fastas(chrm, regions, current_region_pointer, output_dir, fasta_orig_ :return: """ n = 0 + fastas = [] for indiv, chrs_dips in regions.items(): id_chrs = list(chrs_dips.keys()) id_chrs = sorted(id_chrs) c = 0 + fastas.append([]) for id_chr in id_chrs: chr_dip = chrs_dips[id_chr] # SVs for each diploid chromosome logs_regions = [] # Store logs # Build FASTA and store logs: - try: - with open(os.path.join(output_dir, "indiv" + str(indiv + 1) + "_chr_" + str(id_chr) + ".fasta"), - "a") as output_handle: - fasta = "" - last_one = 0 - for chr_region in chr_dip: - fasta += fasta_orig_chr[n][c][int(chr_region[0]):int(chr_region[1])] - logs_regions.append("\t".join(str(x) for x in chr_region)) - last_one = int(chr_region[1]) - if last_one < last_nt: - logs_regions.append("\t".join([str(current_region_pointer[indiv][id_chr]), str(last_nt)])) - fasta += fasta_orig_chr[n][c][int(current_region_pointer[indiv][id_chr]):last_nt] - SeqIO.write(fasta, output_handle, "fasta") - except IOError: - raise ExecException("ERROR: unable to write \"{0}\" file.". - format(os.path.join(output_dir, "indiv" + str(indiv + 1) + "_chr_" + str(id_chr) + - ".fasta"))) + + fasta = "" + last_one = 0 + for chr_region in chr_dip: + fasta += fasta_orig_chr[n][c][int(chr_region[0]):int(chr_region[1])] + logs_regions.append("\t".join(str(x) for x in chr_region)) + last_one = int(chr_region[1]) + if last_one < last_nt: + logs_regions.append("\t".join([str(current_region_pointer[indiv][id_chr]), str(last_nt)])) + fasta += fasta_orig_chr[n][c][int(current_region_pointer[indiv][id_chr]):last_nt] + fastas[n].append(fasta) # Write logs try: @@ -359,6 +355,7 @@ def _build_fastas(chrm, regions, current_region_pointer, output_dir, fasta_orig_ format(os.path.join(output_dir, "indiv" + str(indiv + 1) + ".regions.log"))) c += 1 n += 1 + return fastas def _build_fastas_inversions(fasta_orig_chrm: dict, genotypes_for_inds: dict, nb_inds, haploid): @@ -372,12 +369,31 @@ def _build_fastas_inversions(fasta_orig_chrm: dict, genotypes_for_inds: dict, nb if genotype[k] == "1": start = int(props["start"]) end = int(props["end"]) - fastas[i][k] = fastas[i][k][:start] + str(Seq(fastas[i][k][start:end]).reverse_complement()) + \ + o_id = fastas[i][k].id + o_name = fastas[i][k].name + o_description = fastas[i][k].description + fastas[i][k] = fastas[i][k][:start] + fastas[i][k][start:end].reverse_complement() + \ fastas[i][k][end:] + fastas[i][k].id = o_id + fastas[i][k].name = o_name + fastas[i][k].description = o_description i += 1 return fastas +def _write_fastas(fastas, output_dir): + for indiv, chrs_dips in zip(range(0, len(fastas)), fastas): + for id_chr, fasta in zip(range(0, len(chrs_dips)), chrs_dips): + try: + with open(os.path.join(output_dir, "indiv" + str(indiv + 1) + "_chr_" + str(id_chr) + ".fasta"), + "a") as output_handle: + SeqIO.write(fasta, output_handle, "fasta") + except IOError: + raise ExecException("ERROR: unable to write \"{0}\" file.". + format(os.path.join(output_dir, "indiv" + str(indiv + 1) + "_chr_" + str(id_chr) + + ".fasta"))) + + def build_fastas_chromosomes(reference, genotypes_for_inds_DEL, genotypes_for_inds_INV, haploid, output_dir, nb_inds, printer): printer.out("BUILD FASTA GENOME FOR EACH INDIVIDUAL...\n") @@ -386,7 +402,7 @@ def build_fastas_chromosomes(reference, genotypes_for_inds_DEL, genotypes_for_in # First, inversions (does not change coordinates): printer.out("STARTING WITH INVERSIONS...") fastas_by_chrm = {} - for chrm, svs_infos in genotypes_for_inds_DEL.items(): + for chrm, svs_infos in genotypes_for_inds_INV.items(): printer.out("PROCESSING CHROMOSOME {0}...\n".format(chrm)) svs = list(svs_infos.keys()) fastas_by_chrm[chrm] = _build_fastas_inversions(fasta_orig[chrm], genotypes_for_inds_INV[chrm], nb_inds, haploid) @@ -399,14 +415,24 @@ def build_fastas_chromosomes(reference, genotypes_for_inds_DEL, genotypes_for_in svs = list(svs_infos.keys()) svs = sorted(svs, key=lambda x:_svsort(x, chrm, genotypes_for_inds_DEL)) - fastas_orig_chr = fastas_by_chrm[chrm] + if chrm in fastas_by_chrm: + fastas_orig_chr = fastas_by_chrm[chrm] + else: # Build data as _build_fastas_inversions does + fastas_orig_chr = [] + for i in range(0, nb_inds): + fastas_orig_chr.append([fasta_orig[chrm]] if haploid else [fasta_orig[chrm], fasta_orig[chrm]]) last_nt = len(fastas_orig_chr[0][0])-1 # Compute keeped genomic regions for each diploid chromosome: regions, current_region_pointer = _compute_keeped_genomic_regions(svs, svs_infos, haploid) # Build FASTA of each diploid/haploid chromosome: - _build_fastas(chrm, regions, current_region_pointer, output_dir, fastas_orig_chr, last_nt) + fastas_by_chrm[chrm] = _build_fastas_deletions(chrm, regions, current_region_pointer, output_dir, fastas_orig_chr, last_nt) + + printer.out("WRITING FASTAS...") + for chrm, fastas in fastas_by_chrm.items(): + _write_fastas(fastas, output_dir) + printer.out("") def generate_samples_fastq(haploid, nb_inds, output_dir, coverage, read_len, insert_len_mean, insert_len_sd, @@ -452,12 +478,13 @@ def confirm(deletions: dict, inversions: dict, variants: dict, printer: Print): variants_del_counts["{0}-{1}".format(variant["min"], variant["max"])] += 1 break printer.out("We generate {0} deletion variants.".format(nb_dels)) - printer.out("Ranges:") - for v_range, v_count in variants_del_counts.items(): - printer.out("\t- Range {0}: {1}".format(v_range, v_count)) - printer.out("Chromosomes:") - for chrm in sorted(list(variants_del_counts_chr.keys())): - printer.out("\t- {0}: {1}".format(chrm, variants_del_counts_chr[chrm])) + if nb_dels > 0: + printer.out("Ranges:") + for v_range, v_count in variants_del_counts.items(): + printer.out("\t- Range {0}: {1}".format(v_range, v_count)) + printer.out("Chromosomes:") + for chrm in sorted(list(variants_del_counts_chr.keys())): + printer.out("\t- {0}: {1}".format(chrm, variants_del_counts_chr[chrm])) printer.out("") # inversions: @@ -478,12 +505,13 @@ def confirm(deletions: dict, inversions: dict, variants: dict, printer: Print): variants_inv_counts["{0}-{1}".format(variant["min"], variant["max"])] += 1 break printer.out("We generate {0} inversion variants.".format(nb_invs)) - printer.out("Ranges:") - for v_range, v_count in variants_inv_counts.items(): - printer.out("\t- Range {0}: {1}".format(v_range, v_count)) - printer.out("Chromosomes:") - for chrm in sorted(list(variants_inv_counts_chr.keys())): - printer.out("\t- {0}: {1}".format(chrm, variants_inv_counts_chr[chrm])) + if nb_invs > 0: + printer.out("Ranges:") + for v_range, v_count in variants_inv_counts.items(): + printer.out("\t- Range {0}: {1}".format(v_range, v_count)) + printer.out("Chromosomes:") + for chrm in sorted(list(variants_inv_counts_chr.keys())): + printer.out("\t- {0}: {1}".format(chrm, variants_inv_counts_chr[chrm])) printer.out("") # Confirm: @@ -535,13 +563,13 @@ def init(output_dir, force_outputdir, sv_list, nstretches, nb_inds, reference, p if genotypes is None: printer.out("GENERATE RANDOM DELETIONS VARIANTS...\n") try: - nb_deletions = 0 - nb_inversions = 0 + nb_deletions = -1 # To allow 0 deletions + nb_inversions = -1 # To allow 0 inversions nb_try = 0 sv_sim = VariantsSimulator(sv_list, nstretches, threads, stdout, stderr) - printer.out("Try to generate at least %s deletions..." % min_deletions) + printer.out("Try to generate at least %s deletions and %s inversions..." % (min_deletions, min_inversions)) max_try = 10 - while nb_deletions < min_deletions and nb_try < max_try: + while (nb_deletions < min_deletions or nb_inversions < min_inversions) and nb_try < max_try: printer.out("\nTry {0} / {1}...".format(nb_try + 1, max_try)) nb_deletions, deletions, nb_inversions, inversions = sv_sim.get_random_variants(proba_del, proba_inv, reference)