diff --git a/build_pop.py b/build_pop.py index 8cfb3bc6d7b36489a34d0d9dd5ef7991ac71813c..3dfbb664b54cd19aab890ed16ac77612bfba3c4a 100755 --- a/build_pop.py +++ b/build_pop.py @@ -25,6 +25,7 @@ def parse_args(): formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("-n", "--nb-inds", help="Number of individuals to generate", required=True, type=int) parser.add_argument("-r", "--reference", help="Reference genome", required=True) + parser.add_argument("-ns", "--nstretches", help="N-stretches positions bed file") parser.add_argument("-s", "--sv-list", help="File containing the SVs", required=False) parser.add_argument("-c", "--coverage", help="Coverage of reads", default=15, type=int) parser.add_argument("-o", "--output-directory", help="Output directory", default="res") @@ -317,7 +318,7 @@ def confirm(deletions: dict, variants: dict): return input("Continue [Y/n]? ") in ["y", "Y", ""] -def init(output_dir, force_outputdir, sv_list, nb_inds, reference, proba_del, haploid, force_polymorphism, +def init(output_dir, force_outputdir, sv_list, nstretches, nb_inds, reference, proba_del, haploid, force_polymorphism, coverage, read_len, insert_len_mean, insert_len_sd, threads, quiet=True): if os.path.isdir(output_dir): if force_outputdir: @@ -352,7 +353,7 @@ def init(output_dir, force_outputdir, sv_list, nb_inds, reference, proba_del, ha print("GENERATE RANDOM DELETIONS VARIANTS...\n") try: - sv_sim = VariantsSimulator(sv_list, threads) + sv_sim = VariantsSimulator(sv_list, nstretches, threads) deletions = sv_sim.get_random_deletions(proba_del, reference) print("") except InputException as e: @@ -373,6 +374,7 @@ def main(): args = parse_args() reference = args.reference sv_list = args.sv_list + nstretches = args.nstretches output_dir = args.output_directory haploid = args.haploid nb_inds = args.nb_inds @@ -386,7 +388,7 @@ def main(): insert_len_sd = args.insert_len_sd quiet = args.quiet - init(output_dir, force_outputdir, sv_list, nb_inds, reference, proba_del, haploid, force_polymorphism, + init(output_dir, force_outputdir, sv_list, nstretches, nb_inds, reference, proba_del, haploid, force_polymorphism, coverage, read_len, insert_len_mean, insert_len_sd, threads, quiet) if __name__ == '__main__': diff --git a/variants_simulator.py b/variants_simulator.py index aa19cb44735876de7cc4f1df79e8659110b0f74c..27cc7a29411f48b681ad6e57c0dd520403b4fe25 100755 --- a/variants_simulator.py +++ b/variants_simulator.py @@ -12,17 +12,45 @@ from joblib import Parallel, delayed import multiprocessing -class VariantsSimulator: +class bcolors: + HEADER = '\033[95m' + OKBLUE = '\033[94m' + OKGREEN = '\033[92m' + WARNING = '\033[93m' + FAIL = '\033[91m' + ENDC = '\033[0m' + BOLD = '\033[1m' + UNDERLINE = '\033[4m' + - def __init__(self, sv_list=None, threads=-1): +class VariantsSimulator: + def __init__(self, sv_list: str = None, nstretches: str = None, threads: int = -1): if sv_list is None: sv_list = os.path.join(os.path.dirname(os.path.realpath(__file__)), "defaults.rules") + self.min_sizes = {} self.variants = self.__load_variants(sv_list) + self.nstretches = {} + if nstretches is not None: + self.nstretches = self.__load_nstretches(nstretches) self.deletions = {} self.threads = threads if threads > -1 else multiprocessing.cpu_count() @staticmethod - def __load_variants(sv_list): + def __print_color(color, message, header=None): + if header is not None: + header += "\n" + print(header + color + message + bcolors.ENDC) + + def print_err(self, message, header=None): + self.__print_color(bcolors.FAIL, message, header) + + def print_ok(self, message, header=None): + self.__print_color(bcolors.OKGREEN, message, header) + + def print_warn(self, message, header=None): + self.__print_color(bcolors.WARNING, message, header) + + def __load_variants(self, sv_list): svs = {} allowed_sv = {"DEL", "INV", "DUP"} with open(sv_list, "r") as sv_list_f: @@ -45,11 +73,13 @@ class VariantsSimulator: raise InputException("Error while reading variant: proba value must be float") if type_sv not in svs: svs[type_sv] = [] + self.min_sizes[type_sv] = min_s svs[type_sv].append({ "min": min_s, "max": max_s, "proba": proba }) + self.min_sizes[type_sv] = min(self.min_sizes[type_sv], min_s) # Only deletions are supported for now: if "DEL" not in svs: @@ -68,6 +98,23 @@ class VariantsSimulator: return svs + @staticmethod + def __load_nstretches(ns_file): + stretches = {} + with open(ns_file, 'r') as stretches_f: + for stretch in stretches_f: + parts = stretch.strip("\n").split("\t") + chrm = parts[0] + start = int(parts[1]) + end = int(parts[2]) + length = int(parts[4]) + if chrm not in stretches: + stretches[chrm] = [] + stretches[chrm].append([start, end, length]) + for chrm in stretches: + stretches[chrm].sort(key=lambda x: x[0]) + return stretches + def _get_deletion_size(self): r = random.uniform(0, 1) for sizes in self.variants["DEL"]: @@ -75,6 +122,98 @@ class VariantsSimulator: break return random.randint(sizes["min"], sizes["max"] + 1) + def __get_max_interval(self, start, end, chrm, pos): + my_stretches = [] + for c_nstretches in self.nstretches[chrm][pos:]: + if (start - 50 < c_nstretches[1] <= end + 50) or (start - 50 <= c_nstretches[0] < end + 50): + my_stretches.append([c_nstretches[0] - 50, c_nstretches[1] + 50]) + else: + break + intervals = [] + my_start = start + for my_stretch in my_stretches: + if my_stretch[0] > my_start: + intervals.append([my_start, my_stretch[0]]) + my_start = min(my_stretch[1], end) + if my_start < end: + intervals.append([my_start, end]) + intervals.sort(key=lambda x: x[0]-x[1]) + return intervals[0] + + + def _get_far_from_nstretches(self, chrm, start, end, chr_len, pos=0): + """ + Get if a deletion is far from N stretches + :param chrm: the chromosome + :param start: start position of the deletion + :param end: end position of the deletion + :param pos: position to start in stretches list + :return: three values: + - True/False: True if far from nstretches (after a resize if needed) + - True/False: True if deletion size has changed + - True/False: True if deletion position changed (at least start) + - Int: new start of the deletion + - Int: new end of the deletion + - Pos: pos to restart (-1 if no need to restart) + """ + if chrm in self.nstretches: + for c_nstretches in self.nstretches[chrm][pos:]: + pos += 1 + if c_nstretches[0] <= start < c_nstretches[1]: + # --------(n_start)------(start)----------(n_end)---------(end)----- + # or + # --------(n_start)------(start)----------(end)---------(n_end)----- + if end > c_nstretches[1] + 50: + # --------(n_start)------(start)----------(n_end)----{>50bp}----(end)----- + new_start = c_nstretches[1] + 50 + length = end - new_start + if length >= self.min_sizes["DEL"]: + return True, True, True, new_start, end, pos + # Else, deletion is too much close to the nstretch (or included inside), and/or can't be resized + return False, False, False, start, end, -1 + + elif start < c_nstretches[0] < end and start < c_nstretches[1] < end: + # ------------(start)--------(n_start)--------(n_end)-------(end)--- + # Find best interval: + new_start, new_end = self.__get_max_interval(start, end, chrm, pos-1) + length = new_end - new_start + if length > 0 and length >= self.min_sizes["DEL"]: + return True, True, True, new_start, new_end, -1 + # Else, deletion can't be rezised + return False, False, False, start, end, -1 + + elif start < c_nstretches[0] < end or c_nstretches[0] > end > c_nstretches[0] - 50: + # FIRST CASE: + # ------------(start)--------(n_start)--------(end)-------(n_end)--- + # SECOND CASE: + # ------------(start)--------(end)---{<50bp}---(n_start)------------ + new_end = c_nstretches[0] - 50 + length = new_end - start + if length > 0 and length >= self.min_sizes["DEL"]: + return True, True, True, start, new_end, pos + # Else, deletion can't be rezised + return False, False, False, start, end, -1 + + elif c_nstretches[1] < start < c_nstretches[1] + 50: + # ------------(n_end)---{<50bp}---(start)-------------------(end)--- + new_start = c_nstretches[1] + 50 + new_end = end + (new_start - start) + resize = False + move = True + if new_end > chr_len: + move = False + resize = True + new_end = chr_len + length = new_end - new_start + if length < 0 or length < self.min_sizes["DEL"]: + # Unable to move / resize + return False, False, False, start, end, -1 + return True, resize, move, new_start, new_end, pos + + elif c_nstretches[0] > end + 50: + break + return True, False, False, start, end, -1 + def _get_random_deletions_by_chr(self, chrm, chr_length, deletions, proba_deletion): print("Generating for chromosome %s..." % chrm) nb_on_ch = 0 @@ -88,15 +227,45 @@ class VariantsSimulator: nb_dels += 1 nb_on_ch += 1 del_size = self._get_deletion_size() - freq = 0.2 if random.uniform(0, 1) < 0.5 else 0.5 - deletions[chrm].append({ - "name": "DEL{0}_{1}".format(chrm, nb_on_ch), - "start": i + 1, - "end": i + 1 + del_size, - "length": del_size, - "freq": freq - }) - i += del_size + 1 + start = i + 1 + end = i + 1 + del_size + print_message_header = "One deletion found ({0}:{1}-{2} ; size: {3})...".format(chrm, start, end, + del_size) + pos = 0 + keep_deletion = True + has_resize = False + has_move = False + while keep_deletion and pos >= 0: + keep_deletion, with_resize, with_move, start, end, pos = self._get_far_from_nstretches(chrm, start, + end, len_chr, + pos) + if with_resize: + del_size = end - start + has_resize = True + elif with_move: + has_move = True + + if keep_deletion: + if has_resize: + self.print_warn("N-stretch filter: resized ({0}:{1}-{2}, new size:{3})". + format(chrm, start, end, del_size), print_message_header) + elif has_move: + self.print_warn("N-stretch filter: moved ({0}:{1}-{2})".format(chrm, start, end), + print_message_header) + else: + self.print_ok("N-stretch filter: OK", print_message_header) + i = start + freq = 0.2 if random.uniform(0, 1) < 0.5 else 0.5 + deletions[chrm].append({ + "name": "DEL{0}_{1}".format(chrm, nb_on_ch), + "start": start, + "end": end, + "length": del_size, + "freq": freq + }) + i += del_size + else: + self.print_err("N-stretch filter: removed", print_message_header) i += 1 return deletions @@ -136,5 +305,6 @@ def main(): except InputException as e: print(e) + if __name__ == '__main__': - sys.exit(main()) \ No newline at end of file + sys.exit(main())