diff --git a/build_pop.py b/build_pop.py index f763d7066471e547256e239db45460572fed136e..4b7126b8ef7c6ae249229ef8af25956cf3ee3d8d 100755 --- a/build_pop.py +++ b/build_pop.py @@ -13,7 +13,7 @@ from lib.svrunner_utils import eprint from pysam import tabix_compress, tabix_index from variants_simulator import VariantsSimulator from exceptions import InputException - +import multiprocessing def parse_args(): """ @@ -41,7 +41,7 @@ def parse_args(): type=int, default=30) parser.add_argument("-q", "--quiet", help="Don't ask anything, choose default answer instead", action="store_const", const=True, default=False) - parser.add_argument("-t", "--threads", help="Number of threads", default=4) + parser.add_argument("-t", "--threads", help="Number of threads", default=multiprocessing.cpu_count(), type=int) args = parser.parse_args() return args @@ -350,8 +350,9 @@ def main(): print("GENERATE RANDOM DELETIONS VARIANTS...\n") try: - sv_sim = VariantsSimulator(sv_list) + sv_sim = VariantsSimulator(sv_list, args.threads) deletions = sv_sim.get_random_deletions(args.proba_del, args.reference) + print("") except InputException as e: print(e) return False diff --git a/variants_simulator.py b/variants_simulator.py index 0426ddce41e74cc275591fd02a235c250700ef5d..3cc87cf4eaaef1ac69827fb4c15c21d3c98b61ed 100755 --- a/variants_simulator.py +++ b/variants_simulator.py @@ -4,18 +4,26 @@ import os import sys import re import random + from exceptions import InputException from lib.svrunner_utils import eprint from Bio import SeqIO +from joblib import Parallel, delayed +import multiprocessing + + +def variantssimulator_get_random_deletions_vy_chr_parallel(*args): + return VariantsSimulator.get_random_deletions_by_chr(*args) class VariantsSimulator: - def __init__(self, sv_list=None): + def __init__(self, sv_list=None, threads=-1): if sv_list is None: sv_list = os.path.join(os.path.dirname(os.path.realpath(__file__)), "defaults.rules") self.variants = self.__load_variants(sv_list) self.deletions = {} + self.threads = threads if threads > -1 else multiprocessing.cpu_count() @staticmethod def __load_variants(sv_list): @@ -64,41 +72,50 @@ class VariantsSimulator: return svs - def _get_deletion_size(self): + @staticmethod + def get_deletion_size(variants): r = random.uniform(0, 1) - for sizes in self.variants["DEL"]: + for sizes in variants["DEL"]: if r < sizes["proba_cumul"]: break return random.randint(sizes["min"], sizes["max"] + 1) + @staticmethod + def get_random_deletions_by_chr(chrm, chr_length, deletions, proba_deletion, variants): + print("Generating for chromosome %s..." % chrm) + nb_on_ch = 0 + nb_dels = 0 + i = 0 + len_chr = chr_length[chrm] + deletions[chrm] = [] + while i < len_chr: + r = random.uniform(0, 1) + if r <= proba_deletion: + nb_dels += 1 + nb_on_ch += 1 + del_size = VariantsSimulator.get_deletion_size(variants) + freq = 0.2 if random.uniform(0, 1) < 0.5 else 0.5 + deletions[chrm].append({ + "name": "DEL{0}_{1}".format(chrm, nb_on_ch), + "start": i + 1, + "end": i + 1 + del_size, + "length": del_size, + "freq": freq + }) + i += del_size + 1 + i += 1 + return deletions + def get_random_deletions(self, proba_deletion, reference_genome): fasta_index = SeqIO.index(reference_genome, "fasta") deletions = {} - nb_total = 0 + chr_length = {} for chrm in fasta_index: - nb_on_ch = 0 - nb_dels = 0 - i = 0 - len_chr = len(fasta_index[chrm].seq) - while i < len_chr: - r = random.uniform(0, 1) - if r <= proba_deletion: - nb_dels += 1 - nb_total += 1 - nb_on_ch += 1 - del_size = self._get_deletion_size() - freq = 0.2 if random.uniform(0, 1) < 0.5 else 0.5 - if chrm not in deletions: - deletions[chrm] = [] - deletions[chrm].append({ - "name": "DEL{0}_{1}_{2}".format("%05d" % nb_total, chrm, nb_on_ch), - "start": i+1, - "end": i+1+del_size, - "length": del_size, - "freq": freq - }) - i += del_size + 1 - i += 1 + chr_length[str(chrm)] = len(fasta_index[chrm].seq) + results = Parallel(n_jobs=self.threads)(delayed(variantssimulator_get_random_deletions_vy_chr_parallel) + (str(chrm), chr_length, deletions, proba_deletion, self.variants) for chrm in fasta_index) + for result in results: + deletions.update(result) self.deletions = deletions return deletions