From afbac7cc794fbf1194dbf07628ba41c10d120500 Mon Sep 17 00:00:00 2001 From: Floreal Cabanettes <floreal.cabanettes@inra.fr> Date: Thu, 7 Jun 2018 11:17:43 +0200 Subject: [PATCH] Add detection workflow to the simulation pipeline --- cnvpipelines.py | 62 ++++++++++++++++++++++++--- snakecnv/align.snk | 2 +- snakecnv/{Snakefile => detection.snk} | 31 +++++++------- snakecnv/lib | 2 +- snakecnv/mergebatches.snk | 2 +- snakecnv/popsim.snk | 15 ++++++- snakecnv/referencebundle.snk | 4 +- 7 files changed, 91 insertions(+), 27 deletions(-) rename snakecnv/{Snakefile => detection.snk} (91%) diff --git a/cnvpipelines.py b/cnvpipelines.py index b0e8fc5..3869e55 100755 --- a/cnvpipelines.py +++ b/cnvpipelines.py @@ -209,7 +209,7 @@ class CnvPipeline: def _get_snakefile(self): try: if self.wf_type == "detection": - snakefile = "Snakefile" + snakefile = "detection.snk" elif self.wf_type == "refbundle": snakefile = "referencebundle.snk" elif self.wf_type == "align": @@ -567,7 +567,7 @@ class CnvPipeline: config = { "wdir": self.wdir, - "genome": reference, + "reference": reference, "tools": tools, # "sample_file": final_sample_file, "batches": batches, @@ -625,7 +625,7 @@ class CnvPipeline: config = { "wdir": self.wdir, - "genome": reference, + "reference": reference, "species": species, "readlength": read_length, "maxNstretches": max_n_stretches @@ -728,6 +728,11 @@ class CnvPipeline: self._fail(str(e)) def run_mergebatches(self, **kwargs): + """ + Run mergebatches workflow + :param kwargs: + :return: + """ sample_files = glob(os.path.join(self.wdir, "samples.list.*")) sample_files.sort(key=lambda x: int(x.rsplit(".", 1)[1])) sample_file_final = os.path.join(self.wdir, "samples.list") @@ -750,11 +755,21 @@ class CnvPipeline: @staticmethod def _link(source, dest): + """ + Link file if dest file does not exists + :param source: + :param dest: + """ if not os.path.exists(dest): os.symlink(source, dest) @staticmethod def _create_samples_fastq_simulaton(nb_inds, smple_file): + """ + Create samples file for align workflow of simulation pipeline + :param nb_inds: number of individuals generated for simulation + :param smple_file: path of the sample file + """ samples = {} for i in range(1, nb_inds+1): samples["indiv%d" % i] = { @@ -764,9 +779,20 @@ class CnvPipeline: with open(smple_file, "w") as samples_f: yaml.dump(samples, samples_f, default_flow_style=False) + @staticmethod + def _create_samples_detection_simulation(nb_inds, smple_file): + """ + Create samples file for detection workflow of simulation pipeline + :param nb_inds: number of individuals generated for simulation + :param smple_file: path of the sample file + """ + with open(smple_file, "w") as samples_f: + for i in range(1, nb_inds + 1): + samples_f.write("indiv%d\n" % i) + def run_simulation(self, nb_inds, reference, nstretches, sv_list, coverage, force_polymorphism, haploid, proba_del, proba_inv, read_len, insert_len_mean, insert_len_sd, min_deletions, - min_inversions, max_try, genotypes, force_wdir=False, **kwargs): + min_inversions, max_try, genotypes, tools, force_wdir=False, **kwargs): """ Run simulation :param nb_inds: @@ -785,6 +811,7 @@ class CnvPipeline: :param min_inversions: :param max_try: :param genotypes: + :param tools: :param kwargs: :return: """ @@ -804,6 +831,12 @@ class CnvPipeline: if not os.path.isfile(genotypes): raise ValueError("Genotypes file does not exists") genotypes = os.path.abspath(genotypes) + if proba_del < 0: + raise ValueError("proba_del must be positive or null") + if proba_inv < 0: + raise ValueError("proba_inv must be positive or null") + if proba_inv == 0 and proba_del == 0: + raise ValueError("At least one proba of variant (del or inv) must be positive") self._check_wdir(force_wdir) @@ -812,7 +845,15 @@ class CnvPipeline: # Create sample file for fastq files: samples_file_fq = os.path.join(self.wdir, "samples.yml") + samples_file_d = os.path.join(self.wdir, "samples.list") self._create_samples_fastq_simulaton(nb_inds, samples_file_fq) + self._create_samples_detection_simulation(nb_inds, samples_file_d) + + variant_types = [] + if proba_del > 0: + variant_types.append("DEL") + if proba_inv > 0: + variant_types.append("INV") config = { "wdir": self.wdir, @@ -829,7 +870,15 @@ class CnvPipeline: "min_deletions": min_deletions, "min_inversions": min_inversions, "max_try": max_try, - "sample_file_align": samples_file_fq + "sample_file_align": samples_file_fq, + "tools": tools, + "sample_file": samples_file_d, + "batches": -1, + "parallel_batches": -1, + "start_batch_nb": 1, + "chromosomes": "all", + "varianttypes": variant_types, + "no_detection_index": True } nstretches_final = None @@ -1111,7 +1160,8 @@ if __name__ == "__main__": type=check_min_size_0) simul_parser.add_argument("--max-try", help="Maximum of tries", default=10, type=check_min_size_0) simul_parser.add_argument("-g", "--genotypes", help="Position of SV with genotypes of individuals") - + simul_parser.add_argument('-t', '--tools', type=str, required=True, help="Tools to launch, coma separated", + nargs="+", choices=TOOLS) add_run_and_rerun_options(simul_parser) simul_parser.add_argument('-w', '--wdir', type=str, required=True, help="Output folder where data will be stored") diff --git a/snakecnv/align.snk b/snakecnv/align.snk index eaba3ab..b0ce10a 100644 --- a/snakecnv/align.snk +++ b/snakecnv/align.snk @@ -64,7 +64,7 @@ rule bamindex: bai = "{folder}/{sample}.bam.bai", summary = "{folder}/{sample}.bam.summary" wildcard_constraints: - folder = "\w+" + folder = "(bams|bwa)" threads: 1 log: diff --git a/snakecnv/Snakefile b/snakecnv/detection.snk similarity index 91% rename from snakecnv/Snakefile rename to snakecnv/detection.snk index 92c02c7..f34a06c 100644 --- a/snakecnv/Snakefile +++ b/snakecnv/detection.snk @@ -60,9 +60,9 @@ def valid_chromosome(chrom, regexp_chrom): def get_filterbed(config): if "filterbed" in config and os.isfile(config['filterbed']): return config['filterbed'] - elif (config['genome'].endswith("fasta") - and os.path.exists(config['genome'][:-6] + ".Nstretch.bed")): - return config['genome'][:-6] + ".Nstretch.bed" + elif (config['reference'].endswith("fasta") + and os.path.exists(config['reference'][:-6] + ".Nstretch.bed")): + return config['reference'][:-6] + ".Nstretch.bed" else: return None @@ -81,7 +81,7 @@ message("Starting") FILTERBED = get_filterbed(config) EXCLUDEBED = get_excluded(config) -REFERENCE = config['genome'] +REFERENCE = config['reference'] WDIR = config['wdir'] workdir: WDIR @@ -119,7 +119,7 @@ for tool in tools: else: snakemake_utils.set_tool_svtypes(tool, varianttypes) -localrules: all, gendermap, gbamlist, bamlist +localrules: all_detection, gendermap, gbamlist, bamlist def set_all_inputs(wildcards): @@ -135,7 +135,7 @@ def set_all_inputs(wildcards): return inputs -rule all: +rule all_detection: input: set_all_inputs @@ -195,15 +195,16 @@ rule parsing: "-o {output.parseoutput} -t {wildcards.tool} -s {wildcards.svtype} " "1>{log.stdout} 2>{log.stderr}" -rule index: - input: - "data/bams/{sample}.bam" - output: - "data/bams/{sample}.bam.bai" - threads: - get_threads("index", 8) - shell: - "samtools index -@{threads} {input}" +if "no_detection_index" not in config or not config["no_detection_index"]: + rule index: + input: + "data/bams/{sample}.bam" + output: + "data/bams/{sample}.bam.bai" + threads: + get_threads("index", 8) + shell: + "samtools index -@{threads} {input}" rule excluded: input: diff --git a/snakecnv/lib b/snakecnv/lib index 765e872..80f0cb8 160000 --- a/snakecnv/lib +++ b/snakecnv/lib @@ -1 +1 @@ -Subproject commit 765e87245fcc733e09f9c3471bfa6fd42106f941 +Subproject commit 80f0cb833989fe6db3146f588713fd44c388603f diff --git a/snakecnv/mergebatches.snk b/snakecnv/mergebatches.snk index 366f61e..9855e09 100644 --- a/snakecnv/mergebatches.snk +++ b/snakecnv/mergebatches.snk @@ -75,7 +75,7 @@ message("Starting") WDIR = config['wdir'] workdir: WDIR -REFERENCE = config['genome'] +REFERENCE = config['reference'] samples = get_samples(config['sample_file'], -1)["batch001"] diff --git a/snakecnv/popsim.snk b/snakecnv/popsim.snk index 3a41aef..8f33f70 100644 --- a/snakecnv/popsim.snk +++ b/snakecnv/popsim.snk @@ -63,12 +63,14 @@ bp_outputs = buildpop_outputs() include: "tools/threads.snk" include: "align.snk" +include: "detection.snk" localrules: all_popsim rule all_popsim: input: - expand("bams/{sample}.bam", sample=["indiv%d" % i for i in range(1, NB_INDS+1)]) + set_all_inputs + #expand("bams/{sample}.bam", sample=["indiv%d" % i for i in range(1, NB_INDS+1)]) #*bp_outputs rule buildpop: @@ -121,3 +123,14 @@ rule buildpop: if params.haploid: command.append("-a") shell(" ".join(command) + " 1> %s 2> %s" % (log.stdout, log.stderr)) + +rule linkdatabams: + input: + bam="bams/{sample}.bam", + bai="bams/{sample}.bam.bai" + output: + bam="data/bams/{sample}.bam", + bai="data/bams/{sample}.bam.bai" + run: + os.symlink(os.path.abspath(input.bam), output.bam) + os.symlink(os.path.abspath(input.bai), output.bai) \ No newline at end of file diff --git a/snakecnv/referencebundle.snk b/snakecnv/referencebundle.snk index 0a997fa..4c5eaff 100644 --- a/snakecnv/referencebundle.snk +++ b/snakecnv/referencebundle.snk @@ -69,7 +69,7 @@ def getChroms(genome, chromfilter=True): include: "tools/threads.snk" -CHROMS = getChroms(config['genome']) +CHROMS = getChroms(config['reference']) rule final: @@ -86,7 +86,7 @@ rule final: rule get_fasta: input: - config['genome'] + config['reference'] output: PREFIX+".fasta" run: -- GitLab