From 9452d0d0b9fd5666feef85af07df494d9aa0ba6b Mon Sep 17 00:00:00 2001 From: Floreal Cabanettes <floreal.cabanettes@inra.fr> Date: Thu, 7 Jun 2018 16:12:37 +0200 Subject: [PATCH] Add refbundle workflow to the simulation pipeline --- cnvpipelines.py | 22 ++++++-- snakecnv/detection.snk | 2 +- snakecnv/popsim.snk | 1 + snakecnv/referencebundle.snk | 93 +++++++++++++++++----------------- snakecnv/tools/genomestrip.snk | 43 ++++++++++------ 5 files changed, 96 insertions(+), 65 deletions(-) diff --git a/cnvpipelines.py b/cnvpipelines.py index 3869e55..18bcedb 100755 --- a/cnvpipelines.py +++ b/cnvpipelines.py @@ -627,7 +627,7 @@ class CnvPipeline: "wdir": self.wdir, "reference": reference, "species": species, - "readlength": read_length, + "read_len": read_length, "maxNstretches": max_n_stretches } @@ -792,7 +792,7 @@ class CnvPipeline: def run_simulation(self, nb_inds, reference, nstretches, sv_list, coverage, force_polymorphism, haploid, proba_del, proba_inv, read_len, insert_len_mean, insert_len_sd, min_deletions, - min_inversions, max_try, genotypes, tools, force_wdir=False, **kwargs): + min_inversions, max_try, genotypes, tools, species, max_n_stretches, force_wdir=False, **kwargs): """ Run simulation :param nb_inds: @@ -813,6 +813,9 @@ class CnvPipeline: :param genotypes: :param tools: :param kwargs: + :param species: + :param max_n_stretches: + :param force_wdir: :return: """ try: @@ -837,6 +840,8 @@ class CnvPipeline: raise ValueError("proba_inv must be positive or null") if proba_inv == 0 and proba_del == 0: raise ValueError("At least one proba of variant (del or inv) must be positive") + if "genomestrip" in tools and species is None: + raise ValueError("Species parameter is required for genomestrip") self._check_wdir(force_wdir) @@ -878,9 +883,16 @@ class CnvPipeline: "start_batch_nb": 1, "chromosomes": "all", "varianttypes": variant_types, - "no_detection_index": True + "no_detection_index": True, + "build_refbundle": "genomestrip" in tools, + "species": species, + "maxNstretches": max_n_stretches } + if "genomestrip" in tools: + config["refbundle"] = os.path.join("refbundle", "reference.fasta") + config["rb_subdir"] = "refbundle" + nstretches_final = None if nstretches is not None: nstretches_final = os.path.join(self.wdir, os.path.basename(nstretches)) @@ -1162,6 +1174,10 @@ if __name__ == "__main__": simul_parser.add_argument("-g", "--genotypes", help="Position of SV with genotypes of individuals") simul_parser.add_argument('-t', '--tools', type=str, required=True, help="Tools to launch, coma separated", nargs="+", choices=TOOLS) + simul_parser.add_argument('-sp', '--species', type=str, required=False, + help="[refbundle] Species name, according to the NCBI Taxonomy database") + simul_parser.add_argument('-mn', '--max-n-stretches', type=int, required=False, default=100, + help="[refbundle] Max size of nstretches to consider them as it") add_run_and_rerun_options(simul_parser) simul_parser.add_argument('-w', '--wdir', type=str, required=True, help="Output folder where data will be stored") diff --git a/snakecnv/detection.snk b/snakecnv/detection.snk index f34a06c..a4a1df7 100644 --- a/snakecnv/detection.snk +++ b/snakecnv/detection.snk @@ -228,7 +228,7 @@ rule excluded: " -t {threads}" " 1>{log.stdout} 2>{log.stderr}" -rule nstretches: +rule nstretches_d: input: reference = REFERENCE output: diff --git a/snakecnv/popsim.snk b/snakecnv/popsim.snk index 8f33f70..d23df53 100644 --- a/snakecnv/popsim.snk +++ b/snakecnv/popsim.snk @@ -63,6 +63,7 @@ bp_outputs = buildpop_outputs() include: "tools/threads.snk" include: "align.snk" +include: "referencebundle.snk" include: "detection.snk" localrules: all_popsim diff --git a/snakecnv/referencebundle.snk b/snakecnv/referencebundle.snk index 4c5eaff..0f375b6 100644 --- a/snakecnv/referencebundle.snk +++ b/snakecnv/referencebundle.snk @@ -26,6 +26,7 @@ shell.prefix("export PATH={root_dir}/bin:{sv_dir}/bwa:\"$PATH\"; export LD_LIBRA root_dir=ROOTPATH, sv_dir=SV_DIR)) PREFIX = "reference" +SUBDIR = config["rb_subdir"] + os.path.sep if "rb_subdir" in config else "" SUFF_TYPES = ("gcmask", "svmask", "lcmask") p_repeats = re.compile('.*(Satellite|Simple_repeat|Low_complexity).*') @@ -74,21 +75,21 @@ CHROMS = getChroms(config['reference']) rule final: input: - PREFIX+".fasta.fai", - PREFIX+".gcmask.fasta.fai", - PREFIX+".svmask.fasta.fai", - PREFIX+".lcmask.fasta.fai", - PREFIX+".rdmask.bed", - PREFIX+".dict", - PREFIX+".ploidymap.txt", - PREFIX+".gendermask.bed", - PREFIX+".Nstretch.bed" + SUBDIR + PREFIX+".fasta.fai", + SUBDIR + PREFIX+".gcmask.fasta.fai", + SUBDIR + PREFIX+".svmask.fasta.fai", + SUBDIR + PREFIX+".lcmask.fasta.fai", + SUBDIR + PREFIX+".rdmask.bed", + SUBDIR + PREFIX+".dict", + SUBDIR + PREFIX+".ploidymap.txt", + SUBDIR + PREFIX+".gendermask.bed", + SUBDIR + PREFIX+".Nstretch.bed" rule get_fasta: input: config['reference'] output: - PREFIX+".fasta" + SUBDIR + PREFIX+".fasta" run: from Bio import SeqIO with open(input[0], "rU") as handle, open(output[0], "w") as f: @@ -98,9 +99,9 @@ rule get_fasta: rule index: input: - "{type}.fasta" + SUBDIR + "{type}.fasta" output: - "{type}.fasta.fai" + SUBDIR + "{type}.fasta.fai" log: "logs/index/{type}.log" shell: @@ -108,20 +109,20 @@ rule index: rule length: input: - fasta = PREFIX+".fasta", - fai = PREFIX+".fasta.fai" + fasta = SUBDIR + PREFIX+".fasta", + fai = SUBDIR + PREFIX+".fasta.fai" output: - "reference.fasta.length" + SUBDIR + "reference.fasta.length" shell: "fastalength {input.fasta} > {output}" rule preparesvmask: input: - fasta = PREFIX+".fasta", - fai = PREFIX+".fasta.fai" + fasta = SUBDIR + PREFIX+".fasta", + fai = SUBDIR + PREFIX+".fasta.fai" output: - seq = "work/reference.fasta", - index = "work/reference.fasta.bwt" + seq = SUBDIR + "work/reference.fasta", + index = SUBDIR + "work/reference.fasta.bwt" log: "logs/preparesvmask.log" shell: @@ -132,20 +133,20 @@ rule preparesvmask: rule mergesvmask: input: - expand("work/svmask_{chrom}.fasta", chrom=CHROMS) + expand(SUBDIR + "work/svmask_{chrom}.fasta", chrom=CHROMS) output: - "reference.svmask.fasta" + SUBDIR + "reference.svmask.fasta" shell: "cat {input} > {output}" rule chromsvmask: input: - "work/reference.fasta.bwt", - fa = "work/reference.fasta" + SUBDIR + "work/reference.fasta.bwt", + fa = SUBDIR + "work/reference.fasta" output: - "work/svmask_{chrom}.fasta" + SUBDIR + "work/svmask_{chrom}.fasta" params: - rl = config['readlength'] + rl = config['read_len'] log: "logs/svmask/{chrom}.log" shell: @@ -155,11 +156,11 @@ rule chromsvmask: rule bed2fasta: input: - PREFIX+".fasta.fai", - ref = PREFIX+".fasta", - bed = "{type}.bed" + SUBDIR + PREFIX+".fasta.fai", + ref = SUBDIR + PREFIX+".fasta", + bed = SUBDIR + "{type}.bed" output: - "{type}.fasta" + SUBDIR + "{type}.fasta" wildcard_constraints: type = ".*(lc|sv|gc)mask" shell: @@ -171,10 +172,10 @@ rule bed2fasta: rule repeatmask: input: - fasta = "reference.fasta", - fai = PREFIX+".fasta.fai" + fasta = SUBDIR + "reference.fasta", + fai = SUBDIR + PREFIX+".fasta.fai" output: - "reference.fasta.out" + SUBDIR + "reference.fasta.out" params: sp = config['species'] threads: @@ -185,9 +186,9 @@ rule repeatmask: rule rdmask: input: - length = PREFIX+".fasta.length" + length = SUBDIR + PREFIX+".fasta.length" output: - bed = PREFIX+".rdmask.bed" + bed = SUBDIR + PREFIX+".rdmask.bed" run: with open(input.length) as f: with open(output.bed, "w") as fout: @@ -199,9 +200,9 @@ rule rdmask: rule lcmaskbed: input: - repeats = PREFIX+".fasta.out" + repeats = SUBDIR + PREFIX+".fasta.out" output: - bed = PREFIX+".lcmask.bed" + bed = SUBDIR + PREFIX+".lcmask.bed" run: repeats = [] with open(output.bed, "w") as fout: @@ -218,9 +219,9 @@ rule lcmaskbed: rule gcmaskbed: input: - repeats = "reference.fasta.out" + repeats = SUBDIR + "reference.fasta.out" output: - bed = "reference.gcmask.bed" + bed = SUBDIR + "reference.gcmask.bed" run: repeats = [] with open(output.bed, "w") as fout: @@ -237,10 +238,10 @@ rule gcmaskbed: rule dict: input: - fasta = PREFIX+".fasta", - fai = PREFIX+".fasta.fai" + fasta = SUBDIR + PREFIX+".fasta", + fai = SUBDIR + PREFIX+".fasta.fai" output: - PREFIX+".dict" + SUBDIR + PREFIX+".dict" shell: "picard CreateSequenceDictionary " " R= {input.fasta} " @@ -248,10 +249,10 @@ rule dict: rule nstretches: input: - fasta = PREFIX+".fasta", - fai = PREFIX+".fasta.fai" + fasta = SUBDIR + PREFIX+".fasta", + fai = SUBDIR + PREFIX+".fasta.fai" output: - PREFIX+".Nstretch.bed" + SUBDIR + PREFIX+".Nstretch.bed" params: nstrech = config['maxNstretches'] threads: @@ -262,12 +263,12 @@ rule nstretches: rule ploidy: output: - PREFIX+".ploidymap.txt" + SUBDIR + PREFIX+".ploidymap.txt" shell: " printf \"*\t*\t*\t*\t2\n\" > {output}" rule gendermask: output: - PREFIX+".gendermask.bed" + SUBDIR + PREFIX+".gendermask.bed" shell: " touch {output}" diff --git a/snakecnv/tools/genomestrip.snk b/snakecnv/tools/genomestrip.snk index 0f36712..8279cd4 100644 --- a/snakecnv/tools/genomestrip.snk +++ b/snakecnv/tools/genomestrip.snk @@ -11,21 +11,34 @@ rule gendermap: fout.write("\t".join([sample, "F"])+"\n") fout.close() -rule preprocess: - input: - unpack(snakemake_utils.get_inputs_bams), - bamlist = "{batch}/bamlist.list", - gendermap = "{batch}/genomestrip/gendermap.txt", - genome = config['refbundle'] if 'refbundle' in config else REFERENCE - output: - metadata = "{batch}/genomestrip/metadata/sample_gender.report.txt" - log: - stdout = "{batch}/logs/genomestrip/preprocess.o", - stderr = "{batch}/logs/genomestrip/preprocess.e" - shell: - "gstrip_preprocess.sh {input.bamlist} {input.gendermap}" - " {input.genome} {wildcards.batch}/genomestrip preprocess " - " 1>{log.stdout} 2>{log.stderr}" +if "genomestrip" in tools: + + PREFIX = os.path.splitext(config["refbundle"])[0] + + rule preprocess: + input: + PREFIX+".fasta.fai", + PREFIX+".gcmask.fasta.fai", + PREFIX+".svmask.fasta.fai", + PREFIX+".lcmask.fasta.fai", + PREFIX+".rdmask.bed", + PREFIX+".dict", + PREFIX+".ploidymap.txt", + PREFIX+".gendermask.bed", + PREFIX+".Nstretch.bed", + unpack(snakemake_utils.get_inputs_bams), + bamlist = "{batch}/bamlist.list", + gendermap = "{batch}/genomestrip/gendermap.txt", + genome = config['refbundle'] + output: + metadata = "{batch}/genomestrip/metadata/sample_gender.report.txt" + log: + stdout = "{batch}/logs/genomestrip/preprocess.o", + stderr = "{batch}/logs/genomestrip/preprocess.e" + shell: + "gstrip_preprocess.sh {input.bamlist} {input.gendermap}" + " {input.genome} {wildcards.batch}/genomestrip preprocess " + " 1>{log.stdout} 2>{log.stderr}" rule genomestrip: input: -- GitLab