From 9452d0d0b9fd5666feef85af07df494d9aa0ba6b Mon Sep 17 00:00:00 2001
From: Floreal Cabanettes <floreal.cabanettes@inra.fr>
Date: Thu, 7 Jun 2018 16:12:37 +0200
Subject: [PATCH] Add refbundle workflow to the simulation pipeline

---
 cnvpipelines.py                | 22 ++++++--
 snakecnv/detection.snk         |  2 +-
 snakecnv/popsim.snk            |  1 +
 snakecnv/referencebundle.snk   | 93 +++++++++++++++++-----------------
 snakecnv/tools/genomestrip.snk | 43 ++++++++++------
 5 files changed, 96 insertions(+), 65 deletions(-)

diff --git a/cnvpipelines.py b/cnvpipelines.py
index 3869e55..18bcedb 100755
--- a/cnvpipelines.py
+++ b/cnvpipelines.py
@@ -627,7 +627,7 @@ class CnvPipeline:
                 "wdir": self.wdir,
                 "reference": reference,
                 "species": species,
-                "readlength": read_length,
+                "read_len": read_length,
                 "maxNstretches": max_n_stretches
             }
 
@@ -792,7 +792,7 @@ class CnvPipeline:
 
     def run_simulation(self, nb_inds, reference, nstretches, sv_list, coverage, force_polymorphism,
                        haploid, proba_del, proba_inv, read_len, insert_len_mean, insert_len_sd, min_deletions,
-                       min_inversions, max_try, genotypes, tools, force_wdir=False, **kwargs):
+                       min_inversions, max_try, genotypes, tools, species, max_n_stretches, force_wdir=False, **kwargs):
         """
         Run simulation
         :param nb_inds:
@@ -813,6 +813,9 @@ class CnvPipeline:
         :param genotypes:
         :param tools:
         :param kwargs:
+        :param species:
+        :param max_n_stretches:
+        :param force_wdir:
         :return:
         """
         try:
@@ -837,6 +840,8 @@ class CnvPipeline:
                 raise ValueError("proba_inv must be positive or null")
             if proba_inv == 0 and proba_del == 0:
                 raise ValueError("At least one proba of variant (del or inv) must be positive")
+            if "genomestrip" in tools and species is None:
+                raise ValueError("Species parameter is required for genomestrip")
 
             self._check_wdir(force_wdir)
 
@@ -878,9 +883,16 @@ class CnvPipeline:
                 "start_batch_nb": 1,
                 "chromosomes": "all",
                 "varianttypes": variant_types,
-                "no_detection_index": True
+                "no_detection_index": True,
+                "build_refbundle": "genomestrip" in tools,
+                "species": species,
+                "maxNstretches": max_n_stretches
             }
 
+            if "genomestrip" in tools:
+                config["refbundle"] = os.path.join("refbundle", "reference.fasta")
+                config["rb_subdir"] = "refbundle"
+
             nstretches_final = None
             if nstretches is not None:
                 nstretches_final = os.path.join(self.wdir, os.path.basename(nstretches))
@@ -1162,6 +1174,10 @@ if __name__ == "__main__":
     simul_parser.add_argument("-g", "--genotypes", help="Position of SV with genotypes of individuals")
     simul_parser.add_argument('-t', '--tools', type=str, required=True, help="Tools to launch, coma separated",
                                   nargs="+", choices=TOOLS)
+    simul_parser.add_argument('-sp', '--species', type=str, required=False,
+                                  help="[refbundle] Species name, according to the NCBI Taxonomy database")
+    simul_parser.add_argument('-mn', '--max-n-stretches', type=int, required=False, default=100,
+                                  help="[refbundle] Max size of nstretches to consider them as it")
     add_run_and_rerun_options(simul_parser)
     simul_parser.add_argument('-w', '--wdir', type=str, required=True,
                               help="Output folder where data will be stored")
diff --git a/snakecnv/detection.snk b/snakecnv/detection.snk
index f34a06c..a4a1df7 100644
--- a/snakecnv/detection.snk
+++ b/snakecnv/detection.snk
@@ -228,7 +228,7 @@ rule excluded:
         " -t {threads}"
         " 1>{log.stdout} 2>{log.stderr}"
 
-rule nstretches:
+rule nstretches_d:
     input:
         reference = REFERENCE
     output:
diff --git a/snakecnv/popsim.snk b/snakecnv/popsim.snk
index 8f33f70..d23df53 100644
--- a/snakecnv/popsim.snk
+++ b/snakecnv/popsim.snk
@@ -63,6 +63,7 @@ bp_outputs = buildpop_outputs()
 
 include: "tools/threads.snk"
 include: "align.snk"
+include: "referencebundle.snk"
 include: "detection.snk"
 
 localrules: all_popsim
diff --git a/snakecnv/referencebundle.snk b/snakecnv/referencebundle.snk
index 4c5eaff..0f375b6 100644
--- a/snakecnv/referencebundle.snk
+++ b/snakecnv/referencebundle.snk
@@ -26,6 +26,7 @@ shell.prefix("export PATH={root_dir}/bin:{sv_dir}/bwa:\"$PATH\"; export LD_LIBRA
     root_dir=ROOTPATH, sv_dir=SV_DIR))
 
 PREFIX = "reference"
+SUBDIR = config["rb_subdir"] + os.path.sep if "rb_subdir" in config else ""
 SUFF_TYPES = ("gcmask", "svmask", "lcmask")
 
 p_repeats = re.compile('.*(Satellite|Simple_repeat|Low_complexity).*')
@@ -74,21 +75,21 @@ CHROMS = getChroms(config['reference'])
 
 rule final:
     input:
-        PREFIX+".fasta.fai",
-        PREFIX+".gcmask.fasta.fai",
-        PREFIX+".svmask.fasta.fai",
-        PREFIX+".lcmask.fasta.fai",
-        PREFIX+".rdmask.bed",
-        PREFIX+".dict",
-        PREFIX+".ploidymap.txt",
-        PREFIX+".gendermask.bed",
-        PREFIX+".Nstretch.bed"
+        SUBDIR + PREFIX+".fasta.fai",
+        SUBDIR + PREFIX+".gcmask.fasta.fai",
+        SUBDIR + PREFIX+".svmask.fasta.fai",
+        SUBDIR + PREFIX+".lcmask.fasta.fai",
+        SUBDIR + PREFIX+".rdmask.bed",
+        SUBDIR + PREFIX+".dict",
+        SUBDIR + PREFIX+".ploidymap.txt",
+        SUBDIR + PREFIX+".gendermask.bed",
+        SUBDIR + PREFIX+".Nstretch.bed"
 
 rule get_fasta:
     input:
         config['reference']
     output:
-        PREFIX+".fasta"
+        SUBDIR + PREFIX+".fasta"
     run:
         from Bio import SeqIO
         with open(input[0], "rU") as handle, open(output[0], "w") as f:
@@ -98,9 +99,9 @@ rule get_fasta:
 
 rule index:
     input:
-        "{type}.fasta"
+        SUBDIR + "{type}.fasta"
     output:
-        "{type}.fasta.fai"
+        SUBDIR + "{type}.fasta.fai"
     log:
         "logs/index/{type}.log"
     shell:
@@ -108,20 +109,20 @@ rule index:
 
 rule length:
     input:
-        fasta = PREFIX+".fasta",
-        fai = PREFIX+".fasta.fai"
+        fasta = SUBDIR + PREFIX+".fasta",
+        fai = SUBDIR + PREFIX+".fasta.fai"
     output:
-        "reference.fasta.length"
+        SUBDIR + "reference.fasta.length"
     shell:
         "fastalength {input.fasta} > {output}"
 
 rule preparesvmask:
     input:
-        fasta = PREFIX+".fasta",
-        fai = PREFIX+".fasta.fai"
+        fasta = SUBDIR + PREFIX+".fasta",
+        fai = SUBDIR + PREFIX+".fasta.fai"
     output:
-        seq = "work/reference.fasta",
-        index = "work/reference.fasta.bwt"
+        seq = SUBDIR + "work/reference.fasta",
+        index = SUBDIR + "work/reference.fasta.bwt"
     log:
         "logs/preparesvmask.log"
     shell:
@@ -132,20 +133,20 @@ rule preparesvmask:
 
 rule mergesvmask:
     input:
-        expand("work/svmask_{chrom}.fasta", chrom=CHROMS)
+        expand(SUBDIR + "work/svmask_{chrom}.fasta", chrom=CHROMS)
     output:
-        "reference.svmask.fasta"
+        SUBDIR + "reference.svmask.fasta"
     shell:
         "cat {input} > {output}"
 
 rule chromsvmask:
     input:
-        "work/reference.fasta.bwt",
-        fa = "work/reference.fasta"
+        SUBDIR + "work/reference.fasta.bwt",
+        fa = SUBDIR + "work/reference.fasta"
     output:
-        "work/svmask_{chrom}.fasta"
+        SUBDIR + "work/svmask_{chrom}.fasta"
     params:
-        rl = config['readlength']
+        rl = config['read_len']
     log:
         "logs/svmask/{chrom}.log"
     shell:
@@ -155,11 +156,11 @@ rule chromsvmask:
 
 rule bed2fasta:
     input:
-        PREFIX+".fasta.fai",
-        ref = PREFIX+".fasta",
-        bed = "{type}.bed"
+        SUBDIR + PREFIX+".fasta.fai",
+        ref = SUBDIR + PREFIX+".fasta",
+        bed = SUBDIR + "{type}.bed"
     output:
-        "{type}.fasta"
+        SUBDIR + "{type}.fasta"
     wildcard_constraints:
         type = ".*(lc|sv|gc)mask"
     shell:
@@ -171,10 +172,10 @@ rule bed2fasta:
 
 rule repeatmask:
     input:
-        fasta = "reference.fasta",
-        fai = PREFIX+".fasta.fai"
+        fasta = SUBDIR + "reference.fasta",
+        fai = SUBDIR + PREFIX+".fasta.fai"
     output:
-        "reference.fasta.out"
+        SUBDIR + "reference.fasta.out"
     params:
         sp = config['species']
     threads:
@@ -185,9 +186,9 @@ rule repeatmask:
 
 rule rdmask:
     input:
-        length = PREFIX+".fasta.length"
+        length = SUBDIR + PREFIX+".fasta.length"
     output:
-        bed = PREFIX+".rdmask.bed"
+        bed = SUBDIR + PREFIX+".rdmask.bed"
     run:
         with open(input.length) as f:
             with open(output.bed, "w") as fout:
@@ -199,9 +200,9 @@ rule rdmask:
 
 rule lcmaskbed:
     input:
-        repeats = PREFIX+".fasta.out"
+        repeats = SUBDIR + PREFIX+".fasta.out"
     output:
-        bed = PREFIX+".lcmask.bed"
+        bed = SUBDIR + PREFIX+".lcmask.bed"
     run:
         repeats = []
         with open(output.bed, "w") as fout:
@@ -218,9 +219,9 @@ rule lcmaskbed:
 
 rule gcmaskbed:
     input:
-        repeats = "reference.fasta.out"
+        repeats = SUBDIR + "reference.fasta.out"
     output:
-        bed = "reference.gcmask.bed"
+        bed = SUBDIR + "reference.gcmask.bed"
     run:
         repeats = []
         with open(output.bed, "w") as fout:
@@ -237,10 +238,10 @@ rule gcmaskbed:
 
 rule dict:
     input:
-        fasta = PREFIX+".fasta",
-        fai = PREFIX+".fasta.fai"
+        fasta = SUBDIR + PREFIX+".fasta",
+        fai = SUBDIR + PREFIX+".fasta.fai"
     output:
-        PREFIX+".dict"
+        SUBDIR + PREFIX+".dict"
     shell:
         "picard CreateSequenceDictionary "
         " R= {input.fasta} "
@@ -248,10 +249,10 @@ rule dict:
 
 rule nstretches:
     input:
-        fasta = PREFIX+".fasta",
-        fai = PREFIX+".fasta.fai"
+        fasta = SUBDIR + PREFIX+".fasta",
+        fai = SUBDIR + PREFIX+".fasta.fai"
     output:
-        PREFIX+".Nstretch.bed"
+        SUBDIR + PREFIX+".Nstretch.bed"
     params:
         nstrech = config['maxNstretches']
     threads:
@@ -262,12 +263,12 @@ rule nstretches:
 
 rule ploidy:
     output:
-        PREFIX+".ploidymap.txt"
+        SUBDIR + PREFIX+".ploidymap.txt"
     shell:
         " printf \"*\t*\t*\t*\t2\n\" > {output}"
 
 rule gendermask:
     output:
-        PREFIX+".gendermask.bed"
+        SUBDIR + PREFIX+".gendermask.bed"
     shell:
         " touch {output}"
diff --git a/snakecnv/tools/genomestrip.snk b/snakecnv/tools/genomestrip.snk
index 0f36712..8279cd4 100644
--- a/snakecnv/tools/genomestrip.snk
+++ b/snakecnv/tools/genomestrip.snk
@@ -11,21 +11,34 @@ rule gendermap:
             fout.write("\t".join([sample, "F"])+"\n")
         fout.close()
 
-rule preprocess:
-    input:
-        unpack(snakemake_utils.get_inputs_bams),
-        bamlist = "{batch}/bamlist.list",
-        gendermap = "{batch}/genomestrip/gendermap.txt",
-        genome = config['refbundle'] if 'refbundle' in config else REFERENCE
-    output:
-        metadata = "{batch}/genomestrip/metadata/sample_gender.report.txt"
-    log:
-        stdout = "{batch}/logs/genomestrip/preprocess.o",
-        stderr = "{batch}/logs/genomestrip/preprocess.e"
-    shell:
-        "gstrip_preprocess.sh {input.bamlist} {input.gendermap}"
-        " {input.genome} {wildcards.batch}/genomestrip preprocess "
-        " 1>{log.stdout} 2>{log.stderr}"
+if "genomestrip" in tools:
+
+    PREFIX = os.path.splitext(config["refbundle"])[0]
+
+    rule preprocess:
+        input:
+            PREFIX+".fasta.fai",
+            PREFIX+".gcmask.fasta.fai",
+            PREFIX+".svmask.fasta.fai",
+            PREFIX+".lcmask.fasta.fai",
+            PREFIX+".rdmask.bed",
+            PREFIX+".dict",
+            PREFIX+".ploidymap.txt",
+            PREFIX+".gendermask.bed",
+            PREFIX+".Nstretch.bed",
+            unpack(snakemake_utils.get_inputs_bams),
+            bamlist = "{batch}/bamlist.list",
+            gendermap = "{batch}/genomestrip/gendermap.txt",
+            genome = config['refbundle']
+        output:
+            metadata = "{batch}/genomestrip/metadata/sample_gender.report.txt"
+        log:
+            stdout = "{batch}/logs/genomestrip/preprocess.o",
+            stderr = "{batch}/logs/genomestrip/preprocess.e"
+        shell:
+            "gstrip_preprocess.sh {input.bamlist} {input.gendermap}"
+            " {input.genome} {wildcards.batch}/genomestrip preprocess "
+            " 1>{log.stdout} 2>{log.stderr}"
 
 rule genomestrip:
     input:
-- 
GitLab