From afbac7cc794fbf1194dbf07628ba41c10d120500 Mon Sep 17 00:00:00 2001
From: Floreal Cabanettes <floreal.cabanettes@inra.fr>
Date: Thu, 7 Jun 2018 11:17:43 +0200
Subject: [PATCH] Add detection workflow to the simulation pipeline

---
 cnvpipelines.py                       | 62 ++++++++++++++++++++++++---
 snakecnv/align.snk                    |  2 +-
 snakecnv/{Snakefile => detection.snk} | 31 +++++++-------
 snakecnv/lib                          |  2 +-
 snakecnv/mergebatches.snk             |  2 +-
 snakecnv/popsim.snk                   | 15 ++++++-
 snakecnv/referencebundle.snk          |  4 +-
 7 files changed, 91 insertions(+), 27 deletions(-)
 rename snakecnv/{Snakefile => detection.snk} (91%)

diff --git a/cnvpipelines.py b/cnvpipelines.py
index b0e8fc5..3869e55 100755
--- a/cnvpipelines.py
+++ b/cnvpipelines.py
@@ -209,7 +209,7 @@ class CnvPipeline:
     def _get_snakefile(self):
         try:
             if self.wf_type == "detection":
-                snakefile = "Snakefile"
+                snakefile = "detection.snk"
             elif self.wf_type == "refbundle":
                 snakefile = "referencebundle.snk"
             elif self.wf_type == "align":
@@ -567,7 +567,7 @@ class CnvPipeline:
 
             config = {
                 "wdir": self.wdir,
-                "genome": reference,
+                "reference": reference,
                 "tools": tools,
                 # "sample_file": final_sample_file,
                 "batches": batches,
@@ -625,7 +625,7 @@ class CnvPipeline:
 
             config = {
                 "wdir": self.wdir,
-                "genome": reference,
+                "reference": reference,
                 "species": species,
                 "readlength": read_length,
                 "maxNstretches": max_n_stretches
@@ -728,6 +728,11 @@ class CnvPipeline:
             self._fail(str(e))
 
     def run_mergebatches(self, **kwargs):
+        """
+        Run mergebatches workflow
+        :param kwargs:
+        :return:
+        """
         sample_files = glob(os.path.join(self.wdir, "samples.list.*"))
         sample_files.sort(key=lambda x: int(x.rsplit(".", 1)[1]))
         sample_file_final = os.path.join(self.wdir, "samples.list")
@@ -750,11 +755,21 @@ class CnvPipeline:
 
     @staticmethod
     def _link(source, dest):
+        """
+        Link file if dest file does not exists
+        :param source:
+        :param dest:
+        """
         if not os.path.exists(dest):
             os.symlink(source, dest)
 
     @staticmethod
     def _create_samples_fastq_simulaton(nb_inds, smple_file):
+        """
+        Create samples file for align workflow of simulation pipeline
+        :param nb_inds: number of individuals generated for simulation
+        :param smple_file: path of the sample file
+        """
         samples = {}
         for i in range(1, nb_inds+1):
             samples["indiv%d" % i] = {
@@ -764,9 +779,20 @@ class CnvPipeline:
         with open(smple_file, "w") as samples_f:
             yaml.dump(samples, samples_f, default_flow_style=False)
 
+    @staticmethod
+    def _create_samples_detection_simulation(nb_inds, smple_file):
+        """
+        Create samples file for detection workflow of simulation pipeline
+        :param nb_inds: number of individuals generated for simulation
+        :param smple_file: path of the sample file
+        """
+        with open(smple_file, "w") as samples_f:
+            for i in range(1, nb_inds + 1):
+                samples_f.write("indiv%d\n" % i)
+
     def run_simulation(self, nb_inds, reference, nstretches, sv_list, coverage, force_polymorphism,
                        haploid, proba_del, proba_inv, read_len, insert_len_mean, insert_len_sd, min_deletions,
-                       min_inversions, max_try, genotypes, force_wdir=False, **kwargs):
+                       min_inversions, max_try, genotypes, tools, force_wdir=False, **kwargs):
         """
         Run simulation
         :param nb_inds:
@@ -785,6 +811,7 @@ class CnvPipeline:
         :param min_inversions:
         :param max_try:
         :param genotypes:
+        :param tools:
         :param kwargs:
         :return:
         """
@@ -804,6 +831,12 @@ class CnvPipeline:
                 if not os.path.isfile(genotypes):
                     raise ValueError("Genotypes file does not exists")
                 genotypes = os.path.abspath(genotypes)
+            if proba_del < 0:
+                raise ValueError("proba_del must be positive or null")
+            if proba_inv < 0:
+                raise ValueError("proba_inv must be positive or null")
+            if proba_inv == 0 and proba_del == 0:
+                raise ValueError("At least one proba of variant (del or inv) must be positive")
 
             self._check_wdir(force_wdir)
 
@@ -812,7 +845,15 @@ class CnvPipeline:
 
             # Create sample file for fastq files:
             samples_file_fq = os.path.join(self.wdir, "samples.yml")
+            samples_file_d = os.path.join(self.wdir, "samples.list")
             self._create_samples_fastq_simulaton(nb_inds, samples_file_fq)
+            self._create_samples_detection_simulation(nb_inds, samples_file_d)
+
+            variant_types = []
+            if proba_del > 0:
+                variant_types.append("DEL")
+            if proba_inv > 0:
+                variant_types.append("INV")
 
             config = {
                 "wdir": self.wdir,
@@ -829,7 +870,15 @@ class CnvPipeline:
                 "min_deletions": min_deletions,
                 "min_inversions": min_inversions,
                 "max_try": max_try,
-                "sample_file_align": samples_file_fq
+                "sample_file_align": samples_file_fq,
+                "tools": tools,
+                "sample_file": samples_file_d,
+                "batches": -1,
+                "parallel_batches": -1,
+                "start_batch_nb": 1,
+                "chromosomes": "all",
+                "varianttypes": variant_types,
+                "no_detection_index": True
             }
 
             nstretches_final = None
@@ -1111,7 +1160,8 @@ if __name__ == "__main__":
                               type=check_min_size_0)
     simul_parser.add_argument("--max-try", help="Maximum of tries", default=10, type=check_min_size_0)
     simul_parser.add_argument("-g", "--genotypes", help="Position of SV with genotypes of individuals")
-
+    simul_parser.add_argument('-t', '--tools', type=str, required=True, help="Tools to launch, coma separated",
+                                  nargs="+", choices=TOOLS)
     add_run_and_rerun_options(simul_parser)
     simul_parser.add_argument('-w', '--wdir', type=str, required=True,
                               help="Output folder where data will be stored")
diff --git a/snakecnv/align.snk b/snakecnv/align.snk
index eaba3ab..b0ce10a 100644
--- a/snakecnv/align.snk
+++ b/snakecnv/align.snk
@@ -64,7 +64,7 @@ rule bamindex:
         bai = "{folder}/{sample}.bam.bai",
         summary = "{folder}/{sample}.bam.summary"
     wildcard_constraints:
-        folder = "\w+"
+        folder = "(bams|bwa)"
     threads:
         1
     log:
diff --git a/snakecnv/Snakefile b/snakecnv/detection.snk
similarity index 91%
rename from snakecnv/Snakefile
rename to snakecnv/detection.snk
index 92c02c7..f34a06c 100644
--- a/snakecnv/Snakefile
+++ b/snakecnv/detection.snk
@@ -60,9 +60,9 @@ def valid_chromosome(chrom, regexp_chrom):
 def get_filterbed(config):
     if "filterbed" in config and os.isfile(config['filterbed']):
         return config['filterbed']
-    elif (config['genome'].endswith("fasta")
-            and os.path.exists(config['genome'][:-6] + ".Nstretch.bed")):
-        return config['genome'][:-6] + ".Nstretch.bed"
+    elif (config['reference'].endswith("fasta")
+            and os.path.exists(config['reference'][:-6] + ".Nstretch.bed")):
+        return config['reference'][:-6] + ".Nstretch.bed"
     else:
         return None
 
@@ -81,7 +81,7 @@ message("Starting")
 FILTERBED = get_filterbed(config)
 EXCLUDEBED = get_excluded(config)
 
-REFERENCE = config['genome']
+REFERENCE = config['reference']
 WDIR = config['wdir']
 workdir: WDIR
 
@@ -119,7 +119,7 @@ for tool in tools:
     else:
         snakemake_utils.set_tool_svtypes(tool, varianttypes)
 
-localrules: all, gendermap, gbamlist, bamlist
+localrules: all_detection, gendermap, gbamlist, bamlist
 
 
 def set_all_inputs(wildcards):
@@ -135,7 +135,7 @@ def set_all_inputs(wildcards):
     return inputs
 
 
-rule all:
+rule all_detection:
     input:
         set_all_inputs
 
@@ -195,15 +195,16 @@ rule parsing:
         "-o {output.parseoutput} -t {wildcards.tool} -s {wildcards.svtype} "
         "1>{log.stdout} 2>{log.stderr}"
 
-rule index:
-    input:
-        "data/bams/{sample}.bam"
-    output:
-        "data/bams/{sample}.bam.bai"
-    threads:
-        get_threads("index", 8)
-    shell:
-        "samtools index -@{threads} {input}"
+if "no_detection_index" not in config or not config["no_detection_index"]:
+    rule index:
+        input:
+            "data/bams/{sample}.bam"
+        output:
+            "data/bams/{sample}.bam.bai"
+        threads:
+            get_threads("index", 8)
+        shell:
+            "samtools index -@{threads} {input}"
 
 rule excluded:
     input:
diff --git a/snakecnv/lib b/snakecnv/lib
index 765e872..80f0cb8 160000
--- a/snakecnv/lib
+++ b/snakecnv/lib
@@ -1 +1 @@
-Subproject commit 765e87245fcc733e09f9c3471bfa6fd42106f941
+Subproject commit 80f0cb833989fe6db3146f588713fd44c388603f
diff --git a/snakecnv/mergebatches.snk b/snakecnv/mergebatches.snk
index 366f61e..9855e09 100644
--- a/snakecnv/mergebatches.snk
+++ b/snakecnv/mergebatches.snk
@@ -75,7 +75,7 @@ message("Starting")
 
 WDIR = config['wdir']
 workdir: WDIR
-REFERENCE = config['genome']
+REFERENCE = config['reference']
 
 samples = get_samples(config['sample_file'], -1)["batch001"]
 
diff --git a/snakecnv/popsim.snk b/snakecnv/popsim.snk
index 3a41aef..8f33f70 100644
--- a/snakecnv/popsim.snk
+++ b/snakecnv/popsim.snk
@@ -63,12 +63,14 @@ bp_outputs = buildpop_outputs()
 
 include: "tools/threads.snk"
 include: "align.snk"
+include: "detection.snk"
 
 localrules: all_popsim
 
 rule all_popsim:
     input:
-        expand("bams/{sample}.bam", sample=["indiv%d" % i for i in range(1, NB_INDS+1)])
+        set_all_inputs
+        #expand("bams/{sample}.bam", sample=["indiv%d" % i for i in range(1, NB_INDS+1)])
         #*bp_outputs
 
 rule buildpop:
@@ -121,3 +123,14 @@ rule buildpop:
         if params.haploid:
             command.append("-a")
         shell(" ".join(command) + " 1> %s 2> %s" % (log.stdout, log.stderr))
+
+rule linkdatabams:
+    input:
+        bam="bams/{sample}.bam",
+        bai="bams/{sample}.bam.bai"
+    output:
+        bam="data/bams/{sample}.bam",
+        bai="data/bams/{sample}.bam.bai"
+    run:
+        os.symlink(os.path.abspath(input.bam), output.bam)
+        os.symlink(os.path.abspath(input.bai), output.bai)
\ No newline at end of file
diff --git a/snakecnv/referencebundle.snk b/snakecnv/referencebundle.snk
index 0a997fa..4c5eaff 100644
--- a/snakecnv/referencebundle.snk
+++ b/snakecnv/referencebundle.snk
@@ -69,7 +69,7 @@ def getChroms(genome, chromfilter=True):
 include: "tools/threads.snk"
 
 
-CHROMS = getChroms(config['genome'])
+CHROMS = getChroms(config['reference'])
 
 
 rule final:
@@ -86,7 +86,7 @@ rule final:
 
 rule get_fasta:
     input:
-        config['genome']
+        config['reference']
     output:
         PREFIX+".fasta"
     run:
-- 
GitLab