Compare revisions

Thomas Faraut · Thomas Faraut · Thomas Faraut · Thomas Faraut · Thomas Faraut · Thomas Faraut
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,4 @@ application.properties
 **/*.pyo
 **/.snakemake
 **/old/*
+**/.ipynb_checkpoints
--- a/.gitmodules
+++ b/.gitmodules
 [submodule "snakecnv/lib"]
 	path = snakecnv/lib
-	url = git@forgemia.inra.fr:svdetection/svlib.git
-        branch=master
+	url = https://forgemia.inra.fr/svdetection/svlib.git
+    branch = master
 [submodule "popsim"]
 	path = popsim
-	url = git@forgemia.inra.fr:svdetection/popsim.git
+	url = https://forgemia.inra.fr/svdetection/popsim.git
 	branch = master
--- a/README.md
+++ b/README.md
@@ -35,21 +35,22 @@ $ conda env create --file environment.yaml
 #### Load new conda environment

 ```sh
-source activate cnvpipeline
+conda activate cnvpipeline
 ```   

 ### Install for simulations

 To do simulations, you need to compile pirs, which is included as submodule of your cnvpipeline installation. You must use a recent gcc compiler (for example on genologin, module load compiler/gcc-7.2.0).
 Then cd in `cnvpipelines/popsim/pirs` and just make:
-
+    
+    module load compiler/gcc-7.2.0  # for genologin users
    make

   
 ## Configuration

 Copy `application.properties.example` into `application.properties` and make appropriate changes.
-Sections and parameters are described below.
+Sections and parameters are described below (on the genologin server you can simply copy `application.properties.genologin` into `application.properties`).

 ### Global section

@@ -71,12 +72,11 @@ This section must be filled only if you don't use local as batch system type (se

 ### Reference bundle section

-* *repeatmasker_lib_path*: PATH to the RepBase Libraries folder for RepeatMasker, if needed by RepeatMasker installation
+* *repeatmasker_lib_path*: PATH to the RepBase Libraries folder for RepeatMasker. This is not mandatory (see Install RepeatMasker Libraries in http://www.repeatmasker.org/RMDownload.html).

-### Genotoul specific configuration
+### Genotoul (genologin) specific configuration

-* Use the delly module specified
-* Ue the drmaa library specified
+* Use the drmaa library specified


 ## Run
@@ -153,7 +153,7 @@ With:
 `tools`: list of tools, space separated. Choose among genomestrip, delly, lumpy and pindel.

 ##### Optional arguments
-  
+`-r rb`: the refbundle directory necessary when using genomestrip
 `-b INT`: size of batches (default: -1, to always make only 1 batch)  
 `--chromosomes CHRS`: list of chromosomes to study, space separated. Regex accepted (using the [python syntax](https://docs.python.org/3/library/re.html#regular-expression-syntax)). Default: all valid chromosomes of the reference  
 `--force-all-chromosomes`: ignore filtering if `--chromosomes` is not set  

--- a/application.properties.genologin
+++ b/application.properties.genologin
+[global]
+
+# batch system type: local, slurm or sge
+# for genologin use slurm
+batch_system_type = slurm
+# list of modules to load (space separated)
+modules =
+# path environment to prepend (":" separated)
+paths =
+# number of concurrent jobs to launch
+jobs = 100
+# svtoolkit home: genomestrip is now installed via conda
+# sv_dir =
+
+
+[cluster]
+# Ignore these options for local batch system type
+
+# submission mode: drmaa or cluster
+submission_mode = drmaa
+
+# cluster submission command (ignore for DRMAA submission mode)
+submission_command =
+
+# DRMAA lib (ignore for cluster submission mode)
+# For genologin use /tools/libraries/slurm-drmaa/slurm-drmaa-1.0.7/lib/libdrmaa.so
+drmaa = /tools/libraries/slurm-drmaa/slurm-drmaa-1.0.7/lib/libdrmaa.so
+
+# native submission commands: keep default on most cases
+native_submission_options = ###DEFAULT###
+
+# cluster config file
+config = ###PROGRAM###/cluster.yaml
+
+# enable drmaa parallelisation through DRMAA for Genomestrip (may cause exceed of number of concurrent jobs launched)
+# True: enabled, False: disabled
+sv_parallel_drmaa = True
+sv_parallel_native_specs = ###DEFAULT###
+
+[refbundle]
+
+# Path to the Libraries folder extracted from RepBase archive file
+# Optional (will use default libraries instead) but highly recommended :-)
+repeatmasker_lib_path = /save/faraut/Libraries
--- a/cluster.yaml
+++ b/cluster.yaml
@@ -8,11 +8,12 @@ delly:
  time: "36:00:00"

 lumpy:
-  mem: 12
+  mem: 24
+  time: "36:00:00"

 pindel:
  mem: 16
-  time: "24:00:00"
+  time: "36:00:00"

 genomestrip:
  mem: 8
@@ -46,6 +47,11 @@ repeatmask:
  mem: 8
  h_vmem: 8

+buildpop:
+  time: "48:00:00"
+  mem: 8
+  h_vmem: 8
+
 bed2fasta:
  mem: 16
  h_vmem: 16

--- a/cnvpipelines.py
+++ b/cnvpipelines.py
@@ -19,7 +19,7 @@ from subprocess import run

 TOOLS = ["lumpy", "delly", "pindel", "genomestrip"]
 VTYPES = ["DEL", "INV", "DUP", "mCNV"]
-MIN_CHR_SIZE = 50000
+MIN_CHR_SIZE = 500000

 MD5_TEST = {
    "ERR470101_f_1.fq.gz": "2c30600e303c9bfb7b9538ec76055203",
@@ -437,6 +437,7 @@ class CnvPipeline:
        :param reason: if True, show reason of each executed rule
        :type reason: bool
        """
+
        if self.isrerun:
            run(["touch", self.config_file, "-d", self.config_timestamp])
        if mode == "rerun" and self.wf_type == "detection" and len(snake_args) == 0:
@@ -445,27 +446,31 @@ class CnvPipeline:
                del local_vars["snake_args"]
                return self.run_or_rerun_detection(**local_vars)
            # Fix snakemake bug: make reference files timestamp to the config.yaml timestamp value
-        if (mode == "rerun" or self.isrerun) and (self.wf_type == "simulation" or self.wf_type == "refbundle"):
-                reference = os.path.join(self.wdir, "reference.fasta")
-                fixed_date = datetime.fromtimestamp(os.stat(self.config_file).st_ctime).strftime("%Y-%m-%d %H:%M:%S")
-                if os.path.exists(reference):
-                    run(["touch", reference, "-d", fixed_date])
-                fai = reference + ".fai"
-                if os.path.exists(fai):
-                    run(["touch", fai, "-d", fixed_date])
-                if self.wf_type == "simulation":
-                    reference = os.path.join(self.wdir, "refbundle", "reference.fasta")
-                    if os.path.exists(reference):
-                        run(["touch", reference, "-d", fixed_date])
-                    fai = reference + ".fai"
-                    if os.path.exists(fai):
-                        run(["touch", fai, "-d", fixed_date])
+#        if (mode == "rerun" or self.isrerun) and (self.wf_type == "simulation" or self.wf_type == "refbundle"):
+#                #reference = os.path.join(self.wdir, "reference.fasta")
+#                reference = os.path.join(self.wdir, "reference_raw.fasta")
+#                fixed_date = datetime.fromtimestamp(os.stat(self.config_file).st_ctime).strftime("%Y-%m-%d %H:%M:%S")
+#                if os.path.exists(reference):
+#                    run(["touch", "-h", reference, "-d", fixed_date])
+#                fai = reference + ".fai"
+#                if os.path.exists(fai):
+#                    run(["touch", "-h", fai, "-d", fixed_date])
+#
+#                if self.wf_type == "simulation":
+#                    reference = os.path.join(self.wdir, "refbundle", "reference.fasta")
+#                    if os.path.exists(reference):
+#                        run(["touch", reference, "-d", fixed_date])
+#                    fai = reference + ".fai"
+#                    if os.path.exists(fai):
+#                        run(["touch", fai, "-d", fixed_date])

        if merge_batches:
            if self.wf_type != "detection":
                self._fail("Merge batches can only be done on a detection workflow")
            self._wf_type = "mergebatches"

+
+
        try:
            if not os.path.exists(self.config_file):
                raise FileNotFoundError("File not found: %s" % self.config_file)
@@ -561,14 +566,12 @@ class CnvPipeline:
        do_delete_after_dryrun = self.delete_wdir_after_dryrun

        for n_block in blocks:
-
            sample_file = os.path.join(self.wdir, "samples.list.%d" % n_block)

            with open(sample_file, "r") as smple_file:
                samples_name = [x.rstrip() for x in smple_file if x.rstrip() != ""]

            block_done = os.path.join(self.wdir, ".done%d" % n_block)
-
            if not os.path.exists(block_done):
                print("Launching block of samples number %d..." % n_block, flush=True)
                print("Samples:", flush=True)
@@ -1143,7 +1146,7 @@ class CnvPipeline:
                        self._fail("Indexing reference fasta file failed")
                        return False
            elif not os.path.exists(final_fai):
-                self._fail("Working directory is corrupted: reference.fasta.fai not found in the existing working "
+                self._fail("Working directory is corrupted: reference_raw.fasta.fai not found in the existing working "
                           "directory")
                return False

@@ -1314,20 +1317,21 @@ class CnvPipeline:
                raise ValueError("Type of workflow not set")
            else:
                raise ValueError("Type of workflow not supported: %s" % self.wf_type)
-
+            print("Cleaned!")
            return 0

        except ValueError as e:
            if self.debug:
                traceback.print_exc()
            self._fail(str(e))
-        print("Cleaned!")
+

    def soft_clean(self, fake=False):
        """
-        Clean only lof files
+        Clean only log files
        :param fake: if True, only show deleted files without removing them
        """
+        print("Cleaning...")
        logs = glob(os.path.join(self.wdir, "**", "logs"), recursive=True)
        for log in logs:
            if os.path.isdir(log):
@@ -1368,6 +1372,7 @@ if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(description="Run cnv pipelines")
    subparsers = parser.add_subparsers(metavar="command")
+    subparsers.required = True

    def check_min_size_0(value):
        ivalue = int(value)
@@ -1576,7 +1581,10 @@ if __name__ == "__main__":

    args = parser.parse_args()

-
+    try:
+        func = args.func
+    except AttributeError:
+        parser.error("too few arguments use -h for help")

    if args.func == "run_detection" and args.refbundle is None and "genomestrip" in args.tools:
        print("Error: --refbundle option is required for genomestrip")

--- a/config_reader.py
+++ b/config_reader.py
@@ -135,7 +135,7 @@ class AppConfigReader:
            batch_type = self._get_batch_system_type()
            if batch_type == "slurm":
                return (" --mem-per-cpu={cluster.mem}000 --mincpus={threads} --time={cluster.time} -J {cluster.name}"
-                        " -N 1=1")
+                        " -N 1")
            elif batch_type == "sge":
                return " -l mem={cluster.mem}G -l h_vmem={cluster.mem}G -pe parallel_smp {threads} -N {cluster.name}"
            return None

--- a/detection_summary.ipynb
+++ b/detection_summary.ipynb
--- a/environment.yaml
+++ b/environment.yaml
-name: cnvpipelinetest
+name: cnvpipeline
 channels:
  - anaconda
  - conda-forge
@@ -20,6 +20,7 @@ dependencies:
  - exonerate>=2.4.0
  - genomestrip>=2.0
  - joblib>=0.13.2
+  - jupyter>=1.0.0
  - numpy>=1.13.3
  - openjdk>=8.0.121
  - pandas>=0.22.0
@@ -31,10 +32,12 @@ dependencies:
  - python>=3.6.2
  - pyvcf>=0.6.8
  - pyyaml>=3.12
+  - r-base>=3.5.1
  - scipy>=1.0.1
  - snakemake>=3.13.3
  - termcolor>=1.1.0
  - trf>=4.09
+  - upsetplot>=0.3.0
  - vcftools>=0.1.16
  - pip>=19.1.1termcolor-1.1.0
  - pip:

--- a/fixpermissions.sh
+++ b/fixpermissions.sh
+chmod u+x snakecnv/bin/*.py
+chmod u+x popsim/build_variant_pop.py
--- a/notebooks/.ipynb_checkpoints/Summary-checkpoint.ipynb
+++ b/notebooks/.ipynb_checkpoints/Summary-checkpoint.ipynb
--- a/notebooks/Summary.ipynb
+++ b/notebooks/Summary.ipynb
--- a/notebooks/detection_summary.ipynb
+++ b/notebooks/detection_summary.ipynb
--- a/popsim @ d608a847
+++ b/popsim @ d608a847
-Subproject commit 3e4317eaa8e300b6cb8332dfad36d9f3519d2c01
+Subproject commit d608a847a09da7362057e8fd101cb66cff72a34d
--- a/snakecnv/Summarized_results.ipynb
+++ b/snakecnv/Summarized_results.ipynb
-%% Cell type:raw id: tags:
-
-<script>
-  function code_toggle() {
-    if (code_shown){
-      $('div.input').hide('500');
-      $('#toggleButton').val('Show Code')
-    } else {
-      $('div.input').show('500');
-      $('#toggleButton').val('Hide Code')
-    }
-    code_shown = !code_shown
-  }
-
-  $( document ).ready(function(){
-    code_shown=false;
-    $('div.input').hide()
-  });
-</script>
-<form action="javascript:code_toggle()"><input type="submit" id="toggleButton" value="Show Code"></form>
-
-%% Cell type:code id: tags:
-
-``` python
-import sys
-import glob
-import os
-import json
-
-from collections import defaultdict,Counter
-import math
-
-import pandas as pd
-import plotly as py
-import numpy as np
-import seaborn as sns
-sns.set(style="whitegrid", color_codes=True)
-%matplotlib inline
-
-import matplotlib.pyplot as plt
-import matplotlib.patches as mpatches
-
-# vcf parser and associated utilities
-# VariantFile is the the pysam vcf file handler returning a pysam vcf record
-from pysam import VariantFile
-from pybedtools import BedTool, create_interval_from_list
-
-#Fot the http request to jvenn
-from IPython.display import display, HTML
-py.offline.init_notebook_mode(connected=True)
-
-from os import listdir
-from os.path import isfile, join
-```
-
-%% Output
-
-
-%% Cell type:code id: tags:
-
-``` python
-```
-
-%% Cell type:code id: tags:
-
-``` python
-# The rule file specifying the simulation protocol (size of variants and associated proportion)
-rules = pd.read_csv("rules.sim", sep='\s+',header=None, names=['type','min_size','max_size','proportion','cum'])
-rules[['type','min_size','max_size','proportion']]
-```
-
-%% Output
-
-      type  min_size  max_size  proportion
-    0  DEL       100       500         0.2
-    1  DEL       500      1000         0.2
-    2  DEL      1000      2000         0.3
-    3  DEL      2000     10000         0.2
-    4  DEL     10000     50000         0.1
-
-%% Cell type:code id: tags:
-
-``` python
-```
-%% Cell type:raw id: tags:
-
-<script>
-  function code_toggle() {
-    if (code_shown){
-      $('div.input').hide('500');
-      $('#toggleButton').val('Show Code')
-    } else {
-      $('div.input').show('500');
-      $('#toggleButton').val('Hide Code')
-    }
-    code_shown = !code_shown
-  }
-
-  $( document ).ready(function(){
-    code_shown=false;
-    $('div.input').hide()
-  });
-</script>
-<form action="javascript:code_toggle()"><input type="submit" id="toggleButton" value="Show Code"></form>
-
-%% Cell type:code id: tags:
-
-``` python
-import sys
-import glob
-import os
-import json
-
-from collections import defaultdict,Counter
-import math
-
-import pandas as pd
-import plotly as py
-import numpy as np
-import seaborn as sns
-sns.set(style="whitegrid", color_codes=True)
-%matplotlib inline
-
-import matplotlib.pyplot as plt
-import matplotlib.patches as mpatches
-
-# vcf parser and associated utilities
-# VariantFile is the the pysam vcf file handler returning a pysam vcf record
-from pysam import VariantFile
-from pybedtools import BedTool, create_interval_from_list
-
-#Fot the http request to jvenn
-from IPython.display import display, HTML
-py.offline.init_notebook_mode(connected=True)
-
-from os import listdir
-from os.path import isfile, join
-```
-
-%% Output
-
-
-%% Cell type:code id: tags:
-
-``` python
-```
-
-%% Cell type:code id: tags:
-
-``` python
-# The rule file specifying the simulation protocol (size of variants and associated proportion)
-rules = pd.read_csv("rules.sim", sep='\s+',header=None, names=['type','min_size','max_size','proportion','cum'])
-rules[['type','min_size','max_size','proportion']]
-```
-
-%% Output
-
-      type  min_size  max_size  proportion
-    0  DEL       100       500         0.2
-    1  DEL       500      1000         0.2
-    2  DEL      1000      2000         0.3
-    3  DEL      2000     10000         0.2
-    4  DEL     10000     50000         0.1
-
-%% Cell type:code id: tags:
-
-``` python
-```
--- a/snakecnv/bin/annotate.py
+++ b/snakecnv/bin/annotate.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""This script annotates a vcf file"""
+
+from svrunner_utils import eprint
+
+from svreader.annotation import AnnotateReader, AnnotateWriter
+
+from svreader.annotation import redundancy_annotator
+from svreader.annotation import get_connected_duplicates
+from svreader.annotation import variant_filtration, set_supporting_tools
+from svreader.annotation import rename_variants
+
+
+def filtering_by_support(variant_set, minsupp_reads=3):
+    """
+    Adding the LOWSUPPORT filter if the number of supporting reads is not
+    least minsupp_reads for a least on sample
+    """
+    for variant in variant_set:
+        info = variant.record.info
+        if info["MAX_SUPP_READS"] < minsupp_reads:
+            variant.filter.add("LOWSUPPORT")
+
+
+def annotate(inputfile, outputfile, genotyper):
+    """ Filtering the candidate CNVs according to the following criteria
+          - non duplicate sites
+          - variant sites
+          - call rate > 0.8
+          - at least one variant (homozygous or heterozygote) has a
+            genotype quality > 20
+          - the variant is not everywhere heterozygote or homozygote
+            (use NONVARIANTSCORE in both cases)
+          - sv with no overlap with CNVR filtered out by genomeSTRIP
+    """
+    # Reading the vcf file
+    variant_set = []
+    eprint(" Reading file %s" % (inputfile))
+    reader = AnnotateReader(inputfile)
+
+    for record in reader:
+        variant_set.append(record)
+    eprint("Working with " + str(len(variant_set)) + " records")
+
+    #  Annotations
+    num = 1
+    for variant in variant_set:
+        # PASS, and '.' become both PASS
+        variant.unify_pass_filtertag()
+        variant.add_supporting_infos()
+        variant.set_qual()
+
+    # Redundancy annotation
+    redundancy_annotator(variant_set, reader, overlap_cutoff=0.5,
+                         genotyper=genotyper)
+
+    # Now genomeSTRIP inspired variant filtration
+    if reader.numSamples() > 0:
+        variant_filtration(variant_set, reader)
+
+    # Now supporting info filtering
+    filtering_by_support(variant_set)
+
+    # Constructing connected component of duplicates
+    get_connected_duplicates(variant_set)
+    set_supporting_tools(variant_set)
+
+    # Rename sv
+    rename_variants(variant_set)
+
+    vcf_fout = AnnotateWriter(outputfile, reader)
+    num = 1
+    for record in sorted(variant_set, key=lambda k: k.start):
+        vcf_fout.write(record)
+        num += 1
+    vcf_fout.close()
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        prog="filter.py",
+        description="Filter by applying flags to variants")
+    parser.add_argument("-i", "--input-vcf", required=True,
+                        help="Genotypes vcf input file")
+    parser.add_argument("-o", "--output-vcf", required=True,
+                        help="Filtered vcf output file")
+    parser.add_argument("-g", "--genotyper", required=True,
+                        help="Genotyper tool")
+
+    args = parser.parse_args()
+
+    annotate(inputfile=args.input_vcf,
+             outputfile=args.output_vcf,
+             genotyper=args.genotyper)
--- a/snakecnv/bin/build_results.py
+++ b/snakecnv/bin/build_results.py
--- a/snakecnv/bin/build_results.py
+++ b/snakecnv/bin/build_results.py
+../../popsim/build_results.py
\ No newline at end of file
--- a/snakecnv/bin/build_variant_pop.py
+++ b/snakecnv/bin/build_variant_pop.py
--- a/snakecnv/bin/build_variant_pop.py
+++ b/snakecnv/bin/build_variant_pop.py
+../../popsim/build_variant_pop.py
\ No newline at end of file
No results found