Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • philippe.bardou/cnvpipelines
  • svdetection/cnvpipelines
2 results
Show changes
Commits on Source (100)
Showing
with 2856 additions and 3837 deletions
......@@ -5,3 +5,4 @@ application.properties
**/*.pyo
**/.snakemake
**/old/*
**/.ipynb_checkpoints
[submodule "snakecnv/lib"]
path = snakecnv/lib
url = git@forgemia.inra.fr:svdetection/svlib.git
branch=master
url = https://forgemia.inra.fr/svdetection/svlib.git
branch = master
[submodule "popsim"]
path = popsim
url = git@forgemia.inra.fr:svdetection/popsim.git
url = https://forgemia.inra.fr/svdetection/popsim.git
branch = master
......@@ -35,21 +35,22 @@ $ conda env create --file environment.yaml
#### Load new conda environment
```sh
source activate cnvpipeline
conda activate cnvpipeline
```
### Install for simulations
To do simulations, you need to compile pirs, which is included as submodule of your cnvpipeline installation. You must use a recent gcc compiler (for example on genologin, module load compiler/gcc-7.2.0).
Then cd in `cnvpipelines/popsim/pirs` and just make:
module load compiler/gcc-7.2.0 # for genologin users
make
## Configuration
Copy `application.properties.example` into `application.properties` and make appropriate changes.
Sections and parameters are described below.
Sections and parameters are described below (on the genologin server you can simply copy `application.properties.genologin` into `application.properties`).
### Global section
......@@ -71,12 +72,11 @@ This section must be filled only if you don't use local as batch system type (se
### Reference bundle section
* *repeatmasker_lib_path*: PATH to the RepBase Libraries folder for RepeatMasker, if needed by RepeatMasker installation
* *repeatmasker_lib_path*: PATH to the RepBase Libraries folder for RepeatMasker. This is not mandatory (see Install RepeatMasker Libraries in http://www.repeatmasker.org/RMDownload.html).
### Genotoul specific configuration
### Genotoul (genologin) specific configuration
* Use the delly module specified
* Ue the drmaa library specified
* Use the drmaa library specified
## Run
......@@ -153,7 +153,7 @@ With:
`tools`: list of tools, space separated. Choose among genomestrip, delly, lumpy and pindel.
##### Optional arguments
`-r rb`: the refbundle directory necessary when using genomestrip
`-b INT`: size of batches (default: -1, to always make only 1 batch)
`--chromosomes CHRS`: list of chromosomes to study, space separated. Regex accepted (using the [python syntax](https://docs.python.org/3/library/re.html#regular-expression-syntax)). Default: all valid chromosomes of the reference
`--force-all-chromosomes`: ignore filtering if `--chromosomes` is not set
......
[global]
# batch system type: local, slurm or sge
# for genologin use slurm
batch_system_type = slurm
# list of modules to load (space separated)
modules =
# path environment to prepend (":" separated)
paths =
# number of concurrent jobs to launch
jobs = 100
# svtoolkit home: genomestrip is now installed via conda
# sv_dir =
[cluster]
# Ignore these options for local batch system type
# submission mode: drmaa or cluster
submission_mode = drmaa
# cluster submission command (ignore for DRMAA submission mode)
submission_command =
# DRMAA lib (ignore for cluster submission mode)
# For genologin use /tools/libraries/slurm-drmaa/slurm-drmaa-1.0.7/lib/libdrmaa.so
drmaa = /tools/libraries/slurm-drmaa/slurm-drmaa-1.0.7/lib/libdrmaa.so
# native submission commands: keep default on most cases
native_submission_options = ###DEFAULT###
# cluster config file
config = ###PROGRAM###/cluster.yaml
# enable drmaa parallelisation through DRMAA for Genomestrip (may cause exceed of number of concurrent jobs launched)
# True: enabled, False: disabled
sv_parallel_drmaa = True
sv_parallel_native_specs = ###DEFAULT###
[refbundle]
# Path to the Libraries folder extracted from RepBase archive file
# Optional (will use default libraries instead) but highly recommended :-)
repeatmasker_lib_path = /save/faraut/Libraries
......@@ -8,11 +8,12 @@ delly:
time: "36:00:00"
lumpy:
mem: 12
mem: 24
time: "36:00:00"
pindel:
mem: 16
time: "24:00:00"
time: "36:00:00"
genomestrip:
mem: 8
......@@ -46,6 +47,11 @@ repeatmask:
mem: 8
h_vmem: 8
buildpop:
time: "48:00:00"
mem: 8
h_vmem: 8
bed2fasta:
mem: 16
h_vmem: 16
......
......@@ -19,7 +19,7 @@ from subprocess import run
TOOLS = ["lumpy", "delly", "pindel", "genomestrip"]
VTYPES = ["DEL", "INV", "DUP", "mCNV"]
MIN_CHR_SIZE = 50000
MIN_CHR_SIZE = 500000
MD5_TEST = {
"ERR470101_f_1.fq.gz": "2c30600e303c9bfb7b9538ec76055203",
......@@ -437,6 +437,7 @@ class CnvPipeline:
:param reason: if True, show reason of each executed rule
:type reason: bool
"""
if self.isrerun:
run(["touch", self.config_file, "-d", self.config_timestamp])
if mode == "rerun" and self.wf_type == "detection" and len(snake_args) == 0:
......@@ -445,27 +446,31 @@ class CnvPipeline:
del local_vars["snake_args"]
return self.run_or_rerun_detection(**local_vars)
# Fix snakemake bug: make reference files timestamp to the config.yaml timestamp value
if (mode == "rerun" or self.isrerun) and (self.wf_type == "simulation" or self.wf_type == "refbundle"):
reference = os.path.join(self.wdir, "reference.fasta")
fixed_date = datetime.fromtimestamp(os.stat(self.config_file).st_ctime).strftime("%Y-%m-%d %H:%M:%S")
if os.path.exists(reference):
run(["touch", reference, "-d", fixed_date])
fai = reference + ".fai"
if os.path.exists(fai):
run(["touch", fai, "-d", fixed_date])
if self.wf_type == "simulation":
reference = os.path.join(self.wdir, "refbundle", "reference.fasta")
if os.path.exists(reference):
run(["touch", reference, "-d", fixed_date])
fai = reference + ".fai"
if os.path.exists(fai):
run(["touch", fai, "-d", fixed_date])
# if (mode == "rerun" or self.isrerun) and (self.wf_type == "simulation" or self.wf_type == "refbundle"):
# #reference = os.path.join(self.wdir, "reference.fasta")
# reference = os.path.join(self.wdir, "reference_raw.fasta")
# fixed_date = datetime.fromtimestamp(os.stat(self.config_file).st_ctime).strftime("%Y-%m-%d %H:%M:%S")
# if os.path.exists(reference):
# run(["touch", "-h", reference, "-d", fixed_date])
# fai = reference + ".fai"
# if os.path.exists(fai):
# run(["touch", "-h", fai, "-d", fixed_date])
#
# if self.wf_type == "simulation":
# reference = os.path.join(self.wdir, "refbundle", "reference.fasta")
# if os.path.exists(reference):
# run(["touch", reference, "-d", fixed_date])
# fai = reference + ".fai"
# if os.path.exists(fai):
# run(["touch", fai, "-d", fixed_date])
if merge_batches:
if self.wf_type != "detection":
self._fail("Merge batches can only be done on a detection workflow")
self._wf_type = "mergebatches"
try:
if not os.path.exists(self.config_file):
raise FileNotFoundError("File not found: %s" % self.config_file)
......@@ -561,14 +566,12 @@ class CnvPipeline:
do_delete_after_dryrun = self.delete_wdir_after_dryrun
for n_block in blocks:
sample_file = os.path.join(self.wdir, "samples.list.%d" % n_block)
with open(sample_file, "r") as smple_file:
samples_name = [x.rstrip() for x in smple_file if x.rstrip() != ""]
block_done = os.path.join(self.wdir, ".done%d" % n_block)
if not os.path.exists(block_done):
print("Launching block of samples number %d..." % n_block, flush=True)
print("Samples:", flush=True)
......@@ -1143,7 +1146,7 @@ class CnvPipeline:
self._fail("Indexing reference fasta file failed")
return False
elif not os.path.exists(final_fai):
self._fail("Working directory is corrupted: reference.fasta.fai not found in the existing working "
self._fail("Working directory is corrupted: reference_raw.fasta.fai not found in the existing working "
"directory")
return False
......@@ -1314,20 +1317,21 @@ class CnvPipeline:
raise ValueError("Type of workflow not set")
else:
raise ValueError("Type of workflow not supported: %s" % self.wf_type)
print("Cleaned!")
return 0
except ValueError as e:
if self.debug:
traceback.print_exc()
self._fail(str(e))
print("Cleaned!")
def soft_clean(self, fake=False):
"""
Clean only lof files
Clean only log files
:param fake: if True, only show deleted files without removing them
"""
print("Cleaning...")
logs = glob(os.path.join(self.wdir, "**", "logs"), recursive=True)
for log in logs:
if os.path.isdir(log):
......@@ -1368,6 +1372,7 @@ if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Run cnv pipelines")
subparsers = parser.add_subparsers(metavar="command")
subparsers.required = True
def check_min_size_0(value):
ivalue = int(value)
......@@ -1576,7 +1581,10 @@ if __name__ == "__main__":
args = parser.parse_args()
try:
func = args.func
except AttributeError:
parser.error("too few arguments use -h for help")
if args.func == "run_detection" and args.refbundle is None and "genomestrip" in args.tools:
print("Error: --refbundle option is required for genomestrip")
......
......@@ -135,7 +135,7 @@ class AppConfigReader:
batch_type = self._get_batch_system_type()
if batch_type == "slurm":
return (" --mem-per-cpu={cluster.mem}000 --mincpus={threads} --time={cluster.time} -J {cluster.name}"
" -N 1=1")
" -N 1")
elif batch_type == "sge":
return " -l mem={cluster.mem}G -l h_vmem={cluster.mem}G -pe parallel_smp {threads} -N {cluster.name}"
return None
......
source diff could not be displayed: it is too large. Options to address this: view the blob.
name: cnvpipelinetest
name: cnvpipeline
channels:
- anaconda
- conda-forge
......@@ -20,6 +20,7 @@ dependencies:
- exonerate>=2.4.0
- genomestrip>=2.0
- joblib>=0.13.2
- jupyter>=1.0.0
- numpy>=1.13.3
- openjdk>=8.0.121
- pandas>=0.22.0
......@@ -31,10 +32,12 @@ dependencies:
- python>=3.6.2
- pyvcf>=0.6.8
- pyyaml>=3.12
- r-base>=3.5.1
- scipy>=1.0.1
- snakemake>=3.13.3
- termcolor>=1.1.0
- trf>=4.09
- upsetplot>=0.3.0
- vcftools>=0.1.16
- pip>=19.1.1termcolor-1.1.0
- pip:
......
chmod u+x snakecnv/bin/*.py
chmod u+x popsim/build_variant_pop.py
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Subproject commit 3e4317eaa8e300b6cb8332dfad36d9f3519d2c01
Subproject commit d608a847a09da7362057e8fd101cb66cff72a34d
%% Cell type:raw id: tags:
<script>
function code_toggle() {
if (code_shown){
$('div.input').hide('500');
$('#toggleButton').val('Show Code')
} else {
$('div.input').show('500');
$('#toggleButton').val('Hide Code')
}
code_shown = !code_shown
}
$( document ).ready(function(){
code_shown=false;
$('div.input').hide()
});
</script>
<form action="javascript:code_toggle()"><input type="submit" id="toggleButton" value="Show Code"></form>
%% Cell type:code id: tags:
``` python
import sys
import glob
import os
import json
from collections import defaultdict,Counter
import math
import pandas as pd
import plotly as py
import numpy as np
import seaborn as sns
sns.set(style="whitegrid", color_codes=True)
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
# vcf parser and associated utilities
# VariantFile is the the pysam vcf file handler returning a pysam vcf record
from pysam import VariantFile
from pybedtools import BedTool, create_interval_from_list
#Fot the http request to jvenn
from IPython.display import display, HTML
py.offline.init_notebook_mode(connected=True)
from os import listdir
from os.path import isfile, join
```
%% Output
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
# The rule file specifying the simulation protocol (size of variants and associated proportion)
rules = pd.read_csv("rules.sim", sep='\s+',header=None, names=['type','min_size','max_size','proportion','cum'])
rules[['type','min_size','max_size','proportion']]
```
%% Output
type min_size max_size proportion
0 DEL 100 500 0.2
1 DEL 500 1000 0.2
2 DEL 1000 2000 0.3
3 DEL 2000 10000 0.2
4 DEL 10000 50000 0.1
%% Cell type:code id: tags:
``` python
```
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""This script annotates a vcf file"""
from svrunner_utils import eprint
from svreader.annotation import AnnotateReader, AnnotateWriter
from svreader.annotation import redundancy_annotator
from svreader.annotation import get_connected_duplicates
from svreader.annotation import variant_filtration, set_supporting_tools
from svreader.annotation import rename_variants
def filtering_by_support(variant_set, minsupp_reads=3):
"""
Adding the LOWSUPPORT filter if the number of supporting reads is not
least minsupp_reads for a least on sample
"""
for variant in variant_set:
info = variant.record.info
if info["MAX_SUPP_READS"] < minsupp_reads:
variant.filter.add("LOWSUPPORT")
def annotate(inputfile, outputfile, genotyper):
""" Filtering the candidate CNVs according to the following criteria
- non duplicate sites
- variant sites
- call rate > 0.8
- at least one variant (homozygous or heterozygote) has a
genotype quality > 20
- the variant is not everywhere heterozygote or homozygote
(use NONVARIANTSCORE in both cases)
- sv with no overlap with CNVR filtered out by genomeSTRIP
"""
# Reading the vcf file
variant_set = []
eprint(" Reading file %s" % (inputfile))
reader = AnnotateReader(inputfile)
for record in reader:
variant_set.append(record)
eprint("Working with " + str(len(variant_set)) + " records")
# Annotations
num = 1
for variant in variant_set:
# PASS, and '.' become both PASS
variant.unify_pass_filtertag()
variant.add_supporting_infos()
variant.set_qual()
# Redundancy annotation
redundancy_annotator(variant_set, reader, overlap_cutoff=0.5,
genotyper=genotyper)
# Now genomeSTRIP inspired variant filtration
if reader.numSamples() > 0:
variant_filtration(variant_set, reader)
# Now supporting info filtering
filtering_by_support(variant_set)
# Constructing connected component of duplicates
get_connected_duplicates(variant_set)
set_supporting_tools(variant_set)
# Rename sv
rename_variants(variant_set)
vcf_fout = AnnotateWriter(outputfile, reader)
num = 1
for record in sorted(variant_set, key=lambda k: k.start):
vcf_fout.write(record)
num += 1
vcf_fout.close()
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(
prog="filter.py",
description="Filter by applying flags to variants")
parser.add_argument("-i", "--input-vcf", required=True,
help="Genotypes vcf input file")
parser.add_argument("-o", "--output-vcf", required=True,
help="Filtered vcf output file")
parser.add_argument("-g", "--genotyper", required=True,
help="Genotyper tool")
args = parser.parse_args()
annotate(inputfile=args.input_vcf,
outputfile=args.output_vcf,
genotyper=args.genotyper)
This diff is collapsed.
../../popsim/build_results.py
\ No newline at end of file
This diff is collapsed.
../../popsim/build_variant_pop.py
\ No newline at end of file