Skip to content
Snippets Groups Projects
Commit eceb3c6b authored by Thomas Faraut's avatar Thomas Faraut
Browse files

added parsing merging and genotyping

parent 3c092b4a
No related branches found
No related tags found
No related merge requests found
Showing
with 590 additions and 89 deletions
......@@ -51,7 +51,7 @@ def valid_chromosome(chrom, regexp_chrom):
p = re.compile(regexp_chrom)
return p.match(chrom) is not None
ROOTPATH = "/home/faraut/dynawork/CapriSV/pipeline/capricnv/snakewf/playground/cnvdetection"
ROOTPATH = "/home/faraut/dynawork/CapriSV/pipeline/cnvpipelines/snakecnv"
shell.prefix("export PATH=%s/bin:%s/soft/lumpy/bin:\"$PATH\";" % (ROOTPATH,ROOTPATH))
......@@ -79,6 +79,30 @@ rule all:
chrom=chromosomes,
tool=tools)
rule merging:
input:
expand("{tool}/{{chrom}}/{{svtype}}/{tool}_{{chrom}}_{{svtype}}_parsed.vcf.gz",
tool=tools)
output:
"merge/{chrom}/{svtype}/merge_{chrom}_{svtype}.vcf.gz"
params:
outdir = "merge",
reference = REFERENCE
script:
"bin/merge.py"
rule parsing:
input:
unpack(getToolOuputFile),
reference = REFERENCE
output:
touch("{tool}/{chrom}/{svtype}/parsing.done"),
parseoutput = "{tool}/{chrom}/{svtype}/{tool}_{chrom}_{svtype}_parsed.vcf.gz"
script:
# Parse for the specific tool, the
# specified variants and the specified chrom
"bin/parse.py"
rule extract:
input:
bam = "data/bams/{sample}.bam"
......@@ -89,15 +113,5 @@ rule extract:
" -r {wildcards.chrom}"
" -o data/bams/{wildcards.chrom}"
rule parsing:
input:
getToolOuputFile
output: touch("{tool}/{chrom}/{svtype}/parsing.done")
shell:
"echo {input}"
# script:
# Parse for the specific tool, the
# specified variants and the specified chrom
# rundetectionscript.py
include: "tools/modules.snk"
#!/bin/bash
# If you adapt this script for your own use, you will need to set these two variables based on your environment.
# SV_DIR is the installation directory for SVToolkit - it must be an exported environment variable.
# SV_TMPDIR is a directory for writing temp files, which may be large if you have a large data set.
export SV_DIR=/home/faraut/save/Softwares/svtoolkit
echo "---"
echo "Running CNVPipeline.sh $1 $2 $3 $4 $5"
echo "---"
myname=`basename "$0"`
function usage() {
echo "
Program: geneomstrip cnvpipeline wrapper
usage: $myname <bamlist> <genome> <metadata> <gendermap> <outdir> <outprefix> <intervallist>
positional args:
bamlist a file specifying the bamfiles
genome the geome fasta file
metadata the metada directory created by preprocess
gendermap file containing the declared gender for each sample.
outdir the output dir
outprefix prefix of the output file
intervallist Interval(s) or .list for which intervals to process (
see
http://software.broadinstitute.org/software/genomestrip/org_broadinstitute_sv_qscript_CNVDiscoveryPipeline.html
for a complete description
"
}
bamlist=$1
genome=$2
metadata=$3
gendermap=$4
outdir=$5
outprefix=$6
intervallist=$7
if [[ -z ${bamlist} || -z ${genome} || -z ${metadata} || -z ${gendermap} || -z ${outdir} || -z ${outprefix} || -z ${intervallist} ]]
then
usage
echo "At least one argument is missing"
exit 1
fi
if [ ! -d "${metadata}" ] ; then
echo "Directory ${metadata} absent ... exiting"
exit 1
fi
inputFile=$bamlist
reference_prefix=${genome%.fasta}
runDir=${outdir}/${outprefix}_run
mkdir -p ${runDir}/logs || exit 1
# For SVAltAlign, you must use the version of bwa compatible with Genome STRiP.
export PATH=${SV_DIR}/bwa:${PATH}
export LD_LIBRARY_PATH=${SV_DIR}/bwa:${LD_LIBRARY_PATH}
export LD_LIBRARY_PATH=/SGE/ogs/lib/linux-x64:$LD_LIBRARY_PATH
mx="-Xmx4g"
classpath="${SV_DIR}/lib/SVToolkit.jar:${SV_DIR}/lib/gatk/GenomeAnalysisTK.jar:${SV_DIR}/lib/gatk/Queue.jar"
echo "-- Run CNVPipeline -- "
LC_ALL=C java -cp ${classpath} ${mx} \
org.broadinstitute.gatk.queue.QCommandLine \
-S ${SV_DIR}/qscript/discovery/cnv/CNVDiscoveryPipeline.q \
-S ${SV_DIR}/qscript/SVQScript.q \
-cp ${classpath} \
-gatk ${SV_DIR}/lib/gatk/GenomeAnalysisTK.jar \
-configFile ${SV_DIR}/conf/genstrip_parameters.txt \
--disableJobReport \
-jobProject Capri \
-jobRunner Drmaa \
-gatkJobRunner Drmaa \
-jobQueue workq \
-jobNative '-l mem=16G -l h_vmem=16G' \
-R ${reference_prefix}.fasta \
-genderMaskBedFile ${reference_prefix}.gendermask.bed \
-genomeMaskFile ${reference_prefix}.svmask.fasta \
-genderMapFile ${gendermap} \
-ploidyMapFile ${reference_prefix}.ploidymap.txt \
-md ${metadata} \
-runDirectory ${runDir} \
-jobLogDir ${runDir}/logs \
-I ${inputFile} \
-intervalList $intervallist \
-tilingWindowSize 5000 \
-tilingWindowOverlap 2500 \
-maximumReferenceGapLength 2500 \
-boundaryPrecision 200 \
-minimumRefinedLength 2500 \
-run \
|| exit 1
sites=${runDir}/results/gs_cnv.genotypes.vcf.gz
genotypes=${outdir}/${outprefix}.vcf.gz
# Run genotyping on the discovered sites.
LC_ALL=C java -Xmx4g -cp ${classpath} ${mx} \
org.broadinstitute.sv.apps.GenerateHaploidCNVGenotypes \
-R ${reference_prefix}.fasta \
-ploidyMapFile ${reference_prefix}.ploidymap.txt \
-genderMapFile ${gendermap} \
-vcf ${sites} \
-O ${genotypes} \
-estimateAlleleFrequencies true \
-genotypeLikelihoodThreshold 0.001 || exit 1
echo "Script CNVPipeline completed successfully at "`date +%Y-%m-%d`
......@@ -86,7 +86,7 @@ def main(args):
"wb",
header=new_header
) as outf:
for read in inf.fetch(chrom, start, end):
for read in inf.fetch(contig=chrom, start=start, stop=end):
if read.mate_is_unmapped:
read.reference_id = 0
read.next_reference_id = 0
......
#!/bin/bash
# If you adapt this script for your own use, you will need to set these two variables based on your environment.
# SV_DIR is the installation directory for SVToolkit - it must be an exported environment variable.
# SV_TMPDIR is a directory for writing temp files, which may be large if you have a large data set.
export SV_DIR=/usr/local/bioinfo/src/genomeSTRIP/svtoolkit
echo "---"
echo "Running gstrip_genotyper.sh $1 $2 $3 $4 $5 $6 $7 $8 $9"
echo "---"
myname=`basename "$0"`
function usage() {
echo "
Program: SVGenotyper wrapper
usage: $myname <bamlist> <chrom> <genome> <metadata> <rundir> <gendermap> <sites> genotypes> [<ncpus>]
positional args:
bamlist a file with the bamfiles
chrom specific chromosome
genome the genome fasta file (the genomestrip reference genome fasta file)
metadata directory with genomestrip metada (generated by SVpreprocess)
rundir directory for auwiliarry data for variant filtering
gendermap the gendermap file for genomestrip
sites vcf file with sv to genotype
genotypes vcf file with genotypes (output file)
optional args:
cpus the number of cpus [1]
"
}
bamlist=$1
chrom=$2
genome=$3
metadata=$4
rundir=$5
gendermap=$6
sites=$7
genotypes=$8
ncpus=${9:-1}
if [[ -z "$bamlist" || -z "$chrom" || -z "$genome" || -z "$metadata" || -z "$rundir" || -z "$gendermap" || -z "$sites" || -z "$genotypes" ]]
then
usage
echo "At least one argument is missing"
exit 1
fi
inputFile=$bamlist
reference_prefix=${genome%.fasta}
if [[ ${bamlist: -5} != ".list" ]]; then
echo "bamlist file must have the extension .list ... exiting"
exit
fi
# Now testing the existence of directories and files
if [[ ! -d ${metadata} ]]; then
echo "Directory metadata ${metadata} is missing ... exiting"
exit 1
fi
if [[ ! -s ${sites} ]]; then
echo "${sites} shoud be an existing non empty file ... exiting"
exit 1
fi
if [[ ! -s ${gendermap} ]]; then
echo "${gendermap} shoud be an existing non empty file ... exiting"
exit 1
fi
runDir=${rundir}
SV_TMPDIR=${runDir}/tmp
mkdir -p ${runDir}/logs || exit 1
mkdir -p ${SV_TMPDIR}
# For SVAltAlign, you must use the version of bwa compatible with Genome STRiP.
export PATH=${SV_DIR}/bwa:${PATH}
export LD_LIBRARY_PATH=${SV_DIR}/bwa:/SGE/ogs/lib/linux-x64:${LD_LIBRARY_PATH}
mx="-Xmx4g"
classpath="${SV_DIR}/lib/SVToolkit.jar:${SV_DIR}/lib/gatk/GenomeAnalysisTK.jar:${SV_DIR}/lib/gatk/Queue.jar"
# Run genotyping on the discovered sites.
LC_ALL=C java -cp ${classpath} ${mx} \
org.broadinstitute.gatk.queue.QCommandLine \
-S ${SV_DIR}/qscript/SVGenotyper.q \
-S ${SV_DIR}/qscript/SVQScript.q \
-gatk ${SV_DIR}/lib/gatk/GenomeAnalysisTK.jar \
--disableJobReport \
-cp ${classpath} \
-jobProject Capri \
-jobRunner Drmaa \
-gatkJobRunner Drmaa \
-jobQueue workq \
-jobNative "-l mem=24G -l h_vmem=24G -pe parallel_smp $ncpus" \
-parallelJobs $ncpus \
-configFile ${SV_DIR}/conf/genstrip_parameters.txt \
-tempDir ${SV_TMPDIR} \
-R ${reference_prefix}.fasta \
-genderMaskBedFile ${reference_prefix}.gendermask.bed \
-genomeMaskFile ${reference_prefix}.svmask.fasta \
-genderMapFile ${gendermap} \
-runDirectory ${runDir} \
-md ${metadata} \
-disableGATKTraversal \
-jobLogDir ${runDir}/logs \
-L ${chrom} \
-I ${inputFile} \
-vcf ${sites} \
-O ${genotypes} \
-run \
|| exit 1
#bgzip -f ${genotypes}
#tabix -f -p vcf ${genotypes}.gz
#mv ${genotypes} ${SV_TMPDIR}/patchgenomestripvcf
#if [[ ${genotypes: -4} != ".gz" ]]; then
# bcftools view -O z ${SV_TMPDIR}/patchgenomestripvcf > ${genotypes}
#else
# bcftools view ${SV_TMPDIR}/patchgenomestripvcf > ${genotypes}
#fi
rm -fr ${SV_TMPDIR}
echo "Script SVGenotyping completed successfully at "`date +%c`
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Usage:
Snakemake script
"""
import os
import tempfile
import shutil
from subprocess import call
from svrunner_utils import addContigInfosTovcf
def concatenate(vcf_files, outputfile, reference, mergedir, chrom, svtype):
tempdir = tempfile.mkdtemp(dir=mergedir)
vcfFileslist = tempdir + "/" + "vcffile.list"
with open(vcfFileslist, "w") as outfh:
for gfile in vcf_files:
vcf_sample_dropped_file = tempdir + "/" + os.path.basename(gfile)
cmd = "bcftools view --drop-genotypes " + gfile + \
" | vcf-sort -c | bgzip -c >" + vcf_sample_dropped_file
cmd += " && tabix -p vcf " + vcf_sample_dropped_file
call(cmd, shell=True)
outfh.write(vcf_sample_dropped_file + "\n")
temp_outputfile = tempdir + "/" + "merge.vcf"
cmd = "bcftools concat -f " + vcfFileslist + \
" -a | vcf-sort -c > " + temp_outputfile
call(cmd, shell=True)
# drop format info from header
temp_outputfile_newheader = tempdir + "/" + "merge_newheader.vcf"
cmd = "bcftools view -h " + temp_outputfile + \
" | grep -v '^##FORMAT' > " + tempdir + "/newheader ;"
cmd += " bcftools reheader -h " + tempdir + "/newheader " + \
temp_outputfile + " > " + temp_outputfile_newheader
call(cmd, shell=True)
# Correct contig infos here using bcftools annotate
addContigInfosTovcf(temp_outputfile_newheader, outputfile, reference)
shutil.rmtree(tempdir)
def merging(inputfiles, outputfile, reference, outdir, chrom, svtype):
print("\n".join(["\t".join(inputfiles), outputfile, outdir, chrom, svtype]))
concatenate(inputfiles, outputfile, outdir, reference,
chrom, svtype)
merging(snakemake.input,
snakemake.output[0],
snakemake.params['outdir'],
snakemake.params['reference'],
snakemake.wildcards['svtype'],
snakemake.wildcards['svtype']
)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Usage:
Snakemake script
"""
import os
from pysam import Fastafile, tabix_index
from svrunner_utils import eprint
from svrunner_utils import get_contigs, sorting, eprint
from svreader.genomestrip import GenomeSTRIPReader, GenomeSTRIPWriter
from svreader.delly import DellyReader, DellyWriter
from svreader.lumpy import LumpyReader, LumpyWriter
from svreader.pindel import PindelReader, PindelWriter
tool_to_reader = {"pindel": PindelReader, "delly": DellyReader,
"lumpy": LumpyReader, "genomestrip": GenomeSTRIPReader}
tool_to_writer = {"pindel": PindelWriter, "delly": DellyWriter,
"lumpy": LumpyWriter, "genomestrip": GenomeSTRIPWriter}
def convert_svtool_to_vcf(reference, inputfile, outputfile,
toolname, svtype,
minlen=100, maxlen=5000000,
max_overlap=0.4, doFilter=False,
filterbed=None):
# The reference handle
reference_handle = Fastafile(reference) if reference else None
reference_contigs = get_contigs(reference)
SVReader = tool_to_reader[toolname](inputfile,
reference_handle=reference_handle,
svs_to_report=[svtype])
SVWriter = tool_to_writer[toolname](outputfile,
reference_contigs,
SVReader)
# The records
eprint("Now reading the records")
records = []
for record in SVReader:
records.append(record)
eprint("%d records" % (len(records)))
# Merging records (necessary for delly inversions)
records = SVReader.bnd_merge(svtype, records)
# Now filtering the records
if doFilter and 0:
eprint("Now filtering")
records = SVReader.filtering(records, max_overlap, minlen, maxlen,
filterbed)
# Sorting the vcf records
records = sorting(records, reference_contigs)
eprint("Now writing the records")
for record in records:
SVWriter.write(record)
SVWriter.close()
eprint("%d records" % (len(records)))
tabix_index(outputfile, force=True, preset='vcf')
def parsing(reference, tooloutput, parseoutput, tool, svtype):
print("\n".join([reference, tooloutput, parseoutput, tool, svtype]))
convert_svtool_to_vcf(reference, tooloutput, parseoutput,
tool, svtype)
parsing(snakemake.input['reference'],
snakemake.input['tooloutput'],
snakemake.output['parseoutput'],
snakemake.wildcards['tool'],
snakemake.wildcards['svtype'])
#!/bin/bash
# see http://redsymbol.net/articles/unofficial-bash-strict-mode/
set -euo pipefail
echo "---"
echo "Running svtyper.sh $1 $2 $3"
echo "Starting "`date +%Y-%m-%d`" at "`date +%H:%M:%S`
echo "---"
svtyper=svtyper
root=/home/faraut/work/CapriSV/src/SVPipeline
export PATH=$root/bin:$PATH
PYTHONPATH=""
myname=`basename "$0"`
function usage() {
echo "
Program: lumpy wrapper
usage: $myname [options] <bamlist> <input> <outprefix>
positional args:
bamlist a file with the bamfiles
input the input vcf file (gzipped or not)
outprefix prefix of the output file
ncpus the number of cpus, optional [1]
"
}
get_bams( ) {
bams=""
for bamfile in $@; do
bams=$bams$bamfile","
done
bams=${bams%?}
echo $bams
}
function join_by {
local IFS="$1";
shift;
echo "$*"; }
bamlist=$1
input=$2
outprefix=$3
ncpus=${4:-1} # If parameter is unset or null, the expansion of word is substituted (see https://www.gnu.org/software/bash/manual/bashref.html#Shell-Parameter-Expansion)
if [[ -z ${bamlist} || -z ${input} || -z ${outprefix} ]]
then
usage
echo "At least one argument is missing"
exit 1
fi
mapfile -t bamarray < ${bamlist}
bams=`join_by ',' "${bamarray[@]}"`
origdir=`pwd`
tempdir=`mktemp -d -p .`
cd $tempdir
echo "Working in $tempdir"
# genotype the Structural variants (and parallelize the job)
zless ${input} | head -n 10000 | grep "^#" > header
zless ${input} | grep -v "^#" | split -l 100 - temp_
for i in temp_*;
do cat header $i > $i.raw.vcf
rm -f $i;done
ls *.raw.vcf | parallel --no-notice -P ${ncpus} ${svtyper} -i {} -B ${bams} '>' {.}.output
rm -f lumpy_temp_*.raw.vcf
vcf-concat *.raw.output | vcf-sort > ${outprefix}.vcf
bgzip -f ${outprefix}.vcf
tabix -f -p vcf ${outprefix}.vcf.gz
cd $origdir
rm -fr $tempdir
echo "Script lumpy.sh completed successfully "`date +%Y-%m-%d`" at "`date +%H:%M:%S`
......@@ -116,7 +116,7 @@ class GenomeSTRIPRecord(SVRecord):
super(GenomeSTRIPRecord, self).__init__(genomestrip_name,
record.chrom,
record.pos,
record.info['END'],
record.stop,
record.id,
ref=record.ref,
qual=record.qual,
......
......@@ -68,7 +68,7 @@ class LumpyRecord(SVRecord):
super(LumpyRecord, self).__init__(lumpy_name,
record.chrom,
record.pos,
record.info['END'],
record.stop,
record.id,
ref=record.ref,
qual=record.qual,
......@@ -107,7 +107,7 @@ class LumpyRecord(SVRecord):
else:
return 0
def variantSample(self, sample):
def variantSampleOld(self, sample):
if (max(self.sv[sample]["AS"]) > 0 or
max(self.sv[sample]["AP"]) or
max(self.sv[sample]["ASC"]) > 0):
......@@ -115,6 +115,13 @@ class LumpyRecord(SVRecord):
else:
return False
def variantSample(self, sample):
if self.sv[sample]["SU"] > 0:
return True
else:
return False
def variantSamples(self):
variant_samples = []
for sample in self.sv:
......
import os
import sys
from pysam import Fastafile, tabix_index
from svrunner_utils import get_contigs, sorting, eprint
from svreader.genomestrip import GenomeSTRIPReader, GenomeSTRIPWriter
from svreader.delly import DellyReader, DellyWriter
from svreader.lumpy import LumpyReader, LumpyWriter
from svreader.pindel import PindelReader, PindelWriter
tool_to_reader = {"pindel": PindelReader, "delly": DellyReader,
"lumpy": LumpyReader, "genomestrip": GenomeSTRIPReader}
tool_to_writer = {"pindel": PindelWriter, "delly": DellyWriter,
"lumpy": LumpyWriter, "genomestrip": GenomeSTRIPWriter}
def get_tool_prefix(tool, chrom, svtype):
common_prefix = "_".join([tool, chrom])
......@@ -24,13 +11,13 @@ def get_tool_prefix(tool, chrom, svtype):
tool_suffix = get_lumpy_suffix(svtype)
elif tool == "delly":
tool_suffix = get_delly_suffix(svtype)
else:
tool_suffix = svtype + "_" + ".vcf.gz"
return "_".join([common_prefix, tool_suffix])
elif tool == "genomestrip":
tool_suffix = get_genomestrip_suffix(svtype)
return common_prefix + tool_suffix
def get_delly_suffix(svtype):
return svtype + "_" + ".bcf"
return "_" + svtype + ".bcf"
def get_lumpy_suffix(svtype):
......@@ -53,57 +40,11 @@ def get_pindel_suffix(svtype):
if svtype not in SV_TYPE_TO_PINDEL:
raise KeyError('Unsupported variant type')
pindel_type = SV_TYPE_TO_PINDEL[svtype]
outfile = pindel_type + ".gz"
outfile = "_" + pindel_type + ".gz"
return outfile
def getToolOuputFile(wildcards):
tool_prefix = get_tool_prefix(wildcards.tool,
wildcards.chrom, wildcards.svtype)
return os.path.join(wildcards.tool, wildcards.chrom, tool_prefix)
def convert_svtool_to_vcf(self, inputfile, outputfile,
toolname, svtype, params,
minlen=100, maxlen=5000000,
max_overlap=0.4, doFilter=False):
reference = self._Param('reference')
# The reference handle
reference_handle = Fastafile(reference) if reference else None
reference_contigs = get_contigs(reference)
SVReader = tool_to_reader[toolname](inputfile,
reference_handle=reference_handle,
svs_to_report=[svtype])
SVWriter = tool_to_writer[toolname](outputfile,
reference_contigs,
SVReader)
# The records
eprint("Now reading the records")
records = []
for record in SVReader:
records.append(record)
eprint("%d records" % (len(records)))
# Merging records (necessary for delly inversions)
records = SVReader.bnd_merge(svtype, records)
# Now filtering the records
if doFilter:
eprint("Now filtering")
records = SVReader.filtering(records, max_overlap, minlen, maxlen,
params["filterBed"])
# Sorting the vcf records
records = sorting(records, reference_contigs)
eprint("Now writing the records")
for record in records:
SVWriter.write(record)
SVWriter.close()
eprint("%d records" % (len(records)))
tabix_index(outputfile, force=True, preset='vcf')
return {'tooloutput': os.path.join(wildcards.tool, wildcards.chrom, tool_prefix)}
LUMPY_HOME=/home/faraut/dynawork/CapriSV/pipeline/capricnv/snakewf/playground/cnvdetection/soft/lumpy
LUMPY_HOME=/home/faraut/dynawork/CapriSV/pipeline/cnvpipelines/snakecnv/soft/lumpy/lumpy
LUMPY=$LUMPY_HOME/bin/lumpy
SAMBLASTER=samblaster
......
def intervallist(reference, chrom, output):
chrom_length = getChromLength(reference, chrom)
chrlistFh = open(output, "w")
chrlistFh.write(chrom + ":" + str(1) + "-" + str(chrom_length) + "\n")
chrlistFh.close()
rule intervallist:
input:
reference = REFERENCE
output:
intervallist = "cnvpipeline/{chrom}/{chrom}.list"
run:
intervallist(input.reference, wildcards.chrom, output.intervallist)
rule cnvpipeline:
input:
bams = expand("data/bams/{{chrom}}/{sample}.bam", sample=samples),
bamlist = "genomestrip/bamlist.list",
gendermap = "genomestrip/gendermap.txt",
metadatadone = "genomestrip/metadata/sample_gender.report.txt",
genome = REFERENCE
params:
metadata = "genomestrip/metadata"
output:
touch("genomestrip/{chrom}/done")
threads: 2
shell:
"genomestrip.sh {input.bamlist} {wildcards.chrom} {input.genome}"
" {params.metadata} {input.gendermap} genomestrip/{wildcards.chrom}"
" genomestrip_{wildcards.chrom}"
......@@ -5,18 +5,18 @@ rule delly:
genome = REFERENCE
output:
touch("delly/{chrom}/done"),
expand("delly/{{chrom}}/cnv_{{chrom}}_{svtype}.bcf", svtype=['INV', 'DEL', 'DUP']),
expand("delly/{{chrom}}/cnv_{{chrom}}_{svtype}_raw.bcf", svtype=['INV', 'DEL', 'DUP'])
expand("delly/{{chrom}}/delly_{{chrom}}_{svtype}.bcf", svtype=['INV', 'DEL', 'DUP']),
expand("delly/{{chrom}}/delly_{{chrom}}_{svtype}_raw.bcf", svtype=['INV', 'DEL', 'DUP'])
threads: 2
shell:
"""
for svtype in INV DEL DUP;
do
delly call -g {input.genome} -t ${{svtype}} \
-o delly/{wildcards.chrom}/cnv_{wildcards.chrom}_${{svtype}}_raw.bcf \
-o delly/{wildcards.chrom}/delly_{wildcards.chrom}_${{svtype}}_raw.bcf \
{input.bams}
delly filter -f germline -m 50 -g {input.genome} -t ${{svtype}} \
-o delly/{wildcards.chrom}/cnv_{wildcards.chrom}_${{svtype}}.bcf \
delly/{wildcards.chrom}/cnv_{wildcards.chrom}_${{svtype}}_raw.bcf
-o delly/{wildcards.chrom}/delly_{wildcards.chrom}_${{svtype}}.bcf \
delly/{wildcards.chrom}/delly_{wildcards.chrom}_${{svtype}}_raw.bcf
done
"""
......@@ -40,12 +40,14 @@ rule genomestrip:
bams = expand("data/bams/{{chrom}}/{sample}.bam", sample=samples),
bamlist = "genomestrip/bamlist.list",
gendermap = "genomestrip/gendermap.txt",
metadata = "genomestrip/metadata",
metadatadone = "genomestrip/metadata/sample_gender.report.txt",
genome = REFERENCE
params:
metadata = "genomestrip/metadata"
output:
touch("genomestrip/{chrom}/done")
threads: 2
shell:
"genomestrip.sh {input.bamlist} {wildcards.chrom} {input.genome}"
" {input.metadata} {input.gendermap} genomestrip/{wildcards.chrom}"
" {params.metadata} {input.gendermap} genomestrip/{wildcards.chrom}"
" genomestrip_{wildcards.chrom}"
rule svtyping:
input:
sites = "merge/{chrom}/DEL/merge_{chrom}_DEL.vcf.gz",
bamlist = "genomestrip/bamlist.list",
genome = REFERENCE
output:
vcf = "genotypes/{chrom}/DEL/svtyper_{chrom}_DEL_genotypes.vcf.gz"
params:
outputprefix = "genotypes/{chrom}/DEL/svtyper_{chrom}_DEL_genotypes"
threads: 4
shell:
"svtyper.sh {input.bamlist} {input.sites} {params.outputprefix}"
" {threads}"
rule genotyping:
input:
sites = "merge/{chrom}/DEL/merge_{chrom}_DEL.vcf.gz",
bamlist = "genomestrip/bamlist.list",
gendermap = "genomestrip/gendermap.txt",
metadatadone = "genomestrip/metadata/sample_gender.report.txt",
genome = REFERENCE
params:
metadata = "genomestrip/metadata"
output:
vcf = "genotypes/{chrom}/DEL/genomestrip_{chrom}_DEL_genotypes.vcf.gz"
threads: 2
shell:
"gstrip_genotyper.sh {input.bamlist} {wildcards.chrom} {input.genome}"
" {params.metadata} genomestrip/{wildcards.chrom} {input.gendermap} "
" {input.sites} {output.vcf}"
......@@ -16,6 +16,7 @@ rule exclude:
printf "%s\t%s\t%s" {wildcards.chrom} "0" "1" > {output.excluded}
fi
"""
rule lumpy:
input:
bams = expand("data/bams/{{chrom}}/{sample}.bam", sample=samples),
......
......@@ -4,3 +4,5 @@ include: "delly.snk"
include: "lumpy.snk"
include: "pindel.snk"
include: "genomestrip.snk"
include: "genotyping.snk"
include: "cnvpipeline.snk"
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment