Newer
Older
from os.path import join, isfile
from itertools import chain
import glob
from functions import get_samples
ROOTPATH = os.path.dirname(workflow.snakefile)
sys.path.insert(0, os.path.join(ROOTPATH, "lib"))
os.environ["PYTHONPATH"] = os.path.join(ROOTPATH, "lib")
from svsnakemake_utils import SnakemakeUtils
size_batches = int(config["batches"] if "batches" in config else 1)
# TODO: raise the error only if genomestrip (or genomestrip genotype) is used)
if not "SV_DIR" in os.environ:
def message(mes):
sys.stderr.write("|--- " + mes + "\n")
def get_first_bam(samples):
return "data/bams/%s.bam" % ([x for x in samples.values()][0][0])
def get_filterbed(config):
if "filterbed" in config and os.isfile(config['filterbed']):
return config['filterbed']
elif (config['reference'].endswith("fasta")
and os.path.exists(config['reference'][:-6] + ".Nstretch.bed")):
return config['reference'][:-6] + ".Nstretch.bed"
else:
return None
def get_excluded(config):
if "excludebed" in config and os.isfile(config['excludebed']):
return config['excludebed']
else:
return ""
Thomas Faraut
committed
def get_all_samples(wdir):
files = glob.glob(wdir + "/samples.list.*")
samples = []
for f in files:
with open(f) as fin:
for line in fin:
samples.append(line.rstrip())
return samples
Thomas Faraut
committed
def get_chr_batches(ref_file, chr, chr_batchsize=5000000, overlap=50000):
# Make ranges:
groups = []
start = 1
Thomas Faraut
committed
end = min(start + chr_batchsize - 1, len_chr)
while (len_chr - end + 1 >= chr_batchsize - overlap) or (len_chr + 1 == end):
groups.append((start, end))
if len_chr + 1 == end:
break
Thomas Faraut
committed
start = end - overlap + 1
end = min(start + chr_batchsize - 1, len_chr + 1)
last_group = (start, len_chr + 1)
if last_group not in groups:
groups.append(last_group)
return groups
shell.prefix("export PATH=%s/bin:\"$PATH\";" % ROOTPATH)
FILTERBED = get_filterbed(config)
EXCLUDEBED = get_excluded(config)
REFERENCE = config['reference']
WDIR = config['wdir']
workdir: WDIR
EMPTY_TEMPLATE = os.path.join(ROOTPATH, "lib", "svreader", "resources", "template.vcf")
# list of all samples
all_samples = get_all_samples(WDIR)
# list of samples as a dictionnary with batch as keys
samples = get_samples(config['sample_file'], size_batches, config['start_batch_nb'] if 'start_batch_nb' in config else 1)
Floreal Cabanettes
committed
tools = config['tools']
chromosomes = config["chromosomes"]
ref_chr = "reference_raw.fasta" if not os.path.exists(REFERENCE) and os.path.exists("reference_raw.fasta") else REFERENCE
Thomas Faraut
committed
chr_batches[chromosome] = get_chr_batches(ref_chr,
chromosome,
chr_batchsize=5000000,
overlap=50000)
# list of variants to be detected
varianttypes = ['DEL', 'INV', 'DUP', 'mCNV']
Floreal Cabanettes
committed
if "varianttypes" in config:
varianttypes = config["varianttypes"]
snakemake_utils = SnakemakeUtils(tools, samples, chr_batches)
include: "tools/threads.snk"
# Fill, for each tool, the produces variant types:
for tool in tools:
sv_key = tool + "_sv"
if sv_key in globals():
snakemake_utils.set_tool_svtypes(tool, globals()[sv_key])
else:
snakemake_utils.set_tool_svtypes(tool, varianttypes)
localrules: all_detection, gendermap, gbamlist, bamlist
def set_all_inputs(wildcards):
inputs += expand("{batch}/cnvpipeline/{chrom}/cnvpipeline_{chrom}.vcf.gz",
chrom=chromosomes, batch=samples.keys())
if len(varianttypes) > 1 or "mCNV" not in varianttypes:
inputs += expand("{batch}/filtered/{svtype}/svtyper_{chrom}_{svtype}_genotypes_filtered.vcf.gz",
chrom=chromosomes,
svtype=[x for x in varianttypes if x != "mCNV"],
batch=samples.keys())
rule all_detection:
input:
set_all_inputs
rule filtering:
input:
genotypes = "{batch}/genotypes/{svtype}/svtyper_{chrom}_{svtype}_genotypes.vcf.gz"
output:
filtered = "{batch}/filtered/{svtype}/svtyper_{chrom}_{svtype}_genotypes_filtered.vcf.gz"
wildcard_constraints:
params:
genotyper = "svtyper"
stdout = "{batch}/logs/filter/{chrom}_{svtype}.o",
stderr = "{batch}/logs/filter/{chrom}_{svtype}.e"
threads:
1
"filter.py -i {input.genotypes} -o {output.filtered} -g {params.genotyper} "
"1>{log.stdout} 2>{log.stderr}"
unpack(snakemake_utils.get_tools_for_svtype)
"{batch}/merge/{svtype}/merge_{chrom}_{svtype}.vcf.gz"
wildcard_constraints:
stdout = "{batch}/logs/merge/{chrom}_{svtype}.o",
stderr = "{batch}/logs/merge/{chrom}_{svtype}.e"
threads:
1
"merge.py -o {output} -r {params.reference} -i {input} "
"1>{log.stdout} 2>{log.stderr}"
unpack(snakemake_utils.getToolOuputFile),
gaps = "exclusion/gaps.bed",
fai = REFERENCE + ".fai"
parseoutput = "{batch}/parse/{svtype}/{tool}/{tool}_{chrom}_{svtype}_parsed.vcf.gz"
wildcard_constraints:
stdout = "{batch}/logs/{tool}/parse_{chrom}_{svtype}.o",
stderr = "{batch}/logs/{tool}/parse_{chrom}_{svtype}.e"
threads:
1
Thomas Faraut
committed
"parse.py -r {input.reference} -i {input.tooloutput} -g {input.gaps} "
"-o {output.parseoutput} -t {wildcards.tool} -s {wildcards.svtype} "
if "no_detection_index" not in config or not config["no_detection_index"]:
rule index:
input:
"data/bams/{sample}.bam"
output:
"data/bams/{sample}.bam.bai"
threads:
get_threads("index", 8)
shell:
"samtools index -@{threads} {input}"
rule mergeexcluded:
input:
expand("exclusion/excluded_{sample}.bed", sample=all_samples),
output:
"exclusion/excluded.bed"
log:
stdout = "logs/exclusion/exclusion.o",
stderr = "logs/exclusion/exclusion.e"
shell:
"cat {input} | bedtools sort | bedtools merge > {output}"
bam = "data/bams/{sample}.bam",
index = "data/bams/{sample}.bam.bai",
reference = REFERENCE,
fai = REFERENCE + ".fai"
chromosomes = ",".join(chromosomes)
stdout = "logs/exclusion/exclusion_{sample}.o",
stderr = "logs/exclusion/exclusion_{sample}.e"
"detect_regions_to_exclude.py -b {input.bam}"
" -f {input.reference}"
" -c {params.chromosomes}"
" -o {output}"
" -t {threads}"
" 1>{log.stdout} 2>{log.stderr}"
rule nstretches_d:
reference = REFERENCE,
fai = REFERENCE + ".fai"
output:
gaps = "exclusion/gaps.bed"
params:
nstrech = config['maxNstretches'] if 'maxNstretches' in config else 100
Thomas Faraut
committed
get_threads("excluded", 8)
log:
stdout = "logs/exclusion/nstretch.o",
stderr = "logs/exclusion/nstretch.e"
"findNstretches.py -f {input.reference} -o {output} -t {threads} -m {params.nstrech}"
" 1>{log.stdout} 2>{log.stderr}"
rule bamlist:
input:
unpack(snakemake_utils.get_inputs_bams)
output:
bamlist = "{batch}/bamlist.list"
wildcard_constraints:
run:
fout = open(output.bamlist, "w")
for sample in samples[wildcards.batch]:
bamfile = "data/bams/%s.bam" % sample
fout.write(bamfile + "\n")
fout.close()