detection.snk



import sys
import re
import os
import pysam

from os.path import join, isfile
from itertools import chain
import glob

from functions import get_samples

ROOTPATH = os.path.dirname(workflow.snakefile)
sys.path.insert(0, os.path.join(ROOTPATH, "lib"))
os.environ["PYTHONPATH"] = os.path.join(ROOTPATH, "lib")

from svsnakemake_utils import SnakemakeUtils

size_batches = int(config["batches"] if "batches" in config else 1)

# TODO: raise the error only if genomestrip (or genomestrip genotype) is used)
if not "SV_DIR" in os.environ:
    print("SV_DIR environment variable must be filled!")
    exit(1)


def message(mes):
    sys.stderr.write("|--- " + mes + "\n")


def get_first_bam(samples):
    return "data/bams/%s.bam" % ([x for x in samples.values()][0][0])


def get_filterbed(config):
    if "filterbed" in config and os.isfile(config['filterbed']):
        return config['filterbed']
    elif (config['reference'].endswith("fasta")
            and os.path.exists(config['reference'][:-6] + ".Nstretch.bed")):
        return config['reference'][:-6] + ".Nstretch.bed"
    else:
        return None


def get_excluded(config):
    if "excludebed" in config and os.isfile(config['excludebed']):
        return config['excludebed']
    else:
        return ""


def get_all_samples(wdir):
    files = glob.glob(wdir + "/samples.list.*")
    samples = []
    for f in files:
        with open(f) as fin:
            for line in fin:
                samples.append(line.rstrip())
    return samples


def get_chr_batches(ref_file, chr, chr_batchsize=5000000, overlap=50000):

    fa = pysam.FastaFile(ref_file)
    len_chr = len(fa.fetch(chr))

    # Make ranges:
    groups = []
    start = 1
    end = min(start + chr_batchsize - 1,  len_chr)
    while (len_chr - end + 1 >= chr_batchsize - overlap) or (len_chr + 1 == end):
        groups.append((start, end))
        if len_chr + 1 == end:
            break
        start = end - overlap + 1
        end = min(start + chr_batchsize - 1, len_chr + 1)
    last_group = (start, len_chr + 1)
    if last_group not in groups:
        groups.append(last_group)

    return groups


shell.prefix("export PATH=%s/bin:\"$PATH\";" % ROOTPATH)

message("Starting")

FILTERBED = get_filterbed(config)
EXCLUDEBED = get_excluded(config)

REFERENCE = config['reference']
WDIR = config['wdir']
workdir: WDIR

EMPTY_TEMPLATE = os.path.join(ROOTPATH, "lib", "svreader", "resources", "template.vcf")

config['read_length'] = 100

# list of all samples
all_samples = get_all_samples(WDIR)

#  list of samples as a dictionnary with batch as keys
samples = get_samples(config['sample_file'], size_batches, config['start_batch_nb'] if 'start_batch_nb' in config else 1)

# list of tools
tools = config['tools']
# list of chromosomes
chromosomes = config["chromosomes"]
chr_batches = {}
ref_chr = "reference_raw.fasta" if not os.path.exists(REFERENCE) and os.path.exists("reference_raw.fasta") else REFERENCE
for chromosome in chromosomes:
    chr_batches[chromosome] = get_chr_batches(ref_chr,
                                              chromosome,
                                              chr_batchsize=5000000,
                                              overlap=50000)

# list of variants to be detected
varianttypes = ['DEL', 'INV', 'DUP', 'mCNV']

if "varianttypes" in config:
    varianttypes = config["varianttypes"]

snakemake_utils = SnakemakeUtils(tools, samples, chr_batches)

include: "tools/threads.snk"
include: "tools/modules.snk"

# Fill, for each tool, the produces variant types:
for tool in tools:
    sv_key = tool + "_sv"
    if sv_key in globals():
        snakemake_utils.set_tool_svtypes(tool, globals()[sv_key])
    else:
        snakemake_utils.set_tool_svtypes(tool, varianttypes)

localrules: all_detection, gendermap, gbamlist, bamlist


def set_all_inputs(wildcards):
    inputs = []
    if "mCNV" in varianttypes:
        inputs += expand("{batch}/cnvpipeline/{chrom}/cnvpipeline_{chrom}.vcf.gz",
                          chrom=chromosomes, batch=samples.keys())
    if len(varianttypes) > 1 or "mCNV" not in varianttypes:
        inputs += expand("{batch}/filtered/{svtype}/svtyper_{chrom}_{svtype}_genotypes_filtered.vcf.gz",
                          chrom=chromosomes,
                          svtype=[x for x in varianttypes if x != "mCNV"],
                          batch=samples.keys())
    return inputs


rule all_detection:
    input:
        set_all_inputs

rule filtering:
    input:
        genotypes = "{batch}/genotypes/{svtype}/svtyper_{chrom}_{svtype}_genotypes.vcf.gz"
    output:
        filtered = "{batch}/filtered/{svtype}/svtyper_{chrom}_{svtype}_genotypes_filtered.vcf.gz"
    wildcard_constraints:
        batch = "\w+"
    params:
        genotyper = "svtyper"
    log:
        stdout = "{batch}/logs/filter/{chrom}_{svtype}.o",
        stderr = "{batch}/logs/filter/{chrom}_{svtype}.e"
    threads:
        1
    shell:
        "filter.py -i {input.genotypes} -o {output.filtered} -g {params.genotyper} "
        "1>{log.stdout} 2>{log.stderr}"

rule merging:
    input:
        unpack(snakemake_utils.get_tools_for_svtype)
    output:
        "{batch}/merge/{svtype}/merge_{chrom}_{svtype}.vcf.gz"
    wildcard_constraints:
        batch = "\w+"
    params:
        reference = REFERENCE
    log:
        stdout = "{batch}/logs/merge/{chrom}_{svtype}.o",
        stderr = "{batch}/logs/merge/{chrom}_{svtype}.e"
    threads:
        1
    shell:
        "merge.py -o {output} -r {params.reference} -i {input} "
        "1>{log.stdout} 2>{log.stderr}"

rule parsing:
    input:
        unpack(snakemake_utils.getToolOuputFile),
        gaps = "exclusion/gaps.bed",
        reference = REFERENCE,
        fai = REFERENCE + ".fai"
    output:
        parseoutput = "{batch}/parse/{svtype}/{tool}/{tool}_{chrom}_{svtype}_parsed.vcf.gz"
    wildcard_constraints:
            batch = "\w+",
            tool = "\w+"
    log:
        stdout = "{batch}/logs/{tool}/parse_{chrom}_{svtype}.o",
        stderr = "{batch}/logs/{tool}/parse_{chrom}_{svtype}.e"
    threads:
        1
    shell:
        "parse.py -r {input.reference} -i {input.tooloutput} -g {input.gaps} "
        "-o {output.parseoutput} -t {wildcards.tool} -s {wildcards.svtype} "
        "-b {wildcards.batch} "
        "1>{log.stdout} 2>{log.stderr}"

if "no_detection_index" not in config or not config["no_detection_index"]:
    rule index:
        input:
            "data/bams/{sample}.bam"
        output:
            "data/bams/{sample}.bam.bai"
        threads:
            get_threads("index", 8)
        shell:
            "samtools index -@{threads} {input}"

rule mergeexcluded:
    input:
        expand("exclusion/excluded_{sample}.bed", sample=all_samples),
    output:
        "exclusion/excluded.bed"
    log:
        stdout = "logs/exclusion/exclusion.o",
        stderr = "logs/exclusion/exclusion.e"
    shell:
        "cat {input} | bedtools sort | bedtools merge > {output}"


rule excluded:
    input:
        bam = "data/bams/{sample}.bam",
        index = "data/bams/{sample}.bam.bai",
        reference = REFERENCE,
        fai = REFERENCE + ".fai"
    output:
        "exclusion/excluded_{sample}.bed"
    params:
        chromosomes = ",".join(chromosomes)
    threads:
        get_threads("excluded", 4)
    log:
        stdout = "logs/exclusion/exclusion_{sample}.o",
        stderr = "logs/exclusion/exclusion_{sample}.e"
    shell:
        "detect_regions_to_exclude.py -b {input.bam}"
        " -f {input.reference}"
        " -c {params.chromosomes}"
        " -o {output}"
        " -t {threads}"
        " 1>{log.stdout} 2>{log.stderr}"

rule nstretches_d:
    input:
        reference = REFERENCE,
        fai = REFERENCE + ".fai"
    output:
        gaps = "exclusion/gaps.bed"
    params:
        nstrech = config['maxNstretches'] if 'maxNstretches' in config else 100
    threads:
        get_threads("excluded", 8)
    log:
        stdout = "logs/exclusion/nstretch.o",
        stderr = "logs/exclusion/nstretch.e"
    shell:
        "findNstretches.py -f {input.reference} -o {output} -t {threads} -m {params.nstrech}"
        " 1>{log.stdout} 2>{log.stderr}"

rule bamlist:
    input:
        unpack(snakemake_utils.get_inputs_bams)
    output:
        bamlist = "{batch}/bamlist.list"
    wildcard_constraints:
        batch = "\w+"
    run:
        fout = open(output.bamlist, "w")
        for sample in samples[wildcards.batch]:
            bamfile = "data/bams/%s.bam" % sample
            fout.write(bamfile + "\n")
        fout.close()