Skip to content
Snippets Groups Projects
detection.snk 8.35 KiB
Newer Older
import sys
import re
import os
import pysam

from os.path import join, isfile
from itertools import chain
import glob

from functions import get_samples
ROOTPATH = os.path.dirname(workflow.snakefile)
sys.path.insert(0, os.path.join(ROOTPATH, "lib"))
os.environ["PYTHONPATH"] = os.path.join(ROOTPATH, "lib")
from svsnakemake_utils import SnakemakeUtils
Floreal Cabanettes's avatar
Floreal Cabanettes committed
size_batches = int(config["batches"] if "batches" in config else 1)
# TODO: raise the error only if genomestrip (or genomestrip genotype) is used)
if not "SV_DIR" in os.environ:
Floreal Cabanettes's avatar
Floreal Cabanettes committed
    print("SV_DIR environment variable must be filled!")

def message(mes):
    sys.stderr.write("|--- " + mes + "\n")


def get_first_bam(samples):
    return "data/bams/%s.bam" % ([x for x in samples.values()][0][0])


Thomas Faraut's avatar
Thomas Faraut committed
def get_filterbed(config):
    if "filterbed" in config and os.isfile(config['filterbed']):
        return config['filterbed']
    elif (config['reference'].endswith("fasta")
            and os.path.exists(config['reference'][:-6] + ".Nstretch.bed")):
        return config['reference'][:-6] + ".Nstretch.bed"
Thomas Faraut's avatar
Thomas Faraut committed
    else:
        return None


def get_excluded(config):
    if "excludebed" in config and os.isfile(config['excludebed']):
        return config['excludebed']
    else:
        return ""

def get_all_samples(wdir):
    files = glob.glob(wdir + "/samples.list.*")
    samples = []
    for f in files:
        with open(f) as fin:
            for line in fin:
                samples.append(line.rstrip())
    return samples


def get_chr_batches(ref_file, chr, chr_batchsize=5000000, overlap=50000):
    fa = pysam.FastaFile(ref_file)
Floreal Cabanettes's avatar
Floreal Cabanettes committed
    len_chr = len(fa.fetch(chr))

    # Make ranges:
    groups = []
    start = 1
    end = min(start + chr_batchsize - 1,  len_chr)
    while (len_chr - end + 1 >= chr_batchsize - overlap) or (len_chr + 1 == end):
        groups.append((start, end))
        if len_chr + 1 == end:
            break
        start = end - overlap + 1
        end = min(start + chr_batchsize - 1, len_chr + 1)
    last_group = (start, len_chr + 1)
    if last_group not in groups:
        groups.append(last_group)

    return groups

shell.prefix("export PATH=%s/bin:\"$PATH\";" % ROOTPATH)
Thomas Faraut's avatar
Thomas Faraut committed
FILTERBED = get_filterbed(config)
EXCLUDEBED = get_excluded(config)

REFERENCE = config['reference']
WDIR = config['wdir']
workdir: WDIR

EMPTY_TEMPLATE = os.path.join(ROOTPATH, "lib", "svreader", "resources", "template.vcf")

Thomas Faraut's avatar
Thomas Faraut committed
config['read_length'] = 100

# list of all samples
all_samples = get_all_samples(WDIR)

#  list of samples as a dictionnary with batch as keys
samples = get_samples(config['sample_file'], size_batches, config['start_batch_nb'] if 'start_batch_nb' in config else 1)
# list of chromosomes
chromosomes = config["chromosomes"]
chr_batches = {}
Floreal Cabanettes's avatar
Floreal Cabanettes committed
ref_chr = "reference_raw.fasta" if not os.path.exists(REFERENCE) and os.path.exists("reference_raw.fasta") else REFERENCE
for chromosome in chromosomes:
    chr_batches[chromosome] = get_chr_batches(ref_chr,
                                              chromosome,
                                              chr_batchsize=5000000,
                                              overlap=50000)
# list of variants to be detected
varianttypes = ['DEL', 'INV', 'DUP', 'mCNV']

if "varianttypes" in config:
    varianttypes = config["varianttypes"]
snakemake_utils = SnakemakeUtils(tools, samples, chr_batches)
include: "tools/threads.snk"
include: "tools/modules.snk"
# Fill, for each tool, the produces variant types:
for tool in tools:
    sv_key = tool + "_sv"
    if sv_key in globals():
        snakemake_utils.set_tool_svtypes(tool, globals()[sv_key])
    else:
        snakemake_utils.set_tool_svtypes(tool, varianttypes)

localrules: all_detection, gendermap, gbamlist, bamlist
def set_all_inputs(wildcards):
    inputs = []
Floreal Cabanettes's avatar
Floreal Cabanettes committed
    if "mCNV" in varianttypes:
        inputs += expand("{batch}/cnvpipeline/{chrom}/cnvpipeline_{chrom}.vcf.gz",
                          chrom=chromosomes, batch=samples.keys())
    if len(varianttypes) > 1 or "mCNV" not in varianttypes:
        inputs += expand("{batch}/filtered/{svtype}/svtyper_{chrom}_{svtype}_genotypes_filtered.vcf.gz",
                          svtype=[x for x in varianttypes if x != "mCNV"],
                          batch=samples.keys())
rule all_detection:
    input:
        set_all_inputs

        genotypes = "{batch}/genotypes/{svtype}/svtyper_{chrom}_{svtype}_genotypes.vcf.gz"
        filtered = "{batch}/filtered/{svtype}/svtyper_{chrom}_{svtype}_genotypes_filtered.vcf.gz"
    wildcard_constraints:
        batch = "\w+"
Floreal Cabanettes's avatar
Floreal Cabanettes committed
    log:
Floreal Cabanettes's avatar
Floreal Cabanettes committed
        stdout = "{batch}/logs/filter/{chrom}_{svtype}.o",
        stderr = "{batch}/logs/filter/{chrom}_{svtype}.e"
Floreal Cabanettes's avatar
Floreal Cabanettes committed
        "filter.py -i {input.genotypes} -o {output.filtered} -g {params.genotyper} "
        "1>{log.stdout} 2>{log.stderr}"
rule merging:
    input:
        unpack(snakemake_utils.get_tools_for_svtype)
        "{batch}/merge/{svtype}/merge_{chrom}_{svtype}.vcf.gz"
    wildcard_constraints:
        batch = "\w+"
    params:
        reference = REFERENCE
Floreal Cabanettes's avatar
Floreal Cabanettes committed
    log:
Floreal Cabanettes's avatar
Floreal Cabanettes committed
        stdout = "{batch}/logs/merge/{chrom}_{svtype}.o",
        stderr = "{batch}/logs/merge/{chrom}_{svtype}.e"
Floreal Cabanettes's avatar
Floreal Cabanettes committed
        "merge.py -o {output} -r {params.reference} -i {input} "
        "1>{log.stdout} 2>{log.stderr}"

rule parsing:
    input:
        unpack(snakemake_utils.getToolOuputFile),
        gaps = "exclusion/gaps.bed",
Floreal Cabanettes's avatar
Floreal Cabanettes committed
        reference = REFERENCE,
        fai = REFERENCE + ".fai"
Thomas Faraut's avatar
Thomas Faraut committed
        parseoutput = "{batch}/parse/{svtype}/{tool}/{tool}_{chrom}_{svtype}_parsed.vcf.gz"
    wildcard_constraints:
            batch = "\w+",
            tool = "\w+"
Floreal Cabanettes's avatar
Floreal Cabanettes committed
    log:
Floreal Cabanettes's avatar
Floreal Cabanettes committed
        stdout = "{batch}/logs/{tool}/parse_{chrom}_{svtype}.o",
        stderr = "{batch}/logs/{tool}/parse_{chrom}_{svtype}.e"
        "parse.py -r {input.reference} -i {input.tooloutput} -g {input.gaps} "
        "-o {output.parseoutput} -t {wildcards.tool} -s {wildcards.svtype} "
        "-b {wildcards.batch} "
Floreal Cabanettes's avatar
Floreal Cabanettes committed
        "1>{log.stdout} 2>{log.stderr}"
if "no_detection_index" not in config or not config["no_detection_index"]:
    rule index:
        input:
            "data/bams/{sample}.bam"
        output:
            "data/bams/{sample}.bam.bai"
        threads:
            get_threads("index", 8)
        shell:
            "samtools index -@{threads} {input}"
rule mergeexcluded:
    input:
        expand("exclusion/excluded_{sample}.bed", sample=all_samples),
    output:
        "exclusion/excluded.bed"
    log:
        stdout = "logs/exclusion/exclusion.o",
        stderr = "logs/exclusion/exclusion.e"
    shell:
        "cat {input} | bedtools sort | bedtools merge > {output}"


        bam = "data/bams/{sample}.bam",
        index = "data/bams/{sample}.bam.bai",
        reference = REFERENCE,
        fai = REFERENCE + ".fai"
        "exclusion/excluded_{sample}.bed"
        chromosomes = ",".join(chromosomes)
        get_threads("excluded", 4)
        stdout = "logs/exclusion/exclusion_{sample}.o",
        stderr = "logs/exclusion/exclusion_{sample}.e"
        "detect_regions_to_exclude.py -b {input.bam}"
        " -f {input.reference}"
        " -c {params.chromosomes}"
        " -o {output}"
        " -t {threads}"
        " 1>{log.stdout} 2>{log.stderr}"

        reference = REFERENCE,
        fai = REFERENCE + ".fai"
    output:
        gaps = "exclusion/gaps.bed"
    params:
        nstrech = config['maxNstretches'] if 'maxNstretches' in config else 100
    log:
        stdout = "logs/exclusion/nstretch.o",
        stderr = "logs/exclusion/nstretch.e"
        "findNstretches.py -f {input.reference} -o {output} -t {threads} -m {params.nstrech}"
        " 1>{log.stdout} 2>{log.stderr}"

rule bamlist:
    input:
        unpack(snakemake_utils.get_inputs_bams)
    output:
        bamlist = "{batch}/bamlist.list"
    wildcard_constraints:
        batch = "\w+"
    run:
        fout = open(output.bamlist, "w")
        for sample in samples[wildcards.batch]:
            bamfile = "data/bams/%s.bam" % sample
            fout.write(bamfile + "\n")
        fout.close()