referencebundle.snk


import sys
import re
import os

from os.path import join

from pyfaidx import Fasta
# from natsort import natsorted

include: "tools/utils.snk"

ROOTPATH = os.path.dirname(workflow.snakefile)
sys.path.insert(0, os.path.join(ROOTPATH, "lib"))
os.environ["PYTHONPATH"] = os.path.join(ROOTPATH, "lib")

if "SV_DIR" not in os.environ:
    raise Exception("SV_DIR environment variable must be filled!")

SV_DIR = os.environ['SV_DIR']
CLASSPATH = SV_DIR + "/lib/SVToolkit.jar" + ":" + SV_DIR + "/lib/gatk/GenomeAnalysisTK.jar"
WDIR = config['wdir']

workdir: WDIR

# Always set the environment

PREFIX = "reference"
SUBDIR = config["rb_subdir"] + os.path.sep if "rb_subdir" in config else ""
SUFF_TYPES = ("gcmask", "svmask", "lcmask")

p_repeats = re.compile('.*(Satellite|Simple_repeat|Low_complexity).*')
p_repeats_all = re.compile('.*(Satellite|Simple_repeat|Line|SINE|LTR).*')


CHROMS = config['chromosomes']
PERL5LIB = get_perl5lib()

shell.prefix("export PATH=%s/bin:\"$PATH\"; export PERL5LIB=%s; " % (ROOTPATH, PERL5LIB))

rule final:
    input:
        SUBDIR + PREFIX+".fasta.fai",
        SUBDIR + PREFIX+".gcmask.fasta.fai",
        SUBDIR + PREFIX+".svmask.fasta.fai",
        SUBDIR + PREFIX+".lcmask.fasta.fai",
        SUBDIR + PREFIX+".rdmask.bed",
        SUBDIR + PREFIX+".dict",
        SUBDIR + PREFIX+".ploidymap.txt",
        SUBDIR + PREFIX+".gendermask.bed",
        SUBDIR + PREFIX+".Nstretch.bed"

rule get_fasta:
    input:
        fasta=SUBDIR + PREFIX+"_raw.fasta",
        fai=SUBDIR + PREFIX+"_raw.fasta.fai"
    output:
        SUBDIR + PREFIX+".fasta"
    params:
        chromosomes=CHROMS
    shell:
        "samtools faidx {input.fasta} {params.chromosomes} > {output}"
    # run:
    #     from Bio import SeqIO
    #     with open(input[0], "rU") as handle, open(output[0], "w") as f:
    #         for seq in SeqIO.parse(handle, "fasta"):
    #             if seq.id in CHROMS:
    #                 SeqIO.write([seq], f, "fasta")
    #     os.remove(input[0])
    #     i_fai = input[0] + ".fai"
    #     if os.path.exists(i_fai):
    #         os.remove(i_fai)

rule index:
    input:
        SUBDIR + "{type}.fasta"
    output:
        SUBDIR + "{type}.fasta.fai"
    log:
        SUBDIR + "logs/index/{type}.log"
    shell:
        "samtools faidx {input} &> {log}"

rule length:
    input:
        fasta = SUBDIR + PREFIX+".fasta",
        fai = SUBDIR + PREFIX+".fasta.fai"
    output:
        SUBDIR + "reference.fasta.length"
    shell:
        "fastalength {input.fasta} > {output}"

rule preparesvmask:
    input:
        fasta = SUBDIR + PREFIX+".fasta",
        fai = SUBDIR + PREFIX+".fasta.fai"
    output:
        seq = SUBDIR + "work/reference.fasta",
        index = SUBDIR + "work/reference.fasta.bwt"
    log:
        "logs/preparesvmask.log"
    shell:
        "export PATH=$SV_DIR/bwa:\"$PATH\"; export LD_LIBRARY_PATH=$SV_DIR/bwa:\"$LD_LIBRARY_PATH\"; "
        "ln {input.fasta} {output.seq}; "
        "samtools faidx {output.seq}; "
        # Checking genome size for bwa index
        #"genomesize=`cat {output.seq}.fai | awk '{ totsize = totsize +$0}END{ print totsize}'`; "
        "bwa index -a bwtsw {output.seq} 2> {log} || bwa index -a is {output.seq} 2> {log}"

rule mergesvmask:
    input:
        expand(SUBDIR + "work/svmask_{chrom}.fasta", chrom=CHROMS)
    output:
        SUBDIR + "reference.svmask.fasta"
    shell:
        "cat {input} > {output}"

rule chromsvmask:
    input:
        SUBDIR + "work/reference.fasta.bwt",
        fa = SUBDIR + "work/reference.fasta"
    output:
        SUBDIR + "work/svmask_{chrom}.fasta"
    params:
        rl = config['read_len'],
        subdir = SUBDIR
    log:
        "logs/svmask/{chrom}.log"
    shell:
        "export PATH=$SV_DIR/bwa:\"$PATH\"; export LD_LIBRARY_PATH=$SV_DIR/bwa:\"$LD_LIBRARY_PATH\"; "
        "java -cp {CLASSPATH} -Xmx4g org.broadinstitute.sv.apps.ComputeGenomeMask "
        "  -R {input.fa} -O {params.subdir}work/svmask_{wildcards.chrom}.fasta -readLength {params.rl}"
        " -sequence {wildcards.chrom} 2> {log} "

rule bed2fasta:
    input:
        SUBDIR + PREFIX+".fasta.fai",
        ref = SUBDIR + PREFIX+".fasta",
        bed = SUBDIR + "{type}.bed"
    output:
        SUBDIR + "{type}.fasta"
    wildcard_constraints:
        type = ".*(lc|sv|gc)mask"
    shell:
        "export PATH=$SV_DIR/bwa:\"$PATH\"; export LD_LIBRARY_PATH=$SV_DIR/bwa:\"$LD_LIBRARY_PATH\"; "
        " java -cp {CLASSPATH} -Xmx4g "
        "     org.broadinstitute.sv.apps.BedFileToGenomeMask "
        "    -I {input.bed} "
        "    -R {input.ref}  "
        "    -O {output}"

rule repeatmask:
    input:
        fasta = SUBDIR + "reference.fasta",
        fai = SUBDIR + PREFIX+".fasta.fai"
    output:
        SUBDIR + "reference.fasta.out"
    params:
        sp = config['species'],
        perl5lib = PERL5LIB
    threads:
        get_threads("repeatmask", 12)
    shell:
        """
        export PERL5LIB={params.perl5lib}
        RepeatMasker -pa {threads} -species {params.sp} -xsmall {input.fasta}
         > {input.fasta}.repeatmasker.log
        """

rule rdmask:
    input:
        length = SUBDIR + PREFIX+".fasta.length"
    output:
        bed = SUBDIR + PREFIX+".rdmask.bed"
    run:
        with open(input.length) as f:
            with open(output.bed, "w") as fout:
                fout.write("\t".join(["CHROM", "START", "END"])+"\n")
                for line in f:
                    line = line.rstrip()
                    fields = line.split()
                    fout.write("\t".join([fields[1], "0", str(int(fields[0])-1)]) + "\n")

rule lcmaskbed:
    input:
        repeats = SUBDIR + PREFIX+".fasta.out"
    output:
        bed = SUBDIR + PREFIX+".lcmask.bed"
    run:
        repeats = []
        with open(input.repeats) as f:
            for line in f:
                if p_repeats.match(line):
                    fields = line.rstrip().split()
                    # append chrom start and end
                    repeats.append([fields[4], fields[5], fields[6]])
        sorted_repeats = sorted(repeats,
                                key=lambda x: (x[0], int(x[1]), int(x[2])))
        # Write to file
        import csv
        with open(output.bed, 'w') as f:
            writer = csv.writer(f, delimiter='\t')
            writer.writerows(sorted_repeats)

rule gcmaskbed:
    input:
        repeats = SUBDIR + "reference.fasta.out"
    output:
        bed = SUBDIR + "reference.gcmask.bed"
    run:
        repeats = []
        with open(input.repeats) as f:
            for line in f:
                if p_repeats_all.match(line):
                    fields = line.rstrip().split()
                    repeats.append([fields[4], fields[5], fields[6]])
        sorted_repeats = sorted(repeats,
                                key=lambda x: (x[0], int(x[1]), int(x[2])))

        # Write to file
        import csv
        with open(output.bed, 'w') as f:
            writer = csv.writer(f, delimiter='\t')
            writer.writerows(sorted_repeats)


rule dict:
    input:
        fasta = SUBDIR + PREFIX+".fasta",
        fai = SUBDIR + PREFIX+".fasta.fai"
    output:
        SUBDIR + PREFIX+".dict"
    shell:
        "picard CreateSequenceDictionary "
        " R= {input.fasta} "
        " O= {output} "

rule nstretches:
    input:
        fasta = SUBDIR + PREFIX+".fasta",
        fai = SUBDIR + PREFIX+".fasta.fai"
    output:
        SUBDIR + PREFIX+".Nstretch.bed"
    params:
        nstrech = config['maxNstretches']
    threads:
        1
    shell:
        "findNstretches.py -f {input.fasta} -m {params.nstrech} "
        "-o {output} -t {threads}"

rule ploidy:
    output:
        SUBDIR + PREFIX+".ploidymap.txt"
    shell:
        " printf \"*\t*\t*\t*\t2\n\" > {output}"

rule gendermask:
    output:
        SUBDIR + PREFIX+".gendermask.bed"
    shell:
        " touch {output}"