Skip to content
Snippets Groups Projects
referencebundle.snk 7.7 KiB
Newer Older
Thomas Faraut's avatar
Thomas Faraut committed
import sys
import re
Thomas Faraut's avatar
Thomas Faraut committed

from os.path import join

from pyfaidx import Fasta
# from natsort import natsorted

Thomas Faraut's avatar
Thomas Faraut committed
include: "tools/utils.snk"

Floreal Cabanettes's avatar
Floreal Cabanettes committed
ROOTPATH = os.path.dirname(workflow.snakefile)
sys.path.insert(0, os.path.join(ROOTPATH, "lib"))
os.environ["PYTHONPATH"] = os.path.join(ROOTPATH, "lib")

if "SV_DIR" not in os.environ:
    raise Exception("SV_DIR environment variable must be filled!")

SV_DIR = os.environ['SV_DIR']
CLASSPATH = SV_DIR + "/lib/SVToolkit.jar" + ":" + SV_DIR + "/lib/gatk/GenomeAnalysisTK.jar"
WDIR = config['wdir']
Thomas Faraut's avatar
Thomas Faraut committed

workdir: WDIR

# Always set the environment

PREFIX = "reference"
SUBDIR = config["rb_subdir"] + os.path.sep if "rb_subdir" in config else ""
Thomas Faraut's avatar
Thomas Faraut committed
SUFF_TYPES = ("gcmask", "svmask", "lcmask")

p_repeats = re.compile('.*(Satellite|Simple_repeat|Low_complexity).*')
p_repeats_all = re.compile('.*(Satellite|Simple_repeat|Line|SINE|LTR).*')

CHROMS = config['chromosomes']
Thomas Faraut's avatar
Thomas Faraut committed
PERL5LIB = get_perl5lib()
Thomas Faraut's avatar
Thomas Faraut committed
shell.prefix("export PATH=%s/bin:\"$PATH\"; export PERL5LIB=%s; " % (ROOTPATH, PERL5LIB))
Thomas Faraut's avatar
Thomas Faraut committed
rule final:
    input:
        SUBDIR + PREFIX+".fasta.fai",
        SUBDIR + PREFIX+".gcmask.fasta.fai",
        SUBDIR + PREFIX+".svmask.fasta.fai",
        SUBDIR + PREFIX+".lcmask.fasta.fai",
        SUBDIR + PREFIX+".rdmask.bed",
        SUBDIR + PREFIX+".dict",
        SUBDIR + PREFIX+".ploidymap.txt",
        SUBDIR + PREFIX+".gendermask.bed",
        SUBDIR + PREFIX+".Nstretch.bed"
rule get_fasta:
    input:
Thomas Faraut's avatar
Thomas Faraut committed
        fasta=SUBDIR + PREFIX+"_raw.fasta",
        fai=SUBDIR + PREFIX+"_raw.fasta.fai"
    output:
        SUBDIR + PREFIX+".fasta"
Thomas Faraut's avatar
Thomas Faraut committed
    params:
        chromosomes=CHROMS
    shell:
        "samtools faidx {input.fasta} {params.chromosomes} > {output}"
    # run:
    #     from Bio import SeqIO
    #     with open(input[0], "rU") as handle, open(output[0], "w") as f:
    #         for seq in SeqIO.parse(handle, "fasta"):
    #             if seq.id in CHROMS:
    #                 SeqIO.write([seq], f, "fasta")
    #     os.remove(input[0])
    #     i_fai = input[0] + ".fai"
    #     if os.path.exists(i_fai):
    #         os.remove(i_fai)
Thomas Faraut's avatar
Thomas Faraut committed
rule index:
    input:
        SUBDIR + "{type}.fasta"
Thomas Faraut's avatar
Thomas Faraut committed
    output:
        SUBDIR + "{type}.fasta.fai"
Thomas Faraut's avatar
Thomas Faraut committed
    log:
Thomas Faraut's avatar
Thomas Faraut committed
        SUBDIR + "logs/index/{type}.log"
Thomas Faraut's avatar
Thomas Faraut committed
    shell:
        "samtools faidx {input} &> {log}"
Thomas Faraut's avatar
Thomas Faraut committed

rule length:
    input:
        fasta = SUBDIR + PREFIX+".fasta",
        fai = SUBDIR + PREFIX+".fasta.fai"
Thomas Faraut's avatar
Thomas Faraut committed
    output:
        SUBDIR + "reference.fasta.length"
Thomas Faraut's avatar
Thomas Faraut committed
    shell:
        "fastalength {input.fasta} > {output}"
Thomas Faraut's avatar
Thomas Faraut committed

rule preparesvmask:
    input:
        fasta = SUBDIR + PREFIX+".fasta",
        fai = SUBDIR + PREFIX+".fasta.fai"
Thomas Faraut's avatar
Thomas Faraut committed
    output:
        seq = SUBDIR + "work/reference.fasta",
        index = SUBDIR + "work/reference.fasta.bwt"
Thomas Faraut's avatar
Thomas Faraut committed
    log:
        "logs/preparesvmask.log"
    shell:
        "export PATH=$SV_DIR/bwa:\"$PATH\"; export LD_LIBRARY_PATH=$SV_DIR/bwa:\"$LD_LIBRARY_PATH\"; "
        "ln {input.fasta} {output.seq}; "
Thomas Faraut's avatar
Thomas Faraut committed
        "samtools faidx {output.seq}; "
Thomas Faraut's avatar
Thomas Faraut committed
        # Checking genome size for bwa index
        #"genomesize=`cat {output.seq}.fai | awk '{ totsize = totsize +$0}END{ print totsize}'`; "
        "bwa index -a bwtsw {output.seq} 2> {log} || bwa index -a is {output.seq} 2> {log}"
Thomas Faraut's avatar
Thomas Faraut committed

rule mergesvmask:
    input:
        expand(SUBDIR + "work/svmask_{chrom}.fasta", chrom=CHROMS)
Thomas Faraut's avatar
Thomas Faraut committed
    output:
        SUBDIR + "reference.svmask.fasta"
Thomas Faraut's avatar
Thomas Faraut committed
    shell:
        "cat {input} > {output}"

rule chromsvmask:
    input:
        SUBDIR + "work/reference.fasta.bwt",
        fa = SUBDIR + "work/reference.fasta"
Thomas Faraut's avatar
Thomas Faraut committed
    output:
        SUBDIR + "work/svmask_{chrom}.fasta"
Thomas Faraut's avatar
Thomas Faraut committed
    params:
Floreal Cabanettes's avatar
Floreal Cabanettes committed
        rl = config['read_len'],
        subdir = SUBDIR
Thomas Faraut's avatar
Thomas Faraut committed
    log:
        "logs/svmask/{chrom}.log"
    shell:
        "export PATH=$SV_DIR/bwa:\"$PATH\"; export LD_LIBRARY_PATH=$SV_DIR/bwa:\"$LD_LIBRARY_PATH\"; "
Thomas Faraut's avatar
Thomas Faraut committed
        "java -cp {CLASSPATH} -Xmx4g org.broadinstitute.sv.apps.ComputeGenomeMask "
        "  -R {input.fa} -O {params.subdir}work/svmask_{wildcards.chrom}.fasta -readLength {params.rl}"
Thomas Faraut's avatar
Thomas Faraut committed
        " -sequence {wildcards.chrom} 2> {log} "

rule bed2fasta:
    input:
        SUBDIR + PREFIX+".fasta.fai",
        ref = SUBDIR + PREFIX+".fasta",
        bed = SUBDIR + "{type}.bed"
Thomas Faraut's avatar
Thomas Faraut committed
    output:
Thomas Faraut's avatar
Thomas Faraut committed
    wildcard_constraints:
        type = ".*(lc|sv|gc)mask"
Thomas Faraut's avatar
Thomas Faraut committed
    shell:
        "export PATH=$SV_DIR/bwa:\"$PATH\"; export LD_LIBRARY_PATH=$SV_DIR/bwa:\"$LD_LIBRARY_PATH\"; "
Thomas Faraut's avatar
Thomas Faraut committed
        " java -cp {CLASSPATH} -Xmx4g "
        "     org.broadinstitute.sv.apps.BedFileToGenomeMask "
        "    -I {input.bed} "
        "    -R {input.ref}  "
        "    -O {output}"

rule repeatmask:
    input:
        fasta = SUBDIR + "reference.fasta",
        fai = SUBDIR + PREFIX+".fasta.fai"
Thomas Faraut's avatar
Thomas Faraut committed
    output:
        SUBDIR + "reference.fasta.out"
Thomas Faraut's avatar
Thomas Faraut committed
        sp = config['species'],
        perl5lib = PERL5LIB
    threads:
        get_threads("repeatmask", 12)
Thomas Faraut's avatar
Thomas Faraut committed
    shell:
Thomas Faraut's avatar
Thomas Faraut committed
        """
        export PERL5LIB={params.perl5lib}
        RepeatMasker -pa {threads} -species {params.sp} -xsmall {input.fasta}
         > {input.fasta}.repeatmasker.log
        """
Thomas Faraut's avatar
Thomas Faraut committed

rule rdmask:
    input:
        length = SUBDIR + PREFIX+".fasta.length"
Thomas Faraut's avatar
Thomas Faraut committed
    output:
        bed = SUBDIR + PREFIX+".rdmask.bed"
Thomas Faraut's avatar
Thomas Faraut committed
    run:
        with open(input.length) as f:
            with open(output.bed, "w") as fout:
                fout.write("\t".join(["CHROM", "START", "END"])+"\n")
Thomas Faraut's avatar
Thomas Faraut committed
                for line in f:
                    line = line.rstrip()
                    fields = line.split()
                    fout.write("\t".join([fields[1], "0", str(int(fields[0])-1)]) + "\n")
Thomas Faraut's avatar
Thomas Faraut committed

rule lcmaskbed:
    input:
        repeats = SUBDIR + PREFIX+".fasta.out"
Thomas Faraut's avatar
Thomas Faraut committed
    output:
        bed = SUBDIR + PREFIX+".lcmask.bed"
Thomas Faraut's avatar
Thomas Faraut committed
    run:
        repeats = []
        with open(input.repeats) as f:
            for line in f:
                if p_repeats.match(line):
                    fields = line.rstrip().split()
                    # append chrom start and end
                    repeats.append([fields[4], fields[5], fields[6]])
        sorted_repeats = sorted(repeats,
                                key=lambda x: (x[0], int(x[1]), int(x[2])))
        # Write to file
        import csv
        with open(output.bed, 'w') as f:
            writer = csv.writer(f, delimiter='\t')
            writer.writerows(sorted_repeats)
Thomas Faraut's avatar
Thomas Faraut committed

rule gcmaskbed:
    input:
        repeats = SUBDIR + "reference.fasta.out"
Thomas Faraut's avatar
Thomas Faraut committed
    output:
        bed = SUBDIR + "reference.gcmask.bed"
Thomas Faraut's avatar
Thomas Faraut committed
    run:
        repeats = []
        with open(input.repeats) as f:
            for line in f:
                if p_repeats_all.match(line):
                    fields = line.rstrip().split()
                    repeats.append([fields[4], fields[5], fields[6]])
        sorted_repeats = sorted(repeats,
                                key=lambda x: (x[0], int(x[1]), int(x[2])))

        # Write to file
        import csv
        with open(output.bed, 'w') as f:
            writer = csv.writer(f, delimiter='\t')
            writer.writerows(sorted_repeats)

Thomas Faraut's avatar
Thomas Faraut committed

rule dict:
    input:
        fasta = SUBDIR + PREFIX+".fasta",
        fai = SUBDIR + PREFIX+".fasta.fai"
Thomas Faraut's avatar
Thomas Faraut committed
    output:
Thomas Faraut's avatar
Thomas Faraut committed
    shell:
        "picard CreateSequenceDictionary "
        " R= {input.fasta} "
Thomas Faraut's avatar
Thomas Faraut committed
        " O= {output} "

rule nstretches:
    input:
        fasta = SUBDIR + PREFIX+".fasta",
        fai = SUBDIR + PREFIX+".fasta.fai"
Thomas Faraut's avatar
Thomas Faraut committed
    output:
        SUBDIR + PREFIX+".Nstretch.bed"
Thomas Faraut's avatar
Thomas Faraut committed
    params:
        nstrech = config['maxNstretches']
Thomas Faraut's avatar
Thomas Faraut committed
    shell:
        "findNstretches.py -f {input.fasta} -m {params.nstrech} "
        "-o {output} -t {threads}"
Thomas Faraut's avatar
Thomas Faraut committed

rule ploidy:
    output:
        SUBDIR + PREFIX+".ploidymap.txt"
Thomas Faraut's avatar
Thomas Faraut committed
    shell:
        " printf \"*\t*\t*\t*\t2\n\" > {output}"

rule gendermask:
    output:
        SUBDIR + PREFIX+".gendermask.bed"
Thomas Faraut's avatar
Thomas Faraut committed
    shell:
        " touch {output}"