Newer
Older
from pyfaidx import Fasta
# from natsort import natsorted
ROOTPATH = os.path.dirname(workflow.snakefile)
sys.path.insert(0, os.path.join(ROOTPATH, "lib"))
os.environ["PYTHONPATH"] = os.path.join(ROOTPATH, "lib")
if "SV_DIR" not in os.environ:
raise Exception("SV_DIR environment variable must be filled!")
CLASSPATH = SV_DIR + "/lib/SVToolkit.jar" + ":" + SV_DIR + "/lib/gatk/GenomeAnalysisTK.jar"
workdir: WDIR
# Always set the environment
SUBDIR = config["rb_subdir"] + os.path.sep if "rb_subdir" in config else ""
SUFF_TYPES = ("gcmask", "svmask", "lcmask")
p_repeats = re.compile('.*(Satellite|Simple_repeat|Low_complexity).*')
p_repeats_all = re.compile('.*(Satellite|Simple_repeat|Line|SINE|LTR).*')
CHROMS = config['chromosomes']
shell.prefix("export PATH=%s/bin:\"$PATH\"; export PERL5LIB=%s; " % (ROOTPATH, PERL5LIB))
SUBDIR + PREFIX+".fasta.fai",
SUBDIR + PREFIX+".gcmask.fasta.fai",
SUBDIR + PREFIX+".svmask.fasta.fai",
SUBDIR + PREFIX+".lcmask.fasta.fai",
SUBDIR + PREFIX+".rdmask.bed",
SUBDIR + PREFIX+".dict",
SUBDIR + PREFIX+".ploidymap.txt",
SUBDIR + PREFIX+".gendermask.bed",
SUBDIR + PREFIX+".Nstretch.bed"
fasta=SUBDIR + PREFIX+"_raw.fasta",
fai=SUBDIR + PREFIX+"_raw.fasta.fai"
output:
SUBDIR + PREFIX+".fasta"
params:
chromosomes=CHROMS
shell:
"samtools faidx {input.fasta} {params.chromosomes} > {output}"
# run:
# from Bio import SeqIO
# with open(input[0], "rU") as handle, open(output[0], "w") as f:
# for seq in SeqIO.parse(handle, "fasta"):
# if seq.id in CHROMS:
# SeqIO.write([seq], f, "fasta")
# os.remove(input[0])
# i_fai = input[0] + ".fai"
# if os.path.exists(i_fai):
# os.remove(i_fai)
SUBDIR + "{type}.fasta"
SUBDIR + "{type}.fasta.fai"
fasta = SUBDIR + PREFIX+".fasta",
fai = SUBDIR + PREFIX+".fasta.fai"
SUBDIR + "reference.fasta.length"
"fastalength {input.fasta} > {output}"
fasta = SUBDIR + PREFIX+".fasta",
fai = SUBDIR + PREFIX+".fasta.fai"
seq = SUBDIR + "work/reference.fasta",
index = SUBDIR + "work/reference.fasta.bwt"
"export PATH=$SV_DIR/bwa:\"$PATH\"; export LD_LIBRARY_PATH=$SV_DIR/bwa:\"$LD_LIBRARY_PATH\"; "
"ln {input.fasta} {output.seq}; "
# Checking genome size for bwa index
#"genomesize=`cat {output.seq}.fai | awk '{ totsize = totsize +$0}END{ print totsize}'`; "
Mathieu Charles
committed
"bwa index -a bwtsw {output.seq} 2> {log} || bwa index -a is {output.seq} 2> {log}"
expand(SUBDIR + "work/svmask_{chrom}.fasta", chrom=CHROMS)
SUBDIR + "reference.svmask.fasta"
shell:
"cat {input} > {output}"
rule chromsvmask:
input:
SUBDIR + "work/reference.fasta.bwt",
fa = SUBDIR + "work/reference.fasta"
SUBDIR + "work/svmask_{chrom}.fasta"
"export PATH=$SV_DIR/bwa:\"$PATH\"; export LD_LIBRARY_PATH=$SV_DIR/bwa:\"$LD_LIBRARY_PATH\"; "
"java -cp {CLASSPATH} -Xmx4g org.broadinstitute.sv.apps.ComputeGenomeMask "
" -R {input.fa} -O {params.subdir}work/svmask_{wildcards.chrom}.fasta -readLength {params.rl}"
" -sequence {wildcards.chrom} 2> {log} "
rule bed2fasta:
input:
SUBDIR + PREFIX+".fasta.fai",
ref = SUBDIR + PREFIX+".fasta",
bed = SUBDIR + "{type}.bed"
SUBDIR + "{type}.fasta"
type = ".*(lc|sv|gc)mask"
"export PATH=$SV_DIR/bwa:\"$PATH\"; export LD_LIBRARY_PATH=$SV_DIR/bwa:\"$LD_LIBRARY_PATH\"; "
" java -cp {CLASSPATH} -Xmx4g "
" org.broadinstitute.sv.apps.BedFileToGenomeMask "
" -I {input.bed} "
" -R {input.ref} "
" -O {output}"
rule repeatmask:
input:
fasta = SUBDIR + "reference.fasta",
fai = SUBDIR + PREFIX+".fasta.fai"
SUBDIR + "reference.fasta.out"
threads:
get_threads("repeatmask", 12)
"""
export PERL5LIB={params.perl5lib}
RepeatMasker -pa {threads} -species {params.sp} -xsmall {input.fasta}
> {input.fasta}.repeatmasker.log
"""
length = SUBDIR + PREFIX+".fasta.length"
bed = SUBDIR + PREFIX+".rdmask.bed"
with open(output.bed, "w") as fout:
fout.write("\t".join(["CHROM", "START", "END"])+"\n")
for line in f:
line = line.rstrip()
fields = line.split()
fout.write("\t".join([fields[1], "0", str(int(fields[0])-1)]) + "\n")
repeats = SUBDIR + PREFIX+".fasta.out"
bed = SUBDIR + PREFIX+".lcmask.bed"
with open(input.repeats) as f:
for line in f:
if p_repeats.match(line):
fields = line.rstrip().split()
# append chrom start and end
repeats.append([fields[4], fields[5], fields[6]])
sorted_repeats = sorted(repeats,
key=lambda x: (x[0], int(x[1]), int(x[2])))
# Write to file
import csv
with open(output.bed, 'w') as f:
writer = csv.writer(f, delimiter='\t')
writer.writerows(sorted_repeats)
repeats = SUBDIR + "reference.fasta.out"
bed = SUBDIR + "reference.gcmask.bed"
with open(input.repeats) as f:
for line in f:
if p_repeats_all.match(line):
fields = line.rstrip().split()
repeats.append([fields[4], fields[5], fields[6]])
sorted_repeats = sorted(repeats,
key=lambda x: (x[0], int(x[1]), int(x[2])))
# Write to file
import csv
with open(output.bed, 'w') as f:
writer = csv.writer(f, delimiter='\t')
writer.writerows(sorted_repeats)
fasta = SUBDIR + PREFIX+".fasta",
fai = SUBDIR + PREFIX+".fasta.fai"
SUBDIR + PREFIX+".dict"
"picard CreateSequenceDictionary "
" O= {output} "
rule nstretches:
input:
fasta = SUBDIR + PREFIX+".fasta",
fai = SUBDIR + PREFIX+".fasta.fai"
SUBDIR + PREFIX+".Nstretch.bed"
nstrech = config['maxNstretches']
Floreal Cabanettes
committed
threads:
1
"findNstretches.py -f {input.fasta} -m {params.nstrech} "
"-o {output} -t {threads}"
SUBDIR + PREFIX+".ploidymap.txt"
shell:
" printf \"*\t*\t*\t*\t2\n\" > {output}"
rule gendermask:
output:
SUBDIR + PREFIX+".gendermask.bed"