Skip to content
Snippets Groups Projects
Commit 9d1429e5 authored by Thomas Faraut's avatar Thomas Faraut
Browse files

adding reference bundle

parent 8f90454b
No related branches found
No related tags found
No related merge requests found
##-----------------------------------------------##
## A set of functions
##-----------------------------------------------##
import sys
import re
from pyfaidx import Fasta
from os.path import join
configfile: "referencebundle.yaml"
## Global variables
##--------------------------------------
SV_DIR = "/home/faraut/save/Softwares/svtoolkit"
CLASSPATH = SV_DIR + "/lib/SVToolkit.jar" + ":" + SV_DIR +"/lib/gatk/GenomeAnalysisTK.jar"
ROOT_DIR = config['rootdir']
WDIR = config['workdir']
workdir: WDIR
PicardJar="/usr/local/bioinfo/src/picard-tools/current/picard.jar"
# Always set the environment
shell.prefix("export PATH=%s/bin:%s/scripts:%s/bwa:\"$PATH\"; export LD_LIBRARY_PATH=%s/bwa:/SGE/ogs/lib/linux-x64:\"$LD_LIBRARY_PATH\"; " % (ROOT_DIR, ROOT_DIR, SV_DIR, SV_DIR))
PREFIX="reference"
SUFF_TYPES = ("gcmask", "svmask", "lcmask")
p_repeats = re.compile('.*(Satellite|Simple_repeat|Low_complexity).*')
def message(mes):
sys.stderr.write("|--- " + mes + "\n")
def chromNum(chrom):
m = re.search(r"[cC]hr(?P<num>[\dXY]+)",chrom)
if m is not None:
if m.group('num') == 'X':
num = 100
elif m.group('num') == 'Y':
num = 101
else:
num = int(m.group('num'))
return num
else:
return chrom
def getChroms(genome):
ref = Fasta(genome)
return sorted(ref.keys(),key=lambda x:chromNum(x))
CHROMS = getChroms(config['genome'])
rule final:
input:
PREFIX+".fasta.fai",
PREFIX+".gcmask.fasta.fai",
PREFIX+".svmask.fasta.fai",
PREFIX+".lcmask.fasta.fai",
PREFIX+".rdmask.bed",
PREFIX+".dict",
PREFIX+".ploidymap.txt",
PREFIX+".gendermask.bed",
PREFIX+".Nstretch.bed",
"chroms/splitted.done"
rule get_fasta:
input:
config['genome']
output:
PREFIX+".fasta"
shell:
'cp {input} {output}'
rule index:
input:
"{type}.fasta"
output:
"{type}.fasta.fai"
log:
"logs/index/{type}.log"
shell:
"java -cp {CLASSPATH} -Xmx4g "
" org.broadinstitute.sv.apps.IndexFastaFile "
" -I {input} "
" -O {output} 2> {log}"
rule length:
input:
"reference.fasta"
output:
"reference.fasta.length"
shell:
"fastalength {input} > {output}"
rule preparesvmask:
input:
PREFIX+".fasta"
output:
seq="work/reference.fasta",
index="work/reference.fasta.bwt"
log:
"logs/preparesvmask.log"
shell:
"ln {input} {output.seq}; "
"samtools faidx {output.seq}; "
"which bwa ; "
"bwa index -a bwtsw {output.seq} 2> {log}"
rule mergesvmask:
input:
expand("work/svmask_{chrom}.fasta",chrom=CHROMS)
output:
"reference.svmask.fasta"
shell:
"cat {input} > {output}"
rule chromsvmask:
input:
"work/reference.fasta.bwt",
fa="work/reference.fasta"
output:
"work/svmask_{chrom}.fasta"
params:
rl=config['readlength']
log:
"logs/svmask/{chrom}.log"
shell:
"java -cp {CLASSPATH} -Xmx4g org.broadinstitute.sv.apps.ComputeGenomeMask "
" -R {input.fa} -O work/svmask_{wildcards.chrom}.fasta -readLength {params.rl} "
" -sequence {wildcards.chrom} 2> {log} "
rule bed2fasta:
input:
PREFIX+".fasta.fai",
ref=PREFIX+".fasta",
bed="{type}.bed"
output:
"{type}.fasta"
wildcard_constraints:
type=".*(lc|sv|gc)mask"
shell:
" java -cp {CLASSPATH} -Xmx4g "
" org.broadinstitute.sv.apps.BedFileToGenomeMask "
" -I {input.bed} "
" -R {input.ref} "
" -O {output}"
rule repeatmask:
input:
"reference.fasta"
output:
"reference.fasta.out"
params:
sp=config['species']
threads: 12
shell:
"RepeatMasker -pa {threads} -species {params.sp} -xsmall {input} > {input}.repeatmasker.log"
rule rdmask:
input:
length=PREFIX+".fasta.length"
output:
bed=PREFIX+".rdmask.bed"
run:
with open(input.length) as f:
with open(output.bed,"w") as fout:
fout.write("\t".join(["CHROM","START","END"])+"\n")
for line in f:
line = line.rstrip()
fields = line.split()
fout.write("\t".join([fields[1],"0",str(int(fields[0])-1)])+"\n")
rule lcmaskbed:
input:
repeats=PREFIX+".fasta.out"
output:
bed=PREFIX+".lcmask.bed"
run:
repeats = []
with open(output.bed,"w") as fout:
with open(input.repeats) as f:
for line in f:
if p_repeats.match(line):
fields = line.rstrip().split()
repeats.append({'chrom':fields[4],'start':fields[5],'end':fields[6]})
for r in sorted(repeats,key=lambda x:(chromNum(x['chrom']),int(x['start']))):
fout.write("\t".join([r['chrom'],r['start'],r['end']])+"\n")
rule gcmaskbed:
input:
repeats="reference.fasta.out"
output:
bed="reference.gcmask.bed"
run:
repeats = []
with open(output.bed,"w") as fout:
with open(input.repeats) as f:
for line in f:
if p_repeats.match(line):
fields = line.rstrip().split()
repeats.append({'chrom':fields[4],'start':fields[5],'end':fields[6]})
for r in sorted(repeats,key=lambda x:(chromNum(x['chrom']),int(x['start']))):
fout.write("\t".join([r['chrom'],r['start'],r['end']])+"\n")
rule dict:
input:
PREFIX+".fasta"
output:
PREFIX+".dict"
shell:
"java -jar {PicardJar} CreateSequenceDictionary "
" R= {input} "
" O= {output} "
rule nstretches:
input:
PREFIX+".fasta"
output:
PREFIX+".Nstretch.bed"
params:
nstrech =config['maxNstretches']
shell:
"findNstretches.py -f {input} -m {params.nstrech} > {output}"
rule splitchroms:
input:
PREFIX+".fasta"
output:
"chroms/splitted.done"
shell:
"faSplit byname {input} chroms/ ; "
"for f in `ls chroms/*.fa`; do "
" samtools faidx $f ; "
"done ;"
"touch chroms/splitted.done"
rule ploidy:
output:
PREFIX+".ploidymap.txt"
shell:
" printf \"*\t*\t*\t*\t2\n\" > {output}"
rule gendermask:
output:
PREFIX+".gendermask.bed"
shell:
" touch {output}"
rootdir: "/home/faraut/dynawork/CapriSV/SVPipeline"
readlength: 100
species: "goat"
maxNstretches: 100
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment