Skip to content
Snippets Groups Projects
Commit ae5e9fe3 authored by Helene Rimbert's avatar Helene Rimbert
Browse files

cleaning code with temporary files

parent 45c1f80b
No related branches found
No related tags found
No related merge requests found
......@@ -30,7 +30,7 @@ rule grepGffFeature:
rule splitGffPerChrom:
message: "Split Gff Features per chromosome: current is {wildcards.chrom}"
input: config['results']+"/1.features.bed"
output: config['results']+"/1.features/{chrom}.bed"
output: temp(config['results']+"/1.features/{chrom}.bed")
log: config['results']+"/1.features/{chrom}.fgrep.log"
params: "chr{chrom}"
shell:
......
#!/bin/bash
##SBATCH --array=0-7
##SBATCH -n1
##SBATCH --mail-user=helene.rimbert@inrae.fr
##SBATCH --mail-type=ALL
# for use with slurm
#chroms=($(echo TraesCS{1..7} TraesCSU))
#chrom=${chroms[$SLURM_ARRAY_TASK_ID]}
# get the list of query and target file for gmap
inputDir=$1
blastdb=$2
......@@ -16,21 +6,18 @@ gmapdb=$3
gmapdbdir=$4
outputRescueGff=$5
outputRescueWholeGenomeGff=$6
wholegenomeFasta=$7
querylist=($(find $inputDir -type f -name 'query.fasta'))
targetlist=($(find $inputDir -type f -name 'target.fasta'))
numquery=${#querylist[@]}
numtarget=${#targetlist[@]}
#blastdb='/banks/blast/IWGSCv1.1_repr_mrna'
echo "STARTING AT `date` FOR DATASET $inputDir"
echo "found $numquery query fasta file"
echo "found $numtarget target fasta file"
echo "Check for genes with invilid isbp pairs"
fgrep -h 'Could not identify a pair of marker to use to anchor' *.out |cut -d ' ' -f 15|cut -d '.' -f 1|sort > 3_mapping_wrongISBP_pair.txt
for i in "${querylist[@]}";
do
rootdir=`dirname $i`
......@@ -40,19 +27,7 @@ do
echo " gene is $geneid"
echo " datadir is $rootdir"
echo "1. creating gmap db for dataset $rootdir"
echo "================================================"
gmap_build -D $rootdir -d query $rootdir'/query.fasta' &
pid=$!
if [ -f "$rootdir/GMAPRESCUE" ]
then
gmap_build -D $rootdir -d target $rootdir'/target.fasta' &
pid2=$!
fi
wait $pid $pid2
echo "2. extract all mrnas"
echo "1. extract all mrnas"
echo "================================================"
# on dump d'abord tous les IDs de la banque blast ('-outfmt %i)'),
# puis on ne garde que les mrna du gène en cours de traitement,
......@@ -60,7 +35,7 @@ do
blastdbcmd -db $blastdb -entry all -outfmt %i |fgrep -w $geneid |blastdbcmd -db $blastdb -entry_batch - > $rootdir'/query.mrna.fasta'
echo "3. run gmap on target and query"
echo "2. run gmap on target and query"
echo "================================================"
targetindex='target'
targetindexdir=$rootdir
......@@ -70,48 +45,49 @@ do
# check if we use as target the sub sequence or the entire genome
if [ -f "$rootdir/UNMAPPED" ]
then
echo " Gene $geneid has status UNMAPPED: mapping on the whole genome target"
echo " Gene $geneid has status UNMAPPED: mapping on the whole genome"
targetindex=$gmapdb
targetindexdir=$gmapdbdir
targetoutputGff=$rootdir'/gmap.wholeGenome.gff3'
gmapexe='gmapl'
eval $gmapexe --min-identity 0.90 --min-trimmed-coverage 0.80 --ordered -n 1 -d $targetindex -D $targetindexdir -f 2 $rootdir'/query.mrna.fasta' 1> $rootdir'/gmap.wholeGenome.gff3'
else
echo "running gmap on target"
eval gmap --ordered -n 1 -g $rootdir'/target.fasta' -f 2 $rootdir'/query.mrna.fasta' > $rootdir'/gmap.target.gff3'
fi
eval $gmapexe -n 1 -d $targetindex -D $targetindexdir -f 2 $rootdir'/query.mrna.fasta' > $targetoutputGff &
eval gmap --ordered -n 1 -g $rootdir'/query.fasta' -f 2 $rootdir'/query.mrna.fasta' > $rootdir'/gmap.query.gff3'
echo "3. indexing the query and target genomic"
echo "================================================"
samtools faidx $rootdir'/query.fasta' &
pid=$!
eval gmap -n 1 -d query -D $rootdir -f 2 $rootdir'/query.mrna.fasta' > $rootdir'/gmap.query.gff3' &
samtools faidx $rootdir'/target.fasta' &
pid2=$!
wait $pid $pid2
rm -rf $rootdir'/query' $rootdir'/target'
echo "4. indexing the query and target genomic"
echo "4. running gffread to extract the pep sequence"
echo "================================================"
samtools faidx $rootdir'/query.fasta'
samtools faidx $rootdir'/target.fasta'
if [ -f "$rootdir/GMAPRESCUE" ]
if [ -f "$rootdir/UNMAPPED" ]
then
echo "5. running gffread to extract the pep sequence"
echo "================================================"
gffread -x $rootdir'/target.cds.fasta' -y $rootdir'/target.pep.faa' -g $wholegenomeFasta $rootdir'/gmap.wholeGenome.gff3'
else
gffread -x $rootdir'/target.cds.fasta' -y $rootdir'/target.pep.faa' -g $rootdir'/target.fasta' $rootdir'/gmap.target.gff3'
gffread -x $rootdir'/query.cds.fasta' -y $rootdir'/query.pep.faa' -g $rootdir'/query.fasta' $rootdir'/gmap.query.gff3'
fi
gffread -x $rootdir'/query.cds.fasta' -y $rootdir'/query.pep.faa' -g $rootdir'/query.fasta' $rootdir'/gmap.query.gff3'
echo "6. running diff to check if the pep sequences are different or not"
diffresult=$(diff $rootdir'/target.pep.faa' $rootdir'/query.pep.faa')
if [ "$diffresult" != "" ]
then
echo "RESULTS pep of gene $geneid has changed"
echo $diffresults > $rootdir/CDS_CHANGED
else
echo "RESULTS pep of gene $geneid stays the same"
touch $rootdir/CDS_OK
fi
echo "5. running diff to check if the pep sequences are different or not"
echo "================================================"
diffresult=$(diff $rootdir'/target.pep.faa' $rootdir'/query.pep.faa')
if [ "$diffresult" != "" ]
then
echo "RESULTS pep of gene $geneid has changed"
echo $diffresults > $rootdir/CDS_CHANGED
awk 'BEGIN{OFS="\t"}{if ($3 ~ /gene/){print $0";cds=CDS_CHANGED"} else {print $0}}' $rootdir'/gmap.wholeGenome.gff3' > $rootdir/'tmp.gff3' && mv $rootdir/'tmp.gff3' $rootdir'/gmap.wholeGenome.gff3'
else
echo " rescue done on the whole genome: we do not check for pep sequences"
touch $rootdir/CDS_UNKNOWN
echo "RESULTS pep of gene $geneid stays the same"
touch $rootdir/CDS_OK
awk 'BEGIN{OFS="\t"}{if ($3 ~ /gene/){print $0";cds=CDS_OK"} else {print $0}}' $rootdir'/gmap.wholeGenome.gff3' > $rootdir/'tmp.gff3' && mv $rootdir/'tmp.gff3' $rootdir'/gmap.wholeGenome.gff3'
fi
done
......
......@@ -464,8 +464,10 @@ def checkPerfectHit(blatresult, workingDir, maxhit):
def runBlat(db,query,blatResult, outFormat):
cmd='blat -noHead -extendThroughN -out='+outFormat+' '+db+' '+query+' '+blatResult
#cmd='blastn -outfmt "6 std qcovs slen sframe" -perc_identity 95 -qcov_hsp_perc 80 -query '+query+' -subject '+db
print(' blat command to run: %s'% str(cmd))
os.system('blat -noHead -extendThroughN -out={} {} {} {}'.format(outFormat,db,query,blatResult))
#os.system('blastn -outfmt "6 std qcovs slen sframe" -perc_identity 95 -qcov_hsp_perc 80 -query {} -subject {} -out {}'.format(query, db, blatResult))
return blatResult
def getFastaSeq(fasta,chrom,start,stop):
......
......@@ -25,8 +25,8 @@ mapq: 30
mismatches: 0
##### OUTPUT directory
results: 'results_200615'
finalPrefix: 'IWGSC_refseqv2.1_annotv2.0_200617'
results: 'results_2008'
finalPrefix: 'IWGSC_refseqv2.1_annotv2.0_200819'
# this file contains two columns: the first is the chromosome name as it appears in the genome.fasta of the new reference,
# and the second the chromosome name as it will appear in the new gene Names
chromMapID: 'data/chromosomeMappingID.csv'
......@@ -35,4 +35,4 @@ chromMapID: 'data/chromosomeMappingID.csv'
# used in rule renameGeneIds (rules/geneAnchoring.smk)
gff_prefix: 'TraesCS'
gff_version: '03G'
gff_source: 'IWGSC_v2.1'
\ No newline at end of file
gff_source: 'IWGSC_v2.1'
rule selectMappedISBP:
message: " Select only mapped ISBPS on new refseq"
input: mapped=config['results']+'/1.filteredISBPs.sorted.bed', original=config['isbpBed']
output: config['results']+'/2.mappedISBPs_coordsOnQuery.bed'
log: config['results']+'/2.mappedISBPs_coordsOnQuery.log'
input: mapped=config['results']+'/1.filteredISBPs/sorted.bed', original=config['isbpBed']
output: temp(config['results']+'/2.mappedISBPs/coordsOnQuery.bed')
log: config['results']+'/2.mappedISBPs/coordsOnQuery.log'
shell:
"""
cut -f 4 {input.mapped}|fgrep -wf - {input.original} |sort -k1,1 -k2,2n 1> {output} 2> {log}
......@@ -10,9 +10,9 @@ rule selectMappedISBP:
rule keepMappedOnSameChrom:
message: " Select Mapped ISBPs on same chromosome (or on unknown chromosome)"
input: isbpOnQuery=config['results']+'/2.mappedISBPs_coordsOnQuery.bed', isbpOnTarget=config['results']+"/1.filteredISBPs.sorted.bed"
output: config['results']+'/2.isbpMappedOnSameChrom.bed'
log: config['results']+'/2.isbpMappedOnSameChrom.log'
input: isbpOnQuery=config['results']+'/2.mappedISBPs/coordsOnQuery.bed', isbpOnTarget=config['results']+"/1.filteredISBPs/sorted.bed"
output: temp(config['results']+'/2.mappedISBPs/OnSameChrom.bed')
log: config['results']+'/2.mappedISBPs/OnSameChrom.log'
shell:
"""
join -1 4 -2 4 <(sort -k4,4 {input.isbpOnQuery}) <(sort -k4,4 {input.isbpOnTarget})|sed "s/ /\t/g"|awk "{{if (tolower(\$2) == tolower(\$5)) {{print \$2,\$3,\$4,\$1}} }}" |sort -k1,1 -k2,2n |sed "s/ /\t/g"> {output}
......@@ -22,9 +22,9 @@ rule keepMappedOnSameChrom:
rule upstreamClosest:
message: " Collect closest marker upstream of genes"
input: annot=config['results']+"/1.features.bed", markers=config['results']+'/2.isbpMappedOnSameChrom.bed'
output: temp(config['results']+'/2.closestbedUpstream.txt')
log: config['results']+"/2.closestbedUpstream.log"
input: annot=config['results']+"/1.features.bed", markers=config['results']+'/2.mappedISBPs/OnSameChrom.bed'
output: temp(config['results']+'/2.closestbed/Upstream.txt')
log: config['results']+"/2.closestbed/Upstream.log"
shell:
"""
bedtools closest -k 5 -id -D ref -io -a {input.annot} -b {input.markers} 1> {output} 2> {log}
......@@ -32,9 +32,9 @@ rule upstreamClosest:
rule downstreamClosest:
message: " Collect Downstream marker downstream of genes"
input: annot=config['results']+"/1.features.bed", markers=config['results']+'/2.isbpMappedOnSameChrom.bed'
output: temp(config['results']+'/2.downstreamClosest.txt')
log: config['results']+"/2.downstreamClosest.log"
input: annot=config['results']+"/1.features.bed", markers=config['results']+'/2.mappedISBPs/OnSameChrom.bed'
output: temp(config['results']+'/2.closestbed/downstream.txt')
log: config['results']+"/2.closestbed/downstream.log"
shell:
"""
bedtools closest -k 5 -iu -D ref -io -a {input.annot} -b {input.markers} 1> {output} 2> {log}
......@@ -43,13 +43,13 @@ rule downstreamClosest:
rule splitPerChrom:
message: "Split data per chromosome"
input:
upstream=config['results']+'/2.closestbedUpstream.txt',
downstream=config['results']+'/2.downstreamClosest.txt'
upstream=config['results']+'/2.closestbed/Upstream.txt',
downstream=config['results']+'/2.closestbed/downstream.txt'
output:
splitUp=config['results']+'/2.closestbed_split/{chrom}.upstream.txt',
splitDown=config['results']+'/2.closestbed_split/{chrom}.downstream.txt'
temp(splitUp=config['results']+'/2.closestbed/{chrom}/upstream.txt'),
splitDown=config['results']+'/2.closestbed/{chrom}/downstream.txt'
params: chromPref='TraesCS{chrom}'
log: config['results']+'/2.closestbed_split/{chrom}.split.log'
log: config['results']+'/2.closestbed/{chrom}/split.log'
shell:
"""
fgrep {params.chromPref} {input.upstream} 1> {output.splitUp} 2> {log}
......
......@@ -2,13 +2,13 @@ rule mapHomologousRegions:
message: " mapping homologous regions of both references using ISBPs markers as anchors for chromosome {wildcards.chrom}"
input:
#closestMarkers=config['results']+'/2.mergedClosestMarkers.txt',
marker5prime=config['results']+'/2.closestbed_split/{chrom}.upstream.txt',
marker3prime=config['results']+'/2.closestbed_split/{chrom}.downstream.txt',
marker5prime=config['results']+'/2.closestbed/{chrom}/upstream.txt',
marker3prime=config['results']+'/2.closestbed/{chrom}/downstream.txt',
refQuery=config['queryFasta'],
faiQuery=config['queryFasta']+'.fai',
refTarget=config['targetFasta'],
faiTarget=config['targetFasta']+'.fai',
markersOnTarget=config['results']+"/1.filteredISBPs.sorted.{chrom}.bed"
markersOnTarget=config['results']+"/1.filteredISBPs/{chrom}/sorted.bed"
output:
allblat=config['results']+'/3.mapping/{chrom}/allBlat.csv',
summary=config['results']+'/3.mapping/{chrom}/mappingSummary.csv'
......@@ -38,7 +38,7 @@ rule recalcBlatMapped:
rule gtCleanBlatGff:
message: " Clean the gff file recalculated based on Blat fine mapping for chromosome {wildcards.chrom}"
input: config['results']+'/4.recalcBlat/{chrom}/RecalcAnnotOnTarget.gff3'
output: config['results']+'/4.recalcBlat/{chrom}/RecalcAnnotOnTarget-clean.gff3'
output: temp(config['results']+'/4.recalcBlat/{chrom}/RecalcAnnotOnTarget-clean.gff3')
log: config['results']+'/4.recalcBlat/{chrom}/RecalcAnnotOnTarget-clean.log'
shell:
"""
......@@ -47,21 +47,21 @@ rule gtCleanBlatGff:
rule gmapRescue:
message: " Rescue anchoring of failed genes with gmap for chrom {wildcards.chrom}"
input: blat=config['results']+'/3.mapping/{chrom}/allBlat.csv', gmapindex=config['results']+"/target_gmapindex"
input: blat=config['results']+'/3.mapping/{chrom}/allBlat.csv', wholeGenomeFasta=config['targetFasta']
output: txt=config['results']+'/4.gmapRescue/{chrom}.txt',
targetgff=config['results']+'/4.gmapRescue/{chrom}.target.gff3',
wholegenomegff=config['results']+'/4.gmapRescue/{chrom}.wholeGenome.gff3'
params: blastdb=config['blastdb'], anchoringDir=config['results']+'/3.mapping/{chrom}/', gmapIndexDir=config['results']
targetgff=temp(config['results']+'/4.gmapRescue/{chrom}.target.gff3'),
wholegenomegff=temp(config['results']+'/4.gmapRescue/{chrom}.wholeGenome.gff3')
params: blastdb=config['blastdb'], anchoringDir=config['results']+'/3.mapping/{chrom}/', gmapIndexDir=config['results'], gmapindex="target_gmapindex"
log: config['results']+'/4.gmapRescue/{chrom}.log'
shell:
"""
bin/gmapRescue.sh {params.anchoringDir} {params.blastdb} {input.gmapindex} {params.gmapIndexDir} {output.targetgff} {output.wholegenomegff} 1> {output.txt} 2> {log}
bin/gmapRescue.sh {params.anchoringDir} {params.blastdb} {params.gmapindex} {params.gmapIndexDir} {output.targetgff} {output.wholegenomegff} {input.wholeGenomeFasta} 1> {output.txt} 2> {log}
"""
rule recalcGmapRescue:
message: "Recalc the coordinates of the GMAP on target GFF3 files for chromosome {wildcards.chrom}"
input: config['results']+'/4.gmapRescue/{chrom}.target.gff3',
output: config['results']+'/4.recalcGmap/{chrom}/recalc.gff3'
output: temp(config['results']+'/4.recalcGmap/{chrom}/recalc.gff3')
params: anchoringDir=config['results']+'/3.mapping/{chrom}',
bindir=os.getcwd()
log: config['results']+'/4.recalcGmap/{chrom}/recalc.log'
......@@ -74,8 +74,8 @@ rule saveGmapWG:
message: "Save the GMAP results of genes mapped on Whole Genome"
input: gff=config['results']+'/4.gmapRescue/{chrom}.wholeGenome.gff3',
map=config['chromMapID']
output: gff=config['results']+'/4.gmapWholeGenome/{chrom}.wholeGenome.gff3',
gfferror=config['results']+'/4.gmapWholeGenome/{chrom}.wholeGenome_differentChrom.gff3'
output: gff=temp(config['results']+'/4.gmapWholeGenome/{chrom}.wholeGenome.gff3'),
gfferror=tmp(config['results']+'/4.gmapWholeGenome/{chrom}.wholeGenome_differentChrom.gff3')
log: config['results']+'/4.gmapWholeGenome/{chrom}.wholeGenome.log'
shell:
"""
......@@ -89,7 +89,7 @@ rule mergeFinalGff3:
input: blat=config['results']+'/4.recalcBlat/{chrom}/RecalcAnnotOnTarget-clean.gff3',
rescue=config['results']+'/4.recalcGmap/{chrom}/recalc.gff3',
wg=config['results']+'/4.gmapWholeGenome/{chrom}.wholeGenome.gff3'
output: annot=config['results']+'/5.FINAL/{chrom}/annotation.gff3'
output: annot=temp(config['results']+'/5.FINAL/{chrom}/annotation.gff3')
log: annot=config['results']+'/5.FINAL/{chrom}/annotation.log'
shell:
"""
......@@ -99,7 +99,7 @@ rule checkMissing:
message: ""
input: gff=config['results']+'/5.FINAL/{chrom}/annotation.gff3',
ref=config['results']+"/1.features/{chrom}.bed"
output: config['results']+'/5.FINAL/{chrom}/missing.txt'
output: temp(config['results']+'/5.FINAL/{chrom}/missing.txt')
log: config['results']+'/5.FINAL/{chrom}/missing.log'
shell:
"""
......
......@@ -16,21 +16,21 @@ rule bam2bed:
rule sortBed:
message: "Sort ISBPs bed file"
input: config['results']+"/1.filteredISBPs.bed"
output: config['results']+"/1.filteredISBPs.sorted.bed"
output: temp(config['results']+"/1.filteredISBPs/sorted.bed")
log: config['results']+'/1.sortBed.log'
shell: "sort -k1,1 -k2,2n {input} 1> {output} 2> {log}"
rule dumpISBPsID:
message: "Dump ISBPs IDs"
input: config['results']+"/1.filteredISBPs.sorted.bed"
input: config['results']+"/1.filteredISBPs/sorted.bed"
output: temp(config['results']+"/1.filteredISBPs.ids")
shell: " cut -f 4 {input} > {output}"
rule splitISBP:
message: "Split isbps per chromosome"
input: config['results']+"/1.filteredISBPs.sorted.bed"
output: config['results']+"/1.filteredISBPs.sorted.{chrom}.bed"
log: config['results']+"/1.filteredISBPs.splitPerChrom{chrom}.log"
output: temp(config['results']+"/1.filteredISBPs/{chrom}/sorted.bed")
log: config['results']+"/1.filteredISBPs/{chrom}/sorted.log"
params: 'Chr{chrom}'
shell:
"""
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment