cleaning code with temporary files

ae5e9fe3 · Helene Rimbert · 45c1f80b · ae5e9fe3 · ae5e9fe3 · ae5e9fe3
Commit ae5e9fe3 authored 4 years ago by Helene Rimbert
--- a/Snakefile
+++ b/Snakefile
@@ -30,7 +30,7 @@ rule grepGffFeature:
 rule splitGffPerChrom:
 	message: "Split Gff Features per chromosome: current is {wildcards.chrom}"
 	input: config['results']+"/1.features.bed"
-	output: config['results']+"/1.features/{chrom}.bed"
+	output: temp(config['results']+"/1.features/{chrom}.bed")
 	log: config['results']+"/1.features/{chrom}.fgrep.log"
 	params: "chr{chrom}"
 	shell:

--- a/bin/gmapRescue.sh
+++ b/bin/gmapRescue.sh
 #!/bin/bash
-##SBATCH --array=0-7
-##SBATCH -n1
-##SBATCH --mail-user=helene.rimbert@inrae.fr
-##SBATCH --mail-type=ALL
-
-# for use with slurm
-#chroms=($(echo TraesCS{1..7} TraesCSU))
-#chrom=${chroms[$SLURM_ARRAY_TASK_ID]}
-
-# get the list of query and target file for gmap

 inputDir=$1
 blastdb=$2
@@ -16,21 +6,18 @@ gmapdb=$3
 gmapdbdir=$4
 outputRescueGff=$5
 outputRescueWholeGenomeGff=$6
+wholegenomeFasta=$7

 querylist=($(find  $inputDir  -type f -name 'query.fasta'))
 targetlist=($(find $inputDir -type f -name 'target.fasta'))

 numquery=${#querylist[@]}
 numtarget=${#targetlist[@]}
-#blastdb='/banks/blast/IWGSCv1.1_repr_mrna'

 echo "STARTING AT `date` FOR DATASET $inputDir"
 echo "found $numquery query fasta file"
 echo "found $numtarget target fasta file"

-echo "Check for genes with invilid isbp pairs"
-fgrep -h 'Could not identify a pair of marker to use to anchor' *.out |cut -d ' ' -f 15|cut -d '.' -f 1|sort > 3_mapping_wrongISBP_pair.txt
-
 for i in "${querylist[@]}";
 do
 	rootdir=`dirname $i`
@@ -40,19 +27,7 @@ do
 	echo " gene is $geneid"
 	echo " datadir is $rootdir"

-	echo "1. creating gmap db for dataset $rootdir"
-	echo "================================================"
-	gmap_build -D $rootdir -d query $rootdir'/query.fasta' &
-	pid=$!
-
-	if [ -f "$rootdir/GMAPRESCUE" ]
-	then
-		gmap_build -D $rootdir -d target $rootdir'/target.fasta' &
-		pid2=$!
-	fi
-	wait $pid $pid2
-
-	echo "2. extract all mrnas"
+	echo "1. extract all mrnas"
 	echo "================================================"
 	# on dump d'abord tous les IDs de la banque blast ('-outfmt %i)'),
 	# puis on ne garde que les mrna du gène en cours de traitement,
@@ -60,7 +35,7 @@ do
 	blastdbcmd -db $blastdb -entry all -outfmt %i |fgrep -w $geneid |blastdbcmd -db $blastdb -entry_batch - > $rootdir'/query.mrna.fasta'
 		

-	echo "3. run gmap on target and query"
+	echo "2. run gmap on target and query"
 	echo "================================================"
 	targetindex='target'
 	targetindexdir=$rootdir
@@ -70,48 +45,49 @@ do
 	# check if we use as target the sub sequence or the entire genome
 	if [ -f "$rootdir/UNMAPPED" ]
 	then
-		echo " Gene $geneid has status UNMAPPED: mapping on the whole genome target"
+		echo " Gene $geneid has status UNMAPPED: mapping on the whole genome"
 		targetindex=$gmapdb
 		targetindexdir=$gmapdbdir
 		targetoutputGff=$rootdir'/gmap.wholeGenome.gff3'
 		gmapexe='gmapl'
+		eval $gmapexe --min-identity 0.90 --min-trimmed-coverage 0.80 --ordered -n 1 -d $targetindex -D $targetindexdir -f 2 $rootdir'/query.mrna.fasta' 1> $rootdir'/gmap.wholeGenome.gff3' 
+	else
+		echo "running gmap on target"
+		eval gmap --ordered -n 1 -g $rootdir'/target.fasta' -f 2 $rootdir'/query.mrna.fasta' > $rootdir'/gmap.target.gff3'
 	fi
-	eval $gmapexe -n 1 -d $targetindex -D $targetindexdir -f 2 $rootdir'/query.mrna.fasta' > $targetoutputGff &
+	eval gmap --ordered -n 1 -g $rootdir'/query.fasta' -f 2 $rootdir'/query.mrna.fasta' > $rootdir'/gmap.query.gff3' 
+
+	echo "3. indexing the query and target genomic"
+	echo "================================================"
+	samtools faidx $rootdir'/query.fasta' &
 	pid=$!
-	eval gmap -n 1 -d query -D $rootdir -f 2 $rootdir'/query.mrna.fasta' > $rootdir'/gmap.query.gff3' &
+	samtools faidx $rootdir'/target.fasta' &
 	pid2=$!
 	wait $pid $pid2

-	rm -rf $rootdir'/query' $rootdir'/target'
-
-	echo "4. indexing the query and target genomic"
+	echo "4. running gffread to extract the pep sequence"
 	echo "================================================"
-	samtools faidx $rootdir'/query.fasta'
-	samtools faidx $rootdir'/target.fasta'
-
-	
-
-	if [ -f "$rootdir/GMAPRESCUE" ]
+	if [ -f "$rootdir/UNMAPPED" ]
 	then
-		echo "5. running gffread to extract the pep sequence"
-		echo "================================================"
+		gffread -x $rootdir'/target.cds.fasta' -y $rootdir'/target.pep.faa' -g $wholegenomeFasta $rootdir'/gmap.wholeGenome.gff3'
+	else
 		gffread -x $rootdir'/target.cds.fasta' -y $rootdir'/target.pep.faa' -g $rootdir'/target.fasta' $rootdir'/gmap.target.gff3'
-		gffread -x $rootdir'/query.cds.fasta' -y $rootdir'/query.pep.faa' -g $rootdir'/query.fasta' $rootdir'/gmap.query.gff3'
+	fi
+	gffread -x $rootdir'/query.cds.fasta' -y $rootdir'/query.pep.faa' -g $rootdir'/query.fasta' $rootdir'/gmap.query.gff3'
 		
-		echo "6. running diff to check if the pep sequences are different or not"
-		diffresult=$(diff $rootdir'/target.pep.faa' $rootdir'/query.pep.faa')
-		if [ "$diffresult" != "" ]
-		then
-			echo "RESULTS pep of gene $geneid has changed"
-			echo $diffresults > $rootdir/CDS_CHANGED
-		else
-			echo "RESULTS pep of gene $geneid stays the same"
-			touch $rootdir/CDS_OK
-		fi
+	echo "5. running diff to check if the pep sequences are different or not"
+	echo "================================================"
+	diffresult=$(diff $rootdir'/target.pep.faa' $rootdir'/query.pep.faa')
+	if [ "$diffresult" != "" ]
+	then
+		echo "RESULTS pep of gene $geneid has changed"
+		echo $diffresults > $rootdir/CDS_CHANGED
+		awk 'BEGIN{OFS="\t"}{if ($3 ~ /gene/){print $0";cds=CDS_CHANGED"} else {print $0}}' $rootdir'/gmap.wholeGenome.gff3' > $rootdir/'tmp.gff3' && mv $rootdir/'tmp.gff3' $rootdir'/gmap.wholeGenome.gff3'

 	else
-		echo " rescue done on the whole genome: we do not check for pep sequences"
-		touch $rootdir/CDS_UNKNOWN
+		echo "RESULTS pep of gene $geneid stays the same"
+		touch $rootdir/CDS_OK
+		awk 'BEGIN{OFS="\t"}{if ($3 ~ /gene/){print $0";cds=CDS_OK"} else {print $0}}' $rootdir'/gmap.wholeGenome.gff3' > $rootdir/'tmp.gff3' && mv $rootdir/'tmp.gff3' $rootdir'/gmap.wholeGenome.gff3'
 	fi
 	
 done

--- a/bin/microMappingPipeline.py
+++ b/bin/microMappingPipeline.py
@@ -464,8 +464,10 @@ def checkPerfectHit(blatresult, workingDir, maxhit):

 def runBlat(db,query,blatResult, outFormat):
 	cmd='blat -noHead -extendThroughN -out='+outFormat+' '+db+' '+query+' '+blatResult
+	#cmd='blastn -outfmt "6 std qcovs slen sframe" -perc_identity 95 -qcov_hsp_perc 80 -query '+query+' -subject '+db
 	print(' blat command to run: %s'% str(cmd))
 	os.system('blat -noHead -extendThroughN -out={} {} {} {}'.format(outFormat,db,query,blatResult))
+	#os.system('blastn -outfmt "6 std qcovs slen sframe" -perc_identity 95 -qcov_hsp_perc 80 -query {} -subject {} -out {}'.format(query, db, blatResult))
 	return blatResult

 def getFastaSeq(fasta,chrom,start,stop):

--- a/config.yaml
+++ b/config.yaml
@@ -25,8 +25,8 @@ mapq: 30
 mismatches: 0

 ##### OUTPUT directory
-results: 'results_200615'
-finalPrefix: 'IWGSC_refseqv2.1_annotv2.0_200617'
+results: 'results_2008'
+finalPrefix: 'IWGSC_refseqv2.1_annotv2.0_200819'
 # this file contains two columns: the first is the chromosome name as it appears in the genome.fasta of the new reference,
 # and the second the chromosome name as it will appear in the new gene Names
 chromMapID: 'data/chromosomeMappingID.csv'
@@ -35,4 +35,4 @@ chromMapID: 'data/chromosomeMappingID.csv'
 # used in rule renameGeneIds (rules/geneAnchoring.smk)
 gff_prefix: 'TraesCS'
 gff_version: '03G'
-gff_source: 'IWGSC_v2.1'
\ No newline at end of file
+gff_source: 'IWGSC_v2.1'
--- a/rules/bedtoolsClosest.smk
+++ b/rules/bedtoolsClosest.smk
 rule selectMappedISBP:
 	message: " Select only mapped ISBPS on new refseq"
-	input: mapped=config['results']+'/1.filteredISBPs.sorted.bed', original=config['isbpBed']
-	output: config['results']+'/2.mappedISBPs_coordsOnQuery.bed'
-	log: config['results']+'/2.mappedISBPs_coordsOnQuery.log'
+	input: mapped=config['results']+'/1.filteredISBPs/sorted.bed', original=config['isbpBed']
+	output: temp(config['results']+'/2.mappedISBPs/coordsOnQuery.bed')
+	log: config['results']+'/2.mappedISBPs/coordsOnQuery.log'
 	shell:
 		"""
 		cut -f 4 {input.mapped}|fgrep -wf - {input.original} |sort -k1,1 -k2,2n 1> {output} 2> {log}
@@ -10,9 +10,9 @@ rule selectMappedISBP:

 rule keepMappedOnSameChrom:
 	message: " Select Mapped ISBPs on same chromosome (or on unknown chromosome)"
-	input: isbpOnQuery=config['results']+'/2.mappedISBPs_coordsOnQuery.bed', isbpOnTarget=config['results']+"/1.filteredISBPs.sorted.bed"
-	output: config['results']+'/2.isbpMappedOnSameChrom.bed'
-	log: config['results']+'/2.isbpMappedOnSameChrom.log'
+	input: isbpOnQuery=config['results']+'/2.mappedISBPs/coordsOnQuery.bed', isbpOnTarget=config['results']+"/1.filteredISBPs/sorted.bed"
+	output: temp(config['results']+'/2.mappedISBPs/OnSameChrom.bed')
+	log: config['results']+'/2.mappedISBPs/OnSameChrom.log'
 	shell:
 		"""
 		join -1 4 -2 4 <(sort -k4,4 {input.isbpOnQuery}) <(sort -k4,4 {input.isbpOnTarget})|sed "s/ /\t/g"|awk "{{if (tolower(\$2) == tolower(\$5)) {{print \$2,\$3,\$4,\$1}} }}" |sort -k1,1 -k2,2n |sed "s/ /\t/g"> {output}
@@ -22,9 +22,9 @@ rule keepMappedOnSameChrom:

 rule upstreamClosest:
 	message: " Collect closest marker upstream of genes"
-	input: annot=config['results']+"/1.features.bed", markers=config['results']+'/2.isbpMappedOnSameChrom.bed'
-	output: temp(config['results']+'/2.closestbedUpstream.txt')
-	log: config['results']+"/2.closestbedUpstream.log"
+	input: annot=config['results']+"/1.features.bed", markers=config['results']+'/2.mappedISBPs/OnSameChrom.bed'
+	output: temp(config['results']+'/2.closestbed/Upstream.txt')
+	log: config['results']+"/2.closestbed/Upstream.log"
 	shell:
 		"""
 		bedtools closest -k 5 -id -D ref -io -a {input.annot} -b {input.markers} 1> {output} 2> {log}
@@ -32,9 +32,9 @@ rule upstreamClosest:

 rule downstreamClosest:
 	message: " Collect Downstream marker downstream of genes"
-	input: annot=config['results']+"/1.features.bed", markers=config['results']+'/2.isbpMappedOnSameChrom.bed'
-	output: temp(config['results']+'/2.downstreamClosest.txt')
-	log: config['results']+"/2.downstreamClosest.log"
+	input: annot=config['results']+"/1.features.bed", markers=config['results']+'/2.mappedISBPs/OnSameChrom.bed'
+	output: temp(config['results']+'/2.closestbed/downstream.txt')
+	log: config['results']+"/2.closestbed/downstream.log"
 	shell:
 		"""
 		bedtools closest -k 5 -iu -D ref -io -a {input.annot} -b {input.markers} 1> {output} 2> {log}
@@ -43,13 +43,13 @@ rule downstreamClosest:
 rule splitPerChrom:
 	message: "Split data per chromosome"
 	input: 
-		upstream=config['results']+'/2.closestbedUpstream.txt',
-		downstream=config['results']+'/2.downstreamClosest.txt'
+		upstream=config['results']+'/2.closestbed/Upstream.txt',
+		downstream=config['results']+'/2.closestbed/downstream.txt'
 	output:
-		splitUp=config['results']+'/2.closestbed_split/{chrom}.upstream.txt',
-		splitDown=config['results']+'/2.closestbed_split/{chrom}.downstream.txt'
+		temp(splitUp=config['results']+'/2.closestbed/{chrom}/upstream.txt'),
+		splitDown=config['results']+'/2.closestbed/{chrom}/downstream.txt'
 	params: chromPref='TraesCS{chrom}'
-	log: config['results']+'/2.closestbed_split/{chrom}.split.log'
+	log: config['results']+'/2.closestbed/{chrom}/split.log'
 	shell:
 		"""
 		fgrep  {params.chromPref} {input.upstream} 1> {output.splitUp} 2> {log}

--- a/rules/geneAnchoring.smk
+++ b/rules/geneAnchoring.smk
@@ -2,13 +2,13 @@ rule mapHomologousRegions:
 	message: " mapping homologous regions of both references using ISBPs markers as anchors for chromosome {wildcards.chrom}"
 	input: 
 		#closestMarkers=config['results']+'/2.mergedClosestMarkers.txt',
-		marker5prime=config['results']+'/2.closestbed_split/{chrom}.upstream.txt',
-		marker3prime=config['results']+'/2.closestbed_split/{chrom}.downstream.txt',
+		marker5prime=config['results']+'/2.closestbed/{chrom}/upstream.txt',
+		marker3prime=config['results']+'/2.closestbed/{chrom}/downstream.txt',
 		refQuery=config['queryFasta'],
 		faiQuery=config['queryFasta']+'.fai',
 		refTarget=config['targetFasta'],
 		faiTarget=config['targetFasta']+'.fai',
-		markersOnTarget=config['results']+"/1.filteredISBPs.sorted.{chrom}.bed"
+		markersOnTarget=config['results']+"/1.filteredISBPs/{chrom}/sorted.bed"
 	output: 
 		allblat=config['results']+'/3.mapping/{chrom}/allBlat.csv',
 		summary=config['results']+'/3.mapping/{chrom}/mappingSummary.csv'
@@ -38,7 +38,7 @@ rule recalcBlatMapped:
 rule gtCleanBlatGff:
 	message: " Clean the gff file recalculated based on Blat fine mapping for chromosome {wildcards.chrom}"
 	input: config['results']+'/4.recalcBlat/{chrom}/RecalcAnnotOnTarget.gff3'
-	output: config['results']+'/4.recalcBlat/{chrom}/RecalcAnnotOnTarget-clean.gff3'
+	output: temp(config['results']+'/4.recalcBlat/{chrom}/RecalcAnnotOnTarget-clean.gff3')
 	log: config['results']+'/4.recalcBlat/{chrom}/RecalcAnnotOnTarget-clean.log'
 	shell: 
 		"""
@@ -47,21 +47,21 @@ rule gtCleanBlatGff:

 rule gmapRescue:
 	message: " Rescue anchoring of failed genes with gmap for chrom {wildcards.chrom}"
-	input: blat=config['results']+'/3.mapping/{chrom}/allBlat.csv', gmapindex=config['results']+"/target_gmapindex"
+	input: blat=config['results']+'/3.mapping/{chrom}/allBlat.csv', wholeGenomeFasta=config['targetFasta']
 	output: txt=config['results']+'/4.gmapRescue/{chrom}.txt',
-		targetgff=config['results']+'/4.gmapRescue/{chrom}.target.gff3',
-		wholegenomegff=config['results']+'/4.gmapRescue/{chrom}.wholeGenome.gff3'
-	params: blastdb=config['blastdb'], anchoringDir=config['results']+'/3.mapping/{chrom}/', gmapIndexDir=config['results']
+		targetgff=temp(config['results']+'/4.gmapRescue/{chrom}.target.gff3'),
+		wholegenomegff=temp(config['results']+'/4.gmapRescue/{chrom}.wholeGenome.gff3')
+	params: blastdb=config['blastdb'], anchoringDir=config['results']+'/3.mapping/{chrom}/', gmapIndexDir=config['results'], gmapindex="target_gmapindex"
 	log: config['results']+'/4.gmapRescue/{chrom}.log'
 	shell:
 		"""
-		bin/gmapRescue.sh {params.anchoringDir} {params.blastdb} {input.gmapindex} {params.gmapIndexDir} {output.targetgff} {output.wholegenomegff} 1> {output.txt} 2> {log}
+		bin/gmapRescue.sh {params.anchoringDir} {params.blastdb} {params.gmapindex} {params.gmapIndexDir} {output.targetgff} {output.wholegenomegff} {input.wholeGenomeFasta} 1> {output.txt} 2> {log}
 		"""

 rule recalcGmapRescue:
 	message: "Recalc the coordinates of the GMAP on target GFF3 files for chromosome {wildcards.chrom}"
 	input: config['results']+'/4.gmapRescue/{chrom}.target.gff3',
-	output: config['results']+'/4.recalcGmap/{chrom}/recalc.gff3'
+	output: temp(config['results']+'/4.recalcGmap/{chrom}/recalc.gff3')
 	params: anchoringDir=config['results']+'/3.mapping/{chrom}',
 		bindir=os.getcwd()
 	log: config['results']+'/4.recalcGmap/{chrom}/recalc.log'
@@ -74,8 +74,8 @@ rule saveGmapWG:
 	message: "Save the GMAP results of genes mapped on Whole Genome"
 	input: gff=config['results']+'/4.gmapRescue/{chrom}.wholeGenome.gff3',
 		map=config['chromMapID']
-	output: gff=config['results']+'/4.gmapWholeGenome/{chrom}.wholeGenome.gff3',
-		gfferror=config['results']+'/4.gmapWholeGenome/{chrom}.wholeGenome_differentChrom.gff3'
+	output: gff=temp(config['results']+'/4.gmapWholeGenome/{chrom}.wholeGenome.gff3'),
+		gfferror=tmp(config['results']+'/4.gmapWholeGenome/{chrom}.wholeGenome_differentChrom.gff3')
 	log: config['results']+'/4.gmapWholeGenome/{chrom}.wholeGenome.log'
 	shell:
 		"""
@@ -89,7 +89,7 @@ rule mergeFinalGff3:
 	input: blat=config['results']+'/4.recalcBlat/{chrom}/RecalcAnnotOnTarget-clean.gff3',
 		rescue=config['results']+'/4.recalcGmap/{chrom}/recalc.gff3', 
 		wg=config['results']+'/4.gmapWholeGenome/{chrom}.wholeGenome.gff3'
-	output: annot=config['results']+'/5.FINAL/{chrom}/annotation.gff3'
+	output: annot=temp(config['results']+'/5.FINAL/{chrom}/annotation.gff3')
 	log: annot=config['results']+'/5.FINAL/{chrom}/annotation.log'
 	shell:
 		"""
@@ -99,7 +99,7 @@ rule checkMissing:
 	message: ""
 	input: gff=config['results']+'/5.FINAL/{chrom}/annotation.gff3',
 		ref=config['results']+"/1.features/{chrom}.bed"
-	output: config['results']+'/5.FINAL/{chrom}/missing.txt'
+	output: temp(config['results']+'/5.FINAL/{chrom}/missing.txt')
 	log: config['results']+'/5.FINAL/{chrom}/missing.log'
 	shell:
 		"""

--- a/rules/preprocessISBP.smk
+++ b/rules/preprocessISBP.smk
@@ -16,21 +16,21 @@ rule bam2bed:
 rule sortBed:
 	message: "Sort ISBPs bed file"
 	input: config['results']+"/1.filteredISBPs.bed"
-	output: config['results']+"/1.filteredISBPs.sorted.bed"
+	output: temp(config['results']+"/1.filteredISBPs/sorted.bed")
 	log: config['results']+'/1.sortBed.log'
 	shell: "sort -k1,1 -k2,2n {input} 1> {output} 2> {log}"

 rule dumpISBPsID:
 	message: "Dump ISBPs IDs"
-	input: config['results']+"/1.filteredISBPs.sorted.bed"
+	input: config['results']+"/1.filteredISBPs/sorted.bed"
 	output: temp(config['results']+"/1.filteredISBPs.ids")
 	shell: " cut -f 4 {input} > {output}"

 rule splitISBP:
 	message: "Split isbps per chromosome"
 	input: config['results']+"/1.filteredISBPs.sorted.bed"
-	output: config['results']+"/1.filteredISBPs.sorted.{chrom}.bed"
-	log: config['results']+"/1.filteredISBPs.splitPerChrom{chrom}.log"
+	output: temp(config['results']+"/1.filteredISBPs/{chrom}/sorted.bed")
+	log: config['results']+"/1.filteredISBPs/{chrom}/sorted.log"
 	params: 'Chr{chrom}'
 	shell:
 		"""