gmapRescue.sh

#!/bin/bash

inputDir=$1
blastdb=$2
gmapdb=$3
gmapdbdir=$4
outputRescueGff=$5
outputRescueWholeGenomeGff=$6
wholegenomeFasta=$7

querylist=($(find  $inputDir  -type f -name 'query.fasta'))
targetlist=($(find $inputDir -type f -name 'target.fasta'))

numquery=${#querylist[@]}
numtarget=${#targetlist[@]}

echo "STARTING AT `date` FOR DATASET $inputDir"
echo "found $numquery query fasta file"
echo "found $numtarget target fasta file"

for i in "${querylist[@]}";
do
	rootdir=`dirname $i`
	# extract the name of the gene
	geneid=$(cat $rootdir'/query.fasta'|fgrep '>'|perl -ne 'print $1 if /^\>(TraesCS.+)\ coords/')
	echo "================================================"
	echo " gene is $geneid"
	echo " datadir is $rootdir"

	echo "1. extract all mrnas"
	# on dump d'abord tous les IDs de la banque blast ('-outfmt %i)'),
	# puis on ne garde que les mrna du gène en cours de traitement,
	# et on refait un appel sur la banque blast pour extraire toutes les sequences mrna
	blastdbcmd -db $blastdb -entry all -outfmt %i |fgrep -w $geneid |blastdbcmd -db $blastdb -entry_batch - > $rootdir'/query.mrna.fasta'
		

	echo "2. run gmap on target and query"
	targetindex='target'
	targetindexdir=$rootdir
	targetgff=$rootdir'/gmap.target.gff3'
	gmapexe='gmap'

	# check if we use as target the sub sequence or the entire genome
	if [ -f "$rootdir/UNMAPPED" ]
	then
		echo " Gene $geneid has status UNMAPPED: mapping on the whole genome"
		targetindex=$gmapdb
		targetindexdir=$gmapdbdir
		targetgff=$rootdir'/gmap.wholeGenome.gff3'
		gmapexe='gmapl'
		eval $gmapexe --min-identity 0.90 --min-trimmed-coverage 0.80 --ordered -n 1 -d $targetindex -D $targetindexdir -f 2 $rootdir'/query.mrna.fasta' 1> $targetgff 
	else
		echo "running gmap on target"
		eval gmap --ordered -n 1 -g $rootdir'/target.fasta' -f 2 $rootdir'/query.mrna.fasta' 1> $targetgff
	fi
	eval gmap --ordered -n 1 -g $rootdir'/query.fasta' -f 2 $rootdir'/query.mrna.fasta' 1> $rootdir'/gmap.query.gff3'

	echo "3. indexing the query and target genomic"
	samtools faidx $rootdir'/query.fasta' &
	pid=$!
	samtools faidx $rootdir'/target.fasta' &
	pid2=$!
	wait $pid $pid2

	echo "4. running gffread to extract the pep sequence"
	if [ -f "$rootdir/UNMAPPED" ]
	then
		gffread -x $rootdir'/target.cds.fasta' -y $rootdir'/target.pep.faa' -g $wholegenomeFasta $targetgff 
	else
		gffread -x $rootdir'/target.cds.fasta' -y $rootdir'/target.pep.faa' -g $rootdir'/target.fasta' $targetgff && rm $rootdir'/target.fasta'
	fi
	gffread -x $rootdir'/query.cds.fasta' -y $rootdir'/query.pep.faa' -g $rootdir'/query.fasta' $rootdir'/gmap.query.gff3' && rm $rootdir'/query.fasta'
		
	echo "5. running diff to check if the pep sequences are different or not"
	diffresult=$(diff $rootdir'/target.pep.faa' $rootdir'/query.pep.faa')
	if [ "$diffresult" != "" ]
	then
		echo "RESULTS pep of gene $geneid has changed"
		echo $diffresults > $rootdir/CDS_CHANGED
		awk 'BEGIN{OFS="\t"}{if ($3 ~ /gene/){print $0";cds=CDS_CHANGED"} else {print $0}}' $targetgff > $rootdir/'tmp.gff3' && mv $rootdir/'tmp.gff3' $targetgff

	else
		echo "RESULTS pep of gene $geneid stays the same"
		touch $rootdir/CDS_OK
		awk 'BEGIN{OFS="\t"}{if ($3 ~ /gene/){print $0";cds=CDS_OK"} else {print $0}}' $targetgff > $rootdir/'tmp.gff3' && mv $rootdir/'tmp.gff3' $targetgff
	fi
	
done

### concat all the gff files created when mapping done on whole genome
# for genes mapped on subsequence
echo "Now concat all GFF of gmap on target"
gffList=($(find $inputDir -type f -name "gmap.target.gff3" ))
gt gff3 -sort -tidy -retainids "${gffList[@]}" 1> $outputRescueGff 2> $outputRescueGff.log

# for genes mapped on whole genome
echo "Now concat all GFF of gmap on whole genome"
gffList=($(find $inputDir -type f -name "gmap.wholeGenome.gff3" ))
gt gff3 -sort -tidy -retainids "${gffList[@]}" 1> $outputRescueWholeGenomeGff 2> $outputRescueWholeGenomeGff.log

echo "ENDED AT `date`"