#!/bin/bash inputDir=$1 blastdb=$2 gmapdb=$3 gmapdbdir=$4 outputRescueGff=$5 outputRescueWholeGenomeGff=$6 wholegenomeFasta=$7 querylist=($(find $inputDir -type f -name 'query.fasta')) targetlist=($(find $inputDir -type f -name 'target.fasta')) numquery=${#querylist[@]} numtarget=${#targetlist[@]} echo "STARTING AT `date` FOR DATASET $inputDir" echo "found $numquery query fasta file" echo "found $numtarget target fasta file" for i in "${querylist[@]}"; do rootdir=`dirname $i` # extract the name of the gene geneid=$(cat $rootdir'/query.fasta'|fgrep '>'|perl -ne 'print $1 if /^\>(TraesCS.+)\ coords/') echo "================================================" echo " gene is $geneid" echo " datadir is $rootdir" echo "1. extract all mrnas" # on dump d'abord tous les IDs de la banque blast ('-outfmt %i)'), # puis on ne garde que les mrna du gène en cours de traitement, # et on refait un appel sur la banque blast pour extraire toutes les sequences mrna blastdbcmd -db $blastdb -entry all -outfmt %i |fgrep -w $geneid |blastdbcmd -db $blastdb -entry_batch - > $rootdir'/query.mrna.fasta' echo "2. run gmap on target and query" targetindex='target' targetindexdir=$rootdir targetgff=$rootdir'/gmap.target.gff3' gmapexe='gmap' # check if we use as target the sub sequence or the entire genome if [ -f "$rootdir/UNMAPPED" ] then echo " Gene $geneid has status UNMAPPED: mapping on the whole genome" targetindex=$gmapdb targetindexdir=$gmapdbdir targetgff=$rootdir'/gmap.wholeGenome.gff3' gmapexe='gmapl' eval $gmapexe --min-identity 0.90 --min-trimmed-coverage 0.80 --ordered -n 1 -d $targetindex -D $targetindexdir -f 2 $rootdir'/query.mrna.fasta' 1> $targetgff else echo "running gmap on target" eval gmap --ordered -n 1 -g $rootdir'/target.fasta' -f 2 $rootdir'/query.mrna.fasta' 1> $targetgff fi eval gmap --ordered -n 1 -g $rootdir'/query.fasta' -f 2 $rootdir'/query.mrna.fasta' 1> $rootdir'/gmap.query.gff3' echo "3. indexing the query and target genomic" samtools faidx $rootdir'/query.fasta' & pid=$! samtools faidx $rootdir'/target.fasta' & pid2=$! wait $pid $pid2 echo "4. running gffread to extract the pep sequence" if [ -f "$rootdir/UNMAPPED" ] then gffread -x $rootdir'/target.cds.fasta' -y $rootdir'/target.pep.faa' -g $wholegenomeFasta $targetgff else gffread -x $rootdir'/target.cds.fasta' -y $rootdir'/target.pep.faa' -g $rootdir'/target.fasta' $targetgff && rm $rootdir'/target.fasta' fi gffread -x $rootdir'/query.cds.fasta' -y $rootdir'/query.pep.faa' -g $rootdir'/query.fasta' $rootdir'/gmap.query.gff3' && rm $rootdir'/query.fasta' echo "5. running diff to check if the pep sequences are different or not" diffresult=$(diff $rootdir'/target.pep.faa' $rootdir'/query.pep.faa') if [ "$diffresult" != "" ] then echo "RESULTS pep of gene $geneid has changed" echo $diffresults > $rootdir/CDS_CHANGED awk 'BEGIN{OFS="\t"}{if ($3 ~ /gene/){print $0";cds=CDS_CHANGED"} else {print $0}}' $targetgff > $rootdir/'tmp.gff3' && mv $rootdir/'tmp.gff3' $targetgff else echo "RESULTS pep of gene $geneid stays the same" touch $rootdir/CDS_OK awk 'BEGIN{OFS="\t"}{if ($3 ~ /gene/){print $0";cds=CDS_OK"} else {print $0}}' $targetgff > $rootdir/'tmp.gff3' && mv $rootdir/'tmp.gff3' $targetgff fi done ### concat all the gff files created when mapping done on whole genome # for genes mapped on subsequence echo "Now concat all GFF of gmap on target" gffList=($(find $inputDir -type f -name "gmap.target.gff3" )) gt gff3 -sort -tidy -retainids "${gffList[@]}" 1> $outputRescueGff 2> $outputRescueGff.log # for genes mapped on whole genome echo "Now concat all GFF of gmap on whole genome" gffList=($(find $inputDir -type f -name "gmap.wholeGenome.gff3" )) gt gff3 -sort -tidy -retainids "${gffList[@]}" 1> $outputRescueWholeGenomeGff 2> $outputRescueWholeGenomeGff.log echo "ENDED AT `date`"