Newer
Older
gmapdb=$3
gmapdbdir=$4
outputRescueGff=$5
outputRescueWholeGenomeGff=$6
querylist=($(find $inputDir -type f -name 'query.fasta'))
targetlist=($(find $inputDir -type f -name 'target.fasta'))
numquery=${#querylist[@]}
numtarget=${#targetlist[@]}
echo "STARTING AT `date` FOR DATASET $inputDir"
echo "found $numquery query fasta file"
echo "found $numtarget target fasta file"
for i in "${querylist[@]}";
do
rootdir=`dirname $i`
# extract the name of the gene
geneid=$(cat $rootdir'/query.fasta'|fgrep '>'|perl -ne 'print $1 if /^\>(TraesCS.+)\ coords/')
echo "================================================"
echo " gene is $geneid"
echo "================================================"
# on dump d'abord tous les IDs de la banque blast ('-outfmt %i)'),
# puis on ne garde que les mrna du gène en cours de traitement,
# et on refait un appel sur la banque blast pour extraire toutes les sequences mrna
blastdbcmd -db $blastdb -entry all -outfmt %i |fgrep -w $geneid |blastdbcmd -db $blastdb -entry_batch - > $rootdir'/query.mrna.fasta'
echo "2. run gmap on target and query"
echo "================================================"
targetindex='target'
targetindexdir=$rootdir
gmapexe='gmap'
# check if we use as target the sub sequence or the entire genome
if [ -f "$rootdir/UNMAPPED" ]
then
echo " Gene $geneid has status UNMAPPED: mapping on the whole genome"
targetindex=$gmapdb
targetindexdir=$gmapdbdir
gmapexe='gmapl'
eval $gmapexe --min-identity 0.90 --min-trimmed-coverage 0.80 --ordered -n 1 -d $targetindex -D $targetindexdir -f 2 $rootdir'/query.mrna.fasta' 1> $rootdir'/gmap.wholeGenome.gff3'
else
echo "running gmap on target"
eval gmap --ordered -n 1 -g $rootdir'/target.fasta' -f 2 $rootdir'/query.mrna.fasta' > $rootdir'/gmap.target.gff3'
eval gmap --ordered -n 1 -g $rootdir'/query.fasta' -f 2 $rootdir'/query.mrna.fasta' > $rootdir'/gmap.query.gff3'
echo "3. indexing the query and target genomic"
echo "================================================"
samtools faidx $rootdir'/query.fasta' &
samtools faidx $rootdir'/target.fasta' &
echo "4. running gffread to extract the pep sequence"
echo "================================================"
gffread -x $rootdir'/target.cds.fasta' -y $rootdir'/target.pep.faa' -g $wholegenomeFasta $rootdir'/gmap.wholeGenome.gff3'
else
gffread -x $rootdir'/target.cds.fasta' -y $rootdir'/target.pep.faa' -g $rootdir'/target.fasta' $rootdir'/gmap.target.gff3'
fi
gffread -x $rootdir'/query.cds.fasta' -y $rootdir'/query.pep.faa' -g $rootdir'/query.fasta' $rootdir'/gmap.query.gff3'
echo "5. running diff to check if the pep sequences are different or not"
echo "================================================"
diffresult=$(diff $rootdir'/target.pep.faa' $rootdir'/query.pep.faa')
if [ "$diffresult" != "" ]
then
echo "RESULTS pep of gene $geneid has changed"
echo $diffresults > $rootdir/CDS_CHANGED
awk 'BEGIN{OFS="\t"}{if ($3 ~ /gene/){print $0";cds=CDS_CHANGED"} else {print $0}}' $rootdir'/gmap.wholeGenome.gff3' > $rootdir/'tmp.gff3' && mv $rootdir/'tmp.gff3' $rootdir'/gmap.wholeGenome.gff3'
echo "RESULTS pep of gene $geneid stays the same"
touch $rootdir/CDS_OK
awk 'BEGIN{OFS="\t"}{if ($3 ~ /gene/){print $0";cds=CDS_OK"} else {print $0}}' $rootdir'/gmap.wholeGenome.gff3' > $rootdir/'tmp.gff3' && mv $rootdir/'tmp.gff3' $rootdir'/gmap.wholeGenome.gff3'
### concat all the gff files created when mapping done on whole genome
# for genes mapped on subsequence
gffList=($(find $inputDir -type f -name "gmap.target.gff3" ))
gt gff3 -sort -tidy -retainids "${gffList[@]}" 1> $outputRescueGff 2> $outputRescueGff.log
# for genes mapped on whole genome
gffList=($(find $inputDir -type f -name "gmap.wholeGenome.gff3" ))
gt gff3 -sort -tidy -retainids "${gffList[@]}" 1> $outputRescueWholeGenomeGff 2> $outputRescueWholeGenomeGff.log