Newer
Older
gmapdb=$3
gmapdbdir=$4
outputRescueGff=$5
outputRescueWholeGenomeGff=$6
querylist=($(find $inputDir -type f -name 'query.fasta'))
targetlist=($(find $inputDir -type f -name 'target.fasta'))
numquery=${#querylist[@]}
numtarget=${#targetlist[@]}
echo "STARTING AT `date` FOR DATASET $inputDir"
echo "found $numquery query fasta file"
echo "found $numtarget target fasta file"
for i in "${querylist[@]}";
do
rootdir=`dirname $i`
# extract the name of the gene
geneid=$(cat $rootdir'/query.fasta'|fgrep '>'|perl -ne 'print $1 if /^\>(TraesCS.+)\ coords/')
echo "================================================"
echo " gene is $geneid"
# on dump d'abord tous les IDs de la banque blast ('-outfmt %i)'),
# puis on ne garde que les mrna du gène en cours de traitement,
# et on refait un appel sur la banque blast pour extraire toutes les sequences mrna
blastdbcmd -db $blastdb -entry all -outfmt %i |fgrep -w $geneid |blastdbcmd -db $blastdb -entry_batch - > $rootdir'/query.mrna.fasta'
echo "2. run gmap on target and query"
targetgff=$rootdir'/gmap.target.gff3'
gmapexe='gmap'
# check if we use as target the sub sequence or the entire genome
if [ -f "$rootdir/UNMAPPED" ]
then
echo " Gene $geneid has status UNMAPPED: mapping on the whole genome"
targetgff=$rootdir'/gmap.wholeGenome.gff3'
eval $gmapexe --min-identity 0.90 --min-trimmed-coverage 0.80 --ordered -n 1 -d $targetindex -D $targetindexdir -f 2 $rootdir'/query.mrna.fasta' 1> $targetgff
else
echo "running gmap on target"
eval gmap --ordered -n 1 -g $rootdir'/target.fasta' -f 2 $rootdir'/query.mrna.fasta' 1> $targetgff
eval gmap --ordered -n 1 -g $rootdir'/query.fasta' -f 2 $rootdir'/query.mrna.fasta' 1> $rootdir'/gmap.query.gff3'
echo "3. indexing the query and target genomic"
samtools faidx $rootdir'/query.fasta' &
samtools faidx $rootdir'/target.fasta' &
echo "4. running gffread to extract the pep sequence"
if [ -f "$rootdir/UNMAPPED" ]
gffread -x $rootdir'/target.cds.fasta' -y $rootdir'/target.pep.faa' -g $wholegenomeFasta $targetgff
gffread -x $rootdir'/target.cds.fasta' -y $rootdir'/target.pep.faa' -g $rootdir'/target.fasta' $targetgff && rm $rootdir'/target.fasta'
gffread -x $rootdir'/query.cds.fasta' -y $rootdir'/query.pep.faa' -g $rootdir'/query.fasta' $rootdir'/gmap.query.gff3' && rm $rootdir'/query.fasta'
echo "5. running diff to check if the pep sequences are different or not"
diffresult=$(diff $rootdir'/target.pep.faa' $rootdir'/query.pep.faa')
if [ "$diffresult" != "" ]
then
echo "RESULTS pep of gene $geneid has changed"
echo $diffresults > $rootdir/CDS_CHANGED
awk 'BEGIN{OFS="\t"}{if ($3 ~ /gene/){print $0";cds=CDS_CHANGED"} else {print $0}}' $targetgff > $rootdir/'tmp.gff3' && mv $rootdir/'tmp.gff3' $targetgff
echo "RESULTS pep of gene $geneid stays the same"
touch $rootdir/CDS_OK
awk 'BEGIN{OFS="\t"}{if ($3 ~ /gene/){print $0";cds=CDS_OK"} else {print $0}}' $targetgff > $rootdir/'tmp.gff3' && mv $rootdir/'tmp.gff3' $targetgff
### concat all the gff files created when mapping done on whole genome
# for genes mapped on subsequence
echo "Now concat all GFF of gmap on target"
gffList=($(find $inputDir -type f -name "gmap.target.gff3" ))
gt gff3 -sort -tidy -retainids "${gffList[@]}" 1> $outputRescueGff 2> $outputRescueGff.log
echo "Now concat all GFF of gmap on whole genome"
gffList=($(find $inputDir -type f -name "gmap.wholeGenome.gff3" ))
gt gff3 -sort -tidy -retainids "${gffList[@]}" 1> $outputRescueWholeGenomeGff 2> $outputRescueWholeGenomeGff.log