#!/bin/bash
##SBATCH --array=0-7
##SBATCH -n1
##SBATCH --mail-user=helene.rimbert@inrae.fr
##SBATCH --mail-type=ALL

# for use with slurm
#chroms=($(echo TraesCS{1..7} TraesCSU))
#chrom=${chroms[$SLURM_ARRAY_TASK_ID]}

# get the list of query and target file for gmap

inputDir=$1
blastdb=$2
gmapdb=$3
gmapdbdir=$4
outputRescueGff=$5
outputRescueWholeGenomeGff=$6

querylist=($(find  $inputDir  -type f -name 'query.fasta'))
targetlist=($(find $inputDir -type f -name 'target.fasta'))

numquery=${#querylist[@]}
numtarget=${#targetlist[@]}
#blastdb='/banks/blast/IWGSCv1.1_repr_mrna'

echo "STARTING AT `date` FOR DATASET $inputDir"
echo "found $numquery query fasta file"
echo "found $numtarget target fasta file"

echo "Check for genes with invilid isbp pairs"
fgrep -h 'Could not identify a pair of marker to use to anchor' *.out |cut -d ' ' -f 15|cut -d '.' -f 1|sort > 3_mapping_wrongISBP_pair.txt

for i in "${querylist[@]}";
do
	rootdir=`dirname $i`
	# extract the name of the gene
	geneid=$(cat $rootdir'/query.fasta'|fgrep '>'|perl -ne 'print $1 if /^\>(TraesCS.+)\ coords/')
	echo "================================================"
	echo " gene is $geneid"
	echo " datadir is $rootdir"

	echo "1. creating gmap db for dataset $rootdir"
	echo "================================================"
	gmap_build -D $rootdir -d query $rootdir'/query.fasta' &
	pid=$!

	if [ -f "$rootdir/GMAPRESCUE" ]
	then
		gmap_build -D $rootdir -d target $rootdir'/target.fasta' &
		pid2=$!
	fi
	wait $pid $pid2

	echo "2. extract all mrnas"
	echo "================================================"
	# on dump d'abord tous les IDs de la banque blast ('-outfmt %i)'),
	# puis on ne garde que les mrna du gène en cours de traitement,
	# et on refait un appel sur la banque blast pour extraire toutes les sequences mrna
	blastdbcmd -db $blastdb -entry all -outfmt %i |fgrep -w $geneid |blastdbcmd -db $blastdb -entry_batch - > $rootdir'/query.mrna.fasta'
		

	echo "3. run gmap on target and query"
	echo "================================================"
	targetindex='target'
	targetindexdir=$rootdir
	targetoutputGff=$rootdir'/gmap.target.gff3'
	gmapexe='gmap'

	# check if we use as target the sub sequence or the entire genome
	if [ -f "$rootdir/UNMAPPED" ]
	then
		echo " Gene $geneid has status UNMAPPED: mapping on the whole genome target"
		targetindex=$gmapdb
		targetindexdir=$gmapdbdir
		targetoutputGff=$rootdir'/gmap.wholeGenome.gff3'
		gmapexe='gmapl'
	fi
	eval $gmapexe -n 1 -d $targetindex -D $targetindexdir -f 2 $rootdir'/query.mrna.fasta' > $targetoutputGff &
	pid=$!
	eval gmap -n 1 -d query -D $rootdir -f 2 $rootdir'/query.mrna.fasta' > $rootdir'/gmap.query.gff3' &
	pid2=$!
	wait $pid $pid2

	rm -rf $rootdir'/query' $rootdir'/target'

	echo "4. indexing the query and target genomic"
	echo "================================================"
	samtools faidx $rootdir'/query.fasta'
	samtools faidx $rootdir'/target.fasta'

	

	if [ -f "$rootdir/GMAPRESCUE" ]
	then
		echo "5. running gffread to extract the pep sequence"
		echo "================================================"
		gffread -x $rootdir'/target.cds.fasta' -y $rootdir'/target.pep.faa' -g $rootdir'/target.fasta' $rootdir'/gmap.target.gff3'
		gffread -x $rootdir'/query.cds.fasta' -y $rootdir'/query.pep.faa' -g $rootdir'/query.fasta' $rootdir'/gmap.query.gff3'
		
		echo "6. running diff to check if the pep sequences are different or not"
		diffresult=$(diff $rootdir'/target.pep.faa' $rootdir'/query.pep.faa')
		if [ "$diffresult" != "" ]
		then
			echo "RESULTS pep of gene $geneid has changed"
			echo $diffresults > $rootdir/CDS_CHANGED
		else
			echo "RESULTS pep of gene $geneid stays the same"
			touch $rootdir/CDS_OK
		fi

	else
		echo " rescue done on the whole genome: we do not check for pep sequences"
		touch $rootdir/CDS_UNKNOWN
	fi
	
done

### concat all the gff files created when mapping done on whole genome
# for genes mapped on subsequence
gffList=($(find $inputDir -type f -name "gmap.target.gff3" ))
gt gff3 -sort -tidy -retainids "${gffList[@]}" 1> $outputRescueGff 2> $outputRescueGff.log

# for genes mapped on whole genome
gffList=($(find $inputDir -type f -name "gmap.wholeGenome.gff3" ))
gt gff3 -sort -tidy -retainids "${gffList[@]}" 1> $outputRescueWholeGenomeGff 2> $outputRescueWholeGenomeGff.log

echo "ENDED AT `date`"