Skip to content
Snippets Groups Projects
gmapRescue.sh 3.4 KiB
Newer Older
#!/bin/bash
##SBATCH --array=0-7
##SBATCH -n1
##SBATCH --mail-user=helene.rimbert@inrae.fr
##SBATCH --mail-type=ALL

# for use with slurm
#chroms=($(echo TraesCS{1..7} TraesCSU))
#chrom=${chroms[$SLURM_ARRAY_TASK_ID]}

# get the list of query and target file for gmap

inputDir=$1
blastdb=$2

querylist=($(find  $inputDir  -type f -name 'query.fasta'))
targetlist=($(find $inputDir -type f -name 'target.fasta'))

numquery=${#querylist[@]}
numtarget=${#targetlist[@]}
#blastdb='/banks/blast/IWGSCv1.1_repr_mrna'

echo "STARTING AT `date` FOR DATASET $inputDir"
echo "found $numquery query fasta file"
echo "found $numtarget target fasta file"

echo "Check for genes with invilid isbp pairs"
fgrep -h 'Could not identify a pair of marker to use to anchor' *.out |cut -d ' ' -f 15|cut -d '.' -f 1|sort > 3_mapping_wrongISBP_pair.txt

for i in "${querylist[@]}";
do
	rootdir=`dirname $i`
	# extract the name of the gene
        geneid=$(cat $rootdir'/query.fasta'|fgrep '>'|perl -ne 'print $1 if /^\>(TraesCS.+)\ coords/')
	echo "\n\n================================================"
        echo " gene is $geneid"
	echo " datadir is $rootdir"

	echo "1. creating gmap db for dataset $rootdir"

	if [ ! -d "$rootdir/query" ]
	then
		gmap_build -D $rootdir -d query $rootdir'/query.fasta' &
		pid=$!
		gmap_build -D $rootdir -d target $rootdir'/target.fasta' &
		pid2=$!
		wait $pid $pid2
	fi


	if [ ! -s "$rootdir/query.mrna.fasta" ]
	then
		# on dump d'abord tous les IDs de la banque blast ('-outfmt %i)'),
		# puis on ne garde que les mrna du gène en cours de traitement,
		# et on refait un appel sur la banque blast pour extraire toutes les sequences mrna
		blastdbcmd -db $blastdb -entry all -outfmt %i |fgrep -w $geneid |blastdbcmd -db $blastdb -entry_batch - > $rootdir'/query.mrna.fasta'
		
	fi

	echo "3. run gmap on target and query"
	if [ ! -s "$rootdir/gmap.target.gff3" ]
	then 
		gmap -n 1 -d target -D $rootdir -f 2 $rootdir'/query.mrna.fasta' > $rootdir'/gmap.target.gff3' &
		pid=$!
		gmap -n 1 -d query -D $rootdir -f 2 $rootdir'/query.mrna.fasta' > $rootdir'/gmap.query.gff3' &
		pid2=$!
		blastn -query $rootdir'/query.fasta' -subject $rootdir'/target.fasta' -outfmt 6 -out $rootdir'/blast_m8.tab' &
		pid3=$!
		wait $pid $pid2 $pid3
	fi
	rm -rf $rootdir'/query' $rootdir'/target'

	echo "4. indexing the query and target genomic"
	if [ ! -s " $rootdir/query.fasta.fai" ]
	then
		samtools faidx $rootdir'/query.fasta'
		samtools faidx $rootdir'/target.fasta'
	else
		echo " index already done"
	fi

	echo "5. running gffread to extract the pep sequence"

	if [ ! -s " $rootdir/target.pep.faa" ]
	then
		gffread -x $rootdir'/target.cds.fasta' -y $rootdir'/target.pep.faa' -g $rootdir'/target.fasta' $rootdir'/gmap.target.gff3'
		gffread -x $rootdir'/query.cds.fasta' -y $rootdir'/query.pep.faa' -g $rootdir'/query.fasta' $rootdir'/gmap.query.gff3'
	else
		echo "pep extraction already done"
	fi

	echo "6. running diff to check if the pep sequences are different or not"
	diffresult=$(diff $rootdir'/target.pep.faa' $rootdir'/query.pep.faa')

	if [ ! -s "$rootdir/blat.txt" ]
	then
		echo "RESULTS no blat results here: no extraction to make"
		touch $rootdir/UNMAPPED
	else
		if [ "$diffresult" != "" ]
		then
			echo "RESULTS pep of gene $geneid has changed"
			echo $diffresults > $rootdir/CDS_CHANGED

		else
			echo "RESULTS pep of gene $geneid stays the same"
			touch $rootdir/CDS_OK
		fi
	fi

done

echo "ENDED AT `date`"