#!/bin/bash ##SBATCH --array=0-7 ##SBATCH -n1 ##SBATCH --mail-user=helene.rimbert@inrae.fr ##SBATCH --mail-type=ALL # for use with slurm #chroms=($(echo TraesCS{1..7} TraesCSU)) #chrom=${chroms[$SLURM_ARRAY_TASK_ID]} # get the list of query and target file for gmap inputDir=$1 blastdb=$2 gmapdb=$3 gmapdbdir=$4 outputRescueGff=$5 outputRescueWholeGenomeGff=$6 querylist=($(find $inputDir -type f -name 'query.fasta')) targetlist=($(find $inputDir -type f -name 'target.fasta')) numquery=${#querylist[@]} numtarget=${#targetlist[@]} #blastdb='/banks/blast/IWGSCv1.1_repr_mrna' echo "STARTING AT `date` FOR DATASET $inputDir" echo "found $numquery query fasta file" echo "found $numtarget target fasta file" echo "Check for genes with invilid isbp pairs" fgrep -h 'Could not identify a pair of marker to use to anchor' *.out |cut -d ' ' -f 15|cut -d '.' -f 1|sort > 3_mapping_wrongISBP_pair.txt for i in "${querylist[@]}"; do rootdir=`dirname $i` # extract the name of the gene geneid=$(cat $rootdir'/query.fasta'|fgrep '>'|perl -ne 'print $1 if /^\>(TraesCS.+)\ coords/') echo "================================================" echo " gene is $geneid" echo " datadir is $rootdir" echo "1. creating gmap db for dataset $rootdir" echo "================================================" gmap_build -D $rootdir -d query $rootdir'/query.fasta' & pid=$! if [ -f "$rootdir/GMAPRESCUE" ] then gmap_build -D $rootdir -d target $rootdir'/target.fasta' & pid2=$! fi wait $pid $pid2 echo "2. extract all mrnas" echo "================================================" # on dump d'abord tous les IDs de la banque blast ('-outfmt %i)'), # puis on ne garde que les mrna du gène en cours de traitement, # et on refait un appel sur la banque blast pour extraire toutes les sequences mrna blastdbcmd -db $blastdb -entry all -outfmt %i |fgrep -w $geneid |blastdbcmd -db $blastdb -entry_batch - > $rootdir'/query.mrna.fasta' echo "3. run gmap on target and query" echo "================================================" targetindex='target' targetindexdir=$rootdir targetoutputGff=$rootdir'/gmap.target.gff3' gmapexe='gmap' # check if we use as target the sub sequence or the entire genome if [ -f "$rootdir/UNMAPPED" ] then echo " Gene $geneid has status UNMAPPED: mapping on the whole genome target" targetindex=$gmapdb targetindexdir=$gmapdbdir targetoutputGff=$rootdir'/gmap.wholeGenome.gff3' gmapexe='gmapl' fi eval $gmapexe -n 1 -d $targetindex -D $targetindexdir -f 2 $rootdir'/query.mrna.fasta' > $targetoutputGff & pid=$! eval gmap -n 1 -d query -D $rootdir -f 2 $rootdir'/query.mrna.fasta' > $rootdir'/gmap.query.gff3' & pid2=$! wait $pid $pid2 rm -rf $rootdir'/query' $rootdir'/target' echo "4. indexing the query and target genomic" echo "================================================" samtools faidx $rootdir'/query.fasta' samtools faidx $rootdir'/target.fasta' if [ -f "$rootdir/GMAPRESCUE" ] then echo "5. running gffread to extract the pep sequence" echo "================================================" gffread -x $rootdir'/target.cds.fasta' -y $rootdir'/target.pep.faa' -g $rootdir'/target.fasta' $rootdir'/gmap.target.gff3' gffread -x $rootdir'/query.cds.fasta' -y $rootdir'/query.pep.faa' -g $rootdir'/query.fasta' $rootdir'/gmap.query.gff3' echo "6. running diff to check if the pep sequences are different or not" diffresult=$(diff $rootdir'/target.pep.faa' $rootdir'/query.pep.faa') if [ "$diffresult" != "" ] then echo "RESULTS pep of gene $geneid has changed" echo $diffresults > $rootdir/CDS_CHANGED else echo "RESULTS pep of gene $geneid stays the same" touch $rootdir/CDS_OK fi else echo " rescue done on the whole genome: we do not check for pep sequences" touch $rootdir/CDS_UNKNOWN fi done ### concat all the gff files created when mapping done on whole genome # for genes mapped on subsequence gffList=($(find $inputDir -type f -name "gmap.target.gff3" )) gt gff3 -sort -tidy -retainids "${gffList[@]}" 1> $outputRescueGff 2> $outputRescueGff.log # for genes mapped on whole genome gffList=($(find $inputDir -type f -name "gmap.wholeGenome.gff3" )) gt gff3 -sort -tidy -retainids "${gffList[@]}" 1> $outputRescueWholeGenomeGff 2> $outputRescueWholeGenomeGff.log echo "ENDED AT `date`"