#!/bin/bash ##SBATCH --array=0-7 ##SBATCH -n1 ##SBATCH --mail-user=helene.rimbert@inrae.fr ##SBATCH --mail-type=ALL # for use with slurm #chroms=($(echo TraesCS{1..7} TraesCSU)) #chrom=${chroms[$SLURM_ARRAY_TASK_ID]} # get the list of query and target file for gmap inputDir=$1 blastdb=$2 querylist=($(find $inputDir -type f -name 'query.fasta')) targetlist=($(find $inputDir -type f -name 'target.fasta')) numquery=${#querylist[@]} numtarget=${#targetlist[@]} #blastdb='/banks/blast/IWGSCv1.1_repr_mrna' echo "STARTING AT `date` FOR DATASET $inputDir" echo "found $numquery query fasta file" echo "found $numtarget target fasta file" echo "Check for genes with invilid isbp pairs" fgrep -h 'Could not identify a pair of marker to use to anchor' *.out |cut -d ' ' -f 15|cut -d '.' -f 1|sort > 3_mapping_wrongISBP_pair.txt for i in "${querylist[@]}"; do rootdir=`dirname $i` # extract the name of the gene geneid=$(cat $rootdir'/query.fasta'|fgrep '>'|perl -ne 'print $1 if /^\>(TraesCS.+)\ coords/') echo "\n\n================================================" echo " gene is $geneid" echo " datadir is $rootdir" echo "1. creating gmap db for dataset $rootdir" if [ ! -d "$rootdir/query" ] then gmap_build -D $rootdir -d query $rootdir'/query.fasta' & pid=$! gmap_build -D $rootdir -d target $rootdir'/target.fasta' & pid2=$! wait $pid $pid2 fi echo "2. extract all mrnas" if [ ! -s "$rootdir/query.mrna.fasta" ] then # on dump d'abord tous les IDs de la banque blast ('-outfmt %i)'), # puis on ne garde que les mrna du gène en cours de traitement, # et on refait un appel sur la banque blast pour extraire toutes les sequences mrna blastdbcmd -db $blastdb -entry all -outfmt %i |fgrep -w $geneid |blastdbcmd -db $blastdb -entry_batch - > $rootdir'/query.mrna.fasta' fi echo "3. run gmap on target and query" if [ ! -s "$rootdir/gmap.target.gff3" ] then gmap -n 1 -d target -D $rootdir -f 2 $rootdir'/query.mrna.fasta' > $rootdir'/gmap.target.gff3' & pid=$! gmap -n 1 -d query -D $rootdir -f 2 $rootdir'/query.mrna.fasta' > $rootdir'/gmap.query.gff3' & pid2=$! blastn -query $rootdir'/query.fasta' -subject $rootdir'/target.fasta' -outfmt 6 -out $rootdir'/blast_m8.tab' & pid3=$! wait $pid $pid2 $pid3 fi rm -rf $rootdir'/query' $rootdir'/target' echo "4. indexing the query and target genomic" if [ ! -s " $rootdir/query.fasta.fai" ] then samtools faidx $rootdir'/query.fasta' samtools faidx $rootdir'/target.fasta' else echo " index already done" fi echo "5. running gffread to extract the pep sequence" if [ ! -s " $rootdir/target.pep.faa" ] then gffread -x $rootdir'/target.cds.fasta' -y $rootdir'/target.pep.faa' -g $rootdir'/target.fasta' $rootdir'/gmap.target.gff3' gffread -x $rootdir'/query.cds.fasta' -y $rootdir'/query.pep.faa' -g $rootdir'/query.fasta' $rootdir'/gmap.query.gff3' else echo "pep extraction already done" fi echo "6. running diff to check if the pep sequences are different or not" diffresult=$(diff $rootdir'/target.pep.faa' $rootdir'/query.pep.faa') if [ ! -s "$rootdir/blat.txt" ] then echo "RESULTS no blat results here: no extraction to make" touch $rootdir/UNMAPPED else if [ "$diffresult" != "" ] then echo "RESULTS pep of gene $geneid has changed" echo $diffresults > $rootdir/CDS_CHANGED else echo "RESULTS pep of gene $geneid stays the same" touch $rootdir/CDS_OK fi fi done echo "ENDED AT `date`"