Newer
Older
#!/bin/bash
##SBATCH --array=0-7
##SBATCH -n1
##SBATCH --mail-user=helene.rimbert@inrae.fr
##SBATCH --mail-type=ALL
# for use with slurm
#chroms=($(echo TraesCS{1..7} TraesCSU))
#chrom=${chroms[$SLURM_ARRAY_TASK_ID]}
# get the list of query and target file for gmap
inputDir=$1
blastdb=$2
querylist=($(find $inputDir -type f -name 'query.fasta'))
targetlist=($(find $inputDir -type f -name 'target.fasta'))
numquery=${#querylist[@]}
numtarget=${#targetlist[@]}
#blastdb='/banks/blast/IWGSCv1.1_repr_mrna'
echo "STARTING AT `date` FOR DATASET $inputDir"
echo "found $numquery query fasta file"
echo "found $numtarget target fasta file"
echo "Check for genes with invilid isbp pairs"
fgrep -h 'Could not identify a pair of marker to use to anchor' *.out |cut -d ' ' -f 15|cut -d '.' -f 1|sort > 3_mapping_wrongISBP_pair.txt
for i in "${querylist[@]}";
do
rootdir=`dirname $i`
# extract the name of the gene
geneid=$(cat $rootdir'/query.fasta'|fgrep '>'|perl -ne 'print $1 if /^\>(TraesCS.+)\ coords/')
echo "\n\n================================================"
echo " gene is $geneid"
echo " datadir is $rootdir"
echo "1. creating gmap db for dataset $rootdir"
if [ ! -d "$rootdir/query" ]
then
gmap_build -D $rootdir -d query $rootdir'/query.fasta' &
pid=$!
gmap_build -D $rootdir -d target $rootdir'/target.fasta' &
pid2=$!
wait $pid $pid2
fi

Helene Rimbert
committed
echo "2. extract all mrnas"
if [ ! -s "$rootdir/query.mrna.fasta" ]
then

Helene Rimbert
committed
# on dump d'abord tous les IDs de la banque blast ('-outfmt %i)'),
# puis on ne garde que les mrna du gène en cours de traitement,
# et on refait un appel sur la banque blast pour extraire toutes les sequences mrna
blastdbcmd -db $blastdb -entry all -outfmt %i |fgrep -w $geneid |blastdbcmd -db $blastdb -entry_batch - > $rootdir'/query.mrna.fasta'
fi
echo "3. run gmap on target and query"
if [ ! -s "$rootdir/gmap.target.gff3" ]
then
gmap -n 1 -d target -D $rootdir -f 2 $rootdir'/query.mrna.fasta' > $rootdir'/gmap.target.gff3' &
gmap -n 1 -d query -D $rootdir -f 2 $rootdir'/query.mrna.fasta' > $rootdir'/gmap.query.gff3' &
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
pid2=$!
blastn -query $rootdir'/query.fasta' -subject $rootdir'/target.fasta' -outfmt 6 -out $rootdir'/blast_m8.tab' &
pid3=$!
wait $pid $pid2 $pid3
fi
rm -rf $rootdir'/query' $rootdir'/target'
echo "4. indexing the query and target genomic"
if [ ! -s " $rootdir/query.fasta.fai" ]
then
samtools faidx $rootdir'/query.fasta'
samtools faidx $rootdir'/target.fasta'
else
echo " index already done"
fi
echo "5. running gffread to extract the pep sequence"
if [ ! -s " $rootdir/target.pep.faa" ]
then
gffread -x $rootdir'/target.cds.fasta' -y $rootdir'/target.pep.faa' -g $rootdir'/target.fasta' $rootdir'/gmap.target.gff3'
gffread -x $rootdir'/query.cds.fasta' -y $rootdir'/query.pep.faa' -g $rootdir'/query.fasta' $rootdir'/gmap.query.gff3'
else
echo "pep extraction already done"
fi
echo "6. running diff to check if the pep sequences are different or not"
diffresult=$(diff $rootdir'/target.pep.faa' $rootdir'/query.pep.faa')
if [ ! -s "$rootdir/blat.txt" ]
then
echo "RESULTS no blat results here: no extraction to make"
touch $rootdir/UNMAPPED
else
if [ "$diffresult" != "" ]
then
echo "RESULTS pep of gene $geneid has changed"
echo $diffresults > $rootdir/CDS_CHANGED
else
echo "RESULTS pep of gene $geneid stays the same"
touch $rootdir/CDS_OK
fi
fi