Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#!/bin/bash
##SBATCH --array=0-7
##SBATCH -n1
##SBATCH --mail-user=helene.rimbert@inrae.fr
##SBATCH --mail-type=ALL
# for use with slurm
#chroms=($(echo TraesCS{1..7} TraesCSU))
#chrom=${chroms[$SLURM_ARRAY_TASK_ID]}
# get the list of query and target file for gmap
inputDir=$1
blastdb=$2
querylist=($(find $inputDir -type f -name 'gmap.target.gff3'))
numquery=${#querylist[@]}
echo "STARTING AT `date` FOR DATASET $inputDir"
echo "found $numquery query file"
for i in "${querylist[@]}";
do
rootdir=`dirname $i`
# extract the name of the gene
geneid=$(cat $rootdir'/query.fasta'|fgrep '>'|perl -ne 'print $1 if /^\>(TraesCS.+)\ coords/')
echo "\n\n================================================"
echo " gene is $geneid"
echo " datadir is $rootdir"
# extract the target chrom, start and stop
chrom=$(fgrep '>' $rootdir'/target.fasta'|sed "s/>//"|cut -d ' ' -f1|cut -d '_' -f2)
start=$(fgrep '>' $rootdir'/target.fasta'|sed "s/>//"|cut -d ' ' -f1|cut -d '_' -f3)
stop=$(fgrep '>' $rootdir'/target.fasta'|sed "s/>//"|cut -d ' ' -f1|cut -d '_' -f4)
echo " Target coords: $chrom from $start - $stop"
# check the status of the CDS: OK or CHANGED
mappingStatus='UNMAPPED'
if [ -f "$rootdir/CDS_OK" ]
then
echo " CDS has not changed between target and reference"
mappingStatus='CDS_OK'
elif [ -f "$rootdir/CDS_CHANGED" ]
then
echo " WARNING : CDS has changed between target and reference"
mappingStatus='CDS_CHANGED'
else
echo " Assume gene is unmapped"
echo " need to run gmap on entire genome"
mappingStatus='UNMAPPED'
#touch $rootdir/UNMAPPED
continue
fi
echo " 1. convert gmap coordinates on new reference"
eval $TRANSFERTANNOT_PATH'/bin/convertGmapCoords.py' -G $rootdir'/gmap.target.gff3' -s $start -e $stop -c $mappingStatus -o $rootdir'/recalc.gff3' -r $chrom -g $geneid -m GMAP_on_target &> $rootdir'/recalc.log'
echo " 2. sort gff"
eval gt gff3 -sort -retainids -tidy $rootdir'/recalc.gff3' 1> $rootdir'/recalc_clean.gff3'
done
echo "ENDED AT `date`"