Skip to content
Snippets Groups Projects
recalcGmapRescue.sh 2.3 KiB
Newer Older
#!/bin/bash
##SBATCH --array=0-7
##SBATCH -n1
##SBATCH --mail-user=helene.rimbert@inrae.fr
##SBATCH --mail-type=ALL

inputDir=$1
Helene Rimbert's avatar
Helene Rimbert committed
bindir=$4

export TRANSFERTANNOT_PATH=$bindir

querylist=($(find  $inputDir  -type f -name 'gmap.target.gff3'))

numquery=${#querylist[@]}

echo "STARTING AT `date` FOR DATASET $inputDir"
echo "found $numquery query file"

Helene Rimbert's avatar
Helene Rimbert committed
for inputgff in "${querylist[@]}";
Helene Rimbert's avatar
Helene Rimbert committed
	rootdir=`dirname $inputgff`
	# extract the name of the gene
    geneid=$(cat $rootdir'/query.fasta'|fgrep '>'|perl -ne 'print $1 if /^\>(TraesCS.+)\ coords/')
	echo "\n\n================================================"
    echo " gene is $geneid"
	echo " datadir is $rootdir"

    # extract the target chrom, start and stop
    chrom=$(fgrep '>' $rootdir'/target.fasta'|sed "s/>//"|cut -d ' ' -f1|cut -d '_' -f2)
    start=$(fgrep '>' $rootdir'/target.fasta'|sed "s/>//"|cut -d ' ' -f1|cut -d '_' -f3)
    stop=$(fgrep '>' $rootdir'/target.fasta'|sed "s/>//"|cut -d ' ' -f1|cut -d '_' -f4)
    echo " Target coords: $chrom from $start - $stop"

Helene Rimbert's avatar
Helene Rimbert committed
    # check the status of the CDS: OK or CHANGED or UNKNOWN
    mappingStatus='UNMAPPED'
Helene Rimbert's avatar
Helene Rimbert committed
    cdsStatus='CDS_OK'
    if [ -f "$rootdir/CDS_OK" ]
    then
        echo " CDS has not changed between target and reference"
Helene Rimbert's avatar
Helene Rimbert committed
        cdsStatus='CDS_OK'

    elif [ -f "$rootdir/CDS_CHANGED" ]
    then
        echo " WARNING : CDS has changed between target and reference"
Helene Rimbert's avatar
Helene Rimbert committed
        cdsStatus='CDS_CHANGED'

    elif [ -f "$rootdir/CDS_UNKNOWN" ]
    then
        echo " WARNING : we do not know if the CDS has changed or not"
        cdsStatus='CDS_UNKNOWN'
Helene Rimbert's avatar
Helene Rimbert committed
        echo " Assume gene is UNKNOWN"
        echo " need to run gmap on entire genome"
Helene Rimbert's avatar
Helene Rimbert committed
        cdsStatus='CDS_UNKNOWN'
        continue
    fi

    echo " 1. convert gmap coordinates on new reference"
Helene Rimbert's avatar
Helene Rimbert committed
    eval $TRANSFERTANNOT_PATH'/bin/convertGmapCoords.py' -G $inputgff -s $start -e $stop -c $cdsStatus -o $rootdir'/recalc.gff3' -r $chrom -g $geneid -m 'GMAP_ON_TARGET' &> $rootdir'/recalc.log'
	
    echo " 2. sort gff"
    eval gt gff3 -sort -retainids -tidy $rootdir'/recalc.gff3' 1> $rootdir'/recalc_clean.gff3'

done

# merge All recalc_clean.gff3 files
gfflist=($(find  $inputDir  -type f -name 'recalc_clean.gff3'))
gt gff3 -sort -tidy -retainids "${gfflist[@]}" 1> $finalGFF 2> $finalGFF.gt.log

echo "ENDED AT `date`"