Skip to content
Snippets Groups Projects
Commit a6b15d58 authored by Helene Rimbert's avatar Helene Rimbert
Browse files

add fasta removing to save disk space

parent a4418dc5
No related branches found
No related tags found
No related merge requests found
......@@ -28,7 +28,6 @@ do
echo " datadir is $rootdir"
echo "1. extract all mrnas"
echo "================================================"
# on dump d'abord tous les IDs de la banque blast ('-outfmt %i)'),
# puis on ne garde que les mrna du gène en cours de traitement,
# et on refait un appel sur la banque blast pour extraire toutes les sequences mrna
......@@ -36,9 +35,9 @@ do
echo "2. run gmap on target and query"
echo "================================================"
targetindex='target'
targetindexdir=$rootdir
targetgff=$rootdir'/gmap.target.gff3'
gmapexe='gmap'
# check if we use as target the sub sequence or the entire genome
......@@ -47,16 +46,16 @@ do
echo " Gene $geneid has status UNMAPPED: mapping on the whole genome"
targetindex=$gmapdb
targetindexdir=$gmapdbdir
targetgff=$rootdir'/gmap.wholeGenome.gff3'
gmapexe='gmapl'
eval $gmapexe --min-identity 0.90 --min-trimmed-coverage 0.80 --ordered -n 1 -d $targetindex -D $targetindexdir -f 2 $rootdir'/query.mrna.fasta' 1> $rootdir'/gmap.wholeGenome.gff3'
eval $gmapexe --min-identity 0.90 --min-trimmed-coverage 0.80 --ordered -n 1 -d $targetindex -D $targetindexdir -f 2 $rootdir'/query.mrna.fasta' 1> $targetgff
else
echo "running gmap on target"
eval gmap --ordered -n 1 -g $rootdir'/target.fasta' -f 2 $rootdir'/query.mrna.fasta' > $rootdir'/gmap.target.gff3'
eval gmap --ordered -n 1 -g $rootdir'/target.fasta' -f 2 $rootdir'/query.mrna.fasta' 1> $targetgff
fi
eval gmap --ordered -n 1 -g $rootdir'/query.fasta' -f 2 $rootdir'/query.mrna.fasta' > $rootdir'/gmap.query.gff3'
eval gmap --ordered -n 1 -g $rootdir'/query.fasta' -f 2 $rootdir'/query.mrna.fasta' 1> $rootdir'/gmap.query.gff3'
echo "3. indexing the query and target genomic"
echo "================================================"
samtools faidx $rootdir'/query.fasta' &
pid=$!
samtools faidx $rootdir'/target.fasta' &
......@@ -64,38 +63,38 @@ do
wait $pid $pid2
echo "4. running gffread to extract the pep sequence"
echo "================================================"
if [ -f "$rootdir/UNMAPPED" ]
then
gffread -x $rootdir'/target.cds.fasta' -y $rootdir'/target.pep.faa' -g $wholegenomeFasta $rootdir'/gmap.wholeGenome.gff3'
gffread -x $rootdir'/target.cds.fasta' -y $rootdir'/target.pep.faa' -g $wholegenomeFasta $targetgff
else
gffread -x $rootdir'/target.cds.fasta' -y $rootdir'/target.pep.faa' -g $rootdir'/target.fasta' $rootdir'/gmap.target.gff3'
gffread -x $rootdir'/target.cds.fasta' -y $rootdir'/target.pep.faa' -g $rootdir'/target.fasta' $targetgff && rm $rootdir'/target.fasta'
fi
gffread -x $rootdir'/query.cds.fasta' -y $rootdir'/query.pep.faa' -g $rootdir'/query.fasta' $rootdir'/gmap.query.gff3'
gffread -x $rootdir'/query.cds.fasta' -y $rootdir'/query.pep.faa' -g $rootdir'/query.fasta' $rootdir'/gmap.query.gff3' && rm $rootdir'/query.fasta'
echo "5. running diff to check if the pep sequences are different or not"
echo "================================================"
diffresult=$(diff $rootdir'/target.pep.faa' $rootdir'/query.pep.faa')
if [ "$diffresult" != "" ]
then
echo "RESULTS pep of gene $geneid has changed"
echo $diffresults > $rootdir/CDS_CHANGED
awk 'BEGIN{OFS="\t"}{if ($3 ~ /gene/){print $0";cds=CDS_CHANGED"} else {print $0}}' $rootdir'/gmap.wholeGenome.gff3' > $rootdir/'tmp.gff3' && mv $rootdir/'tmp.gff3' $rootdir'/gmap.wholeGenome.gff3'
awk 'BEGIN{OFS="\t"}{if ($3 ~ /gene/){print $0";cds=CDS_CHANGED"} else {print $0}}' $targetgff > $rootdir/'tmp.gff3' && mv $rootdir/'tmp.gff3' $targetgff
else
echo "RESULTS pep of gene $geneid stays the same"
touch $rootdir/CDS_OK
awk 'BEGIN{OFS="\t"}{if ($3 ~ /gene/){print $0";cds=CDS_OK"} else {print $0}}' $rootdir'/gmap.wholeGenome.gff3' > $rootdir/'tmp.gff3' && mv $rootdir/'tmp.gff3' $rootdir'/gmap.wholeGenome.gff3'
awk 'BEGIN{OFS="\t"}{if ($3 ~ /gene/){print $0";cds=CDS_OK"} else {print $0}}' $targetgff > $rootdir/'tmp.gff3' && mv $rootdir/'tmp.gff3' $targetgff
fi
done
### concat all the gff files created when mapping done on whole genome
# for genes mapped on subsequence
echo "Now concat all GFF of gmap on target"
gffList=($(find $inputDir -type f -name "gmap.target.gff3" ))
gt gff3 -sort -tidy -retainids "${gffList[@]}" 1> $outputRescueGff 2> $outputRescueGff.log
# for genes mapped on whole genome
echo "Now concat all GFF of gmap on whole genome"
gffList=($(find $inputDir -type f -name "gmap.wholeGenome.gff3" ))
gt gff3 -sort -tidy -retainids "${gffList[@]}" 1> $outputRescueWholeGenomeGff 2> $outputRescueWholeGenomeGff.log
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment