Commit 7a45d38d authored by sallet's avatar sallet
Browse files

Merge branch 'feature/ImproveAltEst' into develop

parents cd422d77 c11ceed0
-------------------------------------------------------------------------------
v4.2c released May, 2021
v4.3 released July, 2021
-------------------------------------------------------------------------------
Use -k flag or AltEst.Reference to load a reference annotation and predict splicing variants. Improve performance to perform variant prediction.
- Use -k flag or AltEst.Reference to load a reference annotation and predict the splicing variants.
- Improve performance to perform the variant prediction.
- In the AltEst plugin, add the parameter 'AltEst.IncompatibilityExonBorderMatchThreshold'. EuGene will perform a
variant search only for the AltEst where one of the exons shows at one of its borders a difference of at least
x nucleotides with a reference gene.
- If AltEst is activated, create two files: one for the initial prediction (.gff3) and one also including the variants
(.variants.gff3)
-------------------------------------------------------------------------------
v4.2b released Sep. 2020
......
......@@ -4,10 +4,18 @@ Copyright (C) 2020, EuGene Team <eugene-help@groupes.renater.fr>
See the end for copying conditions.
Please send eugene bug reports to eugene-help@groupes.renater.fr
Version 4.2c
Version 4.3
* If AltEst plugin is activate, EuGene creates two files: one for the initial prediction
(.gff3) and one including the variant prediction (.variants.gff3)
* The -k option (equivalent to AltEst.reference=1) allows to predict splicing variants
from a reference annotation.
* New parameter 'AltEst.IncompatibilityExonBorderMatchThreshold'. EuGene will perform a
variant search only for the AltEst where one of the exons shows at one of its borders
a difference of at least x nucleotides with a reference gene.
* Improve performance of splicing variant prediction
* Allow to predict only variant from a reference annotation
Version 4.2b
......@@ -48,7 +56,7 @@ Version 2.0
-------------------------------------------------------
Copying information:
Copyright (C) 2011, EuGene Team <eugene-help@lists.mulcyber.toulouse.inra.fr>
Copyright (C) 2011, EuGene Team <eugene-help@groupes.renater.fr>
Permission is granted to anyone to make or distribute verbatim copies
of this document as received, in any medium, provided that the
......
......@@ -2,6 +2,8 @@ Procedure to create a new Eugene release.
0) Prerequise: 'EUGENEDIR' environment variable has to be defined.
1) Describe the release news in ChangeLog file.
1) To update the git eugene directory, move to $EUGENEDIR and
type `git pull'.
......@@ -15,12 +17,12 @@ Procedure to create a new Eugene release.
4) To check the new release is OK, compile and launch tests:
make clean
./reconf
./configure
./configure --prefix=$EUGENEDIR
make
make check
5) Commit modified files:
git commit cfg/eugene.par configure.ac Makefile.in
git commit cfg/eugene.par configure.ac Makefile.in ChangeLog
git push
6) Create the new release:
......@@ -28,4 +30,4 @@ Procedure to create a new Eugene release.
7) Type `make dist' to create a new tarball file in the
$EUGENEDIR directory, named for instance eugene-4.2b.tar.gz.
Put the tarball in the CVS repository. (Section 'Files')
Put the tarball in the Download section of eugene.toulouse.inrae.fr
#################################################################
###################### GENERAL PARAMETERS #######################
#################################################################
EuGene.version 4.2c
EuGene.version 4.3
EuGene.organism Arabidopsis
EuGene.mode Eukaryote # Eukaryote or Prokaryote
EuGene.sloppy 0
......@@ -66,6 +66,7 @@ Output.webdir LOCAL # or URL: http://www.inra.fr/bia/T/EuGene/
#
##### Alternative splicing predictor control
AltEst.use 0
AltEst.IncompatibilityExonBorderMatchThreshold 50
AltEst.ExonBorderMatchThreshold 0
AltEst.Penalty 1000
AltEst.includedEstFilter 1
......
#################################################################
###################### GENERAL PARAMETERS #######################
#################################################################
EuGene.version 4.2c
EuGene.version 4.3
EuGene.organism Arabidopsis
EuGene.mode Eukaryote # Eukaryote or Prokaryote
EuGene.sloppy 0
......@@ -66,12 +66,13 @@ Output.webdir LOCAL # or URL: http://www.inra.fr/bia/T/EuGene/
#
##### Alternative splicing predictor control
AltEst.use 0
AltEst.IncompatibilityExonBorderMatchThreshold 100
AltEst.ExonBorderMatchThreshold 0
AltEst.Penalty 1000
AltEst.includedEstFilter 1
AltEst.compatibleEstFilter 1
AltEst.unsplicedEstFilter 1
AltEst.extremeLengthFilter 1
AltEst.compatibleEstFilter 0
AltEst.unsplicedEstFilter 0
AltEst.extremeLengthFilter 0
AltEst.maxEstLength 10000
AltEst.minEstLength 100
AltEst.maxIn 10000
......@@ -79,9 +80,9 @@ AltEst.minIn 40
AltEst.maxEx 10000
AltEst.minEx 3
AltEst.exonucleasicLength 10
AltEst.RepredictMargin 16000
AltEst.RepredictMargin 1000
AltEst.altEstDisplay 0
AltEst.strandSpecific 0
AltEst.strandSpecific 1
AltEst.verbose 0
#AltEst.reference /path/of/gff3/reference/annotation
AltEst.format GFF3
......
#################################################################
###################### GENERAL PARAMETERS #######################
#################################################################
EuGene.version 4.2c
EuGene.version 4.3
EuGene.organism Meliloti
EuGene.mode Prokaryote # Eukaryote or Prokaryote or Porkaryote2
EuGene.sloppy 0
......
......@@ -2773,7 +2773,7 @@ fi
# Define the identity of the package.
PACKAGE=eugene
VERSION=4.2c
VERSION=4.3
cat >>confdefs.h <<_ACEOF
......
......@@ -21,7 +21,7 @@
AC_INIT(, , [EuGene Team eugene@ossau.toulouse.inra.fr])
AC_CONFIG_AUX_DIR(config)
AM_CONFIG_HEADER(config.h)
AM_INIT_AUTOMAKE(eugene, 4.2c)
AM_INIT_AUTOMAKE(eugene, 4.3)
# Checks for programs.
AC_PROG_CXX
......
......@@ -331,6 +331,129 @@ bool OneAltEst :: CompatibleWith(Prediction *pred)
return false;
}
Gene* OneAltEst :: GetUncompatibleGene(Prediction *pred, int margin)
{
int idxf, idxe=0;
Gene *g;
bool VERBOSE = false;
if (VERBOSE)
{
fprintf(stdout, "Recherche un gène incompatible pour l'AltEst suivante :\n ") ;
this->Print();
}
//locate overlapping gene
if (PAR.getI("AltEst.strandSpecific"))
g = pred->FindGene(start,end, strand);
else
g = pred->FindGene(start,end);
// no overlapping gene
if (g == NULL)
{
if (VERBOSE) fprintf(stdout, "Pas de gène chevauchat l'altEst (1) ! \n ") ;
return NULL;
}
else
{
// study g to be sure it is incompatible
// check first exon start is in transcribed matured region
int nbFeature = g->nbFea();
for (idxf = 0; idxf < nbFeature; idxf++)
{
if ((g->vFea[idxf]->start-1 <= vi_ExonStart[idxe]) &&
(g->vFea[idxf]->end-1 >= vi_ExonStart[idxe]))
{
if (! g->vFea[idxf]->IsTranscribedAndUnspliced() &&
abs((g->vFea[idxf]->end-1) - vi_ExonStart[idxe]) >= margin)
{
if (VERBOSE) {
fprintf(stdout, "Gene incompatible trouve : EST debute dans intron ! \n ") ;
g->Print();
}
return g;
}
else break;
}
}
idxf++;
bool firstOk = false;
// test des frontieres suivantes
while ((idxe < exonsNumber-1) && (idxf < nbFeature))
{
if (!firstOk && (abs ((vi_ExonEnd[idxe]+1) - (g->vFea[idxf]->start-1)) <= margin))
firstOk = true;
// Test if the left border of an intron of the gen is incoherent with the EST
if (!firstOk && ( ! g->vFea[idxf]->IsTranscribedAndUnspliced()) ) // IG or intron: broken
{
if (VERBOSE) {
fprintf(stdout, "Gene incompatible trouve: AltEst incoherente avec un intron du gene de reference (left position) ! \n ") ;
g->Print();
}
return g;
}
// Test if the right border of an intron of the gene is incoehrent with the EST
if ( (abs(vi_ExonStart[idxe+1] - g->vFea[idxf]->end) > margin) &&
!g->vFea[idxf]->IsTranscribedAndUnspliced() ) // IF or intron
{
if (VERBOSE) {
fprintf(stdout, "Gene incompatible trouve: AltEst incoherente avec un intron du gene de reference (right position) ! \n ") ;
g->Print();
}
return g;
}
if (firstOk && (abs(vi_ExonStart[idxe+1] - g->vFea[idxf]->end) <= margin))
{
if ( !g->vFea[idxf]->IsTranscribedAndUnspliced() ) // IF or intron
{
idxe++;
idxf++;
firstOk = false;
continue;
}
}
idxf++;
}
// test de la fin
for (; idxf < nbFeature; idxf++)
{
if ((g->vFea[idxf]->start-1 <= vi_ExonEnd[idxe]) &&
(g->vFea[idxf]->end-1 >= vi_ExonEnd[idxe]))
{
//return (g->vFea[idxf]->IsTranscribedAndUnspliced());
if (!g->vFea[idxf]->IsTranscribedAndUnspliced() && abs( vi_ExonEnd[idxe] - (g->vFea[idxf]->start-1)) >= margin )
{
if (VERBOSE) {
fprintf(stdout, "Gene incompatible trouve (4) ! \n ") ;
g->Print();
}
return g;
}
else
{
if (VERBOSE) {
fprintf(stdout, "Pas de gene incompatible ! \n ") ;
}
return NULL;
}
}
}
}
//return false;
if (VERBOSE) {
fprintf(stdout, "je suis a la fin. Je retourne le gene trouve..\n");
}
return g;
}
// --------------------------------------
// Penalize according to the EST
// --------------------------------------
......@@ -697,8 +820,6 @@ void AltEst :: Compare(int &nbIncomp, int &nbNoevidence, int &nbIncluded)
// WARNING : voae_AltEst[0] is the special INIT (counted in totalAltEstNumber)
keptAltEstNumber = totalAltEstNumber;
int debug = 1;
int nbComparison = 0;
int strandSpecific = PAR.getI("AltEst.strandSpecific");
if (compatibleEstFilter || includedEstFilter)
......@@ -714,10 +835,6 @@ void AltEst :: Compare(int &nbIncomp, int &nbNoevidence, int &nbIncluded)
// all the next Est have a higher position than the current one
if (voae_AltEst[j].GetStart() > voae_AltEst[i].GetEnd()) break;
if (voae_AltEst[j].IsToRemove()) continue;
/*if (debug) {
fprintf(stderr, "\n COMP EST i %d - %d / EST j %d - %d", voae_AltEst[i].GetStart(), voae_AltEst[i].GetEnd(), voae_AltEst[j].GetStart(), voae_AltEst[j].GetEnd());
nbComparison++;
}*/
if (voae_AltEst[i].IsInconsistentWith(&voae_AltEst[j]))
{
......@@ -740,19 +857,8 @@ void AltEst :: Compare(int &nbIncomp, int &nbNoevidence, int &nbIncluded)
}
}
}
/*if (debug) {
fprintf(stderr, "\n COMP EST i %d - %d / EST j %d - %d END", voae_AltEst[i].GetStart(), voae_AltEst[i].GetEnd(), voae_AltEst[j].GetStart(), voae_AltEst[j].GetEnd());
}*/
}
}
/*if (debug)
{
fprintf(stderr,"\nNombre de comparaisons effectues : %d\n", nbComparison);
}*/
if (compatibleEstFilter)
{
......
......@@ -64,6 +64,7 @@ class OneAltEst
int minEstLen, int maxEstLen);
bool IsInconsistentWith(OneAltEst*);
bool CompatibleWith(Prediction *pred);
Gene* GetUncompatibleGene(Prediction *pred, int margin=0);
void Print ();
inline void UpdateBoundaries() { start = vi_ExonStart[0]; end = vi_ExonEnd[vi_ExonEnd.size()-1]; };
inline char* GetId() { return id; };
......
......@@ -81,9 +81,9 @@ void BackPoint :: Clean ()
StartPos = 0;
Cost = 0.0;
Additional = 0.0;
delete Next;
delete Prev;
delete Origin;
//if (Next != NULL) delete Next;
//if (Prev != NULL) delete Prev;
//if (Prev != NULL) delete Origin;
Next = Prev = this;
Origin = NULL;
Status = 0;
......@@ -151,6 +151,7 @@ void Track :: Zap()
// ----------------------------------------------------------------
void Track :: Clean()
{
Zap();
Path.Clean();
Optimal = 0.0;
OptPos = 0;
......@@ -210,8 +211,9 @@ void Track :: Mark(int pos)
void Track :: Sweep(int pos) {
BackPoint *It = Path.Next;
BackPoint *NextIt;
while ( (It != &Path)){// && (It->StartPos >= pos)) {
NextIt = It->Next;
if (!It->IsMarked()) {
It->Next->Additional += It->Additional;
It->Next->Prev = It->Prev;
......@@ -219,8 +221,7 @@ void Track :: Sweep(int pos) {
delete It;
NumBPCollect++;
}
It = It->Next;
It = NextIt;
}
}
// ----------------------------------------------------------------
......
......@@ -41,6 +41,7 @@ const int FASTA_Len = 50;
const int MAX_LINE = 300;
const int MAX_GFF_LINE = 2048;
const double NINFINITY = log(0.0);
const int MIN_PERCENTAGE_OVERLAP_ALTEST = 80;
// Les Hits EST
const unsigned char HitForward = 0x1; // 00000001
......
......@@ -156,11 +156,11 @@ DAG :: ~DAG ()
// ----------------------------------------------------------------
// Clean the activeTracks
// NOT USED
// ----------------------------------------------------------------
void DAG :: Clean()
{
pred = NULL;
for (int i = 0; i < activeTracks.size(); i ++) LBP[activeTracks[i]].Clean();
DAG::NormalizingPath = 0.0;
}
......
......@@ -218,6 +218,7 @@ Prediction* Predict (DNASeq* TheSeq, MasterSensor* MSensor)
{
return Predict(TheSeq,0,TheSeq->SeqLen-1,MSensor);
}
// -------------------------------------------------------------------------
// Compute alternative Predictions based on EST
// -------------------------------------------------------------------------
......@@ -274,102 +275,7 @@ Prediction* AltPredict (DNASeq* TheSeq, int From, int To, MasterSensor* MSensor,
}
// // -------------------------------------------------------------------------
// // Compute alternative Predictions based on EST
// // NOT USED
// // -------------------------------------------------------------------------
// std::vector<Prediction*> AllAltPredict (DNASeq* TheSeq, int fromPos, int toPos, MasterSensor* MSensor,
// AltEst *AltEstDB, Prediction *pred)
// {
// int ExonBorderMatchThreshold = PAR.getI("AltEst.ExonBorderMatchThreshold");
// int RepredictMargin = PAR.getI("AltEst.RepredictMargin");
// int GCVerbose = PAR.getI("EuGene.VerboseGC");
// int GCLatency = PAR.getI("EuGene.GCLatency");
// DATA Data;
// int Forward = 1;//PAR.getI("Sense");
// int Dir = (Forward ? 1 : -1);
//
//
// int localFrom,localTo;
// std::vector <Prediction*> vPred;
// Prediction* AltPred;
// Gene* baseGene;
// int newGene = 0; // if a splice variant has no base gene, it is a "new" gene. counter needed for gene number
//
// // Active the appropriated tracks according to the eugene mode Prokaryote or Eukaryote
// InitActiveTracks(0);
// // Create a dag which would be reinitialze for each altEst
// DAG* Dag = new DAG(TheSeq, PAR);
// Dag->LoadDistLength();
//
// for (int altidx = 0; altidx < AltEstDB->totalAltEstNumber; altidx++)
// {
// if (AltEstDB->voae_AltEst[altidx].IsToRemove()) continue;
// if (AltEstDB->voae_AltEst[altidx].CompatibleWith(pred)) continue;
//
// localFrom = Max(fromPos, AltEstDB->voae_AltEst[altidx].GetStart()-RepredictMargin);
// localTo = Min(toPos, AltEstDB->voae_AltEst[altidx].GetEnd()+RepredictMargin);
// // DynaProg end at the lastpos + 1 to account for final signals.
// int FirstNuc = (Forward ? localFrom : localTo+1);
// int LastNuc = (Forward ? localTo+1 : localFrom);
//
// Dag->Init(FirstNuc-Dir, LastNuc+Dir);
// Dag->WeightThePrior();
//
// for (int nuc = FirstNuc; nuc != LastNuc+Dir; nuc += Dir)
// {
// // recuperation des infos
// MSensor->GetInfoAt(TheSeq, nuc, &Data);
// AltEstDB->Penalize(altidx,nuc,&Data);
// if (Forward)
// Dag->ShortestPathAlgoForward(nuc,Data);
// else
// Dag->ShortestPathAlgoBackward(nuc,Data);
// if (nuc && (nuc % GCLatency == 0)) Dag->MarkAndSweep(nuc,GCVerbose,GCLatency);
// }
// Dag->WeightThePrior();
// Dag->BuildPrediction(localFrom, localTo, Forward);
// Dag->pred->TrimAndUpdate(TheSeq);
// AltPred = Dag->pred;
//
// if (AltPred)
// {
// if ( (AltPred->vGene[0]->cdsStart == -1) || (AltPred->vGene[0]->cdsEnd == -1))
// {
// Dag->Clean();
// continue;
// }
// // Delete the gene of the alt prediction which doesn't overlap the EST
// AltPred->DeleteOutOfRange(AltEstDB->voae_AltEst[altidx].GetStart(),AltEstDB->voae_AltEst[altidx].GetEnd(), AltEstDB->voae_AltEst[altidx].GetStrand());
// // If genes overlapping the EST was found and if the prediction is original
// if ( (AltPred->nbGene > 0) && (AltPred->IsOriginal(pred,vPred,ExonBorderMatchThreshold)) )
// {
// fprintf(stderr,"Optimal path length = %.4f\n",- AltPred->optimalPath);
// baseGene = pred->FindGene(AltPred->vGene[0]->trStart,AltPred->vGene[0]->trEnd, AltPred->vGene[0]->GetStrand());
// if (baseGene)
// {
// baseGene->hasvariant++;
// AltPred->vGene[0]->isvariant = true;
// AltPred->vGene[0]->hasvariant = baseGene->hasvariant;
// AltPred->vGene[0]->geneNumber = baseGene->geneNumber;
// baseGene->tuStart = ( baseGene->tuStart ) ? Min(baseGene->tuStart,AltPred->vGene[0]->trStart)
// : Min(baseGene->trStart,AltPred->vGene[0]->trStart);
// baseGene->tuEnd = ( baseGene->tuEnd ) ? Max(baseGene->tuEnd,AltPred->vGene[0]->trEnd)
// : Max(baseGene->trEnd,AltPred->vGene[0]->trEnd);
// }
// else
// {
// fprintf(stderr,"New gene predicted by alternative spliced gene prediction.\n");
// AltPred->vGene[0]->geneNumber = pred->nbGene + newGene++;
// }
// vPred.push_back(AltPred);
// }
// }
// Dag->Clean();
// }
// return vPred;
// }
//
// -------------------------------------------------------------------------
// Read a fasta file
......@@ -477,6 +383,7 @@ int main (int argc, char * argv [])
char grname[FILENAME_MAX+1];
char miname[FILENAME_MAX+1];
int graph;
bool debugAltest = true;
fprintf(stderr,"-------------------------------------"
"--------------------------------\n");
......@@ -497,6 +404,10 @@ int main (int argc, char * argv [])
int sequence;
for (sequence = optind; sequence < argc ; sequence++)
{
time_t t1, t2, t3, t4, t5, t6, t7;
time(&t1); // start analyse the sequence
PAR.set("fstname", argv[sequence]);
// --------------------------------------------------------------------
......@@ -570,22 +481,22 @@ int main (int argc, char * argv [])
// --------------------------------------------------------------------
MS = new MasterSensor();
MS->InitMaster(TheSeq);
time(&t2); // end of Init the sensor
// --------------------------------------------------------------------
// Predict: 1st main prediction
// --------------------------------------------------------------------
if (PAR.count("AltEst.reference") > 0)
{
fprintf(stderr, "Alt mode\n");
fprintf(stderr, "Variant prediction...\n");
char reffile[FILENAME_MAX+1];
strcpy(reffile, PAR.getC("AltEst.reference"));
pred = new Prediction(reffile, TheSeq);
//pred->Print();
time(&t3); // end of create the Prediction object from the reference GFF3
}
else
{
char* mode = PAR.getC ("EuGene.mode");
if (!strcmp(mode, "Prokaryote2") || !strcmp(mode, "Eukaryote2") )
......@@ -599,10 +510,8 @@ int main (int argc, char * argv [])
else
{
pred = Predict(TheSeq, fromPos, toPos, MS);
//pred->Print();
fprintf(stderr,"Optimal path length = %.4f\n",- pred->optimalPath);
}
//pred->Print();
}
// --------------------------------------------------------------------
......@@ -611,7 +520,8 @@ int main (int argc, char * argv [])
if (graph)
pred->PlotPred();
if ( ! PAR.getI("AltEst.use") )
// Print the prediction in the classical mode or in the AltEst mode (without ref prediction preloading)
if ( ! PAR.getI("AltEst.use") || (PAR.getI("AltEst.use") && PAR.count("AltEst.reference") <= 0 ))
pred->Print(TheSeq, MS);
strcpy(miname, prefixName);
......@@ -628,32 +538,44 @@ int main (int argc, char * argv [])
ClosePNG();
fprintf(stderr, "done\n");
}
time(&t4); // end of MS PostAnalyse
// --------------------------------------------------------------------
// Load Alternative EST data (if any)
// --------------------------------------------------------------------
if (PAR.getI("AltEst.use"))
{
AltEst *AltEstDB = new AltEst(TheSeq);
time(&t5); // end of AltEst build
std::vector <Prediction*> vPred;
int ExonBorderMatchThreshold = PAR.getI("AltEst.ExonBorderMatchThreshold");
int RepredictMargin = PAR.getI("AltEst.RepredictMargin");
int ExonBorderMatchThreshold2 = PAR.getI("AltEst.IncompatibilityExonBorderMatchThreshold");
int newGene = 0; // if a splice variant has no base gene, it is a "new" gene. counter needed for gene number
printf ("ExonBorderMatchThreshold2 %i\n", ExonBorderMatchThreshold2);
Prediction* AltPred;
Gene* baseGene;
Gene* uncompatibleGene;
int localFrom,localTo, altLocalFrom,altLocalTo ;
int NumberLaunchedPred = 0;
for (int altidx = 0; altidx < AltEstDB->totalAltEstNumber; altidx++)
{
if (AltEstDB->voae_AltEst[altidx].IsToRemove()) continue;
int localFrom,localTo;
localFrom = Max(fromPos, AltEstDB->voae_AltEst[altidx].GetStart()-RepredictMargin);
localTo = Min(toPos, AltEstDB->voae_AltEst[altidx].GetEnd()+RepredictMargin);
// Look for an overlapping with an inconsistent gene
uncompatibleGene = AltEstDB->voae_AltEst[altidx].GetUncompatibleGene(pred, ExonBorderMatchThreshold2);
// skip if not reference gene found
if (uncompatibleGene == NULL) continue;
NumberLaunchedPred++;
/