From a7057bfd68e1e01d726c803f5d4e860e3c0162c0 Mon Sep 17 00:00:00 2001
From: Olivier Langella <olivier.langella@u-psud.fr>
Date: Tue, 19 Jun 2018 17:09:16 +0200
Subject: [PATCH] WIP: better pep xml support

---
 src/input/pepxmlsaxhandler.cpp | 117 ++++++++++++++++++++++++++-------
 src/input/pepxmlsaxhandler.h   |   3 +
 src/utils/types.h              |   6 +-
 3 files changed, 101 insertions(+), 25 deletions(-)

diff --git a/src/input/pepxmlsaxhandler.cpp b/src/input/pepxmlsaxhandler.cpp
index 05090d158..809b68575 100644
--- a/src/input/pepxmlsaxhandler.cpp
+++ b/src/input/pepxmlsaxhandler.cpp
@@ -120,7 +120,10 @@ PepXmlSaxHandler::startElement(const QString &namespaceURI,
         {
           is_ok = startElement_mod_aminoacid_mass(attributes);
         }
-
+      else if(qName == "modification_info")
+        {
+          is_ok = startElement_modification_info(attributes);
+        }
       _current_text.clear();
     }
   catch(pappso::PappsoException exception_pappso)
@@ -211,10 +214,29 @@ PepXmlSaxHandler::startElement_msms_pipeline_analysis(QXmlAttributes attributes)
 bool
 PepXmlSaxHandler::startElement_msms_run_summary(QXmlAttributes attributes)
 {
-  bool is_ok          = true;
-  QString mz_datafile = QString("%1%2")
-                          .arg(attributes.value("base_name"))
-                          .arg(attributes.value("raw_data"));
+  bool is_ok = true;
+  QString old_file;
+  if(!_current_complete_msrun_file_path.isEmpty())
+    {
+      old_file = _current_complete_msrun_file_path;
+    }
+  _current_complete_msrun_file_path = QString("%1%2")
+                                        .arg(attributes.value("base_name"))
+                                        .arg(attributes.value("raw_data"));
+  if((!old_file.isEmpty()) &&
+     (QFileInfo(_current_complete_msrun_file_path).baseName() !=
+      QFileInfo(old_file).baseName()))
+    {
+      throw pappso::PappsoException(
+        QObject::tr("ERROR reading pepxml file :\nX!TandemPipeline does not "
+                    "support identification source files containing results "
+                    "from multiple MS runs (%1 != %2)")
+          .arg(old_file)
+          .arg(_current_complete_msrun_file_path));
+    }
+  _sp_msrun.get()->setFilename(attributes.value("base_name"));
+  _sp_msrun.get()->setSampleName(
+    QFileInfo(_current_complete_msrun_file_path).baseName());
   return is_ok;
 }
 
@@ -259,7 +281,12 @@ PepXmlSaxHandler::startElement_search_summary(QXmlAttributes attributes)
 bool
 PepXmlSaxHandler::startElement_spectrum_query(QXmlAttributes attributes)
 {
-  bool is_ok              = true;
+  bool is_ok           = true;
+  QString spectrum_ref = attributes.value("spectrum");
+  if(_current_complete_msrun_file_path.isEmpty())
+    {
+      _sp_msrun.get()->setFilename(QFileInfo(spectrum_ref).baseName());
+    }
   unsigned int start_scan = attributes.value("start_scan").toUInt();
   unsigned int end_scan   = attributes.value("end_scan").toUInt();
   if(start_scan != end_scan)
@@ -272,6 +299,7 @@ PepXmlSaxHandler::startElement_spectrum_query(QXmlAttributes attributes)
           .arg(start_scan)
           .arg(end_scan));
     }
+  _scan           = start_scan;
   _current_charge = attributes.value("assumed_charge").toUInt();
   if(attributes.value("retention_time_sec").isEmpty())
     {
@@ -297,7 +325,7 @@ PepXmlSaxHandler::startElement_spectrum_query(QXmlAttributes attributes)
 
 //<alternative_protein protein="sp|P46784|RS10B_YEAST" protein_descr="40S
 //       ribosomal protein S10-B OS=Saccharomyces cerevisiae (strain ATCC 204508
-//                   \
+//                                  \
 //S288c) GN=RPS10B PE=1 SV=1" num_tol_term="2" peptide_prev_aa="K"
 // peptide_next_aa="N"/>
 bool
@@ -482,37 +510,78 @@ PepXmlSaxHandler::startElement_search_score(QXmlAttributes attributes)
       if(name == "expect")
         {
           _p_peptide_evidence->setEvalue(valueStr.simplified().toDouble());
-          if(_p_peptide_evidence->getIdentificationEngine() ==
-             IdentificationEngine::OMSSA)
-            {
-              _p_peptide_evidence->setParam(PeptideEvidenceParam::omssa_evalue,
-                                            valueStr.simplified().toDouble());
-            }
         }
       else if(name == "EValue")
         {
           _p_peptide_evidence->setEvalue(valueStr.simplified().toDouble());
         }
-      // <search_score name="hyperscore" value="232"/>
-      else if(name == "hyperscore")
-        {
-          _p_peptide_evidence->setParam(
-            PeptideEvidenceParam::tandem_hyperscore,
-            QVariant(attributes.value("hyperscore").toDouble()));
-        }
-      else if(name == "pvalue")
+
+      IdentificationEngine identification_engine =
+        _p_peptide_evidence->getIdentificationEngine();
+      if(identification_engine == IdentificationEngine::OMSSA)
         {
-          if(_p_peptide_evidence->getIdentificationEngine() ==
-             IdentificationEngine::OMSSA)
+          if(name == "pvalue")
             {
               _p_peptide_evidence->setParam(PeptideEvidenceParam::omssa_pvalue,
                                             valueStr.simplified().toDouble());
             }
+          else if(name == "expect")
+            {
+              _p_peptide_evidence->setParam(PeptideEvidenceParam::omssa_evalue,
+                                            valueStr.simplified().toDouble());
+            }
+        }
+      else if(identification_engine == IdentificationEngine::XTandem)
+        {
+          if(name == "hyperscore")
+            {
+              _p_peptide_evidence->setParam(
+                PeptideEvidenceParam::tandem_hyperscore,
+                QVariant(valueStr.simplified().toDouble()));
+            }
+        }
+      else if(identification_engine == IdentificationEngine::MSGFplus)
+        {
+          if(name == "raw")
+            {
+              _p_peptide_evidence->setParam(
+                PeptideEvidenceParam::msgfplus_raw,
+                QVariant(valueStr.simplified().toDouble()));
+            }
+          else if(name == "SpecEValue")
+            {
+              _p_peptide_evidence->setParam(
+                PeptideEvidenceParam::msgfplus_SpecEValue,
+                QVariant(valueStr.simplified().toDouble()));
+            }
+          else if(name == "EValue")
+            {
+              _p_peptide_evidence->setParam(
+                PeptideEvidenceParam::msgfplus_EValue,
+                QVariant(valueStr.simplified().toDouble()));
+            }
         }
     }
   return is_ok;
 }
 
+
+// <modification_info mod_nterm_mass="43.018389" modified_peptide="SQRDCR">
+bool
+PepXmlSaxHandler::startElement_modification_info(QXmlAttributes attributes)
+{
+  bool is_ok = true;
+
+  if(!attributes.value("mod_nterm_mass").isEmpty())
+    {
+      pappso::AaModificationP modif =
+        Utils::guessAaModificationPbyMonoisotopicMassDelta(
+          attributes.value("mod_nterm_mass").toDouble());
+
+      _current_peptide_sp.get()->addAaModification(modif, 0);
+    }
+  return is_ok;
+}
 // <modification_info modified_peptide="SQRDCR"> <mod_aminoacid_mass
 // position="5" mass="160.030649"/> </modification_info>
 bool
@@ -521,7 +590,7 @@ PepXmlSaxHandler::startElement_mod_aminoacid_mass(QXmlAttributes attributes)
   bool is_ok            = true;
   double mass           = attributes.value("mass").toDouble();
   unsigned int position = attributes.value("position").toUInt() - 1;
-  const pappso::Aa &aa  = _current_peptide_sp.get()->getConstAa(position);
+  const pappso::Aa aa(_current_peptide_sp.get()->getSequence()[position].toLatin1());
   double mass_modif     = mass - aa.getMass();
 
   pappso::AaModificationP modif =
diff --git a/src/input/pepxmlsaxhandler.h b/src/input/pepxmlsaxhandler.h
index ed38bc289..40a44ecc0 100644
--- a/src/input/pepxmlsaxhandler.h
+++ b/src/input/pepxmlsaxhandler.h
@@ -76,6 +76,8 @@ class PepXmlSaxHandler : public QXmlDefaultHandler
   bool startElement_interprophet_result(QXmlAttributes attributes);
   bool startElement_search_score(QXmlAttributes attributes);
   bool startElement_mod_aminoacid_mass(QXmlAttributes attributes);
+  bool startElement_modification_info(QXmlAttributes attributes);
+  
   bool endElement_search_hit();
   bool endElement_modification_info();
 
@@ -100,6 +102,7 @@ class PepXmlSaxHandler : public QXmlDefaultHandler
   QString _current_group_type;
   QString _current_note_label;
   QString _current_note_type;
+  QString _current_complete_msrun_file_path;
   unsigned int _scan;
   unsigned int _current_charge;
   pappso::pappso_double _current_retention_time;
diff --git a/src/utils/types.h b/src/utils/types.h
index 9c7ed47c5..df9423e49 100644
--- a/src/utils/types.h
+++ b/src/utils/types.h
@@ -75,7 +75,11 @@ enum class PeptideEvidenceParam : std::int8_t
   peptide_inter_prophet_probability = 5, ///< no PSI MS description
   omssa_evalue = 6, ///< MS:1001328  "OMSSA E-value." [PSI:PI]
   omssa_pvalue = 7, ///< MS:1001329  "OMSSA p-value." [PSI:PI]
-  
+  msgfplus_raw = 8, ///< MS:1002049  "MS-GF raw score." [PSI:PI]
+  msgfplus_denovo = 9, ///< MS:1002050  "MS-GF de novo score." [PSI:PI]
+  msgfplus_energy = 10, ///< MS:1002051  "MS-GF energy score." [PSI:PI]
+  msgfplus_SpecEValue = 11, ///< MS:1002052  "MS-GF spectral E-value." [PSI:PI]
+  msgfplus_EValue = 12, ///< MS:1002053  "MS-GF E-value." [PSI:PI]
 
 };
 
-- 
GitLab