From a4ae5d554a09ef724be2702b876ff6da571529b8 Mon Sep 17 00:00:00 2001
From: Olivier Langella <olivier.langella@u-psud.fr>
Date: Mon, 5 Mar 2018 22:11:23 +0100
Subject: [PATCH] WIP: Mascot parser

---
 src/input/mascot/mascotdatparser.cpp | 248 ++++++++++++++++++++-------
 src/input/mascot/mascotdatparser.h   |  17 ++
 src/utils/types.h                    |   3 +-
 3 files changed, 203 insertions(+), 65 deletions(-)

diff --git a/src/input/mascot/mascotdatparser.cpp b/src/input/mascot/mascotdatparser.cpp
index 1ef92308c..fc06c24f2 100644
--- a/src/input/mascot/mascotdatparser.cpp
+++ b/src/input/mascot/mascotdatparser.cpp
@@ -33,6 +33,7 @@
 #include <pappsomspp/pappsoexception.h>
 #include "../../core/peptidextp.h"
 #include "../../core/proteinmatch.h"
+#include "../../core/peptideevidence.h"
 
 MascotDatParser::MascotDatParser(Project * p_project, IdentificationGroup * p_identification_group,
                                  IdentificationDataSource * p_identification_data_source) {
@@ -63,11 +64,23 @@ void MascotDatParser::parse(QIODevice * in_stream) {
                 parseHeaderLine( mime_parser.getCurrentTextStream().readLine());
             }
         }
+        else if (mime_parser.getCurrentFileName() == "summary") {
+            while(!mime_parser.getCurrentTextStream().atEnd()) {
+                parseSummaryLine( mime_parser.getCurrentTextStream().readLine());
+            }
+        }
         else if (mime_parser.getCurrentFileName() == "peptides") {
             while(!mime_parser.getCurrentTextStream().atEnd()) {
                 parsePeptidesLine( mime_parser.getCurrentTextStream().readLine());
             }
         }
+        else if (mime_parser.getCurrentFileName().startsWith("query")) {
+            _current_query_index = mime_parser.getCurrentFileName().mid(5).toUInt();
+            while(!mime_parser.getCurrentTextStream().atEnd()) {
+                parseQueryLine( mime_parser.getCurrentTextStream().readLine());
+            }
+            saveQuery();
+        }
     }
 
     mime_parser.close();
@@ -122,6 +135,7 @@ void MascotDatParser::parseHeaderLine(const QString & header_line) {
             qDebug() << "queries=" <<  header_list[2];
             _number_of_queries = header_list[2].toUInt();
             _query_peptide_results.resize(_number_of_queries);
+            _summary_list.resize(_number_of_queries);
         }
 //min_peaks_for_homology=6
 //max_hits=50
@@ -149,70 +163,6 @@ void MascotDatParser::parseHeaderLine(const QString & header_line) {
     }
 }
 
-void MascotDatParser::saveAndClearPeptide() {
-    qDebug() << "MascotDatParser::saveAndClearPeptide begin";
-    if (_current_peptide.query_index > 0) {
-        // save
-        _query_peptide_results[_current_peptide.query_index-1].push_back(_current_peptide);
-        //parse and save
-
-        QString peptide_str =_current_peptide.peptide_string_list.at(4);
-        if (!_current_peptide.subst.isEmpty()) {
-            //q856_p9_subst=1,X,W
-            //q24379_p2_subst=1,B,D,8,B,D
-            QStringList subst_list = _current_peptide.subst.split(",");
-            for (unsigned int i=0; i < subst_list.size(); i+=3) {
-                peptide_str = peptide_str.replace(subst_list.at(0+i).toInt()-1,1,subst_list.at(2+i));
-            }
-        }
-        PeptideXtpSp peptide_sp;
-        peptide_sp = PeptideXtp(peptide_str).makePeptideXtpSp();
-        peptide_sp = _p_project->getPeptideStore().getInstance(peptide_sp);
-
-        if (_current_peptide.protein_string_list.size() != _current_peptide.fasta_file_list.size()) {
-            throw pappso::PappsoException(QObject::tr("ERROR (_current_peptide.protein_string_list.size() != _current_peptide.fasta_file_list.size()) %1").arg(_current_peptide.protein_string_list.join(",\"")));
-        }
-
-        foreach (const QString &str, _current_peptide.protein_string_list) {
-            //sp|O95006|OR2F2_HUMAN":0:299:303:1
-            int position =  str.indexOf("\"", 0);
-            QString accession = str.mid(0, position);
-            qDebug() << "accession=" <<  accession;
-            QStringList position_list = str.mid(position+2).split(":");
-            if (position_list.size() != 4) {
-                throw pappso::PappsoException(QObject::tr("ERROR position_list.size() != 4 %1").arg(str));
-            }
-            unsigned int start = position_list.at(1).toUInt();
-            unsigned int stop = position_list.at(2).toUInt();
-
-            ProteinXtp protein;
-            protein.setAccession(accession);
-
-            ProteinMatch * p_protein_match = _p_identification_group->getProteinMatchInstance(protein.getAccession());
-            if (p_protein_match == nullptr) {
-                throw pappso::PappsoException(QObject::tr("ERROR (p_protein_match == nullptr) %1").arg(str));
-            }
-
-            ProteinXtpSp sp_xtp_protein = protein.makeProteinXtpSp();
-            p_protein_match->setProteinXtpSp(_p_project->getProteinStore().getInstance(sp_xtp_protein));
-            p_protein_match->setChecked(true);
-            
-            PeptideMatch peptide_match;
-            peptide_match.setStart(start);
-            //peptide_match.setPeptideEvidenceSp();
-            
-            p_protein_match->addPeptideMatch(peptide_match);
-        }
-    }
-
-    //new peptide query clear
-    _current_peptide.peptide_string_list.clear();
-    _current_peptide.fasta_file_list.clear();
-    _current_peptide.query_index = 0;
-    _current_peptide.subst = "";
-    qDebug() << "MascotDatParser::saveAndClearPeptide end";
-}
-
 void MascotDatParser::parsePeptidesLine(const QString & peptide_line) {
     try {
         if (_regexp_header_line.exactMatch(peptide_line)) {
@@ -304,3 +254,173 @@ void MascotDatParser::parsePeptidesLine(const QString & peptide_line) {
         throw pappso::PappsoException(_error_str);
     }
 }
+
+
+void MascotDatParser::parseQueryLine(const QString & query_line) {
+    try {
+        if (_regexp_header_line.exactMatch(query_line)) {
+            QStringList header_list = _regexp_header_line.capturedTexts();
+            QString index = header_list[1];
+            QString value = header_list[2];
+
+
+            //title=FULL%20ISSLGSVGAGIVAVKK%20N22213%20%20%20QEP1_SpikeIn_230914_1_3ng_270914%2e35282%2e35282%2e2
+            if (index == "title") {
+                _current_query.title = value;
+            }
+            //rtinseconds=5703.84
+            else if (index == "rtinseconds") {
+                _current_query.rt = value.toDouble();
+            }
+            //index=44035
+            else if (index == "index") {
+                _current_query.query_index = value.toUInt();
+            }
+            //charge=2+
+            else if (index == "charge") {
+                _current_query.charge = value.toUInt();
+            }
+        }
+        /*
+        mass_min=129.102051
+        mass_max=1198.751099
+        int_min=2327
+        int_max=6.845e+005
+        num_vals=44
+        num_used1=-1
+        Ions1=129.102051:1.111e+005,275.207306:9.008e+004,374.275299:9.717e+004,514.825623:1.929e+004,599.878906:1.472e+005,714.431519:1.356e+004,815.466003:9643,924.570435:4428,1028.645630:1.257e+005,1085.665527:6.845e+005,1198.751099:1.858e+005,147.112656:7.954e+004,261.159271:4.327e+004,357.248169:2.192e+004,506.312775:3319,543.336548:1.083e+005,655.376465:9786,1011.625977:1.747e+004,1068.641602:9.804e+004,1181.725342:2.712e+004,130.086014:3.602e+004,257.197388:1.455e+004,357.212097:3960,591.370056:2.454e+004,972.547058:3819,1069.647583:4.457e+004,201.123077:1.566e+004,299.171783:1.421e+004,534.825562:1.036e+004,1087.671631:2.374e+004,173.128418:1.512e+004,244.129089:1.052e+004,590.873291:8065,1067.656006:2.235e+004,228.133957:1.35e+004,258.180115:3194,534.333374:3635,1096.635376:1.549e+004,200.139084:1.026e+004,535.328552:3006,1070.637085:4813,183.112442:9925,131.080994:4806,211.108139:2327
+
+         */
+    }
+
+    catch (pappso::PappsoException exception_pappso) {
+        _error_str = QObject::tr("ERROR in MascotDatParser::parseQueryLine %1, PAPPSO exception:\n%2").arg(query_line).arg(exception_pappso.qwhat());
+        qDebug() << _error_str;
+        throw pappso::PappsoException(_error_str);
+    }
+    catch (std::exception exception_std) {
+        _error_str = QObject::tr("ERROR in MascotDatParser::parseQueryLine %1, std exception:\n%2").arg(query_line).arg(exception_std.what());
+        qDebug() << _error_str;
+        throw pappso::PappsoException(_error_str);
+    }
+}
+
+
+void MascotDatParser::parseSummaryLine(const QString & summary_line) {
+
+    if (_regexp_header_line.exactMatch(summary_line)) {
+        QStringList header_list = _regexp_header_line.capturedTexts();
+        QString index = header_list[1];
+        QString value = header_list[2];
+
+
+        //qmass1=598.300206
+        if (index.startsWith("qmass")) {
+            unsigned int query_index = index.mid(5).toUInt();
+            //_current_query.title = value;
+            _summary_list[query_index-1].exp_mass=value.toDouble();
+        }
+        //qexp1=300.157379,2+
+//qintensity1=2054822.6250
+//qmatch1=73
+        else if (index.startsWith("qmatch")) {
+            unsigned int query_index = index.mid(6).toUInt();
+            //_current_query.title = value;
+            _summary_list[query_index-1].match=value.toUInt();
+        }
+//qplughole1=14.820890
+        else if (index.startsWith("qplughole")) {
+            unsigned int query_index = index.mid(9).toUInt();
+            //_current_query.title = value;
+            _summary_list[query_index-1].plug_hole=value.toDouble();
+        }
+
+    }
+}
+
+
+void MascotDatParser::saveAndClearPeptide() {
+    qDebug() << "MascotDatParser::saveAndClearPeptide begin";
+    if (_current_peptide.query_index > 0) {
+        // save
+        _query_peptide_results[_current_peptide.query_index-1].push_back(_current_peptide);
+    }
+
+    //new peptide query clear
+    _current_peptide.peptide_string_list.clear();
+    _current_peptide.fasta_file_list.clear();
+    _current_peptide.query_index = 0;
+    _current_peptide.subst = "";
+    qDebug() << "MascotDatParser::saveAndClearPeptide end";
+}
+
+
+void MascotDatParser::saveQuery() {
+    qDebug() << "MascotDatParser::saveQuery begin";
+    if (_current_query.query_index > 0) {
+        std::vector< PeptideLine> peptide_list = _query_peptide_results[_current_query.query_index-1];
+
+        PeptideEvidence peptide_evidence(_p_identification_data_source->getMsRunSp().get(),_current_query.query_index);
+        peptide_evidence.setCharge(_current_query.charge);
+        peptide_evidence.setChecked(true);
+        peptide_evidence.setExperimentalMass(_summary_list[_current_query.query_index-1].exp_mass);
+        peptide_evidence.setRetentionTime(_current_query.rt);
+
+        peptide_evidence.setParam(PeptideEvidenceParam::mascot_macth_score,  QVariant(_summary_list[_current_query.query_index-1].match));
+
+        peptide_evidence.setIdentificationDataSource( _p_identification_data_source);
+        //parse and save
+        for(PeptideLine & peptide_line:peptide_list) {
+
+            QString peptide_str =peptide_line.peptide_string_list.at(4);
+            if (!peptide_line.subst.isEmpty()) {
+                //q856_p9_subst=1,X,W
+                //q24379_p2_subst=1,B,D,8,B,D
+                QStringList subst_list = peptide_line.subst.split(",");
+                for (unsigned int i=0; i < subst_list.size(); i+=3) {
+                    peptide_str = peptide_str.replace(subst_list.at(0+i).toInt()-1,1,subst_list.at(2+i));
+                }
+            }
+            PeptideXtpSp peptide_sp;
+            peptide_sp = PeptideXtp(peptide_str).makePeptideXtpSp();
+            peptide_sp = _p_project->getPeptideStore().getInstance(peptide_sp);
+
+            if (peptide_line.protein_string_list.size() != peptide_line.fasta_file_list.size()) {
+                throw pappso::PappsoException(QObject::tr("ERROR (peptide_line.protein_string_list.size() != peptide_line.fasta_file_list.size()) %1").arg(peptide_line.protein_string_list.join(",\"")));
+            }
+
+            foreach (const QString &str, peptide_line.protein_string_list) {
+                //sp|O95006|OR2F2_HUMAN":0:299:303:1
+                int position =  str.indexOf("\"", 0);
+                QString accession = str.mid(0, position);
+                qDebug() << "accession=" <<  accession;
+                QStringList position_list = str.mid(position+2).split(":");
+                if (position_list.size() != 4) {
+                    throw pappso::PappsoException(QObject::tr("ERROR position_list.size() != 4 %1").arg(str));
+                }
+                unsigned int start = position_list.at(1).toUInt();
+                unsigned int stop = position_list.at(2).toUInt();
+
+                ProteinXtp protein;
+                protein.setAccession(accession);
+
+                ProteinMatch * p_protein_match = _p_identification_group->getProteinMatchInstance(protein.getAccession());
+                if (p_protein_match == nullptr) {
+                    throw pappso::PappsoException(QObject::tr("ERROR (p_protein_match == nullptr) %1").arg(str));
+                }
+
+                ProteinXtpSp sp_xtp_protein = protein.makeProteinXtpSp();
+                p_protein_match->setProteinXtpSp(_p_project->getProteinStore().getInstance(sp_xtp_protein));
+                p_protein_match->setChecked(true);
+
+                PeptideMatch peptide_match;
+                peptide_match.setStart(start);
+                peptide_match.setPeptideEvidenceSp(_p_identification_data_source->getPeptideEvidenceStore().getInstance(&peptide_evidence));
+
+                p_protein_match->addPeptideMatch(peptide_match);
+            }
+        }
+    }
+
+    qDebug() << "MascotDatParser::saveQuery end";
+}
diff --git a/src/input/mascot/mascotdatparser.h b/src/input/mascot/mascotdatparser.h
index 23f387754..d4ef5fd69 100644
--- a/src/input/mascot/mascotdatparser.h
+++ b/src/input/mascot/mascotdatparser.h
@@ -43,6 +43,9 @@ private:
     void parseProteinLine(const QString & protein_line);
     void parseHeaderLine(const QString & protein_line);
     void parsePeptidesLine(const QString & peptide_line);
+    void parseQueryLine(const QString & query_line);
+    void parseSummaryLine(const QString & summary_line);
+    void saveQuery();
     void saveAndClearPeptide();
     
     struct PeptideLine {
@@ -53,6 +56,17 @@ private:
         QStringList protein_string_list;
         std::vector<FastaFileSp> fasta_file_list;
     };
+    struct QueryLine {
+        unsigned int query_index=0;
+        unsigned int charge=0;
+        pappso::pappso_double rt=0;
+        QString title;
+    };
+    struct SummaryLine {
+        unsigned int match=0;
+        pappso::pappso_double exp_mass=0;
+        pappso::pappso_double plug_hole=0;
+    };
 private:
     Project * _p_project;
     IdentificationGroup * _p_identification_group;
@@ -67,11 +81,14 @@ private:
     QRegExp   _regexp_header_line;
     unsigned int _number_of_queries=0;
     unsigned int _number_of_residues=0;
+    unsigned int _current_query_index=0;
     QString _error_str;
     
     PeptideLine _current_peptide;
+    QueryLine _current_query;
     
     std::vector<std::vector<PeptideLine>> _query_peptide_results;
+    std::vector<SummaryLine> _summary_list;
     
     
 };
diff --git a/src/utils/types.h b/src/utils/types.h
index c228b07d8..26bc2fd44 100644
--- a/src/utils/types.h
+++ b/src/utils/types.h
@@ -54,7 +54,8 @@ enum class IdentificationEngine: std::int8_t {
  *
  */
 enum class PeptideEvidenceParam: std::int8_t {
-    tandem_hyperscore=0 ///< X!Tandem hyperscore
+    tandem_hyperscore=0, ///< X!Tandem hyperscore
+    mascot_macth_score=1 ///< MASCOT match score
 };
 
 /** \def IdentificationEngineParam identification engine parameters
-- 
GitLab