From a4ae5d554a09ef724be2702b876ff6da571529b8 Mon Sep 17 00:00:00 2001 From: Olivier Langella <olivier.langella@u-psud.fr> Date: Mon, 5 Mar 2018 22:11:23 +0100 Subject: [PATCH] WIP: Mascot parser --- src/input/mascot/mascotdatparser.cpp | 248 ++++++++++++++++++++------- src/input/mascot/mascotdatparser.h | 17 ++ src/utils/types.h | 3 +- 3 files changed, 203 insertions(+), 65 deletions(-) diff --git a/src/input/mascot/mascotdatparser.cpp b/src/input/mascot/mascotdatparser.cpp index 1ef92308c..fc06c24f2 100644 --- a/src/input/mascot/mascotdatparser.cpp +++ b/src/input/mascot/mascotdatparser.cpp @@ -33,6 +33,7 @@ #include <pappsomspp/pappsoexception.h> #include "../../core/peptidextp.h" #include "../../core/proteinmatch.h" +#include "../../core/peptideevidence.h" MascotDatParser::MascotDatParser(Project * p_project, IdentificationGroup * p_identification_group, IdentificationDataSource * p_identification_data_source) { @@ -63,11 +64,23 @@ void MascotDatParser::parse(QIODevice * in_stream) { parseHeaderLine( mime_parser.getCurrentTextStream().readLine()); } } + else if (mime_parser.getCurrentFileName() == "summary") { + while(!mime_parser.getCurrentTextStream().atEnd()) { + parseSummaryLine( mime_parser.getCurrentTextStream().readLine()); + } + } else if (mime_parser.getCurrentFileName() == "peptides") { while(!mime_parser.getCurrentTextStream().atEnd()) { parsePeptidesLine( mime_parser.getCurrentTextStream().readLine()); } } + else if (mime_parser.getCurrentFileName().startsWith("query")) { + _current_query_index = mime_parser.getCurrentFileName().mid(5).toUInt(); + while(!mime_parser.getCurrentTextStream().atEnd()) { + parseQueryLine( mime_parser.getCurrentTextStream().readLine()); + } + saveQuery(); + } } mime_parser.close(); @@ -122,6 +135,7 @@ void MascotDatParser::parseHeaderLine(const QString & header_line) { qDebug() << "queries=" << header_list[2]; _number_of_queries = header_list[2].toUInt(); _query_peptide_results.resize(_number_of_queries); + _summary_list.resize(_number_of_queries); } //min_peaks_for_homology=6 //max_hits=50 @@ -149,70 +163,6 @@ void MascotDatParser::parseHeaderLine(const QString & header_line) { } } -void MascotDatParser::saveAndClearPeptide() { - qDebug() << "MascotDatParser::saveAndClearPeptide begin"; - if (_current_peptide.query_index > 0) { - // save - _query_peptide_results[_current_peptide.query_index-1].push_back(_current_peptide); - //parse and save - - QString peptide_str =_current_peptide.peptide_string_list.at(4); - if (!_current_peptide.subst.isEmpty()) { - //q856_p9_subst=1,X,W - //q24379_p2_subst=1,B,D,8,B,D - QStringList subst_list = _current_peptide.subst.split(","); - for (unsigned int i=0; i < subst_list.size(); i+=3) { - peptide_str = peptide_str.replace(subst_list.at(0+i).toInt()-1,1,subst_list.at(2+i)); - } - } - PeptideXtpSp peptide_sp; - peptide_sp = PeptideXtp(peptide_str).makePeptideXtpSp(); - peptide_sp = _p_project->getPeptideStore().getInstance(peptide_sp); - - if (_current_peptide.protein_string_list.size() != _current_peptide.fasta_file_list.size()) { - throw pappso::PappsoException(QObject::tr("ERROR (_current_peptide.protein_string_list.size() != _current_peptide.fasta_file_list.size()) %1").arg(_current_peptide.protein_string_list.join(",\""))); - } - - foreach (const QString &str, _current_peptide.protein_string_list) { - //sp|O95006|OR2F2_HUMAN":0:299:303:1 - int position = str.indexOf("\"", 0); - QString accession = str.mid(0, position); - qDebug() << "accession=" << accession; - QStringList position_list = str.mid(position+2).split(":"); - if (position_list.size() != 4) { - throw pappso::PappsoException(QObject::tr("ERROR position_list.size() != 4 %1").arg(str)); - } - unsigned int start = position_list.at(1).toUInt(); - unsigned int stop = position_list.at(2).toUInt(); - - ProteinXtp protein; - protein.setAccession(accession); - - ProteinMatch * p_protein_match = _p_identification_group->getProteinMatchInstance(protein.getAccession()); - if (p_protein_match == nullptr) { - throw pappso::PappsoException(QObject::tr("ERROR (p_protein_match == nullptr) %1").arg(str)); - } - - ProteinXtpSp sp_xtp_protein = protein.makeProteinXtpSp(); - p_protein_match->setProteinXtpSp(_p_project->getProteinStore().getInstance(sp_xtp_protein)); - p_protein_match->setChecked(true); - - PeptideMatch peptide_match; - peptide_match.setStart(start); - //peptide_match.setPeptideEvidenceSp(); - - p_protein_match->addPeptideMatch(peptide_match); - } - } - - //new peptide query clear - _current_peptide.peptide_string_list.clear(); - _current_peptide.fasta_file_list.clear(); - _current_peptide.query_index = 0; - _current_peptide.subst = ""; - qDebug() << "MascotDatParser::saveAndClearPeptide end"; -} - void MascotDatParser::parsePeptidesLine(const QString & peptide_line) { try { if (_regexp_header_line.exactMatch(peptide_line)) { @@ -304,3 +254,173 @@ void MascotDatParser::parsePeptidesLine(const QString & peptide_line) { throw pappso::PappsoException(_error_str); } } + + +void MascotDatParser::parseQueryLine(const QString & query_line) { + try { + if (_regexp_header_line.exactMatch(query_line)) { + QStringList header_list = _regexp_header_line.capturedTexts(); + QString index = header_list[1]; + QString value = header_list[2]; + + + //title=FULL%20ISSLGSVGAGIVAVKK%20N22213%20%20%20QEP1_SpikeIn_230914_1_3ng_270914%2e35282%2e35282%2e2 + if (index == "title") { + _current_query.title = value; + } + //rtinseconds=5703.84 + else if (index == "rtinseconds") { + _current_query.rt = value.toDouble(); + } + //index=44035 + else if (index == "index") { + _current_query.query_index = value.toUInt(); + } + //charge=2+ + else if (index == "charge") { + _current_query.charge = value.toUInt(); + } + } + /* + mass_min=129.102051 + mass_max=1198.751099 + int_min=2327 + int_max=6.845e+005 + num_vals=44 + num_used1=-1 + Ions1=129.102051:1.111e+005,275.207306:9.008e+004,374.275299:9.717e+004,514.825623:1.929e+004,599.878906:1.472e+005,714.431519:1.356e+004,815.466003:9643,924.570435:4428,1028.645630:1.257e+005,1085.665527:6.845e+005,1198.751099:1.858e+005,147.112656:7.954e+004,261.159271:4.327e+004,357.248169:2.192e+004,506.312775:3319,543.336548:1.083e+005,655.376465:9786,1011.625977:1.747e+004,1068.641602:9.804e+004,1181.725342:2.712e+004,130.086014:3.602e+004,257.197388:1.455e+004,357.212097:3960,591.370056:2.454e+004,972.547058:3819,1069.647583:4.457e+004,201.123077:1.566e+004,299.171783:1.421e+004,534.825562:1.036e+004,1087.671631:2.374e+004,173.128418:1.512e+004,244.129089:1.052e+004,590.873291:8065,1067.656006:2.235e+004,228.133957:1.35e+004,258.180115:3194,534.333374:3635,1096.635376:1.549e+004,200.139084:1.026e+004,535.328552:3006,1070.637085:4813,183.112442:9925,131.080994:4806,211.108139:2327 + + */ + } + + catch (pappso::PappsoException exception_pappso) { + _error_str = QObject::tr("ERROR in MascotDatParser::parseQueryLine %1, PAPPSO exception:\n%2").arg(query_line).arg(exception_pappso.qwhat()); + qDebug() << _error_str; + throw pappso::PappsoException(_error_str); + } + catch (std::exception exception_std) { + _error_str = QObject::tr("ERROR in MascotDatParser::parseQueryLine %1, std exception:\n%2").arg(query_line).arg(exception_std.what()); + qDebug() << _error_str; + throw pappso::PappsoException(_error_str); + } +} + + +void MascotDatParser::parseSummaryLine(const QString & summary_line) { + + if (_regexp_header_line.exactMatch(summary_line)) { + QStringList header_list = _regexp_header_line.capturedTexts(); + QString index = header_list[1]; + QString value = header_list[2]; + + + //qmass1=598.300206 + if (index.startsWith("qmass")) { + unsigned int query_index = index.mid(5).toUInt(); + //_current_query.title = value; + _summary_list[query_index-1].exp_mass=value.toDouble(); + } + //qexp1=300.157379,2+ +//qintensity1=2054822.6250 +//qmatch1=73 + else if (index.startsWith("qmatch")) { + unsigned int query_index = index.mid(6).toUInt(); + //_current_query.title = value; + _summary_list[query_index-1].match=value.toUInt(); + } +//qplughole1=14.820890 + else if (index.startsWith("qplughole")) { + unsigned int query_index = index.mid(9).toUInt(); + //_current_query.title = value; + _summary_list[query_index-1].plug_hole=value.toDouble(); + } + + } +} + + +void MascotDatParser::saveAndClearPeptide() { + qDebug() << "MascotDatParser::saveAndClearPeptide begin"; + if (_current_peptide.query_index > 0) { + // save + _query_peptide_results[_current_peptide.query_index-1].push_back(_current_peptide); + } + + //new peptide query clear + _current_peptide.peptide_string_list.clear(); + _current_peptide.fasta_file_list.clear(); + _current_peptide.query_index = 0; + _current_peptide.subst = ""; + qDebug() << "MascotDatParser::saveAndClearPeptide end"; +} + + +void MascotDatParser::saveQuery() { + qDebug() << "MascotDatParser::saveQuery begin"; + if (_current_query.query_index > 0) { + std::vector< PeptideLine> peptide_list = _query_peptide_results[_current_query.query_index-1]; + + PeptideEvidence peptide_evidence(_p_identification_data_source->getMsRunSp().get(),_current_query.query_index); + peptide_evidence.setCharge(_current_query.charge); + peptide_evidence.setChecked(true); + peptide_evidence.setExperimentalMass(_summary_list[_current_query.query_index-1].exp_mass); + peptide_evidence.setRetentionTime(_current_query.rt); + + peptide_evidence.setParam(PeptideEvidenceParam::mascot_macth_score, QVariant(_summary_list[_current_query.query_index-1].match)); + + peptide_evidence.setIdentificationDataSource( _p_identification_data_source); + //parse and save + for(PeptideLine & peptide_line:peptide_list) { + + QString peptide_str =peptide_line.peptide_string_list.at(4); + if (!peptide_line.subst.isEmpty()) { + //q856_p9_subst=1,X,W + //q24379_p2_subst=1,B,D,8,B,D + QStringList subst_list = peptide_line.subst.split(","); + for (unsigned int i=0; i < subst_list.size(); i+=3) { + peptide_str = peptide_str.replace(subst_list.at(0+i).toInt()-1,1,subst_list.at(2+i)); + } + } + PeptideXtpSp peptide_sp; + peptide_sp = PeptideXtp(peptide_str).makePeptideXtpSp(); + peptide_sp = _p_project->getPeptideStore().getInstance(peptide_sp); + + if (peptide_line.protein_string_list.size() != peptide_line.fasta_file_list.size()) { + throw pappso::PappsoException(QObject::tr("ERROR (peptide_line.protein_string_list.size() != peptide_line.fasta_file_list.size()) %1").arg(peptide_line.protein_string_list.join(",\""))); + } + + foreach (const QString &str, peptide_line.protein_string_list) { + //sp|O95006|OR2F2_HUMAN":0:299:303:1 + int position = str.indexOf("\"", 0); + QString accession = str.mid(0, position); + qDebug() << "accession=" << accession; + QStringList position_list = str.mid(position+2).split(":"); + if (position_list.size() != 4) { + throw pappso::PappsoException(QObject::tr("ERROR position_list.size() != 4 %1").arg(str)); + } + unsigned int start = position_list.at(1).toUInt(); + unsigned int stop = position_list.at(2).toUInt(); + + ProteinXtp protein; + protein.setAccession(accession); + + ProteinMatch * p_protein_match = _p_identification_group->getProteinMatchInstance(protein.getAccession()); + if (p_protein_match == nullptr) { + throw pappso::PappsoException(QObject::tr("ERROR (p_protein_match == nullptr) %1").arg(str)); + } + + ProteinXtpSp sp_xtp_protein = protein.makeProteinXtpSp(); + p_protein_match->setProteinXtpSp(_p_project->getProteinStore().getInstance(sp_xtp_protein)); + p_protein_match->setChecked(true); + + PeptideMatch peptide_match; + peptide_match.setStart(start); + peptide_match.setPeptideEvidenceSp(_p_identification_data_source->getPeptideEvidenceStore().getInstance(&peptide_evidence)); + + p_protein_match->addPeptideMatch(peptide_match); + } + } + } + + qDebug() << "MascotDatParser::saveQuery end"; +} diff --git a/src/input/mascot/mascotdatparser.h b/src/input/mascot/mascotdatparser.h index 23f387754..d4ef5fd69 100644 --- a/src/input/mascot/mascotdatparser.h +++ b/src/input/mascot/mascotdatparser.h @@ -43,6 +43,9 @@ private: void parseProteinLine(const QString & protein_line); void parseHeaderLine(const QString & protein_line); void parsePeptidesLine(const QString & peptide_line); + void parseQueryLine(const QString & query_line); + void parseSummaryLine(const QString & summary_line); + void saveQuery(); void saveAndClearPeptide(); struct PeptideLine { @@ -53,6 +56,17 @@ private: QStringList protein_string_list; std::vector<FastaFileSp> fasta_file_list; }; + struct QueryLine { + unsigned int query_index=0; + unsigned int charge=0; + pappso::pappso_double rt=0; + QString title; + }; + struct SummaryLine { + unsigned int match=0; + pappso::pappso_double exp_mass=0; + pappso::pappso_double plug_hole=0; + }; private: Project * _p_project; IdentificationGroup * _p_identification_group; @@ -67,11 +81,14 @@ private: QRegExp _regexp_header_line; unsigned int _number_of_queries=0; unsigned int _number_of_residues=0; + unsigned int _current_query_index=0; QString _error_str; PeptideLine _current_peptide; + QueryLine _current_query; std::vector<std::vector<PeptideLine>> _query_peptide_results; + std::vector<SummaryLine> _summary_list; }; diff --git a/src/utils/types.h b/src/utils/types.h index c228b07d8..26bc2fd44 100644 --- a/src/utils/types.h +++ b/src/utils/types.h @@ -54,7 +54,8 @@ enum class IdentificationEngine: std::int8_t { * */ enum class PeptideEvidenceParam: std::int8_t { - tandem_hyperscore=0 ///< X!Tandem hyperscore + tandem_hyperscore=0, ///< X!Tandem hyperscore + mascot_macth_score=1 ///< MASCOT match score }; /** \def IdentificationEngineParam identification engine parameters -- GitLab