/** * \file /input/mascot/mascotdatparser.h * \date 17/2/2018 * \author Olivier Langella * \brief MASCOT dat file parser */ /******************************************************************************* * Copyright (c) 2018 Olivier Langella <olivier.langella@u-psud.fr>. * * This file is part of XTPcpp. * * XTPcpp is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * XTPcpp is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with XTPcpp. If not, see <http://www.gnu.org/licenses/>. * * Contributors: * Olivier Langella <olivier.langella@u-psud.fr> - initial API and *implementation ******************************************************************************/ #include "mascotdatparser.h" #include "mimeparser.h" #include <cmath> #include <QDebug> #include <pappsomspp/pappsoexception.h> #include "../../core/peptidextp.h" #include "../../core/proteinmatch.h" #include "../../core/peptideevidence.h" MascotDatParser::MascotDatParser( Project *p_project, IdentificationGroup *p_identification_group, IdentificationDataSource *p_identification_data_source) { _p_project = p_project; _p_identification_group = p_identification_group; _p_identification_data_source = p_identification_data_source; _regexp_header_line.setPattern("^([A-Z,a-z,0-9,_]+)=(.*)$"); _regexp_parse_scan.setPattern(".*scan=([0-9]+).*"); } MascotDatParser::~MascotDatParser() { } void MascotDatParser::parse(QIODevice *in_stream) { qDebug() << "MascotDatParser::parse begin"; MimeParser mime_parser(in_stream); mime_parser.open(); for(bool more = mime_parser.goToFirstFile(); more; more = mime_parser.goToNextFile()) { qDebug() << "MascotDatParser::parse mimetype=" << mime_parser.getCurrentMimeType() << " filename=" << mime_parser.getCurrentFileName(); if(mime_parser.getCurrentFileName() == "proteins") { while(!mime_parser.getCurrentTextStream().atEnd()) { parseProteinLine(mime_parser.getCurrentTextStream().readLine()); } } else if(mime_parser.getCurrentFileName() == "parameters") { while(!mime_parser.getCurrentTextStream().atEnd()) { parseParametersLine( mime_parser.getCurrentTextStream().readLine()); } } else if(mime_parser.getCurrentFileName() == "header") { while(!mime_parser.getCurrentTextStream().atEnd()) { parseHeaderLine(mime_parser.getCurrentTextStream().readLine()); } } else if(mime_parser.getCurrentFileName() == "masses") { while(!mime_parser.getCurrentTextStream().atEnd()) { parseMassesLine(mime_parser.getCurrentTextStream().readLine()); } } else if(mime_parser.getCurrentFileName() == "summary") { while(!mime_parser.getCurrentTextStream().atEnd()) { parseSummaryLine(mime_parser.getCurrentTextStream().readLine()); } } else if(mime_parser.getCurrentFileName() == "peptides") { while(!mime_parser.getCurrentTextStream().atEnd()) { parsePeptidesLine(mime_parser.getCurrentTextStream().readLine()); } saveAndClearPeptide(); } else if(mime_parser.getCurrentFileName() == "decoy_summary") { _is_decoy_section = true; while(!mime_parser.getCurrentTextStream().atEnd()) { parseSummaryLine(mime_parser.getCurrentTextStream().readLine()); } _is_decoy_section = false; } else if(mime_parser.getCurrentFileName() == "decoy_peptides") { _is_decoy_section = true; while(!mime_parser.getCurrentTextStream().atEnd()) { parsePeptidesLine(mime_parser.getCurrentTextStream().readLine()); } saveAndClearPeptide(); _is_decoy_section = false; } else if(mime_parser.getCurrentFileName().startsWith("query")) { _current_query.query_index = mime_parser.getCurrentFileName().mid(5).toULong(); while(!mime_parser.getCurrentTextStream().atEnd()) { parseQueryLine(mime_parser.getCurrentTextStream().readLine()); } saveQuery(); } } mime_parser.close(); qDebug() << "MascotDatParser::parse end"; } void MascotDatParser::parseMassesLine(const QString &masses_line) { qDebug() << __FILE__ << " " << __FUNCTION__ << " " << __LINE__ << " " << masses_line; if(_regexp_header_line.exactMatch(masses_line)) { QStringList header_list = _regexp_header_line.capturedTexts(); // C_term=17.002740 // N_term=1.007825 // delta1=15.994915,Oxidation (M) if(header_list[1].startsWith("delta")) { _delta_modification_list.push_back(MascotModification()); unsigned index = header_list[1].mid(5).toUInt(); QStringList delta_mod_list = header_list[2].split(","); pappso::pappso_double mass = delta_mod_list[0].toDouble(); if(delta_mod_list[1] == "Oxidation (M)") { _delta_modification_list[index - 1].modification = pappso::AaModification::getInstance("MOD:00719"); _delta_modification_list[index - 1].residue = 'M'; } else { _delta_modification_list[index - 1].modification = pappso::AaModification::getInstanceCustomizedMod(mass); } //_number_of_residues = header_list[5].toUInt(); } // FixedModResidues1=C else if(header_list[1].startsWith("FixedModResidues")) { unsigned index = header_list[1].mid(16).toUInt(); qDebug() << __FILE__ << " " << __FUNCTION__ << " " << __LINE__ << " " << index; _fixed_modification_list[index - 1].residue = header_list[2].at(0); } // NeutralLoss1=0.000000 // NeutralLoss1_master=63.998285 // FixedMod1=57.021464,Carbamidomethyl (C) else if(header_list[1].startsWith("FixedMod")) { unsigned index = header_list[1].mid(8).toUInt(); _fixed_modification_list.push_back(MascotModification()); QStringList fixed_mod_list = header_list[2].split(","); pappso::pappso_double mass = fixed_mod_list[0].toDouble(); //_number_of_residues = header_list[2].toUInt(); qDebug() << __FILE__ << " " << __FUNCTION__ << " " << __LINE__ << " " << header_list[2]; if(fixed_mod_list[1] == "Carbamidomethyl (C)") { _fixed_modification_list[index - 1].modification = pappso::AaModification::getInstance("MOD:00397"); _fixed_modification_list[index - 1].residue = 'C'; } else { _fixed_modification_list[index - 1].modification = pappso::AaModification::getInstanceCustomizedMod(mass); } } } else { qDebug() << __FILE__ << " " << __FUNCTION__ << " " << __LINE__ << " QREGEXP does not work on " << masses_line; } qDebug() << __FILE__ << " " << __FUNCTION__ << " " << __LINE__ << " " << _fixed_modification_list.size(); } void MascotDatParser::parseProteinLine(const QString &protein_line) { qDebug() << __FILE__ << " " << __FUNCTION__ << " " << __LINE__ << " " << protein_line; ProteinXtpSp sp_xtp_protein; // 02::"tr|A0A0D9SF80|A0A0D9SF80_HUMAN"=55120.88,"General transcription factor // II-I repeat domain-containing protein 2A OS=Homo sapiens GN=GTF2IRD2B PE=4 // SV=1" QRegExp regexp_protein("^(.*)::\"(.*)\"=([0-9]+\\.[0-9]+),\"(.*)\"$"); if(regexp_protein.exactMatch(protein_line)) { QStringList protein_list = regexp_protein.capturedTexts(); FastaFileSp fasta_file_sp = _fasta_file_list[protein_list[1].toUInt() - 1]; _current_protein.setAccession(protein_list[2]); _current_protein.setDescription(protein_list[4]); _current_protein.setFastaFileP(fasta_file_sp.get()); sp_xtp_protein = _current_protein.makeProteinXtpSp(); sp_xtp_protein = _p_project->getProteinStore().getInstance(sp_xtp_protein); } else { QRegExp regexp_proteinb("^\"(.*)\"=([0-9]+\\.[0-9]+),\"(.*)\"$"); if(regexp_proteinb.exactMatch(protein_line)) { QStringList protein_list = regexp_proteinb.capturedTexts(); FastaFileSp fasta_file_sp = _fasta_file_list[0]; _current_protein.setAccession(protein_list[1]); _current_protein.setDescription(protein_list[3]); _current_protein.setFastaFileP(fasta_file_sp.get()); sp_xtp_protein = _current_protein.makeProteinXtpSp(); sp_xtp_protein = _p_project->getProteinStore().getInstance(sp_xtp_protein); } else { qDebug() << "MascotDatParser::parseProteinLine error " << protein_line; } } } void MascotDatParser::parseHeaderLine(const QString &header_line) { if(_regexp_header_line.exactMatch(header_line)) { QStringList header_list = _regexp_header_line.capturedTexts(); // sequences=73998 // sequences_after_tax=73998 // residues=24900901 if(header_list[1].startsWith("residues")) { qDebug() << "queries=" << header_list[2]; _number_of_residues = header_list[2].toUInt(); } // distribution=64847,0,1,576,2254,1934,1417,984,683,419,238,192,113,89,74,46,23,32,28,21,11,3,2,3,3,2,1,1,0,0,0,0,0,0,1 // decoy_type=1 // distribution_decoy=65162,0,1,552,2259,1891,1422,957,601,376,192,159,126,73,56,64,26,27,22,14,6,4,1,1,3,2,1 // exec_time=484 // date=1517587671 // time=17:07:51 // queries=54084 else if(header_list[1].startsWith("queries")) { qDebug() << "queries=" << header_list[2]; _number_of_queries = header_list[2].toUInt(); _query_peptide_results.resize(_number_of_queries); _summary_list.resize(_number_of_queries); _decoy_query_peptide_results.resize(_number_of_queries); _decoy_summary_list.resize(_number_of_queries); } // min_peaks_for_homology=6 // max_hits=50 // version=2.5.0 else if(header_list[1] == "version") { _p_identification_data_source->setIdentificationEngineVersion( header_list[2]); } // fastafile=C:/inetpub/mascot/sequence/ECOLI_INRA/current/ECOLI_INRA_1.fasta else if(header_list[1].startsWith("fastafile")) { qDebug() << "fastafile=" << header_list[2]; _fasta_file_list.push_back( _p_project->getFastaFileStore().getInstance( FastaFile(header_list[2]))); _p_identification_data_source->addFastaFile(_fasta_file_list.back()); } // release=ECOLI_INRA_1.fasta // sequences1=4305 // sequences_after_tax1=4305 // residues1=1356026 // fastafile2=C:/inetpub/mascot/sequence/HUMAN_INRA/current/HUMAN_INRA_1.fasta // release2=HUMAN_INRA_1.fasta // sequences2=69693 // sequences_after_tax2=69693 // residues2=23544875 // taskid=151758718701 } } void MascotDatParser::parseParametersLine(const QString &header_line) { if(_regexp_header_line.exactMatch(header_line)) { QStringList header_list = _regexp_header_line.capturedTexts(); /* * LICENSE=licence du logiciel MP= NM= COM=WP4 Batch4 inj2 paramSTR QEKAC160601_02.raw.mgf IATOL= IA2TOL= IASTOL= IBTOL= IB2TOL= IBSTOL= IYTOL= IY2TOL= IYSTOL= SEG= SEGT= SEGTU= LTOL= TOL=5 TOLU=ppm ITH= ITOL=70 ITOLU=mmu PFA=1 DB=Contaminants_WP4_D DB2=S_cerevisiae_D DB3=UPS1UPS2_D MODS=Carbamidomethyl (C) MASS=Monoisotopic CLE=Trypsin/P FILE=F:\MSData\Batch4 Qex+TOUL Injection2\QEKAC160601_02.raw.mgf PEAK= QUE= TWO= SEARCH=MIS USERNAME=AMH USEREMAIL= CHARGE=2+ and 3+ INTERMEDIATE= REPORT= AUTO OVERVIEW= FORMAT=Mascot generic FORMVER=1.01 FRAG= IT_MODS=Acetyl (Protein N-term),Oxidation (M) USER00= USER01= USER02= USER03= USER04= USER05= USER06= USER07= USER08= USER09= USER10= USER11= USER12= PRECURSOR= TAXONOMY=All entries ACCESSION= REPTYPE= SUBCLUSTER= ICAT= INSTRUMENT=ESI FTMS HCD ERRORTOLERANT= FRAMES= CUTOUT= USERID=0 QUANTITATION= DECOY= PEP_ISOTOPE_ERROR= MULTI_SITE_MODS= DATAURL= RULES=1,2,4,5,6,7,8,9,10,13,14,15,17,18 INTERNALS=0.0,700.0 */ if(header_list[1] == "FILE") { // FILE=F:\MSData\Batch4 Qex+TOUL Injection2\QEKAC160601_02.raw.mgf _p_identification_data_source->getMsRunSp().get()->setFilename( header_list[2]); QFileInfo fileinfo(header_list[2]); if(fileinfo.fileName() == header_list[2]) { fileinfo.setFile(header_list[2].replace("\\", "/")); _p_identification_data_source->getMsRunSp().get()->setSampleName( fileinfo.baseName()); } else { _p_identification_data_source->getMsRunSp().get()->setSampleName( fileinfo.baseName()); } } } } void MascotDatParser::parsePeptidesLine(const QString &peptide_line) { try { if(_regexp_header_line.exactMatch(peptide_line)) { QStringList header_list = _regexp_header_line.capturedTexts(); QString index = header_list[1]; QString value = header_list[2]; QStringList index_list = index.split("_"); QString query_index_str = index_list[0]; unsigned int query_index_number = query_index_str.mid(1).toUInt(); QString peptide_index = index_list[1]; std::size_t current_peptide_index = peptide_index.mid(1).toUInt(); if((value != "-1") && ((query_index_number != _current_peptide.query_index) || (current_peptide_index != _current_peptide.peptide_index))) { saveAndClearPeptide(); } _current_peptide.query_index = query_index_number; _current_peptide.peptide_index = current_peptide_index; if(index_list.size() == 4) { if((index_list[2] == "primary") && (index_list[3] == "nl")) { // throw pappso::PappsoException( // QObject::tr("primary_nl is not taken into account")); } } else if(index_list.size() == 3) { if(index_list[2] == "db") { // q1_p1_db=02 while(value.size() > 0) { QString fasta_str = value.mid(0, 2); _current_peptide.fasta_file_list.push_back( _fasta_file_list.at(fasta_str.toInt() - 1)); value = value.mid(2); } } // q856_p9_subst=1,X,W else if(index_list[2] == "subst") { _current_peptide.subst = value; } } else if(index_list.size() == 2) { if(value == "-1") { // no result for this query } else { // q1_p1=0,597.302322,0.997884,2,GAWHK,9,0000000,7.97,0000012000000000000,0,0;"sp|O95006|OR2F2_HUMAN":0:299:303:1 int position = value.indexOf(";\"", 0); QString peptide_string = value.mid(0, position); // qDebug() << "peptide_string=" << peptide_string; _current_peptide.peptide_string_list = peptide_string.split(","); QString protein_string = value.mid(position + 2); // qDebug() << "protein_string=" << protein_string; //"sp|Q9Y2I7|FYV1_HUMAN":0:670:675:2,"tr|E9PDH4|E9PDH4_HUMAN":0:614:619:2 _current_peptide.protein_string_list = protein_string.split(",\""); } } /* q1_p1_db=02 q1_p1=0,597.302322,0.997884,2,GAWHK,9,0000000,7.97,0000012000000000000,0,0;"sp|O95006|OR2F2_HUMAN":0:299:303:1 q1_p1_terms=K,L q1_p2_db=02 q1_p2=1,598.296219,0.003987,2,KEEPP,11,0000000,1.32,0002000000000000000,0,0;"tr|E9PNM8|E9PNM8_HUMAN":0:134:138:1 q1_p2_terms=R,- q2_p1_db=02 q2_p1=1,598.380234,-0.000316,3,KAGVPK,13,00000000,17.13,0002011000000000000,0,0;"tr|H7C1P9|H7C1P9_HUMAN":0:945:950:2 q2_p1_terms=K,K q2_p2_db=0202 q2_p2=1,598.380234,-0.000316,2,KQPVK,9,0000000,9.54,0002011000000000000,0,0;"sp|P33527|MRP1_HUMAN":0:270:274:1,"tr|I3L4X2|I3L4X2_HUMAN":0:169:173:1 q2_p2_terms=R,V:R,V q2_p3_db=02 q2_p3=1,598.380234,-0.000316,2,KAVPGK,13,00000000,7.03,0002001000000000000,0,0;"sp|Q13061|TRDN_HUMAN":0:440:445:2 q2_p3_terms=K,K q2_p4_db=0202 q2_p4=1,598.380234,-0.000316,2,IPGGKK,14,00000000,1.26,0002001000000000000,0,0;"sp|Q9Y2I7|FYV1_HUMAN":0:670:675:2,"tr|E9PDH4|E9PDH4_HUMAN":0:614:619:2 q2_p4_terms=K,F:K,F */ // q24878_p3_primary_nl=000000020000000 /* q856_p9_db=0202 q856_p9=0,685.427521,-0.000117,3,XLLVR,12,0000000,13.68,0000002000000000000,0,0;"tr|V9GY00|V9GY00_HUMAN":0:1:5:1,"tr|H7C3C3|H7C3C3_HUMAN":0:1:5:1 q856_p9_terms=-,L:-,V q856_p9_subst=1,X,W */ } } catch(pappso::PappsoException exception_pappso) { _error_str = QObject::tr( "ERROR in MascotDatParser::parsePeptidesLine " "%1, PAPPSO exception:\n%2 near q%3_p%4") .arg(peptide_line) .arg(exception_pappso.qwhat()) .arg(_current_peptide.query_index) .arg(_current_peptide.peptide_index); qDebug() << _error_str; throw pappso::PappsoException(_error_str); } catch(std::exception exception_std) { _error_str = QObject::tr( "ERROR in MascotDatParser::parsePeptidesLine %1, std " "exception:\n%2 near q%3_p%4") .arg(peptide_line) .arg(exception_std.what()) .arg(_current_peptide.query_index) .arg(_current_peptide.peptide_index); qDebug() << _error_str; throw pappso::PappsoException(_error_str); } } void MascotDatParser::parseQueryLine(const QString &query_line) { try { if(_regexp_header_line.exactMatch(query_line)) { QStringList header_list = _regexp_header_line.capturedTexts(); QString index = header_list[1]; QString value = header_list[2]; // title=FULL%20ISSLGSVGAGIVAVKK%20N22213%20%20%20QEP1_SpikeIn_230914_1_3ng_270914%2e35282%2e35282%2e2 // title=controllerType%3d0%20controllerNumber%3d1%20scan%3d43355 if(index == "title") { _current_query.parsed_scan_number = 0; _current_query.title = QUrl::fromPercentEncoding(value.toLatin1()); qDebug() << __FILE__ << " " << __FUNCTION__ << " " << __LINE__ << " " << _current_query.title; if(_regexp_parse_scan.exactMatch(_current_query.title)) { _current_query.parsed_scan_number = _regexp_parse_scan.capturedTexts()[1].toULong(); } qDebug() << __FILE__ << " " << __FUNCTION__ << " " << __LINE__ << " " << _current_query.parsed_scan_number; } // rtinseconds=5703.84 else if(index == "rtinseconds") { _current_query.rt = value.toDouble(); } // index=44035 else if(index == "index") // it is not the query index { _current_query.index = value.toUInt(); if(_current_query.parsed_scan_number == 0) { _current_query.parsed_scan_number = _current_query.index; } } // charge=2+ else if(index == "charge") { _current_query.charge = value.mid(0, value.size() - 1).toUInt(); } } /* mass_min=129.102051 mass_max=1198.751099 int_min=2327 int_max=6.845e+005 num_vals=44 num_used1=-1 Ions1=129.102051:1.111e+005,275.207306:9.008e+004,374.275299:9.717e+004,514.825623:1.929e+004,599.878906:1.472e+005,714.431519:1.356e+004,815.466003:9643,924.570435:4428,1028.645630:1.257e+005,1085.665527:6.845e+005,1198.751099:1.858e+005,147.112656:7.954e+004,261.159271:4.327e+004,357.248169:2.192e+004,506.312775:3319,543.336548:1.083e+005,655.376465:9786,1011.625977:1.747e+004,1068.641602:9.804e+004,1181.725342:2.712e+004,130.086014:3.602e+004,257.197388:1.455e+004,357.212097:3960,591.370056:2.454e+004,972.547058:3819,1069.647583:4.457e+004,201.123077:1.566e+004,299.171783:1.421e+004,534.825562:1.036e+004,1087.671631:2.374e+004,173.128418:1.512e+004,244.129089:1.052e+004,590.873291:8065,1067.656006:2.235e+004,228.133957:1.35e+004,258.180115:3194,534.333374:3635,1096.635376:1.549e+004,200.139084:1.026e+004,535.328552:3006,1070.637085:4813,183.112442:9925,131.080994:4806,211.108139:2327 */ } catch(pappso::PappsoException exception_pappso) { _error_str = QObject::tr( "ERROR in MascotDatParser::parseQueryLine %1, PAPPSO " "exception:\n%2") .arg(query_line) .arg(exception_pappso.qwhat()); qDebug() << _error_str; throw pappso::PappsoException(_error_str); } catch(std::exception exception_std) { _error_str = QObject::tr( "ERROR in MascotDatParser::parseQueryLine %1, std exception:\n%2") .arg(query_line) .arg(exception_std.what()); qDebug() << _error_str; throw pappso::PappsoException(_error_str); } } void MascotDatParser::parseSummaryLine(const QString &summary_line) { std::vector<SummaryLine> *p_summary_list = &_summary_list; if(_is_decoy_section) { p_summary_list = &_decoy_summary_list; } if(_regexp_header_line.exactMatch(summary_line)) { QStringList header_list = _regexp_header_line.capturedTexts(); QString index = header_list[1]; QString value = header_list[2]; // qmass1=598.300206 if(index.startsWith("qmass")) { unsigned int query_index = index.mid(5).toUInt(); qDebug() << __FILE__ << " " << __FUNCTION__ << " " << __LINE__ << " " << query_index << " " << index; //_current_query.title = value; (*p_summary_list)[query_index - 1].exp_mass = value.toDouble(); } // qexp1=300.157379,2+ // qintensity1=2054822.6250 // qmatch1=73 else if(index.startsWith("qmatch")) { unsigned int query_index = index.mid(6).toUInt(); //_current_query.title = value; (*p_summary_list)[query_index - 1].match = value.toDouble(); } // qplughole1=14.820890 else if(index.startsWith("qplughole")) { unsigned int query_index = index.mid(9).toUInt(); //_current_query.title = value; (*p_summary_list)[query_index - 1].plug_hole = value.toDouble(); } } } void MascotDatParser::saveAndClearPeptide() { // qDebug() << "MascotDatParser::saveAndClearPeptide begin"; if(_current_peptide.query_index > 0) { // save if(_is_decoy_section) { _decoy_query_peptide_results[_current_peptide.query_index - 1] .push_back(_current_peptide); } else { _query_peptide_results[_current_peptide.query_index - 1].push_back( _current_peptide); } } // new peptide query clear _current_peptide.peptide_string_list.clear(); _current_peptide.fasta_file_list.clear(); _current_peptide.query_index = 0; _current_peptide.subst = ""; // qDebug() << "MascotDatParser::saveAndClearPeptide end"; } void MascotDatParser::saveQuery() { qDebug() << __FILE__ << " " << __FUNCTION__ << " " << __LINE__; if(_current_query.query_index > 0) { try { std::vector<PeptideLine> peptide_list = _query_peptide_results[_current_query.query_index - 1]; savePeptideList(peptide_list, false); peptide_list = _decoy_query_peptide_results[_current_query.query_index - 1]; savePeptideList(peptide_list, true); } catch(pappso::PappsoException &error) { throw pappso::PappsoException( QObject::tr("ERROR saving MASCOT query%1 :\n %2") .arg(_current_query.query_index) .arg(error.qwhat())); } } qDebug() << __FILE__ << " " << __FUNCTION__ << " " << __LINE__; } void MascotDatParser::savePeptideList(std::vector<PeptideLine> &peptide_list, bool is_decoy) { qDebug() << __FILE__ << " " << __FUNCTION__ << " " << __LINE__ << " " << _current_query.query_index; try { PeptideEvidence peptide_evidence_generic( _p_identification_data_source->getMsRunSp().get(), _current_query.parsed_scan_number); peptide_evidence_generic.setCharge(_current_query.charge); peptide_evidence_generic.setChecked(true); peptide_evidence_generic.setExperimentalMass( _summary_list[_current_query.query_index - 1].exp_mass); peptide_evidence_generic.setRetentionTime(_current_query.rt); peptide_evidence_generic.setIdentificationDataSource( _p_identification_data_source); // parse and save for(PeptideLine &peptide_line : peptide_list) { if(peptide_line.peptide_string_list.size() == 0) { // no results continue; } PeptideEvidence peptide_evidence(peptide_evidence_generic); /* * 0 setMissedCleavages(Integer.parseInt(st.nextToken())); 1 setPeptideMr(Double.parseDouble(st.nextToken())); 2 setDeltaMass(Double.parseDouble(st.nextToken())); 3 setNumberOfIonsMatched(Integer.parseInt(st.nextToken())); 4 setSequence(st.nextToken()); 5 setPeaksUsedFromIons1(Integer.parseInt(st.nextToken())); 6 setVariableModificationsArray(st.nextToken()); 7 setIonsScore(Double.parseDouble(st.nextToken())); 8 parseIonSeries(st.nextToken()); 9 setPeaksUsedFromIons2(Integer.parseInt(st.nextToken())); 10 setPeaksUsedFromIons3(Integer.parseInt(st.nextToken())); */ qDebug() << __FILE__ << " " << __FUNCTION__ << " " << __LINE__; if(peptide_line.peptide_string_list.size() < 8) { throw pappso::PappsoException( QObject::tr("unable to parse ion score in peptide line q%1_p%2") .arg(peptide_line.query_index) .arg(peptide_line.peptide_index)); } pappso::pappso_double ion_score = peptide_line.peptide_string_list.at(7).toDouble(); peptide_evidence.setParam(PeptideEvidenceParam::mascot_score, QVariant(ion_score)); qDebug() << __FILE__ << " " << __FUNCTION__ << " " << __LINE__; if(is_decoy) { peptide_evidence.setEvalue(getEvalue( ion_score, _decoy_summary_list.at(_current_query.query_index - 1), 0.05)); } else { peptide_evidence.setEvalue( getEvalue(ion_score, _summary_list.at(_current_query.query_index - 1), 0.05)); } peptide_evidence.setParam( PeptideEvidenceParam::mascot_expectation_value, QVariant(peptide_evidence.getEvalue())); qDebug() << __FILE__ << " " << __FUNCTION__ << " " << __LINE__; QString peptide_str = peptide_line.peptide_string_list.at(4); qDebug() << __FILE__ << " " << __FUNCTION__ << " " << __LINE__; if(!peptide_line.subst.isEmpty()) { // q856_p9_subst=1,X,W // q24379_p2_subst=1,B,D,8,B,D QStringList subst_list = peptide_line.subst.split(","); for(unsigned int i = 0; i < subst_list.size(); i += 3) { peptide_str = peptide_str.replace( subst_list.at(0 + i).toInt() - 1, 1, subst_list.at(2 + i)); } } PeptideXtpSp peptide_sp; peptide_sp = PeptideXtp(peptide_str).makePeptideXtpSp(); // variable modifications : setVariableModifications(peptide_sp, peptide_line.peptide_string_list.at(6)); // fixed modifications : for(unsigned int i = 0; i < peptide_str.size(); i++) { for(MascotModification mascot_modif : _fixed_modification_list) { if(peptide_str.at(i) == mascot_modif.residue) { peptide_sp.get()->addAaModification( mascot_modif.modification, i); } } } peptide_evidence.setPeptideXtpSp( _p_project->getPeptideStore().getInstance(peptide_sp)); // qDebug() << __FILE__ << " " << __FUNCTION__<< " peptide=" << // peptide_str << " evalue=" << peptide_evidence.getEvalue() << " // ionscore=" << ion_score; unsigned int i = 0; foreach(const QString &str, peptide_line.protein_string_list) { // sp|O95006|OR2F2_HUMAN":0:299:303:1 int position = str.indexOf("\"", 0); QString accession = str.mid(0, position); if(is_decoy) { accession = QString("%1|reversed").arg(accession); } // qDebug() << "accession=" << accession; QStringList position_list = str.mid(position + 2).split(":"); if(position_list.size() != 4) { throw pappso::PappsoException( QObject::tr("ERROR position_list.size() != 4 %1").arg(str)); } unsigned int start = position_list.at(1).toUInt() - 1; unsigned int stop = position_list.at(2).toUInt() - 1; // qDebug() << __FILE__ << " " << __FUNCTION__<< " " << __LINE__; ProteinXtp protein; protein.setAccession(accession); if(peptide_line.fasta_file_list.size() == 0) { if(_fasta_file_list.size() == 1) { protein.setFastaFileP(_fasta_file_list[0].get()); } else { throw pappso::PappsoException( QObject::tr("ERROR no FASTA file")); } } else { if(i < peptide_line.fasta_file_list.size()) { protein.setFastaFileP( peptide_line.fasta_file_list[i].get()); } else { throw pappso::PappsoException( QObject::tr("ERROR no corresponding FASTA file %1") .arg(i)); } } if(is_decoy) { protein.setIsDecoy(true); } ProteinMatch *p_protein_match = _p_identification_group->getProteinMatchInstance(accession); if(p_protein_match == nullptr) { throw pappso::PappsoException( QObject::tr("ERROR (p_protein_match == nullptr) %1") .arg(str)); } // qDebug() << __FILE__ << " " << __FUNCTION__<< " " << __LINE__; ProteinXtpSp sp_xtp_protein = protein.makeProteinXtpSp(); p_protein_match->setProteinXtpSp( _p_project->getProteinStore().getInstance(sp_xtp_protein)); p_protein_match->setChecked(true); PeptideMatch peptide_match; peptide_match.setStart(start); peptide_match.setPeptideEvidenceSp( _p_identification_data_source->getPeptideEvidenceStore() .getInstance(&peptide_evidence)); p_protein_match->addPeptideMatch(peptide_match); i++; } } } catch(pappso::PappsoException exception_pappso) { _error_str = QObject::tr( "ERROR in MascotDatParser::savePeptideList " " PAPPSO exception:\n%2") .arg(exception_pappso.qwhat()); qDebug() << _error_str; throw pappso::PappsoException(_error_str); } catch(std::exception exception_std) { _error_str = QObject::tr( "ERROR in MascotDatParser::savePeptideList std " "exception:\n%2") .arg(exception_std.what()); qDebug() << _error_str; throw pappso::PappsoException(_error_str); } qDebug() << __FILE__ << " " << __FUNCTION__ << " " << __LINE__; } pappso::pappso_double MascotDatParser::getEvalue(pappso::pappso_double ion_score, SummaryLine &summary_line, pappso::pappso_double confidence_interval) const { pappso::pappso_double identity_threshold = (pappso::pappso_double)10.0 * std::log(summary_line.match / (confidence_interval * (pappso::pappso_double)20.0)) / log((pappso::pappso_double)10); ; return (confidence_interval * pow((pappso::pappso_double)10, ((identity_threshold - ion_score) / (pappso::pappso_double)10))); } pappso::pappso_double MascotDatParser::getEvalueExperimental( pappso::pappso_double ion_score, SummaryLine &summary_line, pappso::pappso_double confidence_interval) const { // Evalue experimental Homology threshold (QPlughole value from .dat file). pappso::pappso_double eval = (ion_score - summary_line.plug_hole) / (pappso::pappso_double)10; eval = confidence_interval / pow((pappso::pappso_double)10, eval); } void MascotDatParser::setVariableModifications(PeptideXtpSp &peptide_sp, QString variable_modifications_string) { qDebug() << __FILE__ << " " << __FUNCTION__ << " " << __LINE__ << " " << variable_modifications_string; // for(unsigned int i = 0; i < variable_modifications_string.size(); i++) { qDebug() << __FILE__ << " " << __FUNCTION__ << " " << __LINE__ << " " << variable_modifications_string.mid(i, 1).toUInt() << " " << variable_modifications_string; if(variable_modifications_string.mid(i, 1).toUInt() > 0) { MascotModification mascot_modification = _delta_modification_list [variable_modifications_string.mid(i, 1).toUInt() - 1]; try { if(i > 0) { peptide_sp.get()->addAaModification( mascot_modification.modification, i - 1); } else { peptide_sp.get()->addAaModification( mascot_modification.modification, i); } } catch(pappso::PappsoException &error) { throw pappso::PappsoException( QObject::tr("ERROR adding modification %1 to peptide %2 at " "position %3 :\n %4") .arg(mascot_modification.modification->getAccession()) .arg(peptide_sp.get()->toAbsoluteString()) .arg(i) .arg(error.qwhat())); } } } qDebug() << __FILE__ << " " << __FUNCTION__ << " " << __LINE__; }