diff --git a/src/input/mascot/mascotdatparser.cpp b/src/input/mascot/mascotdatparser.cpp index 5d92c3b479d5a6696e083cdaeaa96329e837cc05..f49bb8ac45d908cb0adc387adf52651aeb93d1c7 100644 --- a/src/input/mascot/mascotdatparser.cpp +++ b/src/input/mascot/mascotdatparser.cpp @@ -30,6 +30,8 @@ #include "mascotdatparser.h" #include "mimeparser.h" #include <QDebug> +#include <pappsomspp/pappsoexception.h> +#include <pappsomspp/peptide/peptide.h> MascotDatParser::MascotDatParser(Project * p_project, IdentificationGroup * p_identification_group, IdentificationDataSource * p_identification_data_source) { @@ -37,6 +39,8 @@ MascotDatParser::MascotDatParser(Project * p_project, IdentificationGroup * p_id _p_identification_group = p_identification_group; _p_identification_data_source = p_identification_data_source; + _regexp_header_line.setPattern("^([a-z,0-9,_]+)=(.*)$"); + } MascotDatParser::~MascotDatParser() { } @@ -97,12 +101,15 @@ void MascotDatParser::parseProteinLine(const QString & protein_line) { } } void MascotDatParser::parseHeaderLine(const QString & header_line) { - QRegExp regexp_header_line("^([a-z,0-9,_]+)=(.*)$"); - if (regexp_header_line.exactMatch(header_line)) { - QStringList header_list = regexp_header_line.capturedTexts(); + if (_regexp_header_line.exactMatch(header_line)) { + QStringList header_list = _regexp_header_line.capturedTexts(); //sequences=73998 //sequences_after_tax=73998 //residues=24900901 + if (header_list[1].startsWith("residues")) { + qDebug() << "queries=" << header_list[2]; + _number_of_residues = header_list[2].toUInt(); + } //distribution=64847,0,1,576,2254,1934,1417,984,683,419,238,192,113,89,74,46,23,32,28,21,11,3,2,3,3,2,1,1,0,0,0,0,0,0,1 //decoy_type=1 //distribution_decoy=65162,0,1,552,2259,1891,1422,957,601,376,192,159,126,73,56,64,26,27,22,14,6,4,1,1,3,2,1 @@ -110,10 +117,14 @@ void MascotDatParser::parseHeaderLine(const QString & header_line) { //date=1517587671 //time=17:07:51 //queries=54084 + else if (header_list[1].startsWith("queries")) { + qDebug() << "queries=" << header_list[2]; + _number_of_queries = header_list[2].toUInt(); + } //min_peaks_for_homology=6 //max_hits=50 //version=2.5.0 - if (header_list[1] == "version") { + else if (header_list[1] == "version") { _p_identification_data_source->setIdentificationEngineVersion(header_list[2]); } //fastafile=C:/inetpub/mascot/sequence/ECOLI_INRA/current/ECOLI_INRA_1.fasta @@ -137,46 +148,104 @@ void MascotDatParser::parseHeaderLine(const QString & header_line) { } void MascotDatParser::parsePeptidesLine(const QString & peptide_line) { - QRegExp regexp_header_line("^([a-z,0-9,_]+)=(.*)$"); - if (regexp_header_line.exactMatch(peptide_line)) { - QStringList header_list = regexp_header_line.capturedTexts(); - QString index = header_list[1]; - QString value = header_list[2]; - QStringList index_list = index.split("_"); - if (index_list.size() == 3) { - if (index_list[2] == "db") { - _peptides_fasta_file_list.clear(); - while(value.size() > 0) { - _peptides_fasta_file_list.push_back( _fasta_file_list[value.left(2).toInt()-1]); - value = value.mid(2); + try { + if (_regexp_header_line.exactMatch(peptide_line)) { + QStringList header_list = _regexp_header_line.capturedTexts(); + QString index = header_list[1]; + QString value = header_list[2]; + QStringList index_list = index.split("_"); + if (index_list.size() == 3) { + if (index_list[2] == "db") { + //q1_p1_db=02 + _peptides_fasta_file_list.clear(); + while (value.size() > 0) { + QString fasta_str = value.mid(0,2); + _peptides_fasta_file_list.push_back(_fasta_file_list.at(fasta_str.toInt()-1)); + value = value.mid(2); + } } } - } - else if (index_list.size() == 2) { + else if (index_list.size() == 2) { + if (value == "-1") { + //no result for this query + } + else { + + QString query_index = index_list[0]; + QString peptide_index = index_list[1]; + //q1_p1=0,597.302322,0.997884,2,GAWHK,9,0000000,7.97,0000012000000000000,0,0;"sp|O95006|OR2F2_HUMAN":0:299:303:1 + int position = value.indexOf(";\"", 0); + QString peptide_string = value.mid(0, position); + qDebug() << "peptide_string=" << peptide_string; + + QStringList peptide_string_list = peptide_string.split(","); + pappso::Peptide peptide(peptide_string_list.at(4)); + + - QString query = index_list[0]; - QString peptide = index_list[1]; + QString protein_string = value.mid(position+2); + qDebug() << "protein_string=" << protein_string; + //"sp|Q9Y2I7|FYV1_HUMAN":0:670:675:2,"tr|E9PDH4|E9PDH4_HUMAN":0:614:619:2 + QStringList protein_string_list = protein_string.split(",\""); + if (protein_string_list.size() != _peptides_fasta_file_list.size()) { + throw pappso::PappsoException(QObject::tr("ERROR (protein_string_list.size() != _peptides_fasta_file_list.size()) %1").arg(value)); + } + foreach (const QString &str, protein_string_list) { + //sp|O95006|OR2F2_HUMAN":0:299:303:1 + int position = str.indexOf("\"", 0); + QString accession = str.mid(0, position); + qDebug() << "accession=" << accession; + QStringList position_list = str.mid(position+2).split(":"); + if (position_list.size() != 4) { + throw pappso::PappsoException(QObject::tr("ERROR position_list.size() != 4 %1").arg(value)); + } + unsigned int start = position_list.at(1).toUInt(); + unsigned int stop = position_list.at(2).toUInt(); + } + } + + + } + /* + q1_p1_db=02 + q1_p1=0,597.302322,0.997884,2,GAWHK,9,0000000,7.97,0000012000000000000,0,0;"sp|O95006|OR2F2_HUMAN":0:299:303:1 + q1_p1_terms=K,L + q1_p2_db=02 + q1_p2=1,598.296219,0.003987,2,KEEPP,11,0000000,1.32,0002000000000000000,0,0;"tr|E9PNM8|E9PNM8_HUMAN":0:134:138:1 + q1_p2_terms=R,- + q2_p1_db=02 + q2_p1=1,598.380234,-0.000316,3,KAGVPK,13,00000000,17.13,0002011000000000000,0,0;"tr|H7C1P9|H7C1P9_HUMAN":0:945:950:2 + q2_p1_terms=K,K + q2_p2_db=0202 + q2_p2=1,598.380234,-0.000316,2,KQPVK,9,0000000,9.54,0002011000000000000,0,0;"sp|P33527|MRP1_HUMAN":0:270:274:1,"tr|I3L4X2|I3L4X2_HUMAN":0:169:173:1 + q2_p2_terms=R,V:R,V + q2_p3_db=02 + q2_p3=1,598.380234,-0.000316,2,KAVPGK,13,00000000,7.03,0002001000000000000,0,0;"sp|Q13061|TRDN_HUMAN":0:440:445:2 + q2_p3_terms=K,K + q2_p4_db=0202 + q2_p4=1,598.380234,-0.000316,2,IPGGKK,14,00000000,1.26,0002001000000000000,0,0;"sp|Q9Y2I7|FYV1_HUMAN":0:670:675:2,"tr|E9PDH4|E9PDH4_HUMAN":0:614:619:2 + q2_p4_terms=K,F:K,F + */ + + + /* + q856_p9_db=0202 + q856_p9=0,685.427521,-0.000117,3,XLLVR,12,0000000,13.68,0000002000000000000,0,0;"tr|V9GY00|V9GY00_HUMAN":0:1:5:1,"tr|H7C3C3|H7C3C3_HUMAN":0:1:5:1 + q856_p9_terms=-,L:-,V + q856_p9_subst=1,X,W + */ } - /* - q1_p1_db=02 - q1_p1=0,597.302322,0.997884,2,GAWHK,9,0000000,7.97,0000012000000000000,0,0;"sp|O95006|OR2F2_HUMAN":0:299:303:1 - q1_p1_terms=K,L - q1_p2_db=02 - q1_p2=1,598.296219,0.003987,2,KEEPP,11,0000000,1.32,0002000000000000000,0,0;"tr|E9PNM8|E9PNM8_HUMAN":0:134:138:1 - q1_p2_terms=R,- - q2_p1_db=02 - q2_p1=1,598.380234,-0.000316,3,KAGVPK,13,00000000,17.13,0002011000000000000,0,0;"tr|H7C1P9|H7C1P9_HUMAN":0:945:950:2 - q2_p1_terms=K,K - q2_p2_db=0202 - q2_p2=1,598.380234,-0.000316,2,KQPVK,9,0000000,9.54,0002011000000000000,0,0;"sp|P33527|MRP1_HUMAN":0:270:274:1,"tr|I3L4X2|I3L4X2_HUMAN":0:169:173:1 - q2_p2_terms=R,V:R,V - q2_p3_db=02 - q2_p3=1,598.380234,-0.000316,2,KAVPGK,13,00000000,7.03,0002001000000000000,0,0;"sp|Q13061|TRDN_HUMAN":0:440:445:2 - q2_p3_terms=K,K - q2_p4_db=0202 - q2_p4=1,598.380234,-0.000316,2,IPGGKK,14,00000000,1.26,0002001000000000000,0,0;"sp|Q9Y2I7|FYV1_HUMAN":0:670:675:2,"tr|E9PDH4|E9PDH4_HUMAN":0:614:619:2 - q2_p4_terms=K,F:K,F - */ + } + + catch (pappso::PappsoException exception_pappso) { + _error_str = QObject::tr("ERROR in MascotDatParser::parsePeptidesLine %1, PAPPSO exception:\n%2").arg(peptide_line).arg(exception_pappso.qwhat()); + qDebug() << _error_str; + throw pappso::PappsoException(_error_str); + } + catch (std::exception exception_std) { + _error_str = QObject::tr("ERROR in MascotDatParser::parsePeptidesLine %1, std exception:\n%2").arg(peptide_line).arg(exception_std.what()); + qDebug() << _error_str; + throw pappso::PappsoException(_error_str); } } diff --git a/src/input/mascot/mascotdatparser.h b/src/input/mascot/mascotdatparser.h index d28551c665025266f14d440266e26c758abffe3a..f15aa1ff5382385c11d44f57ae716cc4510d6f6d 100644 --- a/src/input/mascot/mascotdatparser.h +++ b/src/input/mascot/mascotdatparser.h @@ -47,12 +47,18 @@ private: Project * _p_project; IdentificationGroup * _p_identification_group; IdentificationDataSource * _p_identification_data_source; - - + + ProteinXtp _current_protein; std::vector<FastaFileSp> _fasta_file_list; - + std::vector<FastaFileSp> _peptides_fasta_file_list; + + QRegExp _regexp_header_line; + unsigned int _number_of_queries=0; + unsigned int _number_of_residues=0; + QString _error_str; + }; #endif // MASCOTDATPARSER_H