diff --git a/src/input/mascot/mascotdatparser.cpp b/src/input/mascot/mascotdatparser.cpp index 724d913403a8ee86d03729c662a5dcef68c7531c..798ff53e23b5d734aef25d2d6f2f4a834aae8137 100644 --- a/src/input/mascot/mascotdatparser.cpp +++ b/src/input/mascot/mascotdatparser.cpp @@ -53,24 +53,80 @@ void MascotDatParser::parse(QIODevice * in_stream) { parseProteinLine( mime_parser.getCurrentTextStream().readLine()); } } + else if (mime_parser.getCurrentFileName() == "header") { + while(!mime_parser.getCurrentTextStream().atEnd()) { + parseHeaderLine( mime_parser.getCurrentTextStream().readLine()); + } + } } mime_parser.close(); qDebug() << "MascotDatParser::parse end"; } void MascotDatParser::parseProteinLine(const QString & protein_line) { + ProteinXtpSp sp_xtp_protein; //02::"tr|A0A0D9SF80|A0A0D9SF80_HUMAN"=55120.88,"General transcription factor II-I repeat domain-containing protein 2A OS=Homo sapiens GN=GTF2IRD2B PE=4 SV=1" QRegExp regexp_protein("^(.*)::\"(.*)\"=([0-9]+\\.[0-9]+),\"(.*)\"$"); if (regexp_protein.exactMatch(protein_line)) { QStringList protein_list = regexp_protein.capturedTexts(); + _current_protein.setAccession(protein_list[2]); + _current_protein.setDescription(protein_list[4]); + + sp_xtp_protein = _current_protein.makeProteinXtpSp(); + sp_xtp_protein = _p_project->getProteinStore().getInstance(sp_xtp_protein); } else { QRegExp regexp_proteinb("^\"(.*)\"=([0-9]+\\.[0-9]+),\"(.*)\"$"); if (regexp_proteinb.exactMatch(protein_line)) { QStringList protein_list = regexp_proteinb.capturedTexts(); + + _current_protein.setAccession(protein_list[1]); + _current_protein.setDescription(protein_list[3]); + + sp_xtp_protein = _current_protein.makeProteinXtpSp(); + sp_xtp_protein = _p_project->getProteinStore().getInstance(sp_xtp_protein); } else { qDebug() << "MascotDatParser::parseProteinLine error " << protein_line; } } } +void MascotDatParser::parseHeaderLine(const QString & header_line) { + QRegExp regexp_header_line("^([a-z,0-9,_]+)=(.*)$"); + if (regexp_header_line.exactMatch(header_line)) { + QStringList header_list = regexp_header_line.capturedTexts(); +//sequences=73998 +//sequences_after_tax=73998 +//residues=24900901 +//distribution=64847,0,1,576,2254,1934,1417,984,683,419,238,192,113,89,74,46,23,32,28,21,11,3,2,3,3,2,1,1,0,0,0,0,0,0,1 +//decoy_type=1 +//distribution_decoy=65162,0,1,552,2259,1891,1422,957,601,376,192,159,126,73,56,64,26,27,22,14,6,4,1,1,3,2,1 +//exec_time=484 +//date=1517587671 +//time=17:07:51 +//queries=54084 +//min_peaks_for_homology=6 +//max_hits=50 +//version=2.5.0 + if (header_list[1] == "version") { + _p_identification_data_source->setIdentificationEngineVersion(header_list[2]); + } +//fastafile=C:/inetpub/mascot/sequence/ECOLI_INRA/current/ECOLI_INRA_1.fasta + else if (header_list[1].startsWith("fastafile")) { + qDebug() << "fastafile=" << header_list[2]; + _fasta_file_list.push_back( _p_project->getFastaFileStore().getInstance(FastaFile(header_list[2]))); + + _p_identification_data_source->addFastaFile(_fasta_file_list.back()); + } +//release=ECOLI_INRA_1.fasta +//sequences1=4305 +//sequences_after_tax1=4305 +//residues1=1356026 +//fastafile2=C:/inetpub/mascot/sequence/HUMAN_INRA/current/HUMAN_INRA_1.fasta +//release2=HUMAN_INRA_1.fasta +//sequences2=69693 +//sequences_after_tax2=69693 +//residues2=23544875 +//taskid=151758718701 + } +} diff --git a/src/input/mascot/mascotdatparser.h b/src/input/mascot/mascotdatparser.h index 880ba7ba0b92b2f2f7a8a706a27c48d4e62fee0d..aba84f8f562abe3036ae57a31745e2301e02f5d2 100644 --- a/src/input/mascot/mascotdatparser.h +++ b/src/input/mascot/mascotdatparser.h @@ -41,10 +41,15 @@ public: void parse(QIODevice * in_stream); private: void parseProteinLine(const QString & protein_line); + void parseHeaderLine(const QString & protein_line); private: Project * _p_project; IdentificationGroup * _p_identification_group; IdentificationDataSource * _p_identification_data_source; + + + ProteinXtp _current_protein; + std::vector<FastaFileSp> _fasta_file_list; }; #endif // MASCOTDATPARSER_H