/** * \file /input/mascot/mascotdatparser.h * \date 17/2/2018 * \author Olivier Langella * \brief MASCOT dat file parser */ /******************************************************************************* * Copyright (c) 2018 Olivier Langella <olivier.langella@u-psud.fr>. * * This file is part of XTPcpp. * * XTPcpp is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * XTPcpp is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with XTPcpp. If not, see <http://www.gnu.org/licenses/>. * * Contributors: * Olivier Langella <olivier.langella@u-psud.fr> - initial API and implementation ******************************************************************************/ #include "mascotdatparser.h" #include "mimeparser.h" #include <QDebug> MascotDatParser::MascotDatParser(Project * p_project, IdentificationGroup * p_identification_group, IdentificationDataSource * p_identification_data_source) { _p_project = p_project; _p_identification_group = p_identification_group; _p_identification_data_source = p_identification_data_source; } MascotDatParser::~MascotDatParser() { } void MascotDatParser::parse(QIODevice * in_stream) { qDebug() << "MascotDatParser::parse begin"; MimeParser mime_parser(in_stream); mime_parser.open(); for(bool more=mime_parser.goToFirstFile(); more; more=mime_parser.goToNextFile()) { qDebug() << "MascotDatParser::parse mimetype=" << mime_parser.getCurrentMimeType() << " filename=" << mime_parser.getCurrentFileName(); if (mime_parser.getCurrentFileName() == "proteins") { while(!mime_parser.getCurrentTextStream().atEnd()) { parseProteinLine( mime_parser.getCurrentTextStream().readLine()); } } else if (mime_parser.getCurrentFileName() == "header") { while(!mime_parser.getCurrentTextStream().atEnd()) { parseHeaderLine( mime_parser.getCurrentTextStream().readLine()); } } } mime_parser.close(); qDebug() << "MascotDatParser::parse end"; } void MascotDatParser::parseProteinLine(const QString & protein_line) { ProteinXtpSp sp_xtp_protein; //02::"tr|A0A0D9SF80|A0A0D9SF80_HUMAN"=55120.88,"General transcription factor II-I repeat domain-containing protein 2A OS=Homo sapiens GN=GTF2IRD2B PE=4 SV=1" QRegExp regexp_protein("^(.*)::\"(.*)\"=([0-9]+\\.[0-9]+),\"(.*)\"$"); if (regexp_protein.exactMatch(protein_line)) { QStringList protein_list = regexp_protein.capturedTexts(); _current_protein.setAccession(protein_list[2]); _current_protein.setDescription(protein_list[4]); sp_xtp_protein = _current_protein.makeProteinXtpSp(); sp_xtp_protein = _p_project->getProteinStore().getInstance(sp_xtp_protein); } else { QRegExp regexp_proteinb("^\"(.*)\"=([0-9]+\\.[0-9]+),\"(.*)\"$"); if (regexp_proteinb.exactMatch(protein_line)) { QStringList protein_list = regexp_proteinb.capturedTexts(); _current_protein.setAccession(protein_list[1]); _current_protein.setDescription(protein_list[3]); sp_xtp_protein = _current_protein.makeProteinXtpSp(); sp_xtp_protein = _p_project->getProteinStore().getInstance(sp_xtp_protein); } else { qDebug() << "MascotDatParser::parseProteinLine error " << protein_line; } } } void MascotDatParser::parseHeaderLine(const QString & header_line) { QRegExp regexp_header_line("^([a-z,0-9,_]+)=(.*)$"); if (regexp_header_line.exactMatch(header_line)) { QStringList header_list = regexp_header_line.capturedTexts(); //sequences=73998 //sequences_after_tax=73998 //residues=24900901 //distribution=64847,0,1,576,2254,1934,1417,984,683,419,238,192,113,89,74,46,23,32,28,21,11,3,2,3,3,2,1,1,0,0,0,0,0,0,1 //decoy_type=1 //distribution_decoy=65162,0,1,552,2259,1891,1422,957,601,376,192,159,126,73,56,64,26,27,22,14,6,4,1,1,3,2,1 //exec_time=484 //date=1517587671 //time=17:07:51 //queries=54084 //min_peaks_for_homology=6 //max_hits=50 //version=2.5.0 if (header_list[1] == "version") { _p_identification_data_source->setIdentificationEngineVersion(header_list[2]); } //fastafile=C:/inetpub/mascot/sequence/ECOLI_INRA/current/ECOLI_INRA_1.fasta else if (header_list[1].startsWith("fastafile")) { qDebug() << "fastafile=" << header_list[2]; _fasta_file_list.push_back( _p_project->getFastaFileStore().getInstance(FastaFile(header_list[2]))); _p_identification_data_source->addFastaFile(_fasta_file_list.back()); } //release=ECOLI_INRA_1.fasta //sequences1=4305 //sequences_after_tax1=4305 //residues1=1356026 //fastafile2=C:/inetpub/mascot/sequence/HUMAN_INRA/current/HUMAN_INRA_1.fasta //release2=HUMAN_INRA_1.fasta //sequences2=69693 //sequences_after_tax2=69693 //residues2=23544875 //taskid=151758718701 } }