Skip to content
Snippets Groups Projects
mascotdatparser.cpp 5.31 KiB
Newer Older
Langella Olivier's avatar
Langella Olivier committed
/**
 * \file /input/mascot/mascotdatparser.h
 * \date 17/2/2018
 * \author Olivier Langella
 * \brief MASCOT dat file parser
 */

/*******************************************************************************
* Copyright (c) 2018 Olivier Langella <olivier.langella@u-psud.fr>.
*
* This file is part of XTPcpp.
*
*     XTPcpp is free software: you can redistribute it and/or modify
*     it under the terms of the GNU General Public License as published by
*     the Free Software Foundation, either version 3 of the License, or
*     (at your option) any later version.
*
*     XTPcpp is distributed in the hope that it will be useful,
*     but WITHOUT ANY WARRANTY; without even the implied warranty of
*     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
*     GNU General Public License for more details.
*
*     You should have received a copy of the GNU General Public License
*     along with XTPcpp.  If not, see <http://www.gnu.org/licenses/>.
*
* Contributors:
*     Olivier Langella <olivier.langella@u-psud.fr> - initial API and implementation
******************************************************************************/

#include "mascotdatparser.h"
#include "mimeparser.h"
#include <QDebug>

MascotDatParser::MascotDatParser(Project * p_project, IdentificationGroup * p_identification_group,
                                 IdentificationDataSource * p_identification_data_source) {
    _p_project = p_project;
    _p_identification_group = p_identification_group;
    _p_identification_data_source = p_identification_data_source;

}
MascotDatParser::~MascotDatParser() {
}
void MascotDatParser::parse(QIODevice * in_stream) {
    qDebug() << "MascotDatParser::parse begin";
    MimeParser mime_parser(in_stream);
    mime_parser.open();

    for(bool more=mime_parser.goToFirstFile(); more; more=mime_parser.goToNextFile()) {
        qDebug() << "MascotDatParser::parse mimetype=" << mime_parser.getCurrentMimeType() << " filename=" << mime_parser.getCurrentFileName();

        if (mime_parser.getCurrentFileName() == "proteins") {
            while(!mime_parser.getCurrentTextStream().atEnd()) {
                parseProteinLine( mime_parser.getCurrentTextStream().readLine());
            }
        }
Langella Olivier's avatar
Langella Olivier committed
        else if (mime_parser.getCurrentFileName() == "header") {
            while(!mime_parser.getCurrentTextStream().atEnd()) {
                parseHeaderLine( mime_parser.getCurrentTextStream().readLine());
            }
        }
Langella Olivier's avatar
Langella Olivier committed
    }

    mime_parser.close();
    qDebug() << "MascotDatParser::parse end";
}
void MascotDatParser::parseProteinLine(const QString & protein_line) {
Langella Olivier's avatar
Langella Olivier committed
    ProteinXtpSp sp_xtp_protein;
Langella Olivier's avatar
Langella Olivier committed
    //02::"tr|A0A0D9SF80|A0A0D9SF80_HUMAN"=55120.88,"General transcription factor II-I repeat domain-containing protein 2A OS=Homo sapiens GN=GTF2IRD2B PE=4 SV=1"
    QRegExp   regexp_protein("^(.*)::\"(.*)\"=([0-9]+\\.[0-9]+),\"(.*)\"$");
    if (regexp_protein.exactMatch(protein_line)) {
        QStringList protein_list = regexp_protein.capturedTexts();
Langella Olivier's avatar
Langella Olivier committed
        _current_protein.setAccession(protein_list[2]);
        _current_protein.setDescription(protein_list[4]);

        sp_xtp_protein = _current_protein.makeProteinXtpSp();
        sp_xtp_protein = _p_project->getProteinStore().getInstance(sp_xtp_protein);
Langella Olivier's avatar
Langella Olivier committed
    }
    else {
        QRegExp   regexp_proteinb("^\"(.*)\"=([0-9]+\\.[0-9]+),\"(.*)\"$");
        if (regexp_proteinb.exactMatch(protein_line)) {
            QStringList protein_list = regexp_proteinb.capturedTexts();
Langella Olivier's avatar
Langella Olivier committed

            _current_protein.setAccession(protein_list[1]);
            _current_protein.setDescription(protein_list[3]);

            sp_xtp_protein = _current_protein.makeProteinXtpSp();
            sp_xtp_protein = _p_project->getProteinStore().getInstance(sp_xtp_protein);
Langella Olivier's avatar
Langella Olivier committed
        }
        else {
            qDebug() << "MascotDatParser::parseProteinLine error " << protein_line;
        }
    }
}
Langella Olivier's avatar
Langella Olivier committed
void MascotDatParser::parseHeaderLine(const QString & header_line) {
    QRegExp   regexp_header_line("^([a-z,0-9,_]+)=(.*)$");
    if (regexp_header_line.exactMatch(header_line)) {
        QStringList header_list = regexp_header_line.capturedTexts();
//sequences=73998
//sequences_after_tax=73998
//residues=24900901
//distribution=64847,0,1,576,2254,1934,1417,984,683,419,238,192,113,89,74,46,23,32,28,21,11,3,2,3,3,2,1,1,0,0,0,0,0,0,1
//decoy_type=1
//distribution_decoy=65162,0,1,552,2259,1891,1422,957,601,376,192,159,126,73,56,64,26,27,22,14,6,4,1,1,3,2,1
//exec_time=484
//date=1517587671
//time=17:07:51
//queries=54084
//min_peaks_for_homology=6
//max_hits=50
//version=2.5.0
        if (header_list[1] == "version") {
            _p_identification_data_source->setIdentificationEngineVersion(header_list[2]);
        }
//fastafile=C:/inetpub/mascot/sequence/ECOLI_INRA/current/ECOLI_INRA_1.fasta
        else if (header_list[1].startsWith("fastafile")) {
            qDebug() << "fastafile=" <<  header_list[2];
            _fasta_file_list.push_back( _p_project->getFastaFileStore().getInstance(FastaFile(header_list[2])));

            _p_identification_data_source->addFastaFile(_fasta_file_list.back());
        }
//release=ECOLI_INRA_1.fasta
//sequences1=4305
//sequences_after_tax1=4305
//residues1=1356026
//fastafile2=C:/inetpub/mascot/sequence/HUMAN_INRA/current/HUMAN_INRA_1.fasta
//release2=HUMAN_INRA_1.fasta
//sequences2=69693
//sequences_after_tax2=69693
//residues2=23544875
//taskid=151758718701
    }
}