Skip to content
Snippets Groups Projects
mascotdatparser.cpp 13.2 KiB
Newer Older
Langella Olivier's avatar
Langella Olivier committed
/**
 * \file /input/mascot/mascotdatparser.h
 * \date 17/2/2018
 * \author Olivier Langella
 * \brief MASCOT dat file parser
 */

/*******************************************************************************
* Copyright (c) 2018 Olivier Langella <olivier.langella@u-psud.fr>.
*
* This file is part of XTPcpp.
*
*     XTPcpp is free software: you can redistribute it and/or modify
*     it under the terms of the GNU General Public License as published by
*     the Free Software Foundation, either version 3 of the License, or
*     (at your option) any later version.
*
*     XTPcpp is distributed in the hope that it will be useful,
*     but WITHOUT ANY WARRANTY; without even the implied warranty of
*     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
*     GNU General Public License for more details.
*
*     You should have received a copy of the GNU General Public License
*     along with XTPcpp.  If not, see <http://www.gnu.org/licenses/>.
*
* Contributors:
*     Olivier Langella <olivier.langella@u-psud.fr> - initial API and implementation
******************************************************************************/

#include "mascotdatparser.h"
#include "mimeparser.h"
#include <QDebug>
Langella Olivier's avatar
Langella Olivier committed
#include <pappsomspp/pappsoexception.h>
Langella Olivier's avatar
Langella Olivier committed
#include "../../core/peptidextp.h"
#include "../../core/proteinmatch.h"
Langella Olivier's avatar
Langella Olivier committed

MascotDatParser::MascotDatParser(Project * p_project, IdentificationGroup * p_identification_group,
                                 IdentificationDataSource * p_identification_data_source) {
    _p_project = p_project;
    _p_identification_group = p_identification_group;
    _p_identification_data_source = p_identification_data_source;

Langella Olivier's avatar
Langella Olivier committed
    _regexp_header_line.setPattern("^([a-z,0-9,_]+)=(.*)$");

Langella Olivier's avatar
Langella Olivier committed
}
MascotDatParser::~MascotDatParser() {
}
void MascotDatParser::parse(QIODevice * in_stream) {
    qDebug() << "MascotDatParser::parse begin";
    MimeParser mime_parser(in_stream);
    mime_parser.open();

    for(bool more=mime_parser.goToFirstFile(); more; more=mime_parser.goToNextFile()) {
        qDebug() << "MascotDatParser::parse mimetype=" << mime_parser.getCurrentMimeType() << " filename=" << mime_parser.getCurrentFileName();

        if (mime_parser.getCurrentFileName() == "proteins") {
            while(!mime_parser.getCurrentTextStream().atEnd()) {
                parseProteinLine( mime_parser.getCurrentTextStream().readLine());
            }
        }
Langella Olivier's avatar
Langella Olivier committed
        else if (mime_parser.getCurrentFileName() == "header") {
            while(!mime_parser.getCurrentTextStream().atEnd()) {
                parseHeaderLine( mime_parser.getCurrentTextStream().readLine());
            }
        }
Langella Olivier's avatar
Langella Olivier committed
        else if (mime_parser.getCurrentFileName() == "peptides") {
            while(!mime_parser.getCurrentTextStream().atEnd()) {
                parsePeptidesLine( mime_parser.getCurrentTextStream().readLine());
            }
        }
Langella Olivier's avatar
Langella Olivier committed
    }

    mime_parser.close();
    qDebug() << "MascotDatParser::parse end";
}
void MascotDatParser::parseProteinLine(const QString & protein_line) {
Langella Olivier's avatar
Langella Olivier committed
    ProteinXtpSp sp_xtp_protein;
Langella Olivier's avatar
Langella Olivier committed
    //02::"tr|A0A0D9SF80|A0A0D9SF80_HUMAN"=55120.88,"General transcription factor II-I repeat domain-containing protein 2A OS=Homo sapiens GN=GTF2IRD2B PE=4 SV=1"
    QRegExp   regexp_protein("^(.*)::\"(.*)\"=([0-9]+\\.[0-9]+),\"(.*)\"$");
    if (regexp_protein.exactMatch(protein_line)) {
        QStringList protein_list = regexp_protein.capturedTexts();
Langella Olivier's avatar
Langella Olivier committed
        _current_protein.setAccession(protein_list[2]);
        _current_protein.setDescription(protein_list[4]);

        sp_xtp_protein = _current_protein.makeProteinXtpSp();
        sp_xtp_protein = _p_project->getProteinStore().getInstance(sp_xtp_protein);
Langella Olivier's avatar
Langella Olivier committed
    }
    else {
        QRegExp   regexp_proteinb("^\"(.*)\"=([0-9]+\\.[0-9]+),\"(.*)\"$");
        if (regexp_proteinb.exactMatch(protein_line)) {
            QStringList protein_list = regexp_proteinb.capturedTexts();
Langella Olivier's avatar
Langella Olivier committed

            _current_protein.setAccession(protein_list[1]);
            _current_protein.setDescription(protein_list[3]);

            sp_xtp_protein = _current_protein.makeProteinXtpSp();
            sp_xtp_protein = _p_project->getProteinStore().getInstance(sp_xtp_protein);
Langella Olivier's avatar
Langella Olivier committed
        }
        else {
            qDebug() << "MascotDatParser::parseProteinLine error " << protein_line;
        }
    }
}
Langella Olivier's avatar
Langella Olivier committed
void MascotDatParser::parseHeaderLine(const QString & header_line) {
Langella Olivier's avatar
Langella Olivier committed
    if (_regexp_header_line.exactMatch(header_line)) {
        QStringList header_list = _regexp_header_line.capturedTexts();
Langella Olivier's avatar
Langella Olivier committed
//sequences=73998
//sequences_after_tax=73998
//residues=24900901
Langella Olivier's avatar
Langella Olivier committed
        if (header_list[1].startsWith("residues")) {
            qDebug() << "queries=" <<  header_list[2];
            _number_of_residues = header_list[2].toUInt();
        }
Langella Olivier's avatar
Langella Olivier committed
//distribution=64847,0,1,576,2254,1934,1417,984,683,419,238,192,113,89,74,46,23,32,28,21,11,3,2,3,3,2,1,1,0,0,0,0,0,0,1
//decoy_type=1
//distribution_decoy=65162,0,1,552,2259,1891,1422,957,601,376,192,159,126,73,56,64,26,27,22,14,6,4,1,1,3,2,1
//exec_time=484
//date=1517587671
//time=17:07:51
//queries=54084
Langella Olivier's avatar
Langella Olivier committed
        else if (header_list[1].startsWith("queries")) {
            qDebug() << "queries=" <<  header_list[2];
            _number_of_queries = header_list[2].toUInt();
Langella Olivier's avatar
Langella Olivier committed
            _query_peptide_results.resize(_number_of_queries);
Langella Olivier's avatar
Langella Olivier committed
//min_peaks_for_homology=6
//max_hits=50
//version=2.5.0
Langella Olivier's avatar
Langella Olivier committed
        else if (header_list[1] == "version") {
Langella Olivier's avatar
Langella Olivier committed
            _p_identification_data_source->setIdentificationEngineVersion(header_list[2]);
        }
//fastafile=C:/inetpub/mascot/sequence/ECOLI_INRA/current/ECOLI_INRA_1.fasta
        else if (header_list[1].startsWith("fastafile")) {
            qDebug() << "fastafile=" <<  header_list[2];
            _fasta_file_list.push_back( _p_project->getFastaFileStore().getInstance(FastaFile(header_list[2])));

            _p_identification_data_source->addFastaFile(_fasta_file_list.back());
        }
//release=ECOLI_INRA_1.fasta
//sequences1=4305
//sequences_after_tax1=4305
//residues1=1356026
//fastafile2=C:/inetpub/mascot/sequence/HUMAN_INRA/current/HUMAN_INRA_1.fasta
//release2=HUMAN_INRA_1.fasta
//sequences2=69693
//sequences_after_tax2=69693
//residues2=23544875
//taskid=151758718701
    }
}
Langella Olivier's avatar
Langella Olivier committed
void MascotDatParser::saveAndClearPeptide() {
    qDebug() << "MascotDatParser::saveAndClearPeptide begin";
    if (_current_peptide.query_index > 0) {
Langella Olivier's avatar
Langella Olivier committed
        // save
        _query_peptide_results[_current_peptide.query_index-1].push_back(_current_peptide);
Langella Olivier's avatar
Langella Olivier committed
        //parse and save

Langella Olivier's avatar
Langella Olivier committed
        QString peptide_str =_current_peptide.peptide_string_list.at(4);
        if (!_current_peptide.subst.isEmpty()) {
            //q856_p9_subst=1,X,W
            //q24379_p2_subst=1,B,D,8,B,D
            QStringList subst_list = _current_peptide.subst.split(",");
            for (unsigned int i=0; i < subst_list.size(); i+=3) {
                peptide_str = peptide_str.replace(subst_list.at(0+i).toInt()-1,1,subst_list.at(2+i));
            }
        }
Langella Olivier's avatar
Langella Olivier committed
        PeptideXtpSp peptide_sp;
        peptide_sp = PeptideXtp(peptide_str).makePeptideXtpSp();
        peptide_sp = _p_project->getPeptideStore().getInstance(peptide_sp);
Langella Olivier's avatar
Langella Olivier committed

        if (_current_peptide.protein_string_list.size() != _current_peptide.fasta_file_list.size()) {
            throw pappso::PappsoException(QObject::tr("ERROR (_current_peptide.protein_string_list.size() != _current_peptide.fasta_file_list.size()) %1").arg(_current_peptide.protein_string_list.join(",\"")));
        }

        foreach (const QString &str, _current_peptide.protein_string_list) {
            //sp|O95006|OR2F2_HUMAN":0:299:303:1
            int position =  str.indexOf("\"", 0);
            QString accession = str.mid(0, position);
            qDebug() << "accession=" <<  accession;
            QStringList position_list = str.mid(position+2).split(":");
            if (position_list.size() != 4) {
                throw pappso::PappsoException(QObject::tr("ERROR position_list.size() != 4 %1").arg(str));
            }
            unsigned int start = position_list.at(1).toUInt();
            unsigned int stop = position_list.at(2).toUInt();
Langella Olivier's avatar
Langella Olivier committed

            ProteinXtp protein;
            protein.setAccession(accession);

            ProteinMatch * p_protein_match = _p_identification_group->getProteinMatchInstance(protein.getAccession());
            if (p_protein_match == nullptr) {
                throw pappso::PappsoException(QObject::tr("ERROR (p_protein_match == nullptr) %1").arg(str));
            }

            ProteinXtpSp sp_xtp_protein = protein.makeProteinXtpSp();
            p_protein_match->setProteinXtpSp(_p_project->getProteinStore().getInstance(sp_xtp_protein));
            p_protein_match->setChecked(true);
            
            PeptideMatch peptide_match;
            peptide_match.setStart(start);
            //peptide_match.setPeptideEvidenceSp();
            
            p_protein_match->addPeptideMatch(peptide_match);
Langella Olivier's avatar
Langella Olivier committed
        }
    }

    //new peptide query clear
    _current_peptide.peptide_string_list.clear();
    _current_peptide.fasta_file_list.clear();
    _current_peptide.query_index = 0;
    _current_peptide.subst = "";
    qDebug() << "MascotDatParser::saveAndClearPeptide end";
}

Langella Olivier's avatar
Langella Olivier committed
void MascotDatParser::parsePeptidesLine(const QString & peptide_line) {
Langella Olivier's avatar
Langella Olivier committed
    try {
        if (_regexp_header_line.exactMatch(peptide_line)) {
            QStringList header_list = _regexp_header_line.capturedTexts();
            QString index = header_list[1];
            QString value = header_list[2];
            QStringList index_list = index.split("_");
            if (index_list.size() == 3) {
                if (index_list[2] == "db") {
Langella Olivier's avatar
Langella Olivier committed
                    saveAndClearPeptide();
Langella Olivier's avatar
Langella Olivier committed
                    //q1_p1_db=02
                    while (value.size() > 0) {
                        QString fasta_str = value.mid(0,2);
Langella Olivier's avatar
Langella Olivier committed
                        _current_peptide.fasta_file_list.push_back(_fasta_file_list.at(fasta_str.toInt()-1));
Langella Olivier's avatar
Langella Olivier committed
                        value = value.mid(2);
                    }
Langella Olivier's avatar
Langella Olivier committed
                }
Langella Olivier's avatar
Langella Olivier committed
                //q856_p9_subst=1,X,W
                else if (index_list[2] == "subst") {
                    _current_peptide.subst = value;
                }
Langella Olivier's avatar
Langella Olivier committed
            }
Langella Olivier's avatar
Langella Olivier committed
            else if (index_list.size() == 2) {
                if (value == "-1") {
                    //no result for this query
                }
                else {

                    QString query_index = index_list[0];
Langella Olivier's avatar
Langella Olivier committed
                    _current_peptide.query_index = query_index.mid(1).toUInt();
Langella Olivier's avatar
Langella Olivier committed
                    QString peptide_index = index_list[1];
Langella Olivier's avatar
Langella Olivier committed
                    _current_peptide.peptide_index = peptide_index.mid(1).toUInt();
Langella Olivier's avatar
Langella Olivier committed
                    //q1_p1=0,597.302322,0.997884,2,GAWHK,9,0000000,7.97,0000012000000000000,0,0;"sp|O95006|OR2F2_HUMAN":0:299:303:1
                    int position =  value.indexOf(";\"", 0);
                    QString peptide_string = value.mid(0, position);
                    qDebug() << "peptide_string=" <<  peptide_string;

Langella Olivier's avatar
Langella Olivier committed
                    _current_peptide.peptide_string_list = peptide_string.split(",");
Langella Olivier's avatar
Langella Olivier committed
                    QString protein_string = value.mid(position+2);
                    qDebug() << "protein_string=" <<  protein_string;
                    //"sp|Q9Y2I7|FYV1_HUMAN":0:670:675:2,"tr|E9PDH4|E9PDH4_HUMAN":0:614:619:2
Langella Olivier's avatar
Langella Olivier committed
                    _current_peptide.protein_string_list = protein_string.split(",\"");
Langella Olivier's avatar
Langella Olivier committed
                }


            }
            /*
            q1_p1_db=02
            q1_p1=0,597.302322,0.997884,2,GAWHK,9,0000000,7.97,0000012000000000000,0,0;"sp|O95006|OR2F2_HUMAN":0:299:303:1
            q1_p1_terms=K,L
            q1_p2_db=02
            q1_p2=1,598.296219,0.003987,2,KEEPP,11,0000000,1.32,0002000000000000000,0,0;"tr|E9PNM8|E9PNM8_HUMAN":0:134:138:1
            q1_p2_terms=R,-
            q2_p1_db=02
            q2_p1=1,598.380234,-0.000316,3,KAGVPK,13,00000000,17.13,0002011000000000000,0,0;"tr|H7C1P9|H7C1P9_HUMAN":0:945:950:2
            q2_p1_terms=K,K
            q2_p2_db=0202
            q2_p2=1,598.380234,-0.000316,2,KQPVK,9,0000000,9.54,0002011000000000000,0,0;"sp|P33527|MRP1_HUMAN":0:270:274:1,"tr|I3L4X2|I3L4X2_HUMAN":0:169:173:1
            q2_p2_terms=R,V:R,V
            q2_p3_db=02
            q2_p3=1,598.380234,-0.000316,2,KAVPGK,13,00000000,7.03,0002001000000000000,0,0;"sp|Q13061|TRDN_HUMAN":0:440:445:2
            q2_p3_terms=K,K
            q2_p4_db=0202
            q2_p4=1,598.380234,-0.000316,2,IPGGKK,14,00000000,1.26,0002001000000000000,0,0;"sp|Q9Y2I7|FYV1_HUMAN":0:670:675:2,"tr|E9PDH4|E9PDH4_HUMAN":0:614:619:2
            q2_p4_terms=K,F:K,F
            */


            /*
            q856_p9_db=0202
            q856_p9=0,685.427521,-0.000117,3,XLLVR,12,0000000,13.68,0000002000000000000,0,0;"tr|V9GY00|V9GY00_HUMAN":0:1:5:1,"tr|H7C3C3|H7C3C3_HUMAN":0:1:5:1
            q856_p9_terms=-,L:-,V
            q856_p9_subst=1,X,W
            */
Langella Olivier's avatar
Langella Olivier committed
        }
Langella Olivier's avatar
Langella Olivier committed
    }

    catch (pappso::PappsoException exception_pappso) {
        _error_str = QObject::tr("ERROR in MascotDatParser::parsePeptidesLine %1, PAPPSO exception:\n%2").arg(peptide_line).arg(exception_pappso.qwhat());
        qDebug() << _error_str;
        throw pappso::PappsoException(_error_str);
    }
    catch (std::exception exception_std) {
        _error_str = QObject::tr("ERROR in MascotDatParser::parsePeptidesLine %1, std exception:\n%2").arg(peptide_line).arg(exception_std.what());
        qDebug() << _error_str;
        throw pappso::PappsoException(_error_str);