Skip to content
Snippets Groups Projects
Commit 3581ca04 authored by Langella Olivier's avatar Langella Olivier
Browse files

WIP: Mascot parser

parent 19ab80b7
No related branches found
No related tags found
No related merge requests found
......@@ -30,6 +30,8 @@
#include "mascotdatparser.h"
#include "mimeparser.h"
#include <QDebug>
#include <pappsomspp/pappsoexception.h>
#include <pappsomspp/peptide/peptide.h>
MascotDatParser::MascotDatParser(Project * p_project, IdentificationGroup * p_identification_group,
IdentificationDataSource * p_identification_data_source) {
......@@ -37,6 +39,8 @@ MascotDatParser::MascotDatParser(Project * p_project, IdentificationGroup * p_id
_p_identification_group = p_identification_group;
_p_identification_data_source = p_identification_data_source;
_regexp_header_line.setPattern("^([a-z,0-9,_]+)=(.*)$");
}
MascotDatParser::~MascotDatParser() {
}
......@@ -97,12 +101,15 @@ void MascotDatParser::parseProteinLine(const QString & protein_line) {
}
}
void MascotDatParser::parseHeaderLine(const QString & header_line) {
QRegExp regexp_header_line("^([a-z,0-9,_]+)=(.*)$");
if (regexp_header_line.exactMatch(header_line)) {
QStringList header_list = regexp_header_line.capturedTexts();
if (_regexp_header_line.exactMatch(header_line)) {
QStringList header_list = _regexp_header_line.capturedTexts();
//sequences=73998
//sequences_after_tax=73998
//residues=24900901
if (header_list[1].startsWith("residues")) {
qDebug() << "queries=" << header_list[2];
_number_of_residues = header_list[2].toUInt();
}
//distribution=64847,0,1,576,2254,1934,1417,984,683,419,238,192,113,89,74,46,23,32,28,21,11,3,2,3,3,2,1,1,0,0,0,0,0,0,1
//decoy_type=1
//distribution_decoy=65162,0,1,552,2259,1891,1422,957,601,376,192,159,126,73,56,64,26,27,22,14,6,4,1,1,3,2,1
......@@ -110,10 +117,14 @@ void MascotDatParser::parseHeaderLine(const QString & header_line) {
//date=1517587671
//time=17:07:51
//queries=54084
else if (header_list[1].startsWith("queries")) {
qDebug() << "queries=" << header_list[2];
_number_of_queries = header_list[2].toUInt();
}
//min_peaks_for_homology=6
//max_hits=50
//version=2.5.0
if (header_list[1] == "version") {
else if (header_list[1] == "version") {
_p_identification_data_source->setIdentificationEngineVersion(header_list[2]);
}
//fastafile=C:/inetpub/mascot/sequence/ECOLI_INRA/current/ECOLI_INRA_1.fasta
......@@ -137,46 +148,104 @@ void MascotDatParser::parseHeaderLine(const QString & header_line) {
}
void MascotDatParser::parsePeptidesLine(const QString & peptide_line) {
QRegExp regexp_header_line("^([a-z,0-9,_]+)=(.*)$");
if (regexp_header_line.exactMatch(peptide_line)) {
QStringList header_list = regexp_header_line.capturedTexts();
QString index = header_list[1];
QString value = header_list[2];
QStringList index_list = index.split("_");
if (index_list.size() == 3) {
if (index_list[2] == "db") {
_peptides_fasta_file_list.clear();
while(value.size() > 0) {
_peptides_fasta_file_list.push_back( _fasta_file_list[value.left(2).toInt()-1]);
value = value.mid(2);
try {
if (_regexp_header_line.exactMatch(peptide_line)) {
QStringList header_list = _regexp_header_line.capturedTexts();
QString index = header_list[1];
QString value = header_list[2];
QStringList index_list = index.split("_");
if (index_list.size() == 3) {
if (index_list[2] == "db") {
//q1_p1_db=02
_peptides_fasta_file_list.clear();
while (value.size() > 0) {
QString fasta_str = value.mid(0,2);
_peptides_fasta_file_list.push_back(_fasta_file_list.at(fasta_str.toInt()-1));
value = value.mid(2);
}
}
}
}
else if (index_list.size() == 2) {
else if (index_list.size() == 2) {
if (value == "-1") {
//no result for this query
}
else {
QString query_index = index_list[0];
QString peptide_index = index_list[1];
//q1_p1=0,597.302322,0.997884,2,GAWHK,9,0000000,7.97,0000012000000000000,0,0;"sp|O95006|OR2F2_HUMAN":0:299:303:1
int position = value.indexOf(";\"", 0);
QString peptide_string = value.mid(0, position);
qDebug() << "peptide_string=" << peptide_string;
QStringList peptide_string_list = peptide_string.split(",");
pappso::Peptide peptide(peptide_string_list.at(4));
QString query = index_list[0];
QString peptide = index_list[1];
QString protein_string = value.mid(position+2);
qDebug() << "protein_string=" << protein_string;
//"sp|Q9Y2I7|FYV1_HUMAN":0:670:675:2,"tr|E9PDH4|E9PDH4_HUMAN":0:614:619:2
QStringList protein_string_list = protein_string.split(",\"");
if (protein_string_list.size() != _peptides_fasta_file_list.size()) {
throw pappso::PappsoException(QObject::tr("ERROR (protein_string_list.size() != _peptides_fasta_file_list.size()) %1").arg(value));
}
foreach (const QString &str, protein_string_list) {
//sp|O95006|OR2F2_HUMAN":0:299:303:1
int position = str.indexOf("\"", 0);
QString accession = str.mid(0, position);
qDebug() << "accession=" << accession;
QStringList position_list = str.mid(position+2).split(":");
if (position_list.size() != 4) {
throw pappso::PappsoException(QObject::tr("ERROR position_list.size() != 4 %1").arg(value));
}
unsigned int start = position_list.at(1).toUInt();
unsigned int stop = position_list.at(2).toUInt();
}
}
}
/*
q1_p1_db=02
q1_p1=0,597.302322,0.997884,2,GAWHK,9,0000000,7.97,0000012000000000000,0,0;"sp|O95006|OR2F2_HUMAN":0:299:303:1
q1_p1_terms=K,L
q1_p2_db=02
q1_p2=1,598.296219,0.003987,2,KEEPP,11,0000000,1.32,0002000000000000000,0,0;"tr|E9PNM8|E9PNM8_HUMAN":0:134:138:1
q1_p2_terms=R,-
q2_p1_db=02
q2_p1=1,598.380234,-0.000316,3,KAGVPK,13,00000000,17.13,0002011000000000000,0,0;"tr|H7C1P9|H7C1P9_HUMAN":0:945:950:2
q2_p1_terms=K,K
q2_p2_db=0202
q2_p2=1,598.380234,-0.000316,2,KQPVK,9,0000000,9.54,0002011000000000000,0,0;"sp|P33527|MRP1_HUMAN":0:270:274:1,"tr|I3L4X2|I3L4X2_HUMAN":0:169:173:1
q2_p2_terms=R,V:R,V
q2_p3_db=02
q2_p3=1,598.380234,-0.000316,2,KAVPGK,13,00000000,7.03,0002001000000000000,0,0;"sp|Q13061|TRDN_HUMAN":0:440:445:2
q2_p3_terms=K,K
q2_p4_db=0202
q2_p4=1,598.380234,-0.000316,2,IPGGKK,14,00000000,1.26,0002001000000000000,0,0;"sp|Q9Y2I7|FYV1_HUMAN":0:670:675:2,"tr|E9PDH4|E9PDH4_HUMAN":0:614:619:2
q2_p4_terms=K,F:K,F
*/
/*
q856_p9_db=0202
q856_p9=0,685.427521,-0.000117,3,XLLVR,12,0000000,13.68,0000002000000000000,0,0;"tr|V9GY00|V9GY00_HUMAN":0:1:5:1,"tr|H7C3C3|H7C3C3_HUMAN":0:1:5:1
q856_p9_terms=-,L:-,V
q856_p9_subst=1,X,W
*/
}
/*
q1_p1_db=02
q1_p1=0,597.302322,0.997884,2,GAWHK,9,0000000,7.97,0000012000000000000,0,0;"sp|O95006|OR2F2_HUMAN":0:299:303:1
q1_p1_terms=K,L
q1_p2_db=02
q1_p2=1,598.296219,0.003987,2,KEEPP,11,0000000,1.32,0002000000000000000,0,0;"tr|E9PNM8|E9PNM8_HUMAN":0:134:138:1
q1_p2_terms=R,-
q2_p1_db=02
q2_p1=1,598.380234,-0.000316,3,KAGVPK,13,00000000,17.13,0002011000000000000,0,0;"tr|H7C1P9|H7C1P9_HUMAN":0:945:950:2
q2_p1_terms=K,K
q2_p2_db=0202
q2_p2=1,598.380234,-0.000316,2,KQPVK,9,0000000,9.54,0002011000000000000,0,0;"sp|P33527|MRP1_HUMAN":0:270:274:1,"tr|I3L4X2|I3L4X2_HUMAN":0:169:173:1
q2_p2_terms=R,V:R,V
q2_p3_db=02
q2_p3=1,598.380234,-0.000316,2,KAVPGK,13,00000000,7.03,0002001000000000000,0,0;"sp|Q13061|TRDN_HUMAN":0:440:445:2
q2_p3_terms=K,K
q2_p4_db=0202
q2_p4=1,598.380234,-0.000316,2,IPGGKK,14,00000000,1.26,0002001000000000000,0,0;"sp|Q9Y2I7|FYV1_HUMAN":0:670:675:2,"tr|E9PDH4|E9PDH4_HUMAN":0:614:619:2
q2_p4_terms=K,F:K,F
*/
}
catch (pappso::PappsoException exception_pappso) {
_error_str = QObject::tr("ERROR in MascotDatParser::parsePeptidesLine %1, PAPPSO exception:\n%2").arg(peptide_line).arg(exception_pappso.qwhat());
qDebug() << _error_str;
throw pappso::PappsoException(_error_str);
}
catch (std::exception exception_std) {
_error_str = QObject::tr("ERROR in MascotDatParser::parsePeptidesLine %1, std exception:\n%2").arg(peptide_line).arg(exception_std.what());
qDebug() << _error_str;
throw pappso::PappsoException(_error_str);
}
}
......@@ -47,12 +47,18 @@ private:
Project * _p_project;
IdentificationGroup * _p_identification_group;
IdentificationDataSource * _p_identification_data_source;
ProteinXtp _current_protein;
std::vector<FastaFileSp> _fasta_file_list;
std::vector<FastaFileSp> _peptides_fasta_file_list;
QRegExp _regexp_header_line;
unsigned int _number_of_queries=0;
unsigned int _number_of_residues=0;
QString _error_str;
};
#endif // MASCOTDATPARSER_H
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment