Skip to content
Snippets Groups Projects
Commit c2619415 authored by Langella Olivier's avatar Langella Olivier
Browse files

better mascot parser

parent 7ff50657
No related branches found
No related tags found
No related merge requests found
......@@ -53,24 +53,80 @@ void MascotDatParser::parse(QIODevice * in_stream) {
parseProteinLine( mime_parser.getCurrentTextStream().readLine());
}
}
else if (mime_parser.getCurrentFileName() == "header") {
while(!mime_parser.getCurrentTextStream().atEnd()) {
parseHeaderLine( mime_parser.getCurrentTextStream().readLine());
}
}
}
mime_parser.close();
qDebug() << "MascotDatParser::parse end";
}
void MascotDatParser::parseProteinLine(const QString & protein_line) {
ProteinXtpSp sp_xtp_protein;
//02::"tr|A0A0D9SF80|A0A0D9SF80_HUMAN"=55120.88,"General transcription factor II-I repeat domain-containing protein 2A OS=Homo sapiens GN=GTF2IRD2B PE=4 SV=1"
QRegExp regexp_protein("^(.*)::\"(.*)\"=([0-9]+\\.[0-9]+),\"(.*)\"$");
if (regexp_protein.exactMatch(protein_line)) {
QStringList protein_list = regexp_protein.capturedTexts();
_current_protein.setAccession(protein_list[2]);
_current_protein.setDescription(protein_list[4]);
sp_xtp_protein = _current_protein.makeProteinXtpSp();
sp_xtp_protein = _p_project->getProteinStore().getInstance(sp_xtp_protein);
}
else {
QRegExp regexp_proteinb("^\"(.*)\"=([0-9]+\\.[0-9]+),\"(.*)\"$");
if (regexp_proteinb.exactMatch(protein_line)) {
QStringList protein_list = regexp_proteinb.capturedTexts();
_current_protein.setAccession(protein_list[1]);
_current_protein.setDescription(protein_list[3]);
sp_xtp_protein = _current_protein.makeProteinXtpSp();
sp_xtp_protein = _p_project->getProteinStore().getInstance(sp_xtp_protein);
}
else {
qDebug() << "MascotDatParser::parseProteinLine error " << protein_line;
}
}
}
void MascotDatParser::parseHeaderLine(const QString & header_line) {
QRegExp regexp_header_line("^([a-z,0-9,_]+)=(.*)$");
if (regexp_header_line.exactMatch(header_line)) {
QStringList header_list = regexp_header_line.capturedTexts();
//sequences=73998
//sequences_after_tax=73998
//residues=24900901
//distribution=64847,0,1,576,2254,1934,1417,984,683,419,238,192,113,89,74,46,23,32,28,21,11,3,2,3,3,2,1,1,0,0,0,0,0,0,1
//decoy_type=1
//distribution_decoy=65162,0,1,552,2259,1891,1422,957,601,376,192,159,126,73,56,64,26,27,22,14,6,4,1,1,3,2,1
//exec_time=484
//date=1517587671
//time=17:07:51
//queries=54084
//min_peaks_for_homology=6
//max_hits=50
//version=2.5.0
if (header_list[1] == "version") {
_p_identification_data_source->setIdentificationEngineVersion(header_list[2]);
}
//fastafile=C:/inetpub/mascot/sequence/ECOLI_INRA/current/ECOLI_INRA_1.fasta
else if (header_list[1].startsWith("fastafile")) {
qDebug() << "fastafile=" << header_list[2];
_fasta_file_list.push_back( _p_project->getFastaFileStore().getInstance(FastaFile(header_list[2])));
_p_identification_data_source->addFastaFile(_fasta_file_list.back());
}
//release=ECOLI_INRA_1.fasta
//sequences1=4305
//sequences_after_tax1=4305
//residues1=1356026
//fastafile2=C:/inetpub/mascot/sequence/HUMAN_INRA/current/HUMAN_INRA_1.fasta
//release2=HUMAN_INRA_1.fasta
//sequences2=69693
//sequences_after_tax2=69693
//residues2=23544875
//taskid=151758718701
}
}
......@@ -41,10 +41,15 @@ public:
void parse(QIODevice * in_stream);
private:
void parseProteinLine(const QString & protein_line);
void parseHeaderLine(const QString & protein_line);
private:
Project * _p_project;
IdentificationGroup * _p_identification_group;
IdentificationDataSource * _p_identification_data_source;
ProteinXtp _current_protein;
std::vector<FastaFileSp> _fasta_file_list;
};
#endif // MASCOTDATPARSER_H
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment