Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
xtpcpp
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
PAPPSO
xtpcpp
Commits
3581ca04
Commit
3581ca04
authored
7 years ago
by
Langella Olivier
Browse files
Options
Downloads
Patches
Plain Diff
WIP: Mascot parser
parent
19ab80b7
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
src/input/mascot/mascotdatparser.cpp
+109
-40
109 additions, 40 deletions
src/input/mascot/mascotdatparser.cpp
src/input/mascot/mascotdatparser.h
+9
-3
9 additions, 3 deletions
src/input/mascot/mascotdatparser.h
with
118 additions
and
43 deletions
src/input/mascot/mascotdatparser.cpp
+
109
−
40
View file @
3581ca04
...
...
@@ -30,6 +30,8 @@
#include
"mascotdatparser.h"
#include
"mimeparser.h"
#include
<QDebug>
#include
<pappsomspp/pappsoexception.h>
#include
<pappsomspp/peptide/peptide.h>
MascotDatParser
::
MascotDatParser
(
Project
*
p_project
,
IdentificationGroup
*
p_identification_group
,
IdentificationDataSource
*
p_identification_data_source
)
{
...
...
@@ -37,6 +39,8 @@ MascotDatParser::MascotDatParser(Project * p_project, IdentificationGroup * p_id
_p_identification_group
=
p_identification_group
;
_p_identification_data_source
=
p_identification_data_source
;
_regexp_header_line
.
setPattern
(
"^([a-z,0-9,_]+)=(.*)$"
);
}
MascotDatParser
::~
MascotDatParser
()
{
}
...
...
@@ -97,12 +101,15 @@ void MascotDatParser::parseProteinLine(const QString & protein_line) {
}
}
void
MascotDatParser
::
parseHeaderLine
(
const
QString
&
header_line
)
{
QRegExp
regexp_header_line
(
"^([a-z,0-9,_]+)=(.*)$"
);
if
(
regexp_header_line
.
exactMatch
(
header_line
))
{
QStringList
header_list
=
regexp_header_line
.
capturedTexts
();
if
(
_regexp_header_line
.
exactMatch
(
header_line
))
{
QStringList
header_list
=
_regexp_header_line
.
capturedTexts
();
//sequences=73998
//sequences_after_tax=73998
//residues=24900901
if
(
header_list
[
1
].
startsWith
(
"residues"
))
{
qDebug
()
<<
"queries="
<<
header_list
[
2
];
_number_of_residues
=
header_list
[
2
].
toUInt
();
}
//distribution=64847,0,1,576,2254,1934,1417,984,683,419,238,192,113,89,74,46,23,32,28,21,11,3,2,3,3,2,1,1,0,0,0,0,0,0,1
//decoy_type=1
//distribution_decoy=65162,0,1,552,2259,1891,1422,957,601,376,192,159,126,73,56,64,26,27,22,14,6,4,1,1,3,2,1
...
...
@@ -110,10 +117,14 @@ void MascotDatParser::parseHeaderLine(const QString & header_line) {
//date=1517587671
//time=17:07:51
//queries=54084
else
if
(
header_list
[
1
].
startsWith
(
"queries"
))
{
qDebug
()
<<
"queries="
<<
header_list
[
2
];
_number_of_queries
=
header_list
[
2
].
toUInt
();
}
//min_peaks_for_homology=6
//max_hits=50
//version=2.5.0
if
(
header_list
[
1
]
==
"version"
)
{
else
if
(
header_list
[
1
]
==
"version"
)
{
_p_identification_data_source
->
setIdentificationEngineVersion
(
header_list
[
2
]);
}
//fastafile=C:/inetpub/mascot/sequence/ECOLI_INRA/current/ECOLI_INRA_1.fasta
...
...
@@ -137,46 +148,104 @@ void MascotDatParser::parseHeaderLine(const QString & header_line) {
}
void
MascotDatParser
::
parsePeptidesLine
(
const
QString
&
peptide_line
)
{
QRegExp
regexp_header_line
(
"^([a-z,0-9,_]+)=(.*)$"
);
if
(
regexp_header_line
.
exactMatch
(
peptide_line
))
{
QStringList
header_list
=
regexp_header_line
.
capturedTexts
();
QString
index
=
header_list
[
1
];
QString
value
=
header_list
[
2
];
QStringList
index_list
=
index
.
split
(
"_"
);
if
(
index_list
.
size
()
==
3
)
{
if
(
index_list
[
2
]
==
"db"
)
{
_peptides_fasta_file_list
.
clear
();
while
(
value
.
size
()
>
0
)
{
_peptides_fasta_file_list
.
push_back
(
_fasta_file_list
[
value
.
left
(
2
).
toInt
()
-
1
]);
value
=
value
.
mid
(
2
);
try
{
if
(
_regexp_header_line
.
exactMatch
(
peptide_line
))
{
QStringList
header_list
=
_regexp_header_line
.
capturedTexts
();
QString
index
=
header_list
[
1
];
QString
value
=
header_list
[
2
];
QStringList
index_list
=
index
.
split
(
"_"
);
if
(
index_list
.
size
()
==
3
)
{
if
(
index_list
[
2
]
==
"db"
)
{
//q1_p1_db=02
_peptides_fasta_file_list
.
clear
();
while
(
value
.
size
()
>
0
)
{
QString
fasta_str
=
value
.
mid
(
0
,
2
);
_peptides_fasta_file_list
.
push_back
(
_fasta_file_list
.
at
(
fasta_str
.
toInt
()
-
1
));
value
=
value
.
mid
(
2
);
}
}
}
}
else
if
(
index_list
.
size
()
==
2
)
{
else
if
(
index_list
.
size
()
==
2
)
{
if
(
value
==
"-1"
)
{
//no result for this query
}
else
{
QString
query_index
=
index_list
[
0
];
QString
peptide_index
=
index_list
[
1
];
//q1_p1=0,597.302322,0.997884,2,GAWHK,9,0000000,7.97,0000012000000000000,0,0;"sp|O95006|OR2F2_HUMAN":0:299:303:1
int
position
=
value
.
indexOf
(
";
\"
"
,
0
);
QString
peptide_string
=
value
.
mid
(
0
,
position
);
qDebug
()
<<
"peptide_string="
<<
peptide_string
;
QStringList
peptide_string_list
=
peptide_string
.
split
(
","
);
pappso
::
Peptide
peptide
(
peptide_string_list
.
at
(
4
));
QString
query
=
index_list
[
0
];
QString
peptide
=
index_list
[
1
];
QString
protein_string
=
value
.
mid
(
position
+
2
);
qDebug
()
<<
"protein_string="
<<
protein_string
;
//"sp|Q9Y2I7|FYV1_HUMAN":0:670:675:2,"tr|E9PDH4|E9PDH4_HUMAN":0:614:619:2
QStringList
protein_string_list
=
protein_string
.
split
(
",
\"
"
);
if
(
protein_string_list
.
size
()
!=
_peptides_fasta_file_list
.
size
())
{
throw
pappso
::
PappsoException
(
QObject
::
tr
(
"ERROR (protein_string_list.size() != _peptides_fasta_file_list.size()) %1"
).
arg
(
value
));
}
foreach
(
const
QString
&
str
,
protein_string_list
)
{
//sp|O95006|OR2F2_HUMAN":0:299:303:1
int
position
=
str
.
indexOf
(
"
\"
"
,
0
);
QString
accession
=
str
.
mid
(
0
,
position
);
qDebug
()
<<
"accession="
<<
accession
;
QStringList
position_list
=
str
.
mid
(
position
+
2
).
split
(
":"
);
if
(
position_list
.
size
()
!=
4
)
{
throw
pappso
::
PappsoException
(
QObject
::
tr
(
"ERROR position_list.size() != 4 %1"
).
arg
(
value
));
}
unsigned
int
start
=
position_list
.
at
(
1
).
toUInt
();
unsigned
int
stop
=
position_list
.
at
(
2
).
toUInt
();
}
}
}
/*
q1_p1_db=02
q1_p1=0,597.302322,0.997884,2,GAWHK,9,0000000,7.97,0000012000000000000,0,0;"sp|O95006|OR2F2_HUMAN":0:299:303:1
q1_p1_terms=K,L
q1_p2_db=02
q1_p2=1,598.296219,0.003987,2,KEEPP,11,0000000,1.32,0002000000000000000,0,0;"tr|E9PNM8|E9PNM8_HUMAN":0:134:138:1
q1_p2_terms=R,-
q2_p1_db=02
q2_p1=1,598.380234,-0.000316,3,KAGVPK,13,00000000,17.13,0002011000000000000,0,0;"tr|H7C1P9|H7C1P9_HUMAN":0:945:950:2
q2_p1_terms=K,K
q2_p2_db=0202
q2_p2=1,598.380234,-0.000316,2,KQPVK,9,0000000,9.54,0002011000000000000,0,0;"sp|P33527|MRP1_HUMAN":0:270:274:1,"tr|I3L4X2|I3L4X2_HUMAN":0:169:173:1
q2_p2_terms=R,V:R,V
q2_p3_db=02
q2_p3=1,598.380234,-0.000316,2,KAVPGK,13,00000000,7.03,0002001000000000000,0,0;"sp|Q13061|TRDN_HUMAN":0:440:445:2
q2_p3_terms=K,K
q2_p4_db=0202
q2_p4=1,598.380234,-0.000316,2,IPGGKK,14,00000000,1.26,0002001000000000000,0,0;"sp|Q9Y2I7|FYV1_HUMAN":0:670:675:2,"tr|E9PDH4|E9PDH4_HUMAN":0:614:619:2
q2_p4_terms=K,F:K,F
*/
/*
q856_p9_db=0202
q856_p9=0,685.427521,-0.000117,3,XLLVR,12,0000000,13.68,0000002000000000000,0,0;"tr|V9GY00|V9GY00_HUMAN":0:1:5:1,"tr|H7C3C3|H7C3C3_HUMAN":0:1:5:1
q856_p9_terms=-,L:-,V
q856_p9_subst=1,X,W
*/
}
/*
q1_p1_db=02
q1_p1=0,597.302322,0.997884,2,GAWHK,9,0000000,7.97,0000012000000000000,0,0;"sp|O95006|OR2F2_HUMAN":0:299:303:1
q1_p1_terms=K,L
q1_p2_db=02
q1_p2=1,598.296219,0.003987,2,KEEPP,11,0000000,1.32,0002000000000000000,0,0;"tr|E9PNM8|E9PNM8_HUMAN":0:134:138:1
q1_p2_terms=R,-
q2_p1_db=02
q2_p1=1,598.380234,-0.000316,3,KAGVPK,13,00000000,17.13,0002011000000000000,0,0;"tr|H7C1P9|H7C1P9_HUMAN":0:945:950:2
q2_p1_terms=K,K
q2_p2_db=0202
q2_p2=1,598.380234,-0.000316,2,KQPVK,9,0000000,9.54,0002011000000000000,0,0;"sp|P33527|MRP1_HUMAN":0:270:274:1,"tr|I3L4X2|I3L4X2_HUMAN":0:169:173:1
q2_p2_terms=R,V:R,V
q2_p3_db=02
q2_p3=1,598.380234,-0.000316,2,KAVPGK,13,00000000,7.03,0002001000000000000,0,0;"sp|Q13061|TRDN_HUMAN":0:440:445:2
q2_p3_terms=K,K
q2_p4_db=0202
q2_p4=1,598.380234,-0.000316,2,IPGGKK,14,00000000,1.26,0002001000000000000,0,0;"sp|Q9Y2I7|FYV1_HUMAN":0:670:675:2,"tr|E9PDH4|E9PDH4_HUMAN":0:614:619:2
q2_p4_terms=K,F:K,F
*/
}
catch
(
pappso
::
PappsoException
exception_pappso
)
{
_error_str
=
QObject
::
tr
(
"ERROR in MascotDatParser::parsePeptidesLine %1, PAPPSO exception:
\n
%2"
).
arg
(
peptide_line
).
arg
(
exception_pappso
.
qwhat
());
qDebug
()
<<
_error_str
;
throw
pappso
::
PappsoException
(
_error_str
);
}
catch
(
std
::
exception
exception_std
)
{
_error_str
=
QObject
::
tr
(
"ERROR in MascotDatParser::parsePeptidesLine %1, std exception:
\n
%2"
).
arg
(
peptide_line
).
arg
(
exception_std
.
what
());
qDebug
()
<<
_error_str
;
throw
pappso
::
PappsoException
(
_error_str
);
}
}
This diff is collapsed.
Click to expand it.
src/input/mascot/mascotdatparser.h
+
9
−
3
View file @
3581ca04
...
...
@@ -47,12 +47,18 @@ private:
Project
*
_p_project
;
IdentificationGroup
*
_p_identification_group
;
IdentificationDataSource
*
_p_identification_data_source
;
ProteinXtp
_current_protein
;
std
::
vector
<
FastaFileSp
>
_fasta_file_list
;
std
::
vector
<
FastaFileSp
>
_peptides_fasta_file_list
;
QRegExp
_regexp_header_line
;
unsigned
int
_number_of_queries
=
0
;
unsigned
int
_number_of_residues
=
0
;
QString
_error_str
;
};
#endif // MASCOTDATPARSER_H
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment