pepParser.h 3.67 KB
Newer Older
Edlira Nano's avatar
Edlira Nano committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
/*
*
* MassChroQ: Mass Chromatogram Quantification software.
* Copyright (C) 2010 Olivier Langella, Edlira Nano, Benoit Valot, Michel Zivy.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program.  If not, see <http://www.gnu.org/licenses/>.
*
*/
20
21
22
23
24
25
26
/**
 * \file pepParser.h
 * \date December 03, 2010
 * \author Edlira Nano
 */

#ifndef PEP_PARSER_H_
27
#define PEP_PARSER_H_ 1
28

29
30
#include <QFileInfo>
#include <QTextStream>
31
32

class MassChroqDomEngine;
33

34
35
/**
 * \class PepParser
36
 * \brief Parser of a peptide identification text file. 
37

38
 * The peptide text file is organised into rows and columns. Columns 
39
40
41
42
43
44
45
46
47
48
49
 * are seperated by a tabulation, a comma ',' or a semi-colon ';'. Once a separation 
 * character is used in the header, every separator in the file has to be the same, 
 * otherwise an error is thrown. The text file must contain the following header : 
 * scan[sep]sequence[sep]mh[sep]z[sep]protein
 * and an optional 'mods' 6th column can be added.   
 * 'sequence' is the sequence of a peptide, 
 * 'mh' is its mass + mass of an H+ ion set to 1.007825 in MassChroQ, 
 * 'protein' is the description of a single protein the peptide is identified in, 
 * 'scan' is the scan number of the spectrum the peptide is observed in the mzXML/mzML file, 
 * 'z' is its charge,
 * 'mods' is an optional column containing free text and/or numbers that the user chooses. 
50
 * If the same peptide is identified in more than one protein, one 
51
 * line per protein must be filled (same peptide, scan, mh ...)
52
53
 */

54

55
56
57
58
class PepParser {
  
 public : 
  
59
  /// Constructor that takes the name of the peptide file to parse  
60
  PepParser(const QString & filename);
61
62
63
  
  virtual ~PepParser();
  
64
65
  /// main method doing all the parsing and passing proteins and
  /// peptides information to the dom reader/writer class
Edlira Nano's avatar
Edlira Nano committed
66
  void parse(MassChroqDomEngine * m_engine);
67
68

 protected :
Edlira Nano's avatar
Edlira Nano committed
69
 
70
  /// method that takes a line and parses it
71
72
  void processLine(QString & line);

73
  /// verifies if the given line is a header line or not
74
75
  const bool isHeader(const QString & header_line);

76
77
  /// get the separator for the peptide file using header line;
  /// the other lines will be expected to use the same separator.
78
79
  const QString getSeparator(const QString & header_line);

80
  /// parses a peptide sequence 
Edlira Nano's avatar
Edlira Nano committed
81
  const QString processSequence(QString & seq_token);
82

83
84
  /// parses a peptide mh
  const QString processMh(QString & mh_token);
85

86
  /// parses a protein description field field
87
  const QString processProtOrMods(QString & prot_token);
88

89
  /// parses a scan number field
Edlira Nano's avatar
Edlira Nano committed
90
  const int processScan(QString & scan_token);
91

92
  /// parses a peptide charge  field
Edlira Nano's avatar
Edlira Nano committed
93
  const int processCharge(QString & charge_token);
94
 
95
96
 private :
  
97
98
99
100
  const QString whatSeparator() const;

  /// peptide's text file information
  QFile * _pep_file;
101
  
102
  /// peptide's text file input stream
103
104
  QTextStream * _pep_txt_stream;

105
  /// separator used for this file
106
107
  QString _sep;
  
108
  /// line number being currently parsed
109
  unsigned int _line_number;
110
111
  
  /// current parsed data (in the current line)
112
113
  QString _current_sequence;
  QString _current_prot_desc;
114
  QString _current_mh;
115
116
  int _current_scan;
  int _current_z;
117
  QString _current_mods;
118
  
119
   
120
121
};
#endif /* PEP_PARSER_H_ */