WIP: reading Xtandem files the xml stream way seems to work fine

349d8838 · Langella Olivier · fdd9cadb · 349d8838 · 349d8838
Commit 349d8838 authored 2 years ago by Langella Olivier
--- a/src/input/tandem/tandemreader.cpp
+++ b/src/input/tandem/tandemreader.cpp
@@ -209,11 +209,12 @@ TandemReader::readGroup()
                              while(m_qxmlStreamReader.readNext() ==
                                    QXmlStreamReader::TokenType::Characters)
                                {
-                                  protein_sequence +=
-                                    m_qxmlStreamReader.Characters;
+                                  qDebug() << m_qxmlStreamReader.text();
+                                  protein_sequence += m_qxmlStreamReader.text();
                                }
                              protein_sequence =
                                protein_sequence.simplified().replace(" ", "");
+                              qDebug() << protein_sequence;
                              if(!protein_sequence.isEmpty())
                                {
                                  //._sequence.replace(QRegExp("\\*"),
@@ -254,10 +255,17 @@ TandemReader::readGroup()
                    }
                }

-              //m_qxmlStreamReader.skipCurrentElement();
+              // m_qxmlStreamReader.skipCurrentElement();
+            }
+          else if(tandem_group.type == "parameters")
+            {
+
+              readGroupTypeParameters(tandem_group);
            }
          else
            {
+              m_qxmlStreamReader.raiseError(
+                QObject::tr("Not an X!Tandem input file (no bioml)"));
              m_qxmlStreamReader.skipCurrentElement();
            }
        }
@@ -520,3 +528,298 @@ TandemReader::readDomain(TandemReader::TandemGroup &tandem_group)
      delete p_peptide_evidence;
    }
 }
+
+
+void
+TandemReader::readGroupTypeParameters(TandemGroup &tandem_group)
+
+{
+
+  qDebug() << tandem_group.label;
+  while(m_qxmlStreamReader.readNextStartElement())
+    {
+
+      qDebug() << m_qxmlStreamReader.name();
+      if(m_qxmlStreamReader.name() == "note")
+        {
+          QString type =
+            m_qxmlStreamReader.attributes().value("type").toString();
+          QString label =
+            m_qxmlStreamReader.attributes().value("label").toString();
+          QString text = m_qxmlStreamReader.readElementText();
+
+          //<group label="input parameters" type="parameters">
+          //<note type="input" label="list path, default
+          // parameters">/gorgone/pappso/tmp/temp_condor_job8533994640337729751189420695540169/QExactive_analysis_FDR_nosemi.xml</note>
+          if(label == "list path, default parameters")
+            {
+              mp_identificationDataSource->setIdentificationEngineParam(
+                IdentificationEngineParam::tandem_param, text);
+            }
+          /*
+          <note type="input" label="list path, taxonomy
+          information">/gorgone/pappso/tmp/temp_condor_job8533994640337729751189420695540169/database.xml</note>
+          <note type="input" label="output, histogram column width">30</note>
+          <note type="input" label="output, histograms">yes</note>
+          <note type="input" label="output, maximum valid expectation
+          value">0.05</note> <note type="input" label="output, maximum valid
+          protein expectation value">0.05</note> <note type="input"
+          label="output, one sequence copy">yes</note> <note type="input"
+          label="output, parameters">yes</note> <note type="input"
+          label="output,
+          path">/gorgone/pappso/formation/TD/xml_tandem/20120906_balliau_extract_1_A02_urzb-1.xml</note>
+          <note type="input" label="output, path hashing">no</note>
+          <note type="input" label="output, performance">yes</note>
+          <note type="input" label="output, proteins">yes</note>
+          <note type="input" label="output, results">valid</note>
+          <note type="input" label="output, sequences">yes</note>
+          <note type="input" label="output, sort results by">spectrum</note>
+          <note type="input" label="output, spectra">yes</note>
+          <note type="input" label="output, xsl path">tandem-style.xsl</note>
+          <note type="input" label="protein, C-terminal residue modification
+          mass">0.0</note> <note type="input" label="protein, N-terminal residue
+          modification mass">0.0</note> <note type="input" label="protein,
+          cleavage C-terminal mass change">+17.00305</note> <note type="input"
+          label="protein, cleavage N-terminal mass change">+1.00794</note> <note
+          type="input" label="protein, cleavage semi">no</note> <note
+          type="input" label="protein, cleavage site">[RK]|{P}</note> <note
+          type="input" label="protein, modified residue mass file"></note> <note
+          type="input" label="protein, quick acetyl">yes</note> <note
+          type="input" label="protein, quick pyrolidone">yes</note> <note
+          type="input" label="protein, stP bias">yes</note> <note type="input"
+          label="protein, taxon">usedefined</note> <note type="input"
+          label="refine">yes</note> <note type="input" label="refine, cleavage
+          semi">no</note> <note type="input" label="refine, maximum valid
+          expectation value">0.01</note> <note type="input" label="refine,
+          modification mass">57.02146@C</note> <note type="input" label="refine,
+          modification mass 1"></note> <note type="input" label="refine, point
+          mutations">no</note> <note type="input" label="refine, potential
+          C-terminus modifications"></note> <note type="input" label="refine,
+          potential N-terminus modifications">+42.01056@[</note> <note
+          type="input" label="refine, potential modification
+          mass">15.99491@M</note> <note type="input" label="refine, potential
+          modification mass 1"></note> <note type="input" label="refine,
+          potential modification motif"></note> <note type="input"
+          label="refine, potential modification motif 1"></note> <note
+          type="input" label="refine, spectrum synthesis">yes</note> <note
+          type="input" label="refine, unanticipated cleavage">no</note> <note
+          type="input" label="refine, use potential modifications for full
+          refinement">yes</note> <note type="input" label="residue, modification
+          mass">57.02146@C</note> <note type="input" label="residue,
+          modification mass 1"></note> <note type="input" label="residue,
+          potential modification mass">15.99491@M</note> <note type="input"
+          label="residue, potential modification motif"></note> <note
+          type="input" label="scoring, a ions">no</note> <note type="input"
+          label="scoring, b ions">yes</note> <note type="input" label="scoring,
+          c ions">no</note> <note type="input" label="scoring, cyclic
+          permutation">yes</note> <note type="input" label="scoring, include
+          reverse">yes</note> <note type="input" label="scoring, maximum missed
+          cleavage sites">1</note> <note type="input" label="scoring, minimum
+          ion count">4</note> <note type="input" label="scoring, x
+          ions">no</note> <note type="input" label="scoring, y ions">yes</note>
+          <note type="input" label="scoring, z ions">no</note>
+          <note type="input" label="spectrum, dynamic range">100.0</note>
+          <note type="input" label="spectrum, fragment mass
+          type">monoisotopic</note> <note type="input" label="spectrum, fragment
+          monoisotopic mass error">0.02</note> <note type="input"
+          label="spectrum, fragment monoisotopic mass error
+          units">Daltons</note> <note type="input" label="spectrum, maximum
+          parent charge">4</note> <note type="input" label="spectrum, minimum
+          fragment mz">150.0</note> <note type="input" label="spectrum, minimum
+          parent m+h">500.0</note> <note type="input" label="spectrum, minimum
+          peaks">15</note> <note type="input" label="spectrum, neutral loss
+          mass">18.01057</note> <note type="input" label="spectrum, neutral loss
+          window">0.02</note> <note type="input" label="spectrum, parent
+          monoisotopic mass error minus">10</note> <note type="input"
+          label="spectrum, parent monoisotopic mass error plus">10</note> <note
+          type="input" label="spectrum, parent monoisotopic mass error
+          units">ppm</note> <note type="input" label="spectrum, parent
+          monoisotopic mass isotope error">yes</note>
+          */
+          //<note type="input" label="spectrum,
+          // path">/gorgone/pappso/formation/TD/mzXML/20120906_balliau_extract_1_A02_urzb-1.mzXML</note>
+
+          if(label == "spectrum, path")
+            {
+              //_sp_msrun.get()->setFileName(_current_text);
+              // already set by tandem info parser
+            }
+
+          /*
+          <note type="input" label="spectrum, sequence batch size">1000</note>
+          <note type="input" label="spectrum, threads">1</note>
+          <note type="input" label="spectrum, total peaks">100</note>
+          <note type="input" label="spectrum, use contrast angle">no</note>
+          <note type="input" label="spectrum, use neutral loss
+          window">yes</note> <note type="input" label="spectrum, use noise
+          suppression">yes</note>
+          </group>
+
+          */
+
+          //<group label="unused input parameters"  type="parameters">
+
+          /*
+            <note type="input" label="protein, use minimal
+          annotations">yes</note> <note type="input" label="refine, modification
+          mass 2"></note> <note type="input" label="refine, potential
+          modification mass 2"></note> <note type="input" label="refine,
+          potential modification motif 2"></note> <note type="input"
+          label="residue, modification mass 2"></note> <note type="input"
+          label="residue, potential modification mass 1"></note> <note
+          type="input" label="residue, potential modification mass 2"></note>
+          <note type="input" label="residue, potential modification motif
+          1"></note> <note type="input" label="residue, potential modification
+          motif 2"></note> <note type="input" label="scoring, pluggable
+          scoring">no</note>
+          </group>
+          */
+
+          //<group label="performance parameters" type="parameters">
+
+          //<note label="list path, sequence source
+          //#1">/gorgone/pappso/formation/TD/Database/Genome_Z_mays_5a.fasta</note>
+          //<note label="list path, sequence source
+          //#2">/gorgone/pappso/formation/TD/Database/contaminants_standarts.fasta</note>
+          if(label.startsWith("list path, sequence source #"))
+            {
+              mp_identificationDataSource->addFastaFile(
+                mp_project->getFastaFileStore().getInstance(FastaFile(text)));
+            }
+
+          /*
+          <note label="list path, sequence source description #1">no
+          description</note> <note label="list path, sequence source description
+          #2">no description</note> <note label="modelling, duplicate peptide
+          ids">6019</note> <note label="modelling, duplicate
+          proteins">19735</note> <note label="modelling, estimated false
+          positives">18</note> <note label="modelling, reversed sequence false
+          positives">20</note> <note label="modelling, spectrum noise
+          suppression ratio">0.00</note>
+          */
+          //<note label="modelling, total peptides used">96618641</note>
+          if(label == "modelling, total peptides used")
+            {
+              mp_identificationDataSource->setIdentificationEngineStatistics(
+                IdentificationEngineStatistics::total_peptide_used,
+                text.toUInt());
+            }
+
+          //<note label="modelling, total proteins used">273656</note>
+          if(label == "modelling, total proteins used")
+            {
+              mp_identificationDataSource->setIdentificationEngineStatistics(
+                IdentificationEngineStatistics::total_proteins_used,
+                text.toUInt());
+            }
+          //<note label="modelling, total spectra assigned">7464</note>
+          if(label == "modelling, total spectra assigned")
+            {
+              mp_identificationDataSource->setIdentificationEngineStatistics(
+                IdentificationEngineStatistics::total_spectra_assigned,
+                text.toUInt());
+            }
+          //<note label="modelling, total spectra used">12199</note>
+          if(label == "modelling, total spectra used")
+            {
+
+              qDebug() << label;
+              mp_identificationDataSource->setIdentificationEngineStatistics(
+                IdentificationEngineStatistics::total_spectra_used,
+                text.toUInt());
+            }
+          //<note label="modelling, total unique assigned">6260</note>
+          if(label == "modelling, total unique assigned")
+            {
+
+              qDebug() << label;
+              mp_identificationDataSource->setIdentificationEngineStatistics(
+                IdentificationEngineStatistics::total_unique_assigned,
+                text.toUInt());
+            }
+          qDebug() << label;
+          if(label == "spectrum, timstof MS2 centroid parameters")
+            {
+              qDebug() << label;
+              mp_identificationDataSource->setTimstofMs2CentroidParameters(
+                text);
+
+              if((msp_msrun.get()->getFileName().endsWith(".tdf")) ||
+                 (msp_msrun.get()->getFileName().endsWith(".d")))
+                {
+                  // this is a TimsTOF tandem result file : scan numbers are in
+                  // fact spectrum index: we have to notice this
+                  mp_identificationDataSource->getPeptideEvidenceStore()
+                    .ensureSpectrumIndexRef();
+                }
+            }
+          if(label == "output, spectrum index")
+            {
+              // TODO in pappsomspp : change tandem output to explicitly tell if
+              // we are dealing with spectrum index
+              qDebug() << label;
+
+              if(text == "true")
+                { // this MUST be spectrum index instead of scan numbers
+                  mp_identificationDataSource->getPeptideEvidenceStore()
+                    .ensureSpectrumIndexRef();
+                }
+            }
+          if(label == "spectrum, timstof MS2 filters")
+            {
+              qDebug() << label;
+              // this is a TimsTOF tandem result file : scan numbers are in fact
+              // spectrum index:
+              // we have to notice this
+              if((msp_msrun.get()->getFileName().endsWith(".tdf")) ||
+                 (msp_msrun.get()->getFileName().endsWith(".d")))
+                {
+                  // this is a TimsTOF tandem result file : scan numbers are in
+                  // fact spectrum index: we have to notice this
+                  mp_identificationDataSource->getPeptideEvidenceStore()
+                    .ensureSpectrumIndexRef();
+                }
+            }
+          //<note label="process, start time">2013:12:20:16:47:19</note>
+
+          //<note label="process, version">X! Tandem Sledgehammer
+          //(2013.09.01.1)</note>
+          if(label == "process, version")
+            {
+              QRegExp rx("\\((.*)\\)");
+              if(rx.indexIn(text, 0) != -1)
+                {
+                  mp_identificationDataSource->setIdentificationEngineVersion(
+                    rx.cap(1));
+                }
+              qDebug() << mp_identificationDataSource
+                            ->getIdentificationEngineVersion();
+            }
+          /*
+          <note label="quality values">243 476 437 382 384 417 399 416 346 387
+          390 382 321 355 311 283 253 272 251 228</note> <note label="refining,
+          # input models">4893</note> <note label="refining, # input
+          spectra">5520</note> <note label="refining, # partial
+          cleavage">326</note> <note label="refining, # point
+          mutations">0</note> <note label="refining, # potential
+          C-terminii">0</note> <note label="refining, # potential
+          N-terminii">392</note> <note label="refining, # unanticipated
+          cleavage">0</note> <note label="timing, initial modelling total
+          (sec)">170.96</note> <note label="timing, initial modelling/spectrum
+          (sec)">0.0140</note> <note label="timing, load sequence models
+          (sec)">0.33</note> <note label="timing, refinement/spectrum
+          (sec)">0.0141</note>
+          </group>
+          */
+        }
+
+
+      else
+        {
+          m_qxmlStreamReader.raiseError(
+            QObject::tr("Not an X!Tandem input "
+                        "file (no bioml)"));
+          m_qxmlStreamReader.skipCurrentElement();
+        }
+    }
+}
--- a/src/input/tandem/tandemreader.h
+++ b/src/input/tandem/tandemreader.h
@@ -74,6 +74,7 @@ class TandemReader : public pappso::XmlStreamReaderInterface
    PeptideMatch peptideMatch;
  };
  void readDomain(TandemGroup &tandem_group);
+  void readGroupTypeParameters(TandemGroup &tandem_group);

  private:
  pappso::UiMonitorInterface *mp_monitor;