xtandem_pipeline.tex

\documentclass[10pt,a4paper]{article}
\usepackage[utf8x]{inputenc}
\usepackage{ucs}
\usepackage{amsmath}
\usepackage{amsfonts}
\usepackage{amssymb}
\usepackage[colorlinks=true,urlcolor=blue,linkcolor=black]{hyperref}
\usepackage{graphicx}
\usepackage{fancyhdr}
\usepackage{geometry}

\newcommand{\X}{\textbf{X!Tandem pipeline}}

%\usepackage{enumitem}
%\setdescription{labelsep=\textwidth}

\author{Benoit Valot\\
\texttt{valot@moulon.inra.fr}\\
PAPPSO - \url{http://pappso.inra.fr/}\\
\includegraphics[width=1cm]{images/pappso.pdf}
}
\title{$\X$\\Automated analyses, filtering and export of X!Tandem MS/MS results}
\date{29 October 2010}

%Modification des entetes et pied de page + marges
\geometry{top=3cm, bottom=3cm, left=2cm, right=2cm}
%\pagestyle{headings}
\pagestyle{fancy}
%\fancyhead{}
\fancyfoot{}
\rfoot{\thepage}
\lfoot{\includegraphics[width=1cm]{images/pappso.pdf}}


\begin{document}
\maketitle


\begin{abstract}
\href{http://www.thegpm.org/tandem/index.html}{X!Tandem} is an open-source software performing peptide/protein identification from MS/MS mass spectra. X!Tandem is fast and accurate, but the Global Proteome Machine (\href{http://www.thegpm.org/}{GPM}) is relatively limited regarding the processing of identification results. 
$\X$ is an alternative to the installation of the GPM on local servers. 

\paragraph*{}
$\X$ performs database searching and matching on a list of MS/MS runs in one shot, using a list of easily user selected paramaters and databases.

\paragraph*{}
$\X$ also performs filtering of data according to statistical values at peptide and protein levels. The results are stored into TSV (Tab Separated Values) files. Moreover, redundancy of protein databases are fully filtered as follows :
\begin{itemize}
\item proteins identified without specific peptides compared to others are eliminated;
\item proteins identified with the same pool of peptides are assembled;
\item proteins are grouped by function (identified with at least one common peptide), and the specific peptides for each sub-group of proteins are indicated.
\end{itemize}
\end{abstract}

\tableofcontents

\pagebreak 

\section{Installation}

\subsection{License}
\paragraph*{}
Copyright (C) 2010  Valot Benoit\\
$\X$ program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.\\
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the \href{http://www.gnu.org/licenses/gpl.html}{GNU General Public License} for more details.

\subsection{Requirements}
$\X$ works on all platforms (Linux, Windows and Mac). Java 1.6 must be installed (it can be found :
\href{https://cds.sun.com/is-bin/INTERSHOP.enfinity/WFS/CDS-CDS_Developer-Site/en_US/-/USD/ViewProductDetail-Start?ProductRef=jre-6u14-oth-JPR@CDS-CDS_Developer}{here}).

\subsection{Third party softwares for Windows}
\begin{enumerate}
\item Download the \href{http://pappso.inra.fr/downloads/xtandem_pipeline.zip}{$\X$ archive} and unzip it.
\item Create a folder named "Benperl/" directly in the C:/ directory.
\item Move the folders "Xtandem", from the archive to the new folder "C:/Benperl/".
\end{enumerate}
You could also download the executable in the \href{ftp://ftp.thegpm.org/projects/tandem/binaries/}{GPM site} (32 and 64 bits).

\subsection{Third party softwares for Linux}
\subsubsection*{Ubuntu}
\begin{itemize}
\item Add this \href{https://launchpad.net/~olivier-langella/+archive/ppa}{repository}. 
\item Install the \textit{xtandem-tornado} package.
\end{itemize}
\subsubsection*{Other distributions}
\begin{itemize}
\item Download the \href{ftp://ftp.thegpm.org/projects/tandem/source/}{sources} and follow the instruction of compilation.
\end{itemize}

\subsection{Third party softwares for Mac}
Download the executable from the \href{ftp://ftp.thegpm.org/projects/tandem/binaries/}{GPM site}.

\begin{figure}[!ht]
\center \includegraphics[scale=0.5]{images/tandem_principal.png}
\caption{Principal window}
\label{principal}
\end{figure}

\subsection{Start X!Tandem pipeline}
\paragraph*{}
To run \X, simply :
\begin{itemize}
\item Open X!Tandem pipeline by using this \href{http://pappso.inra.fr/documents/bioinformatique/xtandem_parser.jnlp}{link}
\item Wait for the program to execute
\item The principal window will appear (Fig~\ref{principal})
\end{itemize}

\subsection{Configuration}
\begin{itemize}
\item Open the menu X!Tandem $\rightarrow$ Configuration (Fig~\ref{configuration}).
\item Define the path to the X!Tandem executable
\item Choose the folder where to store the X!Tandem parameters
\item Choose the folder where the MS/MS data, the protein databases and the X!tandem results are stored
\end{itemize}

\begin{figure}[!ht]
\center \includegraphics[scale=0.5]{images/tandem_configuration.png}
\caption{Configuration window}
\label{configuration}
\end{figure}

\pagebreak 

\section{X!Tandem analysis}
\paragraph*{}
$\X$ allows you to analyze peak-lists files by searching a list of protein databases using the X!Tandem software.
Three successive graphical boxes help you select first the mzXML files or other peak-lists, then the protein databases and finally the folder where the results will be stored. The databases must be protein ones, X!Tandem does not work on DNA databases.

\subsection{Parameters}
\label{parameter}
\paragraph*{}
To perform database searching, you must create or edit a model XML file (stored in the xtandem models folder). Open the menu X!Tandem $\rightarrow$ Parameters (Fig~\ref{xtandem_parameter}).

\begin{figure}[!h]
\center \includegraphics[scale=0.4]{images/tandem_parameter.png}
\caption{Parameter window}
\label{xtandem_parameter}
\end{figure}

\paragraph*{}
To use complete performance of your computer, specify the number of CPU in the model : spectrum $\rightarrow$ threads.

\subsection{Running analysis}
\paragraph*{}
To perform analysis, start the menu X!Tandem $\rightarrow$ Analysis.
\begin{enumerate}
\item Select the peak-list files to be analyzed (See~\ref{peak})
\item Select the database files to be searched (See~\ref{database})
\item Select the folder where to store the result files
\item Select the searching parameters model (See~\ref{parameter})
\end{enumerate}

\subsection{Peak-lists}
\label{peak}
\paragraph*{}
X!Tandem works with open peak-list files like mzXML, mgf, mzData, mzML or pkl files.

\subsection{Databases}
\label{database}
\paragraph*{}
X!Tandem software uses only protein databases in fasta format. It doesn't work with EST\footnote{Expressed Sequenced tag} sequences. You can transform your database using our application \textit{Protein database manager}, available \href{http://pappso.inra.fr/bioinformatique.html}{here}, or you can directly run it \href{http://pappso.inra.fr/documents/bioinformatique/database_manager.jnlp}{here}.

\pagebreak

\section{Processing the results}
\paragraph*{Warning:}
To process results, $\X$ needs to have X!Tandem result files (.xml). The names of the files are used as \textbf{sample names}.

\subsection{Three modes of analysis}
\label{mode}
\paragraph*{}
You can filter the MS/MS identification results and export them in three different modes : (menu Processing)
\begin{description}
\item[Individual mode] \hfill  \\ Each MS/MS result file is processed individually.\\
You cannot perform comparison by using this process.
\item[Combined mode] \hfill  \\ The MS/MS result files are combined in one result file, and this file is filtered / exported.\\
This mode is useful to compare different results.
\item[Phosphopeptide mode] \hfill  \\ Same as the combined mode analysis except that only phosphopeptides are conserved and the result is oriented in order to validate phosphosites.
\end{description} 

\paragraph*{}
In all modes, you have to :
\begin{enumerate}
\item Select the XML result files
\item Define the filter parameters  (~\ref{filtering})
\item Define the name of the result file to export
\item Define the export parameters  (~\ref{exporting})
\end{enumerate}

\subsection{Filter parameters}
\label{filtering}
The filter window (Fig~\ref{filter}) defines the automated filtering process parameters :
\small
\begin{description}
\item[Peptide E-value] \hfill \\Defines the E-value above which a peptide is considered as valid.
\item[Peptide number] \hfill \\Defines the number of valid unique\footnote{Unique peptides are defined as peptides with different sequences. This excludes peptides with different modifications.} peptides necessary to validate a protein.
\item[Protein E-value] \hfill \\Defines the E-value above which a protein is considered as valid.
\begin{itemize}
\item The protein E-value is the product of its valid unique peptide E-values and it is different from the protein E-values determined by X!Tandem. 
\item The values are expressed in log(E-value).
\end{itemize}
\item[Sum to all] \hfill \\Defines how protein filter is performed when MS/MS results are combined :
\begin{description}
\item[No] To validate a protein, the 2 parameters (peptide number and protein E-value) must be valid in at least one result.
Interesting if one wants to compare 2DLC-MS/MS results, where peptides from a protein are in the same LC-MS/MS run.
\item[Yes] To validate a protein, the 2 parameters (peptide number and protein E-value) must be valid in the sum of all results.
Interesting if one wants to compare SDS-PAGE-LC-MS/MS results, where peptides from a protein are split in different LC-MS/MS runs.
\end{description}
\item[Phosphopeptide] \hfill \\Keep only peptides containing phosphorylated residue modifications.
All other peptides are invalided.
\item[Contaminants] \hfill \\When you perform an analysis using different fasta databases, you can remove the result from one database by selecting this database.
Interesting because it allows you to always include the same contaminant proteins during the database search, and because it removes the contaminant proteins from the results.
\item[Add results] \hfill \\At this stage, you can add other MS/MS result files to the analysis. If two files have the same name, they are combined in one result file. Interesting if one wants to combine X!Tandem results of the same LC-MS/MS run using different modification parameters or protein databases.
\end{description}
\normalsize
\begin{figure}[!ht]
\center \includegraphics[scale=0.5]{images/tandem_filter.png}
\caption{Filter window}
\label{filter}
\end{figure}

\pagebreak 

\subsection{Export parameters}
\label{exporting}
The export window (Fig~\ref{export}) shows the different types of available exports :
\small
\begin{description}
\item[Default] \hfill \\Creates TSV files containing identification results for proteins (*protein.txt) and peptides (*peptide.txt). When you perform a combined analysis, a *compar.txt file is created that contains the results of comparison between samples.
\item[Fasta] \hfill \\Creates a fasta file for valid proteins.
\item[PepNovo] \hfill \\Creates a XML file containing the peptide results to be removed for an automated \textit{De Novo} interpretation in sequence using our \href{http://pappso.inra.fr/documents/bioinformatique/DeNovo_pipeline.pdf}{DeNovo pipeline}.
\item[FDR] \hfill \\Creates a tabulated file containing the number of valid peptides for the different peptide E-values in each database. Allows you to determine the E-value above which FDR value is acceptable.\\
\label{fdr}\textbf{Warning} : Use very low parameters in peptide (0.1) and protein (-1) E-values, and set the number of unique peptides to validate a protein to 1.
\item[Protic] \hfill \\Creates a PROTICdb compatible XML file, so you can store results in the \href{http://pappso.inra.fr/bioinformatique.html}{PROTICdb} proteomic database.
\item[MassChroQ] \hfill \\Creates a MassChroQ compatible XML file, so you can perform quantitative analysis using our home-made software \textbf{MassChroQ} (to be released soon).
\end{description}
\normalsize
\begin{figure}[!ht]
\center \includegraphics[scale=0.5]{images/tandem_export.png}
\caption{Export window}
\label{export}
\end{figure}

\pagebreak

\section{Exporting the results}
\subsection{Files *protein.txt}
The identified proteins are represented by sample (individual mode) or for all samples (combine/phosphopeptide modes) (Fig~\ref{prot}). Proteins are grouped by function.
\small
\begin{description}
\item[Group] Group to which the protein belongs. All the proteins in a group have at least one peptide in common.
\item[Sub-group] Sub-group to which the protein belongs. All the proteins in a sub-group are identified with the same valid peptides.
\item[Description] Protein description as it appears in the header of the fasta file.
\item[log(E value)] Protein E-value expressed in log.
\begin{itemize}
\item Statistical value representing the number of times this protein would be identified randomly.
\item Calculated as the product of unique peptide E-values in the sample.
\end{itemize}
\item[Coverage] \% of protein coverage.
\item[MW] Molecular weight of the protein expressed in KDa.
\item[Spectra] Total number of MS/MS spectra identified for the protein
\item[Specifics] Number of MS/MS spectra that are specific to the protein, compared to the other proteins of the same group (individual and phosphopeptide mode, see~\ref{mode}).
\item[Specific uniques] Number of unique peptide sequences specific to the protein, compared to other proteins of the same group (combined mode, see~\ref{mode}).
\item[Uniques] Number of unique peptide sequences identified for the protein.
\item[PAI] Protein Abundance Index\label{pai} :
\begin{itemize}
\item PAI estimates the relative abundance of the protein.
\item PAI is calculated as the number of identified spectra divided by the number of theoretical peptides\footnote{Theoretical peptides correspond to the peptides resulting from the theoretical digestion of the protein sequence by trypsin and that are visible in mass spectrometry ($800<MH<2500$)} of the protein.
\end{itemize}
\item[Redundancy] Number of proteins identified with the same pool of spectra. When there is redundancy, the above described parameters are shown only for the first protein of the subgroup (arbitrary chosen).Only the description of the other members of the subgroup is shown.
\item[Position] Position(s) of the phosphosite in the protein. This value is only reported in phosphosite mode (see~\ref{mode}).
\end{description}
\normalsize

\begin{figure}[!ht]
\center \includegraphics[width=1.0\textwidth]{images/tandem_prot.pdf}
\caption{Protein results}
\label{prot}
\end{figure}

\subsection{Files *peptide.txt}
Identified peptides are grouped by group (Fig~\ref{pep}). One line corresponds to one MS/MS spectrum identifying one peptide that can be present in one or more proteins.
\small
\begin{description}
\item[Group] Group of the proteins containing this peptide.
\item[Description] Protein description if the peptide is specific to this protein.
\item[Sample] Name of the MS/MS run file.
\item[Scan] Scan number of the MS/MS run analysis.
\item[Rt] Retention time of the peptide.
\item[Sequence] Sequence of the peptide.
\item[Modifs] Modifications on the peptide.
\footnote{For example, M2:+15.99 means that the mass of the second amino acid, which is a methionine, is increased by 15.99. This mass increase indicates that the peptide is oxidized.}
\item[Valid] Indicates whether the peptide was validated by the filter parameters or not.
\item[Used] Number of protein sub-groups in which the peptide is present.
\item[on a total of] Total number of protein sub-groups in the group.\\
\textit{Rq :} If the peptide is specific, there is only $'-'$. 
\item[Sub-groups] Protein sub-groups where the peptide is present.
\item[E-value] Peptide E-value.
\begin{itemize}
\item Statistical value representing the number of times this peptide would be identified randomly.
\item Calculated by X!Tandem with an empiric model.
\end{itemize}
\item[Charge] Charge level of the precursor.
\item[MH+ Obs] Monoisotopic observed mass for the peptide + one proton (MH$^{+}$)
\item[MH+ Theo]Monoisotopic calculated mass for the peptide + one proton (MH$^{+}$)
\item[DeltaMH+] Error in the precursor mass between observed and theoretical data (Da)
\item[Delta-ppm] Error in the precursor mass between observed and theoretical data (ppm)
\item[Position] Position(s) of the phosphosite in the protein. This value is only reported in phosphosite mode (see~\ref{mode}).
\end{description}
\normalsize

\begin{figure}[!ht]
\center \includegraphics[width=1.0\textwidth]{images/tandem_peptide.pdf}
\caption{Peptide results}
\label{pep}
\end{figure}

\subsection{Files *compar.txt}
All identified proteins are represented in a list: one protein per row, and one sample per column (Fig~\ref{compar}).
The list of proteins is repeated 4 times, corresponding to the 4 parameters that are used to compare samples (see Type for details).
\small
\begin{description}
\item[Group] Protein group. Groups roughly correspond to the different functions.
\item[Sub-group] Protein sub-group. All the proteins of a sub-group are identified with the same valid peptides.
\item[Description] Protein description extracted from the fasta file.
\item[MW] Molecular weight of the protein (KDa).
\item[log(E value)] The log of protein's E-value.
\begin{itemize}
\item Statistical value representing the number of times this protein would be identified randomly.
\item Calculated as the product of unique peptide E-values in all sample.
\end{itemize}
\item[Type] The item that is compared between samples.
\begin{description}
\item[Spectra] Number of MS/MS spectra identified for the protein.
\item[Specifics] Number of specific MS/MS spectra identified for the protein compared to the other proteins belonging to the same group.
\item[Uniques] Number of unique peptide sequences identified for this protein.
\item[PAI] Protein Abundance Index (~\ref{pai}).
\end{description}
\item[Position] Position(s) of the phosphosite in the protein. This value is only reported in phosphosite mode (see~\ref{mode}).
\end{description}
\normalsize

\begin{figure}[!ht]
\center \includegraphics[width=1.0\textwidth]{images/tandem_comparaison.pdf}
\caption{Comparison results}
\label{compar}
\end{figure}

\subsection{Files *fdr.txt}
This result file indicates the number of peptides with an E-value less than the E-value indicated in the fist column (Fig~\ref{fdr2}). You just have to divide the number of peptides in the reverse or decoy database by the number of peptides in the normal database to obtain the false discovery rate at each E-value level.\\
This method has 2 limitations :
\begin{itemize}
\item normal and reverse databases must be saved in different fasta files;
\item the filter parameters must be low (~\ref{fdr})
\end{itemize}

\begin{figure}[!h]
\center \includegraphics[scale=0.5]{images/tandem_fdr.png}
\caption{FDR results}
\label{fdr2}
\end{figure}


\end{document}