pipeline_intestinal_microbiota.R 12 KB
Newer Older
1
2
#_______________________________________________________________________
#
3
# Pipeline to analyze metaproteomics of MICROBIAL GUT
4
5
6
7
8
9
10
11
12
13
# Data are generated from High-throughput Liquid Chromatography 
# Mass Spectrometry instruments 
# 
#_______________________________________________________________________

# ------------- Set working directory --------------


setwd("~/metaprotr")

14

15
16
17
# ------------- Install the required packages -------------


18
19
20
21
# Before to start, install the following packages :
# ad4
# dendextend
# ggforce
22
# ggrepel
23
24
25
26
27
28
29
30
# reshape2
# stringr
# tidyverse

# ex. 
# install.packages("ade4")
# install.packages("dendextend")
# install.packages("ggforce")
31
# install.packages("ggrepel")
32
33
34
# install.packages("reshape2")
# install.packages("stringr")
# install.packages("tidyverse")
35
36
37
38
39
40
41
42


# ------------- Load required libraries -------------


library("metaprotr")
library("ade4")
library("dendextend")
43
library("dplyr")
44
library("ggforce")
45
library("ggrepel")
46
47
48
library("reshape2")
library("stringr")
library("tidyverse")
49

50
# The following warnings could appear, you can continue the analysis
51
52
53
# ── Conflicts ────────────────────────────────────────────────── tidyverse_conflicts() ──
# x dplyr::filter() masks stats::filter()
# x dplyr::lag()    masks stats::lag()
54
55


56
# ------------- Load Mass Spectrometry data and database with taxonomical levels -------------
57
58
59


# DB containing the taxonomic classification for each protein
60
61
# The Integrated non-redundant Gene Catalog 9.9 with taxonomic annotation can be downloaded from
# https://zenodo.org/record/3997093#.X0UYI6Zb_mE
62

63
meta99_full_taxo <- read.csv2("full_taxonomy_MetaHIT99.tsv", header = T, sep = "\t")
64

65
66
# Characters indicating the localisation (including directories) of files
# check that the items from the column 'msrunfile' (in metadata) are identical to the samples names in 'peptides_file'
67

68
69
70
proteins_file <- "data_directory/protein_list_test_condor_22janv.txt"
peptides_file <- "data_directory/peptide_counting_test_condor_22janv.txt"
metadata_file <- "data_directory/metadata.csv"
71
72
73
74
75
76
77
78
79
80

# Combine the files containing proteins names, peptides abundance in spectral counts and metadata

metaproteome <- load_protspeps(proteins_file, peptides_file, metadata_file)
dim(metaproteome$peptides_proteins) # verification


# ------------- Integrate taxonomic database into metaproteome_object -------------


81
metaproteome <- add_taxonomy(metaproteome, meta99_full_taxo) 
82
83
84
85
86
87
88
head(metaproteome$peptides_proteins) # verification


# ------------- Get spectral counts from specific peptides  -------------


# Spectral counts can be arranged by peptides, subgroups or groups
89
90
# SC_peps <- getsc_specific(metaproteome, "sc_specific_peptides") 
# SC_groups <- getsc_specific(metaproteome, "sc_groups")
91
92
93

# For this pipepline the analysis will be carried out from subgroups, also referred as "metaproteins"

94
SC_subgroups <- getsc_specific(metaproteome, "sc_subgroups") 
95
96
97
98
99
100


# ------------- Inspection of spectral data  -------------


# 1. The function below allows to observe the number of elements (subgroups)
101
# shared by the samples in the experiment
102
103
104
105
106
107
108
109
110

inspect_sample_elements(SC_subgroups)

# 2. Portrays the abundance intensities of the items (peptides, subgroups, 
# groups or taxonomic elements) in the conditions or samples
# To define the second argument of the function, you can inspect the 
# names of columns from metadata

colnames(SC_subgroups$metadata)
111
# [1] "SC_name"  "msrunfile"  "SampleID"  "Order_injection"  "Condition"
112

113
114
plot_intensities(SC_subgroups, "SC_name", "Spectral counts of subgroups")
plot_intensities(SC_subgroups, "Condition", "Spectral counts of subgroups")
115

116
117
# 3. The commands below show the clustering of samples after a Principal Components Analysis (PCA). 
# This allows to observe the overall grouping of the samples. 
118
119
120
# To define the second argument you can inspect the names of columns from metadata

colnames(SC_subgroups$metadata)
121
# [1] "SC_name"  "msrunfile"  "SampleID"  "Order_injection"  "Condition" 
122
123

plot_pca(SC_subgroups, "Condition", c(1, 2))
124
plot_pca(SC_subgroups, "Condition", c(2, 3))
125
# In this example, we observe that the 3 first components explain the major part of the variance
126
127
128
129
130


# ------------- Non supervised Clustering -------------


131
# To visualize the similarities between the different samples, an unsupervised clustering analysis can be informative
132
133
# First, you need to set a the name of ONE column from metadata
# The colors of the dendogram will be set in function of the different levels in the slected column  
134

135
str(SC_subgroups$metadata)
136

137
# 'data.frame':	11 obs. of  4 variables:
138
# $ SC_name        : chr  
139
140
141
# $ msrunfile      : chr  
# $ SampleID       : chr  
# $ Condition      : chr  
142

143
144
145
146
147
# The function will ask for the colors for a given sample or condition
# Provide the colors in the suitable format:
# ex. red, yellow, darkorange, blue
# ex. #440154FF, #FDE725FF, #404788FF, #55C667FF, #B8DE29FF

148
plot_dendocluster(SC_subgroups, "Condition", "msrunfile", "graph_dendogram_by_subgroups")
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173


# ------------- Reformating  -------------


# After data inspection, if some samples, conditions or elements (peptide, 
# proteins, subgroups, groups, etc.) should be removed there are some tools
#for reformating data

?remove_element
?select_element
?filter_text

# For instance, this metaproteome analysis requires to separate 
# human and microbial proteins.

SC_sgp_human <- filter_text(SC_subgroups, "Description", "HUMAN", "keep")
SC_sgp_microbial <- filter_text(SC_subgroups, "Description", "HUMAN", "discard")


# ------------- Comparison between two samples or conditions -------------


# The following lines allow to compare globally two conditions or two samples 

174
plot_intensities_ratio(SC_sgp_microbial, "Condition", c("CTRL", "S"))
175
176
177

# To verify that replicates are similar in a given condition

178
plot_intensities_ratio(SC_sgp_microbial, "SampleID", c("Q1_batch1_prot", "Q2_batch1_prot"))
179
180
181
182


# ------------- Venn diagrams  -------------

183
# Option 1
184

185
venn_Q1_Q2 <- plot_venn(SC_sgp_microbial, "SampleID", c("Q1_batch1_prot", "Q2_batch1_prot"))
186
187

# Export of the elements of each part of the venn diagram on the working directory
188
export_vennlists(venn_Q1_Q2)
189

190
# Option 2
191

192
# Venn plots can also be performed by comparing three conditions
193

194
195
196
197
SC_sgp_microbial$metadata$SampleID
# [1] "STD_env"            "CTRL"               "Q1+FW1_batch1_prot" "Q2+FW2_batch1_prot"
# [5] "Q3+FW3_batch1_prot" "Q1_batch1_prot"     "Q2_batch1_prot"     "Q3_batch1_prot"    
# [9] "FW1_batch1_prot"    "FW2_batch1_prot"    "FW3_batch1_prot" 
198

199
200
201
# Set the three conditions
list_conditions <- c("Q1_batch1_prot", "Q2_batch1_prot", "Q3_batch1_prot")
venn_Q1_Q2_Q3 <- plot_venn(SC_sgp_microbial, "SampleID", list_conditions)
202

203
204
205
# Export of "venn_object" lists in a subdirectory
dir.create("list_venn_Q1_Q2_Q3")
export_vennlists(venn_Q1_Q2_Q3, "list_venn_Q1_Q2_Q3")
206
207
208
209
210
211
212
213
214


# ------------- Information about the taxonomy levels in all samples of an "spectral_count_object" -------------


# ~7 min depending on the experiment size
plot_fulltaxo(SC_sgp_microbial) 


215
# ------------- Calculates spectral counts (MICROBIAL origin for instance) per taxonomic level -------------
216
217
218


# All in ~ 16 mins
219
220
221
222
223
224
225
SCsgp_superkingdom <- crumble_taxonomy(SC_sgp_microbial, "superkingdom")
SCsgp_phylum <- crumble_taxonomy(SC_sgp_microbial, "phylum")
SCsgp_class <- crumble_taxonomy(SC_sgp_microbial, "class")
SCsgp_order <- crumble_taxonomy(SC_sgp_microbial, "order")
SCsgp_family <- crumble_taxonomy(SC_sgp_microbial, "family")
SCsgp_genus <- crumble_taxonomy(SC_sgp_microbial, "genus")
SCsgp_species <- crumble_taxonomy(SC_sgp_microbial, "species")
226
227
228
229
230


# ------------- Plot stacked bars of spectral_count_objects with defined taxonomy  ------------- 


231
# The lines below represent the distribution of spectral counts per taxonomic level (ex. SCsgp_superkingdom, SCsgp_phylum, SCsgp_class, etc.)
232

233
plot_stackedtaxo(SCsgp_family, "SC_name", "percent")
234

235
236
# Third argument is optional and can be adjusted to improve visual 
# representaion by increasing/decreasing the elements that pass the "filter_percent" argument
237

238
plot_stackedtaxo(SCsgp_family, "SC_name", "percent", 2)
239

240
# Other columns from metadata can also be selected (ex. conditions, order of injection, etc.)
241

242
plot_stackedtaxo(SCsgp_family, "Condition", "percent", 2)
243

244
# Bar plot can be also represented as spectral counts by setting the second argument as "numbers"
245

246
plot_stackedtaxo(SCsgp_family, "SampleID", "numbers", 2)
247
248
249
250
251


# ------------- Plot pie a chart of a sample or condition from spectral_count_objects with defined taxonomy  ------------- 


252
plot_pietaxo(SCsgp_genus, "Condition", "CTRL")
253

254
255
# Third argument is optional and can be adjusted to improve visual representaion
# by increasing/decreasing the elements that pass the "filter_percent" argument
256

257
plot_pietaxo(SCsgp_species, "Condition", "STD", 0.8)
258
259


260
# ------------- Principal Components Analysis (PCA) from taxonomic entities -------------
261
262


263
264
265
plot_pca(SCsgp_species, "Condition", c(1, 2))
plot_pca(SCsgp_genus, "Condition", c(1, 2))
plot_pca(SCsgp_family, "Condition", c(1, 2))
266
267
268
269
270
271
272


# ------------- Differential taxonomic elements (over / under represented) ------------- 


# To display the most divergent elements between two conditions or samples 
# The differential elements are selected by a log2(condition / reference) > 3
273
# this is equivalent to a 8 fold-change between two conditions
274

275
identify_differences(SCsgp_species, "Condition", c("CTRL", "S"))
276

277
identify_differences(SCsgp_genus, "Condition", c("CTRL", "S"))
278

279
identify_differences(SCsgp_family, "SampleID", c("Q1_batch1_prot", "Q2_batch1_prot"))
280
281
282

# If required, the third argument can be adjusted to identify elements with lower log2(fold change)

283
identify_differences(SCsgp_genus, "Condition", c("CTRL", "S"), 1.5)
284
285
286


# ------------- Functional Annotation from KEGG database -------------
287

288

289
# Import the KEGG annotations and the MetaHit99 database
290
291
292
293
# The IGC database with KEGG functionnal annotation can be downloaded from
# https://zenodo.org/record/3997093#.X0UYI6Zb_mE

kegg_db <- read.csv2("KEGG89_IGC_hs99.table", header = T, sep = "\t")
294

295
# The following objects were loaded at the beginning of the pipeline, they are required for functional annotation
296

297
meta99_full_taxo <- read.csv2("full_taxonomy_MetaHIT99.tsv", header = T, sep = "\t")
298
299
300
proteins_file <- "data_directory/protein_list_test_condor_22janv.txt"
peptides_file <- "data_directory/peptide_counting_test_condor_22janv.txt"
metadata_file <- "data_directory/metadata.csv"
301
metaproteome <- load_protspeps(proteins_file, peptides_file, metadata_file)
302
303
304

# Assignment of functional annotation per taxonomic level

305
306
SCsgp_species_annot <- add_kegg(
  SCsgp_species, 
307
  kegg_db, meta99_full_taxo, metaproteome, proteins_file, peptides_file, 
308
309
310
311
312
  text_to_filter = "HUMAN"
)

SCsgp_genus_annot <- add_kegg(
  SCsgp_genus, 
313
  kegg_db, meta99_full_taxo, metaproteome, proteins_file, peptides_file, 
314
315
316
317
318
  text_to_filter = "HUMAN"
)

SCsgp_family_annot <- add_kegg(
  SCsgp_family, 
319
  kegg_db, meta99_full_taxo, metaproteome, proteins_file, peptides_file, 
320
321
322
323
324
  text_to_filter = "HUMAN"
)

SCsgp_phylum_annot <- add_kegg(
  SCsgp_phylum, 
325
  kegg_db, meta99_full_taxo, metaproteome, proteins_file, peptides_file, 
326
327
328
329
330
  text_to_filter = "HUMAN"
)

SCsgp_superkingdom_annot <- add_kegg(
  SCsgp_superkingdom, 
331
  kegg_db, meta99_full_taxo, metaproteome, proteins_file, peptides_file, 
332
333
  text_to_filter = "HUMAN"
)
334

335

336
337
338
339
340
341
# ------------- Export of KO terms to draw metabolic pathways in iPATH3 (https://pathways.embl.de/) -------------


export_ipath3(SCsgp_phylum_annot, "all", "SampleID", "CTRL", "#840AA3")

taxonomic_levels <- c("Bacteroidetes", "Actinobacteria", "Proteobacteria")
342
export_ipath3(SCsgp_phylum_annot, "selection", "Condition", "S", "#28c1df", taxonomic_levels)
343
344

taxonomic_levels <- c("Eubacteriaceae")
345
export_ipath3(SCsgp_family_annot, "selection", "Condition", "S", "#097737", taxonomic_levels)
346
347
348
349
350


# ------------- Export spectral counts expressed in percentages, considering the sample, for multi-omics integration -------------


351
export_robject(SCsgp_species_annot, "spectral_percent", "RDATA")
352

353
export_robject(SCsgp_family_annot, "spectral_percent", "RDATA")
354

355
export_robject(SCsgp_phylum_annot, "spectral_percent", "RDATA")
356