Commit 063d9a19 authored by Celine Noirot's avatar Celine Noirot
Browse files

Udate Gaelle R script

parent 241d03ba
################################### MA plots ###################################
##################### Differential Analysis Expression #########################
################################################################################
### ---------------------Packages
......@@ -6,10 +6,13 @@
packages <- c("optparse", # to read arguments from a command line
"ggplot2", # nice plots
"gridExtra", # multiple plots on one page
"grid", # multiple plots on one page for MAplots
"RColorBrewer", # color in plots
"mixOmics", # for heatmap plot
"reshape2",
"RJSONIO") # reshape grouped data
"reshape2", # reshape grouped data
"knitr", # create a pdf file with images
"markdown", # create a pdf file with images
"RJSONIO") # work with JSON file
for(package in packages){
# if package is installed locally, load
......@@ -35,7 +38,6 @@ for(package in bioPackages){
### ---------------------Parameters
#############
......@@ -54,10 +56,10 @@ option_list = list(
help = "folder path where results are stored",
metavar = "character"),
make_option(c("--pool1"), type = "character", default = NULL,
help = "library name in pool 1 separeted by ', '",
help = "library name in pool 1 separeted by ','",
metavar = "character"),
make_option(c("--pool2"), type = "character", default = NULL,
help = "library name in pool 2 separeted by ', '",
help = "library name in pool 2 separeted by ','",
metavar = "character"),
make_option(c("--filter"), type = "logical", default = TRUE,
help = "if TRUE low expressed genes are removed [default=%default]",
......@@ -103,9 +105,9 @@ opt = parse_args(opt_parser);
#opt=list()
#opt$file <- XXX # for example "C:/My documents/raw_count.csv"
#opt$norm <- XXX # for example "C:/My documents/RLE_info.csv"
#opt$out <- XXX # for example"C:/My documents/results"
#opt$pool1 <- "lib1, lib2"
#opt$pool2 <- "lib3, lib4"
#opt$out <- XXX # for example "C:/My documents/results"
#opt$pool1 <- XXX # for example "lib1, lib2"
#opt$pool2 <- XXX # for example "lib3, lib4"
#opt$alpha <- 0.05
#opt$filter <- TRUE
#opt$correct <- "BH"
......@@ -122,7 +124,7 @@ if (is.null(opt$file) || is.null(opt$norm) || is.null(opt$out) ||
}
#multiple testing correction
if (!(opt$correct %in% c("BH", "BY", "bonferroni"))) {
if (!(opt$correct %in% c("BH", "bonferroni"))) {
print_help(opt_parser)
stop("you can't use this correction to adjust p-values.\n", call. = FALSE)
}
......@@ -164,6 +166,7 @@ if (nrow(pool1) < 1 || nrow(pool2) < 1) {
sample_info <- rbind(pool1, pool2)
PC_before <- PC_before[, colnames(PC_before) %in% as.character(sample_info$lib_name)]
PC_before <- PC_before[, match(colnames(PC_before), sample_info$lib_name)]
norm_fact <- norm_fact[as.character(norm_fact$sample.name) %in%
as.character(sample_info$lib_name), ]
......@@ -182,6 +185,11 @@ PC_after_log <- log2(cpm(dge_after) + 1)
##############################################
### ---------------------Plots : normalization
##############################################
#save images in a pdf file
pdf(file.path(opt$out, "de_images.pdf"), title = "Plots from differential analysis",
width = 12, height = 10)
PC_before_log_long <- melt(PC_before_log)
colnames(PC_before_log_long) <- c("name", "lib", "log2CPM")
PC_before_log_long <- merge(PC_before_log_long, sample_info, by.x = "lib",
......@@ -195,33 +203,33 @@ PC_after_log_long <- merge(PC_after_log_long, sample_info, by.x = "lib",
#### boxplot
PC_boxplot <- function(dat, tit, filename) {
ggplot(dat, aes(x=lib, y=log2CPM)) +
p <- ggplot(dat, aes(x=lib, y=log2CPM)) +
geom_boxplot(aes(fill = factor(conditions))) + theme_bw() + xlab(label = "") +
ylab(label = expression(log[2]*"(counts + 1)")) + ggtitle(tit) +
scale_fill_manual(name = "Conditions", values = c("steelblue3", "springgreen3"))
ggsave(filename = normalizePath(file.path(opt$out, filename),
mustWork = FALSE), width = 11, height = 7)
print(p)
}
PC_boxplot(PC_before_log_long, "Boxplot on raw pseudo-counts",
"rawcount_boxplot.svg")
"rawcount_boxplot.png")
PC_boxplot(PC_after_log_long, "Boxplot on pseudo-counts after normalization",
"normcount_boxplot.svg")
"normcount_boxplot.png")
#### MDS
if (nrow(pool1) > 1 && nrow(pool2) > 1) {
PC_MDS <- function(dat, tit, filename) {
dev.new()
mds <- data.frame(plotMDS(dat)$cmdscale.out)
dev.off()
mds$lib.name <- rownames(mds)
ggplot(mds, aes(x = X1, y = X2, label = lib.name,
p <- ggplot(mds, aes(x = X1, y = X2, label = lib.name,
color = as.factor(sample_info$conditions))) +
geom_point(size=4) + geom_text(hjust=0.5, vjust=-0.5) + theme_bw() +
xlab(label = "Dimension 1") + ylab(label = "Dimension 2") +
ggtitle(tit) + scale_color_manual(name = "Conditions",
values = c("steelblue3", "springgreen3"))
ggsave(filename = normalizePath(file.path(opt$out, paste0(filename,".svg")),
mustWork = FALSE), width = 11, height = 7)
print(p)
write.table(mds, file = normalizePath(file.path(opt$out,
paste0(filename,".csv")),
......@@ -238,21 +246,20 @@ if (nrow(pool1) > 1 && nrow(pool2) > 1) {
#### density
PC_density <- function(dat, tit, filename) {
ggplot(dat, aes(x=log2CPM, group = lib)) +
p <- ggplot(dat, aes(x=log2CPM, group = lib)) +
geom_density(aes(colour = factor(conditions))) +
theme_bw() + xlab(label = expression(log[2]*"(counts + 1)")) +
ylab(label = "Density") + ggtitle(tit) +
guides(color=guide_legend(title="Conditions")) +
scale_color_manual(name = "Conditions",
values = c("steelblue3", "springgreen3"))
ggsave(filename = normalizePath(file.path(opt$out, filename),
mustWork = FALSE), width = 8, height = 6)
print(p)
}
PC_density(PC_before_log_long, "Density plot on raw pseudo-counts",
"rawcount_density.svg")
"rawcount_density.png")
PC_density(PC_after_log_long, "Density plot on pseudo-counts after normalization",
"normcount_density.svg")
"normcount_density.png")
#save data used for boxplots and density plots
......@@ -270,6 +277,8 @@ write.table(PC_after_log_long, file = normalizePath(file.path(opt$out,
#### MAplots
MAplot_name <- character()
pos <- 1
if (opt$MAplots && nrow(pool1) > 1 && nrow(pool2) > 1) {
ifelse(!dir.exists(file.path(opt$out, "MAplots")),
dir.create(file.path(opt$out, "MAplots")), FALSE)
......@@ -278,12 +287,13 @@ if (opt$MAplots && nrow(pool1) > 1 && nrow(pool2) > 1) {
avalues <- (lib1 + lib2)/2
mvalues <- (lib1 - lib2)
MA <- data.frame(avalues = avalues, mvalues = mvalues)
ggplot(MA, aes(x=avalues, y=mvalues)) +
geom_point(size=3) + theme_bw() + ggtitle(tit) + xlab("A") + ylab("M") +
geom_hline(yintercept = 0, col = "red")
ggsave(filename = normalizePath(file.path(opt$out, "MAplots", filename),
mustWork = FALSE), width = 8, height = 6)
return(list(avalues,mvalues))
p <- ggplot(MA, aes(x=avalues, y=mvalues)) +
geom_point(size=1) + theme_bw() + ggtitle(tit) + xlab("A") + ylab("M") +
geom_hline(yintercept = 0, col = "red") +
theme(text = element_text(size=5))
#print(p)
return(list(avalues,mvalues,p))
}
a_before <- data.frame(name=rownames(PC_before_log))
......@@ -292,43 +302,62 @@ if (opt$MAplots && nrow(pool1) > 1 && nrow(pool2) > 1) {
m_after <- data.frame(name=rownames(PC_after_log))
for(cond in unique(sample_info$conditions)) {
I <- sum(sample_info$conditions == cond)-1
J <- sum(sample_info$conditions == cond)
grid.newpage()
for(i in 1:(sum(sample_info$conditions == cond)-1)) {
for(j in (i+1):sum(sample_info$conditions == cond)) {
# Create layout : nrow = 2, ncol = 2
pushViewport(viewport(layout = grid.layout(I, I*2)))
pos_graph <- 1
for(i in 1:I) {
pos_graph <- i*2 - 1
for(j in (i+1):J) {
col_name <- paste0("cond",cond, "_",
sample_info$lib_name[sample_info$conditions == cond][i],
"vs",
sample_info$lib_name[sample_info$conditions == cond][j])
bef <- PC_MAplot(PC_before_log[, sample_info$conditions == cond][, i],
PC_before_log[, sample_info$conditions == cond][, j],
paste0("MA plot on raw pseudo-counts: ",
paste0("MA plot on raw pseudo-counts: \n",
sample_info$lib_name[sample_info$conditions == cond][i],
" vs. ",
sample_info$lib_name[sample_info$conditions == cond][j]),
paste0("raw_MAplot_", cond, "_", i, "vs", j, ".svg"))
paste0("raw_MAplot_", cond, "_", i, "vs", j, ".png"))
MAplot_name[pos] <- paste0("raw_MAplot_", cond, "_", i, "vs", j, ".png")
pos <- pos + 1
a_before <- cbind(a_before, bef[[1]])
m_before <- cbind(m_before, bef[[2]])
colnames(a_before)[ncol(a_before)] <- col_name
colnames(m_before)[ncol(m_before)] <- col_name
print(bef[[3]], vp=viewport(layout.pos.row = i, layout.pos.col = pos_graph))
aft <- PC_MAplot(PC_after_log[, sample_info$conditions == cond][, i],
PC_after_log[, sample_info$conditions == cond][, j],
paste0("MA plot on normalized pseudo-counts: ",
paste0("MA plot on normalized pseudo-counts: \n",
sample_info$lib_name[sample_info$conditions == cond][i],
" vs. ",
sample_info$lib_name[sample_info$conditions == cond][j]),
paste0("norm_MAplot_", cond, "_", i, "vs", j, ".svg"))
paste0("norm_MAplot_", cond, "_", i, "vs", j, ".png"))
MAplot_name[pos] <- paste0("norm_MAplot_", cond, "_", i, "vs", j, ".png")
pos <- pos + 1
a_after <- cbind(a_after, plot_name=aft[[1]])
m_after <- cbind(m_after, plot_name=aft[[2]])
colnames(a_after)[ncol(a_after)] <- col_name
colnames(m_after)[ncol(m_after)] <- col_name
print(aft[[3]], vp=viewport(layout.pos.row = i, layout.pos.col = pos_graph+1))
pos_graph <- pos_graph + 2
}
}
}
write.table(a_before, file = normalizePath(file.path(opt$out, "MAplots",
......@@ -501,16 +530,10 @@ if (opt$filter) {
mustWork = FALSE), sep = "\t",
row.names = FALSE, quote = FALSE)
svg(normalizePath(file.path(opt$out, "histograms.svg"),
mustWork = FALSE), width = 12, height = 7)
grid.arrange(h1, h2, h3, ncol=3)
dev.off()
grid.arrange(h1, h2, h3, ncol=3)
} else {
svg(normalizePath(file.path(opt$out, "histograms.svg"),
mustWork = FALSE), width = 12, height = 7)
grid.arrange(h1, h2, ncol=2)
dev.off()
}
......@@ -518,12 +541,12 @@ if (opt$filter) {
res$DEG <- ifelse(res$FDR < opt$alpha, "Yes", "No")
if (nrow(pool1) > 1 | nrow(pool2) > 1) {
ggplot(res, aes(x=logCPM, y=logFC, group=DEG)) +
p <- ggplot(res, aes(x=logCPM, y=logFC, group=DEG)) +
geom_point(aes(color=as.factor(DEG)), size=3)+
theme_bw() + scale_color_manual(values=c('#000000','#CC0000')) +
guides(color=guide_legend(title="DEG ?")) +
ggtitle("MA plot on normalized pseudo-counts")
print(p)
write.table(cbind(A = res$logCPM, M = res$logFC, DEG = res$DEG),
file = normalizePath(file.path(opt$out, "DE_MAplot.csv"),
mustWork = FALSE), sep = "\t",
......@@ -531,21 +554,18 @@ if (nrow(pool1) > 1 | nrow(pool2) > 1) {
} else {
avalues <- (PC_after_log[,1] + PC_after_log[,2])/2
mvalues <- (PC_after_log[,1] - PC_after_log[,2])
ggplot(data.frame(PC_after_log), aes(x=avalues, y=mvalues)) +
p <- ggplot(data.frame(PC_after_log), aes(x=avalues, y=mvalues)) +
geom_point(aes(color=as.factor(res$DEG)), size=3)+
theme_bw() + scale_color_manual(values=c('#000000','#CC0000')) +
guides(color=guide_legend(title="DEG ?")) +
ggtitle("MA plot on normalized pseudo-counts") +
xlab("A") + ylab("M")
print(p)
write.table(cbind(A = avalues, M = mvalues, DEG = res$DEG),
file = normalizePath(file.path(opt$out, "DE_MAplot.csv"),
mustWork = FALSE), sep = "\t",
row.names = FALSE, quote = FALSE)
}
ggsave(filename = normalizePath(file.path(opt$out, "DE_MAplot.svg"),
mustWork = FALSE), width = 8, height = 6)
#### heatmap and PCA plot
......@@ -556,11 +576,8 @@ if (nrow(pool1) > 1 && nrow(pool2) > 1 && nrow(diff) > 0) {
cimColor <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255)[255:1]
svg(normalizePath(file.path(opt$out, "DE_heatmap.svg"),
mustWork = FALSE))
cim(t(selY), color=cimColor, symkey=FALSE,
main = "Heatmap on differentialy expressed genes")
dev.off()
write.table(t(selY),
file = normalizePath(file.path(opt$out, "DE_heatmap.csv"),
......@@ -574,8 +591,4 @@ if (nrow(pool1) > 1 && nrow(pool2) > 1 && nrow(diff) > 0) {
}
dev.off()
############################# Enrichment analysis ##############################
################################################################################
### ---------------------Packages
#common packages
packages <- c("optparse", # to read arguments from a command line
"ggplot2", # nice plots
"RJSONIO") # work with JSON file
for(package in packages){
# if package is installed locally, load
if (!(package %in% rownames(installed.packages()))) {
install.packages(package)
}
do.call('library', list(package))
}
#Bioconductor package
if (!("topGO" %in% rownames(installed.packages()))) {
source("http://www.bioconductor.org/biocLite.R")
biocLite("topGO")
}
library(topGO) # GO enrichment
### ---------------------Parameters
#############
###### Parameter initialization when the script is launched from command line
############
option_list = list(
make_option(c("-f", "--files"), type = "character", default = NULL,
help = "geneID2GO files: list of tabulated matrix with the correspondence
gene to GO separated by ','", metavar = "character"),
make_option(c("-n", "--names"), type = "character", default = NULL,
help = "names of the samples (same order than files)", metavar = "character"),
make_option(c("--fileFormat"), type = "character", default = "topGO",
help = "two types of file can be used : topGO (gene_id1 GO_1,GO_2,...
\n gene_id2 GO_3,...) or twoColumns (only 2 columns
#gene_id GO_id)", metavar = "character"),
make_option(c("-o", "--out"), type = "character", default = NULL,
help = "folder path where results are stored",
metavar = "character")
);
opt_parser = OptionParser(usage="Perform an enrichment analysis for
Gene Ontology (GO)", option_list = option_list);
opt = parse_args(opt_parser);
##############
## Parameter initialization when the script is launched from R or Rstudio
##############
#opt=list()
#opt$files <- XXX # for example c("C:/My documents/raw_count.csv",...)
#opt$names <- XXX # for example c("sample1",...)
#opt$fileFormat <- "topGO" # "topGO" or "twoColumns"
#opt$out <- XXX # for example "C:/My documents/results"
# table with genes and GO, genes of interest, test and algorithm are needed to
#continue
if (is.null(opt$files)) {
print_help(opt_parser)
stop("Tables with genes and GO missing.\n",
call. = FALSE)
}
#Create output folder if it doesn't exist
ifelse(!dir.exists(opt$out), dir.create(opt$out), FALSE)
#####################################################
### ---------------------Input data and preprocessing
#####################################################
## read tables with all genes
l <- unlist(strsplit(opt$files, ','))
opt$names <- unlist(strsplit(opt$names, ','))
geneID2GO <- list()
if (opt$fileFormat == "topGO"){
for(i in seq_along(l)){
geneID2GO[[i]] <- readMappings(normalizePath(l[i]), sep = "\t", IDsep = ",")
}
} else {
for(i in seq_along(l)){
go <- read.table(normalizePath(l[i]), header = TRUE, sep = "\t",
comment.char="")
# obtains a list of GO by genes for all genes (mandatory for topGO package)
geneID2GO[[i]] <- list()
for (i in unique(go[, 1])){
gene <- go[go[,1] == i,]
listGO <- vector()
for (j in 1:nrow(gene)){
listGO <- c(listGO, as.character(gene[j, 2]))
}
geneID2GO[[i]] <- c(geneID2GO[[i]], list(listGO))
}
names(geneID2GO[[i]]) <- unique(go[, 1])
}
}
geneList <- list()
for(i in seq_along(geneID2GO)){
geneList[[i]] <- factor(rep(1, length(geneID2GO[[i]])))
levels(geneList[[i]]) <- c(levels(geneList[[i]]),0)
names(geneList[[i]]) <- names(geneID2GO[[i]])
}
######################### Functions
########## get name/definition of a GO (hidden function in topGO)
getTermsDefinition <- function(whichTerms, ontology, numChar = 20,
multipLines = FALSE) {
qTerms <- paste(paste("'", whichTerms, "'", sep = ""), collapse = ",")
retVal <- dbGetQuery(GO_dbconn(),
paste("SELECT term, go_id FROM go_term WHERE ontology IN",
"('", ontology, "') AND go_id IN (",
qTerms, ");", sep = ""))
termsNames <- retVal$term
names(termsNames) <- retVal$go_id
if(!multipLines)
shortNames <- paste(substr(termsNames, 1, numChar),
ifelse(nchar(termsNames) > numChar, '...', ''), sep = '')
else
shortNames <- sapply(termsNames,
function(x) {
a <- strwrap(x, numChar)
return(paste(a, sep = "", collapse = "\\\n"))
})
names(shortNames) <- names(termsNames)
return(shortNames[whichTerms])
}
########## find GO at levels 2 for ONE ontology
nbLevels2 <- function(geneListI, ontology, geneID2GO){
myGOdata <- NULL
try(myGOdata <- new("topGOdata", description="My project", ontology=ontology,
allGenes=geneListI, annot = annFUN.gene2GO, gene2GO = geneID2GO),
silent = TRUE)
count <- 0
if (!is.null(myGOdata)){
# calculate levels for each GO in our data and his ancestors
levels <- buildLevels(myGOdata@graph)
# select only GO at level 2
goLevels2 <- mget(ls(levels$nodes2level), envir=levels$nodes2level)==2
goLevels2 <- names(goLevels2)[goLevels2]
# count genes in each GO
count <- NULL
for (i in goLevels2){
count <- c(count,countGenesInTerm(myGOdata, i))
}
}
return(count)
}
########## obtain percent of total genes for each level 2 GO
percentOfGene <- function(geneList, ontology, DefOntology){
prop <- list()
count <- NULL
#create a table with GO names and proportion of genes
for(i in seq_along(geneList)){
prop[[i]] <- nbLevels2(geneList[[i]], ontology, geneID2GO[[i]])/length(geneList[[i]])
count <- rbind(count,data.frame(name=getTermsDefinition(names(prop[[i]]), ontology,
numChar = 40),
nbGene=as.vector(prop[[i]]), sample=opt$names[i],
ontology=DefOntology))
}
# table with all GO
all <- data.frame(name = rep(unique(count$name), length(names)),
nbGene = 0, sample = rep(opt$names,
each = length(unique(count$name))),
ontology=DefOntology)
#only GO-name not in count
not_in_count <- all[!(paste0(all$name,".", all$sample) %in%
paste0(count$name,".", count$sample)),]
count <- rbind(count, not_in_count)
return(count)
}
######################### Barplot
countBP <- percentOfGene(geneList, "BP", "Biological Process")
countMF <- percentOfGene(geneList, "MF", "Molecular Function")
countCC <- percentOfGene(geneList, "CC", "Cellular Component")
count <- rbind(countBP, countMF, countCC)
ggplot(count, aes(x=as.factor(name), y=nbGene, fill=sample)) +
geom_bar(stat="identity", position=position_dodge()) +
facet_grid( ~ ontology, scales="free_x") +
theme(axis.text.x = element_text(angle = 70, vjust = 1, hjust=1, colour = "black")) +
scale_fill_discrete(name = "") + ylab(paste0("Percent of ", opt$target)) +
xlab("GO term")
ggsave(filename = normalizePath(file.path(opt$out, "barplot.svg"),
mustWork = FALSE), width = 11, height = 7)
write.table(count, file = normalizePath(file.path(opt$out,
paste0("barplot.csv")),
mustWork = FALSE), sep = "\t",
row.names = FALSE, quote = FALSE)
......@@ -26,7 +26,7 @@ library(edgeR) # RNAseq data normalization
#############
###### Parameters' initialization when the script is launched from command line
###### Parameter initialization when the script is launched from command line
############
option_list = list(
make_option(c("-f", "--file"), type = "character", default = NULL,
......@@ -43,7 +43,7 @@ opt_parser = OptionParser(usage="Imports data, converts them into a DGEList obje
opt = parse_args(opt_parser);
##############
## Parameters' initialization when the script is launched from R or Rstudio
## Parameter initialization when the script is launched from R or Rstudio
#file: tabulated raw count matrix with library name as header
#(e.g. gene_id lib1 lib2)
#out: folder path where results are stored
......@@ -179,15 +179,18 @@ normalization <- function(dge, method) {
mustWork = FALSE), width = 8, height = 6)
#MDS
MDS_PC <- data.frame(plotMDS(PC_after_log)$cmdscale.out)
if (ncol(PC_before) > 2){
MDS_PC <- data.frame(plotMDS(PC_after_log)$cmdscale.out)
ggplot(MDS_PC, aes(x = X1, y = X2, label = rownames(MDS_PC))) +
geom_point(size=4) + geom_text(hjust=0.5, vjust=-0.5) + theme_bw() +
xlab(label = "Dimension 1") + ylab(label = "Dimension 2") +
ggtitle(paste0("PCA plot on pseudo-counts after normalization (method = ",
tit, ")"))
ggsave(filename = normalizePath(file.path(opt$out, paste0(method, "_MDS.svg")),
mustWork = FALSE), width = 11, height = 7)
}
ggplot(MDS_PC, aes(x = X1, y = X2, label = rownames(MDS_PC))) +
geom_point(size=4) + geom_text(hjust=0.5, vjust=-0.5) + theme_bw() +
xlab(label = "Dimension 1") + ylab(label = "Dimension 2") +
ggtitle(paste0("PCA plot on pseudo-counts after normalization (method = ",
tit, ")"))
ggsave(filename = normalizePath(file.path(opt$out, paste0(method, "_MDS.svg")),