Commit e80d0f64 authored by Celine Noirot's avatar Celine Noirot
Browse files

Normalization and Expression differential.

parent 92fbc971
This diff is collapsed.
################################ Normalization #################################
################################################################################
### ---------------------Packages
#common packages
packages <- c("optparse", # to read arguments from a command line
"ggplot2", # nice plots
"reshape2") # reshape grouped data
for(package in packages){
# if package is installed locally, load
if (!(package %in% rownames(installed.packages()))) {
install.packages(package)
}
do.call('library', list(package))
}
#Bioconductor package
if (!("edgeR" %in% rownames(installed.packages()))) {
source("http://www.bioconductor.org/biocLite.R")
biocLite("edgeR")
}
library(edgeR) # RNAseq data normalization
### ---------------------Parameters
#############
###### Parameters' initialization when the script is launched from command line
############
option_list = list(
make_option(c("-f", "--file"), type = "character", default = NULL,
help = "tabulated raw count matrix with library name as header (e.g.
#gene_id lib1 lib2)", metavar = "character"),
make_option(c("-o", "--out"), type = "character", default = NULL,
help = "folder path where results are stored",
metavar = "character")
);
opt_parser = OptionParser(usage="Imports data, converts them into a DGEList object
(edgeR), does all normalization methods and produces
diagnostic plots.", option_list = option_list);
opt = parse_args(opt_parser);
##############
## Parameters' initialization when the script is launched from R or Rstudio
#file: tabulated raw count matrix with library name as header
#(e.g. gene_id lib1 lib2)
#out: folder path where results are stored
##############
#opt=list()
#opt$file=XXX # for example "C:/My documents/raw_count.csv"
#opt$out=XXX # for example"C:/My documents/results"
#Directory and count table file name are needed to continue
if (is.null(opt$file) | is.null(opt$out)) {
print_help(opt_parser)
stop("count table file name or output directory missing.\n", call. = FALSE)
}
#Create output folder if it didn't exist
ifelse(!dir.exists(opt$out), dir.create(opt$out), FALSE)
#####################################################
### ---------------------Input data and preprocessing
#####################################################
#Raw count
raw_count_table <- read.table(normalizePath(opt$file), header = TRUE,
row.names = 1, comment.char="")
#DGEList object
dge <- DGEList(raw_count_table, remove.zeros = TRUE)
#--Pseudo counts before normalization
PC_before <- dge$counts
PC_before_log <- log2(dge$counts + 1)
##############-Plots before normalization
PC_before_log_long <- melt(PC_before_log)
colnames(PC_before_log_long) <- c("name", "lib", "log2CPM")
## boxplot
ggplot(PC_before_log_long, aes(x=lib, y=log2CPM)) +
geom_boxplot(fill='gray', color="black") + theme_bw() + xlab(label = "") +
ylab(label = expression(log[2]*"(counts + 1)")) +
ggtitle("Boxplot on raw pseudo-counts")
ggsave(filename = normalizePath(file.path(opt$out, "rawcount_boxplot.svg"),
mustWork = FALSE), width = 11, height = 7)
## MDS
MDS_PC <- data.frame(plotMDS(PC_before_log)$cmdscale.out)
ggplot(MDS_PC, aes(x = X1, y = X2, label = rownames(MDS_PC))) +
geom_point(size=4) + geom_text(hjust=0.5, vjust=-0.5) + theme_bw() +
xlab(label = "Dimension 1") + ylab(label = "Dimension 2") +
ggtitle("PCA plot on raw pseudo-counts")
ggsave(filename = normalizePath(file.path(opt$out, "rawcount_MDS.svg"),
mustWork = FALSE), width = 11, height = 7)
## density
ggplot(PC_before_log_long, aes(x=log2CPM, colour=lib)) + geom_density() +
theme_bw() + xlab(label = expression(log[2]*"(counts + 1)")) +
ylab(label = "Density") + ggtitle("Density plot on raw pseudo-counts") +
guides(color=guide_legend(title="Library"))
ggsave(filename = normalizePath(file.path(opt$out, "rawcount_density.svg"),
mustWork = FALSE), width = 8, height = 6)
#save data used for plots
write.table(PC_before_log_long, file = normalizePath(file.path(opt$out,
"raw_boxplot_density.csv"),
mustWork = FALSE), sep = "\t",
row.names = TRUE, quote = FALSE)
write.table(MDS_PC, file = normalizePath(file.path(opt$out, "raw_MDS.csv"),
mustWork = FALSE), sep = "\t",
row.names = TRUE, quote = FALSE)
#######################################
### --------------------- Normalization
#######################################
normalization <- function(dge, method) {
#norm factor
dgeNorm <- calcNormFactors(dge, method = method)
#Pseudo-counts
PC_after <- cpm(dgeNorm)
PC_after_log <- log2(PC_after + 1)
#Normalized data
PC_after <- cbind("#name" = rownames(PC_after), PC_after)
sample_info <- data.frame(sample.name=rownames(dgeNorm$samples),
dgeNorm$samples[,c("lib.size","norm.factors")])
write.table(PC_after,
file = normalizePath(file.path(opt$out,
paste0(method, "_pseudocounts.csv")),
mustWork = FALSE), sep = "\t",
row.names = FALSE, quote = FALSE)
write.table(sample_info,
file = normalizePath(file.path(opt$out, paste0(method, "_info.txt")),
mustWork = FALSE), sep = "\t",
row.names = FALSE, quote = FALSE)
##### Plots
PC_after_log_long <- melt(PC_after_log)
colnames(PC_after_log_long) <- c("name", "lib", "log2CPM")
#title
tit <- method
if (method == "upperquartile") {
tit <- "upper-quartile"
}
#boxplot
ggplot(PC_after_log_long, aes(x=lib, y=log2CPM)) +
geom_boxplot(fill='gray', color="black") + theme_bw() + xlab(label = "") +
ylab(label = expression(log[2]*"(counts + 1)")) +
ggtitle(paste0("Boxplot on pseudo-counts after normalization (method = ",
tit, ")"))
ggsave(filename = normalizePath(file.path(opt$out, paste0(method,
"_boxplot.svg")),
mustWork = FALSE), width = 8, height = 6)
#MDS
MDS_PC <- data.frame(plotMDS(PC_after_log)$cmdscale.out)
ggplot(MDS_PC, aes(x = X1, y = X2, label = rownames(MDS_PC))) +
geom_point(size=4) + geom_text(hjust=0.5, vjust=-0.5) + theme_bw() +
xlab(label = "Dimension 1") + ylab(label = "Dimension 2") +
ggtitle(paste0("PCA plot on pseudo-counts after normalization (method = ",
tit, ")"))
ggsave(filename = normalizePath(file.path(opt$out, paste0(method, "_MDS.svg")),
mustWork = FALSE), width = 11, height = 7)
#density
ggplot(PC_after_log_long, aes(x=log2CPM, colour=lib)) + geom_density() +
theme_bw() + xlab(label = expression(log[2]*"(counts + 1)")) +
ylab(label = "Density") +
ggtitle(paste0("Density plot on pseudo-counts after normalization (method = ",
tit, ")")) +
guides(color=guide_legend(title="Library"))
ggsave(filename = normalizePath(file.path(opt$out, paste0(method, "_density.svg")),
mustWork = FALSE), width = 8, height = 6)
#save data used for plots
write.table(PC_after_log_long, file = normalizePath(file.path(opt$out,
paste0(method, "_boxplot_density.csv")),
mustWork = FALSE), sep = "\t",
row.names = TRUE, quote = FALSE)
write.table(MDS_PC, file = normalizePath(file.path(opt$out,
paste0(method, "_MDS.csv")),
mustWork = FALSE), sep = "\t",
row.names = TRUE, quote = FALSE)
}
normalization(dge, "upperquartile")
normalization(dge, "RLE")
normalization(dge, "TMM")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment