Commit 000aeca3 authored by Celine Noirot's avatar Celine Noirot
Browse files

Debug new parameters and add package svglite

parent 80056632
This diff is collapsed.
......@@ -4,7 +4,7 @@
##DATE = "03/2016"
##AUTHORS = "Gaëlle Lefort"
##KEYWORDS = "RNAseq gene ontology barplot"
##DESCRIPTION = "GOEnrichment.R carries out a barplot to compare gene ontology between multiple set of files."
##DESCRIPTION = "GOEnrichment.R carries out a barplot to compare gene ontology between multiple set of files."
############################# Enrichment analysis ##############################
......@@ -14,7 +14,8 @@
#common packages
packages <- c("optparse", # to read arguments from a command line
"ggplot2", # nice plots
"RJSONIO") # work with JSON file
"RJSONIO", # work with JSON file
"DBI")
for(package in packages){
# if package is installed locally, load
......@@ -22,7 +23,7 @@ for(package in packages){
install.packages(package)
}
do.call('library', list(package))
}
}
#Bioconductor package
if (!("topGO" %in% rownames(installed.packages()))) {
......@@ -38,19 +39,19 @@ library(topGO) # GO enrichment
###### Parameter initialization when the script is launched from command line
############
option_list = list(
make_option(c("-f", "--files"), type = "character", default = NULL,
help = "geneID2GO files: list of tabulated matrix with the correspondence
make_option(c("-f", "--files"), type = "character", default = NULL,
help = "geneID2GO files: list of tabulated matrix with the correspondence
gene to GO separated by ','", metavar = "character"),
make_option(c("-n", "--names"), type = "character", default = NULL,
make_option(c("-n", "--names"), type = "character", default = NULL,
help = "names of the samples (same order than files)", metavar = "character"),
make_option(c("--fileFormat"), type = "character", default = "topGO",
help = "two types of file can be used : topGO (gene_id1 GO_1,GO_2,...
make_option(c("--fileFormat"), type = "character", default = "topGO",
help = "two types of file can be used : topGO (gene_id1 GO_1,GO_2,...
\n gene_id2 GO_3,...) or twoColumns (only 2 columns
#gene_id GO_id)", metavar = "character"),
make_option(c("-o", "--out"), type = "character", default = NULL,
help = "folder path where results are stored",
make_option(c("-o", "--out"), type = "character", default = NULL,
help = "folder path where results are stored",
metavar = "character")
);
);
opt_parser = OptionParser(usage="Carries out a barplot to compare gene ontology between multiple set of files", option_list = option_list);
opt = parse_args(opt_parser);
......@@ -66,11 +67,11 @@ opt = parse_args(opt_parser);
#opt$out <- XXX # for example "C:/My documents/results"
# table with genes and GO, genes of interest, test and algorithm are needed to
# table with genes and GO, genes of interest, test and algorithm are needed to
#continue
if (is.null(opt$files)) {
print_help(opt_parser)
stop("Tables with genes and GO missing.\n",
stop("Tables with genes and GO missing.\n",
call. = FALSE)
}
......@@ -83,7 +84,7 @@ ifelse(!dir.exists(opt$out), dir.create(opt$out), FALSE)
### ---------------------Input data and preprocessing
#####################################################
## read tables with all genes
l <- unlist(strsplit(opt$files, ','))
l <- unlist(strsplit(opt$files, ','))
opt$names <- unlist(strsplit(opt$names, ','))
......@@ -92,50 +93,50 @@ if (opt$fileFormat == "topGO"){
for(i in seq_along(l)){
geneID2GO[[i]] <- readMappings(normalizePath(l[i]), sep = "\t", IDsep = ",")
}
} else {
for(i in seq_along(l)){
go <- read.table(normalizePath(l[i]), header = TRUE, sep = "\t",
go <- read.table(normalizePath(l[i]), header = TRUE, sep = "\t",
comment.char="")
# obtains a list of GO by genes for all genes (mandatory for topGO package)
geneID2GO[[i]] <- list()
for (i in unique(go[, 1])){
gene <- go[go[,1] == i,]
for (x in unique(go[, 1])){
gene <- go[go[,1] == x,]
listGO <- vector()
for (j in 1:nrow(gene)){
listGO <- c(listGO, as.character(gene[j, 2]))
}
geneID2GO[[i]] <- c(geneID2GO[[i]], list(listGO))
}
names(geneID2GO[[i]]) <- unique(go[, 1])
}
}
}
geneList <- list()
for(i in seq_along(geneID2GO)){
geneList[[i]] <- factor(rep(1, length(geneID2GO[[i]])))
levels(geneList[[i]]) <- c(levels(geneList[[i]]),0)
names(geneList[[i]]) <- names(geneID2GO[[i]])
names(geneList[[i]]) <- names(geneID2GO[[i]])
}
######################### Functions
########## get name/definition of a GO (hidden function in topGO)
getTermsDefinition <- function(whichTerms, ontology, numChar = 20,
getTermsDefinition <- function(whichTerms, ontology, numChar = 20,
multipLines = FALSE) {
qTerms <- paste(paste("'", whichTerms, "'", sep = ""), collapse = ",")
retVal <- dbGetQuery(GO_dbconn(),
retVal <- dbGetQuery(GO_dbconn(),
paste("SELECT term, go_id FROM go_term WHERE ontology IN",
"('", ontology, "') AND go_id IN (",
"('", ontology, "') AND go_id IN (",
qTerms, ");", sep = ""))
termsNames <- retVal$term
names(termsNames) <- retVal$go_id
if(!multipLines)
if(!multipLines)
shortNames <- paste(substr(termsNames, 1, numChar),
ifelse(nchar(termsNames) > numChar, '...', ''), sep = '')
else
......@@ -144,7 +145,7 @@ getTermsDefinition <- function(whichTerms, ontology, numChar = 20,
a <- strwrap(x, numChar)
return(paste(a, sep = "", collapse = "\\\n"))
})
names(shortNames) <- names(termsNames)
return(shortNames[whichTerms])
}
......@@ -152,27 +153,27 @@ getTermsDefinition <- function(whichTerms, ontology, numChar = 20,
########## find GO at levels 2 for ONE ontology
nbLevels2 <- function(geneListI, ontology, geneID2GO){
myGOdata <- NULL
try(myGOdata <- new("topGOdata", description="My project", ontology=ontology,
try(myGOdata <- new("topGOdata", description="My project", ontology=ontology,
allGenes=geneListI, annot = annFUN.gene2GO, gene2GO = geneID2GO),
silent = TRUE)
count <- 0
if (!is.null(myGOdata)){
# calculate levels for each GO in our data and his ancestors
levels <- buildLevels(myGOdata@graph)
# select only GO at level 2
goLevels2 <- mget(ls(levels$nodes2level), envir=levels$nodes2level)==2
goLevels2 <- names(goLevels2)[goLevels2]
# count genes in each GO
count <- NULL
for (i in goLevels2){
count <- c(count,countGenesInTerm(myGOdata, i))
}
}
return(count)
}
......@@ -181,29 +182,29 @@ nbLevels2 <- function(geneListI, ontology, geneID2GO){
percentOfGene <- function(geneList, ontology, DefOntology){
prop <- list()
count <- NULL
#create a table with GO names and proportion of genes
for(i in seq_along(geneList)){
prop[[i]] <- nbLevels2(geneList[[i]], ontology, geneID2GO[[i]])/length(geneList[[i]])
count <- rbind(count,data.frame(name=getTermsDefinition(names(prop[[i]]), ontology,
numChar = 40),
nbGene=as.vector(prop[[i]]), sample=opt$names[i],
count <- rbind(count,data.frame(name=getTermsDefinition(names(prop[[i]]), ontology,
numChar = 40),
nbGene=as.vector(prop[[i]]), sample=opt$names[i],
ontology=DefOntology))
}
# table with all GO
all <- data.frame(name = rep(unique(count$name), length(names)),
nbGene = 0, sample = rep(opt$names,
all <- data.frame(name = rep(unique(count$name), length(names)),
nbGene = 0, sample = rep(opt$names,
each = length(unique(count$name))),
ontology=DefOntology)
#only GO-name not in count
not_in_count <- all[!(paste0(all$name,".", all$sample) %in%
not_in_count <- all[!(paste0(all$name,".", all$sample) %in%
paste0(count$name,".", count$sample)),]
count <- rbind(count, not_in_count)
return(count)
}
......@@ -221,15 +222,12 @@ ggplot(count, aes(x=as.factor(name), y=nbGene, fill=sample)) +
geom_bar(stat="identity", position=position_dodge()) +
facet_grid( ~ ontology, scales="free_x") +
theme(axis.text.x = element_text(angle = 70, vjust = 1, hjust=1, colour = "black")) +
scale_fill_discrete(name = "") + ylab(paste0("Percent of ", opt$target)) +
scale_fill_discrete(name = "") + ylab(paste0("Percent of ", opt$target)) +
xlab("GO term")
ggsave(filename = normalizePath(file.path(opt$out, "barplot.svg"),
ggsave(filename = normalizePath(file.path(opt$out, "barplot.svg"),
mustWork = FALSE), width = 11, height = 7)
write.table(count, file = normalizePath(file.path(opt$out,
paste0("barplot.csv")),
mustWork = FALSE), sep = "\t",
write.table(count, file = normalizePath(file.path(opt$out,
paste0("barplot.csv")),
mustWork = FALSE), sep = "\t",
row.names = FALSE, quote = FALSE)
......@@ -4,7 +4,7 @@
##DATE = "03/2016"
##AUTHORS = "Gaëlle Lefort"
##KEYWORDS = "RNAseq GO enrichment"
##DESCRIPTION = "GOEnrichment.R perform an enrichment analysis for Gene Ontology (GO)."
##DESCRIPTION = "GOEnrichment.R perform an enrichment analysis for Gene Ontology (GO)."
############################# Enrichment analysis ##############################
......@@ -14,7 +14,8 @@
#common packages
packages <- c("optparse", # to read arguments from a command line
"ggplot2", # nice plots
"RJSONIO") # work with JSON file
"RJSONIO", # work with JSON file
"DBI")
for(package in packages){
# if package is installed locally, load
......@@ -22,7 +23,7 @@ for(package in packages){
install.packages(package)
}
do.call('library', list(package))
}
}
#Bioconductor package
if (!("topGO" %in% rownames(installed.packages()))) {
......@@ -38,43 +39,43 @@ library(topGO) # GO enrichment
###### Parameter initialization when the script is launched from command line
############
option_list = list(
make_option(c("-f", "--file"), type = "character", default = NULL,
help = "geneID2GO file: tabulated matrix with the correspondence
make_option(c("-f", "--file"), type = "character", default = NULL,
help = "geneID2GO file: tabulated matrix with the correspondence
gene to GO", metavar = "character"),
make_option(c("--fileFormat"), type = "character", default = "topGO",
help = "two types of file can be used : topGO (gene_id1 GO_1,GO_2,...
make_option(c("--fileFormat"), type = "character", default = "topGO",
help = "two types of file can be used : topGO (gene_id1 GO_1,GO_2,...
\n gene_id2 GO_3,...) or twoColumns (only 2 columns
#gene_id GO_id)", metavar = "character"),
make_option(c("-i", "--interest"), type = "character", default = NULL,
help = "matrix with id of genes of interest in the first column
(all these genes must be in the geneID2GO file)",
make_option(c("-i", "--interest"), type = "character", default = NULL,
help = "matrix with id of genes of interest in the first column
(all these genes must be in the geneID2GO file)",
metavar = "character"),
make_option(c("-o", "--out"), type = "character", default = NULL,
help = "folder path where results are stored",
make_option(c("-o", "--out"), type = "character", default = NULL,
help = "folder path where results are stored",
metavar = "character"),
make_option(c("-t", "--test"), type = "character", default = NULL,
help = "test used to find significant GO ('fisher' or 'KS'
make_option(c("-t", "--test"), type = "character", default = NULL,
help = "test used to find significant GO ('fisher' or 'KS'
Kolmogorov-Smirnov)", metavar = "character"),
make_option(c("-a", "--algorithm"), type = "character", default = NULL,
help = "algorithm used to correct p-value ('classic', 'elim'
make_option(c("-a", "--algorithm"), type = "character", default = NULL,
help = "algorithm used to correct p-value ('classic', 'elim'
or 'weight')", metavar = "character"),
make_option(c("--paramAlgo"), type = "character", default = NULL,
make_option(c("--paramAlgo"), type = "character", default = NULL,
help = "- for 'classic' algorithm: method used to correct
p-values ('bonferroni' or 'BH') [default=BH]
- for 'elim' algorithm: value used to remove genes from
significant GO [default=0.01]
- for 'weight' algorithm: weighted function used
('ratio' or 'log') [default=ratio] ",
('ratio' or 'log') [default=ratio] ",
metavar = "character"),
make_option(c("--alpha"), type = "character", default = 0.05,
help = "significance level of the tests [default=%default]",
make_option(c("--alpha"), type = "character", default = 0.05,
help = "significance level of the tests [default=%default]",
metavar = "character"),
make_option(c("--target"), type = "character", default = "gene",
help = "type of data [default=%default]",
make_option(c("--target"), type = "character", default = "gene",
help = "type of data [default=%default]",
metavar = "character")
);
);
opt_parser = OptionParser(usage="Perform an enrichment analysis for
opt_parser = OptionParser(usage="Perform an enrichment analysis for
Gene Ontology (GO)", option_list = option_list);
opt = parse_args(opt_parser);
......@@ -94,12 +95,12 @@ opt = parse_args(opt_parser);
#opt$target <- "contigs" # for example contigs, gene...
# table with genes and GO, genes of interest, test and algorithm are needed to
# table with genes and GO, genes of interest, test and algorithm are needed to
#continue
if (is.null(opt$file) || is.null(opt$interest) || is.null(opt$test) ||
if (is.null(opt$file) || is.null(opt$interest) || is.null(opt$test) ||
is.null(opt$algorithm)) {
print_help(opt_parser)
stop("Table with genes and GO, genes of interest, test or algorithm missing.\n",
stop("Table with genes and GO, genes of interest, test or algorithm missing.\n",
call. = FALSE)
}
......@@ -109,7 +110,7 @@ ifelse(!dir.exists(opt$out), dir.create(opt$out), FALSE)
#Stop the script if files doesn't exist
if (!file.exists(normalizePath(opt$file))) {
print_help(opt_parser)
stop("table with genes and GO doesn't exist.\n",
stop("table with genes and GO doesn't exist.\n",
call. = FALSE)
}
......@@ -124,12 +125,12 @@ if(is.null(opt$paramAlgo)){
}
if(opt$algorithm == "classic" & !(opt$paramAlgo == "bonferroni" | opt$paramAlgo == "BH")){
stop("paramAlgo need to be equals to 'bonferroni' or 'BH'\n",
stop("paramAlgo need to be equals to 'bonferroni' or 'BH'\n",
call. = FALSE)
}
if(opt$algorithm == "classic" & !(opt$paramAlgo == "bonferroni" | opt$paramAlgo == "BH")){
stop("paramAlgo need to be equals to 'bonferroni' or 'BH'\n",
stop("paramAlgo need to be equals to 'bonferroni' or 'BH'\n",
call. = FALSE)
}
......@@ -137,17 +138,17 @@ if(opt$algorithm == "classic" & !(opt$paramAlgo == "bonferroni" | opt$paramAlgo
if(opt$algorithm == "elim"){
opt$paramAlgo <- as.numeric(opt$paramAlgo)
if(is.na(opt$paramAlgo)){
stop("paramAlgo need to be a number\n",
stop("paramAlgo need to be a number\n",
call. = FALSE)
}
if(opt$paramAlgo > 1){
stop("paramAlgo need to be inferior to 1\n",
stop("paramAlgo need to be inferior to 1\n",
call. = FALSE)
}
}
if(opt$algorithm == "weight" & !(opt$paramAlgo == "ratio" | opt$paramAlgo == "log")){
stop("paramAlgo need to be equals to 'ratio' or 'log'\n",
stop("paramAlgo need to be equals to 'ratio' or 'log'\n",
call. = FALSE)
}
......@@ -159,7 +160,7 @@ if(opt$algorithm == "weight" & !(opt$paramAlgo == "ratio" | opt$paramAlgo == "lo
if (opt$fileFormat == "topGO"){
geneID2GO <- readMappings(normalizePath(opt$file), sep = "\t", IDsep = ",")
} else {
go <- read.table(normalizePath(opt$file), header = TRUE, sep = "\t",
go <- read.table(normalizePath(opt$file), header = TRUE, sep = "\t",
comment.char="")
# obtains a list of GO by genes for all genes (mandatory for topGO package)
geneID2GO <- list()
......@@ -171,19 +172,19 @@ if (opt$fileFormat == "topGO"){
}
geneID2GO <- c(geneID2GO, list(listGO))
}
names(geneID2GO) <- unique(go[, 1])
}
## read table with genes of interest
genesOfInterest <- read.table(normalizePath(opt$interest), header = TRUE, sep = "\t",
genesOfInterest <- read.table(normalizePath(opt$interest), header = TRUE, sep = "\t",
comment.char="")
genesOfInterest <- genesOfInterest[,1]
#names of all genes
geneUniverse <- names(geneID2GO)
geneUniverse <- names(geneID2GO)
# =1 if gene is a gene of interest
geneList <- factor(as.integer(geneUniverse %in% genesOfInterest))
......@@ -196,19 +197,19 @@ notInAnalysis <- genesOfInterest[!(genesOfInterest %in% geneUniverse)]
######################### Functions
########## get name/definition of a GO (hidden function in topGO)
getTermsDefinition <- function(whichTerms, ontology, numChar = 20,
getTermsDefinition <- function(whichTerms, ontology, numChar = 20,
multipLines = FALSE) {
qTerms <- paste(paste("'", whichTerms, "'", sep = ""), collapse = ",")
retVal <- dbGetQuery(GO_dbconn(),
retVal <- dbGetQuery(GO_dbconn(),
paste("SELECT term, go_id FROM go_term WHERE ontology IN",
"('", ontology, "') AND go_id IN (",
"('", ontology, "') AND go_id IN (",
qTerms, ");", sep = ""))
termsNames <- retVal$term
names(termsNames) <- retVal$go_id
if(!multipLines)
if(!multipLines)
shortNames <- paste(substr(termsNames, 1, numChar),
ifelse(nchar(termsNames) > numChar, '...', ''), sep = '')
else
......@@ -217,7 +218,7 @@ getTermsDefinition <- function(whichTerms, ontology, numChar = 20,
a <- strwrap(x, numChar)
return(paste(a, sep = "", collapse = "\\\n"))
})
names(shortNames) <- names(termsNames)
return(shortNames[whichTerms])
}
......@@ -225,27 +226,27 @@ getTermsDefinition <- function(whichTerms, ontology, numChar = 20,
########## find GO at levels 2 for ONE ontology
nbLevels2 <- function(geneList, ontology){
myGOdata <- NULL
try(myGOdata <- new("topGOdata", description="My project", ontology=ontology,
try(myGOdata <- new("topGOdata", description="My project", ontology=ontology,
allGenes=geneList, annot = annFUN.gene2GO, gene2GO = geneID2GO),
silent = TRUE)
count <- 0
if (!is.null(myGOdata)){
# calculate levels for each GO in our data and his ancestors
levels <- buildLevels(myGOdata@graph)
# select only GO at level 2
goLevels2 <- mget(ls(levels$nodes2level), envir=levels$nodes2level)==2
goLevels2 <- names(goLevels2)[goLevels2]
# count genes in each GO
count <- NULL
for (i in goLevels2){
count <- c(count,countGenesInTerm(myGOdata, i))
}
}
return(count)
}
......@@ -254,43 +255,43 @@ nbLevels2 <- function(geneList, ontology){
percentOfGene <- function(geneList, ontology, DefOntology){
## Not genes of interest
All <- nbLevels2(geneList, ontology)/length(geneList)
## genes of interest
geneListGoI <- geneList[geneList==1]
GoI <- nbLevels2(geneListGoI, ontology)/length(geneListGoI)
## percent of gene by GO
count <- data.frame(name=getTermsDefinition(names(All), ontology,
numChar = 40),
nbGene=as.vector(All), GoI="All genes",
count <- data.frame(name=getTermsDefinition(names(All), ontology,
numChar = 40),
nbGene=as.vector(All), GoI="All genes",
ontology=DefOntology)
if (GoI != 0){
count <- rbind(count,
data.frame(name=getTermsDefinition(names(GoI), ontology,
numChar = 40),
nbGene=as.vector(GoI), GoI="Genes of interest",
data.frame(name=getTermsDefinition(names(GoI), ontology,
numChar = 40),
nbGene=as.vector(GoI), GoI="Genes of interest",
ontology=DefOntology))
}
## to obtain the same list of GO between GoI and not GoI, we add line with a percent
## of genes equal to 0
if(length(count$name[!(count$name[count$GoI == "All genes"] %in%
if(length(count$name[!(count$name[count$GoI == "All genes"] %in%
count$name[count$GoI == "Genes of interest"])]) > 0){
count<-rbind(count,
data.frame(name=count$name[!(count$name[count$GoI == "All genes"]
%in% count$name[count$GoI == "Genes of interest"])],
nbGene=0, GoI="Genes of interest", ontology=DefOntology))
}
if(length(count$name[!(count$name[count$GoI == "Genes of interest"] %in%
if(length(count$name[!(count$name[count$GoI == "Genes of interest"] %in%
count$name[count$GoI == "All genes"])]) > 0){
count<-rbind(count,
data.frame(name=count$name[!(count$name[count$GoI == "Genes of interest"]
data.frame(name=count$name[!(count$name[count$GoI == "Genes of interest"]
%in% count$name[count$GoI == "All genes"])],
nbGene=0, GoI="All genes", ontology=DefOntology))
}
return(count)
}
......@@ -309,14 +310,14 @@ ggplot(count, aes(x=as.factor(name), y=nbGene, fill=as.factor(GoI))) +
geom_bar(stat="identity", position=position_dodge()) +
facet_grid( ~ ontology, scales="free_x") +
theme(axis.text.x = element_text(angle = 70, vjust = 1, hjust=1, colour = "black")) +
scale_fill_discrete(name = "") + ylab(paste0("Percent of ", opt$target)) +
scale_fill_discrete(name = "") + ylab(paste0("Percent of ", opt$target)) +
xlab("GO term")
ggsave(filename = normalizePath(file.path(opt$out, "barplot.svg"),
ggsave(filename = normalizePath(file.path(opt$out, "barplot.svg"),
mustWork = FALSE), width = 11, height = 7)
write.table(count, file = normalizePath(file.path(opt$out,
paste0("barplot.csv")),
mustWork = FALSE), sep = "\t",
write.table(count, file = normalizePath(file.path(opt$out,
paste0("barplot.csv")),
mustWork = FALSE), sep = "\t",
row.names = FALSE, quote = FALSE)
......@@ -326,56 +327,56 @@ write.table(count, file = normalizePath(file.path(opt$out,
GOgeneSig <- NULL
for(ontology in c("BP", "MF", "CC")){
## GO object
myGOdata <- new("topGOdata", description="My project", ontology=ontology,
myGOdata <- new("topGOdata", description="My project", ontology=ontology,
allGenes=geneList, annot = annFUN.gene2GO, gene2GO = geneID2GO)
## Test and p-value correction if algorithm = 'classic'
resultTest <- NULL
if (opt$algorithm == "classic"){
resultTest <- runTest(myGOdata, algorithm = opt$algorithm, statistic = opt$test)
resultTest@score <- p.adjust(resultTest@score, method = opt$paramAlgo)
} else if (opt$algorithm == "elim"){
resultTest <- runTest(myGOdata, algorithm = opt$algorithm, statistic = opt$test,
resultTest <- runTest(myGOdata, algorithm = opt$algorithm, statistic = opt$test,