Commit 6907d0b4 authored by Celine Noirot's avatar Celine Noirot
Browse files

Add GoEnrichment

parent 063d9a19
############################# Enrichment analysis ##############################
################################################################################
### ---------------------Packages
#common packages
packages <- c("optparse", # to read arguments from a command line
"ggplot2", # nice plots
"RJSONIO") # work with JSON file
for(package in packages){
# if package is installed locally, load
if (!(package %in% rownames(installed.packages()))) {
install.packages(package)
}
do.call('library', list(package))
}
#Bioconductor package
if (!("topGO" %in% rownames(installed.packages()))) {
source("http://www.bioconductor.org/biocLite.R")
biocLite("topGO", suppressUpdates = TRUE, suppressAutoUpdate = TRUE, ask = FALSE)
}
library(topGO) # GO enrichment
### ---------------------Parameters
#############
###### Parameter initialization when the script is launched from command line
############
option_list = list(
make_option(c("-f", "--file"), type = "character", default = NULL,
help = "geneID2GO file: tabulated matrix with the correspondence
gene to GO", metavar = "character"),
make_option(c("--fileFormat"), type = "character", default = "topGO",
help = "two types of file can be used : topGO (gene_id1 GO_1,GO_2,...
\n gene_id2 GO_3,...) or twoColumns (only 2 columns
#gene_id GO_id)", metavar = "character"),
make_option(c("-i", "--interest"), type = "character", default = NULL,
help = "matrix with id of genes of interest in the first column
(all these genes must be in the geneID2GO file)",
metavar = "character"),
make_option(c("-o", "--out"), type = "character", default = NULL,
help = "folder path where results are stored",
metavar = "character"),
make_option(c("-t", "--test"), type = "character", default = NULL,
help = "test used to find significant GO ('fisher' or 'KS'
Kolmogorov-Smirnov)", metavar = "character"),
make_option(c("-a", "--algorithm"), type = "character", default = NULL,
help = "algorithm used to correct p-value ('classic', 'elim'
or 'weight')", metavar = "character"),
make_option(c("--paramAlgo"), type = "character", default = NULL,
help = "- for 'classic' algorithm: method used to correct
p-values ('bonferroni' or 'BH') [default=BH]
- for 'elim' algorithm: value used to remove genes from
significant GO [default=0.01]
- for 'weight' algorithm: weighted function used
('ratio' or 'log') [default=ratio] ",
metavar = "character"),
make_option(c("--alpha"), type = "character", default = 0.05,
help = "significance level of the tests [default=%default]",
metavar = "character"),
make_option(c("--target"), type = "character", default = "gene",
help = "type of data [default=%default]",
metavar = "character")
);
opt_parser = OptionParser(usage="Perform an enrichment analysis for
Gene Ontology (GO)", option_list = option_list);
opt = parse_args(opt_parser);
##############
## Parameter initialization when the script is launched from R or Rstudio
##############
#opt=list()
#opt$file <- XXX # for example "C:/My documents/raw_count.csv"
#opt$fileFormat <- "topGO" # "topGO" or "twoColumns"
#opt$interest <- XXX # for example "C:/My documents/interest.csv"
#opt$out <- XXX # for example "C:/My documents/results"
#opt$test <- XXX # "fisher" or "KS"
#opt$algorithm <- XXX # ('classic', 'elim' or 'weight')
#opt$paramAlgo <- XXX # ('bonferroni' or 'BH' for classic, 0.01 for elim and 'ratio' or 'log' for weight)
#opt$alpha <- 0.05 # for example 0.05
#opt$target <- "contigs" # for example contigs, gene...
# table with genes and GO, genes of interest, test and algorithm are needed to
#continue
if (is.null(opt$file) || is.null(opt$interest) || is.null(opt$test) ||
is.null(opt$algorithm)) {
print_help(opt_parser)
stop("Table with genes and GO, genes of interest, test or algorithm missing.\n",
call. = FALSE)
}
#Create output folder if it doesn't exist
ifelse(!dir.exists(opt$out), dir.create(opt$out), FALSE)
#Stop the script if files doesn't exist
if (!file.exists(normalizePath(opt$file))) {
print_help(opt_parser)
stop("table with genes and GO doesn't exist.\n",
call. = FALSE)
}
if(is.null(opt$paramAlgo)){
if (opt$algorithm == "classic"){
opt$paramAlgo <- "BH"
} else if (opt$algorithm == "elim"){
opt$paramAlgo <- 0.01
} else if (opt$algorithm == "weight"){
opt$paramAlgo <- "ratio"
}
}
if(opt$algorithm == "classic" & !(opt$paramAlgo == "bonferroni" | opt$paramAlgo == "BH")){
stop("paramAlgo need to be equals to 'bonferroni' or 'BH'\n",
call. = FALSE)
}
if(opt$algorithm == "classic" & !(opt$paramAlgo == "bonferroni" | opt$paramAlgo == "BH")){
stop("paramAlgo need to be equals to 'bonferroni' or 'BH'\n",
call. = FALSE)
}
if(opt$algorithm == "elim"){
opt$paramAlgo <- as.numeric(opt$paramAlgo)
if(is.na(opt$paramAlgo)){
stop("paramAlgo need to be a number\n",
call. = FALSE)
}
if(opt$paramAlgo > 1){
stop("paramAlgo need to be inferior to 1\n",
call. = FALSE)
}
}
if(opt$algorithm == "weight" & !(opt$paramAlgo == "ratio" | opt$paramAlgo == "log")){
stop("paramAlgo need to be equals to 'ratio' or 'log'\n",
call. = FALSE)
}
#####################################################
### ---------------------Input data and preprocessing
#####################################################
## read table with all genes
if (opt$fileFormat == "topGO"){
geneID2GO <- readMappings(normalizePath(opt$file), sep = "\t", IDsep = ",")
} else {
go <- read.table(normalizePath(opt$file), header = TRUE, sep = "\t",
comment.char="")
# obtains a list of GO by genes for all genes (mandatory for topGO package)
geneID2GO <- list()
for (i in unique(go[, 1])){
gene <- go[go[,1] == i,]
listGO <- vector()
for (j in 1:nrow(gene)){
listGO <- c(listGO, as.character(gene[j, 2]))
}
geneID2GO <- c(geneID2GO, list(listGO))
}
names(geneID2GO) <- unique(go[, 1])
}
## read table with genes of interest
genesOfInterest <- read.table(normalizePath(opt$interest), header = TRUE, sep = "\t",
comment.char="")
genesOfInterest <- genesOfInterest[,1]
#names of all genes
geneUniverse <- names(geneID2GO)
# =1 if gene is a gene of interest
geneList <- factor(as.integer(geneUniverse %in% genesOfInterest))
names(geneList) <- geneUniverse
#gene in the list of interest but not in the analysis
notInAnalysis <- genesOfInterest[!(genesOfInterest %in% geneUniverse)]
######################### Functions
########## get name/definition of a GO (hidden function in topGO)
getTermsDefinition <- function(whichTerms, ontology, numChar = 20,
multipLines = FALSE) {
qTerms <- paste(paste("'", whichTerms, "'", sep = ""), collapse = ",")
retVal <- dbGetQuery(GO_dbconn(),
paste("SELECT term, go_id FROM go_term WHERE ontology IN",
"('", ontology, "') AND go_id IN (",
qTerms, ");", sep = ""))
termsNames <- retVal$term
names(termsNames) <- retVal$go_id
if(!multipLines)
shortNames <- paste(substr(termsNames, 1, numChar),
ifelse(nchar(termsNames) > numChar, '...', ''), sep = '')
else
shortNames <- sapply(termsNames,
function(x) {
a <- strwrap(x, numChar)
return(paste(a, sep = "", collapse = "\\\n"))
})
names(shortNames) <- names(termsNames)
return(shortNames[whichTerms])
}
########## find GO at levels 2 for ONE ontology
nbLevels2 <- function(geneList, ontology){
myGOdata <- NULL
try(myGOdata <- new("topGOdata", description="My project", ontology=ontology,
allGenes=geneList, annot = annFUN.gene2GO, gene2GO = geneID2GO),
silent = TRUE)
count <- 0
if (!is.null(myGOdata)){
# calculate levels for each GO in our data and his ancestors
levels <- buildLevels(myGOdata@graph)
# select only GO at level 2
goLevels2 <- mget(ls(levels$nodes2level), envir=levels$nodes2level)==2
goLevels2 <- names(goLevels2)[goLevels2]
# count genes in each GO
count <- NULL
for (i in goLevels2){
count <- c(count,countGenesInTerm(myGOdata, i))
}
}
return(count)
}
########## obtain percent of total genes for each level 2 GO
percentOfGene <- function(geneList, ontology, DefOntology){
## Not genes of interest
All <- nbLevels2(geneList, ontology)/length(geneList)
## genes of interest
geneListGoI <- geneList[geneList==1]
GoI <- nbLevels2(geneListGoI, ontology)/length(geneListGoI)
## percent of gene by GO
count <- data.frame(name=getTermsDefinition(names(All), ontology,
numChar = 40),
nbGene=as.vector(All), GoI="All genes",
ontology=DefOntology)
if (GoI != 0){
count <- rbind(count,
data.frame(name=getTermsDefinition(names(GoI), ontology,
numChar = 40),
nbGene=as.vector(GoI), GoI="Genes of interest",
ontology=DefOntology))
}
## to obtain the same list of GO between GoI and not GoI, we add line with a percent
## of genes equal to 0
if(length(count$name[!(count$name[count$GoI == "All genes"] %in%
count$name[count$GoI == "Genes of interest"])]) > 0){
count<-rbind(count,
data.frame(name=count$name[!(count$name[count$GoI == "All genes"]
%in% count$name[count$GoI == "Genes of interest"])],
nbGene=0, GoI="Genes of interest", ontology=DefOntology))
}
if(length(count$name[!(count$name[count$GoI == "Genes of interest"] %in%
count$name[count$GoI == "All genes"])]) > 0){
count<-rbind(count,
data.frame(name=count$name[!(count$name[count$GoI == "Genes of interest"]
%in% count$name[count$GoI == "All genes"])],
nbGene=0, GoI="All genes", ontology=DefOntology))
}
return(count)
}
######################### Barplot
countBP <- percentOfGene(geneList, "BP", "Biological Process")
countMF <- percentOfGene(geneList, "MF", "Molecular Function")
countCC <- percentOfGene(geneList, "CC", "Cellular Component")
count <- rbind(countBP, countMF, countCC)
count$GoI <- factor(count$GoI, levels = rev(levels(count$GoI)))
ggplot(count, aes(x=as.factor(name), y=nbGene, fill=as.factor(GoI))) +
geom_bar(stat="identity", position=position_dodge()) +
facet_grid( ~ ontology, scales="free_x") +
theme(axis.text.x = element_text(angle = 70, vjust = 1, hjust=1, colour = "black")) +
scale_fill_discrete(name = "") + ylab(paste0("Percent of ", opt$target)) +
xlab("GO term")
ggsave(filename = normalizePath(file.path(opt$out, "barplot.svg"),
mustWork = FALSE), width = 11, height = 7)
write.table(count, file = normalizePath(file.path(opt$out,
paste0("barplot.csv")),
mustWork = FALSE), sep = "\t",
row.names = FALSE, quote = FALSE)
#################### Tests
GOgeneSig <- NULL
for(ontology in c("BP", "MF", "CC")){
## GO object
myGOdata <- new("topGOdata", description="My project", ontology=ontology,
allGenes=geneList, annot = annFUN.gene2GO, gene2GO = geneID2GO)
## Test and p-value correction if algorithm = 'classic'
resultTest <- NULL
if (opt$algorithm == "classic"){
resultTest <- runTest(myGOdata, algorithm = opt$algorithm, statistic = opt$test)
resultTest@score <- p.adjust(resultTest@score, method = opt$paramAlgo)
} else if (opt$algorithm == "elim"){
resultTest <- runTest(myGOdata, algorithm = opt$algorithm, statistic = opt$test,
cutOff = as.numeric(opt$paramAlgo))
} else if (opt$algorithm == "weight"){
resultTest <- runTest(myGOdata, algorithm = opt$algorithm, statistic = opt$test,
sigRatio = opt$paramAlgo)
}
sigRes <- resultTest@score[resultTest@score <= opt$alpha]
if (!is.na(sigRes[1])){
svg(normalizePath(file.path(opt$out, paste0(ontology, "_DAG.svg")),
mustWork = FALSE))
showSigOfNodes(myGOdata, score(resultTest), firstSigNodes = 10, useInfo = 'all')
dev.off()
## Table with significant GOs and genes
for (i in 1:length(sigRes)){
add <- data.frame(ontology = ontology, GO = names(sigRes)[i],
GODef = as.character(getTermsDefinition(names(sigRes)[i],
ontology, numChar = 40)),
gene = genesInTerm(myGOdata, names(sigRes)[i])[[1]],
pval = as.numeric(sigRes[i]))
GOgeneSig <- rbind(GOgeneSig, add)
}
}
}
if(!is.null(GOgeneSig)){
names(GOgeneSig)[4] <- opt$target
}
write.table(GOgeneSig, file = normalizePath(file.path(opt$out,
paste0("GOgeneSig.csv")),
mustWork = FALSE), sep = "\t",
row.names = FALSE, quote = FALSE)
write.table(unique(GOgeneSig[,c("ontology", "GO", "GODef", "pval")]),
file = normalizePath(file.path(opt$out, paste0("GOSig.csv")),
mustWork = FALSE), sep = "\t",
row.names = FALSE, quote = FALSE)
## JSON
stat <- list("notInTheAnalysis" = ifelse(!is.na(notInAnalysis[1]), as.character(notInAnalysis), "none"))
write(gsub(" ", "", toJSON(list(stat=stat), collapse = "")),
file = normalizePath(file.path(opt$out, "results.json"),
mustWork = FALSE))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment