Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
bios4biol
bioinfo-utils
Commits
6907d0b4
Commit
6907d0b4
authored
Mar 14, 2016
by
Celine Noirot
Browse files
Add GoEnrichment
parent
063d9a19
Changes
1
Hide whitespace changes
Inline
Side-by-side
bin/GOEnrichment.R
0 → 100644
View file @
6907d0b4
############################# Enrichment analysis ##############################
################################################################################
### ---------------------Packages
#common packages
packages
<-
c
(
"optparse"
,
# to read arguments from a command line
"ggplot2"
,
# nice plots
"RJSONIO"
)
# work with JSON file
for
(
package
in
packages
){
# if package is installed locally, load
if
(
!
(
package
%in%
rownames
(
installed.packages
())))
{
install.packages
(
package
)
}
do.call
(
'library'
,
list
(
package
))
}
#Bioconductor package
if
(
!
(
"topGO"
%in%
rownames
(
installed.packages
())))
{
source
(
"http://www.bioconductor.org/biocLite.R"
)
biocLite
(
"topGO"
,
suppressUpdates
=
TRUE
,
suppressAutoUpdate
=
TRUE
,
ask
=
FALSE
)
}
library
(
topGO
)
# GO enrichment
### ---------------------Parameters
#############
###### Parameter initialization when the script is launched from command line
############
option_list
=
list
(
make_option
(
c
(
"-f"
,
"--file"
),
type
=
"character"
,
default
=
NULL
,
help
=
"geneID2GO file: tabulated matrix with the correspondence
gene to GO"
,
metavar
=
"character"
),
make_option
(
c
(
"--fileFormat"
),
type
=
"character"
,
default
=
"topGO"
,
help
=
"two types of file can be used : topGO (gene_id1 GO_1,GO_2,...
\n gene_id2 GO_3,...) or twoColumns (only 2 columns
#gene_id GO_id)"
,
metavar
=
"character"
),
make_option
(
c
(
"-i"
,
"--interest"
),
type
=
"character"
,
default
=
NULL
,
help
=
"matrix with id of genes of interest in the first column
(all these genes must be in the geneID2GO file)"
,
metavar
=
"character"
),
make_option
(
c
(
"-o"
,
"--out"
),
type
=
"character"
,
default
=
NULL
,
help
=
"folder path where results are stored"
,
metavar
=
"character"
),
make_option
(
c
(
"-t"
,
"--test"
),
type
=
"character"
,
default
=
NULL
,
help
=
"test used to find significant GO ('fisher' or 'KS'
Kolmogorov-Smirnov)"
,
metavar
=
"character"
),
make_option
(
c
(
"-a"
,
"--algorithm"
),
type
=
"character"
,
default
=
NULL
,
help
=
"algorithm used to correct p-value ('classic', 'elim'
or 'weight')"
,
metavar
=
"character"
),
make_option
(
c
(
"--paramAlgo"
),
type
=
"character"
,
default
=
NULL
,
help
=
"- for 'classic' algorithm: method used to correct
p-values ('bonferroni' or 'BH') [default=BH]
- for 'elim' algorithm: value used to remove genes from
significant GO [default=0.01]
- for 'weight' algorithm: weighted function used
('ratio' or 'log') [default=ratio] "
,
metavar
=
"character"
),
make_option
(
c
(
"--alpha"
),
type
=
"character"
,
default
=
0.05
,
help
=
"significance level of the tests [default=%default]"
,
metavar
=
"character"
),
make_option
(
c
(
"--target"
),
type
=
"character"
,
default
=
"gene"
,
help
=
"type of data [default=%default]"
,
metavar
=
"character"
)
);
opt_parser
=
OptionParser
(
usage
=
"Perform an enrichment analysis for
Gene Ontology (GO)"
,
option_list
=
option_list
);
opt
=
parse_args
(
opt_parser
);
##############
## Parameter initialization when the script is launched from R or Rstudio
##############
#opt=list()
#opt$file <- XXX # for example "C:/My documents/raw_count.csv"
#opt$fileFormat <- "topGO" # "topGO" or "twoColumns"
#opt$interest <- XXX # for example "C:/My documents/interest.csv"
#opt$out <- XXX # for example "C:/My documents/results"
#opt$test <- XXX # "fisher" or "KS"
#opt$algorithm <- XXX # ('classic', 'elim' or 'weight')
#opt$paramAlgo <- XXX # ('bonferroni' or 'BH' for classic, 0.01 for elim and 'ratio' or 'log' for weight)
#opt$alpha <- 0.05 # for example 0.05
#opt$target <- "contigs" # for example contigs, gene...
# table with genes and GO, genes of interest, test and algorithm are needed to
#continue
if
(
is.null
(
opt
$
file
)
||
is.null
(
opt
$
interest
)
||
is.null
(
opt
$
test
)
||
is.null
(
opt
$
algorithm
))
{
print_help
(
opt_parser
)
stop
(
"Table with genes and GO, genes of interest, test or algorithm missing.\n"
,
call.
=
FALSE
)
}
#Create output folder if it doesn't exist
ifelse
(
!
dir.exists
(
opt
$
out
),
dir.create
(
opt
$
out
),
FALSE
)
#Stop the script if files doesn't exist
if
(
!
file.exists
(
normalizePath
(
opt
$
file
)))
{
print_help
(
opt_parser
)
stop
(
"table with genes and GO doesn't exist.\n"
,
call.
=
FALSE
)
}
if
(
is.null
(
opt
$
paramAlgo
)){
if
(
opt
$
algorithm
==
"classic"
){
opt
$
paramAlgo
<-
"BH"
}
else
if
(
opt
$
algorithm
==
"elim"
){
opt
$
paramAlgo
<-
0.01
}
else
if
(
opt
$
algorithm
==
"weight"
){
opt
$
paramAlgo
<-
"ratio"
}
}
if
(
opt
$
algorithm
==
"classic"
&
!
(
opt
$
paramAlgo
==
"bonferroni"
|
opt
$
paramAlgo
==
"BH"
)){
stop
(
"paramAlgo need to be equals to 'bonferroni' or 'BH'\n"
,
call.
=
FALSE
)
}
if
(
opt
$
algorithm
==
"classic"
&
!
(
opt
$
paramAlgo
==
"bonferroni"
|
opt
$
paramAlgo
==
"BH"
)){
stop
(
"paramAlgo need to be equals to 'bonferroni' or 'BH'\n"
,
call.
=
FALSE
)
}
if
(
opt
$
algorithm
==
"elim"
){
opt
$
paramAlgo
<-
as.numeric
(
opt
$
paramAlgo
)
if
(
is.na
(
opt
$
paramAlgo
)){
stop
(
"paramAlgo need to be a number\n"
,
call.
=
FALSE
)
}
if
(
opt
$
paramAlgo
>
1
){
stop
(
"paramAlgo need to be inferior to 1\n"
,
call.
=
FALSE
)
}
}
if
(
opt
$
algorithm
==
"weight"
&
!
(
opt
$
paramAlgo
==
"ratio"
|
opt
$
paramAlgo
==
"log"
)){
stop
(
"paramAlgo need to be equals to 'ratio' or 'log'\n"
,
call.
=
FALSE
)
}
#####################################################
### ---------------------Input data and preprocessing
#####################################################
## read table with all genes
if
(
opt
$
fileFormat
==
"topGO"
){
geneID2GO
<-
readMappings
(
normalizePath
(
opt
$
file
),
sep
=
"\t"
,
IDsep
=
","
)
}
else
{
go
<-
read.table
(
normalizePath
(
opt
$
file
),
header
=
TRUE
,
sep
=
"\t"
,
comment.char
=
""
)
# obtains a list of GO by genes for all genes (mandatory for topGO package)
geneID2GO
<-
list
()
for
(
i
in
unique
(
go
[,
1
])){
gene
<-
go
[
go
[,
1
]
==
i
,]
listGO
<-
vector
()
for
(
j
in
1
:
nrow
(
gene
)){
listGO
<-
c
(
listGO
,
as.character
(
gene
[
j
,
2
]))
}
geneID2GO
<-
c
(
geneID2GO
,
list
(
listGO
))
}
names
(
geneID2GO
)
<-
unique
(
go
[,
1
])
}
## read table with genes of interest
genesOfInterest
<-
read.table
(
normalizePath
(
opt
$
interest
),
header
=
TRUE
,
sep
=
"\t"
,
comment.char
=
""
)
genesOfInterest
<-
genesOfInterest
[,
1
]
#names of all genes
geneUniverse
<-
names
(
geneID2GO
)
# =1 if gene is a gene of interest
geneList
<-
factor
(
as.integer
(
geneUniverse
%in%
genesOfInterest
))
names
(
geneList
)
<-
geneUniverse
#gene in the list of interest but not in the analysis
notInAnalysis
<-
genesOfInterest
[
!
(
genesOfInterest
%in%
geneUniverse
)]
######################### Functions
########## get name/definition of a GO (hidden function in topGO)
getTermsDefinition
<-
function
(
whichTerms
,
ontology
,
numChar
=
20
,
multipLines
=
FALSE
)
{
qTerms
<-
paste
(
paste
(
"'"
,
whichTerms
,
"'"
,
sep
=
""
),
collapse
=
","
)
retVal
<-
dbGetQuery
(
GO_dbconn
(),
paste
(
"SELECT term, go_id FROM go_term WHERE ontology IN"
,
"('"
,
ontology
,
"') AND go_id IN ("
,
qTerms
,
");"
,
sep
=
""
))
termsNames
<-
retVal
$
term
names
(
termsNames
)
<-
retVal
$
go_id
if
(
!
multipLines
)
shortNames
<-
paste
(
substr
(
termsNames
,
1
,
numChar
),
ifelse
(
nchar
(
termsNames
)
>
numChar
,
'...'
,
''
),
sep
=
''
)
else
shortNames
<-
sapply
(
termsNames
,
function
(
x
)
{
a
<-
strwrap
(
x
,
numChar
)
return
(
paste
(
a
,
sep
=
""
,
collapse
=
"\\\n"
))
})
names
(
shortNames
)
<-
names
(
termsNames
)
return
(
shortNames
[
whichTerms
])
}
########## find GO at levels 2 for ONE ontology
nbLevels2
<-
function
(
geneList
,
ontology
){
myGOdata
<-
NULL
try
(
myGOdata
<-
new
(
"topGOdata"
,
description
=
"My project"
,
ontology
=
ontology
,
allGenes
=
geneList
,
annot
=
annFUN.gene2GO
,
gene2GO
=
geneID2GO
),
silent
=
TRUE
)
count
<-
0
if
(
!
is.null
(
myGOdata
)){
# calculate levels for each GO in our data and his ancestors
levels
<-
buildLevels
(
myGOdata
@
graph
)
# select only GO at level 2
goLevels2
<-
mget
(
ls
(
levels
$
nodes2level
),
envir
=
levels
$
nodes2level
)
==
2
goLevels2
<-
names
(
goLevels2
)[
goLevels2
]
# count genes in each GO
count
<-
NULL
for
(
i
in
goLevels2
){
count
<-
c
(
count
,
countGenesInTerm
(
myGOdata
,
i
))
}
}
return
(
count
)
}
########## obtain percent of total genes for each level 2 GO
percentOfGene
<-
function
(
geneList
,
ontology
,
DefOntology
){
## Not genes of interest
All
<-
nbLevels2
(
geneList
,
ontology
)
/
length
(
geneList
)
## genes of interest
geneListGoI
<-
geneList
[
geneList
==
1
]
GoI
<-
nbLevels2
(
geneListGoI
,
ontology
)
/
length
(
geneListGoI
)
## percent of gene by GO
count
<-
data.frame
(
name
=
getTermsDefinition
(
names
(
All
),
ontology
,
numChar
=
40
),
nbGene
=
as.vector
(
All
),
GoI
=
"All genes"
,
ontology
=
DefOntology
)
if
(
GoI
!=
0
){
count
<-
rbind
(
count
,
data.frame
(
name
=
getTermsDefinition
(
names
(
GoI
),
ontology
,
numChar
=
40
),
nbGene
=
as.vector
(
GoI
),
GoI
=
"Genes of interest"
,
ontology
=
DefOntology
))
}
## to obtain the same list of GO between GoI and not GoI, we add line with a percent
## of genes equal to 0
if
(
length
(
count
$
name
[
!
(
count
$
name
[
count
$
GoI
==
"All genes"
]
%in%
count
$
name
[
count
$
GoI
==
"Genes of interest"
])])
>
0
){
count
<-
rbind
(
count
,
data.frame
(
name
=
count
$
name
[
!
(
count
$
name
[
count
$
GoI
==
"All genes"
]
%in%
count
$
name
[
count
$
GoI
==
"Genes of interest"
])],
nbGene
=
0
,
GoI
=
"Genes of interest"
,
ontology
=
DefOntology
))
}
if
(
length
(
count
$
name
[
!
(
count
$
name
[
count
$
GoI
==
"Genes of interest"
]
%in%
count
$
name
[
count
$
GoI
==
"All genes"
])])
>
0
){
count
<-
rbind
(
count
,
data.frame
(
name
=
count
$
name
[
!
(
count
$
name
[
count
$
GoI
==
"Genes of interest"
]
%in%
count
$
name
[
count
$
GoI
==
"All genes"
])],
nbGene
=
0
,
GoI
=
"All genes"
,
ontology
=
DefOntology
))
}
return
(
count
)
}
######################### Barplot
countBP
<-
percentOfGene
(
geneList
,
"BP"
,
"Biological Process"
)
countMF
<-
percentOfGene
(
geneList
,
"MF"
,
"Molecular Function"
)
countCC
<-
percentOfGene
(
geneList
,
"CC"
,
"Cellular Component"
)
count
<-
rbind
(
countBP
,
countMF
,
countCC
)
count
$
GoI
<-
factor
(
count
$
GoI
,
levels
=
rev
(
levels
(
count
$
GoI
)))
ggplot
(
count
,
aes
(
x
=
as.factor
(
name
),
y
=
nbGene
,
fill
=
as.factor
(
GoI
)))
+
geom_bar
(
stat
=
"identity"
,
position
=
position_dodge
())
+
facet_grid
(
~
ontology
,
scales
=
"free_x"
)
+
theme
(
axis.text.x
=
element_text
(
angle
=
70
,
vjust
=
1
,
hjust
=
1
,
colour
=
"black"
))
+
scale_fill_discrete
(
name
=
""
)
+
ylab
(
paste0
(
"Percent of "
,
opt
$
target
))
+
xlab
(
"GO term"
)
ggsave
(
filename
=
normalizePath
(
file.path
(
opt
$
out
,
"barplot.svg"
),
mustWork
=
FALSE
),
width
=
11
,
height
=
7
)
write.table
(
count
,
file
=
normalizePath
(
file.path
(
opt
$
out
,
paste0
(
"barplot.csv"
)),
mustWork
=
FALSE
),
sep
=
"\t"
,
row.names
=
FALSE
,
quote
=
FALSE
)
#################### Tests
GOgeneSig
<-
NULL
for
(
ontology
in
c
(
"BP"
,
"MF"
,
"CC"
)){
## GO object
myGOdata
<-
new
(
"topGOdata"
,
description
=
"My project"
,
ontology
=
ontology
,
allGenes
=
geneList
,
annot
=
annFUN.gene2GO
,
gene2GO
=
geneID2GO
)
## Test and p-value correction if algorithm = 'classic'
resultTest
<-
NULL
if
(
opt
$
algorithm
==
"classic"
){
resultTest
<-
runTest
(
myGOdata
,
algorithm
=
opt
$
algorithm
,
statistic
=
opt
$
test
)
resultTest
@
score
<-
p.adjust
(
resultTest
@
score
,
method
=
opt
$
paramAlgo
)
}
else
if
(
opt
$
algorithm
==
"elim"
){
resultTest
<-
runTest
(
myGOdata
,
algorithm
=
opt
$
algorithm
,
statistic
=
opt
$
test
,
cutOff
=
as.numeric
(
opt
$
paramAlgo
))
}
else
if
(
opt
$
algorithm
==
"weight"
){
resultTest
<-
runTest
(
myGOdata
,
algorithm
=
opt
$
algorithm
,
statistic
=
opt
$
test
,
sigRatio
=
opt
$
paramAlgo
)
}
sigRes
<-
resultTest
@
score
[
resultTest
@
score
<=
opt
$
alpha
]
if
(
!
is.na
(
sigRes
[
1
])){
svg
(
normalizePath
(
file.path
(
opt
$
out
,
paste0
(
ontology
,
"_DAG.svg"
)),
mustWork
=
FALSE
))
showSigOfNodes
(
myGOdata
,
score
(
resultTest
),
firstSigNodes
=
10
,
useInfo
=
'all'
)
dev.off
()
## Table with significant GOs and genes
for
(
i
in
1
:
length
(
sigRes
)){
add
<-
data.frame
(
ontology
=
ontology
,
GO
=
names
(
sigRes
)[
i
],
GODef
=
as.character
(
getTermsDefinition
(
names
(
sigRes
)[
i
],
ontology
,
numChar
=
40
)),
gene
=
genesInTerm
(
myGOdata
,
names
(
sigRes
)[
i
])[[
1
]],
pval
=
as.numeric
(
sigRes
[
i
]))
GOgeneSig
<-
rbind
(
GOgeneSig
,
add
)
}
}
}
if
(
!
is.null
(
GOgeneSig
)){
names
(
GOgeneSig
)[
4
]
<-
opt
$
target
}
write.table
(
GOgeneSig
,
file
=
normalizePath
(
file.path
(
opt
$
out
,
paste0
(
"GOgeneSig.csv"
)),
mustWork
=
FALSE
),
sep
=
"\t"
,
row.names
=
FALSE
,
quote
=
FALSE
)
write.table
(
unique
(
GOgeneSig
[,
c
(
"ontology"
,
"GO"
,
"GODef"
,
"pval"
)]),
file
=
normalizePath
(
file.path
(
opt
$
out
,
paste0
(
"GOSig.csv"
)),
mustWork
=
FALSE
),
sep
=
"\t"
,
row.names
=
FALSE
,
quote
=
FALSE
)
## JSON
stat
<-
list
(
"notInTheAnalysis"
=
ifelse
(
!
is.na
(
notInAnalysis
[
1
]),
as.character
(
notInAnalysis
),
"none"
))
write
(
gsub
(
" "
,
""
,
toJSON
(
list
(
stat
=
stat
),
collapse
=
""
)),
file
=
normalizePath
(
file.path
(
opt
$
out
,
"results.json"
),
mustWork
=
FALSE
))
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment