Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Maintenance - Mise à jour mensuelle Lundi 6 Février entre 7h00 et 9h00
Open sidebar
bios4biol
bioinfo-utils
Commits
063d9a19
Commit
063d9a19
authored
Mar 14, 2016
by
Celine Noirot
Browse files
Udate Gaelle R script
parent
241d03ba
Changes
3
Hide whitespace changes
Inline
Side-by-side
bin/DEG.R
View file @
063d9a19
#####################
############## MA plots ##########
#########################
#####################
Differential Analysis Expression
#########################
################################################################################
### ---------------------Packages
...
...
@@ -6,10 +6,13 @@
packages
<-
c
(
"optparse"
,
# to read arguments from a command line
"ggplot2"
,
# nice plots
"gridExtra"
,
# multiple plots on one page
"grid"
,
# multiple plots on one page for MAplots
"RColorBrewer"
,
# color in plots
"mixOmics"
,
# for heatmap plot
"reshape2"
,
"RJSONIO"
)
# reshape grouped data
"reshape2"
,
# reshape grouped data
"knitr"
,
# create a pdf file with images
"markdown"
,
# create a pdf file with images
"RJSONIO"
)
# work with JSON file
for
(
package
in
packages
){
# if package is installed locally, load
...
...
@@ -35,7 +38,6 @@ for(package in bioPackages){
### ---------------------Parameters
#############
...
...
@@ -54,10 +56,10 @@ option_list = list(
help
=
"folder path where results are stored"
,
metavar
=
"character"
),
make_option
(
c
(
"--pool1"
),
type
=
"character"
,
default
=
NULL
,
help
=
"library name in pool 1 separeted by ',
'"
,
help
=
"library name in pool 1 separeted by ','"
,
metavar
=
"character"
),
make_option
(
c
(
"--pool2"
),
type
=
"character"
,
default
=
NULL
,
help
=
"library name in pool 2 separeted by ',
'"
,
help
=
"library name in pool 2 separeted by ','"
,
metavar
=
"character"
),
make_option
(
c
(
"--filter"
),
type
=
"logical"
,
default
=
TRUE
,
help
=
"if TRUE low expressed genes are removed [default=%default]"
,
...
...
@@ -103,9 +105,9 @@ opt = parse_args(opt_parser);
#opt=list()
#opt$file <- XXX # for example "C:/My documents/raw_count.csv"
#opt$norm <- XXX # for example "C:/My documents/RLE_info.csv"
#opt$out <- XXX # for example"C:/My documents/results"
#opt$pool1 <- "lib1, lib2"
#opt$pool2 <- "lib3, lib4"
#opt$out <- XXX # for example
"C:/My documents/results"
#opt$pool1 <-
XXX # for example
"lib1, lib2"
#opt$pool2 <-
XXX # for example
"lib3, lib4"
#opt$alpha <- 0.05
#opt$filter <- TRUE
#opt$correct <- "BH"
...
...
@@ -122,7 +124,7 @@ if (is.null(opt$file) || is.null(opt$norm) || is.null(opt$out) ||
}
#multiple testing correction
if
(
!
(
opt
$
correct
%in%
c
(
"BH"
,
"BY"
,
"bonferroni"
)))
{
if
(
!
(
opt
$
correct
%in%
c
(
"BH"
,
"bonferroni"
)))
{
print_help
(
opt_parser
)
stop
(
"you can't use this correction to adjust p-values.\n"
,
call.
=
FALSE
)
}
...
...
@@ -164,6 +166,7 @@ if (nrow(pool1) < 1 || nrow(pool2) < 1) {
sample_info
<-
rbind
(
pool1
,
pool2
)
PC_before
<-
PC_before
[,
colnames
(
PC_before
)
%in%
as.character
(
sample_info
$
lib_name
)]
PC_before
<-
PC_before
[,
match
(
colnames
(
PC_before
),
sample_info
$
lib_name
)]
norm_fact
<-
norm_fact
[
as.character
(
norm_fact
$
sample.name
)
%in%
as.character
(
sample_info
$
lib_name
),
]
...
...
@@ -182,6 +185,11 @@ PC_after_log <- log2(cpm(dge_after) + 1)
##############################################
### ---------------------Plots : normalization
##############################################
#save images in a pdf file
pdf
(
file.path
(
opt
$
out
,
"de_images.pdf"
),
title
=
"Plots from differential analysis"
,
width
=
12
,
height
=
10
)
PC_before_log_long
<-
melt
(
PC_before_log
)
colnames
(
PC_before_log_long
)
<-
c
(
"name"
,
"lib"
,
"log2CPM"
)
PC_before_log_long
<-
merge
(
PC_before_log_long
,
sample_info
,
by.x
=
"lib"
,
...
...
@@ -195,33 +203,33 @@ PC_after_log_long <- merge(PC_after_log_long, sample_info, by.x = "lib",
#### boxplot
PC_boxplot
<-
function
(
dat
,
tit
,
filename
)
{
ggplot
(
dat
,
aes
(
x
=
lib
,
y
=
log2CPM
))
+
p
<-
ggplot
(
dat
,
aes
(
x
=
lib
,
y
=
log2CPM
))
+
geom_boxplot
(
aes
(
fill
=
factor
(
conditions
)))
+
theme_bw
()
+
xlab
(
label
=
""
)
+
ylab
(
label
=
expression
(
log
[
2
]
*
"(counts + 1)"
))
+
ggtitle
(
tit
)
+
scale_fill_manual
(
name
=
"Conditions"
,
values
=
c
(
"steelblue3"
,
"springgreen3"
))
ggsave
(
filename
=
normalizePath
(
file.path
(
opt
$
out
,
filename
),
mustWork
=
FALSE
),
width
=
11
,
height
=
7
)
print
(
p
)
}
PC_boxplot
(
PC_before_log_long
,
"Boxplot on raw pseudo-counts"
,
"rawcount_boxplot.
sv
g"
)
"rawcount_boxplot.
pn
g"
)
PC_boxplot
(
PC_after_log_long
,
"Boxplot on pseudo-counts after normalization"
,
"normcount_boxplot.
sv
g"
)
"normcount_boxplot.
pn
g"
)
#### MDS
if
(
nrow
(
pool1
)
>
1
&&
nrow
(
pool2
)
>
1
)
{
PC_MDS
<-
function
(
dat
,
tit
,
filename
)
{
dev.new
()
mds
<-
data.frame
(
plotMDS
(
dat
)
$
cmdscale.out
)
dev.off
()
mds
$
lib.name
<-
rownames
(
mds
)
ggplot
(
mds
,
aes
(
x
=
X1
,
y
=
X2
,
label
=
lib.name
,
p
<-
ggplot
(
mds
,
aes
(
x
=
X1
,
y
=
X2
,
label
=
lib.name
,
color
=
as.factor
(
sample_info
$
conditions
)))
+
geom_point
(
size
=
4
)
+
geom_text
(
hjust
=
0.5
,
vjust
=
-0.5
)
+
theme_bw
()
+
xlab
(
label
=
"Dimension 1"
)
+
ylab
(
label
=
"Dimension 2"
)
+
ggtitle
(
tit
)
+
scale_color_manual
(
name
=
"Conditions"
,
values
=
c
(
"steelblue3"
,
"springgreen3"
))
ggsave
(
filename
=
normalizePath
(
file.path
(
opt
$
out
,
paste0
(
filename
,
".svg"
)),
mustWork
=
FALSE
),
width
=
11
,
height
=
7
)
print
(
p
)
write.table
(
mds
,
file
=
normalizePath
(
file.path
(
opt
$
out
,
paste0
(
filename
,
".csv"
)),
...
...
@@ -238,21 +246,20 @@ if (nrow(pool1) > 1 && nrow(pool2) > 1) {
#### density
PC_density
<-
function
(
dat
,
tit
,
filename
)
{
ggplot
(
dat
,
aes
(
x
=
log2CPM
,
group
=
lib
))
+
p
<-
ggplot
(
dat
,
aes
(
x
=
log2CPM
,
group
=
lib
))
+
geom_density
(
aes
(
colour
=
factor
(
conditions
)))
+
theme_bw
()
+
xlab
(
label
=
expression
(
log
[
2
]
*
"(counts + 1)"
))
+
ylab
(
label
=
"Density"
)
+
ggtitle
(
tit
)
+
guides
(
color
=
guide_legend
(
title
=
"Conditions"
))
+
scale_color_manual
(
name
=
"Conditions"
,
values
=
c
(
"steelblue3"
,
"springgreen3"
))
ggsave
(
filename
=
normalizePath
(
file.path
(
opt
$
out
,
filename
),
mustWork
=
FALSE
),
width
=
8
,
height
=
6
)
print
(
p
)
}
PC_density
(
PC_before_log_long
,
"Density plot on raw pseudo-counts"
,
"rawcount_density.
sv
g"
)
"rawcount_density.
pn
g"
)
PC_density
(
PC_after_log_long
,
"Density plot on pseudo-counts after normalization"
,
"normcount_density.
sv
g"
)
"normcount_density.
pn
g"
)
#save data used for boxplots and density plots
...
...
@@ -270,6 +277,8 @@ write.table(PC_after_log_long, file = normalizePath(file.path(opt$out,
#### MAplots
MAplot_name
<-
character
()
pos
<-
1
if
(
opt
$
MAplots
&&
nrow
(
pool1
)
>
1
&&
nrow
(
pool2
)
>
1
)
{
ifelse
(
!
dir.exists
(
file.path
(
opt
$
out
,
"MAplots"
)),
dir.create
(
file.path
(
opt
$
out
,
"MAplots"
)),
FALSE
)
...
...
@@ -278,12 +287,13 @@ if (opt$MAplots && nrow(pool1) > 1 && nrow(pool2) > 1) {
avalues
<-
(
lib1
+
lib2
)
/
2
mvalues
<-
(
lib1
-
lib2
)
MA
<-
data.frame
(
avalues
=
avalues
,
mvalues
=
mvalues
)
ggplot
(
MA
,
aes
(
x
=
avalues
,
y
=
mvalues
))
+
geom_point
(
size
=
3
)
+
theme_bw
()
+
ggtitle
(
tit
)
+
xlab
(
"A"
)
+
ylab
(
"M"
)
+
geom_hline
(
yintercept
=
0
,
col
=
"red"
)
ggsave
(
filename
=
normalizePath
(
file.path
(
opt
$
out
,
"MAplots"
,
filename
),
mustWork
=
FALSE
),
width
=
8
,
height
=
6
)
return
(
list
(
avalues
,
mvalues
))
p
<-
ggplot
(
MA
,
aes
(
x
=
avalues
,
y
=
mvalues
))
+
geom_point
(
size
=
1
)
+
theme_bw
()
+
ggtitle
(
tit
)
+
xlab
(
"A"
)
+
ylab
(
"M"
)
+
geom_hline
(
yintercept
=
0
,
col
=
"red"
)
+
theme
(
text
=
element_text
(
size
=
5
))
#print(p)
return
(
list
(
avalues
,
mvalues
,
p
))
}
a_before
<-
data.frame
(
name
=
rownames
(
PC_before_log
))
...
...
@@ -292,43 +302,62 @@ if (opt$MAplots && nrow(pool1) > 1 && nrow(pool2) > 1) {
m_after
<-
data.frame
(
name
=
rownames
(
PC_after_log
))
for
(
cond
in
unique
(
sample_info
$
conditions
))
{
I
<-
sum
(
sample_info
$
conditions
==
cond
)
-1
J
<-
sum
(
sample_info
$
conditions
==
cond
)
grid.newpage
()
for
(
i
in
1
:
(
sum
(
sample_info
$
conditions
==
cond
)
-1
))
{
for
(
j
in
(
i
+1
)
:
sum
(
sample_info
$
conditions
==
cond
))
{
# Create layout : nrow = 2, ncol = 2
pushViewport
(
viewport
(
layout
=
grid.layout
(
I
,
I
*
2
)))
pos_graph
<-
1
for
(
i
in
1
:
I
)
{
pos_graph
<-
i
*
2
-
1
for
(
j
in
(
i
+1
)
:
J
)
{
col_name
<-
paste0
(
"cond"
,
cond
,
"_"
,
sample_info
$
lib_name
[
sample_info
$
conditions
==
cond
][
i
],
"vs"
,
sample_info
$
lib_name
[
sample_info
$
conditions
==
cond
][
j
])
bef
<-
PC_MAplot
(
PC_before_log
[,
sample_info
$
conditions
==
cond
][,
i
],
PC_before_log
[,
sample_info
$
conditions
==
cond
][,
j
],
paste0
(
"MA plot on raw pseudo-counts: "
,
paste0
(
"MA plot on raw pseudo-counts:
\n
"
,
sample_info
$
lib_name
[
sample_info
$
conditions
==
cond
][
i
],
" vs. "
,
sample_info
$
lib_name
[
sample_info
$
conditions
==
cond
][
j
]),
paste0
(
"raw_MAplot_"
,
cond
,
"_"
,
i
,
"vs"
,
j
,
".svg"
))
paste0
(
"raw_MAplot_"
,
cond
,
"_"
,
i
,
"vs"
,
j
,
".png"
))
MAplot_name
[
pos
]
<-
paste0
(
"raw_MAplot_"
,
cond
,
"_"
,
i
,
"vs"
,
j
,
".png"
)
pos
<-
pos
+
1
a_before
<-
cbind
(
a_before
,
bef
[[
1
]])
m_before
<-
cbind
(
m_before
,
bef
[[
2
]])
colnames
(
a_before
)[
ncol
(
a_before
)]
<-
col_name
colnames
(
m_before
)[
ncol
(
m_before
)]
<-
col_name
print
(
bef
[[
3
]],
vp
=
viewport
(
layout.pos.row
=
i
,
layout.pos.col
=
pos_graph
))
aft
<-
PC_MAplot
(
PC_after_log
[,
sample_info
$
conditions
==
cond
][,
i
],
PC_after_log
[,
sample_info
$
conditions
==
cond
][,
j
],
paste0
(
"MA plot on normalized pseudo-counts: "
,
paste0
(
"MA plot on normalized pseudo-counts:
\n
"
,
sample_info
$
lib_name
[
sample_info
$
conditions
==
cond
][
i
],
" vs. "
,
sample_info
$
lib_name
[
sample_info
$
conditions
==
cond
][
j
]),
paste0
(
"norm_MAplot_"
,
cond
,
"_"
,
i
,
"vs"
,
j
,
".svg"
))
paste0
(
"norm_MAplot_"
,
cond
,
"_"
,
i
,
"vs"
,
j
,
".png"
))
MAplot_name
[
pos
]
<-
paste0
(
"norm_MAplot_"
,
cond
,
"_"
,
i
,
"vs"
,
j
,
".png"
)
pos
<-
pos
+
1
a_after
<-
cbind
(
a_after
,
plot_name
=
aft
[[
1
]])
m_after
<-
cbind
(
m_after
,
plot_name
=
aft
[[
2
]])
colnames
(
a_after
)[
ncol
(
a_after
)]
<-
col_name
colnames
(
m_after
)[
ncol
(
m_after
)]
<-
col_name
print
(
aft
[[
3
]],
vp
=
viewport
(
layout.pos.row
=
i
,
layout.pos.col
=
pos_graph
+1
))
pos_graph
<-
pos_graph
+
2
}
}
}
write.table
(
a_before
,
file
=
normalizePath
(
file.path
(
opt
$
out
,
"MAplots"
,
...
...
@@ -501,16 +530,10 @@ if (opt$filter) {
mustWork
=
FALSE
),
sep
=
"\t"
,
row.names
=
FALSE
,
quote
=
FALSE
)
svg
(
normalizePath
(
file.path
(
opt
$
out
,
"histograms.svg"
),
mustWork
=
FALSE
),
width
=
12
,
height
=
7
)
grid.arrange
(
h1
,
h2
,
h3
,
ncol
=
3
)
dev.off
()
grid.arrange
(
h1
,
h2
,
h3
,
ncol
=
3
)
}
else
{
svg
(
normalizePath
(
file.path
(
opt
$
out
,
"histograms.svg"
),
mustWork
=
FALSE
),
width
=
12
,
height
=
7
)
grid.arrange
(
h1
,
h2
,
ncol
=
2
)
dev.off
()
}
...
...
@@ -518,12 +541,12 @@ if (opt$filter) {
res
$
DEG
<-
ifelse
(
res
$
FDR
<
opt
$
alpha
,
"Yes"
,
"No"
)
if
(
nrow
(
pool1
)
>
1
|
nrow
(
pool2
)
>
1
)
{
ggplot
(
res
,
aes
(
x
=
logCPM
,
y
=
logFC
,
group
=
DEG
))
+
p
<-
ggplot
(
res
,
aes
(
x
=
logCPM
,
y
=
logFC
,
group
=
DEG
))
+
geom_point
(
aes
(
color
=
as.factor
(
DEG
)),
size
=
3
)
+
theme_bw
()
+
scale_color_manual
(
values
=
c
(
'#000000'
,
'#CC0000'
))
+
guides
(
color
=
guide_legend
(
title
=
"DEG ?"
))
+
ggtitle
(
"MA plot on normalized pseudo-counts"
)
print
(
p
)
write.table
(
cbind
(
A
=
res
$
logCPM
,
M
=
res
$
logFC
,
DEG
=
res
$
DEG
),
file
=
normalizePath
(
file.path
(
opt
$
out
,
"DE_MAplot.csv"
),
mustWork
=
FALSE
),
sep
=
"\t"
,
...
...
@@ -531,21 +554,18 @@ if (nrow(pool1) > 1 | nrow(pool2) > 1) {
}
else
{
avalues
<-
(
PC_after_log
[,
1
]
+
PC_after_log
[,
2
])
/
2
mvalues
<-
(
PC_after_log
[,
1
]
-
PC_after_log
[,
2
])
ggplot
(
data.frame
(
PC_after_log
),
aes
(
x
=
avalues
,
y
=
mvalues
))
+
p
<-
ggplot
(
data.frame
(
PC_after_log
),
aes
(
x
=
avalues
,
y
=
mvalues
))
+
geom_point
(
aes
(
color
=
as.factor
(
res
$
DEG
)),
size
=
3
)
+
theme_bw
()
+
scale_color_manual
(
values
=
c
(
'#000000'
,
'#CC0000'
))
+
guides
(
color
=
guide_legend
(
title
=
"DEG ?"
))
+
ggtitle
(
"MA plot on normalized pseudo-counts"
)
+
xlab
(
"A"
)
+
ylab
(
"M"
)
print
(
p
)
write.table
(
cbind
(
A
=
avalues
,
M
=
mvalues
,
DEG
=
res
$
DEG
),
file
=
normalizePath
(
file.path
(
opt
$
out
,
"DE_MAplot.csv"
),
mustWork
=
FALSE
),
sep
=
"\t"
,
row.names
=
FALSE
,
quote
=
FALSE
)
}
ggsave
(
filename
=
normalizePath
(
file.path
(
opt
$
out
,
"DE_MAplot.svg"
),
mustWork
=
FALSE
),
width
=
8
,
height
=
6
)
#### heatmap and PCA plot
...
...
@@ -556,11 +576,8 @@ if (nrow(pool1) > 1 && nrow(pool2) > 1 && nrow(diff) > 0) {
cimColor
<-
colorRampPalette
(
rev
(
brewer.pal
(
9
,
"Blues"
)))(
255
)[
255
:
1
]
svg
(
normalizePath
(
file.path
(
opt
$
out
,
"DE_heatmap.svg"
),
mustWork
=
FALSE
))
cim
(
t
(
selY
),
color
=
cimColor
,
symkey
=
FALSE
,
main
=
"Heatmap on differentialy expressed genes"
)
dev.off
()
write.table
(
t
(
selY
),
file
=
normalizePath
(
file.path
(
opt
$
out
,
"DE_heatmap.csv"
),
...
...
@@ -574,8 +591,4 @@ if (nrow(pool1) > 1 && nrow(pool2) > 1 && nrow(diff) > 0) {
}
dev.off
()
bin/GOBarplot.R
0 → 100644
View file @
063d9a19
############################# Enrichment analysis ##############################
################################################################################
### ---------------------Packages
#common packages
packages
<-
c
(
"optparse"
,
# to read arguments from a command line
"ggplot2"
,
# nice plots
"RJSONIO"
)
# work with JSON file
for
(
package
in
packages
){
# if package is installed locally, load
if
(
!
(
package
%in%
rownames
(
installed.packages
())))
{
install.packages
(
package
)
}
do.call
(
'library'
,
list
(
package
))
}
#Bioconductor package
if
(
!
(
"topGO"
%in%
rownames
(
installed.packages
())))
{
source
(
"http://www.bioconductor.org/biocLite.R"
)
biocLite
(
"topGO"
)
}
library
(
topGO
)
# GO enrichment
### ---------------------Parameters
#############
###### Parameter initialization when the script is launched from command line
############
option_list
=
list
(
make_option
(
c
(
"-f"
,
"--files"
),
type
=
"character"
,
default
=
NULL
,
help
=
"geneID2GO files: list of tabulated matrix with the correspondence
gene to GO separated by ','"
,
metavar
=
"character"
),
make_option
(
c
(
"-n"
,
"--names"
),
type
=
"character"
,
default
=
NULL
,
help
=
"names of the samples (same order than files)"
,
metavar
=
"character"
),
make_option
(
c
(
"--fileFormat"
),
type
=
"character"
,
default
=
"topGO"
,
help
=
"two types of file can be used : topGO (gene_id1 GO_1,GO_2,...
\n gene_id2 GO_3,...) or twoColumns (only 2 columns
#gene_id GO_id)"
,
metavar
=
"character"
),
make_option
(
c
(
"-o"
,
"--out"
),
type
=
"character"
,
default
=
NULL
,
help
=
"folder path where results are stored"
,
metavar
=
"character"
)
);
opt_parser
=
OptionParser
(
usage
=
"Perform an enrichment analysis for
Gene Ontology (GO)"
,
option_list
=
option_list
);
opt
=
parse_args
(
opt_parser
);
##############
## Parameter initialization when the script is launched from R or Rstudio
##############
#opt=list()
#opt$files <- XXX # for example c("C:/My documents/raw_count.csv",...)
#opt$names <- XXX # for example c("sample1",...)
#opt$fileFormat <- "topGO" # "topGO" or "twoColumns"
#opt$out <- XXX # for example "C:/My documents/results"
# table with genes and GO, genes of interest, test and algorithm are needed to
#continue
if
(
is.null
(
opt
$
files
))
{
print_help
(
opt_parser
)
stop
(
"Tables with genes and GO missing.\n"
,
call.
=
FALSE
)
}
#Create output folder if it doesn't exist
ifelse
(
!
dir.exists
(
opt
$
out
),
dir.create
(
opt
$
out
),
FALSE
)
#####################################################
### ---------------------Input data and preprocessing
#####################################################
## read tables with all genes
l
<-
unlist
(
strsplit
(
opt
$
files
,
','
))
opt
$
names
<-
unlist
(
strsplit
(
opt
$
names
,
','
))
geneID2GO
<-
list
()
if
(
opt
$
fileFormat
==
"topGO"
){
for
(
i
in
seq_along
(
l
)){
geneID2GO
[[
i
]]
<-
readMappings
(
normalizePath
(
l
[
i
]),
sep
=
"\t"
,
IDsep
=
","
)
}
}
else
{
for
(
i
in
seq_along
(
l
)){
go
<-
read.table
(
normalizePath
(
l
[
i
]),
header
=
TRUE
,
sep
=
"\t"
,
comment.char
=
""
)
# obtains a list of GO by genes for all genes (mandatory for topGO package)
geneID2GO
[[
i
]]
<-
list
()
for
(
i
in
unique
(
go
[,
1
])){
gene
<-
go
[
go
[,
1
]
==
i
,]
listGO
<-
vector
()
for
(
j
in
1
:
nrow
(
gene
)){
listGO
<-
c
(
listGO
,
as.character
(
gene
[
j
,
2
]))
}
geneID2GO
[[
i
]]
<-
c
(
geneID2GO
[[
i
]],
list
(
listGO
))
}
names
(
geneID2GO
[[
i
]])
<-
unique
(
go
[,
1
])
}
}
geneList
<-
list
()
for
(
i
in
seq_along
(
geneID2GO
)){
geneList
[[
i
]]
<-
factor
(
rep
(
1
,
length
(
geneID2GO
[[
i
]])))
levels
(
geneList
[[
i
]])
<-
c
(
levels
(
geneList
[[
i
]]),
0
)
names
(
geneList
[[
i
]])
<-
names
(
geneID2GO
[[
i
]])
}
######################### Functions
########## get name/definition of a GO (hidden function in topGO)
getTermsDefinition
<-
function
(
whichTerms
,
ontology
,
numChar
=
20
,
multipLines
=
FALSE
)
{
qTerms
<-
paste
(
paste
(
"'"
,
whichTerms
,
"'"
,
sep
=
""
),
collapse
=
","
)
retVal
<-
dbGetQuery
(
GO_dbconn
(),
paste
(
"SELECT term, go_id FROM go_term WHERE ontology IN"
,
"('"
,
ontology
,
"') AND go_id IN ("
,
qTerms
,
");"
,
sep
=
""
))
termsNames
<-
retVal
$
term
names
(
termsNames
)
<-
retVal
$
go_id
if
(
!
multipLines
)
shortNames
<-
paste
(
substr
(
termsNames
,
1
,
numChar
),
ifelse
(
nchar
(
termsNames
)
>
numChar
,
'...'
,
''
),
sep
=
''
)
else
shortNames
<-
sapply
(
termsNames
,
function
(
x
)
{
a
<-
strwrap
(
x
,
numChar
)
return
(
paste
(
a
,
sep
=
""
,
collapse
=
"\\\n"
))
})
names
(
shortNames
)
<-
names
(
termsNames
)
return
(
shortNames
[
whichTerms
])
}
########## find GO at levels 2 for ONE ontology
nbLevels2
<-
function
(
geneListI
,
ontology
,
geneID2GO
){
myGOdata
<-
NULL
try
(
myGOdata
<-
new
(
"topGOdata"
,
description
=
"My project"
,
ontology
=
ontology
,
allGenes
=
geneListI
,
annot
=
annFUN.gene2GO
,
gene2GO
=
geneID2GO
),
silent
=
TRUE
)
count
<-
0
if
(
!
is.null
(
myGOdata
)){
# calculate levels for each GO in our data and his ancestors
levels
<-
buildLevels
(
myGOdata
@
graph
)
# select only GO at level 2
goLevels2
<-
mget
(
ls
(
levels
$
nodes2level
),
envir
=
levels
$
nodes2level
)
==
2
goLevels2
<-
names
(
goLevels2
)[
goLevels2
]
# count genes in each GO
count
<-
NULL
for
(
i
in
goLevels2
){
count
<-
c
(
count
,
countGenesInTerm
(
myGOdata
,
i
))
}
}
return
(
count
)
}
########## obtain percent of total genes for each level 2 GO
percentOfGene
<-
function
(
geneList
,
ontology
,
DefOntology
){
prop
<-
list
()
count
<-
NULL
#create a table with GO names and proportion of genes
for
(
i
in
seq_along
(
geneList
)){
prop
[[
i
]]
<-
nbLevels2
(
geneList
[[
i
]],
ontology
,
geneID2GO
[[
i
]])
/
length
(
geneList
[[
i
]])
count
<-
rbind
(
count
,
data.frame
(
name
=
getTermsDefinition
(
names
(
prop
[[
i
]]),
ontology
,
numChar
=
40
),
nbGene
=
as.vector
(
prop
[[
i
]]),
sample
=
opt
$
names
[
i
],
ontology
=
DefOntology
))
}
# table with all GO
all
<-
data.frame
(
name
=
rep
(
unique
(
count
$
name
),
length
(
names
)),
nbGene
=
0
,
sample
=
rep
(
opt
$
names
,
each
=
length
(
unique
(
count
$
name
))),
ontology
=
DefOntology
)
#only GO-name not in count
not_in_count
<-
all
[
!
(
paste0
(
all
$
name
,
"."
,
all
$
sample
)
%in%
paste0
(
count
$
name
,
"."
,
count
$
sample
)),]
count
<-
rbind
(
count
,
not_in_count
)
return
(
count
)
}
######################### Barplot
countBP
<-
percentOfGene
(
geneList
,
"BP"
,
"Biological Process"
)
countMF
<-
percentOfGene
(
geneList
,
"MF"
,
"Molecular Function"
)
countCC
<-
percentOfGene
(
geneList
,
"CC"
,
"Cellular Component"
)
count
<-
rbind
(
countBP
,
countMF
,
countCC
)
ggplot
(
count
,
aes
(
x
=
as.factor
(
name
),
y
=
nbGene
,
fill
=
sample
))
+
geom_bar
(
stat
=
"identity"
,
position
=
position_dodge
())
+
facet_grid
(
~
ontology
,
scales
=
"free_x"
)
+
theme
(
axis.text.x
=
element_text
(
angle
=
70
,
vjust
=
1
,
hjust
=
1
,
colour
=
"black"
))
+
scale_fill_discrete
(
name
=
""
)
+
ylab
(
paste0
(
"Percent of "
,
opt
$
target
))
+
xlab
(
"GO term"
)
ggsave
(
filename
=
normalizePath
(
file.path
(
opt
$
out
,
"barplot.svg"
),
mustWork
=
FALSE
),
width
=
11
,
height
=
7
)
write.table
(
count
,
file
=
normalizePath
(
file.path
(
opt
$
out
,
paste0
(
"barplot.csv"
)),
mustWork
=
FALSE
),
sep
=
"\t"
,
row.names
=
FALSE
,
quote
=
FALSE
)
bin/Normalization.R
View file @
063d9a19
...
...
@@ -26,7 +26,7 @@ library(edgeR) # RNAseq data normalization
#############
###### Parameter
s'
initialization when the script is launched from command line
###### Parameter initialization when the script is launched from command line
############
option_list
=
list
(
make_option
(
c
(
"-f"
,
"--file"
),
type
=
"character"
,
default
=
NULL
,
...
...
@@ -43,7 +43,7 @@ opt_parser = OptionParser(usage="Imports data, converts them into a DGEList obje
opt
=
parse_args
(
opt_parser
);
##############
## Parameter
s'
initialization when the script is launched from R or Rstudio
## Parameter initialization when the script is launched from R or Rstudio
#file: tabulated raw count matrix with library name as header
#(e.g. gene_id lib1 lib2)
#out: folder path where results are stored
...
...
@@ -179,15 +179,18 @@ normalization <- function(dge, method) {
mustWork
=
FALSE
),
width
=
8
,
height
=
6
)
#MDS
MDS_PC
<-
data.frame
(
plotMDS
(
PC_after_log
)
$
cmdscale.out
)
if
(
ncol
(
PC_before
)
>
2
){
MDS_PC
<-
data.frame
(
plotMDS
(
PC_after_log
)
$
cmdscale.out
)
ggplot
(
MDS_PC
,
aes
(
x
=
X1
,
y
=
X2
,
label
=
rownames
(
MDS_PC
)))
+
geom_point
(
size
=
4
)
+
geom_text
(
hjust
=
0.5
,
vjust
=
-0.5
)
+
theme_bw
()
+
xlab
(
label
=
"Dimension 1"
)
+
ylab
(
label
=
"Dimension 2"
)
+
ggtitle
(
paste0
(
"PCA plot on pseudo-counts after normalization (method = "
,
tit
,
")"
))
ggsave
(
filename
=
normalizePath
(
file.path
(
opt
$
out
,
paste0
(
method
,
"_MDS.svg"
)),
mustWork
=
FALSE
),
width
=
11
,
height
=
7
)
}
ggplot
(
MDS_PC
,
aes
(
x
=
X1
,
y
=
X2
,
label
=
rownames
(
MDS_PC
)))
+
geom_point
(
size
=
4
)
+
geom_text
(
hjust
=
0.5
,
vjust
=
-0.5
)
+
theme_bw
()
+
xlab
(
label
=
"Dimension 1"
)
+
ylab
(
label
=
"Dimension 2"
)
+
ggtitle
(
paste0
(
"PCA plot on pseudo-counts after normalization (method = "
,
tit
,
")"
))
ggsave
(
filename
=
normalizePath
(
file.path
(
opt
$
out
,
paste0
(
method
,
"_MDS.svg"
)),