PCIT_RIF_gprofiler_analyse.R 14.6 KB
Newer Older
Safia Saci's avatar
Safia Saci committed
1
2
3
4
5
6
7
8
9
# module load compiler/gcc-7.2.0
# module load system/R-4.0.2_gcc-7.2.0

# Packages ------
library(sqldf)
library(dplyr)
library(CeTF)
library(data.table)
library(kableExtra)
Safia Saci's avatar
Safia Saci committed
10
11
12
13
library(rtracklayer)
library(ComplexHeatmap)
library("RColorBrewer")
library(ggplot2)
Safia Saci's avatar
Safia Saci committed
14
15


Safia Saci's avatar
Safia Saci committed
16
# tables with diff expr genes -----
Safia Saci's avatar
Safia Saci committed
17
## Input data -------
Safia Saci's avatar
Safia Saci committed
18
coefLine <- read.csv("C:/Users/ssaci/Documents/safia/projet/RosePigs_analyse/data/coef_line.csv", sep = ";", header = T)
Safia Saci's avatar
Safia Saci committed
19
20
21
coefCondition <- read.csv("data/coef_condition.csv", sep = ";", header = T)
coefSex <- read.csv("data/coef_sex.csv", sep = ";", header = T)

Safia Saci's avatar
Safia Saci committed
22
23
24
## GTF file -----
gtf <- import(con = "data/Sus_scrofa.Sscrofa11.1.102.gtf.gz", format = "GFF")

Safia Saci's avatar
Safia Saci committed
25

Safia Saci's avatar
Safia Saci committed
26
## collection of gene_names into gtf :
Safia Saci's avatar
Safia Saci committed
27
28
29
30
31
32
33
34
35
36
37
gene_name <- function(){
    listeGenes <- as.data.frame(gtf$gene_id)
    listeGenes <- distinct(listeGenes)
    for(i in listeGenes){
        listG <- gtf[gtf$gene_id %in% i, c("gene_id", "gene_name")]
        geneName <- as.data.frame(listG)[ , c("gene_id", "gene_name")]
    }
    return(distinct(geneName))
}
geneNames <- gene_name()

Safia Saci's avatar
Safia Saci committed
38
39


Safia Saci's avatar
Safia Saci committed
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# collection of gene_type into gtf :
gene_type <- function(){
    listeGenes <- as.data.frame(gtf$gene_id)
    listeGenes <- distinct(listeGenes)
    for(i in listeGenes){
        listG <- gtf[gtf$gene_id %in% i, c("gene_id", "gene_biotype")]
        geneType <- as.data.frame(listG)[ , c("gene_id", "gene_biotype")]
    }
    
    return(distinct(geneType))
}
geneType <- gene_type()



# creation of the tsv file containing the genes, their Expression value and their expression status
gene_file <- sqldf("select C.gene, G.gene_name as gene_symbol, T.gene_biotype as gene_type, C.logFC as log2fc_condition, C.adj_P_Val as adjpval_condition, C.expressed as status_condition,
                    L.logFC as log2fc_line, L.adj_P_Val as adjpval_line, L.expressed as status_line,
                    S.logFC as log2fc_sex, S.adj_P_Val as adjpval_sex, S.expressed  as status_sex
                    from coefLine L, coefCondition C, coefSex S, geneNames G, geneType T
                    where l.gene = C.gene
                    and C.gene = S.gene
                    and L.gene = G.gene_id
                    and C.gene = T.gene_id")
write.csv(gene_file, "data/gene_info.csv")


Safia Saci's avatar
Safia Saci committed
67
68


Safia Saci's avatar
Safia Saci committed
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# differentially expressed genes in line and in condition:
diff_ex <- sqldf("select C.gene, C.expressed as condition_expressed, L.expressed as line_expressed
                 from coefLine L, coefCondition C
                 where C.gene = L.gene
                 and (C.expressed != 'NS' and L.expressed != 'NS')")
# > diff_ex
# gene condition_expressed line_expressed
# 1  ENSSSCG00000000293                  UP             UP
# 2  ENSSSCG00000000555                DOWN           DOWN
# 3  ENSSSCG00000001392                  UP             UP
# 4  ENSSSCG00000002041                DOWN           DOWN
# 5  ENSSSCG00000002835                DOWN           DOWN
# 6  ENSSSCG00000007237                  UP           DOWN
# 7  ENSSSCG00000008559                  UP             UP
# 8  ENSSSCG00000008789                DOWN             UP
# 9  ENSSSCG00000009498                DOWN           DOWN
# 10 ENSSSCG00000010036                DOWN           DOWN
# 11 ENSSSCG00000010431                DOWN           DOWN
# 12 ENSSSCG00000011880                  UP             UP
# 13 ENSSSCG00000012292                  UP             UP
# 14 ENSSSCG00000016140                DOWN           DOWN
# 15 ENSSSCG00000016159                DOWN           DOWN
# 16 ENSSSCG00000032894                DOWN           DOWN


Safia Saci's avatar
Safia Saci committed
94

Safia Saci's avatar
Safia Saci committed
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# genes Network --------
## with Cor ----------
load("data/expMat2.RData")
load("data/expMat.RData")

cormat_gene_keep <- cor(t(expMat2), method = "pearson")
cor_tab <- data.table(cormat_gene_keep)
cor_tab$gene_1 <- rownames(cormat_gene_keep)
cor_table <- melt(cor_tab, id.vars = "gene_1")
cor_table <- cor_table %>% arrange(desc(abs(cor_table$value)))
cor_table2<- cor_table[13739:23738, ]
cor_table2 %>% distinct(value)
corelation_table <- distinct(cor_table2, value, .keep_all = TRUE)
cor_thr = 0.8

corelation_table$value <- replace(corelation_table$value, corelation_table$value < cor_thr, NA_character_)
corelation_table$value <- replace(corelation_table$value, corelation_table$value >= cor_thr, "Pos")
corelation_table$value <- replace(corelation_table$value, corelation_table$value <= cor_thr, "Neg")

## with PCIT ---------

# the tables with genes diff ex
## line and condition
diff_ex_gene <- sqldf("select C.gene, C.expressed as condition_expressed, L.expressed as line_expressed
                 from coefLine L, coefCondition C
                 where C.gene = L.gene
                 and (C.expressed != 'NS' or L.expressed != 'NS')")

# only line
diff_ex_gene_line <- sqldf("select L.gene, L.expressed as line_expressed
                 from coefLine L
                 where L.expressed != 'NS' ")

# only condition
diff_ex_gene_cond <- sqldf("select C.gene, C.expressed as condition_expressed
                 from coefCondition C
                 where C.expressed != 'NS' ")

Safia Saci's avatar
Safia Saci committed
133
134
135
136
137
gene_file_line <- sqldf("select gene, log2fc_line, adjpval_line from gene_file 
                        where gene in (select gene from diff_ex_gene_line)")

gene_file_cond <- sqldf("select * from gene_file 
                        where gene in (select gene from diff_ex_gene_cond)")
Safia Saci's avatar
Safia Saci committed
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192

# the scores matrix of genes diff expr
de_mat <- expMat2[diff_ex_gene$gene, ]


# Application of partial correlation
cor_de_mat <- PCIT(de_mat, tolType = "mean" )
pcor_de_table <- cor_de_mat$tab


# remove values of corr2 == null
pcor_mat_distinct <- sqldf("select * from pcor_de_table P
                           where P.corr2 != 0.0000")


# generation of the diff expr genes file with the pvalues nd their expressions.
cyto_file <- sqldf("select G.gene, gene_symbol, gene_type, log2fc_condition, adjpval_condition, status_condition,
                   log2fc_line, adjpval_line, status_line,
                   log2fc_sex, adjpval_sex, status_sex
                   from gene_file G, diff_ex_gene D
                   where G.gene = D.gene")
write.csv(cyto_file, "data/cyto_file.csv")



#  Density plots for PCIT:
svg("plots/densityPlot.svg", width = 8, height = 6)
densityPlot(cor_de_mat$adj_raw, cor_de_mat$adj_sig)
dev.off()

# collection of diff expr genes
diff_ex_gene$condition_expressed<- paste0("Condition_", diff_ex_gene$condition_expressed)
diff_ex_gene$line_expressed <- paste0("Line_", diff_ex_gene$line_expressed)


# case where gene is diff_expr in line and in condition
line_cond_ex <- diff_ex$gene
for (i in line_cond_ex) {
    diff_ex_gene$line_expressed[which(diff_ex_gene$gene == i)] <- "Double_expressed"
    diff_ex_gene$condition_expressed[which(diff_ex_gene$gene == i)] <- "Double_expressed"
    
}
# add the expression column
diff_ex_gene$expression <- "null"
# filling
diff_ex_gene$expression <- ifelse(
    diff_ex_gene$condition_expressed == "Condition_NS",
    diff_ex_gene$line_expressed,
    ifelse(
            diff_ex_gene$line_expressed == "Line_NS",
            diff_ex_gene$condition_expressed,
            "Double_expressed"
    )
)

Safia Saci's avatar
Safia Saci committed
193
194


Safia Saci's avatar
Safia Saci committed
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
# recovery of 12640 genes with the best correlation in abs value:
pcor_mat_dist_filter <- pcor_mat_distinct %>% arrange(desc(abs(pcor_mat_distinct$corr2)))
pcor_dist_filter_table <- pcor_mat_dist_filter[1:12640, ]
write.csv(pcor_dist_filter_table, "data/pcor_dist_filter_table.csv")


# save data for cytoscape
write.csv(pcor_de_table, "data/pcor_de_table.csv")
write.csv((pcor_mat_distinct), "data/pcor_de_distinct.csv")
write.csv(diff_ex_gene, "data/diff_ex_gene.csv")






# RIF analysis ---------

genontology <- fread("data/genontology_0003700.txt", col.names = c("ACC_ID", "gene_product", "gene_product_label", "panther_family_label", "panther_family"))
uniprot <- fread("data/uniProt.txt", na.strings="", col.names = c("Genes_table_ID", "UniProtKB_Gene_Name_ID"))

uniprot$UniProtKB_Gene_Name_ID[!is.na(uniprot$UniProtKB_Gene_Name_ID)] <- paste0("UniProtKB:",uniprot$UniProtKB_Gene_Name_ID[!is.na(uniprot$UniProtKB_Gene_Name_ID)])


# combine uniprot and genontology files
Go_pigs <- sqldf("select U.Genes_table_ID, U.UniProtKB_Gene_Name_ID, G.gene_product_label, G.panther_family_label, G.panther_family
                 from uniprot U, genontology G
                 where G.ACC_ID == U.UniProtKB_Gene_Name_ID ")


Safia Saci's avatar
Safia Saci committed
225
226

## per line --------------------
Safia Saci's avatar
Safia Saci committed
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
my_genes_de_line <- diff_ex_gene_line$gene
summary(my_genes_de_line %in% row.names(expMat))

my_genes_tf_line <-  Go_pigs$Genes_table_ID
summary(my_genes_tf_line %in% row.names(expMat))
my_genes_tf_line <- my_genes_tf_line[my_genes_tf_line %in% row.names(expMat)]
de_mat_tf_line <- expMat[c(my_genes_de_line, my_genes_tf_line), ]


RIF_out_line <- RIF(input = de_mat_tf_line,
               nta = length(my_genes_de_line),
               ntf = length(my_genes_tf_line),
               nSamples1 = 12,
               nSamples2 = 12)



Safia Saci's avatar
Safia Saci committed
244
## per cond------------------
Safia Saci's avatar
Safia Saci committed
245
246
247
248
249
250
251
252
253
my_genes_de_cond <- diff_ex_gene_cond$gene
summary(my_genes_de_cond %in% row.names(expMat))

my_genes_tf_cond <-  Go_pigs$Genes_table_ID
summary(my_genes_tf_cond %in% row.names(expMat))
my_genes_tf_cond <- my_genes_tf_cond[my_genes_tf_cond %in% row.names(expMat)]
de_mat_tf_cond <- expMat[c(my_genes_de_cond, my_genes_tf_cond), ]


Safia Saci's avatar
Safia Saci committed
254
## Go_pigs sorting
Safia Saci's avatar
Safia Saci committed
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
de_mat_tf_cond <- cbind(de_mat_tf_cond[,c("G11-_fasted_F_2884","G11-_fasted_M_2889",
                               "G11-_fasted_F_2899","G11-_fasted_M_2902",
                               "G11-_fasted_F_2980","G11-_fasted_M_2987",
                               "G11+_fasted_F_2990","G11+_fasted_M_2992",
                               "G11+_fasted_F_3004","G11+_fasted_M_3011",
                                "G11+_fasted_F_3042","G11+_fasted_M_3050",
                               "G11-_fed_F_2885","G11-_fed_M_2888",
                               "G11-_fed_F_2898","G11-_fed_M_2903",
                               "G11-_fed_F_2981","G11-_fed_M_2985",
                               "G11+_fed_F_2989","G11+_fed_M_2991",
                               "G11+_fed_M_3012","G11+_fed_M_3014",
                               "G11+_fed_F_3039","G11+_fed_M_3049")])

RIF_out_cond <- RIF(input = de_mat_tf_cond,
               nta = length(my_genes_de_cond),
               ntf = length(my_genes_tf_cond),
               nSamples1 = 12,
               nSamples2 = 12)


## saving : 
RIF_out_cond <- RIF_out_cond[order(RIF_out_cond$RIF1, decreasing = TRUE), ]
RIF_out_line <- RIF_out_line[order(RIF_out_line$RIF1, decreasing = TRUE), ]

list_tf <- sqldf("select distinct L.TF, L.avgexpr as avgexpr_line, L.RIF1 as RIF1_line, L.RIF2 as RIF2_line,
                               C.avgexpr as avgexpr_cond, C.RIF1 as RIF1_cond, C.RIF2 as RIF2_cond,
                               GO.UniProtKB_Gene_Name_ID, GO.gene_product_label, GO.panther_family
                 from RIF_out_line L, RIF_out_cond C, Go_pigs GO
                 where L.TF = GO.Genes_table_ID
                 and C.TF = L.TF")
write.csv(list_tf, "data/RIF_Tf_file.csv")

Safia Saci's avatar
Safia Saci committed
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
# Ananlyse RIf resullts-------

list_tf <- read.csv("data/RIF_Tf_file.csv")

# plot 1
rifDF <- read.csv2("data/rif_df.csv")
bplot <- ggplot(rifDF, aes(x = factor(name, levels = c("LRP6","HIF1A","TEAD1","CTNNB1","RLIM","NR1D1","ELK1","CTH","GTF3A","SMAD7","ZNF575","PSMD10","SNAI3","EHF","ONECUT3","MYBL2","PROX2","FOXD4","AIRE","PRMT2")),
                           y =  rif_value, fill = categorie )) +

    geom_bar(stat="identity", position="dodge" )+
    scale_fill_manual(values=c("indianred1", "indianred4","deepskyblue1", "deepskyblue4")) +
    coord_flip() +
    ylab("RIF")+
    xlab("TFs")+
    geom_text(aes(label= rif_value), position=position_dodge(width=0.9))+
    theme(panel.grid.major = element_line(colour = "white"), 
          panel.grid.minor = element_line(colour = "white"))

ggsave(filename = "plots/rif_TF.png", bplot)


# best TFs with RIF value
best_rif <- read.csv2("data/best_rif.csv")
best_rif$val <- as.numeric(best_rif$val)

levels <- group_by(best_rif, name) %>%
    summarise(max_rif = max(val)) %>%
    arrange(max_rif) %>% 
    pull(name)



p2 <- ggplot(best_rif, aes(x = categorie, y = val, fill = categorie))+
    geom_bar(stat = "identity", position = "dodge", width=0.8) +
    scale_fill_manual(values=c("indianred1", "indianred4","deepskyblue1", "deepskyblue4"))+
    ylab("RIF")+
    coord_flip(ylim = c(0, 4.2), expand = FALSE)+
    xlab("TFs")+
    facet_wrap(~factor(name, levels = rev(levels)))+
    theme_bw()+
    #scale_y_discrete(breaks=seq(-1, 4, 0.5))+
    theme(panel.grid.major = element_line(colour = "white"), 
          panel.grid.minor = element_line(colour = "white"),
          axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
          legend.position = "none")

ggsave("plots/best_rif2.png", p2, height = 4, width = 6)




# TFs with Gprofiler------------  

# listes of TFS from g;profiler
up_cond_tf <- read.csv("data/gprofiler/up_cond.csv")
up_line_tf <- read.csv("data/gprofiler/up_line.csv")
down_cond_tf <- read.csv("data/gprofiler/down_cond.csv")
de_line_tf <- read.csv("data/line_DE_tf_gprofiler.csv")
de_cond_tf <- read.csv("data/gprofiler/DE_cond.csv")

# add "condition" categorie
up_cond_tf$categorie <- "UP_Condition"
up_line_tf$categorie <- "UP_Line"
down_cond_tf$categorie <- "DOWN_Condition"
de_line_tf$categorie <- "DE_Line"
de_cond_tf$categorie <- "DE_Condition"

# order TFs with -log10(pval_adj)
up_cond_tf <- up_cond_tf[order(up_cond_tf$negative_log10_of_adjusted_p_value, decreasing = TRUE), ]
up_line_tf <- up_line_tf[order(up_line_tf$negative_log10_of_adjusted_p_value, decreasing = TRUE), ]
down_cond_tf <- down_cond_tf[order(down_cond_tf$negative_log10_of_adjusted_p_value, decreasing = TRUE), ]
de_cond_tf <- de_cond_tf[order(de_cond_tf$negative_log10_of_adjusted_p_value, decreasing = TRUE), ]

# x <- data.frame(tf = tfs$term_name, val= tfs$negative_log10_of_adjusted_p_value, id = tfs$term_id, categorie = tfs$categorie)
# # save the liste of the best Tfs 
# write.csv(x, "data/tsf.csv")

x <- read.csv("data/tsf.csv")

all_tf <- rbind(up_cond_tf, up_line_tf, down_cond_tf, de_cond_tf, de_line_tf)

motifs <- filter(x, grepl(pattern = "_1$", id)) %>%
    group_by(categorie) %>%
    slice_max(val, n = 5) %>%
    pull(id) %>%
    unique()

tfs <- mutate(x, categorie = factor(categorie, levels = c("DE_Condition", "UP_Condition", "DOWN_Condition", "DE_Line","UP_Line", "DOWN_Line")))%>%
    filter(id %in% motifs)


p4 <- ggplot(tfs, aes(x = categorie, y = val, fill = categorie))+
    geom_bar(stat = "identity")+
    xlab("")+
    ylab("-Log10(pVal_Adj)")+
    scale_fill_manual("Categories", values=c("yellowgreen", "forestgreen","goldenrod1", "mediumorchid4", "Blue4", "brown3"), drop = FALSE)+
    coord_flip(ylim = c(0, 35), expand = TRUE)+
    facet_wrap(~factor(tf,))+
    scale_x_discrete(drop = FALSE)+
    theme_bw()
 
png("plots/tfs_gprofiler.png", width = 800, height = 600, pointsize = 20)
p4
dev.off()

ggsave(filename ="plots/tfs_gprofiler.png", p4)