Commit 1d43b2ff authored by Safia Saci's avatar Safia Saci
Browse files

Upload New File

parent bca95c80
# module load compiler/gcc-7.2.0
# module load system/R-4.0.2_gcc-7.2.0
# Packages ------
library(sqldf)
library(dplyr)
library(CeTF)
library(data.table)
library(kableExtra)
# tables with diff expr genes ------
## Input data -------
coefLine <- read.csv("data/coef_line.csv", sep = ";", header = T)
coefCondition <- read.csv("data/coef_condition.csv", sep = ";", header = T)
coefSex <- read.csv("data/coef_sex.csv", sep = ";", header = T)
## collection of gene_names into gtf :
gene_name <- function(){
listeGenes <- as.data.frame(gtf$gene_id)
listeGenes <- distinct(listeGenes)
for(i in listeGenes){
listG <- gtf[gtf$gene_id %in% i, c("gene_id", "gene_name")]
geneName <- as.data.frame(listG)[ , c("gene_id", "gene_name")]
}
return(distinct(geneName))
}
geneNames <- gene_name()
# collection of gene_type into gtf :
gene_type <- function(){
listeGenes <- as.data.frame(gtf$gene_id)
listeGenes <- distinct(listeGenes)
for(i in listeGenes){
listG <- gtf[gtf$gene_id %in% i, c("gene_id", "gene_biotype")]
geneType <- as.data.frame(listG)[ , c("gene_id", "gene_biotype")]
}
return(distinct(geneType))
}
geneType <- gene_type()
# creation of the tsv file containing the genes, their Expression value and their expression status
gene_file <- sqldf("select C.gene, G.gene_name as gene_symbol, T.gene_biotype as gene_type, C.logFC as log2fc_condition, C.adj_P_Val as adjpval_condition, C.expressed as status_condition,
L.logFC as log2fc_line, L.adj_P_Val as adjpval_line, L.expressed as status_line,
S.logFC as log2fc_sex, S.adj_P_Val as adjpval_sex, S.expressed as status_sex
from coefLine L, coefCondition C, coefSex S, geneNames G, geneType T
where l.gene = C.gene
and C.gene = S.gene
and L.gene = G.gene_id
and C.gene = T.gene_id")
write.csv(gene_file, "data/gene_info.csv")
# differentially expressed genes in line and in condition:
diff_ex <- sqldf("select C.gene, C.expressed as condition_expressed, L.expressed as line_expressed
from coefLine L, coefCondition C
where C.gene = L.gene
and (C.expressed != 'NS' and L.expressed != 'NS')")
# > diff_ex
# gene condition_expressed line_expressed
# 1 ENSSSCG00000000293 UP UP
# 2 ENSSSCG00000000555 DOWN DOWN
# 3 ENSSSCG00000001392 UP UP
# 4 ENSSSCG00000002041 DOWN DOWN
# 5 ENSSSCG00000002835 DOWN DOWN
# 6 ENSSSCG00000007237 UP DOWN
# 7 ENSSSCG00000008559 UP UP
# 8 ENSSSCG00000008789 DOWN UP
# 9 ENSSSCG00000009498 DOWN DOWN
# 10 ENSSSCG00000010036 DOWN DOWN
# 11 ENSSSCG00000010431 DOWN DOWN
# 12 ENSSSCG00000011880 UP UP
# 13 ENSSSCG00000012292 UP UP
# 14 ENSSSCG00000016140 DOWN DOWN
# 15 ENSSSCG00000016159 DOWN DOWN
# 16 ENSSSCG00000032894 DOWN DOWN
# genes Network --------
## with Cor ----------
load("data/expMat2.RData")
load("data/expMat.RData")
cormat_gene_keep <- cor(t(expMat2), method = "pearson")
cor_tab <- data.table(cormat_gene_keep)
cor_tab$gene_1 <- rownames(cormat_gene_keep)
cor_table <- melt(cor_tab, id.vars = "gene_1")
cor_table <- cor_table %>% arrange(desc(abs(cor_table$value)))
cor_table2<- cor_table[13739:23738, ]
cor_table2 %>% distinct(value)
corelation_table <- distinct(cor_table2, value, .keep_all = TRUE)
cor_thr = 0.8
corelation_table$value <- replace(corelation_table$value, corelation_table$value < cor_thr, NA_character_)
corelation_table$value <- replace(corelation_table$value, corelation_table$value >= cor_thr, "Pos")
corelation_table$value <- replace(corelation_table$value, corelation_table$value <= cor_thr, "Neg")
## with PCIT ---------
# the tables with genes diff ex
## line and condition
diff_ex_gene <- sqldf("select C.gene, C.expressed as condition_expressed, L.expressed as line_expressed
from coefLine L, coefCondition C
where C.gene = L.gene
and (C.expressed != 'NS' or L.expressed != 'NS')")
# only line
diff_ex_gene_line <- sqldf("select L.gene, L.expressed as line_expressed
from coefLine L
where L.expressed != 'NS' ")
# only condition
diff_ex_gene_cond <- sqldf("select C.gene, C.expressed as condition_expressed
from coefCondition C
where C.expressed != 'NS' ")
# the scores matrix of genes diff expr
de_mat <- expMat2[diff_ex_gene$gene, ]
# Application of partial correlation
cor_de_mat <- PCIT(de_mat, tolType = "mean" )
pcor_de_table <- cor_de_mat$tab
# remove values of corr2 == null
pcor_mat_distinct <- sqldf("select * from pcor_de_table P
where P.corr2 != 0.0000")
# generation of the diff expr genes file with the pvalues nd their expressions.
cyto_file <- sqldf("select G.gene, gene_symbol, gene_type, log2fc_condition, adjpval_condition, status_condition,
log2fc_line, adjpval_line, status_line,
log2fc_sex, adjpval_sex, status_sex
from gene_file G, diff_ex_gene D
where G.gene = D.gene")
write.csv(cyto_file, "data/cyto_file.csv")
# Density plots for PCIT:
svg("plots/densityPlot.svg", width = 8, height = 6)
densityPlot(cor_de_mat$adj_raw, cor_de_mat$adj_sig)
dev.off()
# collection of diff expr genes
diff_ex_gene$condition_expressed<- paste0("Condition_", diff_ex_gene$condition_expressed)
diff_ex_gene$line_expressed <- paste0("Line_", diff_ex_gene$line_expressed)
# case where gene is diff_expr in line and in condition
line_cond_ex <- diff_ex$gene
for (i in line_cond_ex) {
diff_ex_gene$line_expressed[which(diff_ex_gene$gene == i)] <- "Double_expressed"
diff_ex_gene$condition_expressed[which(diff_ex_gene$gene == i)] <- "Double_expressed"
}
# add the expression column
diff_ex_gene$expression <- "null"
# filling
diff_ex_gene$expression <- ifelse(
diff_ex_gene$condition_expressed == "Condition_NS",
diff_ex_gene$line_expressed,
ifelse(
diff_ex_gene$line_expressed == "Line_NS",
diff_ex_gene$condition_expressed,
"Double_expressed"
)
)
# recovery of 12640 genes with the best correlation in abs value:
pcor_mat_dist_filter <- pcor_mat_distinct %>% arrange(desc(abs(pcor_mat_distinct$corr2)))
pcor_dist_filter_table <- pcor_mat_dist_filter[1:12640, ]
write.csv(pcor_dist_filter_table, "data/pcor_dist_filter_table.csv")
# save data for cytoscape
write.csv(pcor_de_table, "data/pcor_de_table.csv")
write.csv((pcor_mat_distinct), "data/pcor_de_distinct.csv")
write.csv(diff_ex_gene, "data/diff_ex_gene.csv")
# RIF analysis ---------
genontology <- fread("data/genontology_0003700.txt", col.names = c("ACC_ID", "gene_product", "gene_product_label", "panther_family_label", "panther_family"))
uniprot <- fread("data/uniProt.txt", na.strings="", col.names = c("Genes_table_ID", "UniProtKB_Gene_Name_ID"))
uniprot$UniProtKB_Gene_Name_ID[!is.na(uniprot$UniProtKB_Gene_Name_ID)] <- paste0("UniProtKB:",uniprot$UniProtKB_Gene_Name_ID[!is.na(uniprot$UniProtKB_Gene_Name_ID)])
# combine uniprot and genontology files
Go_pigs <- sqldf("select U.Genes_table_ID, U.UniProtKB_Gene_Name_ID, G.gene_product_label, G.panther_family_label, G.panther_family
from uniprot U, genontology G
where G.ACC_ID == U.UniProtKB_Gene_Name_ID ")
#
# my_genes_de <- diff_ex_gene$gene
# summary(my_genes_de %in% row.names(expMat2))
#
# my_genes_tf <- Go_pigs$Genes_table_ID
# summary(my_genes_tf %in% row.names(expMat2))
# my_genes_tf <- my_genes_tf[my_genes_tf %in% row.names(expMat2)]
# de_mat_tf <- expMat2[c(my_genes_de, my_genes_tf), ]
## line --------------------
my_genes_de_line <- diff_ex_gene_line$gene
summary(my_genes_de_line %in% row.names(expMat))
my_genes_tf_line <- Go_pigs$Genes_table_ID
summary(my_genes_tf_line %in% row.names(expMat))
my_genes_tf_line <- my_genes_tf_line[my_genes_tf_line %in% row.names(expMat)]
de_mat_tf_line <- expMat[c(my_genes_de_line, my_genes_tf_line), ]
RIF_out_line <- RIF(input = de_mat_tf_line,
nta = length(my_genes_de_line),
ntf = length(my_genes_tf_line),
nSamples1 = 12,
nSamples2 = 12)
## cond------------------
# ---
my_genes_de_cond <- diff_ex_gene_cond$gene
summary(my_genes_de_cond %in% row.names(expMat))
my_genes_tf_cond <- Go_pigs$Genes_table_ID
summary(my_genes_tf_cond %in% row.names(expMat))
my_genes_tf_cond <- my_genes_tf_cond[my_genes_tf_cond %in% row.names(expMat)]
de_mat_tf_cond <- expMat[c(my_genes_de_cond, my_genes_tf_cond), ]
## Go_pigs sorting
de_mat_tf_cond <- cbind(de_mat_tf_cond[,c("G11-_fasted_F_2884","G11-_fasted_M_2889",
"G11-_fasted_F_2899","G11-_fasted_M_2902",
"G11-_fasted_F_2980","G11-_fasted_M_2987",
"G11+_fasted_F_2990","G11+_fasted_M_2992",
"G11+_fasted_F_3004","G11+_fasted_M_3011",
"G11+_fasted_F_3042","G11+_fasted_M_3050",
"G11-_fed_F_2885","G11-_fed_M_2888",
"G11-_fed_F_2898","G11-_fed_M_2903",
"G11-_fed_F_2981","G11-_fed_M_2985",
"G11+_fed_F_2989","G11+_fed_M_2991",
"G11+_fed_M_3012","G11+_fed_M_3014",
"G11+_fed_F_3039","G11+_fed_M_3049")])
RIF_out_cond <- RIF(input = de_mat_tf_cond,
nta = length(my_genes_de_cond),
ntf = length(my_genes_tf_cond),
nSamples1 = 12,
nSamples2 = 12)
## saving :
RIF_out_cond <- RIF_out_cond[order(RIF_out_cond$RIF1, decreasing = TRUE), ]
RIF_out_line <- RIF_out_line[order(RIF_out_line$RIF1, decreasing = TRUE), ]
list_tf <- sqldf("select distinct L.TF, L.avgexpr as avgexpr_line, L.RIF1 as RIF1_line, L.RIF2 as RIF2_line,
C.avgexpr as avgexpr_cond, C.RIF1 as RIF1_cond, C.RIF2 as RIF2_cond,
GO.UniProtKB_Gene_Name_ID, GO.gene_product_label, GO.panther_family
from RIF_out_line L, RIF_out_cond C, Go_pigs GO
where L.TF = GO.Genes_table_ID
and C.TF = L.TF")
write.csv(list_tf, "data/RIF_Tf_file.csv")
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment