Commit 478ccb65 authored by Gosset Simon's avatar Gosset Simon
Browse files

first commit

parent 2d0fd341
Package: appinetwork
Type: Package
Title: construction and Analysis of Protein-Protein Interactions (PPI)
networks for complexes and biological processes
Version: 0.1.0
Description: This package is used to construct and analyse a network about PPI in cellular complexes or biological processes.
There are functions to parse files, to construct network, to modelize assembly intermediary of complexes and to cluster network (TFit).
Parse Databases : IrefIndex, Biogrid, Intact.
Construct Thesaurus.
Build Network.
Analyse Network : Assembly intermediary + TFit clustering to find all interactants.
Author: Marie-Hélène Mucchielli-Giorgi <marie-helene.mucchielli@i2bc.paris-saclay.fr> [aut],
Annie Glatigny [ctb], Benjamin Auder <Benjamin.Auder@u-psud.fr> [ctb], Melina Gallopin <Melina.Gallopin@u-psud.fr> [ctb]
Maintainer: Marie-Hélène Mucchielli-Giorgi <marie-helene.mucchielli@i2bc.paris-saclay.fr>
Depends:
R (>= 3.0.0), gWidgets2, gWidgets2tcltk, tcltk, ape, digest
Imports:
MASS
Suggests:
roxygen2,
testthat
URL: http://www.i2bc.paris-saclay.fr/?lang=en
License: CC0
Encoding: UTF-8
RoxygenNote: 5.0.1
BSD 3-Clause License
Copyright (c) 2019, Mélina Gallopin
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
exportPattern("^[^\\.]")
export(interface)
#export(thesaurus_window)
import(gWidgets2)
import(gWidgets2tcltk)
import(tcltk)
import(ape)
#exportPattern("^[[:alpha:]]+")
useDynLib(appinetwork)
\ No newline at end of file
### function construct
### function branch_length
#############################################################
construct <- function(part1, part2, e, mat, inputID, inputSGD) {
ind1 <- grep(part1, e)
ind2 <- grep(part2, e)
if ((length(ind1) > 0) && (length(ind2) > 0) && (ind1 < ind2)) {
ind2 <- ind2-1
}
if (length(ind1) > 0) {
part1 <- e[ind1]
e <- e[-ind1]
}
if (length(ind2) > 0) {
part2 <- e[ind2]
e <- e[-ind2]
}
if (length(part1) == length(setdiff(inputID, inputID[inputSGD == part2]))) {
d1 <- round(branch_length(mat, part1, setdiff(inputID, inputID[inputSGD == part1])), 3) / 2.0
d2 <- round(branch_length(mat, part2, setdiff(inputID, inputID[inputSGD == part2])), 3) / 2.0
}
else {
d1 <- round(branch_length(mat, part1, setdiff(inputID, inputID[inputSGD == part1])), 3)
d2 <- round(branch_length(mat, part2, setdiff(inputID, inputID[inputSGD == part2])), 3)
}
e <- c(e, paste("(", part1, ":", d1, ",", part2, ":", d2, ")", sep = ""))
return(e)
}
#############################################################
branch_length <- function(matrix, set1, set2) {
goodTriples <- 0
badTriples <- 0
# computing triplets with one element in set2 and 2 elements in set1
if (length(set1) > 1) {
for (i in 1:length(set1)) {
for (j in i:length(set1)) {
for (k in 1:length(set2)) {
if (k != i && k != j && i != j) {
if (matrix[i,j] < max(matrix[i,k], matrix[j,k])) {
badTriples <- badTriples + 1
}
else {
goodTriples <- goodTriples + 1
}
}
}
}
}
}
# computing triplets with one element in set1 and 2 elements in set2
if (length(set2) > 1) {
for (i in 1:length(set2)) {
for (j in i:length(set2)) {
for (k in 1:length(set1)) {
if (k != i && k != j && i != j) {
if (matrix[i,j] < max(matrix[i,k], matrix[j,k])) {
badTriples <- badTriples + 1
}
else {
goodTriples <- goodTriples + 1
}
}
}
}
}
}
return (goodTriples / (goodTriples + badTriples));
}
# Search all the databases having the same PPI coming from one or several publications
DataBases<-function(Final.List.Redondant){
Inter.red<-as.matrix(Final.List.Redondant)
indices<-rep("NA",dim(Inter.red)[1])
for(i in 1:dim(Inter.red)[1])
{
indi<-intersect(c(grep(Inter.red[i,12],Inter.red[,11]),grep(Inter.red[i,12],Inter.red[,12])),c(grep(Inter.red[i,11],Inter.red[,11]),grep(Inter.red[i,11],Inter.red[,12])))
indices[i]<-toString(unique(Inter.red[indi,10]))
}
Inter.red[,10]<-indices
return(Inter.red)
}
search_id <- function(cible, thesaurus) {
# Recherche de la cible dans tout le thesaurus pour associer le bon UniprotID avec le nom de la proteine et le nom du gene
# On regarde si la cible peut etre un ID-isoforme pour cherche l'ID seul dans le thesaurus : UniprotID ou OldID
CIBLE <- unlist(strsplit(cible, "-"))
ciblePrincipale <- CIBLE[1]
if (length(CIBLE[]) > 1) {
complementCible <- paste('-', CIBLE[2], sep = '')
}
else {
complementCible <- ""
}
# On cherche la cible dans la liste des uniprotID du thesaurus : colonne 1
ligneThesaurus <- 1 # on commence à la premiere ligne du thesaurus
ligneMax <- dim(thesaurus)[1]
resultat <- list()
recherche <- 'non'
# while (recherche == 'non' & ligneThesaurus <= ligneMax) {
# if (ciblePrincipale == thesaurus[ligneThesaurus,1]) {
# resultat <- c(paste(thesaurus[ligneThesaurus,1], complementCible, sep = ''), paste(thesaurus[ligneThesaurus,5], complementCible, sep = ''), thesaurus[ligneThesaurus,3])
# recherche <- 'oui'
# }
# else {
# ligneThesaurus <- ligneThesaurus + 1
# }
# }
resultat = grep(pattern = ciblePrincipale, x = thesaurus[,1])
if (length(resultat) != 0) {
resultat = c(thesaurus[resultat,1], thesaurus[resultat,5], thesaurus[resultat,3])
} else {
resultat = grep(pattern = ciblePrincipale, x = thesaurus[,5])
if (length(resultat) != 0) {
resultat = c(thesaurus[resultat,1], thesaurus[resultat,5], thesaurus[resultat,3])
} else {
resultat = grep(pattern = ciblePrincipale, x = thesaurus[,7])
if (length(resultat) != 0) {
resultat = c(paste(thesaurus[resultat,1], complementCible, sep = ''), paste(thesaurus[resultat,5], complementCible, sep = ''), thesaurus[resultat,3])
} else {
resultat = grep(pattern = ciblePrincipale, x = thesaurus[,6])
if (length(resultat) != 0) {
resultat = c(thesaurus[resultat,1], thesaurus[resultat,5], thesaurus[resultat,3])
} else {
resultat = grep(pattern = ciblePrincipale, x = thesaurus[,4])
if (length(resultat) != 0) {
resultat = c(thesaurus[resultat,1], thesaurus[resultat,5], thesaurus[resultat,3])
} else {
resultat = grep(pattern = ciblePrincipale, x = thesaurus[,3])
if (length(resultat) != 0) {
resultat = c(thesaurus[resultat,1], thesaurus[resultat,5], thesaurus[resultat,3])
}
}
}
}
}
}
return (resultat)
}
############################################################################################
recup_ppi <- function(inputListFile, Base) {
# Recuperation des colonnes des bases qui sont utilisees dans le reseau
Base.f <- as.matrix(Base[,c(1:5,7:11,14:15)])
ind <- unique(c(as.vector(grep(inputListFile[1], Base.f[,1])), as.vector(grep(inputListFile[1], Base.f[,2]))))
cat('\n>Searching interactions...')
# Recherche des interactions dans les bases, contenant au moins une proteine du fichier de recherche input list
for (i in 1:length(inputListFile)) {
input_name = paste("^",inputListFile[i],"$", sep = "")
ind <- unique(c(ind, c(as.vector(grep(input_name, Base.f[,1])), as.vector(grep(input_name, Base.f[,2])))))
}
interaction.dir <- Base.f[ind,]
cat('OK')
return(interaction.dir)
}
############################################################################################
pubmed_id <- function(Final.List.Redondant, Name, run) {
cat('\n>Searching pubmed IDs ... ')
print(dim(Final.List.Redondant))
cat("\n")
# Recuperation des donnees du reseau
Inter.red <- as.matrix(Final.List.Redondant)
Inter.partialRed <- as.matrix(Final.List.Redondant[1,])
if (dim(Inter.partialRed)[2] == 1) {
Inter.partialRed <- t(Inter.partialRed)
}
if(run==2)
pb <- txtProgressBar(min = 0, max = dim(Inter.red)[1],style = 3)
# Parcours du reseau pour trouver des interactions redondantes et identifier le nombres d'articles associes (nombre de pubmed ID)
l <- dim(Inter.red)[1]
i <- 1
if (Name != "nofile") {
unredundant <<- as.matrix(t(c(Inter.red[1,], "NA")))
colnames(unredundant) <<- c(colnames(Inter.red), "NbPmids")
nbl <- 1
while (i <= l) {
indi <- intersect(c(grep(Inter.red[i,2], Inter.red[,1]), grep(Inter.red[i,2], Inter.red[,2])), c(grep(Inter.red[i,1], Inter.red[,1]), grep(Inter.red[i,1], Inter.red[,2])))
j <- 1
# Recherche des redondances d'interactions
while (j <= length(indi)) {
if ((((Inter.red[indi[j],1] == Inter.red[i,1]) && (Inter.red[indi[j],2] == Inter.red[i,2])) || ((Inter.red[indi[j],1] == Inter.red[i,2]) && (Inter.red[indi[j],2] == Inter.red[i,1]))) == FALSE) {
indi <- indi[-j]
j <- j - 1
}
j <- j + 1
}
if (nbl > 1) {
unredundant <<- rbind(unredundant, as.matrix(t(c(Inter.red[i,], "NA"))))
}
# Recuperation des differents pubmed-ID
if (length(indi) > 1) {
pmids <- unique(Inter.red[indi,6])
pmids <- as.matrix(pmids)
pmids2 <- c()
for (n in 1:length(pmids)) {
a <- strsplit(pmids[n,1], "\\|")
a1 <- length(a[[1]])
for (p in 1:a1) {
pmids2 <- c(pmids2, a[[1]][p])
}
}
pmids <- unique(as.vector(pmids2))
# Rassemblement des different pubmed-ID de chaque interaction
pmids3 <- c()
b <- length (pmids)
if (b > 1) {
for (o in 1:length(pmids)) {
pmids3 <- paste(pmids3, pmids[o], sep = "|")
}
pmids3 <- substr(pmids3, 2, nchar(pmids3))
}
if (b == 1) {
pmids3 <- paste(pmids3, pmids)
}
tmp <- Inter.red[indi,]
for (k in 1:length(pmids)) {
Inter.partialRed <- rbind(Inter.partialRed, t(as.matrix(tmp[(1:dim(tmp)[1])[tmp[,6] == pmids[k]][1],])))
}
unredundant[nbl,6] <<- pmids3
# On complete la derniere colonne du reseau avec le nombre de pubmed-ID trouves pour chaque interaction
unredundant[nbl,13] <<- length(pmids)
Inter.red <- Inter.red[-indi,]
l = l - length(indi)
i = i - 1
}
if (length(indi) == 1) {
unredundant[nbl,13] <<- 1
Inter.partialRed <- rbind(Inter.partialRed, t(as.matrix(Inter.red[indi,])))
}
i = i + 1
nbl <- nbl + 1
if(run==2)
setTxtProgressBar(pb, i)
}
Inter.partialRed <- Inter.partialRed[-1,]
if(run==2)
close(pb)
}
cat('OK')
return(unredundant)
}
############################################################################################
load_data <- function(db) {
# Recuperation des donnes contenues dans les bases de donnees selectionnees
cat('\n\n>Loading database...')
data.name.list <- db
# Verification de la presence de donnees dans la base
if (length(data.name.list) == 0) {
cat('ERROR : no databases selected')
stop()
}
# Verification de la compatibilite du format des bases de donnees
multiple.database <- c()
for (i in 1:length(data.name.list)) {
one.multiple.database <- read.delim2(data.name.list[i], header = T, sep = '\t')
tryCatch({
colnames(one.multiple.database) <- c( "uidA", "uidB", "aliasA", "aliasB", "method", "author", "pmid", "taxA", "taxB", "interactionType", "sourceBD", "confidence", "numParticipants", "GeneNameA", "GeneNameB")
multiple.database <- rbind(multiple.database, one.multiple.database)
}, error = function(err) {
message(err)
message('Database(s) dimension different')
})
}
cat('OK dim :')
cat(dim(multiple.database))
# Recuperation des bases si le format correspond
return(multiple.database)
}
############################################################################################
load_network <- function(nw) {
# Recuperation des donnes contenues dans le reseau selectionne
cat('\n>Loading network ... ')
network.name.list <- nw
# Verification de la presence d'interactions dans le reseau
if (length(network.name.list) == 0) {
cat('ERROR : no network selected')
stop()
}
# Verification de la compatibilite de format du reseau
multiple.network <- c()
for (i in 1:length(network.name.list)) {
one.multiple.network <- read.delim2(network.name.list[i], header = T, sep = '\t', stringsAsFactors = F)
tryCatch({
colnames(one.multiple.network) <- c("aliasA", "method", "aliasB", "uidA", "uidB", "pmid", "taxA", "taxB", "interactionType", "sourceBD", "GeneNameA", "GeneNameB","NbPmids" )
multiple.network <- rbind(multiple.network, one.multiple.network)
}, error = function(err) {
message(err)
message('Network dimension uncorrect')
})
}
cat('OK dim :')
cat(dim(unique(multiple.network)))
# Recuperation du reseau si le format correspond
return(unique(multiple.network))
}
############################################################################################
proximity_score <- function (listProt1, listProt2, inputUniprotID, allProt, formula) {
score <- (-1.0)
listinter <- intersect(listProt1, listProt2)
ninter <- length(listinter)
listunion <- union(listProt1, listProt2)
nunion <- length(listunion)
nUniprotID <- length(inputUniprotID)
for (i in 1:nUniprotID) {
ninter <- ninter - length(grep(inputUniprotID[i], listinter))
nunion <- nunion - length(grep(inputUniprotID[i], listunion))
}
o1 <- length(listProt1)
o2 <- length(listProt2)
O1 <- as.double(o1)
O2 <- as.double(o2)
OO <- as.double(O1 + O2)
# proteins not in the complex, in the neighborhood of subcomplex 1 and subcomplex 2
O11 <- as.double(ninter)
O <- as.double(nunion)
# proteins not in the complex, in the neighborhood of subcomplex 1 but not of subcomplex 2
O12 <- as.double(O1 - O11)
# proteins not in the complex, in the neighborhood of subcomplex 2 but not of subcomplex 1
O21 <- as.double(O2 - O11)
# O21<-length(listProt2)-length(intersect(listProt2,inputUniprotID))-length(ninter)
# proteins neither in the complex, nor in the neighborhood of subcomplex 1 nor of subcomplex 2
N <- length(allProt) - length(inputUniprotID)
N <- as.double(N)
O22 <- as.double(N - O)
S1 <- as.double(O12 + O22)
S2 <- as.double(O21 + O22)
E11 <- as.double(O1 * O2) / N
E12 <- as.double(O1 * S1) / N
E21 <- as.double(S2 * O2) / N
E22 <- as.double(S2 * S1) / N
if (formula == "jaccard") {
valeur <- (O11 / O)
}
if (formula == "liddell") {
valeur <- (O11 * O22 - O12 * O21) / (O2 * S1)
}
if (formula == "dice") {
valeur <- 2.0 * O11 / (O1 + O2)
}
if (formula == "zscore") {
valeur <- (O11 - E11) ^ 2.0 / sqrt(E11)
}
if (formula == "ms") {
valeur <- min(O11 / O1, O11 / O2)
}
if (formula == "Chi2") {
valeur <- N * (O11 - E11) ^ 2.0 / (E11 * E22)
}
score <- round(valeur, 3)
if (score != 'NA') {
return (score)
}
else {
cat('Error : Unable to generate a score, choose an other one')
stop()
}
}
############################################################################################
normalize_mat <- function(mat) {
matMin = min(mat, na.rm = TRUE)
matMax = max(mat, na.rm = TRUE)
if (matMin != matMax) {
for (i in 1:(length(mat[1,]))) {
for (j in 1:(length(mat[1,]))) {
if (matMin < 0) {
# affine transformation
mat[i,j] <- 0.99 * (mat[i,j] - matMin) / (matMax - matMin)
}
else {
# linear transformation
mat[i,j] <- 0.99 * mat[i,j] / matMax
}
}
}
}
else {
for (i in 1:(length(mat[1,]))) {
for (j in 1:(length(mat[1,]))) {
mat[i,j] <- 0 * mat[i,j]
}
}
}
return(mat)
}
############################################################################################
load_inputlist <- function(data) {
# Recuperation des noms des proteines d'interet
cat('\n>Loading inputlist ... ')
data.name.list <- data
if (length(data.name.list) == 0) {
cat('Error no file selected')
stop()
}
multiple.database <- c()
for (i in 1:length(data.name.list)) {
one.multiple.database <- read.delim2(data.name.list[i], header = T, sep = '\t')
multiple.database <- rbind(multiple.database, one.multiple.database)
}
cat('OK dim :')
cat(dim(unique(multiple.database)))
return(unique(multiple.database))
}
############################################################################################
indices_min_jaccard <- function(mat.JaccardDistance) {
ind <- c(-1, -1)
for (i in 1:(dim(mat.JaccardDistance)[1] - 1)) {
for (j in (i + 1):dim(mat.JaccardDistance)[2]) {
if (mat.JaccardDistance[i,j] == min(mat.JaccardDistance, na.rm = TRUE)) {
ind[1] <- i
ind[2] <- j
}
}
}
return(ind)
}
############################################################################################
remove_redundants <- function(inputtable) {
# Recuperation du reseau d'interactions
tab_PPI <- as.matrix(inputtable)
# On suit l'evolution du parcours
prep_file <- tab_PPI[duplicated(tab_PPI[,c(4,5)]) == F,]
# Dedoublement du reseau pour avoir les interactions dans les deux sens
prep_file_inverse <- as.matrix(data.frame(prep_file[,1:3], prep_file[,5], prep_file[,4], prep_file[,6:10], prep_file[,12], prep_file[,11], stringAsFactors=F))
# Parcours du reseau
i <- 1
while (i <= dim(prep_file)[1]) {
prsbar <- txtProgressBar(min = 1, max = dim(prep_file)[1], style = 3)
j <- 1
nom_pubmed_j <- c()
while (j <= dim(prep_file_inverse)[1]) {
# On enleve les lignes avec des interactions entre les meme proteines, sans enlever les interactions d'une proteine avec elle meme
if (prep_file[i,4] == prep_file_inverse[j,4] && prep_file[i,5] == prep_file_inverse[j,5] && as.character(prep_file[i,4]) != as.character(prep_file[i,5])) {
prep_file <- prep_file[-j,]
prep_file_inverse <- prep_file_inverse[-j,]
}
else {
j <- j + 1
}