Commit 510a963b authored by Simon Gosset's avatar Simon Gosset
Browse files

correction pb transposon

parent 26173fc5
......@@ -168,8 +168,8 @@ format_database = function(organism,organismID, putative, file, uniprot, pathrac
# eliminate transposons
thesaurusNotranNoPut<-thesaurus[thesaurus[,6]=="FALSE",]
if(length(thesaurus[thesaurus[,6]==" TRUE",6])>0){
Tab.transposons<-thesaurus[thesaurus[,6]==" TRUE",]
if(length(thesaurus[thesaurus[,6]=="TRUE",6])>0){
Tab.transposons<-thesaurus[thesaurus[,6]=="TRUE",]
Tab.transposons[,2]<-unlist(apply(Tab.transposons,c(1,2), strsplit,split=";")[,2])
for (i in 1:dim(Tab.transposons)[1]){
if(length(grep(Tab.transposons[i,2],irefindex[,1]))>0)
......@@ -182,9 +182,9 @@ format_database = function(organism,organismID, putative, file, uniprot, pathrac
# eliminate putative proteins
if (putative == " TRUE") {
if (putative == "TRUE") {
thesaurusNotranNoPut <- thesaurusNotranNoPut[thesaurusNotranNoPut[,5] == "FALSE",]
Tab.putatives<-thesaurus[thesaurus[,5]==" TRUE",]
Tab.putatives<-thesaurus[thesaurus[,5]=="TRUE",]
Tab.putatives[,2]<-unlist(apply(Tab.putatives,c(1,2), strsplit,split=";")[,2])
for (i in 1:dim(Tab.putatives)[1]) {
if(length(grep(Tab.putatives[i,2],irefindex[,1]))>0)
......@@ -296,6 +296,7 @@ biogrid_window2 <- function(f_pos, mainpanel, pana, mainpath) {
# Selection du fichier
chdb <- ggroup(container = pp, horizontale = T)
addSpring(chdb)
bouton1 <- gbutton("Biogrid file", container = chdb, handler = function(...) {
file <<- gfile(text = "Select a file", type = "open", multi = F, container = chdb)
if (is.null(file) == T) {
......
......@@ -21,7 +21,7 @@ recup_species = function(intact_tab, taxon) {
}
format_intact = function(organism, organismID, file, pathracine, other) {
format_intact = function(organism, organismID, file, pathracine, other, thesaurus_file) {
cat("\n\n>FORMATING INTACT DATABASE")
......@@ -32,6 +32,10 @@ format_intact = function(organism, organismID, file, pathracine, other) {
intact <- read.delim(file = intactFileName, header = T, sep = "\t")
thesaurus = read.delim(file = thesaurus_file, header = T, sep = "\t", stringsAsFactors = F)
intact <- intact[,-c(3,4,14)]
intact <- intact[grep("uniprotkb:", intact[,1]),]
intact <- intact[grep("uniprotkb:", intact[,2]),]
......@@ -161,6 +165,49 @@ format_intact = function(organism, organismID, file, pathracine, other) {
irefindex <- cbind(irefindex, numParticipants, GeneNameA, GeneNameB)
colnames(irefindex) <- c("uidA", "uidB", "aliasA", "aliasB", "method", "author", "pmids", "taxa", "taxb", "interactionType", "sourcedb", "confidence", "numParticipants", "GeneNameA", "GeneNameB")
outputfilename <- paste(organism, "_intact.txt", sep = "")
if(length(thesaurus[thesaurus[,10]=="TRUE",]) > 0){
Tab.transposons <- thesaurus[thesaurus[,10]=="TRUE",]
for (i in 1:dim(Tab.transposons)[1]) {
Tab.transposons[i,1]
if(length(grep(pattern = Tab.transposons[i,1],irefindex[,1])) > 0) {
irefindex <- irefindex[-grep(pattern = Tab.transposons[i,1], irefindex[,1]),]
}
if(length(grep(Tab.transposons[i,1],irefindex[,2])) > 0) {
irefindex <- irefindex[-grep(pattern = Tab.transposons[i,1], irefindex[,2]),]
}
old_uniprot = Tab.transposons[i,7]
if(length(old_uniprot) > 0) {
old_uniprot = unlist(strsplit(Tab.transposons[i,7], split = ";"))
for (j in old_uniprot) {
if(length(grep(pattern = paste0("^",j,"$"),irefindex[,1])) > 0) {
irefindex <- irefindex[-grep(pattern = paste0("^",j,"$"), irefindex[,1]),]
}
if(length(grep(pattern = paste0("^",j,"$"), irefindex[,2])) > 0) {
irefindex <- irefindex[-grep(pattern = paste0("^",j,"$"), irefindex[,2]),]
}
}
}
old_uniprot = NULL
}
}
write.table(irefindex, file = outputfilename, row.names = F, col.names = T, quote = F, sep = "\t")
cat(paste("OK\n\n>Formating intact database is done.\n\nDatabase file is saved in", database.path, sep = " : "))
......@@ -187,6 +234,7 @@ intact_window <- function(f_pos, mainpanel, pana, mainpath) {
# File selection
chdb <- ggroup(container = pp, horizontale = T)
addSpring(chdb)
bouton1 <- gbutton("Intact file", container = chdb, handler = function(...) {
file <<- gfile(text = "Select a file", type = "open", multi = F, container = chdb)
if (is.null(file) == T) {
......@@ -197,6 +245,18 @@ intact_window <- function(f_pos, mainpanel, pana, mainpath) {
cat(paste('\n>File selected : ', file, sep = ''))
}
})
true_thesaurus = character(0)
bouton3 <- gbutton("Thesaurus file", container = chdb, handler = function(...) {
true_thesaurus <<- gfile(text = "Select a file", type = "open", multi = F, container = chdb)
if (is.null(true_thesaurus) == T) {
gmessage('Selected thesaurus file is null', icon = 'error')
}
if (is.null(true_thesaurus) == F) {
bouton3$set_value(paste(length(true_thesaurus), 'thesaurus file selected'))
cat(paste('\n>thesaurus file selected : ', true_thesaurus, sep = ''))
}
})
ppb <- ggroup(container = pp)
addSpring(ppb)
......@@ -249,7 +309,7 @@ intact_window <- function(f_pos, mainpanel, pana, mainpath) {
# intact_othero(organism, organismID, file, pathracine)
format_intact(organism, organismID, file, pathracine, other)
format_intact(organism, organismID, file, pathracine, other, true_thesaurus)
dispose(bpc)
}, container = bpc)
......@@ -258,7 +318,7 @@ intact_window <- function(f_pos, mainpanel, pana, mainpath) {
else {
format_intact(organism, organismID, file, pathracine, other)
format_intact(organism, organismID, file, pathracine, other, true_thesaurus)
}
......
rm(list = ls())
setwd("/home/IPS2/sgosset1/Documents/appinetwork/appinetwork/R")
library(appinetwork)
......
#!/usr/bin/python
import sys
import re
try:
uniprot = sys.argv[sys.argv.index("thesaurusPy.py")+1]
except:
print ("ERROR: please, enter uniprot file")
sys.exit()
try:
thesaurus = sys.argv[sys.argv.index("thesaurusPy.py")+2]
except:
sys.exit()
try:
organism = sys.argv[sys.argv.index("thesaurusPy.py")+3]
except:
sys.exit()
### Recuperation des donnees
fichier = open(uniprot,'r')
donnees = []
test = 0
ligne = 'NA'
liste = []
ID = 'NA'
Biogrid = 'NA'
BIOGRID = []
NameProt = 'NA'
Name = 'NA'
Ref = 'NA'
Syn = 'NA'
ORF = 'NA'
Iso = 'NA'
ISO = 'NA'
NIso = 'NA'
for i in fichier :
i = i.strip("\n")
# Recuperation du nom principal de la proteine (29, 30-37)
if i[0:2] == "ID" :
review_state = i[29:39]
NameProt = re.search(r"(ID)(\s)*(?P<id>\w+)(_)", i)
if NameProt is not None:
NameProt = NameProt.group('id')
NameProt = str(NameProt)
else :
NameProt = 'NA'
if review_state == "Unreviewed" :
continue
# Recuperation de l'identifiant uniprot de la proteine et de ses anciens ID
if i[0:2] == "AC" :
if ID == "NA" :
ID = re.search(r"(AC)(\s)*(?P<ac>\w+)(;)", i)
if ID is not None:
ID = ID.group('ac')
ID = str(ID)
line = str(i)
line = line.strip("AC ")
liste = line.split()
if len(liste) > 1 :
Iso = ''
for j in range(1, len(liste)) :
Iso = Iso + liste[j]
else :
ID = 'NA'
else :
line = str(i)
line = line.strip("AC ")
liste = line.split()
if len(liste) > 0 :
for j in range(0, len(liste)) :
Iso = Iso + liste[j]
# Recuperation de la reference du locus (NP)
elif i[0:12] == "DR RefSeq;" :
if Ref == "NA" :
Ref = re.search(r"(DR RefSeq; NP_)(?P<rsq>\d+)(.)", i)
if Ref is not None:
Ref = Ref.group('rsq')
Ref = "NP_" + str(Ref)
else :
Ref = re.search(r"(DR RefSeq; NP_)(?P<rsq>\d+)(;)", i)
if Ref is not None:
Ref = Ref.group('rsq')
Ref = "NP_" + str(Ref)
else :
Ref = re.search(r"(DR RefSeq; XP_)(?P<rsq>\d+)(.)", i)
if Ref is not None:
Ref = Ref.group('rsq')
Ref = "XP_" + str(Ref)
else :
Ref = re.search(r"(DR RefSeq; XP_)(?P<rsq>\d+)(;)", i)
if Ref is not None:
Ref = Ref.group('rsq')
Ref = "XP_" + str(Ref)
else :
Ref = 'NA'
# Recuperation de l'identifiant biogrid
elif i[0:13].upper() == "DR BIOGRID;" :
Biogrid = re.search(r"(DR BioGrid;)(\s)*(?P<bg>\w+)(;)", i, flags=re.IGNORECASE)
if Biogrid is not None:
Biogrid = Biogrid.group('bg')
Biogrid = str(Biogrid)
BIOGRID.append(Biogrid)
else :
Biogrid = 'NA'
# Recuperation du nombre d'isoformes
elif i[0:2] == "CC" :
if NIso == 'NA' :
NIso = re.search(r"(Named isoforms=)(?P<is>\d+)(;)", i)
if NIso is not None :
NIso = NIso.group('is')
NIso = str(NIso)
else :
NIso = 'NA'
# Recuperation de l'identifiant du gene : Hsapiens, Celegans, Dmelanogaster, Mmusculus, Rnorvegicus, Athaliana
#elif i[0:13] == "DR UniGene;" :
# if organism == "homo+sapiens" :
# if ORF == 'NA' :
# ORF = re.search(r"(DR UniGene;)(\s)*(Hs.)(?P<ug>\w+)(;)", i)
# if ORF is not None:
# ORF = ORF.group('ug')
# ORF = 'Hs.' + str(ORF)
# else :
# ORF = 'NA'
# if organism == "caenorhabditis+elegans" :
# if ORF == 'NA' :
# ORF = re.search(r"(DR UniGene;)(\s)*(Cel.)(?P<ug>\w+)(;)", i)
# if ORF is not None:
# ORF = ORF.group('ug')
# ORF = 'Cel.' + str(ORF)
# else :
# ORF = 'NA'
# if organism == "drosophila+melanogaster" :
# if ORF == 'NA' :
# ORF = re.search(r"(DR UniGene;)(\s)*(Dm.)(?P<ug>\w+)(;)", i)
# if ORF is not None:
# ORF = ORF.group('ug')
# ORF = 'Dm.' + str(ORF)
# else :
# ORF = 'NA'
# if organism == "mus+musculus" :
# if ORF == 'NA' :
# ORF = re.search(r"(DR UniGene;)(\s)*(Mm.)(?P<ug>\w+)(;)", i)
# if ORF is not None:
# ORF = ORF.group('ug')
# ORF = 'Mm.' + str(ORF)
# else :
# ORF = 'NA'
# if organism == "rattus+norvegicus" :
# if ORF == 'NA' :
# ORF = re.search(r"(DR UniGene;)(\s)*(Rn.)(?P<ug>\w+)(;)", i)
# if ORF is not None:
# ORF = ORF.group('ug')
# ORF = 'Rn.' + str(ORF)
# else :
# ORF = 'NA'
# if organism == "arabidopsis+thaliana" :
# if ORF == 'NA' :
# ORF = re.search(r"(DR UniGene;)(\s)*(At.)(?P<ug>\w+)(;)", i)
# if ORF is not None:
# ORF = ORF.group('ug')
# ORF = 'At.' + str(ORF)
# else :
# ORF = 'NA'
# Recuperation de l'identifiant du gene : Hsapiens, Celegans, Dmelanogaster, Mmusculus, Rnorvegicus, Athaliana + Ecoli, Scerevisiae
elif i[0:10] == "DR KEGG;" :
if organism == "homo+sapiens" :
if ORF == 'NA' :
ORF = re.search(r"(hsa:)(?P<ug>\S+)(;)", i)
if ORF is not None:
ORF = ORF.group('ug')
ORF = 'hsa:' + str(ORF)
else :
ORF = 'NA'
if organism == "caenorhabditis+elegans" :
if ORF == 'NA' :
ORF = re.search(r"(cel:)(?P<ug>\S+)(;)", i)
if ORF is not None:
ORF = ORF.group('ug')
ORF = 'cel:' + str(ORF)
else :
ORF = 'NA'
if organism == "drosophila+melanogaster" :
if ORF == 'NA' :
ORF = re.search(r"(dme:)(?P<ug>\S+)(;)", i)
if ORF is not None:
ORF = ORF.group('ug')
ORF = 'dme:' + str(ORF)
else :
ORF = 'NA'
if organism == "mus+musculus" :
if ORF == 'NA' :
ORF = re.search(r"(mmu:)(?P<ug>\S+)(;)", i)
if ORF is not None:
ORF = ORF.group('ug')
ORF = 'mmu:' + str(ORF)
else :
ORF = 'NA'
if organism == "rattus+norvegicus" :
if ORF == 'NA' :
ORF = re.search(r"(rno:)(?P<ug>\S+)(;)", i)
if ORF is not None:
ORF = ORF.group('ug')
ORF = 'rno:' + str(ORF)
else :
ORF = 'NA'
#if organism == "escherichia+coli" :
# if ORF == 'NA' :
# ORF = re.search(r"(eco:)(?P<ug>\S+)(;)", i)
# if ORF is not None:
# ORF = ORF.group('ug')
# ORF = 'eco:' + str(ORF)
# else :
# ORF = 'NA'
#if organism == "saccharomyces+cerevisiae" :
# if ORF == 'NA' :
# ORF = re.search(r"(sce:)(?P<ug>\S+)(;)", i)
# if ORF is not None:
# ORF = ORF.group('ug')
# ORF = 'sce:' + str(ORF)
# else :
# ORF = 'NA'
# Recuperationdu nom du gene, des synonymes et de l'identifiant du gene
if i[0:2] == "GN" :
# Recuperation des synonymes
if test == 1 :
ligneSyn = i.split(";")
n = len(i)
listeLigne = ligneSyn[0].strip("GN ")
laLigne = listeLigne.replace(" ", "")
listeSyn = laLigne.split(",")
Supprimer = []
for k in range(0, len(listeSyn)) : # enlever {...}
sup = re.search(r"({)(?P<supp>\S+)(})", listeSyn[k])
if sup is not None :
sup = sup.group('supp')
sup = "{" + str(sup) + "}"
listeSyn[k] = listeSyn[k].replace(sup, "")
sup = re.search(r"({)(?P<supp>\S+)$", listeSyn[k])
if sup is not None :
sup = sup.group('supp')
sup = "{" + str(sup)
listeSyn[k] = listeSyn[k].replace(sup, "")
sup = re.search(r"^(?P<supp>\S+)(})", listeSyn[k])
if sup is not None :
Supprimer.append(k)
for s in Supprimer :
del listeSyn[s]
m = len(listeSyn)
if i[n-1] == ";" and m > 0 : # tester si c'est la derniere ligne
test = 0
listeSyn[m - 1] = listeSyn[m - 1].replace(";", "")
elif m == 0 :
test = 0
listeSyn = []
else :
del listeSyn[m - 1]
if len(listeSyn) > 0 :
ajout = ";".join(listeSyn)
Syn = Syn + ajout + ";"
SYN = re.search(r"(Synonyms=)", i)
if SYN is not None :
listeLigne = i.split(";")
if len(listeLigne) == 2 : # la ligne ne contient que les synonymes, et la liste est complete sur une seule ligne
listeLigne[0] = listeLigne[0].strip("GN Synonyms=")
laLigne = listeLigne[0].replace(" ", "")
listeSyn = laLigne.split(",")
Syn = ''
Supprimer = []
for k in range(0, len(listeSyn)) : # enlever {...}
sup = re.search(r"({)(?P<supp>\S+)(})", listeSyn[k])
if sup is not None :
sup = sup.group('supp')
sup = "{" + str(sup) + "}"
listeSyn[k] = listeSyn[k].replace(sup, "")
sup = re.search(r"({)(?P<supp>\S+)$", listeSyn[k])
if sup is not None :
sup = sup.group('supp')
sup = "{" + str(sup)
listeSyn[k] = listeSyn[k].replace(sup, "")
sup = re.search(r"^(?P<supp>\S+)(})", listeSyn[k])
if sup is not None :
Supprimer.append(k)
for s in Supprimer :
del listeSyn[s]
Syn = ";".join(listeSyn)
Syn = Syn + ";"
elif len(listeLigne) == 1 : # la ligne ne contient que les synonymes, et la liste ne se termine pas a la premiere ligne, elle est donc sur au moins deux lignes
test = 1
listeLigne[0] = listeLigne[0].strip("GN Synonyms=")
laLigne = listeLigne[0].replace(" ", "")
listeSyn = laLigne.split(",")
Syn = ''
Supprimer = []
for k in range(0, len(listeSyn)) : # enlever {...}
sup = re.search(r"({)(?P<supp>\S+)(})", listeSyn[k])
if sup is not None :
sup = sup.group('supp')
sup = "{" + str(sup) + "}"
listeSyn[k] = listeSyn[k].replace(sup, "")
sup = re.search(r"({)(?P<supp>\S+)$", listeSyn[k])
if sup is not None :
sup = sup.group('supp')
sup = "{" + str(sup)
listeSyn[k] = listeSyn[k].replace(sup, "")
sup = re.search(r"^(?P<supp>\S+)(})", listeSyn[k])
if sup is not None :
Supprimer.append(k)
for s in Supprimer :
del listeSyn[s]
del listeSyn[-1]
Syn = ";".join(listeSyn)
Syn = Syn + ";"
else : # la ligne contient diverses informations en plus des synonymes, il faut donc retrouver le fragment concernant les synonymes
for j in range(0, len(listeLigne)) :
SYN = re.search(r"(Synonyms=)", listeLigne[j])
if SYN is not None :
if listeLigne[-1] == "" : # la liste se termine sur cette ligne
if j == 0 :
listeLigne[j] = listeLigne[j].strip("GN Synonyms=")
else :
listeLigne[j] = listeLigne[j].strip(" Synonyms=")
laLigne = listeLigne[j].replace(" ", "")
listeSyn = laLigne.split(",")
Syn = ''
Supprimer = []
for k in range(0, len(listeSyn)) : # enlever {...}
sup = re.search(r"({)(?P<supp>\S+)(})", listeSyn[k])
if sup is not None :
sup = sup.group('supp')
sup = "{" + str(sup) + "}"
listeSyn[k] = listeSyn[k].replace(sup, "")
sup = re.search(r"({)(?P<supp>\S+)$", listeSyn[k])
if sup is not None :
sup = sup.group('supp')
sup = "{" + str(sup)
listeSyn[k] = listeSyn[k].replace(sup, "")
sup = re.search(r"^(?P<supp>\S+)(})", listeSyn[k])
if sup is not None :
Supprimer.append(k)
for s in Supprimer :
del listeSyn[s]
Syn = ";".join(listeSyn)
Syn = Syn + ";"
else : # la liste est sur plusieurs lignes
test = 1
listeLigne[j] = listeLigne[j].strip(" Synonyms=")
laLigne = listeLigne[j].replace(" ", "")
listeSyn = laLigne.split(",")
Syn = ''
Supprimer = []
for k in range(0, len(listeSyn)) : # enlever {...}
sup = re.search(r"({)(?P<supp>\S+)(})", listeSyn[k])
if sup is not None :
sup = sup.group('supp')
sup = "{" + str(sup) + "}"
listeSyn[k] = listeSyn[k].replace(sup, "")
sup = re.search(r"({)(?P<supp>\S+)$", listeSyn[k])
if sup is not None :
sup = sup.group('supp')
sup = "{" + str(sup)
listeSyn[k] = listeSyn[k].replace(sup, "")
sup = re.search(r"^(?P<supp>\S+)(})", listeSyn[k])
if sup is not None :
Supprimer.append(k)
for s in Supprimer :
del listeSyn[s]
del listeSyn[-1]
Syn = ";".join(listeSyn)
Syn = Syn + ";"
else:
SYN = 'NA'
else :
SYN = 'NA'
# Recuperation de l'identifiant du gene : Scerevisiae + Ecoli
if organism == "saccharomyces+cerevisiae" or organism == "escherichia+coli" :
if ORF == "NA" :
ligneORF = i.split(";")
for k in ligneORF :
k = k.replace(" ","")
orfs = re.search(r"(.)*(OrderedLocusNames=)(?P<orf>\S+)$", k)
if orfs is not None:
ORF = orfs.group('orf')
ORF = str(ORF)
sup = re.search(r"({)(?P<SUP>\S+)(})", ORF)
if sup is not None :
sup = sup.group('SUP')
sup = '{' + sup + '}'
ORF = ORF.replace(sup, "")
else :
orfs = 'NA'
if organism == "arabidopsis+thaliana" :
if ORF == "NA" :
ligneORF = i.split(";")
for k in ligneORF :
k = k.replace(" ","")
k = k.split("{")[0]
orfs = re.search(r"(.)*(OrderedLocusNames=)(?P<orf>\S+)$", k)
if orfs is not None:
ORF = orfs.group('orf')
ORF = str(ORF)
ORF = ORF.upper()
sup = re.search(r"({)(?P<SUP>\S+)(})", ORF)
if sup is not None :
sup = sup.group('SUP')
sup = '{' + sup + '}'
ORF = ORF.replace(sup, "")
else :
orfs = 'NA'
# Recuperation du nom du gene
if Name == "NA" :
ligneName = i.split(";")
for k in ligneName :
k = k.replace(" ", "")
NAME = re.search(r"(Name=)(?P<id>\S+)$", k)
if NAME is not None:
Name = NAME.group('id')
Name = str(Name)
sup = re.search(r"({)(?P<SUP>\S+)$", Name)
if sup is not None :
sup = sup.group('SUP')
sup = '{' + sup
Name = Name.replace(sup, "")
else :
NAME = 'NA'
# A la fin de chaque bloc (chaque proteine) les donnees sont rassemblees sur une meme ligne
elif i[0:2] == "//" :
if Name == 'NA' :