Skip to content
Snippets Groups Projects
Commit b3554d8f authored by UMEC Mathieu's avatar UMEC Mathieu
Browse files

stable version before modification of metabolite tracking and increase in input data

parent cc9d1854
No related branches found
No related tags found
No related merge requests found
......@@ -8,6 +8,7 @@ import pandas as pd
from bs4 import BeautifulSoup
sys.path.append('..')
from utils import column_recovery
from bioservices import ChEBI
def chebi_horizontal(file, outfile_ful_name, n=0, sep=";", flow=None):
......@@ -464,6 +465,71 @@ def chebi_in_outgouing(file, n, outfill_ful_name, sep=";"):
print(time_of_running)
def chebi_ontgoing_is_a(chebi_list):
"""
Take a list of chebi ID and return ID and name of the outgoing metabolites
who have for link "is a"
Arg:
- chebi_list : list of chebi ID
["CHEBI:15377", "CHEBI:15377"]
Returns:
1 dictionnary of result
"""
chebi_base = ChEBI()
dic_return = {}
for chebi in chebi_list:
ontology_parents = chebi_base.getOntologyParents(chebi)
dic_return[chebi] = []
for elem in ontology_parents["ListElement"]:
if elem["type"] == "is a":
dic_return[chebi].append([elem["chebiName"], elem["chebiId"]])
return dic_return
def get_ontology_parents(chebi_id):
"""
Take a chebi ID and return ID and name of the first Chebi parents
Arg:
- chebi_id : one id ChEBI
"CHEBI:15377"
Returns:
1 list of 2 informations
"""
chebi = ChEBI()
ontology_parents = chebi.getOntologyParents(chebi_id)
if ontology_parents:
out_final = [ontology_parents["ListElement"][0]["type"],
ontology_parents["ListElement"][0]["chebiName"]]
return out_final
return None
def get_ontology_links(chebi_id, link):
"""
Take a chebi ID and return ID and name of the first Chebi childer
Arg:
- chebi_id : one id ChEBI
CHEBI:15377
- link : the ontology links you want to search
Returns:
1 list of 2 informations
"""
chebi = ChEBI()
ontology_children_links = chebi.getAllOntologyChildrenInPath(chebi_id, link)
if ontology_children_links:
out_final = [ontology_children_links["ListElement"][0]["chebiId"],
ontology_children_links["ListElement"][0]["chebiAsciiName"]]
return out_final
return None
"""
def back_to_chebi_ontology(chebi_use, not_to_exceed_level):
url_chebi = "https://www.ebi.ac.uk/chebi/chebiOntology.do?chebiId="
url_chebi += chebi_use + "&treeView=true#vizualisation"
......@@ -490,7 +556,7 @@ def back_to_chebi_ontology(chebi_use, not_to_exceed_level):
ont_lev_che = clean_split_tree.index(name_cheb_treat.lower())
print(ont_lev_che)
print(not_to_exceed_level)
"""
#
for name_kegg_tf in names_k:
if name_kegg_tf.lower() in clean_split_tree:
name_kegg_in_ontology.append(name_kegg_tf)
......@@ -520,7 +586,9 @@ def back_to_chebi_ontology(chebi_use, not_to_exceed_level):
ontology_level_diff.append("NA")
name_kegg_in_ontology.append("None")
"""
"""
if __name__ == "__main__":
"""
FOLDER = "C:\\Users\\mumec\\Desktop\\test_all_functions\\"
......@@ -528,5 +596,7 @@ if __name__ == "__main__":
FI = FOLDER + "data_test.csv"
FI_OUT = FOLDER + "result_ch.xlsx"
chebi_in_outgouing(FI, 0, FI_OUT, sep=";")
back_to_chebi_ontology("CHEBI:17964", 10)
"""
back_to_chebi_ontology("CHEBI:17964", 10)
\ No newline at end of file
chebis_list = ["CHEBI:15377", "CHEBI:15378"]
print(chebi_ontgoing_is_a(chebis_list))
\ No newline at end of file
......@@ -4,10 +4,15 @@ This module is designed to draw network from CPDB mapping using Cytoscape
import sys
import json
import pandas as pd
import requests
import urllib
from utils import excel_file_writer, column_recovery
from bs4 import BeautifulSoup
from bioservices import KEGG, ChEBI
sys.path.append('..')
from mapping_using_api import send_request_to_mapping_api
LOCAL = "C:\\Users\\mumec\\Desktop\\test_all_functions\conversion_metabolites_id\\"
from time import time
LOCAL = "C:\\Users\\mumec\\Desktop\\Mini_codes\\amelioration_conversion_consensus\\"
def chemical_props_ramp_api(metabolites_list, out_folder):
......@@ -111,8 +116,215 @@ def equiv_from_ma_api(metabolites_list, inputype, out_folder):
return api_datas
def id_conv_opti(out_folder, cts_data=None, ma_data=None,
ramp_data=None, mtx_data=None, start_type_id="chebi"):
def kegg_common_names(l_kegg_id):
"""
Take a list of KEGG ID and return associated names
Arg:
- chebi_list : list of chebi ID
["kegg:C00015","C00016"]
Returns:
1 dictionnary of result
"""
kegg_base = KEGG() # Initialisation of KEGG service
dic_back = {}
for kegg_id in l_kegg_id:
if kegg_id !="NA":
dic_back[kegg_id] = []
if kegg_id[:4] != "kegg":
kegg_inf = kegg_base.get("kegg:" + kegg_id)
else:
kegg_inf = kegg_base.get(kegg_id)
if kegg_inf:
lines = kegg_inf.split('\n')
for line in lines:
if len(dic_back[kegg_id]) != 0 and not line.startswith("FORMULA"):
dic_back[kegg_id].append(line.strip())
if line.startswith("NAME"):
dic_back[kegg_id].append(line.replace("NAME", "").strip())
elif line.startswith("FORMULA"):
break
return dic_back
def chebi_common_names(l_chebi_id):
"""
Take a list of ChEBI ID and return associated names
Arg:
- chebi_list : list of chebi ID
["CHEBI:27252", "27732", "chebi:27832"]
Returns:
1 dictionnary of result
"""
chebi_base = ChEBI()
dic_back = {}
for chebi_id in l_chebi_id:
dic_back[chebi_id] = []
if chebi_id.upper()[:5] != "CHEBI":
chebi_inf = chebi_base.getLiteEntity("CHEBI:" + chebi_id)
else:
chebi_inf = chebi_base.getLiteEntity(chebi_id.upper())
for name in chebi_inf:
dic_back[chebi_id].append(name['chebiAsciiName'])
return dic_back
def inchikey_common_names(l_inchikey_id):
"""
Take a list of InchIKey ID and return the UPAC names found
Arg:
- l_inchikey_id : list of InchIKey ID
["BSYNRYMUTXBXSQ-UHFFFAOYSA-N","BSYNRYMUTXBXSQ-UHFCFAOYSA-N"]
Returns:
1 dictionnary of result
"""
dic_back = {}
for inchikey in l_inchikey_id:
dic_back[inchikey] = []
url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchikey/{inchikey}/json"
response = requests.get(url)
if response.status_code == 200:
data = response.json()
for posibility in range(len(data['PC_Compounds'][0]['props'])):
if 'sval' in data['PC_Compounds'][0]['props'][6]['value']:
dic_back[inchikey] = data['PC_Compounds'][0]['props'][6]['value']['sval']
break
if len(dic_back[inchikey]) == 0:
dic_back[inchikey] = "No result"
return dic_back
def smiles_common_names(l_smiles):
"""
Take a list of SMILES ID and return the first UPAC name find
Arg:
- l_smiles : list of SMILES ID
["C[C@H](CCCC(C)C)[C@H]1CC[C@@H]2[C@@]1(CC[C@H]3[C@H]2CC=C4[C@@]3(CC[C@@H](C4)O)C)C"]
Returns:
1 dictionnary of result
"""
dic_back = {}
for smiles_id in l_smiles:
dic_back[smiles_id] = []
url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/{}/json".format(smiles_id)
response = requests.get(url)
if response.status_code == 200: # response of URL is fine
data = response.json()
for posibility in range(len(data['PC_Compounds'][0]['props'])):
if 'sval' in data['PC_Compounds'][0]['props'][posibility]['value']:
dic_back[smiles_id] = data['PC_Compounds'][0]['props'][posibility]['value']['sval']
break
else:
dic_back[smiles_id] = "Query error"
return dic_back
def hmdb_common_names(l_hmdb_id):
"""
Take a list of HMDB ID and return associated names 1 name by ID
Arg:
- l_hmdb_id : list of HMDB ID
["HMDB0000532", "hmdb:HMDB0000432"]
Returns:
1 dictionnary of result
"""
dic_back = {}
for hmdb_id in l_hmdb_id:
clean_hmdb_id = hmdb_id.strip("hmdb:").strip()
url = f"https://hmdb.ca/metabolites/{clean_hmdb_id}"
try:
soup = BeautifulSoup(urllib.request.urlopen(url).read().decode("utf-8"), "html.parser")
text = soup.get_text()[:300]
pos_end = text.index("(" + clean_hmdb_id + ")") - 1
pos_start = text.index("for") + 4
# the results is the name contain in the first row of the response :
# Human Metabolome Database: Showing metabocard for {nam} ({id})
dic_back[hmdb_id] = text[pos_start:pos_end].strip()
except:
dic_back[hmdb_id] = "No result found"
return dic_back
def identifier_id(l_ids):
"""
Take a list of ID and return the type of ID for each ID
This function only take in count ChEBI, HMDB, InChIKey, KEGG and SMILES
Arg:
- l_ids : list of HMDB ID
["ODHCTXKNWHHXJC-VKHMYHEASA-N", "chebi:58402", "58402",
"OC(=O)[C@@H]1CCC(=O)N1", "NA", "C00062", "kegg:C00062",
"HMDB0001565", "hmdb:HMDB0001565"]
Returns:
1Dic with types of ID
"""
dic_id_type = {}
for id_now in l_ids:
if id_now[:4] in ["HMDB", "hmdb"]:
dic_id_type[id_now] = "HMDB"
elif id_now[:4] == "kegg" or id_now[0] in ["C", "D", "K"]:
dic_id_type[id_now] = "KEGG"
elif id_now[:5] == "chebi" or 4 < len(id_now) < 8:
dic_id_type[id_now] = "ChEBI"
elif len(id_now) == 27:
dic_id_type[id_now] = "InChIKey"
elif len(id_now) > 2:
dic_id_type[id_now] = "SMILES"
else:
dic_id_type[id_now] = "NA"
return(dic_id_type)
def common_names_of_id(ids_list):
"""
Take a list of ID and return the name associated for each ID
This function only take in count ChEBI, HMDB, InChIKey, KEGG and SMILES
Arg:
- ids_list : list of HMDB ID
["ODHCTXKNWHHXJC-VKHMYHEASA-N", "chebi:58402", "58402",
"OC(=O)[C@@H]1CCC(=O)N1", "NA", "C00062", "kegg:C00062",
"HMDB0001565", "hmdb:HMDB0001565"]
Returns:
1Dic with types of ID
"""
dic_id_type = {}
for id_now in ids_list:
if id_now[:4] in ["HMDB", "hmdb"]:
results = hmdb_common_names(ids_list)
break
elif id_now[:4] == "kegg" or id_now[0] in ["C", "D", "K"] and 4 < len(id_now) < 8:
results = kegg_common_names(ids_list)
break
elif id_now[:5] == "chebi" or 4 < len(id_now) < 8:
results = chebi_common_names(ids_list)
break
elif len(id_now) == 27 and id_now.count("-") == 2:
results = inchikey_common_names(ids_list)
break
elif len(id_now) > 2:
results = smiles_common_names(ids_list)
break
else:
print("first ID of list is NA")
return(results)
def id_conv_opti(out_folder, cts_data=None, ma_data=None, ramp_data=None,
mtx_data=None, start_type_id="chebi", return_names="True"):
"""
Take csv file of id convert and give the best consensus conversion
careful you neeed the exact same number of line and
......@@ -128,11 +340,15 @@ def id_conv_opti(out_folder, cts_data=None, ma_data=None,
Returns:
1 excel file with the best conversion
"""
# Create a list for each ID type to be compared
t1 = time()
hmdb_comp = []
kegg_comp = []
chebi_comp = []
inchikey_comp = []
smiles_comp = []
# For each conversion file we divide the results into lists
if mtx_data is not None:
inchikey_mtx = column_recovery(mtx_data, 4)
chebi_mtx = column_recovery(mtx_data, 2) # pas sûr
......@@ -143,7 +359,7 @@ def id_conv_opti(out_folder, cts_data=None, ma_data=None,
l_input = column_recovery(mtx_data, 0)
if cts_data is not None:
# position dépendant des données demander à améliorer
# For now this code can only be use for this exact repartition on CTS
kegg_cts = column_recovery(cts_data, 6)
chebi_cts = column_recovery(cts_data, 1)
hmdb_cts = column_recovery(cts_data, 3)
......@@ -157,6 +373,7 @@ def id_conv_opti(out_folder, cts_data=None, ma_data=None,
hmdb_comp.append(hmdb_cts)
chebi_comp.append(chebi_cts)
inchikey_comp.append(inchikey_cts)
# In the case of we havn't data from MetanetX we use CTS as data marker
if mtx_data is None:
l_input = []
cts_input = column_recovery(cts_data, 0)
......@@ -177,6 +394,8 @@ def id_conv_opti(out_folder, cts_data=None, ma_data=None,
hmdb_comp.append(hmdb_ma)
chebi_comp.append(chebi_ma)
smiles_comp.append(smiles_ma)
# In the case of we havn't data from MetanetX and CTS we use
# MetaboAnalyst as data marker
if mtx_data is None and cts_data is None:
ma_input = column_recovery(ma_data, 0)
l_input = []
......@@ -201,51 +420,75 @@ def id_conv_opti(out_folder, cts_data=None, ma_data=None,
smiles_ramp[smiles_ramp.index([])] = "NA"
inchikey_comp.append(inchikey_ramp)
smiles_comp.append(smiles_ramp)
dic_order = {2: ["metaNetx", "CTS", "MetaBoAnalyst"],
3: ["metaNetx", "CTS", "RaMP"],
4: ["metaNetx", "MetaboAnalyst", "RaMP"]}
end_results = [l_input]
end_results = [l_input] # Create table will be return at the end
all_inf_bases = [hmdb_comp, kegg_comp, chebi_comp,
inchikey_comp, smiles_comp]
for n_cle, inf_bases in enumerate(all_inf_bases):
print("It's' the turn of the " + str(n_cle + 1) + " ID type" )
# If we have more than 1 value for this ID we compare
if len(inf_bases) in [2, 3]:
if inf_bases[0][0] == "reference":
type_same = ["chebi identique"]
if return_names is True:
names = ["Noms associés aux ID chebi"]
type_to_check = ["chebi a verifier"]
else:
type_same = [inf_bases[0][0] + " identique"]
if return_names is True:
names = ["Noms associés aux ID " + inf_bases[0][0]]
type_to_check = [inf_bases[0][0] + " a verifier"]
if len(inf_bases) == 2:
for itn, typenow in enumerate(inf_bases[0][1:]):
# We treat all type of ID
for itn, typenow in enumerate(inf_bases[0][1:]): ######################################### 3
if typenow.strip() == inf_bases[1][itn+1].strip():
type_same.append(typenow)
if return_names is True:
names.append(common_names_of_id([typenow]))
type_to_check.append("NA")
else:
type_same.append("NA")
if return_names is True:
names.append(common_names_of_id([typenow, inf_bases[1][itn+1]]))
type_to_check.append([typenow, inf_bases[1][itn+1]])
if len(inf_bases) == 3:
# We treat not the HMDB and KEGG
convert_ng = ["different tool"]
for itn, typenow in enumerate(inf_bases[0][1:]):
for itn, typenow in enumerate(inf_bases[0][1:]): ############################################ 3
if typenow.strip() == inf_bases[1][itn+1].strip() == inf_bases[2][itn+1].strip():
type_same.append(typenow)
if return_names is True:
names.append(common_names_of_id([typenow]))
type_to_check.append("NA")
convert_ng.append("all same")
elif typenow.strip() == inf_bases[1][itn+1].strip():
type_same.append(typenow)
if return_names is True:
names.append(common_names_of_id([typenow, inf_bases[2][itn+1]]))
type_to_check.append(inf_bases[2][itn+1])
convert_ng.append(dic_order[n_cle][2])
elif inf_bases[2][itn+1].strip() == inf_bases[1][itn+1].strip():
type_same.append(inf_bases[2][itn+1])
if return_names is True:
names.append(common_names_of_id([inf_bases[2][itn+1], typenow]))
type_to_check.append(typenow)
convert_ng.append(dic_order[n_cle][0])
elif typenow.strip() == inf_bases[2][itn+1].strip():
type_same.append(typenow)
if return_names is True:
names.append(common_names_of_id([typenow, inf_bases[1][itn+1]]))
type_to_check.append(inf_bases[1][itn+1])
convert_ng.append(dic_order[n_cle][1])
else:
type_same.append("NA")
if return_names is True:
names.append(common_names_of_id([typenow, inf_bases[1][itn+1],
inf_bases[2][itn+1]]))
type_to_check.append([typenow, inf_bases[1][itn+1],
inf_bases[2][itn+1]])
convert_ng.append("all different")
......@@ -254,18 +497,31 @@ def id_conv_opti(out_folder, cts_data=None, ma_data=None,
end_results.append(type_to_check)
if len(inf_bases) == 3:
end_results.append(convert_ng)
if return_names is True:
end_results.append(names)
out_file = out_folder + "best_conversion.xlsx"
conversion_df = pd.DataFrame(data=end_results).transpose()
excel_file_writer(conversion_df, out_file, sheetname="reume")
excel_file_writer(conversion_df, out_file, sheetname="results")
print(time() - t1)
return "comparaison résultats de comparaison OK"
if __name__ == "__main__":
FOLDER = LOCAL + "ico\\"
CTS_PATH = FOLDER + "cts_data_test.csv"
MA_PATH = FOLDER + "MA_data_test.csv"
RAMP_PATH = FOLDER + "RaMP_data_test.csv"
MTX_PATH = FOLDER + "metanetx_data_test.csv"
id_conv_opti(FOLDER, cts_data=CTS_PATH, ma_data=MA_PATH,
ramp_data=RAMP_PATH, mtx_data=MTX_PATH)
CTS_PATH = LOCAL + "CTS_L100_chebi_rev_17-07-2024.csv"
MA_PATH = LOCAL + "MetaboAnalyst_L100_chebi_rev_17-07-2024.csv"
RAMP_PATH = LOCAL + "RaMP_L100_chebi_rev_17-07-2024.csv"
MTX_PATH = LOCAL + "MetaNetx_L100_chebi_rev_17-07-2024.csv"
id_conv_opti(LOCAL, cts_data=CTS_PATH, ma_data=MA_PATH,
ramp_data=RAMP_PATH, mtx_data=MTX_PATH, return_names="False")
"""
all_inf_bases = [hmdb_comp, kegg_comp, chebi_comp,
inchikey_comp, smiles_comp]
function_names = [hmdb_common_names, ]
identifier_id(["ODHCTXKNWHHXJC-VKHMYHEASA-N", "chebi:58402", "58402",
"OC(=O)[C@@H]1CCC(=O)N1", "NA", "C00062", "kegg:C00062",
"HMDB0001565", "hmdb:HMDB0001565"])
"""
#print(common_names_of_id(["HMDB0001565", "hmdb:HMDB0001565"]).values())
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment