begin kegg ans chebi ontology work

453f00c9 · UMEC Mathieu · ecfd9f4f · 453f00c9 · 453f00c9 · 453f00c9
Commit 453f00c9 authored 10 months ago by UMEC Mathieu
--- a/src/processing_mapping_results/chebi_id/recovery_of_associated_chebi_id.py
+++ b/src/processing_mapping_results/chebi_id/recovery_of_associated_chebi_id.py
@@ -6,8 +6,8 @@ import time
 import sys
 import pandas as pd
 from bs4 import BeautifulSoup
-sys.path.append('C:\\Users\\mumec\\Desktop\\Dossier_gitlab_local\\traitement_des_données')
-from utils import column_recovery
+sys.path.append('C:\\Users\\mumec\\Desktop\\Dossier_gitlab_local\\traitement_des_données\\src\\processing_mapping_results')
+from utils import column_recovery, excel_file_writer


 def chebi_horizontal(file, outfile_ful_name,  n=0, sep=";", flow=None):
@@ -462,11 +462,88 @@ def chebi_in_outgouing(file, n, outfill_ful_name, sep=";"):
    print(time_of_running)


+def ontologie_gap_kegg_chebi(csv_file, out_folder):
+    l_kegg = column_recovery(csv_file, 0)
+    l_cheb = column_recovery(csv_file, 1)
+    name_kegg = ["associated name of KEGG ID"]
+    name_chebi = ["associated name of ChEBI ID"]
+    ontologie_level_chebi = ["associated ontologie level of ChEBI ID"]
+    ontologie_level_equiv_kegg  = ["positional difference betwen kegg et chebi ID"]
+    chebi_on_kegg = ["ID chebi from KEGG"]
+    for i_ac_k, ac_kegg in enumerate(l_kegg):
+        accheb = l_cheb[i_ac_k].strip().upper()
+        ackegg = ac_kegg.strip().upper()
+        if accheb == "NA":
+            url_kegg = "https://www.genome.jp/entry/"+ackegg
+            print(url_kegg)
+            soupk = BeautifulSoup(urllib.request.urlopen(url_kegg).read().decode("utf-8"), "html.parser")
+            textk = soupk.get_text()
+            #print(text)
+            i_bnamek = textk.find("Name") + 5
+            i_enamek = textk.find("Formula")
+            #print(textk[i_bnamek: i_enameK])
+            pv_pos = [0]
+            bef_pv = 0
+            while(textk[i_bnamek + bef_pv + 1: i_enamek].find(";") != -1):
+                bef_pv += textk[i_bnamek + bef_pv + 1: i_enamek].find(";") +1
+                pv_pos.append(bef_pv)
+            if len(pv_pos) == 1:
+                 name_kegg.append([textk[i_bnamek: i_enamek].strip()])
+            else:
+                names_k = []
+                for ipvp, pvp in enumerate(pv_pos[1:]):
+                    names_k.append(textk[i_bnamek + pv_pos[ipvp]: i_bnamek + pvp].strip(";").strip("\n"))
+                names_k.append(textk[i_bnamek + pv_pos[-1]: i_enamek].strip(";").strip("\n"))
+                name_kegg.append(names_k)
+            if textk.find("ChEBI") != -1:
+                if textk.find("LIPIDMAPS") != -1:
+                    chebi_aso = textk[textk.find("ChEBI") +  7: textk.find("LIPIDMAPS")].strip()
+                else:
+                    chebi_aso = textk[textk.find("ChEBI") +  7: textk.find("KCF data")].strip()
+                print(chebi_aso)
+                blanc_pos = [0]
+                blanc = 0
+                for i_ch_bl, ch_bl in enumerate(chebi_aso):
+                    if ch_bl == " ":
+                       blanc_pos.append(i_ch_bl)
+                if len(pv_pos) == 1:
+                    chebi_on_kegg.append(chebi_aso)
+                else:
+                    c_aso = []
+                    for ipca, pca in enumerate(blanc_pos[1:]):
+                        c_aso.append(chebi_aso[blanc_pos[ipca]: pca].strip())
+                    c_aso.append(chebi_aso[blanc_pos[-1]:].strip())
+                    chebi_on_kegg.append(c_aso)
+            else:
+                chebi_on_kegg.append("NA")
+            name_chebi.append("NA")
+            ontologie_level_chebi.append("NA")
+            ontologie_level_equiv_kegg.append("NA")
+    print(chebi_on_kegg)
+    #print(name_kegg)
+    out_df = pd.DataFrame([l_kegg, name_kegg, chebi_on_kegg, l_cheb, name_chebi,
+                           ontologie_level_chebi, ontologie_level_equiv_kegg]).transpose()
+    outf_name = out_folder + "KEGG_and_ChEBI_dif_ontologie_test.xlsx"
+    excel_file_writer(out_df, outf_name,  sheetname="resume_ontology")
+    """
+        elif ackegg == "NA":
+            name_kegg.append("NA")
+            name_chebi.append(accheb)
+            ontologie_level_chebi.append("NA")  # voir si on sort le niveau de l'ontologie
+            ontologie_level_equiv_kegg.append("NA")
+
+        else:
+            url = "https://www.ebi.ac.uk/chebi/searchId.do?chebiId="+ac_chebi
+            soup = BeautifulSoup(urllib.request.urlopen(url).read().decode("utf-8"), "html.parser")
+            text = soup.get_text()
+            i_beg_p_name = text.find("ChEBI Name")
+            inter_name = text[i_beg_p_name: i_beg_p_name+300]
+            i_end_p_name = inter_name.find("ChEBI ID")
+            name_chebi = text[i_beg_p_name+10: i_beg_p_name + i_end_p_name]
+            name_chebi = name_chebi.strip()
+    """
+
 if __name__ == "__main__":
-    LOCAL = "C:\\Users\\mumec\\Desktop\\fichier_mis_en_forme_programme_total\\recovery_asso_chebi"
-    OUTF1 = LOCAL + "outgoing_test.xlsx"
-    OUTF2 = LOCAL + "horizontal_test.xlsx"
-    INF = LOCAL + "l_chebi_to_change_oeil_for_test.csv"
-    COL = [1, 3]
-    chebi_in_outgouing(INF, 0, OUTF1)
-    print(chebi_horizontal(INF, OUTF2))
+    LOCAL = "C:\\Users\\mumec\\Desktop\\Mini_codes\\ontologie_kegg_chebi\\"
+    infile = LOCAL  + "ontologie_gap_chebi_na.csv"
+    ontologie_gap_kegg_chebi(infile, LOCAL)
\ No newline at end of file
--- a/src/processing_mapping_results/conversion_metabolites_id.py
+++ b/src/processing_mapping_results/conversion_metabolites_id.py
@@ -16,7 +16,7 @@ import urlopen
 #from metanetx_sdk import et1_table
 #import rpy2.robjects as robjects

-LOCAL = "C:\\Users\\mumec\\Desktop\\conversion_ids\\Liste_test_chebi\\"
+LOCAL = "C:\\Users\\mumec\\Desktop\\conversion_ids\\c18\\"

 def send_request_to_mapping_api(url, data_json, head, met='POST'):
    """
@@ -152,6 +152,7 @@ def id_conv_opti(out_folder, cts_data=None, ma_data=None,
                 ramp_data=None, mtx_data=None, start_type_id="chebi"):
    """
    Take csv file of id convert and give the best consensus conversion
+    careful you neeed the exact same number of line and coreespondance between all enter datas

    cts_data = csv file of CTS conversion site data
    ma_data = csv file of MeaboAnalyst conversion  site data
@@ -308,7 +309,7 @@ if __name__ == "__main__":
                    "Cholic acid", "Citric acid", "Deoxycholic acid"]
    intyp = 'name'
    print(equiv_from_ma_api(meta_test_ma, intyp, out_fold))
-    """
+
    cts_path = LOCAL + "L100_chebi_cts_rev_23-04-2024.csv"
    ma_path = LOCAL + "L100_chebi_metaboanalyst_rev_23-04-2024.csv"
    ramp_path = LOCAL + "L100_chebi_ramp_rev_23-04-2024.csv"
@@ -317,8 +318,7 @@ if __name__ == "__main__":
    id_conv_opti(out_fold, cts_data=cts_path, ma_data=ma_path,
                 ramp_data=ramp_path, mtx_data=mtx_path)
    """
-    cts_path = LOCAL + "L100_inchikey_cts_rev_23-04-2024.csv"
-    mtx_path = LOCAL + "L100_inchikey_metanetx_rev_23-04-2024.csv"
+    cts_path = LOCAL + "CTS_c18_pos_rev_29-04-2024.csv"
+    ma_path = LOCAL + "MA_c18_pos_rev_29-04-2024.csv"

-    id_conv_opti(out_fold, cts_data=cts_path, mtx_data=mtx_path)
-    """
\ No newline at end of file
+    id_conv_opti(out_fold, cts_data=cts_path, ma_data=ma_path)
\ No newline at end of file
--- a/src/processing_mapping_results/main.py
+++ b/src/processing_mapping_results/main.py
@@ -304,13 +304,13 @@ def workflow(infile, out_folder):
        val_matching = [0, 0, 0]
        for id_map in l_id_map[1:]:
            if id_map == "NA":
-                all_verif[pos_l_id_map].append("Not matched")
+                all_verif[pos_l_id_map].append("not matched")
                val_matching[2] += 1
            elif id_map in exa_id:
-                all_verif[pos_l_id_map].append("Exact matching")
+                all_verif[pos_l_id_map].append("Exact match")
                val_matching[0] += 1
            else:
-                all_verif[pos_l_id_map].append("Non-exact matching")
+                all_verif[pos_l_id_map].append("Non-exact match")
                val_matching[1] += 1
        name_sav_vis = vis_dir + "couverture_mapping " + title + ".png"
        cam_vis(l_matching, val_matching, title, sav=name_sav_vis)

--- a/src/processing_mapping_results/visu_datas_mapping.py
+++ b/src/processing_mapping_results/visu_datas_mapping.py
@@ -104,7 +104,7 @@ def view_recovery(file, title_plot="Recouvrement moyen",
    return fig


-def color_pie(data, labels, colors, explode=None, fsize=(10, 10), pourcent=True):
+def color_pie(data, colors, labels=None, explode=None, fsize=(10, 10), pourcent=True):
    """
    plot a clor pie

@@ -125,16 +125,25 @@ def color_pie(data, labels, colors, explode=None, fsize=(10, 10), pourcent=True)
    for posdata, xdata in enumerate(data):
        if xdata != 0:
            data_clean.append(xdata)
-            labels_clean.append(labels[posdata])
+            if labels is not None:
+                labels_clean.append(labels[posdata])
            explode_clean.append(explode[posdata])
            colors_clean.append(colors[posdata])
    plt.figure(figsize=fsize)
    if pourcent is True:
-        plt.pie(data_clean, labels=labels_clean, pctdistance=0.8, explode=explode_clean,
-                colors=colors_clean, autopct=lambda x: str(round(x, 1)) + '%', textprops={'fontsize': 14})
+        if labels is not None:
+            plt.pie(data_clean, labels=labels_clean, pctdistance=0.9, explode=explode_clean,
+                    colors=colors_clean, autopct=lambda x: str(round(x, 1)) + '%', textprops={'fontsize': 16})
+        else:
+            plt.pie(data_clean,  pctdistance=0.80, explode=explode_clean, colors=colors_clean,
+                    autopct=lambda x: str(round(x, 1)) + '%', textprops={'fontsize': 18})
    else:
-        plt.pie(data_clean, labels=labels_clean, pctdistance=0.8, explode=explode_clean,
-                colors=colors_clean, autopct='%d', textprops={'fontsize': 16})
+        if labels is not False:
+            plt.pie(data_clean, labels=labels_clean, pctdistance=0.8, explode=explode_clean,
+                    colors=colors_clean, autopct='%.f', textprops={'fontsize': 16})
+        else:
+            plt.pie(data_clean, pctdistance=0.8, explode=explode_clean,
+                    colors=colors_clean, autopct='%.f', textprops={'fontsize': 16})
    plt.legend(loc=1, fontsize=8)
    plt.show()

@@ -315,6 +324,84 @@ def cam_vis(name, values, title, sav=None):
        plt.savefig(sav)
    plt.show()

+def barplot_temp(column_x, column_y, df_data, title="barplot", figure_size=(15, 30),
+            ax_x_label="voies métaboliques", ax_y_label='Recouvrement moyen',
+            colors='Spectral', decimal='%.1f', y_lim=None):
+    """
+    drawn barplot from data
+
+    Arg:
+        column_x = data for plot absises axis
+        column_y = data for plot ordinate axis
+        df_data = dataframe of data to plot
+
+    Returns:
+        Type de retour:
+    """
+    f, ax = plt.subplots(1, 1, figsize=figure_size)
+    p1 = sns.barplot(x=column_y, y=column_x, data=df_data, palette=colors)
+    max_bot = 0
+    for boto in df_data[column_x]:
+        if len(boto) > max_bot:
+            max_bot=len(boto)
+    p1.set(title=title)
+    plt.xlabel(ax_y_label)
+    plt.ylabel(ax_x_label)
+    if y_lim != None:
+        plt.xlim(0, y_lim)
+    p1.bar_label(p1.containers[0], fontsize=16)
+    plt.subplots_adjust(top=0.95, bottom=0.05, right=0.95, left=0.25)
+    p1.tick_params(axis='y', size=0.05, labelsize=16)
+    return p1
+
+def up_down_path_plot_temp(l_path, up, down, log_p):
+    """
+    plot regulation of pathways
+
+    Parameters :
+    L_path : List of pathways names
+    up : numbers of metabolites with up regulation
+    down : numbers of metabolites with down regulation
+    log_p :  value of -log(pvalue of the pathways
+
+    return : 1 plot of regulation pathways
+    """
+    fig, ax1 = plt.subplots(figsize=(22, 14))
+    max_bot = 0
+    for boto in l_path:
+        if len(boto) > max_bot:
+            max_bot=len(boto)
+    if max_bot>20:
+        plt.subplots_adjust(top=0.95, bottom=0.40*(max_bot/100))
+    else:
+        plt.subplots_adjust(top=0.95, bottom=0.20)
+    l_bar = 0.8
+    x = range(len(down))
+    ax1.bar(x, down, width=l_bar, label="Down regulated", color="tomato")
+    ax1.bar(x, up, width=l_bar, bottom=down,
+            label="Up regulated", color="mediumturquoise")
+    ax1.set_ylabel("% of metabolites modulate", size=16)
+    plt.ylim(top=100)
+    ax2 = ax1.twinx()
+    ax2.plot(l_path, log_p, marker='o', linestyle='-',
+             label="-log(pvalue)", color="mediumpurple")
+    plt.title("Pathways modulation", size=16)
+    ax2.set_xlabel("Pathways", size=16)
+    ax2.set_ylabel("-log(pvalue)", size=16)
+    ax2.yaxis.set_tick_params(labelsize = 16)
+    ax1.yaxis.set_tick_params(labelsize = 16)
+    plt.ylim(bottom=0, top=max(log_p)+0.5)
+    fig.legend(loc='upper right', bbox_to_anchor=(0.9, 0.95))
+    ax1.set_xticks(l_path)
+    if max_bot>25:
+        for patch in ax1.patches:
+            patch.set_width(0.8)
+        ax1.set_xticklabels(l_path, size=11, rotation=-90, ha='center')
+    else:
+        ax1.set_xticklabels(l_path, size=11, rotation=-90, ha='center')
+    return fig
+
+
 if __name__ == "__main__":
    """
    L_PATH = ["pathways1", "pathways2ffffffffffffffffffffffffffffffffffffffffff", "pathways3"]
@@ -336,9 +423,30 @@ if __name__ == "__main__":
    title = "mapping coverage of MetaboAnalyst on KEGG"
    cam_vis(name, values, title)
    """
-    x = [4, 43, 0, 5, 48, 0]
-    y=["Identical", "different", "CTS", "MetaboAnalyst", "MetaNetx", "RaMP"]
+    """
+    x = [28, 24, 48, 1, 1, 1]
+    y=["Identique", "different", "CTS", "MetaboAnalyst", "MetaNetx", "RaMP"]
    exp=[0.02, 0.02, 0.02, 0.02, 0.02, 0.02]
    co=["steelblue", "firebrick", "darkorange", "limegreen", "yellow", "darkviolet"]
    color_pie(x, y, co, explode=exp, fsize=(15, 15), pourcent=False)
+    x = [2671, 363, 49613, 1377]
+    y=["Reactome","KEGG","HMDB","Wikipathways"]
+    exp=[0.02, 0.04,0.04, 0.04]
+    co=["darkorange","firebrick","wheat","steelblue"]
+    color_pie(x, co, labels=False, explode=exp, fsize=(15, 15), pourcent=True)
+
+    l_p = ["Glycine_ serine_ alanine and threonine metabolism", "Valine_ leucine and isoleucine degradation", "Urea cycle and metabolism of arginine_etc", "Aminosugars metabolism", "Vitamin B3 (nicotinate and nicotinamide) metabolis", "Pentose phosphate pathway", "Lysine metabolism", "Methionine and cysteine metabolism", "Galactose metabolism"]
+    upp =[5.681818182, 8.064516129, 3.2, 7.143, 2.857, 8.108, 3.846, 2.564, 2.128]
+    downn = [4.545454545, 0, 1.6, 2.381, 5.714, 0, 1.923, 1.282, 2.128]
+    llog_p = [20.31115619, 10.35604427, 9.295, 9.106, 6.725, 6.557, 5.575, 4.448, 3.402]
+
+    up_down_path_plot_temp(l_p, upp, downn, llog_p)
+    """
+    x = [14, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+    y=["amino acids and derivatives", "FA Carn / FA and derivatives", "carbohydrates and derivatives",
+       "MonoSGL", "LPC", "organic acids", "PC", "PE", "peptides", "PS", "SM", "steroids", "TG", "Autres"]
+    exp=[0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02]
+    co=["steelblue", "firebrick", "darkorange", "limegreen", "yellow", "darkviolet", "lightsteelblue", "lightcoral", "bisque", "aquamarine", "lightyellow", "plum", "pink", "grey"]
+    color_pie(x, co, labels=None, explode=exp, fsize=(15, 15), pourcent=True)
+    plt.show()