diff --git a/Mapping_using_the_API.py b/Mapping_using_the_API.py index 4884197c1237a8adec91d67e50a3c006c586e156..5f7041ff9096aa9d10f1650a97c408a31671539d 100644 --- a/Mapping_using_the_API.py +++ b/Mapping_using_the_API.py @@ -7,23 +7,10 @@ import json from urllib import request import xmltodict import pandas as pd +from utils import excel_file_writer, pre_cut, recup_all_inf_excel FOLDER = "C:\\Users\\mumec\\Desktop\\Mini_codes\\" -def recup_all_inf_excel(file): - """ - This function takes infos from a .xlsx - - Arg: - file = the file to read - Returns: - Type of return: 1 list of list line - """ - datas = pd.read_excel(file, header=None, na_filter=False) - l_datas = datas.values.tolist() - return l_datas - - def send_request_to_mapping_api(url, data_json, head, met='POST'): """ This function gives the result of mapping of a metabolites list from RAMP. @@ -46,47 +33,7 @@ def send_request_to_mapping_api(url, data_json, head, met='POST'): return out_data -def excel_file_writer(dataframe, name_out_file, sheetname="Resultats"): - """ - write an excel file - - Arg: - dataframe = the data to write on dataframe shape - name_out_file = name of the outfile to write - sheetname = name of the sheet to write - - Returns: - Type of return: 1 excel file whith 5 columns - """ - ex_f = pd.ExcelWriter(name_out_file) - dataframe.to_excel(ex_f, sheet_name=sheetname, index=False, header=False) - ex_f.close() - - -def pre_cut(listed): - """ - cut only 1 type of ID by the first entree - - Arg: - list: 1 list of id - - Returns: - Type of return: 1 list - """ - clean_list = [] - cump = 0 - while listed[cump] == "NA": - cump += 1 - pos_cut = listed[cump].index(":") - for elem in listed: - if elem == "NA": - clean_list.append("NA") - else: - clean_list.append(elem[pos_cut+1:]) - return clean_list - - -def mapping_ramp_api(metabolites_list, outfile, inf="opti"): +def mapping_ramp_api(metabolites_list, outfile, inf="flow", flow=False): """ This function gives the result of mapping of a metabolites list from RAMP. Here's an example of 4 metabolites giving 505 lines. @@ -94,7 +41,7 @@ def mapping_ramp_api(metabolites_list, outfile, inf="opti"): Arg: metabolites_list = a list of metabolites id - outfiles = name of the outfile to write + outfile = name of the outfile to write inf = if all give the full information Returns: @@ -132,7 +79,7 @@ def mapping_ramp_api(metabolites_list, outfile, inf="opti"): break print(str(len(l_met_map))+" metabolites were found") return (len(l_met_map), l_met_map) - if inf == "all": + if inf in ("all", "flow"): psource = [] pathwayid = [] commonname = [] @@ -141,13 +88,13 @@ def mapping_ramp_api(metabolites_list, outfile, inf="opti"): onel = datas_to_treat[i_b_l[index_pos]:i_b_l[index_pos+1]] pathwayname.append(onel[16:onel.find("pathwaySource")-3]) inputid.append(onel[onel.find("inputId")+10:onel.find("commonName")-3]) - if inf == "all": + if inf in ("all", "flow"): psource.append(onel[onel.find("pathwaySource")+16:onel.find("pathwayId")-3]) pathwayid.append(onel[onel.find("pathwayId")+12:onel.find("inputId")-3]) commonname.append(onel[onel.find("commonName")+13:len(onel)-3]) pathwayname.insert(0, "pathwayName") inputid.insert(0, "inputid") - if inf == "all": + if inf in ("all", "flow"): psource.insert(0, "pathway_source") pathwayid.insert(0, "pathwayid") commonname.insert(0, "commonname") @@ -431,7 +378,7 @@ def get_cpdb_version(): def m_ora_cpdb(accnumbers, acctype, cpdbidsbg=None, - pthreshold=0.05, infos="all", + pthreshold=0.05, infos="flow", ofile="C:\\Users\\mumec\\Desktop\\test_out_cpdb.xlsx"): """ Give the result of id mapping on CPDB @@ -522,7 +469,7 @@ def m_ora_cpdb(accnumbers, acctype, cpdbidsbg=None, id_cln = tab_cor[1].index(id_t) l_map_cor.append(tab_cor[0][id_cln]) return l_map_cor - if infos == "all": + if infos in ("all", "flow"): splited = details.split("',") fsetid = ["fsetId"] cpdburl = ["URLCPDB"] @@ -596,6 +543,8 @@ def m_ora_cpdb(accnumbers, acctype, cpdbidsbg=None, overlapping, size, e_size, fsetid, pmids, cpdburl] out_df = pd.DataFrame(data=out_f).transpose() excel_file_writer(out_df, ofile, sheetname="Resultats") + if infos == "flow": + return out_f return ovlent_c @@ -628,7 +577,7 @@ def multimapping_ramp(file, num_col, outfiles, infpath="Yes"): return l_o_d -def opti_multimapping(file, outfolder, mapping="YES"): +def opti_multimapping(file, outfolder, mapping="flow"): """ Processe optimal mapping of RAMP and CPDB @@ -640,13 +589,19 @@ def opti_multimapping(file, outfolder, mapping="YES"): Returns: Type of return: 2 excel files """ - inf = recup_all_inf_excel(file) + if mapping == "flow": + n_mapped = [] + inf = file + else: + inf = recup_all_inf_excel(file) to_test = [] recap = [[]] id_dif = [] col_id = [] + modulation =[] for line in inf: recap[0].append(line[0]) + modulation.append(line[-1]) for ind_head, headers in enumerate(inf[0][1:-1]): if headers not in id_dif: id_dif.append(headers) @@ -681,18 +636,24 @@ def opti_multimapping(file, outfolder, mapping="YES"): if col_actu[index_change] != "NA": to_test.remove(col_actu[index_change]) if len(cpdb_o_opti) == len(inf[1:]) or n_col == col_id[i_t_i][-1]: - if mapping == "YES": + if mapping == "all": cpdbf = outfolder+acctype+"_mapping_opti_cpdb.xlsx" m_ora_cpdb(cpdb_o_opti, acctype, infos="all", ofile=cpdbf) l_opti_for_this_id[0] = "CPDB "+acctype + if mapping == "flow": + n_mapped.append(len(cpdb_o_opti)) recap.append(l_opti_for_this_id) break + if mapping == "flow": + i_map_opt = n_mapped.index(max(n_mapped)) + 1 + cpdbf = outfolder+recap[i_map_opt][0]+"_mapping_opti.xlsx" + datas_cpdb = m_ora_cpdb(cpdb_o_opti, acctype, infos="flow", ofile=cpdbf) for line in inf[1:]: to_test.append(line[1]) l_opt_ramp = [] l_opt_ramp_tri = ["NA" for i in range(len(inf))] n_meta_map = 0 - ramp_outf = FOLDER+"optimapping_ramp.xlsx" + ramp_outf = outfolder+"optimapping_ramp.xlsx" n_meta_map, l_opt_ramp = mapping_ramp_api(to_test, ramp_outf, inf="opti") if n_meta_map == len(inf)-1: mapping_ramp_api(l_opt_ramp, ramp_outf, inf="all") @@ -708,7 +669,6 @@ def opti_multimapping(file, outfolder, mapping="YES"): l_opt_ramp_tri[index_l+1] = li[1] input_col = 2 while n_meta_map != len(inf)-1 and input_col != (len(inf[0])-1): - # prend en compte la derniére colones de fold-change if len(to_test) != 0: n_sup, s_map = mapping_ramp_api(to_test, ramp_outf, inf="opti") n_meta_map += n_sup @@ -725,20 +685,27 @@ def opti_multimapping(file, outfolder, mapping="YES"): for ind_ind in index_still: if inf[ind_ind][input_col] != "NA": to_test.append(inf[ind_ind][input_col]) - if mapping == "YES": - mapping_ramp_api(l_opt_ramp, ramp_outf, inf="all") + if mapping == "all": + n_map, datas_ramp = mapping_ramp_api(l_opt_ramp, ramp_outf, inf="all") + if mapping == "flow": + ramp_outf = outfolder + "ramp_mapping_opti.xlsx" + n_map, datas_ramp = mapping_ramp_api(l_opt_ramp, ramp_outf, inf="flow") + print("lines Ramp", n_map) l_opt_ramp_tri[0] = "RAMP" recap.append(l_opt_ramp_tri) - recap = pd.DataFrame(data=recap).transpose() - n_out_f = outfolder+"recap_mapping_opti_oeil.xlsx" - excel_file_writer(recap, n_out_f, sheetname="Resultats") + recap.append(modulation) + df_recap = pd.DataFrame(data=recap).transpose() + n_out_f = outfolder+"recap_multimapping.xlsx" + excel_file_writer(df_recap, n_out_f, sheetname="Resultats") + if mapping == "flow": + return datas_cpdb, datas_ramp, recap return "all is ok" if __name__ == "__main__": F_ENTER = FOLDER+"Donnees_oeil_mis_en_forme_opti_mapping.xlsx" - #opti_multimapping(F_ENTER, FOLDER) + opti_multimapping(F_ENTER, FOLDER) F_O = FOLDER + "test_enrichment_ramp.xlsx" - a, b = mapping_ramp_api(["KEGG:C01157","hmdb:HMDB0000064","hmdb:HMDB0000148","chebi:16015"], F_O, inf="all") - b = pd.DataFrame(data=b).transpose() - excel_file_writer(b, F_O, sheetname="Resultats") \ No newline at end of file + #a, b = mapping_ramp_api(["KEGG:C01157","hmdb:HMDB0000064","hmdb:HMDB0000148","chebi:16015"], F_O, inf="all") + #b = pd.DataFrame(data=b).transpose() + #excel_file_writer(b, F_O, sheetname="Resultats") \ No newline at end of file diff --git a/Visualisation_des_donnes_de_mapping.py b/Visualisation_des_donnes_de_mapping.py index 926f7f4979393a50cab517f6dc35b1340d882229..2ebdf96b102903b7e872bae745cc82f82b2ab9ac 100644 --- a/Visualisation_des_donnes_de_mapping.py +++ b/Visualisation_des_donnes_de_mapping.py @@ -175,6 +175,31 @@ def boite_a_metabolites(file, title_plot="boîte à moustache", num_col_plot=1): plt.show() +def barplot(column_x, column_y, df_data, title="barplot", figure_size=(30, 5), + ax_x_label="voies métaboliques", ax_y_label='Recouvrement moyen', + colors='Spectral', decimal='%.1f', size_of_labels=6): + """ + drawn barplot from data + + Arg: + column_x = data for plot absises axis + column_y = data for plot ordinate axis + df_data = dataframe of data to plot + + Returns: + Type de retour: + """ + plt.subplots(1, 1, figsize=figure_size) + p1 = sns.barplot(x=column_x, y=column_y, data=df_data, palette=colors) + plt.subplots_adjust(top=0.90, bottom=0.26) + p1.set(title=title) + plt.xlabel(ax_x_label) + plt.ylabel(ax_y_label) + p1.bar_label(p1.containers[0], fontsize=7, fmt=decimal) + p1.tick_params(axis='x', rotation=90, size=0.05, labelsize=size_of_labels) + return p1 + + def up_down_path_plot(l_path, up, down, log_p): """ plot regulation of pathways diff --git a/__pycache__/Visualisation_des_donnes_de_mapping.cpython-310.pyc b/__pycache__/Visualisation_des_donnes_de_mapping.cpython-310.pyc index e273d9b1eabea1642dccb1dcff7fafa4f1eeb334..a1fe174ff1b8ff659683d23c083016ede401ef4f 100644 Binary files a/__pycache__/Visualisation_des_donnes_de_mapping.cpython-310.pyc and b/__pycache__/Visualisation_des_donnes_de_mapping.cpython-310.pyc differ diff --git a/complete_processing_of_mapping_results.py b/complete_processing_of_mapping_results.py index c0eb5d3d8efdf3f55fb988523b925b233789f09f..28c5e32931487fb9eae65b3f28ea57516c1185e4 100644 --- a/complete_processing_of_mapping_results.py +++ b/complete_processing_of_mapping_results.py @@ -11,53 +11,11 @@ import pandas as pd from math import log, floor import sys sys.path.append('C:\\Users\\mumec\\Desktop\\Dossier_gitlab_local\\traitement_des_données') -from Visualisation_des_donnes_de_mapping import up_down_path_plot +from Visualisation_des_donnes_de_mapping import up_down_path_plot, barplot +from utils import column_recovery, comma_cleaning, cor_index, excel_file_writer, excel_m_file_writer LOCAL = "C:\\Users\\mumec\\Desktop\\fichier_mis_en_forme_programme_total\\" - -def column_recovery(file, n, sep=";", enc=None): - """ - Put the culomn n of the file in list - - Arg: - file : A csv file to read - n : the number of column to read - sep : type of separator - - Returns: - Type of return: list - """ - with open(file, "r", encoding=enc) as f: - r = csv.reader(f, delimiter=sep) - lines = list(r) - res = [] - if abs(n) < len(lines[0]): - for line in lines: - if line[n].strip() != '': - res.append(line[n].strip()) - return res - - -def cor_index(list_objects_to_convert, l_all_obj, l_all_equ): - """ - Change elements of a list by the correspondance elements - - Arg: - list_objects_to_convert : list of object - l_all_obj : list who countain all objet to convert - l_all_equ : correspondance list of all object - - Returns: - Type of return: list - """ - l_to_return = [] - for item_to_replace in (list_objects_to_convert): - l_to_return.append(l_all_equ[l_all_obj.index(item_to_replace.strip())]) - - return l_to_return - - -def recup_ramp_pathways_list(ramp_mapping_result, correspondence_file): +def recup_ramp_pathways_list(ramp_mapping_result, correspondence_file, flow=False): """ Give a list of pathways with the correspondent metabolites names @@ -68,10 +26,16 @@ def recup_ramp_pathways_list(ramp_mapping_result, correspondence_file): Returns: Type of return: list """ - column_pathways_name = column_recovery(ramp_mapping_result, 0) - c_input_id = column_recovery(ramp_mapping_result, 3) - associated_name = column_recovery(correspondence_file, 0) - list_aso = column_recovery(correspondence_file, 1) + if flow == True: + column_pathways_name = ramp_mapping_result[0] + c_input_id = ramp_mapping_result[3] + list_aso = correspondence_file[-2] + associated_name = correspondence_file[0] + else: + column_pathways_name = column_recovery(ramp_mapping_result, 0) + c_input_id = column_recovery(ramp_mapping_result, 3) + associated_name = column_recovery(correspondence_file, 0) + list_aso = column_recovery(correspondence_file, 1) all_pathways = [] m_id_asso_p = [] @@ -84,14 +48,14 @@ def recup_ramp_pathways_list(ramp_mapping_result, correspondence_file): m_id_asso_p[all_pathways.index(c_p_n)].append(c_input_id[number]) for path_num, pathways in enumerate(all_pathways): - pat = cor_index(m_id_asso_p[path_num], list_aso, associated_name) + pat = cor_index(m_id_asso_p[path_num], list_aso, associated_name) ######ça bloque a cause de l'output du mutlti mapping ['....'] pat.insert(0, pathways) l_to_return.append(pat) return l_to_return -def recup_cpdb_pathways_list(cpdb_mapping_result, correspondence_file): +def recup_cpdb_pathways_list(cpdb_mapping_result, correspondence_file, flow=False): """ Give a list of pathways with the correspondent metabolites names @@ -102,21 +66,49 @@ def recup_cpdb_pathways_list(cpdb_mapping_result, correspondence_file): Returns: Type of return: list """ + if flow == True: + l_pathways = cpdb_mapping_result[2] + l_path_metabo_whith_top = cpdb_mapping_result[5] + id_use = l_path_metabo_whith_top[1][0] + for itraverse, traverse in enumerate(correspondence_file): + if id_use in traverse: + associated_chebi = correspondence_file[itraverse] + break + associated_name = correspondence_file[0] + p_value = cpdb_mapping_result[0] + m_inp_ol = cpdb_mapping_result[8] + l_pathways = l_pathways[1:] + l_path_metabo = l_path_metabo_whith_top[1:] + l_to_return = [] + + for num_path_t, l_p_m in enumerate(l_path_metabo): + path_cont = l_p_m + paths_to_rec = cor_index(path_cont, associated_chebi, associated_name) + paths_to_rec.insert(0, l_pathways[num_path_t]) + paths_to_rec.insert(0, m_inp_ol[num_path_t + 1]) + paths_to_rec.insert(0, p_value[num_path_t + 1]) + l_to_return.append(paths_to_rec) + print(paths_to_rec) + return l_to_return + associated_name = column_recovery(correspondence_file, 0) associated_chebi = column_recovery(correspondence_file, 1) l_pathways = column_recovery(cpdb_mapping_result, 2) - l_pathways = l_pathways[1:] l_path_metabo_whith_top = column_recovery(cpdb_mapping_result, 5) - l_path_metabo = l_path_metabo_whith_top[1:] p_value = column_recovery(cpdb_mapping_result, 0) m_inp_ol = column_recovery(cpdb_mapping_result, 8) + l_pathways = l_pathways[1:] + l_path_metabo = l_path_metabo_whith_top[1:] l_to_return = [] + for num_path_t, l_p_m in enumerate(l_path_metabo): + print(l_p_m) path_cont = [] if (len(l_p_m)) > 6: # regulation a vérifier comma_pos = [] for index, t_l_p_m in enumerate(l_p_m): - if t_l_p_m == ",": # probleme entre ; et , + print(t_l_p_m) + if t_l_p_m == ";": # probleme entre ; et , comma_pos.append(index) for n_comma in range(len(comma_pos)+1): if n_comma == 0: @@ -135,6 +127,7 @@ def recup_cpdb_pathways_list(cpdb_mapping_result, correspondence_file): paths_to_rec.insert(0, m_inp_ol[num_path_t + 1]) paths_to_rec.insert(0, p_value[num_path_t + 1]) l_to_return.append(paths_to_rec) + print(paths_to_rec) return l_to_return @@ -192,52 +185,6 @@ def recup_ma_pathways_list(ma_mapping_result, number_of_columns): return l_to_return -def comma_cleaning(str_to_clean): - """ - Replace potential ',' by '_' - - Arg: - str_to_clean = list of character with potentialy ',' - - Returns: - Type of return: character - """ - if ',' in str_to_clean: - while ',' in str_to_clean: - str_to_clean = re.sub(",", "_", str(str_to_clean)) - return str_to_clean - - -def excel_file_writer(dataframe, n_o_f, sheetname="Resultats"): - """ - Take a dataframe and write an excel file with this data - - Arg: - dataframe = dataframe of data to write - n_o_f = name and acces path of the new excel file - sheetname = The name of the new sheet - """ - ex_f = pd.ExcelWriter(n_o_f) # pylint: disable=abstract-class-instantiated - dataframe.to_excel(ex_f, sheet_name=sheetname, header=False, index=False) - ex_f.close() - - -def excel_m_file_writer(list_of_dataframe, n_outf, list_of_sheetname): - """ - Take a list of dataframe and write an excel file with these data - - Arg: - list_of_dataframe = list of dataframe to write - n_outf = name and acces path of the new excel file - list_of_sheetname = list of sheets names to write - """ - e_f = pd.ExcelWriter(n_outf) # pylint: disable=abstract-class-instantiated - for df_index, l_o_d in enumerate(list_of_dataframe): - s_n = list_of_sheetname[df_index] - l_o_d.to_excel(e_f, sheet_name=s_n, header=False, index=False) - e_f.close() - - def pathways_selection(list_of_list_to_select, list_of_object_to_filter): """ Only keep the object they are not in the filter list @@ -342,35 +289,9 @@ def df_matrix_r(sim_matrix): return look_like -def barplot(column_x, column_y, df_data, title="barplot", figure_size=(30, 5), - ax_x_label="voies métaboliques", ax_y_label='Recouvrement moyen', - colors='Spectral', decimal='%.1f', size_of_labels=6): - """ - drawn barplot from data - - Arg: - column_x = data for plot absises axis - column_y = data for plot ordinate axis - df_data = dataframe of data to plot - - Returns: - Type de retour: - """ - plt.subplots(1, 1, figsize=figure_size) - p1 = sns.barplot(x=column_x, y=column_y, data=df_data, palette=colors) - plt.subplots_adjust(top=0.90, bottom=0.26) - p1.set(title=title) - plt.xlabel(ax_x_label) - plt.ylabel(ax_y_label) - p1.bar_label(p1.containers[0], fontsize=7, fmt=decimal) - p1.tick_params(axis='x', rotation=90, size=0.05, labelsize=size_of_labels) - return p1 - - def c_p_o_m_r(file, outf, mapper, type_of_view="all", save_plot="all", fold_of_visu_sav=LOCAL, midfile="Yes", - midfile_name=LOCAL+"\\mid_file.xlsx", n_path_to_filt="nothing", modul=None, f_modul=None): """ Do the complet treatement of mapping results @@ -384,16 +305,21 @@ def c_p_o_m_r(file, outf, mapper, type_of_view="all", save_plot="all", save_plot = possibility to specify or not the 3 plots fold_of_visu_sav = folder where save plot(s) midfile = if yes the midfile while be write - midfile_name = name of the output midfile n_path_to_filt = list of object to filter """ if mapper == "CPDB": - c_file = input("In which file is the correspondence table?") - l_of_pathways_list = recup_cpdb_pathways_list(file, c_file) + if modul == "flow": + l_of_pathways_list = recup_cpdb_pathways_list(file, f_modul, flow=True) + else: + c_file = input("In which file is the correspondence table?") + l_of_pathways_list = recup_cpdb_pathways_list(file, c_file) elif mapper == "RAMP": - c_file = input("In which file is the correspondence table ?") - l_of_pathways_list = recup_ramp_pathways_list(file, c_file) + if modul == "flow": + l_of_pathways_list = recup_ramp_pathways_list(file, f_modul, flow=True) + else: + c_file = input("In which file is the correspondence table ?") + l_of_pathways_list = recup_ramp_pathways_list(file, c_file) elif mapper == "ME": fold = input("in which folder are the files?") # no "" around acces n_files = int(input("how many files you have in the folder?")) @@ -409,16 +335,24 @@ def c_p_o_m_r(file, outf, mapper, type_of_view="all", save_plot="all", log_p = [] metabo = column_recovery(f_modul, 0) value_modul = column_recovery(f_modul, 1) - #print(l_of_pathways_list) + if modul == "flow": + list_path = [] + up = [] + down = [] + log_p = [] + metabo = f_modul[0] + value_modul = f_modul[-1] for i_p_l, path_l in enumerate(l_of_pathways_list): - l_of_pathways_list[i_p_l][2] = comma_cleaning(path_l[2]) - if modul == True: + if mapper != "RAMP": + l_of_pathways_list[i_p_l][2] = comma_cleaning(path_l[2]) + else: + l_of_pathways_list[i_p_l][0] = comma_cleaning(path_l[0]) + if modul in (True, "flow") and mapper != "RAMP": actu_up = 0 actu_down = 0 list_path.append(l_of_pathways_list[i_p_l][2]) for path_meta in path_l[3:]: # print(comma_cleaning(path_meta)) Probable probléme de version entre ME et les autres (a vériifer) - #print(path_meta) if mapper == "ME": if float(value_modul[metabo.index(comma_cleaning(path_meta))]) >= 0: actu_up += 1 @@ -432,7 +366,7 @@ def c_p_o_m_r(file, outf, mapper, type_of_view="all", save_plot="all", up.append((actu_up/int(path_l[1]))*100) down.append((actu_down/int(path_l[1]))*100) log_p.append(-log(float(path_l[0]))) - if modul == True: + if modul in (True, "flow") and mapper != "RAMP": n_m_i_p = 200 if len(log_p) > n_m_i_p: print(len(log_p)) @@ -454,13 +388,14 @@ def c_p_o_m_r(file, outf, mapper, type_of_view="all", save_plot="all", plt.savefig(fold_of_visu_sav+"up_down_path_plot"+str(under_plot + 1)+".png") else : plot = up_down_path_plot(list_path, up, down, log_p) - plt.savefig(fold_of_visu_sav+"up_down_path_plot.png") + plt.savefig(fold_of_visu_sav+ mapper +"up_down_path_plot.png") if midfile == "Yes": + midfile_name = outf + mapper + "fmid_file.xlsx" mid_data = pd.DataFrame(l_of_pathways_list, dtype=object) excel_file_writer(mid_data, midfile_name, sheetname="Resultats") - for index_cleaning, full in enumerate(l_of_pathways_list): - l_of_pathways_list[index_cleaning] = full[2:] - #print(l_of_pathways_list) + if mapper != "RAMP": + for index_cleaning, full in enumerate(l_of_pathways_list): + l_of_pathways_list[index_cleaning] = full[2:] if n_path_to_filt != "nothing": l_path_l_treat = pathways_selection(l_of_pathways_list, n_path_to_filt) else: @@ -481,6 +416,7 @@ def c_p_o_m_r(file, outf, mapper, type_of_view="all", save_plot="all", sum_l = 0 for num_col in range(n_path_to_treat): shared = 0 + p1 = l_path_l_treat[num_line][1:] p2 = l_path_l_treat[num_col][1:] for metabolite_search in (p1): @@ -489,9 +425,12 @@ def c_p_o_m_r(file, outf, mapper, type_of_view="all", save_plot="all", mir_table[num_line, num_col] = shared sum_l += shared if len(p1) == 1: + print(p1) one_metabo_path.append(l_path_l_treat[num_line][0]) met_one_met_path.append(l_path_l_treat[num_line][1]) + print(l_path_l_treat[num_line]) pathways_names.append(l_path_l_treat[num_line][0]) + for metabolite_of_p1 in p1: if metabolite_of_p1 not in all_metabolites: all_metabolites.append(metabolite_of_p1) @@ -513,9 +452,10 @@ def c_p_o_m_r(file, outf, mapper, type_of_view="all", save_plot="all", all_metabolites[n_metab] = comma_cleaning(a_m) approximate_table = np.array(mir_table, dtype=object) - metabo_f1, path_metabo_f1 = list_f_1(metabolite_frequency, all_metabolites, + metabo_f1, path_metabo_f1 = list_f_1(metabolite_frequency, all_metabolites, # probléme a régler pathways_of_metabo, pathways_names, l_path_l_treat) + meta_and_path_p = pa_metabo(all_metabolites, pathways_of_metabo) all_metabolites.insert(0, "Ensemble des métabolites") @@ -533,10 +473,12 @@ def c_p_o_m_r(file, outf, mapper, type_of_view="all", save_plot="all", for index_fm in range(1, len(all_metabolites)): all_metabolites[index_fm] = f_metabo[index_fm-1][1] metabolite_frequency[index_fm] = f_metabo[index_fm-1][0] + inf_shap = [[len(one_metabo_path), one_metabo_path, met_one_met_path], [len(path_metabo_f1), metabo_f1, path_metabo_f1], [len(all_metabolites), all_metabolites, metabolite_frequency]] + inf_shap.sort() counter = 0 metabo_f_order = [] @@ -558,19 +500,19 @@ def c_p_o_m_r(file, outf, mapper, type_of_view="all", save_plot="all", metabo_f_order_for_export = np.array(metabo_f_order, dtype=object) metabo_f_order_for_export = pd.DataFrame(data=metabo_f_order_for_export) - patways_reco_order = recov_pos_path_name(totale_recovery, average_recovery, - pathways_names) + patways_reco_order = recov_pos_path_name(totale_recovery, average_recovery, pathways_names) patways_reco_order_for_export = pd.DataFrame(data=patways_reco_order) df_matrix_table = df_matrix_r(approximate_table) + result_out_file = outf+ mapper+"resultats_traitment_mapping.xlsx" excel_m_file_writer([patways_reco_order_for_export, df_matrix_table, metabo_f_order_for_export, - meta_and_path_p], outf, + meta_and_path_p], result_out_file, ["Voies métaboliques", "Table de ressemblance", "Fréquence métabolites", "Métabolites et leurs P"]) - data_for_recovery_visualization = pd.DataFrame(data=patways_reco_order[1:]) colnames_recovery = list(data_for_recovery_visualization.columns) + print(data_for_recovery_visualization) if type_of_view in ("all", "bar_plot", "bar_plot_r", "bar_r_meta_p"): barplot(colnames_recovery[2], colnames_recovery[1], @@ -578,7 +520,7 @@ def c_p_o_m_r(file, outf, mapper, type_of_view="all", save_plot="all", title="Recouvrement moyen des différentes voies métaboliques", figure_size=(22, 10), size_of_labels=6) if save_plot in ("all", "bar_plot", "bar_plot_r", "bar_r_meta_p"): - plt.savefig(fold_of_visu_sav+"bar_plot_of_recovery.png") + plt.savefig(fold_of_visu_sav+mapper+"bar_plot_of_recovery.png") plt.show() just_frequency = [] @@ -598,7 +540,7 @@ def c_p_o_m_r(file, outf, mapper, type_of_view="all", save_plot="all", figure_size=(22, 10), ax_x_label="Métabolites d'intérêt", ax_y_label='Fréquence', decimal='%.0f', size_of_labels=7) if save_plot in ("all", "bar_plot", "bar_plot_f", "bar_f_meta_p"): - plt.savefig(fold_of_visu_sav+"bar_plot_of_metabolites.png") + plt.savefig(fold_of_visu_sav+mapper+"bar_plot_of_metabolites.png") plt.show() if type_of_view in ("all", "meta_box", "bar_f_meta_p", "bar_r_meta_p"): @@ -607,20 +549,18 @@ def c_p_o_m_r(file, outf, mapper, type_of_view="all", save_plot="all", b1.set(title="Boîte à moustache des fréquences des métabolites") plt.ylabel("fréquence des métabolites") if save_plot in ("all", "meta_box", "bar_f_meta_p", "bar_r_meta_p"): - plt.savefig(fold_of_visu_sav+"metabolites_bo_of_frequency.png") + plt.savefig(fold_of_visu_sav+ mapper+"metabolites_bo_of_frequency.png") plt.show() - if __name__ == "__main__": #MAP = 'RAMP' MAP = "CPDB" #MAP = "ME" VIEW = "all" SAVE = "all" - INFILE = LOCAL + "ora_cpdb_data_yeux_reactome_rev_18-01-2024.csv" + INFILE = LOCAL + "CPDB\\Resultats_mapping_Chebi_ID_L100_CPDB.csv" #INFILE = "ExportExcel_6843" #INFILE = LOCAL + "RAMP\\sortie_Mapping_RAMP_L100_CheEBI.csv" - FINISHFILE = LOCAL + "test_oeil.xlsx" - FILE_MODUL = LOCAL + "chebi_modulation_intensite_patho_oeil_donnes_estelles_rev_19-01-2024.csv" - #FILE_MODUL = LOCAL + "CPDB\\liste_Chebi_des_100_chebi_ConsensusPAthDB_modul.csv" + FINISHFILE = LOCAL + "test.xlsx" + FILE_MODUL = LOCAL + "CPDB\\liste_Chebi_des_100_chebi_ConsensusPAthDB_modul.csv" c_p_o_m_r(INFILE, FINISHFILE, MAP, type_of_view=VIEW, save_plot=SAVE, modul=True, f_modul=FILE_MODUL) diff --git a/main.py b/main.py new file mode 100644 index 0000000000000000000000000000000000000000..b791e2c45d6f270ec81bceeea01b4a193e0eb0cb --- /dev/null +++ b/main.py @@ -0,0 +1,62 @@ +""" +This module is designed to process the data obtained during metabolite mapping. +The main function is c_p_o_m_r +""" +import re +import csv +import matplotlib.pyplot as plt +import seaborn as sns +import numpy as np +import pandas as pd +from math import log, floor +import sys +sys.path.append('C:\\Users\\mumec\\Desktop\\Dossier_gitlab_local\\traitement_des_données') +sys.path.append('C:\\Users\\mumec\\Desktop\\Dossier_gitlab_local\\chebi-ids.git') +from utils import excel_file_writer, column_recovery, excel_m_file_writer, comma_cleaning, pre_cut, cor_index, recup_all_inf_excel +from Recovery_of_associated_Chebi_IDs import chebi_horizontal, chebi_in_outgouing +from Visualisation_des_donnes_de_mapping import up_down_path_plot, barplot +from complete_processing_of_mapping_results import recup_ramp_pathways_list, recup_cpdb_pathways_list, recup_me_path_list, recup_ma_pathways_list, pathways_selection, list_f_1, pa_metabo, recov_pos_path_name, df_matrix_r, c_p_o_m_r +from Mapping_using_the_API import send_request_to_mapping_api, mapping_ramp_api, m_ora_cpdb, opti_multimapping +from network_visualization import Paths_link_CPDB, network_visu + +FOLDER = "C:\\Users\\mumec\\Desktop\\fichier_mis_en_forme_programme_total\\main\\" + + +def shapping_data(file, folder): + """ + Takes data from an excel file and formats it for further workflow steps + + Arg: + file : file with data obtain after analysis + folder : folder in which the Excel file containing the modification results will be saved + + Returns: + Type of return: list and 1 file .xlsx + + """ + beg_datas = recup_all_inf_excel(file) + """ + if "chebi" in beg_datas[0]: + i_c_chebi = beg_datas.find("chebi") + chebi_increased = chebi_horizontal(beg_datas[i_c_chebi]) # soit modifier pour sortir la liste soit créer une fonction qui fait les 2 directement + chebi_increased.append(chebi_in_outgouing(beg_datas[i_c_chebi])) + datas_for_mapping = chebi_increased + beg_datas[1:i_c_chebi] + beg_datas[i_c_chebi+1:] + """ + datas_for_mapping = beg_datas + df_dfm = pd.DataFrame(data=datas_for_mapping) + n_o_f = folder + "Datas_mis_en_forme_pour_le_mapping.xlsx" + excel_file_writer(df_dfm, n_o_f) + return(datas_for_mapping) + + +if __name__ == "__main__": + INFILE = FOLDER + "Donnees_oeil_mis_en_forme_opti_mapping.xlsx" + datas_f_map = shapping_data(INFILE, FOLDER) + result_cpdb, result_ramp, recap = opti_multimapping(datas_f_map, FOLDER, mapping="flow") + #c_p_o_m_r(result_ramp, FOLDER, "RAMP", fold_of_visu_sav=FOLDER, modul="flow", f_modul=recap) + #c_p_o_m_r(result_cpdb, FOLDER, "CPDB", fold_of_visu_sav=FOLDER, modul="flow", f_modul=recap) + l_bdd = ["Reactome", "Wikipathways", "KEGG", "EHMN", "HumanCyc", "SMPDB", "INOH"] + for bdd in l_bdd: + out_path_links = FOLDER + "CPDB_links_network"+ bdd+"datas_base.xlsx" + edge_data, nodes_data = Paths_link_CPDB(result_cpdb, out_path_links , recap, bdd= bdd, flow=True) + print(network_visu(edge_data[0:3], nodes_data, bdd="HumanCyc")) diff --git a/network_visualization.py b/network_visualization.py new file mode 100644 index 0000000000000000000000000000000000000000..3a6692e5fb1d97e397e6c994f220e0cc194ddbd2 --- /dev/null +++ b/network_visualization.py @@ -0,0 +1,173 @@ +import re +import csv +import matplotlib.pyplot as plt +import seaborn as sns +import numpy as np +import pandas as pd +import py4cytoscape as p4c +from py4cytoscape import palette_color_brewer_d_RdBu +from math import log, floor +import sys +sys.path.append('C:\\Users\\mumec\\Desktop\\Dossier_gitlab_local\\traitement_des_données') +from utils import excel_file_writer, column_recovery +LOCAL = "C:\\Users\\mumec\\Desktop\\Mini_codes\\" + + +def Paths_link_CPDB(csv_file, out_file, int_file, bdd="Reactome", flow=None): + if flow==None: + all_l_paths = column_recovery(csv_file, 2) + all_l_len_path = column_recovery(csv_file, 8) + all_l_meta_in = column_recovery(csv_file, 5) + all_l_p_value = column_recovery(csv_file, 0) + source = column_recovery(csv_file, 3) + l_all_meta = column_recovery(int_file, 0)[1:] + int_cas = column_recovery(int_file, 1)[1:] + int_tem = column_recovery(int_file, 2)[1:] + modul = [] + for i_cas, cas in enumerate(int_cas): + modul.append(float(cas) - float(int_tem[i_cas])) + l_all_meta[i_cas] = l_all_meta[i_cas].strip() + else: + all_l_paths = csv_file[2] + all_l_len_path = csv_file[8] + all_l_meta_in = csv_file[5] + all_l_p_value = csv_file[0] + source = csv_file[3] + if 'HMDB' in all_l_meta_in[1][0]: + l_all_meta = int_file[2][1:]# output cpdb ID probléme bientot + else: + l_all_meta = int_file[1][1:] + modul = int_file[-1][1:] + l_paths = [] + l_len_path = [] + l_p_value = [] + l_meta_in = [] + for ip, np in enumerate(all_l_paths): + if source[ip] == bdd: + l_paths.append(np.replace(",",";")) + l_p_value.append(all_l_p_value[ip]) + l_len_path.append(all_l_len_path[ip]) + l_meta_in.append(all_l_meta_in[ip]) + for i_lpval, lpval in enumerate(l_p_value[1:]): + if "e" in lpval: + pvalac = '0.'+ (int(lpval[-2:])-1)*'0' +lpval[0] +lpval[2:-4] + print(pvalac) + l_p_value[i_lpval+1] = float(pvalac) + else: + l_p_value[i_lpval+1] = float(lpval) + edge = [] + modul_path = ["modulation de la voie"] + n_meta_int_in = ["numbers of metabolite of interest"] + for index_p, act_path in enumerate(l_paths): + if index_p != 0 and act_path != l_paths[-1]: + edge_now = [] + if flow==None: + splited = l_meta_in[index_p].split(",") + else: + splited = l_meta_in[index_p] + for index_m, try_met in enumerate(l_meta_in[index_p+1:]): + mod = 0 + links = 0 + for i in range(len(splited)): + splited[i] = splited[i].strip() + for met in splited: + mod += modul[l_all_meta.index(met.strip())] + if met in try_met: + links += 1 + edge_now.append([l_paths[index_p+1+index_m], links, mod, len(splited)]) + edge.append(edge_now) + n_meta_int_in.append(len(splited)) + modul_path.append(mod) + elif act_path==l_paths[-1]: + mod = 0 + if flow==None: + splited = l_meta_in[index_p].split(",") + else: + splited = l_meta_in[index_p] + for i in range(len(splited)): + splited[i] = splited[i].strip() + for met in splited: + mod += modul[l_all_meta.index(met.strip())] + edge.append([[act_path, 0, mod, len(splited)]]) + n_meta_int_in.append(len(splited)) + modul_path.append(mod) + source = ["Source"] + target = ["Target"] + n_edge = ["n_edge"] + modulation = ["Modulation"] + n_meta_map = ["Metabo_map"] + len_path = ["Number of metabolites in Pathway"] + p_value = ["p-value"] + for index_edge, edge in enumerate(edge): + for new_entree in edge: + source.append(l_paths[index_edge+1]) + target.append(new_entree[0]) + n_edge.append(new_entree[1]) + modulation.append(new_entree[2]) + n_meta_map.append(new_entree[3]) + len_path.append(l_len_path[index_edge+1]) + p_value.append(l_p_value[index_edge+1]) + out_data = [source, target, n_edge, modulation, n_meta_map, len_path, p_value] + nodes = [l_paths, l_p_value, n_meta_int_in, l_len_path, modul_path] + print(len(l_paths), len(l_p_value), len(n_meta_int_in), len(l_len_path), len(modul_path)) + network = pd.DataFrame(data = out_data).transpose() + excel_file_writer(network, out_file, sheetname="Network links") + return out_data, nodes + + +def network_visu(edge, nodes, bdd="Reactome"): + source = nodes[0][1:] + p_value = nodes[1][1:] + n_meta_in_path = nodes[2][1:] + len_tot_path = nodes[3][1:] + modul_path = nodes[4][1:] + source_for_target = edge[0][1:-1] + target = edge[1][1:-1] + weight_ege = edge[2][1:-1] + p4c.cytoscape_ping() + p4c.cytoscape_version_info() + df_nodes = pd.DataFrame(data={'id': source, 'p value': p_value, + 'N metabolites mapped': n_meta_in_path, + 'N metabolites in pathway': len_tot_path, + 'Pathway modulation': modul_path}) + df_edges = pd.DataFrame(data={'source': source_for_target, 'target': target, + 'weight': weight_ege}) + p4c.create_network_from_data_frames(nodes=df_nodes, edges=df_edges, + title="CPDB_network_"+ bdd, + collection="Network_from_mapping") + #mise en place de paramétres fixe + + p4c.set_node_shape_default('ELLIPSE') + p4c.set_node_font_size_default(17) + nmm_min = min(n_meta_in_path) + nmm_max = max(n_meta_in_path) + nmm_c = nmm_min + (nmm_max - nmm_min)/2 + p4c.set_node_color_mapping('N metabolites mapped', [nmm_min, nmm_c, nmm_max], + ['#e6eeff', '#6699ff', '#000099'], + mapping_type='c') + pv_min = min(p_value) + pv_max = max(p_value) + pv_c = pv_min + (pv_max - pv_min)/3 + p4c.set_node_label_color_mapping('p value', [pv_min, pv_c, pv_max], + ['#145214', '#ffb3ff', '#4d004d'], + mapping_type='c') + w_min = min(p_value) + w_max = max(p_value) + w_c = w_min + (w_max - w_min)/2 + p4c.set_edge_line_width_mapping('weight', [w_min, w_c, w_max], + [0.5, 1.75, 3], mapping_type='c') + + for i_ltp, ltp in enumerate(len_tot_path): + len_tot_path[i_ltp] = int(ltp)/2 + p4c.set_node_height_bypass(source, len_tot_path) + p4c.set_node_width_bypass(source, len_tot_path) + + p4c.layout_network('degree-circle') + return([pv_min, pv_c, pv_max]) + +if __name__ == "__main__": + csv_f = LOCAL + "ora_cpdb_data_yeux_reactome_rev_18-01-2024.csv" + out_file = LOCAL + "reseax_edge_tab_data_oeil_cpdb_reactome_v2_rev_19-01-2024.xlsx" + intens = LOCAL + "chebi_intensite_patho_oeil_donnes_estelles_rev_17-01-2024.csv" + edge_data, nodes_data = Paths_link_CPDB(csv_f, out_file, intens) + print(network_visu(edge_data[0:3], nodes_data)) diff --git a/utils.py b/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..c3ac3553efc59a1631e4d871375b11d1ac5b4848 --- /dev/null +++ b/utils.py @@ -0,0 +1,130 @@ +""" +all function utils +""" +import pandas as pd +import csv +import re + + +def excel_file_writer(dataframe, n_o_f, sheetname="Resultats"): + """ + Take a dataframe and write an excel file with this data + + Arg: + dataframe = dataframe of data to write + n_o_f = name and acces path of the new excel file + sheetname = The name of the new sheet + """ + ex_f = pd.ExcelWriter(n_o_f) # pylint: disable=abstract-class-instantiated + dataframe.to_excel(ex_f, sheet_name=sheetname, header=False, index=False) + ex_f.close() + + +def column_recovery(file, n, sep=";", enc=None): + """ + Put the culomn n of the file in list + + Arg: + file : A csv file to read + n : the number of column to read + sep : type of separator + + Returns: + Type of return: list + """ + with open(file, "r", encoding=enc) as f: + r = csv.reader(f, delimiter=sep) + lines = list(r) + res = [] + if abs(n) < len(lines[0]): + for line in lines: + if line[n].strip() != '': + res.append(line[n].strip()) + return(res) + + +def excel_m_file_writer(list_of_dataframe, n_outf, list_of_sheetname): + """ + Take a list of dataframe and write an excel file with these data + + Arg: + list_of_dataframe = list of dataframe to write + n_outf = name and acces path of the new excel file + list_of_sheetname = list of sheets names to write + """ + e_f = pd.ExcelWriter(n_outf) # pylint: disable=abstract-class-instantiated + for df_index, l_o_d in enumerate(list_of_dataframe): + s_n = list_of_sheetname[df_index] + l_o_d.to_excel(e_f, sheet_name=s_n, header=False, index=False) + e_f.close() + + +def comma_cleaning(str_to_clean): + """ + Replace potential ',' by '_' + + Arg: + str_to_clean = list of character with potentialy ',' + + Returns: + Type of return: character + """ + if ',' in str_to_clean: + while ',' in str_to_clean: + str_to_clean = re.sub(",", "_", str(str_to_clean)) + return str_to_clean + + +def pre_cut(listed): + """ + cut only 1 type of ID by the first entree + + Arg: + list: 1 list of id + + Returns: + Type of return: 1 list + """ + clean_list = [] + cump = 0 + while listed[cump] == "NA": + cump += 1 + pos_cut = listed[cump].index(":") + for elem in listed: + if elem == "NA": + clean_list.append("NA") + else: + clean_list.append(elem[pos_cut+1:]) + return clean_list + + +def recup_all_inf_excel(file): + """ + This function takes infos from a .xlsx + + Arg: + file = the file to read + Returns: + Type of return: 1 list of list line + """ + datas = pd.read_excel(file, header=None, na_filter=False) + l_datas = datas.values.tolist() + return l_datas + + +def cor_index(list_objects_to_convert, l_all_obj, l_all_equ): + """ + Change elements of a list by the correspondance elements + + Arg: + list_objects_to_convert : list of object + l_all_obj : list who countain all objet to convert + l_all_equ : correspondance list of all object + + Returns: + Type of return: list + """ + l_to_return = [] + for item_to_replace in (list_objects_to_convert): + l_to_return.append(l_all_equ[l_all_obj.index(item_to_replace.strip())]) + return l_to_return \ No newline at end of file