From 1858b5dd6b4c6476c116ee56245921c2d8a10f4f Mon Sep 17 00:00:00 2001 From: local_comparaison <mathieu.umec@inrae.fr> Date: Mon, 29 Jan 2024 14:26:08 +0100 Subject: [PATCH] addition of main file and network visualization as well as adjustments to other programs to facilitate workflow --- Mapping_using_the_API.py | 117 +++------ Visualisation_des_donnes_de_mapping.py | 25 ++ ...tion_des_donnes_de_mapping.cpython-310.pyc | Bin 5713 -> 7532 bytes complete_processing_of_mapping_results.py | 246 +++++++----------- main.py | 62 +++++ network_visualization.py | 173 ++++++++++++ utils.py | 130 +++++++++ 7 files changed, 525 insertions(+), 228 deletions(-) create mode 100644 main.py create mode 100644 network_visualization.py create mode 100644 utils.py diff --git a/Mapping_using_the_API.py b/Mapping_using_the_API.py index 4884197..5f7041f 100644 --- a/Mapping_using_the_API.py +++ b/Mapping_using_the_API.py @@ -7,23 +7,10 @@ import json from urllib import request import xmltodict import pandas as pd +from utils import excel_file_writer, pre_cut, recup_all_inf_excel FOLDER = "C:\\Users\\mumec\\Desktop\\Mini_codes\\" -def recup_all_inf_excel(file): - """ - This function takes infos from a .xlsx - - Arg: - file = the file to read - Returns: - Type of return: 1 list of list line - """ - datas = pd.read_excel(file, header=None, na_filter=False) - l_datas = datas.values.tolist() - return l_datas - - def send_request_to_mapping_api(url, data_json, head, met='POST'): """ This function gives the result of mapping of a metabolites list from RAMP. @@ -46,47 +33,7 @@ def send_request_to_mapping_api(url, data_json, head, met='POST'): return out_data -def excel_file_writer(dataframe, name_out_file, sheetname="Resultats"): - """ - write an excel file - - Arg: - dataframe = the data to write on dataframe shape - name_out_file = name of the outfile to write - sheetname = name of the sheet to write - - Returns: - Type of return: 1 excel file whith 5 columns - """ - ex_f = pd.ExcelWriter(name_out_file) - dataframe.to_excel(ex_f, sheet_name=sheetname, index=False, header=False) - ex_f.close() - - -def pre_cut(listed): - """ - cut only 1 type of ID by the first entree - - Arg: - list: 1 list of id - - Returns: - Type of return: 1 list - """ - clean_list = [] - cump = 0 - while listed[cump] == "NA": - cump += 1 - pos_cut = listed[cump].index(":") - for elem in listed: - if elem == "NA": - clean_list.append("NA") - else: - clean_list.append(elem[pos_cut+1:]) - return clean_list - - -def mapping_ramp_api(metabolites_list, outfile, inf="opti"): +def mapping_ramp_api(metabolites_list, outfile, inf="flow", flow=False): """ This function gives the result of mapping of a metabolites list from RAMP. Here's an example of 4 metabolites giving 505 lines. @@ -94,7 +41,7 @@ def mapping_ramp_api(metabolites_list, outfile, inf="opti"): Arg: metabolites_list = a list of metabolites id - outfiles = name of the outfile to write + outfile = name of the outfile to write inf = if all give the full information Returns: @@ -132,7 +79,7 @@ def mapping_ramp_api(metabolites_list, outfile, inf="opti"): break print(str(len(l_met_map))+" metabolites were found") return (len(l_met_map), l_met_map) - if inf == "all": + if inf in ("all", "flow"): psource = [] pathwayid = [] commonname = [] @@ -141,13 +88,13 @@ def mapping_ramp_api(metabolites_list, outfile, inf="opti"): onel = datas_to_treat[i_b_l[index_pos]:i_b_l[index_pos+1]] pathwayname.append(onel[16:onel.find("pathwaySource")-3]) inputid.append(onel[onel.find("inputId")+10:onel.find("commonName")-3]) - if inf == "all": + if inf in ("all", "flow"): psource.append(onel[onel.find("pathwaySource")+16:onel.find("pathwayId")-3]) pathwayid.append(onel[onel.find("pathwayId")+12:onel.find("inputId")-3]) commonname.append(onel[onel.find("commonName")+13:len(onel)-3]) pathwayname.insert(0, "pathwayName") inputid.insert(0, "inputid") - if inf == "all": + if inf in ("all", "flow"): psource.insert(0, "pathway_source") pathwayid.insert(0, "pathwayid") commonname.insert(0, "commonname") @@ -431,7 +378,7 @@ def get_cpdb_version(): def m_ora_cpdb(accnumbers, acctype, cpdbidsbg=None, - pthreshold=0.05, infos="all", + pthreshold=0.05, infos="flow", ofile="C:\\Users\\mumec\\Desktop\\test_out_cpdb.xlsx"): """ Give the result of id mapping on CPDB @@ -522,7 +469,7 @@ def m_ora_cpdb(accnumbers, acctype, cpdbidsbg=None, id_cln = tab_cor[1].index(id_t) l_map_cor.append(tab_cor[0][id_cln]) return l_map_cor - if infos == "all": + if infos in ("all", "flow"): splited = details.split("',") fsetid = ["fsetId"] cpdburl = ["URLCPDB"] @@ -596,6 +543,8 @@ def m_ora_cpdb(accnumbers, acctype, cpdbidsbg=None, overlapping, size, e_size, fsetid, pmids, cpdburl] out_df = pd.DataFrame(data=out_f).transpose() excel_file_writer(out_df, ofile, sheetname="Resultats") + if infos == "flow": + return out_f return ovlent_c @@ -628,7 +577,7 @@ def multimapping_ramp(file, num_col, outfiles, infpath="Yes"): return l_o_d -def opti_multimapping(file, outfolder, mapping="YES"): +def opti_multimapping(file, outfolder, mapping="flow"): """ Processe optimal mapping of RAMP and CPDB @@ -640,13 +589,19 @@ def opti_multimapping(file, outfolder, mapping="YES"): Returns: Type of return: 2 excel files """ - inf = recup_all_inf_excel(file) + if mapping == "flow": + n_mapped = [] + inf = file + else: + inf = recup_all_inf_excel(file) to_test = [] recap = [[]] id_dif = [] col_id = [] + modulation =[] for line in inf: recap[0].append(line[0]) + modulation.append(line[-1]) for ind_head, headers in enumerate(inf[0][1:-1]): if headers not in id_dif: id_dif.append(headers) @@ -681,18 +636,24 @@ def opti_multimapping(file, outfolder, mapping="YES"): if col_actu[index_change] != "NA": to_test.remove(col_actu[index_change]) if len(cpdb_o_opti) == len(inf[1:]) or n_col == col_id[i_t_i][-1]: - if mapping == "YES": + if mapping == "all": cpdbf = outfolder+acctype+"_mapping_opti_cpdb.xlsx" m_ora_cpdb(cpdb_o_opti, acctype, infos="all", ofile=cpdbf) l_opti_for_this_id[0] = "CPDB "+acctype + if mapping == "flow": + n_mapped.append(len(cpdb_o_opti)) recap.append(l_opti_for_this_id) break + if mapping == "flow": + i_map_opt = n_mapped.index(max(n_mapped)) + 1 + cpdbf = outfolder+recap[i_map_opt][0]+"_mapping_opti.xlsx" + datas_cpdb = m_ora_cpdb(cpdb_o_opti, acctype, infos="flow", ofile=cpdbf) for line in inf[1:]: to_test.append(line[1]) l_opt_ramp = [] l_opt_ramp_tri = ["NA" for i in range(len(inf))] n_meta_map = 0 - ramp_outf = FOLDER+"optimapping_ramp.xlsx" + ramp_outf = outfolder+"optimapping_ramp.xlsx" n_meta_map, l_opt_ramp = mapping_ramp_api(to_test, ramp_outf, inf="opti") if n_meta_map == len(inf)-1: mapping_ramp_api(l_opt_ramp, ramp_outf, inf="all") @@ -708,7 +669,6 @@ def opti_multimapping(file, outfolder, mapping="YES"): l_opt_ramp_tri[index_l+1] = li[1] input_col = 2 while n_meta_map != len(inf)-1 and input_col != (len(inf[0])-1): - # prend en compte la derniére colones de fold-change if len(to_test) != 0: n_sup, s_map = mapping_ramp_api(to_test, ramp_outf, inf="opti") n_meta_map += n_sup @@ -725,20 +685,27 @@ def opti_multimapping(file, outfolder, mapping="YES"): for ind_ind in index_still: if inf[ind_ind][input_col] != "NA": to_test.append(inf[ind_ind][input_col]) - if mapping == "YES": - mapping_ramp_api(l_opt_ramp, ramp_outf, inf="all") + if mapping == "all": + n_map, datas_ramp = mapping_ramp_api(l_opt_ramp, ramp_outf, inf="all") + if mapping == "flow": + ramp_outf = outfolder + "ramp_mapping_opti.xlsx" + n_map, datas_ramp = mapping_ramp_api(l_opt_ramp, ramp_outf, inf="flow") + print("lines Ramp", n_map) l_opt_ramp_tri[0] = "RAMP" recap.append(l_opt_ramp_tri) - recap = pd.DataFrame(data=recap).transpose() - n_out_f = outfolder+"recap_mapping_opti_oeil.xlsx" - excel_file_writer(recap, n_out_f, sheetname="Resultats") + recap.append(modulation) + df_recap = pd.DataFrame(data=recap).transpose() + n_out_f = outfolder+"recap_multimapping.xlsx" + excel_file_writer(df_recap, n_out_f, sheetname="Resultats") + if mapping == "flow": + return datas_cpdb, datas_ramp, recap return "all is ok" if __name__ == "__main__": F_ENTER = FOLDER+"Donnees_oeil_mis_en_forme_opti_mapping.xlsx" - #opti_multimapping(F_ENTER, FOLDER) + opti_multimapping(F_ENTER, FOLDER) F_O = FOLDER + "test_enrichment_ramp.xlsx" - a, b = mapping_ramp_api(["KEGG:C01157","hmdb:HMDB0000064","hmdb:HMDB0000148","chebi:16015"], F_O, inf="all") - b = pd.DataFrame(data=b).transpose() - excel_file_writer(b, F_O, sheetname="Resultats") \ No newline at end of file + #a, b = mapping_ramp_api(["KEGG:C01157","hmdb:HMDB0000064","hmdb:HMDB0000148","chebi:16015"], F_O, inf="all") + #b = pd.DataFrame(data=b).transpose() + #excel_file_writer(b, F_O, sheetname="Resultats") \ No newline at end of file diff --git a/Visualisation_des_donnes_de_mapping.py b/Visualisation_des_donnes_de_mapping.py index 926f7f4..2ebdf96 100644 --- a/Visualisation_des_donnes_de_mapping.py +++ b/Visualisation_des_donnes_de_mapping.py @@ -175,6 +175,31 @@ def boite_a_metabolites(file, title_plot="boîte à moustache", num_col_plot=1): plt.show() +def barplot(column_x, column_y, df_data, title="barplot", figure_size=(30, 5), + ax_x_label="voies métaboliques", ax_y_label='Recouvrement moyen', + colors='Spectral', decimal='%.1f', size_of_labels=6): + """ + drawn barplot from data + + Arg: + column_x = data for plot absises axis + column_y = data for plot ordinate axis + df_data = dataframe of data to plot + + Returns: + Type de retour: + """ + plt.subplots(1, 1, figsize=figure_size) + p1 = sns.barplot(x=column_x, y=column_y, data=df_data, palette=colors) + plt.subplots_adjust(top=0.90, bottom=0.26) + p1.set(title=title) + plt.xlabel(ax_x_label) + plt.ylabel(ax_y_label) + p1.bar_label(p1.containers[0], fontsize=7, fmt=decimal) + p1.tick_params(axis='x', rotation=90, size=0.05, labelsize=size_of_labels) + return p1 + + def up_down_path_plot(l_path, up, down, log_p): """ plot regulation of pathways diff --git a/__pycache__/Visualisation_des_donnes_de_mapping.cpython-310.pyc b/__pycache__/Visualisation_des_donnes_de_mapping.cpython-310.pyc index e273d9b1eabea1642dccb1dcff7fafa4f1eeb334..a1fe174ff1b8ff659683d23c083016ede401ef4f 100644 GIT binary patch literal 7532 zcmc&(TZ|)TR(4%&m)-9Bb&`p9!a%$svrJZE!zjsgHp3=C!(=v>U8o&YRQoTx%Tum$ z|5cv8q%0Oq*tga`2YBdV5mLV_0)d3Y8}rB_ka$>8FMzm2Ezn8`<t0KHzH@%t-RVpq zZ(A+@_e<6J&-u>xpH60GoD!b?pZ{_2oo1=@FH|}ESg35`m;MNeD7ht}2|d#Kx~pqc zHX_3{Ik#M!^K!r9R>ysgTN6JcY*Bt+bL-VgRTGzmBWkG4h@TZRqJdJQI*az76Z2vL zwOR3uSQ5)9%~e-1<AP|3C&UWs^WsTy@%@s!5G;zP?&?x>OQQJ)n!AMjOURdzUqZfu z{AuK?!9uVUEC(yW>V4B)3#v!!4@!71<8A)6zI-rn-q{Y*W<L?bC}^T2f;8;LfoNt) zb0Cv;kfzOSJ7@|&^P9<5=7-cNhBA!1&3=&iTS*jVL9_1<257p1^{5iY&CW1xXJHc4 zs%>xJC0^f?&Ix2#YH7vlE8T9C^aJT<;l1G1XqaXJt$FLi-~9m3`Wu@kB=@#K<HyfM z<YoL`#V<Y3OQo#T({etJLw#gqdTtyU4@&nnt(2L$(X;MrBg$=I2=h=ISr}pU%GBR2 zjcokN_*Jq>R?RJ;tz+hW?MCT0o*s?sd3C*%YY_C;^U9Hvn^b>**<7nqY3JG8Mh}<s z^4T#v&tg>@Z{TFLqk8V-PDg*BjcP(0)$yD8%&uP<HS$JYxmS`qd0kjvD~;+qoyU$` zAI;{oJo0<_43C`Ic`vu}2HtiSzZ%x5ua&5l&#aYREbY9v)5-PGTsD)>kt&MHp>fp6 zYj~gee7-k3T7YC0#&TJNOqQ}rKF6|Xji6K`DAZ^*o1-z1&(a-zv^HAL*Yjn_=5O;A ze0Mot8_Q-DwKbN_U*?M_udu|HP+H~IKgs5M3!*BVLprC)KCh29u*U^a8$H32S<E-i zoX3;-1<bGIOVmmyg>U0>{v?%YU3|gk_S2W`{C75gw3t7^W9N^Sa^4}o@Bl`PH(C~T zGDa~g<_=Aa{x^(X5%bw<uHV;I)eiGVYivFXZ<kt&2mk0a@tkUI(~m{-UKs2udkNFb zkK3@RxOuhNR{HJmlviZ;8W+NNFwDG82+O<H<Sgm1p>gj$Kk~x36UHHWQ5gnJbdOuO z&ckAFym*RX@vy%oQyRDH#~BQD(x-NQE69d2PSx)8_V4TucvZ<ASm0`N5G5HeEH#)J zEVUZOBG~iVNmN*!DDgA2St7&C6B^;x1~O<TeSAtFiVBsYAPZVX;her&VFM~;*e=RZ z7~l1~K}$cG#pDw*?_;efcfxKO9t0-|>V>v<;v(<9xAeo;{`ug~uAh+6ADe~g?}cem zkx2&7NaDhzDMdB%w}Oa@2OCiU1PeQzKn8IJ*im9noAFDxTI+>A5QQTHUwFY@JHY1* zfH+xUZlkF%Bb-=K#%B)uF?Puk57Hd$7goCM4}!vye%uWTBMRce2G|9$DD-$xSV4b~ z?Mt9hsW3WW=9bm_ctth90t9fFHHfmp#zziiP?Xc*7D>4<Ijv=3`>r?erQgT4>2Q#R zS%fKRCh>*a$!_bS>zt~J>zpcvB*#@$37b2Z9+3E>7=;v$2#sFmE=L6C-XIxDFOxo? zx)0H~3#ZVXdg_JU3uoKJSbxVVSX2f{8j{rAnK*%x1zxh%3$oNLXOL*-^<BH;;cx@j z>X5FucF5Jj3c0#Ssu{@COVB+&7S|b<fcx%(4ANngrITK2TZ}yV8Z`OScF5PmlF`tu zjc0ptvg=l($wYUS?1oiV?c<ULhwZ?V?LwDr*TSoKgRz7L?jjA2v#=AwWW28Q2it*L z9->it;c-d+B4qqg56P?7?!24g6YlhJtnE8Dg7j{d4DQ@W(lo@_E{@IL@}i{eM|Vy| z?ZKmZB8lUN$3c4MTVXo%p=qqZ(=HkjcoP>2o!gj-zwg6tV14RSAe1`ZB;7;On5k=y zMn9APmb5voL3MpjtDr=!ODLgVSNTO<{~yybH64HEC$_GCVjC46SLamCb2R<Cv@ZRr zH7;qFu*w`(TR@w90h@ok@D_}Icu%rAfj{gA@d?>L+e9*Xswhky_IFXqN=I<ma62a8 z+Uyx{k5*>(;C%6?!%dT;w%}guQ8}}FWjHkhP;ccrQd3xYS=fixsFGKDm0S~Lav>;J z@pD84t^saE)D8`9t&JhQDrO1Na~rTd=QjXajBjLoO~1M2OU66`&4Hh7@A~^q0<?R9 z+&_ocpNG_JupX~>jmynWBH0VDHYu2f&cQ6kqqtL91no-%O92nlt_cE@p3)q)Pk=KG zd+a1Ep=|hz!@R;G1@sl13;_8VWRi~W8!fA_Vwvow$nXNGoI`720|;fXH_rEq%G-mW z4MmSymRz6#+J0dY>=fmJ4<7*B)BQc5yTw9be{y1D;K%0WD_1+mdQs^lfN278qtnlL z5xIx~iUWRdd|fW1)~d@5Wby(f!~x|~l(di#UNV-Zz)PTZ8K=phYNSC{R8dpHE1WiV zBJL_v`5Y#;s)b3%SvaSMSJWp0=Otz)H5RSBN_##}djjRVh2igAg+sZ@@Su{fQRfUX z0B@o|@1iMPL*g_XHdN=)oNjCOqq@EVV=^^Kj_z1LAt0A`khF>QP97@L;V|LdM1ct} z?ix&;N<BRXJr$t6IoXc+eo0t|Ht0;LXN+kck(<(y$+V>k{dW%j^{n!@BPi-149+P! zll^N=@`$9KR84uB(lV&xq-(5p#3i683Ao2;pK9QF!5%~^h~}`BOsd#C9<z-OsR8;q zBU2<J;JN<~X}P90rmtZ3#d9Q)i*ARXf4|-)o-%o;OiKPXekrMKR?7GiaUVoa=&R^x z8Mh>f{bTKNOP60k3En?}M`&qEN696eIrGYkk>B4E{>#6G_LNX?p>%M8Ki|V^U3rmx zLi#caY8>p#R+1}fgLXCn`=UG+Okpe86=i=27Xu@s*J1r=>7XD<TmA|fZo%-#3gnlO zfMbLKmJPaKCh`~1g5IxF?{8Alreq5VNF2%dyxvx)uS*mxB`;HN)ZsqtM-3gIAEMzS zvcQgfc)T+Vh}y!lHTw^bCwIV<-||xqfWV(Do)tJo;qx58IiSZu9GW9Lw|h1e++dg| zcVPD_!UW7)q}~kkcCRWb3id^vU_Nt>YOL-vILOAqf1Ok9$6&Zp?Nc0p7XGwR{EI1m zM+Q4Mi2bP#Q@XziIUk!!Q(r+@X#pL9{AEhW0_1B*4vfoJI`R#wk|8kk%9~Uo17YmF z^*P{siw3+&$!$u$Ny%H35V$I(WECt+vNQP(B?N}@Ta-+(RQ@_u&zaL4LnwWmrx@w` z7{sPZC^`q!oZu()GD6=+<TJLChd)Fd;^7g<2!aWIdpr14@PO2i@bnCpj=2a24w#FW zii-eGO=28SLStkhWH432tb9~q<uajM*1<opz!}oO(<Y&itz_?~s&vj5#7$q_RsptC zi8bln3CxRXH;F)tWg>8svltwd9<fG|9^GnHAM2M%HQ<O7g8WRW2ZI#rgMD5frm9fr zTc2^Ji(3gUES^8s0}3>yzmMh91`<*Rz!NJ26hWeiH4@&GNdoCGYsS4KJPT9+uT*Z* z=+`MBLuixxojf%ZrkItZLQa<mDhMbP0tln`4061=4mxT8ryPth1?VZrDI9L@^gvq` zIR)O?xy3Pu(h=~BO6afP2izjUnpY4BA;Ri_!ZKyetD?b#)q#@D9u#~aLi)Qg89HlQ zM<#t0Dd5g&M1hVDXN4QAx7U101?`B7@+5yN4O2*hZqYpL%;^2kj83EoW87UoJzR7= zCI@P5zI09)!j!u5+K>0zpa2Lp;4Z2{5X!BQua<X_eE2s%qUS%acmHtklJnyq{M~hM zL<0`6u(y&dOZsQ<Yl$X34!W2z))WN&Dh--KRWUmui=Hofzz9~(!ro(7T-f`ZpTn|N z&815<OLSV=cgtfz$ZueaqNdoiM~>BX{5@|^(M~S!t1^4wqAY@T*!Ls1L1XDMta!lV zfRMHMPq98@-&|usUGKmn6?goUr>$b#lq&uNm9f_9hZ?R^Cazhai$oGZ5h-kKkL=7q zloX@~moQwkptEIYG)D(T_0WPwsx~dRK`&W1iCpH^ku7GFM)%4TF=Ov+5is(_tP0yO z#2f{)#iCeJg!IS}%dGpAqnc0=!3x${n69&$)pJ}2#oB5~tg{uFy&16~E{G?%W+7gw zVy{+j7BOE0!o^df$#z0u{3ZWs_%&8NR(2V5hY{bh&s^+Mg^af;)`A!nu~nXN6XBO^ z8-D-hnDLG;f(RT@Fonkq2k2L~FAlJs36?Z>L(Im2(@lAbNMN%6_aq*2Zd#OdJ=*Lv z==U6&o*7JIfOHR>yrKG{bMZU+<kLk@$CAP0qPGI|I3WmSn*5=n$v?T?{VG3ye_igP zugh`x4^FNhG;Yv$jMqbq&xR%XSwencHx$`+VJX8Q;&@PhcPa)9o$UuA9QF})+!-bS zpOz`_)4uzZkXx5LY5g+2@mY!TKO}%YCjx^&yD8vNXi1?xcThhgjaE@tI}C;ZMl6p& zqc86U5>Y{1_R?%W3eN1acw1fY$N%tP^>8bbMkYpGY=q~Zu7mmWThh)4wIKq2vMIyv zc7~37Xp_H<1myr((b$5&^AIvb$ag!D<3%<$!#{fT=n>dF;!A;S8IrDoa+PN2+kR{A z3`!kP&m~HprfvgBUpTnLdJ2^ab3Y3ExJmDZ@t*v78pp`PL&gAR^!+{FT20)exMB*> z!y;T(NYkpiHo_1PM4=A{xUl1Jae}0C^4qlTuOV@*$io{jEu^dN^XEC*{BVGv)^5xT zaZuy0F)gJRo2ygosg7t7#gFVq4z3TCX_>CUD*!S_U4MTM6N-w5*jE^PUdwQ+6UDxI zT73Sr_*Dpz9Qz4j-3ep8aYFRsWQGz#!Ishei;KVWAK!f8+0E{ce?X4psm<>9|LUV> z?tFh?69;Bs46lKueu>K0C?QhZ(r?M{P;ST`weh7(b;$mtp;cDAH>HLlCA`Qcm8OKE zwNaEk2&nCO?reHD3Vaz~8OQKcsMsFh-vxfk3<^;eiZsj1SfiMSZ4rV+;ntPGK9vev z<*Y4;9tR+rGFE==1=^Nk2z)_fHhjv2<V3n0MD<+zrg!_5cV2h(cW=AqjW@r2%e8L4 z`I?6c{8yLnjAzc7SvkivDle-4557#YNNG#6R5vSps)#Qd`Ww3DJaRVlWzN_16^bxA SpH$W>muM9IQDoBm_<sS4C;Y$w delta 2693 zcmZ`)O^n;d73NSBMNyPQ{jUCOE5D8W6W6U>2ZiCJ^(LvE&9<?dq!!Q!lxJ4U4JArL zYO^t@sOkC;q;QL3u08DL<^Tl><kFmS$hAKx+7r>93j?_Z1p*iCdn0+hS+o-P&71e$ zynhbA`0ig$=lpEeRN(hu`Y{Wezs|Mhzk?lBDDjjb=@Npp>Z#p?T(xdeuKF<5HI}k; z*Q70)r1~Aw&Cp{sO--OIdYoD`3sjb#pf)W4m7{H1qGh1+v_h+Ql&;MRbmeW8zcEl~ z?IY4H4lXLR4nzqU4Oq*tHeszWn-y7!m04w1>sFbux3aH50O}3-bRH8Ua+UmC{GMFZ z2Fffgh+3MPQIe9n5}EK2@Uj{iduF8JejhTEI}@qNnMenTq@&bfoynOn9qGGdmI*U^ zR+Nb{8|prpS(MDO@Z>@R<kHJ7*(IyWPtVTsQ9d%Z6&~cGEKOciX4%Oms)~IG$&PF( z{2<CnS#C0ml2IN4wBfPfXtt){F3Q!EOUh(6*^Ja#A<RYvs?!uT?k4tfkp&?XqvBS6 zRtnXqv<oRf(om~eB{ZUf4CK_T3i}m!YBEUB+9lT^!t$&hm7+R0sX|Tu8m&OSDcsAN z)L>VaHTf(m1HB@-6`*R8`$=eT6|l#5!6`Zu?<Xqra@43R%L+U(I~FxT--;^8#SVZk zNk_+!MmrGp|CM7plYfQwUNLG(U3;$-Ne@wTUxl;wRvBlD=4hVUcQv9sF}V-w6<P?Z zk-AG(7B1}Nj3_shPH|n-l7CV=HE};V7F6eHgF2zp3!T3BhyDY-d*YVw^@N$aFjyHn zA$M=F-q@e=UdSCcWJ5LzySB464Hk6Ia|a{VEgs^)d4B{v%N<cXz0k-lcMqk-kw5Hn z)}u^JQYYu_H5V+xt_OVZ<ucw27#Ql^^3WIWKEBlGEe-uI9eb(VV{i_F4!WhqsgVnj zADC(!W=eA>4}`J4u-{FOro-NVJL64wg)yOEu@nDERqagRPlIJ7_Y{6!JZ6ORRY<`Z zLB_IfI}hEtqk#ceNiYV;sTMJbrJ6)bXaxUd)h0OtTv!dDG!=LW1GF@?3>=djSJekv zs!Vj%kWvdYG-AV<97xrlfjwd+EaFIW<Z<|$ATE<42>5rv>z7vmt~02}HkTc25B+!9 zs7+bWrtZeZ=l3|YHE55WaPuAK-JtFFLpGYCblM&|<1wTF%58BqeS&--w(?DJCr$IA zwr9z^2j|LO7bG(ww?&XSOR}O}sE}vGr<v#Gr@=Qr3(!f&+Dop-;$rBA*!k(ApzdVK z;^R<l#(f+B&w<{_5$ZV$4aVaFcH<%2bNbBV-$cXb0OG`kJBSm`&be3}pX29|FW~}E zSK`LD8%!O~4V=)0dE$ra@H!|2Hh>8wtC1FTr@AQHo4~xM-p5Y@zbSrUy`^K4>2>k9 z?AqyPfg2~eKONBy;YHN>y3^RW<T=AWbuJIYTJEj+O;G8mal-R&@t2WK@@Mc=2Ov(2 zT^8#e8?X^w6e<~OtME>C^)*a1=;~LX9x%+Q7k9><Pg$JY2;AH3sM9pohxfpo19k(9 z%Yq3~Nl69JiT)s$gozjc|I3TwbpCj&O~2ueLTLZzA4B_(H2Yfqm+P2xcAZ~_BOPs# z>GMGI7ZAROfTP6;!ixa66Hh(6!C%6yEK3Qgs|Xr^_)VrYe+74I2(KcnBm59S4vH*` zoR$>-5prKcxQ=iG;h^dK4cs0n`;%X9^#*4!Tku8(y&;1O&iCA~$%41xWRL^Et~Va_ z+!db{wohTo+IpNG_#R~=?$b^pPRhzIvX=Q{)}1s?xy&a{^?htjxX9^I68h034&sm1 z?;jo2xYYOIiiN2^h?WKKV6a$A3|50a%1-{<(x8(*us`{<dQ;8gaE1Oj*86@K`a`i& z{}%DYwfa-`Ht6sW0WEP)%<E4Z0gy5AzY@Q#pVycCf7ZW!QtF}bc4j&rGv4OzU^DEf zgFpTOe_uS$IS4x{?~4~2ml`t9YiKJUO#7IB&~qq$8^ni=PU{p%_$rFcfQ7vs`P#;b zp|iuO_;cfu{$o^lT@;#?%%Mh#bIt16_dygq31C}rC&EQmhBp&$Izz5ljH<l<iXi2` z67MzNyKoWBJ%@nv*iqN{PjF4}P2?Gm1bpiu13rVj)+=W)kvJ7F7!f|=azgRm<ZmM2 z`@&rWtT*`jd+W?=IHHH9Plk`z4ZQh7pH4k?8Dj{Z2Cx(ZqQ=P0g!!sznWkwOrqM<& K{_*zLzWOhf6P=s@ diff --git a/complete_processing_of_mapping_results.py b/complete_processing_of_mapping_results.py index c0eb5d3..28c5e32 100644 --- a/complete_processing_of_mapping_results.py +++ b/complete_processing_of_mapping_results.py @@ -11,53 +11,11 @@ import pandas as pd from math import log, floor import sys sys.path.append('C:\\Users\\mumec\\Desktop\\Dossier_gitlab_local\\traitement_des_données') -from Visualisation_des_donnes_de_mapping import up_down_path_plot +from Visualisation_des_donnes_de_mapping import up_down_path_plot, barplot +from utils import column_recovery, comma_cleaning, cor_index, excel_file_writer, excel_m_file_writer LOCAL = "C:\\Users\\mumec\\Desktop\\fichier_mis_en_forme_programme_total\\" - -def column_recovery(file, n, sep=";", enc=None): - """ - Put the culomn n of the file in list - - Arg: - file : A csv file to read - n : the number of column to read - sep : type of separator - - Returns: - Type of return: list - """ - with open(file, "r", encoding=enc) as f: - r = csv.reader(f, delimiter=sep) - lines = list(r) - res = [] - if abs(n) < len(lines[0]): - for line in lines: - if line[n].strip() != '': - res.append(line[n].strip()) - return res - - -def cor_index(list_objects_to_convert, l_all_obj, l_all_equ): - """ - Change elements of a list by the correspondance elements - - Arg: - list_objects_to_convert : list of object - l_all_obj : list who countain all objet to convert - l_all_equ : correspondance list of all object - - Returns: - Type of return: list - """ - l_to_return = [] - for item_to_replace in (list_objects_to_convert): - l_to_return.append(l_all_equ[l_all_obj.index(item_to_replace.strip())]) - - return l_to_return - - -def recup_ramp_pathways_list(ramp_mapping_result, correspondence_file): +def recup_ramp_pathways_list(ramp_mapping_result, correspondence_file, flow=False): """ Give a list of pathways with the correspondent metabolites names @@ -68,10 +26,16 @@ def recup_ramp_pathways_list(ramp_mapping_result, correspondence_file): Returns: Type of return: list """ - column_pathways_name = column_recovery(ramp_mapping_result, 0) - c_input_id = column_recovery(ramp_mapping_result, 3) - associated_name = column_recovery(correspondence_file, 0) - list_aso = column_recovery(correspondence_file, 1) + if flow == True: + column_pathways_name = ramp_mapping_result[0] + c_input_id = ramp_mapping_result[3] + list_aso = correspondence_file[-2] + associated_name = correspondence_file[0] + else: + column_pathways_name = column_recovery(ramp_mapping_result, 0) + c_input_id = column_recovery(ramp_mapping_result, 3) + associated_name = column_recovery(correspondence_file, 0) + list_aso = column_recovery(correspondence_file, 1) all_pathways = [] m_id_asso_p = [] @@ -84,14 +48,14 @@ def recup_ramp_pathways_list(ramp_mapping_result, correspondence_file): m_id_asso_p[all_pathways.index(c_p_n)].append(c_input_id[number]) for path_num, pathways in enumerate(all_pathways): - pat = cor_index(m_id_asso_p[path_num], list_aso, associated_name) + pat = cor_index(m_id_asso_p[path_num], list_aso, associated_name) ######ça bloque a cause de l'output du mutlti mapping ['....'] pat.insert(0, pathways) l_to_return.append(pat) return l_to_return -def recup_cpdb_pathways_list(cpdb_mapping_result, correspondence_file): +def recup_cpdb_pathways_list(cpdb_mapping_result, correspondence_file, flow=False): """ Give a list of pathways with the correspondent metabolites names @@ -102,21 +66,49 @@ def recup_cpdb_pathways_list(cpdb_mapping_result, correspondence_file): Returns: Type of return: list """ + if flow == True: + l_pathways = cpdb_mapping_result[2] + l_path_metabo_whith_top = cpdb_mapping_result[5] + id_use = l_path_metabo_whith_top[1][0] + for itraverse, traverse in enumerate(correspondence_file): + if id_use in traverse: + associated_chebi = correspondence_file[itraverse] + break + associated_name = correspondence_file[0] + p_value = cpdb_mapping_result[0] + m_inp_ol = cpdb_mapping_result[8] + l_pathways = l_pathways[1:] + l_path_metabo = l_path_metabo_whith_top[1:] + l_to_return = [] + + for num_path_t, l_p_m in enumerate(l_path_metabo): + path_cont = l_p_m + paths_to_rec = cor_index(path_cont, associated_chebi, associated_name) + paths_to_rec.insert(0, l_pathways[num_path_t]) + paths_to_rec.insert(0, m_inp_ol[num_path_t + 1]) + paths_to_rec.insert(0, p_value[num_path_t + 1]) + l_to_return.append(paths_to_rec) + print(paths_to_rec) + return l_to_return + associated_name = column_recovery(correspondence_file, 0) associated_chebi = column_recovery(correspondence_file, 1) l_pathways = column_recovery(cpdb_mapping_result, 2) - l_pathways = l_pathways[1:] l_path_metabo_whith_top = column_recovery(cpdb_mapping_result, 5) - l_path_metabo = l_path_metabo_whith_top[1:] p_value = column_recovery(cpdb_mapping_result, 0) m_inp_ol = column_recovery(cpdb_mapping_result, 8) + l_pathways = l_pathways[1:] + l_path_metabo = l_path_metabo_whith_top[1:] l_to_return = [] + for num_path_t, l_p_m in enumerate(l_path_metabo): + print(l_p_m) path_cont = [] if (len(l_p_m)) > 6: # regulation a vérifier comma_pos = [] for index, t_l_p_m in enumerate(l_p_m): - if t_l_p_m == ",": # probleme entre ; et , + print(t_l_p_m) + if t_l_p_m == ";": # probleme entre ; et , comma_pos.append(index) for n_comma in range(len(comma_pos)+1): if n_comma == 0: @@ -135,6 +127,7 @@ def recup_cpdb_pathways_list(cpdb_mapping_result, correspondence_file): paths_to_rec.insert(0, m_inp_ol[num_path_t + 1]) paths_to_rec.insert(0, p_value[num_path_t + 1]) l_to_return.append(paths_to_rec) + print(paths_to_rec) return l_to_return @@ -192,52 +185,6 @@ def recup_ma_pathways_list(ma_mapping_result, number_of_columns): return l_to_return -def comma_cleaning(str_to_clean): - """ - Replace potential ',' by '_' - - Arg: - str_to_clean = list of character with potentialy ',' - - Returns: - Type of return: character - """ - if ',' in str_to_clean: - while ',' in str_to_clean: - str_to_clean = re.sub(",", "_", str(str_to_clean)) - return str_to_clean - - -def excel_file_writer(dataframe, n_o_f, sheetname="Resultats"): - """ - Take a dataframe and write an excel file with this data - - Arg: - dataframe = dataframe of data to write - n_o_f = name and acces path of the new excel file - sheetname = The name of the new sheet - """ - ex_f = pd.ExcelWriter(n_o_f) # pylint: disable=abstract-class-instantiated - dataframe.to_excel(ex_f, sheet_name=sheetname, header=False, index=False) - ex_f.close() - - -def excel_m_file_writer(list_of_dataframe, n_outf, list_of_sheetname): - """ - Take a list of dataframe and write an excel file with these data - - Arg: - list_of_dataframe = list of dataframe to write - n_outf = name and acces path of the new excel file - list_of_sheetname = list of sheets names to write - """ - e_f = pd.ExcelWriter(n_outf) # pylint: disable=abstract-class-instantiated - for df_index, l_o_d in enumerate(list_of_dataframe): - s_n = list_of_sheetname[df_index] - l_o_d.to_excel(e_f, sheet_name=s_n, header=False, index=False) - e_f.close() - - def pathways_selection(list_of_list_to_select, list_of_object_to_filter): """ Only keep the object they are not in the filter list @@ -342,35 +289,9 @@ def df_matrix_r(sim_matrix): return look_like -def barplot(column_x, column_y, df_data, title="barplot", figure_size=(30, 5), - ax_x_label="voies métaboliques", ax_y_label='Recouvrement moyen', - colors='Spectral', decimal='%.1f', size_of_labels=6): - """ - drawn barplot from data - - Arg: - column_x = data for plot absises axis - column_y = data for plot ordinate axis - df_data = dataframe of data to plot - - Returns: - Type de retour: - """ - plt.subplots(1, 1, figsize=figure_size) - p1 = sns.barplot(x=column_x, y=column_y, data=df_data, palette=colors) - plt.subplots_adjust(top=0.90, bottom=0.26) - p1.set(title=title) - plt.xlabel(ax_x_label) - plt.ylabel(ax_y_label) - p1.bar_label(p1.containers[0], fontsize=7, fmt=decimal) - p1.tick_params(axis='x', rotation=90, size=0.05, labelsize=size_of_labels) - return p1 - - def c_p_o_m_r(file, outf, mapper, type_of_view="all", save_plot="all", fold_of_visu_sav=LOCAL, midfile="Yes", - midfile_name=LOCAL+"\\mid_file.xlsx", n_path_to_filt="nothing", modul=None, f_modul=None): """ Do the complet treatement of mapping results @@ -384,16 +305,21 @@ def c_p_o_m_r(file, outf, mapper, type_of_view="all", save_plot="all", save_plot = possibility to specify or not the 3 plots fold_of_visu_sav = folder where save plot(s) midfile = if yes the midfile while be write - midfile_name = name of the output midfile n_path_to_filt = list of object to filter """ if mapper == "CPDB": - c_file = input("In which file is the correspondence table?") - l_of_pathways_list = recup_cpdb_pathways_list(file, c_file) + if modul == "flow": + l_of_pathways_list = recup_cpdb_pathways_list(file, f_modul, flow=True) + else: + c_file = input("In which file is the correspondence table?") + l_of_pathways_list = recup_cpdb_pathways_list(file, c_file) elif mapper == "RAMP": - c_file = input("In which file is the correspondence table ?") - l_of_pathways_list = recup_ramp_pathways_list(file, c_file) + if modul == "flow": + l_of_pathways_list = recup_ramp_pathways_list(file, f_modul, flow=True) + else: + c_file = input("In which file is the correspondence table ?") + l_of_pathways_list = recup_ramp_pathways_list(file, c_file) elif mapper == "ME": fold = input("in which folder are the files?") # no "" around acces n_files = int(input("how many files you have in the folder?")) @@ -409,16 +335,24 @@ def c_p_o_m_r(file, outf, mapper, type_of_view="all", save_plot="all", log_p = [] metabo = column_recovery(f_modul, 0) value_modul = column_recovery(f_modul, 1) - #print(l_of_pathways_list) + if modul == "flow": + list_path = [] + up = [] + down = [] + log_p = [] + metabo = f_modul[0] + value_modul = f_modul[-1] for i_p_l, path_l in enumerate(l_of_pathways_list): - l_of_pathways_list[i_p_l][2] = comma_cleaning(path_l[2]) - if modul == True: + if mapper != "RAMP": + l_of_pathways_list[i_p_l][2] = comma_cleaning(path_l[2]) + else: + l_of_pathways_list[i_p_l][0] = comma_cleaning(path_l[0]) + if modul in (True, "flow") and mapper != "RAMP": actu_up = 0 actu_down = 0 list_path.append(l_of_pathways_list[i_p_l][2]) for path_meta in path_l[3:]: # print(comma_cleaning(path_meta)) Probable probléme de version entre ME et les autres (a vériifer) - #print(path_meta) if mapper == "ME": if float(value_modul[metabo.index(comma_cleaning(path_meta))]) >= 0: actu_up += 1 @@ -432,7 +366,7 @@ def c_p_o_m_r(file, outf, mapper, type_of_view="all", save_plot="all", up.append((actu_up/int(path_l[1]))*100) down.append((actu_down/int(path_l[1]))*100) log_p.append(-log(float(path_l[0]))) - if modul == True: + if modul in (True, "flow") and mapper != "RAMP": n_m_i_p = 200 if len(log_p) > n_m_i_p: print(len(log_p)) @@ -454,13 +388,14 @@ def c_p_o_m_r(file, outf, mapper, type_of_view="all", save_plot="all", plt.savefig(fold_of_visu_sav+"up_down_path_plot"+str(under_plot + 1)+".png") else : plot = up_down_path_plot(list_path, up, down, log_p) - plt.savefig(fold_of_visu_sav+"up_down_path_plot.png") + plt.savefig(fold_of_visu_sav+ mapper +"up_down_path_plot.png") if midfile == "Yes": + midfile_name = outf + mapper + "fmid_file.xlsx" mid_data = pd.DataFrame(l_of_pathways_list, dtype=object) excel_file_writer(mid_data, midfile_name, sheetname="Resultats") - for index_cleaning, full in enumerate(l_of_pathways_list): - l_of_pathways_list[index_cleaning] = full[2:] - #print(l_of_pathways_list) + if mapper != "RAMP": + for index_cleaning, full in enumerate(l_of_pathways_list): + l_of_pathways_list[index_cleaning] = full[2:] if n_path_to_filt != "nothing": l_path_l_treat = pathways_selection(l_of_pathways_list, n_path_to_filt) else: @@ -481,6 +416,7 @@ def c_p_o_m_r(file, outf, mapper, type_of_view="all", save_plot="all", sum_l = 0 for num_col in range(n_path_to_treat): shared = 0 + p1 = l_path_l_treat[num_line][1:] p2 = l_path_l_treat[num_col][1:] for metabolite_search in (p1): @@ -489,9 +425,12 @@ def c_p_o_m_r(file, outf, mapper, type_of_view="all", save_plot="all", mir_table[num_line, num_col] = shared sum_l += shared if len(p1) == 1: + print(p1) one_metabo_path.append(l_path_l_treat[num_line][0]) met_one_met_path.append(l_path_l_treat[num_line][1]) + print(l_path_l_treat[num_line]) pathways_names.append(l_path_l_treat[num_line][0]) + for metabolite_of_p1 in p1: if metabolite_of_p1 not in all_metabolites: all_metabolites.append(metabolite_of_p1) @@ -513,9 +452,10 @@ def c_p_o_m_r(file, outf, mapper, type_of_view="all", save_plot="all", all_metabolites[n_metab] = comma_cleaning(a_m) approximate_table = np.array(mir_table, dtype=object) - metabo_f1, path_metabo_f1 = list_f_1(metabolite_frequency, all_metabolites, + metabo_f1, path_metabo_f1 = list_f_1(metabolite_frequency, all_metabolites, # probléme a régler pathways_of_metabo, pathways_names, l_path_l_treat) + meta_and_path_p = pa_metabo(all_metabolites, pathways_of_metabo) all_metabolites.insert(0, "Ensemble des métabolites") @@ -533,10 +473,12 @@ def c_p_o_m_r(file, outf, mapper, type_of_view="all", save_plot="all", for index_fm in range(1, len(all_metabolites)): all_metabolites[index_fm] = f_metabo[index_fm-1][1] metabolite_frequency[index_fm] = f_metabo[index_fm-1][0] + inf_shap = [[len(one_metabo_path), one_metabo_path, met_one_met_path], [len(path_metabo_f1), metabo_f1, path_metabo_f1], [len(all_metabolites), all_metabolites, metabolite_frequency]] + inf_shap.sort() counter = 0 metabo_f_order = [] @@ -558,19 +500,19 @@ def c_p_o_m_r(file, outf, mapper, type_of_view="all", save_plot="all", metabo_f_order_for_export = np.array(metabo_f_order, dtype=object) metabo_f_order_for_export = pd.DataFrame(data=metabo_f_order_for_export) - patways_reco_order = recov_pos_path_name(totale_recovery, average_recovery, - pathways_names) + patways_reco_order = recov_pos_path_name(totale_recovery, average_recovery, pathways_names) patways_reco_order_for_export = pd.DataFrame(data=patways_reco_order) df_matrix_table = df_matrix_r(approximate_table) + result_out_file = outf+ mapper+"resultats_traitment_mapping.xlsx" excel_m_file_writer([patways_reco_order_for_export, df_matrix_table, metabo_f_order_for_export, - meta_and_path_p], outf, + meta_and_path_p], result_out_file, ["Voies métaboliques", "Table de ressemblance", "Fréquence métabolites", "Métabolites et leurs P"]) - data_for_recovery_visualization = pd.DataFrame(data=patways_reco_order[1:]) colnames_recovery = list(data_for_recovery_visualization.columns) + print(data_for_recovery_visualization) if type_of_view in ("all", "bar_plot", "bar_plot_r", "bar_r_meta_p"): barplot(colnames_recovery[2], colnames_recovery[1], @@ -578,7 +520,7 @@ def c_p_o_m_r(file, outf, mapper, type_of_view="all", save_plot="all", title="Recouvrement moyen des différentes voies métaboliques", figure_size=(22, 10), size_of_labels=6) if save_plot in ("all", "bar_plot", "bar_plot_r", "bar_r_meta_p"): - plt.savefig(fold_of_visu_sav+"bar_plot_of_recovery.png") + plt.savefig(fold_of_visu_sav+mapper+"bar_plot_of_recovery.png") plt.show() just_frequency = [] @@ -598,7 +540,7 @@ def c_p_o_m_r(file, outf, mapper, type_of_view="all", save_plot="all", figure_size=(22, 10), ax_x_label="Métabolites d'intérêt", ax_y_label='Fréquence', decimal='%.0f', size_of_labels=7) if save_plot in ("all", "bar_plot", "bar_plot_f", "bar_f_meta_p"): - plt.savefig(fold_of_visu_sav+"bar_plot_of_metabolites.png") + plt.savefig(fold_of_visu_sav+mapper+"bar_plot_of_metabolites.png") plt.show() if type_of_view in ("all", "meta_box", "bar_f_meta_p", "bar_r_meta_p"): @@ -607,20 +549,18 @@ def c_p_o_m_r(file, outf, mapper, type_of_view="all", save_plot="all", b1.set(title="Boîte à moustache des fréquences des métabolites") plt.ylabel("fréquence des métabolites") if save_plot in ("all", "meta_box", "bar_f_meta_p", "bar_r_meta_p"): - plt.savefig(fold_of_visu_sav+"metabolites_bo_of_frequency.png") + plt.savefig(fold_of_visu_sav+ mapper+"metabolites_bo_of_frequency.png") plt.show() - if __name__ == "__main__": #MAP = 'RAMP' MAP = "CPDB" #MAP = "ME" VIEW = "all" SAVE = "all" - INFILE = LOCAL + "ora_cpdb_data_yeux_reactome_rev_18-01-2024.csv" + INFILE = LOCAL + "CPDB\\Resultats_mapping_Chebi_ID_L100_CPDB.csv" #INFILE = "ExportExcel_6843" #INFILE = LOCAL + "RAMP\\sortie_Mapping_RAMP_L100_CheEBI.csv" - FINISHFILE = LOCAL + "test_oeil.xlsx" - FILE_MODUL = LOCAL + "chebi_modulation_intensite_patho_oeil_donnes_estelles_rev_19-01-2024.csv" - #FILE_MODUL = LOCAL + "CPDB\\liste_Chebi_des_100_chebi_ConsensusPAthDB_modul.csv" + FINISHFILE = LOCAL + "test.xlsx" + FILE_MODUL = LOCAL + "CPDB\\liste_Chebi_des_100_chebi_ConsensusPAthDB_modul.csv" c_p_o_m_r(INFILE, FINISHFILE, MAP, type_of_view=VIEW, save_plot=SAVE, modul=True, f_modul=FILE_MODUL) diff --git a/main.py b/main.py new file mode 100644 index 0000000..b791e2c --- /dev/null +++ b/main.py @@ -0,0 +1,62 @@ +""" +This module is designed to process the data obtained during metabolite mapping. +The main function is c_p_o_m_r +""" +import re +import csv +import matplotlib.pyplot as plt +import seaborn as sns +import numpy as np +import pandas as pd +from math import log, floor +import sys +sys.path.append('C:\\Users\\mumec\\Desktop\\Dossier_gitlab_local\\traitement_des_données') +sys.path.append('C:\\Users\\mumec\\Desktop\\Dossier_gitlab_local\\chebi-ids.git') +from utils import excel_file_writer, column_recovery, excel_m_file_writer, comma_cleaning, pre_cut, cor_index, recup_all_inf_excel +from Recovery_of_associated_Chebi_IDs import chebi_horizontal, chebi_in_outgouing +from Visualisation_des_donnes_de_mapping import up_down_path_plot, barplot +from complete_processing_of_mapping_results import recup_ramp_pathways_list, recup_cpdb_pathways_list, recup_me_path_list, recup_ma_pathways_list, pathways_selection, list_f_1, pa_metabo, recov_pos_path_name, df_matrix_r, c_p_o_m_r +from Mapping_using_the_API import send_request_to_mapping_api, mapping_ramp_api, m_ora_cpdb, opti_multimapping +from network_visualization import Paths_link_CPDB, network_visu + +FOLDER = "C:\\Users\\mumec\\Desktop\\fichier_mis_en_forme_programme_total\\main\\" + + +def shapping_data(file, folder): + """ + Takes data from an excel file and formats it for further workflow steps + + Arg: + file : file with data obtain after analysis + folder : folder in which the Excel file containing the modification results will be saved + + Returns: + Type of return: list and 1 file .xlsx + + """ + beg_datas = recup_all_inf_excel(file) + """ + if "chebi" in beg_datas[0]: + i_c_chebi = beg_datas.find("chebi") + chebi_increased = chebi_horizontal(beg_datas[i_c_chebi]) # soit modifier pour sortir la liste soit créer une fonction qui fait les 2 directement + chebi_increased.append(chebi_in_outgouing(beg_datas[i_c_chebi])) + datas_for_mapping = chebi_increased + beg_datas[1:i_c_chebi] + beg_datas[i_c_chebi+1:] + """ + datas_for_mapping = beg_datas + df_dfm = pd.DataFrame(data=datas_for_mapping) + n_o_f = folder + "Datas_mis_en_forme_pour_le_mapping.xlsx" + excel_file_writer(df_dfm, n_o_f) + return(datas_for_mapping) + + +if __name__ == "__main__": + INFILE = FOLDER + "Donnees_oeil_mis_en_forme_opti_mapping.xlsx" + datas_f_map = shapping_data(INFILE, FOLDER) + result_cpdb, result_ramp, recap = opti_multimapping(datas_f_map, FOLDER, mapping="flow") + #c_p_o_m_r(result_ramp, FOLDER, "RAMP", fold_of_visu_sav=FOLDER, modul="flow", f_modul=recap) + #c_p_o_m_r(result_cpdb, FOLDER, "CPDB", fold_of_visu_sav=FOLDER, modul="flow", f_modul=recap) + l_bdd = ["Reactome", "Wikipathways", "KEGG", "EHMN", "HumanCyc", "SMPDB", "INOH"] + for bdd in l_bdd: + out_path_links = FOLDER + "CPDB_links_network"+ bdd+"datas_base.xlsx" + edge_data, nodes_data = Paths_link_CPDB(result_cpdb, out_path_links , recap, bdd= bdd, flow=True) + print(network_visu(edge_data[0:3], nodes_data, bdd="HumanCyc")) diff --git a/network_visualization.py b/network_visualization.py new file mode 100644 index 0000000..3a6692e --- /dev/null +++ b/network_visualization.py @@ -0,0 +1,173 @@ +import re +import csv +import matplotlib.pyplot as plt +import seaborn as sns +import numpy as np +import pandas as pd +import py4cytoscape as p4c +from py4cytoscape import palette_color_brewer_d_RdBu +from math import log, floor +import sys +sys.path.append('C:\\Users\\mumec\\Desktop\\Dossier_gitlab_local\\traitement_des_données') +from utils import excel_file_writer, column_recovery +LOCAL = "C:\\Users\\mumec\\Desktop\\Mini_codes\\" + + +def Paths_link_CPDB(csv_file, out_file, int_file, bdd="Reactome", flow=None): + if flow==None: + all_l_paths = column_recovery(csv_file, 2) + all_l_len_path = column_recovery(csv_file, 8) + all_l_meta_in = column_recovery(csv_file, 5) + all_l_p_value = column_recovery(csv_file, 0) + source = column_recovery(csv_file, 3) + l_all_meta = column_recovery(int_file, 0)[1:] + int_cas = column_recovery(int_file, 1)[1:] + int_tem = column_recovery(int_file, 2)[1:] + modul = [] + for i_cas, cas in enumerate(int_cas): + modul.append(float(cas) - float(int_tem[i_cas])) + l_all_meta[i_cas] = l_all_meta[i_cas].strip() + else: + all_l_paths = csv_file[2] + all_l_len_path = csv_file[8] + all_l_meta_in = csv_file[5] + all_l_p_value = csv_file[0] + source = csv_file[3] + if 'HMDB' in all_l_meta_in[1][0]: + l_all_meta = int_file[2][1:]# output cpdb ID probléme bientot + else: + l_all_meta = int_file[1][1:] + modul = int_file[-1][1:] + l_paths = [] + l_len_path = [] + l_p_value = [] + l_meta_in = [] + for ip, np in enumerate(all_l_paths): + if source[ip] == bdd: + l_paths.append(np.replace(",",";")) + l_p_value.append(all_l_p_value[ip]) + l_len_path.append(all_l_len_path[ip]) + l_meta_in.append(all_l_meta_in[ip]) + for i_lpval, lpval in enumerate(l_p_value[1:]): + if "e" in lpval: + pvalac = '0.'+ (int(lpval[-2:])-1)*'0' +lpval[0] +lpval[2:-4] + print(pvalac) + l_p_value[i_lpval+1] = float(pvalac) + else: + l_p_value[i_lpval+1] = float(lpval) + edge = [] + modul_path = ["modulation de la voie"] + n_meta_int_in = ["numbers of metabolite of interest"] + for index_p, act_path in enumerate(l_paths): + if index_p != 0 and act_path != l_paths[-1]: + edge_now = [] + if flow==None: + splited = l_meta_in[index_p].split(",") + else: + splited = l_meta_in[index_p] + for index_m, try_met in enumerate(l_meta_in[index_p+1:]): + mod = 0 + links = 0 + for i in range(len(splited)): + splited[i] = splited[i].strip() + for met in splited: + mod += modul[l_all_meta.index(met.strip())] + if met in try_met: + links += 1 + edge_now.append([l_paths[index_p+1+index_m], links, mod, len(splited)]) + edge.append(edge_now) + n_meta_int_in.append(len(splited)) + modul_path.append(mod) + elif act_path==l_paths[-1]: + mod = 0 + if flow==None: + splited = l_meta_in[index_p].split(",") + else: + splited = l_meta_in[index_p] + for i in range(len(splited)): + splited[i] = splited[i].strip() + for met in splited: + mod += modul[l_all_meta.index(met.strip())] + edge.append([[act_path, 0, mod, len(splited)]]) + n_meta_int_in.append(len(splited)) + modul_path.append(mod) + source = ["Source"] + target = ["Target"] + n_edge = ["n_edge"] + modulation = ["Modulation"] + n_meta_map = ["Metabo_map"] + len_path = ["Number of metabolites in Pathway"] + p_value = ["p-value"] + for index_edge, edge in enumerate(edge): + for new_entree in edge: + source.append(l_paths[index_edge+1]) + target.append(new_entree[0]) + n_edge.append(new_entree[1]) + modulation.append(new_entree[2]) + n_meta_map.append(new_entree[3]) + len_path.append(l_len_path[index_edge+1]) + p_value.append(l_p_value[index_edge+1]) + out_data = [source, target, n_edge, modulation, n_meta_map, len_path, p_value] + nodes = [l_paths, l_p_value, n_meta_int_in, l_len_path, modul_path] + print(len(l_paths), len(l_p_value), len(n_meta_int_in), len(l_len_path), len(modul_path)) + network = pd.DataFrame(data = out_data).transpose() + excel_file_writer(network, out_file, sheetname="Network links") + return out_data, nodes + + +def network_visu(edge, nodes, bdd="Reactome"): + source = nodes[0][1:] + p_value = nodes[1][1:] + n_meta_in_path = nodes[2][1:] + len_tot_path = nodes[3][1:] + modul_path = nodes[4][1:] + source_for_target = edge[0][1:-1] + target = edge[1][1:-1] + weight_ege = edge[2][1:-1] + p4c.cytoscape_ping() + p4c.cytoscape_version_info() + df_nodes = pd.DataFrame(data={'id': source, 'p value': p_value, + 'N metabolites mapped': n_meta_in_path, + 'N metabolites in pathway': len_tot_path, + 'Pathway modulation': modul_path}) + df_edges = pd.DataFrame(data={'source': source_for_target, 'target': target, + 'weight': weight_ege}) + p4c.create_network_from_data_frames(nodes=df_nodes, edges=df_edges, + title="CPDB_network_"+ bdd, + collection="Network_from_mapping") + #mise en place de paramétres fixe + + p4c.set_node_shape_default('ELLIPSE') + p4c.set_node_font_size_default(17) + nmm_min = min(n_meta_in_path) + nmm_max = max(n_meta_in_path) + nmm_c = nmm_min + (nmm_max - nmm_min)/2 + p4c.set_node_color_mapping('N metabolites mapped', [nmm_min, nmm_c, nmm_max], + ['#e6eeff', '#6699ff', '#000099'], + mapping_type='c') + pv_min = min(p_value) + pv_max = max(p_value) + pv_c = pv_min + (pv_max - pv_min)/3 + p4c.set_node_label_color_mapping('p value', [pv_min, pv_c, pv_max], + ['#145214', '#ffb3ff', '#4d004d'], + mapping_type='c') + w_min = min(p_value) + w_max = max(p_value) + w_c = w_min + (w_max - w_min)/2 + p4c.set_edge_line_width_mapping('weight', [w_min, w_c, w_max], + [0.5, 1.75, 3], mapping_type='c') + + for i_ltp, ltp in enumerate(len_tot_path): + len_tot_path[i_ltp] = int(ltp)/2 + p4c.set_node_height_bypass(source, len_tot_path) + p4c.set_node_width_bypass(source, len_tot_path) + + p4c.layout_network('degree-circle') + return([pv_min, pv_c, pv_max]) + +if __name__ == "__main__": + csv_f = LOCAL + "ora_cpdb_data_yeux_reactome_rev_18-01-2024.csv" + out_file = LOCAL + "reseax_edge_tab_data_oeil_cpdb_reactome_v2_rev_19-01-2024.xlsx" + intens = LOCAL + "chebi_intensite_patho_oeil_donnes_estelles_rev_17-01-2024.csv" + edge_data, nodes_data = Paths_link_CPDB(csv_f, out_file, intens) + print(network_visu(edge_data[0:3], nodes_data)) diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..c3ac355 --- /dev/null +++ b/utils.py @@ -0,0 +1,130 @@ +""" +all function utils +""" +import pandas as pd +import csv +import re + + +def excel_file_writer(dataframe, n_o_f, sheetname="Resultats"): + """ + Take a dataframe and write an excel file with this data + + Arg: + dataframe = dataframe of data to write + n_o_f = name and acces path of the new excel file + sheetname = The name of the new sheet + """ + ex_f = pd.ExcelWriter(n_o_f) # pylint: disable=abstract-class-instantiated + dataframe.to_excel(ex_f, sheet_name=sheetname, header=False, index=False) + ex_f.close() + + +def column_recovery(file, n, sep=";", enc=None): + """ + Put the culomn n of the file in list + + Arg: + file : A csv file to read + n : the number of column to read + sep : type of separator + + Returns: + Type of return: list + """ + with open(file, "r", encoding=enc) as f: + r = csv.reader(f, delimiter=sep) + lines = list(r) + res = [] + if abs(n) < len(lines[0]): + for line in lines: + if line[n].strip() != '': + res.append(line[n].strip()) + return(res) + + +def excel_m_file_writer(list_of_dataframe, n_outf, list_of_sheetname): + """ + Take a list of dataframe and write an excel file with these data + + Arg: + list_of_dataframe = list of dataframe to write + n_outf = name and acces path of the new excel file + list_of_sheetname = list of sheets names to write + """ + e_f = pd.ExcelWriter(n_outf) # pylint: disable=abstract-class-instantiated + for df_index, l_o_d in enumerate(list_of_dataframe): + s_n = list_of_sheetname[df_index] + l_o_d.to_excel(e_f, sheet_name=s_n, header=False, index=False) + e_f.close() + + +def comma_cleaning(str_to_clean): + """ + Replace potential ',' by '_' + + Arg: + str_to_clean = list of character with potentialy ',' + + Returns: + Type of return: character + """ + if ',' in str_to_clean: + while ',' in str_to_clean: + str_to_clean = re.sub(",", "_", str(str_to_clean)) + return str_to_clean + + +def pre_cut(listed): + """ + cut only 1 type of ID by the first entree + + Arg: + list: 1 list of id + + Returns: + Type of return: 1 list + """ + clean_list = [] + cump = 0 + while listed[cump] == "NA": + cump += 1 + pos_cut = listed[cump].index(":") + for elem in listed: + if elem == "NA": + clean_list.append("NA") + else: + clean_list.append(elem[pos_cut+1:]) + return clean_list + + +def recup_all_inf_excel(file): + """ + This function takes infos from a .xlsx + + Arg: + file = the file to read + Returns: + Type of return: 1 list of list line + """ + datas = pd.read_excel(file, header=None, na_filter=False) + l_datas = datas.values.tolist() + return l_datas + + +def cor_index(list_objects_to_convert, l_all_obj, l_all_equ): + """ + Change elements of a list by the correspondance elements + + Arg: + list_objects_to_convert : list of object + l_all_obj : list who countain all objet to convert + l_all_equ : correspondance list of all object + + Returns: + Type of return: list + """ + l_to_return = [] + for item_to_replace in (list_objects_to_convert): + l_to_return.append(l_all_equ[l_all_obj.index(item_to_replace.strip())]) + return l_to_return \ No newline at end of file -- GitLab