diff --git a/corpora/florilege/dates.meta b/corpora/florilege/dates.meta new file mode 100644 index 0000000000000000000000000000000000000000..42ca6be6c60cc7a46831132ce70842f521b36cf4 --- /dev/null +++ b/corpora/florilege/dates.meta @@ -0,0 +1,10 @@ + +source,uri,valeur +cirm,/db/maj/genbank/date,17-02-2021 +genbank,/db/maj/genbank/date,27-01-2021 +dsmz,/db/maj/dsmz/date,26-01-2018 +pubmed,/db/maj/pubmed/date,01-09-2021 +ontobitope,/db/maj/ontobiotope/date,09-04-2021 +eval,migale/evaluation/BB19/date,14-04-2021 +eval,https://sites.google.com/view/bb-2019,Bacteria Bitope 2019 +ncbi,migale/maj/ncbi/taxo/date,9-04-2021 diff --git a/corpora/florilege/labels.stats b/corpora/florilege/labels.stats index b3c558958c7a1be48aa5a28df3c0ef72c9b7de72..1238326878a0b36c83b00568f2613ba6b24b63e3 100644 --- a/corpora/florilege/labels.stats +++ b/corpora/florilege/labels.stats @@ -1,40 +1,43 @@ id,libelle,uri -cirm_000, date de mise à jour des données de cirm, /db/maj/genbank/date +cirm_000,date de mise à jour des données cirm,/db/maj/genbank/date cirm_001,nombre d'entrées de cirm-BIA,cirm/BIA_2021/florilege_export_final_17_02_21.xlsx -cirm_002,nombre d'entrées cirm-Levure,cirm/Levures_2021/Florilege_21012021.xlsx -cirm_003,nombre de taxons de cirm-BIA,cirm/mapped_taxids.txt +cirm_002,nombre d'entrées de cirm-Levure,cirm/Levures_2021/Florilege_21012021.xlsx +cirm_003,nombre de taxons de cirm-BIA,cirm/mapped_bia_taxa.txt cirm_004,nombre de taxons de cirm-Levure,cirm/mapped_yeast_taxa.txt -cirm_005,nombre d'habitats de cirm-BIA,cirm/mapped_habitats.txt -cirm_006,nombre d'habitat de cirm-Levure,cirm/mapped_yeast_habitats.txt -genbank_000, date de mise à jour des données de genbank,/db/maj/genbank/date +cirm_005,nombre d'habitats de cirm-BIA,cirm/mapped_bia_habitats.txt +cirm_006,nombre d'habitats de cirm-Levure,cirm/mapped_yeast_habitats.txt +cirm_007,nombre d'entrées de cirm-cfpb,cirm/CFBP_2020/CFPB_22_sept_2020_Type.xlsx +cirm_008,nombre de taxons de cirm-cfpb,cirm/mapped_cfbp_taxa.txt +cirm_009,nombre d'habitats de cirm-cfpb,cirm/mapped_cfbp_habitats.txt +genbank_000,date de mise à jour des données genbank,/db/maj/genbank/date genbank_001,nombre d'entrées de genbank,genbank/GenBank_extraction_20210127.tsv -genbank_002,nombre de taxon de genbank,genbank/mapped_taxids.txt +genbank_002,nombre de taxons de genbank,genbank/mapped_taxids.txt genbank_003,nombre d'entités du type #Habitat de genbank,genbank/mapped_habitats.txt dsmz_000, date de mise à jour des données de dsmz,/db/maj/dsmz/date dsmz_001,nombre d'entrées de dsmz,dsmz/dsmz-data/category=from_ncbi_taxonomy-key=taxid.tsv dsmz_002,nombre de taxon venant de dsmz,dsmz/mapped_taxids.txt dsmz_003,nombre d'entités de type #Habitat de dsmz,dsmz/mapped_habitats.txt pubmed_000, date de mise à jour du corpus pubmed,/db/maj/pubmed/date -pubmed_001,nombre de batches (x1000) pubmed,microbes-2019/list_of_batches.txt -pubmed_002,nombre d'entités du type #Habitat de pubmed,microbes-2019/habitats.full.txt -pubmed_003,nombre d'entités du type #Taxon pubmed,microbes-2019/microorganisms.full.txt -pubmed_004,nombre de relations du type #Phenotype-Taxon pubmed,microbes-2019/phenotype-relations.full.txt -pubmed_005,nombre de relations du type #Phenotype-Relations pubmed,microbes-2019/phenotype-relations.txt -pubmed_006,nombre d'entités du type #Phenotype de pubmed,microbes-2019/phenotypes.full.txt -pubmed_007,nombre de relations du type #Taxon-Habitat de pubmed,microbes-2019/relations.full.txt -pubmed_008,nombre de relations de type #Use pubmed,microbes-2019/uses.full.txt -pubmed_009,nombre de relations du type #Use-Taxon pubmed,microbes-2019/uses-relations.full.txt -eval_001, corpus utilisés, https://sites.google.com/view/bb-2019 -eval_002, date d'évaluation, migale/evaluation/BB19/date -eval_BB19-norm+ner_001, mesure pour l'evaluation de BB19-norm+ner,BioNLP-OST-2019/BB19-norm+ner#Mesure -eval_BB19-norm+ner_002, score global sur la prédiction de BB19-norm+ner,BioNLP-OST-2019/BB19-norm+ner#Standard_scoring -eval_BB19-norm+ner_002, score sur la prédiction des taxons de BB19-norm+ner,BioNLP-OST-2019/BB19-norm+ner#Habitat -eval_BB19-norm+ner_003, score sur la prédiction des phénotypes de BB19-norm+ner,BioNLP-OST-2019/BB19-norm+ner#Phenotype -eval_BB19-norm+ner_004, score sur la prédiction des habitats de BB19-norm+ner,BioNLP-OST-2019/BB19-norm+ner#Microorganism -eval_BB19-rel+ner_001, mesure pour l'evaluation de BB19-rel+ner,BioNLP-OST-2019/BB19-rel+ner#Mesure -eval_BB19-rel+ner_002, score global sur la prédiction de BB19-rel+ner,BioNLP-OST-2019/BB19-rel+ner#Standard_scoring -eval_BB19-rel+ner_002, score sur la prédiction des Lives-In de BB19-rel+ner,BioNLP-OST-2019/BB19-rel+ner#Lives_In -eval_BB19-rel+ner_003, score sur la prédiction des Exhibits de BB19-rel+ner,BioNLP-OST-2019/BB19-rel+ner#Exhibits -eval_BB19-kb+ner_001, mesure pour l'evaluation de BB19-kb+ner,BioNLP-OST-2019/BB19-kb+ner#Mesure -eval_BB19-kb+ner_002, score moyen sur BB19-kb+ner,BioNLP-OST-2019/BB19-kb+ner#Standard_scoring -ontobiotope_000, date de mise à jour de ontobiotope,/db/maj/pubmed/date +pubmed_001,nombre de batches (x1000) pubmed,pubmed/list_of_batches.txt +pubmed_002,nombre d'entités du type #Habitat de pubmed,pubmed/habitats.full.txt +pubmed_003,nombre d'entités du type #Taxon pubmed,pubmed/microorganisms.full.txt +pubmed_004,nombre d'entités du type #Phenotype de pubmed,pubmed/phenotypes.full.txt +pubmed_005,nombre d'entités du type #Use de pubmed,pubmed/uses.full.txt +pubmed_006,nombre de relations du type #Taxon-Habitat de pubmed,pubmed/relations.full.txt +pubmed_007,nombre de relations du type #Phenotype-Taxon pubmed,pubmed/phenotype-relations.full.txt +pubmed_008,nombre de relations du type #Use-Taxon pubmed,pubmed/uses-relations.full.txt +eval_001,corpus utilisés,https://sites.google.com/view/bb-2019 +eval_002,date d'évaluation,migale/evaluation/BB19/date +eval_BB19-norm+ner_001,mesure pour l'evaluation de BB19-norm+ner,BioNLP-OST-2019/BB19-norm+ner#Mesure +eval_BB19-norm+ner_002,score global sur la prédiction de BB19-norm+ner,BioNLP-OST-2019/BB19-norm+ner#Standard_scoring +eval_BB19-norm+ner_002,score sur la prédiction des taxons de BB19-norm+ner,BioNLP-OST-2019/BB19-norm+ner#Microorganism +eval_BB19-norm+ner_003,score sur la prédiction des phénotypes de BB19-norm+ner,BioNLP-OST-2019/BB19-norm+ner#Phenotype +eval_BB19-norm+ner_004,score sur la prédiction des habitats de BB19-norm+ner,BioNLP-OST-2019/BB19-norm+ner#Habitat +eval_BB19-rel+ner_001,mesure pour l'evaluation de BB19-rel+ner,BioNLP-OST-2019/BB19-rel+ner#Mesure +eval_BB19-rel+ner_002,score global sur la prédiction de BB19-rel+ner,BioNLP-OST-2019/BB19-rel+ner#Standard_scoring +eval_BB19-rel+ner_002,score sur la prédiction des Lives-In de BB19-rel+ner,BioNLP-OST-2019/BB19-rel+ner#Lives_In +eval_BB19-rel+ner_003,score sur la prédiction des Exhibits de BB19-rel+ner,BioNLP-OST-2019/BB19-rel+ner#Exhibits +eval_BB19-kb+ner_001,mesure pour l'evaluation de BB19-kb+ner,BioNLP-OST-2019/BB19-kb+ner#Mesure +eval_BB19-kb+ner_002,score moyen sur BB19-kb+ner,BioNLP-OST-2019/BB19-kb+ner#Standard_scoring +ontobiotope_000,date de mise à jour de ontobiotope,/db/maj/ontobiotope/date +ncbi_000,date de mise à jour de la taxo ncbi,migale/maj/ncbi/taxo/date diff --git a/corpora/florilege/stats.labels b/corpora/florilege/stats.labels deleted file mode 100644 index 3be2a5401b9528e6ec71754ad96d4fa957e3963a..0000000000000000000000000000000000000000 --- a/corpora/florilege/stats.labels +++ /dev/null @@ -1,22 +0,0 @@ -LIBELLE,file -entrées cirm,cirm/2019-07-05/extraction_3-fv.csv -entrées cirm (levure),cirm/Levures_2017/data_CIRM_levures_extraction_09032017.csv -taxid cirm,cirm/mapped_taxids.txt -yeast cirm,cirm/yeast_taxa.txt -habitats cirm,cirm/mapped_habitats.txt -habitats cirm (yeast),cirm/mapped_yeast_habitats.txt -entrées genbank,genbank/req1_sup800_bacteria-descriptors.csv -taxid genbank,genbank/mapped_taxids.txt -habitats genbank,genbank/mapped_habitats.txt -entrées dsmz,dsmz/dsmz-data/category=from_ncbi_taxonomy-key=taxid.tsv -taxid dsmz,dsmz/mapped_taxids.txt -habitats dsmz,dsmz/mapped_habitats.txt -batch (x1000) pubmed,microbes-2019/list_of_batches.txt -habitats pubmed,microbes-2019/habitats.full.txt -microorganisms pubmed,microbes-2019/microorganisms.full.txt -phenotype-relations pubmed,microbes-2019/phenotype-relations.full.txt -phenotype-relations pubmed,microbes-2019/phenotype-relations.txt -phenotypes pubmed,microbes-2019/phenotypes.full.txt -relations pubmed,microbes-2019/relations.full.txt -uses pubmed,microbes-2019/uses.full.txt -uses-relations pubmed,microbes-2019/uses-relations.full.txt diff --git a/generate_stats.snakefile b/generate_stats.snakefile index 40c7626407254592b5cda860fe4fa03f22f7cd4b..4e1d2eb3072e7c5f2bdd4d86dce7602a879ceec6 100644 --- a/generate_stats.snakefile +++ b/generate_stats.snakefile @@ -16,7 +16,7 @@ SOURCES=["cirm", "genbank", "dsmz", "pubmed", "BioNLP-OST-2019"] cirm | nb entrees | count_lines(corpora/cirm/BIA_2021/florilege_export_final_17_02_21.xlsx) cirm | nb yeast entrees | count_lines(corpora/cirm/Levures_2021/Florilege_21012021.xlsx) ''' -ENTREES_CIRM = ["BIA_2021/florilege_export_final_17_02_21.xlsx", "Levures_2021/Florilege_21012021.xlsx"] +ENTREES_CIRM = ["BIA_2021/florilege_export_final_17_02_21.xlsx", "Levures_2021/Florilege_21012021.xlsx", "CFBP_2020/CFPB_22_sept_2020_Type.xlsx"] rule stats_cirm_BIA: input: file="corpora/cirm/{file}" @@ -36,12 +36,14 @@ rule stats_cirm_BIA: df2.to_csv(output.stats, index=False) ''' -cirm | nb entites | count_lines(corpora/cirm/mapped_taxids.txt) +cirm | nb entites | count_lines(corpora/cirm/mapped_bia_taxa.txt) cirm | nb yeast entities | count_lines(corpora/cirm/mapped_yeast_taxa.txt) -cirm | nb entites | count_lines(corpora/cirm/mapped_habitats.txt) +cirm | nb entites | count_lines(corpora/cirm/mapped_bia_habitats.txt) cirm | nb yeast habitats | count_lines(corpora/cirm/mapped_yeast_habitats.txt) +cirm | nb entites | count_lines(corpora/cirm/mapped_cfbp_habitats.txt) +cirm | nb yeast habitats | count_lines(corpora/cirm/mapped_cfbp_habitats.txt) ''' -SORTIES_CIRM= ["mapped_taxids.txt", "mapped_yeast_taxa.txt", "mapped_habitats.txt", "mapped_yeast_habitats.txt" ] +SORTIES_CIRM= ["mapped_bia_taxa.txt", "mapped_yeast_taxa.txt", "mapped_cfbp_taxa.txt", "mapped_bia_habitats.txt", "mapped_yeast_habitats.txt", "mapped_cfbp_habitats.txt" ] ''' ''' rule stats_cirm_Levure: @@ -126,7 +128,7 @@ dsmz | nb entites | count_lines(corpora/dsmz/mapped_taxids.txt) dsmz | nb habitats | count_lines(corpora/dsmz/mapped_habitats.txt) ''' ENTREES_DSMZ = ["dsmz-data/category=from_ncbi_taxonomy-key=taxid.tsv" ] -SORTIES_DSMZ = ["mapped_taxids.txt", "mapped_habitats.txt" ] +SORTIES_DSMZ = [ "mapped_habitats.txt" ] FILES_DSMZ = ENTREES_DSMZ + SORTIES_DSMZ ''' ''' @@ -179,6 +181,17 @@ SORTIES_PUBMED = ["relations.full.txt", "phenotype-relations.full.txt", "uses-re FILES_PUBMED = ENTREES_PUBMED + SORTIES_PUBMED +''' +''' +rule get_list_of_batches: + input: + batches="corpora/pubmed/batches/" + output: + list="corpora/pubmed/list_of_batches.txt" + shell:""" + ls {input.batches}/*/batch.xml >> {output.list} + """ + ''' ''' @@ -344,19 +357,19 @@ rule merge_all: result.to_csv(output.result, index=False) - ''' merge ''' rule joint_stats: input: full_r="corpora/florilege/stats.full.csv", - concepts="corpora/florilege/labels.stats" + concepts="corpora/florilege/labels.stats", + dates="corpora/florilege/dates.meta" output: result="corpora/florilege/full_stats_with_labels.csv" run: import pandas + frames = [pandas.read_csv(input.full_r), pandas.read_csv(input.dates)] df1=pandas.read_csv(input.concepts) - df2=pandas.read_csv(input.full_r) - df = pandas.merge(df1, df2, on="uri", how="left") + df = pandas.merge(df1, pandas.concat(frames), on="uri", how="left") df.to_csv(output.result, index=False)