From 8f88c2d35bf18ab8eca544ccbf6bd38801bf1af0 Mon Sep 17 00:00:00 2001 From: "louise.deleger" Date: Thu, 25 Nov 2021 15:49:13 +0100 Subject: [PATCH 1/6] Removed &ontobiotope; entity --- plans/tomap-habitats.plan | 22 +++++++++++----------- plans/tomap-microbial-phenotypes.plan | 26 +++++++++++++------------- plans/use-extraction.plan | 4 ++-- process_PubMed_corpus.snakefile | 16 ++++++++++++---- 4 files changed, 38 insertions(+), 30 deletions(-) diff --git a/plans/tomap-habitats.plan b/plans/tomap-habitats.plan index 46bbb98..205a5fa 100644 --- a/plans/tomap-habitats.plan +++ b/plans/tomap-habitats.plan @@ -7,14 +7,14 @@ habitats concept-id explain_ - &ontobiotope;-Habitat.tomap + ancillaries/BioNLP-OST+EnovFood-Habitat.tomap score - &ontobiotope;-Habitat.obo + ancillaries/BioNLP-OST+EnovFood-Habitat.obo documents.sections.layer:habitats
@concept-id
@@ -30,14 +30,14 @@ habitats2 concept-id explain_ - &ontobiotope;-Habitat.tomap + ancillaries/BioNLP-OST+EnovFood-Habitat.tomap score - &ontobiotope;-Habitat.obo + ancillaries/BioNLP-OST+EnovFood-Habitat.obo documents.sections.layer:habitats2
@concept-id
@@ -62,13 +62,13 @@ habitats3 concept-id explain_ - &ontobiotope;-Habitat.tomap + ancillaries/BioNLP-OST+EnovFood-Habitat.tomap score - &ontobiotope;-Habitat.obo + ancillaries/BioNLP-OST+EnovFood-Habitat.obo documents.sections.layer:habitats3
@concept-id
@@ -93,14 +93,14 @@ habitats4 concept-id explain_ - &ontobiotope;-Habitat.tomap + ancillaries/BioNLP-OST+EnovFood-Habitat.tomap score - &ontobiotope;-Habitat.obo + ancillaries/BioNLP-OST+EnovFood-Habitat.obo documents.sections.layer:habitats4
@concept-id
@@ -125,13 +125,13 @@ habitats5 concept-id explain_ - &ontobiotope;-Habitat.tomap + ancillaries/BioNLP-OST+EnovFood-Habitat.tomap score - &ontobiotope;-Habitat.obo + ancillaries/BioNLP-OST+EnovFood-Habitat.obo documents.sections.layer:habitats5
@concept-id
@@ -714,7 +714,7 @@ - &ontobiotope;-Habitat.obo + ancillaries/BioNLP-OST+EnovFood-Habitat.obo documents.sections.layer:habitats
@concept-id
diff --git a/plans/tomap-microbial-phenotypes.plan b/plans/tomap-microbial-phenotypes.plan index 1748164..275e64c 100644 --- a/plans/tomap-microbial-phenotypes.plan +++ b/plans/tomap-microbial-phenotypes.plan @@ -6,14 +6,14 @@ phenotypes concept-id explain_ - &ontobiotope;-Phenotype.tomap + ancillaries/BioNLP-OST+EnovFood-Phenotype.tomap score - &ontobiotope;-Phenotype.obo + ancillaries/BioNLP-OST+EnovFood-Phenotype.obo documents.sections.layer:phenotypes
@concept-id
@@ -29,14 +29,14 @@ phenotypes2 concept-id explain_ - &ontobiotope;-Phenotype.tomap + ancillaries/BioNLP-OST+EnovFood-Phenotype.tomap score - &ontobiotope;-Phenotype.obo + ancillaries/BioNLP-OST+EnovFood-Phenotype.obo documents.sections.layer:phenotypes2
@concept-id
@@ -61,13 +61,13 @@ phenotypes3 concept-id explain_ - &ontobiotope;-Phenotype.tomap + ancillaries/BioNLP-OST+EnovFood-Phenotype.tomap score - &ontobiotope;-Phenotype.obo + ancillaries/BioNLP-OST+EnovFood-Phenotype.obo documents.sections.layer:phenotypes3
@concept-id
@@ -93,14 +93,14 @@ phenotypes4 concept-id explain_ - &ontobiotope;-Phenotype.tomap + ancillaries/BioNLP-OST+EnovFood-Phenotype.tomap score - &ontobiotope;-Phenotype.obo + ancillaries/BioNLP-OST+EnovFood-Phenotype.obo documents.sections.layer:phenotypes4
@concept-id
@@ -125,13 +125,13 @@ phenotypes5 concept-id explain_ - &ontobiotope;-Phenotype.tomap + ancillaries/BioNLP-OST+EnovFood-Phenotype.tomap score - &ontobiotope;-Phenotype.obo + ancillaries/BioNLP-OST+EnovFood-Phenotype.obo documents.sections.layer:phenotypes5
@concept-id
@@ -394,7 +394,7 @@ - + @@ -404,7 +404,7 @@ - + @@ -469,7 +469,7 @@ - &ontobiotope;-Phenotype.obo + ancillaries/BioNLP-OST+EnovFood-Phenotype.obo documents.sections.layer:phenotypes
@concept-id
diff --git a/plans/use-extraction.plan b/plans/use-extraction.plan index 5d12302..87195d8 100644 --- a/plans/use-extraction.plan +++ b/plans/use-extraction.plan @@ -2,7 +2,7 @@ - &ontobiotope-use;.obo + ancillaries/Use_V2.obo uses concept-id @@ -12,7 +12,7 @@ - &ontobiotope-use;.obo + ancillaries/Use_V2.obo uses2 concept-id diff --git a/process_PubMed_corpus.snakefile b/process_PubMed_corpus.snakefile index 7fd42f3..2d25617 100644 --- a/process_PubMed_corpus.snakefile +++ b/process_PubMed_corpus.snakefile @@ -47,8 +47,13 @@ rule run_pubmed_entities: batch="{B}", corpus='pubmed', inhibitSyntax='inhibit-syntax', - onto='ancillaries/BioNLP-OST+EnovFood', - ontobiotopeUse='ancillaries/Use_V2', + onto_habitat='ancillaries/BioNLP-OST+EnovFood-Habitat.obo', + tomap_habitat='ancillaries/BioNLP-OST+EnovFood-Habitat.tomap', + onto_pheno='ancillaries/BioNLP-OST+EnovFood-Phenotype.obo', + tomap_pheno='ancillaries/BioNLP-OST+EnovFood-Phenotype.tomap', + graylist='ancillaries/graylist_extended.heads', + emptywords='ancillaries/stopwords_EN.ttg', + ontobiotopeUse='ancillaries/Use_V2.obo', plan='plans/entities.plan', dir='corpora/pubmed/batches/{B}/', taxid_microorganisms='ancillaries/extended-microorganisms-taxonomy/taxid_microorganisms.txt', @@ -65,8 +70,11 @@ rule run_pubmed_entities: -environmentEntities \ -entity corpus {params.corpus} \ -feat inhibit-syntax {params.inhibitSyntax} \ - -entity ontobiotope {params.onto} \ - -entity ontobiotope-use {params.ontobiotopeUse} \ + -alias ontobiotope-habitat {params.onto_habitat} \ + -xalias '{params.tomap_habitat}' \ + -alias ontobiotope-phenotypes {params.onto_pheno} \ + -xalias '{params.tomap_pheno}' \ + -alias ontobiotope-use {params.ontobiotopeUse} \ -entity batch {params.batch} \ -alias taxid_microorganisms {params.taxid_microorganisms} \ -alias taxa+id_full {params.taxa_id_full} \ -- GitLab From cc295712a16652db30acd30e64abe3cdc9061445 Mon Sep 17 00:00:00 2001 From: "louise.deleger" Date: Fri, 26 Nov 2021 14:29:24 +0100 Subject: [PATCH 2/6] removed &corpus; and &batch; entities --- plans/entities.plan | 141 +++++++++++++++++--------- plans/tomap-habitats.plan | 24 ++--- plans/tomap-microbial-phenotypes.plan | 12 +-- process_PubMed_corpus.snakefile | 14 ++- 4 files changed, 122 insertions(+), 69 deletions(-) diff --git a/plans/entities.plan b/plans/entities.plan index f98ea1c..b32f10c 100644 --- a/plans/entities.plan +++ b/plans/entities.plan @@ -17,6 +17,10 @@ + + + + @@ -37,6 +41,23 @@ + + + + + + + + + + + + + + + + + @@ -140,7 +161,28 @@ - + + + + + + + + + + + + + + + + + + + + + + @@ -195,14 +237,15 @@ - corpora/&corpus;/batches/&batch;/batch.xml - ancillaries/&corpus;-pubmed2alvisnlp.xslt + corpora/pubmed/batches/0000/batch.xml + ancillaries/microbes-2019-pubmed2alvisnlp.xslt + batch=0000 true abstract - corpora/&corpus;/batches/&batch;/bionlp-st + corpora/BioNLP-OST-2019/batches/BB19-kb+ner/bionlp-st @@ -404,7 +447,7 @@ @name == "title" or @name == "abstract" - corpora/&corpus;/batches/&batch;/yatea/candidates.xml + yatea/candidates.xml tt_pos ancillaries/YaTeA/config-habitats ancillaries/YaTeA/locale @@ -415,7 +458,7 @@ @name == "title" or @name == "abstract" - corpora/&corpus;/batches/&batch;/yatea-var/candidates.xml + yatea-var/candidates.xml tt_pos variant ancillaries/YaTeA/config-habitats @@ -669,12 +712,12 @@ - corpora/&corpus;/batches/&batch; + . $ "doc-mesh.txt" documents.sections:mesh - "&batch;"; + document.@batch; document.@id; @UI; @mesh-name; @@ -683,12 +726,12 @@ - corpora/&corpus;/batches/&batch; + . $ "taxa.txt" documents.sections.layer:taxa - "&batch;"; + document.@batch; section.document.@id; section.@name; start ^ "-" ^ end; @@ -702,12 +745,12 @@ - corpora/&corpus;/batches/&batch; + . $ "microorganisms.txt" documents.sections.layer:microorganism - "&batch;"; + document.@batch; section.document.@id; section.@name; start ^ "-" ^ end; @@ -721,12 +764,12 @@ - corpora/&corpus;/batches/&batch; + . $ "microorganisms-short.txt" documents.sections.layer:microorganism[outside:words and not @form == outside:words.@form] - "&batch;"; + document.@batch; section.document.@id; section.@name; start ^ "-" ^ end; @@ -740,12 +783,12 @@ - corpora/&corpus;/batches/&batch; + . $ "bacteria.txt" documents.sections.layer:bacteria - "&batch;"; + document.@batch; section.document.@id; section.@name; start ^ "-" ^ end; @@ -759,12 +802,12 @@ - corpora/&corpus;/batches/&batch; + . $ "habitats.txt" documents.sections.layer:habitats - "&batch;"; + document.@batch; section.document.@id; section.@name; start ^ "-" ^ end; @@ -780,12 +823,12 @@ - corpora/&corpus;/batches/&batch; + . $ "phenotypes.txt" documents.sections.layer:phenotypes - "&batch;"; + document.@batch; section.document.@id; section.@name; start ^ "-" ^ end; @@ -801,12 +844,12 @@ - corpora/&corpus;/batches/&batch; + . $ "uses.txt" documents.sections.layer:uses - "&batch;"; + document.@batch; section.document.@id; section.@name; start ^ "-" ^ end; @@ -822,12 +865,12 @@ - corpora/&corpus;/batches/&batch; + . $ "geo.txt" documents.sections.layer:Geographical - "&batch;"; + document.@batch; section.document.@id; section.@name; start ^ "-" ^ end; @@ -837,12 +880,12 @@ - corpora/&corpus;/batches/&batch; + . $ "relations.txt" documents.sections.relations:CooccurrenceLocalization.tuples - "&batch;"; + document.@batch; section.document.@id; section.@name; args:Bacterium.@taxid; @@ -860,12 +903,12 @@ - corpora/&corpus;/batches/&batch; + . $ "phenotype-relations.txt" documents.sections.relations:PhenotypeRelation.tuples - "&batch;"; + document.@batch; section.document.@id; section.@name; args:Microorganism.@taxid; @@ -883,12 +926,12 @@ - corpora/&corpus;/batches/&batch; + . $ "uses-relations.txt" documents.sections.relations:UseRelation.tuples - "&batch;"; + document.@batch; section.document.@id; section.@name; args:Microorganism.@taxid; @@ -974,7 +1017,7 @@ - corpora/&corpus;/batches/&batch; + . $ "sentences.txt" documents.sections.layer:sentences[@name != "author"] @@ -995,7 +1038,7 @@ - corpora/&corpus;/batches/&batch; + . $ "anaphora.txt" documents.sections.relations:coreferences.tuples[args:Ante] @@ -1028,7 +1071,7 @@ - corpora/&corpus;/batches/&batch; + . $ "dependencies.txt" documents.sections[@name != "author"].relations:dependencies.tuples @@ -1247,11 +1290,11 @@ - corpora/&corpus;/batches/&batch;/adb + . documents.sections.relations:CooccurrenceLocalization.tuples[args:Bacterium[@bacteria == "true"]] - "&batch;_" ^ id:unique + section.document.@batch ^ id:unique "Localization" "localization" args:Bacterium|args:Localization @@ -1267,7 +1310,7 @@ - corpora/&corpus;/batches/&batch;/index + . 9216 title,abstract,author,full-author,pmid,year,journal,mesh,url @@ -1404,8 +1447,8 @@ - - corpora/&corpus;/batches/&batch;/index-food + @@ -1563,14 +1606,14 @@ false - corpora/&corpus;/batches/&batch;/html + ./html ne-type phenotypes,microorganism,habitats #99cc00,#ffcc99,#ffd333,#ffd666 - corpora/&corpus;/batches/&batch; + . $ "words.txt" documents.sections[@name == "title" or @name == "abstract"].layer:words @@ -1582,7 +1625,7 @@ - corpora/&corpus;/batches/&batch;/a2 + . documents.sections document.@id ^ ".a2" layer:habitats @@ -1595,7 +1638,7 @@ - corpora/&corpus;/batches/&batch;/a2 + . documents.sections document.@id ^ ".a2" layer:phenotypes @@ -1608,7 +1651,7 @@ - corpora/&corpus;/batches/&batch;/a2 + . documents.sections document.@id ^ ".a2" layer:microorganism @@ -1621,7 +1664,7 @@ - corpora/&corpus;/batches/&batch;/a2 + . documents.sections document.@id ^ ".a2" layer:habitats|layer:phenotypes @@ -1633,7 +1676,7 @@ - corpora/&corpus;/batches/&batch;/a2 + . documents.sections document.@id ^ ".a2" layer:microorganism @@ -1645,7 +1688,7 @@ - corpora/&corpus;/batches/&batch;/a2 + . documents.sections document.@id ^ ".a2" relations:CooccurrenceLocalization.tuples[args:Localization.@concept-id != ""] @@ -1657,7 +1700,7 @@ - corpora/&corpus;/batches/&batch;/a2 + . documents.sections document.@id ^ ".a2" relations:PhenotypeRelation.tuples @@ -1670,7 +1713,7 @@ - corpora/&corpus;/batches/&batch; + . $ "success.txt" documents diff --git a/plans/tomap-habitats.plan b/plans/tomap-habitats.plan index 205a5fa..da458fd 100644 --- a/plans/tomap-habitats.plan +++ b/plans/tomap-habitats.plan @@ -3,7 +3,7 @@ - corpora/&corpus;/batches/&batch;/yatea/candidates.xml + yatea/candidates.xml habitats concept-id explain_ @@ -26,7 +26,7 @@ - corpora/&corpus;/batches/&batch;/yatea/candidates.xml + yatea/candidates.xml habitats2 concept-id explain_ @@ -58,7 +58,7 @@ - corpora/&corpus;/batches/&batch;/yatea/candidates.xml + yatea/candidates.xml habitats3 concept-id explain_ @@ -89,7 +89,7 @@ - corpora/&corpus;/batches/&batch;/yatea-var/candidates.xml + yatea-var/candidates.xml habitats4 concept-id explain_ @@ -121,7 +121,7 @@ - corpora/&corpus;/batches/&batch;/yatea/candidates.xml + yatea/candidates.xml habitats5 concept-id explain_ @@ -196,7 +196,7 @@ yateaTerms - corpora/&corpus;/batches/&batch;/yatea/candidates.xml + yatea/candidates.xml lemma @@ -388,20 +388,20 @@ - - + - documents[@id in "corpora/&corpus;/food-pmids.txt"].sections.layer:habitats[not @concept-path =~ "OBT:000008/"] + - remove:habitats + diff --git a/plans/tomap-microbial-phenotypes.plan b/plans/tomap-microbial-phenotypes.plan index 275e64c..f55357a 100644 --- a/plans/tomap-microbial-phenotypes.plan +++ b/plans/tomap-microbial-phenotypes.plan @@ -2,7 +2,7 @@ - corpora/&corpus;/batches/&batch;/yatea/candidates.xml + yatea/candidates.xml phenotypes concept-id explain_ @@ -25,7 +25,7 @@ - corpora/&corpus;/batches/&batch;/yatea/candidates.xml + yatea/candidates.xml phenotypes2 concept-id explain_ @@ -57,7 +57,7 @@ - corpora/&corpus;/batches/&batch;/yatea/candidates.xml + yatea/candidates.xml phenotypes3 concept-id explain_ @@ -89,7 +89,7 @@ - corpora/&corpus;/batches/&batch;/yatea-var/candidates.xml + yatea-var/candidates.xml phenotypes4 concept-id explain_ @@ -121,7 +121,7 @@ - corpora/&corpus;/batches/&batch;/yatea/candidates.xml + yatea/candidates.xml phenotypes5 concept-id explain_ @@ -166,7 +166,7 @@ yateaTerms - corpora/&corpus;/batches/&batch;/yatea/candidates.xml + yatea/candidates.xml lemma diff --git a/process_PubMed_corpus.snakefile b/process_PubMed_corpus.snakefile index 2d25617..6b7d9e1 100644 --- a/process_PubMed_corpus.snakefile +++ b/process_PubMed_corpus.snakefile @@ -47,6 +47,8 @@ rule run_pubmed_entities: batch="{B}", corpus='pubmed', inhibitSyntax='inhibit-syntax', + yatea="corpora/pubmed/batches/{B}/yatea/candidates.xml", + yatea_var="corpora/pubmed/batches/{B}/yatea-var/candidates.xml", onto_habitat='ancillaries/BioNLP-OST+EnovFood-Habitat.obo', tomap_habitat='ancillaries/BioNLP-OST+EnovFood-Habitat.tomap', onto_pheno='ancillaries/BioNLP-OST+EnovFood-Phenotype.obo', @@ -56,6 +58,8 @@ rule run_pubmed_entities: ontobiotopeUse='ancillaries/Use_V2.obo', plan='plans/entities.plan', dir='corpora/pubmed/batches/{B}/', + adbIndex='corpora/pubmed/batches/{B}/adb', + bionlp='corpora/pubmed/batches/{B}/a2', taxid_microorganisms='ancillaries/extended-microorganisms-taxonomy/taxid_microorganisms.txt', taxa_id_full='ancillaries/extended-microorganisms-taxonomy/taxa+id_full.txt', dummy='corpora/pubmed/batches/{B}/bionlp-st' @@ -66,16 +70,22 @@ rule run_pubmed_entities: -alias format pubmed \ -alias input {input.file} \ -alias input-xslt {input.xslt} \ + -alias batch batch={params.batch} \ -alias outputDir {params.dir} \ + -alias adbIndex {params.adbIndex} \ + -alias index {output.index} \ + -alias bionlp-output {params.bionlp} \ -environmentEntities \ - -entity corpus {params.corpus} \ -feat inhibit-syntax {params.inhibitSyntax} \ + -alias yatea-file {params.yatea} \ + -alias yatea-var-file {params.yatea_var} \ + -xalias '{params.yatea}' \ + -xalias '{params.yatea_var}' \ -alias ontobiotope-habitat {params.onto_habitat} \ -xalias '{params.tomap_habitat}' \ -alias ontobiotope-phenotypes {params.onto_pheno} \ -xalias '{params.tomap_pheno}' \ -alias ontobiotope-use {params.ontobiotopeUse} \ - -entity batch {params.batch} \ -alias taxid_microorganisms {params.taxid_microorganisms} \ -alias taxa+id_full {params.taxa_id_full} \ {params.plan} -- GitLab From d2b71ddb7a7f4fa003aabc28077c1c0176aa193c Mon Sep 17 00:00:00 2001 From: "louise.deleger" Date: Fri, 26 Nov 2021 15:04:15 +0100 Subject: [PATCH 3/6] use -outputDir --- plans/entities.plan | 57 ++++++--------------------------- process_PubMed_corpus.snakefile | 15 ++------- 2 files changed, 11 insertions(+), 61 deletions(-) diff --git a/plans/entities.plan b/plans/entities.plan index b32f10c..55e0338 100644 --- a/plans/entities.plan +++ b/plans/entities.plan @@ -41,23 +41,6 @@ - - - - - - - - - - - - - - - - - @@ -160,28 +143,6 @@ - - - - - - - - - - - - - - - - - - - - - - @@ -1290,7 +1251,7 @@ - . + adb documents.sections.relations:CooccurrenceLocalization.tuples[args:Bacterium[@bacteria == "true"]] @@ -1310,7 +1271,7 @@ - . + index 9216 title,abstract,author,full-author,pmid,year,journal,mesh,url @@ -1625,7 +1586,7 @@ - . + a2 documents.sections document.@id ^ ".a2" layer:habitats @@ -1638,7 +1599,7 @@ - . + a2 documents.sections document.@id ^ ".a2" layer:phenotypes @@ -1651,7 +1612,7 @@ - . + a2 documents.sections document.@id ^ ".a2" layer:microorganism @@ -1664,7 +1625,7 @@ - . + a2 documents.sections document.@id ^ ".a2" layer:habitats|layer:phenotypes @@ -1676,7 +1637,7 @@ - . + a2 documents.sections document.@id ^ ".a2" layer:microorganism @@ -1688,7 +1649,7 @@ - . + a2 documents.sections document.@id ^ ".a2" relations:CooccurrenceLocalization.tuples[args:Localization.@concept-id != ""] @@ -1700,7 +1661,7 @@ - . + a2 documents.sections document.@id ^ ".a2" relations:PhenotypeRelation.tuples diff --git a/process_PubMed_corpus.snakefile b/process_PubMed_corpus.snakefile index 6b7d9e1..5710034 100644 --- a/process_PubMed_corpus.snakefile +++ b/process_PubMed_corpus.snakefile @@ -46,9 +46,7 @@ rule run_pubmed_entities: params: batch="{B}", corpus='pubmed', - inhibitSyntax='inhibit-syntax', - yatea="corpora/pubmed/batches/{B}/yatea/candidates.xml", - yatea_var="corpora/pubmed/batches/{B}/yatea-var/candidates.xml", + inhibitSyntax='inhibit-syntax', onto_habitat='ancillaries/BioNLP-OST+EnovFood-Habitat.obo', tomap_habitat='ancillaries/BioNLP-OST+EnovFood-Habitat.tomap', onto_pheno='ancillaries/BioNLP-OST+EnovFood-Phenotype.obo', @@ -58,8 +56,6 @@ rule run_pubmed_entities: ontobiotopeUse='ancillaries/Use_V2.obo', plan='plans/entities.plan', dir='corpora/pubmed/batches/{B}/', - adbIndex='corpora/pubmed/batches/{B}/adb', - bionlp='corpora/pubmed/batches/{B}/a2', taxid_microorganisms='ancillaries/extended-microorganisms-taxonomy/taxid_microorganisms.txt', taxa_id_full='ancillaries/extended-microorganisms-taxonomy/taxa+id_full.txt', dummy='corpora/pubmed/batches/{B}/bionlp-st' @@ -71,16 +67,9 @@ rule run_pubmed_entities: -alias input {input.file} \ -alias input-xslt {input.xslt} \ -alias batch batch={params.batch} \ - -alias outputDir {params.dir} \ - -alias adbIndex {params.adbIndex} \ - -alias index {output.index} \ - -alias bionlp-output {params.bionlp} \ + -outputDir {params.dir} \ -environmentEntities \ -feat inhibit-syntax {params.inhibitSyntax} \ - -alias yatea-file {params.yatea} \ - -alias yatea-var-file {params.yatea_var} \ - -xalias '{params.yatea}' \ - -xalias '{params.yatea_var}' \ -alias ontobiotope-habitat {params.onto_habitat} \ -xalias '{params.tomap_habitat}' \ -alias ontobiotope-phenotypes {params.onto_pheno} \ -- GitLab From bf4c4556640c75cd9a7481c9b94284f4bef01b70 Mon Sep 17 00:00:00 2001 From: "louise.deleger" Date: Fri, 26 Nov 2021 15:48:06 +0100 Subject: [PATCH 4/6] removed entities --- process-evaluate_BioNLP-OST.snakefile | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/process-evaluate_BioNLP-OST.snakefile b/process-evaluate_BioNLP-OST.snakefile index c6b4373..e50a8be 100644 --- a/process-evaluate_BioNLP-OST.snakefile +++ b/process-evaluate_BioNLP-OST.snakefile @@ -39,9 +39,14 @@ rule run_bionlp_prediction: params: batch="{B}", corpus='BioNLP-OST-2019', - inhibitSyntax='inhibit-syntax', - onto='ancillaries/BioNLP-OST+EnovFood', - ontobiotopeUse='ancillaries/Use_V2', + inhibitSyntax='inhibit-syntax', + onto_habitat='ancillaries/BioNLP-OST+EnovFood-Habitat.obo', + tomap_habitat='ancillaries/BioNLP-OST+EnovFood-Habitat.tomap', + onto_pheno='ancillaries/BioNLP-OST+EnovFood-Phenotype.obo', + tomap_pheno='ancillaries/BioNLP-OST+EnovFood-Phenotype.tomap', + graylist='ancillaries/graylist_extended.heads', + emptywords='ancillaries/stopwords_EN.ttg', + ontobiotopeUse='ancillaries/Use_V2.obo', plan='plans/entities.plan', dir='corpora/BioNLP-OST-2019/batches/{B}/', taxid_microorganisms='ancillaries/extended-microorganisms-taxonomy/taxid_microorganisms.txt', @@ -52,14 +57,14 @@ rule run_bionlp_prediction: -log {log} \ -alias format bionlp-st \ -alias input-dir {input.dir} \ - -alias input-xslt {input.xslt} \ - -alias outputDir {params.dir} \ + -outputDir {params.dir} \ -environmentEntities \ - -entity corpus {params.corpus} \ -feat inhibit-syntax {params.inhibitSyntax} \ - -entity ontobiotope {params.onto} \ - -entity ontobiotope-use {params.ontobiotopeUse} \ - -entity batch {params.batch} \ + -alias ontobiotope-habitat {params.onto_habitat} \ + -xalias '{params.tomap_habitat}' \ + -alias ontobiotope-phenotypes {params.onto_pheno} \ + -xalias '{params.tomap_pheno}' \ + -alias ontobiotope-use {params.ontobiotopeUse} \ -alias taxid_microorganisms {params.taxid_microorganisms} \ -alias taxa+id_full {params.taxa_id_full} \ {params.plan} -- GitLab From f8db036f901025cb56740d9505e3b4a4a820d588 Mon Sep 17 00:00:00 2001 From: "louise.deleger" Date: Fri, 26 Nov 2021 15:49:48 +0100 Subject: [PATCH 5/6] changed default values --- plans/entities.plan | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/plans/entities.plan b/plans/entities.plan index 55e0338..8523ddb 100644 --- a/plans/entities.plan +++ b/plans/entities.plan @@ -198,9 +198,9 @@ - corpora/pubmed/batches/0000/batch.xml + corpora/pubmed/batches/0001/batch.xml ancillaries/microbes-2019-pubmed2alvisnlp.xslt - batch=0000 + batch=0001 -- GitLab From 7f044808cfeec89c8bf45de76916ef53c7cbac75 Mon Sep 17 00:00:00 2001 From: "louise.deleger" Date: Fri, 26 Nov 2021 17:07:10 +0100 Subject: [PATCH 6/6] Changed plan used for mapping habitats --- plans/map_habitats.plan | 6 +-- process_CIRM_corpus.snakefile | 69 ++++++++++++++++++-------------- process_DSMZ_corpus.snakefile | 7 +++- process_GenBank_corpus.snakefile | 7 +++- 4 files changed, 52 insertions(+), 37 deletions(-) diff --git a/plans/map_habitats.plan b/plans/map_habitats.plan index 0777778..5aff349 100644 --- a/plans/map_habitats.plan +++ b/plans/map_habitats.plan @@ -168,7 +168,7 @@ - ancillaries/yatea/candidates.xml + yatea/candidates.xml tt_pos ancillaries/YaTeA/config-habitats ancillaries/YaTeA/locale @@ -178,7 +178,7 @@ - ancillaries/yatea-var/candidates.xml + yatea-var/candidates.xml tt_pos variant ancillaries/YaTeA/config-habitats @@ -258,7 +258,7 @@ - + diff --git a/process_CIRM_corpus.snakefile b/process_CIRM_corpus.snakefile index 2b172cf..f9f49e7 100644 --- a/process_CIRM_corpus.snakefile +++ b/process_CIRM_corpus.snakefile @@ -24,8 +24,8 @@ rule get_cirm_bia_taxa_habitats: strain_index='1', habitat_index='15' output: - taxa='corpora/cirm/bia_taxa.txt', - habitats='corpora/cirm/bia_habitats.txt', + taxa='corpora/cirm/bia/bia_taxa.txt', + habitats='corpora/cirm/bia/bia_habitats.txt', tsv='corpora/cirm/BIA_2021/florilege_export_final_17_02_21.tsv' conda: 'softwares/envs/python3_pandas_env.yaml' shell: """ @@ -42,8 +42,8 @@ rule get_cirm_yeast_taxa_habitats: taxa_index='1', habitat_index='10,11' output: - taxa='corpora/cirm/yeast_taxa.txt', - habitats='corpora/cirm/yeast_habitats.txt', + taxa='corpora/cirm/levures/yeast_taxa.txt', + habitats='corpora/cirm/levures/yeast_habitats.txt', tsv='corpora/cirm/Levures_2021/Florilege_21012021.tsv' conda: 'softwares/envs/python3_pandas_env.yaml' shell: """ @@ -61,8 +61,8 @@ rule get_cirm_cfbp_taxa_habitats: strain_index='1', habitat_index='6,10,13,14' output: - taxa='corpora/cirm/cfbp_taxa.txt', - habitats='corpora/cirm/cfbp_habitats.txt', + taxa='corpora/cirm/cfbp/cfbp_taxa.txt', + habitats='corpora/cirm/cfbp/cfbp_habitats.txt', tsv='corpora/cirm/CFBP_2021/20210617_PPortier.tsv' conda: 'softwares/envs/python3_pandas_env.yaml' shell: """ @@ -75,9 +75,9 @@ map microorganisms ''' rule map_cirm_bia_microorganisms: input: - taxa='corpora/cirm/bia_taxa.txt' + taxa='corpora/cirm/bia/bia_taxa.txt' output: - mapped_taxaids='corpora/cirm/mapped_bia_taxa.txt' + mapped_taxaids='corpora/cirm/bia/mapped_bia_taxa.txt' params: plan='plans/map_microorganisms.plan', taxid_microorganisms='ancillaries/extended-microorganisms-taxonomy/taxid_microorganisms.txt', @@ -96,9 +96,9 @@ map microorganisms (CIRM Levures) ''' rule map_cirm_yeast_microorganisms: input: - taxa='corpora/cirm/yeast_taxa.txt' + taxa='corpora/cirm/levures/yeast_taxa.txt' output: - mapped_taxaids='corpora/cirm/mapped_yeast_taxa.txt' + mapped_taxaids='corpora/cirm/levures/mapped_yeast_taxa.txt' params: plan='plans/map_microorganisms.plan', taxid_microorganisms='ancillaries/extended-microorganisms-taxonomy/taxid_microorganisms.txt', @@ -118,9 +118,9 @@ map microorganisms (CIRM CFBP) ''' rule map_cirm_cfbp_microorganisms: input: - taxa='corpora/cirm/cfbp_taxa.txt' + taxa='corpora/cirm/cfbp/cfbp_taxa.txt' output: - mapped_taxa='corpora/cirm/mapped_cfbp_taxa.txt' + mapped_taxa='corpora/cirm/cfbp/mapped_cfbp_taxa.txt' params: plan='plans/map_microorganisms.plan', taxid_microorganisms='ancillaries/extended-microorganisms-taxonomy/taxid_microorganisms.txt', @@ -139,19 +139,22 @@ map habitats of microorganisms ''' rule map_cirm_habitats: input: - habitats='corpora/cirm/bia_habitats.txt' + habitats='corpora/cirm/bia/bia_habitats.txt' output: - mapped_habitats='corpora/cirm/mapped_bia_habitats.txt' + mapped_habitats='corpora/cirm/bia/mapped_bia_habitats.txt' params: plan='plans/map_habitats.plan', onto='ancillaries/BioNLP-OST+EnovFood-Habitat.obo', tomap='ancillaries/BioNLP-OST+EnovFood-Habitat.tomap', graylist='ancillaries/graylist_extended.heads', - emptywords='ancillaries/stopwords_EN.ttg' + emptywords='ancillaries/stopwords_EN.ttg', + outdir='corpora/cirm/bia', + outfile='mapped_bia_habitats.txt' singularity:config["SINGULARITY_IMG"] shell: """alvisnlp -J-Xmx32g -cleanTmp -verbose \ -alias input {input.habitats} \ - -alias output {output.mapped_habitats} \ + -outputDir {params.outdir} \ + -alias output {params.outfile} \ -alias ontobiotope {params.onto} \ -xalias '{params.tomap}' \ {params.plan} @@ -162,19 +165,22 @@ map habitats of microorganisms (CIRM Levures) ''' rule map_cirm_yeast_habitats: input: - habitats='corpora/cirm/yeast_habitats.txt' + habitats='corpora/cirm/levures/yeast_habitats.txt' output: - mapped_habitats='corpora/cirm/mapped_yeast_habitats.txt' + mapped_habitats='corpora/cirm/levures/mapped_yeast_habitats.txt' params: plan='plans/map_habitats.plan', onto='ancillaries/BioNLP-OST+EnovFood-Habitat.obo', tomap='ancillaries/BioNLP-OST+EnovFood-Habitat.tomap', graylist='ancillaries/graylist_extended.heads', - emptywords='ancillaries/stopwords_EN.ttg' + emptywords='ancillaries/stopwords_EN.ttg', + outdir='corpora/cirm/levures', + outfile='mapped_yeast_habitats.txt' singularity:config["SINGULARITY_IMG"] shell: """alvisnlp -J-Xmx32g -cleanTmp -verbose \ -alias input {input.habitats} \ - -alias output {output.mapped_habitats} \ + -outputDir {params.outdir} \ + -alias output {params.outfile} \ -alias ontobiotope {params.onto} \ -xalias '{params.tomap}' \ {params.plan} @@ -185,19 +191,22 @@ map habitats of microorganisms (CIRM CFBP) ''' rule map_cirm_cfbp_habitats: input: - habitats='corpora/cirm/cfbp_habitats.txt' + habitats='corpora/cirm/cfbp/cfbp_habitats.txt' output: - mapped_habitats='corpora/cirm/mapped_cfbp_habitats.txt' + mapped_habitats='corpora/cirm/cfbp/mapped_cfbp_habitats.txt' params: plan='plans/map_habitats.plan', onto='ancillaries/BioNLP-OST+EnovFood-Habitat.obo', tomap='ancillaries/BioNLP-OST+EnovFood-Habitat.tomap', graylist='ancillaries/graylist_extended.heads', - emptywords='ancillaries/stopwords_EN.ttg' + emptywords='ancillaries/stopwords_EN.ttg', + outdir='corpora/cirm/cfbp', + outfile='mapped_cfbp_habitats.txt' singularity:config["SINGULARITY_IMG"] shell: """alvisnlp -J-Xmx32g -cleanTmp -verbose \ -alias input {input.habitats} \ - -alias output {output.mapped_habitats} \ + -outputDir {params.outdir} \ + -alias output {params.outfile} \ -alias ontobiotope {params.onto} \ -xalias '{params.tomap}' \ {params.plan} @@ -209,8 +218,8 @@ format results rule format_cirm_results: input: file='corpora/cirm/BIA_2021/florilege_export_final_17_02_21.tsv', - taxa='corpora/cirm/mapped_bia_taxa.txt', - habitats='corpora/cirm/mapped_bia_habitats.txt' + taxa='corpora/cirm/bia/mapped_bia_taxa.txt', + habitats='corpora/cirm/bia/mapped_bia_habitats.txt' output: result='corpora/florilege/cirm/cirm-bia-results.txt' params: @@ -226,8 +235,8 @@ format results (CIRM Levures) rule format_cirm_yeast_results: input: file='corpora/cirm/Levures_2021/Florilege_21012021.tsv', - taxa='corpora/cirm/mapped_yeast_taxa.txt', - habitats='corpora/cirm/mapped_yeast_habitats.txt' + taxa='corpora/cirm/levures/mapped_yeast_taxa.txt', + habitats='corpora/cirm/levures/mapped_yeast_habitats.txt' output: result='corpora/florilege/cirm/cirm-yeast-results.txt' params: @@ -242,8 +251,8 @@ format results (CIRM CFBP) rule format_cirm_cfbp_results: input: file='corpora/cirm/CFBP_2021/20210617_PPortier.tsv', - taxa='corpora/cirm/mapped_cfbp_taxa.txt', - habitats='corpora/cirm/mapped_cfbp_habitats.txt' + taxa='corpora/cirm/cfbp/mapped_cfbp_taxa.txt', + habitats='corpora/cirm/cfbp/mapped_cfbp_habitats.txt' output: result='corpora/florilege/cirm/cirm-cfbp-results.txt' params: diff --git a/process_DSMZ_corpus.snakefile b/process_DSMZ_corpus.snakefile index b4fa67f..8fde651 100644 --- a/process_DSMZ_corpus.snakefile +++ b/process_DSMZ_corpus.snakefile @@ -35,11 +35,14 @@ rule map_dsmz_habitats: onto='ancillaries/BioNLP-OST+EnovFood-Habitat.obo', tomap='ancillaries/BioNLP-OST+EnovFood-Habitat.tomap', graylist='ancillaries/graylist_extended.heads', - emptywords='ancillaries/stopwords_EN.ttg' + emptywords='ancillaries/stopwords_EN.ttg', + outdir='corpora/dsmz', + outfile='mapped_habitats.txt' singularity:config["SINGULARITY_IMG"] shell: """alvisnlp -J-Xmx32g -cleanTmp -verbose \ -alias input {input.habitats} \ - -alias output {output.mapped_habitats} \ + -outputDir {params.outdir} \ + -alias output {params.outfile} \ -alias ontobiotope {params.onto} \ -xalias '{params.tomap}' \ {params.plan} diff --git a/process_GenBank_corpus.snakefile b/process_GenBank_corpus.snakefile index 388f9ab..888835e 100644 --- a/process_GenBank_corpus.snakefile +++ b/process_GenBank_corpus.snakefile @@ -79,11 +79,14 @@ rule map_genbank_habitats: tomap='ancillaries/BioNLP-OST+EnovFood-Habitat.tomap', graylist='ancillaries/graylist_extended.heads', emptywords='ancillaries/stopwords_EN.ttg', - inhibitSyntax='inhibit-syntax' + inhibitSyntax='inhibit-syntax', + outdir='corpora/genbank', + outfile='mapped_habitats.txt' singularity:config["SINGULARITY_IMG"] shell: """alvisnlp -J-Xmx32g -cleanTmp -verbose \ -alias input {input.habitats} \ - -alias output {output.mapped_habitats}\ + -outputDir {params.outdir} \ + -alias output {params.outfile} \ -alias ontobiotope {params.onto} \ -feat inhibit-syntax {params.inhibitSyntax} \ -xalias '{params.tomap}' \ -- GitLab