diff --git a/plans/entities.plan b/plans/entities.plan index f98ea1c1a7e009f4d9006ca56f147057fbb3ad01..8523ddb14c67d45b2edc40e07b71539e55ef52cd 100644 --- a/plans/entities.plan +++ b/plans/entities.plan @@ -17,6 +17,10 @@ + + + + @@ -139,8 +143,7 @@ - - + @@ -195,14 +198,15 @@ - corpora/&corpus;/batches/&batch;/batch.xml - ancillaries/&corpus;-pubmed2alvisnlp.xslt + corpora/pubmed/batches/0001/batch.xml + ancillaries/microbes-2019-pubmed2alvisnlp.xslt + batch=0001 true abstract - corpora/&corpus;/batches/&batch;/bionlp-st + corpora/BioNLP-OST-2019/batches/BB19-kb+ner/bionlp-st @@ -404,7 +408,7 @@ @name == "title" or @name == "abstract" - corpora/&corpus;/batches/&batch;/yatea/candidates.xml + yatea/candidates.xml tt_pos ancillaries/YaTeA/config-habitats ancillaries/YaTeA/locale @@ -415,7 +419,7 @@ @name == "title" or @name == "abstract" - corpora/&corpus;/batches/&batch;/yatea-var/candidates.xml + yatea-var/candidates.xml tt_pos variant ancillaries/YaTeA/config-habitats @@ -669,12 +673,12 @@ - corpora/&corpus;/batches/&batch; + . $ "doc-mesh.txt" documents.sections:mesh - "&batch;"; + document.@batch; document.@id; @UI; @mesh-name; @@ -683,12 +687,12 @@ - corpora/&corpus;/batches/&batch; + . $ "taxa.txt" documents.sections.layer:taxa - "&batch;"; + document.@batch; section.document.@id; section.@name; start ^ "-" ^ end; @@ -702,12 +706,12 @@ - corpora/&corpus;/batches/&batch; + . $ "microorganisms.txt" documents.sections.layer:microorganism - "&batch;"; + document.@batch; section.document.@id; section.@name; start ^ "-" ^ end; @@ -721,12 +725,12 @@ - corpora/&corpus;/batches/&batch; + . $ "microorganisms-short.txt" documents.sections.layer:microorganism[outside:words and not @form == outside:words.@form] - "&batch;"; + document.@batch; section.document.@id; section.@name; start ^ "-" ^ end; @@ -740,12 +744,12 @@ - corpora/&corpus;/batches/&batch; + . $ "bacteria.txt" documents.sections.layer:bacteria - "&batch;"; + document.@batch; section.document.@id; section.@name; start ^ "-" ^ end; @@ -759,12 +763,12 @@ - corpora/&corpus;/batches/&batch; + . $ "habitats.txt" documents.sections.layer:habitats - "&batch;"; + document.@batch; section.document.@id; section.@name; start ^ "-" ^ end; @@ -780,12 +784,12 @@ - corpora/&corpus;/batches/&batch; + . $ "phenotypes.txt" documents.sections.layer:phenotypes - "&batch;"; + document.@batch; section.document.@id; section.@name; start ^ "-" ^ end; @@ -801,12 +805,12 @@ - corpora/&corpus;/batches/&batch; + . $ "uses.txt" documents.sections.layer:uses - "&batch;"; + document.@batch; section.document.@id; section.@name; start ^ "-" ^ end; @@ -822,12 +826,12 @@ - corpora/&corpus;/batches/&batch; + . $ "geo.txt" documents.sections.layer:Geographical - "&batch;"; + document.@batch; section.document.@id; section.@name; start ^ "-" ^ end; @@ -837,12 +841,12 @@ - corpora/&corpus;/batches/&batch; + . $ "relations.txt" documents.sections.relations:CooccurrenceLocalization.tuples - "&batch;"; + document.@batch; section.document.@id; section.@name; args:Bacterium.@taxid; @@ -860,12 +864,12 @@ - corpora/&corpus;/batches/&batch; + . $ "phenotype-relations.txt" documents.sections.relations:PhenotypeRelation.tuples - "&batch;"; + document.@batch; section.document.@id; section.@name; args:Microorganism.@taxid; @@ -883,12 +887,12 @@ - corpora/&corpus;/batches/&batch; + . $ "uses-relations.txt" documents.sections.relations:UseRelation.tuples - "&batch;"; + document.@batch; section.document.@id; section.@name; args:Microorganism.@taxid; @@ -974,7 +978,7 @@ - corpora/&corpus;/batches/&batch; + . $ "sentences.txt" documents.sections.layer:sentences[@name != "author"] @@ -995,7 +999,7 @@ - corpora/&corpus;/batches/&batch; + . $ "anaphora.txt" documents.sections.relations:coreferences.tuples[args:Ante] @@ -1028,7 +1032,7 @@ - corpora/&corpus;/batches/&batch; + . $ "dependencies.txt" documents.sections[@name != "author"].relations:dependencies.tuples @@ -1247,11 +1251,11 @@ - corpora/&corpus;/batches/&batch;/adb + adb documents.sections.relations:CooccurrenceLocalization.tuples[args:Bacterium[@bacteria == "true"]] - "&batch;_" ^ id:unique + section.document.@batch ^ id:unique "Localization" "localization" args:Bacterium|args:Localization @@ -1267,7 +1271,7 @@ - corpora/&corpus;/batches/&batch;/index + index 9216 title,abstract,author,full-author,pmid,year,journal,mesh,url @@ -1404,8 +1408,8 @@ - - corpora/&corpus;/batches/&batch;/index-food + @@ -1563,14 +1567,14 @@ false - corpora/&corpus;/batches/&batch;/html + ./html ne-type phenotypes,microorganism,habitats #99cc00,#ffcc99,#ffd333,#ffd666 - corpora/&corpus;/batches/&batch; + . $ "words.txt" documents.sections[@name == "title" or @name == "abstract"].layer:words @@ -1582,7 +1586,7 @@ - corpora/&corpus;/batches/&batch;/a2 + a2 documents.sections document.@id ^ ".a2" layer:habitats @@ -1595,7 +1599,7 @@ - corpora/&corpus;/batches/&batch;/a2 + a2 documents.sections document.@id ^ ".a2" layer:phenotypes @@ -1608,7 +1612,7 @@ - corpora/&corpus;/batches/&batch;/a2 + a2 documents.sections document.@id ^ ".a2" layer:microorganism @@ -1621,7 +1625,7 @@ - corpora/&corpus;/batches/&batch;/a2 + a2 documents.sections document.@id ^ ".a2" layer:habitats|layer:phenotypes @@ -1633,7 +1637,7 @@ - corpora/&corpus;/batches/&batch;/a2 + a2 documents.sections document.@id ^ ".a2" layer:microorganism @@ -1645,7 +1649,7 @@ - corpora/&corpus;/batches/&batch;/a2 + a2 documents.sections document.@id ^ ".a2" relations:CooccurrenceLocalization.tuples[args:Localization.@concept-id != ""] @@ -1657,7 +1661,7 @@ - corpora/&corpus;/batches/&batch;/a2 + a2 documents.sections document.@id ^ ".a2" relations:PhenotypeRelation.tuples @@ -1670,7 +1674,7 @@ - corpora/&corpus;/batches/&batch; + . $ "success.txt" documents diff --git a/plans/map_habitats.plan b/plans/map_habitats.plan index 07777780e184fa6ede0bf8d55785a7e3a463d7f9..5aff349486ac1d6145c03930300193eeb36b05e4 100644 --- a/plans/map_habitats.plan +++ b/plans/map_habitats.plan @@ -168,7 +168,7 @@ - ancillaries/yatea/candidates.xml + yatea/candidates.xml tt_pos ancillaries/YaTeA/config-habitats ancillaries/YaTeA/locale @@ -178,7 +178,7 @@ - ancillaries/yatea-var/candidates.xml + yatea-var/candidates.xml tt_pos variant ancillaries/YaTeA/config-habitats @@ -258,7 +258,7 @@ - + diff --git a/plans/tomap-habitats.plan b/plans/tomap-habitats.plan index 46bbb98d0e1a2aadb023988465071097e7a11a72..da458fd1932cceace194e9cae09134bd2bf10027 100644 --- a/plans/tomap-habitats.plan +++ b/plans/tomap-habitats.plan @@ -3,18 +3,18 @@ - corpora/&corpus;/batches/&batch;/yatea/candidates.xml + yatea/candidates.xml habitats concept-id explain_ - &ontobiotope;-Habitat.tomap + ancillaries/BioNLP-OST+EnovFood-Habitat.tomap score - &ontobiotope;-Habitat.obo + ancillaries/BioNLP-OST+EnovFood-Habitat.obo documents.sections.layer:habitats
@concept-id
@@ -26,18 +26,18 @@ - corpora/&corpus;/batches/&batch;/yatea/candidates.xml + yatea/candidates.xml habitats2 concept-id explain_ - &ontobiotope;-Habitat.tomap + ancillaries/BioNLP-OST+EnovFood-Habitat.tomap score - &ontobiotope;-Habitat.obo + ancillaries/BioNLP-OST+EnovFood-Habitat.obo documents.sections.layer:habitats2
@concept-id
@@ -58,17 +58,17 @@ - corpora/&corpus;/batches/&batch;/yatea/candidates.xml + yatea/candidates.xml habitats3 concept-id explain_ - &ontobiotope;-Habitat.tomap + ancillaries/BioNLP-OST+EnovFood-Habitat.tomap score - &ontobiotope;-Habitat.obo + ancillaries/BioNLP-OST+EnovFood-Habitat.obo documents.sections.layer:habitats3
@concept-id
@@ -89,18 +89,18 @@ - corpora/&corpus;/batches/&batch;/yatea-var/candidates.xml + yatea-var/candidates.xml habitats4 concept-id explain_ - &ontobiotope;-Habitat.tomap + ancillaries/BioNLP-OST+EnovFood-Habitat.tomap score - &ontobiotope;-Habitat.obo + ancillaries/BioNLP-OST+EnovFood-Habitat.obo documents.sections.layer:habitats4
@concept-id
@@ -121,17 +121,17 @@ - corpora/&corpus;/batches/&batch;/yatea/candidates.xml + yatea/candidates.xml habitats5 concept-id explain_ - &ontobiotope;-Habitat.tomap + ancillaries/BioNLP-OST+EnovFood-Habitat.tomap score - &ontobiotope;-Habitat.obo + ancillaries/BioNLP-OST+EnovFood-Habitat.obo documents.sections.layer:habitats5
@concept-id
@@ -196,7 +196,7 @@ yateaTerms - corpora/&corpus;/batches/&batch;/yatea/candidates.xml + yatea/candidates.xml lemma @@ -388,20 +388,20 @@ - - + - documents[@id in "corpora/&corpus;/food-pmids.txt"].sections.layer:habitats[not @concept-path =~ "OBT:000008/"] + - remove:habitats + @@ -714,7 +714,7 @@ - &ontobiotope;-Habitat.obo + ancillaries/BioNLP-OST+EnovFood-Habitat.obo documents.sections.layer:habitats
@concept-id
diff --git a/plans/tomap-microbial-phenotypes.plan b/plans/tomap-microbial-phenotypes.plan index 17481643bb3b90c66ab16caa337ee600eb2f1f0f..f55357aa56a7004280b8a5bae6cef17a501a97fe 100644 --- a/plans/tomap-microbial-phenotypes.plan +++ b/plans/tomap-microbial-phenotypes.plan @@ -2,18 +2,18 @@ - corpora/&corpus;/batches/&batch;/yatea/candidates.xml + yatea/candidates.xml phenotypes concept-id explain_ - &ontobiotope;-Phenotype.tomap + ancillaries/BioNLP-OST+EnovFood-Phenotype.tomap score - &ontobiotope;-Phenotype.obo + ancillaries/BioNLP-OST+EnovFood-Phenotype.obo documents.sections.layer:phenotypes
@concept-id
@@ -25,18 +25,18 @@ - corpora/&corpus;/batches/&batch;/yatea/candidates.xml + yatea/candidates.xml phenotypes2 concept-id explain_ - &ontobiotope;-Phenotype.tomap + ancillaries/BioNLP-OST+EnovFood-Phenotype.tomap score - &ontobiotope;-Phenotype.obo + ancillaries/BioNLP-OST+EnovFood-Phenotype.obo documents.sections.layer:phenotypes2
@concept-id
@@ -57,17 +57,17 @@ - corpora/&corpus;/batches/&batch;/yatea/candidates.xml + yatea/candidates.xml phenotypes3 concept-id explain_ - &ontobiotope;-Phenotype.tomap + ancillaries/BioNLP-OST+EnovFood-Phenotype.tomap score - &ontobiotope;-Phenotype.obo + ancillaries/BioNLP-OST+EnovFood-Phenotype.obo documents.sections.layer:phenotypes3
@concept-id
@@ -89,18 +89,18 @@ - corpora/&corpus;/batches/&batch;/yatea-var/candidates.xml + yatea-var/candidates.xml phenotypes4 concept-id explain_ - &ontobiotope;-Phenotype.tomap + ancillaries/BioNLP-OST+EnovFood-Phenotype.tomap score - &ontobiotope;-Phenotype.obo + ancillaries/BioNLP-OST+EnovFood-Phenotype.obo documents.sections.layer:phenotypes4
@concept-id
@@ -121,17 +121,17 @@ - corpora/&corpus;/batches/&batch;/yatea/candidates.xml + yatea/candidates.xml phenotypes5 concept-id explain_ - &ontobiotope;-Phenotype.tomap + ancillaries/BioNLP-OST+EnovFood-Phenotype.tomap score - &ontobiotope;-Phenotype.obo + ancillaries/BioNLP-OST+EnovFood-Phenotype.obo documents.sections.layer:phenotypes5
@concept-id
@@ -166,7 +166,7 @@ yateaTerms - corpora/&corpus;/batches/&batch;/yatea/candidates.xml + yatea/candidates.xml lemma @@ -394,7 +394,7 @@ - + @@ -404,7 +404,7 @@ - + @@ -469,7 +469,7 @@ - &ontobiotope;-Phenotype.obo + ancillaries/BioNLP-OST+EnovFood-Phenotype.obo documents.sections.layer:phenotypes
@concept-id
diff --git a/plans/use-extraction.plan b/plans/use-extraction.plan index 5d12302993b0490526cb682ad0934cca5820bda1..87195d8d5434aaa4ac887c01765ea83e3c1d4a05 100644 --- a/plans/use-extraction.plan +++ b/plans/use-extraction.plan @@ -2,7 +2,7 @@ - &ontobiotope-use;.obo + ancillaries/Use_V2.obo uses concept-id @@ -12,7 +12,7 @@ - &ontobiotope-use;.obo + ancillaries/Use_V2.obo uses2 concept-id diff --git a/process-evaluate_BioNLP-OST.snakefile b/process-evaluate_BioNLP-OST.snakefile index c6b43731d8c0626bc0a5d556b984ed992e41dab2..e50a8be95703306378fc1a5da5719bd885c49691 100644 --- a/process-evaluate_BioNLP-OST.snakefile +++ b/process-evaluate_BioNLP-OST.snakefile @@ -39,9 +39,14 @@ rule run_bionlp_prediction: params: batch="{B}", corpus='BioNLP-OST-2019', - inhibitSyntax='inhibit-syntax', - onto='ancillaries/BioNLP-OST+EnovFood', - ontobiotopeUse='ancillaries/Use_V2', + inhibitSyntax='inhibit-syntax', + onto_habitat='ancillaries/BioNLP-OST+EnovFood-Habitat.obo', + tomap_habitat='ancillaries/BioNLP-OST+EnovFood-Habitat.tomap', + onto_pheno='ancillaries/BioNLP-OST+EnovFood-Phenotype.obo', + tomap_pheno='ancillaries/BioNLP-OST+EnovFood-Phenotype.tomap', + graylist='ancillaries/graylist_extended.heads', + emptywords='ancillaries/stopwords_EN.ttg', + ontobiotopeUse='ancillaries/Use_V2.obo', plan='plans/entities.plan', dir='corpora/BioNLP-OST-2019/batches/{B}/', taxid_microorganisms='ancillaries/extended-microorganisms-taxonomy/taxid_microorganisms.txt', @@ -52,14 +57,14 @@ rule run_bionlp_prediction: -log {log} \ -alias format bionlp-st \ -alias input-dir {input.dir} \ - -alias input-xslt {input.xslt} \ - -alias outputDir {params.dir} \ + -outputDir {params.dir} \ -environmentEntities \ - -entity corpus {params.corpus} \ -feat inhibit-syntax {params.inhibitSyntax} \ - -entity ontobiotope {params.onto} \ - -entity ontobiotope-use {params.ontobiotopeUse} \ - -entity batch {params.batch} \ + -alias ontobiotope-habitat {params.onto_habitat} \ + -xalias '{params.tomap_habitat}' \ + -alias ontobiotope-phenotypes {params.onto_pheno} \ + -xalias '{params.tomap_pheno}' \ + -alias ontobiotope-use {params.ontobiotopeUse} \ -alias taxid_microorganisms {params.taxid_microorganisms} \ -alias taxa+id_full {params.taxa_id_full} \ {params.plan} diff --git a/process_CIRM_corpus.snakefile b/process_CIRM_corpus.snakefile index 2b172cf78ac91f2f0387201244ff924c6a48a7ee..f9f49e74a7c963826b96a1eed27b4ebbecd48f8c 100644 --- a/process_CIRM_corpus.snakefile +++ b/process_CIRM_corpus.snakefile @@ -24,8 +24,8 @@ rule get_cirm_bia_taxa_habitats: strain_index='1', habitat_index='15' output: - taxa='corpora/cirm/bia_taxa.txt', - habitats='corpora/cirm/bia_habitats.txt', + taxa='corpora/cirm/bia/bia_taxa.txt', + habitats='corpora/cirm/bia/bia_habitats.txt', tsv='corpora/cirm/BIA_2021/florilege_export_final_17_02_21.tsv' conda: 'softwares/envs/python3_pandas_env.yaml' shell: """ @@ -42,8 +42,8 @@ rule get_cirm_yeast_taxa_habitats: taxa_index='1', habitat_index='10,11' output: - taxa='corpora/cirm/yeast_taxa.txt', - habitats='corpora/cirm/yeast_habitats.txt', + taxa='corpora/cirm/levures/yeast_taxa.txt', + habitats='corpora/cirm/levures/yeast_habitats.txt', tsv='corpora/cirm/Levures_2021/Florilege_21012021.tsv' conda: 'softwares/envs/python3_pandas_env.yaml' shell: """ @@ -61,8 +61,8 @@ rule get_cirm_cfbp_taxa_habitats: strain_index='1', habitat_index='6,10,13,14' output: - taxa='corpora/cirm/cfbp_taxa.txt', - habitats='corpora/cirm/cfbp_habitats.txt', + taxa='corpora/cirm/cfbp/cfbp_taxa.txt', + habitats='corpora/cirm/cfbp/cfbp_habitats.txt', tsv='corpora/cirm/CFBP_2021/20210617_PPortier.tsv' conda: 'softwares/envs/python3_pandas_env.yaml' shell: """ @@ -75,9 +75,9 @@ map microorganisms ''' rule map_cirm_bia_microorganisms: input: - taxa='corpora/cirm/bia_taxa.txt' + taxa='corpora/cirm/bia/bia_taxa.txt' output: - mapped_taxaids='corpora/cirm/mapped_bia_taxa.txt' + mapped_taxaids='corpora/cirm/bia/mapped_bia_taxa.txt' params: plan='plans/map_microorganisms.plan', taxid_microorganisms='ancillaries/extended-microorganisms-taxonomy/taxid_microorganisms.txt', @@ -96,9 +96,9 @@ map microorganisms (CIRM Levures) ''' rule map_cirm_yeast_microorganisms: input: - taxa='corpora/cirm/yeast_taxa.txt' + taxa='corpora/cirm/levures/yeast_taxa.txt' output: - mapped_taxaids='corpora/cirm/mapped_yeast_taxa.txt' + mapped_taxaids='corpora/cirm/levures/mapped_yeast_taxa.txt' params: plan='plans/map_microorganisms.plan', taxid_microorganisms='ancillaries/extended-microorganisms-taxonomy/taxid_microorganisms.txt', @@ -118,9 +118,9 @@ map microorganisms (CIRM CFBP) ''' rule map_cirm_cfbp_microorganisms: input: - taxa='corpora/cirm/cfbp_taxa.txt' + taxa='corpora/cirm/cfbp/cfbp_taxa.txt' output: - mapped_taxa='corpora/cirm/mapped_cfbp_taxa.txt' + mapped_taxa='corpora/cirm/cfbp/mapped_cfbp_taxa.txt' params: plan='plans/map_microorganisms.plan', taxid_microorganisms='ancillaries/extended-microorganisms-taxonomy/taxid_microorganisms.txt', @@ -139,19 +139,22 @@ map habitats of microorganisms ''' rule map_cirm_habitats: input: - habitats='corpora/cirm/bia_habitats.txt' + habitats='corpora/cirm/bia/bia_habitats.txt' output: - mapped_habitats='corpora/cirm/mapped_bia_habitats.txt' + mapped_habitats='corpora/cirm/bia/mapped_bia_habitats.txt' params: plan='plans/map_habitats.plan', onto='ancillaries/BioNLP-OST+EnovFood-Habitat.obo', tomap='ancillaries/BioNLP-OST+EnovFood-Habitat.tomap', graylist='ancillaries/graylist_extended.heads', - emptywords='ancillaries/stopwords_EN.ttg' + emptywords='ancillaries/stopwords_EN.ttg', + outdir='corpora/cirm/bia', + outfile='mapped_bia_habitats.txt' singularity:config["SINGULARITY_IMG"] shell: """alvisnlp -J-Xmx32g -cleanTmp -verbose \ -alias input {input.habitats} \ - -alias output {output.mapped_habitats} \ + -outputDir {params.outdir} \ + -alias output {params.outfile} \ -alias ontobiotope {params.onto} \ -xalias '{params.tomap}' \ {params.plan} @@ -162,19 +165,22 @@ map habitats of microorganisms (CIRM Levures) ''' rule map_cirm_yeast_habitats: input: - habitats='corpora/cirm/yeast_habitats.txt' + habitats='corpora/cirm/levures/yeast_habitats.txt' output: - mapped_habitats='corpora/cirm/mapped_yeast_habitats.txt' + mapped_habitats='corpora/cirm/levures/mapped_yeast_habitats.txt' params: plan='plans/map_habitats.plan', onto='ancillaries/BioNLP-OST+EnovFood-Habitat.obo', tomap='ancillaries/BioNLP-OST+EnovFood-Habitat.tomap', graylist='ancillaries/graylist_extended.heads', - emptywords='ancillaries/stopwords_EN.ttg' + emptywords='ancillaries/stopwords_EN.ttg', + outdir='corpora/cirm/levures', + outfile='mapped_yeast_habitats.txt' singularity:config["SINGULARITY_IMG"] shell: """alvisnlp -J-Xmx32g -cleanTmp -verbose \ -alias input {input.habitats} \ - -alias output {output.mapped_habitats} \ + -outputDir {params.outdir} \ + -alias output {params.outfile} \ -alias ontobiotope {params.onto} \ -xalias '{params.tomap}' \ {params.plan} @@ -185,19 +191,22 @@ map habitats of microorganisms (CIRM CFBP) ''' rule map_cirm_cfbp_habitats: input: - habitats='corpora/cirm/cfbp_habitats.txt' + habitats='corpora/cirm/cfbp/cfbp_habitats.txt' output: - mapped_habitats='corpora/cirm/mapped_cfbp_habitats.txt' + mapped_habitats='corpora/cirm/cfbp/mapped_cfbp_habitats.txt' params: plan='plans/map_habitats.plan', onto='ancillaries/BioNLP-OST+EnovFood-Habitat.obo', tomap='ancillaries/BioNLP-OST+EnovFood-Habitat.tomap', graylist='ancillaries/graylist_extended.heads', - emptywords='ancillaries/stopwords_EN.ttg' + emptywords='ancillaries/stopwords_EN.ttg', + outdir='corpora/cirm/cfbp', + outfile='mapped_cfbp_habitats.txt' singularity:config["SINGULARITY_IMG"] shell: """alvisnlp -J-Xmx32g -cleanTmp -verbose \ -alias input {input.habitats} \ - -alias output {output.mapped_habitats} \ + -outputDir {params.outdir} \ + -alias output {params.outfile} \ -alias ontobiotope {params.onto} \ -xalias '{params.tomap}' \ {params.plan} @@ -209,8 +218,8 @@ format results rule format_cirm_results: input: file='corpora/cirm/BIA_2021/florilege_export_final_17_02_21.tsv', - taxa='corpora/cirm/mapped_bia_taxa.txt', - habitats='corpora/cirm/mapped_bia_habitats.txt' + taxa='corpora/cirm/bia/mapped_bia_taxa.txt', + habitats='corpora/cirm/bia/mapped_bia_habitats.txt' output: result='corpora/florilege/cirm/cirm-bia-results.txt' params: @@ -226,8 +235,8 @@ format results (CIRM Levures) rule format_cirm_yeast_results: input: file='corpora/cirm/Levures_2021/Florilege_21012021.tsv', - taxa='corpora/cirm/mapped_yeast_taxa.txt', - habitats='corpora/cirm/mapped_yeast_habitats.txt' + taxa='corpora/cirm/levures/mapped_yeast_taxa.txt', + habitats='corpora/cirm/levures/mapped_yeast_habitats.txt' output: result='corpora/florilege/cirm/cirm-yeast-results.txt' params: @@ -242,8 +251,8 @@ format results (CIRM CFBP) rule format_cirm_cfbp_results: input: file='corpora/cirm/CFBP_2021/20210617_PPortier.tsv', - taxa='corpora/cirm/mapped_cfbp_taxa.txt', - habitats='corpora/cirm/mapped_cfbp_habitats.txt' + taxa='corpora/cirm/cfbp/mapped_cfbp_taxa.txt', + habitats='corpora/cirm/cfbp/mapped_cfbp_habitats.txt' output: result='corpora/florilege/cirm/cirm-cfbp-results.txt' params: diff --git a/process_DSMZ_corpus.snakefile b/process_DSMZ_corpus.snakefile index b4fa67fa3a0830be87fa78e37a69270333359c55..8fde651932d3c4255ef92b8bbbfb11f822308cf3 100644 --- a/process_DSMZ_corpus.snakefile +++ b/process_DSMZ_corpus.snakefile @@ -35,11 +35,14 @@ rule map_dsmz_habitats: onto='ancillaries/BioNLP-OST+EnovFood-Habitat.obo', tomap='ancillaries/BioNLP-OST+EnovFood-Habitat.tomap', graylist='ancillaries/graylist_extended.heads', - emptywords='ancillaries/stopwords_EN.ttg' + emptywords='ancillaries/stopwords_EN.ttg', + outdir='corpora/dsmz', + outfile='mapped_habitats.txt' singularity:config["SINGULARITY_IMG"] shell: """alvisnlp -J-Xmx32g -cleanTmp -verbose \ -alias input {input.habitats} \ - -alias output {output.mapped_habitats} \ + -outputDir {params.outdir} \ + -alias output {params.outfile} \ -alias ontobiotope {params.onto} \ -xalias '{params.tomap}' \ {params.plan} diff --git a/process_GenBank_corpus.snakefile b/process_GenBank_corpus.snakefile index 388f9ab1da3af0ffa392a81c44802b85b812d21b..888835e413b1864c216cebcf38a65355270f103e 100644 --- a/process_GenBank_corpus.snakefile +++ b/process_GenBank_corpus.snakefile @@ -79,11 +79,14 @@ rule map_genbank_habitats: tomap='ancillaries/BioNLP-OST+EnovFood-Habitat.tomap', graylist='ancillaries/graylist_extended.heads', emptywords='ancillaries/stopwords_EN.ttg', - inhibitSyntax='inhibit-syntax' + inhibitSyntax='inhibit-syntax', + outdir='corpora/genbank', + outfile='mapped_habitats.txt' singularity:config["SINGULARITY_IMG"] shell: """alvisnlp -J-Xmx32g -cleanTmp -verbose \ -alias input {input.habitats} \ - -alias output {output.mapped_habitats}\ + -outputDir {params.outdir} \ + -alias output {params.outfile} \ -alias ontobiotope {params.onto} \ -feat inhibit-syntax {params.inhibitSyntax} \ -xalias '{params.tomap}' \ diff --git a/process_PubMed_corpus.snakefile b/process_PubMed_corpus.snakefile index 7fd42f33b809b64d119f5f88c543976bf8ebc097..5710034a3d3862750f5fcff9027c2a2411ed89db 100644 --- a/process_PubMed_corpus.snakefile +++ b/process_PubMed_corpus.snakefile @@ -46,9 +46,14 @@ rule run_pubmed_entities: params: batch="{B}", corpus='pubmed', - inhibitSyntax='inhibit-syntax', - onto='ancillaries/BioNLP-OST+EnovFood', - ontobiotopeUse='ancillaries/Use_V2', + inhibitSyntax='inhibit-syntax', + onto_habitat='ancillaries/BioNLP-OST+EnovFood-Habitat.obo', + tomap_habitat='ancillaries/BioNLP-OST+EnovFood-Habitat.tomap', + onto_pheno='ancillaries/BioNLP-OST+EnovFood-Phenotype.obo', + tomap_pheno='ancillaries/BioNLP-OST+EnovFood-Phenotype.tomap', + graylist='ancillaries/graylist_extended.heads', + emptywords='ancillaries/stopwords_EN.ttg', + ontobiotopeUse='ancillaries/Use_V2.obo', plan='plans/entities.plan', dir='corpora/pubmed/batches/{B}/', taxid_microorganisms='ancillaries/extended-microorganisms-taxonomy/taxid_microorganisms.txt', @@ -61,13 +66,15 @@ rule run_pubmed_entities: -alias format pubmed \ -alias input {input.file} \ -alias input-xslt {input.xslt} \ - -alias outputDir {params.dir} \ + -alias batch batch={params.batch} \ + -outputDir {params.dir} \ -environmentEntities \ - -entity corpus {params.corpus} \ -feat inhibit-syntax {params.inhibitSyntax} \ - -entity ontobiotope {params.onto} \ - -entity ontobiotope-use {params.ontobiotopeUse} \ - -entity batch {params.batch} \ + -alias ontobiotope-habitat {params.onto_habitat} \ + -xalias '{params.tomap_habitat}' \ + -alias ontobiotope-phenotypes {params.onto_pheno} \ + -xalias '{params.tomap_pheno}' \ + -alias ontobiotope-use {params.ontobiotopeUse} \ -alias taxid_microorganisms {params.taxid_microorganisms} \ -alias taxa+id_full {params.taxa_id_full} \ {params.plan}