diff --git a/bacdive2alvisnlp.xslt b/bacdive2alvisnlp.xslt index bdb35ea2006b0238ac1e12c19a5f7cbf9af83868..98926a579e244a8b7d2a81f4634d157bb2622fd8 100644 --- a/bacdive2alvisnlp.xslt +++ b/bacdive2alvisnlp.xslt @@ -15,7 +15,14 @@ </xsl:template> <xsl:template match="taxonomy_name/strains/list-item"> + <a:section name="subspecies_epithet" xpath-contents="subspecies_epithet"/> <a:section name="species" xpath-contents="species"/> + <a:section name="genus" xpath-contents="genus"/> + <a:section name="family" xpath-contents="family"/> + <a:section name="ordo" xpath-contents="ordo"/> + <a:section name="class" xpath-contents="class"/> + <a:section name="phylum" xpath-contents="phylum"/> + <a:section name="domain" xpath-contents="domain"/> <a:section name="full_scientific_name" xpath-contents="normalize-space(full_scientific_name)"/> <a:section name="designation" xpath-contents="designation"/> </xsl:template> diff --git a/dsmz-match.plan b/dsmz-match.plan index 9379fae18e5b47a79d83f7ffc63657ce739b5eb1..69f520f934411991e7678e2e636dad329690b97a 100644 --- a/dsmz-match.plan +++ b/dsmz-match.plan @@ -31,21 +31,153 @@ <createSections/> </species-and-number> - <match class="FileMapper"> + <mark-strains class="Action"> <target>documents.(sections:catalog-number | sections:species-and-number)</target> + <action>set:feat:strain("yes")</action> + <setFeatures/> + </mark-strains> + + <mark-candidates class="Action"> + <target>documents.(sections[@strain] | sections:species | sections:genus | sections:family | sections:ordo | sections:class | sections:phylum | sections:domain)</target> + <action>set:feat:candidate("yes")</action> + <setFeatures/> + </mark-candidates> + + <match class="FileMapper"> + <target>documents.sections[@candidate]</target> <form>contents</form> <targetFeatures>,taxid,canonical-name,path,pos,rank</targetFeatures> </match> + <dispatch> + <equivalent class="Action"> + <target>documents[(not @dispatch) and sections[@strain and @rank == "no rank"]]</target> + <action> + set:feat:dispatch("equivalent") + | set:feat:taxid(sections[@strain and @rank == "no rank"]{0}.@taxid) + | set:feat:rule(module:id) + </action> + <setFeatures/> + </equivalent> + + <separate-subspecies-type-material class="Action"> + <target>documents[(not @dispatch) and sections[@strain and @rank == "subspecies"]]</target> + <action> + set:feat:dispatch("new") + | set:feat:taxid(sections[@strain and @rank == "subspecies"]{0}.@taxid) + | set:feat:rule(module:id) + </action> + <setFeatures/> + </separate-subspecies-type-material> + + <separate-species-type-material class="Action"> + <target>documents[(not @dispatch) and sections[@strain and @rank == "species"]]</target> + <action> + set:feat:dispatch("new") + | set:feat:taxid(sections[@strain and @rank == "species"]{0}.@taxid) + | set:feat:rule(module:id) + </action> + <setFeatures/> + </separate-species-type-material> + + <new-strain-for-subspecies class="Action"> + <target>documents[(not @dispatch) and sections:species[@rank == "subspecies"]]</target> + <action> + set:feat:dispatch("new") + | set:feat:taxid(sections:species[@rank == "subspecies"].@taxid) + | set:feat:rule(module:id) + </action> + <setFeatures/> + </new-strain-for-subspecies> + + <new-strain-for-species class="Action"> + <target>documents[(not @dispatch) and sections:species[@rank == "species"]]</target> + <action> + set:feat:dispatch("new") + | set:feat:taxid(sections:species[@rank == "species"].@taxid) + | set:feat:rule(module:id) + </action> + <setFeatures/> + </new-strain-for-species> + + <new-strain-for-genus class="Action"> + <target>documents[(not @dispatch) and sections:genus[@rank == "genus"]]</target> + <action> + set:feat:dispatch("new-species") + | set:feat:taxid(sections:genus[@rank == "genus"].@taxid) + | set:feat:rule(module:id) + </action> + <setFeatures/> + </new-strain-for-genus> + + <new-strain-for-family class="Action"> + <target>documents[(not @dispatch) and sections:family[@rank == "family"]]</target> + <action> + set:feat:dispatch("new-species") + | set:feat:taxid(sections:family[@rank == "family"].@taxid) + | set:feat:rule(module:id) + </action> + <setFeatures/> + </new-strain-for-family> + + <new-strain-for-ordo class="Action"> + <target>documents[(not @dispatch) and sections:ordo[@rank == "order"]]</target> + <action> + set:feat:dispatch("new-species") + | set:feat:taxid(sections:ordo[@rank == "order"].@taxid) + | set:feat:rule(module:id) + </action> + <setFeatures/> + </new-strain-for-ordo> + + <new-strain-for-class class="Action"> + <target>documents[(not @dispatch) and sections:class[@rank == "class"]]</target> + <action> + set:feat:dispatch("new-species") + | set:feat:taxid(sections:class[@rank == "class"].@taxid) + | set:feat:rule(module:id) + </action> + <setFeatures/> + </new-strain-for-class> + + <new-strain-for-phylum class="Action"> + <target>documents[(not @dispatch) and sections:phylum[@rank == "phylum"]]</target> + <action> + set:feat:dispatch("new-species") + | set:feat:taxid(sections:phylum[@rank == "phylum"].@taxid) + | set:feat:rule(module:id) + </action> + <setFeatures/> + </new-strain-for-phylum> + + <new-strain-for-domain class="Action"> + <target>documents[(not @dispatch) and sections:domain[@rank == "superkingdom"]]</target> + <action> + set:feat:dispatch("new-species") + | set:feat:taxid(sections:domain[@rank == "superkingdom"].@taxid) + | set:feat:rule(module:id) + </action> + <setFeatures/> + </new-strain-for-domain> + + <no-match class="Action"> + <target>documents[not @dispatch]</target> + <action> + set:feat:dispatch("no-match") + | set:feat:taxid("") + | set:feat:rule(module:id) + </action> + <setFeatures/> + </no-match> + </dispatch> + <export> - <name-match class="TabularExport"> + <full-report class="TabularExport"> <outDir>.</outDir> - <corpusFile>all.txt</corpusFile> - <lines>documents.sections[@taxid]</lines> + <corpusFile>full-report.txt</corpusFile> + <lines>documents.sections[@candidate]</lines> <columns> document.@id, - document.sections:species.contents, - document.sections:full_scientific_name.contents, @name, contents, @taxid, @@ -53,47 +185,145 @@ @rank </columns> <headers> - "ID", - "SPECIES NAME", - "FULL NAME", + "BACDIVE ID", "FIELD", "NAME", "NCBI TAXID", "NCBI CANONICAL", "NCBI RANK" </headers> - </name-match> + </full-report> + + <dispatch-report class="TabularExport"> + <outDir>.</outDir> + <corpusFile>dispatch-report.txt</corpusFile> + <lines>documents</lines> + <columns> + @id, + @dispatch, + @taxid, + @rule + </columns> + <headers> + "BACDIVE ID", + "DISPATCH", + "NCBI TAXID", + "RULE" + </headers> + </dispatch-report> <equivalent-strains class="TabularExport"> <outDir>.</outDir> <corpusFile>equivalent-strains.txt</corpusFile> - <lines>documents[sections[@rank == "no rank"]].sections:catalog-number[contents ?= " "]</lines> <!-- exclude ym20-087 kondo51 etc --> + <lines>documents</lines> <columns separator=";"> - str:replace(str:lower(contents), " ", ":"); - document.sections[@rank == "no rank"]{0}.@taxid + (if @dispatch == "equivalent" then @taxid ^ "\t" else "") ^ + str:join:'\t'(sections:catalog-number[contents ?= " "], str:replace(str:lower(contents), " ", ":")) <!-- exclude ym20-087 kondo51 etc --> </columns> </equivalent-strains> <new-strains> - <identifier class="Action"> - <target>documents[sections[@rank == "species" or @rank == "subspecies"]]</target> - <action>set:feat:new-taxid(str:replace(str:lower(sections:catalog-number[contents ^= "DSM"].contents), " ", ":"))</action> + <canonical> + <first-dsm class="Action"> + <target>documents[@dispatch == "new" or @dispatch == "new-species"].sections:catalog-number[contents ^= "DSM"]{0}</target> + <action>set:feat:canonical("yes")</action> + <setFeatures/> + </first-dsm> + + <first-any class="Action"> + <target>documents[(@dispatch == "new" or @dispatch == "new-species") and not sections:catalog-number[@canonical]].sections:catalog-number{0}</target> + <action>set:feat:canonical("yes")</action> + <setFeatures/> + </first-any> + + <new-taxid class="Action"> + <target>documents[@dispatch == "new" or @dispatch == "new-species"].sections:catalog-number[@canonical]</target> + <action>document.set:feat:new-taxid(str:replace(str:lower(target.contents), " ", ":"))</action> + <setFeatures/> + </new-taxid> + + <scientific-name class="Action"> + <target>documents[@dispatch == "new" or @dispatch == "new-species"].sections:catalog-number[@canonical]</target> + <action>document.sections:species-and-number[contents ?= target.contents]{0}.set:feat:scientific-name("yes")</action> + <setFeatures/> + </scientific-name> + </canonical> + + <new-species-id class="Action"> + <target>documents[@dispatch == "new-species"]</target> + <action>set:feat:new-species-id("prov:" ^ str:replace(sections:species[contents != "unclassified"].contents, " ", "-"))</action> <setFeatures/> - </identifier> + </new-species-id> + + <export-nodes class="TabularExport"> + <outDir>.</outDir> + <corpusFile>dsmz-nodes.dmp</corpusFile> + <lines>documents[@dispatch == "new" or @dispatch == "new-species"]</lines> + <separator trim="false"> | </separator> + <columns> + @new-taxid, + if @dispatch == "new" then @taxid else @new-species-id, + "no rank", + "", + "0", + "1", + "11", + "1", + "0", + "1", + "1", + "0", + "" + </columns> + </export-nodes> + + <export-species-nodes class="TabularExport"> + <outDir>.</outDir> + <corpusFile>dsmz-species-nodes.dmp</corpusFile> + <lines>sort:nsval(documents[@dispatch == "new-species"], @new-species-id ^ "___" ^ @taxid)</lines> + <separator trim="false"> | </separator> + <columns> + @new-species-id, + @taxid, + if sections:subspecies_epithet.contents == "" then "species" else "subspecies", + "", + "0", + "1", + "11", + "1", + "0", + "1", + "1", + "0", + "" + </columns> + </export-species-nodes> + + <export-names class="TabularExport"> + <outDir>.</outDir> + <corpusFile>dsmz-names.dmp</corpusFile> + <lines>documents[@dispatch == "new" or @dispatch == "new-species"].sections[@strain]</lines> + <separator trim="false"> | </separator> + <columns> + document.@new-taxid, + contents, + "", + if @scientific-name then "scientific name" else "equivalent catalog" + </columns> + </export-names> - <export class="TabularExport"> + <export-species-names class="TabularExport"> <outDir>.</outDir> - <corpusFile>taxa+id_dsmz.txt</corpusFile> - <lines>documents[@new-taxid].(sections:catalog-number | sections:species-and-number)</lines> - <columns separator=";"> - contents; - document.@new-taxid; - document.sections:species-and-number[contents ?= "DSM"].contents; - document.sections[@taxid]{0}.@path ^ "/" ^ document.@new-taxid; - document.sections[@taxid]{0}.@pos; - "no rank" + <corpusFile>dsmz-species-names.dmp</corpusFile> + <lines>documents[@dispatch == "new-species"].sections:species</lines> + <separator trim="false"> | </separator> + <columns> + document.@new-species-id, + contents, + "", + "scientific name" </columns> - </export> + </export-species-names> </new-strains> </export> </alvisnlp-plan>