From 584609647e51d0c120d6d12473696fe15e638ee4 Mon Sep 17 00:00:00 2001
From: Robert Bossy <Robert.Bossy@inra.fr>
Date: Fri, 19 Feb 2021 02:42:55 +0100
Subject: [PATCH] try to fix everything (defunct method)

---
 bacdive2alvisnlp.xslt |   7 ++
 dsmz-match.plan       | 286 +++++++++++++++++++++++++++++++++++++-----
 2 files changed, 265 insertions(+), 28 deletions(-)

diff --git a/bacdive2alvisnlp.xslt b/bacdive2alvisnlp.xslt
index bdb35ea..98926a5 100644
--- a/bacdive2alvisnlp.xslt
+++ b/bacdive2alvisnlp.xslt
@@ -15,7 +15,14 @@
   </xsl:template>
 
   <xsl:template match="taxonomy_name/strains/list-item">
+    <a:section name="subspecies_epithet" xpath-contents="subspecies_epithet"/>
     <a:section name="species" xpath-contents="species"/>
+    <a:section name="genus" xpath-contents="genus"/>
+    <a:section name="family" xpath-contents="family"/>
+    <a:section name="ordo" xpath-contents="ordo"/>
+    <a:section name="class" xpath-contents="class"/>
+    <a:section name="phylum" xpath-contents="phylum"/>
+    <a:section name="domain" xpath-contents="domain"/>
     <a:section name="full_scientific_name" xpath-contents="normalize-space(full_scientific_name)"/>
     <a:section name="designation" xpath-contents="designation"/>
   </xsl:template>
diff --git a/dsmz-match.plan b/dsmz-match.plan
index 9379fae..69f520f 100644
--- a/dsmz-match.plan
+++ b/dsmz-match.plan
@@ -31,21 +31,153 @@
     <createSections/>
   </species-and-number>
 
-  <match class="FileMapper">
+  <mark-strains class="Action">
     <target>documents.(sections:catalog-number | sections:species-and-number)</target>
+    <action>set:feat:strain("yes")</action>
+    <setFeatures/>
+  </mark-strains>
+  
+  <mark-candidates class="Action">
+    <target>documents.(sections[@strain] | sections:species | sections:genus | sections:family | sections:ordo | sections:class | sections:phylum | sections:domain)</target>
+    <action>set:feat:candidate("yes")</action>
+    <setFeatures/>
+  </mark-candidates>
+
+  <match class="FileMapper">
+    <target>documents.sections[@candidate]</target>
     <form>contents</form>
     <targetFeatures>,taxid,canonical-name,path,pos,rank</targetFeatures>
   </match>
 
+  <dispatch>
+    <equivalent class="Action">
+      <target>documents[(not @dispatch) and sections[@strain and @rank == "no rank"]]</target>
+      <action>
+	set:feat:dispatch("equivalent")
+	| set:feat:taxid(sections[@strain and @rank == "no rank"]{0}.@taxid)
+	| set:feat:rule(module:id)
+      </action>
+      <setFeatures/>
+    </equivalent>
+
+    <separate-subspecies-type-material class="Action">
+      <target>documents[(not @dispatch) and sections[@strain and  @rank == "subspecies"]]</target>
+      <action>
+	set:feat:dispatch("new")
+	| set:feat:taxid(sections[@strain and @rank == "subspecies"]{0}.@taxid)
+	| set:feat:rule(module:id)
+      </action>
+      <setFeatures/>
+    </separate-subspecies-type-material>
+
+    <separate-species-type-material class="Action">
+      <target>documents[(not @dispatch) and sections[@strain and  @rank == "species"]]</target>
+      <action>
+	set:feat:dispatch("new")
+	| set:feat:taxid(sections[@strain and @rank == "species"]{0}.@taxid)
+	| set:feat:rule(module:id)
+      </action>
+      <setFeatures/>
+    </separate-species-type-material>
+
+    <new-strain-for-subspecies class="Action">
+      <target>documents[(not @dispatch) and sections:species[@rank == "subspecies"]]</target>
+      <action>
+	set:feat:dispatch("new")
+	| set:feat:taxid(sections:species[@rank == "subspecies"].@taxid)
+	| set:feat:rule(module:id)
+      </action>
+      <setFeatures/>
+    </new-strain-for-subspecies>
+    
+    <new-strain-for-species class="Action">
+      <target>documents[(not @dispatch) and sections:species[@rank == "species"]]</target>
+      <action>
+	set:feat:dispatch("new")
+	| set:feat:taxid(sections:species[@rank == "species"].@taxid)
+	| set:feat:rule(module:id)
+      </action>
+      <setFeatures/>
+    </new-strain-for-species>
+    
+    <new-strain-for-genus class="Action">
+      <target>documents[(not @dispatch) and sections:genus[@rank == "genus"]]</target>
+      <action>
+	set:feat:dispatch("new-species")
+	| set:feat:taxid(sections:genus[@rank == "genus"].@taxid)
+	| set:feat:rule(module:id)
+      </action>
+      <setFeatures/>
+    </new-strain-for-genus>
+
+    <new-strain-for-family class="Action">
+      <target>documents[(not @dispatch) and sections:family[@rank == "family"]]</target>
+      <action>
+	set:feat:dispatch("new-species")
+	| set:feat:taxid(sections:family[@rank == "family"].@taxid)
+	| set:feat:rule(module:id)
+      </action>
+      <setFeatures/>
+    </new-strain-for-family>
+
+    <new-strain-for-ordo class="Action">
+      <target>documents[(not @dispatch) and sections:ordo[@rank == "order"]]</target>
+      <action>
+	set:feat:dispatch("new-species")
+	| set:feat:taxid(sections:ordo[@rank == "order"].@taxid)
+	| set:feat:rule(module:id)
+      </action>
+      <setFeatures/>
+    </new-strain-for-ordo>
+
+    <new-strain-for-class class="Action">
+      <target>documents[(not @dispatch) and sections:class[@rank == "class"]]</target>
+      <action>
+	set:feat:dispatch("new-species")
+	| set:feat:taxid(sections:class[@rank == "class"].@taxid)
+	| set:feat:rule(module:id)
+      </action>
+      <setFeatures/>
+    </new-strain-for-class>
+
+    <new-strain-for-phylum class="Action">
+      <target>documents[(not @dispatch) and sections:phylum[@rank == "phylum"]]</target>
+      <action>
+	set:feat:dispatch("new-species")
+	| set:feat:taxid(sections:phylum[@rank == "phylum"].@taxid)
+	| set:feat:rule(module:id)
+      </action>
+      <setFeatures/>
+    </new-strain-for-phylum>
+
+    <new-strain-for-domain class="Action">
+      <target>documents[(not @dispatch) and sections:domain[@rank == "superkingdom"]]</target>
+      <action>
+	set:feat:dispatch("new-species")
+	| set:feat:taxid(sections:domain[@rank == "superkingdom"].@taxid)
+	| set:feat:rule(module:id)
+      </action>
+      <setFeatures/>
+    </new-strain-for-domain>
+
+    <no-match class="Action">
+      <target>documents[not @dispatch]</target>
+      <action>
+	set:feat:dispatch("no-match")
+	| set:feat:taxid("")
+	| set:feat:rule(module:id)
+      </action>
+      <setFeatures/>
+    </no-match>
+  </dispatch>
+
   <export>
-    <name-match class="TabularExport">
+    <full-report class="TabularExport">
       <outDir>.</outDir>
-      <corpusFile>all.txt</corpusFile>
-      <lines>documents.sections[@taxid]</lines>
+      <corpusFile>full-report.txt</corpusFile>
+      <lines>documents.sections[@candidate]</lines>
       <columns>
 	document.@id,
-	document.sections:species.contents,
-	document.sections:full_scientific_name.contents,
 	@name,
 	contents,
 	@taxid,
@@ -53,47 +185,145 @@
 	@rank
       </columns>
       <headers>
-	"ID",
-	"SPECIES NAME",
-	"FULL NAME",
+	"BACDIVE ID",
 	"FIELD",
 	"NAME",
 	"NCBI TAXID",
 	"NCBI CANONICAL",
 	"NCBI RANK"
       </headers>
-    </name-match>
+    </full-report>
+
+    <dispatch-report class="TabularExport">
+      <outDir>.</outDir>
+      <corpusFile>dispatch-report.txt</corpusFile>
+      <lines>documents</lines>
+      <columns>
+	@id,
+	@dispatch,
+	@taxid,
+	@rule
+      </columns>
+      <headers>
+	"BACDIVE ID",
+	"DISPATCH",
+	"NCBI TAXID",
+	"RULE"
+      </headers>
+    </dispatch-report>
 
     <equivalent-strains class="TabularExport">
       <outDir>.</outDir>
       <corpusFile>equivalent-strains.txt</corpusFile>
-      <lines>documents[sections[@rank == "no rank"]].sections:catalog-number[contents ?= " "]</lines> <!-- exclude ym20-087 kondo51 etc -->
+      <lines>documents</lines>
       <columns separator=";">
-	str:replace(str:lower(contents), " ", ":");
-	document.sections[@rank == "no rank"]{0}.@taxid
+	(if @dispatch == "equivalent" then @taxid ^ "\t" else "") ^
+	str:join:'\t'(sections:catalog-number[contents ?= " "], str:replace(str:lower(contents), " ", ":")) <!-- exclude ym20-087 kondo51 etc -->
       </columns>
     </equivalent-strains>
 
     <new-strains>
-      <identifier class="Action">
-	<target>documents[sections[@rank == "species" or @rank == "subspecies"]]</target>
-	<action>set:feat:new-taxid(str:replace(str:lower(sections:catalog-number[contents ^= "DSM"].contents), " ", ":"))</action>
+      <canonical>
+	<first-dsm class="Action">
+	  <target>documents[@dispatch == "new" or @dispatch == "new-species"].sections:catalog-number[contents ^= "DSM"]{0}</target>
+	  <action>set:feat:canonical("yes")</action>
+	  <setFeatures/>
+	</first-dsm>
+
+	<first-any class="Action">
+	  <target>documents[(@dispatch == "new" or @dispatch == "new-species") and not sections:catalog-number[@canonical]].sections:catalog-number{0}</target>
+	  <action>set:feat:canonical("yes")</action>
+	  <setFeatures/>
+	</first-any>
+
+	<new-taxid class="Action">
+	  <target>documents[@dispatch == "new" or @dispatch == "new-species"].sections:catalog-number[@canonical]</target>
+	  <action>document.set:feat:new-taxid(str:replace(str:lower(target.contents), " ", ":"))</action>
+	  <setFeatures/>
+	</new-taxid>
+	
+	<scientific-name class="Action">
+	  <target>documents[@dispatch == "new" or @dispatch == "new-species"].sections:catalog-number[@canonical]</target>
+	  <action>document.sections:species-and-number[contents ?= target.contents]{0}.set:feat:scientific-name("yes")</action>
+	  <setFeatures/>
+	</scientific-name>
+      </canonical>
+
+      <new-species-id class="Action">
+	<target>documents[@dispatch == "new-species"]</target>
+	<action>set:feat:new-species-id("prov:" ^ str:replace(sections:species[contents != "unclassified"].contents, " ", "-"))</action>
 	<setFeatures/>
-      </identifier>
+      </new-species-id>
+
+      <export-nodes class="TabularExport">
+	<outDir>.</outDir>
+	<corpusFile>dsmz-nodes.dmp</corpusFile>
+	<lines>documents[@dispatch == "new" or @dispatch == "new-species"]</lines>
+	<separator trim="false">	|	</separator>
+	<columns>
+	  @new-taxid,
+	  if @dispatch == "new" then @taxid else @new-species-id,
+	  "no rank",
+	  "",
+	  "0",
+	  "1",
+	  "11",
+	  "1",
+	  "0",
+	  "1",
+	  "1",
+	  "0",
+	  ""
+	</columns>
+      </export-nodes>
+
+      <export-species-nodes class="TabularExport">
+	<outDir>.</outDir>
+	<corpusFile>dsmz-species-nodes.dmp</corpusFile>
+	<lines>sort:nsval(documents[@dispatch == "new-species"], @new-species-id ^ "___"  ^ @taxid)</lines>
+	<separator trim="false">	|	</separator>
+	<columns>
+	  @new-species-id,
+	  @taxid,
+	  if sections:subspecies_epithet.contents == "" then "species" else "subspecies",
+	  "",
+	  "0",
+	  "1",
+	  "11",
+	  "1",
+	  "0",
+	  "1",
+	  "1",
+	  "0",
+	  ""
+	</columns>
+      </export-species-nodes>
+
+      <export-names class="TabularExport">
+	<outDir>.</outDir>
+	<corpusFile>dsmz-names.dmp</corpusFile>
+	<lines>documents[@dispatch == "new" or @dispatch == "new-species"].sections[@strain]</lines>
+	<separator trim="false">	|	</separator>
+	<columns>
+	  document.@new-taxid,
+	  contents,
+	  "",
+	  if @scientific-name then "scientific name" else "equivalent catalog"
+	</columns>
+      </export-names>
 
-      <export class="TabularExport">
+      <export-species-names class="TabularExport">
 	<outDir>.</outDir>
-	<corpusFile>taxa+id_dsmz.txt</corpusFile>
-	<lines>documents[@new-taxid].(sections:catalog-number | sections:species-and-number)</lines>
-	<columns separator=";">
-	  contents;
-	  document.@new-taxid;
-	  document.sections:species-and-number[contents ?= "DSM"].contents;
-	  document.sections[@taxid]{0}.@path ^ "/" ^ document.@new-taxid;
-	  document.sections[@taxid]{0}.@pos;
-	  "no rank"
+	<corpusFile>dsmz-species-names.dmp</corpusFile>
+	<lines>documents[@dispatch == "new-species"].sections:species</lines>
+	<separator trim="false">	|	</separator>
+	<columns>
+	  document.@new-species-id,
+	  contents,
+	  "",
+	  "scientific name"
 	</columns>
-      </export>
+      </export-species-names>
     </new-strains>
   </export>
 </alvisnlp-plan>
-- 
GitLab