-
Robert Bossy authoredRobert Bossy authored
dsmz-match.plan 2.88 KiB
<alvisnlp-plan id="match-bacdive">
<param name="input">
<alias module="read" param="sourcePath"/>
</param>
<param name="taxaFile">
<alias module="match" param="mappingFile"/>
</param>
<read class="XMLReader">
<xslTransform>bacdive2alvisnlp.xslt</xslTransform>
</read>
<saturate-strain-number>
<feature class="Action">
<target>documents.sections:strain_number</target>
<action>str:split:',':each(contents)</action>
<setFeatures/>
</feature>
<section class="Action">
<target>documents.sections:strain_number.nav:features:each</target>
<action>nav:parent.document.new:section:strain-number(str:trim(target.@value))</action>
<createSections/>
</section>
</saturate-strain-number>
<strain-name class="Action">
<target>documents.sections:strain-number</target>
<action>document.new:section:strain-name(sections:species.contents ^ " " ^ target.contents)</action>
<createSections/>
</strain-name>
<match class="FileMapper">
<target>documents.(sections:strain-number | sections:strain-name)</target>
<form>contents</form>
<targetFeatures>,taxid,canonical-name,path,pos,rank</targetFeatures>
</match>
<export>
<name-match class="TabularExport">
<outDir>.</outDir>
<corpusFile>all.txt</corpusFile>
<lines>documents.sections[@taxid]</lines>
<columns>
document.@id,
document.sections:species.contents,
document.sections:full_scientific_name.contents,
@name,
contents,
@taxid,
@canonical-name,
@rank
</columns>
<headers>
"ID",
"SPECIES NAME",
"FULL NAME",
"FIELD",
"NAME",
"NCBI TAXID",
"NCBI CANONICAL",
"NCBI RANK"
</headers>
</name-match>
<equivalent-strains class="TabularExport">
<outDir>.</outDir>
<corpusFile>equivalent-strains.txt</corpusFile>
<lines>documents[sections[@rank == "no rank"]].sections:strain-number[contents ?= " "]</lines> <!-- exclude ym20-087 kondo51 etc -->
<columns separator=";">
str:replace(str:lower(contents), " ", ":");
document.sections[@rank == "no rank"]{0}.@taxid
</columns>
</equivalent-strains>
<new-strains>
<identifier class="Action">
<target>documents[sections[@rank == "species" or @rank == "subspecies"]]</target>
<action>set:feat:new-taxid(str:replace(str:lower(sections:strain-number[contents ^= "DSM"].contents), " ", ":"))</action>
<setFeatures/>
</identifier>
<export class="TabularExport">
<outDir>.</outDir>
<corpusFile>taxa+id_dsmz.txt</corpusFile>
<lines>documents[@new-taxid].(sections:strain-number|sections:strain-name)</lines>
<columns separator=";">
contents;
document.@new-taxid;
document.sections:strain-name[contents ?= "DSM"].contents;
document.sections[@taxid]{0}.@path ^ "/" ^ document.@new-taxid;
document.sections[@taxid]{0}.@pos;
"no rank"
</columns>
</export>
</new-strains>
</export>
</alvisnlp-plan>