diff --git a/rewrite-taxonomy.snakefile b/rewrite-taxonomy.snakefile index 003577ea882a3e0618027c6e1c737aada982b27e..ab0b78defff19118f7a0bfe5a2ef952264f2b360 100644 --- a/rewrite-taxonomy.snakefile +++ b/rewrite-taxonomy.snakefile @@ -9,7 +9,8 @@ rule all: config['OUTDIR'] + '/taxa+id_microorganisms.trie', config['OUTDIR'] + '/taxid_full.txt', config['OUTDIR'] + '/taxa+id_full.txt', - config['OUTDIR'] + '/taxa+id_full.trie' + config['OUTDIR'] + '/taxa+id_full.trie', + config['OUTDIR'] + '/microorganisms-roots-paths.txt' rule check: @@ -24,6 +25,17 @@ rule check: '''{config[ALVISNLP]} -J-Xmx24G -alias taxo {input} -alias trie {output} compile-taxonomy.plan''' +rule root_paths: + output: + config['OUTDIR'] + '/microorganisms-roots-paths.txt' + + input: + 'microorganisms-roots.txt' + + shell: + '''cut -f 5 {input} | sed -e '1d' -e '/^$/d' -e 's,$,/,' >{output}''' + + rule microorganisms: output: config['OUTDIR'] + '/{p}_microorganisms.txt' diff --git a/select-taxa.plan b/select-taxa.plan index 2c0cc744442c2f31dd87c46e9e120ad113275439..c3cec8382e5a3daa97b544a764fc835819395e73 100644 --- a/select-taxa.plan +++ b/select-taxa.plan @@ -1,12 +1,21 @@ <alvisnlp-plan id="select-taxa"> <param name="list"> - <alias module="taxids" param="mappingFile"/> + <alias module="tag.taxids" param="mappingFile"/> + <alias module="tag.taxroots" param="mappingFile"/> </param> <param name="name"> <alias module="name" param="featureValue"/> - <alias module="overlaps" param="layerName"/> + </param> + + <param name="column"> + <alias module="tag.taxids" param="keyColumn"/> + <alias module="tag.taxroots" param="keyColumn"/> + </param> + + <param name="mode"> + <alias module="tag" param="select"/> </param> <name class="SetFeature"> @@ -14,12 +23,23 @@ <featureName>select-taxa-name</featureName> </name> - <taxids class="FileMapper"> - <target>documents.sections.layer:taxa</target> - <form>@taxid</form> - <targetFeatures>selected-taxa</targetFeatures> - </taxids> - + <tag> + <select>taxids</select> + + <taxids class="FileMapper"> + <target>documents.sections.layer:taxa</target> + <form>@taxid</form> + <targetFeatures>selected-taxa</targetFeatures> + </taxids> + + <taxroots class="FileMapper"> + <target>documents.sections.layer:taxa</target> + <form>@path ^ "/"</form> + <operator>prefix</operator> + <targetFeatures>selected-taxa</targetFeatures> + </taxroots> + </tag> + <layer class="Action"> <target>documents.sections.layer:taxa[@selected-taxa]</target> <action> @@ -30,6 +50,4 @@ <addToLayer/> <setFeatures/> </layer> - - <ovrelaps class="RemoveOverlaps"/> </alvisnlp-plan> diff --git a/test.plan b/test.plan index 978cdfb615bf259eccab6da2366844acb57f472b..5c879941a11ad38dcd806492c540d70caa926e63 100644 --- a/test.plan +++ b/test.plan @@ -9,8 +9,14 @@ <compiledDict>output/taxa+id_full.trie</compiledDict> </ner-taxa> - <select-microorganisms href="select-taxa.plan"> + <select-microorganisms-taxids href="select-taxa.plan"> <list>output/taxid_microorganisms.txt</list> - <name>microorganisms</name> - </select-microorganisms> + <name>microorganisms-taxids</name> + </select-microorganisms-taxids> + + <select-microorganisms-taxroots href="select-taxa.plan"> + <list>output/microorganisms-roots-paths.txt</list> + <name>microorganisms-taxroots</name> + <mode>taxroots</mode> + </select-microorganisms-taxroots> </alvisnlp-plan>