diff --git a/.gitignore b/.gitignore index 39c31b34abd2bd16b9c5af391368a2f712e725a6..5ef89f16a580c0cb8dd994d5abe0ba99c79b925a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,3 @@ -taxa+id_microorganisms.txt _attic/ output/ .snakemake/ diff --git a/check-taxonomy.plan b/check-taxonomy.plan new file mode 100644 index 0000000000000000000000000000000000000000..0145527497a67e6b27b85c4a52774760fd09ec0a --- /dev/null +++ b/check-taxonomy.plan @@ -0,0 +1,20 @@ +<alvisnlp-plan id="check-taxonomy"> + <param name="taxo"> + <alias module="read-taxonomy" param="dictFile"/> + </param> + + <param name="finish"> + <alias module="finish" param="corpusFile"/> + </param> + + <read-taxonomy class="TabularProjector"> + <targetLayerName>__dummy</targetLayerName> + <valueFeatures>,taxid,canonical-name,path,pos,rank,species-taxid,species-name</valueFeatures> + </read-taxonomy> + + <finish class="TabularExport"> + <outDir>.</outDir> + <lines>$</lines> + <columns>"checked"</columns> + </finish> +</alvisnlp-plan> diff --git a/dsmz-match.py b/dsmz-match.py index 03379ddf7af2cc20d73cb3de90963a40761a5fcf..db0a2744e8876b4659c848f03d82c95f6aa78477 100755 --- a/dsmz-match.py +++ b/dsmz-match.py @@ -77,7 +77,7 @@ class BacDiveEntry: for sne in self.root.iterfind('./strain_availability/strains/list-item/strain_number'): if sne.text is None: continue - yield from (snt.strip() for snt in re.split('[,\n]', sne.text)) + yield from (' '.join(snt.strip().split()) for snt in re.split('[,\n]', sne.text)) desig = self.root.findtext('./taxonomy_name/strains/list-item/designation') if desig is not None and desig != '': for d in re.split('[,;]', desig): diff --git a/rewrite-taxonomy.snakefile b/rewrite-taxonomy.snakefile index c7a2e2af8277cfebf76c478db245f7aa04aba8d6..f848e6cbf01a6da22e06302a8f721bdc346e1ebd 100644 --- a/rewrite-taxonomy.snakefile +++ b/rewrite-taxonomy.snakefile @@ -1,5 +1,18 @@ configfile: 'config.yaml' + +rule check: + input: + config['OUTDIR'] + '/taxa+id_microorganisms.txt' + + output: + config['OUTDIR'] + '/finish.txt' + + + shell: + '''{config[ALVISNLP]} -J-Xmx12G -alias taxo {input} -alias finish {output} check-taxonomy.plan''' + + rule microorganisms: output: config['OUTDIR'] + '/taxa+id_microorganisms.txt'