From e24f206277091807b8e6b16c0b3c1f9f3787cd12 Mon Sep 17 00:00:00 2001 From: Robert Bossy <Robert.Bossy@inra.fr> Date: Wed, 7 Apr 2021 14:26:14 +0200 Subject: [PATCH] fixed bug some entries contain tab character, added check step --- .gitignore | 1 - check-taxonomy.plan | 20 ++++++++++++++++++++ dsmz-match.py | 2 +- rewrite-taxonomy.snakefile | 13 +++++++++++++ 4 files changed, 34 insertions(+), 2 deletions(-) create mode 100644 check-taxonomy.plan diff --git a/.gitignore b/.gitignore index 39c31b3..5ef89f1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,3 @@ -taxa+id_microorganisms.txt _attic/ output/ .snakemake/ diff --git a/check-taxonomy.plan b/check-taxonomy.plan new file mode 100644 index 0000000..0145527 --- /dev/null +++ b/check-taxonomy.plan @@ -0,0 +1,20 @@ +<alvisnlp-plan id="check-taxonomy"> + <param name="taxo"> + <alias module="read-taxonomy" param="dictFile"/> + </param> + + <param name="finish"> + <alias module="finish" param="corpusFile"/> + </param> + + <read-taxonomy class="TabularProjector"> + <targetLayerName>__dummy</targetLayerName> + <valueFeatures>,taxid,canonical-name,path,pos,rank,species-taxid,species-name</valueFeatures> + </read-taxonomy> + + <finish class="TabularExport"> + <outDir>.</outDir> + <lines>$</lines> + <columns>"checked"</columns> + </finish> +</alvisnlp-plan> diff --git a/dsmz-match.py b/dsmz-match.py index 03379dd..db0a274 100755 --- a/dsmz-match.py +++ b/dsmz-match.py @@ -77,7 +77,7 @@ class BacDiveEntry: for sne in self.root.iterfind('./strain_availability/strains/list-item/strain_number'): if sne.text is None: continue - yield from (snt.strip() for snt in re.split('[,\n]', sne.text)) + yield from (' '.join(snt.strip().split()) for snt in re.split('[,\n]', sne.text)) desig = self.root.findtext('./taxonomy_name/strains/list-item/designation') if desig is not None and desig != '': for d in re.split('[,;]', desig): diff --git a/rewrite-taxonomy.snakefile b/rewrite-taxonomy.snakefile index c7a2e2a..f848e6c 100644 --- a/rewrite-taxonomy.snakefile +++ b/rewrite-taxonomy.snakefile @@ -1,5 +1,18 @@ configfile: 'config.yaml' + +rule check: + input: + config['OUTDIR'] + '/taxa+id_microorganisms.txt' + + output: + config['OUTDIR'] + '/finish.txt' + + + shell: + '''{config[ALVISNLP]} -J-Xmx12G -alias taxo {input} -alias finish {output} check-taxonomy.plan''' + + rule microorganisms: output: config['OUTDIR'] + '/taxa+id_microorganisms.txt' -- GitLab