From e24f206277091807b8e6b16c0b3c1f9f3787cd12 Mon Sep 17 00:00:00 2001
From: Robert Bossy <Robert.Bossy@inra.fr>
Date: Wed, 7 Apr 2021 14:26:14 +0200
Subject: [PATCH] fixed bug some entries contain tab character, added check
 step

---
 .gitignore                 |  1 -
 check-taxonomy.plan        | 20 ++++++++++++++++++++
 dsmz-match.py              |  2 +-
 rewrite-taxonomy.snakefile | 13 +++++++++++++
 4 files changed, 34 insertions(+), 2 deletions(-)
 create mode 100644 check-taxonomy.plan

diff --git a/.gitignore b/.gitignore
index 39c31b3..5ef89f1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,3 @@
-taxa+id_microorganisms.txt
 _attic/
 output/
 .snakemake/
diff --git a/check-taxonomy.plan b/check-taxonomy.plan
new file mode 100644
index 0000000..0145527
--- /dev/null
+++ b/check-taxonomy.plan
@@ -0,0 +1,20 @@
+<alvisnlp-plan id="check-taxonomy">
+  <param name="taxo">
+    <alias module="read-taxonomy" param="dictFile"/>
+  </param>
+  
+  <param name="finish">
+    <alias module="finish" param="corpusFile"/>
+  </param>
+  
+  <read-taxonomy class="TabularProjector">
+    <targetLayerName>__dummy</targetLayerName>
+    <valueFeatures>,taxid,canonical-name,path,pos,rank,species-taxid,species-name</valueFeatures>
+  </read-taxonomy>
+
+  <finish class="TabularExport">
+    <outDir>.</outDir>
+    <lines>$</lines>
+    <columns>"checked"</columns>
+  </finish>
+</alvisnlp-plan>
diff --git a/dsmz-match.py b/dsmz-match.py
index 03379dd..db0a274 100755
--- a/dsmz-match.py
+++ b/dsmz-match.py
@@ -77,7 +77,7 @@ class BacDiveEntry:
         for sne in self.root.iterfind('./strain_availability/strains/list-item/strain_number'):
             if sne.text is None:
                 continue
-            yield from (snt.strip() for snt in re.split('[,\n]', sne.text))
+            yield from (' '.join(snt.strip().split()) for snt in re.split('[,\n]', sne.text))
         desig = self.root.findtext('./taxonomy_name/strains/list-item/designation')
         if desig is not None and desig != '':
             for d in re.split('[,;]', desig):
diff --git a/rewrite-taxonomy.snakefile b/rewrite-taxonomy.snakefile
index c7a2e2a..f848e6c 100644
--- a/rewrite-taxonomy.snakefile
+++ b/rewrite-taxonomy.snakefile
@@ -1,5 +1,18 @@
 configfile: 'config.yaml'
 
+
+rule check:
+    input:
+        config['OUTDIR'] + '/taxa+id_microorganisms.txt'
+
+    output:
+        config['OUTDIR'] + '/finish.txt'
+
+
+    shell:
+        '''{config[ALVISNLP]} -J-Xmx12G -alias taxo {input} -alias finish {output} check-taxonomy.plan'''
+
+
 rule microorganisms:
     output:
         config['OUTDIR'] + '/taxa+id_microorganisms.txt'
-- 
GitLab