From 53d3404db67ae94c4bd2378c70bd29007e09473b Mon Sep 17 00:00:00 2001
From: Robert Bossy <Robert.Bossy@inra.fr>
Date: Wed, 31 Mar 2021 18:43:18 +0200
Subject: [PATCH] read directly dmp files, improved dispatch, fixed bugs

---
 config.yaml          |   5 +-
 dsmz-match.py        | 178 +++++++++++++++++++++++++++++++------------
 dsmz-match.snakefile |   5 +-
 3 files changed, 137 insertions(+), 51 deletions(-)

diff --git a/config.yaml b/config.yaml
index 2e50341..42faed3 100644
--- a/config.yaml
+++ b/config.yaml
@@ -7,8 +7,9 @@ BACDIVE_PASSWORD_FILE: '.bacdive'
 ALVISNLP: '~/code/alvisnlp/.test/alvisnlp/bin/alvisnlp'
 
 
-# taxa+id_microorganisms.txt file
-TAXA_FILE: 'taxa+id_microorganisms.txt'
+# NCBI Taxonomy files
+NCBI_NODES_FILE: 'ncbi-taxonomy/nodes.dmp'
+NCBI_NAMES_FILE: 'ncbi-taxonomy/names.dmp'
 
 
 # Output and working directories
diff --git a/dsmz-match.py b/dsmz-match.py
index 685aea9..8294da6 100755
--- a/dsmz-match.py
+++ b/dsmz-match.py
@@ -6,17 +6,47 @@ import sys
 import os
 import itertools
 import re
+import collections
 
 
-def read_taxa_id(filename):
+class NCBITaxon:
+    def __init__(self, taxid, rank):
+        self.taxid = taxid
+        self.rank = rank
+
+    def q(self, taxid=None, rank=None):
+        return (taxid is None or taxid == self.taxid) and (rank is None or rank == self.rank)
+
+
+class NCBIName:
+    def __init__(self, taxon, name_type):
+        self.taxon = taxon
+        self.name_type = name_type
+
+    def q(self, taxid=None, rank=None, name_type=None):
+        return self.taxon.q(taxid, rank) and (name_type is None or name_type == self.name_type)
+
+
+def read_ncbi_nodes(filename):
     result = {}
     with open(filename) as f:
         for line in f:
-            cols = line.strip().split('\t')
-            result[cols[0]] = {
-                'taxid': cols[1],
-                'rank': cols[5]
-            }
+            cols = line.split('\t|\t')
+            taxid = cols[0]
+            rank = cols[2]
+            result[taxid] = NCBITaxon(taxid, rank)
+    return result
+
+
+def read_ncbi_names(nodes, filename):
+    result = {}
+    with open(filename) as f:
+        for line in f:
+            cols = line.split('\t|\t')
+            taxid = cols[0]
+            name = cols[1]
+            name_type = cols[3].replace('\t|\n', '')
+            result[name] = NCBIName(nodes[taxid], name_type)
     return result
 
 
@@ -44,7 +74,7 @@ class BacDiveEntry:
         for sne in self.root.iterfind('./strain_availability/strains/list-item/strain_number'):
             if sne.text is None:
                 continue
-            yield from (snt.strip() for snt in sne.text.split(','))
+            yield from (snt.strip() for snt in re.split('[,\n]', sne.text))
         desig = self.root.findtext('./taxonomy_name/strains/list-item/designation')
         if desig is not None and desig != '':
             for d in re.split('[,;]', desig):
@@ -84,76 +114,130 @@ class BacDiveEntry:
             return '%s %s' % (self.species, sn)
         return None
 
-    def _match(self, ncbi_taxa, s, rank=None):
+    def _match(self, ncbi_taxa, s, rank=None, name_type=None):
         if s in ncbi_taxa:
-            taxon = ncbi_taxa[s]
-            if rank is None or taxon['rank'] == rank:
-                return taxon['taxid']
+            name = ncbi_taxa[s]
+            if name.q(rank=rank, name_type=name_type):
+                return name
         return None
 
+    def _match_first(self, ncbi_taxa, rank=None, name_type=None):
+        for n in itertools.chain(self.strain_number, self.complete_name):
+            name = self._match(ncbi_taxa, n, rank, name_type)
+            if name is not None:
+                return name
+
     def _dispatch(self, ncbi_taxa):
         if len(self.strain_number) == 0:
             return 'no-number', None
-        for n in itertools.chain(self.strain_number, self.complete_name):
-            taxid = self._match(ncbi_taxa, n, 'no rank')
-            if taxid is not None:
-                return 'equivalent', taxid
-        for n in itertools.chain(self.strain_number, self.complete_name):
-            taxid = self._match(ncbi_taxa, n)
-            if taxid is not None:
-                return 'type material', taxid
+
+        name = self._match_first(ncbi_taxa, 'no rank')
+        if name is not None:
+            return 'equivalent', name.taxon.taxid
+
+        name = self._match_first(ncbi_taxa, name_type='type material')
+        if name is not None:
+            return 'type-material-of-' + name.taxon.rank, name.taxon.taxid
+
         rank = 'subspecies' if self.is_subspecies else 'species'
-        taxid = self._match(ncbi_taxa, self.species, rank)
-        if taxid is not None:
-            return 'append', taxid
-        taxid = self._match(ncbi_taxa, self.genus, 'genus')
-        if taxid is not None:
-            return 'append-species', taxid
-        taxid = self._match(ncbi_taxa, self.family, 'family')
-        if taxid is not None:
-            return 'append-species', taxid
-        taxid = self._match(ncbi_taxa, self.ordo, 'order')
-        if taxid is not None:
-            return 'append-species', taxid
-        taxid = self._match(ncbi_taxa, self.class_, 'class')
-        if taxid is not None:
-            return 'append-species', taxid
-        taxid = self._match(ncbi_taxa, self.phylum, 'phylum')
-        if taxid is not None:
-            return 'append-species', taxid
-        taxid = self._match(ncbi_taxa, self.domain, 'superkingdom')
-        if taxid is not None:
-            return 'append-species', taxid
+        name = self._match(ncbi_taxa, self.species, rank)
+        if name is not None:
+            return 'append-to-' + rank, name.taxon.taxid
+
+        if rank == 'subspecies':
+            name = self._match(ncbi_taxa, self.genus, 'species')
+            if name is not None:
+                return 'append-to-species', name.taxon.taxid
+        name = self._match(ncbi_taxa, self.genus, 'genus')
+        if name is not None:
+            return 'append-to-genus', name.taxon.taxid
+        name = self._match(ncbi_taxa, self.family, 'family')
+        if name is not None:
+            return 'append-to-family', name.taxon.taxid
+        name = self._match(ncbi_taxa, self.ordo, 'order')
+        if name is not None:
+            return 'append-to-order', name.taxon.taxid
+        name = self._match(ncbi_taxa, self.class_, 'class')
+        if name is not None:
+            return 'append-to-class', name.taxon.taxid
+        name = self._match(ncbi_taxa, self.phylum, 'phylum')
+        if name is not None:
+            return 'append-phylum', name.taxon.taxid
+        name = self._match(ncbi_taxa, self.domain, 'superkingdom')
+        if name is not None:
+            return 'append-to-superkingdom', name.taxon.taxid
         return 'fail', None
 
 
-_, NCBI_TAXA_FILE, INPUT_DIR, OUTPUT_DIR = sys.argv
+_, NCBI_NODES_FILE, NCBI_NAMES_FILE, INPUT_DIR, OUTPUT_DIR = sys.argv
 
 
 def of(fn):
     return open(os.path.join(OUTPUT_DIR, fn), 'w')
 
 
-def line(f, sep, *cols):
+def line(f, sep, *cols, tail=None):
     f.write(sep.join(str(i) for i in cols))
+    if tail is not None:
+        f.write(tail)
     f.write('\n')
 
 
-sys.stderr.write('Reading NCBI taxonomy: %s\n' % NCBI_TAXA_FILE)
-NCBI_TAXA = read_taxa_id(NCBI_TAXA_FILE)
+sys.stderr.write('Reading NCBI taxonomy: %s %s\n' % (NCBI_NODES_FILE, NCBI_NAMES_FILE))
+NCBI_TAXA = read_ncbi_names(read_ncbi_nodes(NCBI_NODES_FILE), NCBI_NAMES_FILE)
 os.makedirs(OUTPUT_DIR, exist_ok=True)
 with of('dispatch-report.txt') as out_dispatch, of('dsmz-nodes.dmp') as out_nodes, of('dsmz-names.dmp') as out_names, of('warnings.txt') as out_warn:
     sys.stderr.write('Reading BacDive entries: %s\n' % INPUT_DIR)
+    count_entries = 0
+    count_dispatches = collections.defaultdict(int)
+    count_new_nodes = 0
+    count_names_of_new_nodes = 0
+    count_new_synonyms = 0
+    new_ids = set()
     for dirpath, _, filenames in os.walk(INPUT_DIR):
         for fn in filenames:
             if not fn.endswith('.xml'):
                 continue
             e = BacDiveEntry(NCBI_TAXA, os.path.join(dirpath, fn))
+            count_entries += 1
+            count_dispatches[e.dispatch] += 1
             line(out_dispatch, '\t', e.filename, e.dispatch, e.ncbi_taxid)
-            if e.dispatch == 'append':
-                line(out_nodes, '\t|\t', e.strain_taxid, e.ncbi_taxid, 'no rank', '', 0, 1, 11, 1, 0, 1, 1, 0)
+            if e.dispatch.startswith('append-to-'):
+                if e.strain_taxid in new_ids:
+                    line(out_warn, '\t', e.filename, 'duplicate id ' + e.strain_taxid)
+                    continue
+                new_ids.add(e.strain_taxid)
+                line(out_nodes, '\t|\t',
+                     e.strain_taxid,  # taxid
+                     e.ncbi_taxid,  # parent taxid
+                     'no rank',  # rank
+                     '',  # embl code
+                     0,  # division id
+                     1,  # inherited div flag
+                     11,  # genetic code id
+                     1,  # inherited GC  flag
+                     0,  # mitochondrial genetic code id
+                     1,  # inherited MGC flag
+                     1,  # GenBank hidden flag
+                     0,  # hidden subtree root flag
+                     '',  # comments
+                     tail='\t|')
+                count_new_nodes += 1
                 if e.canonical is None:
                     line(out_warn, '\t', e.filename, 'no canonical')
                 else:
                     for name in itertools.chain(e.strain_number, e.complete_name):
-                        line(out_names, '\t|\t', e.strain_taxid, name, '', 'scientific name' if name == e.canonical else 'strain number')
+                        line(out_names, '\t|\t', e.strain_taxid, name, '', 'scientific name' if name == e.canonical else 'strain number', tail='\t|')
+                        count_names_of_new_nodes += 1
+            elif e.dispatch == 'equivalent' or e.dispatch.startswith('type-material-of-'):
+                for name in itertools.chain(e.strain_number, e.complete_name):
+                    if name not in NCBI_TAXA:
+                        line(out_names, '\t|\t', e.ncbi_taxid, name, '', 'strain number', tail='\t|')
+                        count_new_synonyms += 1
+    sys.stderr.write('Entries: %d\n' % count_entries)
+    sys.stderr.write('Dispatches:\n')
+    for c in count_dispatches.items():
+        sys.stderr.write('  %s: %d\n' % c)
+    sys.stderr.write('New nodes: %d\n' % count_new_nodes)
+    sys.stderr.write('Names of new nodes: %d\n' % count_names_of_new_nodes)
+    sys.stderr.write('New synonyms: %d\n' % count_new_synonyms)
diff --git a/dsmz-match.snakefile b/dsmz-match.snakefile
index a5fba2e..832e9bb 100644
--- a/dsmz-match.snakefile
+++ b/dsmz-match.snakefile
@@ -11,6 +11,7 @@ rule match:
         directory(config['OUTDIR'] + '/' + config['DSMZ_MATCH_DIR'])
 
     input:
-        config['OUTDIR'] + '/' + config['DSMZ_STRAINS_DIR']
+        match='./dsmz-match.py',
+        strains=config['OUTDIR'] + '/' + config['DSMZ_STRAINS_DIR']
 
-    shell: '''./dsmz-match.py {config[TAXA_FILE]} {input} {output}'''
+    shell: '''{input.match} {config[NCBI_NODES_FILE]} {config[NCBI_NAMES_FILE]} {input.strains} {output}'''
-- 
GitLab