read directly dmp files, improved dispatch, fixed bugs

53d3404d · Robert Bossy · 8caab74c · 53d3404d · 53d3404d · 53d3404d
Commit 53d3404d authored 4 years ago by Robert Bossy
--- a/config.yaml
+++ b/config.yaml
@@ -7,8 +7,9 @@ BACDIVE_PASSWORD_FILE: '.bacdive'
 ALVISNLP: '~/code/alvisnlp/.test/alvisnlp/bin/alvisnlp'
-# taxa+id_microorganisms.txt file
+# NCBI Taxonomy files
-TAXA_FILE: 'taxa+id_microorganisms.txt'
+NCBI_NODES_FILE: 'ncbi-taxonomy/nodes.dmp'
+NCBI_NAMES_FILE: 'ncbi-taxonomy/names.dmp'
 # Output and working directories

--- a/dsmz-match.py
+++ b/dsmz-match.py
@@ -6,17 +6,47 @@ import sys
 import os
 import itertools
 import re
+import collections
-def read_taxa_id(filename):
+class NCBITaxon:
+    def __init__(self, taxid, rank):
+        self.taxid = taxid
+        self.rank = rank
+    def q(self, taxid=None, rank=None):
+        return (taxid is None or taxid == self.taxid) and (rank is None or rank == self.rank)
+class NCBIName:
+    def __init__(self, taxon, name_type):
+        self.taxon = taxon
+        self.name_type = name_type
+    def q(self, taxid=None, rank=None, name_type=None):
+        return self.taxon.q(taxid, rank) and (name_type is None or name_type == self.name_type)
+def read_ncbi_nodes(filename):
    result = {}
    with open(filename) as f:
        for line in f:
-            cols = line.strip().split('\t')
+            cols = line.split('\t|\t')
-            result[cols[0]] = {
+            taxid = cols[0]
-                'taxid': cols[1],
+            rank = cols[2]
-                'rank': cols[5]
+            result[taxid] = NCBITaxon(taxid, rank)
-            }
+    return result
+def read_ncbi_names(nodes, filename):
+    result = {}
+    with open(filename) as f:
+        for line in f:
+            cols = line.split('\t|\t')
+            taxid = cols[0]
+            name = cols[1]
+            name_type = cols[3].replace('\t|\n', '')
+            result[name] = NCBIName(nodes[taxid], name_type)
    return result
@@ -44,7 +74,7 @@ class BacDiveEntry:
        for sne in self.root.iterfind('./strain_availability/strains/list-item/strain_number'):
            if sne.text is None:
                continue
-            yield from (snt.strip() for snt in sne.text.split(','))
+            yield from (snt.strip() for snt in re.split('[,\n]', sne.text))
        desig = self.root.findtext('./taxonomy_name/strains/list-item/designation')
        if desig is not None and desig != '':
            for d in re.split('[,;]', desig):
@@ -84,76 +114,130 @@ class BacDiveEntry:
            return '%s %s' % (self.species, sn)
        return None
-    def _match(self, ncbi_taxa, s, rank=None):
+    def _match(self, ncbi_taxa, s, rank=None, name_type=None):
        if s in ncbi_taxa:
-            taxon = ncbi_taxa[s]
+            name = ncbi_taxa[s]
-            if rank is None or taxon['rank'] == rank:
+            if name.q(rank=rank, name_type=name_type):
-                return taxon['taxid']
+                return name
        return None
+    def _match_first(self, ncbi_taxa, rank=None, name_type=None):
+        for n in itertools.chain(self.strain_number, self.complete_name):
+            name = self._match(ncbi_taxa, n, rank, name_type)
+            if name is not None:
+                return name
    def _dispatch(self, ncbi_taxa):
        if len(self.strain_number) == 0:
            return 'no-number', None
-        for n in itertools.chain(self.strain_number, self.complete_name):
-            taxid = self._match(ncbi_taxa, n, 'no rank')
+        name = self._match_first(ncbi_taxa, 'no rank')
-            if taxid is not None:
+        if name is not None:
-                return 'equivalent', taxid
+            return 'equivalent', name.taxon.taxid
-        for n in itertools.chain(self.strain_number, self.complete_name):
-            taxid = self._match(ncbi_taxa, n)
+        name = self._match_first(ncbi_taxa, name_type='type material')
-            if taxid is not None:
+        if name is not None:
-                return 'type material', taxid
+            return 'type-material-of-' + name.taxon.rank, name.taxon.taxid
        rank = 'subspecies' if self.is_subspecies else 'species'
-        taxid = self._match(ncbi_taxa, self.species, rank)
+        name = self._match(ncbi_taxa, self.species, rank)
-        if taxid is not None:
+        if name is not None:
-            return 'append', taxid
+            return 'append-to-' + rank, name.taxon.taxid
-        taxid = self._match(ncbi_taxa, self.genus, 'genus')
-        if taxid is not None:
+        if rank == 'subspecies':
-            return 'append-species', taxid
+            name = self._match(ncbi_taxa, self.genus, 'species')
-        taxid = self._match(ncbi_taxa, self.family, 'family')
+            if name is not None:
-        if taxid is not None:
+                return 'append-to-species', name.taxon.taxid
-            return 'append-species', taxid
+        name = self._match(ncbi_taxa, self.genus, 'genus')
-        taxid = self._match(ncbi_taxa, self.ordo, 'order')
+        if name is not None:
-        if taxid is not None:
+            return 'append-to-genus', name.taxon.taxid
-            return 'append-species', taxid
+        name = self._match(ncbi_taxa, self.family, 'family')
-        taxid = self._match(ncbi_taxa, self.class_, 'class')
+        if name is not None:
-        if taxid is not None:
+            return 'append-to-family', name.taxon.taxid
-            return 'append-species', taxid
+        name = self._match(ncbi_taxa, self.ordo, 'order')
-        taxid = self._match(ncbi_taxa, self.phylum, 'phylum')
+        if name is not None:
-        if taxid is not None:
+            return 'append-to-order', name.taxon.taxid
-            return 'append-species', taxid
+        name = self._match(ncbi_taxa, self.class_, 'class')
-        taxid = self._match(ncbi_taxa, self.domain, 'superkingdom')
+        if name is not None:
-        if taxid is not None:
+            return 'append-to-class', name.taxon.taxid
-            return 'append-species', taxid
+        name = self._match(ncbi_taxa, self.phylum, 'phylum')
+        if name is not None:
+            return 'append-phylum', name.taxon.taxid
+        name = self._match(ncbi_taxa, self.domain, 'superkingdom')
+        if name is not None:
+            return 'append-to-superkingdom', name.taxon.taxid
        return 'fail', None
-_, NCBI_TAXA_FILE, INPUT_DIR, OUTPUT_DIR = sys.argv
+_, NCBI_NODES_FILE, NCBI_NAMES_FILE, INPUT_DIR, OUTPUT_DIR = sys.argv
 def of(fn):
    return open(os.path.join(OUTPUT_DIR, fn), 'w')
-def line(f, sep, *cols):
+def line(f, sep, *cols, tail=None):
    f.write(sep.join(str(i) for i in cols))
+    if tail is not None:
+        f.write(tail)
    f.write('\n')
-sys.stderr.write('Reading NCBI taxonomy: %s\n' % NCBI_TAXA_FILE)
+sys.stderr.write('Reading NCBI taxonomy: %s %s\n' % (NCBI_NODES_FILE, NCBI_NAMES_FILE))
-NCBI_TAXA = read_taxa_id(NCBI_TAXA_FILE)
+NCBI_TAXA = read_ncbi_names(read_ncbi_nodes(NCBI_NODES_FILE), NCBI_NAMES_FILE)
 os.makedirs(OUTPUT_DIR, exist_ok=True)
 with of('dispatch-report.txt') as out_dispatch, of('dsmz-nodes.dmp') as out_nodes, of('dsmz-names.dmp') as out_names, of('warnings.txt') as out_warn:
    sys.stderr.write('Reading BacDive entries: %s\n' % INPUT_DIR)
+    count_entries = 0
+    count_dispatches = collections.defaultdict(int)
+    count_new_nodes = 0
+    count_names_of_new_nodes = 0
+    count_new_synonyms = 0
+    new_ids = set()
    for dirpath, _, filenames in os.walk(INPUT_DIR):
        for fn in filenames:
            if not fn.endswith('.xml'):
                continue
            e = BacDiveEntry(NCBI_TAXA, os.path.join(dirpath, fn))
+            count_entries += 1
+            count_dispatches[e.dispatch] += 1
            line(out_dispatch, '\t', e.filename, e.dispatch, e.ncbi_taxid)
-            if e.dispatch == 'append':
+            if e.dispatch.startswith('append-to-'):
-                line(out_nodes, '\t|\t', e.strain_taxid, e.ncbi_taxid, 'no rank', '', 0, 1, 11, 1, 0, 1, 1, 0)
+                if e.strain_taxid in new_ids:
+                    line(out_warn, '\t', e.filename, 'duplicate id ' + e.strain_taxid)
+                    continue
+                new_ids.add(e.strain_taxid)
+                line(out_nodes, '\t|\t',
+                     e.strain_taxid,  # taxid
+                     e.ncbi_taxid,  # parent taxid
+                     'no rank',  # rank
+                     '',  # embl code
+                     0,  # division id
+                     1,  # inherited div flag
+                     11,  # genetic code id
+                     1,  # inherited GC  flag
+                     0,  # mitochondrial genetic code id
+                     1,  # inherited MGC flag
+                     1,  # GenBank hidden flag
+                     0,  # hidden subtree root flag
+                     '',  # comments
+                     tail='\t|')
+                count_new_nodes += 1
                if e.canonical is None:
                    line(out_warn, '\t', e.filename, 'no canonical')
                else:
                    for name in itertools.chain(e.strain_number, e.complete_name):
-                        line(out_names, '\t|\t', e.strain_taxid, name, '', 'scientific name' if name == e.canonical else 'strain number')
+                        line(out_names, '\t|\t', e.strain_taxid, name, '', 'scientific name' if name == e.canonical else 'strain number', tail='\t|')
+                        count_names_of_new_nodes += 1
+            elif e.dispatch == 'equivalent' or e.dispatch.startswith('type-material-of-'):
+                for name in itertools.chain(e.strain_number, e.complete_name):
+                    if name not in NCBI_TAXA:
+                        line(out_names, '\t|\t', e.ncbi_taxid, name, '', 'strain number', tail='\t|')
+                        count_new_synonyms += 1
+    sys.stderr.write('Entries: %d\n' % count_entries)
+    sys.stderr.write('Dispatches:\n')
+    for c in count_dispatches.items():
+        sys.stderr.write('  %s: %d\n' % c)
+    sys.stderr.write('New nodes: %d\n' % count_new_nodes)
+    sys.stderr.write('Names of new nodes: %d\n' % count_names_of_new_nodes)
+    sys.stderr.write('New synonyms: %d\n' % count_new_synonyms)
--- a/dsmz-match.snakefile
+++ b/dsmz-match.snakefile
@@ -11,6 +11,7 @@ rule match:
        directory(config['OUTDIR'] + '/' + config['DSMZ_MATCH_DIR'])
    input:
-        config['OUTDIR'] + '/' + config['DSMZ_STRAINS_DIR']
+        match='./dsmz-match.py',
+        strains=config['OUTDIR'] + '/' + config['DSMZ_STRAINS_DIR']
-    shell: '''./dsmz-match.py {config[TAXA_FILE]} {input} {output}'''
+    shell: '''{input.match} {config[NCBI_NODES_FILE]} {config[NCBI_NAMES_FILE]} {input.strains} {output}'''