From 53d3404db67ae94c4bd2378c70bd29007e09473b Mon Sep 17 00:00:00 2001 From: Robert Bossy <Robert.Bossy@inra.fr> Date: Wed, 31 Mar 2021 18:43:18 +0200 Subject: [PATCH] read directly dmp files, improved dispatch, fixed bugs --- config.yaml | 5 +- dsmz-match.py | 178 +++++++++++++++++++++++++++++++------------ dsmz-match.snakefile | 5 +- 3 files changed, 137 insertions(+), 51 deletions(-) diff --git a/config.yaml b/config.yaml index 2e50341..42faed3 100644 --- a/config.yaml +++ b/config.yaml @@ -7,8 +7,9 @@ BACDIVE_PASSWORD_FILE: '.bacdive' ALVISNLP: '~/code/alvisnlp/.test/alvisnlp/bin/alvisnlp' -# taxa+id_microorganisms.txt file -TAXA_FILE: 'taxa+id_microorganisms.txt' +# NCBI Taxonomy files +NCBI_NODES_FILE: 'ncbi-taxonomy/nodes.dmp' +NCBI_NAMES_FILE: 'ncbi-taxonomy/names.dmp' # Output and working directories diff --git a/dsmz-match.py b/dsmz-match.py index 685aea9..8294da6 100755 --- a/dsmz-match.py +++ b/dsmz-match.py @@ -6,17 +6,47 @@ import sys import os import itertools import re +import collections -def read_taxa_id(filename): +class NCBITaxon: + def __init__(self, taxid, rank): + self.taxid = taxid + self.rank = rank + + def q(self, taxid=None, rank=None): + return (taxid is None or taxid == self.taxid) and (rank is None or rank == self.rank) + + +class NCBIName: + def __init__(self, taxon, name_type): + self.taxon = taxon + self.name_type = name_type + + def q(self, taxid=None, rank=None, name_type=None): + return self.taxon.q(taxid, rank) and (name_type is None or name_type == self.name_type) + + +def read_ncbi_nodes(filename): result = {} with open(filename) as f: for line in f: - cols = line.strip().split('\t') - result[cols[0]] = { - 'taxid': cols[1], - 'rank': cols[5] - } + cols = line.split('\t|\t') + taxid = cols[0] + rank = cols[2] + result[taxid] = NCBITaxon(taxid, rank) + return result + + +def read_ncbi_names(nodes, filename): + result = {} + with open(filename) as f: + for line in f: + cols = line.split('\t|\t') + taxid = cols[0] + name = cols[1] + name_type = cols[3].replace('\t|\n', '') + result[name] = NCBIName(nodes[taxid], name_type) return result @@ -44,7 +74,7 @@ class BacDiveEntry: for sne in self.root.iterfind('./strain_availability/strains/list-item/strain_number'): if sne.text is None: continue - yield from (snt.strip() for snt in sne.text.split(',')) + yield from (snt.strip() for snt in re.split('[,\n]', sne.text)) desig = self.root.findtext('./taxonomy_name/strains/list-item/designation') if desig is not None and desig != '': for d in re.split('[,;]', desig): @@ -84,76 +114,130 @@ class BacDiveEntry: return '%s %s' % (self.species, sn) return None - def _match(self, ncbi_taxa, s, rank=None): + def _match(self, ncbi_taxa, s, rank=None, name_type=None): if s in ncbi_taxa: - taxon = ncbi_taxa[s] - if rank is None or taxon['rank'] == rank: - return taxon['taxid'] + name = ncbi_taxa[s] + if name.q(rank=rank, name_type=name_type): + return name return None + def _match_first(self, ncbi_taxa, rank=None, name_type=None): + for n in itertools.chain(self.strain_number, self.complete_name): + name = self._match(ncbi_taxa, n, rank, name_type) + if name is not None: + return name + def _dispatch(self, ncbi_taxa): if len(self.strain_number) == 0: return 'no-number', None - for n in itertools.chain(self.strain_number, self.complete_name): - taxid = self._match(ncbi_taxa, n, 'no rank') - if taxid is not None: - return 'equivalent', taxid - for n in itertools.chain(self.strain_number, self.complete_name): - taxid = self._match(ncbi_taxa, n) - if taxid is not None: - return 'type material', taxid + + name = self._match_first(ncbi_taxa, 'no rank') + if name is not None: + return 'equivalent', name.taxon.taxid + + name = self._match_first(ncbi_taxa, name_type='type material') + if name is not None: + return 'type-material-of-' + name.taxon.rank, name.taxon.taxid + rank = 'subspecies' if self.is_subspecies else 'species' - taxid = self._match(ncbi_taxa, self.species, rank) - if taxid is not None: - return 'append', taxid - taxid = self._match(ncbi_taxa, self.genus, 'genus') - if taxid is not None: - return 'append-species', taxid - taxid = self._match(ncbi_taxa, self.family, 'family') - if taxid is not None: - return 'append-species', taxid - taxid = self._match(ncbi_taxa, self.ordo, 'order') - if taxid is not None: - return 'append-species', taxid - taxid = self._match(ncbi_taxa, self.class_, 'class') - if taxid is not None: - return 'append-species', taxid - taxid = self._match(ncbi_taxa, self.phylum, 'phylum') - if taxid is not None: - return 'append-species', taxid - taxid = self._match(ncbi_taxa, self.domain, 'superkingdom') - if taxid is not None: - return 'append-species', taxid + name = self._match(ncbi_taxa, self.species, rank) + if name is not None: + return 'append-to-' + rank, name.taxon.taxid + + if rank == 'subspecies': + name = self._match(ncbi_taxa, self.genus, 'species') + if name is not None: + return 'append-to-species', name.taxon.taxid + name = self._match(ncbi_taxa, self.genus, 'genus') + if name is not None: + return 'append-to-genus', name.taxon.taxid + name = self._match(ncbi_taxa, self.family, 'family') + if name is not None: + return 'append-to-family', name.taxon.taxid + name = self._match(ncbi_taxa, self.ordo, 'order') + if name is not None: + return 'append-to-order', name.taxon.taxid + name = self._match(ncbi_taxa, self.class_, 'class') + if name is not None: + return 'append-to-class', name.taxon.taxid + name = self._match(ncbi_taxa, self.phylum, 'phylum') + if name is not None: + return 'append-phylum', name.taxon.taxid + name = self._match(ncbi_taxa, self.domain, 'superkingdom') + if name is not None: + return 'append-to-superkingdom', name.taxon.taxid return 'fail', None -_, NCBI_TAXA_FILE, INPUT_DIR, OUTPUT_DIR = sys.argv +_, NCBI_NODES_FILE, NCBI_NAMES_FILE, INPUT_DIR, OUTPUT_DIR = sys.argv def of(fn): return open(os.path.join(OUTPUT_DIR, fn), 'w') -def line(f, sep, *cols): +def line(f, sep, *cols, tail=None): f.write(sep.join(str(i) for i in cols)) + if tail is not None: + f.write(tail) f.write('\n') -sys.stderr.write('Reading NCBI taxonomy: %s\n' % NCBI_TAXA_FILE) -NCBI_TAXA = read_taxa_id(NCBI_TAXA_FILE) +sys.stderr.write('Reading NCBI taxonomy: %s %s\n' % (NCBI_NODES_FILE, NCBI_NAMES_FILE)) +NCBI_TAXA = read_ncbi_names(read_ncbi_nodes(NCBI_NODES_FILE), NCBI_NAMES_FILE) os.makedirs(OUTPUT_DIR, exist_ok=True) with of('dispatch-report.txt') as out_dispatch, of('dsmz-nodes.dmp') as out_nodes, of('dsmz-names.dmp') as out_names, of('warnings.txt') as out_warn: sys.stderr.write('Reading BacDive entries: %s\n' % INPUT_DIR) + count_entries = 0 + count_dispatches = collections.defaultdict(int) + count_new_nodes = 0 + count_names_of_new_nodes = 0 + count_new_synonyms = 0 + new_ids = set() for dirpath, _, filenames in os.walk(INPUT_DIR): for fn in filenames: if not fn.endswith('.xml'): continue e = BacDiveEntry(NCBI_TAXA, os.path.join(dirpath, fn)) + count_entries += 1 + count_dispatches[e.dispatch] += 1 line(out_dispatch, '\t', e.filename, e.dispatch, e.ncbi_taxid) - if e.dispatch == 'append': - line(out_nodes, '\t|\t', e.strain_taxid, e.ncbi_taxid, 'no rank', '', 0, 1, 11, 1, 0, 1, 1, 0) + if e.dispatch.startswith('append-to-'): + if e.strain_taxid in new_ids: + line(out_warn, '\t', e.filename, 'duplicate id ' + e.strain_taxid) + continue + new_ids.add(e.strain_taxid) + line(out_nodes, '\t|\t', + e.strain_taxid, # taxid + e.ncbi_taxid, # parent taxid + 'no rank', # rank + '', # embl code + 0, # division id + 1, # inherited div flag + 11, # genetic code id + 1, # inherited GC flag + 0, # mitochondrial genetic code id + 1, # inherited MGC flag + 1, # GenBank hidden flag + 0, # hidden subtree root flag + '', # comments + tail='\t|') + count_new_nodes += 1 if e.canonical is None: line(out_warn, '\t', e.filename, 'no canonical') else: for name in itertools.chain(e.strain_number, e.complete_name): - line(out_names, '\t|\t', e.strain_taxid, name, '', 'scientific name' if name == e.canonical else 'strain number') + line(out_names, '\t|\t', e.strain_taxid, name, '', 'scientific name' if name == e.canonical else 'strain number', tail='\t|') + count_names_of_new_nodes += 1 + elif e.dispatch == 'equivalent' or e.dispatch.startswith('type-material-of-'): + for name in itertools.chain(e.strain_number, e.complete_name): + if name not in NCBI_TAXA: + line(out_names, '\t|\t', e.ncbi_taxid, name, '', 'strain number', tail='\t|') + count_new_synonyms += 1 + sys.stderr.write('Entries: %d\n' % count_entries) + sys.stderr.write('Dispatches:\n') + for c in count_dispatches.items(): + sys.stderr.write(' %s: %d\n' % c) + sys.stderr.write('New nodes: %d\n' % count_new_nodes) + sys.stderr.write('Names of new nodes: %d\n' % count_names_of_new_nodes) + sys.stderr.write('New synonyms: %d\n' % count_new_synonyms) diff --git a/dsmz-match.snakefile b/dsmz-match.snakefile index a5fba2e..832e9bb 100644 --- a/dsmz-match.snakefile +++ b/dsmz-match.snakefile @@ -11,6 +11,7 @@ rule match: directory(config['OUTDIR'] + '/' + config['DSMZ_MATCH_DIR']) input: - config['OUTDIR'] + '/' + config['DSMZ_STRAINS_DIR'] + match='./dsmz-match.py', + strains=config['OUTDIR'] + '/' + config['DSMZ_STRAINS_DIR'] - shell: '''./dsmz-match.py {config[TAXA_FILE]} {input} {output}''' + shell: '''{input.match} {config[NCBI_NODES_FILE]} {config[NCBI_NAMES_FILE]} {input.strains} {output}''' -- GitLab