diff --git a/dsmz-match.py b/dsmz-match.py index 8294da61e1dc5c263114f51282886622876099d4..03379ddf7af2cc20d73cb3de90963a40761a5fcf 100755 --- a/dsmz-match.py +++ b/dsmz-match.py @@ -10,12 +10,13 @@ import collections class NCBITaxon: - def __init__(self, taxid, rank): + def __init__(self, taxid, rank, division_id): self.taxid = taxid self.rank = rank + self.division_id = division_id - def q(self, taxid=None, rank=None): - return (taxid is None or taxid == self.taxid) and (rank is None or rank == self.rank) + def q(self, taxid=None, rank=None, division_id='0'): + return (taxid is None or taxid == self.taxid) and (rank is None or rank == self.rank) and (division_id is None or division_id == self.division_id) class NCBIName: @@ -32,9 +33,10 @@ def read_ncbi_nodes(filename): with open(filename) as f: for line in f: cols = line.split('\t|\t') - taxid = cols[0] + taxid = 'ncbi:' + cols[0] rank = cols[2] - result[taxid] = NCBITaxon(taxid, rank) + division_id = cols[4] + result[taxid] = NCBITaxon(taxid, rank, division_id) return result @@ -43,7 +45,7 @@ def read_ncbi_names(nodes, filename): with open(filename) as f: for line in f: cols = line.split('\t|\t') - taxid = cols[0] + taxid = 'ncbi:' + cols[0] name = cols[1] name_type = cols[3].replace('\t|\n', '') result[name] = NCBIName(nodes[taxid], name_type) @@ -53,6 +55,7 @@ def read_ncbi_names(nodes, filename): class BacDiveEntry: def __init__(self, ncbi_taxa, filename): self.filename = filename + self.bacdive_id = os.path.basename(filename).replace('.xml', '') self.root = ET.parse(filename).getroot() self.strain_number = [] self.species = None @@ -66,7 +69,7 @@ class BacDiveEntry: self.strain_number = tuple(self._get_strain_numbers()) self._set_taxonomy() self.complete_name = tuple('%s %s' % (self.species, sn) for sn in self.strain_number) - self.dispatch, self.ncbi_taxid = self._dispatch(ncbi_taxa) + self.dispatch, self.ncbi_taxon = self._dispatch(ncbi_taxa) self.strain_taxid = self._get_strain_taxid() self.canonical = self._get_canonical() @@ -133,39 +136,39 @@ class BacDiveEntry: name = self._match_first(ncbi_taxa, 'no rank') if name is not None: - return 'equivalent', name.taxon.taxid + return 'equivalent', name.taxon name = self._match_first(ncbi_taxa, name_type='type material') if name is not None: - return 'type-material-of-' + name.taxon.rank, name.taxon.taxid + return 'type-material-of-' + name.taxon.rank, name.taxon rank = 'subspecies' if self.is_subspecies else 'species' name = self._match(ncbi_taxa, self.species, rank) if name is not None: - return 'append-to-' + rank, name.taxon.taxid + return 'append-to-' + rank, name.taxon if rank == 'subspecies': name = self._match(ncbi_taxa, self.genus, 'species') if name is not None: - return 'append-to-species', name.taxon.taxid + return 'append-to-species', name.taxon name = self._match(ncbi_taxa, self.genus, 'genus') if name is not None: - return 'append-to-genus', name.taxon.taxid + return 'append-to-genus', name.taxon name = self._match(ncbi_taxa, self.family, 'family') if name is not None: - return 'append-to-family', name.taxon.taxid + return 'append-to-family', name.taxon name = self._match(ncbi_taxa, self.ordo, 'order') if name is not None: - return 'append-to-order', name.taxon.taxid + return 'append-to-order', name.taxon name = self._match(ncbi_taxa, self.class_, 'class') if name is not None: - return 'append-to-class', name.taxon.taxid + return 'append-to-class', name.taxon name = self._match(ncbi_taxa, self.phylum, 'phylum') if name is not None: - return 'append-phylum', name.taxon.taxid + return 'append-phylum', name.taxon name = self._match(ncbi_taxa, self.domain, 'superkingdom') if name is not None: - return 'append-to-superkingdom', name.taxon.taxid + return 'append-to-superkingdom', name.taxon return 'fail', None @@ -186,7 +189,7 @@ def line(f, sep, *cols, tail=None): sys.stderr.write('Reading NCBI taxonomy: %s %s\n' % (NCBI_NODES_FILE, NCBI_NAMES_FILE)) NCBI_TAXA = read_ncbi_names(read_ncbi_nodes(NCBI_NODES_FILE), NCBI_NAMES_FILE) os.makedirs(OUTPUT_DIR, exist_ok=True) -with of('dispatch-report.txt') as out_dispatch, of('dsmz-nodes.dmp') as out_nodes, of('dsmz-names.dmp') as out_names, of('warnings.txt') as out_warn: +with of('dispatch-report.txt') as out_dispatch, of('dsmz-nodes.dmp') as out_nodes, of('dsmz-names.dmp') as out_names, of('bacdive-to-taxid.txt') as out_link, of('warnings.txt') as out_warn: sys.stderr.write('Reading BacDive entries: %s\n' % INPUT_DIR) count_entries = 0 count_dispatches = collections.defaultdict(int) @@ -201,27 +204,30 @@ with of('dispatch-report.txt') as out_dispatch, of('dsmz-nodes.dmp') as out_node e = BacDiveEntry(NCBI_TAXA, os.path.join(dirpath, fn)) count_entries += 1 count_dispatches[e.dispatch] += 1 - line(out_dispatch, '\t', e.filename, e.dispatch, e.ncbi_taxid) + line(out_dispatch, '\t', e.filename, e.dispatch, '' if e.ncbi_taxon is None else e.ncbi_taxon.taxid) if e.dispatch.startswith('append-to-'): if e.strain_taxid in new_ids: line(out_warn, '\t', e.filename, 'duplicate id ' + e.strain_taxid) continue + if e.ncbi_taxon is not None and e.ncbi_taxon.division_id != '0': + line(out_warn, '\t', e.filename, 'not bacteria (div %s) (strain %s) (taxid %s)' % (e.ncbi_taxon.division_id, e.strain_taxid, e.ncbi_taxon.taxid)) new_ids.add(e.strain_taxid) line(out_nodes, '\t|\t', e.strain_taxid, # taxid - e.ncbi_taxid, # parent taxid + e.ncbi_taxon.taxid, # parent taxid 'no rank', # rank '', # embl code - 0, # division id - 1, # inherited div flag + e.ncbi_taxon.division_id, # division id + 1, # inherited div flag 11, # genetic code id - 1, # inherited GC flag - 0, # mitochondrial genetic code id - 1, # inherited MGC flag - 1, # GenBank hidden flag - 0, # hidden subtree root flag + 1, # inherited GC flag + 0, # mitochondrial genetic code id + 1, # inherited MGC flag + 1, # GenBank hidden flag + 0, # hidden subtree root flag '', # comments tail='\t|') + line(out_link, '\t', e.bacdive_id, e.strain_taxid) count_new_nodes += 1 if e.canonical is None: line(out_warn, '\t', e.filename, 'no canonical') @@ -230,9 +236,10 @@ with of('dispatch-report.txt') as out_dispatch, of('dsmz-nodes.dmp') as out_node line(out_names, '\t|\t', e.strain_taxid, name, '', 'scientific name' if name == e.canonical else 'strain number', tail='\t|') count_names_of_new_nodes += 1 elif e.dispatch == 'equivalent' or e.dispatch.startswith('type-material-of-'): + line(out_link, '\t', e.bacdive_id, e.ncbi_taxon.taxid) for name in itertools.chain(e.strain_number, e.complete_name): if name not in NCBI_TAXA: - line(out_names, '\t|\t', e.ncbi_taxid, name, '', 'strain number', tail='\t|') + line(out_names, '\t|\t', e.ncbi_taxon.taxid, name, '', 'strain number', tail='\t|') count_new_synonyms += 1 sys.stderr.write('Entries: %d\n' % count_entries) sys.stderr.write('Dispatches:\n')