Skip to content
Snippets Groups Projects
Commit 53d3404d authored by Robert Bossy's avatar Robert Bossy
Browse files

read directly dmp files, improved dispatch, fixed bugs

parent 8caab74c
No related branches found
No related tags found
No related merge requests found
...@@ -7,8 +7,9 @@ BACDIVE_PASSWORD_FILE: '.bacdive' ...@@ -7,8 +7,9 @@ BACDIVE_PASSWORD_FILE: '.bacdive'
ALVISNLP: '~/code/alvisnlp/.test/alvisnlp/bin/alvisnlp' ALVISNLP: '~/code/alvisnlp/.test/alvisnlp/bin/alvisnlp'
# taxa+id_microorganisms.txt file # NCBI Taxonomy files
TAXA_FILE: 'taxa+id_microorganisms.txt' NCBI_NODES_FILE: 'ncbi-taxonomy/nodes.dmp'
NCBI_NAMES_FILE: 'ncbi-taxonomy/names.dmp'
# Output and working directories # Output and working directories
......
...@@ -6,17 +6,47 @@ import sys ...@@ -6,17 +6,47 @@ import sys
import os import os
import itertools import itertools
import re import re
import collections
def read_taxa_id(filename): class NCBITaxon:
def __init__(self, taxid, rank):
self.taxid = taxid
self.rank = rank
def q(self, taxid=None, rank=None):
return (taxid is None or taxid == self.taxid) and (rank is None or rank == self.rank)
class NCBIName:
def __init__(self, taxon, name_type):
self.taxon = taxon
self.name_type = name_type
def q(self, taxid=None, rank=None, name_type=None):
return self.taxon.q(taxid, rank) and (name_type is None or name_type == self.name_type)
def read_ncbi_nodes(filename):
result = {} result = {}
with open(filename) as f: with open(filename) as f:
for line in f: for line in f:
cols = line.strip().split('\t') cols = line.split('\t|\t')
result[cols[0]] = { taxid = cols[0]
'taxid': cols[1], rank = cols[2]
'rank': cols[5] result[taxid] = NCBITaxon(taxid, rank)
} return result
def read_ncbi_names(nodes, filename):
result = {}
with open(filename) as f:
for line in f:
cols = line.split('\t|\t')
taxid = cols[0]
name = cols[1]
name_type = cols[3].replace('\t|\n', '')
result[name] = NCBIName(nodes[taxid], name_type)
return result return result
...@@ -44,7 +74,7 @@ class BacDiveEntry: ...@@ -44,7 +74,7 @@ class BacDiveEntry:
for sne in self.root.iterfind('./strain_availability/strains/list-item/strain_number'): for sne in self.root.iterfind('./strain_availability/strains/list-item/strain_number'):
if sne.text is None: if sne.text is None:
continue continue
yield from (snt.strip() for snt in sne.text.split(',')) yield from (snt.strip() for snt in re.split('[,\n]', sne.text))
desig = self.root.findtext('./taxonomy_name/strains/list-item/designation') desig = self.root.findtext('./taxonomy_name/strains/list-item/designation')
if desig is not None and desig != '': if desig is not None and desig != '':
for d in re.split('[,;]', desig): for d in re.split('[,;]', desig):
...@@ -84,76 +114,130 @@ class BacDiveEntry: ...@@ -84,76 +114,130 @@ class BacDiveEntry:
return '%s %s' % (self.species, sn) return '%s %s' % (self.species, sn)
return None return None
def _match(self, ncbi_taxa, s, rank=None): def _match(self, ncbi_taxa, s, rank=None, name_type=None):
if s in ncbi_taxa: if s in ncbi_taxa:
taxon = ncbi_taxa[s] name = ncbi_taxa[s]
if rank is None or taxon['rank'] == rank: if name.q(rank=rank, name_type=name_type):
return taxon['taxid'] return name
return None return None
def _match_first(self, ncbi_taxa, rank=None, name_type=None):
for n in itertools.chain(self.strain_number, self.complete_name):
name = self._match(ncbi_taxa, n, rank, name_type)
if name is not None:
return name
def _dispatch(self, ncbi_taxa): def _dispatch(self, ncbi_taxa):
if len(self.strain_number) == 0: if len(self.strain_number) == 0:
return 'no-number', None return 'no-number', None
for n in itertools.chain(self.strain_number, self.complete_name):
taxid = self._match(ncbi_taxa, n, 'no rank') name = self._match_first(ncbi_taxa, 'no rank')
if taxid is not None: if name is not None:
return 'equivalent', taxid return 'equivalent', name.taxon.taxid
for n in itertools.chain(self.strain_number, self.complete_name):
taxid = self._match(ncbi_taxa, n) name = self._match_first(ncbi_taxa, name_type='type material')
if taxid is not None: if name is not None:
return 'type material', taxid return 'type-material-of-' + name.taxon.rank, name.taxon.taxid
rank = 'subspecies' if self.is_subspecies else 'species' rank = 'subspecies' if self.is_subspecies else 'species'
taxid = self._match(ncbi_taxa, self.species, rank) name = self._match(ncbi_taxa, self.species, rank)
if taxid is not None: if name is not None:
return 'append', taxid return 'append-to-' + rank, name.taxon.taxid
taxid = self._match(ncbi_taxa, self.genus, 'genus')
if taxid is not None: if rank == 'subspecies':
return 'append-species', taxid name = self._match(ncbi_taxa, self.genus, 'species')
taxid = self._match(ncbi_taxa, self.family, 'family') if name is not None:
if taxid is not None: return 'append-to-species', name.taxon.taxid
return 'append-species', taxid name = self._match(ncbi_taxa, self.genus, 'genus')
taxid = self._match(ncbi_taxa, self.ordo, 'order') if name is not None:
if taxid is not None: return 'append-to-genus', name.taxon.taxid
return 'append-species', taxid name = self._match(ncbi_taxa, self.family, 'family')
taxid = self._match(ncbi_taxa, self.class_, 'class') if name is not None:
if taxid is not None: return 'append-to-family', name.taxon.taxid
return 'append-species', taxid name = self._match(ncbi_taxa, self.ordo, 'order')
taxid = self._match(ncbi_taxa, self.phylum, 'phylum') if name is not None:
if taxid is not None: return 'append-to-order', name.taxon.taxid
return 'append-species', taxid name = self._match(ncbi_taxa, self.class_, 'class')
taxid = self._match(ncbi_taxa, self.domain, 'superkingdom') if name is not None:
if taxid is not None: return 'append-to-class', name.taxon.taxid
return 'append-species', taxid name = self._match(ncbi_taxa, self.phylum, 'phylum')
if name is not None:
return 'append-phylum', name.taxon.taxid
name = self._match(ncbi_taxa, self.domain, 'superkingdom')
if name is not None:
return 'append-to-superkingdom', name.taxon.taxid
return 'fail', None return 'fail', None
_, NCBI_TAXA_FILE, INPUT_DIR, OUTPUT_DIR = sys.argv _, NCBI_NODES_FILE, NCBI_NAMES_FILE, INPUT_DIR, OUTPUT_DIR = sys.argv
def of(fn): def of(fn):
return open(os.path.join(OUTPUT_DIR, fn), 'w') return open(os.path.join(OUTPUT_DIR, fn), 'w')
def line(f, sep, *cols): def line(f, sep, *cols, tail=None):
f.write(sep.join(str(i) for i in cols)) f.write(sep.join(str(i) for i in cols))
if tail is not None:
f.write(tail)
f.write('\n') f.write('\n')
sys.stderr.write('Reading NCBI taxonomy: %s\n' % NCBI_TAXA_FILE) sys.stderr.write('Reading NCBI taxonomy: %s %s\n' % (NCBI_NODES_FILE, NCBI_NAMES_FILE))
NCBI_TAXA = read_taxa_id(NCBI_TAXA_FILE) NCBI_TAXA = read_ncbi_names(read_ncbi_nodes(NCBI_NODES_FILE), NCBI_NAMES_FILE)
os.makedirs(OUTPUT_DIR, exist_ok=True) os.makedirs(OUTPUT_DIR, exist_ok=True)
with of('dispatch-report.txt') as out_dispatch, of('dsmz-nodes.dmp') as out_nodes, of('dsmz-names.dmp') as out_names, of('warnings.txt') as out_warn: with of('dispatch-report.txt') as out_dispatch, of('dsmz-nodes.dmp') as out_nodes, of('dsmz-names.dmp') as out_names, of('warnings.txt') as out_warn:
sys.stderr.write('Reading BacDive entries: %s\n' % INPUT_DIR) sys.stderr.write('Reading BacDive entries: %s\n' % INPUT_DIR)
count_entries = 0
count_dispatches = collections.defaultdict(int)
count_new_nodes = 0
count_names_of_new_nodes = 0
count_new_synonyms = 0
new_ids = set()
for dirpath, _, filenames in os.walk(INPUT_DIR): for dirpath, _, filenames in os.walk(INPUT_DIR):
for fn in filenames: for fn in filenames:
if not fn.endswith('.xml'): if not fn.endswith('.xml'):
continue continue
e = BacDiveEntry(NCBI_TAXA, os.path.join(dirpath, fn)) e = BacDiveEntry(NCBI_TAXA, os.path.join(dirpath, fn))
count_entries += 1
count_dispatches[e.dispatch] += 1
line(out_dispatch, '\t', e.filename, e.dispatch, e.ncbi_taxid) line(out_dispatch, '\t', e.filename, e.dispatch, e.ncbi_taxid)
if e.dispatch == 'append': if e.dispatch.startswith('append-to-'):
line(out_nodes, '\t|\t', e.strain_taxid, e.ncbi_taxid, 'no rank', '', 0, 1, 11, 1, 0, 1, 1, 0) if e.strain_taxid in new_ids:
line(out_warn, '\t', e.filename, 'duplicate id ' + e.strain_taxid)
continue
new_ids.add(e.strain_taxid)
line(out_nodes, '\t|\t',
e.strain_taxid, # taxid
e.ncbi_taxid, # parent taxid
'no rank', # rank
'', # embl code
0, # division id
1, # inherited div flag
11, # genetic code id
1, # inherited GC flag
0, # mitochondrial genetic code id
1, # inherited MGC flag
1, # GenBank hidden flag
0, # hidden subtree root flag
'', # comments
tail='\t|')
count_new_nodes += 1
if e.canonical is None: if e.canonical is None:
line(out_warn, '\t', e.filename, 'no canonical') line(out_warn, '\t', e.filename, 'no canonical')
else: else:
for name in itertools.chain(e.strain_number, e.complete_name): for name in itertools.chain(e.strain_number, e.complete_name):
line(out_names, '\t|\t', e.strain_taxid, name, '', 'scientific name' if name == e.canonical else 'strain number') line(out_names, '\t|\t', e.strain_taxid, name, '', 'scientific name' if name == e.canonical else 'strain number', tail='\t|')
count_names_of_new_nodes += 1
elif e.dispatch == 'equivalent' or e.dispatch.startswith('type-material-of-'):
for name in itertools.chain(e.strain_number, e.complete_name):
if name not in NCBI_TAXA:
line(out_names, '\t|\t', e.ncbi_taxid, name, '', 'strain number', tail='\t|')
count_new_synonyms += 1
sys.stderr.write('Entries: %d\n' % count_entries)
sys.stderr.write('Dispatches:\n')
for c in count_dispatches.items():
sys.stderr.write(' %s: %d\n' % c)
sys.stderr.write('New nodes: %d\n' % count_new_nodes)
sys.stderr.write('Names of new nodes: %d\n' % count_names_of_new_nodes)
sys.stderr.write('New synonyms: %d\n' % count_new_synonyms)
...@@ -11,6 +11,7 @@ rule match: ...@@ -11,6 +11,7 @@ rule match:
directory(config['OUTDIR'] + '/' + config['DSMZ_MATCH_DIR']) directory(config['OUTDIR'] + '/' + config['DSMZ_MATCH_DIR'])
input: input:
config['OUTDIR'] + '/' + config['DSMZ_STRAINS_DIR'] match='./dsmz-match.py',
strains=config['OUTDIR'] + '/' + config['DSMZ_STRAINS_DIR']
shell: '''./dsmz-match.py {config[TAXA_FILE]} {input} {output}''' shell: '''{input.match} {config[NCBI_NODES_FILE]} {config[NCBI_NAMES_FILE]} {input.strains} {output}'''
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment