Skip to content
Snippets Groups Projects
Commit 53d3404d authored by Robert Bossy's avatar Robert Bossy
Browse files

read directly dmp files, improved dispatch, fixed bugs

parent 8caab74c
No related branches found
No related tags found
No related merge requests found
......@@ -7,8 +7,9 @@ BACDIVE_PASSWORD_FILE: '.bacdive'
ALVISNLP: '~/code/alvisnlp/.test/alvisnlp/bin/alvisnlp'
# taxa+id_microorganisms.txt file
TAXA_FILE: 'taxa+id_microorganisms.txt'
# NCBI Taxonomy files
NCBI_NODES_FILE: 'ncbi-taxonomy/nodes.dmp'
NCBI_NAMES_FILE: 'ncbi-taxonomy/names.dmp'
# Output and working directories
......
......@@ -6,17 +6,47 @@ import sys
import os
import itertools
import re
import collections
def read_taxa_id(filename):
class NCBITaxon:
def __init__(self, taxid, rank):
self.taxid = taxid
self.rank = rank
def q(self, taxid=None, rank=None):
return (taxid is None or taxid == self.taxid) and (rank is None or rank == self.rank)
class NCBIName:
def __init__(self, taxon, name_type):
self.taxon = taxon
self.name_type = name_type
def q(self, taxid=None, rank=None, name_type=None):
return self.taxon.q(taxid, rank) and (name_type is None or name_type == self.name_type)
def read_ncbi_nodes(filename):
result = {}
with open(filename) as f:
for line in f:
cols = line.strip().split('\t')
result[cols[0]] = {
'taxid': cols[1],
'rank': cols[5]
}
cols = line.split('\t|\t')
taxid = cols[0]
rank = cols[2]
result[taxid] = NCBITaxon(taxid, rank)
return result
def read_ncbi_names(nodes, filename):
result = {}
with open(filename) as f:
for line in f:
cols = line.split('\t|\t')
taxid = cols[0]
name = cols[1]
name_type = cols[3].replace('\t|\n', '')
result[name] = NCBIName(nodes[taxid], name_type)
return result
......@@ -44,7 +74,7 @@ class BacDiveEntry:
for sne in self.root.iterfind('./strain_availability/strains/list-item/strain_number'):
if sne.text is None:
continue
yield from (snt.strip() for snt in sne.text.split(','))
yield from (snt.strip() for snt in re.split('[,\n]', sne.text))
desig = self.root.findtext('./taxonomy_name/strains/list-item/designation')
if desig is not None and desig != '':
for d in re.split('[,;]', desig):
......@@ -84,76 +114,130 @@ class BacDiveEntry:
return '%s %s' % (self.species, sn)
return None
def _match(self, ncbi_taxa, s, rank=None):
def _match(self, ncbi_taxa, s, rank=None, name_type=None):
if s in ncbi_taxa:
taxon = ncbi_taxa[s]
if rank is None or taxon['rank'] == rank:
return taxon['taxid']
name = ncbi_taxa[s]
if name.q(rank=rank, name_type=name_type):
return name
return None
def _match_first(self, ncbi_taxa, rank=None, name_type=None):
for n in itertools.chain(self.strain_number, self.complete_name):
name = self._match(ncbi_taxa, n, rank, name_type)
if name is not None:
return name
def _dispatch(self, ncbi_taxa):
if len(self.strain_number) == 0:
return 'no-number', None
for n in itertools.chain(self.strain_number, self.complete_name):
taxid = self._match(ncbi_taxa, n, 'no rank')
if taxid is not None:
return 'equivalent', taxid
for n in itertools.chain(self.strain_number, self.complete_name):
taxid = self._match(ncbi_taxa, n)
if taxid is not None:
return 'type material', taxid
name = self._match_first(ncbi_taxa, 'no rank')
if name is not None:
return 'equivalent', name.taxon.taxid
name = self._match_first(ncbi_taxa, name_type='type material')
if name is not None:
return 'type-material-of-' + name.taxon.rank, name.taxon.taxid
rank = 'subspecies' if self.is_subspecies else 'species'
taxid = self._match(ncbi_taxa, self.species, rank)
if taxid is not None:
return 'append', taxid
taxid = self._match(ncbi_taxa, self.genus, 'genus')
if taxid is not None:
return 'append-species', taxid
taxid = self._match(ncbi_taxa, self.family, 'family')
if taxid is not None:
return 'append-species', taxid
taxid = self._match(ncbi_taxa, self.ordo, 'order')
if taxid is not None:
return 'append-species', taxid
taxid = self._match(ncbi_taxa, self.class_, 'class')
if taxid is not None:
return 'append-species', taxid
taxid = self._match(ncbi_taxa, self.phylum, 'phylum')
if taxid is not None:
return 'append-species', taxid
taxid = self._match(ncbi_taxa, self.domain, 'superkingdom')
if taxid is not None:
return 'append-species', taxid
name = self._match(ncbi_taxa, self.species, rank)
if name is not None:
return 'append-to-' + rank, name.taxon.taxid
if rank == 'subspecies':
name = self._match(ncbi_taxa, self.genus, 'species')
if name is not None:
return 'append-to-species', name.taxon.taxid
name = self._match(ncbi_taxa, self.genus, 'genus')
if name is not None:
return 'append-to-genus', name.taxon.taxid
name = self._match(ncbi_taxa, self.family, 'family')
if name is not None:
return 'append-to-family', name.taxon.taxid
name = self._match(ncbi_taxa, self.ordo, 'order')
if name is not None:
return 'append-to-order', name.taxon.taxid
name = self._match(ncbi_taxa, self.class_, 'class')
if name is not None:
return 'append-to-class', name.taxon.taxid
name = self._match(ncbi_taxa, self.phylum, 'phylum')
if name is not None:
return 'append-phylum', name.taxon.taxid
name = self._match(ncbi_taxa, self.domain, 'superkingdom')
if name is not None:
return 'append-to-superkingdom', name.taxon.taxid
return 'fail', None
_, NCBI_TAXA_FILE, INPUT_DIR, OUTPUT_DIR = sys.argv
_, NCBI_NODES_FILE, NCBI_NAMES_FILE, INPUT_DIR, OUTPUT_DIR = sys.argv
def of(fn):
return open(os.path.join(OUTPUT_DIR, fn), 'w')
def line(f, sep, *cols):
def line(f, sep, *cols, tail=None):
f.write(sep.join(str(i) for i in cols))
if tail is not None:
f.write(tail)
f.write('\n')
sys.stderr.write('Reading NCBI taxonomy: %s\n' % NCBI_TAXA_FILE)
NCBI_TAXA = read_taxa_id(NCBI_TAXA_FILE)
sys.stderr.write('Reading NCBI taxonomy: %s %s\n' % (NCBI_NODES_FILE, NCBI_NAMES_FILE))
NCBI_TAXA = read_ncbi_names(read_ncbi_nodes(NCBI_NODES_FILE), NCBI_NAMES_FILE)
os.makedirs(OUTPUT_DIR, exist_ok=True)
with of('dispatch-report.txt') as out_dispatch, of('dsmz-nodes.dmp') as out_nodes, of('dsmz-names.dmp') as out_names, of('warnings.txt') as out_warn:
sys.stderr.write('Reading BacDive entries: %s\n' % INPUT_DIR)
count_entries = 0
count_dispatches = collections.defaultdict(int)
count_new_nodes = 0
count_names_of_new_nodes = 0
count_new_synonyms = 0
new_ids = set()
for dirpath, _, filenames in os.walk(INPUT_DIR):
for fn in filenames:
if not fn.endswith('.xml'):
continue
e = BacDiveEntry(NCBI_TAXA, os.path.join(dirpath, fn))
count_entries += 1
count_dispatches[e.dispatch] += 1
line(out_dispatch, '\t', e.filename, e.dispatch, e.ncbi_taxid)
if e.dispatch == 'append':
line(out_nodes, '\t|\t', e.strain_taxid, e.ncbi_taxid, 'no rank', '', 0, 1, 11, 1, 0, 1, 1, 0)
if e.dispatch.startswith('append-to-'):
if e.strain_taxid in new_ids:
line(out_warn, '\t', e.filename, 'duplicate id ' + e.strain_taxid)
continue
new_ids.add(e.strain_taxid)
line(out_nodes, '\t|\t',
e.strain_taxid, # taxid
e.ncbi_taxid, # parent taxid
'no rank', # rank
'', # embl code
0, # division id
1, # inherited div flag
11, # genetic code id
1, # inherited GC flag
0, # mitochondrial genetic code id
1, # inherited MGC flag
1, # GenBank hidden flag
0, # hidden subtree root flag
'', # comments
tail='\t|')
count_new_nodes += 1
if e.canonical is None:
line(out_warn, '\t', e.filename, 'no canonical')
else:
for name in itertools.chain(e.strain_number, e.complete_name):
line(out_names, '\t|\t', e.strain_taxid, name, '', 'scientific name' if name == e.canonical else 'strain number')
line(out_names, '\t|\t', e.strain_taxid, name, '', 'scientific name' if name == e.canonical else 'strain number', tail='\t|')
count_names_of_new_nodes += 1
elif e.dispatch == 'equivalent' or e.dispatch.startswith('type-material-of-'):
for name in itertools.chain(e.strain_number, e.complete_name):
if name not in NCBI_TAXA:
line(out_names, '\t|\t', e.ncbi_taxid, name, '', 'strain number', tail='\t|')
count_new_synonyms += 1
sys.stderr.write('Entries: %d\n' % count_entries)
sys.stderr.write('Dispatches:\n')
for c in count_dispatches.items():
sys.stderr.write(' %s: %d\n' % c)
sys.stderr.write('New nodes: %d\n' % count_new_nodes)
sys.stderr.write('Names of new nodes: %d\n' % count_names_of_new_nodes)
sys.stderr.write('New synonyms: %d\n' % count_new_synonyms)
......@@ -11,6 +11,7 @@ rule match:
directory(config['OUTDIR'] + '/' + config['DSMZ_MATCH_DIR'])
input:
config['OUTDIR'] + '/' + config['DSMZ_STRAINS_DIR']
match='./dsmz-match.py',
strains=config['OUTDIR'] + '/' + config['DSMZ_STRAINS_DIR']
shell: '''./dsmz-match.py {config[TAXA_FILE]} {input} {output}'''
shell: '''{input.match} {config[NCBI_NODES_FILE]} {config[NCBI_NAMES_FILE]} {input.strains} {output}'''
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment