Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
E
Extended Microorganisms Taxonomy
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Admin message
A compter du 1er avril, attention à vos pipelines :
Nouvelles limitations de Docker Hub
Show more breadcrumbs
Omnicrobe
Extended Microorganisms Taxonomy
Commits
53d3404d
Commit
53d3404d
authored
4 years ago
by
Robert Bossy
Browse files
Options
Downloads
Patches
Plain Diff
read directly dmp files, improved dispatch, fixed bugs
parent
8caab74c
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
config.yaml
+3
-2
3 additions, 2 deletions
config.yaml
dsmz-match.py
+131
-47
131 additions, 47 deletions
dsmz-match.py
dsmz-match.snakefile
+3
-2
3 additions, 2 deletions
dsmz-match.snakefile
with
137 additions
and
51 deletions
config.yaml
+
3
−
2
View file @
53d3404d
...
...
@@ -7,8 +7,9 @@ BACDIVE_PASSWORD_FILE: '.bacdive'
ALVISNLP
:
'
~/code/alvisnlp/.test/alvisnlp/bin/alvisnlp'
# taxa+id_microorganisms.txt file
TAXA_FILE
:
'
taxa+id_microorganisms.txt'
# NCBI Taxonomy files
NCBI_NODES_FILE
:
'
ncbi-taxonomy/nodes.dmp'
NCBI_NAMES_FILE
:
'
ncbi-taxonomy/names.dmp'
# Output and working directories
...
...
This diff is collapsed.
Click to expand it.
dsmz-match.py
+
131
−
47
View file @
53d3404d
...
...
@@ -6,17 +6,47 @@ import sys
import
os
import
itertools
import
re
import
collections
def
read_taxa_id
(
filename
):
class
NCBITaxon
:
def
__init__
(
self
,
taxid
,
rank
):
self
.
taxid
=
taxid
self
.
rank
=
rank
def
q
(
self
,
taxid
=
None
,
rank
=
None
):
return
(
taxid
is
None
or
taxid
==
self
.
taxid
)
and
(
rank
is
None
or
rank
==
self
.
rank
)
class
NCBIName
:
def
__init__
(
self
,
taxon
,
name_type
):
self
.
taxon
=
taxon
self
.
name_type
=
name_type
def
q
(
self
,
taxid
=
None
,
rank
=
None
,
name_type
=
None
):
return
self
.
taxon
.
q
(
taxid
,
rank
)
and
(
name_type
is
None
or
name_type
==
self
.
name_type
)
def
read_ncbi_nodes
(
filename
):
result
=
{}
with
open
(
filename
)
as
f
:
for
line
in
f
:
cols
=
line
.
strip
().
split
(
'
\t
'
)
result
[
cols
[
0
]]
=
{
'
taxid
'
:
cols
[
1
],
'
rank
'
:
cols
[
5
]
}
cols
=
line
.
split
(
'
\t
|
\t
'
)
taxid
=
cols
[
0
]
rank
=
cols
[
2
]
result
[
taxid
]
=
NCBITaxon
(
taxid
,
rank
)
return
result
def
read_ncbi_names
(
nodes
,
filename
):
result
=
{}
with
open
(
filename
)
as
f
:
for
line
in
f
:
cols
=
line
.
split
(
'
\t
|
\t
'
)
taxid
=
cols
[
0
]
name
=
cols
[
1
]
name_type
=
cols
[
3
].
replace
(
'
\t
|
\n
'
,
''
)
result
[
name
]
=
NCBIName
(
nodes
[
taxid
],
name_type
)
return
result
...
...
@@ -44,7 +74,7 @@ class BacDiveEntry:
for
sne
in
self
.
root
.
iterfind
(
'
./strain_availability/strains/list-item/strain_number
'
):
if
sne
.
text
is
None
:
continue
yield
from
(
snt
.
strip
()
for
snt
in
sne
.
text
.
split
(
'
,
'
))
yield
from
(
snt
.
strip
()
for
snt
in
re
.
split
(
'
[,
\n
]
'
,
sne
.
text
))
desig
=
self
.
root
.
findtext
(
'
./taxonomy_name/strains/list-item/designation
'
)
if
desig
is
not
None
and
desig
!=
''
:
for
d
in
re
.
split
(
'
[,;]
'
,
desig
):
...
...
@@ -84,76 +114,130 @@ class BacDiveEntry:
return
'
%s %s
'
%
(
self
.
species
,
sn
)
return
None
def
_match
(
self
,
ncbi_taxa
,
s
,
rank
=
None
):
def
_match
(
self
,
ncbi_taxa
,
s
,
rank
=
None
,
name_type
=
None
):
if
s
in
ncbi_taxa
:
taxon
=
ncbi_taxa
[
s
]
if
rank
is
None
or
taxon
[
'
rank
'
]
==
rank
:
return
taxon
[
'
taxid
'
]
name
=
ncbi_taxa
[
s
]
if
name
.
q
(
rank
=
rank
,
name_type
=
name_type
)
:
return
name
return
None
def
_match_first
(
self
,
ncbi_taxa
,
rank
=
None
,
name_type
=
None
):
for
n
in
itertools
.
chain
(
self
.
strain_number
,
self
.
complete_name
):
name
=
self
.
_match
(
ncbi_taxa
,
n
,
rank
,
name_type
)
if
name
is
not
None
:
return
name
def
_dispatch
(
self
,
ncbi_taxa
):
if
len
(
self
.
strain_number
)
==
0
:
return
'
no-number
'
,
None
for
n
in
itertools
.
chain
(
self
.
strain_number
,
self
.
complete_name
):
taxid
=
self
.
_match
(
ncbi_taxa
,
n
,
'
no rank
'
)
if
taxid
is
not
None
:
return
'
equivalent
'
,
taxid
for
n
in
itertools
.
chain
(
self
.
strain_number
,
self
.
complete_name
):
taxid
=
self
.
_match
(
ncbi_taxa
,
n
)
if
taxid
is
not
None
:
return
'
type material
'
,
taxid
name
=
self
.
_match_first
(
ncbi_taxa
,
'
no rank
'
)
if
name
is
not
None
:
return
'
equivalent
'
,
name
.
taxon
.
taxid
name
=
self
.
_match_first
(
ncbi_taxa
,
name_type
=
'
type material
'
)
if
name
is
not
None
:
return
'
type-material-of-
'
+
name
.
taxon
.
rank
,
name
.
taxon
.
taxid
rank
=
'
subspecies
'
if
self
.
is_subspecies
else
'
species
'
taxid
=
self
.
_match
(
ncbi_taxa
,
self
.
species
,
rank
)
if
taxid
is
not
None
:
return
'
append
'
,
taxid
taxid
=
self
.
_match
(
ncbi_taxa
,
self
.
genus
,
'
genus
'
)
if
taxid
is
not
None
:
return
'
append-species
'
,
taxid
taxid
=
self
.
_match
(
ncbi_taxa
,
self
.
family
,
'
family
'
)
if
taxid
is
not
None
:
return
'
append-species
'
,
taxid
taxid
=
self
.
_match
(
ncbi_taxa
,
self
.
ordo
,
'
order
'
)
if
taxid
is
not
None
:
return
'
append-species
'
,
taxid
taxid
=
self
.
_match
(
ncbi_taxa
,
self
.
class_
,
'
class
'
)
if
taxid
is
not
None
:
return
'
append-species
'
,
taxid
taxid
=
self
.
_match
(
ncbi_taxa
,
self
.
phylum
,
'
phylum
'
)
if
taxid
is
not
None
:
return
'
append-species
'
,
taxid
taxid
=
self
.
_match
(
ncbi_taxa
,
self
.
domain
,
'
superkingdom
'
)
if
taxid
is
not
None
:
return
'
append-species
'
,
taxid
name
=
self
.
_match
(
ncbi_taxa
,
self
.
species
,
rank
)
if
name
is
not
None
:
return
'
append-to-
'
+
rank
,
name
.
taxon
.
taxid
if
rank
==
'
subspecies
'
:
name
=
self
.
_match
(
ncbi_taxa
,
self
.
genus
,
'
species
'
)
if
name
is
not
None
:
return
'
append-to-species
'
,
name
.
taxon
.
taxid
name
=
self
.
_match
(
ncbi_taxa
,
self
.
genus
,
'
genus
'
)
if
name
is
not
None
:
return
'
append-to-genus
'
,
name
.
taxon
.
taxid
name
=
self
.
_match
(
ncbi_taxa
,
self
.
family
,
'
family
'
)
if
name
is
not
None
:
return
'
append-to-family
'
,
name
.
taxon
.
taxid
name
=
self
.
_match
(
ncbi_taxa
,
self
.
ordo
,
'
order
'
)
if
name
is
not
None
:
return
'
append-to-order
'
,
name
.
taxon
.
taxid
name
=
self
.
_match
(
ncbi_taxa
,
self
.
class_
,
'
class
'
)
if
name
is
not
None
:
return
'
append-to-class
'
,
name
.
taxon
.
taxid
name
=
self
.
_match
(
ncbi_taxa
,
self
.
phylum
,
'
phylum
'
)
if
name
is
not
None
:
return
'
append-phylum
'
,
name
.
taxon
.
taxid
name
=
self
.
_match
(
ncbi_taxa
,
self
.
domain
,
'
superkingdom
'
)
if
name
is
not
None
:
return
'
append-to-superkingdom
'
,
name
.
taxon
.
taxid
return
'
fail
'
,
None
_
,
NCBI_
TAXA
_FILE
,
INPUT_DIR
,
OUTPUT_DIR
=
sys
.
argv
_
,
NCBI_
NODES_FILE
,
NCBI_NAMES
_FILE
,
INPUT_DIR
,
OUTPUT_DIR
=
sys
.
argv
def
of
(
fn
):
return
open
(
os
.
path
.
join
(
OUTPUT_DIR
,
fn
),
'
w
'
)
def
line
(
f
,
sep
,
*
cols
):
def
line
(
f
,
sep
,
*
cols
,
tail
=
None
):
f
.
write
(
sep
.
join
(
str
(
i
)
for
i
in
cols
))
if
tail
is
not
None
:
f
.
write
(
tail
)
f
.
write
(
'
\n
'
)
sys
.
stderr
.
write
(
'
Reading NCBI taxonomy: %s
\n
'
%
NCBI_
TAXA
_FILE
)
NCBI_TAXA
=
read_
taxa_id
(
NCBI_TAXA
_FILE
)
sys
.
stderr
.
write
(
'
Reading NCBI taxonomy:
%s
%s
\n
'
%
(
NCBI_
NODES_FILE
,
NCBI_NAMES
_FILE
)
)
NCBI_TAXA
=
read_
ncbi_names
(
read_ncbi_nodes
(
NCBI_NODES_FILE
),
NCBI_NAMES
_FILE
)
os
.
makedirs
(
OUTPUT_DIR
,
exist_ok
=
True
)
with
of
(
'
dispatch-report.txt
'
)
as
out_dispatch
,
of
(
'
dsmz-nodes.dmp
'
)
as
out_nodes
,
of
(
'
dsmz-names.dmp
'
)
as
out_names
,
of
(
'
warnings.txt
'
)
as
out_warn
:
sys
.
stderr
.
write
(
'
Reading BacDive entries: %s
\n
'
%
INPUT_DIR
)
count_entries
=
0
count_dispatches
=
collections
.
defaultdict
(
int
)
count_new_nodes
=
0
count_names_of_new_nodes
=
0
count_new_synonyms
=
0
new_ids
=
set
()
for
dirpath
,
_
,
filenames
in
os
.
walk
(
INPUT_DIR
):
for
fn
in
filenames
:
if
not
fn
.
endswith
(
'
.xml
'
):
continue
e
=
BacDiveEntry
(
NCBI_TAXA
,
os
.
path
.
join
(
dirpath
,
fn
))
count_entries
+=
1
count_dispatches
[
e
.
dispatch
]
+=
1
line
(
out_dispatch
,
'
\t
'
,
e
.
filename
,
e
.
dispatch
,
e
.
ncbi_taxid
)
if
e
.
dispatch
==
'
append
'
:
line
(
out_nodes
,
'
\t
|
\t
'
,
e
.
strain_taxid
,
e
.
ncbi_taxid
,
'
no rank
'
,
''
,
0
,
1
,
11
,
1
,
0
,
1
,
1
,
0
)
if
e
.
dispatch
.
startswith
(
'
append-to-
'
):
if
e
.
strain_taxid
in
new_ids
:
line
(
out_warn
,
'
\t
'
,
e
.
filename
,
'
duplicate id
'
+
e
.
strain_taxid
)
continue
new_ids
.
add
(
e
.
strain_taxid
)
line
(
out_nodes
,
'
\t
|
\t
'
,
e
.
strain_taxid
,
# taxid
e
.
ncbi_taxid
,
# parent taxid
'
no rank
'
,
# rank
''
,
# embl code
0
,
# division id
1
,
# inherited div flag
11
,
# genetic code id
1
,
# inherited GC flag
0
,
# mitochondrial genetic code id
1
,
# inherited MGC flag
1
,
# GenBank hidden flag
0
,
# hidden subtree root flag
''
,
# comments
tail
=
'
\t
|
'
)
count_new_nodes
+=
1
if
e
.
canonical
is
None
:
line
(
out_warn
,
'
\t
'
,
e
.
filename
,
'
no canonical
'
)
else
:
for
name
in
itertools
.
chain
(
e
.
strain_number
,
e
.
complete_name
):
line
(
out_names
,
'
\t
|
\t
'
,
e
.
strain_taxid
,
name
,
''
,
'
scientific name
'
if
name
==
e
.
canonical
else
'
strain number
'
)
line
(
out_names
,
'
\t
|
\t
'
,
e
.
strain_taxid
,
name
,
''
,
'
scientific name
'
if
name
==
e
.
canonical
else
'
strain number
'
,
tail
=
'
\t
|
'
)
count_names_of_new_nodes
+=
1
elif
e
.
dispatch
==
'
equivalent
'
or
e
.
dispatch
.
startswith
(
'
type-material-of-
'
):
for
name
in
itertools
.
chain
(
e
.
strain_number
,
e
.
complete_name
):
if
name
not
in
NCBI_TAXA
:
line
(
out_names
,
'
\t
|
\t
'
,
e
.
ncbi_taxid
,
name
,
''
,
'
strain number
'
,
tail
=
'
\t
|
'
)
count_new_synonyms
+=
1
sys
.
stderr
.
write
(
'
Entries: %d
\n
'
%
count_entries
)
sys
.
stderr
.
write
(
'
Dispatches:
\n
'
)
for
c
in
count_dispatches
.
items
():
sys
.
stderr
.
write
(
'
%s: %d
\n
'
%
c
)
sys
.
stderr
.
write
(
'
New nodes: %d
\n
'
%
count_new_nodes
)
sys
.
stderr
.
write
(
'
Names of new nodes: %d
\n
'
%
count_names_of_new_nodes
)
sys
.
stderr
.
write
(
'
New synonyms: %d
\n
'
%
count_new_synonyms
)
This diff is collapsed.
Click to expand it.
dsmz-match.snakefile
+
3
−
2
View file @
53d3404d
...
...
@@ -11,6 +11,7 @@ rule match:
directory(config['OUTDIR'] + '/' + config['DSMZ_MATCH_DIR'])
input:
config['OUTDIR'] + '/' + config['DSMZ_STRAINS_DIR']
match='./dsmz-match.py',
strains=config['OUTDIR'] + '/' + config['DSMZ_STRAINS_DIR']
shell: '''
./dsmz-
match
.py
{config[
TAXA
_FILE]} {input} {output}'''
shell: '''
{input.
match
}
{config[
NCBI_NODES_FILE]} {config[NCBI_NAMES
_FILE]} {input
.strains
} {output}'''
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment