Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
E
Extended Microorganisms Taxonomy
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Admin message
A compter du 1er avril, attention à vos pipelines :
Nouvelles limitations de Docker Hub
Show more breadcrumbs
Omnicrobe
Extended Microorganisms Taxonomy
Commits
53d3404d
Commit
53d3404d
authored
4 years ago
by
Robert Bossy
Browse files
Options
Downloads
Patches
Plain Diff
read directly dmp files, improved dispatch, fixed bugs
parent
8caab74c
No related branches found
No related tags found
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
config.yaml
+3
-2
3 additions, 2 deletions
config.yaml
dsmz-match.py
+131
-47
131 additions, 47 deletions
dsmz-match.py
dsmz-match.snakefile
+3
-2
3 additions, 2 deletions
dsmz-match.snakefile
with
137 additions
and
51 deletions
config.yaml
+
3
−
2
View file @
53d3404d
...
@@ -7,8 +7,9 @@ BACDIVE_PASSWORD_FILE: '.bacdive'
...
@@ -7,8 +7,9 @@ BACDIVE_PASSWORD_FILE: '.bacdive'
ALVISNLP
:
'
~/code/alvisnlp/.test/alvisnlp/bin/alvisnlp'
ALVISNLP
:
'
~/code/alvisnlp/.test/alvisnlp/bin/alvisnlp'
# taxa+id_microorganisms.txt file
# NCBI Taxonomy files
TAXA_FILE
:
'
taxa+id_microorganisms.txt'
NCBI_NODES_FILE
:
'
ncbi-taxonomy/nodes.dmp'
NCBI_NAMES_FILE
:
'
ncbi-taxonomy/names.dmp'
# Output and working directories
# Output and working directories
...
...
This diff is collapsed.
Click to expand it.
dsmz-match.py
+
131
−
47
View file @
53d3404d
...
@@ -6,17 +6,47 @@ import sys
...
@@ -6,17 +6,47 @@ import sys
import
os
import
os
import
itertools
import
itertools
import
re
import
re
import
collections
def
read_taxa_id
(
filename
):
class
NCBITaxon
:
def
__init__
(
self
,
taxid
,
rank
):
self
.
taxid
=
taxid
self
.
rank
=
rank
def
q
(
self
,
taxid
=
None
,
rank
=
None
):
return
(
taxid
is
None
or
taxid
==
self
.
taxid
)
and
(
rank
is
None
or
rank
==
self
.
rank
)
class
NCBIName
:
def
__init__
(
self
,
taxon
,
name_type
):
self
.
taxon
=
taxon
self
.
name_type
=
name_type
def
q
(
self
,
taxid
=
None
,
rank
=
None
,
name_type
=
None
):
return
self
.
taxon
.
q
(
taxid
,
rank
)
and
(
name_type
is
None
or
name_type
==
self
.
name_type
)
def
read_ncbi_nodes
(
filename
):
result
=
{}
result
=
{}
with
open
(
filename
)
as
f
:
with
open
(
filename
)
as
f
:
for
line
in
f
:
for
line
in
f
:
cols
=
line
.
strip
().
split
(
'
\t
'
)
cols
=
line
.
split
(
'
\t
|
\t
'
)
result
[
cols
[
0
]]
=
{
taxid
=
cols
[
0
]
'
taxid
'
:
cols
[
1
],
rank
=
cols
[
2
]
'
rank
'
:
cols
[
5
]
result
[
taxid
]
=
NCBITaxon
(
taxid
,
rank
)
}
return
result
def
read_ncbi_names
(
nodes
,
filename
):
result
=
{}
with
open
(
filename
)
as
f
:
for
line
in
f
:
cols
=
line
.
split
(
'
\t
|
\t
'
)
taxid
=
cols
[
0
]
name
=
cols
[
1
]
name_type
=
cols
[
3
].
replace
(
'
\t
|
\n
'
,
''
)
result
[
name
]
=
NCBIName
(
nodes
[
taxid
],
name_type
)
return
result
return
result
...
@@ -44,7 +74,7 @@ class BacDiveEntry:
...
@@ -44,7 +74,7 @@ class BacDiveEntry:
for
sne
in
self
.
root
.
iterfind
(
'
./strain_availability/strains/list-item/strain_number
'
):
for
sne
in
self
.
root
.
iterfind
(
'
./strain_availability/strains/list-item/strain_number
'
):
if
sne
.
text
is
None
:
if
sne
.
text
is
None
:
continue
continue
yield
from
(
snt
.
strip
()
for
snt
in
sne
.
text
.
split
(
'
,
'
))
yield
from
(
snt
.
strip
()
for
snt
in
re
.
split
(
'
[,
\n
]
'
,
sne
.
text
))
desig
=
self
.
root
.
findtext
(
'
./taxonomy_name/strains/list-item/designation
'
)
desig
=
self
.
root
.
findtext
(
'
./taxonomy_name/strains/list-item/designation
'
)
if
desig
is
not
None
and
desig
!=
''
:
if
desig
is
not
None
and
desig
!=
''
:
for
d
in
re
.
split
(
'
[,;]
'
,
desig
):
for
d
in
re
.
split
(
'
[,;]
'
,
desig
):
...
@@ -84,76 +114,130 @@ class BacDiveEntry:
...
@@ -84,76 +114,130 @@ class BacDiveEntry:
return
'
%s %s
'
%
(
self
.
species
,
sn
)
return
'
%s %s
'
%
(
self
.
species
,
sn
)
return
None
return
None
def
_match
(
self
,
ncbi_taxa
,
s
,
rank
=
None
):
def
_match
(
self
,
ncbi_taxa
,
s
,
rank
=
None
,
name_type
=
None
):
if
s
in
ncbi_taxa
:
if
s
in
ncbi_taxa
:
taxon
=
ncbi_taxa
[
s
]
name
=
ncbi_taxa
[
s
]
if
rank
is
None
or
taxon
[
'
rank
'
]
==
rank
:
if
name
.
q
(
rank
=
rank
,
name_type
=
name_type
)
:
return
taxon
[
'
taxid
'
]
return
name
return
None
return
None
def
_match_first
(
self
,
ncbi_taxa
,
rank
=
None
,
name_type
=
None
):
for
n
in
itertools
.
chain
(
self
.
strain_number
,
self
.
complete_name
):
name
=
self
.
_match
(
ncbi_taxa
,
n
,
rank
,
name_type
)
if
name
is
not
None
:
return
name
def
_dispatch
(
self
,
ncbi_taxa
):
def
_dispatch
(
self
,
ncbi_taxa
):
if
len
(
self
.
strain_number
)
==
0
:
if
len
(
self
.
strain_number
)
==
0
:
return
'
no-number
'
,
None
return
'
no-number
'
,
None
for
n
in
itertools
.
chain
(
self
.
strain_number
,
self
.
complete_name
):
taxid
=
self
.
_match
(
ncbi_taxa
,
n
,
'
no rank
'
)
name
=
self
.
_match_first
(
ncbi_taxa
,
'
no rank
'
)
if
taxid
is
not
None
:
if
name
is
not
None
:
return
'
equivalent
'
,
taxid
return
'
equivalent
'
,
name
.
taxon
.
taxid
for
n
in
itertools
.
chain
(
self
.
strain_number
,
self
.
complete_name
):
taxid
=
self
.
_match
(
ncbi_taxa
,
n
)
name
=
self
.
_match_first
(
ncbi_taxa
,
name_type
=
'
type material
'
)
if
taxid
is
not
None
:
if
name
is
not
None
:
return
'
type material
'
,
taxid
return
'
type-material-of-
'
+
name
.
taxon
.
rank
,
name
.
taxon
.
taxid
rank
=
'
subspecies
'
if
self
.
is_subspecies
else
'
species
'
rank
=
'
subspecies
'
if
self
.
is_subspecies
else
'
species
'
taxid
=
self
.
_match
(
ncbi_taxa
,
self
.
species
,
rank
)
name
=
self
.
_match
(
ncbi_taxa
,
self
.
species
,
rank
)
if
taxid
is
not
None
:
if
name
is
not
None
:
return
'
append
'
,
taxid
return
'
append-to-
'
+
rank
,
name
.
taxon
.
taxid
taxid
=
self
.
_match
(
ncbi_taxa
,
self
.
genus
,
'
genus
'
)
if
taxid
is
not
None
:
if
rank
==
'
subspecies
'
:
return
'
append-species
'
,
taxid
name
=
self
.
_match
(
ncbi_taxa
,
self
.
genus
,
'
species
'
)
taxid
=
self
.
_match
(
ncbi_taxa
,
self
.
family
,
'
family
'
)
if
name
is
not
None
:
if
taxid
is
not
None
:
return
'
append-to-species
'
,
name
.
taxon
.
taxid
return
'
append-species
'
,
taxid
name
=
self
.
_match
(
ncbi_taxa
,
self
.
genus
,
'
genus
'
)
taxid
=
self
.
_match
(
ncbi_taxa
,
self
.
ordo
,
'
order
'
)
if
name
is
not
None
:
if
taxid
is
not
None
:
return
'
append-to-genus
'
,
name
.
taxon
.
taxid
return
'
append-species
'
,
taxid
name
=
self
.
_match
(
ncbi_taxa
,
self
.
family
,
'
family
'
)
taxid
=
self
.
_match
(
ncbi_taxa
,
self
.
class_
,
'
class
'
)
if
name
is
not
None
:
if
taxid
is
not
None
:
return
'
append-to-family
'
,
name
.
taxon
.
taxid
return
'
append-species
'
,
taxid
name
=
self
.
_match
(
ncbi_taxa
,
self
.
ordo
,
'
order
'
)
taxid
=
self
.
_match
(
ncbi_taxa
,
self
.
phylum
,
'
phylum
'
)
if
name
is
not
None
:
if
taxid
is
not
None
:
return
'
append-to-order
'
,
name
.
taxon
.
taxid
return
'
append-species
'
,
taxid
name
=
self
.
_match
(
ncbi_taxa
,
self
.
class_
,
'
class
'
)
taxid
=
self
.
_match
(
ncbi_taxa
,
self
.
domain
,
'
superkingdom
'
)
if
name
is
not
None
:
if
taxid
is
not
None
:
return
'
append-to-class
'
,
name
.
taxon
.
taxid
return
'
append-species
'
,
taxid
name
=
self
.
_match
(
ncbi_taxa
,
self
.
phylum
,
'
phylum
'
)
if
name
is
not
None
:
return
'
append-phylum
'
,
name
.
taxon
.
taxid
name
=
self
.
_match
(
ncbi_taxa
,
self
.
domain
,
'
superkingdom
'
)
if
name
is
not
None
:
return
'
append-to-superkingdom
'
,
name
.
taxon
.
taxid
return
'
fail
'
,
None
return
'
fail
'
,
None
_
,
NCBI_
TAXA
_FILE
,
INPUT_DIR
,
OUTPUT_DIR
=
sys
.
argv
_
,
NCBI_
NODES_FILE
,
NCBI_NAMES
_FILE
,
INPUT_DIR
,
OUTPUT_DIR
=
sys
.
argv
def
of
(
fn
):
def
of
(
fn
):
return
open
(
os
.
path
.
join
(
OUTPUT_DIR
,
fn
),
'
w
'
)
return
open
(
os
.
path
.
join
(
OUTPUT_DIR
,
fn
),
'
w
'
)
def
line
(
f
,
sep
,
*
cols
):
def
line
(
f
,
sep
,
*
cols
,
tail
=
None
):
f
.
write
(
sep
.
join
(
str
(
i
)
for
i
in
cols
))
f
.
write
(
sep
.
join
(
str
(
i
)
for
i
in
cols
))
if
tail
is
not
None
:
f
.
write
(
tail
)
f
.
write
(
'
\n
'
)
f
.
write
(
'
\n
'
)
sys
.
stderr
.
write
(
'
Reading NCBI taxonomy: %s
\n
'
%
NCBI_
TAXA
_FILE
)
sys
.
stderr
.
write
(
'
Reading NCBI taxonomy:
%s
%s
\n
'
%
(
NCBI_
NODES_FILE
,
NCBI_NAMES
_FILE
)
)
NCBI_TAXA
=
read_
taxa_id
(
NCBI_TAXA
_FILE
)
NCBI_TAXA
=
read_
ncbi_names
(
read_ncbi_nodes
(
NCBI_NODES_FILE
),
NCBI_NAMES
_FILE
)
os
.
makedirs
(
OUTPUT_DIR
,
exist_ok
=
True
)
os
.
makedirs
(
OUTPUT_DIR
,
exist_ok
=
True
)
with
of
(
'
dispatch-report.txt
'
)
as
out_dispatch
,
of
(
'
dsmz-nodes.dmp
'
)
as
out_nodes
,
of
(
'
dsmz-names.dmp
'
)
as
out_names
,
of
(
'
warnings.txt
'
)
as
out_warn
:
with
of
(
'
dispatch-report.txt
'
)
as
out_dispatch
,
of
(
'
dsmz-nodes.dmp
'
)
as
out_nodes
,
of
(
'
dsmz-names.dmp
'
)
as
out_names
,
of
(
'
warnings.txt
'
)
as
out_warn
:
sys
.
stderr
.
write
(
'
Reading BacDive entries: %s
\n
'
%
INPUT_DIR
)
sys
.
stderr
.
write
(
'
Reading BacDive entries: %s
\n
'
%
INPUT_DIR
)
count_entries
=
0
count_dispatches
=
collections
.
defaultdict
(
int
)
count_new_nodes
=
0
count_names_of_new_nodes
=
0
count_new_synonyms
=
0
new_ids
=
set
()
for
dirpath
,
_
,
filenames
in
os
.
walk
(
INPUT_DIR
):
for
dirpath
,
_
,
filenames
in
os
.
walk
(
INPUT_DIR
):
for
fn
in
filenames
:
for
fn
in
filenames
:
if
not
fn
.
endswith
(
'
.xml
'
):
if
not
fn
.
endswith
(
'
.xml
'
):
continue
continue
e
=
BacDiveEntry
(
NCBI_TAXA
,
os
.
path
.
join
(
dirpath
,
fn
))
e
=
BacDiveEntry
(
NCBI_TAXA
,
os
.
path
.
join
(
dirpath
,
fn
))
count_entries
+=
1
count_dispatches
[
e
.
dispatch
]
+=
1
line
(
out_dispatch
,
'
\t
'
,
e
.
filename
,
e
.
dispatch
,
e
.
ncbi_taxid
)
line
(
out_dispatch
,
'
\t
'
,
e
.
filename
,
e
.
dispatch
,
e
.
ncbi_taxid
)
if
e
.
dispatch
==
'
append
'
:
if
e
.
dispatch
.
startswith
(
'
append-to-
'
):
line
(
out_nodes
,
'
\t
|
\t
'
,
e
.
strain_taxid
,
e
.
ncbi_taxid
,
'
no rank
'
,
''
,
0
,
1
,
11
,
1
,
0
,
1
,
1
,
0
)
if
e
.
strain_taxid
in
new_ids
:
line
(
out_warn
,
'
\t
'
,
e
.
filename
,
'
duplicate id
'
+
e
.
strain_taxid
)
continue
new_ids
.
add
(
e
.
strain_taxid
)
line
(
out_nodes
,
'
\t
|
\t
'
,
e
.
strain_taxid
,
# taxid
e
.
ncbi_taxid
,
# parent taxid
'
no rank
'
,
# rank
''
,
# embl code
0
,
# division id
1
,
# inherited div flag
11
,
# genetic code id
1
,
# inherited GC flag
0
,
# mitochondrial genetic code id
1
,
# inherited MGC flag
1
,
# GenBank hidden flag
0
,
# hidden subtree root flag
''
,
# comments
tail
=
'
\t
|
'
)
count_new_nodes
+=
1
if
e
.
canonical
is
None
:
if
e
.
canonical
is
None
:
line
(
out_warn
,
'
\t
'
,
e
.
filename
,
'
no canonical
'
)
line
(
out_warn
,
'
\t
'
,
e
.
filename
,
'
no canonical
'
)
else
:
else
:
for
name
in
itertools
.
chain
(
e
.
strain_number
,
e
.
complete_name
):
for
name
in
itertools
.
chain
(
e
.
strain_number
,
e
.
complete_name
):
line
(
out_names
,
'
\t
|
\t
'
,
e
.
strain_taxid
,
name
,
''
,
'
scientific name
'
if
name
==
e
.
canonical
else
'
strain number
'
)
line
(
out_names
,
'
\t
|
\t
'
,
e
.
strain_taxid
,
name
,
''
,
'
scientific name
'
if
name
==
e
.
canonical
else
'
strain number
'
,
tail
=
'
\t
|
'
)
count_names_of_new_nodes
+=
1
elif
e
.
dispatch
==
'
equivalent
'
or
e
.
dispatch
.
startswith
(
'
type-material-of-
'
):
for
name
in
itertools
.
chain
(
e
.
strain_number
,
e
.
complete_name
):
if
name
not
in
NCBI_TAXA
:
line
(
out_names
,
'
\t
|
\t
'
,
e
.
ncbi_taxid
,
name
,
''
,
'
strain number
'
,
tail
=
'
\t
|
'
)
count_new_synonyms
+=
1
sys
.
stderr
.
write
(
'
Entries: %d
\n
'
%
count_entries
)
sys
.
stderr
.
write
(
'
Dispatches:
\n
'
)
for
c
in
count_dispatches
.
items
():
sys
.
stderr
.
write
(
'
%s: %d
\n
'
%
c
)
sys
.
stderr
.
write
(
'
New nodes: %d
\n
'
%
count_new_nodes
)
sys
.
stderr
.
write
(
'
Names of new nodes: %d
\n
'
%
count_names_of_new_nodes
)
sys
.
stderr
.
write
(
'
New synonyms: %d
\n
'
%
count_new_synonyms
)
This diff is collapsed.
Click to expand it.
dsmz-match.snakefile
+
3
−
2
View file @
53d3404d
...
@@ -11,6 +11,7 @@ rule match:
...
@@ -11,6 +11,7 @@ rule match:
directory(config['OUTDIR'] + '/' + config['DSMZ_MATCH_DIR'])
directory(config['OUTDIR'] + '/' + config['DSMZ_MATCH_DIR'])
input:
input:
config['OUTDIR'] + '/' + config['DSMZ_STRAINS_DIR']
match='./dsmz-match.py',
strains=config['OUTDIR'] + '/' + config['DSMZ_STRAINS_DIR']
shell: '''
./dsmz-
match
.py
{config[
TAXA
_FILE]} {input} {output}'''
shell: '''
{input.
match
}
{config[
NCBI_NODES_FILE]} {config[NCBI_NAMES
_FILE]} {input
.strains
} {output}'''
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment