From b52e3ef75e5f5b79cba1bacc642f0a13ad33e072 Mon Sep 17 00:00:00 2001 From: Robert Bossy Date: Tue, 12 Apr 2022 14:15:07 +0200 Subject: [PATCH] separated taxonomy rewrite and trie compilation --- README.md | 20 ++++++++++++++++++-- compile-trie.snakefile | 19 +++++++++++++++++++ rewrite-taxonomy.snakefile | 16 +--------------- 3 files changed, 38 insertions(+), 17 deletions(-) create mode 100644 compile-trie.snakefile diff --git a/README.md b/README.md index 55048b9..1b55922 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ This will download the Taxonomy archive from the NCBI FTP server, then unzip the The download is anonymous and does not require any registration. -### 2. DSMZ catalog download +### 2. DSMZ catalog download BROKEN ```shell snakemake -j 1 -s dsmz-download.snakefile @@ -65,7 +65,7 @@ The output of this step contains 5 files: snakemake -j 1 -s rewrite-taxonomy.snakefile ``` -**ETA: 12 minutes** +**ETA: 20 minutes** This will write the merged taxonomy in a format suitable for text projection. @@ -79,3 +79,19 @@ This will write the merged taxonomy in a format suitable for text projection. | | | `https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=taxonomy&id={ID}` | | `bd` | BacDive | `https://bacdive.dsmz.de/strain/{ID} | | | | `https://api.bacdive.dsmz.de/example/fetch/{ID}` (authentication required) | + + +### 5. Compile taxonomy tries + + +```shell +snakemake -j 1 -s compile-trie.snakefile +``` + +**ETA: 8 minutes** + +This will compile taxonomy tries for future use with AlvisNLP. + +The output of this step contains 2 files: +* `taxa+id_full.trie` +* `taxa+id_microorganisms.trie` diff --git a/compile-trie.snakefile b/compile-trie.snakefile new file mode 100644 index 0000000..e3465e3 --- /dev/null +++ b/compile-trie.snakefile @@ -0,0 +1,19 @@ +configfile: 'config.yaml' + + +rule all: + input: + config['OUTDIR'] + '/taxa+id_microorganisms.trie', + config['OUTDIR'] + '/taxa+id_full.trie' + + +rule compile: + input: + config['OUTDIR'] + '/taxa+id_{root}.txt' + + output: + config['OUTDIR'] + '/taxa+id_{root}.trie' + + + shell: + '''{config[ALVISNLP]} -J-Xmx24G -alias taxo {input} -alias trie {output} compile-taxonomy.plan''' diff --git a/rewrite-taxonomy.snakefile b/rewrite-taxonomy.snakefile index 329f4f8..19feac3 100644 --- a/rewrite-taxonomy.snakefile +++ b/rewrite-taxonomy.snakefile @@ -5,22 +5,8 @@ rule all: input: config['OUTDIR'] + '/taxid_microorganisms.txt', config['OUTDIR'] + '/taxa+id_microorganisms.txt', - config['OUTDIR'] + '/taxa+id_microorganisms.trie', config['OUTDIR'] + '/taxid_full.txt', - config['OUTDIR'] + '/taxa+id_full.txt', - config['OUTDIR'] + '/taxa+id_full.trie' - - -rule check: - input: - config['OUTDIR'] + '/taxa+id_{root}.txt' - - output: - config['OUTDIR'] + '/taxa+id_{root}.trie' - - - shell: - '''{config[ALVISNLP]} -J-Xmx24G -alias taxo {input} -alias trie {output} compile-taxonomy.plan''' + config['OUTDIR'] + '/taxa+id_full.txt' rule microorganisms: -- GitLab