diff --git a/README.md b/README.md index 55048b96ac5495e94553936399bb26413cad0def..1b55922cd599f4bd14f168969dd077f2c72a1b17 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ This will download the Taxonomy archive from the NCBI FTP server, then unzip the The download is anonymous and does not require any registration. -### 2. DSMZ catalog download +### 2. DSMZ catalog download BROKEN ```shell snakemake -j 1 -s dsmz-download.snakefile @@ -65,7 +65,7 @@ The output of this step contains 5 files: snakemake -j 1 -s rewrite-taxonomy.snakefile ``` -**ETA: 12 minutes** +**ETA: 20 minutes** This will write the merged taxonomy in a format suitable for text projection. @@ -79,3 +79,19 @@ This will write the merged taxonomy in a format suitable for text projection. | | | `https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=taxonomy&id={ID}` | | `bd` | BacDive | `https://bacdive.dsmz.de/strain/{ID} | | | | `https://api.bacdive.dsmz.de/example/fetch/{ID}` (authentication required) | + + +### 5. Compile taxonomy tries + + +```shell +snakemake -j 1 -s compile-trie.snakefile +``` + +**ETA: 8 minutes** + +This will compile taxonomy tries for future use with AlvisNLP. + +The output of this step contains 2 files: +* `taxa+id_full.trie` +* `taxa+id_microorganisms.trie` diff --git a/compile-trie.snakefile b/compile-trie.snakefile new file mode 100644 index 0000000000000000000000000000000000000000..e3465e3bca85e0979980bc767f81231877c9dcb9 --- /dev/null +++ b/compile-trie.snakefile @@ -0,0 +1,19 @@ +configfile: 'config.yaml' + + +rule all: + input: + config['OUTDIR'] + '/taxa+id_microorganisms.trie', + config['OUTDIR'] + '/taxa+id_full.trie' + + +rule compile: + input: + config['OUTDIR'] + '/taxa+id_{root}.txt' + + output: + config['OUTDIR'] + '/taxa+id_{root}.trie' + + + shell: + '''{config[ALVISNLP]} -J-Xmx24G -alias taxo {input} -alias trie {output} compile-taxonomy.plan''' diff --git a/rewrite-taxonomy.snakefile b/rewrite-taxonomy.snakefile index 329f4f8a24b91f77f17b68360a3942c3982587a2..19feac3816b41fb4e49b430ad7f0aa3c6e60c10b 100644 --- a/rewrite-taxonomy.snakefile +++ b/rewrite-taxonomy.snakefile @@ -5,22 +5,8 @@ rule all: input: config['OUTDIR'] + '/taxid_microorganisms.txt', config['OUTDIR'] + '/taxa+id_microorganisms.txt', - config['OUTDIR'] + '/taxa+id_microorganisms.trie', config['OUTDIR'] + '/taxid_full.txt', - config['OUTDIR'] + '/taxa+id_full.txt', - config['OUTDIR'] + '/taxa+id_full.trie' - - -rule check: - input: - config['OUTDIR'] + '/taxa+id_{root}.txt' - - output: - config['OUTDIR'] + '/taxa+id_{root}.trie' - - - shell: - '''{config[ALVISNLP]} -J-Xmx24G -alias taxo {input} -alias trie {output} compile-taxonomy.plan''' + config['OUTDIR'] + '/taxa+id_full.txt' rule microorganisms: