From 1e8e026e0f28844872d186edeae30ed4a8579c14 Mon Sep 17 00:00:00 2001
From: Robert Bossy <Robert.Bossy@inra.fr>
Date: Mon, 5 Apr 2021 15:53:10 +0200
Subject: [PATCH] NCBI Taxonomy download snakefile

---
 config.yaml             | 11 +++++++----
 dsmz-match.snakefile    |  1 -
 ncbi-download.snakefile | 19 +++++++++++++++++++
 3 files changed, 26 insertions(+), 5 deletions(-)
 create mode 100644 ncbi-download.snakefile

diff --git a/config.yaml b/config.yaml
index 96db8a1..ffa7e07 100644
--- a/config.yaml
+++ b/config.yaml
@@ -8,12 +8,15 @@ ALVISNLP: '~/code/alvisnlp/.test/alvisnlp/bin/alvisnlp'
 REWRITE_TAXONOMY: '~/code/bibliome-java-utils/test/install/bin/rewrite-taxonomy'
 
 
-# NCBI Taxonomy files
-NCBI_DIR: 'ncbi-taxonomy_2021-03-26'
-
-
 # Output and working directories
 OUTDIR: 'test'
 
+
+
+
+
+
 DSMZ_STRAINS_DIR: 'dsmz-strains'
 DSMZ_MATCH_DIR: 'dsmz-match'
+NCBI_DIR: 'ncbi-taxonomy'
+NCBI_ZIP_URL: 'https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip'
diff --git a/dsmz-match.snakefile b/dsmz-match.snakefile
index 30dd51e..672a553 100644
--- a/dsmz-match.snakefile
+++ b/dsmz-match.snakefile
@@ -1,6 +1,5 @@
 configfile: 'config.yaml'
 
-import glob
 
 rule match:
     '''
diff --git a/ncbi-download.snakefile b/ncbi-download.snakefile
new file mode 100644
index 0000000..5dd6f11
--- /dev/null
+++ b/ncbi-download.snakefile
@@ -0,0 +1,19 @@
+configfile: 'config.yaml'
+
+
+rule unzip:
+    output:
+        config['OUTDIR'] + '/' + config['NCBI_DIR'] + '/nodes.dmp'
+
+    input:
+        config['OUTDIR'] + '/' + config['NCBI_DIR'] + '/taxdmp.zip'
+
+    shell:
+        '''unzip -d {config[OUTDIR]}/{config[NCBI_DIR]} {input}'''
+
+rule download:
+    output:
+        config['OUTDIR'] + '/' + config['NCBI_DIR'] + '/taxdmp.zip'
+
+    shell:
+        '''curl -o {output} '{config[NCBI_ZIP_URL]}' '''
-- 
GitLab