From 81a9aa77c6d1754f9b0ee6056f602127ae3aa3d8 Mon Sep 17 00:00:00 2001
From: Robert Bossy <Robert.Bossy@inra.fr>
Date: Mon, 5 Apr 2021 15:34:57 +0200
Subject: [PATCH] rewrite snakefile, microorganisms selection

---
 config.yaml                             |   3 +-
 cut-root.py                             |  30 ++++++
 microorganisms-roots/Alveolata          |   1 +
 microorganisms-roots/Amoebozoa          |   1 +
 microorganisms-roots/Archaea            |   1 +
 microorganisms-roots/Bacteria           |   1 +
 microorganisms-roots/Chlamydomonadales  |   1 +
 microorganisms-roots/Chlorella          |   1 +
 microorganisms-roots/Choanoflagellida   |   1 +
 microorganisms-roots/Cryptophyta        |   1 +
 microorganisms-roots/Desmidiales        |   1 +
 microorganisms-roots/Diplomonadida      |   1 +
 microorganisms-roots/Euglenozoa         |   1 +
 microorganisms-roots/Fungi              |   1 +
 microorganisms-roots/Glaucocystophyceae |   1 +
 microorganisms-roots/Haptophyta         |   1 +
 microorganisms-roots/Ichthyosporea      |   1 +
 microorganisms-roots/Nematoda           |   1 +
 microorganisms-roots/Oxymonadida        |   1 +
 microorganisms-roots/Parabasalia        |   1 +
 microorganisms-roots/Prototheca         |   1 +
 microorganisms-roots/Retortamonadidae   |   1 +
 microorganisms-roots/Rhizaria           |   1 +
 microorganisms-roots/Stramenopiles      |   1 +
 microorganisms-roots/Viruses            |   1 +
 microorganisms-roots/_Crenarchaeota     |   1 +
 microorganisms-roots/_Euryarchaeota     |   1 +
 microorganisms-roots/_Korarchaeota      |   1 +
 microorganisms-roots/_Nanoarchaeota     |   1 +
 microorganisms-roots/_Volvox            |   1 +
 reject.txt                              | 137 ++++++++++++++++++++++++
 rewrite-taxonomy.snakefile              |  18 +++-
 saturate.txt                            |   5 +
 33 files changed, 217 insertions(+), 4 deletions(-)
 create mode 100755 cut-root.py
 create mode 100644 microorganisms-roots/Alveolata
 create mode 100644 microorganisms-roots/Amoebozoa
 create mode 100644 microorganisms-roots/Archaea
 create mode 100644 microorganisms-roots/Bacteria
 create mode 100644 microorganisms-roots/Chlamydomonadales
 create mode 100644 microorganisms-roots/Chlorella
 create mode 100644 microorganisms-roots/Choanoflagellida
 create mode 100644 microorganisms-roots/Cryptophyta
 create mode 100644 microorganisms-roots/Desmidiales
 create mode 100644 microorganisms-roots/Diplomonadida
 create mode 100644 microorganisms-roots/Euglenozoa
 create mode 100644 microorganisms-roots/Fungi
 create mode 100644 microorganisms-roots/Glaucocystophyceae
 create mode 100644 microorganisms-roots/Haptophyta
 create mode 100644 microorganisms-roots/Ichthyosporea
 create mode 100644 microorganisms-roots/Nematoda
 create mode 100644 microorganisms-roots/Oxymonadida
 create mode 100644 microorganisms-roots/Parabasalia
 create mode 100644 microorganisms-roots/Prototheca
 create mode 100644 microorganisms-roots/Retortamonadidae
 create mode 100644 microorganisms-roots/Rhizaria
 create mode 100644 microorganisms-roots/Stramenopiles
 create mode 100644 microorganisms-roots/Viruses
 create mode 100644 microorganisms-roots/_Crenarchaeota
 create mode 100644 microorganisms-roots/_Euryarchaeota
 create mode 100644 microorganisms-roots/_Korarchaeota
 create mode 100644 microorganisms-roots/_Nanoarchaeota
 create mode 100644 microorganisms-roots/_Volvox
 create mode 100644 reject.txt
 create mode 100644 saturate.txt

diff --git a/config.yaml b/config.yaml
index 151fab9..96db8a1 100644
--- a/config.yaml
+++ b/config.yaml
@@ -3,8 +3,9 @@ BACDIVE_USER: 'Robert.Bossy@inrae.fr'
 BACDIVE_PASSWORD_FILE: '.bacdive'
 
 
-# AlvisNLP binary
+# AlvisNLP and rewrite-taxonomy binaries
 ALVISNLP: '~/code/alvisnlp/.test/alvisnlp/bin/alvisnlp'
+REWRITE_TAXONOMY: '~/code/bibliome-java-utils/test/install/bin/rewrite-taxonomy'
 
 
 # NCBI Taxonomy files
diff --git a/cut-root.py b/cut-root.py
new file mode 100755
index 0000000..4c11584
--- /dev/null
+++ b/cut-root.py
@@ -0,0 +1,30 @@
+#!/bin/env python3
+
+
+import sys
+import re
+
+ROOT_FILES = sys.argv[1:]
+ROOT_CANDIDATES = []
+for rf in ROOT_FILES:
+    with open(rf) as f:
+        r = f.read().strip()
+        ROOT_CANDIDATES.append((rf, r))
+
+ROOTS = []
+for rf1, r1 in ROOT_CANDIDATES:
+    accept = True
+    for rf2, r2 in ROOT_CANDIDATES:
+        if r1 == r2 and rf1 == rf2:
+            continue
+        if r1.startswith(r2 + '/'):
+            sys.stderr.write('%s excluded since it is subsumed by %s\n' % (rf1, rf2))
+            accept = False
+    if accept:
+        ROOTS.append(r1)
+
+PATTERN = re.compile(r'\t(?:' + '|'.join(ROOTS) + r')[/\t]')
+for line in sys.stdin:
+    m = PATTERN.search(line)
+    if m is not None:
+        sys.stdout.write(line)
diff --git a/microorganisms-roots/Alveolata b/microorganisms-roots/Alveolata
new file mode 100644
index 0000000..68604cd
--- /dev/null
+++ b/microorganisms-roots/Alveolata
@@ -0,0 +1 @@
+/ncbi:1/ncbi:131567/ncbi:2759/ncbi:2698737/ncbi:33630
diff --git a/microorganisms-roots/Amoebozoa b/microorganisms-roots/Amoebozoa
new file mode 100644
index 0000000..0d27f58
--- /dev/null
+++ b/microorganisms-roots/Amoebozoa
@@ -0,0 +1 @@
+/ncbi:1/ncbi:131567/ncbi:2759/ncbi:554915
diff --git a/microorganisms-roots/Archaea b/microorganisms-roots/Archaea
new file mode 100644
index 0000000..48279c0
--- /dev/null
+++ b/microorganisms-roots/Archaea
@@ -0,0 +1 @@
+/ncbi:1/ncbi:131567/ncbi:2157
diff --git a/microorganisms-roots/Bacteria b/microorganisms-roots/Bacteria
new file mode 100644
index 0000000..7abbde7
--- /dev/null
+++ b/microorganisms-roots/Bacteria
@@ -0,0 +1 @@
+/ncbi:1/ncbi:131567/ncbi:2
diff --git a/microorganisms-roots/Chlamydomonadales b/microorganisms-roots/Chlamydomonadales
new file mode 100644
index 0000000..92eac40
--- /dev/null
+++ b/microorganisms-roots/Chlamydomonadales
@@ -0,0 +1 @@
+/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33090/ncbi:3041/ncbi:2692248/ncbi:3166/ncbi:2812636/ncbi:3042
diff --git a/microorganisms-roots/Chlorella b/microorganisms-roots/Chlorella
new file mode 100644
index 0000000..945ed27
--- /dev/null
+++ b/microorganisms-roots/Chlorella
@@ -0,0 +1 @@
+/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33090/ncbi:3041/ncbi:2692248/ncbi:75966/ncbi:35460/ncbi:35461/ncbi:2511126/ncbi:3071
diff --git a/microorganisms-roots/Choanoflagellida b/microorganisms-roots/Choanoflagellida
new file mode 100644
index 0000000..d6449de
--- /dev/null
+++ b/microorganisms-roots/Choanoflagellida
@@ -0,0 +1 @@
+/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33154/ncbi:28009
diff --git a/microorganisms-roots/Cryptophyta b/microorganisms-roots/Cryptophyta
new file mode 100644
index 0000000..1255f03
--- /dev/null
+++ b/microorganisms-roots/Cryptophyta
@@ -0,0 +1 @@
+/ncbi:1/ncbi:131567/ncbi:2759/ncbi:3027
diff --git a/microorganisms-roots/Desmidiales b/microorganisms-roots/Desmidiales
new file mode 100644
index 0000000..5a511d1
--- /dev/null
+++ b/microorganisms-roots/Desmidiales
@@ -0,0 +1 @@
+/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33090/ncbi:35493/ncbi:131221/ncbi:131209/ncbi:2684882/ncbi:131210
diff --git a/microorganisms-roots/Diplomonadida b/microorganisms-roots/Diplomonadida
new file mode 100644
index 0000000..37583ce
--- /dev/null
+++ b/microorganisms-roots/Diplomonadida
@@ -0,0 +1 @@
+/ncbi:1/ncbi:131567/ncbi:2759/ncbi:2611341/ncbi:207245/ncbi:5738
diff --git a/microorganisms-roots/Euglenozoa b/microorganisms-roots/Euglenozoa
new file mode 100644
index 0000000..c2a12e9
--- /dev/null
+++ b/microorganisms-roots/Euglenozoa
@@ -0,0 +1 @@
+/ncbi:1/ncbi:131567/ncbi:2759/ncbi:2611352/ncbi:33682
diff --git a/microorganisms-roots/Fungi b/microorganisms-roots/Fungi
new file mode 100644
index 0000000..5669838
--- /dev/null
+++ b/microorganisms-roots/Fungi
@@ -0,0 +1 @@
+/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33154/ncbi:4751
diff --git a/microorganisms-roots/Glaucocystophyceae b/microorganisms-roots/Glaucocystophyceae
new file mode 100644
index 0000000..3059f6d
--- /dev/null
+++ b/microorganisms-roots/Glaucocystophyceae
@@ -0,0 +1 @@
+/ncbi:1/ncbi:131567/ncbi:2759/ncbi:38254
diff --git a/microorganisms-roots/Haptophyta b/microorganisms-roots/Haptophyta
new file mode 100644
index 0000000..dac52ab
--- /dev/null
+++ b/microorganisms-roots/Haptophyta
@@ -0,0 +1 @@
+/ncbi:1/ncbi:131567/ncbi:2759/ncbi:2608109/ncbi:2830
diff --git a/microorganisms-roots/Ichthyosporea b/microorganisms-roots/Ichthyosporea
new file mode 100644
index 0000000..39bc6b1
--- /dev/null
+++ b/microorganisms-roots/Ichthyosporea
@@ -0,0 +1 @@
+/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33154/ncbi:127916
diff --git a/microorganisms-roots/Nematoda b/microorganisms-roots/Nematoda
new file mode 100644
index 0000000..e8fdf14
--- /dev/null
+++ b/microorganisms-roots/Nematoda
@@ -0,0 +1 @@
+/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33154/ncbi:33208/ncbi:6072/ncbi:33213/ncbi:33317/ncbi:1206794/ncbi:6231
diff --git a/microorganisms-roots/Oxymonadida b/microorganisms-roots/Oxymonadida
new file mode 100644
index 0000000..a5a82c3
--- /dev/null
+++ b/microorganisms-roots/Oxymonadida
@@ -0,0 +1 @@
+/ncbi:1/ncbi:131567/ncbi:2759/ncbi:2611341/ncbi:2662611/ncbi:66288
diff --git a/microorganisms-roots/Parabasalia b/microorganisms-roots/Parabasalia
new file mode 100644
index 0000000..f2a90da
--- /dev/null
+++ b/microorganisms-roots/Parabasalia
@@ -0,0 +1 @@
+/ncbi:1/ncbi:131567/ncbi:2759/ncbi:2611341/ncbi:5719
diff --git a/microorganisms-roots/Prototheca b/microorganisms-roots/Prototheca
new file mode 100644
index 0000000..8974461
--- /dev/null
+++ b/microorganisms-roots/Prototheca
@@ -0,0 +1 @@
+/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33090/ncbi:3041/ncbi:2692248/ncbi:75966/ncbi:35460/ncbi:35461/ncbi:3110
diff --git a/microorganisms-roots/Retortamonadidae b/microorganisms-roots/Retortamonadidae
new file mode 100644
index 0000000..0c9e290
--- /dev/null
+++ b/microorganisms-roots/Retortamonadidae
@@ -0,0 +1 @@
+/ncbi:1/ncbi:131567/ncbi:2759/ncbi:2611341/ncbi:207245/ncbi:193075
diff --git a/microorganisms-roots/Rhizaria b/microorganisms-roots/Rhizaria
new file mode 100644
index 0000000..b0fe36e
--- /dev/null
+++ b/microorganisms-roots/Rhizaria
@@ -0,0 +1 @@
+/ncbi:1/ncbi:131567/ncbi:2759/ncbi:2698737/ncbi:543769
diff --git a/microorganisms-roots/Stramenopiles b/microorganisms-roots/Stramenopiles
new file mode 100644
index 0000000..e6f72b6
--- /dev/null
+++ b/microorganisms-roots/Stramenopiles
@@ -0,0 +1 @@
+/ncbi:1/ncbi:131567/ncbi:2759/ncbi:2698737/ncbi:33634
diff --git a/microorganisms-roots/Viruses b/microorganisms-roots/Viruses
new file mode 100644
index 0000000..e159dd1
--- /dev/null
+++ b/microorganisms-roots/Viruses
@@ -0,0 +1 @@
+/ncbi:1/ncbi:10239
diff --git a/microorganisms-roots/_Crenarchaeota b/microorganisms-roots/_Crenarchaeota
new file mode 100644
index 0000000..4dbf627
--- /dev/null
+++ b/microorganisms-roots/_Crenarchaeota
@@ -0,0 +1 @@
+/ncbi:1/ncbi:131567/ncbi:2157/ncbi:1783275/ncbi:28889
diff --git a/microorganisms-roots/_Euryarchaeota b/microorganisms-roots/_Euryarchaeota
new file mode 100644
index 0000000..8149b0a
--- /dev/null
+++ b/microorganisms-roots/_Euryarchaeota
@@ -0,0 +1 @@
+/ncbi:1/ncbi:131567/ncbi:2157/ncbi:28890
diff --git a/microorganisms-roots/_Korarchaeota b/microorganisms-roots/_Korarchaeota
new file mode 100644
index 0000000..60a5d1a
--- /dev/null
+++ b/microorganisms-roots/_Korarchaeota
@@ -0,0 +1 @@
+/ncbi:1/ncbi:131567/ncbi:2157/ncbi:1783275/ncbi:51967
diff --git a/microorganisms-roots/_Nanoarchaeota b/microorganisms-roots/_Nanoarchaeota
new file mode 100644
index 0000000..4aa9262
--- /dev/null
+++ b/microorganisms-roots/_Nanoarchaeota
@@ -0,0 +1 @@
+/ncbi:1/ncbi:131567/ncbi:2157/ncbi:1783276/ncbi:192989
diff --git a/microorganisms-roots/_Volvox b/microorganisms-roots/_Volvox
new file mode 100644
index 0000000..781fcbd
--- /dev/null
+++ b/microorganisms-roots/_Volvox
@@ -0,0 +1 @@
+/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33090/ncbi:3041/ncbi:2692248/ncbi:3166/ncbi:2812636/ncbi:3042/ncbi:3065/ncbi:3066
diff --git a/reject.txt b/reject.txt
new file mode 100644
index 0000000..a0ee6d5
--- /dev/null
+++ b/reject.txt
@@ -0,0 +1,137 @@
+ncbi:1
+Be	ncbi:1587
+Bd	ncbi:1613
+unclassified bacterium	ncbi:2338
+unidentified bacteria	ncbi:2338
+unidentified bacterium	ncbi:2338
+unknown bacteria	ncbi:2338
+ncbi:2387
+ncbi:2673
+unidentified proteobacterium	ncbi:2722
+unknown proteobacterium	ncbi:2722
+rape	ncbi:3708
+Glycine	ncbi:3846
+rays	ncbi:7858
+A hybrid	ncbi:8307
+monitors	ncbi:8555
+Ara	ncbi:9225
+euro	ncbi:9319
+man	ncbi:9606
+bear	ncbi:9632
+bears	ncbi:9632
+cat	ncbi:9685
+pig	ncbi:9823
+Axis	ncbi:9855
+Vira	ncbi:10239
+unidentified poxvirus	ncbi:10283
+unidentified entomopoxvirus	ncbi:10291
+ASFV	ncbi:10497
+degu	ncbi:10160
+LGT	ncbi:11085
+LI	ncbi:11086
+PVA	ncbi:12215
+GA-1	ncbi:12345
+other sequences	ncbi:28384
+29278
+Spea	ncbi:30316
+A glycine	ncbi:307491
+ncbi:32630
+ncbi:32644
+flag	ncbi:34205
+plasmids	ncbi:36549
+hybrid	ncbi:37965
+bacteriophage	ncbi:38018
+bacteriophages	ncbi:38018
+unidentified bacteriophage	ncbi:38018
+unidentified phage	ncbi:38018
+mum	ncbi:41568
+Arca	ncbi:44596
+ncbi:45196
+ncbi:45197
+4ncbi:5328
+Thymus	ncbi:49990
+ncbi:52958
+Bacillus	ncbi:55087
+ncbi:187	ncbi:55511
+name	ncbi:55581
+spot	ncbi:59837
+Laser	ncbi:62990
+Idea	ncbi:76236
+Codon	ncbi:79338
+expression vector	ncbi:81076
+unidentified expression vector	ncbi:81076
+Dina	ncbi:83994
+gag	ncbi:103820
+Later	ncbi:123504
+Ada	ncbi:125078
+Side	ncbi:145724
+Aa	ncbi:152839
+tipa	ncbi:162890
+This	ncbi:169495
+aka	ncbi:172644
+permit	ncbi:173331
+Car	ncbi:201850
+Mene	ncbi:206144
+Pero	ncbi:214303
+3A	ncbi:215167
+Luria	ncbi:218032
+Iso	ncbi:238707
+Cis	ncbi:245896
+ray	ncbi:255564
+Pera	ncbi:256812
+Mops	ncbi:258862
+Bias	ncbi:272805
+Sige	ncbi:328602
+Span	ncbi:333408
+California	ncbi:337343
+teta	ncbi:338092
+Circe	ncbi:345438
+Tasa	ncbi:381831
+Nusa	ncbi:468772
+A bacterium	ncbi:494443
+-->	ncbi:545367
+[A-Z]\. alpha
+[A-Z]\. beta
+[A-Z]\. gamma
+[A-Z]\. delta
+[A-Z]\. epsilon
+[A-Z]\. group
+A group
+A major
+A minor
+A central
+A minor
+A delta
+A means
+A maximum
+A minimum
+S medium
+A mouse
+A flagellum
+S complex
+Asp
+Beta
+Helix
+rat
+Tor
+Bio	ncbi:463801
+Chen	ncbi:8842
+Color	ncbi:8869
+Dialysis	ncbi:124307
+Ideas	ncbi:76236
+Indicator	ncbi:189528
+Phyla	ncbi:86858
+163164
+374463
+tetra
+408170
+Delta	ncbi:998453
+is	ncbi:159382
+Are	ncbi:695398
+Electron	ncbi:1118549
+environmental samples
+E	ncbi:178505
+AND	ncbi:1481724
+clinical samples	ncbi:88229
+clinical samples	ncbi:191496
+clinical samples	ncbi:226901
diff --git a/rewrite-taxonomy.snakefile b/rewrite-taxonomy.snakefile
index 2fbefe5..9acf158 100644
--- a/rewrite-taxonomy.snakefile
+++ b/rewrite-taxonomy.snakefile
@@ -1,11 +1,23 @@
 configfile: 'config.yaml'
 
-rule rewrite:
+rule microorganisms:
     output:
-        config['OUTDIR'] + '/taxa+id_full_with-DSMZ.txt'
+        config['OUTDIR'] + '/taxa+id_microorganisms.txt'
+
+    input:
+        full=config['OUTDIR'] + '/taxa+id_full.txt',
+        roots='microorganisms-roots'
+
+    shell:
+        '''./cut-root.py {input.roots}/* <{input.full} >{output}'''
+
+
+rule full:
+    output:
+        config['OUTDIR'] + '/taxa+id_full.txt'
 
     input:
         config['OUTDIR'] + '/' + config['DSMZ_MATCH_DIR']
 
     shell:
-        '''../bibliome-java-utils/test/install/bin/rewrite-taxonomy -namesFile {config[NCBI_DIR]}/names.dmp -namesFile {input}/dsmz-names.dmp  -prefix ncbi: -rejectionFile reject.txt -saturationFile saturate.txt -pattern '{{NAME}}\t{{TAXID}}\t{{CANONICAL}}\t{{TAXID_PATH}}\t{{POS_TAG}}\t{{RANK}}\t{{SPECIES_TAXID}}\t{{SPECIES_NAME}}\n' {config[NCBI_DIR]}/nodes.dmp {input}/dsmz-nodes.dmp >{output}'''
+        '''{config[REWRITE_TAXONOMY]} -namesFile {config[NCBI_DIR]}/names.dmp -namesFile {input}/dsmz-names.dmp  -prefix ncbi: -rejectionFile reject.txt -saturationFile saturate.txt -pattern '{{NAME}}\t{{TAXID}}\t{{CANONICAL}}\t{{TAXID_PATH}}\t{{POS_TAG}}\t{{RANK}}\t{{SPECIES_TAXID}}\t{{SPECIES_NAME}}\n' {config[NCBI_DIR]}/nodes.dmp {input}/dsmz-nodes.dmp >{output}'''
diff --git a/saturate.txt b/saturate.txt
new file mode 100644
index 0000000..903d41c
--- /dev/null
+++ b/saturate.txt
@@ -0,0 +1,5 @@
+([A-Z])[a-z]+ ([a-z]+)	genre name abbreviation	{1}. {2}	{1} {2}	{1}.{2}	{1} . {2}
+([^-]*)-(.*)	dash-space replacement	{1} {2}
+(.*) [(]?([A-Z]\w+), (\d{4})[)]?	author name variation	{1} {2}	{1} ({2})
+[A-Z][a-z]*[ao]	redneck plural	{0}s
+([A-Z][a-z]*)us	latin plural	{1}i
-- 
GitLab