From 09133b4f9e5862066d1582552c99ff7312e52742 Mon Sep 17 00:00:00 2001
From: Robert Bossy <Robert.Bossy@inra.fr>
Date: Wed, 20 Oct 2021 13:57:31 +0200
Subject: [PATCH] moved microorganisms roots in a single file

---
 cut-root.py                             | 39 +++++++++++++++++--------
 microorganisms-roots.txt                | 28 ++++++++++++++++++
 microorganisms-roots/Alveolata          |  1 -
 microorganisms-roots/Amoebozoa          |  1 -
 microorganisms-roots/Archaea            |  1 -
 microorganisms-roots/Bacteria           |  1 -
 microorganisms-roots/Chlamydomonadales  |  1 -
 microorganisms-roots/Chlorella          |  1 -
 microorganisms-roots/Choanoflagellida   |  1 -
 microorganisms-roots/Cryptophyta        |  1 -
 microorganisms-roots/Desmidiales        |  1 -
 microorganisms-roots/Diplomonadida      |  1 -
 microorganisms-roots/Euglenozoa         |  1 -
 microorganisms-roots/Fungi              |  1 -
 microorganisms-roots/Glaucocystophyceae |  1 -
 microorganisms-roots/Haptophyta         |  1 -
 microorganisms-roots/Ichthyosporea      |  1 -
 microorganisms-roots/Nematoda           |  1 -
 microorganisms-roots/Oxymonadida        |  1 -
 microorganisms-roots/Parabasalia        |  1 -
 microorganisms-roots/Prototheca         |  1 -
 microorganisms-roots/Retortamonadidae   |  1 -
 microorganisms-roots/Rhizaria           |  1 -
 microorganisms-roots/Stramenopiles      |  1 -
 microorganisms-roots/Viruses            |  1 -
 microorganisms-roots/_Crenarchaeota     |  1 -
 microorganisms-roots/_Euryarchaeota     |  1 -
 microorganisms-roots/_Korarchaeota      |  1 -
 microorganisms-roots/_Nanoarchaeota     |  1 -
 microorganisms-roots/_Volvox            |  1 -
 rewrite-taxonomy.snakefile              |  8 ++---
 31 files changed, 59 insertions(+), 44 deletions(-)
 create mode 100644 microorganisms-roots.txt
 delete mode 100644 microorganisms-roots/Alveolata
 delete mode 100644 microorganisms-roots/Amoebozoa
 delete mode 100644 microorganisms-roots/Archaea
 delete mode 100644 microorganisms-roots/Bacteria
 delete mode 100644 microorganisms-roots/Chlamydomonadales
 delete mode 100644 microorganisms-roots/Chlorella
 delete mode 100644 microorganisms-roots/Choanoflagellida
 delete mode 100644 microorganisms-roots/Cryptophyta
 delete mode 100644 microorganisms-roots/Desmidiales
 delete mode 100644 microorganisms-roots/Diplomonadida
 delete mode 100644 microorganisms-roots/Euglenozoa
 delete mode 100644 microorganisms-roots/Fungi
 delete mode 100644 microorganisms-roots/Glaucocystophyceae
 delete mode 100644 microorganisms-roots/Haptophyta
 delete mode 100644 microorganisms-roots/Ichthyosporea
 delete mode 100644 microorganisms-roots/Nematoda
 delete mode 100644 microorganisms-roots/Oxymonadida
 delete mode 100644 microorganisms-roots/Parabasalia
 delete mode 100644 microorganisms-roots/Prototheca
 delete mode 100644 microorganisms-roots/Retortamonadidae
 delete mode 100644 microorganisms-roots/Rhizaria
 delete mode 100644 microorganisms-roots/Stramenopiles
 delete mode 100644 microorganisms-roots/Viruses
 delete mode 100644 microorganisms-roots/_Crenarchaeota
 delete mode 100644 microorganisms-roots/_Euryarchaeota
 delete mode 100644 microorganisms-roots/_Korarchaeota
 delete mode 100644 microorganisms-roots/_Nanoarchaeota
 delete mode 100644 microorganisms-roots/_Volvox

diff --git a/cut-root.py b/cut-root.py
index 4c11584..928aeda 100755
--- a/cut-root.py
+++ b/cut-root.py
@@ -4,27 +4,42 @@
 import sys
 import re
 
-ROOT_FILES = sys.argv[1:]
+ROOT_FILE = sys.argv[1]
 ROOT_CANDIDATES = []
-for rf in ROOT_FILES:
-    with open(rf) as f:
-        r = f.read().strip()
-        ROOT_CANDIDATES.append((rf, r))
+with open(ROOT_FILE) as f:
+    headers = None
+    for line in f:
+        cols = list(c.strip() for c in line.split('\t'))
+        if headers is None:
+            headers = cols
+            continue
+        record = dict(zip(headers, cols))
+        if record['Taxonomy ID'] == '':
+            sys.stderr.write('ignoring %s, not a taxon root\n' % (record['Name'],))
+        else:
+            ROOT_CANDIDATES.append((record['Name'], record['Taxonomy path']))
 
 ROOTS = []
-for rf1, r1 in ROOT_CANDIDATES:
+for name1, path1 in ROOT_CANDIDATES:
     accept = True
-    for rf2, r2 in ROOT_CANDIDATES:
-        if r1 == r2 and rf1 == rf2:
+    for name2, path2 in ROOT_CANDIDATES:
+        if path1 == path2 and name1 == path2:
             continue
-        if r1.startswith(r2 + '/'):
-            sys.stderr.write('%s excluded since it is subsumed by %s\n' % (rf1, rf2))
+        if path1.startswith(path2 + '/'):
+            sys.stderr.write('%s excluded since it is subsumed by %s\n' % (name1, name2))
             accept = False
     if accept:
-        ROOTS.append(r1)
+        sys.stderr.write('Root %s (%s)\n' % (name1, path1))
+        ROOTS.append((name1, path1))
 
-PATTERN = re.compile(r'\t(?:' + '|'.join(ROOTS) + r')[/\t]')
+PATTERN = re.compile(r'\t(?:' + '|'.join(path for (name, path) in ROOTS) + r')[/\t]')
+REMAIN = list(ROOTS)
 for line in sys.stdin:
     m = PATTERN.search(line)
     if m is not None:
         sys.stdout.write(line)
+        for name, path in REMAIN:
+            if path in line:
+                REMAIN.remove((name, path))
+for name, path in REMAIN:
+    sys.stderr.write('%s not seen' % (name,))
diff --git a/microorganisms-roots.txt b/microorganisms-roots.txt
new file mode 100644
index 0000000..e938833
--- /dev/null
+++ b/microorganisms-roots.txt
@@ -0,0 +1,28 @@
+Name	MeSH ID	MeSH Tree	Taxonomy ID	Taxonomy path
+Alveolata	D056893	B01.043	ncbi:33630	/ncbi:1/ncbi:131567/ncbi:2759/ncbi:2698737/ncbi:33630
+Amoebozoa	D056894	B01.046	ncbi:554915	/ncbi:1/ncbi:131567/ncbi:2759/ncbi:554915
+Archaea	D001105	B02	ncbi:2157	/ncbi:1/ncbi:131567/ncbi:2157
+Bacteria	D001419	B03	ncbi:2	/ncbi:1/ncbi:131567/ncbi:2
+Chlamydomonadales	D000077105	B01.650.940.150.511	ncbi:3042	/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33090/ncbi:3041/ncbi:2692248/ncbi:3166/ncbi:2812636/ncbi:3042
+Chlorella	D002708	B01.650.940.150.469	ncbi:3071	/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33090/ncbi:3041/ncbi:2692248/ncbi:75966/ncbi:35460/ncbi:35461/ncbi:2511126/ncbi:3071
+Choanoflagellida	D056897	B01.175	ncbi:28009	/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33154/ncbi:28009
+Cryptophyta	D044785	B01.206	ncbi:3027	/ncbi:1/ncbi:131567/ncbi:2759/ncbi:3027
+Desmidiales	D058114	B01.650.940.800.150.200	ncbi:131210	/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33090/ncbi:35493/ncbi:131221/ncbi:131209/ncbi:2684882/ncbi:131210
+Diplomonadida	D016828	B01.237	ncbi:5738	/ncbi:1/ncbi:131567/ncbi:2759/ncbi:2611341/ncbi:207245/ncbi:5738
+Euglenozoa	D056898	B01.268	ncbi:33682	/ncbi:1/ncbi:131567/ncbi:2759/ncbi:2611352/ncbi:33682
+Fungi	D005658	B01.300	ncbi:4751	/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33154/ncbi:4751
+Glaucocystophyceae	D058108	B01.650.232	ncbi:38254	/ncbi:1/ncbi:131567/ncbi:2759/ncbi:38254
+Haptophyta	D058087	B01.400	ncbi:2830	/ncbi:1/ncbi:131567/ncbi:2759/ncbi:2608109/ncbi:2830
+Ichthyosporea	D050298	B01.500	ncbi:127916	/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33154/ncbi:127916
+Nematoda	D009348	B01.050.500.500.294	ncbi:6231	/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33154/ncbi:33208/ncbi:6072/ncbi:33213/ncbi:33317/ncbi:1206794/ncbi:6231
+Oxymonadida	D056899	B01.625	ncbi:66288	/ncbi:1/ncbi:131567/ncbi:2759/ncbi:2611341/ncbi:2662611/ncbi:66288
+Parabasalia	D056900	B01.630	ncbi:5719	/ncbi:1/ncbi:131567/ncbi:2759/ncbi:2611341/ncbi:5719
+Prototheca	D011525	B01.650.940.150.634	ncbi:3110	/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33090/ncbi:3041/ncbi:2692248/ncbi:75966/ncbi:35460/ncbi:35461/ncbi:3110
+Retortamonadidae	D056919	B01.675	ncbi:193075	/ncbi:1/ncbi:131567/ncbi:2759/ncbi:2611341/ncbi:207245/ncbi:193075
+Rhizaria	D056901	B01.680	ncbi:543769	/ncbi:1/ncbi:131567/ncbi:2759/ncbi:2698737/ncbi:543769
+Stramenopiles	D058009	B01.750	ncbi:33634	/ncbi:1/ncbi:131567/ncbi:2759/ncbi:2698737/ncbi:33634
+Viruses	D014780	B04	ncbi:10239	/ncbi:1/ncbi:10239
+Microbiological Phenomena	D008827	G06		
+Microbiology	D008829	H01.158.273.540		
+Microbiological Techniques	D008828	E05.200.875,E01.370.225.875		
+Attachment Sites, Microbiological	D001287	G05.360.340.024.079		
diff --git a/microorganisms-roots/Alveolata b/microorganisms-roots/Alveolata
deleted file mode 100644
index 68604cd..0000000
--- a/microorganisms-roots/Alveolata
+++ /dev/null
@@ -1 +0,0 @@
-/ncbi:1/ncbi:131567/ncbi:2759/ncbi:2698737/ncbi:33630
diff --git a/microorganisms-roots/Amoebozoa b/microorganisms-roots/Amoebozoa
deleted file mode 100644
index 0d27f58..0000000
--- a/microorganisms-roots/Amoebozoa
+++ /dev/null
@@ -1 +0,0 @@
-/ncbi:1/ncbi:131567/ncbi:2759/ncbi:554915
diff --git a/microorganisms-roots/Archaea b/microorganisms-roots/Archaea
deleted file mode 100644
index 48279c0..0000000
--- a/microorganisms-roots/Archaea
+++ /dev/null
@@ -1 +0,0 @@
-/ncbi:1/ncbi:131567/ncbi:2157
diff --git a/microorganisms-roots/Bacteria b/microorganisms-roots/Bacteria
deleted file mode 100644
index 7abbde7..0000000
--- a/microorganisms-roots/Bacteria
+++ /dev/null
@@ -1 +0,0 @@
-/ncbi:1/ncbi:131567/ncbi:2
diff --git a/microorganisms-roots/Chlamydomonadales b/microorganisms-roots/Chlamydomonadales
deleted file mode 100644
index 92eac40..0000000
--- a/microorganisms-roots/Chlamydomonadales
+++ /dev/null
@@ -1 +0,0 @@
-/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33090/ncbi:3041/ncbi:2692248/ncbi:3166/ncbi:2812636/ncbi:3042
diff --git a/microorganisms-roots/Chlorella b/microorganisms-roots/Chlorella
deleted file mode 100644
index 945ed27..0000000
--- a/microorganisms-roots/Chlorella
+++ /dev/null
@@ -1 +0,0 @@
-/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33090/ncbi:3041/ncbi:2692248/ncbi:75966/ncbi:35460/ncbi:35461/ncbi:2511126/ncbi:3071
diff --git a/microorganisms-roots/Choanoflagellida b/microorganisms-roots/Choanoflagellida
deleted file mode 100644
index d6449de..0000000
--- a/microorganisms-roots/Choanoflagellida
+++ /dev/null
@@ -1 +0,0 @@
-/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33154/ncbi:28009
diff --git a/microorganisms-roots/Cryptophyta b/microorganisms-roots/Cryptophyta
deleted file mode 100644
index 1255f03..0000000
--- a/microorganisms-roots/Cryptophyta
+++ /dev/null
@@ -1 +0,0 @@
-/ncbi:1/ncbi:131567/ncbi:2759/ncbi:3027
diff --git a/microorganisms-roots/Desmidiales b/microorganisms-roots/Desmidiales
deleted file mode 100644
index 5a511d1..0000000
--- a/microorganisms-roots/Desmidiales
+++ /dev/null
@@ -1 +0,0 @@
-/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33090/ncbi:35493/ncbi:131221/ncbi:131209/ncbi:2684882/ncbi:131210
diff --git a/microorganisms-roots/Diplomonadida b/microorganisms-roots/Diplomonadida
deleted file mode 100644
index 37583ce..0000000
--- a/microorganisms-roots/Diplomonadida
+++ /dev/null
@@ -1 +0,0 @@
-/ncbi:1/ncbi:131567/ncbi:2759/ncbi:2611341/ncbi:207245/ncbi:5738
diff --git a/microorganisms-roots/Euglenozoa b/microorganisms-roots/Euglenozoa
deleted file mode 100644
index c2a12e9..0000000
--- a/microorganisms-roots/Euglenozoa
+++ /dev/null
@@ -1 +0,0 @@
-/ncbi:1/ncbi:131567/ncbi:2759/ncbi:2611352/ncbi:33682
diff --git a/microorganisms-roots/Fungi b/microorganisms-roots/Fungi
deleted file mode 100644
index 5669838..0000000
--- a/microorganisms-roots/Fungi
+++ /dev/null
@@ -1 +0,0 @@
-/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33154/ncbi:4751
diff --git a/microorganisms-roots/Glaucocystophyceae b/microorganisms-roots/Glaucocystophyceae
deleted file mode 100644
index 3059f6d..0000000
--- a/microorganisms-roots/Glaucocystophyceae
+++ /dev/null
@@ -1 +0,0 @@
-/ncbi:1/ncbi:131567/ncbi:2759/ncbi:38254
diff --git a/microorganisms-roots/Haptophyta b/microorganisms-roots/Haptophyta
deleted file mode 100644
index dac52ab..0000000
--- a/microorganisms-roots/Haptophyta
+++ /dev/null
@@ -1 +0,0 @@
-/ncbi:1/ncbi:131567/ncbi:2759/ncbi:2608109/ncbi:2830
diff --git a/microorganisms-roots/Ichthyosporea b/microorganisms-roots/Ichthyosporea
deleted file mode 100644
index 39bc6b1..0000000
--- a/microorganisms-roots/Ichthyosporea
+++ /dev/null
@@ -1 +0,0 @@
-/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33154/ncbi:127916
diff --git a/microorganisms-roots/Nematoda b/microorganisms-roots/Nematoda
deleted file mode 100644
index e8fdf14..0000000
--- a/microorganisms-roots/Nematoda
+++ /dev/null
@@ -1 +0,0 @@
-/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33154/ncbi:33208/ncbi:6072/ncbi:33213/ncbi:33317/ncbi:1206794/ncbi:6231
diff --git a/microorganisms-roots/Oxymonadida b/microorganisms-roots/Oxymonadida
deleted file mode 100644
index a5a82c3..0000000
--- a/microorganisms-roots/Oxymonadida
+++ /dev/null
@@ -1 +0,0 @@
-/ncbi:1/ncbi:131567/ncbi:2759/ncbi:2611341/ncbi:2662611/ncbi:66288
diff --git a/microorganisms-roots/Parabasalia b/microorganisms-roots/Parabasalia
deleted file mode 100644
index f2a90da..0000000
--- a/microorganisms-roots/Parabasalia
+++ /dev/null
@@ -1 +0,0 @@
-/ncbi:1/ncbi:131567/ncbi:2759/ncbi:2611341/ncbi:5719
diff --git a/microorganisms-roots/Prototheca b/microorganisms-roots/Prototheca
deleted file mode 100644
index 8974461..0000000
--- a/microorganisms-roots/Prototheca
+++ /dev/null
@@ -1 +0,0 @@
-/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33090/ncbi:3041/ncbi:2692248/ncbi:75966/ncbi:35460/ncbi:35461/ncbi:3110
diff --git a/microorganisms-roots/Retortamonadidae b/microorganisms-roots/Retortamonadidae
deleted file mode 100644
index 0c9e290..0000000
--- a/microorganisms-roots/Retortamonadidae
+++ /dev/null
@@ -1 +0,0 @@
-/ncbi:1/ncbi:131567/ncbi:2759/ncbi:2611341/ncbi:207245/ncbi:193075
diff --git a/microorganisms-roots/Rhizaria b/microorganisms-roots/Rhizaria
deleted file mode 100644
index b0fe36e..0000000
--- a/microorganisms-roots/Rhizaria
+++ /dev/null
@@ -1 +0,0 @@
-/ncbi:1/ncbi:131567/ncbi:2759/ncbi:2698737/ncbi:543769
diff --git a/microorganisms-roots/Stramenopiles b/microorganisms-roots/Stramenopiles
deleted file mode 100644
index e6f72b6..0000000
--- a/microorganisms-roots/Stramenopiles
+++ /dev/null
@@ -1 +0,0 @@
-/ncbi:1/ncbi:131567/ncbi:2759/ncbi:2698737/ncbi:33634
diff --git a/microorganisms-roots/Viruses b/microorganisms-roots/Viruses
deleted file mode 100644
index e159dd1..0000000
--- a/microorganisms-roots/Viruses
+++ /dev/null
@@ -1 +0,0 @@
-/ncbi:1/ncbi:10239
diff --git a/microorganisms-roots/_Crenarchaeota b/microorganisms-roots/_Crenarchaeota
deleted file mode 100644
index 4dbf627..0000000
--- a/microorganisms-roots/_Crenarchaeota
+++ /dev/null
@@ -1 +0,0 @@
-/ncbi:1/ncbi:131567/ncbi:2157/ncbi:1783275/ncbi:28889
diff --git a/microorganisms-roots/_Euryarchaeota b/microorganisms-roots/_Euryarchaeota
deleted file mode 100644
index 8149b0a..0000000
--- a/microorganisms-roots/_Euryarchaeota
+++ /dev/null
@@ -1 +0,0 @@
-/ncbi:1/ncbi:131567/ncbi:2157/ncbi:28890
diff --git a/microorganisms-roots/_Korarchaeota b/microorganisms-roots/_Korarchaeota
deleted file mode 100644
index 60a5d1a..0000000
--- a/microorganisms-roots/_Korarchaeota
+++ /dev/null
@@ -1 +0,0 @@
-/ncbi:1/ncbi:131567/ncbi:2157/ncbi:1783275/ncbi:51967
diff --git a/microorganisms-roots/_Nanoarchaeota b/microorganisms-roots/_Nanoarchaeota
deleted file mode 100644
index 4aa9262..0000000
--- a/microorganisms-roots/_Nanoarchaeota
+++ /dev/null
@@ -1 +0,0 @@
-/ncbi:1/ncbi:131567/ncbi:2157/ncbi:1783276/ncbi:192989
diff --git a/microorganisms-roots/_Volvox b/microorganisms-roots/_Volvox
deleted file mode 100644
index 781fcbd..0000000
--- a/microorganisms-roots/_Volvox
+++ /dev/null
@@ -1 +0,0 @@
-/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33090/ncbi:3041/ncbi:2692248/ncbi:3166/ncbi:2812636/ncbi:3042/ncbi:3065/ncbi:3066
diff --git a/rewrite-taxonomy.snakefile b/rewrite-taxonomy.snakefile
index 87cbf43..d2daf4c 100644
--- a/rewrite-taxonomy.snakefile
+++ b/rewrite-taxonomy.snakefile
@@ -6,10 +6,10 @@ rule all:
         config['OUTDIR'] + '/finish.txt',
         config['OUTDIR'] + '/taxid_microorganisms.txt',
         config['OUTDIR'] + '/taxa+id_microorganisms.txt',
-        config['OUTDIR'] + '/taxa+id_microorganisms.trie',
+        # config['OUTDIR'] + '/taxa+id_microorganisms.trie',
         config['OUTDIR'] + '/taxid_full.txt',
         config['OUTDIR'] + '/taxa+id_full.txt',
-        config['OUTDIR'] + '/taxa+id_full.trie'
+        # config['OUTDIR'] + '/taxa+id_full.trie'
 
 
 rule check:
@@ -30,10 +30,10 @@ rule microorganisms:
 
     input:
         full=config['OUTDIR'] + '/{p}_full.txt',
-        roots='microorganisms-roots'
+        roots='microorganisms-roots.txt'
 
     shell:
-        '''./cut-root.py {input.roots}/* <{input.full} >{output}'''
+        '''./cut-root.py {input.roots} <{input.full} >{output}'''
 
 
 rule taxaid_full:
-- 
GitLab