From 09133b4f9e5862066d1582552c99ff7312e52742 Mon Sep 17 00:00:00 2001 From: Robert Bossy <Robert.Bossy@inra.fr> Date: Wed, 20 Oct 2021 13:57:31 +0200 Subject: [PATCH] moved microorganisms roots in a single file --- cut-root.py | 39 +++++++++++++++++-------- microorganisms-roots.txt | 28 ++++++++++++++++++ microorganisms-roots/Alveolata | 1 - microorganisms-roots/Amoebozoa | 1 - microorganisms-roots/Archaea | 1 - microorganisms-roots/Bacteria | 1 - microorganisms-roots/Chlamydomonadales | 1 - microorganisms-roots/Chlorella | 1 - microorganisms-roots/Choanoflagellida | 1 - microorganisms-roots/Cryptophyta | 1 - microorganisms-roots/Desmidiales | 1 - microorganisms-roots/Diplomonadida | 1 - microorganisms-roots/Euglenozoa | 1 - microorganisms-roots/Fungi | 1 - microorganisms-roots/Glaucocystophyceae | 1 - microorganisms-roots/Haptophyta | 1 - microorganisms-roots/Ichthyosporea | 1 - microorganisms-roots/Nematoda | 1 - microorganisms-roots/Oxymonadida | 1 - microorganisms-roots/Parabasalia | 1 - microorganisms-roots/Prototheca | 1 - microorganisms-roots/Retortamonadidae | 1 - microorganisms-roots/Rhizaria | 1 - microorganisms-roots/Stramenopiles | 1 - microorganisms-roots/Viruses | 1 - microorganisms-roots/_Crenarchaeota | 1 - microorganisms-roots/_Euryarchaeota | 1 - microorganisms-roots/_Korarchaeota | 1 - microorganisms-roots/_Nanoarchaeota | 1 - microorganisms-roots/_Volvox | 1 - rewrite-taxonomy.snakefile | 8 ++--- 31 files changed, 59 insertions(+), 44 deletions(-) create mode 100644 microorganisms-roots.txt delete mode 100644 microorganisms-roots/Alveolata delete mode 100644 microorganisms-roots/Amoebozoa delete mode 100644 microorganisms-roots/Archaea delete mode 100644 microorganisms-roots/Bacteria delete mode 100644 microorganisms-roots/Chlamydomonadales delete mode 100644 microorganisms-roots/Chlorella delete mode 100644 microorganisms-roots/Choanoflagellida delete mode 100644 microorganisms-roots/Cryptophyta delete mode 100644 microorganisms-roots/Desmidiales delete mode 100644 microorganisms-roots/Diplomonadida delete mode 100644 microorganisms-roots/Euglenozoa delete mode 100644 microorganisms-roots/Fungi delete mode 100644 microorganisms-roots/Glaucocystophyceae delete mode 100644 microorganisms-roots/Haptophyta delete mode 100644 microorganisms-roots/Ichthyosporea delete mode 100644 microorganisms-roots/Nematoda delete mode 100644 microorganisms-roots/Oxymonadida delete mode 100644 microorganisms-roots/Parabasalia delete mode 100644 microorganisms-roots/Prototheca delete mode 100644 microorganisms-roots/Retortamonadidae delete mode 100644 microorganisms-roots/Rhizaria delete mode 100644 microorganisms-roots/Stramenopiles delete mode 100644 microorganisms-roots/Viruses delete mode 100644 microorganisms-roots/_Crenarchaeota delete mode 100644 microorganisms-roots/_Euryarchaeota delete mode 100644 microorganisms-roots/_Korarchaeota delete mode 100644 microorganisms-roots/_Nanoarchaeota delete mode 100644 microorganisms-roots/_Volvox diff --git a/cut-root.py b/cut-root.py index 4c11584..928aeda 100755 --- a/cut-root.py +++ b/cut-root.py @@ -4,27 +4,42 @@ import sys import re -ROOT_FILES = sys.argv[1:] +ROOT_FILE = sys.argv[1] ROOT_CANDIDATES = [] -for rf in ROOT_FILES: - with open(rf) as f: - r = f.read().strip() - ROOT_CANDIDATES.append((rf, r)) +with open(ROOT_FILE) as f: + headers = None + for line in f: + cols = list(c.strip() for c in line.split('\t')) + if headers is None: + headers = cols + continue + record = dict(zip(headers, cols)) + if record['Taxonomy ID'] == '': + sys.stderr.write('ignoring %s, not a taxon root\n' % (record['Name'],)) + else: + ROOT_CANDIDATES.append((record['Name'], record['Taxonomy path'])) ROOTS = [] -for rf1, r1 in ROOT_CANDIDATES: +for name1, path1 in ROOT_CANDIDATES: accept = True - for rf2, r2 in ROOT_CANDIDATES: - if r1 == r2 and rf1 == rf2: + for name2, path2 in ROOT_CANDIDATES: + if path1 == path2 and name1 == path2: continue - if r1.startswith(r2 + '/'): - sys.stderr.write('%s excluded since it is subsumed by %s\n' % (rf1, rf2)) + if path1.startswith(path2 + '/'): + sys.stderr.write('%s excluded since it is subsumed by %s\n' % (name1, name2)) accept = False if accept: - ROOTS.append(r1) + sys.stderr.write('Root %s (%s)\n' % (name1, path1)) + ROOTS.append((name1, path1)) -PATTERN = re.compile(r'\t(?:' + '|'.join(ROOTS) + r')[/\t]') +PATTERN = re.compile(r'\t(?:' + '|'.join(path for (name, path) in ROOTS) + r')[/\t]') +REMAIN = list(ROOTS) for line in sys.stdin: m = PATTERN.search(line) if m is not None: sys.stdout.write(line) + for name, path in REMAIN: + if path in line: + REMAIN.remove((name, path)) +for name, path in REMAIN: + sys.stderr.write('%s not seen' % (name,)) diff --git a/microorganisms-roots.txt b/microorganisms-roots.txt new file mode 100644 index 0000000..e938833 --- /dev/null +++ b/microorganisms-roots.txt @@ -0,0 +1,28 @@ +Name MeSH ID MeSH Tree Taxonomy ID Taxonomy path +Alveolata D056893 B01.043 ncbi:33630 /ncbi:1/ncbi:131567/ncbi:2759/ncbi:2698737/ncbi:33630 +Amoebozoa D056894 B01.046 ncbi:554915 /ncbi:1/ncbi:131567/ncbi:2759/ncbi:554915 +Archaea D001105 B02 ncbi:2157 /ncbi:1/ncbi:131567/ncbi:2157 +Bacteria D001419 B03 ncbi:2 /ncbi:1/ncbi:131567/ncbi:2 +Chlamydomonadales D000077105 B01.650.940.150.511 ncbi:3042 /ncbi:1/ncbi:131567/ncbi:2759/ncbi:33090/ncbi:3041/ncbi:2692248/ncbi:3166/ncbi:2812636/ncbi:3042 +Chlorella D002708 B01.650.940.150.469 ncbi:3071 /ncbi:1/ncbi:131567/ncbi:2759/ncbi:33090/ncbi:3041/ncbi:2692248/ncbi:75966/ncbi:35460/ncbi:35461/ncbi:2511126/ncbi:3071 +Choanoflagellida D056897 B01.175 ncbi:28009 /ncbi:1/ncbi:131567/ncbi:2759/ncbi:33154/ncbi:28009 +Cryptophyta D044785 B01.206 ncbi:3027 /ncbi:1/ncbi:131567/ncbi:2759/ncbi:3027 +Desmidiales D058114 B01.650.940.800.150.200 ncbi:131210 /ncbi:1/ncbi:131567/ncbi:2759/ncbi:33090/ncbi:35493/ncbi:131221/ncbi:131209/ncbi:2684882/ncbi:131210 +Diplomonadida D016828 B01.237 ncbi:5738 /ncbi:1/ncbi:131567/ncbi:2759/ncbi:2611341/ncbi:207245/ncbi:5738 +Euglenozoa D056898 B01.268 ncbi:33682 /ncbi:1/ncbi:131567/ncbi:2759/ncbi:2611352/ncbi:33682 +Fungi D005658 B01.300 ncbi:4751 /ncbi:1/ncbi:131567/ncbi:2759/ncbi:33154/ncbi:4751 +Glaucocystophyceae D058108 B01.650.232 ncbi:38254 /ncbi:1/ncbi:131567/ncbi:2759/ncbi:38254 +Haptophyta D058087 B01.400 ncbi:2830 /ncbi:1/ncbi:131567/ncbi:2759/ncbi:2608109/ncbi:2830 +Ichthyosporea D050298 B01.500 ncbi:127916 /ncbi:1/ncbi:131567/ncbi:2759/ncbi:33154/ncbi:127916 +Nematoda D009348 B01.050.500.500.294 ncbi:6231 /ncbi:1/ncbi:131567/ncbi:2759/ncbi:33154/ncbi:33208/ncbi:6072/ncbi:33213/ncbi:33317/ncbi:1206794/ncbi:6231 +Oxymonadida D056899 B01.625 ncbi:66288 /ncbi:1/ncbi:131567/ncbi:2759/ncbi:2611341/ncbi:2662611/ncbi:66288 +Parabasalia D056900 B01.630 ncbi:5719 /ncbi:1/ncbi:131567/ncbi:2759/ncbi:2611341/ncbi:5719 +Prototheca D011525 B01.650.940.150.634 ncbi:3110 /ncbi:1/ncbi:131567/ncbi:2759/ncbi:33090/ncbi:3041/ncbi:2692248/ncbi:75966/ncbi:35460/ncbi:35461/ncbi:3110 +Retortamonadidae D056919 B01.675 ncbi:193075 /ncbi:1/ncbi:131567/ncbi:2759/ncbi:2611341/ncbi:207245/ncbi:193075 +Rhizaria D056901 B01.680 ncbi:543769 /ncbi:1/ncbi:131567/ncbi:2759/ncbi:2698737/ncbi:543769 +Stramenopiles D058009 B01.750 ncbi:33634 /ncbi:1/ncbi:131567/ncbi:2759/ncbi:2698737/ncbi:33634 +Viruses D014780 B04 ncbi:10239 /ncbi:1/ncbi:10239 +Microbiological Phenomena D008827 G06 +Microbiology D008829 H01.158.273.540 +Microbiological Techniques D008828 E05.200.875,E01.370.225.875 +Attachment Sites, Microbiological D001287 G05.360.340.024.079 diff --git a/microorganisms-roots/Alveolata b/microorganisms-roots/Alveolata deleted file mode 100644 index 68604cd..0000000 --- a/microorganisms-roots/Alveolata +++ /dev/null @@ -1 +0,0 @@ -/ncbi:1/ncbi:131567/ncbi:2759/ncbi:2698737/ncbi:33630 diff --git a/microorganisms-roots/Amoebozoa b/microorganisms-roots/Amoebozoa deleted file mode 100644 index 0d27f58..0000000 --- a/microorganisms-roots/Amoebozoa +++ /dev/null @@ -1 +0,0 @@ -/ncbi:1/ncbi:131567/ncbi:2759/ncbi:554915 diff --git a/microorganisms-roots/Archaea b/microorganisms-roots/Archaea deleted file mode 100644 index 48279c0..0000000 --- a/microorganisms-roots/Archaea +++ /dev/null @@ -1 +0,0 @@ -/ncbi:1/ncbi:131567/ncbi:2157 diff --git a/microorganisms-roots/Bacteria b/microorganisms-roots/Bacteria deleted file mode 100644 index 7abbde7..0000000 --- a/microorganisms-roots/Bacteria +++ /dev/null @@ -1 +0,0 @@ -/ncbi:1/ncbi:131567/ncbi:2 diff --git a/microorganisms-roots/Chlamydomonadales b/microorganisms-roots/Chlamydomonadales deleted file mode 100644 index 92eac40..0000000 --- a/microorganisms-roots/Chlamydomonadales +++ /dev/null @@ -1 +0,0 @@ -/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33090/ncbi:3041/ncbi:2692248/ncbi:3166/ncbi:2812636/ncbi:3042 diff --git a/microorganisms-roots/Chlorella b/microorganisms-roots/Chlorella deleted file mode 100644 index 945ed27..0000000 --- a/microorganisms-roots/Chlorella +++ /dev/null @@ -1 +0,0 @@ -/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33090/ncbi:3041/ncbi:2692248/ncbi:75966/ncbi:35460/ncbi:35461/ncbi:2511126/ncbi:3071 diff --git a/microorganisms-roots/Choanoflagellida b/microorganisms-roots/Choanoflagellida deleted file mode 100644 index d6449de..0000000 --- a/microorganisms-roots/Choanoflagellida +++ /dev/null @@ -1 +0,0 @@ -/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33154/ncbi:28009 diff --git a/microorganisms-roots/Cryptophyta b/microorganisms-roots/Cryptophyta deleted file mode 100644 index 1255f03..0000000 --- a/microorganisms-roots/Cryptophyta +++ /dev/null @@ -1 +0,0 @@ -/ncbi:1/ncbi:131567/ncbi:2759/ncbi:3027 diff --git a/microorganisms-roots/Desmidiales b/microorganisms-roots/Desmidiales deleted file mode 100644 index 5a511d1..0000000 --- a/microorganisms-roots/Desmidiales +++ /dev/null @@ -1 +0,0 @@ -/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33090/ncbi:35493/ncbi:131221/ncbi:131209/ncbi:2684882/ncbi:131210 diff --git a/microorganisms-roots/Diplomonadida b/microorganisms-roots/Diplomonadida deleted file mode 100644 index 37583ce..0000000 --- a/microorganisms-roots/Diplomonadida +++ /dev/null @@ -1 +0,0 @@ -/ncbi:1/ncbi:131567/ncbi:2759/ncbi:2611341/ncbi:207245/ncbi:5738 diff --git a/microorganisms-roots/Euglenozoa b/microorganisms-roots/Euglenozoa deleted file mode 100644 index c2a12e9..0000000 --- a/microorganisms-roots/Euglenozoa +++ /dev/null @@ -1 +0,0 @@ -/ncbi:1/ncbi:131567/ncbi:2759/ncbi:2611352/ncbi:33682 diff --git a/microorganisms-roots/Fungi b/microorganisms-roots/Fungi deleted file mode 100644 index 5669838..0000000 --- a/microorganisms-roots/Fungi +++ /dev/null @@ -1 +0,0 @@ -/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33154/ncbi:4751 diff --git a/microorganisms-roots/Glaucocystophyceae b/microorganisms-roots/Glaucocystophyceae deleted file mode 100644 index 3059f6d..0000000 --- a/microorganisms-roots/Glaucocystophyceae +++ /dev/null @@ -1 +0,0 @@ -/ncbi:1/ncbi:131567/ncbi:2759/ncbi:38254 diff --git a/microorganisms-roots/Haptophyta b/microorganisms-roots/Haptophyta deleted file mode 100644 index dac52ab..0000000 --- a/microorganisms-roots/Haptophyta +++ /dev/null @@ -1 +0,0 @@ -/ncbi:1/ncbi:131567/ncbi:2759/ncbi:2608109/ncbi:2830 diff --git a/microorganisms-roots/Ichthyosporea b/microorganisms-roots/Ichthyosporea deleted file mode 100644 index 39bc6b1..0000000 --- a/microorganisms-roots/Ichthyosporea +++ /dev/null @@ -1 +0,0 @@ -/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33154/ncbi:127916 diff --git a/microorganisms-roots/Nematoda b/microorganisms-roots/Nematoda deleted file mode 100644 index e8fdf14..0000000 --- a/microorganisms-roots/Nematoda +++ /dev/null @@ -1 +0,0 @@ -/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33154/ncbi:33208/ncbi:6072/ncbi:33213/ncbi:33317/ncbi:1206794/ncbi:6231 diff --git a/microorganisms-roots/Oxymonadida b/microorganisms-roots/Oxymonadida deleted file mode 100644 index a5a82c3..0000000 --- a/microorganisms-roots/Oxymonadida +++ /dev/null @@ -1 +0,0 @@ -/ncbi:1/ncbi:131567/ncbi:2759/ncbi:2611341/ncbi:2662611/ncbi:66288 diff --git a/microorganisms-roots/Parabasalia b/microorganisms-roots/Parabasalia deleted file mode 100644 index f2a90da..0000000 --- a/microorganisms-roots/Parabasalia +++ /dev/null @@ -1 +0,0 @@ -/ncbi:1/ncbi:131567/ncbi:2759/ncbi:2611341/ncbi:5719 diff --git a/microorganisms-roots/Prototheca b/microorganisms-roots/Prototheca deleted file mode 100644 index 8974461..0000000 --- a/microorganisms-roots/Prototheca +++ /dev/null @@ -1 +0,0 @@ -/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33090/ncbi:3041/ncbi:2692248/ncbi:75966/ncbi:35460/ncbi:35461/ncbi:3110 diff --git a/microorganisms-roots/Retortamonadidae b/microorganisms-roots/Retortamonadidae deleted file mode 100644 index 0c9e290..0000000 --- a/microorganisms-roots/Retortamonadidae +++ /dev/null @@ -1 +0,0 @@ -/ncbi:1/ncbi:131567/ncbi:2759/ncbi:2611341/ncbi:207245/ncbi:193075 diff --git a/microorganisms-roots/Rhizaria b/microorganisms-roots/Rhizaria deleted file mode 100644 index b0fe36e..0000000 --- a/microorganisms-roots/Rhizaria +++ /dev/null @@ -1 +0,0 @@ -/ncbi:1/ncbi:131567/ncbi:2759/ncbi:2698737/ncbi:543769 diff --git a/microorganisms-roots/Stramenopiles b/microorganisms-roots/Stramenopiles deleted file mode 100644 index e6f72b6..0000000 --- a/microorganisms-roots/Stramenopiles +++ /dev/null @@ -1 +0,0 @@ -/ncbi:1/ncbi:131567/ncbi:2759/ncbi:2698737/ncbi:33634 diff --git a/microorganisms-roots/Viruses b/microorganisms-roots/Viruses deleted file mode 100644 index e159dd1..0000000 --- a/microorganisms-roots/Viruses +++ /dev/null @@ -1 +0,0 @@ -/ncbi:1/ncbi:10239 diff --git a/microorganisms-roots/_Crenarchaeota b/microorganisms-roots/_Crenarchaeota deleted file mode 100644 index 4dbf627..0000000 --- a/microorganisms-roots/_Crenarchaeota +++ /dev/null @@ -1 +0,0 @@ -/ncbi:1/ncbi:131567/ncbi:2157/ncbi:1783275/ncbi:28889 diff --git a/microorganisms-roots/_Euryarchaeota b/microorganisms-roots/_Euryarchaeota deleted file mode 100644 index 8149b0a..0000000 --- a/microorganisms-roots/_Euryarchaeota +++ /dev/null @@ -1 +0,0 @@ -/ncbi:1/ncbi:131567/ncbi:2157/ncbi:28890 diff --git a/microorganisms-roots/_Korarchaeota b/microorganisms-roots/_Korarchaeota deleted file mode 100644 index 60a5d1a..0000000 --- a/microorganisms-roots/_Korarchaeota +++ /dev/null @@ -1 +0,0 @@ -/ncbi:1/ncbi:131567/ncbi:2157/ncbi:1783275/ncbi:51967 diff --git a/microorganisms-roots/_Nanoarchaeota b/microorganisms-roots/_Nanoarchaeota deleted file mode 100644 index 4aa9262..0000000 --- a/microorganisms-roots/_Nanoarchaeota +++ /dev/null @@ -1 +0,0 @@ -/ncbi:1/ncbi:131567/ncbi:2157/ncbi:1783276/ncbi:192989 diff --git a/microorganisms-roots/_Volvox b/microorganisms-roots/_Volvox deleted file mode 100644 index 781fcbd..0000000 --- a/microorganisms-roots/_Volvox +++ /dev/null @@ -1 +0,0 @@ -/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33090/ncbi:3041/ncbi:2692248/ncbi:3166/ncbi:2812636/ncbi:3042/ncbi:3065/ncbi:3066 diff --git a/rewrite-taxonomy.snakefile b/rewrite-taxonomy.snakefile index 87cbf43..d2daf4c 100644 --- a/rewrite-taxonomy.snakefile +++ b/rewrite-taxonomy.snakefile @@ -6,10 +6,10 @@ rule all: config['OUTDIR'] + '/finish.txt', config['OUTDIR'] + '/taxid_microorganisms.txt', config['OUTDIR'] + '/taxa+id_microorganisms.txt', - config['OUTDIR'] + '/taxa+id_microorganisms.trie', + # config['OUTDIR'] + '/taxa+id_microorganisms.trie', config['OUTDIR'] + '/taxid_full.txt', config['OUTDIR'] + '/taxa+id_full.txt', - config['OUTDIR'] + '/taxa+id_full.trie' + # config['OUTDIR'] + '/taxa+id_full.trie' rule check: @@ -30,10 +30,10 @@ rule microorganisms: input: full=config['OUTDIR'] + '/{p}_full.txt', - roots='microorganisms-roots' + roots='microorganisms-roots.txt' shell: - '''./cut-root.py {input.roots}/* <{input.full} >{output}''' + '''./cut-root.py {input.roots} <{input.full} >{output}''' rule taxaid_full: -- GitLab