From 014e209bb74e25ca294ffd0210c915b0008eee00 Mon Sep 17 00:00:00 2001
From: Robert Bossy <Robert.Bossy@inra.fr>
Date: Thu, 11 Apr 2024 17:29:00 +0200
Subject: [PATCH] Handle MeSH tree

removed mesh-tree from roots config files
mesh-download.snakefile download current MeSH tree
scripts/roots2mesh-tree.py translates a root file into a list of MeSH paths
mesh-microbio.snakefile creates the MeSH path list for microorganisms roots
---
 config.yaml                         |  1 +
 mesh-download.snakefile             | 26 ++++++++++++++++
 mesh-microbio.snakefile             | 26 ++++++++++++++++
 resources/ON-roots.yaml             | 33 --------------------
 resources/microorganisms-roots.yaml | 33 --------------------
 resources/vecteurs-roots.yaml       |  2 --
 scripts/roots2mesh-tree.py          | 48 +++++++++++++++++++++++++++++
 7 files changed, 101 insertions(+), 68 deletions(-)
 create mode 100644 mesh-download.snakefile
 create mode 100644 mesh-microbio.snakefile
 create mode 100755 scripts/roots2mesh-tree.py

diff --git a/config.yaml b/config.yaml
index 743ee70..d1a4272 100644
--- a/config.yaml
+++ b/config.yaml
@@ -15,3 +15,4 @@ NCBI_DIR: 'ncbi-taxonomy'
 NCBI_ZIP_URL: 'https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip'
 EPPO_DIR: 'EPPO'
 EPPO_ZIP_URL: 'https://data.eppo.int/files/xmlfull.zip'
+MESH_DIR: 'mesh'
diff --git a/mesh-download.snakefile b/mesh-download.snakefile
new file mode 100644
index 0000000..5a84957
--- /dev/null
+++ b/mesh-download.snakefile
@@ -0,0 +1,26 @@
+configfile: 'config.yaml'
+
+
+import datetime
+
+
+OUTDIR=config['OUTDIR'] + '/' + config['MESH_DIR']
+YEAR=str(datetime.date.today().year)
+
+
+rule all:
+    input:
+        OUTDIR + '/d' + YEAR + '.bin'
+
+
+rule mesh:
+    output:
+        OUTDIR + '/d' + YEAR + '.bin'
+
+    params:
+        year=YEAR
+
+    shell:
+        '''wget -O {output} 'https://nlmpubs.nlm.nih.gov/projects/mesh/MESH_FILES/asciimesh/d{params.year}.bin' '''
+
+
diff --git a/mesh-microbio.snakefile b/mesh-microbio.snakefile
new file mode 100644
index 0000000..8bd3dc2
--- /dev/null
+++ b/mesh-microbio.snakefile
@@ -0,0 +1,26 @@
+configfile: 'config.yaml'
+
+
+import datetime
+
+
+OUTDIR=config['OUTDIR'] + '/' + config['MESH_DIR']
+YEAR=str(datetime.date.today().year)
+
+
+rule all:
+    input:
+        OUTDIR + '/microbio-mesh-terms.txt'
+
+
+rule mesh_microbio:
+    output:
+        OUTDIR + '/microbio-mesh-terms.txt'
+
+    input:
+        script='scripts/roots2mesh-tree.py',
+        mesh=OUTDIR + '/d' + YEAR + '.bin',
+        roots='resources/microorganisms-roots.yaml'
+
+    shell:
+        '''{input.script} {input.mesh} {input.roots} >{output}'''
diff --git a/resources/ON-roots.yaml b/resources/ON-roots.yaml
index a1c93ab..e88c8e7 100644
--- a/resources/ON-roots.yaml
+++ b/resources/ON-roots.yaml
@@ -3,124 +3,91 @@
 - name: Acari
   taxid: ncbi:6933
 - mesh-id: D056893
-  mesh-tree: B01.043
   name: Alveolata
   taxid: ncbi:33630
 - mesh-id: D056894
-  mesh-tree: B01.046
   name: Amoebozoa
   taxid: ncbi:554915
 - mesh-id: D001105
-  mesh-tree: B02
   name: Archaea
   taxid: ncbi:2157
 - mesh-id: D001419
-  mesh-tree: B03
   name: Bacteria
   taxid: ncbi:2
 - mesh-id: D000077105
-  mesh-tree: B01.650.940.150.511
   name: Chlamydomonadales
   taxid: ncbi:3042
 - mesh-id: D002708
-  mesh-tree: B01.650.940.150.469
   name: Chlorella
   taxid: ncbi:3071
 - mesh-id: D056897
-  mesh-tree: B01.175
   name: Choanoflagellida
   taxid: ncbi:28009
 - mesh-id: D044785
-  mesh-tree: B01.206
   name: Cryptophyta
   taxid: ncbi:3027
 - mesh-id: D058114
-  mesh-tree: B01.650.940.800.150.200
   name: Desmidiales
   taxid: ncbi:131210
 - mesh-id: D016828
-  mesh-tree: B01.237
   name: Diplomonadida
   taxid: ncbi:5738
 - mesh-id: D056898
-  mesh-tree: B01.268
   name: Euglenozoa
   taxid: ncbi:33682
 - mesh-id: D005658
-  mesh-tree: B01.300
   name: Fungi
   taxid: ncbi:4751
 - mesh-id: D058108
-  mesh-tree: B01.650.232
   name: Glaucocystophyceae
   taxid: ncbi:38254
 - mesh-id: D058087
-  mesh-tree: B01.400
   name: Haptophyta
   taxid: ncbi:2830
 - mesh-id: D050298
-  mesh-tree: B01.500
   name: Ichthyosporea
   taxid: ncbi:127916
 - mesh-id: D009348
-  mesh-tree: B01.050.500.500.294
   name: Nematoda
   taxid: ncbi:6231
 - mesh-id: D056899
-  mesh-tree: B01.625
   name: Oxymonadida
   taxid: ncbi:66288
 - mesh-id: D056900
-  mesh-tree: B01.630
   name: Parabasalia
   taxid: ncbi:5719
 - mesh-id: D011525
-  mesh-tree: B01.650.940.150.634
   name: Prototheca
   taxid: ncbi:3110
 - mesh-id: D056919
-  mesh-tree: B01.675
   name: Retortamonadidae
   taxid: ncbi:193075
 - mesh-id: D056901
-  mesh-tree: B01.680
   name: Rhizaria
   taxid: ncbi:543769
 - mesh-id: D058009
-  mesh-tree: B01.750
   name: Stramenopiles
   taxid: ncbi:33634
 - mesh-id: D014780
-  mesh-tree: B04
   name: Viruses
   taxid: ncbi:10239
 - mesh-id: D008827
-  mesh-tree: G06
   name: Microbiological Phenomena
 - mesh-id: D008829
-  mesh-tree: H01.158.273.540
   name: Microbiology
 - mesh-id: D008828
-  mesh-tree: E05.200.875,E01.370.225.875
   name: Microbiological Techniques
 - mesh-id: D001287
-  mesh-tree: G05.360.340.024.079
   name: Attachment Sites, Microbiological
 - mesh-id: D064806
-  mesh-tree: C23.550.308
   name: Dysbiosis
 - mesh-id: D054892
-  mesh-tree: G05.360.340.550
   name: Metagenome
 - mesh-id: D064349
-  mesh-tree: G05.360.340.358
   name: Genome, Microbial
 - mesh-id: D056226
-  mesh-tree: A20
   name: Bacterial Structures
 - mesh-id: D056229
-  mesh-tree: A19
   name: Fungal Structures
 - mesh-id: D056224
-  mesh-tree: A21
   name: Viral Structures
diff --git a/resources/microorganisms-roots.yaml b/resources/microorganisms-roots.yaml
index 0bbbeb7..9801ba0 100644
--- a/resources/microorganisms-roots.yaml
+++ b/resources/microorganisms-roots.yaml
@@ -1,123 +1,90 @@
 - mesh-id: D056893
-  mesh-tree: B01.043
   name: Alveolata
   taxid: ncbi:33630
 - mesh-id: D056894
-  mesh-tree: B01.046
   name: Amoebozoa
   taxid: ncbi:554915
 - mesh-id: D001105
-  mesh-tree: B02
   name: Archaea
   taxid: ncbi:2157
 - mesh-id: D001419
-  mesh-tree: B03
   name: Bacteria
   taxid: ncbi:2
 - mesh-id: D000077105
-  mesh-tree: B01.650.940.150.511
   name: Chlamydomonadales
   taxid: ncbi:3042
 - mesh-id: D002708
-  mesh-tree: B01.650.940.150.469
   name: Chlorella
   taxid: ncbi:3071
 - mesh-id: D056897
-  mesh-tree: B01.175
   name: Choanoflagellida
   taxid: ncbi:28009
 - mesh-id: D044785
-  mesh-tree: B01.206
   name: Cryptophyta
   taxid: ncbi:3027
 - mesh-id: D058114
-  mesh-tree: B01.650.940.800.150.200
   name: Desmidiales
   taxid: ncbi:131210
 - mesh-id: D016828
-  mesh-tree: B01.237
   name: Diplomonadida
   taxid: ncbi:5738
 - mesh-id: D056898
-  mesh-tree: B01.268
   name: Euglenozoa
   taxid: ncbi:33682
 - mesh-id: D005658
-  mesh-tree: B01.300
   name: Fungi
   taxid: ncbi:4751
 - mesh-id: D058108
-  mesh-tree: B01.650.232
   name: Glaucocystophyceae
   taxid: ncbi:38254
 - mesh-id: D058087
-  mesh-tree: B01.400
   name: Haptophyta
   taxid: ncbi:2830
 - mesh-id: D050298
-  mesh-tree: B01.500
   name: Ichthyosporea
   taxid: ncbi:127916
 - mesh-id: D009348
-  mesh-tree: B01.050.500.500.294
   name: Nematoda
   taxid: ncbi:6231
 - mesh-id: D056899
-  mesh-tree: B01.625
   name: Oxymonadida
   taxid: ncbi:66288
 - mesh-id: D056900
-  mesh-tree: B01.630
   name: Parabasalia
   taxid: ncbi:5719
 - mesh-id: D011525
-  mesh-tree: B01.650.940.150.634
   name: Prototheca
   taxid: ncbi:3110
 - mesh-id: D056919
-  mesh-tree: B01.675
   name: Retortamonadidae
   taxid: ncbi:193075
 - mesh-id: D056901
-  mesh-tree: B01.680
   name: Rhizaria
   taxid: ncbi:543769
 - mesh-id: D058009
-  mesh-tree: B01.750
   name: Stramenopiles
   taxid: ncbi:33634
 - mesh-id: D014780
-  mesh-tree: B04
   name: Viruses
   taxid: ncbi:10239
 - mesh-id: D008827
-  mesh-tree: G06
   name: Microbiological Phenomena
 - mesh-id: D008829
-  mesh-tree: H01.158.273.540
   name: Microbiology
 - mesh-id: D008828
-  mesh-tree: E05.200.875,E01.370.225.875
   name: Microbiological Techniques
 - mesh-id: D001287
-  mesh-tree: G05.360.340.024.079
   name: Attachment Sites, Microbiological
 - mesh-id: D064806
-  mesh-tree: C23.550.308
   name: Dysbiosis
 - mesh-id: D054892
-  mesh-tree: G05.360.340.550
   name: Metagenome
 - mesh-id: D064349
-  mesh-tree: G05.360.340.358
   name: Genome, Microbial
 - mesh-id: D056226
-  mesh-tree: A20
   name: Bacterial Structures
 - mesh-id: D056229
-  mesh-tree: A19
   name: Fungal Structures
 - mesh-id: D056224
-  mesh-tree: A21
   name: Viral Structures
 
diff --git a/resources/vecteurs-roots.yaml b/resources/vecteurs-roots.yaml
index 1bc763e..fa358f3 100644
--- a/resources/vecteurs-roots.yaml
+++ b/resources/vecteurs-roots.yaml
@@ -3,10 +3,8 @@
 - name: Arachnida
   taxid: ncbi:6854
 - mesh-id: D009348
-  mesh-tree: B01.050.500.500.294
   name: Nematoda
   taxid: ncbi:6231
 - mesh-id: D005658
-  mesh-tree: B01.300
   name: Fungi
   taxid: ncbi:4751
diff --git a/scripts/roots2mesh-tree.py b/scripts/roots2mesh-tree.py
new file mode 100755
index 0000000..8f3d1bf
--- /dev/null
+++ b/scripts/roots2mesh-tree.py
@@ -0,0 +1,48 @@
+#!/bin/env python3
+
+
+import sys
+import yaml
+import re
+import collections
+
+
+MESH_FILE = sys.argv[1]
+ROOTS_FILE = sys.argv[2]
+
+
+with open(ROOTS_FILE) as f:
+    ROOTS = dict((e['mesh-id'], e['name']) for e in yaml.load(f, Loader=yaml.FullLoader) if 'mesh-id' in e)
+
+
+LINE = re.compile('([1A-Z_ ]+) = (.*)')
+
+
+def finish(rec):
+    if 'UI' not in rec:
+        return
+    if rec['UI'][0] not in ROOTS:
+        return
+    if 'MN' not in rec:
+        raise RuntimeError('no tree for ' + rec['UI'][0])
+    for mtree in rec['MN']:
+        print(mtree)
+
+
+current = collections.defaultdict(list)
+with open(MESH_FILE) as f:
+    for line in f:
+        line = line.strip()
+        if line == '':
+            continue
+        if line == '*NEWRECORD':
+            finish(current)
+            current = collections.defaultdict(list)
+            continue
+        m = LINE.fullmatch(line)
+        if m is None:
+            raise RuntimeError('could not parse: ' + line)
+        key = m.group(1)
+        value = m.group(2)
+        current[key].append(value)
+finish(current)
-- 
GitLab