From 014e209bb74e25ca294ffd0210c915b0008eee00 Mon Sep 17 00:00:00 2001 From: Robert Bossy <Robert.Bossy@inra.fr> Date: Thu, 11 Apr 2024 17:29:00 +0200 Subject: [PATCH] Handle MeSH tree removed mesh-tree from roots config files mesh-download.snakefile download current MeSH tree scripts/roots2mesh-tree.py translates a root file into a list of MeSH paths mesh-microbio.snakefile creates the MeSH path list for microorganisms roots --- config.yaml | 1 + mesh-download.snakefile | 26 ++++++++++++++++ mesh-microbio.snakefile | 26 ++++++++++++++++ resources/ON-roots.yaml | 33 -------------------- resources/microorganisms-roots.yaml | 33 -------------------- resources/vecteurs-roots.yaml | 2 -- scripts/roots2mesh-tree.py | 48 +++++++++++++++++++++++++++++ 7 files changed, 101 insertions(+), 68 deletions(-) create mode 100644 mesh-download.snakefile create mode 100644 mesh-microbio.snakefile create mode 100755 scripts/roots2mesh-tree.py diff --git a/config.yaml b/config.yaml index 743ee70..d1a4272 100644 --- a/config.yaml +++ b/config.yaml @@ -15,3 +15,4 @@ NCBI_DIR: 'ncbi-taxonomy' NCBI_ZIP_URL: 'https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip' EPPO_DIR: 'EPPO' EPPO_ZIP_URL: 'https://data.eppo.int/files/xmlfull.zip' +MESH_DIR: 'mesh' diff --git a/mesh-download.snakefile b/mesh-download.snakefile new file mode 100644 index 0000000..5a84957 --- /dev/null +++ b/mesh-download.snakefile @@ -0,0 +1,26 @@ +configfile: 'config.yaml' + + +import datetime + + +OUTDIR=config['OUTDIR'] + '/' + config['MESH_DIR'] +YEAR=str(datetime.date.today().year) + + +rule all: + input: + OUTDIR + '/d' + YEAR + '.bin' + + +rule mesh: + output: + OUTDIR + '/d' + YEAR + '.bin' + + params: + year=YEAR + + shell: + '''wget -O {output} 'https://nlmpubs.nlm.nih.gov/projects/mesh/MESH_FILES/asciimesh/d{params.year}.bin' ''' + + diff --git a/mesh-microbio.snakefile b/mesh-microbio.snakefile new file mode 100644 index 0000000..8bd3dc2 --- /dev/null +++ b/mesh-microbio.snakefile @@ -0,0 +1,26 @@ +configfile: 'config.yaml' + + +import datetime + + +OUTDIR=config['OUTDIR'] + '/' + config['MESH_DIR'] +YEAR=str(datetime.date.today().year) + + +rule all: + input: + OUTDIR + '/microbio-mesh-terms.txt' + + +rule mesh_microbio: + output: + OUTDIR + '/microbio-mesh-terms.txt' + + input: + script='scripts/roots2mesh-tree.py', + mesh=OUTDIR + '/d' + YEAR + '.bin', + roots='resources/microorganisms-roots.yaml' + + shell: + '''{input.script} {input.mesh} {input.roots} >{output}''' diff --git a/resources/ON-roots.yaml b/resources/ON-roots.yaml index a1c93ab..e88c8e7 100644 --- a/resources/ON-roots.yaml +++ b/resources/ON-roots.yaml @@ -3,124 +3,91 @@ - name: Acari taxid: ncbi:6933 - mesh-id: D056893 - mesh-tree: B01.043 name: Alveolata taxid: ncbi:33630 - mesh-id: D056894 - mesh-tree: B01.046 name: Amoebozoa taxid: ncbi:554915 - mesh-id: D001105 - mesh-tree: B02 name: Archaea taxid: ncbi:2157 - mesh-id: D001419 - mesh-tree: B03 name: Bacteria taxid: ncbi:2 - mesh-id: D000077105 - mesh-tree: B01.650.940.150.511 name: Chlamydomonadales taxid: ncbi:3042 - mesh-id: D002708 - mesh-tree: B01.650.940.150.469 name: Chlorella taxid: ncbi:3071 - mesh-id: D056897 - mesh-tree: B01.175 name: Choanoflagellida taxid: ncbi:28009 - mesh-id: D044785 - mesh-tree: B01.206 name: Cryptophyta taxid: ncbi:3027 - mesh-id: D058114 - mesh-tree: B01.650.940.800.150.200 name: Desmidiales taxid: ncbi:131210 - mesh-id: D016828 - mesh-tree: B01.237 name: Diplomonadida taxid: ncbi:5738 - mesh-id: D056898 - mesh-tree: B01.268 name: Euglenozoa taxid: ncbi:33682 - mesh-id: D005658 - mesh-tree: B01.300 name: Fungi taxid: ncbi:4751 - mesh-id: D058108 - mesh-tree: B01.650.232 name: Glaucocystophyceae taxid: ncbi:38254 - mesh-id: D058087 - mesh-tree: B01.400 name: Haptophyta taxid: ncbi:2830 - mesh-id: D050298 - mesh-tree: B01.500 name: Ichthyosporea taxid: ncbi:127916 - mesh-id: D009348 - mesh-tree: B01.050.500.500.294 name: Nematoda taxid: ncbi:6231 - mesh-id: D056899 - mesh-tree: B01.625 name: Oxymonadida taxid: ncbi:66288 - mesh-id: D056900 - mesh-tree: B01.630 name: Parabasalia taxid: ncbi:5719 - mesh-id: D011525 - mesh-tree: B01.650.940.150.634 name: Prototheca taxid: ncbi:3110 - mesh-id: D056919 - mesh-tree: B01.675 name: Retortamonadidae taxid: ncbi:193075 - mesh-id: D056901 - mesh-tree: B01.680 name: Rhizaria taxid: ncbi:543769 - mesh-id: D058009 - mesh-tree: B01.750 name: Stramenopiles taxid: ncbi:33634 - mesh-id: D014780 - mesh-tree: B04 name: Viruses taxid: ncbi:10239 - mesh-id: D008827 - mesh-tree: G06 name: Microbiological Phenomena - mesh-id: D008829 - mesh-tree: H01.158.273.540 name: Microbiology - mesh-id: D008828 - mesh-tree: E05.200.875,E01.370.225.875 name: Microbiological Techniques - mesh-id: D001287 - mesh-tree: G05.360.340.024.079 name: Attachment Sites, Microbiological - mesh-id: D064806 - mesh-tree: C23.550.308 name: Dysbiosis - mesh-id: D054892 - mesh-tree: G05.360.340.550 name: Metagenome - mesh-id: D064349 - mesh-tree: G05.360.340.358 name: Genome, Microbial - mesh-id: D056226 - mesh-tree: A20 name: Bacterial Structures - mesh-id: D056229 - mesh-tree: A19 name: Fungal Structures - mesh-id: D056224 - mesh-tree: A21 name: Viral Structures diff --git a/resources/microorganisms-roots.yaml b/resources/microorganisms-roots.yaml index 0bbbeb7..9801ba0 100644 --- a/resources/microorganisms-roots.yaml +++ b/resources/microorganisms-roots.yaml @@ -1,123 +1,90 @@ - mesh-id: D056893 - mesh-tree: B01.043 name: Alveolata taxid: ncbi:33630 - mesh-id: D056894 - mesh-tree: B01.046 name: Amoebozoa taxid: ncbi:554915 - mesh-id: D001105 - mesh-tree: B02 name: Archaea taxid: ncbi:2157 - mesh-id: D001419 - mesh-tree: B03 name: Bacteria taxid: ncbi:2 - mesh-id: D000077105 - mesh-tree: B01.650.940.150.511 name: Chlamydomonadales taxid: ncbi:3042 - mesh-id: D002708 - mesh-tree: B01.650.940.150.469 name: Chlorella taxid: ncbi:3071 - mesh-id: D056897 - mesh-tree: B01.175 name: Choanoflagellida taxid: ncbi:28009 - mesh-id: D044785 - mesh-tree: B01.206 name: Cryptophyta taxid: ncbi:3027 - mesh-id: D058114 - mesh-tree: B01.650.940.800.150.200 name: Desmidiales taxid: ncbi:131210 - mesh-id: D016828 - mesh-tree: B01.237 name: Diplomonadida taxid: ncbi:5738 - mesh-id: D056898 - mesh-tree: B01.268 name: Euglenozoa taxid: ncbi:33682 - mesh-id: D005658 - mesh-tree: B01.300 name: Fungi taxid: ncbi:4751 - mesh-id: D058108 - mesh-tree: B01.650.232 name: Glaucocystophyceae taxid: ncbi:38254 - mesh-id: D058087 - mesh-tree: B01.400 name: Haptophyta taxid: ncbi:2830 - mesh-id: D050298 - mesh-tree: B01.500 name: Ichthyosporea taxid: ncbi:127916 - mesh-id: D009348 - mesh-tree: B01.050.500.500.294 name: Nematoda taxid: ncbi:6231 - mesh-id: D056899 - mesh-tree: B01.625 name: Oxymonadida taxid: ncbi:66288 - mesh-id: D056900 - mesh-tree: B01.630 name: Parabasalia taxid: ncbi:5719 - mesh-id: D011525 - mesh-tree: B01.650.940.150.634 name: Prototheca taxid: ncbi:3110 - mesh-id: D056919 - mesh-tree: B01.675 name: Retortamonadidae taxid: ncbi:193075 - mesh-id: D056901 - mesh-tree: B01.680 name: Rhizaria taxid: ncbi:543769 - mesh-id: D058009 - mesh-tree: B01.750 name: Stramenopiles taxid: ncbi:33634 - mesh-id: D014780 - mesh-tree: B04 name: Viruses taxid: ncbi:10239 - mesh-id: D008827 - mesh-tree: G06 name: Microbiological Phenomena - mesh-id: D008829 - mesh-tree: H01.158.273.540 name: Microbiology - mesh-id: D008828 - mesh-tree: E05.200.875,E01.370.225.875 name: Microbiological Techniques - mesh-id: D001287 - mesh-tree: G05.360.340.024.079 name: Attachment Sites, Microbiological - mesh-id: D064806 - mesh-tree: C23.550.308 name: Dysbiosis - mesh-id: D054892 - mesh-tree: G05.360.340.550 name: Metagenome - mesh-id: D064349 - mesh-tree: G05.360.340.358 name: Genome, Microbial - mesh-id: D056226 - mesh-tree: A20 name: Bacterial Structures - mesh-id: D056229 - mesh-tree: A19 name: Fungal Structures - mesh-id: D056224 - mesh-tree: A21 name: Viral Structures diff --git a/resources/vecteurs-roots.yaml b/resources/vecteurs-roots.yaml index 1bc763e..fa358f3 100644 --- a/resources/vecteurs-roots.yaml +++ b/resources/vecteurs-roots.yaml @@ -3,10 +3,8 @@ - name: Arachnida taxid: ncbi:6854 - mesh-id: D009348 - mesh-tree: B01.050.500.500.294 name: Nematoda taxid: ncbi:6231 - mesh-id: D005658 - mesh-tree: B01.300 name: Fungi taxid: ncbi:4751 diff --git a/scripts/roots2mesh-tree.py b/scripts/roots2mesh-tree.py new file mode 100755 index 0000000..8f3d1bf --- /dev/null +++ b/scripts/roots2mesh-tree.py @@ -0,0 +1,48 @@ +#!/bin/env python3 + + +import sys +import yaml +import re +import collections + + +MESH_FILE = sys.argv[1] +ROOTS_FILE = sys.argv[2] + + +with open(ROOTS_FILE) as f: + ROOTS = dict((e['mesh-id'], e['name']) for e in yaml.load(f, Loader=yaml.FullLoader) if 'mesh-id' in e) + + +LINE = re.compile('([1A-Z_ ]+) = (.*)') + + +def finish(rec): + if 'UI' not in rec: + return + if rec['UI'][0] not in ROOTS: + return + if 'MN' not in rec: + raise RuntimeError('no tree for ' + rec['UI'][0]) + for mtree in rec['MN']: + print(mtree) + + +current = collections.defaultdict(list) +with open(MESH_FILE) as f: + for line in f: + line = line.strip() + if line == '': + continue + if line == '*NEWRECORD': + finish(current) + current = collections.defaultdict(list) + continue + m = LINE.fullmatch(line) + if m is None: + raise RuntimeError('could not parse: ' + line) + key = m.group(1) + value = m.group(2) + current[key].append(value) +finish(current) -- GitLab