diff --git a/config.yaml b/config.yaml index 151fab9d2ba66f172cee3ff05acc8f8f1782ae30..96db8a16d94651b34c94045d8ae23a1293712f57 100644 --- a/config.yaml +++ b/config.yaml @@ -3,8 +3,9 @@ BACDIVE_USER: 'Robert.Bossy@inrae.fr' BACDIVE_PASSWORD_FILE: '.bacdive' -# AlvisNLP binary +# AlvisNLP and rewrite-taxonomy binaries ALVISNLP: '~/code/alvisnlp/.test/alvisnlp/bin/alvisnlp' +REWRITE_TAXONOMY: '~/code/bibliome-java-utils/test/install/bin/rewrite-taxonomy' # NCBI Taxonomy files diff --git a/cut-root.py b/cut-root.py new file mode 100755 index 0000000000000000000000000000000000000000..4c115849e066fdc59601d061eb0250c8408c898a --- /dev/null +++ b/cut-root.py @@ -0,0 +1,30 @@ +#!/bin/env python3 + + +import sys +import re + +ROOT_FILES = sys.argv[1:] +ROOT_CANDIDATES = [] +for rf in ROOT_FILES: + with open(rf) as f: + r = f.read().strip() + ROOT_CANDIDATES.append((rf, r)) + +ROOTS = [] +for rf1, r1 in ROOT_CANDIDATES: + accept = True + for rf2, r2 in ROOT_CANDIDATES: + if r1 == r2 and rf1 == rf2: + continue + if r1.startswith(r2 + '/'): + sys.stderr.write('%s excluded since it is subsumed by %s\n' % (rf1, rf2)) + accept = False + if accept: + ROOTS.append(r1) + +PATTERN = re.compile(r'\t(?:' + '|'.join(ROOTS) + r')[/\t]') +for line in sys.stdin: + m = PATTERN.search(line) + if m is not None: + sys.stdout.write(line) diff --git a/microorganisms-roots/Alveolata b/microorganisms-roots/Alveolata new file mode 100644 index 0000000000000000000000000000000000000000..68604cd8348a3e7bc373e5c525529fe1692c6e23 --- /dev/null +++ b/microorganisms-roots/Alveolata @@ -0,0 +1 @@ +/ncbi:1/ncbi:131567/ncbi:2759/ncbi:2698737/ncbi:33630 diff --git a/microorganisms-roots/Amoebozoa b/microorganisms-roots/Amoebozoa new file mode 100644 index 0000000000000000000000000000000000000000..0d27f586007b5151384b30f248c0b53f85a5d67d --- /dev/null +++ b/microorganisms-roots/Amoebozoa @@ -0,0 +1 @@ +/ncbi:1/ncbi:131567/ncbi:2759/ncbi:554915 diff --git a/microorganisms-roots/Archaea b/microorganisms-roots/Archaea new file mode 100644 index 0000000000000000000000000000000000000000..48279c000caaf57e3212dcd7e171fd48b3ce6fc6 --- /dev/null +++ b/microorganisms-roots/Archaea @@ -0,0 +1 @@ +/ncbi:1/ncbi:131567/ncbi:2157 diff --git a/microorganisms-roots/Bacteria b/microorganisms-roots/Bacteria new file mode 100644 index 0000000000000000000000000000000000000000..7abbde76f45ab7a5dff8882ae590b476c6e4de15 --- /dev/null +++ b/microorganisms-roots/Bacteria @@ -0,0 +1 @@ +/ncbi:1/ncbi:131567/ncbi:2 diff --git a/microorganisms-roots/Chlamydomonadales b/microorganisms-roots/Chlamydomonadales new file mode 100644 index 0000000000000000000000000000000000000000..92eac407ada8ce72894d36167d9ceb82fe0d54a1 --- /dev/null +++ b/microorganisms-roots/Chlamydomonadales @@ -0,0 +1 @@ +/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33090/ncbi:3041/ncbi:2692248/ncbi:3166/ncbi:2812636/ncbi:3042 diff --git a/microorganisms-roots/Chlorella b/microorganisms-roots/Chlorella new file mode 100644 index 0000000000000000000000000000000000000000..945ed2730462e7cf45917b614cfced94e0506f1a --- /dev/null +++ b/microorganisms-roots/Chlorella @@ -0,0 +1 @@ +/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33090/ncbi:3041/ncbi:2692248/ncbi:75966/ncbi:35460/ncbi:35461/ncbi:2511126/ncbi:3071 diff --git a/microorganisms-roots/Choanoflagellida b/microorganisms-roots/Choanoflagellida new file mode 100644 index 0000000000000000000000000000000000000000..d6449de27e3ed938cafaaaa796866aaf60ebb92d --- /dev/null +++ b/microorganisms-roots/Choanoflagellida @@ -0,0 +1 @@ +/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33154/ncbi:28009 diff --git a/microorganisms-roots/Cryptophyta b/microorganisms-roots/Cryptophyta new file mode 100644 index 0000000000000000000000000000000000000000..1255f0345bf052c749b865c732c15b45e4fac3e4 --- /dev/null +++ b/microorganisms-roots/Cryptophyta @@ -0,0 +1 @@ +/ncbi:1/ncbi:131567/ncbi:2759/ncbi:3027 diff --git a/microorganisms-roots/Desmidiales b/microorganisms-roots/Desmidiales new file mode 100644 index 0000000000000000000000000000000000000000..5a511d1c6a594342277c8b2094dd072571c0bf7b --- /dev/null +++ b/microorganisms-roots/Desmidiales @@ -0,0 +1 @@ +/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33090/ncbi:35493/ncbi:131221/ncbi:131209/ncbi:2684882/ncbi:131210 diff --git a/microorganisms-roots/Diplomonadida b/microorganisms-roots/Diplomonadida new file mode 100644 index 0000000000000000000000000000000000000000..37583cee7fe4c2cb33115719b1d37cbfd4478b52 --- /dev/null +++ b/microorganisms-roots/Diplomonadida @@ -0,0 +1 @@ +/ncbi:1/ncbi:131567/ncbi:2759/ncbi:2611341/ncbi:207245/ncbi:5738 diff --git a/microorganisms-roots/Euglenozoa b/microorganisms-roots/Euglenozoa new file mode 100644 index 0000000000000000000000000000000000000000..c2a12e942141b8e31fbe62b71f2b8f9c62c15539 --- /dev/null +++ b/microorganisms-roots/Euglenozoa @@ -0,0 +1 @@ +/ncbi:1/ncbi:131567/ncbi:2759/ncbi:2611352/ncbi:33682 diff --git a/microorganisms-roots/Fungi b/microorganisms-roots/Fungi new file mode 100644 index 0000000000000000000000000000000000000000..5669838e657ef75a49d2236922da4d6054d883d6 --- /dev/null +++ b/microorganisms-roots/Fungi @@ -0,0 +1 @@ +/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33154/ncbi:4751 diff --git a/microorganisms-roots/Glaucocystophyceae b/microorganisms-roots/Glaucocystophyceae new file mode 100644 index 0000000000000000000000000000000000000000..3059f6debd93a0631e004c99c6b6cf3046e2da23 --- /dev/null +++ b/microorganisms-roots/Glaucocystophyceae @@ -0,0 +1 @@ +/ncbi:1/ncbi:131567/ncbi:2759/ncbi:38254 diff --git a/microorganisms-roots/Haptophyta b/microorganisms-roots/Haptophyta new file mode 100644 index 0000000000000000000000000000000000000000..dac52ab2f42fc3338747ade0f494b9f51dd46d02 --- /dev/null +++ b/microorganisms-roots/Haptophyta @@ -0,0 +1 @@ +/ncbi:1/ncbi:131567/ncbi:2759/ncbi:2608109/ncbi:2830 diff --git a/microorganisms-roots/Ichthyosporea b/microorganisms-roots/Ichthyosporea new file mode 100644 index 0000000000000000000000000000000000000000..39bc6b1a6f7c0108a638c424287c11668b6f93fc --- /dev/null +++ b/microorganisms-roots/Ichthyosporea @@ -0,0 +1 @@ +/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33154/ncbi:127916 diff --git a/microorganisms-roots/Nematoda b/microorganisms-roots/Nematoda new file mode 100644 index 0000000000000000000000000000000000000000..e8fdf146342b6e2bd911c75cc72f412c48c95728 --- /dev/null +++ b/microorganisms-roots/Nematoda @@ -0,0 +1 @@ +/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33154/ncbi:33208/ncbi:6072/ncbi:33213/ncbi:33317/ncbi:1206794/ncbi:6231 diff --git a/microorganisms-roots/Oxymonadida b/microorganisms-roots/Oxymonadida new file mode 100644 index 0000000000000000000000000000000000000000..a5a82c366c10196e8a66a18ae4efc3c97563f197 --- /dev/null +++ b/microorganisms-roots/Oxymonadida @@ -0,0 +1 @@ +/ncbi:1/ncbi:131567/ncbi:2759/ncbi:2611341/ncbi:2662611/ncbi:66288 diff --git a/microorganisms-roots/Parabasalia b/microorganisms-roots/Parabasalia new file mode 100644 index 0000000000000000000000000000000000000000..f2a90daad821b40889111537f7ac2141fdc48202 --- /dev/null +++ b/microorganisms-roots/Parabasalia @@ -0,0 +1 @@ +/ncbi:1/ncbi:131567/ncbi:2759/ncbi:2611341/ncbi:5719 diff --git a/microorganisms-roots/Prototheca b/microorganisms-roots/Prototheca new file mode 100644 index 0000000000000000000000000000000000000000..89744619d9a7d7ff9c477330ddfa93df22f3ddcb --- /dev/null +++ b/microorganisms-roots/Prototheca @@ -0,0 +1 @@ +/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33090/ncbi:3041/ncbi:2692248/ncbi:75966/ncbi:35460/ncbi:35461/ncbi:3110 diff --git a/microorganisms-roots/Retortamonadidae b/microorganisms-roots/Retortamonadidae new file mode 100644 index 0000000000000000000000000000000000000000..0c9e2904650ea2e50346585ba562f3a56ef10153 --- /dev/null +++ b/microorganisms-roots/Retortamonadidae @@ -0,0 +1 @@ +/ncbi:1/ncbi:131567/ncbi:2759/ncbi:2611341/ncbi:207245/ncbi:193075 diff --git a/microorganisms-roots/Rhizaria b/microorganisms-roots/Rhizaria new file mode 100644 index 0000000000000000000000000000000000000000..b0fe36efad1d1ec9ab23aa8e886167f4c5369737 --- /dev/null +++ b/microorganisms-roots/Rhizaria @@ -0,0 +1 @@ +/ncbi:1/ncbi:131567/ncbi:2759/ncbi:2698737/ncbi:543769 diff --git a/microorganisms-roots/Stramenopiles b/microorganisms-roots/Stramenopiles new file mode 100644 index 0000000000000000000000000000000000000000..e6f72b6244417b5fd5c0159d0ef25627cf1994aa --- /dev/null +++ b/microorganisms-roots/Stramenopiles @@ -0,0 +1 @@ +/ncbi:1/ncbi:131567/ncbi:2759/ncbi:2698737/ncbi:33634 diff --git a/microorganisms-roots/Viruses b/microorganisms-roots/Viruses new file mode 100644 index 0000000000000000000000000000000000000000..e159dd15a0a6ed9626f9aee4d282c4c097bdee4a --- /dev/null +++ b/microorganisms-roots/Viruses @@ -0,0 +1 @@ +/ncbi:1/ncbi:10239 diff --git a/microorganisms-roots/_Crenarchaeota b/microorganisms-roots/_Crenarchaeota new file mode 100644 index 0000000000000000000000000000000000000000..4dbf627d1ba7c7920705372c2d87f1065be12b7a --- /dev/null +++ b/microorganisms-roots/_Crenarchaeota @@ -0,0 +1 @@ +/ncbi:1/ncbi:131567/ncbi:2157/ncbi:1783275/ncbi:28889 diff --git a/microorganisms-roots/_Euryarchaeota b/microorganisms-roots/_Euryarchaeota new file mode 100644 index 0000000000000000000000000000000000000000..8149b0a0ffbb982d4c24fa0ddbcc7c9603a29ef5 --- /dev/null +++ b/microorganisms-roots/_Euryarchaeota @@ -0,0 +1 @@ +/ncbi:1/ncbi:131567/ncbi:2157/ncbi:28890 diff --git a/microorganisms-roots/_Korarchaeota b/microorganisms-roots/_Korarchaeota new file mode 100644 index 0000000000000000000000000000000000000000..60a5d1ac5bf5b2f88df505bed7fe72fc4c1278ee --- /dev/null +++ b/microorganisms-roots/_Korarchaeota @@ -0,0 +1 @@ +/ncbi:1/ncbi:131567/ncbi:2157/ncbi:1783275/ncbi:51967 diff --git a/microorganisms-roots/_Nanoarchaeota b/microorganisms-roots/_Nanoarchaeota new file mode 100644 index 0000000000000000000000000000000000000000..4aa9262740749b844712f8d3d32caa32fec78f1c --- /dev/null +++ b/microorganisms-roots/_Nanoarchaeota @@ -0,0 +1 @@ +/ncbi:1/ncbi:131567/ncbi:2157/ncbi:1783276/ncbi:192989 diff --git a/microorganisms-roots/_Volvox b/microorganisms-roots/_Volvox new file mode 100644 index 0000000000000000000000000000000000000000..781fcbd9259d6ebdd8ef362ed9e45b87e53979a8 --- /dev/null +++ b/microorganisms-roots/_Volvox @@ -0,0 +1 @@ +/ncbi:1/ncbi:131567/ncbi:2759/ncbi:33090/ncbi:3041/ncbi:2692248/ncbi:3166/ncbi:2812636/ncbi:3042/ncbi:3065/ncbi:3066 diff --git a/reject.txt b/reject.txt new file mode 100644 index 0000000000000000000000000000000000000000..a0ee6d56c0180614f39f74ce4b67c0f670fc5dec --- /dev/null +++ b/reject.txt @@ -0,0 +1,137 @@ +ncbi:1 +Be ncbi:1587 +Bd ncbi:1613 +unclassified bacterium ncbi:2338 +unidentified bacteria ncbi:2338 +unidentified bacterium ncbi:2338 +unknown bacteria ncbi:2338 +ncbi:2387 +ncbi:2673 +unidentified proteobacterium ncbi:2722 +unknown proteobacterium ncbi:2722 +rape ncbi:3708 +Glycine ncbi:3846 +rays ncbi:7858 +A hybrid ncbi:8307 +monitors ncbi:8555 +Ara ncbi:9225 +euro ncbi:9319 +man ncbi:9606 +bear ncbi:9632 +bears ncbi:9632 +cat ncbi:9685 +pig ncbi:9823 +Axis ncbi:9855 +Vira ncbi:10239 +unidentified poxvirus ncbi:10283 +unidentified entomopoxvirus ncbi:10291 +ASFV ncbi:10497 +degu ncbi:10160 +LGT ncbi:11085 +LI ncbi:11086 +PVA ncbi:12215 +GA-1 ncbi:12345 +other sequences ncbi:28384 +29278 +Spea ncbi:30316 +A glycine ncbi:307491 +ncbi:32630 +ncbi:32644 +flag ncbi:34205 +plasmids ncbi:36549 +hybrid ncbi:37965 +bacteriophage ncbi:38018 +bacteriophages ncbi:38018 +unidentified bacteriophage ncbi:38018 +unidentified phage ncbi:38018 +mum ncbi:41568 +Arca ncbi:44596 +ncbi:45196 +ncbi:45197 +4ncbi:5328 +Thymus ncbi:49990 +ncbi:52958 +Bacillus ncbi:55087 +ncbi:187 ncbi:55511 +name ncbi:55581 +spot ncbi:59837 +Laser ncbi:62990 +Idea ncbi:76236 +Codon ncbi:79338 +expression vector ncbi:81076 +unidentified expression vector ncbi:81076 +Dina ncbi:83994 +gag ncbi:103820 +Later ncbi:123504 +Ada ncbi:125078 +Side ncbi:145724 +Aa ncbi:152839 +tipa ncbi:162890 +This ncbi:169495 +aka ncbi:172644 +permit ncbi:173331 +Car ncbi:201850 +Mene ncbi:206144 +Pero ncbi:214303 +3A ncbi:215167 +Luria ncbi:218032 +Iso ncbi:238707 +Cis ncbi:245896 +ray ncbi:255564 +Pera ncbi:256812 +Mops ncbi:258862 +Bias ncbi:272805 +Sige ncbi:328602 +Span ncbi:333408 +California ncbi:337343 +teta ncbi:338092 +Circe ncbi:345438 +Tasa ncbi:381831 +Nusa ncbi:468772 +A bacterium ncbi:494443 +--> ncbi:545367 +[A-Z]\. alpha +[A-Z]\. beta +[A-Z]\. gamma +[A-Z]\. delta +[A-Z]\. epsilon +[A-Z]\. group +A group +A major +A minor +A central +A minor +A delta +A means +A maximum +A minimum +S medium +A mouse +A flagellum +S complex +Asp +Beta +Helix +rat +Tor +Bio ncbi:463801 +Chen ncbi:8842 +Color ncbi:8869 +Dialysis ncbi:124307 +Ideas ncbi:76236 +Indicator ncbi:189528 +Phyla ncbi:86858 +163164 +374463 +tetra +408170 +Delta ncbi:998453 +is ncbi:159382 +Are ncbi:695398 +Electron ncbi:1118549 +environmental samples +E ncbi:178505 +AND ncbi:1481724 +clinical samples ncbi:88229 +clinical samples ncbi:191496 +clinical samples ncbi:226901 diff --git a/rewrite-taxonomy.snakefile b/rewrite-taxonomy.snakefile index 2fbefe57feb24d1bb5e758bb985f8e0e21170a3d..9acf158a3926e5a5abd0fcdc857075a0562d463f 100644 --- a/rewrite-taxonomy.snakefile +++ b/rewrite-taxonomy.snakefile @@ -1,11 +1,23 @@ configfile: 'config.yaml' -rule rewrite: +rule microorganisms: output: - config['OUTDIR'] + '/taxa+id_full_with-DSMZ.txt' + config['OUTDIR'] + '/taxa+id_microorganisms.txt' + + input: + full=config['OUTDIR'] + '/taxa+id_full.txt', + roots='microorganisms-roots' + + shell: + '''./cut-root.py {input.roots}/* <{input.full} >{output}''' + + +rule full: + output: + config['OUTDIR'] + '/taxa+id_full.txt' input: config['OUTDIR'] + '/' + config['DSMZ_MATCH_DIR'] shell: - '''../bibliome-java-utils/test/install/bin/rewrite-taxonomy -namesFile {config[NCBI_DIR]}/names.dmp -namesFile {input}/dsmz-names.dmp -prefix ncbi: -rejectionFile reject.txt -saturationFile saturate.txt -pattern '{{NAME}}\t{{TAXID}}\t{{CANONICAL}}\t{{TAXID_PATH}}\t{{POS_TAG}}\t{{RANK}}\t{{SPECIES_TAXID}}\t{{SPECIES_NAME}}\n' {config[NCBI_DIR]}/nodes.dmp {input}/dsmz-nodes.dmp >{output}''' + '''{config[REWRITE_TAXONOMY]} -namesFile {config[NCBI_DIR]}/names.dmp -namesFile {input}/dsmz-names.dmp -prefix ncbi: -rejectionFile reject.txt -saturationFile saturate.txt -pattern '{{NAME}}\t{{TAXID}}\t{{CANONICAL}}\t{{TAXID_PATH}}\t{{POS_TAG}}\t{{RANK}}\t{{SPECIES_TAXID}}\t{{SPECIES_NAME}}\n' {config[NCBI_DIR]}/nodes.dmp {input}/dsmz-nodes.dmp >{output}''' diff --git a/saturate.txt b/saturate.txt new file mode 100644 index 0000000000000000000000000000000000000000..903d41c9d2b4bc27ede5c1fbc52c9f03a8fed1db --- /dev/null +++ b/saturate.txt @@ -0,0 +1,5 @@ +([A-Z])[a-z]+ ([a-z]+) genre name abbreviation {1}. {2} {1} {2} {1}.{2} {1} . {2} +([^-]*)-(.*) dash-space replacement {1} {2} +(.*) [(]?([A-Z]\w+), (\d{4})[)]? author name variation {1} {2} {1} ({2}) +[A-Z][a-z]*[ao] redneck plural {0}s +([A-Z][a-z]*)us latin plural {1}i