Newer
Older
config['OUTDIR'] + '/taxid_microorganisms.txt',
config['OUTDIR'] + '/taxa+id_microorganisms.txt',
config['OUTDIR'] + '/taxa+id_microorganisms.trie',
config['OUTDIR'] + '/taxid_full.txt',
config['OUTDIR'] + '/taxa+id_full.txt',
config['OUTDIR'] + '/taxa+id_full.trie',
config['OUTDIR'] + '/microorganisms-roots-paths.txt'
shell:
'''{config[ALVISNLP]} -J-Xmx24G -alias taxo {input} -alias trie {output} compile-taxonomy.plan'''
rule root_paths:
output:
config['OUTDIR'] + '/microorganisms-roots-paths.txt'
input:
'microorganisms-roots.txt'
shell:
'''cut -f 5 {input} | sed -e '1d' -e '/^$/d' -e 's,$,/,' >{output}'''
roots='microorganisms-roots.txt'
'''./cut-root.py {input.roots} <{input.full} >{output}'''
output:
config['OUTDIR'] + '/taxa+id_full.txt'
input:
config['OUTDIR'] + '/' + config['DSMZ_MATCH_DIR']
shell:
'''{config[REWRITE_TAXONOMY]} -namesFile {config[OUTDIR]}/{config[NCBI_DIR]}/names.dmp -namesFile {input}/dsmz-names.dmp -prefix ncbi: -rejectionFile reject.txt -rejectNameType in-part -rejectNameType Includes -saturationFile saturate.txt -pattern '{{NAME}}\t{{TAXID}}\t{{CANONICAL}}\t{{TAXID_PATH}}\t{{POS_TAG}}\t{{RANK}}\t{{SPECIES_TAXID}}\t{{SPECIES_NAME}}\n' {config[OUTDIR]}/{config[NCBI_DIR]}/nodes.dmp {input}/dsmz-nodes.dmp >{output}'''
rule taxid_full:
output:
config['OUTDIR'] + '/taxid_full.txt'
input:
config['OUTDIR'] + '/' + config['DSMZ_MATCH_DIR']
shell:
'''{config[REWRITE_TAXONOMY]} -taxaDict -namesFile {config[OUTDIR]}/{config[NCBI_DIR]}/names.dmp -namesFile {input}/dsmz-names.dmp -prefix ncbi: -rejectionFile reject.txt -rejectNameType in-part -rejectNameType Includes -saturationFile saturate.txt -pattern '{{TAXID}}\t{{CANONICAL}}\t{{TAXID_PATH}}\t{{RANK}}\n' {config[OUTDIR]}/{config[NCBI_DIR]}/nodes.dmp {input}/dsmz-nodes.dmp >{output}'''