Commit 1b15e0fb authored by Jerome Mariette's avatar Jerome Mariette
Browse files

add genome_abundance

parent 5552f7e5
#
# Copyright (C) 2012 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import os, re
from ng6.ng6workflow import NG6Workflow
from ng6.utils import Utils
class GAAS (NG6Workflow):
def process(self):
gaas = self.add_component("GAAS", [self.args["data_files"], self.args["databases"], self.args["taxids"]])
\ No newline at end of file
#
# Copyright (C) 2012 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
#
# Copyright (C) 2012 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import os
import re
from subprocess import Popen, PIPE
from jflow.utils import get_argument_pattern
from jflow.component import Component
from jflow.iotypes import OutputFileList, InputFileList, InputFile, OutputDirectoryList, Formats
from jflow.abstraction import MultiMap
from jflow import seqio
from weaver.function import ShellFunction, PythonFunction
from weaver.abstraction import Merge
from ng6.analysis import Analysis
from ng6.utils import Utils
def get_splited_outputs(fasta_file, output_directory, max_sequences):
from jflow import seqio
reader = seqio.SequenceReader(fasta_file)
nb_seq = 0
for id, desc, seq, qualities in reader:
nb_seq += 1
if nb_seq <= int(max_sequences):
return [fasta_file]
else:
splited_files = []
basename_wext = os.path.splitext(os.path.basename(fasta_file))[0]
ext = os.path.splitext(os.path.basename(fasta_file))[1]
for i in range(int(round(float(nb_seq)/float(max_sequences)))):
splited_files.append(os.path.join(output_directory, basename_wext + "_" + str(i+1) + ext))
return splited_files
def split_fasta_file(fasta_file, output_directory, max_sequences, *outputs):
from jflow import seqio
reader = seqio.SequenceReader(fasta_file)
nb_seq = 0
for id, desc, seq, qualities in reader:
nb_seq += 1
if nb_seq > int(max_sequences):
splited_files = []
basename_wext = os.path.splitext(os.path.basename(fasta_file))[0]
ext = os.path.splitext(os.path.basename(fasta_file))[1]
for i in range(int(round(float(nb_seq)/float(max_sequences)))):
splited_files.append(os.path.join(output_directory, basename_wext + "_" + str(i+1) + ext))
reader = seqio.SequenceReader(fasta_file)
nb_seq, file_num = 0, 0
fh = open(splited_files[file_num], "w")
for id, desc, seq, qualities in reader:
nb_seq += 1
if nb_seq <= int(max_sequences):
seqio.writefasta(fh, [[id, desc, seq, qualities]])
else:
fh.close()
file_num += 1
fh = open(splited_files[file_num], "w")
nb_seq = 1
seqio.writefasta(fh, [[id, desc, seq, qualities]])
fh.close()
class GAAS (Analysis):
MAX_SEQUENCES_PER_FILE = 3
def define_parameters(self, fasta_files, databases, taxids, min_percent=0, min_q_perc=0, evalue="1e-05", hit_select="top", split=False):
self.min_percent = min_percent
self.min_q_perc = min_q_perc
self.hit_select = hit_select
self.evalue = evalue
self.split = split
self.databases = InputFileList(databases)
self.fasta_files = InputFileList(fasta_files)
self.taxids = InputFile (taxids)
self.tblastx_files = OutputFileList(self.get_outputs('gaas_{basename_woext}_vs_'+os.path.splitext(os.path.basename(self.databases[0]))[0]+'.tblastx', self.fasta_files))
if len(self.databases) > 1:
for db in self.databases[1:]:
self.tblastx_files.extend(OutputFileList(self.get_outputs('gaas_{basename_woext}_vs_'+os.path.splitext(os.path.basename(db))[0]+'.tblastx', self.fasta_files)))
self.krona_files = OutputFileList(self.get_outputs('{basename_woext}_krona.html', self.fasta_files))
def define_analysis(self):
self.name = "GAAS"
self.description = "Genome relative Abundance and Average Size."
self.software = "gaas"
self.options = "-p " + str(self.min_percent) + " -q " + str(self.min_q_perc) + " -s " + self.hit_select + " -gt 0 -gp 0 -gs 0 -e " + self.evalue
def get_version(self):
cmd = [self.get_exec_path("gaas"), "--version"]
p = Popen(cmd, stdout=PIPE, stderr=PIPE)
stdout, stderr = p.communicate()
return stdout.split()[2]
def _get_nb_lines(self, file):
nb_line = 0
for line in open(file).readlines():
nb_line += 1
return nb_line
def post_process(self):
for tblastx in self.tblastx_files:
self._save_file(tblastx, gzip=True)
for krona in self.krona_files:
self._save_file(krona)
for ffile in self.fasta_files:
reader = seqio.SequenceReader(ffile)
nb_seq = 0
for id, desc, seq, qualities in reader:
nb_seq += 1
self._add_result_element(os.path.basename(ffile), "nb_sequences", str(nb_seq))
for db in self.databases:
for tblastx in self.tblastx_files:
if os.path.basename(tblastx) == "gaas_" + os.path.splitext(os.path.basename(ffile))[0]+"_vs_"+os.path.splitext(os.path.basename(db))[0]+".tblastx":
self._add_result_element(os.path.basename(ffile), "nb_sequences", str(self._get_nb_lines(tblastx)), result_group=os.path.basename(db))
def process(self):
# first split all files
splited_files_by_file, splited_files, splited_files_by_tab = {}, [], []
os.mkdir(os.path.join(self.output_directory, "split"))
for ffile in self.fasta_files:
splited_files_by_file[os.path.basename(ffile)] = get_splited_outputs(ffile, os.path.join(self.output_directory, "split"), GAAS.MAX_SEQUENCES_PER_FILE)
splited_files.extend(get_splited_outputs(ffile, os.path.join(self.output_directory, "split"), GAAS.MAX_SEQUENCES_PER_FILE))
split = PythonFunction(split_fasta_file, cmd_format='{EXE} {IN} ' + os.path.join(self.output_directory, "split") + ' ' + str(GAAS.MAX_SEQUENCES_PER_FILE) + ' {OUT}')
split(inputs=ffile, outputs=splited_files_by_file[os.path.basename(ffile)])
splited_output_dirs, splited_tblastx, splited_tblastx_by_db = {}, None, {}
for database in self.databases:
splited_output_dirs[os.path.basename(database)] = OutputDirectoryList(self.get_outputs('{basename_woext}_vs_'+os.path.splitext(os.path.basename(database))[0], splited_files))
if splited_tblastx:
splited_tblastx.extend(OutputFileList(self.get_outputs('{basename_woext}_vs_'+os.path.splitext(os.path.basename(database))[0]+"/gaas_{basename_woext}.tblastx", splited_files)))
else:
splited_tblastx = OutputFileList(self.get_outputs('{basename_woext}_vs_'+os.path.splitext(os.path.basename(database))[0]+"/gaas_{basename_woext}.tblastx", splited_files))
splited_tblastx_by_db[os.path.basename(database)] = OutputFileList(self.get_outputs('{basename_woext}_vs_'+os.path.splitext(os.path.basename(database))[0]+"/gaas_{basename_woext}.tblastx", splited_files))
# create outputs directories for the stdout
for db in splited_output_dirs.keys():
for directory in splited_output_dirs[db]:
os.mkdir(directory)
# execute gaas
for database in self.databases:
gaas = ShellFunction( self.get_exec_path("gaas") + ' -f $1 -d ' + database + ' -a ' + self.taxids + ' -o $2 -x $2 ' + self.options, cmd_format='{EXE} {IN} {OUT}')
gaas = MultiMap(gaas, splited_files, [splited_output_dirs[os.path.basename(database)], splited_tblastx_by_db[os.path.basename(database)]])
# merge the results
merging_tblastx = {}
for file in splited_files_by_file.keys():
merging_tblastx[file] = []
for splited_file in splited_files_by_file[file]:
lookfor = "gaas_" + os.path.splitext(os.path.basename(splited_file))[0] + ".tblastx"
for tbfile in splited_tblastx:
if os.path.basename(tbfile) == lookfor:
merging_tblastx[file].append(tbfile)
merging_tblastx_db = {}
for ffile in merging_tblastx:
merging_tblastx_db[ffile] = {}
for db in self.databases:
lookfor = os.path.splitext(os.path.basename(db))[0]
merging_tblastx_db[ffile][lookfor] = []
for tblastx_file in merging_tblastx[ffile]:
if os.path.basename(os.path.dirname(tblastx_file)).endswith("_vs_"+lookfor):
merging_tblastx_db[ffile][lookfor].append(tblastx_file)
for ffile in self.fasta_files:
for db in self.databases:
concat_file_name = os.path.join(self.output_directory, "gaas_"+os.path.splitext(os.path.basename(ffile))[0]+"_vs_"+os.path.splitext(os.path.basename(db))[0]+".tblastx")
Merge(merging_tblastx_db[os.path.basename(ffile)][os.path.splitext(os.path.basename(db))[0]], concat_file_name)
for i, ffile in enumerate(self.fasta_files):
tb_files = []
for tf in self.tblastx_files:
if os.path.splitext(os.path.basename(tf))[0].startswith("gaas_"+os.path.splitext(os.path.basename(ffile))[0]):
tb_files.append(tf)
krona = ShellFunction( self.get_exec_path("ktImportBLAST") + ' $1 -o $2', cmd_format='{EXE} {IN} {OUT}')
krona = MultiMap(krona, tf, self.krona_files[i])
\ No newline at end of file
#
# Copyright (C) 2012 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
[global]
name = genome_abundance
description = Genome relative Abundance and Average Size.
#
# Parameter section
# param.name: the parameter display name
# .flag: the command line flag to use the argument
# .help: a brief description of what the parameter does
# .default [None]: the value produced if the parameter is not provided
# .type [str]: the parameter type that should be tested (str|int|date|localfile|bool|... all types defined in the types.py package)
# .choices [None]: a container of the allowable values for the parameter
# .required [False]: whether or not the command-line option may be omitted
# .action [store]: the basic type of action to be taken (store|append)
# .group [None]: gathers arguments into groups when displaying help messages / forms
# .exclude [None]: will make sure that there is only one arguments provided
#
[parameters]
data_files.name = Data files
data_files.flag = --data-files
data_files.help = The path to the file to process.
data_files.action = append
data_files.type = localfile
data_files.required = True
databases.name = Metagenome database
databases.flag = --databases
databases.help = The database to use to make the annotations
databases.action = append
databases.type = localfile
databases.required = True
taxids.name = Taxonomic ids
taxids.flag = --taxids
taxids.help = The taxids file provided by gaas
taxids.type = localfile
taxids.required = True
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment