Commit 10ee2e45 authored by Penom Nom's avatar Penom Nom
Browse files

split sff pre-treatment : shhhflows

parent c39beb06
#
# Copyright (C) 2012 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import os,re
from subprocess import Popen, PIPE
from jflow.iotypes import OutputFile, OutputFileList, InputFile, InputFileList, Formats
from jflow.abstraction import MultiMap
from weaver.function import PythonFunction, ShellFunction
from ng6.analysis import Analysis
def create_group_file(output_directory,std_out,groups_files_names):
groups_files = []
for file in os.listdir(output_directory):
file = file.strip()
if file.endswith(".groups"):
groups_files.append(os.path.join(output_directory, file))
with open(groups_files_names, 'w') as outfile:
for fname in groups_files:
with open(fname) as infile:
for line in infile:
outfile.write(line)
outfile.close()
class MothurShhhFlows(Analysis):
def define_parameters(self, sff_files, sample_barcodes=None, bdiffs=0,processors=1):
"""
Define sffinfo component parameters.
:param sff_files: a sff file to process
:type sff_files: str
:param sample_barcodes: which barcodes were used in the experimentation
:type sample_barcodes: int
:param bdiffs: the maximum number of differences allowed with the barecode
:type bdiffs: int
"""
# define parameters
if sample_barcodes : self.sample_barcodes = sample_barcodes
else : self.sample_barcodes = None
self.barcodes_file = os.path.join(self.output_directory, 'barcodes.oligos')
self.bdiffs = bdiffs
self.processors = processors
# define input files
self.sff_files = InputFileList(sff_files, Formats.SFF)
# define sffinfo output files
self.fasta_files = OutputFileList(self.get_outputs('{basename_woext}.fasta', self.sff_files), Formats.FASTA)
self.qual_files = OutputFileList(self.get_outputs('{basename_woext}.qual', self.sff_files), Formats.QUAL)
self.flow_files = OutputFileList(self.get_outputs('{basename_woext}.flow', self.sff_files), Formats.FLOW)
self.sffinfo_stdout = OutputFileList(self.get_outputs('{basename_woext}.sffinfo.stdout', self.sff_files))
#define trim.flow output files
self.trim_flow_files = OutputFileList(self.get_outputs('{basename_woext}.flow.files', self.flow_files), Formats.FLOW)
self.scrap_flow_files = OutputFileList(self.get_outputs('{basename_woext}.scrap.flow', self.flow_files))
self.groups_files = OutputFileList(self.get_outputs('{basename_woext}.trim.groups', self.flow_files), Formats.MOTHUR_GROUPS)
self.trimflow_stdout = OutputFileList(self.get_outputs('{basename_woext}.trimflow.stdout', self.flow_files))
# define shhhflows output files
self.shhh_fasta_files = OutputFileList(self.get_outputs('{basename_woext}.shhh.fasta', self.flow_files), Formats.FASTA)
self.names_files = OutputFileList(self.get_outputs('{basename_woext}.shhh.names', self.flow_files), Formats.MOTHUR_NAMES)
self.shhh_stdout = OutputFileList(self.get_outputs('{basename_woext}.shhhflows.stdout', self.flow_files))
def define_analysis(self):
self.name = "roche 454 cleaning sff"
self.description = "Cleaning barcodes."
self.software = "mothur"
self.options = "#sffinfo(sff=FILE.sff);"
self.options += "#trim.flow(flow=FILE.flow,oligos=barcodes.oligos,bdiffs="+str(self.bdiffs)+");"
self.options += "#shhh.flows(file=FILE.flow.files);"
def get_version(self):
cmd = [self.get_exec_path("mothur"), "-version"]
p = Popen(cmd, stdout=PIPE, stderr=PIPE)
stdout, stderr = p.communicate()
return stdout.split()[1].split('=')[1]
def _get_id_seq_failed(self,scrap_files):
scrap_file_lines = open(scrap_files).readlines()
ids_failed = {}
firstline = scrap_file_lines.pop(0)
for line in scrap_file_lines:
m = re.search("(\w+)\|([a-zA-Z]*)",line)
if m:
seq = m.groups()[0]
id_failed = m.groups()[1]
if id_failed not in ids_failed:
ids_failed[id_failed] = 1
else:
ids_failed[id_failed] += 1
return ids_failed
def post_process(self):
groups_lines = open(self.groups_files[0]).readlines()
groups_seqs_count = {}
for line in groups_lines:
seq_id, group = map(str.strip, line.split())
if group not in groups_seqs_count:
groups_seqs_count[group] = 1
else:
groups_seqs_count[group] += 1
#number of sequences per sample
for sample,count in groups_seqs_count.items():
self._add_result_element(sample.replace('.','_'), "nb_seq", count)
#global sequences failed and not by sample
ids_failed = self._get_id_seq_failed(self.scrap_flow_files[0])
for ids,count in ids_failed.items():
self._add_result_element(ids, "ids_failed_trim_seqs", count)
#TODO add to the db
# #save barcodes
# self._add_result_element(os.path.basename(self.barcodes_file), "barcodes_file", self._save_file(self.barcodes_file, os.path.basename(self.barcodes_file)+".cfg"))
def process(self):
# create an barcode file to give as input to mothur
f_barcodes = open(self.barcodes_file,"w")
if self.sample_barcodes:
dict_barcode = dict(re.findall("[\w.]+", self.sample_barcodes)[i:i+2] for i in range(0, len(re.findall("[\w.]+", self.sample_barcodes)), 2))
for name,sequence in dict_barcode.items():
f_barcodes.write('barcode\t%s\t%s\n' % (sequence,name))
f_barcodes.close()
sffinfo = ShellFunction(self.get_exec_path("mothur") + ' "#sffinfo(sff=$1,outputdir='+self.output_directory+'/)" > $2', cmd_format='{EXE} {IN} {OUT}')
sffinfo = MultiMap(sffinfo, inputs=[self.sff_files], outputs=[self.sffinfo_stdout,self.fasta_files,self.qual_files,self.flow_files])
trimflows = ShellFunction(self.get_exec_path("mothur") + ' "#trim.flows(flow=$1,oligos='+self.barcodes_file+',bdiffs='+str(self.bdiffs)+',outputdir='+self.output_directory +'/)" > $2', \
cmd_format='{EXE} {IN} {OUT}')
trimflows = MultiMap(trimflows, inputs=[self.flow_files], outputs=[self.trimflow_stdout,self.trim_flow_files,self.scrap_flow_files])
shhhflows = ShellFunction(self.get_exec_path("mothur") + ' "#shhh.flows(file=$1,outputdir='+self.output_directory +'/)" > $2', cmd_format='{EXE} {IN} {OUT}')
shhhflows = MultiMap(shhhflows, inputs=[self.trim_flow_files], outputs=[self.shhh_stdout,self.shhh_fasta_files,self.names_files])
groupFiles = PythonFunction(create_group_file, cmd_format="{EXE} {ARG} {IN} {OUT}")
groupFiles(arguments=[self.output_directory],inputs=[self.shhh_stdout],outputs=[self.groups_files])
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment