Commit b1116c43 authored by Penom Nom's avatar Penom Nom
Browse files

add ustack component to radseq workflow

parent 9699b1c6
......@@ -19,7 +19,7 @@ import os
import glob
import sys
from ng6.ng6workflow import NG6Workflow, BasicNG6Workflow
from ng6.ng6workflow import NG6Workflow
class RADseq (NG6Workflow):
......@@ -33,34 +33,42 @@ class RADseq (NG6Workflow):
raise ValueError, "Duplicated pool id." + p['id']
pools[p["id"]] = (p, [])
barcodes = [] # array of tuples
indivs_by_name = {}
for indiv in self.args["individu"]:
pool_id = indiv['pool_id']
if not pools.has_key(pool_id):
raise ValueError, "The pool id " + pool_id + " does not exists in (individual " + indiv['indiv_name'] + ")"
pools[pool_id][1].append(indiv)
barcodes.append((indiv['indiv_name'],indiv['barcode']))
if indivs_by_name.has_key(indiv['indiv_name']) :
raise ValueError, "Duplicated individual name " + indiv['indiv_name']
indivs_by_name[indiv['indiv_name'] ] = indiv
# prepare fastq files and create a barcode file per pool
barcode_files = []
indiv_names = []
fastq_files_1 = []
fastq_files_2 = []
for pool_id, data in pools.iteritems() :
pooldata = data[0]
indivs = data[1]
fastq_files_1.append(pooldata['read1']);
if pooldata['read2'] :
fastq_files_2.append(pooldata['read2'])
# write barcode file
barcode_file = self.get_temporary_file()
barcode_files.append(barcode_file)
inames = []
with open(barcode_file, "w") as ff:
for i in barcodes :
ff.write(i[0] + "\t" + i[1] + "\n")
# ----- DEMULTIPLEX -----
# process each pairs of fastq for each pool
for pool_id, data in pools.iteritems() :
pooldata = data[0]
splitbc = self.add_component("SplitBC", [ pooldata['read1'], pooldata['read2'] if pooldata['read2'] else None,
indivs_by_name.keys(), barcode_file, self.args['enzyme'], self.args['mismatches'], self.args['tag_mismatch'],
self.args['trim'], self.args['forward']], component_prefix = pool_id)
for indiv in indivs :
inames.append(indiv['indiv_name'])
ff.write(indiv['indiv_name'] + "\t" + indiv['barcode'] +"\n")
indiv_names.append(inames)
#ustacks = self.add_component("Ustacks", [self.args["read_1"]])
ustacks = self.add_component("Ustacks" , [], {"indiv_dic": self.args["individu"], "read1_files" : splitbc.output_read1 , "max_locus" : 3 } , component_prefix = "ustacks")
splitbc = self.add_component("SplitBC", [ fastq_files_1,fastq_files_2 if fastq_files_2 else None,
indiv_names, barcode_files, self.args['enzyme'], self.args['mismatches'], self.args['tag_mismatch'],
self.args['trim'], self.args['forward']])
ustacks = self.add_component("Ustacks" , [], {"indiv_dic": indivs_by_name, "read1_files" : splitbc.output_read1 , "max_locus" : 3 } )
#cstacks = self.add_component("Cstacks", [ustacks.alleles, ustacks.snps, ustacks.tags, self.args["catalog_mismatches"]])
......@@ -17,12 +17,13 @@
import os
from jflow.component import Component
from jflow.iotypes import OutputFileList,OutputFile, InputFile, InputFileList, Formats
from jflow.abstraction import MultiMap
from weaver.function import ShellFunction
from ng6.analysis import Analysis
class SplitBC (Component):
class SplitBC (Analysis):
ENZYMES = {
'sbfI' : {
......@@ -37,15 +38,28 @@ class SplitBC (Component):
return (self.ENZYMES[name]['rad'], self.ENZYMES[name]['radtag'])
def define_parameters(self, fastq_file1, fastq_file2, indiv_names, barcode_file, enzyme,
def define_parameters(self, fastq_file1, fastq_file2, matrix_indiv_name, barcode_file, enzyme,
mismatches, tag_mismatch, trim = True, forward = True ):
self.indiv_names = indiv_names
self.fastq1 = InputFile(fastq_file1, Formats.FASTQ)
"""
@param fastq_file1: list of fastq_files path
@param fastq_file2: list of fastq_files path
@param matrix_indiv_name: list of list of individual names
@param barcode_file: list of bascode file path
@param enzyme: enzyme name
"""
check_len = len(fastq_file1) == len(matrix_indiv_name) == len(barcode_file)
self.fastq1 = OutputFileList(fastq_file1, Formats.FASTQ)
self.fastq2 = None
if fastq_file2 is not None:
self.fastq2 = InputFile(fastq_file2, Formats.FASTQ)
check_len = len(fastq_file1) == len(matrix_indiv_name) == len(fastq_file1) == len(barcode_file)
self.fastq2 = OutputFileList(fastq_file2, Formats.FASTQ)
if not check_len :
raise Exception("length of fastq_file1, fastq_file2, matrix_indiv_name and barcode_file must be the same")
self.barcode_file = InputFile(barcode_file, Formats.FASTQ)
self.barcode_file = OutputFileList(barcode_file)
self.mismatches = mismatches
self.rad, self.rad_tag = self.get_enzyme(enzyme )
self.tag_mismatch = tag_mismatch
......@@ -53,28 +67,54 @@ class SplitBC (Component):
self.forward = forward
self.prefix_r1 = os.path.join(self.output_directory , "%_1.fq")
self.prefix_r2 = os.path.join(self.output_directory , "%_2.fq")
self.output_read1 = OutputFileList(self.get_outputs('{basename_woext}_1.fq', self.indiv_names), Formats.FASTQ)
self.output_read2 = OutputFileList(self.get_outputs('{basename_woext}_2.fq', self.indiv_names), Formats.FASTQ)
self.stdout = OutputFile(os.path.join(self.output_directory, "splitbc.stdout"))
self.matrix_read1 = []
self.matrix_read2 = []
self.output_read1 = []
self.output_read2 = []
self.stdout = []
for id, inames in enumerate(matrix_indiv_name) :
outr1 = OutputFileList(self.get_outputs('{basename_woext}_1.fq', inames), Formats.FASTQ)
outr2 = OutputFileList(self.get_outputs('{basename_woext}_2.fq', inames), Formats.FASTQ)
self.matrix_read1.append(outr1)
self.matrix_read2.append(outr2)
self.output_read1 += outr1
self.output_read2 += outr2
self.stdout.append(OutputFile(os.path.join(self.output_directory , "splitbc" + str(id) + ".stdout")))
def get_version(self):
return "1.1"
def define_analysis(self):
self.name = "splitbc"
self.description = "Demultiplex individual"
self.software = "splitbc.pl"
self.options = ' '
def process(self):
strand = "--bol" if self.forward else "--eol"
if self.fastq2 is not None :
command = " ".join([
self.get_exec_path("splitbc.pl"), self.fastq1, self.fastq2, "--mismatches", self.mismatches,
"--bcfile", self.barcode_file, "--rad", self.rad, "--radTAG", self.rad_tag, "--TAG_mismatch", self.tag_mismatch,
"--trim" if self.trim else "", "--prefix-r1", self.prefix_r1, "--prefix-r2", self.prefix_r2, strand, ' 2>&1 > ', self.stdout
self.get_exec_path("splitbc.pl"), "$1", "$2", "--mismatches", self.mismatches,
"--bcfile", "$3", "--rad", self.rad, "--radTAG", self.rad_tag, "--TAG_mismatch", self.tag_mismatch,
"--trim" if self.trim else "", "--prefix-r1", self.prefix_r1, "--prefix-r2", self.prefix_r2, strand , ' 2>&1 >> $4 '
])
splitbc = ShellFunction(command, cmd_format='{EXE} {IN} {OUT}')
splitbc(inputs =[self.fastq1, self.fastq2], outputs = [self.output_read1, self.output_read2, self.stdout])
MultiMap(splitbc, inputs=[self.fastq1, self.fastq2, self.barcode_file], outputs=[self.stdout, self.matrix_read1, self.matrix_read2])
else :
command = " ".join([
self.get_exec_path("splitbc.pl"), self.fastq1,"--mismatches", self.mismatches,
"--bcfile", self.barcode_file, "--rad", self.rad, "--radTAG", self.rad_tag, "--TAG_mismatch", self.tag_mismatch,
"--trim" if self.trim else "", "--prefix-r1", self.prefix_r1, strand, ' 2>&1 > ', self.stdout
self.get_exec_path("splitbc.pl"), "$1","--mismatches", self.mismatches,
"--bcfile", "$2", "--rad", self.rad, "--radTAG", self.rad_tag, "--TAG_mismatch", self.tag_mismatch,
"--trim" if self.trim else "", "--prefix-r1", self.prefix_r1, strand, ' 2>&1 > $3 '
])
splitbc = ShellFunction(command, cmd_format='{EXE} {IN} {OUT}')
splitbc(inputs = self.fastq1, outputs = [self.output_read1, self.stdout])
MultiMap(splitbc, inputs=[self.fastq1, self.barcode_file], outputs=[self.stdout, self.matrix_read1])
def post_process(self):
print ""
\ No newline at end of file
......@@ -55,16 +55,36 @@ class Ustacks (Analysis):
self.software = "ustacks"
self.options = "-m " + str(self.min_cov) + " -M " + str(self.primary_mismatch) + " -N " + str(self.secondary_mismatch) + \
" --max_locus_stacks " + str(self.max_locus) + " -r -d -t gzfastq --model_type " + self.model_type
" --max_locus_stacks " + str(self.max_locus) + " -r -d -t gzfastq --model_type " + str(self.model_type)
if self.model_type=="snp" or self.model_type=="bounded" :
self.options+= " --alpha " + self.alpha
if self.model_type == "snp" or self.model_type == "bounded" :
self.options += " --alpha " + str(self.alpha)
if self.model_type == "bounded" :
self.options+= " --bound_low " + self.bound_low + " --bound_high " + self.bound_high
self.options += " --bound_low " + str(self.bound_low) + " --bound_high " + str(self.bound_high)
if self.model_type == "fixed" :
self.options+= " --bc_err_freq " + self.bc_err_freq
self.options+= " --bc_err_freq " + str(self.bc_err_freq)
def get_version(self):
cmd = [self.get_exec_path("ustacks"), "--version"]
p = Popen(cmd, stdout=PIPE, stderr=PIPE)
stdout, stderr = p.communicate()
return stderr.split()[1]
def process(self):
for idx, input in enumerate(self.read1_files):
parts = os.path.splitext(os.path.basename(input))
indiv = parts [0]
if parts[1] == ".gz" :
indiv = os.path.splitext(parts[0])
indiv = re.sub( r'_1$', '', indiv)
individ = self.indiv_dic[indiv]["id"]
format = "fastq" if os.path.splitext(os.path.basename(input))[1] != ".gz" else "gzfastq"
ustacks = ShellFunction(self.get_exec_path("ustacks") + " -t " + format + " -f $1 -d -r -o " + self.output_directory + " -i " + str(individ) +
" -m " + str(self.min_cov) + " -M " + str(self.primary_mismatch) + " -N " + str(self.secondary_mismatch) +
" 2> $2", cmd_format='{EXE} {IN} {OUT}')
ustacks(inputs = input, outputs = [self.stderrs[idx], self.tags[idx], self.alleles[idx], self.snps[idx]])
def post_process(self):
# Parse stderr
......@@ -146,21 +166,6 @@ class Ustacks (Analysis):
self._add_result_element("all", "meansnp_cov", meansnp_cov)
self._add_result_element("all", "mediansnp_cov", mediansnp_cov)
def get_version(self):
cmd = [self.get_exec_path("ustacks"), "--version"]
p = Popen(cmd, stdout=PIPE, stderr=PIPE)
stdout, stderr = p.communicate()
return stderr.split()[1]
def process(self):
for input in self.read1_files:
indiv = os.path.splitext(os.path.basename(input))[0] if os.path.splitext(os.path.basename(input))[1] != ".gz" else os.path.splitext(os.path.splitext(os.path.basename(input))[0])[0]
id=self.indiv_dic[indiv]["id"]
format="fastq" if os.path.splitext(os.path.basename(input))[1] != ".gz" else gzfastq
ustacks = ShellFunction(self.get_exec_path("ustacks") + " -t " + format + " -f $1 -d -r -o " + self.output_directory + " -i " + id + \
" -m " + str(self.min_cov) + " -M " + str(self.primary_mismatch) + " -N " + str(self.secondary_mismatch) + \
" 2> $2", cmd_format='{EXE} {IN} {OUT}')
ustacks(inputs = input, outputs = [self.stderrs[id], self.tags[id], self.alleles[id], self.snps[id]])
def __parse_stderr(self, stderr):
nb_seq_tot, nb_seq_used = "", ""
......
......@@ -52,6 +52,20 @@ read1=workflows/radseq/data/omymune_r1.fq.gz
read2=workflows/radseq/data/omymune_r2.fq.gz
#run
--project-id
1
--name
radseq
--description
The radseq demonstration workflow.
--date
02/01/2014
--data-nature
DNA
--sequencer
HiSeq 2000
--type
Region 1
--species
Escherichia coli
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment