Commit b1116c43 authored by Penom Nom's avatar Penom Nom
Browse files

add ustack component to radseq workflow

parent 9699b1c6
...@@ -19,7 +19,7 @@ import os ...@@ -19,7 +19,7 @@ import os
import glob import glob
import sys import sys
from ng6.ng6workflow import NG6Workflow, BasicNG6Workflow from ng6.ng6workflow import NG6Workflow
class RADseq (NG6Workflow): class RADseq (NG6Workflow):
...@@ -33,34 +33,42 @@ class RADseq (NG6Workflow): ...@@ -33,34 +33,42 @@ class RADseq (NG6Workflow):
raise ValueError, "Duplicated pool id." + p['id'] raise ValueError, "Duplicated pool id." + p['id']
pools[p["id"]] = (p, []) pools[p["id"]] = (p, [])
barcodes = [] # array of tuples
indivs_by_name = {} indivs_by_name = {}
for indiv in self.args["individu"]: for indiv in self.args["individu"]:
pool_id = indiv['pool_id'] pool_id = indiv['pool_id']
if not pools.has_key(pool_id): if not pools.has_key(pool_id):
raise ValueError, "The pool id " + pool_id + " does not exists in (individual " + indiv['indiv_name'] + ")" raise ValueError, "The pool id " + pool_id + " does not exists in (individual " + indiv['indiv_name'] + ")"
pools[pool_id][1].append(indiv) pools[pool_id][1].append(indiv)
barcodes.append((indiv['indiv_name'],indiv['barcode']))
if indivs_by_name.has_key(indiv['indiv_name']) : if indivs_by_name.has_key(indiv['indiv_name']) :
raise ValueError, "Duplicated individual name " + indiv['indiv_name'] raise ValueError, "Duplicated individual name " + indiv['indiv_name']
indivs_by_name[indiv['indiv_name'] ] = indiv indivs_by_name[indiv['indiv_name'] ] = indiv
# write barcode file # prepare fastq files and create a barcode file per pool
barcode_file = self.get_temporary_file() barcode_files = []
with open(barcode_file, "w") as ff: indiv_names = []
for i in barcodes : fastq_files_1 = []
ff.write(i[0] + "\t" + i[1] + "\n") fastq_files_2 = []
# ----- DEMULTIPLEX -----
# process each pairs of fastq for each pool
for pool_id, data in pools.iteritems() : for pool_id, data in pools.iteritems() :
pooldata = data[0] pooldata = data[0]
splitbc = self.add_component("SplitBC", [ pooldata['read1'], pooldata['read2'] if pooldata['read2'] else None, indivs = data[1]
indivs_by_name.keys(), barcode_file, self.args['enzyme'], self.args['mismatches'], self.args['tag_mismatch'], fastq_files_1.append(pooldata['read1']);
self.args['trim'], self.args['forward']], component_prefix = pool_id) if pooldata['read2'] :
fastq_files_2.append(pooldata['read2'])
#ustacks = self.add_component("Ustacks", [self.args["read_1"]]) # write barcode file
ustacks = self.add_component("Ustacks" , [], {"indiv_dic": self.args["individu"], "read1_files" : splitbc.output_read1 , "max_locus" : 3 } , component_prefix = "ustacks") barcode_file = self.get_temporary_file()
barcode_files.append(barcode_file)
inames = []
with open(barcode_file, "w") as ff:
for indiv in indivs :
inames.append(indiv['indiv_name'])
ff.write(indiv['indiv_name'] + "\t" + indiv['barcode'] +"\n")
indiv_names.append(inames)
splitbc = self.add_component("SplitBC", [ fastq_files_1,fastq_files_2 if fastq_files_2 else None,
indiv_names, barcode_files, self.args['enzyme'], self.args['mismatches'], self.args['tag_mismatch'],
self.args['trim'], self.args['forward']])
ustacks = self.add_component("Ustacks" , [], {"indiv_dic": indivs_by_name, "read1_files" : splitbc.output_read1 , "max_locus" : 3 } )
#cstacks = self.add_component("Cstacks", [ustacks.alleles, ustacks.snps, ustacks.tags, self.args["catalog_mismatches"]]) #cstacks = self.add_component("Cstacks", [ustacks.alleles, ustacks.snps, ustacks.tags, self.args["catalog_mismatches"]])
...@@ -17,12 +17,13 @@ ...@@ -17,12 +17,13 @@
import os import os
from jflow.component import Component
from jflow.iotypes import OutputFileList,OutputFile, InputFile, InputFileList, Formats from jflow.iotypes import OutputFileList,OutputFile, InputFile, InputFileList, Formats
from jflow.abstraction import MultiMap
from weaver.function import ShellFunction from weaver.function import ShellFunction
from ng6.analysis import Analysis
class SplitBC (Component):
class SplitBC (Analysis):
ENZYMES = { ENZYMES = {
'sbfI' : { 'sbfI' : {
...@@ -37,15 +38,28 @@ class SplitBC (Component): ...@@ -37,15 +38,28 @@ class SplitBC (Component):
return (self.ENZYMES[name]['rad'], self.ENZYMES[name]['radtag']) return (self.ENZYMES[name]['rad'], self.ENZYMES[name]['radtag'])
def define_parameters(self, fastq_file1, fastq_file2, indiv_names, barcode_file, enzyme, def define_parameters(self, fastq_file1, fastq_file2, matrix_indiv_name, barcode_file, enzyme,
mismatches, tag_mismatch, trim = True, forward = True ): mismatches, tag_mismatch, trim = True, forward = True ):
self.indiv_names = indiv_names """
self.fastq1 = InputFile(fastq_file1, Formats.FASTQ) @param fastq_file1: list of fastq_files path
@param fastq_file2: list of fastq_files path
@param matrix_indiv_name: list of list of individual names
@param barcode_file: list of bascode file path
@param enzyme: enzyme name
"""
check_len = len(fastq_file1) == len(matrix_indiv_name) == len(barcode_file)
self.fastq1 = OutputFileList(fastq_file1, Formats.FASTQ)
self.fastq2 = None self.fastq2 = None
if fastq_file2 is not None: if fastq_file2 is not None:
self.fastq2 = InputFile(fastq_file2, Formats.FASTQ) check_len = len(fastq_file1) == len(matrix_indiv_name) == len(fastq_file1) == len(barcode_file)
self.fastq2 = OutputFileList(fastq_file2, Formats.FASTQ)
self.barcode_file = InputFile(barcode_file, Formats.FASTQ)
if not check_len :
raise Exception("length of fastq_file1, fastq_file2, matrix_indiv_name and barcode_file must be the same")
self.barcode_file = OutputFileList(barcode_file)
self.mismatches = mismatches self.mismatches = mismatches
self.rad, self.rad_tag = self.get_enzyme(enzyme ) self.rad, self.rad_tag = self.get_enzyme(enzyme )
self.tag_mismatch = tag_mismatch self.tag_mismatch = tag_mismatch
...@@ -53,28 +67,54 @@ class SplitBC (Component): ...@@ -53,28 +67,54 @@ class SplitBC (Component):
self.forward = forward self.forward = forward
self.prefix_r1 = os.path.join(self.output_directory , "%_1.fq") self.prefix_r1 = os.path.join(self.output_directory , "%_1.fq")
self.prefix_r2 = os.path.join(self.output_directory , "%_2.fq") self.prefix_r2 = os.path.join(self.output_directory , "%_2.fq")
self.output_read1 = OutputFileList(self.get_outputs('{basename_woext}_1.fq', self.indiv_names), Formats.FASTQ) self.matrix_read1 = []
self.output_read2 = OutputFileList(self.get_outputs('{basename_woext}_2.fq', self.indiv_names), Formats.FASTQ) self.matrix_read2 = []
self.stdout = OutputFile(os.path.join(self.output_directory, "splitbc.stdout")) self.output_read1 = []
self.output_read2 = []
self.stdout = []
for id, inames in enumerate(matrix_indiv_name) :
outr1 = OutputFileList(self.get_outputs('{basename_woext}_1.fq', inames), Formats.FASTQ)
outr2 = OutputFileList(self.get_outputs('{basename_woext}_2.fq', inames), Formats.FASTQ)
self.matrix_read1.append(outr1)
self.matrix_read2.append(outr2)
self.output_read1 += outr1
self.output_read2 += outr2
self.stdout.append(OutputFile(os.path.join(self.output_directory , "splitbc" + str(id) + ".stdout")))
def get_version(self):
return "1.1"
def define_analysis(self):
self.name = "splitbc"
self.description = "Demultiplex individual"
self.software = "splitbc.pl"
self.options = ' '
def process(self): def process(self):
strand = "--bol" if self.forward else "--eol" strand = "--bol" if self.forward else "--eol"
if self.fastq2 is not None : if self.fastq2 is not None :
command = " ".join([ command = " ".join([
self.get_exec_path("splitbc.pl"), self.fastq1, self.fastq2, "--mismatches", self.mismatches, self.get_exec_path("splitbc.pl"), "$1", "$2", "--mismatches", self.mismatches,
"--bcfile", self.barcode_file, "--rad", self.rad, "--radTAG", self.rad_tag, "--TAG_mismatch", self.tag_mismatch, "--bcfile", "$3", "--rad", self.rad, "--radTAG", self.rad_tag, "--TAG_mismatch", self.tag_mismatch,
"--trim" if self.trim else "", "--prefix-r1", self.prefix_r1, "--prefix-r2", self.prefix_r2, strand, ' 2>&1 > ', self.stdout "--trim" if self.trim else "", "--prefix-r1", self.prefix_r1, "--prefix-r2", self.prefix_r2, strand , ' 2>&1 >> $4 '
]) ])
splitbc = ShellFunction(command, cmd_format='{EXE} {IN} {OUT}') splitbc = ShellFunction(command, cmd_format='{EXE} {IN} {OUT}')
splitbc(inputs =[self.fastq1, self.fastq2], outputs = [self.output_read1, self.output_read2, self.stdout]) MultiMap(splitbc, inputs=[self.fastq1, self.fastq2, self.barcode_file], outputs=[self.stdout, self.matrix_read1, self.matrix_read2])
else : else :
command = " ".join([ command = " ".join([
self.get_exec_path("splitbc.pl"), self.fastq1,"--mismatches", self.mismatches, self.get_exec_path("splitbc.pl"), "$1","--mismatches", self.mismatches,
"--bcfile", self.barcode_file, "--rad", self.rad, "--radTAG", self.rad_tag, "--TAG_mismatch", self.tag_mismatch, "--bcfile", "$2", "--rad", self.rad, "--radTAG", self.rad_tag, "--TAG_mismatch", self.tag_mismatch,
"--trim" if self.trim else "", "--prefix-r1", self.prefix_r1, strand, ' 2>&1 > ', self.stdout "--trim" if self.trim else "", "--prefix-r1", self.prefix_r1, strand, ' 2>&1 > $3 '
]) ])
splitbc = ShellFunction(command, cmd_format='{EXE} {IN} {OUT}') splitbc = ShellFunction(command, cmd_format='{EXE} {IN} {OUT}')
splitbc(inputs = self.fastq1, outputs = [self.output_read1, self.stdout]) MultiMap(splitbc, inputs=[self.fastq1, self.barcode_file], outputs=[self.stdout, self.matrix_read1])
def post_process(self):
print ""
\ No newline at end of file
...@@ -55,17 +55,37 @@ class Ustacks (Analysis): ...@@ -55,17 +55,37 @@ class Ustacks (Analysis):
self.software = "ustacks" self.software = "ustacks"
self.options = "-m " + str(self.min_cov) + " -M " + str(self.primary_mismatch) + " -N " + str(self.secondary_mismatch) + \ self.options = "-m " + str(self.min_cov) + " -M " + str(self.primary_mismatch) + " -N " + str(self.secondary_mismatch) + \
" --max_locus_stacks " + str(self.max_locus) + " -r -d -t gzfastq --model_type " + self.model_type " --max_locus_stacks " + str(self.max_locus) + " -r -d -t gzfastq --model_type " + str(self.model_type)
if self.model_type=="snp" or self.model_type=="bounded" : if self.model_type == "snp" or self.model_type == "bounded" :
self.options+= " --alpha " + self.alpha self.options += " --alpha " + str(self.alpha)
if self.model_type == "bounded" : if self.model_type == "bounded" :
self.options+= " --bound_low " + self.bound_low + " --bound_high " + self.bound_high self.options += " --bound_low " + str(self.bound_low) + " --bound_high " + str(self.bound_high)
if self.model_type == "fixed" : if self.model_type == "fixed" :
self.options+= " --bc_err_freq " + self.bc_err_freq self.options+= " --bc_err_freq " + str(self.bc_err_freq)
def get_version(self):
cmd = [self.get_exec_path("ustacks"), "--version"]
p = Popen(cmd, stdout=PIPE, stderr=PIPE)
stdout, stderr = p.communicate()
return stderr.split()[1]
def process(self):
for idx, input in enumerate(self.read1_files):
parts = os.path.splitext(os.path.basename(input))
indiv = parts [0]
if parts[1] == ".gz" :
indiv = os.path.splitext(parts[0])
indiv = re.sub( r'_1$', '', indiv)
individ = self.indiv_dic[indiv]["id"]
format = "fastq" if os.path.splitext(os.path.basename(input))[1] != ".gz" else "gzfastq"
ustacks = ShellFunction(self.get_exec_path("ustacks") + " -t " + format + " -f $1 -d -r -o " + self.output_directory + " -i " + str(individ) +
" -m " + str(self.min_cov) + " -M " + str(self.primary_mismatch) + " -N " + str(self.secondary_mismatch) +
" 2> $2", cmd_format='{EXE} {IN} {OUT}')
ustacks(inputs = input, outputs = [self.stderrs[idx], self.tags[idx], self.alleles[idx], self.snps[idx]])
def post_process(self): def post_process(self):
# Parse stderr # Parse stderr
nb_tot, nb_used = 0, 0 nb_tot, nb_used = 0, 0
...@@ -146,22 +166,7 @@ class Ustacks (Analysis): ...@@ -146,22 +166,7 @@ class Ustacks (Analysis):
self._add_result_element("all", "meansnp_cov", meansnp_cov) self._add_result_element("all", "meansnp_cov", meansnp_cov)
self._add_result_element("all", "mediansnp_cov", mediansnp_cov) self._add_result_element("all", "mediansnp_cov", mediansnp_cov)
def get_version(self):
cmd = [self.get_exec_path("ustacks"), "--version"]
p = Popen(cmd, stdout=PIPE, stderr=PIPE)
stdout, stderr = p.communicate()
return stderr.split()[1]
def process(self):
for input in self.read1_files:
indiv = os.path.splitext(os.path.basename(input))[0] if os.path.splitext(os.path.basename(input))[1] != ".gz" else os.path.splitext(os.path.splitext(os.path.basename(input))[0])[0]
id=self.indiv_dic[indiv]["id"]
format="fastq" if os.path.splitext(os.path.basename(input))[1] != ".gz" else gzfastq
ustacks = ShellFunction(self.get_exec_path("ustacks") + " -t " + format + " -f $1 -d -r -o " + self.output_directory + " -i " + id + \
" -m " + str(self.min_cov) + " -M " + str(self.primary_mismatch) + " -N " + str(self.secondary_mismatch) + \
" 2> $2", cmd_format='{EXE} {IN} {OUT}')
ustacks(inputs = input, outputs = [self.stderrs[id], self.tags[id], self.alleles[id], self.snps[id]])
def __parse_stderr(self, stderr): def __parse_stderr(self, stderr):
nb_seq_tot, nb_seq_used = "", "" nb_seq_tot, nb_seq_used = "", ""
for line in open(stderr).readlines(): for line in open(stderr).readlines():
......
...@@ -52,6 +52,20 @@ read1=workflows/radseq/data/omymune_r1.fq.gz ...@@ -52,6 +52,20 @@ read1=workflows/radseq/data/omymune_r1.fq.gz
read2=workflows/radseq/data/omymune_r2.fq.gz read2=workflows/radseq/data/omymune_r2.fq.gz
#run
--project-id
1
--name
radseq
--description
The radseq demonstration workflow.
--date
02/01/2014
--data-nature
DNA
--sequencer
HiSeq 2000
--type
Region 1
--species
Escherichia coli
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment