Commit 6f1dbb49 authored by Penom Nom's avatar Penom Nom
Browse files

modification of splitbc to be totally independent from radseq pipeline

parent c4102f5a
......@@ -25,21 +25,24 @@ from weaver.function import ShellFunction
class Splitbc (Analysis):
def define_parameters(self, fastq_file1, barcode_file, indiv_names, fastq_file2 = [],
rad = None, rad_tag = None, mismatches = None , tag_mismatch = None , trim_barcode = False,
trim_reads2 = False, bol = True, eol = False, partial = None, no_adapt = False):
def define_parameters(self, fastq_file1, barcode_file, fastq_file2 = None, rad = None, rad_tag = None,
mismatches = None , tag_mismatch = None , trim_barcode = False, trim_reads2 = False,
bol = True, eol = False, partial = None, no_adapt = False):
"""
@param fastq_file1: list of fastq_files path
@param barcode_file: barcode file
@param indiv_names: list of list of individual names (example : [[], [] , []]). The length of this
indiv_names must be the same as the length of fastq_file1 (and fastq_file2 if provided)
@param fastq_file2: list of fastq_files path
@param rad:
@param rad_tag:
...
@param fastq_file1: fastq_file1 path or list of fastq file path
@param barcodes: barcode file path or list of barcode file path
@param fastq_file2: fastq_file2 path or list of fastq file path
"""
if not isinstance(fastq_file1, list):
raise Exception("fastq_file1 must be a list of file path")
if isinstance(fastq_file1, str) :
fastq_file1 = [fastq_file1]
elif not isinstance(fastq_file1, list):
raise Exception("fastq_file1 must be a string or a list of file path")
if isinstance(barcode_file, str) :
barcode_file = [barcode_file]
elif not isinstance(barcode_file, list) :
raise Exception("barcode_file must be a string or a list of file path")
if trim_barcode and trim_reads2 :
raise Exception("you must specify either trim_barcode or trim_reads2, but not both")
......@@ -50,8 +53,6 @@ class Splitbc (Analysis):
if (rad is not None and rad_tag is None) or (rad is None and rad_tag is not None) :
raise Exception("You must specify rad with rad_tag")
self.indiv_names = indiv_names
self.barcode_file = InputFile(barcode_file)
self.mismatches = mismatches
self.tag_mismatch = tag_mismatch
self.trim_barcode = trim_barcode
......@@ -62,17 +63,21 @@ class Splitbc (Analysis):
self.no_adapt = no_adapt
self.rad = rad
self.rad_tag = rad_tag
check_len = len(fastq_file1) == len(indiv_names)
self.fastq1 = InputFileList(fastq_file1, Formats.FASTQ)
self.fastq2 = None
if fastq_file2 :
check_len = len(fastq_file1) == len(indiv_names) == len(fastq_file1)
self.fastq2 = InputFileList(fastq_file2, Formats.FASTQ)
self.bcfile = self.get_temporary_file()
self.barcodes = InputFileList(barcode_file)
self.fastqs1 = InputFileList(fastq_file1, Formats.FASTQ)
self.fastqs2 = None
if not check_len :
raise Exception("length of fastq_file1, fastq_file2 and indiv_names must be the same")
if fastq_file2 :
if isinstance(fastq_file2, str) :
fastq_file2 = [fastq_file2]
elif not isinstance(fastq_file2, list) :
raise Exception("fastq_file2 must be a string or a list of file path")
self.fastqs2 = InputFileList(fastq_file2, Formats.FASTQ)
assert len(self.fastqs1) == len(self.fastqs2) , "if fastq_file1 and fastq_file2 are list, they must be of the same length"
assert len(self.fastqs1) == len(self.barcodes) , "if fastq_file1 and barcode_file are list, they must be of the same length"
self.matrix_read1 = []
self.matrix_read2 = []
......@@ -81,32 +86,39 @@ class Splitbc (Analysis):
self.stdouts = []
self.pools_output_dirs = []
for id, inames in enumerate(self.indiv_names) :
pool_outdir = "pool_" + str(id)
self.pools_output_dirs.append( os.path.join( self.output_directory , pool_outdir ) )
outr1 = OutputFileList(self.get_outputs(os.path.join( pool_outdir, '{basename_woext}_1.fq'), inames), Formats.FASTQ)
self.matrix_read1.append(outr1)
self.output_read1 += outr1
if self.fastq2 is not None :
outr2 = OutputFileList(self.get_outputs(os.path.join(pool_outdir, '{basename_woext}_2.fq'), inames), Formats.FASTQ)
self.matrix_read2.append(outr2)
self.output_read2 += outr2
self.stdouts = OutputFileList( self.get_outputs("splitBC_pool{FULL}.stdout", range(len(self.indiv_names))))
with open(self.bcfile, 'w') as bc_fh :
for id in range(len(self.fastqs1)):
pool_outdir = "pool_" + str(id)
self.pools_output_dirs.append( os.path.join( self.output_directory , pool_outdir ) )
inames = []
with open(self.barcodes[id]) as fh :
for line in fh :
line = line.rstrip()
if line :
bc_fh.write(line + "\n")
inames.append(line.split()[0])
outr1 = OutputFileList(self.get_outputs(os.path.join( pool_outdir, '{basename_woext}_1.fq'), inames), Formats.FASTQ)
self.matrix_read1.append(outr1)
self.output_read1 += outr1
if self.fastqs2 is not None :
outr2 = OutputFileList(self.get_outputs(os.path.join(pool_outdir, '{basename_woext}_2.fq'), inames), Formats.FASTQ)
self.matrix_read2.append(outr2)
self.output_read2 += outr2
self.stdouts = OutputFileList( self.get_outputs("splitBC_pool{FULL}.stdout", range(len(self.fastqs1))))
def get_version(self):
proc = subprocess.Popen( self.get_exec_path("splitbc.pl") + ' --version' , shell=True, stdout = subprocess.PIPE )
stdout, stderr = proc.communicate()
return stdout
def define_analysis(self):
self.name = "Demultiplexing"
self.description = "demultiplexing samples"
self.software = os.path.basename(self.get_exec_path("splitbc.pl"))
self.options = [ "--bcfile", self.barcode_file ]
self.options = [ "--bcfile", self.bcfile ]
if self.bol :
self.options.append('--bol')
......@@ -136,7 +148,7 @@ class Splitbc (Analysis):
self.options = ' '.join(self.options)
def post_process(self):
self._add_result_element("barcode_file", "barcode_file", self._save_file(self.barcode_file, "barcode_file") )
self._add_result_element("barcode_file", "barcode_file", self._save_file(self.bcfile, "barcode_file") )
rkey = 'R'
if self.output_read2 :
rkey = 'R1+R2'
......@@ -152,7 +164,7 @@ class Splitbc (Analysis):
self._add_result_element(name, rkey, val)
def process(self):
command_base = [self.get_exec_path("splitbc.pl"), "--bcfile", self.barcode_file]
command_base = [self.get_exec_path("splitbc.pl")]
if self.bol :
command_base.append('--bol')
......@@ -179,23 +191,24 @@ class Splitbc (Analysis):
if self.rad is not None and self.rad_tag is not None :
command_base.extend(["--rad", self.rad, "--radTAG", self.rad_tag])
for id, fastq1 in enumerate(self.fastq1):
for id, fastq1 in enumerate(self.fastqs1):
stdout = self.stdouts[id]
barcode = self.barcodes[id]
outputs_read1 = self.matrix_read1[id]
pool_outdir = self.pools_output_dirs[id]
command = ["mkdir", pool_outdir, ";"] + command_base
command.extend(["--prefix-r1", os.path.join(pool_outdir, "%_1.fq") ])
if self.fastq2 is not None :
fastq2 = self.fastq2[id]
command.extend(["--prefix-r1", os.path.join(pool_outdir, "%_1.fq") , '--bcfile', barcode])
if self.fastqs2 is not None :
fastq2 = self.fastqs2[id]
outputs_read2 = self.matrix_read2[id]
command.extend(["--prefix-r2", os.path.join(pool_outdir, "%_2.fq") , "$1", "$2", '2>&1 >> $3'])
command = ' '.join(command)
splitbc = ShellFunction(command, cmd_format='{EXE} {IN} {OUT}')
splitbc(includes = [self.barcode_file] , inputs=[ fastq1, fastq2 ], outputs=[stdout, outputs_read1, outputs_read2])
splitbc(includes = [barcode] , inputs=[fastq1, fastq2], outputs=[stdout, outputs_read1, outputs_read2])
else :
command.extend([ "$1", '2>&1 >> $2' ])
command = ' '.join(command)
splitbc = ShellFunction(command, cmd_format='{EXE} {IN} {OUT}')
splitbc(includes = [self.barcode_file] , inputs=[ fastq1], outputs=[stdout, outputs_read1])
splitbc(includes = [barcode] , inputs=[fastq1], outputs=[stdout, outputs_read1])
......@@ -62,25 +62,24 @@ class RADseq (NG6Workflow):
indiv_names = []
fastq_files_1 = []
fastq_files_2 = []
barcode_file = self.get_temporary_file()
barcode_files = []
for pool_id, data in pools.iteritems() :
pooldata = data[0]
indivs = data[1]
tmp_barcode = self.get_temporary_file()
barcode_files.append(tmp_barcode)
fastq_files_1.append(pooldata['read1']);
if pooldata['read2'] :
fastq_files_2.append(pooldata['read2'])
# write barcode file
inames = []
with open(barcode_file, "a") as ff:
with open(tmp_barcode, "w") as ff:
for indiv in indivs :
inames.append(indiv['indiv_name'])
ff.write(indiv['indiv_name'] + "\t" + indiv['barcode'] +"\n")
indiv_names.append(inames)
splitbc = self.add_component("Splitbc", [ fastq_files_1,barcode_file, indiv_names, fastq_files_2,
rad, rad_tag, self.args['mismatches'], self.args['tag_mismatch'],
self.args['trim_barcode'], self.args['trim_reads2']])
splitbc = self.add_component("Splitbc", [ fastq_files_1, barcode_files, fastq_files_2, rad, rad_tag, self.args['mismatches'],
self.args['tag_mismatch'], self.args['trim_barcode'], self.args['trim_reads2']])
#ustacks = self.add_component("Ustacks" , [], {"indiv_dic": indivs_by_name, "read1_files" : splitbc.output_read1 , "max_locus" : 3 } )
#cstacks = self.add_component("Cstacks", [ustacks.alleles, ustacks.snps, ustacks.tags, self.args["catalog_mismatches"]])
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment