Commit b246a603 authored by Maria Bernard's avatar Maria Bernard
Browse files

No commit message

No commit message
parent b890ec4f
......@@ -28,8 +28,11 @@ from ng6.analysis import Analysis
from ng6.utils import Utils
import jflow.seqio as seqio
list_enz=['apeKI', 'bamHI', 'claI', 'dpnII', 'eaeI', 'ecoRI','ecoT22I', 'hindIII', 'mluCI', 'mseI', 'mspI', 'ndeI','nheI', 'nlaIII',\
'notI', 'nsiI', 'pstI', 'sau3AI','sbfI', 'sexAI', 'sgrAI', 'sphI', 'taqI', 'xbaI']
class Process_radtag (Analysis):
def recover_mate_discards (read1,read2, output_file):
# enregistrement des ID de sequences 1
id_R1=[]
......@@ -93,46 +96,47 @@ class Process_radtag (Analysis):
i=0
handle.close()
def define_parameters(self, read1_files, read2_files, enzyme_name=None, limit_score=10, quality_encode = 'phred33', max_length=None, rescue_radtag=False, discard_low_qual=False, discard_read_files=False, archive_name=False):
def define_parameters(self, read1_files, read2_files, uncall_remove=True, discard_low_qual=True, rescue_radtag=False, max_length=None, \
quality_encode = 'phred33', keep_discard_read=True, window_size=0.15, limit_score=10, enzyme_name=None, archive_name=False):
"""
@param read1_files : paths to reads 1
@param read2_files : paths to reads 2
@param enzyme_name : provide the restriction enzyme used (cut site occurs on single-end read)
Currently supported enzymes include:
'apeKI', 'bamHI', 'claI', 'dpnII', 'eaeI', 'ecoRI',
'ecoT22I', 'hindIII', 'mluCI', 'mseI', 'mspI', 'ndeI',
'nheI', 'nlaIII', 'notI', 'nsiI', 'pstI', 'sau3AI',
'sbfI', 'sexAI', 'sgrAI', 'sphI', 'taqI', or 'xbaI'.
@param limit_score : set the score limit. If the average score within the sliding window drops below this value, the read is discarded (default 10).
@param quality_encode : specify how quality scores are encoded, 'phred33' (Illumina 1.8+, Sanger, default) or 'phred64' (Illumina 1.3 - 1.5).
@param rescue_radtag : rescue barcodes and RAD-Tags.
@param discard_low_qual : discard reads with low quality scores.
@param discard_read_files : capture discarded reads to a file.
@param max_length : truncate final read length to this value. (default none)
@param read1_files (-1): paths to reads 1
@param read2_files (-2): paths to reads 2
@param uncall_remove (-c): clean data, remove any read with an uncalled base.
@param discard_low_qual (-q): discard reads with low quality scores.
@param rescue_radtag (-r) : rescue barcodes and RAD-Tags.
@param max_length (-t) : truncate final read length to this value. (default none)
@param quality_encode (-E) : specify how quality scores are encoded, 'phred33' (Illumina 1.8+, Sanger, default) or 'phred64' (Illumina 1.3 - 1.5).
@param keep_discard_read (-D): capture discarded reads to a file.
@param window_size (-w) : set the size of the sliding window as a fraction of the read length, between 0 and 1 (default 0.15).
@param limit_score (-s) : set the score limit. If the average score within the sliding window drops below this value, the read is discarded (default 10).
@param enzyme_name (-e) : -e [enz], --renz_1 [enz]: provide the restriction enzyme used (cut site occurs on single-end read)
@param archive_name : name for the output archive
"""
self.read1_files = InputFileList(read1_files)#, Formats.FASTQ)
self.read2_files = InputFileList(read2_files)#, Formats.FASTQ)
self.add_input_file_list( "read1_files", "paths to reads 1", default=read1_files, required=True, file_format = 'fastq')
self.add_input_file_list( "read2_files", "paths to reads 1", default=read2_files, required=True, file_format = 'fastq')
if len(read1_files) != len(read2_files):
raise Exception("[ERROR] : the number of files is not correct! (the number of files in read1_files and in read2_files must be the same)")
self.enzyme_name = enzyme_name
self.limit_score = limit_score
self.quality_encode = quality_encode
self.rescue_radtag = rescue_radtag
self.discard_low_qual = discard_low_qual
self.discard_read_files = discard_read_files
self.max_length = max_length
self.archive_name = archive_name
self.add_parameter("uncall_remove", "clean data, remove any read with an uncalled base.", default=uncall_remove)
self.add_parameter("discard_low_qual", "discard reads with low quality scores.", default=discard_low_qual)
self.add_parameter("rescue_radtag", "rescue barcodes and RAD-Tags.", default=rescue_radtag)
self.add_parameter("max_length", "truncate final read length to this value. (default none)", default=max_length)
self.add_parameter("quality_encode", "specify how quality scores are encoded, 'phred33' (Illumina 1.8+, Sanger, default) or 'phred64' (Illumina 1.3 - 1.5).", default=quality_encode)
self.add_parameter("keep_discard_read", "capture discarded reads to a file.", default=keep_discard_read)
self.add_parameter("window_size", "set the size of the sliding window as a fraction of the read length, between 0 and 1 (default 0.15).", default=window_size)
self.add_parameter("limit_score", "set the score limit. If the average score within the sliding window drops below this value, the read is discarded (default 10).", default=limit_score)
self.add_parameter("enzyme_name", "provide the restriction enzyme used (cut site occurs on single-end read)", default=limit_score)
self.archive_name = archive_name
self.prefixes = self.get_outputs('{basename_woext}', [read1_files, read2_files])
self.output_read_1 = OutputFileList(self.get_outputs('{basename}.gz', self.read1_files), Formats.FASTQ)
self.output_read_2 = OutputFileList(self.get_outputs('{basename}.gz', self.read2_files), Formats.FASTQ)
self.output_read_2 = OutputFileList(self.get_outputs('{basename}.gz', self.read2_files), Formats.FASTQ)
self.discard_read_1 = OutputFileList(self.get_outputs('{basename}.discard.gz', self.read1_files), Formats.FASTQ)
self.discard_read_2 = OutputFileList(self.get_outputs('{basename}.discard.gz', self.read2_files), Formats.FASTQ)
self.stderrs = OutputFileList(self.get_outputs('{basename_woext}.stderr', self.prefixes))
def define_analysis(self):
self.name = "Process radtag"
......@@ -206,26 +210,24 @@ Currently supported enzymes include:
return stdout.split()[1]
def process(self):
# Tmp output
# Creates list for temporary uncompressed files
tmp_output_read_1 = os.path.join(self.output_directory, self.get_outputs('{basename}',self.read1_files)
tmp_output_read_2 = os.path.join(self.output_directory, self.get_outputs('{basename}',self.read2_files)
# Tmp output
# Creates list for temporary uncompressed files
tmp_output_read_1 = os.path.join(self.output_directory, self.get_outputs('{basename}',self.read1_files))
tmp_output_read_2 = os.path.join(self.output_directory, self.get_outputs('{basename}',self.read2_files))
tmp_discard_read_1 = os.path.join(self.output_directory, self.get_outputs('{basename}.discard',self.read1_files))
tmp_discard_read_2 = os.path.join(self.output_directory, self.get_outputs('{basename}.discard',self.read2_files))
tmp_discard_read_1 = os.path.join(self.output_directory, self.get_outputs('{basename}.discard',self.read1_files)
tmp_discard_read_2 = os.path.join(self.output_directory, self.get_outputs('{basename}.discard',self.read2_files)
# Process radtags read1 files
for i in range(0, len(self.prefixes)):
for i in range(0, len(self.prefixes)):
process_radtag = ShellFunction(self.get_exec_path("process_radtags") + " -f $1 " + self.options + " -o " + self.output_directory + " 2> $2 ", cmd_format='{EXE} {IN} {OUT}')
process_radtag(inputs = [self.read1_files[i]], outputs = [tmp_output_read_1[i], tmp_discard_read_1[i], self.stderrs[i]])
# Recover_mate and recover_discard
recover_mate = PythonFunction(recover_mate_ok, cmd_format="{EXE} {IN} {OUT}")
recover_mate = Map(recover_mate, inputs = [tmp_output_read_1, self.input_read_2, tmp_output_read_2], outputs=[tmp_output_read_2])
recover_discard = PythonFunction(recover_mate_discard, cmd_format="{EXE} {IN} {OUT}")
recover_discard = Map(recover_discard, inputs = [tmp_discard_read_1, self.input_read_2, tmp_discard_read_2], outputs=[tmp_discard_read_2])
recover_mate = PythonFunction(recover_mate_ok, cmd_format="{EXE} {IN} {OUT}")
recover_mate = Map(recover_mate, inputs = [tmp_output_read_1, self.input_read_2, tmp_output_read_2], outputs=[tmp_output_read_2])
recover_discard = PythonFunction(recover_mate_discard, cmd_format="{EXE} {IN} {OUT}")
recover_discard = Map(recover_discard, inputs = [tmp_discard_read_1, self.input_read_2, tmp_discard_read_2], outputs=[tmp_discard_read_2])
# Compress
compress = ShellFunction("gzip $1 $2 $3", cmd_format='{EXE} {IN} {OUT}')
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment