Commit 48d51111 authored by Penom Nom's avatar Penom Nom
Browse files

Use split_pair and basename prefix

parent 8010a925
......@@ -36,6 +36,10 @@ class Utils(object):
# Files with this extensions won't be compressed if asked
UNCOMPRESS_EXTENSION = [".gz", ".zip", ".bam", ".bz", ".bz2"]
# CASAVA filename format
CASAVA_FILENAME = { 'sample' : 1, 'barcode' : 2, 'lane' : 3, 'read' : 4, 'package' : 5 }
CASAVA_FILENAME_SEPARATOR = '_'
@staticmethod
def gzip(file, out_dir, delete=False):
......@@ -236,4 +240,54 @@ class Utils(object):
for n in xrange(1,len(items)+1):
res += Utils.getUniqueCombinaison(items,n)
return res
\ No newline at end of file
@staticmethod
def split_pair( file_list, is_casava=False ):
"""
Return the list of read 1 and the list of read 1 from a list
@param file_list : the list
@param is_casava : files names in file_list are in CASVAVA format
"""
read_1_list = []
read_2_list = []
if is_casava:
for file in file_list:
file_name_fields = os.path.basename(file).split(Utils.CASAVA_FILENAME_SEPARATOR)
read_tag = file_name_fields[Utils.CASAVA_FILENAME['read']-1]
if read_tag == "R1":
read_1_list.append(file)
else:
read_2_list.append(file)
else:
sort_list = file_list
sort_list.sort()
for i in range(0,len(sort_list),2):
read_1_list.append(file_list[i])
read_2_list.append(file_list[i+1])
return [read_1_list, read_2_list]
@staticmethod
def get_group_basenames( file_list, group_by ):
"""
Return the list of prefix according to keywords. Ex : [/home/sampleA_ATGCTC_L001_R1_001.fastq.gz, /home/sampleA_ATGCTC_L001_R2_001.fastq.gz, /home/sampleB_ATGCCG_L001_R1_001.fastq.gz] => [sampleA_ATGCTC_L001_R1, sampleA_ATGCTC_L001_R2, sampleB_ATGCCG_L001_R1]
@param file_list : the list of files (files names are in CASVAVA format)
@param group_by : CASAVA_FILENAME key (ex : read)
"""
group_basenames = {}
for file in file_list:
file_name_fields = os.path.basename(file).split(Utils.CASAVA_FILENAME_SEPARATOR)
group_tag = Utils.CASAVA_FILENAME_SEPARATOR.join( file_name_fields[:Utils.CASAVA_FILENAME[group_by]] )
if group_basenames.has_key(group_tag) :
group_basenames[group_tag].append(file)
else:
group_basenames[group_tag] = [file]
return group_basenames
......@@ -22,6 +22,7 @@ import re
from ng6.ng6workflow import NG6Workflow
from ng6.project import Project
from ng6.run import Run
from ng6.utils import Utils
class Casava18 (NG6Workflow):
......@@ -104,10 +105,17 @@ class Casava18 (NG6Workflow):
def process(self):
# fastq illumina filter
fastqilluminafilter = self.add_component("FastqIlluminaFilter", [self.read1_files+self.read2_files, self.args["illumina_filter_gathering_regexp"], self.args["keep_reads"], self.runobj.name+"_fastqilluminafilter.tar.gz"])
group_prefix = Utils.get_group_basenames(self.read1_files+self.read2_files, "read")
fastqilluminafilter = self.add_component("FastqIlluminaFilter", [self.read1_files+self.read2_files, self.args["keep_reads"], group_prefix, self.runobj.name+"_fastqilluminafilter.tar.gz"])
##### TODO: concatenate fastq
# Split read 1 and read 2 from filtered files list
[filtered_read1_files, filtered_read2_files] = Utils.split_pair(fastqilluminafilter.fastq_files_filtered, True)
# archive the files
#addrawfiles = self.add_component("AddRawFiles", [self.runobj, files_to_save, self.args["compression"]])
# make some statistics on raw file
fastqc = self.add_component("FastQC", [fastqilluminafilter.fastq_files_filtered, True, True, self.runobj.name+"_fastqc.tar.gz"], parent = fastqilluminafilter)
......@@ -119,6 +127,6 @@ class Casava18 (NG6Workflow):
# index the reference genome if not already indexed
bwaindex = self.add_component("BWAIndex", [self.args["reference_genome"]])
# align reads against indexed genome
bwa = self.add_component("BWA", [bwaindex.databank, self.read1_files, self.read2_files])
bwa = self.add_component("BWA", [bwaindex.databank, filtered_read1_files, filtered_read2_files])
# make some statistic on the alignement
alignmentstats = self.add_component("AlignmentStats", [bwa.bam_files])
......@@ -102,8 +102,3 @@ keep_reads.flag = --keep
keep_reads.help = Keep reads which pass the Illumina filters or keep reads which not pass the Illumina filters (pass_illumina_filters|not_pass_illumina_filters)
keep_reads.default = pass_illumina_filters
keep_reads.choices = pass_illumina_filters|not_pass_illumina_filters
illumina_filter_gathering_regexp.name = illumina_filter_gathering_regexp
illumina_filter_gathering_regexp.flag = --illumina-filter-gathering-regexp
illumina_filter_gathering_regexp.help = regexp to gathering files by read by sample
illumina_filter_gathering_regexp.default = *_*_*_*_(*).stdout
......@@ -30,14 +30,14 @@ from weaver.function import ShellFunction
class FastqIlluminaFilter (Analysis):
def define_parameters(self, fastq_files, gathering_regexp="([^_]+).stdout", keep_reads="pass_illumina_filters", archive_name=None):
def define_parameters(self, fastq_files, keep_reads="pass_illumina_filters", group_prefix=None, archive_name=None):
self.fastq_files = InputFileList(fastq_files, Formats.FASTQ)
self.stdout = OutputFileList(self.get_outputs('{basename_woext}.stdout', self.fastq_files))
self.keep_reads = "N"
if keep_reads != "pass_illumina_filters":
self.keep_reads = "Y"
self.keep_reads = "Y"
self.archive_name = archive_name
self.regexp = gathering_regexp.replace("*", ".*")
self.group_prefix = group_prefix
# Outputs list if the file is not zip
if not self.fastq_files[0].endswith(".gz"):
self.fastq_files_filtered = OutputFileList(self.get_outputs('{basename_woext}_filtered.fastq', self.fastq_files), Formats.FASTQ)
......@@ -56,16 +56,20 @@ class FastqIlluminaFilter (Analysis):
def post_process(self):
files = {}
# Create dictionary : key = file name, value = file path
for file in self.stdout:
file_name = os.path.basename(file)
reg = re.search(self.regexp, file_name)
file_name = file_name[:reg.start(1)]+"0"+file_name[reg.end(1):]
if files.has_key(file_name):
files[file_name].append(file)
else:
# Create dictionary : key = file name or prefix, value = files path
if self.group_prefix is not None:
for file in self.stdout:
for prefix in self.group_prefix:
if os.path.basename(file).startswith(prefix):
if files.has_key(prefix):
files[prefix].append(file)
else:
files[prefix] = [file]
else:
for file in self.stdout:
file_name = os.path.splitext(os.path.basename(file))[0]
files[file_name] = [file]
# Merge analyses stat
for sample_file in files.keys():
tot_input = 0
......@@ -107,10 +111,10 @@ class FastqIlluminaFilter (Analysis):
def process(self):
# If the file is not zip
# If the file is not zip
if not self.fastq_files[0].endswith(".gz"):
fastq_illumina_filter = ShellFunction(self.get_exec_path("fastq_illumina_filter") + " --keep " + self.keep_reads + " -v -o $1 $3 > $2", cmd_format='{EXE} {OUT} {IN}')
# If the file is zip
# If the file is zip
else:
fastq_illumina_filter = ShellFunction("zcat $3 | " + self.get_exec_path("fastq_illumina_filter") + " --keep " + self.keep_reads + " -v 2> $2 | gzip > $1", cmd_format='{EXE} {OUT} {IN}')
fastq_illumina_filter = MultiMap(fastq_illumina_filter, inputs = self.fastq_files, outputs = [self.fastq_files_filtered,self.stdout])
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment