Commit 1c19a659 authored by Penom Nom's avatar Penom Nom
Browse files

Merge CASAVA and Illumina pipelines (part 1).

parent 95da49cf
......@@ -262,13 +262,12 @@ class Utils(object):
else:
read_2_list.append(file)
else:
sort_list = file_list
sort_list.sort()
sorted_list = sorted( file_list )
for i in range(0,len(sort_list),2):
read_1_list.append(file_list[i])
read_2_list.append(file_list[i+1])
for i in range(0,len(sorted_list),2):
read_1_list.append(sorted_list[i])
read_2_list.append(sorted_list[i+1])
return [read_1_list, read_2_list]
......@@ -328,4 +327,73 @@ class Utils(object):
if os.path.basename(file_path).startswith(current_prefix):
path_groups[current_prefix].append(file_path)
return path_groups
\ No newline at end of file
return path_groups
@staticmethod
def filesFromCasava( casava_directory, project_name, lane_number ):
"""
Returns index description, list of R1 and list of R2 files from a CASAVA directory.
@param casava_directory : path to CASAVA output directory
@param project_name : files in returned lists are part of this project
@param lane_number : files in returned lists are sequenced on this lane
"""
read1_files = []
read2_files = []
mids_desc_array = {}
# Parse the sample sheet
if not os.path.isfile(os.path.join(casava_directory, "SampleSheet.mk")):
raise ValueError, "The folder " + casava_directory + " doesn't contain the file SampleSheet.mk."
barcodes = ""
sample_ids = ""
subdirs = ""
samples = []
# Retrieve all information for samples in lane
for line in open(os.path.join(casava_directory, "SampleSheet.mk")).readlines():
# Retrieve barcodes
if line.startswith("l" + lane_number + "_BARCODES"):
parts = line.strip().split(":=")
barcodes_list = parts[1].split(" ")
for i in range(len(barcodes_list)):
samples.append({'barcode':barcodes_list[i]})
# Retrieve samples ids
elif line.startswith("l" + lane_number + "_SAMPLEIDS" ):
parts = line.strip().split(":=")
sample_ids_list = parts[1].split(" ")
for i in range(len(sample_ids_list)):
samples[i]['sample_id'] = sample_ids_list[i]
# Retrieve folder
elif line.startswith("l" + lane_number + "_SUBDIRS"):
parts = line.strip().split(":=")
subdirs_list = parts[1].split(" ")
for i in range(len(subdirs_list)):
samples[i]['subdir'] = subdirs_list[i]
# Filter on project name
aux_samples = []
for current_sample in samples:
if re.match("Project_" + project_name + "/Sample_.+", current_sample['subdir']) is not None:
aux_samples.append(current_sample)
samples = aux_samples
if len(samples) == 0:
raise ValueError, "The project " + project_name + " in lane " + lane_number + " doesn't exist in CASVA directory."
# Create files lists
for current_sample in samples:
if not current_sample['subdir'].startswith("Undetermined_indices"): # Skip the folder with the incorrect indexes
# Write line in the index description
if current_sample['barcode'] != "NoIndex":
mids_desc_array[current_sample['barcode']] = current_sample['sample_id']
# Write files lists
for file in os.listdir(casava_directory + "/" + current_sample['subdir']):
if file.endswith(".fastq.gz") and re.search(".*_L00" + lane_number + "_.*", file):
if re.search(".*_R1_.*", file):
read1_files.append(casava_directory + "/" + current_sample['subdir'] + "/" + file)
if re.search(".*_R2_.*", file):
read2_files.append(casava_directory + "/" + current_sample['subdir'] + "/" + file)
return mids_desc_array, read1_files, read2_files
\ No newline at end of file
......@@ -26,107 +26,60 @@ from ng6.utils import Utils
class CasavaQualityCheck (NG6Workflow):
def pre_process(self):
self.project = None
self.runobj = None
# Build the project
self.project = Project.get_from_id(self.args["project_id"])
# Build the run
self.runobj = Run(self.args["run_name"], self.args["run_date"], self.args["species"], self.args["data_nature"],
self.args["type"], self.args["run_description"], self.args["sequencer"])
# Parse the sample sheet
if not os.path.isfile(os.path.join(self.args["casava_directory"], "SampleSheet.mk")):
raise ValueError, "The folder " + self.args["casava_directory"] + " doesn't contain the file SampleSheet.mk."
barcodes = ""
sample_ids = ""
subdirs = ""
samples = []
# Retrieve all information for samples in lane
for line in open(os.path.join(self.args["casava_directory"], "SampleSheet.mk")).readlines():
# Retrieve barcodes
if line.startswith("l"+self.args["lane_number"]+"_BARCODES"):
parts = line.strip().split(":=")
barcodes_list = parts[1].split(" ")
for i in range(len(barcodes_list)):
samples.append({'barcode':barcodes_list[i]})
# Retrieve samples ids
elif line.startswith("l"+self.args["lane_number"]+"_SAMPLEIDS"):
parts = line.strip().split(":=")
sample_ids_list = parts[1].split(" ")
for i in range(len(sample_ids_list)):
samples[i]['sample_id'] = sample_ids_list[i]
# Retrieve folder
elif line.startswith("l"+self.args["lane_number"]+"_SUBDIRS"):
parts = line.strip().split(":=")
subdirs_list = parts[1].split(" ")
for i in range(len(subdirs_list)):
samples[i]['subdir'] = subdirs_list[i]
# Filter on project name
aux_samples = []
for current_sample in samples:
if re.match("Project_" + self.project.get_name() + "/Sample_.+", current_sample['subdir']) is not None:
aux_samples.append(current_sample)
samples = aux_samples
if len(samples) == 0:
raise ValueError, "The project " + self.project.get_name() + " in lane " + self.args["lane_number"] + " doesn't exist in CASVA directory."
# Create files lists
self.read1_files = []
self.read2_files = []
mids_desc_array = {}
for current_sample in samples:
if not current_sample['subdir'].startswith("Undetermined_indices"): # Skip the folder with the incorrect indexes
# Write line in the index description
if current_sample['barcode'] != "NoIndex":
mids_desc_array[current_sample['barcode']] = current_sample['sample_id']
# Write files lists
for file in os.listdir(self.args["casava_directory"] + "/" + current_sample['subdir']):
if file.endswith(".fastq.gz") and re.search(".*_L00" + self.args["lane_number"] + "_.*", file):
if re.search(".*_R1_.*", file):
self.read1_files.append(self.args["casava_directory"] + "/" + current_sample['subdir'] + "/" + file)
if re.search(".*_R2_.*", file):
self.read2_files.append(self.args["casava_directory"] + "/" + current_sample['subdir'] + "/" + file)
self.runobj.add_mids_description(mids_desc_array)
# then add the run to the project
self.project.add_run(self.runobj)
def process(self):
def process(self):
# handle if run name have spaces
run_name = "_".join(self.runobj.name.split())
# manage the sequences files
group_prefix = None
if self.args['casava_directory'] is not None :
mids_desc_array, self.read1_files, self.read2_files = Utils.filesFromCasava( self.args['casava_directory'], self.project.get_name(), self.args['lane_number'] )
group_prefix = (Utils.get_group_basenames(self.read1_files+self.read2_files, "read")).keys()
self.runobj.add_mids_description(mids_desc_array)
elif (self.args['read_1'] is not None) and (len(self.args['read_1']) > 0) :
self.read1_files = []
self.read2_files = []
for file in self.args["read_1"]:
if os.path.isfile(file):
self.read1_files.append(file)
else:
raise IOError, file + " file does not exists"
if self.args["read_2"]:
for file in self.args["read_2"]:
if os.path.isfile(file):
self.read2_files.append(file)
else:
raise IOError, file + " file does not exists"
else:
raise ValueError, "casava_directory OR read(s) must be specified"
# fastq illumina filter
group_prefix = (Utils.get_group_basenames(self.read1_files+self.read2_files, "read")).keys()
fastqilluminafilter = self.add_component("FastqIlluminaFilter", [self.read1_files+self.read2_files, self.args["keep_reads"], group_prefix, run_name+"_fastqilluminafilter.tar.gz"])
# split read 1 and read 2 from filtered files list
[filtered_read1_files, filtered_read2_files] = Utils.split_pair(fastqilluminafilter.fastq_files_filtered, True)
filtered_read1_files.sort()
filtered_read2_files.sort()
# concatenate fastq
reads_prefixes = (Utils.get_group_basenames(filtered_read1_files+filtered_read2_files, "read")).keys()
concatenatefastq = self.add_component("ConcatenateFilesGroups", [filtered_read1_files+filtered_read2_files, reads_prefixes])
# list filtered files
if self.read2_files :
# split read 1 and read 2 from filtered files list
[filtered_read1_files, filtered_read2_files] = Utils.split_pair(fastqilluminafilter.fastq_files_filtered, (group_prefix is not None))
else:
filtered_read1_files = fastqilluminafilter.fastq_files_filtered
filtered_read2_files = []
filtered_read1_files = sorted(filtered_read1_files)
filtered_read2_files = sorted(filtered_read2_files)
# archive the files
addrawfiles = self.add_component("AddRawFiles", [self.runobj, concatenatefastq.concat_files, self.args["compression"]])
saved_files = filtered_read1_files + filtered_read2_files
reads_prefixes = None
if group_prefix is not None :
# concatenate fastq
reads_prefixes = (Utils.get_group_basenames(saved_files, "read")).keys()
concatenatefastq = self.add_component("ConcatenateFilesGroups", [saved_files, reads_prefixes])
saved_files = concatenatefastq.concat_files
addrawfiles = self.add_component("AddRawFiles", [self.runobj, saved_files, self.args["compression"]])
# make some statistics on raw file
fastqc = self.add_component("FastQC", [fastqilluminafilter.fastq_files_filtered, True, True, run_name+"_fastqc.tar.gz"], parent = fastqilluminafilter)
fastqc = self.add_component("FastQC", [fastqilluminafilter.fastq_files_filtered, (group_prefix is not None), True, run_name+"_fastqc.tar.gz"], parent = fastqilluminafilter)
# contamination_search
if self.args["databank"]:
......@@ -140,12 +93,14 @@ class CasavaQualityCheck (NG6Workflow):
indexed_ref = bwaindex.databank
# align reads against indexed genome
sample_lane_prefixes = (Utils.get_group_basenames(filtered_read1_files+filtered_read2_files, "lane")).keys()
sample_lane_prefixes = None
if group_prefix is not None :
sample_lane_prefixes = (Utils.get_group_basenames(filtered_read1_files+filtered_read2_files, "lane")).keys()
bwa = self.add_component("BWA", [indexed_ref, filtered_read1_files, filtered_read2_files, sample_lane_prefixes], parent = fastqilluminafilter)
# make some statistic on the alignement
alignmentstats = self.add_component("AlignmentStats", [bwa.bam_files], parent = bwa)
if len(self.read2_files) > 0:
#if len(self.read2_files) > 0:
# process insert sizes
insertssizes = self.add_component("InsertsSizes", [bwa.bam_files, self.args["histogram_width"], self.args["min_pct"], "LENIENT", "inserts_sizes.tar.gz"], parent = bwa)
\ No newline at end of file
#insertssizes = self.add_component("InsertsSizes", [bwa.bam_files, self.args["histogram_width"], self.args["min_pct"], "LENIENT", "inserts_sizes.tar.gz"], parent = bwa)
\ No newline at end of file
......@@ -34,12 +34,22 @@ description = casava 1.8 quality check pipeline
casava_directory.name = casava_directory
casava_directory.flag = --casava-directory
casava_directory.help = Where are stored casava results
casava_directory.required = True
casava_directory.required = False
lane_number.name = lane_number
lane_number.flag = --lane-number
lane_number.help = Which lane should be processed
lane_number.required = True
lane_number.required = False
read_1.name = read_1
read_1.flag = --read-1
read_1.help = Which read1 files should be used
read_1.action = append
read_2.name = read_2
read_2.flag = --read-2
read_2.help = Which read2 files should be used (if single end, leave empty)
read_2.action = append
compression.name = compression
compression.flag = --compression
......
......@@ -40,9 +40,9 @@ class FastQC (Analysis):
self.stdouts = OutputFileList(self.get_outputs('{basename_woext}.stdout', self.input_files))
self.stderrs = OutputFileList(self.get_outputs('{basename_woext}.stderr', self.input_files))
else:
group_prefix = Utils.get_group_basenames(self.input_files, "read")
self.stdouts = OutputFileList(self.get_outputs('{basename_woext}.stdout', group_prefix.keys()))
self.stderrs = OutputFileList(self.get_outputs('{basename_woext}.stderr', group_prefix.keys()))
group_prefix = Utils.get_group_basenames(self.input_files, "read").keys()
self.stdouts = OutputFileList(self.get_outputs('{basename_woext}.stdout', group_prefix))
self.stderrs = OutputFileList(self.get_outputs('{basename_woext}.stderr', group_prefix))
def define_analysis(self):
self.name = "ReadsStats"
......
......@@ -42,10 +42,10 @@ class FastqIlluminaFilter (Analysis):
self.group_prefix = group_prefix
# Outputs list if the file is not zip
if not self.fastq_files[0].endswith(".gz"):
self.fastq_files_filtered = OutputFileList(self.get_outputs('{basename_woext}.fastq', self.fastq_files), Formats.FASTQ)
self.fastq_files_filtered = OutputFileList(self.get_outputs('{basename_woext}.fastq', fastq_files), Formats.FASTQ)
# Outputs list if the file is zip
else:
self.fastq_files_filtered = OutputFileList(self.get_outputs('{basename_woext}.fastq.gz', self.fastq_files), Formats.FASTQ)
self.fastq_files_filtered = OutputFileList(self.get_outputs('{basename_woext}.fastq.gz', fastq_files), Formats.FASTQ)
def define_analysis(self):
......@@ -112,4 +112,4 @@ class FastqIlluminaFilter (Analysis):
# If the file is zip
else:
fastq_illumina_filter = ShellFunction("zcat $3 | " + self.get_exec_path("fastq_illumina_filter") + " --keep " + self.keep_reads + " -v 2> $2 | gzip > $1", cmd_format='{EXE} {OUT} {IN}')
fastq_illumina_filter = MultiMap(fastq_illumina_filter, inputs = self.fastq_files, outputs = [self.fastq_files_filtered,self.stdout])
fastq_illumina_filter = MultiMap(fastq_illumina_filter, inputs = self.fastq_files, outputs = [self.fastq_files_filtered, self.stdout])
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment