Commit 58b2cef3 authored by Gerald Salin's avatar Gerald Salin

add concatenatefastq component in the return statement of

illumina_process
parent 592b26d2
......@@ -384,6 +384,7 @@ class CasavaNG6Workflow(NG6Workflow):
all_samples, all_samples_id = self._process_casava_216(casava_directory, project_name, lane_number, input_files)
selected_samples = self.casava['select_sample_id']
logging.getLogger("CasavaNG6Workflow").debug("__create_samples__. all_samples_id = a"+", ".join(all_samples_id)+"a")
if selected_samples :
for sid in selected_samples :
assert sid in all_samples_id , "The sample id %s is not in the SampleSheet.mk" % sid
......@@ -417,6 +418,7 @@ class CasavaNG6Workflow(NG6Workflow):
# open casava samplesheet again to associate our files with a sample
with open(os.path.join(casava_directory, "SampleSheet.mk")) as fh :
logging.getLogger("ng6").debug("CasavaNG6Workflow._process_casava_18 SampleSheet.mk exists")
barcodes_list = []
sample_ids_list = []
subdirs_list = []
......@@ -432,7 +434,7 @@ class CasavaNG6Workflow(NG6Workflow):
subdirs_list = parts[1].split(" ")
assert len(barcodes_list) == len(sample_ids_list) == len(subdirs_list), "Invalid lane {0} in SampleSheet.mk".format(lane_number)
logging.getLogger("ng6").debug("CasavaNG6Workflow._process_casava_18 SampleSheet.mk parsed")
# parse samples
for i in range(len(barcodes_list)):
sample = {
......@@ -442,9 +444,10 @@ class CasavaNG6Workflow(NG6Workflow):
'reads1' : [],
'reads2' : []
}
# filter on project name
if re.match("Project_" + project_name + "/Sample_.+", sample['subdir']) or sample['subdir'].startswith("Undetermined_indices"):
for file in os.listdir(casava_directory + "/" + sample['subdir']):
filepath = casava_directory + "/" + sample['subdir'] + "/" + file
if file.endswith(".fastq.gz") and re.search(".*_L00" + str(lane_number) + "_.*", file):
......@@ -464,12 +467,14 @@ class CasavaNG6Workflow(NG6Workflow):
break
if not sample['subdir'].startswith("Undetermined_indices") :
logging.getLogger("ng6").debug("CasavaNG6Workflow._process_casava_18 create sample " + sample['sample_id'])
sp_object = Sample(sample['barcode'], sample['reads1'], reads2 = sample['reads2'], name=sample['sample_id'])
sp_object.add_metadata('barcode', sample['barcode'])
sp_object.add_metadata('is_casava', True)
all_samples.append(sp_object)
all_samples_id.append(sample['sample_id'])
for file in os.listdir(casava_directory):
filepath = casava_directory + "/" + file
if file.endswith(".log"):
......@@ -491,6 +496,7 @@ class CasavaNG6Workflow(NG6Workflow):
def illumina_process(self):
fastqilluminafilter = None
concatenatefastq = None
filtered_read1_files = []
filtered_read2_files = []
saved_files = []
......@@ -558,10 +564,10 @@ class CasavaNG6Workflow(NG6Workflow):
except : pass
# contamination_search
#if contam :
# if self.contamination_databank: contam.extend(self.contamination_databank)
# contamination_search = self.add_component("ContaminationSearch", [filtered_read1_files+filtered_read2_files, contam, reads_prefixes], parent = fastqilluminafilter)
if contam :
if self.contamination_databank: contam.extend(self.contamination_databank)
contamination_search = self.add_component("ContaminationSearch", [filtered_read1_files+filtered_read2_files, contam, reads_prefixes], parent = fastqilluminafilter)
# make some statistics on raw file
fastqc = self.add_component("FastQC", [filtered_read1_files+filtered_read2_files, (self.group_prefix is not None), self.no_group, "fastqc.tar.gz"], parent = fastqilluminafilter)
return fastqilluminafilter, filtered_read1_files, filtered_read2_files, saved_files
return fastqilluminafilter, filtered_read1_files, filtered_read2_files, saved_files, concatenatefastq
......@@ -18,6 +18,7 @@
import os
import sys
import re
import logging
from ng6.ng6workflow import CasavaNG6Workflow
from ng6.utils import Utils
......@@ -40,15 +41,13 @@ class IlluminaDiversityQC (CasavaNG6Workflow):
self.add_parameter("max_overlap", "Maximum overlap length expected in approximately 90 percent of read pairs.", default = 55, type = int, group="JOIN section")
def process(self):
fastqilluminafilter, filtered_read1_files, filtered_read2_files, concat_files = self.illumina_process()
concatenate1 = self.add_component("ConcatenateFilesGroups", [filtered_read1_files, list((Utils.get_group_basenames(self.get_all_reads("read1"), "read")).keys())],component_prefix="read1")
concatenate2 = self.add_component("ConcatenateFilesGroups", [filtered_read2_files, list((Utils.get_group_basenames(self.get_all_reads("read2"), "read")).keys())],component_prefix="read2")
concatenate1.concat_files = sorted(concatenate1.concat_files)
concatenate2.concat_files = sorted(concatenate2.concat_files)
fastqilluminafilter, filtered_read1_files, filtered_read2_files, concat_files, concatenatefastq = self.illumina_process()
logging.getLogger("IlluminaDiversityQC").debug("process Utils.concat_files = " + ",".join(concat_files))
logging.getLogger("IlluminaDiversityQC").debug("process concatenatefastq.r1_files = " + ",".join(concatenatefastq.r1_files))
logging.getLogger("IlluminaDiversityQC").debug("process concatenatefastq.r2_files = " + ",".join(concatenatefastq.r2_files))
# merge overlapping pair
join_pairs = self.add_component("Flash", [concatenate1.concat_files, concatenate2.concat_files, self.mismatch_ratio, self.min_overlap, self.max_overlap], parent=fastqilluminafilter)
join_pairs = self.add_component("Flash", [concatenatefastq.r1_files, concatenatefastq.r2_files, self.mismatch_ratio, self.min_overlap, self.max_overlap], parent=fastqilluminafilter)
if self.assignation_databank != None:
# subset assignation
......
......@@ -39,11 +39,11 @@ class IlluminaMatePair (CasavaNG6Workflow):
" fewer than this percentage of overall reads", type=float, default = 0.01, group="INSERTSIZE section")
def process(self):
fastqilluminafilter, filtered_read1_files, filtered_read2_files, concat_files = self.illumina_process()
fastqilluminafilter, filtered_read1_files, filtered_read2_files, concat_files, concatenatefastq = self.illumina_process()
# mate_pair analyse
concatenate1 = self.add_component("ConcatenateFilesGroups", [filtered_read1_files, list((Utils.get_group_basenames(self.get_all_reads('read1'), "read")).keys())],component_prefix="read1")
concatenate2 = self.add_component("ConcatenateFilesGroups", [filtered_read2_files, list((Utils.get_group_basenames(self.get_all_reads('read2'), "read")).keys())],component_prefix="read2")
concatenate1 = self.add_component("ConcatenateFilesGroups", [self.runobj,filtered_read1_files, list((Utils.get_group_basenames(self.get_all_reads('read1'), "read")).keys())],component_prefix="read1")
concatenate2 = self.add_component("ConcatenateFilesGroups", [self.runobj,filtered_read2_files, list((Utils.get_group_basenames(self.get_all_reads('read2'), "read")).keys())],component_prefix="read2")
concatenate1.concat_files = sorted(concatenate1.concat_files)
concatenate2.concat_files = sorted(concatenate2.concat_files)
......
......@@ -42,7 +42,7 @@ class IlluminaQualityCheck (CasavaNG6Workflow):
def process(self):
fastqilluminafilter, filtered_read1_files, filtered_read2_files, concat_files = self.illumina_process()
fastqilluminafilter, filtered_read1_files, filtered_read2_files, concat_files, concatenatefastq = self.illumina_process()
if self.reference_genome:
# index the reference genome if not already indexed
......
......@@ -40,7 +40,7 @@ class RnaSeqQualityCheck (CasavaNG6Workflow):
self.add_input_file_list("annotation", "Which annotation file should be used for processing RNAseq quality")
def process(self):
fastqilluminafilter, filtered_read1_files, filtered_read2_files, concat_files = self.illumina_process()
fastqilluminafilter, filtered_read1_files, filtered_read2_files, concat_files, concatenatefastq = self.illumina_process()
logging.getLogger("RnaSeqQualityCheck").debug("process. filtered_read1_files = "+",".join(filtered_read1_files))
logging.getLogger("RnaSeqQualityCheck").debug("process. filtered_read2_files = "+",".join(filtered_read2_files))
logging.getLogger("RnaSeqQualityCheck").debug("process. concat_files = "+",".join(concat_files))
......
......@@ -57,7 +57,7 @@ class Methylseq (CasavaNG6Workflow):
def process(self):
fastqilluminafilter, filtered_read1_files, filtered_read2_files, concat_files = self.illumina_process()
fastqilluminafilter, filtered_read1_files, filtered_read2_files, concat_files, concatenatefastq = self.illumina_process()
# handle if run name have spaces
run_name = "_".join(self.runobj.name.split())
......
......@@ -46,7 +46,7 @@ class QIlluminaFilter (Analysis):
def process(self):
if self.input_fastqs[0].endswith(".gz") :
fastq_illumina_filter = ShellFunction("gunzip -c $1 | " + self.get_exec_path("fastq_illumina_filter") + " -v --keep " + self.keep + " 2> $3 | gzip > $2", cmd_format='{EXE} {IN} {OUT}')
fastq_illumina_filter = ShellFunction("gunzip -c $1 | " + self.get_exec_path("fastq_illumina_filter") + " -v --keep " + self.keep + " 2> $3 | "+self.get_exec_path("gzip")+" > $2", cmd_format='{EXE} {IN} {OUT}')
else :
fastq_illumina_filter = ShellFunction( self.get_exec_path("fastq_illumina_filter") + " --keep " + self.keep + " -v -o $2 > $3 $1", cmd_format='{EXE} {IN} {OUT}')
MultiMap(fastq_illumina_filter, inputs = self.input_fastqs, outputs = [self.fastq_files_filtered, self.stdout])
......
......@@ -350,12 +350,12 @@ class ProcessRadtag (Analysis):
self.add_python_execution(recover_mate_discards, cmd_format="{EXE} {IN} {OUT}", inputs = [tmp_discard_read_1, self.read2_files], outputs=[tmp_discard_read_2], map=True)
# Compress
self.add_shell_execution("gzip -c $1 > $2 ", cmd_format="{EXE} {IN} {OUT}",inputs = tmp_output_read_1, outputs = self.out_process_read1_files, map=True )
self.add_shell_execution("gzip -c $1 > $2 ", cmd_format="{EXE} {IN} {OUT}", inputs = tmp_discard_read_1, outputs = self.discard_process_read1_files, map=True )
self.add_shell_execution(self.get_exec_path("gzip") + " -c $1 > $2 ", cmd_format="{EXE} {IN} {OUT}",inputs = tmp_output_read_1, outputs = self.out_process_read1_files, map=True )
self.add_shell_execution(self.get_exec_path("gzip") + " -c $1 > $2 ", cmd_format="{EXE} {IN} {OUT}", inputs = tmp_discard_read_1, outputs = self.discard_process_read1_files, map=True )
if (self.read2_files):
self.add_shell_execution("gzip -c $1 > $2 ", cmd_format="{EXE} {IN} {OUT}",inputs = tmp_output_read_2, outputs = self.out_process_read2_files, map=True )
self.add_shell_execution("gzip -c $1 > $2 ", cmd_format="{EXE} {IN} {OUT}",inputs = tmp_discard_read_2, outputs = self.discard_process_read2_files, map=True )
self.add_shell_execution(self.get_exec_path("gzip") + " -c $1 > $2 ", cmd_format="{EXE} {IN} {OUT}",inputs = tmp_output_read_2, outputs = self.out_process_read2_files, map=True )
self.add_shell_execution(self.get_exec_path("gzip") + " -c $1 > $2 ", cmd_format="{EXE} {IN} {OUT}",inputs = tmp_discard_read_2, outputs = self.discard_process_read2_files, map=True )
# clone_filter
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment