Commit 1a0a3f8c authored by ckuchly's avatar ckuchly
Browse files

Add process for 10X data

parent e5178b47
......@@ -178,7 +178,7 @@ class NG6ConfigReader(object):
for barcode in barcodes:
#print("config reader")
#print(barcode[0])
logging.getLogger("ng6").debug("barcode[0] = " + barcode[0].upper())
#logging.getLogger("ng6").debug("barcode[0] = " + barcode[0].upper())
barcode_array[barcode[0].upper()] = barcode[1].upper()
return barcode_array
except :
......
......@@ -33,7 +33,7 @@ from ng6.project import Project
from ng6.run import Run
from ng6.sample import Sample
from ng6.utils import Utils
from ng6.config_reader import NG6ConfigReader
class BasicNG6Workflow (Workflow):
......@@ -108,6 +108,7 @@ class BasicNG6Workflow (Workflow):
raise ValueError("Component " + cmpt_object.__class__.__name__ + " with prefix " +
cmpt_object.prefix + " already exist in this pipeline!")
self.component_nameids[cmpt_object.get_nameid()] = None
return cmpt_object
else:
raise ImportError(component_name + " component cannot be loaded, available components are: {0}".format(
......@@ -141,6 +142,7 @@ class NG6Workflow (BasicNG6Workflow):
self.samples = []
self.reads1 = []
self.reads2 = []
self.index = []
self.samples_names = []
self.reads1_indexes = []
self.reads2_indexes = []
......@@ -167,10 +169,11 @@ class NG6Workflow (BasicNG6Workflow):
self.add_parameter_list("metadata", "Add metadata to the sample", type='samplemetadata' ,add_to = "input_sample")
self.add_input_file_list("read1", "Read 1 data file path", required = True, add_to = "input_sample")
self.add_input_file_list("read2", "Read 2 data file path", add_to = "input_sample")
self.add_input_file_list("index", "Index data file path", add_to = "input_sample")
def __create_samples__(self):
for sd in self.input_sample :
sp_object = Sample( sd['sample_id'], sd['read1'], sd['read2'], name = sd['sample_name'], description = sd['sample_description'], type = sd['type'],
sp_object = Sample( sd['sample_id'], sd['read1'], sd['read2'], sd['index'],name = sd['sample_name'], description = sd['sample_description'], type = sd['type'],
insert_size = sd['insert_size'], species = sd['species'] )
for metadata in sd['metadata'] :
......@@ -205,6 +208,9 @@ class NG6Workflow (BasicNG6Workflow):
for rfile in sample.reads2 :
self.reads2_indexes.append(sample.sample_id)
self.reads2.append(rfile)
for rfile in sample.index :
self.index.append(rfile)
if len(self.samples_names) != 0 :
if len(self.samples_names) != len (self.samples) :
......@@ -215,8 +221,9 @@ class NG6Workflow (BasicNG6Workflow):
return self.reads1
elif type == 'read2' :
return self.reads2
return self.reads1 + self.reads2
elif type == 'index' :
return self.index
return self.reads1 + self.reads2 + self.index
def get_files_index(self, type = None):
if type == 'read1' :
......@@ -225,7 +232,6 @@ class NG6Workflow (BasicNG6Workflow):
return self.reads2_indexes
return self.reads1_indexes + self.reads2_indexes
def is_paired_end(self):
return len(self.reads2) > 0
......@@ -288,6 +294,25 @@ def get_files_from_casava(casava_directory, project_name, lane_number):
if file.endswith(".fastq.gz") and re.search(".*_L00" + str(lane) + "_.*", file):
files.append(filepath);
return files
def bcl2fastq_10X(directory, pname, lane):
"""longranger"""
files = []
with open(os.path.join(directory, "SampleSheet_10X.mk")) as fh :
subdirs_list = []
for line in fh :
if line.startswith("l" + str(lane) + "_SUBDIRS"):
parts = line.strip().split(":=")
subdirs_list = parts[1].split(" ")
# parse samples
for subdir in subdirs_list:
# filter on project name
if re.match("Project_" + pname + "/Sample_.+", subdir) or subdir.startswith("Undetermined_indices"):
for file in os.listdir(directory + "/" + subdir):
filepath = directory + "/" + subdir + "/" + file
if file.endswith(".fastq.gz") and re.search(".*_L00" + str(lane) + "_.*", file):
files.append(filepath);
return files
def bcl2fastq_216(directory, pname, lane):
"""bcl2fastq >= 1.9"""
......@@ -315,6 +340,8 @@ def get_files_from_casava(casava_directory, project_name, lane_number):
return bcl2fastq_18(casava_directory, project_name, lane_number)
elif os.path.exists(os.path.join( casava_directory, 'Stats', 'DemultiplexingStats.xml')) :
return bcl2fastq_216(casava_directory, project_name, lane_number)
elif os.path.exists(os.path.join(casava_directory, "SampleSheet_10X.mk")) :
return bcl2fastq_10X(casava_directory, project_name, lane_number)
......@@ -326,8 +353,10 @@ class CasavaNG6Workflow(NG6Workflow):
self.group_prefix = None
self.undetermined_reads1 = []
self.undetermined_reads2 = []
self.undetermined_index = []
self.log_files = []
self.is_casava = False
self.is_10Xcasava = False
def __add_sample_parameters__(self):
self.add_multiple_parameter('casava', 'Provide the options to retrieve samples from a CASAVA directory', group="Sample description")
......@@ -383,9 +412,16 @@ class CasavaNG6Workflow(NG6Workflow):
logging.getLogger("ng6").debug("CasavaNG6Workflow.__create_samples__ before self._process_casava_18")
all_samples, all_samples_id = self._process_casava_18(casava_directory, project_name, lane_number, input_files)
logging.getLogger("ng6").debug("CasavaNG6Workflow.__create_samples__ before self._process_casava_18")
elif os.path.exists(os.path.join( casava_directory, 'Stats', 'DemultiplexingStats.xml')) :
all_samples, all_samples_id = self._process_casava_216(casava_directory, project_name, lane_number, input_files)
elif os.path.exists(os.path.join( casava_directory, "SampleSheet_10X.mk")):
logging.getLogger("ng6").debug("CasavaNG6Workflow.__create_samples__ before self._process_casava_10X")
all_samples, all_samples_id = self._process_casava_10X(casava_directory, project_name, lane_number, input_files)
self.is_10Xcasava = True
logging.getLogger("ng6").debug("CasavaNG6Workflow.__create_samples__ after self._process_casava_10X")
selected_samples = self.casava['select_sample_id']
logging.getLogger("CasavaNG6Workflow").debug("__create_samples__. all_samples_id = a"+", ".join(all_samples_id)+"a")
if selected_samples :
......@@ -450,7 +486,6 @@ class CasavaNG6Workflow(NG6Workflow):
# filter on project name
if re.match("Project_" + project_name + "/Sample_.+", sample['subdir']) or sample['subdir'].startswith("Undetermined_indices"):
for file in os.listdir(casava_directory + "/" + sample['subdir']):
filepath = casava_directory + "/" + sample['subdir'] + "/" + file
if file.endswith(".fastq.gz") and re.search(".*_L00" + str(lane_number) + "_.*", file):
......@@ -477,7 +512,6 @@ class CasavaNG6Workflow(NG6Workflow):
all_samples.append(sp_object)
all_samples_id.append(sample['sample_id'])
for file in os.listdir(casava_directory):
filepath = casava_directory + "/" + file
if file.endswith(".log"):
......@@ -485,9 +519,104 @@ class CasavaNG6Workflow(NG6Workflow):
logging.getLogger("ng6").debug("CasavaNG6Workflow._process_casava_18 self.log_files = " + ",".join(self.log_files))
logging.getLogger("ng6").debug("CasavaNG6Workflow._process_casava_18 all_samples_id = " + ",".join(all_samples_id))
logging.getLogger("ng6").debug("CasavaNG6Workflow._process_casava_18 exiting")
return all_samples, all_samples_id
def _process_casava_10X(self,casava_directory, project_name, lane_number, input_files):
logging.getLogger("ng6").debug("CasavaNG6Workflow._process_casava_10X enter")
print("Process Casava 10X ")
logging.getLogger("ng6").debug("CasavaNG6Workflow._process_casava_10X casava_directory = " + casava_directory + ", project_name = " + str(project_name))
"""
Creates samples from casavadir from longranger demultiplexing
@param casava_directory:
@param project_name:
@param lane_number:
@param input_files:
"""
all_samples = []
all_samples_id = []
# open casava samplesheet again to associate our files with a sample
with open(os.path.join(casava_directory, "SampleSheet_10X.mk")) as fh :
barcodes_list = []
sample_ids_list = []
subdirs_list = []
for line in fh :
if line.startswith("l" + str(lane_number) + "_BARCODES"):
parts = line.strip().split(":=")
barcodes_list = [ re.sub( r"[_\s]+", "", x) for x in parts[1].split() ]
elif line.startswith("l" + str(lane_number) + "_SAMPLEIDS" ):
parts = line.strip().split(":=")
sample_ids_list = parts[1].split(" ")
elif line.startswith("l" + str(lane_number) + "_SUBDIRS"):
parts = line.strip().split(":=")
subdirs_list = parts[1].split(" ")
assert len(barcodes_list) == len(sample_ids_list) == len(subdirs_list), "Invalid lane {0} in SampleSheet.mk".format(lane_number)
cfg_reader = NG6ConfigReader()
indexs = cfg_reader.get_10X_indexs()
# parse samples
for i in range(len(barcodes_list)):
if barcodes_list[i] == 'Undetermined' :
barcode = 'Undetermined'
else :
barcode = indexs[barcodes_list[i]]
#print("ng6worflow retrouver les barcodes")
#print(barcode)
sample = {
'barcode' : barcode,
'sample_id' : sample_ids_list[i],
'subdir' : subdirs_list[i],
'reads1' : [],
'reads2' : [],
'index' : []
}
# filter on project name
if re.match("Project_" + project_name + "/Sample_.+", sample['subdir']) or sample['subdir'].startswith("Undetermined_indices"):
for file in os.listdir(casava_directory + "/" + sample['subdir']):
filepath = casava_directory + "/" + sample['subdir'] + "/" + file
if file.endswith(".fastq.gz") and re.search(".*_L00" + str(lane_number) + "_.*", file):
for idx, iofile in enumerate(input_files) :
if iofile == filepath :
if re.search(".*_R1_.*", file):
if not sample['subdir'].startswith("Undetermined_indices"):
sample['reads1'].append(iofile)
else:
self.undetermined_reads1.append(iofile)
if re.search(".*_R2_.*", file):
if not sample['subdir'].startswith("Undetermined_indices"):
sample['reads2'].append(iofile)
else:
self.undetermined_reads2.append(iofile)
if re.search(".*_I1_.*", file):
if not sample['subdir'].startswith("Undetermined_indices"):
logging.getLogger("ng6").debug("CasavaNG6Workflow.__process_casava_10X__index_files = " + "".join(iofile))
sample['index'].append(iofile)
else:
self.undetermined_index.append(iofile)
input_files.pop(idx)
break
if not sample['subdir'].startswith("Undetermined_indices") :
sp_object = Sample(sample['barcode'], sample['reads1'], reads2 = sample['reads2'], index = sample['index'], name=sample['sample_id'])
sp_object.add_metadata('barcode', sample['barcode'])
sp_object.add_metadata('is_casava', True)
all_samples.append(sp_object)
all_samples_id.append(sample['sample_id'])
for file in os.listdir(casava_directory):
filepath = casava_directory + "/" + file
if file.endswith(".log"):
self.log_files.append(filepath)
logging.getLogger("ng6").debug("CasavaNG6Workflow._process_casava_10X all_samples_id = " + ",".join(all_samples_id))
logging.getLogger("ng6").debug("CasavaNG6Workflow._process_casava_10X exiting")
return all_samples, all_samples_id
def _process_casava_216(self,casava_directory, project_name, lane_number, input_files):
"""
Creates samples from casavadir (>=1.9) using input files
......@@ -509,12 +638,21 @@ class CasavaNG6Workflow(NG6Workflow):
logging.getLogger("ng6").debug("illumina_process self.is_casava")
if len(self.log_files) > 0 :
add_log = self.add_component("BasicAnalysis", [self.log_files,"Log Files","Log files generated during primary analysis","-","-","-","gz", "","log.gz"])
if len(self.undetermined_reads1) > 0 :
if self.casava['mismatch_index'] :
demultiplex_stats = self.add_component("DemultiplexStats", [self.get_all_reads("read1"), self.undetermined_reads1, self.get_files_index('read1')])
elif self.is_10Xcasava :
logging.getLogger("ng6").debug("illumina_process self.is_10Xcasava = ")
logging.getLogger("ng6").debug(self.get_all_reads("read1"))
logging.getLogger("ng6").debug("illumina_process undetermined reads = " )
logging.getLogger("ng6").debug(self.undetermined_reads1)
logging.getLogger("ng6").debug("illumina_process file index =")
logging.getLogger("ng6").debug(self.get_files_index("read1"))
#demultiplex_stats = self.add_component("Demultiplex10XStats", [self.get_all_reads("read1"), self.undetermined_reads1, self.get_files_index("read1")])
else :
demultiplex_stats = self.add_component("DemultiplexStats", [self.get_all_reads("read1"), self.undetermined_reads1])
demultiplex_stats = self.add_component("DemultiplexStats", [self.get_all_reads("read1"), self.undetermined_reads1])
if self.keep_reads != "all" :
logging.getLogger("ng6").debug("illumina_process self.keep_reads != all")
......@@ -568,9 +706,9 @@ class CasavaNG6Workflow(NG6Workflow):
except : pass
# contamination_search
if contam :
if self.contamination_databank: contam.extend(self.contamination_databank)
contamination_search = self.add_component("ContaminationSearch", [filtered_read1_files, contam, list((Utils.get_group_basenames(filtered_read1_files, "read")).keys())], parent = fastqilluminafilter)
#if contam :
# if self.contamination_databank: contam.extend(self.contamination_databank)
# contamination_search = self.add_component("ContaminationSearch", [filtered_read1_files+filtered_read2_files, contam, reads_prefixes], parent = fastqilluminafilter)
# make some statistics on raw file
fastqc = self.add_component("FastQC", [filtered_read1_files+filtered_read2_files, (self.group_prefix is not None), self.no_group, "fastqc.tar.gz"], parent = fastqilluminafilter)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment