Commit f459dde2 authored by Gerald Salin's avatar Gerald Salin
Browse files

#56 remove rmdup component, not needed for quality control

parent de6e7c09
......@@ -97,7 +97,9 @@ class Methylseq (CasavaNG6Workflow):
if self.is_paired_end():
fastq_cleaned = trim_galore.output_files_R1 + trim_galore.output_files_R2
fastqcCleaned = self.add_component("FastQC", [fastq_cleaned, (self.group_prefix is not None), True, run_name+"_trim_galore_fastqc.tar.gz"], component_prefix="trim_galore", parent = trim_galore)
#Alignement against the reference genome
indexed_ref = self.reference_genome
# index the reference genome if not already indexed
......@@ -106,29 +108,43 @@ class Methylseq (CasavaNG6Workflow):
indexed_ref = bismark_genome_preparation.databank
if self.is_paired_end() :
# align the PE reads against the reference genome with bismark
bismarkReference = self.add_component("Bismark", [indexed_ref,trim_galore.output_files_R1, trim_galore.output_files_R2, self.samples_names,self.non_directional,self.bowtie1,self.alignment_mismatch, max_insert_size, "Library reads alignment"], component_prefix="paired",parent = trim_galore)
bam_for_next_step = bismarkReference.output_bam
parent_for_next_step = bismarkReference
if not self.rrbs :
rmDuplicate = self.add_component("RemoveDuplicate", [bismarkReference.output_bam,self.is_paired_end(),Methylseq.BISMARK_RAM,Methylseq.BISMARK_CPU], component_prefix="paired",parent = bismarkReference)
bam_for_next_step = rmDuplicate.output
print("RMDUP" , bam_for_next_step)
parent_for_next_step = rmDuplicate
if self.align_subset_reads:
subset = self.add_component("SubsetSeqFiles", [trim_galore.output_files_R1, trim_galore.output_files_R2], parent = trim_galore)
# trim_galore.output_files_R1 = subset.subset_read1
# trim_galore.output_files_R2 = subset.subset_read2
# align the PE reads against the reference genome with bismark
bismarkReference = self.add_component("Bismark", [indexed_ref,subset.subset_read1, subset.subset_read2, self.samples_names,self.non_directional,self.bowtie1,self.alignment_mismatch, max_insert_size, "Library reads alignment"], component_prefix="paired",parent = subset)
bam_for_next_step = bismarkReference.output_bam
parent_for_next_step = bismarkReference
else:
# align the PE reads against the reference genome with bismark
bismarkReference = self.add_component("Bismark", [indexed_ref,trim_galore.output_files_R1, trim_galore.output_files_R2, self.samples_names,self.non_directional,self.bowtie1,self.alignment_mismatch, max_insert_size, "Library reads alignment"], component_prefix="paired",parent = trim_galore)
bam_for_next_step = bismarkReference.output_bam
parent_for_next_step = bismarkReference
# if not self.rrbs :
# rmDuplicate = self.add_component("RemoveDuplicate", [bismarkReference.output_bam,self.is_paired_end(),Methylseq.BISMARK_RAM,Methylseq.BISMARK_CPU], component_prefix="paired",parent = bismarkReference)
# bam_for_next_step = rmDuplicate.output
# print("RMDUP" , bam_for_next_step)
# parent_for_next_step = rmDuplicate
# process insert sizes of the aligned reads
insertssizesReference = self.add_component("InsertsSizes", [bam_for_next_step, insert_size_histogram_width, insert_size_min_percentage, "LENIENT", "inserts_sizes.tar.gz"], component_prefix="paired", parent = parent_for_next_step)
#insertssizesReference = self.add_component("InsertsSizes", [bam_for_next_step, self.histogram_width, self.min_pct, "LENIENT", "inserts_sizes.tar.gz"], component_prefix="paired", parent = parent_for_next_step)
# compute the methylation extraction from the alignement
bismarkReference_extract = self.add_component("BismarkMethylationExtractor", [indexed_ref, bam_for_next_step, "paired" , self.methylation_extractor_no_overlap, self.large_genome], component_prefix="paired",parent = parent_for_next_step)
else:
# if self.align_subset_reads:
# subset = self.add_component("SubsetSeqFiles", [trim_galore.output_files_R1, None], parent = trim_galore)
# trim_galore.output_files_R1 = subset.subset_read1
# align the SE reads against the reference genome with bismark
bismarkReference = self.add_component("Bismark", [indexed_ref,trim_galore.output_files_R1, None, self.samples_names, self.non_directional,self.bowtie1,self.alignment_mismatch,max_insert_size,"Library reads Alignment"],parent = trim_galore)
bam_for_next_step = bismarkReference.output_bam
parent_for_next_step = bismarkReference
if not self.rrbs :
rmDuplicate = self.add_component("RemoveDuplicate", [bismarkReference.output_bam,self.is_paired_end(),Methylseq.BISMARK_RAM,Methylseq.BISMARK_CPU], component_prefix="paired",parent = bismarkReference)
bam_for_next_step = rmDuplicate.output
parent_for_next_step = rmDuplicate
# if not self.rrbs :
# rmDuplicate = self.add_component("RemoveDuplicate", [bismarkReference.output_bam,self.is_paired_end(),Methylseq.BISMARK_RAM,Methylseq.BISMARK_CPU], component_prefix="paired",parent = bismarkReference)
# bam_for_next_step = rmDuplicate.output
# parent_for_next_step = rmDuplicate
# compute the methylation extraction from the alignement
bismarkReference_extract = self.add_component("BismarkMethylationExtractor", [indexed_ref, bam_for_next_step, "single" , self.methylation_extractor_no_overlap, self.large_genome], parent = parent_for_next_step)
......@@ -142,16 +158,15 @@ class Methylseq (CasavaNG6Workflow):
if self.is_paired_end() :
# align the PE reads against the control sequence with bismark
bismarkControl = self.add_component("Bismark", [indexed_control,trim_galore.output_files_R1, trim_galore.output_files_R2, self.samples_names, self.non_directional,self.bowtie1,self.alignment_mismatch, max_insert_size, "Control reads Alignment"], component_prefix="control_paired", parent = trim_galore)
bam_for_next_step = bismarkControl.output_bam
parent_for_next_step = bismarkControl
if not self.rrbs :
#remove duplicate
rmDuplicate = self.add_component("RemoveDuplicate", [bismarkControl.output_bam,self.is_paired_end(),Methylseq.BISMARK_RAM,Methylseq.BISMARK_CPU], component_prefix="control_paired",parent = bismarkControl)
bam_for_next_step = rmDuplicate.output
parent_for_next_step = rmDuplicate
# if not self.rrbs :
# #remove duplicate
# rmDuplicate = self.add_component("RemoveDuplicate", [bismarkControl.output_bam,self.is_paired_end(),Methylseq.BISMARK_RAM,Methylseq.BISMARK_CPU], component_prefix="control_paired",parent = bismarkControl)
# bam_for_next_step = rmDuplicate.output
# parent_for_next_step = rmDuplicate
# process insert sizes of the aligned reads
insertssizesControl = self.add_component("InsertsSizes", [bam_for_next_step, insert_size_histogram_width, insert_size_min_percentage, "LENIENT", "inserts_sizes.tar.gz"], component_prefix="control_paired", parent = parent_for_next_step)
#insertssizesControl = self.add_component("InsertsSizes", [bam_for_next_step, insert_size_histogram_width, insert_size_min_percentage, "LENIENT", "inserts_sizes.tar.gz"], component_prefix="control_paired", parent = parent_for_next_step)
# compute the methylation extraction from the alignement
bismarkControl_extract = self.add_component("BismarkMethylationExtractor", [indexed_control, bam_for_next_step, "paired" , self.methylation_extractor_no_overlap], component_prefix="control_paired", parent = parent_for_next_step)
else:
......@@ -159,11 +174,11 @@ class Methylseq (CasavaNG6Workflow):
bismarkControl = self.add_component("Bismark", [indexed_control,trim_galore.output_files_R1, None ,self.samples_names, self.non_directional,self.bowtie1,self.alignment_mismatch,max_insert_size,"Control reads Alignment"], component_prefix="control", parent = trim_galore)
bam_for_next_step = bismarkControl.output_bam
parent_for_next_step = bismarkControl
if not self.rrbs :
#remove duplicate
rmDuplicate = self.add_component("RemoveDuplicate", [bismarkControl.output_bam,self.is_paired_end(),Methylseq.BISMARK_RAM,Methylseq.BISMARK_CPU], component_prefix="control",parent = bismarkControl)
bam_for_next_step = rmDuplicate.output
parent_for_next_step = rmDuplicate
# if not self.rrbs :
# #remove duplicate
# rmDuplicate = self.add_component("RemoveDuplicate", [bismarkControl.output_bam,self.is_paired_end(),Methylseq.BISMARK_RAM,Methylseq.BISMARK_CPU], component_prefix="control",parent = bismarkControl)
# bam_for_next_step = rmDuplicate.output
# parent_for_next_step = rmDuplicate
# compute the methylation extraction from the alignement
bismarkControl_extract = self.add_component("BismarkMethylationExtractor", [indexed_control, bam_for_next_step, "single" , self.methylation_extractor_no_overlap], component_prefix="control", parent = parent_for_next_step)
......@@ -134,6 +134,7 @@ class Bismark (Analysis):
def post_process(self):
self._save_files(self.output_bam)
results_files = self.output_report
logging.getLogger("ng6").debug("bismark.post_process self.output_report = " + ",".join(self.output_report))
# Finally create and add the archive to the analysis
for file in self.output_report:
# faire un test si contient _trimmed prendre ce qu'il y a avant comme nom d'echantillon.
......@@ -184,7 +185,9 @@ class Bismark (Analysis):
@return : {"nbseq" : x, ...}
"""
stats = {}
logging.getLogger("ng6").debug("bismark.__parse_report_file report_file = " + report_file)
for line in open(report_file, 'r').readlines():
logging.getLogger("ng6").debug("bismark.__parse_report_file line = " + line + ", line.strip = " + ",".join(line.strip().split()))
#if paired
if line.startswith("Sequence pairs analysed in total:"):
stats["totalPairs"] = line.strip().split()[-1]
......@@ -230,6 +233,7 @@ class Bismark (Analysis):
stats["CinUnknown"] = 'NA'
if line.startswith("Total number of C's analysed:"):
stats["totalC"] = line.strip().split()[-1]
logging.getLogger("ng6").debug("bismark.__parse_report_file totalC line.strip = " + line.strip().split()[-1])
if line.startswith("Total methylated C's in CpG context:"):
stats["totalmCinCpG"] = line.strip().split()[-1]
if line.startswith("Total methylated C's in CHG context:"):
......
......@@ -60,12 +60,12 @@ class BismarkMethylationExtractor (Analysis):
def post_process(self):
logging.getLogger("ng6").debug("BismarkMethylationExtractor.post_process " +", ".join(self.output_directories))
for directory in self.output_directories :
methylationExtractorResultFiles = []
sample = os.path.basename(directory)
for file in os.listdir(directory):
logging.getLogger("ng6").debug("BismarkMethylationExtractor.file " +file + " in " + directory)
if file.endswith("M-bias.txt"):
graph=self.__parse_mbias_file(os.path.join(directory,file))
for read in list(graph.keys()):
......@@ -75,7 +75,8 @@ class BismarkMethylationExtractor (Analysis):
self._add_result_element(sample, "unmethylated", ",".join(graph[read][context]["Cunmeth"]), read+"_"+context )
if file.endswith("bam_splitting_report.txt"):
if file.endswith("splitting_report.txt"):
logging.getLogger("ng6").debug("BismarkMethylationExtractor. file " +file + " is splitting_report.txt")
report_info = self.__parse_report_file(os.path.join(directory, file))
self._add_result_element(sample, "totalC", str(report_info["totalC"]))
self._add_result_element(sample, "totalMethylatedCinCpG", str(report_info["totalMethylatedCinCpG"]))
......@@ -91,6 +92,7 @@ class BismarkMethylationExtractor (Analysis):
self._archive_files(methylationExtractorResultFiles,self.compression,os.path.basename(directory)+".tar")
def __parse_report_file (self, report_file):
logging.getLogger("ng6").debug("BismarkMethylationExtractor.__parse_report_file " +report_file)
"""
Parse the data file
@param data_file : the fastqc data file
......@@ -98,8 +100,10 @@ class BismarkMethylationExtractor (Analysis):
"""
stats = {}
for line in open(report_file, 'r').readlines():
logging.getLogger("ng6").debug("BismarkMethylationExtractor.__parse_report_file line" +line)
if line.startswith("Total number of C's analysed:"):
stats["totalC"] = line.strip().split()[-1]
logging.getLogger("ng6").debug("BismarkMethylationExtractor.__parse_report_file totalC = " +stats["totalC"])
if line.startswith("Total methylated C's in CpG context:"):
stats["totalMethylatedCinCpG"] = line.strip().split()[-1]
if line.startswith("Total methylated C's in CHG context:"):
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment