__init__.py 12.8 KB
Newer Older
Penom Nom's avatar
Penom Nom committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
#
# Copyright (C) 2012 INRA
# 
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#

import os
import glob
import sys

from ng6.ng6workflow import CasavaNG6Workflow
from ng6.utils import Utils

#TODO : tested in paired-end mode with fatsq.gz files, with casava_directory and --read-1 / --read-2
#TODO : tested in paired-end mode with fatsq files (not gzipped), with --read-1 / --read-2
#TODO : not tested in single-end mode
#TODO : max insert size for bismark not yet managed
class Methylseq (CasavaNG6Workflow):
30
    BISMARK_CPU=8
Celine Noirot's avatar
debug    
Celine Noirot committed
31
    BISMARK_RAM="8G"
Penom Nom's avatar
Penom Nom committed
32
33
34
35
36
37
38
39
40
41
42
43
44
45
    def get_name(self):
        return 'methylseq'
    
    def get_description(self):
        return "illumina methylation calling pipeline"
    
    def define_parameters(self, function="process"):
        self.add_input_file("reference_genome", "Which genome should the read being align on", required = True)
        self.add_input_file("control_genome", "Control reference sequence ")
                
        self.add_parameter("histogram_width", "Explicitly sets the histogram width, overriding automatic truncation of histogram tail", type=int, default = 800, group="INSERTSIZE section")
        self.add_parameter("min_pct", "When generating the histogram, discard any data categories (out of FR, TANDEM, RF) that have"+
                           " fewer than this percentage of overall reads", type=float, default = 0.01, group="INSERTSIZE section")
        # Bisulfite parameters 
Celine Noirot's avatar
DEbug :    
Celine Noirot committed
46
        self.add_parameter("rrbs", "Workflow for RRBS data : clean data (digested with MspI) and do not perform RMdup", type="bool", default=False)
Penom Nom's avatar
Penom Nom committed
47
48
49
50
        self.add_parameter("non_directional", "To set if the library is non directional (Default : False)", type="bool", default=False)
        self.add_parameter("alignment_mismatch", "Sets the number of mismatches to allowed in a seed alignment during multiseed alignment ", type="int", default=1)
        
        self.add_parameter("max_insert_size", "The maximum insert size for valid paired-end alignments.", type="int", default=800)
Celine Noirot's avatar
Celine Noirot committed
51
        self.add_parameter("bowtie1", "Use bowtie1  (longer, better for reads < 50bp) instead of bowtie2 - default False ", type=bool, default=False, flag="--bowtie1")
Penom Nom's avatar
Penom Nom committed
52
53
54
55
56
57
58
59
        self.add_parameter("keep_cleaned_fastq", "Keep fastq files after cleaning.", type="bool", default = False)
        
        #Extraction parameter
        self.add_parameter("methylation_extractor_no_overlap", "This option avoids scoring overlapping methylation calls twice for paired data with too short inserts.", type="bool", default = True)
        self.add_parameter("large_genome", "Set this option if reference genome have lot of scaffolds", default=False, type="bool")
        
    def process(self):

60
        fastqilluminafilter, filtered_read1_files, filtered_read2_files, concat_files, concatenatefastq = self.illumina_process()
61
     
Penom Nom's avatar
Penom Nom committed
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
        # handle if run name have spaces
        run_name = "_".join(self.runobj.name.split())
        
        #manage the max insert size parameter for bismark component and the histogram width for insertSize component 
        if not self.is_paired_end():
            max_insert_size = 0
            insert_size_min_percentage = self.min_pct
        else:
            max_insert_size = self.max_insert_size
            insert_size_min_percentage = self.min_pct
            
        try:
            int(max_insert_size)
            insert_size_histogram_width = max_insert_size
            float(insert_size_min_percentage)
        except ValueError:
            raise IOError("One of the parameter (max_insert_size, histogram_width, min_pct) is not numeric, please check your values.")

        if self.is_paired_end() and (self.group_prefix != None):
            # split read 1 and read 2 from filtered files list
            [concat_read1_files, concat_read2_files] = Utils.split_pair(concat_files, (self.group_prefix != None))
        elif self.group_prefix != None:
            concat_read1_files = concat_files
            concat_read2_files = []
        else:
            concat_read1_files = filtered_read1_files
            concat_read2_files = filtered_read2_files
        concat_read1_files = sorted(concat_read1_files)
        concat_read2_files = sorted(concat_read2_files)
        
        #cleaning raw files (quality and adapter trimming)
        trim_galore = self.add_component("TrimGalore", [ concat_read1_files, concat_read2_files, self.non_directional, self.rrbs], parent = fastqilluminafilter)
        
        # make some statistics on cleaned files
        fastq_cleaned = trim_galore.output_files_R1
        if self.is_paired_end():
            fastq_cleaned = trim_galore.output_files_R1 + trim_galore.output_files_R2
        fastqcCleaned = self.add_component("FastQC", [fastq_cleaned, (self.group_prefix is not None), True, run_name+"_trim_galore_fastqc.tar.gz"], component_prefix="trim_galore", parent = trim_galore)
100
101
102
        
        
        
Penom Nom's avatar
Penom Nom committed
103
104
105
106
        #Alignement against the reference genome
        indexed_ref = self.reference_genome
        # index the reference genome if not already indexed
        if not os.path.exists(  os.path.join(os.path.dirname(indexed_ref),"Bisulfite_Genome" )):
Celine Noirot's avatar
Celine Noirot committed
107
            bismark_genome_preparation = self.add_component("BismarkGenomePreparation", [ self.reference_genome, self.bowtie1])
Penom Nom's avatar
Penom Nom committed
108
109
110
            indexed_ref = bismark_genome_preparation.databank
        
        if self.is_paired_end() :
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
            if self.align_subset_reads:
                subset = self.add_component("SubsetSeqFiles", [trim_galore.output_files_R1, trim_galore.output_files_R2], parent = trim_galore)
#                 trim_galore.output_files_R1 = subset.subset_read1
#                 trim_galore.output_files_R2 = subset.subset_read2
            
                # align the PE reads against the reference genome with bismark
                bismarkReference = self.add_component("Bismark", [indexed_ref,subset.subset_read1, subset.subset_read2, self.samples_names,self.non_directional,self.bowtie1,self.alignment_mismatch, max_insert_size, "Library reads alignment"], component_prefix="paired",parent = subset)
                bam_for_next_step = bismarkReference.output_bam
                parent_for_next_step = bismarkReference
            else:
                # align the PE reads against the reference genome with bismark
                bismarkReference = self.add_component("Bismark", [indexed_ref,trim_galore.output_files_R1, trim_galore.output_files_R2, self.samples_names,self.non_directional,self.bowtie1,self.alignment_mismatch, max_insert_size, "Library reads alignment"], component_prefix="paired",parent = trim_galore)
                bam_for_next_step = bismarkReference.output_bam
                parent_for_next_step = bismarkReference
#             if not self.rrbs :
#                 rmDuplicate = self.add_component("RemoveDuplicate", [bismarkReference.output_bam,self.is_paired_end(),Methylseq.BISMARK_RAM,Methylseq.BISMARK_CPU], component_prefix="paired",parent = bismarkReference)
#                 bam_for_next_step = rmDuplicate.output
#                 print("RMDUP" , bam_for_next_step)
#                 parent_for_next_step = rmDuplicate
Celine Noirot's avatar
DEbug :    
Celine Noirot committed
130

Penom Nom's avatar
Penom Nom committed
131
            # process insert sizes of the aligned reads
Gerald Salin's avatar
Gerald Salin committed
132
            insertssizesReference = self.add_component("InsertsSizes", [bam_for_next_step, self.histogram_width, self.min_pct, "LENIENT", "inserts_sizes.tar.gz"], component_prefix="paired", parent = parent_for_next_step)
Penom Nom's avatar
Penom Nom committed
133
            # compute the methylation extraction from the alignement
Celine Noirot's avatar
DEbug :    
Celine Noirot committed
134
            bismarkReference_extract = self.add_component("BismarkMethylationExtractor", [indexed_ref, bam_for_next_step, "paired" , self.methylation_extractor_no_overlap, self.large_genome], component_prefix="paired",parent = parent_for_next_step)
Penom Nom's avatar
Penom Nom committed
135
        else:
136
137
138
139
#             if self.align_subset_reads:
#                 subset = self.add_component("SubsetSeqFiles", [trim_galore.output_files_R1, None], parent = trim_galore)
#                 trim_galore.output_files_R1 = subset.subset_read1
                
Penom Nom's avatar
Penom Nom committed
140
            # align the SE reads against the reference genome with bismark
Celine Noirot's avatar
Celine Noirot committed
141
            bismarkReference = self.add_component("Bismark", [indexed_ref,trim_galore.output_files_R1, None, self.samples_names, self.non_directional,self.bowtie1,self.alignment_mismatch,max_insert_size,"Library reads Alignment"],parent = trim_galore)
Celine Noirot's avatar
DEbug :    
Celine Noirot committed
142
143
            bam_for_next_step = bismarkReference.output_bam
            parent_for_next_step = bismarkReference
144
145
146
147
#             if not self.rrbs :
#                 rmDuplicate = self.add_component("RemoveDuplicate", [bismarkReference.output_bam,self.is_paired_end(),Methylseq.BISMARK_RAM,Methylseq.BISMARK_CPU], component_prefix="paired",parent = bismarkReference)
#                 bam_for_next_step = rmDuplicate.output
#                 parent_for_next_step = rmDuplicate
Penom Nom's avatar
Penom Nom committed
148
            # compute the methylation extraction from the alignement
Celine Noirot's avatar
DEbug :    
Celine Noirot committed
149
            bismarkReference_extract = self.add_component("BismarkMethylationExtractor", [indexed_ref, bam_for_next_step, "single" , self.methylation_extractor_no_overlap, self.large_genome], parent = parent_for_next_step)
Penom Nom's avatar
Penom Nom committed
150
151
152
153
154
155
        
        #Alignement against the control sequence if specified
        if self.control_genome:
            indexed_control = self.control_genome
            # index the control sequence if not already indexed
            if not os.path.exists(  os.path.join(os.path.dirname(indexed_control),"Bisulfite_Genome" )):
Celine Noirot's avatar
Celine Noirot committed
156
                bismark_genome_preparation_control = self.add_component("BismarkGenomePreparation", [ self.control_genome, self.bowtie1], component_prefix="control")
Penom Nom's avatar
Penom Nom committed
157
158
159
                indexed_control = bismark_genome_preparation_control.databank
            if self.is_paired_end() :
                # align the PE reads against the control sequence with bismark
Celine Noirot's avatar
Celine Noirot committed
160
                bismarkControl = self.add_component("Bismark", [indexed_control,trim_galore.output_files_R1, trim_galore.output_files_R2, self.samples_names, self.non_directional,self.bowtie1,self.alignment_mismatch, max_insert_size, "Control reads Alignment"], component_prefix="control_paired", parent = trim_galore)
Celine Noirot's avatar
DEbug :    
Celine Noirot committed
161
162
                bam_for_next_step = bismarkControl.output_bam
                parent_for_next_step = bismarkControl
163
164
165
166
167
#                 if not self.rrbs :
#                     #remove duplicate
#                     rmDuplicate = self.add_component("RemoveDuplicate", [bismarkControl.output_bam,self.is_paired_end(),Methylseq.BISMARK_RAM,Methylseq.BISMARK_CPU], component_prefix="control_paired",parent = bismarkControl)
#                     bam_for_next_step = rmDuplicate.output
#                     parent_for_next_step = rmDuplicate
Penom Nom's avatar
Penom Nom committed
168
                # process insert sizes of the aligned reads
Gerald Salin's avatar
Gerald Salin committed
169
                insertssizesControl = self.add_component("InsertsSizes", [bam_for_next_step, insert_size_histogram_width, insert_size_min_percentage, "LENIENT", "inserts_sizes.tar.gz"], component_prefix="control_paired", parent = parent_for_next_step)
Penom Nom's avatar
Penom Nom committed
170
                # compute the methylation extraction from the alignement
Celine Noirot's avatar
DEbug :    
Celine Noirot committed
171
                bismarkControl_extract = self.add_component("BismarkMethylationExtractor", [indexed_control, bam_for_next_step, "paired" , self.methylation_extractor_no_overlap], component_prefix="control_paired", parent = parent_for_next_step)
Penom Nom's avatar
Penom Nom committed
172
173
            else:
                # align the SE reads against the control sequence with bismark
Celine Noirot's avatar
Celine Noirot committed
174
                bismarkControl = self.add_component("Bismark", [indexed_control,trim_galore.output_files_R1, None ,self.samples_names, self.non_directional,self.bowtie1,self.alignment_mismatch,max_insert_size,"Control reads Alignment"], component_prefix="control", parent = trim_galore)
Celine Noirot's avatar
DEbug :    
Celine Noirot committed
175
176
                bam_for_next_step = bismarkControl.output_bam
                parent_for_next_step = bismarkControl    
177
178
179
180
181
#                 if not self.rrbs :
#                      #remove duplicate
#                     rmDuplicate = self.add_component("RemoveDuplicate", [bismarkControl.output_bam,self.is_paired_end(),Methylseq.BISMARK_RAM,Methylseq.BISMARK_CPU], component_prefix="control",parent = bismarkControl)
#                     bam_for_next_step = rmDuplicate.output    
#                     parent_for_next_step = rmDuplicate
Penom Nom's avatar
Penom Nom committed
182
                # compute the methylation extraction from the alignement
Celine Noirot's avatar
DEbug :    
Celine Noirot committed
183
                bismarkControl_extract = self.add_component("BismarkMethylationExtractor", [indexed_control, bam_for_next_step, "single" , self.methylation_extractor_no_overlap], component_prefix="control", parent = parent_for_next_step)
Penom Nom's avatar
Penom Nom committed
184