__init__.py 15.3 KB
Newer Older
1
#
Jerome Mariette's avatar
Jerome Mariette committed
2
# Copyright (C) 2015 INRA
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# 
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#

from jflow.workflow import Workflow

import os
import sys
22
23
from io import StringIO
from configparser import ConfigParser
24

25
26
from jflow import seqio

27
28
29
30
31
32
33
class VariantCalling (Workflow):

    def get_description(self):
        return "Variation calling"

    def define_parameters(self, function="process"):
        self.add_input_file("reference", "The reference used to align reads.", required=True)
34
        # Libraries
35
        self.add_parameter("is_rna", "The sequence in BAM are RNA (add '-recoverDanglingHeads -dontUseSoftClippedBases' to calling options). Keep in mind to change min confidence and quality when you used RNASeq data.", default=False, type="bool" )
36
37
38
        self.add_multiple_parameter_list("library", "Library.", required=True)
        self.add_input_file("R1", "Path to R1 file.", required=True, add_to="library")
        self.add_input_file("R2", "Path to R2 file.", add_to="library")
39
        self.add_parameter("sample", "Sample Name.", add_to="library")
40
        self.add_parameter("sequencer", "The sequencer type.", choices=["HiSeq2000", "ILLUMINA","SLX","SOLEXA","SOLID","454","COMPLETE","PACBIO","IONTORRENT","CAPILLARY","HELICOS","UNKNOWN"], default="HiSeq2000", add_to="library")
41
        self.add_input_file("sequenced_regions", "Bed file which describe which region are sequenced. this information is used to reduce analysis region.")
42
        # Parameter GATK
43
44
45
46
        self.add_parameter( "global_calling", "The variants discovery is lead with all samples and not sample by sample.", default=False, type="bool" )
        self.add_parameter( "min_call_confidence", "The minimum confidence needed for a given base for it to be used in variant calling.", default=30, type="float" )
        self.add_parameter( "min_emit_confidence", "This argument allows you to emit low quality calls as filtered records.", default=10, type="float" )
        self.add_parameter( "min_qual_score", "This argument allows you to emit low quality calls as filtered records.", default=10, type="int" )
47
        self.add_parameter("two_steps_calling", "The SNP calling is realised in two step. The first step (recalibration, calling, filter) has hard filters. The second step (recalibration, calling, filter) has standard filters and the variants detected in the first step are used as database of known polymorphic sites.",
48
49
                           display_name="Two steps for SNP calling", type="bool", default=False, group="SNP Calling")
        self.add_input_file("known_snp", "Known snp for the reference.", group="SNP Calling")
50
        self.add_exclusion_rule( "known_snp", "two_steps_calling" )
51

52
53
54
55
56
57
58
59
    def update_sequenced_regions(self):
        if self.sequenced_regions == None and self.global_calling:
            self.sequenced_regions = self.get_temporary_file(".intervals")
            FH_sequenced_regions = open(self.sequenced_regions, "w")
            reader = seqio.FastaReader(self.reference)
            for id, desc, seq, qualities in reader:
                FH_sequenced_regions.write( id + "\n" )
            FH_sequenced_regions.close()
60
61

    def process(self):
62
63
64
65
66
        JAVA_HUGE_MEM = 2
        self.update_sequenced_regions()
        samples = dict()
        lib_paired_idx = 0
        lib_single_idx = 0
67
68
69
        single_fastq, pair1_fastq, pair2_fastq = [], [], []
        single_librairies_sequencers, pair_librairies_sequencers = [], []
        for lib_arg in self.library:
70
            if lib_arg["sample"] != None and not lib_arg["sample"] in samples:
71
                samples[lib_arg["sample"]] = { "paired": list(), "single": list() }
72
            if lib_arg["R2"] != None and lib_arg["R2"] != "":
73
74
                if lib_arg["sample"] != None:
                    samples[lib_arg["sample"]]["paired"].append( lib_paired_idx )
75
76
77
78
79
80
                pair1_fastq.append(lib_arg["R1"])
                pair2_fastq.append(lib_arg["R2"])
                if lib_arg["sequencer"].lower().startswith("hiseq") or lib_arg["sequencer"].lower().startswith("miseq"):
                    pair_librairies_sequencers.append( "ILLUMINA" )
                else:
                    pair_librairies_sequencers.append( lib_arg["sequencer"] )
81
                lib_paired_idx += 1
82
            else:
83
84
                if lib_arg["sample"] != None:
                    samples[lib_arg["sample"]]["single"].append( lib_single_idx )
85
86
87
88
89
                single_fastq.append(lib_arg["R1"])
                if lib_arg["sequencer"].lower().startswith("hiseq") or lib_arg["sequencer"].lower().startswith("miseq"):
                    single_librairies_sequencers.append( "ILLUMINA" )
                else:
                    single_librairies_sequencers.append( lib_arg["sequencer"] )
90
                lib_single_idx += 1
91

92
93
94
95
96
97
98
99
100
101
102
103
        # Index
        # ##############################
        databank_fai= self.reference
        databank_bwa= self.reference
        if not os.path.exists(self.reference + ".fai") or not os.path.exists(os.path.splitext(self.reference)[0] + ".dict"):
            index_fai_dict = self.add_component( "IndexFaiDict", [self.reference] )
            databank_fai = index_fai_dict.databank
        if not os.path.exists(self.reference + ".bwt"):
            index_bwa = self.add_component( "BWAIndex", [self.reference] )
            databank_bwa = index_bwa.databank


104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
        # Change quality offset
        # ##############################
        # Paired-end reads
        pair1_fastq_qual = pair1_fastq
        pair2_fastq_qual = pair2_fastq
        if len(pair1_fastq) > 0:
            change_offset_paired = self.add_component( "ChangeQualOffset", [pair1_fastq, pair2_fastq], component_prefix="paired" )
            pair1_fastq_qual = change_offset_paired.R1_out_files
            pair2_fastq_qual = change_offset_paired.R2_out_files

        # Single reads
        single_fastq_qual = single_fastq
        if len(single_fastq) > 0:
            change_offset_paired = self.add_component( "ChangeQualOffset", [single_fastq], component_prefix="single" )
            single_fastq_qual = change_offset_paired.R1_out_files


121
122
        # Alignment
        # ##############################
123
        aln_algorithm = "mem"
124
125
126
        all_bams = []
        all_bais = []

127
128
129
        # Paired-end reads
        if len(pair2_fastq_qual)>0 and len(pair1_fastq_qual)>0 :
            align_paired     = self.add_component( "BWA", [databank_bwa, pair1_fastq_qual, pair2_fastq_qual, None, aln_algorithm], component_prefix="paired" )
130
131
132
133
            index_bam_paired = self.add_component( "SamtoolsIndex", [align_paired.bam_files, 2, 2], component_prefix="paired" )
            all_bams = all_bams + index_bam_paired.sorted_bams
            all_bais = all_bais + index_bam_paired.output_files

134
        # Single reads
135
        if len(single_fastq)>0 :
136
            align_single     = self.add_component( "BWA", [databank_bwa, single_fastq_qual, None, None, aln_algorithm], component_prefix="single" )
137
138
139
140
141
142
143
144
            index_bam_single = self.add_component( "SamtoolsIndex", [align_single.bam_files, 2, 2], component_prefix="single" )
            all_bams = all_bams + index_bam_single.sorted_bams
            all_bais = all_bais + index_bam_single.output_files


        # Variant calling and annotation
        ##############################
        # Pre-process
145
        spliced_aln = (aln_algorithm == "mem")
146
        if len(single_fastq) > 0:
147
148
            variant_preprocess_single = self.add_component("VariantPreprocess", [databank_fai, index_bam_single.sorted_bams, single_librairies_sequencers, False, 30, JAVA_HUGE_MEM], component_prefix="single" )
            gatk_preprocess_single    = self.add_component("GatkPreprocess", [variant_preprocess_single.output_files, variant_preprocess_single.index_files, databank_fai, JAVA_HUGE_MEM, spliced_aln, None, self.sequenced_regions], component_prefix="single" )
149
        if len(pair2_fastq) > 0:
150
151
152
153
154
155
156
            variant_preprocess_paired = self.add_component("VariantPreprocess", [databank_fai, index_bam_paired.sorted_bams, pair_librairies_sequencers, True, 30, JAVA_HUGE_MEM], component_prefix="paired" )
            gatk_preprocess_paired    = self.add_component("GatkPreprocess", [variant_preprocess_paired.output_files, variant_preprocess_paired.index_files, databank_fai, JAVA_HUGE_MEM, spliced_aln, None, self.sequenced_regions], component_prefix="paired" )

        # Merge BAM with the same sample
        gatk_preprocess_bams = list()
        processed_idx_paired = dict()
        processed_idx_single = dict()
157
        for sample_name in list(samples.keys()):
158
159
160
161
162
163
164
165
166
167
            current_sample = samples[sample_name]
            bams = list()
            for idx_bam in current_sample['paired']:
                bams.append( gatk_preprocess_paired.output_files[idx_bam] )
                processed_idx_paired[idx_bam] = True
            for idx_bam in current_sample['single']:
                bams.append( gatk_preprocess_single.output_files[idx_bam] )
                processed_idx_single[idx_bam] = True
            merged_bam = self.add_component("SamtoolsMerge", [bams, sample_name], component_prefix=sample_name )
            gatk_preprocess_bams.append( merged_bam.merged_bam )
Philippe Bardou's avatar
Philippe Bardou committed
168
169
        if len(pair2_fastq) > 0:
            for idx, bam_path in enumerate(gatk_preprocess_paired.output_files):
170
                if not idx in processed_idx_paired:
Philippe Bardou's avatar
Philippe Bardou committed
171
172
173
                    gatk_preprocess_bams.append( bam_path )
        if len(single_fastq) > 0: 
            for idx, bam_path in enumerate(gatk_preprocess_single.output_files):
174
                if not idx in processed_idx_single:
Philippe Bardou's avatar
Philippe Bardou committed
175
                    gatk_preprocess_bams.append( bam_path )
176

177
        # Variant calling
178
179
        pre_vcf_file = self.known_snp
        if self.two_steps_calling: # if known_snp is not provide and two_steps_calling True
180
            # SNP calling with hards parameters to produce a database of safe polymorphic sites
181
182
            gatk_recalibration_pre_step    = self.add_component("GatkRecalibration", [gatk_preprocess_bams, databank_fai, None, self.sequenced_regions, JAVA_HUGE_MEM], component_prefix="pre_step")
            gatk_haplotype_caller_pre_step = self.add_component("GatkHaplotypeCaller", [gatk_recalibration_pre_step.output_files, databank_fai, self.min_call_confidence+10, self.min_emit_confidence+10, self.min_qual_score, self.is_rna, self.sequenced_regions, JAVA_HUGE_MEM, self.global_calling], component_prefix="pre_step")
183
            gatk_filter_pre_step           = self.add_component("GatkVariantFilter", [gatk_haplotype_caller_pre_step.output_file, databank_fai,
184
                                                                                      '-window 35 -cluster 3 --filterExpression "QD < 2.0 || FS > 100.0 || MQ < 40.0 || HaplotypeScore > 13.0 || MappingQualityRankSum < -12.5 || ReadPosRankSum < -8.0" --filterName "hard_filter"', 
185
186
                                                                                      '-window 35 -cluster 3 --filterExpression "QD < 2.0 || FS > 200.0 || ReadPosRankSum < -20.0" --filterName "hard_filter"',
                                                                                      JAVA_HUGE_MEM],
187
188
189
                                                                component_prefix="pre_step")
            pre_vcf_file = gatk_filter_pre_step.output_file
        # SNP calling with standards parameters to produce finals variants
190
191
        gatk_recalibration    = self.add_component("GatkRecalibration", [gatk_preprocess_bams, databank_fai, pre_vcf_file, self.sequenced_regions, JAVA_HUGE_MEM], component_prefix="final")
        gatk_haplotype_caller = self.add_component("GatkHaplotypeCaller", [gatk_recalibration.output_files, databank_fai, self.min_call_confidence, self.min_emit_confidence, self.min_qual_score, self.is_rna, self.sequenced_regions, JAVA_HUGE_MEM, self.global_calling], component_prefix="final")
192
        gatk_filter           = self.add_component("GatkVariantFilter", [gatk_haplotype_caller.output_file, databank_fai,
193
                                                                         '-window 18 -cluster 3 -filterName FS --filterExpression \"FS > 30.0\"',
194
195
                                                                         '-window 18 -cluster 3 -filterName FS --filterExpression \"FS > 30.0\"',
                                                                         JAVA_HUGE_MEM],
196
197
                                                   component_prefix="final")

198

199
200
    @staticmethod
    def config_parser(arg_lines):
201
        config = ConfigParser()
202
        config.readfp(StringIO('\n'.join(arg_lines)))
203
204
205
206
207
208
        arguments = []

        section = 'General'
        if config.has_section(section):
            if config.has_option(section, 'reference'):
                arguments.extend( [ '--reference', config.get(section, 'reference') ] )
209
210
211
212
213
214
215
216
217
218
            if config.has_option(section, 'global_calling') and config.getboolean(section, 'global_calling'):
                arguments.extend( [ '--global-calling' ] )
            if config.has_option(section, 'is_rna') and config.getboolean(section, 'is_rna'):
                arguments.extend( [ '--is-rna' ] )
            if config.has_option(section, 'min_call_confidence'):
                arguments.extend( [ '--min-call-confidence', config.get(section, 'min_call_confidence') ] )
            if config.has_option(section, 'min_emit_confidence'):
                arguments.extend( [ '--min-emit-confidence', config.get(section, 'min_emit_confidence') ] )
            if config.has_option(section, 'min_emit_confidence'):
                arguments.extend( [ '--min-qual-score', config.get(section, 'min_qual_score') ] )
219
220
221
222
223
224
225

        section = 'Libraries'
        if config.has_section(section):
            libraries = dict()
            items = config.items(section)
            for tag, value in items:
                lib_id, param = tag.split('.')
226
                if not lib_id in libraries:
227
228
229
230
231
                    libraries[lib_id] = list()
                if param == 'r1':
                    libraries[lib_id].append( 'R1=' + value )
                if param == 'r2':
                    libraries[lib_id].append( 'R2=' + value )
232
233
                if param == 'sample':
                    libraries[lib_id].append( 'sample=' + value )
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
                if param == 'sequencer':
                    libraries[lib_id].append( 'sequencer=' + value )
                if param == 'sequenced_regions':
                    libraries[lib_id].append( 'sequenced-regions=' + value )
            for current_lib in libraries:
                arguments.append('--library')
                for param in libraries[current_lib]:
                    arguments.append( param )

        section = 'SNP calling'
        if config.has_section(section):
            if config.has_option(section, 'known_snp'):
                arguments.extend( [ '--known-snp', config.get(section, 'known_snp') ] )
            if config.has_option(section, 'two_steps_calling') and config.getboolean(section, 'two_steps_calling'):
                arguments.extend( [ '--two-steps-calling' ] )

250
251
        return arguments