__init__.py 9.85 KB
Newer Older
Jerome Mariette's avatar
Jerome Mariette committed
1
2
#
# Copyright (C) 2012 INRA
3
#
Jerome Mariette's avatar
Jerome Mariette committed
4
5
6
7
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
8
#
Jerome Mariette's avatar
Jerome Mariette committed
9
10
11
12
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
13
#
Jerome Mariette's avatar
Jerome Mariette committed
14
15
16
17
18
19
20
21
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#

import os
import glob
import sys

22
from ng6.ng6workflow import NG6Workflow
23
from ng6.utils import Utils
Jerome Mariette's avatar
Jerome Mariette committed
24

Maria Bernard's avatar
Maria Bernard committed
25
class RADseq (NG6Workflow):
26

Penom Nom's avatar
Penom Nom committed
27
28
29
30
31
    ENZYMES = {
              'sbfI' : {
                        'rad'    : 'CCTGCAGG',
                        'radtag' : 'TGCAGG'
            }
32
33
34
35
36
37
38
39
              ,'ecoRI' : {
                        'rad'    : 'GAATTC',
                        'radtag' : 'AATTC'
	        }
	          ,'taqI' : {
                        'rad'    : 'TCGA',
                        'radtag' : 'GCA'
	        }
Penom Nom's avatar
Penom Nom committed
40
    }
41

42

Penom Nom's avatar
Penom Nom committed
43
    def get_enzyme (self, name):
Penom Nom's avatar
Penom Nom committed
44
45
        if name not in self.ENZYMES :
            raise ValueError("The enzyme name " + str(name) + " does not exists. Accepted names are " + str (list(self.ENZYMES.keys())))
46

Penom Nom's avatar
Penom Nom committed
47
        return (self.ENZYMES[name]['rad'], self.ENZYMES[name]['radtag'])
48

49
50
    def get_name(self):
        return 'radseq'
51

52
    def get_description(self):
Maria Bernard's avatar
Maria Bernard committed
53
        return "RADseq data quality analysis workflow"
54

55
    def define_parameters(self, function="process"):
56

57
        # if metadata.barcode faire splitbc. Sinon liste read1 = liste des échantillons.
58

Maria Bernard's avatar
Maria Bernard committed
59
        # split_bc
60
        self.add_multiple_parameter_list("pool", "Sequence pool", required = False)
61
        self.add_parameter("id", "Pool uniq identifier", add_to = "pool")
62
        # required = True que si pool précisé
63
64
        #TODO check rules
        self.add_input_file("read1", "Read 1 fastq file path", rules="RequiredIf?ALL[poll!=None]", add_to = "pool")
65
        self.add_input_file("read2", "Read 2 fastq file path", add_to = "pool")
66
67
68
69
70
        # options bougées dans process_radtag
        #self.add_parameter("enzyme", "Restriction enzyme name", required = True , choices = list(self.ENZYMES.keys()))
        #self.add_parameter("enzyme2", "Restriction enzyme 2 name", required = False, choices = list(self.ENZYMES.keys()))
        # option inutile maintenant, puisque potentiel rescue et check RAD dans process_radtag
        #self.add_parameter("tag_mismatch", "Max. number of mismatches allowed in the radTAG sequence",default = 0, type=int)
Claire Kuchly's avatar
Claire Kuchly committed
71
        self.add_parameter("mismatches", "Max. number of mismatches allowed.",default = 0, type=int)
72
        self.add_parameter("trim_barcode", "Should the barecode be trimmed",default = True, type=bool, rules="Exclude=trim_reads2")
73
        self.add_parameter("trim_reads2", " Shoud the read 2 be trimmed to have the same length as the read1",default = False, type=bool)
74

Maria Bernard's avatar
Maria Bernard committed
75
        #clone_filter
Claire Kuchly's avatar
Claire Kuchly committed
76
        self.add_parameter("dereplicate", "Should we remove PCR duplcat. Only for paired end - single RADSeq protocol",default = False, type=bool)
77

Claire Kuchly's avatar
Claire Kuchly committed
78
        # process_radtag
79
80
        self.add_parameter("enzyme", "Restriction enzyme name", required = True , choices = list(self.ENZYMES.keys()))
        self.add_parameter("enzyme2", "Restriction enzyme 2 name", required = False, default="", choices = list(self.ENZYMES.keys()))
Claire Kuchly's avatar
Claire Kuchly committed
81
82
83
        self.add_parameter("uncall_remove", "clean data, remove any read with an uncalled base.", default=True, type=bool)
        self.add_parameter("discard_low_qual", "discard reads with low quality scores.", default=True, type=bool)
        self.add_parameter("rescue_radtag", "rescue barcodes and RAD-Tags.", default=False, type=bool)
84
        self.add_parameter("max_length", "truncate final read length to this value. (default none)", required = False, type=int)
Claire Kuchly's avatar
Claire Kuchly committed
85
86
        self.add_parameter("quality_encode", "specify how quality scores are encoded, 'phred33' (Illumina 1.8+, Sanger, default) or 'phred64' (Illumina 1.3 - 1.5).", choices = ['phred33', 'phred64'], default='phred33')
        self.add_parameter("keep_discard_read", "capture discarded reads to a file.", default = True, type=bool)
87
        self.add_parameter("window_size", "set the size of the sliding window as a fraction of the read length, between 0 and 1 (default 0.15).", default=0.15, type=float)
Claire Kuchly's avatar
Claire Kuchly committed
88
        self.add_parameter("limit_score", "set the score limit. If the average score within the sliding window drops below this value, the read is discarded (default 10).", default=10, type=int)
89

Maria Bernard's avatar
Maria Bernard committed
90
        # ustacks
91
92
93
94
95
96
97
        self.add_parameter("min_cov", "stacks minimum coverage", type=int)
        self.add_parameter("primary_mismatch", "stacks max mismatch", type=int)
        self.add_parameter("secondary_mismatch", "2nd read max mismatch", type=int)
        self.add_parameter("uncall_hap", "uncall hap with 2nd read", type=bool)
        self.add_parameter("disable_removal_algo", "removal cluster correction algo", type=bool)
        self.add_parameter("disable_deleveraging_algo", "deleveraging cluster correction algo", type=bool)
        self.add_parameter("max_locus", "max stacks per cluster", type=int)
98

Maria Bernard's avatar
Maria Bernard committed
99
        # cstacks
Maria Bernard's avatar
Maria Bernard committed
100
        self.add_parameter("batch_id", " batch_id of this radseq analysis",default = 1, type=int)
101
        self.add_parameter("catalog_mismatches", "How many mismatches allowed between sample to generate the cluster catalog",default = 1, type=int)
102

Maria Bernard's avatar
Maria Bernard committed
103
        # ng6
104
105
        self.add_parameter("compression", "How should data be compressed once archived (none|gz|bz2)",default = 'gz', choices=['none', 'gz', 'bz2'])
        self.add_parameter("databank", "Which databank should be used to seek contamination (as to be nr databank)")
106
107


Jerome Mariette's avatar
Jerome Mariette committed
108
    def process(self):
109

110
111
        read1_list=[]
        read2_list= []
112

113
114
115
116
        # plus besoins on fait le check du rad dans process_radtag, sinon ça ne fonctionne pas dans le cas des double digest.
        # rad, rad_tag = self.get_enzyme(self.enzyme)
        # si aucun pool, alors données démultipléxées et donc les sample read1 sont les vraies sample read1
        if not self.pool:
117
118
119
120
121
            for s in self.samples:
                for r in s.reads1 :
                    read1_list.append(r)
                for r in s.reads2:
                    read2_list.append(r)
122
123
124
125
126
127
128
129
        # si démultiplexage alors définition de pool, et les sample read1 sont égaux au pool read1
        else:
            # group all individual by pool
            pools = {}
            for p in self.pool :
                if p["id"] in pools :
                    raise ValueError("Duplicated pool id." + p['id'])
                pools[p["id"]] = (p, [])
130

131
132
133
134
135
            for sample in self.samples :
                pool_id = sample.get_metadata('pool_id')
                if pool_id not in pools:
                    raise ValueError("The pool id " + pool_id + " does not exists in (individual " + sample.name + ")")
                pools[pool_id][1].append(sample)
136
137

            # prepare fastq files
138
139
140
            fastq_files_1 = []
            fastq_files_2 = []
            barcode_files = []
141

142
143
144
145
146
147
148
149
            for pool_id, data in pools.items() :
                p = data[0]
                samples = data[1]
                tmp_barcode = self.get_temporary_file()
                barcode_files.append(tmp_barcode)
                fastq_files_1.append(p['read1']);
                if p['read2'] :
                    fastq_files_2.append(p['read2'])
150

151
152
153
154
                # write barcode file
                with open(tmp_barcode, "w") as ff:
                    for sample in samples :
                        ff.write(sample.name + "\t" + sample.get_metadata('barcode') +"\n")
155

156
157
158
159
            #splitbc = self.add_component("Splitbc", [ fastq_files_1, barcode_files, fastq_files_2, rad, rad_tag, self.mismatches, self.tag_mismatch, self.trim_barcode, self.trim_reads2])
            splitbc = self.add_component("Splitbc", [ fastq_files_1, barcode_files, fastq_files_2, self.mismatches, self.trim_barcode, self.trim_reads2])
            read1_list=splitbc.output_read1
            read2_list=splitbc.output_read2
160

161
        # PROCESS_RADTAG
162
        print("before process",read1_list,read2_list)
Claire Kuchly's avatar
Claire Kuchly committed
163
        process_radtag = self.add_component("ProcessRadtag", [read1_list, read2_list, self.dereplicate, self.enzyme, self.enzyme2, self.window_size, self.limit_score, self.quality_encode, self.max_length, self.uncall_remove, self.discard_low_qual, self.rescue_radtag, self.keep_discard_read ])
164
"""
Claire Kuchly's avatar
Claire Kuchly committed
165

166
#         # USTACKS
Maria Bernard's avatar
Maria Bernard committed
167
168
169
170
#         # creation des listes pour conserver l'ordre entre read1 read2 et sample id
#         samples_id = []
#         read1_files = []
#         read2_files = []
171
#         # check non empty and MAJ sample_id
172
#         for idx, read1 in enumerate(process_radtag.output_read_1):
Maria Bernard's avatar
Maria Bernard committed
173
174
175
176
177
#             if os.path.getsize(read1) > 0:
#                 read1_files.append(read1)
#                 read2_files.append(process_radtag.output_read_2[idx])
#                 samples_id.append(idx)
#
178
        ustacks_opt={"samples_id":samples_id,"read1_files":read1_list}
179
180

        if self.min_cov :
181
            ustacks_opt["min_cov"] = self.min_cov
182
        if self.primary_mismatch :
183
            ustacks_opt["primary_mismatch"] = self.primary_mismatch
184
        if self.secondary_mismatch :
185
            ustacks_opt["secondary_mismatch"] =  self.secondary_mismatch
186
        if self.uncall_hap :
187
            ustacks_opt["uncall_hap"] = self.uncall_hap
188
        if self.disable_removal_algo :
189
            ustacks_opt["removal_algo"] = False
190
        if self.disable_deleveraging_algo :
191
            ustacks_opt["deleveraging_algo"] = False
192
        if self.max_locus :
193
            ustacks_opt["max_locus"] = self.max_locus
194

195
        ustacks = self.add_component("Ustacks", [],ustacks_opt)
196

Maria Bernard's avatar
Maria Bernard committed
197
        # CSTACKS
Maria Bernard's avatar
Maria Bernard committed
198
#         cstacks = self.add_component("Cstacks", [ustacks.alleles, ustacks.snps, ustacks.tags, self.batch_id, self.catalog_mismatches])
199

Claire Kuchly's avatar
Claire Kuchly committed
200
"""