__init__.py 5.2 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#
# Copyright (C) 2012 INRA
# 
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#

import os
import sys
import re

from ng6.ng6workflow import NG6Workflow
from ng6.project import Project
from ng6.run import Run
from ng6.utils import Utils


class IlluminaDiversityQC (NG6Workflow):
        
    def process(self):      
        # handle if run name have spaces
        run_name = "_".join(self.runobj.name.split())
        
        # manage the sequences files
        group_prefix = None
        
        if self.args['casava_directory'] is not None :
            if self.args['lane_number'] is None :
                raise ValueError, "lane-number must be specified with casava-directory."
Penom Nom's avatar
Penom Nom committed
40
            mids_desc_array, self.read1_files, self.read2_files, undetermined_read1_files, undetermined_read2_files = Utils.filesFromCasava( self.args['casava_directory'], self.project.get_name(), self.args['lane_number'] ) 
41
42
            group_prefix = (Utils.get_group_basenames(self.read1_files+self.read2_files, "read")).keys()
            self.runobj.add_mids_description(mids_desc_array)
Penom Nom's avatar
Penom Nom committed
43
44
45
            # statistics about demultiplexing
            if len(undetermined_read1_files) > 0 :
                demultiplex_stats = self.add_component("DemultiplexStats", [self.read1_files, undetermined_read1_files])
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
        elif (self.args['read_1'] is not None) and (len(self.args['read_1']) > 0) :
            self.read1_files = []
            self.read2_files = []
            for file in self.args["read_1"]:
                if os.path.isfile(file):
                    self.read1_files.append(file)
                else:
                    raise IOError, file + " file does not exists."
            if self.args["read_2"]:
                for file in self.args["read_2"]:
                    if os.path.isfile(file):
                        self.read2_files.append(file)
                    else:
                        raise IOError, file + " file does not exists."
            else:
                raise IOError, "read-2 must be specified with read-1."
        else:
             raise ValueError, "[casava-directory and lane-number] OR [read-1 and read-2] must be specified."
        
        if self.args["keep_reads"] != "all" :
            # fastq illumina filter
            fastqilluminafilter = self.add_component("FastqIlluminaFilter", [self.read1_files+self.read2_files, self.args["keep_reads"], group_prefix, run_name+"_fastqilluminafilter.tar.gz"])        

            # list filtered files
            [filtered_read1_files, filtered_read2_files] = Utils.split_pair(fastqilluminafilter.fastq_files_filtered, (group_prefix is not None))
            filtered_read1_files = sorted(filtered_read1_files)
            filtered_read2_files = sorted(filtered_read2_files)       
        else:
            fastqilluminafilter = None
            filtered_read1_files = self.read1_files
            filtered_read2_files = self.read2_files
     
        # archive the files
        saved_files = filtered_read1_files + filtered_read2_files
        reads_prefixes = None
        if group_prefix is not None :
            # concatenate fastq
            reads_prefixes = (Utils.get_group_basenames(saved_files, "read")).keys()
            concatenatefastq = self.add_component("ConcatenateFilesGroups", [saved_files, reads_prefixes])
            saved_files = concatenatefastq.concat_files
        addrawfiles = self.add_component("AddRawFiles", [self.runobj, saved_files, self.args["compression"]])
        
        # make some statistics on raw file
        fastqc = self.add_component("FastQC", [filtered_read1_files+filtered_read2_files, (group_prefix is not None), True, run_name+"_fastqc.tar.gz"], parent = fastqilluminafilter)
                    
        # contamination_search
        try: self.args["databank"].extend([self.get_resource("phix_bwa"), self.get_resource("ecoli_bwa"), self.get_resource("yeast_bwa")])
        except: pass
        if self.args["databank"]:
            contamination_search = self.add_component("ContaminationSearch", [filtered_read1_files+filtered_read2_files, self.args["databank"], reads_prefixes], parent = fastqilluminafilter)
        
        # merge overlapping pair
        join_pairs = self.add_component("Flash", [filtered_read1_files, filtered_read2_files, self.args["mismatch_ratio"], self.args["min_overlap"], self.args["max_overlap"], self.args["avg_reads_length"], self.args["avg_fragment_length"]], parent=fastqilluminafilter)
        
        if self.args["assignation_databank"] is not None :
            # subset assignation
            subset_assignation = self.add_component("SubsetAssignation", [join_pairs.extended_frags, self.args["assignation_databank"]], parent=join_pairs)