fastqilluminafilter.py 5.76 KB
Newer Older
Penom Nom's avatar
Penom Nom committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
#
# Copyright (C) 2012 INRA
# 
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#

Penom Nom's avatar
Penom Nom committed
18
import re, os
19 20
import logging

Penom Nom's avatar
Penom Nom committed
21 22 23
from subprocess import Popen, PIPE

from ng6.analysis import Analysis
24
from ng6.utils import Utils
Penom Nom's avatar
Penom Nom committed
25 26 27

class FastqIlluminaFilter (Analysis):
    
28 29
    def define_parameters(self,runobj, fastq_files, keep_reads="pass_illumina_filters", group_prefix=None):
        self.runobj = runobj
Penom Nom's avatar
Penom Nom committed
30 31 32 33 34 35 36 37 38 39 40
        self.add_input_file_list( "fastq_files", "fastq_files", default=fastq_files, required=True, file_format = 'fastq')
        self.add_parameter("keep_reads", "keep_reads", default=keep_reads, choices=[ "pass_illumina_filters", "not_pass_illumina_filters", "all"])
        self.add_parameter_list("group_prefix", "group_prefix", default=group_prefix)
        output_ext = '.fastq'
        if self.fastq_files[0].endswith(".gz") : 
            output_ext = '.fastq.gz'
        self.add_output_file_list( "fastq_files_filtered", "fastq_files_filtered", pattern='{basename_woext}' + output_ext, items=self.fastq_files, file_format = 'fastq')
            
        self.keep_option = "N"
        if self.keep_reads != "pass_illumina_filters" :
            self.keep_option = "Y"
Penom Nom's avatar
Penom Nom committed
41
        
Penom Nom's avatar
Penom Nom committed
42
        self.add_output_file_list("stdout", "stdout", pattern='{basename_woext}.stdout', items=self.fastq_files)
Penom Nom's avatar
Penom Nom committed
43 44
        
    def define_analysis(self):
Penom Nom's avatar
Penom Nom committed
45
        self.name = "IlluminaFilter"
46
        self.description = "Filters FASTQ file generated by CASAVA 2.20"
Penom Nom's avatar
Penom Nom committed
47
        self.software = "fastq_illumina_filter"
Penom Nom's avatar
Penom Nom committed
48 49
        self.options = "--keep " + self.keep_option + " -v"

Penom Nom's avatar
Penom Nom committed
50 51
        
    def post_process(self):
52
        logging.getLogger("FastqIlluminaFilter").debug("post_process entering")
53
        # Create dictionary : key = file name or prefix, value = files path
54
        files = {}
55
        if self.group_prefix:
56
            logging.getLogger("FastqIlluminaFilter").debug("post_process self.group_prefix is true")
57
            files = Utils.get_filepath_by_prefix( self.stdout, self.group_prefix )
58
        else:
59
            logging.getLogger("FastqIlluminaFilter").debug("post_process self.group_prefix is false")
60
            for file in self.stdout:
61
                logging.getLogger("FastqIlluminaFilter").debug("post_process self.group_prefix is false, work on " + file)
62
                file_name = os.path.splitext(os.path.basename(file))[0]
Penom Nom's avatar
Penom Nom committed
63
                files[file_name] = [file]
Penom Nom's avatar
Penom Nom committed
64 65
        
        # Merge analyses stat    
Penom Nom's avatar
Penom Nom committed
66
        for sample_file in list(files.keys()):
67
            logging.getLogger("FastqIlluminaFilter").debug("post_process, work on " + sample_file)
Penom Nom's avatar
Penom Nom committed
68 69 70
            tot_input = 0
            tot_output = 0
            for file in files[sample_file]:
71
                logging.getLogger("FastqIlluminaFilter").debug("post_process, work on " + file)
Penom Nom's avatar
Penom Nom committed
72 73 74
                [input, output] = self.__parse_stat_file(file)
                tot_input += int(input)
                tot_output += int(output)
Penom Nom's avatar
Penom Nom committed
75
            
Penom Nom's avatar
Penom Nom committed
76 77 78 79
            self._add_result_element(sample_file, "input", str(tot_input))
            self._add_result_element(sample_file, "output", str(tot_output))
    
    def get_version(self):
Penom Nom's avatar
Penom Nom committed
80
        cmd = [self.get_exec_path("fastq_illumina_filter"), "-help"]
Penom Nom's avatar
Penom Nom committed
81 82 83 84 85 86
        p = Popen(cmd, stdout=PIPE, stderr=PIPE)
        stdout, stderr = p.communicate()
        return stdout.split()[2][:-1]
        
    
    def __parse_stat_file (self, stat_file):
87
        logging.getLogger("FastqIlluminaFilter").debug("__parse_stat_file, entering")
Penom Nom's avatar
Penom Nom committed
88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104
        """
        Parse the stat file
          @param stat_file : the fastq_illumina_filter summary file
          @return             : {"before" : "nb_seq", ...}
        """
        input = 0
        output = 0
        
        for line in open(stat_file, "r").readlines():
            line = line.strip()
            input_reg = re.search("Input: (.*) reads", line)
            output_reg = re.search("Output: (.*) reads \(.*\)", line)
            if input_reg:
                input = input_reg.group(1).replace(",", "")
            if output_reg:
                output = output_reg.group(1).replace(",", "")
        
105
        logging.getLogger("FastqIlluminaFilter").debug("__parse_stat_file, returning")
Penom Nom's avatar
Penom Nom committed
106 107 108 109
        return [input, output]
        
                     
    def process(self):
110
        # If the file is not zip
Penom Nom's avatar
Penom Nom committed
111
        if not self.fastq_files[0].endswith(".gz"):
112 113
            logging.getLogger("FastqIlluminaFilter").debug("process self.fastq_files = " + ",".join(self.fastq_files))
            logging.getLogger("FastqIlluminaFilter").debug("process self.fastq_files_filtered = " + ",".join(self.fastq_files_filtered))
114 115 116
            self.add_shell_execution(self.get_exec_path("fastq_illumina_filter") + " --keep " + self.keep_option + " -v -o $1 $3 > $2", 
                                   cmd_format='{EXE} {OUT} {IN}', map=True,
                                   inputs = self.fastq_files, outputs = [self.fastq_files_filtered, self.stdout])
117
        # If the file is zip
Penom Nom's avatar
Penom Nom committed
118
        else:
119
            self.add_shell_execution("zcat $3 | " + self.get_exec_path("fastq_illumina_filter") + " --keep " + self.keep_option + " -v 2> $2 | "+self.get_exec_path("gzip")+" > $1", 
120 121
                                   cmd_format='{EXE} {OUT} {IN}', map=True,
                                   inputs = self.fastq_files, outputs = [self.fastq_files_filtered, self.stdout])