fastqilluminafilter.py 5.76 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
#
# Copyright (C) 2012 INRA
# 
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#

Penom Nom's avatar
Penom Nom committed
18
import re, os
19 20
import logging

21 22 23
from subprocess import Popen, PIPE

from ng6.analysis import Analysis
24
from ng6.utils import Utils
25 26 27

class FastqIlluminaFilter (Analysis):
    
28 29
    def define_parameters(self,runobj, fastq_files, keep_reads="pass_illumina_filters", group_prefix=None):
        self.runobj = runobj
Penom Nom's avatar
Penom Nom committed
30 31 32 33 34 35 36 37 38 39 40
        self.add_input_file_list( "fastq_files", "fastq_files", default=fastq_files, required=True, file_format = 'fastq')
        self.add_parameter("keep_reads", "keep_reads", default=keep_reads, choices=[ "pass_illumina_filters", "not_pass_illumina_filters", "all"])
        self.add_parameter_list("group_prefix", "group_prefix", default=group_prefix)
        output_ext = '.fastq'
        if self.fastq_files[0].endswith(".gz") : 
            output_ext = '.fastq.gz'
        self.add_output_file_list( "fastq_files_filtered", "fastq_files_filtered", pattern='{basename_woext}' + output_ext, items=self.fastq_files, file_format = 'fastq')
            
        self.keep_option = "N"
        if self.keep_reads != "pass_illumina_filters" :
            self.keep_option = "Y"
41
        
Penom Nom's avatar
Penom Nom committed
42
        self.add_output_file_list("stdout", "stdout", pattern='{basename_woext}.stdout', items=self.fastq_files)
43 44
        
    def define_analysis(self):
45
        self.name = "IlluminaFilter"
46
        self.description = "Filters FASTQ file generated by CASAVA 2.20"
47
        self.software = "fastq_illumina_filter"
Penom Nom's avatar
Penom Nom committed
48 49
        self.options = "--keep " + self.keep_option + " -v"

50 51
        
    def post_process(self):
52
        logging.getLogger("FastqIlluminaFilter").debug("post_process entering")
53
        # Create dictionary : key = file name or prefix, value = files path
54
        files = {}
55
        if self.group_prefix:
56
            logging.getLogger("FastqIlluminaFilter").debug("post_process self.group_prefix is true")
57
            files = Utils.get_filepath_by_prefix( self.stdout, self.group_prefix )
58
        else:
59
            logging.getLogger("FastqIlluminaFilter").debug("post_process self.group_prefix is false")
60
            for file in self.stdout:
61
                logging.getLogger("FastqIlluminaFilter").debug("post_process self.group_prefix is false, work on " + file)
62
                file_name = os.path.splitext(os.path.basename(file))[0]
63
                files[file_name] = [file]
Penom Nom's avatar
Penom Nom committed
64 65
        
        # Merge analyses stat    
Penom Nom's avatar
Penom Nom committed
66
        for sample_file in list(files.keys()):
67
            logging.getLogger("FastqIlluminaFilter").debug("post_process, work on " + sample_file)
68 69 70
            tot_input = 0
            tot_output = 0
            for file in files[sample_file]:
71
                logging.getLogger("FastqIlluminaFilter").debug("post_process, work on " + file)
72 73 74
                [input, output] = self.__parse_stat_file(file)
                tot_input += int(input)
                tot_output += int(output)
Penom Nom's avatar
Penom Nom committed
75
            
76 77 78 79
            self._add_result_element(sample_file, "input", str(tot_input))
            self._add_result_element(sample_file, "output", str(tot_output))
    
    def get_version(self):
80
        cmd = [self.get_exec_path("fastq_illumina_filter"), "-help"]
81 82 83 84 85 86
        p = Popen(cmd, stdout=PIPE, stderr=PIPE)
        stdout, stderr = p.communicate()
        return stdout.split()[2][:-1]
        
    
    def __parse_stat_file (self, stat_file):
87
        logging.getLogger("FastqIlluminaFilter").debug("__parse_stat_file, entering")
88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104
        """
        Parse the stat file
          @param stat_file : the fastq_illumina_filter summary file
          @return             : {"before" : "nb_seq", ...}
        """
        input = 0
        output = 0
        
        for line in open(stat_file, "r").readlines():
            line = line.strip()
            input_reg = re.search("Input: (.*) reads", line)
            output_reg = re.search("Output: (.*) reads \(.*\)", line)
            if input_reg:
                input = input_reg.group(1).replace(",", "")
            if output_reg:
                output = output_reg.group(1).replace(",", "")
        
105
        logging.getLogger("FastqIlluminaFilter").debug("__parse_stat_file, returning")
106 107 108 109
        return [input, output]
        
                     
    def process(self):
110
        # If the file is not zip
111
        if not self.fastq_files[0].endswith(".gz"):
112 113
            logging.getLogger("FastqIlluminaFilter").debug("process self.fastq_files = " + ",".join(self.fastq_files))
            logging.getLogger("FastqIlluminaFilter").debug("process self.fastq_files_filtered = " + ",".join(self.fastq_files_filtered))
114 115 116
            self.add_shell_execution(self.get_exec_path("fastq_illumina_filter") + " --keep " + self.keep_option + " -v -o $1 $3 > $2", 
                                   cmd_format='{EXE} {OUT} {IN}', map=True,
                                   inputs = self.fastq_files, outputs = [self.fastq_files_filtered, self.stdout])
117
        # If the file is zip
118
        else:
119
            self.add_shell_execution("zcat $3 | " + self.get_exec_path("fastq_illumina_filter") + " --keep " + self.keep_option + " -v 2> $2 | "+self.get_exec_path("gzip")+" > $1", 
120 121
                                   cmd_format='{EXE} {OUT} {IN}', map=True,
                                   inputs = self.fastq_files, outputs = [self.fastq_files_filtered, self.stdout])