uclust.py 11.8 KB
Newer Older
Penom Nom's avatar
Penom Nom committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
#
# Copyright (C) 2012 INRA
# 
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#

import os
from subprocess import Popen, PIPE

from workflows.gene_diversity.lib.biomstat import *

from ng6.analysis import Analysis

from jflow.iotypes import OutputFileList, InputFileList, Formats
from jflow.abstraction import MultiMap

from weaver.function import ShellFunction, PythonFunction
from weaver.abstraction import Map


def to_biom( output_biom, clusters_file, precluster_sample_sep, precluster_size_sep ):
    """
    @summary : Write a biom file from cdhit results.
    @param output_biom : [str] path to the output file.
    @param clusters_file : [str] path to the '.clstr' file.
    @param precluster_size_sep : [str] used if sequences provided to Cdhit are pre-clusters (ex : dereplication step before cd-hit). The number of sequences
                                 represented by each pre-cluster can be added to the end of its ID. In this place the number is separated by the character 
                                 precluster_size_sep.
                                 Example : precluster_size_sep=';' where sequence ID = 'seq10001;83'.
    @param precluster_size_sep : [str] used if sequences provided to Cdhit come from differents samples ("none" otherwise). The sample name is stored in each
                                 sequence id. It is separated by the character precluster_sample_sep and it is placed before the pre-cluster size information.
                                 Example : sequence ID = 'seq10001|lake_spot_1;83'.
                                           OR
                                           sequence ID = 'seq10001|lake_spot_1'.
    """
    from workflows.gene_diversity.lib.Biom import Biom, BiomIO
    
    samples_seen = dict()
    biom = Biom( generated_by='cdhit', matrix_type="sparse" )

    # Process count
    cluster_name = None
    for line in open(clusters_file).readlines():
        # New cluster
        if line.startswith(">"): # Line example : '>Cluster 0' => 'Cluster_0'
            if cluster_name is not None:
                # Add previous cluster on biom
                biom.add_observation( cluster_name )
                for sample_name in cluster_count:
                    if not samples_seen.has_key( sample_name ):
                        biom.add_sample( sample_name )
                        samples_seen[sample_name] = 1
                    biom.add_count( cluster_name, sample_name, cluster_count[sample_name] )
            # Start new cluster
            cluster_name = line.strip()[1:].replace(" ", "_")
            cluster_count = dict()
        # Additional information on current cluster
        else: # Line example : '60    130aa, >c6104.0;1... at 99.23%'
            pre_cluster_id = line.strip().split()[2][1:-3] # c6104.0;1
            # Count
            pre_cluster_count = 1
            if precluster_size_sep != "none":
                pre_cluster_count = int( pre_cluster_id.split(precluster_size_sep)[-1] )
                pre_cluster_id = precluster_size_sep.join( pre_cluster_id.split(precluster_size_sep)[0:-1] ) # trim cluster size from ID
            # Sample
            pre_cluster_sample = "all"
            if precluster_sample_sep != "none":
                pre_cluster_sample = pre_cluster_id.split(precluster_sample_sep)[-1]
            # Store
            if cluster_count.has_key(pre_cluster_sample):
                cluster_count[pre_cluster_sample] += pre_cluster_count
            else:
                cluster_count[pre_cluster_sample] = pre_cluster_count
    if cluster_name is not None:
        # Add last cluster on biom
        biom.add_observation( cluster_name )
        for sample_name in cluster_count:
            if not samples_seen.has_key( sample_name ):
                biom.add_sample( sample_name )
                samples_seen[sample_name] = 1
            biom.add_count( cluster_name, sample_name, cluster_count[sample_name] ) 
    # Write
    BiomIO.write( output_biom, biom )


class Uclust (Analysis):
    """
     @summary: Produces a set of representative sequences.
     After the clustering step some statistics are produced : 
         - Number of cluster by depth.
         - Hierarchical clustering between samples. This clustering is based on clusters's normalised expression.
    """
    
    def define_parameters(self, input_fasta, identity_treshold=0.97, is_nucleic=True, distance_method='euclidean', linkage_method='average', 
                             precluster_size_sep=None, precluster_sample_sep=None):
        """
         @param input_fasta : [str] fasta list to process
         @param identity_threshold : [float] sequence identity threshold to cluster.
         @param is_nucleic : [bool] False => the sequences are amino acids.
         @param distance_method : [str] distance method for the hierarchical clustering. Accepted values @see biomstat.samples_hclassification.
         @param linkage_method : [str] linkage method for the hierarchical clustering. Accepted values @see biomstat.samples_hclassification.
         @param precluster_size_sep : [str] used if sequences provided to swarm are pre-clusters (ex : dereplication step before swarm). 
                                      The number of sequences represented by each pre-cluster can be added to the end of its ID. 
                                      In this place the number is separated by the character precluster_size_sep.
                                      Example : precluster_size_sep=';' where sequence ID = 'seq10001;83'.
         @param precluster_sample_sep : [str] used if sequences provided to swarm come from differents samples. The sample name is stored in 
                                        each sequence id. It is separated by the character precluster_sample_sep and it is placed before the 
                                        pre-cluster size information.
                                        Example : sequence ID = 'seq10001|lake_spot_1;83'.
                                                  OR
                                                  sequence ID = 'seq10001|lake_spot_1'.
        """
        # Parameters
        self.identity_treshold = identity_treshold
        self.molecule_type = "nucleo" if is_nucleic else "amino"
        self.distance_method = distance_method
        self.linkage_method = linkage_method
        self.precluster_size_sep = self.precluster_size_sep = "none" if precluster_size_sep is None else precluster_size_sep
        self.precluster_sample_sep = self.precluster_sample_sep = "none" if precluster_sample_sep is None else precluster_sample_sep
        
        # Files
        self.input_fasta = InputFileList(input_fasta)
        self.output_files = OutputFileList( self.get_outputs('{basename_woext}_uclust.fasta', self.input_fasta), Formats.FASTA )
        self.cluster_files = OutputFileList( self.get_outputs('{basename_woext}.clstr', self.input_fasta) )
        self.uclust_files = OutputFileList( self.get_outputs('{basename_woext}.uc', self.input_fasta) )
        self.biom_files = OutputFileList( self.get_outputs('{basename_woext}_uclust.biom', self.input_fasta) )
        self.depth_files = OutputFileList( self.get_outputs('{basename_woext}_depth.tsv', self.input_fasta) )
        self.hclust_files = OutputFileList( self.get_outputs('{basename_woext}_hclust.json', self.input_fasta) )
        self.stderr = os.path.join( self.output_directory, 'uclust.stderr' )

    def get_template(self):
        return "ClustersStats"

    def define_analysis(self):
        self.name = "Cluster"
        self.description = "Cluster sequences."
        self.software = "uclust"
        self.options = "uclust --" + str(self.molecule_type) + " -id " + str(self.identity_treshold) + ";hierarchical_clustering distance=" + self.distance_method + " linkage=" + self.linkage_method

    def get_version(self):
        cmd = [self.get_exec_path("uclust"), "--version"]
        p = Popen(cmd, stdout=PIPE, stderr=PIPE)
        stdout, stderr = p.communicate()
        return stdout.split()[1]

    def post_process(self):
        self._save_files( self.biom_files + self.output_files + self.depth_files )
        # Parse depths
        for filepath in self.depth_files:
            [depths, counts, nb_observations, nb_seq, upper_quartile, median, lower_quartile] = observations_depth_to_stat(filepath)
            sample_name = os.path.basename(filepath).split("_depth.tsv")[0]
            self._add_result_element( sample_name, "depths", ",".join(map(str, depths)) )
            self._add_result_element( sample_name, "counts", ",".join(map(str, counts)) )
            self._add_result_element( sample_name, "nb_observations", str(nb_observations) )
            self._add_result_element( sample_name, "nb_sequences", str(nb_seq) )
            self._add_result_element( sample_name, "upper_quartile", str(upper_quartile) )
            self._add_result_element( sample_name, "median", str(median) )
            self._add_result_element( sample_name, "lower_quartile", str(lower_quartile) )
        # Parse JSON
        for filepath in self.hclust_files:
            json_fh = open( filepath )
            hierarchical_clustering = ' '.join( line.strip() for line in json_fh )
            json_fh.close()
            sample_name = os.path.basename(filepath).split("_hclust.json")[0]
            self._add_result_element( sample_name, "linkage", hierarchical_clustering )
    
    def process(self):
        self.sorted_fasta = self.get_outputs('{basename_woext}_sorted.fasta', self.input_fasta)
        
        # Sort sequences by decreasing abundance
        sort = ShellFunction( self.get_exec_path("uclust") + " --sort $1 --output $2  2>> " + self.stderr, cmd_format='{EXE} {IN} {OUT}' )
        sort = Map( sort, inputs=self.input_fasta, outputs=self.sorted_fasta )
       
        # Build clusters
        uclust = ShellFunction( self.get_exec_path("uclust") + " --" + str(self.molecule_type) + " --id " + str(self.identity_treshold) + " --input $1 --uc $2 2>> " + self.stderr, cmd_format='{EXE} {IN} {OUT}' )
        uclust = Map( uclust, inputs=self.sorted_fasta, outputs=self.uclust_files )
        convert = ShellFunction( self.get_exec_path("uclust") + " --uc2clstr $1 --output $2 2>> " + self.stderr, cmd_format='{EXE} {IN} {OUT}' )
        convert = Map( convert, inputs=self.uclust_files, outputs=self.cluster_files )
        
        # Retrieve cluster's sequences
        fasta = ShellFunction( self.get_exec_path("uclust") + " --uc2fasta $1 --input $2 --output $3 2>> " + self.stderr, cmd_format='{EXE} {IN} {OUT}' )
        fasta = MultiMap( fasta, inputs=[self.uclust_files, self.sorted_fasta], outputs=self.output_files )
        
        # Build biom
        biom = PythonFunction( to_biom, cmd_format="{EXE} {OUT} {IN} '" + self.precluster_sample_sep + "' '" + str(self.precluster_size_sep) + "' 2>> " + self.stderr )
        biom = Map( biom, inputs=self.cluster_files, outputs=self.biom_files )
        
        # Depths stats
        depth = PythonFunction( observations_depth, cmd_format='{EXE} {IN} {OUT} 2>> ' + self.stderr )
        Map( depth, inputs=self.biom_files, outputs=self.depth_files )
        
        # Linkage stats
        dendrogram = PythonFunction( samples_hclassification, cmd_format='{EXE} {IN} {OUT} ' + self.distance_method + ' ' + self.linkage_method + ' 2>> ' + self.stderr )
        Map( dendrogram, inputs=self.biom_files, outputs=self.hclust_files )