swarm.py 13.6 KB
Newer Older
Penom Nom's avatar
Penom Nom committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
#
# Copyright (C) 2012 INRA
# 
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#

import os
from subprocess import Popen, PIPE

from workflows.gene_diversity.lib.biomstat import *

from ng6.analysis import Analysis

from jflow.iotypes import OutputFileList, InputFileList, Formats
from jflow.abstraction import MultiMap

from weaver.function import ShellFunction, PythonFunction
from weaver.abstraction import Map


def filter_seq( input_fasta, clusters_file, cluster_fasta ):
    """
     @summary: Write a renamed_fasta where the representative sequences ID will be replaced by the ID of the cluster.
      @param input_fasta : [str] path to the fasta to process.
      @param clusters_file : [str] path to the '.clstr'.
      @param renamed_fasta : [str] path to the fasta after process.
    """
    import jflow.seqio as seqio
    
    cluster_representative = dict()
    # Retrieve representatives sequences
    cluster_idx = 1
    clusters_fh = open( clusters_file )
    for line in clusters_fh:
        line = line.strip()
        representative_id = line.split()[0]
        cluster_representative[representative_id] = "Cluster_" + str(cluster_idx)
        cluster_idx += 1
    clusters_fh.close()

    # Filter sequences
    reader = seqio.SequenceReader( input_fasta )
    out_fh = open( cluster_fasta, "w" )
    for id, desc, seq, qual in reader :
        if cluster_representative.has_key(id):
            seqio.writefasta( out_fh, [[cluster_representative[id], desc, seq, qual]] )
            
def to_biom( output_biom, clusters_file, precluster_sample_sep, precluster_size_sep ):
    """
    @summary : Write a biom file from swarm results.
    @param output_biom : [str] path to the output file.
    @param clusters_file : [str] path to the '.clstr' file.
    @param precluster_sample_sep : [str] used if sequences provided to swarm come from differents samples ("none" otherwise). The sample name is stored in each
                                 sequence id. It is separated by the character precluster_sample_sep and it is placed before the pre-cluster size information.
                                 Example : sequence ID = 'seq10001|lake_spot_1;83'.
                                           OR
                                           sequence ID = 'seq10001|lake_spot_1'.
    @param precluster_size_sep : [str] used if sequences provided to swarm are pre-clusters (ex : dereplication step before swarm). The number of sequences
                                 represented by each pre-cluster can be added to the end of its ID. In this place the number is separated by the character 
                                 precluster_size_sep.
                                 Example : precluster_size_sep=';' where sequence ID = 'seq10001;83'.
    """
    from workflows.gene_diversity.lib.Biom import Biom, BiomIO
    
    samples_seen = dict()
    biom = Biom( generated_by='swarm', matrix_type="sparse" )
    
    # Process count
    cluster_idx = 1
    clusters_fh = open( clusters_file )
    for line in clusters_fh:
        cluster_name = "Cluster_" + str(cluster_idx)
        cluster_count = dict() # count by sample
        line_fields = line.strip().split()
        # Retrieve count by sample
        for seq_id in line_fields:
            real_id = seq_id
            # Count
            precluster_count = 1
            if precluster_size_sep != "none":
                precluster_count = int( real_id.split(precluster_size_sep)[-1] )
                real_id = precluster_size_sep.join( real_id.split(precluster_size_sep)[0:-1] ) # trim cluster size from ID
            # Sample
            sample_name = "all"
            if precluster_sample_sep != "none":
                sample_name = real_id.split(precluster_sample_sep)[-1]
                real_id = precluster_sample_sep.join( real_id.split(precluster_sample_sep)[0:-1] ) # trim sample name from ID
            # Store
            if cluster_count.has_key( sample_name ):
                cluster_count[sample_name] += precluster_count
            else:
                cluster_count[sample_name] = precluster_count
        # Add cluster on biom
        biom.add_observation( cluster_name )
        for sample_name in cluster_count:
            if not samples_seen.has_key( sample_name ):
                biom.add_sample( sample_name )
                samples_seen[sample_name] = 1
            biom.add_count( cluster_name, sample_name, cluster_count[sample_name] )
        # Next cluster
        cluster_idx += 1
           
    # Write
    BiomIO.write( output_biom, biom )


def sort_by_abundance( input_fasta, output_fasta, size_separator, output_folder, sort_type="desc", new_size_separator="_" ):
    import os
    import jflow.seqio as seqio
    
    # Split by size
    abundances_files = dict()
    reader = seqio.SequenceReader( input_fasta )
    for id, desc, seq, qual in reader :
        abundance = 1
        id_without_abundance = id
        if size_separator != "none":
            abundance = id.split(size_separator)[-1]
            id_without_abundance = size_separator.join(id.split(size_separator)[0:-1])
        if not abundances_files.has_key( abundance ):
            abundances_files[abundance] = dict()
            abundances_files[abundance]["path"] = os.path.join(output_folder, 'tmp_abundance_' + abundance +  '.fasta' )
            abundances_files[abundance]["filehandle"] = open( abundances_files[abundance]["path"], "w" )
        seqio.writefasta( abundances_files[abundance]["filehandle"], [[id_without_abundance + new_size_separator + abundance, desc, seq, qual]] )
    
    for abundance in abundances_files.keys():
        abundances_files[abundance]["filehandle"].close()
    
    #Merge
    output_fh = open( output_fasta, "w" )
    reverse_order = True if sort_type == "desc" else False
    abundances = sorted( abundances_files.keys(), key=int, reverse=reverse_order )
    for current_abundance in abundances:
        fh = open( abundances_files[current_abundance]["path"] )
        for line in fh:
            output_fh.write( line.strip() + "\n" )
        fh.close()
    output_fh.close()

    #Remove tmp
    for tmp_file in abundances_files.keys():
        os.remove( abundances_files[tmp_file]["path"] )


class Swarm (Analysis):
    """
     @summary: Produces a set of representative sequences.
     After the clustering step some statistics are produced : 
         - Number of cluster by depth.
         - Hierarchical clustering between samples. This clustering is based on clusters's normalised expression.
    """
    
    def define_parameters(self, input_fasta, max_differences=1, threads=1, distance_method='euclidean', linkage_method='average', 
                             precluster_size_sep=None, precluster_sample_sep=None):
        """
         @param input_fasta : [str] fasta list to process
         @param max_differences : [int] the maximum number of differences between sequences in each cluster.
         @param threads : [int] number of threads to use.
         @param distance_method : [str] distance method for the hierarchical clustering. Accepted values @see biomstat.samples_hclassification.
         @param linkage_method : [str] linkage method for the hierarchical clustering. Accepted values @see biomstat.samples_hclassification.
         @param precluster_size_sep : [str] used if sequences provided to swarm are pre-clusters (ex : dereplication step before swarm). 
                                      The number of sequences represented by each pre-cluster can be added to the end of its ID. 
                                      In this place the number is separated by the character precluster_size_sep.
                                      Example : precluster_size_sep=';' where sequence ID = 'seq10001;83'.
         @param precluster_sample_sep : [str] used if sequences provided to swarm come from differents samples. The sample name is stored in 
                                        each sequence id. It is separated by the character precluster_sample_sep and it is placed before the 
                                        pre-cluster size information.
                                        Example : sequence ID = 'seq10001|lake_spot_1;83'.
                                                  OR
                                                  sequence ID = 'seq10001|lake_spot_1'.
        """
        # Parameters
        self.max_differences = max_differences
        self.threads = threads
        self.distance_method = distance_method
        self.linkage_method = linkage_method
        self.precluster_size_sep = self.precluster_size_sep = "none" if precluster_size_sep is None else precluster_size_sep
        self.precluster_sample_sep = self.precluster_sample_sep = "none" if precluster_sample_sep is None else precluster_sample_sep
        
        # Files
        self.input_fasta = InputFileList(input_fasta)
        self.output_files = OutputFileList( self.get_outputs('{basename_woext}_swarm.fasta', self.input_fasta), Formats.FASTA )
        self.cluster_files = OutputFileList( self.get_outputs('{basename_woext}.clstr', self.input_fasta) )
        self.log_files = OutputFileList( self.get_outputs('{basename_woext}.log', self.input_fasta) )
        self.biom_files = OutputFileList( self.get_outputs('{basename_woext}_swarm.biom', self.input_fasta) )
        self.depth_files = OutputFileList( self.get_outputs('{basename_woext}_depth.tsv', self.input_fasta) )
        self.hclust_files = OutputFileList( self.get_outputs('{basename_woext}_hclust.json', self.input_fasta) )
        self.stderr = os.path.join( self.output_directory, 'swarm.stderr' )

    def get_template(self):
        return "ClustersStats"

    def define_analysis(self):
        self.name = "Cluster"
        self.description = "Cluster sequences."
        self.software = "swarm"
        self.options = "swarm -d " + str(self.max_differences) + ";hierarchical_clustering distance=" + self.distance_method + " linkage=" + self.linkage_method

    def get_version(self):
        cmd = [self.get_exec_path("swarm"), "--version"]
        p = Popen(cmd, stdout=PIPE, stderr=PIPE)
        stdout, stderr = p.communicate()
        return stderr.split()[1]

    def post_process(self):
        self._save_files( self.biom_files + self.output_files + self.depth_files )#+ self.hclust_files )
        # Parse depths
        for filepath in self.depth_files:
            [depths, counts, nb_observations, nb_seq, upper_quartile, median, lower_quartile] = observations_depth_to_stat(filepath)
            sample_name = os.path.basename(filepath).split("_depth.tsv")[0]
            self._add_result_element( sample_name, "depths", ",".join(map(str, depths)) )
            self._add_result_element( sample_name, "counts", ",".join(map(str, counts)) )
            self._add_result_element( sample_name, "nb_observations", str(nb_observations) )
            self._add_result_element( sample_name, "nb_sequences", str(nb_seq) )
            self._add_result_element( sample_name, "upper_quartile", str(upper_quartile) )
            self._add_result_element( sample_name, "median", str(median) )
            self._add_result_element( sample_name, "lower_quartile", str(lower_quartile) )
        # Parse JSON
        for filepath in self.hclust_files:
            json_fh = open( filepath )
            hierarchical_clustering = ' '.join( line.strip() for line in json_fh )
            json_fh.close()
            sample_name = os.path.basename(filepath).split("_hclust.json")[0]
            self._add_result_element( sample_name, "linkage", hierarchical_clustering )
    
    def process(self):
        self.sorted_fasta = self.get_outputs('{basename_woext}_sorted.fasta', self.input_fasta)
        
        # Sort sequences by decreasing abundance
        sort = PythonFunction( sort_by_abundance, cmd_format="{EXE} {IN} {OUT} '" + str(self.precluster_size_sep) + "' '" + self.output_directory + "' 2>> " + self.stderr )
        sort = Map( sort, inputs=self.input_fasta, outputs=self.sorted_fasta )
        
        # Build clusters
        swarm = ShellFunction( self.get_exec_path("swarm") + " -d " + str(self.max_differences) + " -t " + str(self.threads) + " -s $2 -o $3 $1 2>> " + self.stderr, cmd_format='{EXE} {IN} {OUT}' )
        swarm = MultiMap( swarm, inputs=self.sorted_fasta, outputs=[self.log_files, self.cluster_files] )
        
        # Retrieve cluster's sequences
        filter = PythonFunction( filter_seq, cmd_format='{EXE} {IN} {OUT} 2>> ' + self.stderr )
        filter = MultiMap( filter, inputs=[self.sorted_fasta, self.cluster_files], outputs=self.output_files )
        
        # Build biom
        biom = PythonFunction( to_biom, cmd_format="{EXE} {OUT} {IN} '" + self.precluster_sample_sep + "' '_' 2>> " + self.stderr )
        biom = Map( biom, inputs=self.cluster_files, outputs=self.biom_files )
        
        # Depths stats
        depth = PythonFunction( observations_depth, cmd_format='{EXE} {IN} {OUT} 2>> ' + self.stderr )
        Map( depth, inputs=self.biom_files, outputs=self.depth_files )
        
        # Linkage stats
        dendrogram = PythonFunction( samples_hclassification, cmd_format='{EXE} {IN} {OUT} ' + self.distance_method + ' ' + self.linkage_method + ' 2>> ' + self.stderr )
        Map( dendrogram, inputs=self.biom_files, outputs=self.hclust_files )