Commit 88935c70 authored by Penom Nom's avatar Penom Nom
Browse files

Add wrapper to swarm and uclust.

parent 029d4fc9
#
# Copyright (C) 2012 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import os
from subprocess import Popen, PIPE
from workflows.gene_diversity.lib.biomstat import *
from ng6.analysis import Analysis
from jflow.iotypes import OutputFileList, InputFileList, Formats
from jflow.abstraction import MultiMap
from weaver.function import ShellFunction, PythonFunction
from weaver.abstraction import Map
def filter_seq( input_fasta, clusters_file, cluster_fasta ):
"""
@summary: Write a renamed_fasta where the representative sequences ID will be replaced by the ID of the cluster.
@param input_fasta : [str] path to the fasta to process.
@param clusters_file : [str] path to the '.clstr'.
@param renamed_fasta : [str] path to the fasta after process.
"""
import jflow.seqio as seqio
cluster_representative = dict()
# Retrieve representatives sequences
cluster_idx = 1
clusters_fh = open( clusters_file )
for line in clusters_fh:
line = line.strip()
representative_id = line.split()[0]
cluster_representative[representative_id] = "Cluster_" + str(cluster_idx)
cluster_idx += 1
clusters_fh.close()
# Filter sequences
reader = seqio.SequenceReader( input_fasta )
out_fh = open( cluster_fasta, "w" )
for id, desc, seq, qual in reader :
if cluster_representative.has_key(id):
seqio.writefasta( out_fh, [[cluster_representative[id], desc, seq, qual]] )
def to_biom( output_biom, clusters_file, precluster_sample_sep, precluster_size_sep ):
"""
@summary : Write a biom file from swarm results.
@param output_biom : [str] path to the output file.
@param clusters_file : [str] path to the '.clstr' file.
@param precluster_sample_sep : [str] used if sequences provided to swarm come from differents samples ("none" otherwise). The sample name is stored in each
sequence id. It is separated by the character precluster_sample_sep and it is placed before the pre-cluster size information.
Example : sequence ID = 'seq10001|lake_spot_1;83'.
OR
sequence ID = 'seq10001|lake_spot_1'.
@param precluster_size_sep : [str] used if sequences provided to swarm are pre-clusters (ex : dereplication step before swarm). The number of sequences
represented by each pre-cluster can be added to the end of its ID. In this place the number is separated by the character
precluster_size_sep.
Example : precluster_size_sep=';' where sequence ID = 'seq10001;83'.
"""
from workflows.gene_diversity.lib.Biom import Biom, BiomIO
samples_seen = dict()
biom = Biom( generated_by='swarm', matrix_type="sparse" )
# Process count
cluster_idx = 1
clusters_fh = open( clusters_file )
for line in clusters_fh:
cluster_name = "Cluster_" + str(cluster_idx)
cluster_count = dict() # count by sample
line_fields = line.strip().split()
# Retrieve count by sample
for seq_id in line_fields:
real_id = seq_id
# Count
precluster_count = 1
if precluster_size_sep != "none":
precluster_count = int( real_id.split(precluster_size_sep)[-1] )
real_id = precluster_size_sep.join( real_id.split(precluster_size_sep)[0:-1] ) # trim cluster size from ID
# Sample
sample_name = "all"
if precluster_sample_sep != "none":
sample_name = real_id.split(precluster_sample_sep)[-1]
real_id = precluster_sample_sep.join( real_id.split(precluster_sample_sep)[0:-1] ) # trim sample name from ID
# Store
if cluster_count.has_key( sample_name ):
cluster_count[sample_name] += precluster_count
else:
cluster_count[sample_name] = precluster_count
# Add cluster on biom
biom.add_observation( cluster_name )
for sample_name in cluster_count:
if not samples_seen.has_key( sample_name ):
biom.add_sample( sample_name )
samples_seen[sample_name] = 1
biom.add_count( cluster_name, sample_name, cluster_count[sample_name] )
# Next cluster
cluster_idx += 1
# Write
BiomIO.write( output_biom, biom )
def sort_by_abundance( input_fasta, output_fasta, size_separator, output_folder, sort_type="desc", new_size_separator="_" ):
import os
import jflow.seqio as seqio
# Split by size
abundances_files = dict()
reader = seqio.SequenceReader( input_fasta )
for id, desc, seq, qual in reader :
abundance = 1
id_without_abundance = id
if size_separator != "none":
abundance = id.split(size_separator)[-1]
id_without_abundance = size_separator.join(id.split(size_separator)[0:-1])
if not abundances_files.has_key( abundance ):
abundances_files[abundance] = dict()
abundances_files[abundance]["path"] = os.path.join(output_folder, 'tmp_abundance_' + abundance + '.fasta' )
abundances_files[abundance]["filehandle"] = open( abundances_files[abundance]["path"], "w" )
seqio.writefasta( abundances_files[abundance]["filehandle"], [[id_without_abundance + new_size_separator + abundance, desc, seq, qual]] )
for abundance in abundances_files.keys():
abundances_files[abundance]["filehandle"].close()
#Merge
output_fh = open( output_fasta, "w" )
reverse_order = True if sort_type == "desc" else False
abundances = sorted( abundances_files.keys(), key=int, reverse=reverse_order )
for current_abundance in abundances:
fh = open( abundances_files[current_abundance]["path"] )
for line in fh:
output_fh.write( line.strip() + "\n" )
fh.close()
output_fh.close()
#Remove tmp
for tmp_file in abundances_files.keys():
os.remove( abundances_files[tmp_file]["path"] )
class Swarm (Analysis):
"""
@summary: Produces a set of representative sequences.
After the clustering step some statistics are produced :
- Number of cluster by depth.
- Hierarchical clustering between samples. This clustering is based on clusters's normalised expression.
"""
def define_parameters(self, input_fasta, max_differences=1, threads=1, distance_method='euclidean', linkage_method='average',
precluster_size_sep=None, precluster_sample_sep=None):
"""
@param input_fasta : [str] fasta list to process
@param max_differences : [int] the maximum number of differences between sequences in each cluster.
@param threads : [int] number of threads to use.
@param distance_method : [str] distance method for the hierarchical clustering. Accepted values @see biomstat.samples_hclassification.
@param linkage_method : [str] linkage method for the hierarchical clustering. Accepted values @see biomstat.samples_hclassification.
@param precluster_size_sep : [str] used if sequences provided to swarm are pre-clusters (ex : dereplication step before swarm).
The number of sequences represented by each pre-cluster can be added to the end of its ID.
In this place the number is separated by the character precluster_size_sep.
Example : precluster_size_sep=';' where sequence ID = 'seq10001;83'.
@param precluster_sample_sep : [str] used if sequences provided to swarm come from differents samples. The sample name is stored in
each sequence id. It is separated by the character precluster_sample_sep and it is placed before the
pre-cluster size information.
Example : sequence ID = 'seq10001|lake_spot_1;83'.
OR
sequence ID = 'seq10001|lake_spot_1'.
"""
# Parameters
self.max_differences = max_differences
self.threads = threads
self.distance_method = distance_method
self.linkage_method = linkage_method
self.precluster_size_sep = self.precluster_size_sep = "none" if precluster_size_sep is None else precluster_size_sep
self.precluster_sample_sep = self.precluster_sample_sep = "none" if precluster_sample_sep is None else precluster_sample_sep
# Files
self.input_fasta = InputFileList(input_fasta)
self.output_files = OutputFileList( self.get_outputs('{basename_woext}_swarm.fasta', self.input_fasta), Formats.FASTA )
self.cluster_files = OutputFileList( self.get_outputs('{basename_woext}.clstr', self.input_fasta) )
self.log_files = OutputFileList( self.get_outputs('{basename_woext}.log', self.input_fasta) )
self.biom_files = OutputFileList( self.get_outputs('{basename_woext}_swarm.biom', self.input_fasta) )
self.depth_files = OutputFileList( self.get_outputs('{basename_woext}_depth.tsv', self.input_fasta) )
self.hclust_files = OutputFileList( self.get_outputs('{basename_woext}_hclust.json', self.input_fasta) )
self.stderr = os.path.join( self.output_directory, 'swarm.stderr' )
def get_template(self):
return "ClustersStats"
def define_analysis(self):
self.name = "Cluster"
self.description = "Cluster sequences."
self.software = "swarm"
self.options = "swarm -d " + str(self.max_differences) + ";hierarchical_clustering distance=" + self.distance_method + " linkage=" + self.linkage_method
def get_version(self):
cmd = [self.get_exec_path("swarm"), "--version"]
p = Popen(cmd, stdout=PIPE, stderr=PIPE)
stdout, stderr = p.communicate()
return stderr.split()[1]
def post_process(self):
self._save_files( self.biom_files + self.output_files + self.depth_files )#+ self.hclust_files )
# Parse depths
for filepath in self.depth_files:
[depths, counts, nb_observations, nb_seq, upper_quartile, median, lower_quartile] = observations_depth_to_stat(filepath)
sample_name = os.path.basename(filepath).split("_depth.tsv")[0]
self._add_result_element( sample_name, "depths", ",".join(map(str, depths)) )
self._add_result_element( sample_name, "counts", ",".join(map(str, counts)) )
self._add_result_element( sample_name, "nb_observations", str(nb_observations) )
self._add_result_element( sample_name, "nb_sequences", str(nb_seq) )
self._add_result_element( sample_name, "upper_quartile", str(upper_quartile) )
self._add_result_element( sample_name, "median", str(median) )
self._add_result_element( sample_name, "lower_quartile", str(lower_quartile) )
# Parse JSON
for filepath in self.hclust_files:
json_fh = open( filepath )
hierarchical_clustering = ' '.join( line.strip() for line in json_fh )
json_fh.close()
sample_name = os.path.basename(filepath).split("_hclust.json")[0]
self._add_result_element( sample_name, "linkage", hierarchical_clustering )
def process(self):
self.sorted_fasta = self.get_outputs('{basename_woext}_sorted.fasta', self.input_fasta)
# Sort sequences by decreasing abundance
sort = PythonFunction( sort_by_abundance, cmd_format="{EXE} {IN} {OUT} '" + str(self.precluster_size_sep) + "' '" + self.output_directory + "' 2>> " + self.stderr )
sort = Map( sort, inputs=self.input_fasta, outputs=self.sorted_fasta )
# Build clusters
swarm = ShellFunction( self.get_exec_path("swarm") + " -d " + str(self.max_differences) + " -t " + str(self.threads) + " -s $2 -o $3 $1 2>> " + self.stderr, cmd_format='{EXE} {IN} {OUT}' )
swarm = MultiMap( swarm, inputs=self.sorted_fasta, outputs=[self.log_files, self.cluster_files] )
# Retrieve cluster's sequences
filter = PythonFunction( filter_seq, cmd_format='{EXE} {IN} {OUT} 2>> ' + self.stderr )
filter = MultiMap( filter, inputs=[self.sorted_fasta, self.cluster_files], outputs=self.output_files )
# Build biom
biom = PythonFunction( to_biom, cmd_format="{EXE} {OUT} {IN} '" + self.precluster_sample_sep + "' '_' 2>> " + self.stderr )
biom = Map( biom, inputs=self.cluster_files, outputs=self.biom_files )
# Depths stats
depth = PythonFunction( observations_depth, cmd_format='{EXE} {IN} {OUT} 2>> ' + self.stderr )
Map( depth, inputs=self.biom_files, outputs=self.depth_files )
# Linkage stats
dendrogram = PythonFunction( samples_hclassification, cmd_format='{EXE} {IN} {OUT} ' + self.distance_method + ' ' + self.linkage_method + ' 2>> ' + self.stderr )
Map( dendrogram, inputs=self.biom_files, outputs=self.hclust_files )
\ No newline at end of file
#
# Copyright (C) 2012 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import os
from subprocess import Popen, PIPE
from workflows.gene_diversity.lib.biomstat import *
from ng6.analysis import Analysis
from jflow.iotypes import OutputFileList, InputFileList, Formats
from jflow.abstraction import MultiMap
from weaver.function import ShellFunction, PythonFunction
from weaver.abstraction import Map
def to_biom( output_biom, clusters_file, precluster_sample_sep, precluster_size_sep ):
"""
@summary : Write a biom file from cdhit results.
@param output_biom : [str] path to the output file.
@param clusters_file : [str] path to the '.clstr' file.
@param precluster_size_sep : [str] used if sequences provided to Cdhit are pre-clusters (ex : dereplication step before cd-hit). The number of sequences
represented by each pre-cluster can be added to the end of its ID. In this place the number is separated by the character
precluster_size_sep.
Example : precluster_size_sep=';' where sequence ID = 'seq10001;83'.
@param precluster_size_sep : [str] used if sequences provided to Cdhit come from differents samples ("none" otherwise). The sample name is stored in each
sequence id. It is separated by the character precluster_sample_sep and it is placed before the pre-cluster size information.
Example : sequence ID = 'seq10001|lake_spot_1;83'.
OR
sequence ID = 'seq10001|lake_spot_1'.
"""
from workflows.gene_diversity.lib.Biom import Biom, BiomIO
samples_seen = dict()
biom = Biom( generated_by='cdhit', matrix_type="sparse" )
# Process count
cluster_name = None
for line in open(clusters_file).readlines():
# New cluster
if line.startswith(">"): # Line example : '>Cluster 0' => 'Cluster_0'
if cluster_name is not None:
# Add previous cluster on biom
biom.add_observation( cluster_name )
for sample_name in cluster_count:
if not samples_seen.has_key( sample_name ):
biom.add_sample( sample_name )
samples_seen[sample_name] = 1
biom.add_count( cluster_name, sample_name, cluster_count[sample_name] )
# Start new cluster
cluster_name = line.strip()[1:].replace(" ", "_")
cluster_count = dict()
# Additional information on current cluster
else: # Line example : '60 130aa, >c6104.0;1... at 99.23%'
pre_cluster_id = line.strip().split()[2][1:-3] # c6104.0;1
# Count
pre_cluster_count = 1
if precluster_size_sep != "none":
pre_cluster_count = int( pre_cluster_id.split(precluster_size_sep)[-1] )
pre_cluster_id = precluster_size_sep.join( pre_cluster_id.split(precluster_size_sep)[0:-1] ) # trim cluster size from ID
# Sample
pre_cluster_sample = "all"
if precluster_sample_sep != "none":
pre_cluster_sample = pre_cluster_id.split(precluster_sample_sep)[-1]
# Store
if cluster_count.has_key(pre_cluster_sample):
cluster_count[pre_cluster_sample] += pre_cluster_count
else:
cluster_count[pre_cluster_sample] = pre_cluster_count
if cluster_name is not None:
# Add last cluster on biom
biom.add_observation( cluster_name )
for sample_name in cluster_count:
if not samples_seen.has_key( sample_name ):
biom.add_sample( sample_name )
samples_seen[sample_name] = 1
biom.add_count( cluster_name, sample_name, cluster_count[sample_name] )
# Write
BiomIO.write( output_biom, biom )
class Uclust (Analysis):
"""
@summary: Produces a set of representative sequences.
After the clustering step some statistics are produced :
- Number of cluster by depth.
- Hierarchical clustering between samples. This clustering is based on clusters's normalised expression.
"""
def define_parameters(self, input_fasta, identity_treshold=0.97, is_nucleic=True, distance_method='euclidean', linkage_method='average',
precluster_size_sep=None, precluster_sample_sep=None):
"""
@param input_fasta : [str] fasta list to process
@param identity_threshold : [float] sequence identity threshold to cluster.
@param is_nucleic : [bool] False => the sequences are amino acids.
@param distance_method : [str] distance method for the hierarchical clustering. Accepted values @see biomstat.samples_hclassification.
@param linkage_method : [str] linkage method for the hierarchical clustering. Accepted values @see biomstat.samples_hclassification.
@param precluster_size_sep : [str] used if sequences provided to swarm are pre-clusters (ex : dereplication step before swarm).
The number of sequences represented by each pre-cluster can be added to the end of its ID.
In this place the number is separated by the character precluster_size_sep.
Example : precluster_size_sep=';' where sequence ID = 'seq10001;83'.
@param precluster_sample_sep : [str] used if sequences provided to swarm come from differents samples. The sample name is stored in
each sequence id. It is separated by the character precluster_sample_sep and it is placed before the
pre-cluster size information.
Example : sequence ID = 'seq10001|lake_spot_1;83'.
OR
sequence ID = 'seq10001|lake_spot_1'.
"""
# Parameters
self.identity_treshold = identity_treshold
self.molecule_type = "nucleo" if is_nucleic else "amino"
self.distance_method = distance_method
self.linkage_method = linkage_method
self.precluster_size_sep = self.precluster_size_sep = "none" if precluster_size_sep is None else precluster_size_sep
self.precluster_sample_sep = self.precluster_sample_sep = "none" if precluster_sample_sep is None else precluster_sample_sep
# Files
self.input_fasta = InputFileList(input_fasta)
self.output_files = OutputFileList( self.get_outputs('{basename_woext}_uclust.fasta', self.input_fasta), Formats.FASTA )
self.cluster_files = OutputFileList( self.get_outputs('{basename_woext}.clstr', self.input_fasta) )
self.uclust_files = OutputFileList( self.get_outputs('{basename_woext}.uc', self.input_fasta) )
self.biom_files = OutputFileList( self.get_outputs('{basename_woext}_uclust.biom', self.input_fasta) )
self.depth_files = OutputFileList( self.get_outputs('{basename_woext}_depth.tsv', self.input_fasta) )
self.hclust_files = OutputFileList( self.get_outputs('{basename_woext}_hclust.json', self.input_fasta) )
self.stderr = os.path.join( self.output_directory, 'uclust.stderr' )
def get_template(self):
return "ClustersStats"
def define_analysis(self):
self.name = "Cluster"
self.description = "Cluster sequences."
self.software = "uclust"
self.options = "uclust --" + str(self.molecule_type) + " -id " + str(self.identity_treshold) + ";hierarchical_clustering distance=" + self.distance_method + " linkage=" + self.linkage_method
def get_version(self):
cmd = [self.get_exec_path("uclust"), "--version"]
p = Popen(cmd, stdout=PIPE, stderr=PIPE)
stdout, stderr = p.communicate()
return stdout.split()[1]
def post_process(self):
self._save_files( self.biom_files + self.output_files + self.depth_files )
# Parse depths
for filepath in self.depth_files:
[depths, counts, nb_observations, nb_seq, upper_quartile, median, lower_quartile] = observations_depth_to_stat(filepath)
sample_name = os.path.basename(filepath).split("_depth.tsv")[0]
self._add_result_element( sample_name, "depths", ",".join(map(str, depths)) )
self._add_result_element( sample_name, "counts", ",".join(map(str, counts)) )
self._add_result_element( sample_name, "nb_observations", str(nb_observations) )
self._add_result_element( sample_name, "nb_sequences", str(nb_seq) )
self._add_result_element( sample_name, "upper_quartile", str(upper_quartile) )
self._add_result_element( sample_name, "median", str(median) )
self._add_result_element( sample_name, "lower_quartile", str(lower_quartile) )
# Parse JSON
for filepath in self.hclust_files:
json_fh = open( filepath )
hierarchical_clustering = ' '.join( line.strip() for line in json_fh )
json_fh.close()
sample_name = os.path.basename(filepath).split("_hclust.json")[0]
self._add_result_element( sample_name, "linkage", hierarchical_clustering )
def process(self):
self.sorted_fasta = self.get_outputs('{basename_woext}_sorted.fasta', self.input_fasta)
# Sort sequences by decreasing abundance
sort = ShellFunction( self.get_exec_path("uclust") + " --sort $1 --output $2 2>> " + self.stderr, cmd_format='{EXE} {IN} {OUT}' )
sort = Map( sort, inputs=self.input_fasta, outputs=self.sorted_fasta )
# Build clusters
uclust = ShellFunction( self.get_exec_path("uclust") + " --" + str(self.molecule_type) + " --id " + str(self.identity_treshold) + " --input $1 --uc $2 2>> " + self.stderr, cmd_format='{EXE} {IN} {OUT}' )
uclust = Map( uclust, inputs=self.sorted_fasta, outputs=self.uclust_files )
convert = ShellFunction( self.get_exec_path("uclust") + " --uc2clstr $1 --output $2 2>> " + self.stderr, cmd_format='{EXE} {IN} {OUT}' )
convert = Map( convert, inputs=self.uclust_files, outputs=self.cluster_files )
# Retrieve cluster's sequences
fasta = ShellFunction( self.get_exec_path("uclust") + " --uc2fasta $1 --input $2 --output $3 2>> " + self.stderr, cmd_format='{EXE} {IN} {OUT}' )
fasta = MultiMap( fasta, inputs=[self.uclust_files, self.sorted_fasta], outputs=self.output_files )
# Build biom
biom = PythonFunction( to_biom, cmd_format="{EXE} {OUT} {IN} '" + self.precluster_sample_sep + "' '" + str(self.precluster_size_sep) + "' 2>> " + self.stderr )
biom = Map( biom, inputs=self.cluster_files, outputs=self.biom_files )
# Depths stats
depth = PythonFunction( observations_depth, cmd_format='{EXE} {IN} {OUT} 2>> ' + self.stderr )
Map( depth, inputs=self.biom_files, outputs=self.depth_files )
# Linkage stats
dendrogram = PythonFunction( samples_hclassification, cmd_format='{EXE} {IN} {OUT} ' + self.distance_method + ' ' + self.linkage_method + ' 2>> ' + self.stderr )
Map( dendrogram, inputs=self.biom_files, outputs=self.hclust_files )
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment