Commit 68568b65 authored by Antoine Lucas's avatar Antoine Lucas
Browse files

delete

parent 1029ec2c
distpar <- function(x, method="euclidean", nbproc = 2, diag=FALSE, upper=FALSE)
{
if(class(x) == "exprSet")
x <- exprs(x)
## account for possible spellings of euclid?an
if(!is.na(pmatch(method, "euclidian")))
method <- "euclidean"
METHODS <- c("euclidean", "maximum","manhattan", "canberra",
"binary","pearson","correlation","spearman")
method <- pmatch(method, METHODS)
if(is.na(method))
stop("invalid distance method")
if(method == -1)
stop("ambiguous distance method")
N <- nrow(x <- as.matrix(x))
d <- .C("R_distancepar",
x = as.double(x),
nr= N,
nc= ncol(x),
d = double(N*(N - 1)/2),
diag = as.integer(FALSE),
method= as.integer(method),
nbproc = as.integer(nbproc),
err = as.integer(0),
DUP = FALSE, NAOK=TRUE, PACKAGE="amap")$d
attr(d, "Size") <- N
attr(d, "Labels") <- dimnames(x)[[1]]
attr(d, "Diag") <- diag
attr(d, "Upper") <- upper
attr(d, "method") <- METHODS[method]
attr(d, "call") <- match.call()
class(d) <- "dist"
return(d)
}
## Hierarchical clustering parallelized
##
## Created : 18/11/02
## Last Modified : Time-stamp: <2005-11-13 21:06:46 antoine>
##
## This function is a "mix" of function dist and function hclust.
##
## Author : Antoine Lucas
##
hclusterpar <- function (x, method = "euclidean", diag = FALSE, upper = FALSE, link = "complete", members = NULL, nbproc = 2)
{
if(class(x) == "exprSet")
x <- exprs(x)
## take from dist
if (!is.na(pmatch(method, "euclidian")))
method <- "euclidean"
METHODS <- c("euclidean", "maximum", "manhattan", "canberra",
"binary","pearson","correlation","spearman")
method <- pmatch(method, METHODS)
if (is.na(method))
stop("invalid distance method")
if (method == -1)
stop("ambiguous distance method")
N <- nrow(x <- as.matrix(x))
#take from hclust
METHODSLINKS <- c("ward", "single", "complete", "average", "mcquitty",
"median", "centroid")
link <- pmatch(link, METHODSLINKS)
if (is.na(link))
stop("invalid clustering method")
if (link == -1)
stop("ambiguous clustering method")
if (N < 2)
stop("Must have n >= 2 objects to cluster")
if (is.null(members))
members <- rep(1, N)
if (length(members) != N)
stop("Invalid length of members")
n <- N
hcl <- .C("hclusterpar",
x = as.double(x),
nr = as.integer(n),
nc = as.integer(ncol(x)),
diag = as.integer(FALSE),
method = as.integer(method),
iopt = as.integer(link),
ia = integer(n),
ib = integer(n),
order = integer(n),
crit = double(n),
members = as.double(members),
nbprocess = as.integer(nbproc),
err = as.integer(0),
DUP = FALSE,
NAOK=TRUE, PACKAGE="amap")
if(hcl$err == 2)
stop("Cannot allocate memory")
if(hcl$err == 3)
stop("Missing values in distance Matrix")
if(hcl$err == 1)
stop("Error")
tree <- list(merge = cbind(hcl$ia[1:(N - 1)],
hcl$ib[1:(N - 1)]),
height = hcl$crit[1:(N - 1)],
order = hcl$order,
labels = dimnames(x)[[1]],
method = METHODSLINKS[link],
call = match.call(),
dist.method = METHODS[method]
)
class(tree) <- "hclust"
tree
}
\name{distpar}
\title{Parallelized Distance Matrix Computation}
\usage{
distpar(x, method = "euclidean", nbproc = 2, diag = FALSE, upper = FALSE)
}
\alias{distpar}
\arguments{
\item{x}{numeric matrix or (data frame) or an object of class
"exprSet". Distances between the rows of
\code{x} will be computed.}
\item{method}{the distance measure to be used. This must be one of
\code{"euclidean"}, \code{"maximum"}, \code{"manhattan"},
\code{"canberra"}, \code{"binary"}, \code{"pearson"},
\code{"correlation"} or \code{"spearman"}.
Any unambiguous substring can be given.}
\item{nbproc}{integer, Number of subprocess for parallelization}
\item{diag}{logical value indicating whether the diagonal of the
distance matrix should be printed by \code{print.dist}.}
\item{upper}{logical value indicating whether the upper triangle of the
distance matrix should be printed by \code{print.dist}.}
}
\description{
This function computes and returns the distance matrix computed by
using the specified distance measure to compute the distances between
the rows of a data matrix.
}
\value{
An object of class \code{"dist"}.
}
\references{
Antoine Lucas and Sylvain Jasson, \emph{Using amap and ctc Packages
for Huge Clustering}, R News, 2006, vol 6, issue 5 pages 58-60.
}
\seealso{
\code{\link{Dist}}
}
\examples{
x <- matrix(rnorm(100), nrow=5)
## compute dist with 8 threads
distpar(x,nbproc=8)
## compute pearson dist with 8 threads
distpar(x,nbproc=8,method="pearson")
}
\keyword{multivariate}
\keyword{cluster}
\name{hclusterpar}
\title{Parallelized Hierarchical Clustering}
\alias{hclusterpar}
\description{
Parallelized Hierarchical cluster analysis.
}
\usage{
hclusterpar(x, method = "euclidean", diag = FALSE, upper = FALSE,
link = "complete", members = NULL, nbproc = 2)
}
\arguments{
\item{x}{
A numeric matrix of data, or an object that can be coerced to such a
matrix (such as a numeric vector or a data frame with all numeric
columns). Or an object of class "exprSet".
}
\item{method}{the distance measure to be used. This must be one of
\code{"euclidean"}, \code{"maximum"}, \code{"manhattan"},
\code{"canberra"} \code{"binary"} \code{"pearson"},
\code{"correlation"} or \code{"spearman"}.
Any unambiguous substring can be given.}
\item{diag}{logical value indicating whether the diagonal of the
distance matrix should be printed by \code{print.dist}.}
\item{upper}{logical value indicating whether the upper triangle of the
distance matrix should be printed by \code{print.dist}.}
\item{link}{the agglomeration method to be used. This should
be (an unambiguous abbreviation of) one of
\code{"ward"}, \code{"single"}, \code{"complete"},
\code{"average"}, \code{"mcquitty"}, \code{"median"} or
\code{"centroid"}.}
\item{members}{\code{NULL} or a vector with length size of \code{d}.}
\item{nbproc}{integer, number of subprocess for parallelization}
}
\value{
An object of class \bold{hclust} which describes the
tree produced by the clustering process.
The object is a list with components:
\item{merge}{an \eqn{n-1} by 2 matrix.
Row \eqn{i} of \code{merge} describes the merging of clusters
at step \eqn{i} of the clustering.
If an element \eqn{j} in the row is negative,
then observation \eqn{-j} was merged at this stage.
If \eqn{j} is positive then the merge
was with the cluster formed at the (earlier) stage \eqn{j}
of the algorithm.
Thus negative entries in \code{merge} indicate agglomerations
of singletons, and positive entries indicate agglomerations
of non-singletons.}
\item{height}{a set of \eqn{n-1} non-decreasing real values.
The clustering \emph{height}: that is, the value of
the criterion associated with the clustering
\code{method} for the particular agglomeration.}
\item{order}{a vector giving the permutation of the original
observations suitable for plotting, in the sense that a cluster
plot using this ordering and matrix \code{merge} will not have
crossings of the branches.}
\item{labels}{labels for each of the objects being clustered.}
\item{call}{the call which produced the result.}
\item{method}{the cluster method that has been used.}
\item{dist.method}{the distance that has been used to create \code{d}
(only returned if the distance object has a \code{"method"}
attribute).}
There is a \code{\link{print}} and a \code{\link{plot}} method for
\code{hclust} objects.
The \code{plclust()} function is basically the same as the plot method,
\code{plot.hclust}, primarily for back compatibility with S-plus. Its
extra arguments are not yet implemented.
}
\details{
This function is a mix of function \code{hclust} and function
\code{distpar}. \code{hcluster(x, method = "euclidean",link = "complete")
= hclust(dist(x, method = "euclidean"),method = "complete"))}
For more details, see documentation of \code{hclust} and \code{dist}.
}
\author{
The \code{hcluster} function is based on C code
by Antoine Lucas \url{http://mulcyber.toulouse.inra.fr/projects/amap/}.
}
\seealso{
\code{\link{Dist}}, \code{\link{hcluster}}, \code{\link[stats]{hclust}}, \code{\link[stats]{kmeans}}.
}
\references{
Antoine Lucas and Sylvain Jasson, \emph{Using amap and ctc Packages
for Huge Clustering}, R News, 2006, vol 6, issue 5 pages 58-60.
}
\examples{
data(USArrests)
hc <- hclusterpar(USArrests,link = "ave",nbproc=8)
plot(hc)
plot(hc, hang = -1)
## To check...
hc <- hcluster(USArrests,link = "ave")
plot(hc)
## The same ?
}
\keyword{multivariate}
\keyword{cluster}
/*! \file distance.c
* \brief all functions requiered for R dist function and C hcluster function.
*
* \date Created: probably in 1995
* \date Last modified: Time-stamp: <2005-10-09 13:12:06 antoine>
*
* \author R core members, and lately: Antoine Lucas
*
*
* R : A Computer Language for Statistical Data Analysis
* Copyright (C) 1995, 1996 Robert Gentleman and Ross Ihaka
* Copyright (C) 1998, 2001 Robert Gentleman, Ross Ihaka and the
* R Development Core Team
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
*/
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
#include <float.h>
#include <R_ext/Arith.h>
#include <R_ext/Error.h>
#include "mva.h"
/** \brief Distance euclidean (i.e. sqrt(sum of square) )
*/
double R_euclidean(double *x, int nr, int nc, int i1, int i2,int * flag, void ** opt)
{
double dev, dist;
int count, j;
count= 0;
dist = 0;
for(j = 0 ; j < nc ; j++) {
if(R_FINITE(x[i1]) && R_FINITE(x[i2])) {
dev = (x[i1] - x[i2]);
dist += dev * dev;
count++;
}
i1 += nr;
i2 += nr;
}
if(count == 0)
{
*flag = 0;
return NA_REAL;
}
if(count != nc) dist /= ((double)count/nc);
return sqrt(dist);
}
/** \brief Distance maximum (supremum norm)
*/
double R_maximum(double *x, int nr, int nc, int i1, int i2, int * flag, void ** opt)
{
double dev, dist;
int count, j;
count = 0;
dist = -DBL_MAX;
for(j = 0 ; j < nc ; j++) {
if(R_FINITE(x[i1]) && R_FINITE(x[i2])) {
dev = fabs(x[i1] - x[i2]);
if(dev > dist)
dist = dev;
count++;
}
i1 += nr;
i2 += nr;
}
if(count == 0)
{
*flag = 0;
return NA_REAL;
}
return dist;
}
/** \brief Distance manhattan
*/
double R_manhattan(double *x, int nr, int nc, int i1, int i2, int * flag, void ** opt)
{
double dist;
int count, j;
count = 0;
dist = 0;
for(j = 0 ; j < nc ; j++) {
if(R_FINITE(x[i1]) && R_FINITE(x[i2])) {
dist += fabs(x[i1] - x[i2]);
count++;
}
i1 += nr;
i2 += nr;
}
if(count == 0)
{
*flag = 0;
return NA_REAL;
}
if(count != nc) dist /= ((double)count/nc);
return dist;
}
/** \brief Distance canberra
*/
double R_canberra(double *x, int nr, int nc, int i1, int i2, int * flag, void ** opt)
{
double dist, sum, diff;
int count, j;
count = 0;
dist = 0;
for(j = 0 ; j < nc ; j++) {
if(R_FINITE(x[i1]) && R_FINITE(x[i2])) {
sum = fabs(x[i1] + x[i2]);
diff = fabs(x[i1] - x[i2]);
if (sum > DBL_MIN || diff > DBL_MIN) {
dist += diff/sum;
count++;
}
}
i1 += nr;
i2 += nr;
}
if(count == 0)
{
*flag = 0;
return NA_REAL;
}
if(count != nc) dist /= ((double)count/nc);
return dist;
}
/** \brief Distance binary
*/
double R_dist_binary(double *x, int nr, int nc, int i1, int i2,int * flag, void ** opt)
{
int total, count, dist;
int j;
total = 0;
count = 0;
dist = 0;
for(j = 0 ; j < nc ; j++) {
if(R_FINITE(x[i1]) && R_FINITE(x[i2])) {
if(x[i1] || x[i2]){
count++;
if( ! (x[i1] && x[i2]) ) dist++;
}
total++;
}
i1 += nr;
i2 += nr;
}
if(total == 0)
{
*flag = 0;
return NA_REAL;
}
if(count == 0) return 0;
return (double) dist / count;
}
/** \brief Pearson / Pearson centered (correlation)
* \note Added by A. Lucas
*/
double R_pearson(double *x, int nr, int nc, int i1, int i2,int * flag, void ** opt)
{
double num,sum1,sum2, dist;
int count,j;
count= 0;
num = 0;
sum1 = 0;
sum2 = 0;
for(j = 0 ; j < nc ; j++) {
if(R_FINITE(x[i1]) && R_FINITE(x[i2])) {
num += (x[i1] * x[i2]);
sum1 += (x[i1] * x[i1]);
sum2 += (x[i2] * x[i2]);
count++;
}
i1 += nr;
i2 += nr;
}
if(count == 0)
{
*flag = 0;
return NA_REAL;
}
dist = 1 - ( num / sqrt(sum1 * sum2) );
return dist;
}
/** \brief Distance correlation (Uncentered Pearson)
* \note Added by A. Lucas
*/
double R_correlation(double *x, int nr, int nc, int i1, int i2,int * flag, void ** opt)
{
double num,denum,sumx,sumy,sumxx,sumyy,sumxy;
int count,j;
count= 0;
sumx=0;
sumy=0;
sumxx=0;
sumyy=0;
sumxy=0;
for(j = 0 ; j < nc ; j++) {
if(R_FINITE(x[i1]) && R_FINITE(x[i2])) {
sumxy += (x[i1] * x[i2]);
sumx += x[i1];
sumy += x[i2];
sumxx += x[i1] * x[i1];
sumyy += x[i2] * x[i2];
count++;
}
i1 += nr;
i2 += nr;
}
if(count == 0)
{
*flag = 0;
return NA_REAL;
}
num = sumxy - ( sumx*sumy /count );
denum = sqrt( (sumxx - (sumx*sumx /count ) )* (sumyy - (sumy*sumy /count ) ) );
return 1 - (num / denum);
}
/** \brief Spearman distance (rank base metric)
* \note Added by A. Lucas
*/
double R_spearman(double *x, int nr, int nc, int i1, int i2,int * flag, void ** opt)
{
int j;
double * data_tri;
int * order_tri;
int * rank_tri;
int n;
double diffrang=0;
/* initialisation of variables */
data_tri = (double * ) opt[0];
order_tri = (int * ) opt[1];
rank_tri = (int * ) opt[2];
for(j = 0 ; j < nc ; j++) {
if(!(R_FINITE(x[i1]) && R_FINITE(x[i2])))
{
*flag = 0;
return NA_REAL;
}
order_tri[j] = rank_tri[j] = j;
order_tri[j+nc] = rank_tri[j+nc] = j;
data_tri[j] = x[i1];
data_tri[j+nc] = x[i2];
i1 += nr;
i2 += nr;
}
n = nc;
/* sort and compute rank */
/* First list */
rsort_rank_order(data_tri, order_tri,rank_tri, &n);
/* Second list */
rsort_rank_order(&(data_tri[nc]),&( order_tri[nc]),&(rank_tri[nc]), &n);
for(j=0;j<nc;j++)
{
diffrang += pow((double) ( rank_tri[j] - rank_tri[j+nc]),2);
}
return( diffrang );
/*
* verifcation in R:
* Dist(x,method='spearman') ; n =dim(x)[2]
* l=c(x[3,],x[4,]); sum((rank(l[1:n])-rank(l[(n+1):(2*n)]))^2)
* cor.test(x[3,],x[4,],method="spearm")$statistic
*/
}
enum { EUCLIDEAN=1, MAXIMUM, MANHATTAN, CANBERRA, BINARY ,PEARSON, CORRELATION, SPEARMAN};
/* == 1,2,..., defined by order in the r function dist */
/**
* R_distance: compute distance. Function called direclty by R
* \brief compute distance and call one of function R_euclidean or R_...
* \param x input matrix
* \param nr,nc number of row and columns
* nr individuals with nc values.
* \param d distance half matrix.
* \param diag if we compute diagonal of dist matrix (usualy: no).
* \param method 1, 2,... method used
* \param ierr error return; 1 good; 0 missing values
*/
void R_distance(double *x, int *nr, int *nc, double *d, int *diag,
int *method, int * ierr)
{
int dc, i, j, ij;
/* for spearman dist */
void ** opt;
double * data_tri;
int * order_tri;
int * rank_tri;
double (*distfun)(double*, int, int, int, int, int *, void **) = NULL;
switch(*method) {