Commit d77f8fea authored by Antoine Lucas's avatar Antoine Lucas
Browse files

distance and hirerachical clustering with template => compute with float precision available

parent 4bdd00ae
......@@ -58,32 +58,6 @@ W <- function(x,h,D=NULL,kernel="gaussien")
}
WsansC <- function(x,h,D=NULL,kernel="gaussien")
{
x <- as.matrix(x)
n <- dim(x)[1]
p <- dim(x)[2]
if (is.null(D)) {
D <- diag(1,p)
}
som <- 0
result <- 0
for(i in 1:(n-1) )
{
for(j in (i+1):n)
{
Delta <- x[i,]-x[j,]
norm <- sqrt(t(Delta) %*%D %*%Delta)
k <- K ( norm /h,kernel) # K ( |Delta|/h )
k <- as.numeric(k)
som <- som + k
result <- result + k * Delta %*% t(Delta)
}
}
result / som
}
varrob <- function(x,h,D=NULL,kernel="gaussien")
{
x <- as.matrix(x)
......@@ -120,32 +94,6 @@ varrob <- function(x,h,D=NULL,kernel="gaussien")
}
varrobsansC <- function(x,h,D=NULL,kernel="gaussien")
{
n <- dim(x)[1]
p <- dim(x)[2]
if (is.null(D)) {
D <- diag(1,p)
}
x <- as.matrix(x)
x <- scale(x ,center = TRUE, scale = FALSE)
som <- 0
res <- 0
for (i in 1:n )
{
k <- K ( sqrt(t(x[i,]) %*%D %*% x[i,]) /h,kernel)
k <- as.numeric(k)
res <- res + k * x[i,] %*% t( x[i,])
som <- som + k
}
S <- res / som
Sinv <- solve(S)
solve ( Sinv - D / h )
}
acpgen <- function(x,h1,h2,center=TRUE,reduce=TRUE,kernel="gaussien")
{
# CENTRONS, ET REDUISONS
......
Dist <- function(x, method="euclidean", diag=FALSE, upper=FALSE)
Dist <- function(x, method="euclidean", nbproc = 1, diag=FALSE, upper=FALSE)
{
if(class(x) == "exprSet")
......@@ -25,6 +25,7 @@ Dist <- function(x, method="euclidean", diag=FALSE, upper=FALSE)
d = double(N*(N - 1)/2),
diag = as.integer(FALSE),
method= as.integer(method),
nbproc = as.integer(nbproc),
ierr=as.integer(0),
DUP = FALSE,
NAOK=TRUE,
......
......@@ -10,7 +10,7 @@
hcluster <- function (x, method = "euclidean", diag = FALSE, upper = FALSE, link = "complete", members = NULL)
hcluster <- function (x, method = "euclidean", diag = FALSE, upper = FALSE, link = "complete", members = NULL, nbproc = 2, doubleprecision = TRUE)
{
if(class(x) == "exprSet")
......@@ -46,6 +46,10 @@ hcluster <- function (x, method = "euclidean", diag = FALSE, upper = FALSE, link
if (length(members) != N)
stop("Invalid length of members")
n <- N
precision <- 1
if(doubleprecision)
precision <- 2
hcl <- .C("hcluster",
x = as.double(x),
......@@ -59,6 +63,8 @@ hcluster <- function (x, method = "euclidean", diag = FALSE, upper = FALSE, link
order = integer(n),
crit = double(n),
members = as.double(members),
nbprocess = as.integer(nbproc),
precision = as.integer(precision),
res = as.integer (1),
DUP = FALSE,
NAOK=TRUE,
......
......@@ -2,7 +2,7 @@
\alias{acpgen}
\alias{K}
\alias{W}
\alias{WsansC}
\title{Generalised principal component analysis}
......@@ -12,7 +12,6 @@
acpgen(x,h1,h2,center=TRUE,reduce=TRUE,kernel="gaussien")
K(u,kernel="gaussien")
W(x,h,D=NULL,kernel="gaussien")
WsansC(x,h,D=NULL,kernel="gaussien")
}
\arguments{
......
\name{Dist}
\title{Distance Matrix Computation}
\usage{
Dist(x, method = "euclidean", diag = FALSE, upper = FALSE)
Dist(x, method = "euclidean", nbproc = 1, diag = FALSE, upper = FALSE)
}
\alias{Dist}
......@@ -15,6 +15,7 @@ Dist(x, method = "euclidean", diag = FALSE, upper = FALSE)
\code{"canberra"}, \code{"binary"}, \code{"pearson"},
\code{"correlation"} or \code{"spearman"}.
Any unambiguous substring can be given.}
\item{nbproc}{integer, Number of subprocess for parallelization}
\item{diag}{logical value indicating whether the diagonal of the
distance matrix should be printed by \code{print.dist}.}
\item{upper}{logical value indicating whether the upper triangle of the
......@@ -111,7 +112,7 @@ Dist(x, method = "euclidean", diag = FALSE, upper = FALSE)
\code{\link[cluster]{daisy}} in the \file{cluster} package with more
possibilities in the case of \emph{mixed} (contiuous / categorical)
variables.
\code{\link[stats]{dist}} \code{\link{hclust}}.
\code{\link[stats]{dist}} \code{\link{hcluster}}.
}
\examples{
x <- matrix(rnorm(100), nrow=5)
......@@ -119,6 +120,10 @@ Dist(x)
Dist(x, diag = TRUE)
Dist(x, upper = TRUE)
## compute dist with 8 threads
Dist(x,nbproc=8)
}
\keyword{multivariate}
\keyword{cluster}
......@@ -6,7 +6,8 @@
}
\usage{
hcluster(x, method = "euclidean", diag = FALSE, upper = FALSE,
link = "complete", members = NULL)
link = "complete", members = NULL, nbproc = 2,
doubleprecision = TRUE)
}
\arguments{
\item{x}{
......@@ -29,6 +30,10 @@ hcluster(x, method = "euclidean", diag = FALSE, upper = FALSE,
\code{"average"}, \code{"mcquitty"}, \code{"median"} or
\code{"centroid"}.}
\item{members}{\code{NULL} or a vector with length size of \code{d}.}
\item{nbproc}{integer, number of subprocess for parallelization}
\item{doubleprecision}{True: use of double precision for distance
matrix computation; False: use simple precision}
}
\value{
An object of class \bold{hclust} which describes the
......@@ -87,7 +92,7 @@ hcluster(x, method = "euclidean", diag = FALSE, upper = FALSE,
by Antoine Lucas \url{http://mulcyber.toulouse.inra.fr/projects/amap/}.
}
\seealso{
\code{\link{Dist}}, \code{\link{hclusterpar}},
\code{\link{Dist}},
\code{\link[stats]{hclust}}, \code{\link[stats]{kmeans}}.
}
......@@ -118,6 +123,29 @@ opar <- par(mfrow = c(1, 2))
plot(hc, labels = FALSE, hang = -1, main = "Original Tree")
plot(hc1, labels = FALSE, hang = -1, main = "Re-start from 10 clusters")
par(opar)
## other combinaison are possible
hc <- hcluster(USArrests,method = "euc",link = "ward", nbproc= 1,
doubleprecision = TRUE)
hc <- hcluster(USArrests,method = "max",link = "single", nbproc= 2,
doubleprecision = TRUE)
hc <- hcluster(USArrests,method = "man",link = "complete", nbproc= 1,
doubleprecision = TRUE)
hc <- hcluster(USArrests,method = "can",link = "average", nbproc= 2,
doubleprecision = TRUE)
hc <- hcluster(USArrests,method = "bin",link = "mcquitty", nbproc= 1,
doubleprecision = FALSE)
hc <- hcluster(USArrests,method = "pea",link = "median", nbproc= 2,
doubleprecision = FALSE)
hc <- hcluster(USArrests,method = "cor",link = "centroid", nbproc= 1,
doubleprecision = FALSE)
hc <- hcluster(USArrests,method = "spe",link = "complete", nbproc= 2,
doubleprecision = FALSE)
}
\keyword{multivariate}
\keyword{cluster}
\name{VarRob}
\alias{varrob}
\alias{varrobsansC}
\title{Robust variance}
\description{Compute a robust variance}
\usage{
varrob(x,h,D=NULL,kernel="gaussien")
varrobsansC(x,h,D=NULL,kernel="gaussien")
}
\arguments{
......
#include "distance.h"
#include "distance_template.h"
/**
* R_distance: compute parallelized distance. Function called direclty by R
* \brief compute distance and call function thread_dist
* that call one of function R_euclidean or R_...
* \param x input matrix
* \param nr,nc number of row and columns
* nr individuals with nc values.
* \param d distance half matrix.
* \param diag if we compute diagonal of dist matrix (usualy: no).
* \param method 1, 2,... method used
* \param nbprocess: number of threads to create
* \param ierr error return; 1 good; 0 missing values
*/
void R_distance(double *x, int *nr, int *nc, double *d, int *diag, int *method,int *nbprocess, int * ierr)
{
#ifndef __MINGW_H
if(*nbprocess != 1)
amaptemplate::distancepar<double>(x,nr,nc, d,diag,method,
nbprocess, ierr);
else
#endif
amaptemplate::T_distance<double>(x,nr, nc, d, diag,
method, ierr);
}
/**
* Sort, and return order and rank
* \brief This function is used by R_spearman.
* order and rank must be initialised with 0:(n-1)
* make sort, return x sorted
* order = order(x) - 1
* rank = rank(x) -1
*
*/
void rsort_rank_order(double *x, int *order, int *rank, int * n)
{
double v;
int i, j, h, iv;
for (h = 1; h <= *n / 9; h = 3 * h + 1);
for (; h > 0; h /= 3)
for (i = h; i < *n; i++) {
v = x[i]; iv = order[i];
j = i;
while (j >= h && (x[j - h] > v))
{
x[j] = x[j - h]; order[j] = order[j-h];
rank[order[j]] = j;
j -= h;
}
x[j] = v; order[j] = iv;
rank[order[j]] = j;
}
}
extern "C"
{
/**
* R_distance: compute parallelized distance. Function called direclty by R
* \brief compute distance and call function thread_dist
* that call one of function R_euclidean or R_...
* \param x input matrix
* \param nr,nc number of row and columns
* nr individuals with nc values.
* \param d distance half matrix.
* \param diag if we compute diagonal of dist matrix (usualy: no).
* \param method 1, 2,... method used
* \param nbprocess: number of threads to create
* \param ierr error return; 1 good; 0 missing values
*/
void R_distance(double *x, int *nr, int *nc, double *d, int *diag,
int *method,int * nbprocess, int * ierr);
void rsort_rank_order(double *x, int *order, int *rank, int * n);
}
/*! \file distance.c
* \brief all functions requiered for R dist function and C hcluster function.
*
* \date Created: probably in 1995
* \date Last modified: Time-stamp: <2005-10-09 13:12:06 antoine>
*
* \author R core members, and lately: Antoine Lucas
*
*
* R : A Computer Language for Statistical Data Analysis
* Copyright (C) 1995, 1996 Robert Gentleman and Ross Ihaka
* Copyright (C) 1998, 2001 Robert Gentleman, Ross Ihaka and the
* R Development Core Team
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
*/
#define _AMAP_DISTANCE_TEMPLATE_CPP 1
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
#include "distance_template.h"
#include "distance.h"
#include <float.h>
#include <R_ext/Arith.h>
#include <R_ext/Error.h>
#include "mva.h"
#include <limits>
#include <pthread.h>
#define MAX( A , B ) ( ( A ) > ( B ) ? ( A ) : ( B ) )
#define MIN( A , B ) ( ( A ) < ( B ) ? ( A ) : ( B ) )
namespace amaptemplate
{
/** \brief Distance euclidean (i.e. sqrt(sum of square) )
*/
template <class T> T R_euclidean(double *x, int nr, int nc, int i1, int i2,int * flag, void ** opt)
{
T dev, dist;
int count, j;
count= 0;
dist = 0;
for(j = 0 ; j < nc ; j++) {
if(R_FINITE(x[i1]) && R_FINITE(x[i2])) {
dev = (x[i1] - x[i2]);
dist += dev * dev;
count++;
}
i1 += nr;
i2 += nr;
}
if(count == 0)
{
*flag = 0;
return NA_REAL;
}
if(count != nc) dist /= ((T)count/nc);
return sqrt(dist);
}
/** \brief Distance maximum (supremum norm)
*/
template <class T> T R_maximum(double *x, int nr, int nc, int i1, int i2, int * flag, void ** opt)
{
T dev, dist;
int count, j;
count = 0;
// TODO: use c++ limits
dist = std::numeric_limits<T>::min();
for(j = 0 ; j < nc ; j++) {
if(R_FINITE(x[i1]) && R_FINITE(x[i2])) {
dev = fabs(x[i1] - x[i2]);
if(dev > dist)
dist = dev;
count++;
}
i1 += nr;
i2 += nr;
}
if(count == 0)
{
*flag = 0;
return NA_REAL;
}
return dist;
}
/** \brief Distance manhattan
*/
template< class T> T R_manhattan(double *x, int nr, int nc, int i1, int i2, int * flag, void ** opt)
{
T dist;
int count, j;
count = 0;
dist = 0;
for(j = 0 ; j < nc ; j++) {
if(R_FINITE(x[i1]) && R_FINITE(x[i2])) {
dist += fabs(x[i1] - x[i2]);
count++;
}
i1 += nr;
i2 += nr;
}
if(count == 0)
{
*flag = 0;
return NA_REAL;
}
if(count != nc) dist /= ((T)count/nc);
return dist;
}
/** \brief Distance canberra
*/
template <class T> T R_canberra(double *x, int nr, int nc, int i1, int i2, int * flag, void ** opt)
{
T dist, sum, diff;
int count, j;
count = 0;
dist = 0;
for(j = 0 ; j < nc ; j++) {
if(R_FINITE(x[i1]) && R_FINITE(x[i2])) {
sum = fabs(x[i1] + x[i2]);
diff = fabs(x[i1] - x[i2]);
if (sum > DBL_MIN || diff > DBL_MIN) {
dist += diff/sum;
count++;
}
}
i1 += nr;
i2 += nr;
}
if(count == 0)
{
*flag = 0;
return NA_REAL;
}
if(count != nc) dist /= ((T)count/nc);
return dist;
}
/** \brief Distance binary
*/
template <class T> T R_dist_binary(double *x, int nr, int nc, int i1, int i2,int * flag, void ** opt)
{
int total, count, dist;
int j;
total = 0;
count = 0;
dist = 0;
for(j = 0 ; j < nc ; j++) {
if(R_FINITE(x[i1]) && R_FINITE(x[i2])) {
if(x[i1] || x[i2]){
count++;
if( ! (x[i1] && x[i2]) ) dist++;
}
total++;
}
i1 += nr;
i2 += nr;
}
if(total == 0)
{
*flag = 0;
return NA_REAL;
}
if(count == 0) return 0;
return (T) dist / count;
}
/** \brief Pearson / Pearson centered (correlation)
* \note Added by A. Lucas
*/
template<class T> T R_pearson(double *x, int nr, int nc, int i1, int i2,int * flag, void ** opt)
{
T num,sum1,sum2, dist;
int count,j;
count= 0;
num = 0;
sum1 = 0;
sum2 = 0;
for(j = 0 ; j < nc ; j++) {
if(R_FINITE(x[i1]) && R_FINITE(x[i2])) {
num += (x[i1] * x[i2]);
sum1 += (x[i1] * x[i1]);
sum2 += (x[i2] * x[i2]);
count++;
}
i1 += nr;
i2 += nr;
}
if(count == 0)
{
*flag = 0;
return NA_REAL;
}
dist = 1 - ( num / sqrt(sum1 * sum2) );
return dist;
}
/** \brief Distance correlation (Uncentered Pearson)
* \note Added by A. Lucas
*/
template<class T> T R_correlation(double *x, int nr, int nc, int i1, int i2,int * flag, void ** opt)
{
T num,denum,sumx,sumy,sumxx,sumyy,sumxy;
int count,j;
count= 0;
sumx=0;
sumy=0;
sumxx=0;
sumyy=0;
sumxy=0;
for(j = 0 ; j < nc ; j++) {
if(R_FINITE(x[i1]) && R_FINITE(x[i2])) {
sumxy += (x[i1] * x[i2]);
sumx += x[i1];
sumy += x[i2];
sumxx += x[i1] * x[i1];
sumyy += x[i2] * x[i2];
count++;
}
i1 += nr;
i2 += nr;
}
if(count == 0)
{
*flag = 0;
return NA_REAL;
}
num = sumxy - ( sumx*sumy /count );
denum = sqrt( (sumxx - (sumx*sumx /count ) )* (sumyy - (sumy*sumy /count ) ) );
return 1 - (num / denum);
}
/** \brief Spearman distance (rank base metric)
* \note Added by A. Lucas
*/
template <class T> T R_spearman(double *x, int nr, int nc, int i1, int i2,int * flag, void ** opt)
{
int j;
double * data_tri;
int * order_tri;
int * rank_tri;
int n;
T diffrang=0;
/* initialisation of variables */
data_tri = (double * ) opt[0];
order_tri = (int * ) opt[1];
rank_tri = (int * ) opt[2];
for(j = 0 ; j < nc ; j++) {
if(!(R_FINITE(x[i1]) && R_FINITE(x[i2])))
{
*flag = 0;
return NA_REAL;
}
order_tri[j] = rank_tri[j] = j;