Commit 8ba38d49 authored by Antoine Lucas's avatar Antoine Lucas
Browse files

add centroid2 method

parent 4889bfaf
Package: amap
Version: 0.8-5
Version: 0.8-6
Date: 2010-01-21
Suggests: Biobase
Title: Another Multidimensional Analysis Package
......
## Hierarchical clustering
##
## Created : 18/11/02
## Last Modified : Time-stamp: <2011-11-03 21:50:46 antoine>
## Last Modified : Time-stamp: <2011-11-04 22:55:20 antoine>
##
## This function is a "mix" of function dist and function hclust.
##
......@@ -33,7 +33,7 @@ hclusterpar <- hcluster <- function (x, method = "euclidean", diag = FALSE, uppe
#take from hclust
METHODSLINKS <- c("ward", "single", "complete", "average", "mcquitty",
"median", "centroid")
"median", "centroid","centroid2")
link <- pmatch(link, METHODSLINKS)
if (is.na(link))
......
......@@ -30,7 +30,8 @@ hcluster(x, method = "euclidean", diag = FALSE, upper = FALSE,
be (an unambiguous abbreviation of) one of
\code{"ward"}, \code{"single"}, \code{"complete"},
\code{"average"}, \code{"mcquitty"}, \code{"median"} or
\code{"centroid"}.}
\code{"centroid"},\code{"centroid2"}.}
\item{members}{\code{NULL} or a vector with length size of \code{d}.}
\item{nbproc}{integer, number of subprocess for parallelization [Linux
& Mac only]}
......
......@@ -20,9 +20,8 @@
void R_distance(double *x, int *nr, int *nc, double *d, int *diag, int *method,int *nbprocess, int * ierr)
{
distance_T<double>::distance(x,nr,nc, d,diag,method,
nbprocess, ierr);
nbprocess, ierr,-1);
}
......
......@@ -2,7 +2,7 @@
* \brief all functions requiered for R dist function and C hcluster function.
*
* \date Created: probably in 1995
* \date Last modified: Time-stamp: <2011-11-03 21:32:14 antoine>
* \date Last modified: Time-stamp: <2011-11-04 22:05:55 antoine>
*
* \author R core members, and lately: Antoine Lucas
*
......@@ -38,7 +38,6 @@
#include "distance_T.h"
#include "distance.h"
#include <float.h>
#include <math.h>
#include <stdlib.h>
......@@ -47,7 +46,7 @@
#include <R_ext/Error.h>
#include <limits>
#ifndef WIN32
#ifndef __MINGW_H
#include <pthread.h>
#endif
......@@ -332,7 +331,6 @@ template<class T> T distance_T<T>::R_pearson(double * x, double * y , int nr_x,
num = 0;
sum1 = 0;
sum2 = 0;
for(j = 0 ; j < nc ; j++) {
if(R_FINITE(x[i1]) && R_FINITE(y[i2])) {
num += (x[i1] * y[i2]);
......@@ -348,14 +346,10 @@ template<class T> T distance_T<T>::R_pearson(double * x, double * y , int nr_x,
*flag = 0;
return NA_REAL;
}
if((sum1 == 0) || (sum2 == 0))
dist = 1; // one vector is null.
else
dist = 1 - ( num / sqrt(sum1 * sum2) );
dist = 1 - ( num / sqrt(sum1 * sum2) );
return dist;
}
/** \brief Absoulute Pearson / Pearson uncentered (correlation)
* \note Added by L. Cerulo
*/
......@@ -385,15 +379,10 @@ template<class T> T distance_T<T>::R_abspearson(double * x, double * y , int nr
*flag = 0;
return NA_REAL;
}
if((sum1 == 0) || (sum2 == 0))
dist = 0; // one vector is null.
else
dist = ( num / sqrt(sum1 * sum2) );
if (dist<0)
{
dist*=-1;
}
dist = ( num / sqrt(sum1 * sum2) );
if (dist<0) {
dist*=-1;
}
return (1-dist);
}
......@@ -406,7 +395,7 @@ template<class T> T distance_T<T>::R_correlation(double * x, double * y , int n
int i1, int i2,
int * flag, T_tri & opt)
{
T num,denum2,sumx,sumy,sumxx,sumyy,sumxy;
T num,denum,sumx,sumy,sumxx,sumyy,sumxy;
int count,j;
count= 0;
......@@ -435,16 +424,10 @@ template<class T> T distance_T<T>::R_correlation(double * x, double * y , int n
return NA_REAL;
}
num = sumxy - ( sumx*sumy /count );
denum2 = (sumxx - (sumx*sumx /count ) )* (sumyy - (sumy*sumy /count ) ) ;
if(denum2 <=0) // some apporximations of 0, with floating precision can
// give negative values
return 1;
return 1 - (num / sqrt(denum2));
denum = sqrt( (sumxx - (sumx*sumx /count ) )* (sumyy - (sumy*sumy /count ) ) );
return 1 - (num / denum);
}
/** \brief Absolute Distance correlation (Uncentered Pearson)
* \note Added by L. Cerulo
*/
......@@ -492,7 +475,6 @@ template<class T> T distance_T<T>::R_abscorrelation(double * x, double * y , in
return (1-dist);
}
// ---------------------------------------------------------
// Distance Spearman
//
......@@ -739,7 +721,7 @@ template<class T> T distance_T<T>::R_kendall(double * x, double * y , int nr_x,
template<class T> void distance_T<T>::distance(double *x, int *nr,
int *nc, T *d, int *diag,
int *method,int *nbprocess,
int * ierr)
int * ierr,int i2)
{
......@@ -777,12 +759,13 @@ template<class T> void distance_T<T>::distance(double *x, int *nr,
arguments[i].method = method;
arguments[i].nbprocess= *nbprocess;
arguments[i].ierr=ierr;
arguments[i].i2=i2;
}
*ierr = 1; /* res = 1 => no missing values
res = 0 => missings values */
#ifndef WIN32
#ifndef __MINGW_H
pthread_t * th = (pthread_t *) malloc ( *nbprocess * sizeof(pthread_t));
for (i=0; i < *nbprocess ; i++)
......@@ -802,18 +785,70 @@ template<class T> void distance_T<T>::distance(double *x, int *nr,
// p_thread not yet used on windows.
arguments[0].nbprocess = 1;
arguments[0].i2 = i2;
thread_dist((void *)arguments);
#endif
free( arguments );
}
//template<class T> T distance_T<T>::distance(double * x, double * y , int nr_x, int nr_y, int nc,
//
// get the distance function
//
template<class T> void distance_T<T>::getDistfunction(int method,distfunction& distfun)
{
// T (*distfun)(double*,double*,int, int, int, int, int, int *, T_tri &) = NULL;
switch(method)
{
case EUCLIDEAN:
distfun = R_euclidean;
break;
case MAXIMUM:
distfun = R_maximum;
break;
case MANHATTAN:
distfun = R_manhattan;
break;
case CANBERRA:
distfun = R_canberra;
break;
case BINARY:
distfun = R_dist_binary;
break;
case PEARSON:
distfun = R_pearson;
break;
case CORRELATION:
distfun = R_correlation;
break;
case SPEARMAN:
distfun = R_spearman;
break;
case KENDALL:
distfun = R_kendall;
break;
case ABSPEARSON:
distfun = R_abspearson;
break;
case ABSCORRELATION:
distfun = R_abscorrelation;
break;
default:
{
error("distance(): invalid distance");
distfun = R_euclidean;
}
}
}
/** thread_dist function that compute distance.
*
*/
......@@ -829,9 +864,10 @@ template <class T> void* distance_T<T>::thread_dist(void* arguments_void)
/* for spearman dist */
T_tri opt ;
T (*distfun)(double*,double*,int, int, int, int, int, int *, T_tri &) = NULL;
distfunction distfun;
short int no = arguments[0].id;
nr = *arguments[0].nr;
nc = *arguments[0].nc;
......@@ -841,54 +877,10 @@ template <class T> void* distance_T<T>::thread_dist(void* arguments_void)
method = arguments[0].method;
nbprocess = arguments[0].nbprocess;
ierr = arguments[0].ierr;
int i2 = arguments[0].i2;
switch(*method) {
case EUCLIDEAN:
distfun = R_euclidean;
break;
case MAXIMUM:
distfun = R_maximum;
break;
case MANHATTAN:
distfun = R_manhattan;
break;
case CANBERRA:
distfun = R_canberra;
break;
case BINARY:
distfun = R_dist_binary;
break;
case PEARSON:
distfun = R_pearson;
break;
case CORRELATION:
distfun = R_correlation;
break;
case SPEARMAN:
distfun = R_spearman;
break;
case KENDALL:
distfun = R_kendall;
break;
case ABSPEARSON:
distfun = R_abspearson;
break;
case ABSCORRELATION:
distfun = R_abscorrelation;
break;
default:
{
if(no ==0)
error("distance(): invalid distance");
return (void*)0;
}
}
getDistfunction(*method,distfun);
if( (*method == SPEARMAN) || (*method == KENDALL))
......@@ -901,7 +893,7 @@ template <class T> void* distance_T<T>::thread_dist(void* arguments_void)
opt.rank_tri_y = (int * ) malloc ( (nc) * sizeof(int));
if( (opt.data_tri_x == NULL) || (opt.order_tri_x == NULL) || (opt.rank_tri_x == NULL) ||
(opt.data_tri_y == NULL) || (opt.order_tri_y == NULL) || (opt.rank_tri_y == NULL))
error("distance(): unable to alloc memory");
error("distance(): unable to alloc memory");
}
/*
......@@ -917,28 +909,45 @@ template <class T> void* distance_T<T>::thread_dist(void* arguments_void)
int fin = (int) floor(((nr+1.)*nbprocess - sqrt( (nr+1.)*(nr+1.) * nbprocess * nbprocess - (nr+1.)*(nr+1.) * nbprocess * (no+1.) ) )/nbprocess);
//printf("Thread %d debut %d fin %d\n",no,debut,fin);
//printf("Thread %d debut %d fin %d i2=%d met=%d\n",no,debut,fin,i2,*method);
// here: the computation !
// for(j = 0 ; j <= nr ; j++)
for(j = debut ; j < fin ; j++)
if (i2==-1) /* compute all distance matrix*/
{
ij = (2 * (nr-dc) - j +1) * (j) /2 ;
for(i = j+dc ; i < nr ; i++)
for(j = debut ; j < fin ; j++)
{
d[ij++] = distfun(x,x,nr, nr, nc, i, j,ierr,opt);
ij = (2 * (nr-dc) - j +1) * (j) /2 ;
for(i = j+dc ; i < nr ; i++)
{
d[ij++] = distfun(x,x,nr, nr, nc, i, j,ierr,opt);
}
}
}
}
else { /* updates the distance only for i2*/
for(j = debut ; j < fin ; j++)
{
if (i2!=j) {
int ind1 = j+i2*nr-(i2+1)*(i2+2)/2;
if (i2 > j)
ind1 = i2+j*nr-(j+1)*(j+2)/2;
d[ind1] = distfun(x,x,nr, nr, nc, i2, j,ierr,opt);
//printf("updated dist %d %d %f\n",i2,j,d[ind1]);
}
}
}
if( (*method == SPEARMAN) || (*method == KENDALL))
if( (*method == SPEARMAN) || (*method == KENDALL))
{
free(opt.data_tri_x);
free(opt.rank_tri_x);
free(opt.order_tri_x);
free(opt.data_tri_y);
free(opt.rank_tri_y);
free(opt.order_tri_y);
free(opt.data_tri_x);
free(opt.rank_tri_x);
free(opt.order_tri_x);
free(opt.data_tri_y);
free(opt.rank_tri_y);
free(opt.order_tri_y);
}
return (void*)0;
......@@ -979,47 +988,9 @@ template <class T> T distance_T<T>::distance_kms(double *x,double *y, int nr1,in
T res;
T (*distfun)(double*,double*,int, int, int, int, int, int *, T_tri &) = NULL;
// choice of distance
switch(*method) {
case EUCLIDEAN:
distfun = R_euclidean;
break;
case MAXIMUM:
distfun = R_maximum;
break;
case MANHATTAN:
distfun = R_manhattan;
break;
case CANBERRA:
distfun = R_canberra;
break;
case BINARY:
distfun = R_dist_binary;
break;
case PEARSON:
distfun = R_pearson;
break;
case CORRELATION:
distfun = R_correlation;
break;
case SPEARMAN:
distfun = R_spearman;
break;
case KENDALL:
distfun = R_kendall;
break;
case ABSPEARSON:
distfun = R_abspearson;
break;
case ABSCORRELATION:
distfun = R_abscorrelation;
break;
default:
error("distance_kms(): invalid distance");
}
distfunction distfun;
getDistfunction(*method,distfun);
// here: distance computation
res = distfun(x,y, nr1,nr2, nc, i1, i2,ierr, opt);
......
......@@ -23,6 +23,8 @@ template<class T> class distance_T
};
typedef T (* distfunction)(double*,double*,int, int, int, int, int, int *, T_tri &);
private:
/** \brief arguments sent to distance thread */
......@@ -37,6 +39,7 @@ template<class T> class distance_T
int * method;
int nbprocess;
int * ierr;
int i2;
};
......@@ -49,6 +52,10 @@ template<class T> class distance_T
public:
/** return a distance function, depending on method
*/
static void getDistfunction(int method,distfunction & distfun);
/** \brief R_distance compute parallelized distance.
*
* compute distance and call function thread_dist
......@@ -62,10 +69,13 @@ template<class T> class distance_T
* \param method 1, 2,... method used (correspond to the enum)
* \param nbprocess: number of threads to create
* \param ierr error return; 1 good; 0 missing values
* \param i2: if -1: ignored, else, compute
* distance between individual i2 and others
*/
static void distance(double *x, int *nr, int *nc, T *d, int *diag,
int *method,int *nbprocess, int * ierr);
int *method,int *nbprocess, int * ierr,int i2);
/** \brief R_distance_kms: compute distance between individual i1 and
* centroid i2
......@@ -263,7 +273,7 @@ template<class T> class distance_T
int * flag, T_tri & opt);
/** \brief Pearson / Pearson centered (correlation)
* \note Added by A. Lucas
* \note Added by L. Cerulo
*/
static T R_abspearson(double * x, double * y , int nr_x, int nr_y, int nc,
int i1, int i2,
......
......@@ -3,7 +3,7 @@
* \brief Hierarchical Clustering.
*
* \date Created : 14/11/02
* \date Last Modified : Time-stamp: <2007-10-03 20:33:11 antoine>
* \date Last Modified : Time-stamp: <2011-11-04 22:29:32 antoine>
*
* \author F. Murtagh, ESA/ESO/STECF, Garching, February 1986.
......@@ -72,8 +72,13 @@
*/
void hclust(int *n,int *len, int *iopt ,int *ia , int *ib,int *iorder,double *crit,double *membr,double *diss,int *result)
{
hclust_T::hclust<double>(n,len,iopt ,ia ,ib,iorder,
crit,membr,diss, result);
int nbprocess = 1;
if (*iopt!=hclust_T::CENTROID2)
{
hclust_T::hclust<double>(&nbprocess,NULL,*n,*n,NULL,n,len,iopt ,ia ,ib,iorder,
crit,membr,diss, result);
}
} /* end function hclust */
......
......@@ -2,12 +2,14 @@
#define _AMAP_HCLUST_TEMPLATE_CPP 1
#include "hclust_T.h"
#include "distance_T.h"
#include "hclust.h"
#include "distance_T.h"
#include "distance.h"
#include "hclust_T.h"
#include <stdio.h>
namespace hclust_T
{
......@@ -50,7 +52,7 @@ namespace hclust_T
/*
* Calculate d: distance matrix
*/
distance_T<T>::distance(x,nr,nc,d,diag,method,nbprocess,&flag);
distance_T<T>::distance(x,nr,nc,d,diag,method,nbprocess,&flag,-1);
if(flag == 0)
{
printf("AMAP: Unable to compute Hierarchical Clustering: missing values in distance matrix\n");
......@@ -58,30 +60,39 @@ namespace hclust_T
return;
}
/*
* Hierarchical clustering
*/
hclust_T::hclust<T>(nr,&len,iopt ,ia ,ib,iorder,crit,membr,d,result);
free(d);
hclust_T::hclust<T>(nbprocess,x,*nr,*nc,method,nr,&len,iopt ,ia ,ib,iorder,crit,membr,d,result);
free(d);
*result = 0;
}
template <class T> void hclust(int *n,int *len, int *iopt ,int *ia , int *ib,int *iorder,double *crit,double *membr,T *diss,int *result)
template <class T> void hclust(int * nbprocess,double *mx,int nr, int nc,
int *method,int *n,int *len, int *iopt ,int *ia ,
int *ib,int *iorder,double *crit,double *membr,T *diss,int *result)
{
int im,jm,jj,i,j,ncl,ind,i2,j2,k,ind1,ind2,ind3;
double inf,dmin,x,xx;
int *nn;
int *items = NULL;
double *disnn;
short int *flag;
int *iia;
int *iib;
int count,h,idx1,idxitem1,idx2;
*result = 1;
nn = (int*) malloc (*n * sizeof(int));
if(*iopt==CENTROID2)
{
items = (int*) malloc (*n*nc * sizeof(int));
}
disnn = (double*) malloc (*n * sizeof(double));
flag = (short int*) malloc (*n * sizeof(short int));
if(nn == NULL || disnn == NULL || flag == NULL )
......@@ -94,7 +105,18 @@ namespace hclust_T
/* Initialisation */
for ( i=0; i<*n ; i++)
flag[i]=1;
{
flag[i]=1;
idxitem1=i;
if(items != NULL)
{
for ( j=0; j<nc ; j++)
{
items[idxitem1]=1; /* only one item per cluster and per coordinate for iopt==8*/
idxitem1+=nr;
}
}
}
ncl=*n;
inf= 1e20;
......@@ -126,11 +148,13 @@ namespace hclust_T
/*
* Next, determine least diss. using list of NNs
*/
dmin = inf;
for ( i=0; i<(*n-1) ; i++)
{
if( flag[i] )
{
if (disnn[i] < dmin )
{
dmin = disnn[i];
......@@ -139,6 +163,7 @@ namespace hclust_T
}
}
}
ncl = ncl-1;
/*
......@@ -157,6 +182,69 @@ namespace hclust_T
*/
flag[j2]=0;
dmin=inf;
jj=0;
/*
* Cluster3 CENTROID METHOD - IOPT=8.
*/
if (*iopt==CENTROID2)
{
/* compute centroind coordinates*/
idx1=i2;
idx2=j2;
ind3=ioffst(*n,i2,j2);
//printf("Aggregate %d-%d %d-%d (method=%d, dmin=%f (%f))\n",i2,j2,im,jm,*method,dmin,diss[ind3]);
for(h = 0 ; h< nc ; h++)
{
if(R_FINITE(mx[idx1]) && R_FINITE(mx[idx2]))
{
mx[idx1] = (