Commit 4889bfaf authored by Antoine Lucas's avatar Antoine Lucas
Browse files

add 'absolute' distances

parent 06b2f8cf
......@@ -23,7 +23,7 @@ function(x, centers, iter.max = 10, nstart = 1,
METHODS <- c("euclidean", "maximum", "manhattan", "canberra",
"binary","pearson","correlation","spearman","kendall")
"binary","pearson","correlation","spearman","kendall","abspearson","abscorrelation")
method <- pmatch(method, METHODS)
if (is.na(method))
stop("invalid distance method")
......
......@@ -10,7 +10,7 @@ Dist <- function(x, method="euclidean", nbproc = 2, diag=FALSE, upper=FALSE)
method <- "euclidean"
METHODS <- c("euclidean", "maximum", "manhattan", "canberra",
"binary","pearson","correlation","spearman","kendall")
"binary","pearson","correlation","spearman","kendall","abspearson","abscorrelation")
method <- pmatch(method, METHODS)
if(is.na(method))
stop("invalid distance method")
......
## Hierarchical clustering
##
## Created : 18/11/02
## Last Modified : Time-stamp: <2005-10-01 20:14:25 antoine>
## Last Modified : Time-stamp: <2011-11-03 21:50:46 antoine>
##
## This function is a "mix" of function dist and function hclust.
##
......@@ -20,7 +20,8 @@ hclusterpar <- hcluster <- function (x, method = "euclidean", diag = FALSE, uppe
if (!is.na(pmatch(method, "euclidian")))
method <- "euclidean"
METHODS <- c("euclidean", "maximum", "manhattan", "canberra",
"binary","pearson","correlation","spearman","kendall")
"binary","pearson","correlation","spearman","kendall",
"abspearson","abscorrelation")
method <- pmatch(method, METHODS)
if (is.na(method))
stop("invalid distance method")
......
......@@ -29,6 +29,7 @@ Kmeans(x, centers, iter.max = 10, nstart = 1,
\item{method}{the distance measure to be used. This must be one of
\code{"euclidean"}, \code{"maximum"}, \code{"manhattan"},
\code{"canberra"}, \code{"binary"}, \code{"pearson"} ,
\code{"abspearson"} , \code{"abscorrelation"},
\code{"correlation"}, \code{"spearman"} or \code{"kendall"}.
Any unambiguous substring can be given.}
}
......@@ -77,6 +78,7 @@ plot(x, col = cl$cluster)
points(cl$centers, col = 1:5, pch = 8)
Kmeans(x, 5,nstart = 25, method="abscorrelation")
}
......
......@@ -13,7 +13,8 @@ Dist(x, method = "euclidean", nbproc = 2, diag = FALSE, upper = FALSE)
\item{method}{the distance measure to be used. This must be one of
\code{"euclidean"}, \code{"maximum"}, \code{"manhattan"},
\code{"canberra"}, \code{"binary"}, \code{"pearson"},
\code{"correlation"}, \code{"spearman"} or \code{"kendall"}.
\code{"abspearson"}, \code{"correlation"},
\code{"abscorrelation"}, \code{"spearman"} or \code{"kendall"}.
Any unambiguous substring can be given.}
\item{nbproc}{integer, Number of subprocess for parallelization}
\item{diag}{logical value indicating whether the diagonal of the
......@@ -53,12 +54,31 @@ Dist(x, method = "euclidean", nbproc = 2, diag = FALSE, upper = FALSE)
\item{\code{pearson}:}{Also named "not centered Pearson"
\eqn{1 - \frac{\sum_i x_i y_i}{\sqrt{\sum_i x_i^2 %
\sum_i y_i^2}}}{%
sum(x_i y_i) / sqrt [sum(x_i^2) sum(y_i^2)]}.
1 - sum(x_i y_i) / sqrt [sum(x_i^2) sum(y_i^2)]}.
}
\item{\code{abspearson}:}{Absolute Pearson
\eqn{1 - \left| \frac{\sum_i x_i y_i}{\sqrt{\sum_i x_i^2 %
\sum_i y_i^2}} \right| }{%
1 - |sum(x_i y_i) / sqrt [sum(x_i^2) sum(y_i^2)] |}.
}
\item{\code{correlation}:}{Also named "Centered Pearson"
\eqn{1 - corr(x,y)}.
}
\item{\code{abscorrelation}:}{Absolute correlation
\eqn{1 - | corr(x,y) |}
with
\eqn{ corr(x,y) = \frac{\sum_i x_i y_i -\frac1n \sum_i x_i \sum_i%
y_i}{% frac: 2nd part
\sqrt{\left(\sum_i x_i^2 -\frac1n \left( \sum_i x_i \right)^2 %
\right)%
\left( \sum_i y_i^2 -\frac1n \left( \sum_i y_i \right)^2 %
\right)} }}.
}
\item{\code{spearman}:}{Compute a distance based on rank.
\eqn{\sum(d_i^2)}{sum (d_i^2)} where \eqn{d_i} is the difference
in rank between \eqn{x_i} and \eqn{y_i}.
......@@ -136,6 +156,10 @@ Dist(x, upper = TRUE)
## compute dist with 8 threads
Dist(x,nbproc=8)
Dist(x,method="abscorrelation")
Dist(x,method="kendall")
}
\keyword{multivariate}
\keyword{cluster}
......@@ -18,8 +18,9 @@ hcluster(x, method = "euclidean", diag = FALSE, upper = FALSE,
}
\item{method}{the distance measure to be used. This must be one of
\code{"euclidean"}, \code{"maximum"}, \code{"manhattan"},
\code{"canberra"} \code{"binary"} \code{"pearson"},
\code{"correlation"}, \code{"spearman"} or \code{"kendall"}.
\code{"canberra"}, \code{"binary"}, \code{"pearson"},
\code{"abspearson"}, \code{"correlation"},
\code{"abscorrelation"}, \code{"spearman"} or \code{"kendall"}.
Any unambiguous substring can be given.}
\item{diag}{logical value indicating whether the diagonal of the
distance matrix should be printed by \code{print.dist}.}
......@@ -85,7 +86,7 @@ hcluster(x, method = "euclidean", diag = FALSE, upper = FALSE,
= hclust(dist(x, method = "euclidean"),method = "complete"))}
It use twice less memory, as it doesn't store distance matrix.
For more details, see documentation of \code{hclust} and \code{dist}.
For more details, see documentation of \code{hclust} and \code{Dist}.
}
......@@ -144,10 +145,16 @@ hc <- hcluster(USArrests,method = "bin",link = "mcquitty", nbproc= 1,
doubleprecision = FALSE)
hc <- hcluster(USArrests,method = "pea",link = "median", nbproc= 2,
doubleprecision = FALSE)
hc <- hcluster(USArrests,method = "abspea",link = "median", nbproc= 2,
doubleprecision = FALSE)
hc <- hcluster(USArrests,method = "cor",link = "centroid", nbproc= 1,
doubleprecision = FALSE)
hc <- hcluster(USArrests,method = "abscor",link = "centroid", nbproc= 1,
doubleprecision = FALSE)
hc <- hcluster(USArrests,method = "spe",link = "complete", nbproc= 2,
doubleprecision = FALSE)
hc <- hcluster(USArrests,method = "ken",link = "complete", nbproc= 2,
doubleprecision = FALSE)
......
......@@ -2,7 +2,7 @@
* \brief all functions requiered for R dist function and C hcluster function.
*
* \date Created: probably in 1995
* \date Last modified: Time-stamp: <2010-01-21 18:42:14 antoine>
* \date Last modified: Time-stamp: <2011-11-03 21:32:14 antoine>
*
* \author R core members, and lately: Antoine Lucas
*
......@@ -348,11 +348,57 @@ template<class T> T distance_T<T>::R_pearson(double * x, double * y , int nr_x,
*flag = 0;
return NA_REAL;
}
dist = 1 - ( num / sqrt(sum1 * sum2) );
if((sum1 == 0) || (sum2 == 0))
dist = 1; // one vector is null.
else
dist = 1 - ( num / sqrt(sum1 * sum2) );
return dist;
}
/** \brief Absoulute Pearson / Pearson uncentered (correlation)
* \note Added by L. Cerulo
*/
template<class T> T distance_T<T>::R_abspearson(double * x, double * y , int nr_x, int nr_y, int nc,
int i1, int i2,
int * flag, T_tri & opt)
{
T num,sum1,sum2, dist;
int count,j;
count= 0;
num = 0;
sum1 = 0;
sum2 = 0;
for(j = 0 ; j < nc ; j++) {
if(R_FINITE(x[i1]) && R_FINITE(y[i2])) {
num += (x[i1] * y[i2]);
sum1 += (x[i1] * x[i1]);
sum2 += (y[i2] * y[i2]);
count++;
}
i1 += nr_x;
i2 += nr_y;
}
if(count == 0)
{
*flag = 0;
return NA_REAL;
}
if((sum1 == 0) || (sum2 == 0))
dist = 0; // one vector is null.
else
dist = ( num / sqrt(sum1 * sum2) );
if (dist<0)
{
dist*=-1;
}
return (1-dist);
}
/** \brief Distance correlation (Uncentered Pearson)
* \note Added by A. Lucas
*/
......@@ -360,7 +406,7 @@ template<class T> T distance_T<T>::R_correlation(double * x, double * y , int n
int i1, int i2,
int * flag, T_tri & opt)
{
T num,denum,sumx,sumy,sumxx,sumyy,sumxy;
T num,denum2,sumx,sumy,sumxx,sumyy,sumxy;
int count,j;
count= 0;
......@@ -389,10 +435,64 @@ template<class T> T distance_T<T>::R_correlation(double * x, double * y , int n
return NA_REAL;
}
num = sumxy - ( sumx*sumy /count );
denum = sqrt( (sumxx - (sumx*sumx /count ) )* (sumyy - (sumy*sumy /count ) ) );
return 1 - (num / denum);
denum2 = (sumxx - (sumx*sumx /count ) )* (sumyy - (sumy*sumy /count ) ) ;
if(denum2 <=0) // some apporximations of 0, with floating precision can
// give negative values
return 1;
return 1 - (num / sqrt(denum2));
}
/** \brief Absolute Distance correlation (Uncentered Pearson)
* \note Added by L. Cerulo
*/
template<class T> T distance_T<T>::R_abscorrelation(double * x, double * y , int nr_x, int nr_y, int nc,
int i1, int i2,
int * flag, T_tri & opt)
{
T num,denum,sumx,sumy,sumxx,sumyy,sumxy,dist,term;
int count,j;
count= 0;
sumx=0;
sumy=0;
sumxx=0;
sumyy=0;
sumxy=0;
for(j = 0 ; j < nc ; j++) {
if(R_FINITE(x[i1]) && R_FINITE(y[i2])) {
sumxy += (x[i1] * y[i2]);
sumx += x[i1];
sumy += y[i2];
sumxx += x[i1] * x[i1];
sumyy += y[i2] * y[i2];
count++;
}
i1 += nr_x;
i2 += nr_y;
}
if(count == 0)
{
*flag = 0;
return NA_REAL;
}
num = sumxy - ( sumx*sumy /count );
term=(sumxx - (sumx*sumx /count ) )* (sumyy - (sumy*sumy /count ) );
if (term<=0) return 1;
denum = sqrt( term );
dist=num/denum;
if (dist<0) {
dist*=-1;
}
return (1-dist);
}
// ---------------------------------------------------------
// Distance Spearman
//
......@@ -773,10 +873,21 @@ template <class T> void* distance_T<T>::thread_dist(void* arguments_void)
case KENDALL:
distfun = R_kendall;
break;
case ABSPEARSON:
distfun = R_abspearson;
break;
case ABSCORRELATION:
distfun = R_abscorrelation;
break;
default:
error("distance(): invalid distance");
{
if(no ==0)
error("distance(): invalid distance");
return (void*)0;
}
}
......@@ -899,9 +1010,15 @@ template <class T> T distance_T<T>::distance_kms(double *x,double *y, int nr1,in
case KENDALL:
distfun = R_kendall;
break;
case ABSPEARSON:
distfun = R_abspearson;
break;
case ABSCORRELATION:
distfun = R_abscorrelation;
break;
default:
error("distance(): invalid distance");
error("distance_kms(): invalid distance");
}
// here: distance computation
......
......@@ -9,8 +9,7 @@ template<class T> class distance_T
public:
/* == 1,2,..., defined by order in the r function dist */
enum { EUCLIDEAN=1, MAXIMUM, MANHATTAN, CANBERRA, BINARY ,PEARSON, CORRELATION, SPEARMAN,
KENDALL};
enum { EUCLIDEAN=1, MAXIMUM, MANHATTAN, CANBERRA, BINARY ,PEARSON, CORRELATION, SPEARMAN, KENDALL, ABSPEARSON, ABSCORRELATION};
struct T_tri
......@@ -263,6 +262,19 @@ template<class T> class distance_T
int i1, int i2,
int * flag, T_tri & opt);
/** \brief Pearson / Pearson centered (correlation)
* \note Added by A. Lucas
*/
static T R_abspearson(double * x, double * y , int nr_x, int nr_y, int nc,
int i1, int i2,
int * flag, T_tri & opt);
/** \brief Distance correlation (Uncentered Pearson)
* \note Added by L. Cerulo
*/
static T R_abscorrelation(double * x, double * y , int nr_x, int nr_y, int nc,
int i1, int i2,
int * flag, T_tri & opt);
/** \brief Spearman distance (rank base metric)
*
......
......@@ -4,7 +4,7 @@
* \brief K-means clustering
*
* \date Created : before 2005
* \date Last Modified : Time-stamp: <2009-10-11 09:45:13 antoine>
* \date Last Modified : Time-stamp: <2011-11-03 22:19:14 antoine>
*
* \author R core team. Modified by A. Lucas for distance choice.
*
......@@ -62,9 +62,9 @@ void kmeans_Lloyd2(double *x, int *pn, int *pp, double *cen, int *pk, int *cl,
Rboolean updated;
distance_T<double>::T_tri opt;
int ierr[1];
double * data_tri;
int * order_tri;
int * rank_tri;
//double * data_tri;
//int * order_tri;
//int * rank_tri;
if( (*method == distance_T<double>::SPEARMAN) || (*method == distance_T<double>::KENDALL))
{
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment