Commit 21e3d1e8 authored by Antoine Lucas's avatar Antoine Lucas
Browse files

add kendall distance

parent ae1d7dea
......@@ -23,7 +23,7 @@ function(x, centers, iter.max = 10, nstart = 1,
METHODS <- c("euclidean", "maximum", "manhattan", "canberra",
"binary","pearson","correlation")
"binary","pearson","correlation","spearman","kendall")
method <- pmatch(method, METHODS)
if (is.na(method))
stop("invalid distance method")
......
......@@ -10,7 +10,7 @@ Dist <- function(x, method="euclidean", nbproc = 1, diag=FALSE, upper=FALSE)
method <- "euclidean"
METHODS <- c("euclidean", "maximum", "manhattan", "canberra",
"binary","pearson","correlation","spearman")
"binary","pearson","correlation","spearman","kendall")
method <- pmatch(method, METHODS)
if(is.na(method))
stop("invalid distance method")
......
......@@ -20,7 +20,7 @@ hclusterpar <- hcluster <- function (x, method = "euclidean", diag = FALSE, uppe
if (!is.na(pmatch(method, "euclidian")))
method <- "euclidean"
METHODS <- c("euclidean", "maximum", "manhattan", "canberra",
"binary","pearson","correlation","spearman")
"binary","pearson","correlation","spearman","kendall")
method <- pmatch(method, METHODS)
if (is.na(method))
stop("invalid distance method")
......
......@@ -28,8 +28,8 @@ Kmeans(x, centers, iter.max = 10, nstart = 1,
should be chosen?}
\item{method}{the distance measure to be used. This must be one of
\code{"euclidean"}, \code{"maximum"}, \code{"manhattan"},
\code{"canberra"}, \code{"binary"}, \code{"pearson"} or
\code{"correlation"}.
\code{"canberra"}, \code{"binary"}, \code{"pearson"} ,
\code{"correlation"}, \code{"spearman"} or \code{"kendall"}.
Any unambiguous substring can be given.}
}
\details{
......
......@@ -13,7 +13,7 @@ Dist(x, method = "euclidean", nbproc = 1, diag = FALSE, upper = FALSE)
\item{method}{the distance measure to be used. This must be one of
\code{"euclidean"}, \code{"maximum"}, \code{"manhattan"},
\code{"canberra"}, \code{"binary"}, \code{"pearson"},
\code{"correlation"} or \code{"spearman"}.
\code{"correlation"}, \code{"spearman"} or \code{"kendall"}.
Any unambiguous substring can be given.}
\item{nbproc}{integer, Number of subprocess for parallelization}
\item{diag}{logical value indicating whether the diagonal of the
......@@ -68,6 +68,12 @@ Dist(x, method = "euclidean", nbproc = 1, diag = FALSE, upper = FALSE)
\code{cor.test(x[i,],x[j,],method="spearman")$statistic}
}
\item{\code{kendall}:}{Compute a distance based on rank.
\eqn{\sum_{i,j} K_{i,j}(x,y)} with \eqn{K_{i,j}(x,y)}
is 0 if \eqn{x_i, x_j} in same order as \eqn{y_i,y_j},
1 if not.
}
}
Missing values are allowed, and are excluded from all computations
......@@ -107,6 +113,10 @@ Dist(x, method = "euclidean", nbproc = 1, diag = FALSE, upper = FALSE)
\references{
Mardia, K. V., Kent, J. T. and Bibby, J. M. (1979)
\emph{Multivariate Analysis.} London: Academic Press.
Wikipedia
\url{http://en.wikipedia.org/wiki/Kendall_tau_distance}
}
\seealso{
\code{\link[cluster]{daisy}} in the \file{cluster} package with more
......
......@@ -19,7 +19,7 @@ hcluster(x, method = "euclidean", diag = FALSE, upper = FALSE,
\item{method}{the distance measure to be used. This must be one of
\code{"euclidean"}, \code{"maximum"}, \code{"manhattan"},
\code{"canberra"} \code{"binary"} \code{"pearson"},
\code{"correlation"} or \code{"spearman"}.
\code{"correlation"}, \code{"spearman"} or \code{"kendall"}.
Any unambiguous substring can be given.}
\item{diag}{logical value indicating whether the diagonal of the
distance matrix should be printed by \code{print.dist}.}
......
......@@ -8,7 +8,7 @@
\description{Graphics for Principal component Analysis}
\usage{
plot.acp(x,i=1,j=2,text=TRUE,label='Composants',col='darkblue',
main='Individuals PCA',variables=TRUE,...)
main='Individuals PCA',variables=TRUE,labels=NULL,...)
biplot.acp(x,i=1,j=2,label='Composants',col='darkblue',length=0.1,
main='Variables PCA',circle=TRUE,...)
plot2(x,pourcent=FALSE,eigen=TRUE,label='Comp.',col='lightgrey',
......@@ -26,6 +26,7 @@ values}
\item{eigen}{a logical value indicating whether we use eigen values or
standard deviation}
\item{label}{label for X and Y axis}
\item{labels}{labels naming individuals}
\item{col}{Color of plot}
\item{main}{Title of graphic}
\item{ylab}{Y label}
......
This diff is collapsed.
......@@ -2,12 +2,27 @@
#ifndef _AMAP_DISTANCE_TEMPLATE
#define _AMAP_DISTANCE_TEMPLATE 1
/* == 1,2,..., defined by order in the r function dist */
enum { EUCLIDEAN=1, MAXIMUM, MANHATTAN, CANBERRA, BINARY ,PEARSON, CORRELATION, SPEARMAN};
template<class T> class distance_T
{
public:
/* == 1,2,..., defined by order in the r function dist */
enum { EUCLIDEAN=1, MAXIMUM, MANHATTAN, CANBERRA, BINARY ,PEARSON, CORRELATION, SPEARMAN,
KENDALL};
struct T_tri
{
double * data_tri_x;
int * order_tri_x;
int * rank_tri_x;
double * data_tri_y;
int * order_tri_y;
int * rank_tri_y;
};
private:
struct T_argument
......@@ -36,44 +51,88 @@ template<class T> class distance_T
static void distance(double *x, int *nr, int *nc, T *d, int *diag,
int *method,int *nbprocess, int * ierr);
/** \brief R_distance_kms: compute distance between individual i1 and
* centroid i2
*
* compute distance and call one of function R_euclidean or R_...
* This function is called by kmeans_Lloyd2
*
* \param x input matrix (individuals)
* \param y input matrix (centroids)
* \param nr1,nr2,nc number of row (nr1:x, nr2:y) and columns
* nr individuals with nc values.
* \param i1, i2: indice of individuals (individual i1, centroid i2)
* \param method 1, 2,... method used
* \param ierr for NA 0 if no value can be comuted due to NA
* \param opt optional parameter required for spearman
*/
static T distance_kms(double *x,double *y, int nr1,int nr2,
int nc,int i1,int i2, int *method,
int * ierr, T_tri & opt);
private:
static void* thread_dist(void* arguments);
/** \brief Distance euclidean (i.e. sqrt(sum of square) )
*/
static T R_euclidean(double * x, int nr, int nc, int i1, int i2,int * flag, void ** opt);
static T R_euclidean(double * x, double * y , int nr_x, int nr_y, int nc,
int i1, int i2,
int * flag, T_tri & opt);
/** \brief Distance maximum (supremum norm)
*/
static T R_maximum(double * x, int nr, int nc, int i1, int i2, int * flag, void ** opt);
static T R_maximum(double * x, double * y , int nr_x, int nr_y, int nc,
int i1, int i2,
int * flag, T_tri & opt);
/** \brief Distance manhattan
*/
static T R_manhattan(double * x, int nr, int nc, int i1, int i2, int * flag, void ** opt);
static T R_manhattan(double * x, double * y , int nr_x, int nr_y, int nc,
int i1, int i2,
int * flag, T_tri & opt);
/** \brief Distance canberra
*/
static T R_canberra(double * x, int nr, int nc, int i1, int i2, int * flag, void ** opt);
static T R_canberra(double * x, double * y , int nr_x, int nr_y, int nc,
int i1, int i2,
int * flag, T_tri & opt);
/** \brief Distance binary
*/
static T R_dist_binary(double * x, int nr, int nc, int i1, int i2,int * flag, void ** opt);
static T R_dist_binary(double * x, double * y , int nr_x, int nr_y, int nc,
int i1, int i2,
int * flag, T_tri & opt);
/** \brief Pearson / Pearson centered (correlation)
* \note Added by A. Lucas
*/
static T R_pearson(double * x, int nr, int nc, int i1, int i2,int * flag, void ** opt);
static T R_pearson(double * x, double * y , int nr_x, int nr_y, int nc,
int i1, int i2,
int * flag, T_tri & opt);
/** \brief Distance correlation (Uncentered Pearson)
* \note Added by A. Lucas
*/
static T R_correlation(double * x, int nr, int nc, int i1, int i2,int * flag, void ** opt);
static T R_correlation(double * x, double * y , int nr_x, int nr_y, int nc,
int i1, int i2,
int * flag, T_tri & opt);
/** \brief Spearman distance (rank base metric)
* \note Added by A. Lucas
*/
static T R_spearman(double * x, int nr, int nc, int i1, int i2,int * flag, void ** opt);
static T R_spearman(double * x, double * y , int nr_x, int nr_y, int nc,
int i1, int i2,
int * flag, T_tri & opt);
/** \brief Kendall distance (rank base metric)
* \note Added by A. Lucas
*/
static T R_kendall(double * x, double * y , int nr_x, int nr_y, int nc,
int i1, int i2,
int * flag, T_tri & opt);
};
......
/*! \file distance_kms.c
* \brief all functions requiered for C function kmeans_Lloyd2.
*
* \date Created: 2005
* \date Last modified: Time-stamp: <2005-10-09 13:38:54 antoine>
*
* \author Adapted from distance.c (R core members) by Antoine Lucas
*
* R : A Computer Language for Statistical Data Analysis
* Copyright (C) 1995, 1996 Robert Gentleman and Ross Ihaka
* Copyright (C) 1998, 2001 Robert Gentleman, Ross Ihaka and the
* R Development Core Team
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
#include <float.h>
#include <R_ext/Arith.h>
#include <R_ext/Error.h>
#include "mva.h"
/** \brief Distance euclidean (i.e. sqrt(sum of square) )
*/
double R_euclidean_kms(double *x, double *y, int nr1, int nr2, int nc, int i1, int i2)
{
double dev, dist;
int count, j;
count= 0;
dist = 0;
for(j = 0 ; j < nc ; j++) {
if(R_FINITE(x[i1]) && R_FINITE(y[i2])) {
dev = (x[i1] - y[i2]);
dist += dev * dev;
count++;
}
i1 += nr1;
i2 += nr2;
}
if(count == 0) return NA_REAL;
if(count != nc) dist /= ((double)count/nc);
return sqrt(dist);
}
/** \brief Distance maximum (supremum norm)
*/
double R_maximum_kms(double *x, double *y, int nr1, int nr2, int nc, int i1, int i2)
{
double dev, dist;
int count, j;
count = 0;
dist = -DBL_MAX;
for(j = 0 ; j < nc ; j++) {
if(R_FINITE(x[i1]) && R_FINITE(y[i2])) {
dev = fabs(x[i1] - y[i2]);
if(dev > dist)
dist = dev;
count++;
}
i1 += nr1;
i2 += nr2;
}
if(count == 0) return NA_REAL;
return dist;
}
/** \brief Distance manhattan
*/
double R_manhattan_kms(double *x, double *y, int nr1, int nr2, int nc, int i1, int i2)
{
double dist;
int count, j;
count = 0;
dist = 0;
for(j = 0 ; j < nc ; j++) {
if(R_FINITE(x[i1]) && R_FINITE(y[i2])) {
dist += fabs(x[i1] - y[i2]);
count++;
}
i1 += nr1;
i2 += nr2;
}
if(count == 0) return NA_REAL;
if(count != nc) dist /= ((double)count/nc);
return dist;
}
/** \brief Distance canberra
*/
double R_canberra_kms(double *x, double *y, int nr1, int nr2, int nc, int i1, int i2)
{
double dist, sum, diff;
int count, j;
count = 0;
dist = 0;
for(j = 0 ; j < nc ; j++) {
if(R_FINITE(x[i1]) && R_FINITE(y[i2])) {
sum = fabs(x[i1] + y[i2]);
diff = fabs(x[i1] - y[i2]);
if (sum > DBL_MIN || diff > DBL_MIN) {
dist += diff/sum;
count++;
}
}
i1 += nr1;
i2 += nr2;
}
if(count == 0) return NA_REAL;
if(count != nc) dist /= ((double)count/nc);
return dist;
}
/** \brief Distance binary
*/
double R_dist_binary_kms(double *x, double *y, int nr1, int nr2, int nc, int i1, int i2)
{
int total, count, dist;
int j;
total = 0;
count = 0;
dist = 0;
for(j = 0 ; j < nc ; j++) {
if(R_FINITE(x[i1]) && R_FINITE(y[i2])) {
if(x[i1] || y[i2]){
count++;
if( ! (x[i1] && y[i2]) ) dist++;
}
total++;
}
i1 += nr1;
i2 += nr2;
}
if(total == 0) return NA_REAL;
if(count == 0) return 0;
return (double) dist / count;
}
/** \brief Pearson / Pearson centered (correlation)
* \note Added by A. Lucas
*/
double R_pearson_kms(double *x, double *y, int nr1, int nr2, int nc, int i1, int i2)
{
double num,sum1,sum2, dist;
int count,j;
count= 0;
num = 0;
sum1 = 0;
sum2 = 0;
for(j = 0 ; j < nc ; j++) {
if(R_FINITE(x[i1]) && R_FINITE(y[i2])) {
num += (x[i1] * y[i2]);
sum1 += (x[i1] * x[i1]);
sum2 += (y[i2] * y[i2]);
count++;
}
i1 += nr1;
i2 += nr2;
}
if(count == 0) return NA_REAL;
dist = 1 - ( num / sqrt(sum1 * sum2) );
return dist;
}
/** \brief Distance correlation (Uncentered Pearson)
* \note Added by A. Lucas
*/
double R_correlation_kms(double *x, double *y, int nr1, int nr2, int nc, int i1, int i2)
{
double num,denum,sumx,sumy,sumxx,sumyy,sumxy;
int count,j;
count= 0;
sumx=0;
sumy=0;
sumxx=0;
sumyy=0;
sumxy=0;
for(j = 0 ; j < nc ; j++) {
if(R_FINITE(x[i1]) && R_FINITE(y[i2])) {
sumxy += (x[i1] * y[i2]);
sumx += x[i1];
sumy += y[i2];
sumxx += x[i1] * x[i1];
sumyy += y[i2] * y[i2];
count++;
}
i1 += nr1;
i2 += nr2;
}
if(count == 0) return NA_REAL;
num = sumxy - ( sumx*sumy /count );
denum = sqrt( (sumxx - (sumx*sumx /count ) )* (sumyy - (sumy*sumy /count ) ) );
return 1 - (num / denum);
}
enum { EUCLIDEAN=1, MAXIMUM, MANHATTAN, CANBERRA, BINARY ,PEARSON, CORRELATION};
/* == 1,2,..., defined by order in the R function dist */
/**
* R_distance_kms: compute distance between individual i1 and
* centroid i2
* \brief compute distance and call one of function R_euclidean or R_...
* \brief This function is called by kmeans_Lloyd2
* \param x input matrix (individuals)
* \param y input matrix (centroids)
* \param nr1,nr2,nc number of row (nr1:x, nr2:y) and columns
* nr individuals with nc values.
* \param i1, i2: indice of individuals (individual i1, centroid i2)
* \param method 1, 2,... method used
*/
double R_distance_kms(double *x,double *y, int nr1,int nr2, int nc,int i1,int i2, int *method)
{
/*
* compute distance x[i1,*] - y[i2,*]
* x matrix n x p
* y matrix m x p
* nr1 = n; nr2 = m; nc =p
*/
double res;
double (*distfun)(double*, double*, int, int, int, int, int) = NULL;
switch(*method) {
case EUCLIDEAN:
distfun = R_euclidean_kms;
break;
case MAXIMUM:
distfun = R_maximum_kms;
break;
case MANHATTAN:
distfun = R_manhattan_kms;
break;
case CANBERRA:
distfun = R_canberra_kms;
break;
case BINARY:
distfun = R_dist_binary_kms;
break;
case PEARSON:
distfun = R_pearson_kms;
break;
case CORRELATION:
distfun = R_correlation_kms;
break;
default:
error("distance(): invalid distance");
}
res = distfun(x,y, nr1,nr2, nc, i1, i2);
return( res);
}
......@@ -44,7 +44,6 @@
#include <stdlib.h>
#include <math.h>
#include "mva.h"
#include "hclust.h"
#include "distance.h"
#include "hclust_T.h"
......
......@@ -5,6 +5,7 @@
#include "hclust_T.h"
#include "distance_T.h"
#include "hclust.h"
#include <stdio.h>
namespace hclust_T
......
......@@ -300,7 +300,7 @@ C
C
DOUBLE PRECISION COUTS (N*N) ,
, BORNTH , Z0 , Z , DELTAZ , ZSAVE , ZNEW ,
, FLOAT , ABS
, ABS
C
INTEGER FMBVR , TRIABS , ALLSOL
C
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment