Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Sylvain Jasson
amap
Commits
4889bfaf
Commit
4889bfaf
authored
Nov 03, 2011
by
Antoine Lucas
Browse files
add 'absolute' distances
parent
06b2f8cf
Changes
9
Hide whitespace changes
Inline
Side-by-side
R/Kmeans.R
View file @
4889bfaf
...
...
@@ -23,7 +23,7 @@ function(x, centers, iter.max = 10, nstart = 1,
METHODS
<-
c
(
"euclidean"
,
"maximum"
,
"manhattan"
,
"canberra"
,
"binary"
,
"pearson"
,
"correlation"
,
"spearman"
,
"kendall"
)
"binary"
,
"pearson"
,
"correlation"
,
"spearman"
,
"kendall"
,
"abspearson"
,
"abscorrelation"
)
method
<-
pmatch
(
method
,
METHODS
)
if
(
is.na
(
method
))
stop
(
"invalid distance method"
)
...
...
R/dist.R
View file @
4889bfaf
...
...
@@ -10,7 +10,7 @@ Dist <- function(x, method="euclidean", nbproc = 2, diag=FALSE, upper=FALSE)
method
<-
"euclidean"
METHODS
<-
c
(
"euclidean"
,
"maximum"
,
"manhattan"
,
"canberra"
,
"binary"
,
"pearson"
,
"correlation"
,
"spearman"
,
"kendall"
)
"binary"
,
"pearson"
,
"correlation"
,
"spearman"
,
"kendall"
,
"abspearson"
,
"abscorrelation"
)
method
<-
pmatch
(
method
,
METHODS
)
if
(
is.na
(
method
))
stop
(
"invalid distance method"
)
...
...
R/hcluster.R
View file @
4889bfaf
## Hierarchical clustering
##
## Created : 18/11/02
## Last Modified : Time-stamp: <20
05
-1
0
-0
1
2
0:14:25
antoine>
## Last Modified : Time-stamp: <20
11
-1
1
-0
3
2
1:50:46
antoine>
##
## This function is a "mix" of function dist and function hclust.
##
...
...
@@ -20,7 +20,8 @@ hclusterpar <- hcluster <- function (x, method = "euclidean", diag = FALSE, uppe
if
(
!
is.na
(
pmatch
(
method
,
"euclidian"
)))
method
<-
"euclidean"
METHODS
<-
c
(
"euclidean"
,
"maximum"
,
"manhattan"
,
"canberra"
,
"binary"
,
"pearson"
,
"correlation"
,
"spearman"
,
"kendall"
)
"binary"
,
"pearson"
,
"correlation"
,
"spearman"
,
"kendall"
,
"abspearson"
,
"abscorrelation"
)
method
<-
pmatch
(
method
,
METHODS
)
if
(
is.na
(
method
))
stop
(
"invalid distance method"
)
...
...
man/Kmeans.Rd
View file @
4889bfaf
...
...
@@ -29,6 +29,7 @@ Kmeans(x, centers, iter.max = 10, nstart = 1,
\item{method}{the distance measure to be used. This must be one of
\code{"euclidean"}, \code{"maximum"}, \code{"manhattan"},
\code{"canberra"}, \code{"binary"}, \code{"pearson"} ,
\code{"abspearson"} , \code{"abscorrelation"},
\code{"correlation"}, \code{"spearman"} or \code{"kendall"}.
Any unambiguous substring can be given.}
}
...
...
@@ -77,6 +78,7 @@ plot(x, col = cl$cluster)
points(cl$centers, col = 1:5, pch = 8)
Kmeans(x, 5,nstart = 25, method="abscorrelation")
}
...
...
man/dist.Rd
View file @
4889bfaf
...
...
@@ -13,7 +13,8 @@ Dist(x, method = "euclidean", nbproc = 2, diag = FALSE, upper = FALSE)
\item{method}{the distance measure to be used. This must be one of
\code{"euclidean"}, \code{"maximum"}, \code{"manhattan"},
\code{"canberra"}, \code{"binary"}, \code{"pearson"},
\code{"correlation"}, \code{"spearman"} or \code{"kendall"}.
\code{"abspearson"}, \code{"correlation"},
\code{"abscorrelation"}, \code{"spearman"} or \code{"kendall"}.
Any unambiguous substring can be given.}
\item{nbproc}{integer, Number of subprocess for parallelization}
\item{diag}{logical value indicating whether the diagonal of the
...
...
@@ -53,12 +54,31 @@ Dist(x, method = "euclidean", nbproc = 2, diag = FALSE, upper = FALSE)
\item{\code{pearson}:}{Also named "not centered Pearson"
\eqn{1 - \frac{\sum_i x_i y_i}{\sqrt{\sum_i x_i^2 %
\sum_i y_i^2}}}{%
sum(x_i y_i) / sqrt [sum(x_i^2) sum(y_i^2)]}.
1 - sum(x_i y_i) / sqrt [sum(x_i^2) sum(y_i^2)]}.
}
\item{\code{abspearson}:}{Absolute Pearson
\eqn{1 - \left| \frac{\sum_i x_i y_i}{\sqrt{\sum_i x_i^2 %
\sum_i y_i^2}} \right| }{%
1 - |sum(x_i y_i) / sqrt [sum(x_i^2) sum(y_i^2)] |}.
}
\item{\code{correlation}:}{Also named "Centered Pearson"
\eqn{1 - corr(x,y)}.
}
\item{\code{abscorrelation}:}{Absolute correlation
\eqn{1 - | corr(x,y) |}
with
\eqn{ corr(x,y) = \frac{\sum_i x_i y_i -\frac1n \sum_i x_i \sum_i%
y_i}{% frac: 2nd part
\sqrt{\left(\sum_i x_i^2 -\frac1n \left( \sum_i x_i \right)^2 %
\right)%
\left( \sum_i y_i^2 -\frac1n \left( \sum_i y_i \right)^2 %
\right)} }}.
}
\item{\code{spearman}:}{Compute a distance based on rank.
\eqn{\sum(d_i^2)}{sum (d_i^2)} where \eqn{d_i} is the difference
in rank between \eqn{x_i} and \eqn{y_i}.
...
...
@@ -136,6 +156,10 @@ Dist(x, upper = TRUE)
## compute dist with 8 threads
Dist(x,nbproc=8)
Dist(x,method="abscorrelation")
Dist(x,method="kendall")
}
\keyword{multivariate}
\keyword{cluster}
man/hcluster.Rd
View file @
4889bfaf
...
...
@@ -18,8 +18,9 @@ hcluster(x, method = "euclidean", diag = FALSE, upper = FALSE,
}
\item{method}{the distance measure to be used. This must be one of
\code{"euclidean"}, \code{"maximum"}, \code{"manhattan"},
\code{"canberra"} \code{"binary"} \code{"pearson"},
\code{"correlation"}, \code{"spearman"} or \code{"kendall"}.
\code{"canberra"}, \code{"binary"}, \code{"pearson"},
\code{"abspearson"}, \code{"correlation"},
\code{"abscorrelation"}, \code{"spearman"} or \code{"kendall"}.
Any unambiguous substring can be given.}
\item{diag}{logical value indicating whether the diagonal of the
distance matrix should be printed by \code{print.dist}.}
...
...
@@ -85,7 +86,7 @@ hcluster(x, method = "euclidean", diag = FALSE, upper = FALSE,
= hclust(dist(x, method = "euclidean"),method = "complete"))}
It use twice less memory, as it doesn't store distance matrix.
For more details, see documentation of \code{hclust} and \code{
d
ist}.
For more details, see documentation of \code{hclust} and \code{
D
ist}.
}
...
...
@@ -144,10 +145,16 @@ hc <- hcluster(USArrests,method = "bin",link = "mcquitty", nbproc= 1,
doubleprecision = FALSE)
hc <- hcluster(USArrests,method = "pea",link = "median", nbproc= 2,
doubleprecision = FALSE)
hc <- hcluster(USArrests,method = "abspea",link = "median", nbproc= 2,
doubleprecision = FALSE)
hc <- hcluster(USArrests,method = "cor",link = "centroid", nbproc= 1,
doubleprecision = FALSE)
hc <- hcluster(USArrests,method = "abscor",link = "centroid", nbproc= 1,
doubleprecision = FALSE)
hc <- hcluster(USArrests,method = "spe",link = "complete", nbproc= 2,
doubleprecision = FALSE)
hc <- hcluster(USArrests,method = "ken",link = "complete", nbproc= 2,
doubleprecision = FALSE)
...
...
src/distance_T.cpp
View file @
4889bfaf
...
...
@@ -2,7 +2,7 @@
* \brief all functions requiered for R dist function and C hcluster function.
*
* \date Created: probably in 1995
* \date Last modified: Time-stamp: <201
0-01-21 18:4
2:14 antoine>
* \date Last modified: Time-stamp: <201
1-11-03 21:3
2:14 antoine>
*
* \author R core members, and lately: Antoine Lucas
*
...
...
@@ -348,11 +348,57 @@ template<class T> T distance_T<T>::R_pearson(double * x, double * y , int nr_x,
*
flag
=
0
;
return
NA_REAL
;
}
dist
=
1
-
(
num
/
sqrt
(
sum1
*
sum2
)
);
if
((
sum1
==
0
)
||
(
sum2
==
0
))
dist
=
1
;
// one vector is null.
else
dist
=
1
-
(
num
/
sqrt
(
sum1
*
sum2
)
);
return
dist
;
}
/** \brief Absoulute Pearson / Pearson uncentered (correlation)
* \note Added by L. Cerulo
*/
template
<
class
T
>
T
distance_T
<
T
>::
R_abspearson
(
double
*
x
,
double
*
y
,
int
nr_x
,
int
nr_y
,
int
nc
,
int
i1
,
int
i2
,
int
*
flag
,
T_tri
&
opt
)
{
T
num
,
sum1
,
sum2
,
dist
;
int
count
,
j
;
count
=
0
;
num
=
0
;
sum1
=
0
;
sum2
=
0
;
for
(
j
=
0
;
j
<
nc
;
j
++
)
{
if
(
R_FINITE
(
x
[
i1
])
&&
R_FINITE
(
y
[
i2
]))
{
num
+=
(
x
[
i1
]
*
y
[
i2
]);
sum1
+=
(
x
[
i1
]
*
x
[
i1
]);
sum2
+=
(
y
[
i2
]
*
y
[
i2
]);
count
++
;
}
i1
+=
nr_x
;
i2
+=
nr_y
;
}
if
(
count
==
0
)
{
*
flag
=
0
;
return
NA_REAL
;
}
if
((
sum1
==
0
)
||
(
sum2
==
0
))
dist
=
0
;
// one vector is null.
else
dist
=
(
num
/
sqrt
(
sum1
*
sum2
)
);
if
(
dist
<
0
)
{
dist
*=-
1
;
}
return
(
1
-
dist
);
}
/** \brief Distance correlation (Uncentered Pearson)
* \note Added by A. Lucas
*/
...
...
@@ -360,7 +406,7 @@ template<class T> T distance_T<T>::R_correlation(double * x, double * y , int n
int
i1
,
int
i2
,
int
*
flag
,
T_tri
&
opt
)
{
T
num
,
denum
,
sumx
,
sumy
,
sumxx
,
sumyy
,
sumxy
;
T
num
,
denum
2
,
sumx
,
sumy
,
sumxx
,
sumyy
,
sumxy
;
int
count
,
j
;
count
=
0
;
...
...
@@ -389,10 +435,64 @@ template<class T> T distance_T<T>::R_correlation(double * x, double * y , int n
return
NA_REAL
;
}
num
=
sumxy
-
(
sumx
*
sumy
/
count
);
denum
=
sqrt
(
(
sumxx
-
(
sumx
*
sumx
/
count
)
)
*
(
sumyy
-
(
sumy
*
sumy
/
count
)
)
);
return
1
-
(
num
/
denum
);
denum2
=
(
sumxx
-
(
sumx
*
sumx
/
count
)
)
*
(
sumyy
-
(
sumy
*
sumy
/
count
)
)
;
if
(
denum2
<=
0
)
// some apporximations of 0, with floating precision can
// give negative values
return
1
;
return
1
-
(
num
/
sqrt
(
denum2
));
}
/** \brief Absolute Distance correlation (Uncentered Pearson)
* \note Added by L. Cerulo
*/
template
<
class
T
>
T
distance_T
<
T
>::
R_abscorrelation
(
double
*
x
,
double
*
y
,
int
nr_x
,
int
nr_y
,
int
nc
,
int
i1
,
int
i2
,
int
*
flag
,
T_tri
&
opt
)
{
T
num
,
denum
,
sumx
,
sumy
,
sumxx
,
sumyy
,
sumxy
,
dist
,
term
;
int
count
,
j
;
count
=
0
;
sumx
=
0
;
sumy
=
0
;
sumxx
=
0
;
sumyy
=
0
;
sumxy
=
0
;
for
(
j
=
0
;
j
<
nc
;
j
++
)
{
if
(
R_FINITE
(
x
[
i1
])
&&
R_FINITE
(
y
[
i2
]))
{
sumxy
+=
(
x
[
i1
]
*
y
[
i2
]);
sumx
+=
x
[
i1
];
sumy
+=
y
[
i2
];
sumxx
+=
x
[
i1
]
*
x
[
i1
];
sumyy
+=
y
[
i2
]
*
y
[
i2
];
count
++
;
}
i1
+=
nr_x
;
i2
+=
nr_y
;
}
if
(
count
==
0
)
{
*
flag
=
0
;
return
NA_REAL
;
}
num
=
sumxy
-
(
sumx
*
sumy
/
count
);
term
=
(
sumxx
-
(
sumx
*
sumx
/
count
)
)
*
(
sumyy
-
(
sumy
*
sumy
/
count
)
);
if
(
term
<=
0
)
return
1
;
denum
=
sqrt
(
term
);
dist
=
num
/
denum
;
if
(
dist
<
0
)
{
dist
*=-
1
;
}
return
(
1
-
dist
);
}
// ---------------------------------------------------------
// Distance Spearman
//
...
...
@@ -773,10 +873,21 @@ template <class T> void* distance_T<T>::thread_dist(void* arguments_void)
case
KENDALL
:
distfun
=
R_kendall
;
break
;
case
ABSPEARSON
:
distfun
=
R_abspearson
;
break
;
case
ABSCORRELATION
:
distfun
=
R_abscorrelation
;
break
;
default:
error
(
"distance(): invalid distance"
);
{
if
(
no
==
0
)
error
(
"distance(): invalid distance"
);
return
(
void
*
)
0
;
}
}
...
...
@@ -899,9 +1010,15 @@ template <class T> T distance_T<T>::distance_kms(double *x,double *y, int nr1,in
case
KENDALL
:
distfun
=
R_kendall
;
break
;
case
ABSPEARSON
:
distfun
=
R_abspearson
;
break
;
case
ABSCORRELATION
:
distfun
=
R_abscorrelation
;
break
;
default:
error
(
"distance(): invalid distance"
);
error
(
"distance
_kms
(): invalid distance"
);
}
// here: distance computation
...
...
src/distance_T.h
View file @
4889bfaf
...
...
@@ -9,8 +9,7 @@ template<class T> class distance_T
public:
/* == 1,2,..., defined by order in the r function dist */
enum
{
EUCLIDEAN
=
1
,
MAXIMUM
,
MANHATTAN
,
CANBERRA
,
BINARY
,
PEARSON
,
CORRELATION
,
SPEARMAN
,
KENDALL
};
enum
{
EUCLIDEAN
=
1
,
MAXIMUM
,
MANHATTAN
,
CANBERRA
,
BINARY
,
PEARSON
,
CORRELATION
,
SPEARMAN
,
KENDALL
,
ABSPEARSON
,
ABSCORRELATION
};
struct
T_tri
...
...
@@ -263,6 +262,19 @@ template<class T> class distance_T
int
i1
,
int
i2
,
int
*
flag
,
T_tri
&
opt
);
/** \brief Pearson / Pearson centered (correlation)
* \note Added by A. Lucas
*/
static
T
R_abspearson
(
double
*
x
,
double
*
y
,
int
nr_x
,
int
nr_y
,
int
nc
,
int
i1
,
int
i2
,
int
*
flag
,
T_tri
&
opt
);
/** \brief Distance correlation (Uncentered Pearson)
* \note Added by L. Cerulo
*/
static
T
R_abscorrelation
(
double
*
x
,
double
*
y
,
int
nr_x
,
int
nr_y
,
int
nc
,
int
i1
,
int
i2
,
int
*
flag
,
T_tri
&
opt
);
/** \brief Spearman distance (rank base metric)
*
...
...
src/kmeans.cpp
View file @
4889bfaf
...
...
@@ -4,7 +4,7 @@
* \brief K-means clustering
*
* \date Created : before 2005
* \date Last Modified : Time-stamp: <20
09-10-11 09:45
:1
3
antoine>
* \date Last Modified : Time-stamp: <20
11-11-03 22:19
:1
4
antoine>
*
* \author R core team. Modified by A. Lucas for distance choice.
*
...
...
@@ -62,9 +62,9 @@ void kmeans_Lloyd2(double *x, int *pn, int *pp, double *cen, int *pk, int *cl,
Rboolean
updated
;
distance_T
<
double
>::
T_tri
opt
;
int
ierr
[
1
];
double
*
data_tri
;
int
*
order_tri
;
int
*
rank_tri
;
//
double * data_tri;
//
int * order_tri;
//
int * rank_tri;
if
(
(
*
method
==
distance_T
<
double
>::
SPEARMAN
)
||
(
*
method
==
distance_T
<
double
>::
KENDALL
))
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment