Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Sylvain Jasson
amap
Commits
d77f8fea
Commit
d77f8fea
authored
May 20, 2007
by
Antoine Lucas
Browse files
distance and hirerachical clustering with template => compute with float precision available
parent
4bdd00ae
Changes
18
Hide whitespace changes
Inline
Side-by-side
R/acprob.R
View file @
d77f8fea
...
...
@@ -58,32 +58,6 @@ W <- function(x,h,D=NULL,kernel="gaussien")
}
WsansC
<-
function
(
x
,
h
,
D
=
NULL
,
kernel
=
"gaussien"
)
{
x
<-
as.matrix
(
x
)
n
<-
dim
(
x
)[
1
]
p
<-
dim
(
x
)[
2
]
if
(
is.null
(
D
))
{
D
<-
diag
(
1
,
p
)
}
som
<-
0
result
<-
0
for
(
i
in
1
:
(
n
-1
)
)
{
for
(
j
in
(
i
+1
)
:
n
)
{
Delta
<-
x
[
i
,]
-
x
[
j
,]
norm
<-
sqrt
(
t
(
Delta
)
%*%
D
%*%
Delta
)
k
<-
K
(
norm
/
h
,
kernel
)
# K ( |Delta|/h )
k
<-
as.numeric
(
k
)
som
<-
som
+
k
result
<-
result
+
k
*
Delta
%*%
t
(
Delta
)
}
}
result
/
som
}
varrob
<-
function
(
x
,
h
,
D
=
NULL
,
kernel
=
"gaussien"
)
{
x
<-
as.matrix
(
x
)
...
...
@@ -120,32 +94,6 @@ varrob <- function(x,h,D=NULL,kernel="gaussien")
}
varrobsansC
<-
function
(
x
,
h
,
D
=
NULL
,
kernel
=
"gaussien"
)
{
n
<-
dim
(
x
)[
1
]
p
<-
dim
(
x
)[
2
]
if
(
is.null
(
D
))
{
D
<-
diag
(
1
,
p
)
}
x
<-
as.matrix
(
x
)
x
<-
scale
(
x
,
center
=
TRUE
,
scale
=
FALSE
)
som
<-
0
res
<-
0
for
(
i
in
1
:
n
)
{
k
<-
K
(
sqrt
(
t
(
x
[
i
,])
%*%
D
%*%
x
[
i
,])
/
h
,
kernel
)
k
<-
as.numeric
(
k
)
res
<-
res
+
k
*
x
[
i
,]
%*%
t
(
x
[
i
,])
som
<-
som
+
k
}
S
<-
res
/
som
Sinv
<-
solve
(
S
)
solve
(
Sinv
-
D
/
h
)
}
acpgen
<-
function
(
x
,
h1
,
h2
,
center
=
TRUE
,
reduce
=
TRUE
,
kernel
=
"gaussien"
)
{
# CENTRONS, ET REDUISONS
...
...
R/dist.R
View file @
d77f8fea
Dist
<-
function
(
x
,
method
=
"euclidean"
,
diag
=
FALSE
,
upper
=
FALSE
)
Dist
<-
function
(
x
,
method
=
"euclidean"
,
nbproc
=
1
,
diag
=
FALSE
,
upper
=
FALSE
)
{
if
(
class
(
x
)
==
"exprSet"
)
...
...
@@ -25,6 +25,7 @@ Dist <- function(x, method="euclidean", diag=FALSE, upper=FALSE)
d
=
double
(
N
*
(
N
-
1
)
/
2
),
diag
=
as.integer
(
FALSE
),
method
=
as.integer
(
method
),
nbproc
=
as.integer
(
nbproc
),
ierr
=
as.integer
(
0
),
DUP
=
FALSE
,
NAOK
=
TRUE
,
...
...
R/hcluster.R
View file @
d77f8fea
...
...
@@ -10,7 +10,7 @@
hcluster
<-
function
(
x
,
method
=
"euclidean"
,
diag
=
FALSE
,
upper
=
FALSE
,
link
=
"complete"
,
members
=
NULL
)
hcluster
<-
function
(
x
,
method
=
"euclidean"
,
diag
=
FALSE
,
upper
=
FALSE
,
link
=
"complete"
,
members
=
NULL
,
nbproc
=
2
,
doubleprecision
=
TRUE
)
{
if
(
class
(
x
)
==
"exprSet"
)
...
...
@@ -46,6 +46,10 @@ hcluster <- function (x, method = "euclidean", diag = FALSE, upper = FALSE, link
if
(
length
(
members
)
!=
N
)
stop
(
"Invalid length of members"
)
n
<-
N
precision
<-
1
if
(
doubleprecision
)
precision
<-
2
hcl
<-
.C
(
"hcluster"
,
x
=
as.double
(
x
),
...
...
@@ -59,6 +63,8 @@ hcluster <- function (x, method = "euclidean", diag = FALSE, upper = FALSE, link
order
=
integer
(
n
),
crit
=
double
(
n
),
members
=
as.double
(
members
),
nbprocess
=
as.integer
(
nbproc
),
precision
=
as.integer
(
precision
),
res
=
as.integer
(
1
),
DUP
=
FALSE
,
NAOK
=
TRUE
,
...
...
man/acpgen.Rd
View file @
d77f8fea
...
...
@@ -2,7 +2,7 @@
\alias{acpgen}
\alias{K}
\alias{W}
\alias{WsansC}
\title{Generalised principal component analysis}
...
...
@@ -12,7 +12,6 @@
acpgen(x,h1,h2,center=TRUE,reduce=TRUE,kernel="gaussien")
K(u,kernel="gaussien")
W(x,h,D=NULL,kernel="gaussien")
WsansC(x,h,D=NULL,kernel="gaussien")
}
\arguments{
...
...
man/dist.Rd
View file @
d77f8fea
\name{Dist}
\title{Distance Matrix Computation}
\usage{
Dist(x, method = "euclidean", diag = FALSE, upper = FALSE)
Dist(x, method = "euclidean",
nbproc = 1,
diag = FALSE, upper = FALSE)
}
\alias{Dist}
...
...
@@ -15,6 +15,7 @@ Dist(x, method = "euclidean", diag = FALSE, upper = FALSE)
\code{"canberra"}, \code{"binary"}, \code{"pearson"},
\code{"correlation"} or \code{"spearman"}.
Any unambiguous substring can be given.}
\item{nbproc}{integer, Number of subprocess for parallelization}
\item{diag}{logical value indicating whether the diagonal of the
distance matrix should be printed by \code{print.dist}.}
\item{upper}{logical value indicating whether the upper triangle of the
...
...
@@ -111,7 +112,7 @@ Dist(x, method = "euclidean", diag = FALSE, upper = FALSE)
\code{\link[cluster]{daisy}} in the \file{cluster} package with more
possibilities in the case of \emph{mixed} (contiuous / categorical)
variables.
\code{\link[stats]{dist}} \code{\link{hclust}}.
\code{\link[stats]{dist}} \code{\link{hclust
er
}}.
}
\examples{
x <- matrix(rnorm(100), nrow=5)
...
...
@@ -119,6 +120,10 @@ Dist(x)
Dist(x, diag = TRUE)
Dist(x, upper = TRUE)
## compute dist with 8 threads
Dist(x,nbproc=8)
}
\keyword{multivariate}
\keyword{cluster}
man/hcluster.Rd
View file @
d77f8fea
...
...
@@ -6,7 +6,8 @@
}
\usage{
hcluster(x, method = "euclidean", diag = FALSE, upper = FALSE,
link = "complete", members = NULL)
link = "complete", members = NULL, nbproc = 2,
doubleprecision = TRUE)
}
\arguments{
\item{x}{
...
...
@@ -29,6 +30,10 @@ hcluster(x, method = "euclidean", diag = FALSE, upper = FALSE,
\code{"average"}, \code{"mcquitty"}, \code{"median"} or
\code{"centroid"}.}
\item{members}{\code{NULL} or a vector with length size of \code{d}.}
\item{nbproc}{integer, number of subprocess for parallelization}
\item{doubleprecision}{True: use of double precision for distance
matrix computation; False: use simple precision}
}
\value{
An object of class \bold{hclust} which describes the
...
...
@@ -87,7 +92,7 @@ hcluster(x, method = "euclidean", diag = FALSE, upper = FALSE,
by Antoine Lucas \url{http://mulcyber.toulouse.inra.fr/projects/amap/}.
}
\seealso{
\code{\link{Dist}},
\code{\link{hclusterpar}},
\code{\link{Dist}},
\code{\link[stats]{hclust}}, \code{\link[stats]{kmeans}}.
}
...
...
@@ -118,6 +123,29 @@ opar <- par(mfrow = c(1, 2))
plot(hc, labels = FALSE, hang = -1, main = "Original Tree")
plot(hc1, labels = FALSE, hang = -1, main = "Re-start from 10 clusters")
par(opar)
## other combinaison are possible
hc <- hcluster(USArrests,method = "euc",link = "ward", nbproc= 1,
doubleprecision = TRUE)
hc <- hcluster(USArrests,method = "max",link = "single", nbproc= 2,
doubleprecision = TRUE)
hc <- hcluster(USArrests,method = "man",link = "complete", nbproc= 1,
doubleprecision = TRUE)
hc <- hcluster(USArrests,method = "can",link = "average", nbproc= 2,
doubleprecision = TRUE)
hc <- hcluster(USArrests,method = "bin",link = "mcquitty", nbproc= 1,
doubleprecision = FALSE)
hc <- hcluster(USArrests,method = "pea",link = "median", nbproc= 2,
doubleprecision = FALSE)
hc <- hcluster(USArrests,method = "cor",link = "centroid", nbproc= 1,
doubleprecision = FALSE)
hc <- hcluster(USArrests,method = "spe",link = "complete", nbproc= 2,
doubleprecision = FALSE)
}
\keyword{multivariate}
\keyword{cluster}
man/varrob.Rd
View file @
d77f8fea
\name{VarRob}
\alias{varrob}
\alias{varrobsansC}
\title{Robust variance}
\description{Compute a robust variance}
\usage{
varrob(x,h,D=NULL,kernel="gaussien")
varrobsansC(x,h,D=NULL,kernel="gaussien")
}
\arguments{
...
...
src/distance.cpp
0 → 100644
View file @
d77f8fea
#include
"distance.h"
#include
"distance_template.h"
/**
* R_distance: compute parallelized distance. Function called direclty by R
* \brief compute distance and call function thread_dist
* that call one of function R_euclidean or R_...
* \param x input matrix
* \param nr,nc number of row and columns
* nr individuals with nc values.
* \param d distance half matrix.
* \param diag if we compute diagonal of dist matrix (usualy: no).
* \param method 1, 2,... method used
* \param nbprocess: number of threads to create
* \param ierr error return; 1 good; 0 missing values
*/
void
R_distance
(
double
*
x
,
int
*
nr
,
int
*
nc
,
double
*
d
,
int
*
diag
,
int
*
method
,
int
*
nbprocess
,
int
*
ierr
)
{
#ifndef __MINGW_H
if
(
*
nbprocess
!=
1
)
amaptemplate
::
distancepar
<
double
>
(
x
,
nr
,
nc
,
d
,
diag
,
method
,
nbprocess
,
ierr
);
else
#endif
amaptemplate
::
T_distance
<
double
>
(
x
,
nr
,
nc
,
d
,
diag
,
method
,
ierr
);
}
/**
* Sort, and return order and rank
* \brief This function is used by R_spearman.
* order and rank must be initialised with 0:(n-1)
* make sort, return x sorted
* order = order(x) - 1
* rank = rank(x) -1
*
*/
void
rsort_rank_order
(
double
*
x
,
int
*
order
,
int
*
rank
,
int
*
n
)
{
double
v
;
int
i
,
j
,
h
,
iv
;
for
(
h
=
1
;
h
<=
*
n
/
9
;
h
=
3
*
h
+
1
);
for
(;
h
>
0
;
h
/=
3
)
for
(
i
=
h
;
i
<
*
n
;
i
++
)
{
v
=
x
[
i
];
iv
=
order
[
i
];
j
=
i
;
while
(
j
>=
h
&&
(
x
[
j
-
h
]
>
v
))
{
x
[
j
]
=
x
[
j
-
h
];
order
[
j
]
=
order
[
j
-
h
];
rank
[
order
[
j
]]
=
j
;
j
-=
h
;
}
x
[
j
]
=
v
;
order
[
j
]
=
iv
;
rank
[
order
[
j
]]
=
j
;
}
}
src/distance.h
0 → 100644
View file @
d77f8fea
extern
"C"
{
/**
* R_distance: compute parallelized distance. Function called direclty by R
* \brief compute distance and call function thread_dist
* that call one of function R_euclidean or R_...
* \param x input matrix
* \param nr,nc number of row and columns
* nr individuals with nc values.
* \param d distance half matrix.
* \param diag if we compute diagonal of dist matrix (usualy: no).
* \param method 1, 2,... method used
* \param nbprocess: number of threads to create
* \param ierr error return; 1 good; 0 missing values
*/
void
R_distance
(
double
*
x
,
int
*
nr
,
int
*
nc
,
double
*
d
,
int
*
diag
,
int
*
method
,
int
*
nbprocess
,
int
*
ierr
);
void
rsort_rank_order
(
double
*
x
,
int
*
order
,
int
*
rank
,
int
*
n
);
}
src/distance_template.cpp
0 → 100644
View file @
d77f8fea
/*! \file distance.c
* \brief all functions requiered for R dist function and C hcluster function.
*
* \date Created: probably in 1995
* \date Last modified: Time-stamp: <2005-10-09 13:12:06 antoine>
*
* \author R core members, and lately: Antoine Lucas
*
*
* R : A Computer Language for Statistical Data Analysis
* Copyright (C) 1995, 1996 Robert Gentleman and Ross Ihaka
* Copyright (C) 1998, 2001 Robert Gentleman, Ross Ihaka and the
* R Development Core Team
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
*/
#define _AMAP_DISTANCE_TEMPLATE_CPP 1
#ifdef HAVE_CONFIG_H
#include
<config.h>
#endif
#include
"distance_template.h"
#include
"distance.h"
#include
<float.h>
#include
<R_ext/Arith.h>
#include
<R_ext/Error.h>
#include
"mva.h"
#include
<limits>
#include
<pthread.h>
#define MAX( A , B ) ( ( A ) > ( B ) ? ( A ) : ( B ) )
#define MIN( A , B ) ( ( A ) < ( B ) ? ( A ) : ( B ) )
namespace
amaptemplate
{
/** \brief Distance euclidean (i.e. sqrt(sum of square) )
*/
template
<
class
T
>
T
R_euclidean
(
double
*
x
,
int
nr
,
int
nc
,
int
i1
,
int
i2
,
int
*
flag
,
void
**
opt
)
{
T
dev
,
dist
;
int
count
,
j
;
count
=
0
;
dist
=
0
;
for
(
j
=
0
;
j
<
nc
;
j
++
)
{
if
(
R_FINITE
(
x
[
i1
])
&&
R_FINITE
(
x
[
i2
]))
{
dev
=
(
x
[
i1
]
-
x
[
i2
]);
dist
+=
dev
*
dev
;
count
++
;
}
i1
+=
nr
;
i2
+=
nr
;
}
if
(
count
==
0
)
{
*
flag
=
0
;
return
NA_REAL
;
}
if
(
count
!=
nc
)
dist
/=
((
T
)
count
/
nc
);
return
sqrt
(
dist
);
}
/** \brief Distance maximum (supremum norm)
*/
template
<
class
T
>
T
R_maximum
(
double
*
x
,
int
nr
,
int
nc
,
int
i1
,
int
i2
,
int
*
flag
,
void
**
opt
)
{
T
dev
,
dist
;
int
count
,
j
;
count
=
0
;
// TODO: use c++ limits
dist
=
std
::
numeric_limits
<
T
>::
min
();
for
(
j
=
0
;
j
<
nc
;
j
++
)
{
if
(
R_FINITE
(
x
[
i1
])
&&
R_FINITE
(
x
[
i2
]))
{
dev
=
fabs
(
x
[
i1
]
-
x
[
i2
]);
if
(
dev
>
dist
)
dist
=
dev
;
count
++
;
}
i1
+=
nr
;
i2
+=
nr
;
}
if
(
count
==
0
)
{
*
flag
=
0
;
return
NA_REAL
;
}
return
dist
;
}
/** \brief Distance manhattan
*/
template
<
class
T
>
T
R_manhattan
(
double
*
x
,
int
nr
,
int
nc
,
int
i1
,
int
i2
,
int
*
flag
,
void
**
opt
)
{
T
dist
;
int
count
,
j
;
count
=
0
;
dist
=
0
;
for
(
j
=
0
;
j
<
nc
;
j
++
)
{
if
(
R_FINITE
(
x
[
i1
])
&&
R_FINITE
(
x
[
i2
]))
{
dist
+=
fabs
(
x
[
i1
]
-
x
[
i2
]);
count
++
;
}
i1
+=
nr
;
i2
+=
nr
;
}
if
(
count
==
0
)
{
*
flag
=
0
;
return
NA_REAL
;
}
if
(
count
!=
nc
)
dist
/=
((
T
)
count
/
nc
);
return
dist
;
}
/** \brief Distance canberra
*/
template
<
class
T
>
T
R_canberra
(
double
*
x
,
int
nr
,
int
nc
,
int
i1
,
int
i2
,
int
*
flag
,
void
**
opt
)
{
T
dist
,
sum
,
diff
;
int
count
,
j
;
count
=
0
;
dist
=
0
;
for
(
j
=
0
;
j
<
nc
;
j
++
)
{
if
(
R_FINITE
(
x
[
i1
])
&&
R_FINITE
(
x
[
i2
]))
{
sum
=
fabs
(
x
[
i1
]
+
x
[
i2
]);
diff
=
fabs
(
x
[
i1
]
-
x
[
i2
]);
if
(
sum
>
DBL_MIN
||
diff
>
DBL_MIN
)
{
dist
+=
diff
/
sum
;
count
++
;
}
}
i1
+=
nr
;
i2
+=
nr
;
}
if
(
count
==
0
)
{
*
flag
=
0
;
return
NA_REAL
;
}
if
(
count
!=
nc
)
dist
/=
((
T
)
count
/
nc
);
return
dist
;
}
/** \brief Distance binary
*/
template
<
class
T
>
T
R_dist_binary
(
double
*
x
,
int
nr
,
int
nc
,
int
i1
,
int
i2
,
int
*
flag
,
void
**
opt
)
{
int
total
,
count
,
dist
;
int
j
;
total
=
0
;
count
=
0
;
dist
=
0
;
for
(
j
=
0
;
j
<
nc
;
j
++
)
{
if
(
R_FINITE
(
x
[
i1
])
&&
R_FINITE
(
x
[
i2
]))
{
if
(
x
[
i1
]
||
x
[
i2
]){
count
++
;
if
(
!
(
x
[
i1
]
&&
x
[
i2
])
)
dist
++
;
}
total
++
;
}
i1
+=
nr
;
i2
+=
nr
;
}
if
(
total
==
0
)
{
*
flag
=
0
;
return
NA_REAL
;
}
if
(
count
==
0
)
return
0
;
return
(
T
)
dist
/
count
;
}
/** \brief Pearson / Pearson centered (correlation)
* \note Added by A. Lucas
*/
template
<
class
T
>
T
R_pearson
(
double
*
x
,
int
nr
,
int
nc
,
int
i1
,
int
i2
,
int
*
flag
,
void
**
opt
)
{
T
num
,
sum1
,
sum2
,
dist
;
int
count
,
j
;
count
=
0
;
num
=
0
;
sum1
=
0
;
sum2
=
0
;
for
(
j
=
0
;
j
<
nc
;
j
++
)
{
if
(
R_FINITE
(
x
[
i1
])
&&
R_FINITE
(
x
[
i2
]))
{
num
+=
(
x
[
i1
]
*
x
[
i2
]);
sum1
+=
(
x
[
i1
]
*
x
[
i1
]);
sum2
+=
(
x
[
i2
]
*
x
[
i2
]);
count
++
;
}
i1
+=
nr
;
i2
+=
nr
;
}
if
(
count
==
0
)
{
*
flag
=
0
;
return
NA_REAL
;
}
dist
=
1
-
(
num
/
sqrt
(
sum1
*
sum2
)
);
return
dist
;
}
/** \brief Distance correlation (Uncentered Pearson)
* \note Added by A. Lucas
*/
template
<
class
T
>
T
R_correlation
(
double
*
x
,
int
nr
,
int
nc
,
int
i1
,
int
i2
,
int
*
flag
,
void
**
opt
)
{
T
num
,
denum
,
sumx
,
sumy
,
sumxx
,
sumyy
,
sumxy
;
int
count
,
j
;
count
=
0
;
sumx
=
0
;
sumy
=
0
;
sumxx
=
0
;
sumyy
=
0
;
sumxy
=
0
;
for
(
j
=
0
;
j
<
nc
;
j
++
)
{
if
(
R_FINITE
(
x
[
i1
])
&&
R_FINITE
(
x
[
i2
]))
{
sumxy
+=
(
x
[
i1
]
*
x
[
i2
]);
sumx
+=
x
[
i1
];
sumy
+=
x
[
i2
];
sumxx
+=
x
[
i1
]
*
x
[
i1
];
sumyy
+=
x
[
i2
]
*
x
[
i2
];
count
++
;
}
i1
+=
nr
;
i2
+=
nr
;
}
if
(
count
==
0
)
{
*
flag
=
0
;
return
NA_REAL
;
}
num
=
sumxy
-
(
sumx
*
sumy
/
count
);
denum
=
sqrt
(
(
sumxx
-
(
sumx
*
sumx
/
count
)
)
*
(
sumyy
-
(
sumy
*
sumy
/
count
)
)
);
return
1
-
(
num
/
denum
);
}
/** \brief Spearman distance (rank base metric)
* \note Added by A. Lucas
*/
template
<
class
T
>
T
R_spearman
(
double
*
x
,
int
nr
,
int
nc
,
int
i1
,
int
i2
,
int
*
flag
,
void
**
opt
)
{
int
j
;
double
*
data_tri
;
int
*
order_tri
;
int
*
rank_tri
;
int
n
;
T
diffrang
=
0
;
/* initialisation of variables */
data_tri
=
(
double
*
)
opt
[
0
];
order_tri
=
(
int
*
)
opt
[
1
];
rank_tri
=
(
int
*
)
opt
[
2
];
for
(
j
=
0
;
j
<
nc
;
j
++
)
{
if
(
!
(
R_FINITE
(
x
[
i1
])
&&
R_FINITE
(
x
[
i2
])))
{
*
flag
=
0
;
return
NA_REAL
;
}
order_tri
[
j
]
=
rank_tri
[
j
]
=
j
;
order_tri
[
j
+