Skip to content
Snippets Groups Projects
Commit 39dd4a39 authored by CARDENAS GWENDAELLE's avatar CARDENAS GWENDAELLE
Browse files

correction pour comparer des clusters de taille differentes

parent b47cf615
No related branches found
No related tags found
No related merge requests found
##### Main function
###############################################################################
# Questions to answer:
# 2. Types of the elements of the list(s)? For the moment I choose to deal only
# with 'hclust' objects but we should think of dendrograms, chac, or classes
# dedicated to phylogeny
##### TODO later
# Library
library("limma")
library("tidyverse")
# 6. Should we propose a maximum value for nb_max? Is there a "natural" one?
##### NB: max number of blocks arbitrarily set to 10!
##### utiliser clustering divisif (diana) => Not implemented!
## sub - fonctions
# 7. Should we make available the other criteria from ClusterR to choose the
# number of clusters? More importantly, I am not sure that a threshold for the
# current criteria can be given in advance: it is probably strongly data
# dependent. I would try to go for broken stick or BIC that are a bit more
# independent from data
compute_squeeze <- function(dist_coph, replicats){
nb_cluster <- ncol(dist_coph)/sum(replicats)
##### correspondance des feuilles à implémenter
##### test apparié
###############################################################################
#' Titre
#'
#' Description
#'
#' Long description (several lines)
#'
#' @param trees1 List of trees (\code{\link[stats]{hclust}} types) of the first
#' condition
#' @param trees2 list of trees (\code{\link[stats]{hclust}} types) of the second
#' condition
#' @param scale Logical. If \code{TRUE} (the default value), the trees are all
#' rescalled to have a minimum height equal to 0 (translation) and a maximum
#' height equal to 1 (dilatation)
#' @param threshold.p proportion threshold for the percentage of cophenetic
#' distance of a given entry across trees above which the entry is not
#' considered. Default to \code{0.1}
#' @param nsim number of simulations performed for the estimation of the
#' empirical distribution. Default to \code{10000}
#'
#' @return A list with classes \code{htest} and \code{tree_test} containing the
#' following components: \describe{ ## TODO
#' \item{\code{statistics}}{ the value of the observed statistic (Hotelling
#' based statistics).}
#' \item{\code{parameter}}{ the rank of the observed statistic.}
#' \item{\code{p.value}}{ the pseudo p-value of the test.}
#' \item{\code{p.value_Simes}}{the p-value of the test obtained by Simes aggregation
#' of univariate p-values}
#' \item{\code{alternative}}{ a character string describing the alternative
#' hypothesis (always \code{"greater"} here).}
#' \item{\code{method}}{ a character string giving the method used.}
#' \item{\code{data.name}}{ a character string giving the name(s) of the data,
#' and the number of simulations.}
#' \item{\code{res}}{ a list with components \code{simulations} (\code{nsim}
#' simulated values of statistic, final value is observed statistic),
#' \code{cophenetics} (the cophenetic distances of the input trees),
#' \code{truncated_cophenetics} (the selected cophenetic distances after
#' truncation) and \code{truncated_indices} (the indices of the entries
#' corresponding to selected cophenetic distances after truncation),
#' \code{p} (the effective number of pairs of leaves after filtering),
#' \code{d0} (the additional degrees of freedom for the moderated test statistic),
#' \code{d} (the initial degrees of freedom for the moderated test statistic),
#' \code{Tk} (the vector of p test statistics before aggregation),
#' \code{pk} (the vector of p p-values corresponding to Tk)}
#' }
#'
#' @seealso \code{\link{fisher_mix}} to obtain the empirical distribution under
#' the null hypothesis and \code{\link{compute_stats}} for intermediate
#' computation of statistics
#'
#' @note The returned p-value is based on the rank of the observed statistics
#' when compared to all simulated values obtained from the theoretical
#' distribution. For \code{nsim = 99}, for instance, it can not be smaller than
#' 0.01.
#'
#' @references TODO
#'
#' @examples
#' set.seed(12081238) # for reproducibility
#' # data: generated from a normal distribution with nothing special
#' base_data <- matrix(rnorm(2000), nrow = 100, ncol = 200)
#'
#' # then create two groups by simply subsampling the columns differently
#' group1 <- replicate(20, sample(1:100, 50, replace = FALSE))
#' group2 <- replicate(20, sample(101:200, 50, replace = FALSE))
#' conditions <- factor(rep(c(1, 2), each = 20))
#'
#' # obtain hclust for the two groups
#' trees <- apply(cbind(group1, group2), 2, function(asample) {
#' samples <- base_data[ ,asample]
#' out <- hclust(dist(samples), method = "ward.D2")
#' return(out)
#' })
#'
#' trees1 <- trees[1:20]
#' trees2 <- trees[21:40]
#' treediff(trees1, trees2, nsim = 1e5)$p.value
#'
#' # with a weaker threshold
#' treediff(trees1, trees2, threshold.p = 0.2, nsim = 1e5)
#' @export
#'
#' @importFrom stats punif
#' @importFrom limma squeezeVar
groups1 <- rep(1:nb_cluster, each = replicats[1])
groups2 <- rep(1:nb_cluster, each = replicats[2])
treediff <- function(trees1, trees2, scale = TRUE, threshold.p = 0.1,
nsim = 10000) {
all_stats <- compute_stats(trees1, trees2, scale, threshold.p)
n1 <- replicats[1]
n2 <- replicats[2]
col1 <- 1 : length(groups1)
col2 <- 1 : length(groups2) + length(col1)
# for standard htest outputs
xname1 <- deparse(substitute(trees1))
xname2 <- deparse(substitute(trees2))
data_name <- paste("list1:", xname1, "\nlist2:", xname2,
"\nnumber of simulations + 1:", nsim + 1, "\n")
# Means per groups and conditions
average_coph_trees1 <- by(t(dist_coph[,col1]), groups1, colMeans)
condition1 <- as.vector(Reduce(cbind,average_coph_trees1))
squeezed_var <- squeezeVar(all_stats$variances,
df = length(all_stats$conditions) - 2,
robust = FALSE)
all_var <- squeezed_var$var.post
average_coph_trees2 <- by(t(dist_coph[,col2]), groups2, colMeans)
condition2 <- as.vector(Reduce(cbind,average_coph_trees2))
denominator <- all_var * length(all_stats$conditions) /
prod(table(all_stats$conditions))
len1 <- sapply(average_coph_trees1, length)
cluster1 <- rep(1:nb_cluster, len1)
# computation of Hotelling
sqT <- all_stats$numerator / sqrt(denominator)
Tk <- sqT
sqT <- crossprod(sqT) / length(all_var) # T^2/p
sqT <- as.numeric(sqT)
names(sqT) <- "statistic"
cluster <- cluster1
stopifnot(!is.na(sqT)) ## sanity check
average_coph <- data.frame(condition1, condition2, cluster)
average_coph_woNA <- na.omit(average_coph)
# corresponding distribution under null hypothesis
p <- length(all_var)
d0 <- squeezed_var$df.prior
d <- length(all_stats$conditions)
simulated <- fisher_mix(nsim, p = p, nobs = d0 + d)
# Variance
sq_average_coph <- sweep(average_coph_woNA[-3]^2, 2, replicats, "*")
# computation of the p-value (same as in standard permutation tests in R)
parameter <- rank(c(sqT, simulated))[1]
names(parameter) <- "observed rank"
diff <- length(simulated) - parameter
diff <- ifelse(diff > 0, diff, 0)
pvalue <- punif((diff + 1) / (length(simulated) + 1))
pvalue <- unname(pvalue)
sum_sq_coph_trees1 <- unlist(by(t(dist_coph[,col1])^2, groups1, colSums))
sum_sq_coph_trees2 <- unlist(by(t(dist_coph[,col2])^2, groups2, colSums))
sum_sq_coph <- data.frame(na.omit(sum_sq_coph_trees1), na.omit(sum_sq_coph_trees2))
variances <- sum_sq_coph - sq_average_coph
variances <- rowSums(variances)
variances <- as.vector(variances / (n1 + n2 - 2))
# Squeeze variance
squeezed_var <- squeezeVar(variances, df = n1 + n2 - 2, robust = FALSE)
# univariate p-values
pk <- 1 - pf(Tk^2, df1 = 1, df2 = d0 + d - 2)
# pk <- 2*(1 - pt(abs(Tk), d0 + d - 2)) # should be identical to pk
return(list("average_coph" = average_coph_woNA, "squeezed_var" = squeezed_var))
}
# Simes aggregation
pvalue_Simes <- min(sort(pk)/(1:p))*p
compute_pvalue <- function(average_coph, squeezed_var, replicats){
# preparing outputs
out <- list("statistic" = sqT, "parameter" = parameter,
"p.value" = pvalue,
"p.value_Simes" = pvalue_Simes,
"alternative" = "greater",
"method" = "Tree test based on Fisher mix simulations",
"data.name" = data_name,
"res" = list("simulations" = c(simulated, sqT),
"cophenetics" = all_stats$cophenetics,
"truncated_cophenetics" = all_stats$truncated_cophenetics,
"truncated_indices" = all_stats$truncated_indices,
"p" = p,
"d0" = d0,
"d" = d,
"Tk" = Tk,
"pk" = pk))
class(out) <- c("htest", "tree_test")
# Statistics
cluster <- average_coph$cluster
numerator <- Reduce("-", average_coph[,-3])
n1 <- replicats[1]
n2 <- replicats[2]
denominator <- squeezed_var$var.post * (n1 + n2) / (n1 * n2)
statistics <- numerator / sqrt(denominator)
#P-values
d0 <- squeezed_var$df.prior
p.value <- 2*(1 - pt(abs(statistics), d0 + n1 + n2 - 2))
nb_rep <- rep(as.vector(table(cluster)))
p <- rep(nb_rep, nb_rep)
out <- data.frame ("statistics" = statistics, "p.value" = p.value, "cluster" = factor(cluster), "p"= p)
return(out)
}
#' Titre
#'
#' Description
#'
#' Long description (several lines)
#'
#' @param trees1 List of trees (\code{\link[stats]{hclust}} types) of the first
#' condition
#' @param trees2 list of trees (\code{\link[stats]{hclust}} types) of the second
#' condition
#' @param scale Logical. If \code{TRUE} (the default value), the trees are all
#' rescalled to have a minimum height equal to 0 (translation) and a maximum
#' height equal to 1 (dilatation)
#' @param threshold.p proportion threshold for the percentage of cophenetic
#' distance of a given entry across trees above which the entry is not
#' considered. Default to \code{0.1}
#'
#' @return A list with classes \code{htest} and \code{tree_test} containing the
#' following components: \describe{
#' \item{\code{conditions}}{ the experiment conditions}
#' \item{\code{cophenetics}}{ the cophenetic distances of the input trees}
#' \item{\code{truncated_cophenetics}}{ the selected cophenetic distances
#' after truncation}
#' \item{\code{truncated_indices}}{ the indices of the entries corresponding
#' to selected cophenetic distances after truncation}
#' \item{\code{numerator}}{the numerator of the t-test statistics}
#' \item{\code{variances}}{the variances of the cophenetic entries with
#' selected indices}
#' }
#' # FIX IT (numerator: not really)
#'
#' @seealso \code{\link{treediff}} to perform the test
#'
#' @references TODO
#'
#' @examples
#' set.seed(12081238) # for reproducibility
#' # data: generated from a normal distribution with nothing special
#' base_data <- matrix(rnorm(2000), nrow = 100, ncol = 200)
#'
#' # then create two groups by simply subsampling the columns differently
#' group1 <- replicate(20, sample(1:100, 50, replace = FALSE))
#' group2 <- replicate(20, sample(101:200, 50, replace = FALSE))
#' conditions <- factor(rep(c(1, 2), each = 20))
#'
#' # obtain hclust for the two groups
#' trees <- apply(cbind(group1, group2), 2, function(asample) {
#' samples <- base_data[ ,asample]
#' out <- hclust(dist(samples), method = "ward.D2")
#' return(out)
#' })
#'
#' trees1 <- trees[1:20]
#' trees2 <- trees[21:40]
#' compute_stats(trees1, trees2)$variances
#'
#' @export
#'
#' @importFrom stats cophenetic
#'
compute_stats <- function(trees1, trees2, scale = TRUE, threshold.p = 0.1) {
trees <- c(trees1, trees2)
conditions <- factor(c(rep(1, length(trees1)), rep(2, length(trees2))))
rm(trees1, trees2)
input_classes <- sapply(trees, function(object) class(object)[1])
if (any(input_classes != "hclust"))
stop("'trees' must be a list of 'hclust' objects.", call. = TRUE)
treediff <- function(trees1, trees2, replicats){
# obtain cophenetic distances from the trees
coph_dist <- sapply(trees, cophenetic, simplify = FALSE)
cluster <- length(trees1)/replicats[1]
if (length(trees1[[1]]$order) == length(trees1[[length(trees1)]]$order)){
leaves <- TRUE
} else{
leaves <- FALSE
}
# normalize
if (scale)
coph_dist <- normalize_trees(coph_dist)
trees <- c(trees1, trees2)
# extract cophenetic vector
coph_vect <- sapply(coph_dist, function(adist) {
# cophenetic distance
coph_dist <- sapply(trees, cophenetic, simplify = FALSE)
coph_list_vect <- sapply(coph_dist, function(adist) {
adist <- as.matrix(adist)
return(adist[upper.tri(adist)])
})
# remove 1...
max_indices <- rowSums(coph_vect == 1)
if (all(max_indices == 1)) {
stop("All pairs of leaves have at least one cophenetic distance equal to 1 (across all trees)")
if (cluster != 1 & leaves == FALSE){
len_max <- max(unlist(lapply(coph_list_vect, length)))
coph_list_vect <- lapply(coph_list_vect, FUN = function(col){
col <- c(col, rep(NA, len_max - length(col)))
})
coph_vect <- c()
for (i in 1:length(coph_list_vect)){
coph_vect <- cbind(coph_vect, coph_list_vect[[i]])
}
}else{
coph_vect <- coph_list_vect
}
# get rid of those with *p%* of distances at 1
truncated_indices <- which(max_indices <= threshold.p * ncol(coph_vect))
coph_vect <- coph_vect[truncated_indices, , drop = FALSE]
# average cophenetic distances by condition (bar(X)^k and bar(Y)^k)
average_coph <- by(t(coph_vect), conditions, colMeans)
for_numerator <- Reduce("-", average_coph)
## note: here, I am using sum_i X_i^2 - n bar(X)^2 (by cophenetic distance and condition)
average_coph <- Reduce(cbind, average_coph) # 2 columns with bar{X} and bar{Y}
sq_average_coph <- sweep(average_coph^2, 2, as.vector(table(conditions)), "*") # n bar(X)^2 and n bar(Y)^2
sum_sq_coph <- by(t(coph_vect)^2, conditions, colSums)
sum_sq_coph <- Reduce(cbind, sum_sq_coph) # 2 columns with \sum X_i^2 and \sum Y_i^2
all_var <- sum_sq_coph - sq_average_coph # 2 columns: sum_i X_i^2 - n bar(X)^2 and same with Y
all_var <- rowSums(all_var) # sum_i X_i^2 - n bar(X)^2 + same with Y
all_var <- all_var / (length(conditions) - 2) # sigma^2
out <- list("conditions" = conditions,
"cophenetics" = coph_dist,
"truncated_cophenetics" = coph_vect,
"truncated_indices" = truncated_indices,
"numerator" = for_numerator,
"variances" = all_var)
outs <- compute_squeeze(coph_vect, replicats)
outp <- compute_pvalue(outs$average_coph, outs$squeezed_var, replicats)
return(out)
# pvalue aggre
out_aggr <- outp %>% group_by(cluster) %>%
summarise("p.value" = min(sort(p.value) / (1:p)) * p) %>% unique()
out <- list("method" = "Tree test based on t-test",
"data.name" = paste(substitute(trees1), "and", substitute(trees2)),
"p.value" = out_aggr$p.value,
"res" = list("statistic" = outp$statistics,
"p.value.indiv" = outp$p.value))
class(out) <- "htest"
return(out)
}
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment