correction pour comparer des clusters de taille differentes

39dd4a39 · CARDENAS GWENDAELLE · b47cf615 · 39dd4a39
Commit 39dd4a39 authored 2 years ago by CARDENAS GWENDAELLE
--- a/R/treediff.R
+++ b/R/treediff.R
-##### Main function
-###############################################################################
-# Questions to answer:
-# 2. Types of the elements of the list(s)? For the moment I choose to deal only
-# with 'hclust' objects but we should think of dendrograms, chac, or classes
-# dedicated to phylogeny
-##### TODO later
+# Library
+library("limma")
+library("tidyverse")

-# 6. Should we propose a maximum value for nb_max? Is there a "natural" one?
-##### NB: max number of blocks arbitrarily set to 10!
-##### utiliser clustering divisif (diana) => Not implemented!
+## sub - fonctions

-# 7. Should we make available the other criteria from ClusterR to choose the 
-# number of clusters? More importantly, I am not sure that a threshold for the
-# current criteria can be given in advance: it is probably strongly data 
-# dependent. I would try to go for broken stick or BIC that are a bit more 
-# independent from data
+compute_squeeze <- function(dist_coph, replicats){
+  
+  nb_cluster <- ncol(dist_coph)/sum(replicats)

-##### correspondance des feuilles à implémenter
-##### test apparié
-###############################################################################
-#' Titre
-#'
-#' Description
-#'
-#' Long description (several lines)
-#'
-#' @param trees1 List of trees (\code{\link[stats]{hclust}} types) of the first 
-#' condition
-#' @param trees2 list of trees (\code{\link[stats]{hclust}} types) of the second
-#' condition
-#' @param scale Logical. If \code{TRUE} (the default value), the trees are all
-#' rescalled to have a minimum height equal to 0 (translation) and a maximum
-#' height equal to 1 (dilatation)
-#' @param threshold.p proportion threshold for the percentage of cophenetic 
-#' distance of a given entry across trees above which the entry is not 
-#' considered. Default to \code{0.1}
-#' @param nsim number of simulations performed for the estimation of the 
-#' empirical distribution. Default to \code{10000}
-#'
-#' @return A list with classes \code{htest} and \code{tree_test} containing the
-#' following components: \describe{ ## TODO
-#'   \item{\code{statistics}}{ the value of the observed statistic (Hotelling 
-#'   based statistics).}
-#'   \item{\code{parameter}}{ the rank of the observed statistic.}
-#'   \item{\code{p.value}}{ the pseudo p-value of the test.}
-#'   \item{\code{p.value_Simes}}{the p-value of the test obtained by Simes aggregation 
-#'   of univariate p-values}
-#'   \item{\code{alternative}}{ a character string describing the alternative 
-#'   hypothesis (always \code{"greater"} here).}
-#'   \item{\code{method}}{ a character string giving the method used.}
-#'   \item{\code{data.name}}{ a character string giving the name(s) of the data,
-#'   and the number of simulations.}
-#'   \item{\code{res}}{ a list with components \code{simulations} (\code{nsim}
-#'   simulated values of statistic, final value is observed statistic),
-#'   \code{cophenetics} (the cophenetic distances of the input trees),
-#'   \code{truncated_cophenetics} (the selected cophenetic distances after 
-#'   truncation) and \code{truncated_indices} (the indices of the entries 
-#'   corresponding to selected cophenetic distances after truncation),
-#'   \code{p} (the effective number of pairs of leaves after filtering),
-#'   \code{d0} (the additional degrees of freedom for the moderated test statistic),
-#'   \code{d} (the initial degrees of freedom for the moderated test statistic),
-#'   \code{Tk} (the vector of p test statistics before aggregation),
-#'   \code{pk} (the  vector of p p-values corresponding to Tk)}
-#' }
-#'
-#' @seealso \code{\link{fisher_mix}} to obtain the empirical distribution under 
-#' the null hypothesis and \code{\link{compute_stats}} for intermediate 
-#' computation of statistics
-#' 
-#' @note The returned p-value is based on the rank of the observed statistics
-#' when compared to all simulated values obtained from the theoretical 
-#' distribution. For \code{nsim = 99}, for instance, it can not be smaller than
-#' 0.01.
-#' 
-#' @references TODO
-#'
-#' @examples  
-#' set.seed(12081238) # for reproducibility
-#' # data: generated from a normal distribution with nothing special
-#' base_data <- matrix(rnorm(2000), nrow = 100, ncol = 200)
-#' 
-#' # then create two groups by simply subsampling the columns differently
-#' group1 <- replicate(20, sample(1:100, 50, replace = FALSE))
-#' group2 <- replicate(20, sample(101:200, 50, replace = FALSE))
-#' conditions <- factor(rep(c(1, 2), each = 20))
-#' 
-#' # obtain hclust for the two groups
-#' trees <- apply(cbind(group1, group2), 2, function(asample) {
-#' samples <- base_data[ ,asample]
-#'   out <- hclust(dist(samples), method = "ward.D2")
-#'     return(out)
-#' })
-#' 
-#' trees1 <- trees[1:20]
-#' trees2 <- trees[21:40]
-#' treediff(trees1, trees2, nsim = 1e5)$p.value
-#' 
-#' # with a weaker threshold
-#' treediff(trees1, trees2, threshold.p = 0.2, nsim = 1e5)
-#' @export
-#' 
-#' @importFrom stats punif
-#' @importFrom limma squeezeVar
+  groups1 <- rep(1:nb_cluster, each = replicats[1])
+  groups2 <- rep(1:nb_cluster, each = replicats[2])

-treediff <- function(trees1, trees2, scale = TRUE, threshold.p = 0.1,
-                     nsim = 10000) {
-  all_stats <- compute_stats(trees1, trees2, scale, threshold.p)
+  n1 <- replicats[1]
+  n2 <- replicats[2]
+  
+  col1 <- 1 : length(groups1)
+  col2 <- 1 : length(groups2) + length(col1)
  
-  # for standard htest outputs
-  xname1 <- deparse(substitute(trees1))
-  xname2 <- deparse(substitute(trees2))
-  data_name <- paste("list1:", xname1, "\nlist2:", xname2,
-                     "\nnumber of simulations + 1:", nsim + 1, "\n")
+  # Means per groups and conditions
+  average_coph_trees1 <- by(t(dist_coph[,col1]), groups1, colMeans)
+  condition1 <- as.vector(Reduce(cbind,average_coph_trees1))
  
-  squeezed_var <- squeezeVar(all_stats$variances, 
-                             df = length(all_stats$conditions) - 2,
-                             robust = FALSE)
-  all_var <- squeezed_var$var.post
+  average_coph_trees2 <- by(t(dist_coph[,col2]), groups2, colMeans)
+  condition2 <- as.vector(Reduce(cbind,average_coph_trees2))

-  denominator <- all_var * length(all_stats$conditions) / 
-    prod(table(all_stats$conditions))
+  len1 <- sapply(average_coph_trees1, length)
+  cluster1 <- rep(1:nb_cluster, len1)
  
-  # computation of Hotelling
-  sqT <- all_stats$numerator / sqrt(denominator)
-  Tk <- sqT
-  sqT <- crossprod(sqT) / length(all_var) # T^2/p
-  sqT <- as.numeric(sqT)
-  names(sqT) <- "statistic"
+  cluster <- cluster1
  
-  stopifnot(!is.na(sqT)) ## sanity check
+  average_coph <- data.frame(condition1, condition2, cluster)
+
+  average_coph_woNA <- na.omit(average_coph)
  
-  # corresponding distribution under null hypothesis
-  p <- length(all_var)
-  d0 <- squeezed_var$df.prior
-  d <- length(all_stats$conditions)
-  simulated <- fisher_mix(nsim, p = p, nobs = d0 + d)
+  # Variance
+  sq_average_coph <- sweep(average_coph_woNA[-3]^2, 2, replicats, "*")
  
-  # computation of the p-value (same as in standard permutation tests in R)
-  parameter <- rank(c(sqT, simulated))[1]
-  names(parameter) <- "observed rank"
-  diff <- length(simulated) - parameter
-  diff <- ifelse(diff > 0, diff, 0)
-  pvalue <- punif((diff + 1) / (length(simulated) + 1))
-  pvalue <- unname(pvalue)
+  sum_sq_coph_trees1 <- unlist(by(t(dist_coph[,col1])^2, groups1, colSums))
+  sum_sq_coph_trees2 <- unlist(by(t(dist_coph[,col2])^2, groups2, colSums))
+  sum_sq_coph <- data.frame(na.omit(sum_sq_coph_trees1), na.omit(sum_sq_coph_trees2))
+
+  variances <- sum_sq_coph - sq_average_coph
+  variances <- rowSums(variances)
+  variances <- as.vector(variances / (n1 + n2 - 2))
+  
+  # Squeeze variance
+  squeezed_var <- squeezeVar(variances, df = n1 + n2 - 2, robust = FALSE)
  
-  # univariate p-values
-  pk <- 1 - pf(Tk^2, df1 = 1, df2 = d0 + d - 2)
-  # pk <- 2*(1 - pt(abs(Tk), d0 + d - 2)) # should be identical to pk
+  return(list("average_coph" = average_coph_woNA, "squeezed_var" = squeezed_var))
+}

-  # Simes aggregation
-  pvalue_Simes <- min(sort(pk)/(1:p))*p
+compute_pvalue <- function(average_coph, squeezed_var, replicats){
  
-  # preparing outputs
-  out <- list("statistic" = sqT, "parameter" = parameter, 
-              "p.value" = pvalue,
-              "p.value_Simes" = pvalue_Simes,
-              "alternative" = "greater", 
-              "method" = "Tree test based on Fisher mix simulations",
-              "data.name" = data_name,
-              "res" = list("simulations" = c(simulated, sqT),
-                           "cophenetics" = all_stats$cophenetics,
-                           "truncated_cophenetics" = all_stats$truncated_cophenetics,
-                           "truncated_indices" = all_stats$truncated_indices,
-                           "p" = p,
-                           "d0" = d0,
-                           "d" = d,
-                           "Tk" = Tk,
-                           "pk" = pk))
-  class(out) <- c("htest", "tree_test")
+  # Statistics
+  cluster <- average_coph$cluster
+  numerator <- Reduce("-", average_coph[,-3])
+ 
+  n1 <- replicats[1]
+  n2 <- replicats[2]
+  
+  denominator <- squeezed_var$var.post * (n1 + n2) / (n1 * n2)
+  statistics <- numerator / sqrt(denominator)
+  
+  #P-values
+  d0 <- squeezed_var$df.prior
+  p.value <- 2*(1 - pt(abs(statistics), d0 + n1 + n2 - 2))
+  
+  nb_rep <- rep(as.vector(table(cluster)))
+  p <- rep(nb_rep, nb_rep)
+  
+  out <- data.frame ("statistics" = statistics, "p.value" = p.value, "cluster" = factor(cluster), "p"= p)
  
  return(out)
 }

-
-#' Titre
-#'
-#' Description
-#'
-#' Long description (several lines)
-#'
-#' @param trees1 List of trees (\code{\link[stats]{hclust}} types) of the first 
-#' condition
-#' @param trees2 list of trees (\code{\link[stats]{hclust}} types) of the second
-#' condition
-#' @param scale Logical. If \code{TRUE} (the default value), the trees are all
-#' rescalled to have a minimum height equal to 0 (translation) and a maximum
-#' height equal to 1 (dilatation)
-#' @param threshold.p proportion threshold for the percentage of cophenetic 
-#' distance of a given entry across trees above which the entry is not 
-#' considered. Default to \code{0.1}
-#'
-#' @return A list with classes \code{htest} and \code{tree_test} containing the
-#' following components: \describe{ 
-#'   \item{\code{conditions}}{ the experiment conditions}
-#'   \item{\code{cophenetics}}{ the cophenetic distances of the input trees}
-#'   \item{\code{truncated_cophenetics}}{ the selected cophenetic distances 
-#'   after truncation}
-#'   \item{\code{truncated_indices}}{ the indices of the entries corresponding 
-#'   to selected cophenetic distances after truncation}
-#'   \item{\code{numerator}}{the numerator of the t-test statistics} 
-#'   \item{\code{variances}}{the variances of the cophenetic entries with
-#'   selected indices}
-#' }
-#' # FIX IT (numerator: not really)
-#'
-#' @seealso \code{\link{treediff}} to perform the test
-#' 
-#' @references TODO
-#'
-#' @examples  
-#' set.seed(12081238) # for reproducibility
-#' # data: generated from a normal distribution with nothing special
-#' base_data <- matrix(rnorm(2000), nrow = 100, ncol = 200)
-#' 
-#' # then create two groups by simply subsampling the columns differently
-#' group1 <- replicate(20, sample(1:100, 50, replace = FALSE))
-#' group2 <- replicate(20, sample(101:200, 50, replace = FALSE))
-#' conditions <- factor(rep(c(1, 2), each = 20))
-#' 
-#' # obtain hclust for the two groups
-#' trees <- apply(cbind(group1, group2), 2, function(asample) {
-#' samples <- base_data[ ,asample]
-#'   out <- hclust(dist(samples), method = "ward.D2")
-#'     return(out)
-#' })
-#' 
-#' trees1 <- trees[1:20]
-#' trees2 <- trees[21:40]
-#' compute_stats(trees1, trees2)$variances
-#' 
-#' @export
-#' 
-#' @importFrom stats cophenetic
-#' 
-
-compute_stats <- function(trees1, trees2, scale = TRUE, threshold.p = 0.1) {
-  trees <- c(trees1, trees2)
-  conditions <- factor(c(rep(1, length(trees1)), rep(2, length(trees2))))
-  rm(trees1, trees2)
-  
-  input_classes <- sapply(trees, function(object) class(object)[1])
-  if (any(input_classes != "hclust"))
-    stop("'trees' must be a list of 'hclust' objects.", call. = TRUE)
+treediff <- function(trees1, trees2, replicats){
  
-  # obtain cophenetic distances from the trees
-  coph_dist <- sapply(trees, cophenetic, simplify = FALSE)
+  cluster <- length(trees1)/replicats[1]
+  if (length(trees1[[1]]$order) == length(trees1[[length(trees1)]]$order)){
+    leaves <- TRUE
+  } else{
+    leaves <- FALSE
+  }
  
-  # normalize
-  if (scale)
-    coph_dist <- normalize_trees(coph_dist)
+  trees <- c(trees1, trees2)
  
-  # extract cophenetic vector
-  coph_vect <- sapply(coph_dist, function(adist) {
+  # cophenetic distance
+  coph_dist <- sapply(trees, cophenetic, simplify = FALSE)
+
+  coph_list_vect <- sapply(coph_dist, function(adist) {
    adist <- as.matrix(adist)
    return(adist[upper.tri(adist)])
  })
  
-  # remove 1...
-  max_indices <- rowSums(coph_vect == 1)
-  if (all(max_indices == 1)) {
-    stop("All pairs of leaves have at least one cophenetic distance equal to 1 (across all trees)")
+  if (cluster != 1 & leaves == FALSE){
+    len_max <- max(unlist(lapply(coph_list_vect, length)))
+    
+    coph_list_vect <- lapply(coph_list_vect, FUN = function(col){
+      col <- c(col, rep(NA, len_max - length(col)))
+    })
+    
+    coph_vect <- c()
+    for (i in 1:length(coph_list_vect)){
+      coph_vect <- cbind(coph_vect, coph_list_vect[[i]])
+    }
+  }else{
+    coph_vect <- coph_list_vect
  }
-  # get rid of those with *p%* of distances at 1
-  truncated_indices <- which(max_indices <= threshold.p * ncol(coph_vect))
-  coph_vect <- coph_vect[truncated_indices, , drop = FALSE]
-  
-  # average cophenetic distances by condition (bar(X)^k and bar(Y)^k)
-  average_coph <- by(t(coph_vect), conditions, colMeans)
-  for_numerator <- Reduce("-", average_coph)
-  
-  ## note: here, I am using sum_i X_i^2 - n bar(X)^2 (by cophenetic distance and condition)
-  average_coph <- Reduce(cbind, average_coph) # 2 columns with bar{X} and bar{Y}
-  sq_average_coph <- sweep(average_coph^2, 2, as.vector(table(conditions)), "*") # n bar(X)^2 and n bar(Y)^2
-  
-  sum_sq_coph <- by(t(coph_vect)^2, conditions, colSums) 
-  sum_sq_coph <- Reduce(cbind, sum_sq_coph) # 2 columns with \sum X_i^2 and \sum Y_i^2
-  
-  all_var <- sum_sq_coph - sq_average_coph # 2 columns: sum_i X_i^2 - n bar(X)^2 and same with Y
-  all_var <- rowSums(all_var) # sum_i X_i^2 - n bar(X)^2 + same with Y
-  all_var <- all_var / (length(conditions) - 2) # sigma^2

-  out <- list("conditions" = conditions,
-              "cophenetics" = coph_dist,
-              "truncated_cophenetics" = coph_vect,
-              "truncated_indices" = truncated_indices,
-              "numerator" = for_numerator,
-              "variances" = all_var)
+  outs <- compute_squeeze(coph_vect, replicats)
+   
+  outp <- compute_pvalue(outs$average_coph, outs$squeezed_var, replicats)
  
-  return(out)
+   # pvalue aggre
+   out_aggr <- outp %>% group_by(cluster) %>%
+     summarise("p.value" = min(sort(p.value) / (1:p)) * p) %>% unique()
+
+   out <- list("method" = "Tree test based on t-test",
+               "data.name" = paste(substitute(trees1), "and", substitute(trees2)),
+               "p.value" = out_aggr$p.value,
+               "res" = list("statistic" = outp$statistics,
+                            "p.value.indiv" = outp$p.value))
+   
+   class(out) <- "htest" 
+   
+   return(out)
 }
\ No newline at end of file