diff --git a/R/treediff.R b/R/treediff.R index a44c930e1496569a46154fb7b80102fffd77dbd5..172e847142d75cca3f39739f707da9ab01e82095 100644 --- a/R/treediff.R +++ b/R/treediff.R @@ -75,12 +75,13 @@ treediff <- function(trees1, trees2, replicates){ # Check if the length of replicates is 2 if (length(replicates) != 2){ - stop("Incorrect replicates size, `replicates` must be a vector of 2 elements") + stop("`replicates` must be a vector of length 2.") } # Check if the number of clusters is equal for both conditions - if (length(trees1) / replicates[1] != length(trees2) / replicates[2]){ - stop("The number of clusters is different between conditions or `replicates' is uncorrect") + if (length(trees1) / replicates[1] != length(trees2) / replicates[2]) { + stop(paste("The number of clusters is different between conditions (or", + "`replicates' is not correct).")) } # Merge trees from both conditions @@ -90,7 +91,7 @@ treediff <- function(trees1, trees2, replicates){ coph_dist <- sapply(trees, cophenetic, simplify = FALSE) # Convert cophenetic distances to vector - coph_vect <- sapply(coph_dist, function(adist) { + coph_vect <- lapply(coph_dist, function(adist) { adist <- as.matrix(adist) return(adist[upper.tri(adist)]) }) @@ -107,8 +108,9 @@ treediff <- function(trees1, trees2, replicates){ unique() # Store results in a list + data_name <- paste(substitute(trees1), "and", substitute(trees2)) out <- list("method" = "Tree test based on t-test", - "data.name" = paste(substitute(trees1), "and", substitute(trees2)), + "data.name" = data_name, "p.value" = out_aggr$p.value, "statistic" = outp$statistics, "p.value.indiv" = outp$p.value) @@ -120,57 +122,57 @@ treediff <- function(trees1, trees2, replicates){ return(out) } -compute_squeeze <- function(dist_coph, replicates){ - - if (inherits(dist_coph, "list") == TRUE){ - max_pr <- as.numeric(max(summary(dist_coph)[,1])) - dist_coph <- sapply(dist_coph, FUN = function(col) { - c(col, rep(NA, max_pr - length(col))) - }) - } +compute_squeeze <- function(dist_coph, replicates) { # Calculate number of clusters - nb_cluster <- ncol(dist_coph)/sum(replicates) + nb_cluster <- length(dist_coph) / sum(replicates) - groups1 <- rep(1:nb_cluster, each = replicates[1]) - groups2 <- rep(1:nb_cluster, each = replicates[2]) + clusters1 <- rep(1:nb_cluster, each = replicates[1]) + clusters2 <- rep(1:nb_cluster, each = replicates[2]) # Store replicates values for each group n1 <- replicates[1] n2 <- replicates[2] # Indices for each group - col1 <- 1:length(groups1) - col2 <- 1:length(groups2) + length(col1) + set1 <- 1:length(clusters1) + set2 <- 1:length(clusters2) + length(col1) # Average per groups and conditions - average_coph_trees1 <- by(t(dist_coph[, col1]), groups1, colMeans) - condition1 <- as.vector(Reduce(cbind, average_coph_trees1)) - - average_coph_trees2 <- by(t(dist_coph[, col2]), groups2, colMeans) - condition2 <- as.vector(Reduce(cbind, average_coph_trees2)) - - # Cluster vector - len1 <- sapply(average_coph_trees1, length) - cluster1 <- rep(1:nb_cluster, len1) - - cluster <- cluster1 + average_coph_trees1 <- lapply(unique(clusters1), function(acluster) { + where_clust <- which(clusters1 == acluster) + colMeans(Reduce(rbind, dist_coph[where_clust])) + }) + average_coph_trees2 <- lapply(unique(clusters1), function(acluster) { + where_clust <- which(clusters2 == acluster) + length(clusters1) + colMeans(Reduce(rbind, dist_coph[where_clust])) + }) # Merge average values and cluster vector - average_coph <- data.frame(condition1, condition2, cluster) - - # Remove missing values from average_coph - average_coph_woNA <- na.omit(average_coph) + cluster_length <- sapply(average_coph_trees1, length) + average_coph <- data.frame("set1" = Reduce(c, average_coph_trees1), + "set2" = Reduce(c, average_coph_trees2), + "cluster" = rep(unique(clusters1), cluster_length)) # Calculate variance - sq_average_coph <- sweep(average_coph_woNA[-3]^2, 2, replicates, "*") + sq_average_coph <- sweep(average_coph[-3]^2, 2, replicates, "*") # Sum of squared values for each group - sum_sq_coph_trees1 <- unlist(by(t(dist_coph[,col1])^2, groups1, colSums)) - sum_sq_coph_trees2 <- unlist(by(t(dist_coph[,col2])^2, groups2, colSums)) - sum_sq_coph <- data.frame(na.omit(sum_sq_coph_trees1), na.omit(sum_sq_coph_trees2)) + ## FIX IT: PROBLEM WITH VARIANCES HERE + sum_sq_coph_trees1 <- lapply(unique(clusters1), function(acluster) { + where_clust <- which(clusters1 == acluster) + colMeans(Reduce(rbind, dist_coph[where_clust])^2) + }) + sum_sq_coph_trees2 <- lapply(unique(clusters1), function(acluster) { + where_clust <- which(clusters2 == acluster) + length(clusters1) + colMeans(Reduce(rbind, dist_coph[where_clust])^2) + }) + + sum_sq_coph <- data.frame("set1" = Reduce(c, sum_sq_coph_trees1), + "set2" = Reduce(c, sum_sq_coph_trees2), + "cluster" = rep(unique(clusters1), cluster_length)) - variances <- sum_sq_coph - sq_average_coph + variances <- sum_sq_coph[, 1:2] - sq_average_coph variances <- rowSums(variances) variances <- as.vector(variances / (n1 + n2 - 2)) @@ -178,11 +180,10 @@ compute_squeeze <- function(dist_coph, replicates){ squeezed_var <- squeezeVar(variances, df = n1 + n2 - 2, robust = FALSE) # Return list of average_coph and squeezed_var - return(list("average_coph" = average_coph_woNA, - "squeezed_var" = squeezed_var)) + return(list("average_coph" = average_coph, "squeezed_var" = squeezed_var)) } -compute_pvalue <- function(average_coph, squeezed_var, replicates){ +compute_pvalue <- function(average_coph, squeezed_var, replicates) { # Extract the cluster information from the average_coph data frame cluster <- average_coph$cluster