diff --git a/R/treediff.R b/R/treediff.R index e128bd307af867461faa6bfeeb173a732310819e..923732df8389f86e0b173e7b5ce698fb8c748712 100644 --- a/R/treediff.R +++ b/R/treediff.R @@ -1,12 +1,26 @@ -#' Treediff +#' @title Treediff #' -#' The treediff function calculates the difference between two trees. +#' @description Comparison of two sets of trees. #' -#' @param trees1 A list of trees. +#' @details This function compares two sets of trees using a p-value aggregation +#' method. The p-values are obtained by performing a student test on the averaged +#' cophenetic distances of the two sets of trees. +#' +#' @param trees1 A list of trees . #' @param trees2 A list of trees. -#' @param replicats A vector of the number of replicates for each group. +#' @param replicates A vector of the number of replicates for each group. +#' +#' @return A list with class "htest": +#' \item{p.value}{the p-value for the tree test.} +#' \item{statistic}{the value of the statistic of each leaf peer of the tree test.} +#' \item{p.value.indiv}{the p-value for each leaf peer of the tree test.} +#' \item{method}{a character string indicating what type of test was performed.} +#' \item{data.name}{a character string giving the names of the tree subset} +#' +#' @author +#' +#' @references #' -#' @return A data frame with statistics and p-values. #' @export #' #' @importFrom dplyr %>% @@ -32,29 +46,23 @@ #' trees1 <- trees[1:16] #' trees2 <- trees[17:40] #' -#' replicats <- c(4,6) +#' replicates <- c(4,6) #' -#' tree_pvals <- treediff(trees1, trees2, replicats) +#' tree_pvals <- treediff(trees1, trees2, replicates) #' tree_pvals$p.value -treediff <- function(trees1, trees2, replicats){ - # Check if the length of replicats is 2 - if (length(replicats) != 2){ - stop("Incorrect replicats size, `replicats` must be a vector of 2 elements") +treediff <- function(trees1, trees2, replicates){ + + # Check if the length of replicates is 2 + if (length(replicates) != 2){ + stop("Incorrect replicates size, `replicates` must be a vector of 2 elements") } # Check if the number of clusters is equal for both conditions - if (length(trees1) / replicats[1] != length(trees2) / replicats[2]){ - stop("The number of clusters is different between conditions or `replicats' is uncorrect") + if (length(trees1) / replicates[1] != length(trees2) / replicates[2]){ + stop("The number of clusters is different between conditions or `replicates' is uncorrect") } - # # Extract the number of pair of the trees - # tree_order <- lapply(trees1, "[[", "order") - # leaves <- sapply(tree_order, length) - # - # # Get unique values of the number of leaves - # leaves <- unique(leaves) - # Merge trees from both conditions trees <- c(trees1, trees2) @@ -67,31 +75,23 @@ treediff <- function(trees1, trees2, replicats){ return(adist[upper.tri(adist)]) }) - # # Fill the cophenetic vector with NA if the number of leaves is different from the max. - # if (length(leaves) != 1){ - # len_max <- choose(max(leaves), 2) - # - # coph_vect <- sapply(coph_vect, FUN = function(col) { - # c(col, rep(NA, len_max - length(col))) - # }) - # } - # Compute squeeze factor - outs <- compute_squeeze(coph_vect, replicats) + outs <- compute_squeeze(coph_vect, replicates) # Compute p-values - outp <- compute_pvalue(outs$average_coph, outs$squeezed_var, replicats) + outp <- compute_pvalue(outs$average_coph, outs$squeezed_var, replicates) # Aggregate p-values out_aggr <- outp %>% group_by(cluster) %>% - summarise("p.value" = min(sort(p.value) / (1:p)) * p) %>% unique() + summarise("p.value" = min(sort(p.value) / (1:p)) * p) %>% + unique() # Store results in a list out <- list("method" = "Tree test based on t-test", "data.name" = paste(substitute(trees1), "and", substitute(trees2)), "p.value" = out_aggr$p.value, - "res" = list("statistic" = outp$statistics, - "p.value.indiv" = outp$p.value)) + "statistic" = outp$statistics, + "p.value.indiv" = outp$p.value) # Assign class to the list class(out) <- "htest" @@ -100,9 +100,9 @@ treediff <- function(trees1, trees2, replicats){ return(out) } -compute_squeeze <- function(dist_coph, replicats){ +compute_squeeze <- function(dist_coph, replicates){ - if (class(dist_coph) == "list"){ + if (inherits(dist_coph, "list") == TRUE){ max_pr <- as.numeric(max(summary(dist_coph)[,1])) dist_coph <- sapply(dist_coph, FUN = function(col) { c(col, rep(NA, max_pr - length(col))) @@ -110,14 +110,14 @@ compute_squeeze <- function(dist_coph, replicats){ } # Calculate number of clusters - nb_cluster <- ncol(dist_coph)/sum(replicats) + nb_cluster <- ncol(dist_coph)/sum(replicates) - groups1 <- rep(1:nb_cluster, each = replicats[1]) - groups2 <- rep(1:nb_cluster, each = replicats[2]) + groups1 <- rep(1:nb_cluster, each = replicates[1]) + groups2 <- rep(1:nb_cluster, each = replicates[2]) - # Store replicats values for each group - n1 <- replicats[1] - n2 <- replicats[2] + # Store replicates values for each group + n1 <- replicates[1] + n2 <- replicates[2] # Indices for each group col1 <- 1:length(groups1) @@ -143,7 +143,7 @@ compute_squeeze <- function(dist_coph, replicats){ average_coph_woNA <- na.omit(average_coph) # Calculate variance - sq_average_coph <- sweep(average_coph_woNA[-3]^2, 2, replicats, "*") + sq_average_coph <- sweep(average_coph_woNA[-3]^2, 2, replicates, "*") # Sum of squared values for each group sum_sq_coph_trees1 <- unlist(by(t(dist_coph[,col1])^2, groups1, colSums)) @@ -162,7 +162,7 @@ compute_squeeze <- function(dist_coph, replicats){ "squeezed_var" = squeezed_var)) } -compute_pvalue <- function(average_coph, squeezed_var, replicats){ +compute_pvalue <- function(average_coph, squeezed_var, replicates){ # Extract the cluster information from the average_coph data frame cluster <- average_coph$cluster @@ -170,9 +170,9 @@ compute_pvalue <- function(average_coph, squeezed_var, replicats){ # Calculate the numerator for the t-statistic numerator <- Reduce("-", average_coph[,-3]) - # Store replicats values for each group - n1 <- replicats[1] - n2 <- replicats[2] + # Store replicates values for each group + n1 <- replicates[1] + n2 <- replicates[2] # Calculate the denominator for the t-statistic denominator <- squeezed_var$var.post * (n1 + n2) / (n1 * n2) diff --git a/man/treediff.Rd b/man/treediff.Rd index 49cab52ad24499a4060222ba3789c6704075ba8a..9715feed9074b8096fe27c1ba1d7994f2619d5c1 100644 --- a/man/treediff.Rd +++ b/man/treediff.Rd @@ -4,20 +4,30 @@ \alias{treediff} \title{Treediff} \usage{ -treediff(trees1, trees2, replicats) +treediff(trees1, trees2, replicates) } \arguments{ -\item{trees1}{A list of trees.} +\item{trees1}{A list of trees .} \item{trees2}{A list of trees.} -\item{replicats}{A vector of the number of replicates for each group.} +\item{replicates}{A vector of the number of replicates for each group.} } \value{ -A data frame with statistics and p-values. +A list with class "htest": +\item{p.value}{the p-value for the tree test.} +\item{statistic}{the value of the statistic of each leaf peer of the tree test.} +\item{p.value.indiv}{the p-value for each leaf peer of the tree test.} +\item{method}{a character string indicating what type of test was performed.} +\item{data.name}{a character string giving the names of the tree subset} } \description{ -The treediff function calculates the difference between two trees. +Comparison of two sets of trees. +} +\details{ +This function compares two sets of trees using a p-value aggregation +method. The p-values are obtained by performing a student test on the averaged +cophenetic distances of the two sets of trees. } \examples{ @@ -34,8 +44,8 @@ return(out) trees1 <- trees[1:16] trees2 <- trees[17:40] -replicats <- c(4,6) +replicates <- c(4,6) -tree_pvals <- treediff(trees1, trees2, replicats) +tree_pvals <- treediff(trees1, trees2, replicates) tree_pvals$p.value }