diff --git a/NAMESPACE b/NAMESPACE index 73886567a8bed37394b8007ee961f765b234b288..c69ae31fb177b19cc3db5f1898260f28878145b8 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,11 @@ # Generated by roxygen2: do not edit by hand +S3method(clusterTree,default) +S3method(clusterTree,list) +S3method(normalizeCount,HiCDOCDataSet) +S3method(normalizeCount,HiCDOCDataSetList) +S3method(normalizeCount,default) +S3method(normalizeCount,list) S3method(print,treeTest) S3method(summary,treeTest) export(HiC2Tree) diff --git a/R/HiC2Tree.R b/R/HiC2Tree.R index d1615779578d3111a5b911c97f3a3b0e9c1d611b..3dc738e8f21b34da291160c51ded2d5d1c69a1f4 100644 --- a/R/HiC2Tree.R +++ b/R/HiC2Tree.R @@ -26,89 +26,68 @@ #' \item{testRes}{ A list of treediff results for each cluster.} #' } #' +#' @example +#' dd <- "../data/" +#' replicat <- 1:3 +#' cond <- c("90", "110") +#' +#' all_begins <- interaction(expand.grid(replicat, cond), sep = "-") +#' all_begins <- as.character(all_begins) +#' +#' nb_chr <- 2 +#' chromosomes <- 1:nb_chr +# +#' all_mat_chr <- lapply(chromosomes, function(chr){ +#' all_mat <- lapply(all_begins, function(ab) { +#' mat_file <- paste0(dd, "Rep", ab, "-chr", chr, "_200000.bed") +#' }) +#' all_mat <- unlist(all_mat) +#' }) +#' +#' index <- paste0(dd, "index.200000.longest18chr.abs.bed") +#' format <- rep(rep("HiC-Pro", 6), nb_chr) +#' binsize <- 200000 +#' files <- unlist(all_mat_chr) +#' replicates <- c(3,3) +#' +#' HiC2Tree(files, format, binsize, index, chromosomes, replicates) +#' #' @export HiC2Tree <- function(files, format, binsize = NULL, index = NULL, chromosomes, replicates) { + if (length(files) != length(chromosomes) * sum(replicates)){ + stop("`files`, `chromosomes` or `replicates` is incorrectly filled in.") + } + # create an HiCDOCDataSet object HiCDOCDataSet <- HiCDOCDataSet(files, format, binsize, chromosomes, index) # Extract the HiCDOCDataSet and index_mat_chr variables - res <- HiCDOCDataSet$interactionMat + res <- HiCDOCDataSet$HiCDOCDataSet index_mat_chr <- HiCDOCDataSet$index_mat_chr # Normalize the count data using cyclic loess - norm_matrices <- lapply(res, normalizeCount) - - # Create cluster for each chromosome - cur_clust <- lapply(norm_matrices, create_cluster) - - # Create a list of trees - all_trees <- lapply(seq_along(cur_clust), function(i) { - - # List of all clusters - all_clusters <- unique(cur_clust[[i]][, 1]) - - # Create a tree for each cluster - trees <- sapply(all_clusters, function(ac) { - - # Get the indices of the rows and columns in the cluster - selected <- rownames(cur_clust[[i]])[cur_clust[[i]] == ac] - - # Select the sub-matrix - red_mat <- ((norm_matrices[[i]]$index1 %in% selected) & - (norm_matrices[[i]]$index2 %in% selected)) - red_mat <- norm_matrices[[i]][red_mat, ] + norm_matrices <- normalizeCount(res) - # Create trees - trees <- create_trees(red_mat, selected) - - return(trees) - }, simplify = FALSE) + res_clus <- clusterTree(norm_matrices) + nb_cluster <- sapply(chromosomes, function(chr) { + max(res_clus$metadata$cluster[which(res_clus$metadata$chromosome == chr)]) }) - # Count the number of clusters for each chromosome - nb_cluster <- sapply(all_trees, length) - - # Count the number of matrices for each chromosome - nb_mat <- sapply(chromosomes, - function(chr) sum(index_mat_chr$chromosome == chr)) - - # Create a vector of chromosomes for each tree - met_chr <- rep(chromosomes, nb_cluster * nb_mat) - - # Create a vector of cluster numbers for each tree - met_cluster <- sapply(seq_along(chromosomes), - function(i) rep(seq(nb_cluster[i]), each = nb_mat[i])) - - # Create a vector of file names for each tree - files <- lapply(seq_along(chromosomes), function(i) - rep(files[which(index_mat_chr$chromosome == chromosomes[i])], nb_cluster[i])) - - all_trees <- unlist(unlist(all_trees, recursive = FALSE), recursive = FALSE) - - # Give each tree a name - names <- paste0("tree", seq_along(all_trees)) - names(all_trees) <- names - clus_sum <- sum(nb_cluster) T1 <- rep(rep(c(TRUE, FALSE), replicates), clus_sum) T2 <- rep(rep(c(FALSE, TRUE), replicates), clus_sum) - trees1 <- all_trees[T1] - trees2 <- all_trees[T2] - - # Add metadata to the trees - metadata <- data.frame("names" = names, "chromosome" = met_chr, - "cluster" = unlist(met_cluster), - "file" = unlist(files)) + trees1 <- res_clus$trees[T1] + trees2 <- res_clus$trees[T2] testRes <- treediff(trees1, trees2, replicates) # Return a list containing the trees, metadata, and indexData - return(list("trees" = all_trees, "metadata" = metadata, + return(list("trees" = res_clus$trees, "metadata" = res_clus$metadata, "index" = HiCDOCDataSet$indexData, "testRes" = testRes)) } @@ -131,7 +110,8 @@ HiC2Tree <- function(files, format, binsize = NULL, index = NULL, #' #' @return A list containing the following objects: #' \describe{ -#' \item{interactionMat}{A list of interaction matrices, one for each file} +#' \item{HiCDOCDataSet}{A list of interaction matrices of the HiCDOCDataSet +#' class of the HiCDOC package, one for each file} #' \item{indexData}{A data frame of index data for each interaction in the #' matrices.} #' \item{index_mat_chr}{A data frame containing the name of the matrices and @@ -150,11 +130,39 @@ HiC2Tree <- function(files, format, binsize = NULL, index = NULL, #' @importFrom purrr reduce #' @importFrom SummarizedExperiment assay #' +#' @example +#' dd <- "../data/" +#' replicat <- 1:3 +#' cond <- c("90", "110") +#' +#' all_begins <- interaction(expand.grid(replicat, cond), sep = "-") +#' all_begins <- as.character(all_begins) +#' +#' nb_chr <- 2 +#' chromosomes <- 1:nb_chr +# +#' all_mat_chr <- lapply(chromosomes, function(chr) { +#' all_mat <- lapply(all_begins, function(ab) { +#' mat_file <- paste0(dd, "Rep", ab, "-chr", chr, "_200000.bed") +#' }) +#' all_mat <- unlist(all_mat) +#' }) +#' +#' index <- paste0(dd, "index.200000.longest18chr.abs.bed") +#' format <- rep(rep("HiC-Pro", 6), nb_chr) +#' binsize <- 200000 +#' files <- unlist(all_mat_chr) +#' replicates <- c(3,3) +#' +#' DataFrameSet(files, format, binsize, chromosomes, index) +#' #' @export HiCDOCDataSet <- function(files, format, binsize = NULL, chromosomes, index = NULL) { + if (is.null(chromosomes))stop("`chromsomes` is empty.") + if (length(files) != length(format)) { stop("`files` or `format` is incorrectly filled in.") } @@ -166,19 +174,14 @@ HiCDOCDataSet <- function(files, format, binsize = NULL, chromosomes, }) lapply(unique(format), function(var) { - if (var == "HiC-Pro" & is.null(index)) { - stop("`index` is empty.") - } + if (var == "HiC-Pro" & is.null(index)) stop("`index` is empty.") }) lapply(unique(format), function(var) { - if (var != "tabular" & is.null(binsize)) { - stop("`binsize` is empty.") - } else { + if (var != "tabular" & is.null(binsize)) stop("`binsize` is empty.") + else { if (!is.null(binsize)){ - if (!is.numeric(binsize)) { - stop("`binsize` is not a numeric vector.") - } + if (!is.numeric(binsize)) stop("`binsize` is not a numeric vector.") } } }) @@ -204,6 +207,12 @@ HiCDOCDataSet <- function(files, format, binsize = NULL, chromosomes, return(HiCDOCDataSet) }) + if (length(HiCDOCDataSet) > 1) { + class(HiCDOCDataSet) <- "HiCDOCDataSetList" + } else { + HiCDOCDataSet <- HiCDOCDataSet[[1]] + } + # Create a matrix of chromosomes and matrix index_mat_chr <- sapply(HiCDOCDataSet, function(data) { chromosome <- data@chromosomes @@ -216,7 +225,7 @@ HiCDOCDataSet <- function(files, format, binsize = NULL, chromosomes, index_mat_chr$chromosome <- unlist(index_mat_chr$chromosome) index_mat_chr$variables <- unlist(index_mat_chr$variables) - mat_count <- lapply(chromosomes, function(chr) { + indexData <- lapply(chromosomes, function(chr) { dataSet <- HiCDOCDataSet[index_mat_chr$chromosome == chr] @@ -228,40 +237,17 @@ HiCDOCDataSet <- function(files, format, binsize = NULL, chromosomes, indexData <- unique(indexData) indexData <- indexData[order(indexData$index),] - - data <- lapply(dataSet, function(data) { - interaction <- interactions(data) - interaction <- as.data.frame(interaction) - interaction <- cbind(interaction[, c(6,12)], assay(data)) - }) - - interaction_mat <- suppressWarnings(reduce(data, merge, by = c("index1", "index2"), - all = TRUE)) - names(interaction_mat) <- c("index1", "index2", index_mat_chr$variables[ - index_mat_chr$chromosome == chr]) - - interaction_mat[is.na(interaction_mat)] <- 0 - chr <- rep(chr, nrow(interaction_mat)) - interaction_mat <- cbind("chromosome" = chr,interaction_mat) - - return(list("interaction_mat" = interaction_mat, "indexData" = indexData)) }) - indexData <- lapply(mat_count, function(data) { - indexData <- data$indexData}) - - interaction_mat <- lapply(mat_count, function(data) { - interaction_mat <- data$interaction_mat}) - indexData <- Reduce(rbind, indexData) - return(list("interactionMat" = interaction_mat, "indexData" = indexData, + return(list("HiCDOCDataSet" = HiCDOCDataSet, "indexData" = indexData, "index_mat_chr" = index_mat_chr)) } #' @title Normalize count matrix using cyclic loess #' -#' @deescription This function normalizes the count matrix using loess +#' @description This function normalizes the count matrix using loess #' regression. #' #' @param count_matrice The count matrix to normalize. @@ -289,8 +275,18 @@ HiCDOCDataSet <- function(files, format, binsize = NULL, chromosomes, #' mat <- cbind(chromosome, index1, index2, m) #' #' normalizeCount(mat) +normalizeCount <- function(count_matrice){ + UseMethod("normalizeCount") +} -normalizeCount <- function(count_matrice) { +#' @export +normalizeCount.default <- function(count_matrice) { + + if (!identical(names(count_matrice[ ,c(1:3)]), c("chromosome", "index1", + "index2"))) { + message("Check that the first 3 columns contain in order `chromsome`, + `index1` and `index2`") + } # Extract the count values as a matrix counts <- as.matrix(count_matrice[ ,-c(1:3)]) @@ -314,6 +310,62 @@ normalizeCount <- function(count_matrice) { return(count_matrice) } +#' @export +normalizeCount.list <- function(count_matrice) { + count_matrice <- lapply(count_matrice, normalizeCount.default) + return(count_matrice) +} + +#' @export +normalizeCount.HiCDOCDataSet <- function(count_matrice){ + + chromosome <- count_matrice@chromosomes + + interaction <- interactions(count_matrice) + interaction <- as.data.frame(interaction) + interaction_mat <- cbind(interaction[, c(6,12)], assay(count_matrice)) + + names(interaction_mat) <- c("index1", "index2", "mat_1") + + interaction_mat[is.na(interaction_mat)] <- 0 + chr <- rep(chromosome, nrow(interaction_mat)) + count_matrice <- cbind("chromosome" = chr, interaction_mat) + + return(count_matrice) +} + +#' @export +normalizeCount.HiCDOCDataSetList <- function(count_matrice){ + + matChr <- sapply(count_matrice, function(data) data@chromosomes) + chromosomes <- unique(matChr) + + mat_count <- lapply(chromosomes, function(chr) { + index <- matChr == chr + dataSet <- count_matrice[index] + + data <- lapply(dataSet, function(data) { + interaction <- interactions(data) + interaction <- as.data.frame(interaction) + interaction <- cbind(interaction[, c(6,12)], assay(data)) + }) + + interaction_mat <- suppressWarnings(reduce(data, merge, + by = c("index1", "index2"), all = TRUE)) + names_mat <- paste("mat", which(index), sep = "_") + names(interaction_mat) <- c("index1", "index2", names_mat) + + interaction_mat[is.na(interaction_mat)] <- 0 + chr <- rep(chr, nrow(interaction_mat)) + interaction_mat <- cbind("chromosome" = chr, interaction_mat) + + return(interaction_mat) + }) + + mat_count <- normalizeCount.list(mat_count) + return(mat_count) +} + #' @title Create hierarchical clustering trees for each cluster in a given #' matrix #' @@ -322,40 +374,114 @@ normalizeCount <- function(count_matrice) { #' using the "broken stick" method and then converts the clusters into trees. #' #' @param mat A matrix containing the data to cluster. It should have columns -#' named 'index1', 'index2', 'chromosome and one colonne for each matrices. -#' -#' @return A list of hierarchical clustering trees, one for each cluster in -#' the matrix. +#' named 'index1', 'index2', 'chromosome and one column for each matrices. #' +#' @return +#' @return A list containing the following objects: +#' \describe{ +#' \item{trees}{ A list of hierarchical clustering trees, one for each cluster +#' in the matrix.} +#' \item{metadata}{ A data frame containing the following columns: names (name +#' of each tree), chromosome, cluster, and file. } +#' } #' #' @importFrom adjclust adjClust #' @importFrom adjclust select #' @importFrom stats hclust #' #' @export +#' +#' @example +#' n <- 5 +#' +#' matrice <- matrix(runif(n*n), nrow = n, ncol = n) +#' matrice[lower.tri(matrice)] <- t(matrice)[lower.tri(matrice)] +#' matrice <- as.data.frame(matrice) +#' names(matrice) <- c("mat_1", "mat_2", "mat_3", "mat_4", "mat_5") +#' +#' chromosome <- rep(1, n) +#' index1 <- sample(1:100, n, replace = TRUE) +#' index2 <- sample(1:100, n, replace = TRUE) +#' +#' mat <- cbind(chromosome, index1, index2, matrice_symetrique) +#' +#' res <- clusterTree(mat) +clusterTree <- function(mat){ + UseMethod("clusterTree") +} + +#' @export +clusterTree.default <- function(mat) { + + if (!inherits(mat, "data.frame")){ + stop("`mat` is not a `data.frame`") + } + + if (!identical(names(mat[ ,c(1:3)]), c("chromosome", "index1", + "index2"))) { + message("Check that the first 3 columns contain in order `chromsome`, + `index1` and `index2`") + } -clusterTree <- function(mat) { # Create cluster for each chromosome cur_clust <- create_cluster(mat) - # List of all clusters - all_clusters <- unique(cur_clust) + # List of all clusters + all_clusters <- unique(cur_clust) + + # Create a tree for each cluster + trees <- sapply(seq_along(all_clusters$merged_clust), function(ac) { + + # Get the indices of the rows and columns in the cluster + selected <- rownames(cur_clust)[cur_clust == ac] + + # Select the sub-matrix + red_mat <- ((mat$index1 %in% selected) & (mat$index2 %in% selected)) + red_mat <- mat[red_mat, ] + + # Create trees + trees <- create_trees(red_mat, selected) + + return(trees) + }, simplify = FALSE) + + chromosome <- unique(mat$chromosome) + nb_cluster <- length(trees) + + nb_mat_chr <- unique(sapply(trees, length)) + + chromosomes <- rep(chromosome, nb_cluster * nb_mat_chr) + clusters <- rep(1:nb_cluster, each = nb_mat_chr) + + trees <- unlist(trees, recursive = FALSE) + names_mat <- names(trees) + + names <- paste0("tree", seq_along(trees)) + names(trees) <- names + + metadata <- data.frame("names" = names, "chromosome" = chromosomes, + "cluster" = clusters, "mat" = names_mat) + + return(list("trees" = trees, "metadata" = metadata)) + +} + +#' @export +clusterTree.list <- function(mat){ + res <- lapply(mat, clusterTree.default) - # Create a tree for each cluster - trees <- sapply(seq_along(all_clusters$merged_clust), function(ac) { + metadata <- lapply(res, function(data) data$metadata) + metadata <- Reduce(rbind, metadata) - # Get the indices of the rows and columns in the cluster - selected <- rownames(cur_clust)[cur_clust == ac] + trees <- lapply(res, function(data) data$trees) + trees <- unlist(trees, recursive = FALSE) - # Select the sub-matrix - red_mat <- ((mat$index1 %in% selected) & (mat$index2 %in% selected)) - red_mat <- mat[red_mat, ] + names <- paste0("tree", seq_along(trees)) + names(trees) <- names - # Create trees - trees <- create_trees(red_mat, selected) + metadata$names <- names - return(trees) - }, simplify = FALSE) + return(list("trees" = trees, "metadata" = metadata)) } create_cluster <- function(res) { diff --git a/man/HiCDOCDataSet.Rd b/man/HiCDOCDataSet.Rd index e20520be5924dea74cbafa0a6ab9b64a9baa9b3f..b9fd0201f16b26cb9b7d3892b12d1a4128106074 100644 --- a/man/HiCDOCDataSet.Rd +++ b/man/HiCDOCDataSet.Rd @@ -25,7 +25,8 @@ HiC-Pro format. Ignored for other formats.} \value{ A list containing the following objects: \describe{ - \item{interactionMat}{A list of interaction matrices, one for each file} + \item{HiCDOCDataSet}{A list of interaction matrices of the HiCDOCDataSet + class of the HiCDOC package, one for each file} \item{indexData}{A data frame of index data for each interaction in the matrices.} \item{index_mat_chr}{A data frame containing the name of the matrices and diff --git a/man/clusterTree.Rd b/man/clusterTree.Rd index 5234528d31d646c51c58b6ae3fac7cc1f6f1416b..80fa7e20813cf00475fe9d2c71131ec2e571dcea 100644 --- a/man/clusterTree.Rd +++ b/man/clusterTree.Rd @@ -9,11 +9,16 @@ clusterTree(mat) } \arguments{ \item{mat}{A matrix containing the data to cluster. It should have columns -named 'index1', 'index2', 'chromosome and one colonne for each matrices.} +named 'index1', 'index2', 'chromosome and one column for each matrices.} } \value{ -A list of hierarchical clustering trees, one for each cluster in -the matrix. +A list containing the following objects: +\describe{ + \item{trees}{ A list of hierarchical clustering trees, one for each cluster + in the matrix.} + \item{metadata}{ A data frame containing the following columns: names (name + of each tree), chromosome, cluster, and file. } +} } \description{ This function creates a hierarchical clustering tree for each diff --git a/man/normalizeCount.Rd b/man/normalizeCount.Rd index 6c90ea55be09da1d118dfeadbf0dd3fb76e07dfc..1e2453a3d0ccec05957bbcb578c95bb3611a8f67 100644 --- a/man/normalizeCount.Rd +++ b/man/normalizeCount.Rd @@ -15,5 +15,6 @@ normalizeCount(count_matrice) } } \description{ -Normalize count matrix using cyclic loess +This function normalizes the count matrix using loess +regression. }