diff --git a/R/HiC2Tree.R b/R/HiC2Tree.R index 9f4abdc26d9bd206039fdc1e6553d232b7044068..ba43ffe1f14cc7ab415edf3d1329092ca8dabe63 100644 --- a/R/HiC2Tree.R +++ b/R/HiC2Tree.R @@ -1,193 +1,251 @@ -HiC2Tree <- function(file, type, binsize, index = NULL, - chromosomes) { - - InteractionDataSet <- InteractionDataSet(file, type, binsize, chromosome, index) - +HiC2Tree <- function(files, format, binsize, index = NULL, + chromosomes){ + + # create an InteractionDataSet object + InteractionDataSet <- InteractionDataSet(files, format, binsize, chromosomes, + index) + + # Extract the InteractionDataSet and index_mat_chr variables res <- InteractionDataSet$InteractionDataSet + index_mat_chr <- InteractionDataSet$index_mat_chr - # Perform MA normalization with cyclic loess - norm_matrices <- lapply(res, FUN = normalize_count) - - # Find common clustering + # Normalize the count data using cyclic loess + norm_matrices <- normalize_count(res, index_mat_chr) + + # Create cluster for each chromosome cur_clust <- lapply(norm_matrices, FUN = create_cluster) - - all_trees <- lapply(seq_along(cur_clust), function(i){ - - all_clusters <- unique(cur_clust[[i]][, 1]) - + + # Create a list of trees + all_trees <- future_lapply(seq_along(cur_clust), function(i){ + + # List of all clusters + all_clusters <- unique(cur_clust[[i]][, 1]) + + # Create a tree for each cluster trees <- sapply(all_clusters, function(ac) { - cat("cluster", ac, ":\n") - - #### for each cluster, create a dendrogram for each sample - + + # Get the indices of the rows and columns in the cluster selected <- rownames(cur_clust[[i]])[cur_clust[[i]] == ac] + + # Select the sub-matrix red_mat <- ((norm_matrices[[i]]$index1 %in% selected) & (norm_matrices[[i]]$index2 %in% selected)) red_mat <- norm_matrices[[i]][red_mat, ] - + + # Create trees trees <- create_trees(red_mat, selected) - + return(trees) - }, simplify = FALSE) + }, simplify = FALSE) + }) + + # Count the number of clusters for each chromosome + nb_cluster <- sapply(all_trees, length) + + # Count the number of matrices for each chromosome + nb_mat <- sapply(chromosomes, function(chr){ + nb_mat <- sum(index_mat_chr$chromosome == chr) + return(nb_mat) + }) + + # Create a vector of chromosomes for each tree + met_chr <- rep(chromosomes, nb_cluster * nb_mat) + + # Create a vector of cluster numbers for each tree + met_cluster <- sapply(seq_along(chromosomes), function(i){ + met_cluster <-rep(seq(nb_cluster[i]), each = nb_mat[i]) + return(met_cluster) }) - - all_trees <- unlist(all_trees, recursive = FALSE) - - return(list("all_trees" = all_trees, "metadata" = InteractionDataSet$indexData)) - + + # Create a vector of file names for each tree + files <- sapply(seq_along(chromosomes), function(i){ + files <- rep(files[which(index_mat_chr$chromosome == chromosomes[i])], + nb_cluster[i]) + return(files) + }) + + all_trees <- unlist(unlist(all_trees, recursive = FALSE), recursive = FALSE) + + # Give each tree a name + names <- sapply(seq_along(all_trees), function(i){ + names <- paste0("tree", i) + return(names) + }) + + names(all_trees) <- names + + # Add metadata to the trees + metadata <- data.frame("names" = names, "chromosome" = met_chr, + "cluster" = unlist(met_cluster), + "file" = unlist(files)) + + # Return a list containing the trees, metadata, and indexData + return(list("trees" = all_trees, "metadata" = metadata, + "index" = InteractionDataSet$indexData)) } -InteractionDataSet <- function(file, type, binsize, chromosome, index = NULL) { - + +InteractionDataSet <- function(file, format, binsize, chromosome, index = NULL){ + + # Set the conditions to "mat" conditions <- "mat" - + + # Create a list of HiCDOCDataSet objects HiCDOCDataSet <- sapply(seq_along(file), function(i){ - if (type[i] == "tabular") { - HiCDOCDataSet <- HiCDOCDataSetFromTabular(file[i], i, conditions, + if (format[i] == "tabular") { + HiCDOCDataSet <- HiCDOCDataSetFromTabular(file[i], i, conditions, binsize) - } else if (type[i] == "cooler") { - HiCDOCDataSet <- HiCDOCDataSetFromCool(file[i], i, conditions, + } else if (format[i] == "cooler") { + HiCDOCDataSet <- HiCDOCDataSetFromCool(file[i], i, conditions, binsize) - } else if (type[i] == "juicer") { - HiCDOCDataSet <- HiCDOCDataSetFromHiC(file[i], i, conditions, + } else if (format[i] == "juicer") { + HiCDOCDataSet <- HiCDOCDataSetFromHiC(file[i], i, conditions, binsize) - - } else if (type[i] == "HiC-Pro") { - HiCDOCDataSet <- HiCDOCDataSetFromHiCPro(file[i], bedPaths = index, i, + + } else if (format[i] == "HiC-Pro") { + HiCDOCDataSet <- HiCDOCDataSetFromHiCPro(file[i], bedPaths = index, i, conditions) } return(HiCDOCDataSet) }) - - InteractionData <- lapply(HiCDOCDataSet, function(data){ - interactionSet <- data@interactions - return(interactionSet) - }) - - indexData <- sapply(HiCDOCDataSet, function(data){ + + # Create a matrix of chromosomes and matrix + index_mat_chr <- sapply(HiCDOCDataSet, function(data){ chromosome <- data@chromosomes - variables <- paste(data@colData$condition, data@colData$replicate, sep = "_") + variables <- paste(data@colData$condition, data@colData$replicate, + sep = "_") return(list("chromosome" = chromosome, "variables" = variables )) }) - - indexData <- as.data.frame(t(indexData)) - - fill <- NA - - InteractionDataSet <- lapply(chromosome, function(chr){ - - mat <- indexData$chromosome == chr - unionInteractions <- Reduce(union, InteractionData[mat]) - - InteractionSet <- lapply(which(mat), function(i){ - over <- match(HiCDOCDataSet[[i]], unionInteractions) - totalColumns <- ncol(HiCDOCDataSet[[i]]) - newAssays <- matrix( - rep(fill, length(unionInteractions) * totalColumns), - ncol = totalColumns - ) - - newAssays[over, ] <- SummarizedExperiment::assay(HiCDOCDataSet[[i]]) - - InteractionSet <- InteractionSet::InteractionSet( - newAssays, - unionInteractions, - colData = colData(HiCDOCDataSet[[i]])) - - - return(InteractionSet) - }) - - assayChromosome <- lapply(InteractionSet, function(data){ - assayChromosome <- SummarizedExperiment::assay(data) - assayChromosome <- as.data.table(assayChromosome) - setnames(assayChromosome, paste(data$condition, - data$replicate, sep = "_")) - return(assayChromosome) - }) - - assayChromosome <- Reduce(cbind, assayChromosome) - - unionInteractions <- as.data.table(unionInteractions) - - dataplot <- cbind(unionInteractions[, .(chromosome = chromosome, index1, - index2)], assayChromosome) - dataplot[is.na(dataplot)] <- 0 - - return(dataplot) + + index_mat_chr <- as.data.frame(t(index_mat_chr)) + index_mat_chr$chromosome <- unlist(index_mat_chr$chromosome) + index_mat_chr$variables <- unlist(index_mat_chr$variables) + + # Merge all the InteractionSet objects + suppressWarnings(mergedinteractionSet <- Reduce( + f = HiCDOC:::.mergeInteractionSet, + x = HiCDOCDataSet + )) + + HiCDOCSet <- new("HiCDOCDataSet", mergedinteractionSet) + HiCDOCSet <- HiCDOC:::.fillHiCDOCDataSet(HiCDOCSet) + + # Extract the regions from the HiCDOCSet object + indexData <- HiCDOCSet@interactions@regions + indexData <- as.data.frame(indexData) + + # For each chromosome, extract the interaction counts + all_mat_count <- future_lapply(chromosome, function(chr){ + rowsChromosome <- (S4Vectors::mcols(HiCDOCSet)$chromosome == chr) + assayChromosome <- SummarizedExperiment::assay(HiCDOCSet[rowsChromosome, ]) + assayChromosome <- as.data.table(assayChromosome) + data.table::setnames( + assayChromosome, + paste(HiCDOCSet$condition, HiCDOCSet$replicate, sep = "_") + ) + + interactionsChromosome <- InteractionSet::interactions( + HiCDOCSet[rowsChromosome, ] + ) + interactionsChromosome <- as.data.table(interactionsChromosome) + interaction_mat <- cbind( interactionsChromosome[, .( + chromosome = chromosome, index1, index2 )], assayChromosome) + return(interaction_mat) }) - - return(list("InteractionDataSet" = InteractionDataSet, "indexData" = indexData)) + + mat_count <- Reduce(rbind, all_mat_count) + mat_count[is.na(mat_count)] <- 0 + + return(list("InteractionDataSet" = mat_count, "indexData" = indexData, + "index_mat_chr" = index_mat_chr)) } create_cluster <- function(res) { - + + # extract all unique bin all_bins <- unique(c(res$index1, res$index2)) + + # create index mapping for the bins res$bindex1 <- match(res$index1, all_bins) res$bindex2 <- match(res$index2, all_bins) - - # create a merged squared similarity matrix with sum of normalized counts + merged_mat <- rowSums(res[ ,-c(1:3)]) cur_mat <- matrix(0, ncol = length(all_bins), nrow = length(all_bins)) cur_mat[cbind(res$bindex1, res$bindex2)] <- merged_mat cur_mat[cbind(res$bindex2, res$bindex1)] <- merged_mat - + # log transformation cur_mat <- log(cur_mat + 1) - - # perform constrained HAC (and use bin identifiers as labels) + + # perform constrained hierarchical clustering merged_res <- adjClust(cur_mat, type = "similarity", h = length(all_bins) - 1) merged_res$labels <- as.character(all_bins) - - # select the number of clusters with broken stick + + # select the number of clusters with broken stick method merged_clust <- select(merged_res, type = "bstick") names(merged_clust) <- all_bins - + return(data.frame(merged_clust)) } create_trees <- function(red_mat, selected){ - + + # match bin indices in the reduced similarity matrix to selected indices match1 <- match(red_mat$index1, selected) - match2 <- match(red_mat$index2, selected) - + match2 <- match(red_mat$index2, selected) + + # perform hierarchical clustering for each column adjclust_out <- sapply(red_mat[, -c(1:3)], function(arep) { - - ##### create a merged squared similarity matrix with normalized counts + mat_crep <- matrix(0, ncol = length(selected), nrow = length(selected)) - mat_crep[cbind(match1, match2)] <- arep mat_crep[cbind(match2, match1)] <- arep - + + # perform hierarchical clustering on the similarity matrix out <- adjClust(log(mat_crep + 1), type = "similarity") class(out) <- "hclust" - + return(out) }, simplify = FALSE) - + return(adjclust_out) } -normalize_count <- function(count_matrice) { - - counts <- as.matrix(count_matrice[ ,-c(1:3)]) - - colnames(counts) <- NULL - - cur_dge <- SummarizedExperiment(list(counts = counts)) - cur_dge$totals <- colSums(count_matrice[ ,-c(1:3)]) - - offsets <- csaw::normOffsets(cur_dge, se.out = FALSE) - - rm(cur_dge) - gc(verbose = FALSE) - - counts <- counts / exp(offsets) - count_matrice <- as.data.frame(count_matrice) - count_matrice[ ,-c(1:3)] <- data.frame(counts) - - rm(counts) - gc(verbose = FALSE) - - return(count_matrice) +normalize_count <- function(count_matrice, index_mat_chr){ + + # Get all unique chromosomes in the count matrix + chromosome <- unique(count_matrice$chromosome) + + all_count_matrice <- future_lapply(chromosome, function(chr){ + + # Subset the count matrix + count_matrice <- count_matrice[which(count_matrice$chromosome == chr), ] + + # Subset the index matrix + col <- index_mat_chr$variables[which(index_mat_chr$chromosome == chr)] + + # Create a new count matrix + deb <- count_matrice[, c(1:3)] + end <- subset(count_matrice, select = col) + count_matrice <- cbind(deb, end) + + # Extract the count values as a matrix + counts <- as.matrix(count_matrice[ ,-c(1:3)]) + colnames(counts) <- NULL + + # Compute the total number of reads for each sample + cur_dge <- SummarizedExperiment(list(counts = counts)) + cur_dge$totals <- colSums(count_matrice[ ,-c(1:3)]) + + # Normalize the counts + offsets <- csaw::normOffsets(cur_dge, se.out = FALSE) + counts <- counts / exp(offsets) + count_matrice <- as.data.frame(count_matrice) + count_matrice[ ,-c(1:3)] <- data.frame(counts) + + return(count_matrice) + }) + + # Return the list of normalized count matrices + return(all_count_matrice) }