change interactionset fonction

9721fe35 · CARDENAS GWENDAELLE · 4904522d · 9721fe35
Commit 9721fe35 authored 2 years ago by CARDENAS GWENDAELLE
--- a/R/HiC2Tree.R
+++ b/R/HiC2Tree.R
-HiC2Tree <- function(file, type, binsize, index = NULL, 
-                        chromosomes) {
-  
-  InteractionDataSet <- InteractionDataSet(file, type, binsize, chromosome, index)
-  
+HiC2Tree <- function(files, format, binsize, index = NULL,
+                     chromosomes){
+
+  # create an InteractionDataSet object
+  InteractionDataSet <- InteractionDataSet(files, format, binsize, chromosomes,
+                                           index)
+
+  # Extract the InteractionDataSet and index_mat_chr variables
  res <- InteractionDataSet$InteractionDataSet
+  index_mat_chr <- InteractionDataSet$index_mat_chr

-  # Perform MA normalization with cyclic loess
-  norm_matrices <- lapply(res, FUN = normalize_count)
-  
-  # Find common clustering
+  # Normalize the count data using cyclic loess
+  norm_matrices <- normalize_count(res, index_mat_chr)
+
+  # Create cluster for each chromosome
  cur_clust <- lapply(norm_matrices, FUN = create_cluster)
-  
-  all_trees <- lapply(seq_along(cur_clust), function(i){
-    
-    all_clusters <- unique(cur_clust[[i]][, 1]) 
-    
+
+  # Create a list of trees
+  all_trees <- future_lapply(seq_along(cur_clust), function(i){
+
+    # List of all clusters
+    all_clusters <- unique(cur_clust[[i]][, 1])
+
+    # Create a tree for each cluster
    trees <- sapply(all_clusters, function(ac) {
-      cat("cluster", ac, ":\n")
-      
-      #### for each cluster, create a dendrogram for each sample
-      
+
+      # Get the indices of the rows and columns in the cluster
      selected <- rownames(cur_clust[[i]])[cur_clust[[i]] == ac]
+
+      # Select the sub-matrix
      red_mat <- ((norm_matrices[[i]]$index1 %in% selected) &
                    (norm_matrices[[i]]$index2 %in% selected))
      red_mat <- norm_matrices[[i]][red_mat, ]
-      
+
+      # Create trees
      trees <- create_trees(red_mat, selected)
-      
+
      return(trees)
-     }, simplify = FALSE)
+    }, simplify = FALSE)
+  })
+
+  # Count the number of clusters for each chromosome
+  nb_cluster <- sapply(all_trees, length)
+
+  # Count the number of matrices for each chromosome
+  nb_mat <- sapply(chromosomes, function(chr){
+    nb_mat <- sum(index_mat_chr$chromosome == chr)
+    return(nb_mat)
+  })
+
+  # Create a vector of chromosomes for each tree
+  met_chr <- rep(chromosomes, nb_cluster * nb_mat)
+
+  # Create a vector of cluster numbers for each tree
+  met_cluster <- sapply(seq_along(chromosomes), function(i){
+    met_cluster <-rep(seq(nb_cluster[i]), each = nb_mat[i])
+    return(met_cluster)
  })
-  
-  all_trees <- unlist(all_trees, recursive = FALSE)
-  
-  return(list("all_trees" = all_trees, "metadata" = InteractionDataSet$indexData))  
-  
+
+  # Create a vector of file names for each tree
+  files <- sapply(seq_along(chromosomes), function(i){
+    files <- rep(files[which(index_mat_chr$chromosome == chromosomes[i])],
+                 nb_cluster[i])
+    return(files)
+  })
+
+  all_trees <- unlist(unlist(all_trees, recursive = FALSE), recursive = FALSE)
+
+  # Give each tree a name
+  names <- sapply(seq_along(all_trees), function(i){
+    names <- paste0("tree", i)
+    return(names)
+  })
+
+  names(all_trees) <- names
+
+  # Add metadata to the trees
+  metadata <- data.frame("names" = names, "chromosome" = met_chr,
+                         "cluster" = unlist(met_cluster),
+                         "file" = unlist(files))
+
+  # Return a list containing the trees, metadata, and indexData
+  return(list("trees" = all_trees, "metadata" = metadata,
+              "index" = InteractionDataSet$indexData))
 }

-InteractionDataSet <- function(file, type, binsize, chromosome, index = NULL) {
-  
+
+InteractionDataSet <- function(file, format, binsize, chromosome, index = NULL){
+
+  # Set the conditions to "mat"
  conditions <- "mat"
-  
+
+  # Create a list of HiCDOCDataSet objects
  HiCDOCDataSet <- sapply(seq_along(file), function(i){
-    if (type[i] == "tabular") {
-      HiCDOCDataSet <- HiCDOCDataSetFromTabular(file[i], i, conditions, 
+    if (format[i] == "tabular") {
+      HiCDOCDataSet <- HiCDOCDataSetFromTabular(file[i], i, conditions,
                                                binsize)
-    } else if (type[i] == "cooler") {
-      HiCDOCDataSet <- HiCDOCDataSetFromCool(file[i], i, conditions, 
+    } else if (format[i] == "cooler") {
+      HiCDOCDataSet <- HiCDOCDataSetFromCool(file[i], i, conditions,
                                             binsize)
-    } else if (type[i] == "juicer") {
-      HiCDOCDataSet <- HiCDOCDataSetFromHiC(file[i], i, conditions, 
+    } else if (format[i] == "juicer") {
+      HiCDOCDataSet <- HiCDOCDataSetFromHiC(file[i], i, conditions,
                                            binsize)
-    
-    } else if (type[i] == "HiC-Pro") {
-      HiCDOCDataSet <- HiCDOCDataSetFromHiCPro(file[i], bedPaths = index, i, 
+
+    } else if (format[i] == "HiC-Pro") {
+      HiCDOCDataSet <- HiCDOCDataSetFromHiCPro(file[i], bedPaths = index, i,
                                               conditions)
    }
    return(HiCDOCDataSet)
  })
-  
-  InteractionData <- lapply(HiCDOCDataSet, function(data){
-    interactionSet <- data@interactions
-    return(interactionSet)
-  })
-  
-  indexData <- sapply(HiCDOCDataSet, function(data){
+
+  # Create a matrix of chromosomes and matrix
+  index_mat_chr <- sapply(HiCDOCDataSet, function(data){
    chromosome <- data@chromosomes
-    variables <- paste(data@colData$condition, data@colData$replicate, sep = "_")
+    variables <- paste(data@colData$condition, data@colData$replicate,
+                       sep = "_")
    return(list("chromosome" = chromosome, "variables" = variables ))
  })
-  
-  indexData <- as.data.frame(t(indexData))
-  
-  fill <- NA
-  
-  InteractionDataSet <- lapply(chromosome, function(chr){
-    
-    mat <- indexData$chromosome == chr
-    unionInteractions <- Reduce(union, InteractionData[mat])
-    
-    InteractionSet <- lapply(which(mat), function(i){  
-      over <- match(HiCDOCDataSet[[i]], unionInteractions)
-      totalColumns <- ncol(HiCDOCDataSet[[i]])
-      newAssays <- matrix(
-        rep(fill, length(unionInteractions) * totalColumns),
-        ncol = totalColumns
-      )
-      
-      newAssays[over, ] <- SummarizedExperiment::assay(HiCDOCDataSet[[i]])
-      
-      InteractionSet <- InteractionSet::InteractionSet(
-        newAssays,
-        unionInteractions,
-        colData = colData(HiCDOCDataSet[[i]]))
-      
-      
-      return(InteractionSet)
-    })
-    
-    assayChromosome <- lapply(InteractionSet, function(data){
-      assayChromosome <- SummarizedExperiment::assay(data)
-      assayChromosome <- as.data.table(assayChromosome)
-      setnames(assayChromosome, paste(data$condition, 
-                                      data$replicate, sep = "_"))
-      return(assayChromosome)
-    })
-
-    assayChromosome <- Reduce(cbind, assayChromosome)
-    
-    unionInteractions <- as.data.table(unionInteractions)
-    
-    dataplot <- cbind(unionInteractions[, .(chromosome = chromosome, index1, 
-                                            index2)], assayChromosome)
-    dataplot[is.na(dataplot)] <- 0
-    
-    return(dataplot)
+
+  index_mat_chr <- as.data.frame(t(index_mat_chr))
+  index_mat_chr$chromosome <- unlist(index_mat_chr$chromosome)
+  index_mat_chr$variables <- unlist(index_mat_chr$variables)
+
+  # Merge all the InteractionSet objects
+  suppressWarnings(mergedinteractionSet <- Reduce(
+    f = HiCDOC:::.mergeInteractionSet,
+    x = HiCDOCDataSet
+  ))
+
+  HiCDOCSet <- new("HiCDOCDataSet", mergedinteractionSet)
+  HiCDOCSet <- HiCDOC:::.fillHiCDOCDataSet(HiCDOCSet)
+
+  # Extract the regions from the HiCDOCSet object
+  indexData <- HiCDOCSet@interactions@regions
+  indexData <- as.data.frame(indexData)
+
+  # For each chromosome, extract the interaction counts
+  all_mat_count <- future_lapply(chromosome, function(chr){
+    rowsChromosome <- (S4Vectors::mcols(HiCDOCSet)$chromosome == chr)
+    assayChromosome <- SummarizedExperiment::assay(HiCDOCSet[rowsChromosome, ])
+    assayChromosome <- as.data.table(assayChromosome)
+    data.table::setnames(
+      assayChromosome,
+      paste(HiCDOCSet$condition, HiCDOCSet$replicate, sep = "_")
+    )
+
+    interactionsChromosome <- InteractionSet::interactions(
+      HiCDOCSet[rowsChromosome, ]
+    )
+    interactionsChromosome <- as.data.table(interactionsChromosome)
+    interaction_mat <- cbind( interactionsChromosome[, .(
+      chromosome = chromosome, index1, index2 )], assayChromosome)
+    return(interaction_mat)
  })
-  
-  return(list("InteractionDataSet" = InteractionDataSet, "indexData" = indexData))
+
+  mat_count <- Reduce(rbind, all_mat_count)
+  mat_count[is.na(mat_count)] <- 0
+
+  return(list("InteractionDataSet" = mat_count, "indexData" = indexData,
+              "index_mat_chr" = index_mat_chr))
 }

 create_cluster <- function(res) {
-  
+
+  # extract all unique bin
  all_bins <- unique(c(res$index1, res$index2))
+
+  # create index mapping for the bins
  res$bindex1 <- match(res$index1, all_bins)
  res$bindex2 <- match(res$index2, all_bins)
-  
-  # create a merged squared similarity matrix with sum of normalized counts
+
  merged_mat <- rowSums(res[ ,-c(1:3)])
  cur_mat <- matrix(0, ncol = length(all_bins), nrow = length(all_bins))
  cur_mat[cbind(res$bindex1, res$bindex2)] <- merged_mat
  cur_mat[cbind(res$bindex2, res$bindex1)] <- merged_mat
-  
+
  # log transformation
  cur_mat <- log(cur_mat + 1)
-  
-  # perform constrained HAC (and use bin identifiers as labels)
+
+  # perform constrained hierarchical clustering
  merged_res <- adjClust(cur_mat, type = "similarity", h = length(all_bins) - 1)
  merged_res$labels <- as.character(all_bins)
-  
-  # select the number of clusters with broken stick
+
+  # select the number of clusters with broken stick method
  merged_clust <- select(merged_res, type = "bstick")
  names(merged_clust) <- all_bins
-  
+
  return(data.frame(merged_clust))
 }

 create_trees <- function(red_mat, selected){
-  
+
+  # match bin indices in the reduced similarity matrix to selected indices
  match1 <- match(red_mat$index1, selected)
-  match2 <- match(red_mat$index2, selected) 
-  
+  match2 <- match(red_mat$index2, selected)
+
+  # perform hierarchical clustering for each column
  adjclust_out <- sapply(red_mat[, -c(1:3)], function(arep) {
-    
-    ##### create a merged squared similarity matrix with normalized counts
+
    mat_crep <- matrix(0, ncol = length(selected), nrow = length(selected))
-    
    mat_crep[cbind(match1, match2)] <- arep
    mat_crep[cbind(match2, match1)] <- arep
-    
+
+    # perform hierarchical clustering on the similarity matrix
    out <- adjClust(log(mat_crep + 1), type = "similarity")
    class(out) <- "hclust"
-    
+
    return(out)
  }, simplify = FALSE)
-  
+
  return(adjclust_out)
 }

-normalize_count <- function(count_matrice) {
-  
-  counts <- as.matrix(count_matrice[ ,-c(1:3)])
-  
-  colnames(counts) <- NULL
-  
-  cur_dge <- SummarizedExperiment(list(counts = counts))
-  cur_dge$totals <- colSums(count_matrice[ ,-c(1:3)])
-  
-  offsets <- csaw::normOffsets(cur_dge, se.out = FALSE)
-  
-  rm(cur_dge)
-  gc(verbose = FALSE)
-  
-  counts <- counts / exp(offsets)
-  count_matrice <- as.data.frame(count_matrice)
-  count_matrice[ ,-c(1:3)] <- data.frame(counts)
- 
-  rm(counts) 
-  gc(verbose = FALSE)
-  
-  return(count_matrice)
+normalize_count <- function(count_matrice, index_mat_chr){
+
+  # Get all unique chromosomes in the count matrix
+  chromosome <- unique(count_matrice$chromosome)
+
+  all_count_matrice <- future_lapply(chromosome, function(chr){
+
+    # Subset the count matrix
+    count_matrice <- count_matrice[which(count_matrice$chromosome == chr), ]
+
+    # Subset the index matrix
+    col <- index_mat_chr$variables[which(index_mat_chr$chromosome == chr)]
+
+    # Create a new count matrix
+    deb <- count_matrice[, c(1:3)]
+    end <- subset(count_matrice, select = col)
+    count_matrice <- cbind(deb, end)
+
+    # Extract the count values as a matrix
+    counts <- as.matrix(count_matrice[ ,-c(1:3)])
+    colnames(counts) <- NULL
+
+    # Compute the total number of reads for each sample
+    cur_dge <- SummarizedExperiment(list(counts = counts))
+    cur_dge$totals <- colSums(count_matrice[ ,-c(1:3)])
+
+    # Normalize the counts
+    offsets <- csaw::normOffsets(cur_dge, se.out = FALSE)
+    counts <- counts / exp(offsets)
+    count_matrice <- as.data.frame(count_matrice)
+    count_matrice[ ,-c(1:3)] <- data.frame(counts)
+
+    return(count_matrice)
+  })
+
+  # Return the list of normalized count matrices
+  return(all_count_matrice)
 }