utils.R 2.97 KB
Newer Older
1
## Template for reading biom file 
2
read_frogs_biom <- function(biom_file) {
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
  phyloseq.extended::import_frogs(biom_file)
}

## Template for reading tsv files
read_multihits <- function(multihits_file) {
  readr::read_tsv(multihits_file)
}

## Functions to create short taxa names -------------------------------------
long_taxa_names <- function(taxa_names, max_size = 32) {
  ## any taxa name longer than 32 characters
  any(taxa_names >= max_size)
}

ambiguous_taxa <- function(physeq) {
  phyloseq::tax_table(physeq) %>% as("matrix") %>% 
    apply(MARGIN = 1, FUN = function(x) { any(x == "Multi-affiliation")}) %>% 
    which() %>% names()
}

create_otu_dictionary <- function(physeq) {
  otu_dictionary <- tibble::tibble(
    sequence  = phyloseq::taxa_names(physeq), 
    abundance = phyloseq::taxa_sums(physeq)
  ) %>% 
    dplyr::arrange(desc(abundance))
29
  ## If all taxa names are long, sequences are likely to have been processed by DADA2
30
  ## Move OTU names to sequence column and rename OTU with shorter names
31
  if (long_taxa_names(otu_dictionary$sequence)) {
32
33
   otu_dictionary <- otu_dictionary %>% 
     dplyr::mutate(OTU = paste0("ASV", 1:nrow(otu_dictionary)))
34
35
36
37
38
39
40
41
42
43
44
45
  } else {
    otu_dictionary <- otu_dictionary %>% dplyr::mutate(OTU = sequence)
  }
  otu_dictionary
}

sanitize_physeq_and_affi <- function(physeq, affi) {
  ## Create dictionary
  otu_dictionary <- create_otu_dictionary(physeq)
  old_to_new <- otu_dictionary %>% dplyr::select(-abundance) %>% tibble::deframe()
  
  ## Add abundance information to affiliation table
46
  affi <- dplyr::inner_join(otu_dictionary, affi, by = c("sequence" = "#observation_name")) %>% 
47
    tidyr::separate(blast_taxonomy, into = phyloseq::rank_names(physeq), sep = ";")
48
49
  
  ## Rename taxa in physeq object
50
  phyloseq::taxa_names(physeq) <- old_to_new[phyloseq::taxa_names(physeq)] %>% unname()
51
52
  
  ## Remove previously curated taxa from affi
53
  affi <- affi %>% dplyr::filter(OTU %in% ambiguous_taxa(physeq))
54
55
56
57
58
59
60
61
  
  list(
    physeq         = physeq, 
    affi           = affi, 
    otu_dictionary = otu_dictionary
  )
}

62
63
## Other utilities ------------------------------------------

64
65
66
67
68
69
70
71
72
## Order OTU by priority level
sort_ambiguous_otu <- function(physeq, affi) {
  affi %>% dplyr::arrange(desc(abundance)) %>% dplyr::pull(OTU)
}

## Extract all affiliation for a given OTU
extract_affiliation <- function(affi, otu) {
  affi %>% 
    dplyr::filter(OTU == otu) %>% 
Sandra Derozier's avatar
Sandra Derozier committed
73
74
75
76
77
78
79
80
81
82
    dplyr::select(Kingdom:Species) %>% 
    dplyr::distinct()
}

## Extract all affiliation for a given OTU
extract_sequence <- function(affi, otu) {
  affi %>% 
    dplyr::filter(OTU == otu) %>% 
    dplyr::pull(sequence) %>% 
    head(1)
83
}
84
85
86
87
88
89
90
91
92

## Find level of ambiguity
find_level <- function(data) {
  not_consistent <- function(x) { !all(x == x[1]) }
  rank_names <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")
  for (rank in rank_names) {
    rank_content <- data[[rank]]
    if (not_consistent(rank_content)) return(rank)
  }
93
94
  warning("The provided taxa is not ambiguous")
  NULL
95
}
96