Commit b6b30a21 authored by Olivier Rue's avatar Olivier Rue
Browse files

references analyses

parent 29d93bb8
Pipeline #41904 canceled with stage
---
title: "References analysis"
description: |
METABARFOOD project.
author: Olivier Rué
date: '`r Sys.Date()`'
output:
html_document:
self_contained: false
number_sections: FALSE
code_folding: "show"
toc: true
toc_depth: 5
toc_float: true
---
<link href="https://cdnjs.cloudflare.com/ajax/libs/noUiSlider/7.0.10/jquery.nouislider.min.css" rel="stylesheet"/>
<script src="https://cdnjs.cloudflare.com/ajax/libs/noUiSlider/7.0.10/jquery.nouislider.min.js"></script>
<link href="https://cdnjs.cloudflare.com/ajax/libs/selectize.js/0.12.0/css/selectize.bootstrap3.min.css" rel="stylesheet" />
<script src="https://cdnjs.cloudflare.com/ajax/libs/selectize.js/0.12.0/js/standalone/selectize.min.js"></script>
```{r setup, include=FALSE}
knitr::opts_chunk$set(eval=FALSE, echo =T, cache = FALSE, message = FALSE, warning = FALSE, cache.lazy = FALSE, fig.height = 3.5, fig.width = 10.5)
```
```{r, setup1, echo=FALSE, message = FALSE, warning = FALSE, eval=TRUE}
library(DT)
library(ggpubr)
library(kableExtra)
library(seqinr)
library(gridExtra)
library(phyloseq)
library(phyloseq.extended)
library(tidyverse)
library(plotly)
library(qiime2R)
library(tidyverse)
library(ComplexHeatmap)
library(phyloseqCompanion)
library(circlize)
```
```{r datatable global options, echo=FALSE, eval=TRUE}
options(DT.options = list(pageLength = 10,
scrollX = TRUE,
language = list(search = 'Filter:'),
dom = 'Bfrtip',
buttons = c('copy', 'csv', 'excel', 'pdf', 'print')))
```
```{r, eval=T}
ggd <- read.delim("REFERENCES/ALL_26jan2021.tsv", sep="\t", header=T, stringsAsFactors = F)
ggd <- ggd %>% as_tibble()
ggd
```
# Assignations of references against main databases
The 142 sequences were compared to those present in commonly used databanks:
* UNITE 8.2 for ITS1 and ITS2
* SILVA 28S v138 for D1D2
* nt (nt_2021-07-30) for RPB2
The best hit from blast output is kept and % of coverage and % of identity are ploted for each marker:
```{r, eval=T}
its1 <- read.delim("REFERENCES/ALL_ITS1.blast_vs_UNITE", sep="\t", header=F)
colnames(its1) <- c("Strain", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore", "qlen","taxonomy")
t_its1 <- its1 %>% as_tibble() %>% mutate(pcov = length/qlen)%>% mutate(pident = pident/100) %>% mutate(marker = "ITS1") %>% separate(taxonomy, ";(?=[\\S])", into = c("Ssss", "SKingdom","SPhylum","SClass","SOrder","SFamily","SGenus","SSpecies"), remove = TRUE) %>% mutate(SSpecies = gsub("\\[.*","",SSpecies)) %>% mutate(SSpecies = gsub("s__","",SSpecies)) %>% mutate(SSpecies = gsub("_"," ",trimws(SSpecies)))
its2 <- read.delim("REFERENCES/ALL_ITS2.blast_vs_UNITE", sep="\t", header=F)
colnames(its2) <- c("Strain", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore", "qlen", "taxonomy")
t_its2 <- its2 %>% as_tibble() %>% mutate(pcov = length/qlen)%>% mutate(pident = pident/100) %>% mutate(marker = "ITS2") %>% separate(taxonomy, ";(?=[\\S])", into = c("Ssss", "SKingdom","SPhylum","SClass","SOrder","SFamily","SGenus","SSpecies"), remove = TRUE) %>% mutate(SSpecies = gsub("\\[.*","",SSpecies)) %>% mutate(SSpecies = gsub("s__","",SSpecies)) %>% mutate(SSpecies = gsub("_"," ",trimws(SSpecies)))
d1d2 <- read.delim("REFERENCES/ALL_D1D2.blast_vs_SILVA138-28S", sep="\t", header=F)
colnames(d1d2) <- c("Strain", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore", "qlen", "taxonomy")
t_d1d2 <- d1d2 %>% as_tibble() %>% mutate(pcov = length/qlen)%>% mutate(pident = pident/100) %>% mutate(marker = "D1D2")%>% separate(taxonomy, ";(?=[\\S])", into = c("Ssss", "SKingdom","SPhylum","SClass","SOrder","SFamily","SGenus","SSpecies"), remove = TRUE) %>% mutate(SSpecies = gsub("\\[.*","",SSpecies)) %>% mutate(SSpecies = gsub("s__","",SSpecies)) %>% mutate(SSpecies = gsub("_"," ",trimws(SSpecies)))
rpb2 <- read.delim("REFERENCES/ALL_RPB2.blast_vs_NR", sep="\t", header=F)
colnames(rpb2) <- c("Strain", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore", "qlen", "taxonomy")
t_rpb2 <- rpb2 %>% as_tibble() %>% mutate(pcov = length/qlen) %>% mutate(pident = pident/100)%>% mutate(marker = "RPB2")%>% separate(taxonomy, ";(?=[\\S])", into = c("Ssss", "SKingdom","SPhylum","SClass","SOrder","SFamily","SGenus","SSpecies"), remove = TRUE) %>% mutate(SSpecies = gsub("\\[.*","",SSpecies)) %>% mutate(SSpecies = gsub("s__","",SSpecies)) %>% mutate(SSpecies = gsub("_"," ",trimws(SSpecies)))
t <- bind_rows(t_its1, t_its2, t_d1d2, t_rpb2)
```
```{r, eval=T}
p <- ggplot(t, aes(x=pident, y=pcov, color=marker)) + geom_point(size = 3, alpha = 0.3)
p + stat_ellipse()
#ggplot(t) + geom_violin(aes(x = marker, y = pident, color=marker))
#ggplot(t) + geom_violin(aes(x = marker, y = pcov, color=marker))
p <- ggplot(t, aes(x=pident, y=pcov, color=marker)) + geom_point(size = 3, alpha = 0.3)
p + stat_ellipse()
p<-ggplot(t, aes(x=marker, y=pcov, fill=marker)) +
geom_boxplot(position=position_dodge(1))
p
t
b <- pivot_longer(data = t, cols = c(pident,pcov), names_to = "metric", values_to = "percent")
p<-ggplot(b, aes(x=marker, y=percent, fill=metric)) +
geom_boxplot(position=position_dodge(1))
p
```
`ITS1` and `ITS2` sequences are truncated in UNITE (low % coverage).
For % of identity, `RPB2` has the highest median `r summary(t %>% filter(marker == "ITS1") %>% select(pident))`
```{r}
summary(t %>% filter(marker == "ITS1") %>% select(pident))
summary(t %>% filter(marker == "ITS2") %>% select(pident))
summary(t %>% filter(marker == "D1D2") %>% select(pident))
summary(t %>% filter(marker == "RPB2") %>% select(pident))
```
# Species present perfectly in databases {.tabset}
## ITS1
```{r, eval=T}
infos <- full_join(t_its1,ggd,by="Strain") %>% filter(marker == "ITS1") %>% select(Species, SSpecies, pident, pcov)
infos %>% datatable()
```
`r nrow(infos %>% filter(SSpecies == Species))` hits have the same Species.
## ITS2
```{r, eval=T}
infos <- full_join(t_its2,ggd,by="Strain") %>% filter(marker == "ITS2") %>% select(Species, SSpecies, pident, pcov)
infos %>% datatable()
```
`r nrow(infos %>% filter(SSpecies == Species))` hits have the same Species.
## D1D2
```{r, eval=T}
infos <- full_join(t_d1d2,ggd,by="Strain") %>% filter(marker == "D1D2") %>% select(Species, SSpecies, pident, pcov)
infos %>% datatable()
```
`r nrow(infos %>% filter(SSpecies == Species))` hits have the same Species.
## RPB2
```{r, eval=T}
infos <- full_join(t_rpb2,ggd,by="Strain") %>% filter(marker == "RPB2") %>% select(Species, SSpecies, pident, pcov)
infos %>% datatable()
```
`r infos %>% filter(SSpecies == Species)` hits have the same Species.
```{r, eval=T}
a <- full_join(t,ggd,by="Strain")
a
```
```{r, eval=T}
p <- ggplot(a, aes(x=pident, y=pcov, color=Phylum)) + geom_point(size = 3, alpha = 0.3)
p + stat_ellipse()
```
```{r, eval=T}
its1_unite <- read.delim("REFERENCES/ALL_ITS1.blast_vs_UNITE", sep="\t", header=F)
colnames(its1_unite) <- c("qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore", "qlen","taxonomy")
t_its1_unite <- its1_unite %>% as_tibble() %>% mutate(pcov = length/qlen) %>% mutate(marker = "ITS1", db = "UNITE")
its1_nt <- read.delim("REFERENCES/ALL_ITS1.blast_vs_NR", sep="\t", header=F)
colnames(its1_nt) <- c("qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore", "qlen")
t_its1_nt <- its1_nt %>% as_tibble() %>% mutate(pcov = length/qlen) %>% mutate(marker = "ITS1", db = "NT")
t <- bind_rows(t_its1_unite, t_its1_nt)
p <- ggplot(t, aes(x=pident, y=pcov, color=db)) + geom_point(size = 3, alpha = 0.3)
p + stat_ellipse()
```
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment