library(phyloseq)
library(DESeq2)
library(vsn)
library(microbiome)
library(dplyr)
library(ggplot2)
library(gridExtra)
<- readRDS("data/soil_processed/soil.RDS") soil
Exploratory analysis of compositional data (part II)
Exploratory analysis of a microbial data set
Here we use the “88 soils” data set (Lauber et al. 2009) containing bacterial communities in 88 soils from across North and South America.
Load packages and data data
Exploration of the phyloseq object
soil
phyloseq-class experiment-level object
otu_table() OTU Table: [ 116 taxa and 89 samples ]
sample_data() Sample Data: [ 89 samples by 1 sample variables ]
tax_table() Taxonomy Table: [ 116 taxa by 7 taxonomic ranks ]
# Read count table
<- otu_table(soil)
otutab 1:10, 1:6] otutab[
OTU Table: [6 taxa and 10 samples]
taxa are columns
1124701 697997 203969 205391 843189 3431064
103.CA2 15 2 0 0 0 3
103.CO3 14 4 0 0 0 1
103.SR3 1 0 0 0 0 1
103.IE2 8 0 0 1 0 0
103.BP1 13 67 0 0 0 3
103.VC2 7 0 0 1 0 7
103.SA2 6 1 0 0 0 0
103.GB2 3 3 0 0 0 0
103.CO2 2 0 0 1 0 5
103.KP1 2 1 0 0 0 1
# Taxonomy table
<- tax_table(soil)
taxtab head(taxtab)
Taxonomy Table: [6 taxa by 7 taxonomic ranks]:
Kingdom Phylum Class
1124701 "k__Bacteria" "p__Bacteroidetes" "c__[Saprospirae]"
697997 "k__Bacteria" "p__Acidobacteria" "c__[Chloracidobacteria]"
203969 "k__Bacteria" "p__Acidobacteria" "c__DA052"
205391 "k__Bacteria" "p__Acidobacteria" "c__Solibacteres"
843189 "k__Bacteria" "p__Acidobacteria" "c__Solibacteres"
3431064 "k__Bacteria" "p__Gemmatimonadetes" "c__Gemmatimonadetes"
Order Family Genus
1124701 "o__[Saprospirales]" "f__Chitinophagaceae" "g__"
697997 "o__RB41" "f__" "g__"
203969 "o__Ellin6513" "f__" "g__"
205391 "o__Solibacterales" "f__" "g__"
843189 "o__Solibacterales" "f__Solibacteraceae" "g__Candidatus Solibacter"
3431064 "o__N1423WL" "f__" "g__"
Species
1124701 "s__"
697997 "s__"
203969 "s__"
205391 "s__"
843189 "s__"
3431064 "s__"
# Sample data
<- sample_data(soil)
sampdata head(sampdata)
ph
103.CA2 8.02
103.CO3 6.02
103.SR3 6.95
103.IE2 5.52
103.BP1 7.53
103.VC2 5.99
Add sample variables
We add a few sample variables we will need later in this tutorial:
- sampleID
- phType (acidic if pH<6.7 and basic if pH>6.7)
- totalReads (total number of reads per sample)
<- data.frame(sampleID = rownames(sampdata),
sampdata totalReads = sample_sums(soil),
ph = sampdata$ph)
$phType <- as.factor(ifelse(sampdata$ph < 6.7, "acid", "basic"))
sampdata
head(sampdata)
sampleID totalReads ph phType
103.CA2 103.CA2 205 8.02 basic
103.CO3 103.CO3 233 6.02 acid
103.SR3 103.SR3 169 6.95 basic
103.IE2 103.IE2 244 5.52 acid
103.BP1 103.BP1 297 7.53 basic
103.VC2 103.VC2 256 5.99 acid
table(sampdata$phType)
acid basic
59 30
# Add data frame to phyloseq object
sample_data(soil) <- sampdata
Sequencing depth / library size
Plot the sequencing depth (total number of reads) of each sample.
<- ggplot(sampdata, aes(x = sampleID, y = totalReads)) +
p theme_bw() +
geom_bar(stat = "identity") +
theme(axis.text.x = element_blank()) +
xlab("Sample")
p
min(sampdata$totalReads)
[1] 1
<- sampdata[which.min(sampdata$totalReads), "sampleID"] torm
Remove the sample with only one read count.
<- subset_samples(soil, sampleID != torm)
soil soil
phyloseq-class experiment-level object
otu_table() OTU Table: [ 116 taxa and 88 samples ]
sample_data() Sample Data: [ 88 samples by 4 sample variables ]
tax_table() Taxonomy Table: [ 116 taxa by 7 taxonomic ranks ]
<- as(otu_table(soil), "matrix")
otutab <- sample_data(soil) sampdata
Sparsity
Number of zeros and percentage of zeros in the OTU table
<-dim(otutab)[2]
nvar <- dim(otutab)[1]
nsamp
sum(otutab == 0)
[1] 6797
sum(otutab == 0) / (nvar * nsamp) * 100
[1] 66.58503
67 of the data are zeros.
Visualize microbial composition
Have a look at the microbial compositions on phylum level.
# Agglomerate to phylum level
<- tax_glom(soil, taxrank = "Phylum") soil_phyla
Stacked bar plot of the observed “absolute” abundances:
<- plot_bar(soil_phyla, fill = "Phylum")
p
+ theme_bw() +
p ylab("Absolute abundance") +
theme(axis.text.x = element_blank()) +
scale_fill_brewer(palette = "Set2")
Stacked bar plot of the relative abundances:
# Compute relative abundances
<- transform_sample_counts(soil_phyla, function(x) x/sum(x)) soil_phyla_rel
<- plot_bar(soil_phyla_rel, fill = "Phylum")
p
+ theme_bw() +
p ylab("Absolute abundance") +
theme(axis.text.x = element_blank()) +
scale_fill_brewer(palette = "Set2")
Zero replacement
Zero counts are replaced by a unit pseudo count. In doing so, ratios between non-zero counts are preserved, which is not the case if a pseudo count is added to the whole matrix (which is also common).
<- soil
soil_zrepl otu_table(soil_zrepl)[otu_table(soil_zrepl) == 0] <- 1
<- as(otu_table(soil_zrepl), "matrix") otutab_zrepl
The zCompositions
R package provides more complex methods for zero replacement.
Mean - variance relationship
::meanSdPlot(t(otutab), plot = FALSE)$gg +
vsntheme_bw() + theme(text = element_text(size = 16))
Normalization
We compare three normalization methods we have already seen in the lectures:
- CLR (centered log-ratio) transformation
- Variance stabilizing transformation (vst)
- Regularized log transformation (similar to vst but more robust when the size factors vary widely).
# clr
<- microbiome::transform(soil_zrepl, transform = "clr")
soil_clr <- otu_table(soil_clr)
clr_counts
# vst (function expects samples in columns)
<- DESeq2::varianceStabilizingTransformation(t(otutab_zrepl),
vst_counts fitType = "local")
converting counts to integer mode
# rlog
<- DESeq2::rlog(t(otutab_zrepl), fitType = "local") rlog_counts
rlog() may take a long time with 50 or more samples,
vst() is a much faster transformation
converting counts to integer mode
Create mean-sd plots using the function from vsn
package.
<- meanSdPlot(t(clr_counts), plot = FALSE)$gg +
pclr theme_bw() + theme(text = element_text(size = 16)) + ggtitle("clr")
<- meanSdPlot(vst_counts, plot = FALSE)$gg +
pvst theme_bw() + theme(text = element_text(size = 16)) + ggtitle("vst")
<- meanSdPlot(rlog_counts, plot = FALSE)$gg +
prlog theme_bw() + theme(text = element_text(size = 16)) + ggtitle("rlog")
::grid.arrange(pclr, pvst, prlog,
gridExtrancol = 3, nrow = 1)
Alpha diversity
Another common task is diversity analysis. Alpha diversity summarizes the distribution of species abundances in a given sample.
<- plot_richness(soil,
pAlpha color = "phType",
measures = c("Observed", "Shannon", "InvSimpson", "Chao1"),
title = "Alpha diveristy for 88 soil data")
+ geom_point(size = 2) + theme_bw() + theme(axis.text.x = element_blank()) pAlpha
Low-dimensional representation
In this section, we want to plot the data in the two-dimensional space. We therefore compute distances between all samples and perform multi-dimensional scaling (MDS). Here, we use the non-normalized data (see (McKnight et al. 2019))
Distance functions provided by the phyloseq package:
<- unlist(distanceMethodList)
dist_methods dist_methods
UniFrac1 UniFrac2 DPCoA JSD vegdist1 vegdist2
"unifrac" "wunifrac" "dpcoa" "jsd" "manhattan" "euclidean"
vegdist3 vegdist4 vegdist5 vegdist6 vegdist7 vegdist8
"canberra" "bray" "kulczynski" "jaccard" "gower" "altGower"
vegdist9 vegdist10 vegdist11 vegdist12 vegdist13 vegdist14
"morisita" "horn" "mountford" "raup" "binomial" "chao"
vegdist15 betadiver1 betadiver2 betadiver3 betadiver4 betadiver5
"cao" "w" "-1" "c" "wb" "r"
betadiver6 betadiver7 betadiver8 betadiver9 betadiver10 betadiver11
"I" "e" "t" "me" "j" "sor"
betadiver12 betadiver13 betadiver14 betadiver15 betadiver16 betadiver17
"m" "-2" "co" "cc" "g" "-3"
betadiver18 betadiver19 betadiver20 betadiver21 betadiver22 betadiver23
"l" "19" "hk" "rlb" "sim" "gl"
betadiver24 dist1 dist2 dist3 designdist
"z" "maximum" "binary" "minkowski" "ANY"
MDS plot with Bray Curtis distance
<- ordinate(soil, method = "MDS", distance = "bray") soil_mds_bray
First a plot without coloring metadata.
<- plot_ordination(soil, soil_mds_bray,
mds_soil_bray title = "MDS of 88 soil data")
+
mds_soil_bray theme_bw() +
theme(text = element_text(size = 14)) +
geom_point(size = 3)
One could identify two main clusters, separated by the first axis.
Add the new data frame to the phyloseq object.
sample_data(soil) <- sampdata
<- plot_ordination(soil, soil_mds_bray, color = "phType",
mds_soil_bray title = "")
+
mds_soil_bray theme_bw() +
theme(text = element_text(size = 14)) +
geom_point(size = 3) +
ggtitle("MDS of 88 soil data colored by pH type")
Indeed, pH type separates samples along Axis.1.
MDS plot with Aitchison distance
We generate another MDS plot, but this time using the Aitchison distance, which is simply the Euclidean distance of clr-transformed counts.
<- ordinate(soil_clr, method = "MDS", distance = "euclidean") soil_mds_ait
sample_data(soil_clr) <- sampdata
<- plot_ordination(soil_clr, soil_mds_ait, color = "phType",
mds_soil_ait title = "")
<- mds_soil_bray +
p1 theme_bw() +
theme(text = element_text(size = 12)) +
geom_point(size = 3) +
ggtitle("Bray Curtis dissimilarity")
<- mds_soil_ait +
p2 theme_bw() +
theme(text = element_text(size = 12)) +
geom_point(size = 3) +
ggtitle("Aitchison distance")
::grid.arrange(p1, p2,
gridExtrancol = 2, nrow = 1)
MDS plots for several distance measures
Now, we apply all available distance functions to the data, compute the corresponding MDS embeddings, and plot them.
# Remove some unwanted distances
<- dist_methods[!dist_methods %in% c("unifrac", "wunifrac",
sel_dist "dpcoa", "ANY", "z")]
Now we can apply these distance metrics to our data.
<- function(d_method, phylo_obj){
gather_ordination_data # Calculate the MDS matrix using the distance
<- ordinate(phylo_obj, method = "MDS", distance = d_method)
ordinate_obj # Get the data for plotting
<- plot_ordination(phylo_obj, ordinate_obj)$data
plot_data # Add a column for distance
$distance <- d_method
plot_datareturn(plot_data)
}
<- bind_rows(lapply(sel_dist, gather_ordination_data,
mds_plot_data phylo_obj = soil))
head(mds_plot_data)
Axis.1 Axis.2 sampleID totalReads ph phType distance
103.CA2...1 -0.33649635 0.02831767 103.CA2 205 8.02 basic jsd
103.CO3...2 -0.20666028 0.14139595 103.CO3 233 6.02 acid jsd
103.SR3...3 -0.28623896 -0.06653403 103.SR3 169 6.95 basic jsd
103.IE2...4 0.07986531 0.15001745 103.IE2 244 5.52 acid jsd
103.BP1...5 -0.23627593 0.15166403 103.BP1 297 7.53 basic jsd
103.VC2...6 -0.05674931 0.23499852 103.VC2 256 5.99 acid jsd
We color again according to pH type.
<- ggplot(mds_plot_data, aes(x=Axis.1, y=Axis.2, color=phType)) +
mds_soil geom_point() +
facet_wrap(~ distance, scales = "free") +
labs(x="Axis.1",
y="Axis.2",
title="Separation of 88 soil samples according to different distances")
+ theme_bw() + theme(text = element_text(size = 12)) mds_soil