-
Notifications
You must be signed in to change notification settings - Fork 5
Generating Distance Matrices from Sequence and Omics Data for omeClust
Ali Rahnavard edited this page Apr 9, 2025
·
1 revision
This R code demonstrates how to generate distance matrices from two types of input data: sequencing data and other omics profiles such as relative abundance tables. These distance matrices can then be used as input for omeClust, a community detection tool. Additionally, the script includes an analysis of ecological diversity using HMP data, providing insights into alpha diversity across different body areas.
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_knit$set(root.dir = "~/Downloads/Lab_metagenomics/")
library(ape) # for sequence distances
library(vegan) # for ecological distance metrics
library(devtools) # for installing GWDBB
library(GWDBB) # for loading example HMP data
library(ggplot2) # for plotting
# Change this to your local working directory
setwd("~/Downloads/Lab_metagenomics/")
# Load FASTA sequence
seq <- read.FASTA('data/Campylobacter_showae.fasta')
# Clean sequence names
names(seq) <- gsub("_metaphlan_bowtie2", "", names(seq))
# Compute K80 distance
D <- ape::dist.dna(seq, model = "K80", gamma = FALSE, variance = TRUE,
pairwise.deletion = TRUE, as.matrix = TRUE)
# Save distance matrix
write.table(D, 'data/Campylobacter_showae_K80.txt', sep = "\t",
quote = FALSE, col.names = NA, row.names = TRUE)
# Run once to install
devtools::install_github("GWCBI/GWDBB")
library(GWDBB)
data("HMP1_II_Metadata")
# Fix column names
colnames(HMP1_II_Metadata) <- c("Person_ID", "VISNO", "Body_area", "Body_site", "SNPRNT", "Gender", "WMSPhase")
# Select useful columns
my_HMP_metadata <- HMP1_II_Metadata[, c("Body_area", "Body_site", "Gender")]
# Save metadata
write.table(my_HMP_metadata, 'data/my_HMP_metadata.txt', sep = "\t",
quote = FALSE, col.names = NA, row.names = TRUE)
data("HMP1_II_Microbial_Species")
HMP1_II_Microbial_Species <- t(HMP1_II_Microbial_Species)
# Bray-Curtis distance between samples
veg_dist <- as.matrix(vegdist(HMP1_II_Microbial_Species, method = "bray"))
# Save distance matrix
write.table(veg_dist, 'data/HMP_disatnce.txt', sep = "\t",
quote = FALSE, col.names = NA, row.names = TRUE)
To run omeClust
, open a terminal and follow these steps:
# 1. Install
$ sudo pip3 install omeClust
# 2. Check version
$ omeClust --version
# Expected output: omeClust 1.1.5
# 3. View help
$ omeClust -h
# 4. Run omeClust
$ omeClust -i data/HMP_disatnce.txt --metadata data/my_HMP_metadata.txt -o HMP_omeClust --plot
π£ Discussion: Find 2 interesting biological insights from your clustering results and bring 1 question to session or journal club.
# Prepare diversity data frame
HMP1_II_Microbial_Species <- as.data.frame(HMP1_II_Microbial_Species)
diversity_data <- setNames(
data.frame(matrix(nrow = nrow(HMP1_II_Microbial_Species), ncol = 4)),
c("Sample", "Group", "Alpha_diversity", "Beta_diversity")
)
rownames(diversity_data) <- rownames(HMP1_II_Microbial_Species)
# Compute common diversity indices
simp <- diversity(HMP1_II_Microbial_Species, "simpson")
invsimp <- diversity(HMP1_II_Microbial_Species, "invsimpson")
shannon <- diversity(HMP1_II_Microbial_Species, "shannon")
hist(simp, main = "Simpson Diversity", xlab = "Index")
hist(invsimp, main = "Inverse Simpson", xlab = "Index")
hist(shannon, main = "Shannon Diversity", xlab = "Index")
barplot(sort(simp), main = "Sorted Simpson Diversity")
barplot(sort(invsimp), main = "Sorted Inverse Simpson")
diversity_data$Alpha_diversity <- shannon
diversity_data$Body_area <- my_HMP_metadata[rownames(HMP1_II_Microbial_Species), "Body_area"]
ggplot(diversity_data, aes(x = factor(Body_area), y = Alpha_diversity)) +
geom_boxplot(aes(fill = Body_area), alpha = 0.25) +
geom_point(aes(fill = Body_area), size = 2, alpha = 0.5, shape = 21,
colour = "black", position = position_jitterdodge(), show.legend = FALSE) +
labs(title = "Shannon Diversity across Body Areas", x = "Body Area", y = "Shannon Index")
- Try this pipeline with your own data.
- Explore different models for sequence distance (
model = "raw"
,"TN93"
, etc.). - Replace Bray-Curtis with other ecological distances (e.g., Jaccard).
- Feed output to
omeClust
and interpret cluster outputs biologically.