Skip to content

Commit dafb2e5

Browse files
committed
added R script for hierarchical clustering
1 parent f10dee0 commit dafb2e5

File tree

1 file changed

+68
-0
lines changed

1 file changed

+68
-0
lines changed

hclust.R

+68
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
require(igraph)
2+
require(jsonlite)
3+
4+
tn93 <- read.csv('data/clusters.tn93.txt', skip=1, header=F)
5+
info <- read.csv('data/clusters.info.csv')
6+
7+
# read headers from FASTA
8+
headers <- rep(NA, times=nrow(tn93))
9+
con <- file('data/clusters.fa', open='r')
10+
i <- 1
11+
while (length(line <- readLines(con, n=1, warn=FALSE)) > 0) {
12+
if (grepl("^>", line)) {
13+
headers[i] <- gsub("^>(.+)", "\\1", line)
14+
i <- i + 1
15+
}
16+
}
17+
18+
hc <- hclust(as.dist(tn93), method='ward.D')
19+
clusters <- cutree(hc, h=0.002)
20+
#hist(log10(table(clusters)), breaks=20, col='grey', border='white')
21+
22+
dates <- as.Date(sapply(headers, function(x) {
23+
strsplit(x, "\\|")[[1]][3]
24+
}))
25+
26+
# identify the earliest sample (Wuhan, IPBCAMS-WH-01)
27+
root <- which.min(dates)
28+
29+
result <- lapply(1:max(clusters), function(i) {
30+
# find node in cluster that is closest to root
31+
idx <- which(clusters==i)
32+
if (length(idx)==1) {
33+
list(nodes=headers[idx], edges=NA)
34+
}
35+
else {
36+
subroot <- headers[idx[which.min(tn93[root, idx])]]
37+
38+
# generate minimum spanning tree
39+
mx <- as.matrix(tn93[idx, idx])
40+
colnames(mx) <- headers[idx]
41+
g <- graph.adjacency(mx, weighted=TRUE)
42+
g.mst <- igraph::mst(g, algorithm='prim')
43+
44+
# traverse MST and export node and edge lists
45+
el <- get.edgelist(g.mst)
46+
47+
traverse <- function(node, parent, edgelist, edges=c()) {
48+
# get local edges
49+
temp <- el[apply(el, 1, function(e) is.element(node, e)), ]
50+
temp <- unique(as.vector(temp))
51+
children <- temp[!is.element(temp, c(node, parent))]
52+
53+
for (child in children) {
54+
edges <- c(edges, node, child)
55+
edges <- traverse(child, node, edgelist, edges)
56+
}
57+
return(edges)
58+
}
59+
edges <- traverse(subroot, NA, edgelist)
60+
nodes <- unique(edges)
61+
edges <- matrix(edges, ncol=2, byrow=TRUE)
62+
list(nodes=nodes, edges=edges)
63+
}
64+
})
65+
66+
write(toJSON(result, pretty=TRUE), file="cluster.json")
67+
68+
# record subroots

0 commit comments

Comments
 (0)