|
| 1 | +require(igraph) |
| 2 | +require(jsonlite) |
| 3 | + |
| 4 | +tn93 <- read.csv('data/clusters.tn93.txt', skip=1, header=F) |
| 5 | +info <- read.csv('data/clusters.info.csv') |
| 6 | + |
| 7 | +# read headers from FASTA |
| 8 | +headers <- rep(NA, times=nrow(tn93)) |
| 9 | +con <- file('data/clusters.fa', open='r') |
| 10 | +i <- 1 |
| 11 | +while (length(line <- readLines(con, n=1, warn=FALSE)) > 0) { |
| 12 | + if (grepl("^>", line)) { |
| 13 | + headers[i] <- gsub("^>(.+)", "\\1", line) |
| 14 | + i <- i + 1 |
| 15 | + } |
| 16 | +} |
| 17 | + |
| 18 | +hc <- hclust(as.dist(tn93), method='ward.D') |
| 19 | +clusters <- cutree(hc, h=0.002) |
| 20 | +#hist(log10(table(clusters)), breaks=20, col='grey', border='white') |
| 21 | + |
| 22 | +dates <- as.Date(sapply(headers, function(x) { |
| 23 | + strsplit(x, "\\|")[[1]][3] |
| 24 | + })) |
| 25 | + |
| 26 | +# identify the earliest sample (Wuhan, IPBCAMS-WH-01) |
| 27 | +root <- which.min(dates) |
| 28 | + |
| 29 | +result <- lapply(1:max(clusters), function(i) { |
| 30 | + # find node in cluster that is closest to root |
| 31 | + idx <- which(clusters==i) |
| 32 | + if (length(idx)==1) { |
| 33 | + list(nodes=headers[idx], edges=NA) |
| 34 | + } |
| 35 | + else { |
| 36 | + subroot <- headers[idx[which.min(tn93[root, idx])]] |
| 37 | + |
| 38 | + # generate minimum spanning tree |
| 39 | + mx <- as.matrix(tn93[idx, idx]) |
| 40 | + colnames(mx) <- headers[idx] |
| 41 | + g <- graph.adjacency(mx, weighted=TRUE) |
| 42 | + g.mst <- igraph::mst(g, algorithm='prim') |
| 43 | + |
| 44 | + # traverse MST and export node and edge lists |
| 45 | + el <- get.edgelist(g.mst) |
| 46 | + |
| 47 | + traverse <- function(node, parent, edgelist, edges=c()) { |
| 48 | + # get local edges |
| 49 | + temp <- el[apply(el, 1, function(e) is.element(node, e)), ] |
| 50 | + temp <- unique(as.vector(temp)) |
| 51 | + children <- temp[!is.element(temp, c(node, parent))] |
| 52 | + |
| 53 | + for (child in children) { |
| 54 | + edges <- c(edges, node, child) |
| 55 | + edges <- traverse(child, node, edgelist, edges) |
| 56 | + } |
| 57 | + return(edges) |
| 58 | + } |
| 59 | + edges <- traverse(subroot, NA, edgelist) |
| 60 | + nodes <- unique(edges) |
| 61 | + edges <- matrix(edges, ncol=2, byrow=TRUE) |
| 62 | + list(nodes=nodes, edges=edges) |
| 63 | + } |
| 64 | +}) |
| 65 | + |
| 66 | +write(toJSON(result, pretty=TRUE), file="cluster.json") |
| 67 | + |
| 68 | +# record subroots |
0 commit comments