Skip to content

Commit 4ba6b0e

Browse files
committed
got treetime working
1 parent 2374f98 commit 4ba6b0e

File tree

6 files changed

+81
-20
lines changed

6 files changed

+81
-20
lines changed

covizu.sh

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# screen for non-human and low-coverage samples -> gisaid-filtered.fa
2+
python3 filtering.py
3+
4+
# calculate TN93 distances
5+
tn93 -t 0.0001 -o data/gisaid.tn93.csv data/gisaid-filtered.fa
6+
7+
# cluster genomes into variants -> variants.csv, variants.fa
8+
python3 variants.py
9+
10+
# calculate TN93 distances for clusters and output as HyPhy matrix
11+
tn93 -o data/variants.tn93.txt -f hyphy data/variants.fa
12+
13+
# convert HyPhy matrix format to CSV
14+
sed -i 's/[{}]//g' data/variants.tn93.txt
15+
16+
# hierarchical clustering -> data/clusters.json
17+
Rscript hclust.R
18+
19+
# run FastTree and TreeTime
20+
python3 treetime.py

hclust.R

+15-5
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
require(igraph)
22
require(jsonlite)
33

4-
tn93 <- read.csv('data/clusters.tn93.txt', skip=1, header=F)
5-
info <- read.csv('data/clusters.info.csv')
4+
tn93 <- read.csv('data/variants.tn93.txt', skip=1, header=F)
5+
variants <- read.csv('data/variants.csv')
66

77
# read headers from FASTA
88
headers <- rep(NA, times=nrow(tn93))
9-
con <- file('data/clusters.fa', open='r')
9+
con <- file('data/variants.fa', open='r')
1010
i <- 1
1111
while (length(line <- readLines(con, n=1, warn=FALSE)) > 0) {
1212
if (grepl("^>", line)) {
@@ -57,12 +57,22 @@ result <- lapply(1:max(clusters), function(i) {
5757
return(edges)
5858
}
5959
edges <- traverse(subroot, NA, edgelist)
60-
nodes <- unique(edges)
6160
edges <- matrix(edges, ncol=2, byrow=TRUE)
61+
62+
# store variant data
63+
nodes <- list()
64+
for (node in unique(edges)) {
65+
temp <- variants[variants$cluster==node, ]
66+
temp$label1 <- sapply(as.character(temp$label), function(x) {
67+
strsplit(x, "\\|")[[1]][1]
68+
})
69+
nodes[[node]] <- temp[c('label1', 'region', 'country', 'coldate')]
70+
}
71+
6272
list(nodes=nodes, edges=edges)
6373
}
6474
})
6575

66-
write(toJSON(result, pretty=TRUE), file="cluster.json")
76+
write(toJSON(result, pretty=TRUE), file="data/clusters.json")
6777

6878
# record subroots

mst.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ def get_edgelist(g):
4848

4949
def parse_clusters(infile):
5050
"""
51-
:param infile: cluster info file from clustering.py
51+
:param infile: cluster info file from variants.py
5252
:return: dict
5353
"""
5454
clusters = {}
@@ -164,7 +164,7 @@ def parse_args():
164164
help='input, path to TN93 CSV file')
165165
parser.add_argument('--info', default='data/clusters.info.csv',
166166
help='input, path to CSV with cluster information, '
167-
'generated by clustering.py')
167+
'generated by variants.py')
168168
parser.add_argument('--outstem', default='mst/component-{}.edgelist.csv',
169169
help='output, stem for output files with Python '
170170
'formatted string syntax with one placeholder '

parse-nexus.py

+16-5
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,38 @@
11
import re
22
import argparse
3-
from clustering import date2float
43
from datetime import date
54
import sys
65
from Bio import Phylo
76
from io import StringIO
87

8+
9+
def date2float(dt):
10+
origin = date(dt.year, 1, 1)
11+
td = (dt-origin).days
12+
return dt.year + td/365.25
13+
14+
915
DATE_TOL = 0.1
1016

1117
parser = argparse.ArgumentParser(
1218
description = "Use regular expressions to extract comment fields "
1319
"from NEXUS output of TreeTime and write to a "
1420
"separate CSV file. Remove problematic tips."
1521
)
16-
parser.add_argument('infile', type=argparse.FileType('r'),
22+
parser.add_argument('--infile', type=argparse.FileType('r'),
23+
default=open('treetime/timetree.nexus'),
1724
help="input, TreeTime NEXUS output file")
18-
parser.add_argument('csvfile', type=argparse.FileType('w'),
25+
parser.add_argument('--csvfile', type=argparse.FileType('w'),
26+
default=open('treetime/nodedate.csv', 'w'),
1927
help="output, CSV file with node date estimates")
20-
parser.add_argument('outfile', type=argparse.FileType('w'),
28+
parser.add_argument('--outfile', type=argparse.FileType('w'),
29+
default=open('treetime/timetree.nwk', 'w'),
2130
help="output, cleaned Newick file")
2231
args = parser.parse_args()
2332

24-
handle = open('data/clusters.info.csv')
33+
34+
handle = open('data/variants.csv')
35+
_ = next(handle) # skip header line
2536
coldates = {}
2637
for line in handle:
2738
_, node, dt, _, _ = line.strip().split(',')

treetime.py

+24-4
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,10 @@
33
import json
44
from gotoh2 import iter_fasta
55
from tempfile import NamedTemporaryFile
6+
import sys
67

78

8-
def filter_fasta(fasta_file, json_file):
9+
def filter_fasta(fasta_file, json_file, cutoff=10):
910
"""
1011
:param fasta_file: path to FASTA file containing cluster sequences
1112
:param json_file: path to JSON file with cluster information
@@ -15,8 +16,22 @@ def filter_fasta(fasta_file, json_file):
1516
fasta = dict(list(iter_fasta(fasta_file)))
1617
clusters = json.load(json_file)
1718
for cluster in clusters:
18-
header = cluster['nodes'][0]
19+
# record variant in cluster that is closest to root
20+
if type(cluster['nodes']) is list:
21+
# omit problematic cluster of one
22+
print(cluster['nodes'])
23+
continue
24+
25+
header = list(cluster['nodes'].keys())[0]
1926
result.update({header: fasta[header]})
27+
28+
# extract variants in cluster that have high counts
29+
major = [label for label, samples in
30+
cluster['nodes'].items() if
31+
len(samples) > cutoff and label != header]
32+
for label in major:
33+
result.update({label: fasta[label]})
34+
2035
return result
2136

2237

@@ -30,7 +45,7 @@ def fasttree(fasta):
3045
in_str = ''
3146
for h, s in fasta.items():
3247
in_str += '>{}\n{}\n'.format(h, s)
33-
p = Popen(['fasttree2', '-nt'], stdin=PIPE, stdout=PIPE)
48+
p = Popen(['fasttree2', '-nt', '-quote'], stdin=PIPE, stdout=PIPE)
3449
# TODO: exception handling with stderr?
3550
stdout, stderr = p.communicate(input=in_str.encode('utf-8'))
3651
return stdout.decode('utf-8')
@@ -46,13 +61,15 @@ def treetime(nwk, fasta, outdir):
4661
datefile.write('name,date\n')
4762
alnfile = NamedTemporaryFile('w', delete=False)
4863
for h, s in fasta.items():
64+
# TreeTime seems to have trouble handling labels with spaces
65+
h = h.replace(' ', '')
4966
datefile.write('{},{}\n'.format(h, h.split('|')[-1]))
5067
alnfile.write('>{}\n{}\n'.format(h, s))
5168
datefile.close()
5269
alnfile.close()
5370

5471
with NamedTemporaryFile('w', delete=False) as nwkfile:
55-
nwkfile.write(nwk)
72+
nwkfile.write(nwk.replace(' ', ''))
5673

5774
check_call(['treetime', '--tree', nwkfile.name,
5875
'--aln', alnfile.name, '--dates', datefile.name,
@@ -74,6 +91,9 @@ def parse_args():
7491
default=open('data/clusters.fa'),
7592
help='input, FASTA file with unique variant '
7693
'sequences')
94+
parser.add_argument('--mincount', type=int, default=10,
95+
help='option, minimum count of variant to be '
96+
'added to tree')
7797
parser.add_argument('--outdir', default='treetime/',
7898
help='directory to write TreeTime output files')
7999
return parser.parse_args()

clustering.py renamed to variants.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -175,12 +175,12 @@ def parse_args():
175175
parser.add_argument('--country', default='countries.csv',
176176
help='input, path to CSV file linking countries '
177177
'to geographic regions (continents).')
178-
parser.add_argument('--info', default='data/clusters.info.csv',
179-
help='output, path to write CSV containing '
180-
'cluster info')
178+
parser.add_argument('--info', default='data/variants.csv',
179+
help='output, path to write CSV describing '
180+
'composition of variants')
181181
parser.add_argument('--fasta_in', default='data/gisaid-aligned.fa',
182182
help='input, path to FASTA with aligned genomes')
183-
parser.add_argument('--fasta_out', default='data/clusters.fa',
183+
parser.add_argument('--fasta_out', default='data/variants.fa',
184184
help='output, path to write cluster FASTA')
185185

186186
return parser.parse_args()

0 commit comments

Comments
 (0)