Skip to content

Commit 2374f98

Browse files
committed
rewrote treetime.py script - need to add more sequences, clock estimation is bad
1 parent dafb2e5 commit 2374f98

File tree

1 file changed

+93
-0
lines changed

1 file changed

+93
-0
lines changed

treetime.py

+93
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
import argparse
2+
from subprocess import Popen, PIPE, check_call
3+
import json
4+
from gotoh2 import iter_fasta
5+
from tempfile import NamedTemporaryFile
6+
7+
8+
def filter_fasta(fasta_file, json_file):
9+
"""
10+
:param fasta_file: path to FASTA file containing cluster sequences
11+
:param json_file: path to JSON file with cluster information
12+
:return: dict, filtered header-sequence pairs
13+
"""
14+
result = {}
15+
fasta = dict(list(iter_fasta(fasta_file)))
16+
clusters = json.load(json_file)
17+
for cluster in clusters:
18+
header = cluster['nodes'][0]
19+
result.update({header: fasta[header]})
20+
return result
21+
22+
23+
def fasttree(fasta):
24+
"""
25+
Wrapper for FastTree2, passing FASTA as stdin and capturing the
26+
resulting Newick tree string as stdout.
27+
:param fasta: dict, header: sequence pairs
28+
:return: str, Newick tree string
29+
"""
30+
in_str = ''
31+
for h, s in fasta.items():
32+
in_str += '>{}\n{}\n'.format(h, s)
33+
p = Popen(['fasttree2', '-nt'], stdin=PIPE, stdout=PIPE)
34+
# TODO: exception handling with stderr?
35+
stdout, stderr = p.communicate(input=in_str.encode('utf-8'))
36+
return stdout.decode('utf-8')
37+
38+
39+
def treetime(nwk, fasta, outdir):
40+
"""
41+
:param nwk: str, Newick tree string from fasttree()
42+
:param fasta: dict, header-sequence pairs
43+
"""
44+
# extract dates from sequence headers
45+
datefile = NamedTemporaryFile('w', delete=False)
46+
datefile.write('name,date\n')
47+
alnfile = NamedTemporaryFile('w', delete=False)
48+
for h, s in fasta.items():
49+
datefile.write('{},{}\n'.format(h, h.split('|')[-1]))
50+
alnfile.write('>{}\n{}\n'.format(h, s))
51+
datefile.close()
52+
alnfile.close()
53+
54+
with NamedTemporaryFile('w', delete=False) as nwkfile:
55+
nwkfile.write(nwk)
56+
57+
check_call(['treetime', '--tree', nwkfile.name,
58+
'--aln', alnfile.name, '--dates', datefile.name,
59+
'--outdir', outdir])
60+
61+
62+
63+
64+
def parse_args():
65+
parser = argparse.ArgumentParser(
66+
description="Generate inputs for TreeTime analysis."
67+
)
68+
parser.add_argument('--json', type=argparse.FileType('r'),
69+
default=open('data/clusters.json'),
70+
help='input, JSON file generated by hclust.R '
71+
'identifying representative cluster '
72+
'sequences')
73+
parser.add_argument('--fasta', type=argparse.FileType('r'),
74+
default=open('data/clusters.fa'),
75+
help='input, FASTA file with unique variant '
76+
'sequences')
77+
parser.add_argument('--outdir', default='treetime/',
78+
help='directory to write TreeTime output files')
79+
return parser.parse_args()
80+
81+
82+
if __name__ == '__main__':
83+
args = parse_args()
84+
fasta = filter_fasta(args.fasta, args.json)
85+
nwk = fasttree(fasta)
86+
treetime(nwk, fasta, args.outdir)
87+
88+
89+
# pass outputs to fasttree2 and treetime
90+
# fasttree2 -nt < clusters.fa > clusters.ft2.nwk
91+
# python3 prune-long-tips.py
92+
# treetime --tree data/clusters.pruned.nwk --aln data/clusters.fa --dates data/clusters.dates.csv
93+
# python3 parse-nexus.py

0 commit comments

Comments
 (0)