rewrote treetime.py script - need to add more sequences, clock estimation is bad

ArtPoon · ArtPoon · commit 2374f989dd0c · 2020-05-11T22:55:47.000-04:00
diff --git a/treetime.py b/treetime.py
@@ -0,0 +1,93 @@
+import argparse
+from subprocess import Popen, PIPE, check_call
+import json
+from gotoh2 import iter_fasta
+from tempfile import NamedTemporaryFile
+
+
+def filter_fasta(fasta_file, json_file):
+    """
+    :param fasta_file:  path to FASTA file containing cluster sequences
+    :param json_file:  path to JSON file with cluster information
+    :return:  dict, filtered header-sequence pairs
+    """
+    result = {}
+    fasta = dict(list(iter_fasta(fasta_file)))
+    clusters = json.load(json_file)
+    for cluster in clusters:
+        header = cluster['nodes'][0]
+        result.update({header: fasta[header]})
+    return result
+
+
+def fasttree(fasta):
+    """
+    Wrapper for FastTree2, passing FASTA as stdin and capturing the
+    resulting Newick tree string as stdout.
+    :param fasta: dict, header: sequence pairs
+    :return: str, Newick tree string
+    """
+    in_str = ''
+    for h, s in fasta.items():
+        in_str += '>{}\n{}\n'.format(h, s)
+    p = Popen(['fasttree2', '-nt'], stdin=PIPE, stdout=PIPE)
+    # TODO: exception handling with stderr?
+    stdout, stderr = p.communicate(input=in_str.encode('utf-8'))
+    return stdout.decode('utf-8')
+
+
+def treetime(nwk, fasta, outdir):
+    """
+    :param nwk: str, Newick tree string from fasttree()
+    :param fasta: dict, header-sequence pairs
+    """
+    # extract dates from sequence headers
+    datefile = NamedTemporaryFile('w', delete=False)
+    datefile.write('name,date\n')
+    alnfile = NamedTemporaryFile('w', delete=False)
+    for h, s in fasta.items():
+        datefile.write('{},{}\n'.format(h, h.split('|')[-1]))
+        alnfile.write('>{}\n{}\n'.format(h, s))
+    datefile.close()
+    alnfile.close()
+
+    with NamedTemporaryFile('w', delete=False) as nwkfile:
+        nwkfile.write(nwk)
+
+    check_call(['treetime', '--tree', nwkfile.name,
+                '--aln', alnfile.name, '--dates', datefile.name,
+                '--outdir', outdir])
+
+
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Generate inputs for TreeTime analysis."
+    )
+    parser.add_argument('--json', type=argparse.FileType('r'),
+                        default=open('data/clusters.json'),
+                        help='input, JSON file generated by hclust.R '
+                             'identifying representative cluster '
+                             'sequences')
+    parser.add_argument('--fasta', type=argparse.FileType('r'),
+                        default=open('data/clusters.fa'),
+                        help='input, FASTA file with unique variant '
+                             'sequences')
+    parser.add_argument('--outdir', default='treetime/',
+                        help='directory to write TreeTime output files')
+    return parser.parse_args()
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    fasta = filter_fasta(args.fasta, args.json)
+    nwk = fasttree(fasta)
+    treetime(nwk, fasta, args.outdir)
+
+
+# pass outputs to fasttree2 and treetime
+# fasttree2 -nt < clusters.fa > clusters.ft2.nwk
+# python3 prune-long-tips.py
+# treetime --tree data/clusters.pruned.nwk --aln data/clusters.fa --dates data/clusters.dates.csv
+# python3 parse-nexus.py