|
| 1 | +import argparse |
| 2 | +from subprocess import Popen, PIPE, check_call |
| 3 | +import json |
| 4 | +from gotoh2 import iter_fasta |
| 5 | +from tempfile import NamedTemporaryFile |
| 6 | + |
| 7 | + |
| 8 | +def filter_fasta(fasta_file, json_file): |
| 9 | + """ |
| 10 | + :param fasta_file: path to FASTA file containing cluster sequences |
| 11 | + :param json_file: path to JSON file with cluster information |
| 12 | + :return: dict, filtered header-sequence pairs |
| 13 | + """ |
| 14 | + result = {} |
| 15 | + fasta = dict(list(iter_fasta(fasta_file))) |
| 16 | + clusters = json.load(json_file) |
| 17 | + for cluster in clusters: |
| 18 | + header = cluster['nodes'][0] |
| 19 | + result.update({header: fasta[header]}) |
| 20 | + return result |
| 21 | + |
| 22 | + |
| 23 | +def fasttree(fasta): |
| 24 | + """ |
| 25 | + Wrapper for FastTree2, passing FASTA as stdin and capturing the |
| 26 | + resulting Newick tree string as stdout. |
| 27 | + :param fasta: dict, header: sequence pairs |
| 28 | + :return: str, Newick tree string |
| 29 | + """ |
| 30 | + in_str = '' |
| 31 | + for h, s in fasta.items(): |
| 32 | + in_str += '>{}\n{}\n'.format(h, s) |
| 33 | + p = Popen(['fasttree2', '-nt'], stdin=PIPE, stdout=PIPE) |
| 34 | + # TODO: exception handling with stderr? |
| 35 | + stdout, stderr = p.communicate(input=in_str.encode('utf-8')) |
| 36 | + return stdout.decode('utf-8') |
| 37 | + |
| 38 | + |
| 39 | +def treetime(nwk, fasta, outdir): |
| 40 | + """ |
| 41 | + :param nwk: str, Newick tree string from fasttree() |
| 42 | + :param fasta: dict, header-sequence pairs |
| 43 | + """ |
| 44 | + # extract dates from sequence headers |
| 45 | + datefile = NamedTemporaryFile('w', delete=False) |
| 46 | + datefile.write('name,date\n') |
| 47 | + alnfile = NamedTemporaryFile('w', delete=False) |
| 48 | + for h, s in fasta.items(): |
| 49 | + datefile.write('{},{}\n'.format(h, h.split('|')[-1])) |
| 50 | + alnfile.write('>{}\n{}\n'.format(h, s)) |
| 51 | + datefile.close() |
| 52 | + alnfile.close() |
| 53 | + |
| 54 | + with NamedTemporaryFile('w', delete=False) as nwkfile: |
| 55 | + nwkfile.write(nwk) |
| 56 | + |
| 57 | + check_call(['treetime', '--tree', nwkfile.name, |
| 58 | + '--aln', alnfile.name, '--dates', datefile.name, |
| 59 | + '--outdir', outdir]) |
| 60 | + |
| 61 | + |
| 62 | + |
| 63 | + |
| 64 | +def parse_args(): |
| 65 | + parser = argparse.ArgumentParser( |
| 66 | + description="Generate inputs for TreeTime analysis." |
| 67 | + ) |
| 68 | + parser.add_argument('--json', type=argparse.FileType('r'), |
| 69 | + default=open('data/clusters.json'), |
| 70 | + help='input, JSON file generated by hclust.R ' |
| 71 | + 'identifying representative cluster ' |
| 72 | + 'sequences') |
| 73 | + parser.add_argument('--fasta', type=argparse.FileType('r'), |
| 74 | + default=open('data/clusters.fa'), |
| 75 | + help='input, FASTA file with unique variant ' |
| 76 | + 'sequences') |
| 77 | + parser.add_argument('--outdir', default='treetime/', |
| 78 | + help='directory to write TreeTime output files') |
| 79 | + return parser.parse_args() |
| 80 | + |
| 81 | + |
| 82 | +if __name__ == '__main__': |
| 83 | + args = parse_args() |
| 84 | + fasta = filter_fasta(args.fasta, args.json) |
| 85 | + nwk = fasttree(fasta) |
| 86 | + treetime(nwk, fasta, args.outdir) |
| 87 | + |
| 88 | + |
| 89 | +# pass outputs to fasttree2 and treetime |
| 90 | +# fasttree2 -nt < clusters.fa > clusters.ft2.nwk |
| 91 | +# python3 prune-long-tips.py |
| 92 | +# treetime --tree data/clusters.pruned.nwk --aln data/clusters.fa --dates data/clusters.dates.csv |
| 93 | +# python3 parse-nexus.py |
0 commit comments