got treetime working

ArtPoon · ArtPoon · commit 4ba6b0ead2cd · 2020-05-12T14:06:43.000-04:00
diff --git a/covizu.sh b/covizu.sh
@@ -0,0 +1,20 @@
+# screen for non-human and low-coverage samples -> gisaid-filtered.fa
+python3 filtering.py
+
+# calculate TN93 distances
+tn93 -t 0.0001 -o data/gisaid.tn93.csv data/gisaid-filtered.fa
+
+# cluster genomes into variants -> variants.csv, variants.fa
+python3 variants.py
+
+# calculate TN93 distances for clusters and output as HyPhy matrix
+tn93 -o data/variants.tn93.txt -f hyphy data/variants.fa
+
+# convert HyPhy matrix format to CSV
+sed -i 's/[{}]//g' data/variants.tn93.txt
+
+# hierarchical clustering -> data/clusters.json
+Rscript hclust.R
+
+# run FastTree and TreeTime
+python3 treetime.py
diff --git a/hclust.R b/hclust.R
@@ -1,12 +1,12 @@
 require(igraph)
 require(jsonlite)
 
-tn93 <- read.csv('data/clusters.tn93.txt', skip=1, header=F)
-info <- read.csv('data/clusters.info.csv')
+tn93 <- read.csv('data/variants.tn93.txt', skip=1, header=F)
+variants <- read.csv('data/variants.csv')
 
 # read headers from FASTA
 headers <- rep(NA, times=nrow(tn93))
-con <- file('data/clusters.fa', open='r')
+con <- file('data/variants.fa', open='r')
 i <- 1
 while (length(line <- readLines(con, n=1, warn=FALSE)) > 0) {
   if (grepl("^>", line)) {
@@ -57,12 +57,22 @@ result <- lapply(1:max(clusters), function(i) {
       return(edges)
     }
     edges <- traverse(subroot, NA, edgelist)
-    nodes <- unique(edges)
     edges <- matrix(edges, ncol=2, byrow=TRUE)
+
+    # store variant data
+    nodes <- list()
+    for (node in unique(edges)) {
+      temp <- variants[variants$cluster==node, ]
+      temp$label1 <- sapply(as.character(temp$label), function(x) {
+        strsplit(x, "\\|")[[1]][1]
+      })
+      nodes[[node]] <- temp[c('label1', 'region', 'country', 'coldate')]
+    }
+
     list(nodes=nodes, edges=edges)  
   }
 })
 
-write(toJSON(result, pretty=TRUE), file="cluster.json")
+write(toJSON(result, pretty=TRUE), file="data/clusters.json")
 
 # record subroots
diff --git a/mst.py b/mst.py
@@ -48,7 +48,7 @@ def get_edgelist(g):
 
 def parse_clusters(infile):
     """
-    :param infile: cluster info file from clustering.py
+    :param infile: cluster info file from variants.py
     :return: dict
     """
     clusters = {}
@@ -164,7 +164,7 @@ def parse_args():
                         help='input, path to TN93 CSV file')
     parser.add_argument('--info', default='data/clusters.info.csv',
                         help='input, path to CSV with cluster information, '
-                             'generated by clustering.py')
+                             'generated by variants.py')
     parser.add_argument('--outstem', default='mst/component-{}.edgelist.csv',
                         help='output, stem for output files with Python '
                              'formatted string syntax with one placeholder '
diff --git a/parse-nexus.py b/parse-nexus.py
@@ -1,27 +1,38 @@
 import re
 import argparse
-from clustering import date2float
 from datetime import date
 import sys
 from Bio import Phylo
 from io import StringIO
 
+
+def date2float(dt):
+    origin = date(dt.year, 1, 1)
+    td = (dt-origin).days
+    return dt.year + td/365.25
+
+
 DATE_TOL = 0.1
 
 parser = argparse.ArgumentParser(
     description = "Use regular expressions to extract comment fields "
                   "from NEXUS output of TreeTime and write to a "
                   "separate CSV file.  Remove problematic tips."
 )
-parser.add_argument('infile', type=argparse.FileType('r'),
+parser.add_argument('--infile', type=argparse.FileType('r'),
+                    default=open('treetime/timetree.nexus'),
                     help="input, TreeTime NEXUS output file")
-parser.add_argument('csvfile', type=argparse.FileType('w'),
+parser.add_argument('--csvfile', type=argparse.FileType('w'),
+                    default=open('treetime/nodedate.csv', 'w'),
                     help="output, CSV file with node date estimates")
-parser.add_argument('outfile', type=argparse.FileType('w'),
+parser.add_argument('--outfile', type=argparse.FileType('w'),
+                    default=open('treetime/timetree.nwk', 'w'),
                     help="output, cleaned Newick file")
 args = parser.parse_args()
 
-handle = open('data/clusters.info.csv')
+
+handle = open('data/variants.csv')
+_ = next(handle)  # skip header line
 coldates = {}
 for line in handle:
     _, node, dt, _, _ = line.strip().split(',')
diff --git a/treetime.py b/treetime.py
@@ -3,9 +3,10 @@
 import json
 from gotoh2 import iter_fasta
 from tempfile import NamedTemporaryFile
+import sys
 
 
-def filter_fasta(fasta_file, json_file):
+def filter_fasta(fasta_file, json_file, cutoff=10):
     """
     :param fasta_file:  path to FASTA file containing cluster sequences
     :param json_file:  path to JSON file with cluster information
@@ -15,8 +16,22 @@ def filter_fasta(fasta_file, json_file):
     fasta = dict(list(iter_fasta(fasta_file)))
     clusters = json.load(json_file)
     for cluster in clusters:
-        header = cluster['nodes'][0]
+        # record variant in cluster that is closest to root
+        if type(cluster['nodes']) is list:
+            # omit problematic cluster of one
+            print(cluster['nodes'])
+            continue
+
+        header = list(cluster['nodes'].keys())[0]
         result.update({header: fasta[header]})
+
+        # extract variants in cluster that have high counts
+        major = [label for label, samples in
+                 cluster['nodes'].items() if
+                 len(samples) > cutoff and label != header]
+        for label in major:
+            result.update({label: fasta[label]})
+
     return result
 
 
@@ -30,7 +45,7 @@ def fasttree(fasta):
     in_str = ''
     for h, s in fasta.items():
         in_str += '>{}\n{}\n'.format(h, s)
-    p = Popen(['fasttree2', '-nt'], stdin=PIPE, stdout=PIPE)
+    p = Popen(['fasttree2', '-nt', '-quote'], stdin=PIPE, stdout=PIPE)
     # TODO: exception handling with stderr?
     stdout, stderr = p.communicate(input=in_str.encode('utf-8'))
     return stdout.decode('utf-8')
@@ -46,13 +61,15 @@ def treetime(nwk, fasta, outdir):
     datefile.write('name,date\n')
     alnfile = NamedTemporaryFile('w', delete=False)
     for h, s in fasta.items():
+        # TreeTime seems to have trouble handling labels with spaces
+        h = h.replace(' ', '')
         datefile.write('{},{}\n'.format(h, h.split('|')[-1]))
         alnfile.write('>{}\n{}\n'.format(h, s))
     datefile.close()
     alnfile.close()
 
     with NamedTemporaryFile('w', delete=False) as nwkfile:
-        nwkfile.write(nwk)
+        nwkfile.write(nwk.replace(' ', ''))
 
     check_call(['treetime', '--tree', nwkfile.name,
                 '--aln', alnfile.name, '--dates', datefile.name,
@@ -74,6 +91,9 @@ def parse_args():
                         default=open('data/clusters.fa'),
                         help='input, FASTA file with unique variant '
                              'sequences')
+    parser.add_argument('--mincount', type=int, default=10,
+                        help='option, minimum count of variant to be '
+                             'added to tree')
     parser.add_argument('--outdir', default='treetime/',
                         help='directory to write TreeTime output files')
     return parser.parse_args()
diff --git a/variants.py b/variants.py
@@ -175,12 +175,12 @@ def parse_args():
     parser.add_argument('--country', default='countries.csv',
                         help='input, path to CSV file linking countries '
                              'to geographic regions (continents).')
-    parser.add_argument('--info', default='data/clusters.info.csv',
-                        help='output, path to write CSV containing '
-                             'cluster info')
+    parser.add_argument('--info', default='data/variants.csv',
+                        help='output, path to write CSV describing '
+                             'composition of variants')
     parser.add_argument('--fasta_in', default='data/gisaid-aligned.fa',
                         help='input, path to FASTA with aligned genomes')
-    parser.add_argument('--fasta_out', default='data/clusters.fa',
+    parser.add_argument('--fasta_out', default='data/variants.fa',
                         help='output, path to write cluster FASTA')
 
     return parser.parse_args()