Improved mash reference genome parsing

adamkoziol · adamkoziol · commit b96d9bac5e58 · 2016-05-31T10:44:00.000-04:00
diff --git a/OLCspades/accessoryFunctions.py b/OLCspades/accessoryFunctions.py
@@ -202,8 +202,6 @@ def __getattr__(self, key):
             return self.datastore[key]
 
     def __setattr__(self, key, value):
-        if key == 'trimmedcorrectedfastqfiles':
-            print value
         if value:
             self.datastore[key] = value
         elif type(value) != int:
diff --git a/OLCspades/mMLST.py b/OLCspades/mMLST.py
@@ -452,10 +452,10 @@ def alleleupdater(self, sample, gene, targetallele):
                                     return gene, allelenumber, 100.0, hsp.score
 
     def sequencetyper(self):
+        """Determines the sequence type of each strain based on comparisons to sequence type profiles"""
         for sample in self.metadata:
             if sample.general.bestassemblyfile != 'NA':
-                if type(sample[self.analysistype].allelenames) == list:
-                    """Determines the sequence type of each strain based on comparisons to sequence type profiles"""
+                if len(sample[self.analysistype].allelenames) > 1:
                     # Initialise variables
                     header = 0
                     # Iterate through the genomes
@@ -465,7 +465,6 @@ def sequencetyper(self):
                     self.bestmatch[genome] = defaultdict(int)
                     if sample[self.analysistype].profile != 'NA':
                         # Create the profiledata variable to avoid writing self.profiledata[self.analysistype]
-                        # profiledata = self.profiledata[self.analysistype]
                         profiledata = sample[self.analysistype].profiledata
                         # For each gene in plusdict[genome]
                         for gene in sample[self.analysistype].allelenames:
@@ -583,7 +582,7 @@ def sequencetyper(self):
                                                 mismatches.append(
                                                     ({gene: ('{} ({})'.format(self.bestdict[sample.name][gene]
                                                                               .keys()[0], sortedrefallele))}))
-                                        if not self.updateprofile:
+                                        if not self.updateprofile or self.analysistype == 'mlst':
                                             sample[self.analysistype].mismatchestosequencetype = mismatches
                                             sample[self.analysistype].sequencetype = sequencetype
                                             sample[self.analysistype].matchestosequencetype = matches
@@ -607,7 +606,6 @@ def sequencetyper(self):
                     sample[self.analysistype].sequencetype = 'NA'
 
     def reprofiler(self, header, genome, sample):
-        # reprofiler(numGenes, profileFile, geneList, genome)
         """
         Creates and appends new profiles as required
         :param header:
@@ -1330,26 +1328,27 @@ def strainer(self):
                     # updatecall, allelefolder = '', '{}rMLST/holding'.format(self.referencefilepath)
                     self.alleles = glob('{}/*.tfa'.format(allelefolder))
                     # self.alleles = glob('{}/*.fas'.format(allelefolder))
-                    self.profile = glob('{}/*.txt'.format(allelefolder))
+                    profile = glob('{}/*.txt'.format(allelefolder))
                     self.supplementalprofile = '{}rMLST/OLC_rMLST_profiles.txt'.format(self.referencefilepath)
                     self.combinedalleles = glob('{}/*.fasta'.format(allelefolder))
                     # Set the metadata file appropriately
                     sample[self.analysistype].alleledir = allelefolder
                     sample[self.analysistype].updatecall = updatecall
                 else:
                     self.alleles = glob('{}MLST/{}/*.tfa'.format(self.referencefilepath, sample.general.referencegenus))
-                    self.profile = glob('{}MLST/{}/*.txt'.format(self.referencefilepath, sample.general.referencegenus))
+                    profile = glob('{}MLST/{}/*.txt'.format(self.referencefilepath, sample.general.referencegenus))
                     self.combinedalleles = glob('{}MLST/{}/*.fasta'.format(self.referencefilepath,
                                                                            sample.general.referencegenus))
                     sample[self.analysistype].alleledir = '{}MLST/{}/'.format(self.referencefilepath,
                                                                               sample.general.referencegenus)
                 sample[self.analysistype].alleles = self.alleles
                 sample[self.analysistype].allelenames = [os.path.split(x)[1].split('.')[0] for x in self.alleles]
-                sample[self.analysistype].profile = self.profile
+                sample[self.analysistype].profile = profile if profile else 'NA'
                 sample[self.analysistype].analysistype = self.analysistype
                 sample[self.analysistype].reportdir = '{}/{}/'.format(sample.general.outputdirectory, self.analysistype)
                 sample[self.analysistype].combinedalleles = self.combinedalleles
-                sample[self.analysistype].supplementalprofile = self.supplementalprofile
+                sample[self.analysistype].supplementalprofile = self.supplementalprofile if self.supplementalprofile \
+                    else 'NA'
             else:
                 # Set the metadata file appropriately
                 sample[self.analysistype].alleles = 'NA'
diff --git a/OLCspades/mash.py b/OLCspades/mash.py
@@ -9,7 +9,7 @@
 
 class Mash(object):
     def sketching(self):
-        printtime('Indexing assemblies', self.starttime)
+        printtime('Indexing assemblies for mash analysis', self.starttime)
         # Create the threads for the analysis
         for sample in self.metadata:
             if sample.general.bestassemblyfile != 'NA':
@@ -18,9 +18,9 @@ def sketching(self):
                 threads.start()
         # Populate threads for each gene, genome combination
         for sample in self.metadata:
+            # Create the analysis type-specific GenObject
+            setattr(sample, self.analysistype, GenObject())
             if sample.general.bestassemblyfile != 'NA':
-                # Create the analysis type-specific GenObject
-                setattr(sample, self.analysistype, GenObject())
                 # Set attributes
                 sample[self.analysistype].reportdir = os.path.join(sample.general.outputdirectory, self.analysistype)
                 sample[self.analysistype].targetpath = os.path.join(self.referencefilepath, self.analysistype)
@@ -55,7 +55,7 @@ def sketch(self):
             self.sketchqueue.task_done()
 
     def mashing(self):
-        printtime('Determining closest refseq genome', self.starttime)
+        printtime('Performing mash analyses', self.starttime)
         # Create the threads for the analysis
         for sample in self.metadata:
             if sample.general.bestassemblyfile != 'NA':
@@ -85,6 +85,14 @@ def mash(self):
             self.mashqueue.task_done()
 
     def parse(self):
+        import re
+        from csv import DictReader
+        from glob import glob
+        # Set the name of the refseq profile
+        refseqprofile = glob('{}{}/*.txt'.format(self.referencefilepath, self.analysistype))[0]
+        # Open the refseq profile file as a dictionary
+        profile = DictReader(open(refseqprofile), dialect='excel-tab')
+        printtime('Determining closest refseq genome', self.starttime)
         for sample in self.metadata:
             if sample.general.bestassemblyfile != 'NA':
                 # Open the results and extract the first line of data
@@ -95,8 +103,12 @@ def parse(self):
                     pvalue, sample[self.analysistype].nummatches = data
                 # The database is formatted such that the reference file name is preceded by '-.-'
                 # e.g. refseq-NZ-1005511-PRJNA224116-SAMN00794588-GCF_000303935.1-.-Escherichia_coli_PA45.fna
-                sample[self.analysistype].closestrefseq = \
-                    referenceid.split('-.-')[1].split('.fna')[0]
+                try:
+                    sample[self.analysistype].closestrefseq = \
+                        re.search('(?:GCF_.{11}-.-)(.+)\.fna', referenceid).groups()[0]
+                except AttributeError:
+                    sample[self.analysistype].closestrefseq = \
+                        referenceid.split('-.-')[1].split('.fna')[0]
                 sample[self.analysistype].closestrefseqgenus = sample[self.analysistype].closestrefseq.split('_')[0]
             else:
                 # Populate the attribute with negative results
@@ -105,6 +117,7 @@ def parse(self):
         self.reporter()
 
     def reporter(self):
+        make_path(self.reportpath)
         header = 'Strain,ReferenceGenus,ReferenceFile,ReferenceGenomeMashDistance,Pvalue,NumMatchingHashes\n'
         data = ''
         for sample in self.metadata:
diff --git a/OLCspades/spadesRun.py b/OLCspades/spadesRun.py
@@ -1,6 +1,8 @@
 #!/usr/bin/env python
+from glob import glob
+
 from accessoryFunctions import *
-import os
+
 __author__ = 'adamkoziol'
 
 
@@ -145,6 +147,9 @@ def filter(self):
                 bestassemblyfile = '{}/{}.fasta'.format(sample.general.bestassembliespath, sample.name)
                 # Add the name and path of the best assembly file to the metadata
                 sample.general.bestassemblyfile = bestassemblyfile
+                # Get the trimmed, corrected fastq files into the object
+                sample.general.trimmedcorrectedfastqfiles = sorted(
+                    glob('{}/corrected/*_trimmed*'.format(sample.general.spadesoutput)))
                 # Copy the filtered file to the BestAssemblies folder
                 if not os.path.isfile(bestassemblyfile):
                     shutil.copyfile(filteredfile, bestassemblyfile)