Skip to content

Commit 5137bc6

Browse files
committed
Added mash functionality to rapidly determine closest refseq genome
1 parent f20f140 commit 5137bc6

File tree

4 files changed

+77
-67
lines changed

4 files changed

+77
-67
lines changed

OLCspades/accessoryFunctions.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -195,15 +195,22 @@ def __init__(self, x=None):
195195
super(GenObject, self).__setattr__('datastore', start)
196196

197197
def __getattr__(self, key):
198-
return self.datastore[key]
198+
if self.datastore[key] or self.datastore[key] == 0 or self.datastore[key] == False or all(self.datastore[key]):
199+
return self.datastore[key]
200+
else:
201+
self.datastore[key] = 'NA'
202+
return self.datastore[key]
199203

200204
def __setattr__(self, key, value):
205+
if key == 'trimmedcorrectedfastqfiles':
206+
print value
201207
if value:
202208
self.datastore[key] = value
203-
elif value is False:
204-
self.datastore[key] = value
209+
elif type(value) != int:
210+
if value is False or all(value):
211+
self.datastore[key] = value
205212
else:
206-
if value == 0:
213+
if value >= 0:
207214
self.datastore[key] = 0
208215
else:
209216
self.datastore[key] = "NA"

OLCspades/mMLST.py

Lines changed: 57 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -602,9 +602,9 @@ def sequencetyper(self):
602602
sample[self.analysistype].mismatchestosequencetype = 'NA'
603603
dotter()
604604
else:
605-
sample[self.analysistype].sequencetype = 'NA'
606605
sample[self.analysistype].matchestosequencetype = 'NA'
607606
sample[self.analysistype].mismatchestosequencetype = 'NA'
607+
sample[self.analysistype].sequencetype = 'NA'
608608

609609
def reprofiler(self, header, genome, sample):
610610
# reprofiler(numGenes, profileFile, geneList, genome)
@@ -620,54 +620,55 @@ def reprofiler(self, header, genome, sample):
620620
# Find the last profile entry in the dictionary of profiles
621621
# Opens uses the command line tool 'tail' to look at the last line of the file (-1). This last line
622622
# is split on tabs, and only the first entry (the sequence type number) is captured
623-
if os.path.isfile(sample[self.analysistype].supplementalprofile):
624-
try:
625-
lastentry = int(subprocess.check_output(['tail', '-1', sample[self.analysistype].supplementalprofile])
626-
.split("\t")[0]) + 1
627-
except ValueError:
623+
if sample[self.analysistype].supplementalprofile != 'NA':
624+
if os.path.isfile(sample[self.analysistype].supplementalprofile):
625+
try:
626+
lastentry = int(
627+
subprocess.check_output(['tail', '-1', sample[self.analysistype].supplementalprofile])
628+
.split("\t")[0]) + 1
629+
except ValueError:
630+
lastentry = 1000000
631+
else:
632+
open(sample[self.analysistype].supplementalprofile, 'wb').close()
628633
lastentry = 1000000
629-
else:
630-
open(sample[self.analysistype].supplementalprofile, 'wb').close()
631-
lastentry = 1000000
632-
# As there can be multiple profiles in MLSTSeqType, this loop only needs to be performed once.
633-
seqcount = 0
634-
# Go through the sequence types
635-
try:
636-
sequencetype = self.mlstseqtype[genome].keys()[0]
637-
except IndexError:
638-
sequencetype = ''
639-
seqcount = 1
640-
# Only do this once
641-
if seqcount == 0:
642-
# Set the :newprofile string to start with the new profile name (e.g. 1000000_CFIA)
643-
newprofile = str(lastentry)
644-
# The number of matches to the reference profile
645-
nummatches = self.mlstseqtype[genome][sequencetype].keys()[0]
646-
for sample in self.metadata:
647-
if sample.name == genome:
648-
# The genes in geneList - should be in the correct order
649-
for gene in sorted(sample[self.analysistype].allelenames):
650-
# The allele for each gene in the query genome
651-
allele = self.mlstseqtype[genome][sequencetype][nummatches][gene].keys()[0]
652-
# Append the allele to newprofile
653-
newprofile += '\t{}'.format(allele)
654-
# Add the MLST results for the query genome as well as the new profile data
655-
# to resultProfile
656-
self.resultprofile[genome]['{}(new)'.format(str(lastentry))][header][gene][allele] = \
657-
self.mlstseqtype[genome][sequencetype][nummatches][gene][allele].values()[0]
658-
seqcount += 1
659-
sample[self.analysistype].mismatchestosequencetype = 'NA'
660-
# sample[self.analysistype].sequencetype = '{}_CFIA'.format(str(lastentry))
661-
sample[self.analysistype].matchestosequencetype = header
662-
# Only perform the next loop if :newprofile exists
663-
if newprofile:
664-
# Open the profile file to append
665-
print sample[self.analysistype].supplementalprofile
666-
with open(sample[self.analysistype].supplementalprofile, 'ab') as appendfile:
667-
# Append the new profile to the end of the profile file
668-
appendfile.write('{}\n'.format(newprofile))
669-
# Re-run profiler with the updated files
670-
self.profiler()
634+
# As there can be multiple profiles in MLSTSeqType, this loop only needs to be performed once.
635+
seqcount = 0
636+
# Go through the sequence types
637+
try:
638+
sequencetype = self.mlstseqtype[genome].keys()[0]
639+
except IndexError:
640+
sequencetype = ''
641+
seqcount = 1
642+
# Only do this once
643+
if seqcount == 0:
644+
# Set the :newprofile string to start with the new profile name (e.g. 1000000_CFIA)
645+
newprofile = str(lastentry)
646+
# The number of matches to the reference profile
647+
nummatches = self.mlstseqtype[genome][sequencetype].keys()[0]
648+
for sample in self.metadata:
649+
if sample.name == genome:
650+
# The genes in geneList - should be in the correct order
651+
for gene in sorted(sample[self.analysistype].allelenames):
652+
# The allele for each gene in the query genome
653+
allele = self.mlstseqtype[genome][sequencetype][nummatches][gene].keys()[0]
654+
# Append the allele to newprofile
655+
newprofile += '\t{}'.format(allele)
656+
# Add the MLST results for the query genome as well as the new profile data
657+
# to resultProfile
658+
self.resultprofile[genome]['{}(new)'.format(str(lastentry))][header][gene][allele] = \
659+
self.mlstseqtype[genome][sequencetype][nummatches][gene][allele].values()[0]
660+
seqcount += 1
661+
sample[self.analysistype].mismatchestosequencetype = 'NA'
662+
# sample[self.analysistype].sequencetype = '{}_CFIA'.format(str(lastentry))
663+
sample[self.analysistype].matchestosequencetype = header
664+
# Only perform the next loop if :newprofile exists
665+
if newprofile:
666+
# Open the profile file to append
667+
with open(sample[self.analysistype].supplementalprofile, 'ab') as appendfile:
668+
# Append the new profile to the end of the profile file
669+
appendfile.write('{}\n'.format(newprofile))
670+
# Re-run profiler with the updated files
671+
self.profiler()
671672

672673
def reporter(self):
673674
""" Parse the results into a report"""
@@ -1094,13 +1095,13 @@ def organismchooser(self):
10941095
if not self.allelepath:
10951096
# If the name of the organism to analyse was provided
10961097
assert self.organism, 'Need to provide either a path to the alleles or an organism name'
1098+
# If the -g flag was included, download the appropriate MLST scheme for the organism
1099+
if self.getmlst and self.organism:
1100+
self.getmlsthelper()
10971101
self.allelepath = '{}{}'.format(self.path, self.organism)
10981102
assert os.path.isdir(self.allelepath), 'Cannot find {}. Please ensure that the folder exists, or ' \
10991103
'use the -g option to download the {} MLST scheme' \
11001104
.format(self.allelepath, self.organism)
1101-
# If the -g flag was included, download the appropriate MLST scheme for the organism
1102-
if self.getmlst and self.organism:
1103-
self.getmlsthelper()
11041105
# Tries to get the organism name for the folder containing the alleles
11051106
self.organism = self.organism if self.organism else os.path.split(self.allelepath)[-1]
11061107
if self.cleardatabases:
@@ -1134,8 +1135,6 @@ def organismchooser(self):
11341135
self.combinedalleles = glob('{}/*.fasta'.format(self.allelepath))
11351136
# Get the .txt profile file name and path into a variable
11361137
self.profile = glob('{}/*.txt'.format(self.allelepath))
1137-
# Add the appropriate variables to the metadata object for each sample
1138-
self.scheme = self.scheme if self.scheme else self.analysistype
11391138
for sample in self.samples:
11401139
sample[self.analysistype].alleles = [os.path.split(x)[1].split('.')[0] for x in self.alleles]
11411140
sample[self.analysistype].allelenames = [os.path.split(x)[1].split('.')[0] for x in self.alleles]
@@ -1155,7 +1154,8 @@ def getmlsthelper(self):
11551154
# As there are multiple profiles for certain organisms, this dictionary has the schemes I use as values
11561155
organismdictionary = {'Escherichia': 'Escherichia coli#1',
11571156
'Vibrio': 'Vibrio parahaemolyticus',
1158-
'Campylobacter': 'Campylobacter jejuni'}
1157+
'Campylobacter': 'Campylobacter jejuni',
1158+
'Listeria': 'Listeria monocytogenes'}
11591159
# rMLST alleles cannot be fetched in the same way
11601160
if self.scheme.lower() != 'rmlst':
11611161
# Allow for a genus not in the dictionary being specified
@@ -1267,7 +1267,7 @@ def __init__(self):
12671267
assert os.path.isdir(self.referenceprofilepath), 'Cannot find {}. Please double check that you ' \
12681268
'provided the proper path to the reference profile ' \
12691269
'folder'.format(self.referenceprofilepath)
1270-
self.scheme = args.scheme
1270+
self.scheme = args.scheme if args.scheme else args.type
12711271
self.organism = args.organism
12721272
self.updateprofile = args.updateprofilefalse
12731273
self.updateallele = args.updateallelefalse
@@ -1322,8 +1322,8 @@ class PipelineInit(object):
13221322
def strainer(self):
13231323
from accessoryFunctions import GenObject
13241324
for sample in self.runmetadata.samples:
1325+
setattr(sample, self.analysistype, GenObject())
13251326
if sample.general.bestassemblyfile != 'NA':
1326-
setattr(sample, self.analysistype, GenObject())
13271327
if self.analysistype.lower() == 'rmlst':
13281328
# Run the allele updater method
13291329
updatecall, allelefolder = getrmlsthelper(self.referencefilepath, self.updatermlst, self.start)
@@ -1351,14 +1351,14 @@ def strainer(self):
13511351
sample[self.analysistype].combinedalleles = self.combinedalleles
13521352
sample[self.analysistype].supplementalprofile = self.supplementalprofile
13531353
else:
1354-
setattr(sample, self.analysistype, GenObject())
13551354
# Set the metadata file appropriately
13561355
sample[self.analysistype].alleles = 'NA'
13571356
sample[self.analysistype].allelenames = 'NA'
13581357
sample[self.analysistype].profile = 'NA'
13591358
sample[self.analysistype].analysistype = 'NA'
13601359
sample[self.analysistype].reportdir = 'NA'
13611360
sample[self.analysistype].combinedalleles = 'NA'
1361+
sample[self.analysistype].supplementalprofile = 'NA'
13621362

13631363
def __init__(self, inputobject, analysistype):
13641364
self.runmetadata = inputobject.runmetadata

OLCspades/mash.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ def sketch(self):
5050
while True:
5151
sample = self.sketchqueue.get()
5252
if not os.path.isfile(sample[self.analysistype].sketchfile):
53+
5354
call(sample.commands.sketchcall, shell=True, stdout=self.fnull, stderr=self.fnull)
5455
self.sketchqueue.task_done()
5556

@@ -90,8 +91,8 @@ def parse(self):
9091
mashdata = open(sample[self.analysistype].mashresults).readline().rstrip()
9192
# Split on tabs
9293
data = mashdata.split('\t')
93-
referenceid, queryid, sample[self.analysistype].mashdistance, sample[self.analysistype].pvalue, \
94-
sample[self.analysistype].nummatches = data
94+
referenceid, queryid, sample[self.analysistype].mashdistance, sample[self.analysistype]. \
95+
pvalue, sample[self.analysistype].nummatches = data
9596
# The database is formatted such that the reference file name is preceded by '-.-'
9697
# e.g. refseq-NZ-1005511-PRJNA224116-SAMN00794588-GCF_000303935.1-.-Escherichia_coli_PA45.fna
9798
sample[self.analysistype].closestrefseq = \
@@ -104,7 +105,7 @@ def parse(self):
104105
self.reporter()
105106

106107
def reporter(self):
107-
header = 'Strain,ReferenceGenus,ReferenceGenomeMashDistance,Pvalue,NumMatchingHashes\n'
108+
header = 'Strain,ReferenceGenus,ReferenceFile,ReferenceGenomeMashDistance,Pvalue,NumMatchingHashes\n'
108109
data = ''
109110
for sample in self.metadata:
110111
if sample.general.bestassemblyfile != 'NA':

OLCspades/prodigal.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
#!/usr/bin/env python
2-
from accessoryFunctions import *
32
from threading import Thread, Lock
3+
4+
from accessoryFunctions import *
5+
46
__author__ = 'adamkoziol'
57

68
threadlock = Lock()
@@ -17,9 +19,9 @@ def predictthreads(self):
1719
threads.setDaemon(True)
1820
threads.start()
1921
for sample in self.metadata:
22+
# Create the .prodigal attribute
23+
sample.prodigal = GenObject()
2024
if sample.general.bestassemblyfile != 'NA':
21-
# Create the .prodigal attribute
22-
sample.prodigal = GenObject()
2325
self.predictqueue.put(sample)
2426
self.predictqueue.join()
2527

0 commit comments

Comments
 (0)