Skip to content

Commit b96d9ba

Browse files
committed
Improved mash reference genome parsing
1 parent 5137bc6 commit b96d9ba

File tree

4 files changed

+33
-18
lines changed

4 files changed

+33
-18
lines changed

OLCspades/accessoryFunctions.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -202,8 +202,6 @@ def __getattr__(self, key):
202202
return self.datastore[key]
203203

204204
def __setattr__(self, key, value):
205-
if key == 'trimmedcorrectedfastqfiles':
206-
print value
207205
if value:
208206
self.datastore[key] = value
209207
elif type(value) != int:

OLCspades/mMLST.py

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -452,10 +452,10 @@ def alleleupdater(self, sample, gene, targetallele):
452452
return gene, allelenumber, 100.0, hsp.score
453453

454454
def sequencetyper(self):
455+
"""Determines the sequence type of each strain based on comparisons to sequence type profiles"""
455456
for sample in self.metadata:
456457
if sample.general.bestassemblyfile != 'NA':
457-
if type(sample[self.analysistype].allelenames) == list:
458-
"""Determines the sequence type of each strain based on comparisons to sequence type profiles"""
458+
if len(sample[self.analysistype].allelenames) > 1:
459459
# Initialise variables
460460
header = 0
461461
# Iterate through the genomes
@@ -465,7 +465,6 @@ def sequencetyper(self):
465465
self.bestmatch[genome] = defaultdict(int)
466466
if sample[self.analysistype].profile != 'NA':
467467
# Create the profiledata variable to avoid writing self.profiledata[self.analysistype]
468-
# profiledata = self.profiledata[self.analysistype]
469468
profiledata = sample[self.analysistype].profiledata
470469
# For each gene in plusdict[genome]
471470
for gene in sample[self.analysistype].allelenames:
@@ -583,7 +582,7 @@ def sequencetyper(self):
583582
mismatches.append(
584583
({gene: ('{} ({})'.format(self.bestdict[sample.name][gene]
585584
.keys()[0], sortedrefallele))}))
586-
if not self.updateprofile:
585+
if not self.updateprofile or self.analysistype == 'mlst':
587586
sample[self.analysistype].mismatchestosequencetype = mismatches
588587
sample[self.analysistype].sequencetype = sequencetype
589588
sample[self.analysistype].matchestosequencetype = matches
@@ -607,7 +606,6 @@ def sequencetyper(self):
607606
sample[self.analysistype].sequencetype = 'NA'
608607

609608
def reprofiler(self, header, genome, sample):
610-
# reprofiler(numGenes, profileFile, geneList, genome)
611609
"""
612610
Creates and appends new profiles as required
613611
:param header:
@@ -1330,26 +1328,27 @@ def strainer(self):
13301328
# updatecall, allelefolder = '', '{}rMLST/holding'.format(self.referencefilepath)
13311329
self.alleles = glob('{}/*.tfa'.format(allelefolder))
13321330
# self.alleles = glob('{}/*.fas'.format(allelefolder))
1333-
self.profile = glob('{}/*.txt'.format(allelefolder))
1331+
profile = glob('{}/*.txt'.format(allelefolder))
13341332
self.supplementalprofile = '{}rMLST/OLC_rMLST_profiles.txt'.format(self.referencefilepath)
13351333
self.combinedalleles = glob('{}/*.fasta'.format(allelefolder))
13361334
# Set the metadata file appropriately
13371335
sample[self.analysistype].alleledir = allelefolder
13381336
sample[self.analysistype].updatecall = updatecall
13391337
else:
13401338
self.alleles = glob('{}MLST/{}/*.tfa'.format(self.referencefilepath, sample.general.referencegenus))
1341-
self.profile = glob('{}MLST/{}/*.txt'.format(self.referencefilepath, sample.general.referencegenus))
1339+
profile = glob('{}MLST/{}/*.txt'.format(self.referencefilepath, sample.general.referencegenus))
13421340
self.combinedalleles = glob('{}MLST/{}/*.fasta'.format(self.referencefilepath,
13431341
sample.general.referencegenus))
13441342
sample[self.analysistype].alleledir = '{}MLST/{}/'.format(self.referencefilepath,
13451343
sample.general.referencegenus)
13461344
sample[self.analysistype].alleles = self.alleles
13471345
sample[self.analysistype].allelenames = [os.path.split(x)[1].split('.')[0] for x in self.alleles]
1348-
sample[self.analysistype].profile = self.profile
1346+
sample[self.analysistype].profile = profile if profile else 'NA'
13491347
sample[self.analysistype].analysistype = self.analysistype
13501348
sample[self.analysistype].reportdir = '{}/{}/'.format(sample.general.outputdirectory, self.analysistype)
13511349
sample[self.analysistype].combinedalleles = self.combinedalleles
1352-
sample[self.analysistype].supplementalprofile = self.supplementalprofile
1350+
sample[self.analysistype].supplementalprofile = self.supplementalprofile if self.supplementalprofile \
1351+
else 'NA'
13531352
else:
13541353
# Set the metadata file appropriately
13551354
sample[self.analysistype].alleles = 'NA'

OLCspades/mash.py

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
class Mash(object):
1111
def sketching(self):
12-
printtime('Indexing assemblies', self.starttime)
12+
printtime('Indexing assemblies for mash analysis', self.starttime)
1313
# Create the threads for the analysis
1414
for sample in self.metadata:
1515
if sample.general.bestassemblyfile != 'NA':
@@ -18,9 +18,9 @@ def sketching(self):
1818
threads.start()
1919
# Populate threads for each gene, genome combination
2020
for sample in self.metadata:
21+
# Create the analysis type-specific GenObject
22+
setattr(sample, self.analysistype, GenObject())
2123
if sample.general.bestassemblyfile != 'NA':
22-
# Create the analysis type-specific GenObject
23-
setattr(sample, self.analysistype, GenObject())
2424
# Set attributes
2525
sample[self.analysistype].reportdir = os.path.join(sample.general.outputdirectory, self.analysistype)
2626
sample[self.analysistype].targetpath = os.path.join(self.referencefilepath, self.analysistype)
@@ -55,7 +55,7 @@ def sketch(self):
5555
self.sketchqueue.task_done()
5656

5757
def mashing(self):
58-
printtime('Determining closest refseq genome', self.starttime)
58+
printtime('Performing mash analyses', self.starttime)
5959
# Create the threads for the analysis
6060
for sample in self.metadata:
6161
if sample.general.bestassemblyfile != 'NA':
@@ -85,6 +85,14 @@ def mash(self):
8585
self.mashqueue.task_done()
8686

8787
def parse(self):
88+
import re
89+
from csv import DictReader
90+
from glob import glob
91+
# Set the name of the refseq profile
92+
refseqprofile = glob('{}{}/*.txt'.format(self.referencefilepath, self.analysistype))[0]
93+
# Open the refseq profile file as a dictionary
94+
profile = DictReader(open(refseqprofile), dialect='excel-tab')
95+
printtime('Determining closest refseq genome', self.starttime)
8896
for sample in self.metadata:
8997
if sample.general.bestassemblyfile != 'NA':
9098
# Open the results and extract the first line of data
@@ -95,8 +103,12 @@ def parse(self):
95103
pvalue, sample[self.analysistype].nummatches = data
96104
# The database is formatted such that the reference file name is preceded by '-.-'
97105
# e.g. refseq-NZ-1005511-PRJNA224116-SAMN00794588-GCF_000303935.1-.-Escherichia_coli_PA45.fna
98-
sample[self.analysistype].closestrefseq = \
99-
referenceid.split('-.-')[1].split('.fna')[0]
106+
try:
107+
sample[self.analysistype].closestrefseq = \
108+
re.search('(?:GCF_.{11}-.-)(.+)\.fna', referenceid).groups()[0]
109+
except AttributeError:
110+
sample[self.analysistype].closestrefseq = \
111+
referenceid.split('-.-')[1].split('.fna')[0]
100112
sample[self.analysistype].closestrefseqgenus = sample[self.analysistype].closestrefseq.split('_')[0]
101113
else:
102114
# Populate the attribute with negative results
@@ -105,6 +117,7 @@ def parse(self):
105117
self.reporter()
106118

107119
def reporter(self):
120+
make_path(self.reportpath)
108121
header = 'Strain,ReferenceGenus,ReferenceFile,ReferenceGenomeMashDistance,Pvalue,NumMatchingHashes\n'
109122
data = ''
110123
for sample in self.metadata:

OLCspades/spadesRun.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
#!/usr/bin/env python
2+
from glob import glob
3+
24
from accessoryFunctions import *
3-
import os
5+
46
__author__ = 'adamkoziol'
57

68

@@ -145,6 +147,9 @@ def filter(self):
145147
bestassemblyfile = '{}/{}.fasta'.format(sample.general.bestassembliespath, sample.name)
146148
# Add the name and path of the best assembly file to the metadata
147149
sample.general.bestassemblyfile = bestassemblyfile
150+
# Get the trimmed, corrected fastq files into the object
151+
sample.general.trimmedcorrectedfastqfiles = sorted(
152+
glob('{}/corrected/*_trimmed*'.format(sample.general.spadesoutput)))
148153
# Copy the filtered file to the BestAssemblies folder
149154
if not os.path.isfile(bestassemblyfile):
150155
shutil.copyfile(filteredfile, bestassemblyfile)

0 commit comments

Comments
 (0)