Skip to content

Commit a9cebf2

Browse files
committed
add source code
1 parent 7e487aa commit a9cebf2

File tree

5 files changed

+1321
-0
lines changed

5 files changed

+1321
-0
lines changed

fcat/__init__.py

Whitespace-only changes.

fcat/assessCompleteness.py

+322
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,322 @@
1+
# -*- coding: utf-8 -*-
2+
3+
#######################################################################
4+
# Copyright (C) 2020 Vinh Tran
5+
#
6+
# Calculate FAS cutoff for each core ortholog group of the core set
7+
#
8+
# This script is distributed in the hope that it will be useful,
9+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11+
# GNU General Public License <http://www.gnu.org/licenses/> for
12+
# more details
13+
#
14+
15+
#
16+
#######################################################################
17+
18+
import sys
19+
import os
20+
import argparse
21+
from pathlib import Path
22+
from Bio import SeqIO
23+
import subprocess
24+
import multiprocessing as mp
25+
import shutil
26+
from tqdm import tqdm
27+
import time
28+
import statistics
29+
import collections
30+
31+
def checkFileExist(file, msg):
32+
if not os.path.exists(os.path.abspath(file)):
33+
sys.exit('%s not found! %s' % (file, msg))
34+
35+
def readFile(file):
36+
if os.path.exists(file):
37+
with open(file, 'r') as f:
38+
lines = f.readlines()
39+
f.close()
40+
return(lines)
41+
else:
42+
return('NA')
43+
44+
def addToDict(dict, groupID, seqID, type):
45+
if not groupID in dict:
46+
dict[groupID] = '%s\t%s\t%s' % (groupID, type, seqID)
47+
else:
48+
tmp = dict[groupID].split('\t')
49+
dict[groupID] = '%s\tduplicated (%s)\t%s' % (tmp[0], tmp[1], tmp[2])
50+
dict[groupID] = '%s\n%s\tduplicated (%s)\t%s' % (dict[groupID], groupID, type, seqID)
51+
return(dict)
52+
53+
def mode1(ppFile, coreDir, coreSet, queryID):
54+
noCutoff = []
55+
assessment = {}
56+
for line in readFile(ppFile):
57+
groupID = line.split('\t')[0]
58+
if queryID in line.split('\t')[2]:
59+
# meanFas = statistics.mean((float(line.split('\t')[3]), float(line.split('\t')[4].strip())))
60+
meanFas = float(line.split('\t')[3].strip())
61+
scoreFile = '%s/core_orthologs/%s/%s/fas_dir/cutoff_dir/1.cutoff' % (coreDir, coreSet, groupID)
62+
if os.path.exists(scoreFile):
63+
meanGroup = 0
64+
for l in readFile(scoreFile):
65+
if l.split('\t')[0] == 'mean':
66+
meanGroup = float(l.split('\t')[1].strip())
67+
if meanFas >= meanGroup:
68+
assessment = addToDict(assessment, groupID, line.split('\t')[2], 'similar')
69+
else:
70+
assessment = addToDict(assessment, groupID, line.split('\t')[2], 'dissimilar')
71+
else:
72+
noCutoff.append(groupID)
73+
return(assessment, noCutoff)
74+
75+
def mode2(ppFile, coreDir, coreSet, queryID, outDir):
76+
noCutoff = []
77+
assessment = {}
78+
# get refspec for each group
79+
groupRefspec = {}
80+
refspecFile = '%s/%s/%s/last_refspec.txt' % (outDir, coreSet, queryID)
81+
for g in readFile(refspecFile):
82+
groupRefspec[g.split('\t')[0]] = g.split('\t')[1]
83+
# do assessment
84+
for line in readFile(ppFile):
85+
groupID = line.split('\t')[0]
86+
if queryID in line.split('\t')[2]:
87+
meanFas = statistics.mean((float(line.split('\t')[3]), float(line.split('\t')[4].strip())))
88+
scoreFile = '%s/core_orthologs/%s/%s/fas_dir/cutoff_dir/2.cutoff' % (coreDir, coreSet, groupID)
89+
if os.path.exists(scoreFile):
90+
meanRefspec = 0
91+
for l in readFile(scoreFile):
92+
if l.split('\t')[0] == groupRefspec[groupID].strip():
93+
meanRefspec = float(l.split('\t')[1].strip())
94+
if meanFas >= meanRefspec:
95+
assessment = addToDict(assessment, groupID, line.split('\t')[2], 'similar')
96+
else:
97+
assessment = addToDict(assessment, groupID, line.split('\t')[2], 'dissimilar')
98+
else:
99+
noCutoff.append(groupID)
100+
return(assessment, noCutoff)
101+
102+
def mode3(ppFile, coreDir, coreSet, queryID):
103+
noCutoff = []
104+
assessment = {}
105+
for line in readFile(ppFile):
106+
groupID = line.split('\t')[0]
107+
if queryID in line.split('\t')[2]:
108+
meanFas = statistics.mean((float(line.split('\t')[3]), float(line.split('\t')[4].strip())))
109+
scoreFile = '%s/core_orthologs/%s/%s/fas_dir/cutoff_dir/1.cutoff' % (coreDir, coreSet, groupID)
110+
if os.path.exists(scoreFile):
111+
LCL = 0
112+
UCL = 0
113+
for l in readFile(scoreFile):
114+
if l.split('\t')[0] == 'LCL':
115+
LCL = float(l.split('\t')[1].strip())
116+
if l.split('\t')[0] == 'UCL':
117+
UCL = float(l.split('\t')[1].strip())
118+
# if LCL <= meanFas <= UCL:
119+
if LCL <= meanFas:
120+
assessment = addToDict(assessment, groupID, line.split('\t')[2], 'similar')
121+
else:
122+
assessment = addToDict(assessment, groupID, line.split('\t')[2], 'dissimilar')
123+
else:
124+
noCutoff.append(groupID)
125+
return(assessment, noCutoff)
126+
127+
def mode4(ppFile, coreDir, coreSet, queryID):
128+
noCutoff = []
129+
assessment = {}
130+
for line in readFile(ppFile):
131+
groupID = line.split('\t')[0]
132+
if queryID in line.split('\t')[2]:
133+
length = float(line.split('\t')[3].strip())
134+
scoreFile = '%s/core_orthologs/%s/%s/fas_dir/cutoff_dir/1.cutoff' % (coreDir, coreSet, groupID)
135+
if os.path.exists(scoreFile):
136+
meanLen = 0
137+
stdevLen = 0
138+
for l in readFile(scoreFile):
139+
if l.split('\t')[0] == 'meanLen':
140+
meanLen = float(l.split('\t')[1].strip())
141+
if l.split('\t')[0] == 'stdevLen':
142+
stdevLen = float(l.split('\t')[1].strip())
143+
if not stdevLen == 0:
144+
check = abs((length - meanLen) / stdevLen)
145+
if check <= 1:
146+
assessment = addToDict(assessment, groupID, line.split('\t')[2], 'complete')
147+
else:
148+
assessment = addToDict(assessment, groupID, line.split('\t')[2], 'fragmented')
149+
else:
150+
noCutoff.append(groupID)
151+
else:
152+
noCutoff.append(groupID)
153+
return(assessment, noCutoff)
154+
155+
def mode5(ppFile, coreDir, coreSet, queryID):
156+
noCutoff = []
157+
assessment = {}
158+
for line in readFile(ppFile):
159+
groupID = line.split('\t')[0]
160+
# if queryID in line.split('\t')[2]:
161+
if line.split('\t')[1] == 'ncbi'+str(queryID.split('@')[1]):
162+
# meanFas = statistics.mean((float(line.split('\t')[3]), float(line.split('\t')[4].strip())))
163+
meanFas = float(line.split('\t')[3].strip())
164+
scoreFile = '%s/core_orthologs/%s/%s/fas_dir/cutoff_dir/3.cutoff' % (coreDir, coreSet, groupID)
165+
if os.path.exists(scoreFile):
166+
meanGroup = 0
167+
for l in readFile(scoreFile):
168+
if l.split('\t')[0] == 'meanCons':
169+
meanGroup = float(l.split('\t')[1].strip())
170+
if meanFas >= meanGroup:
171+
assessment = addToDict(assessment, groupID, line.split('\t')[2], 'similar')
172+
else:
173+
assessment = addToDict(assessment, groupID, line.split('\t')[2], 'dissimilar')
174+
else:
175+
noCutoff.append(groupID)
176+
return(assessment, noCutoff)
177+
178+
def mode6(ppFile, coreDir, coreSet, queryID, outDir):
179+
noCutoff = []
180+
assessment = {}
181+
# get refspec for each group
182+
groupRefspec = {}
183+
refspecFile = '%s/%s/%s/last_refspec.txt' % (outDir, coreSet, queryID)
184+
for g in readFile(refspecFile):
185+
groupRefspec[g.split('\t')[0]] = g.split('\t')[1]
186+
# do assessment
187+
for line in readFile(ppFile):
188+
groupID = line.split('\t')[0]
189+
# if queryID in line.split('\t')[2]:
190+
if line.split('\t')[1] == 'ncbi'+str(queryID.split('@')[1]):
191+
# meanFas = statistics.mean((float(line.split('\t')[3]), float(line.split('\t')[4].strip())))
192+
meanFas = float(line.split('\t')[3].strip())
193+
scoreFile = '%s/core_orthologs/%s/%s/fas_dir/cutoff_dir/4.cutoff' % (coreDir, coreSet, groupID)
194+
if os.path.exists(scoreFile):
195+
meanRefspec = 0
196+
for l in readFile(scoreFile):
197+
if l.split('\t')[0] == groupRefspec[groupID].strip():
198+
meanRefspec = float(l.split('\t')[1].strip())
199+
if meanFas >= meanRefspec:
200+
assessment = addToDict(assessment, groupID, line.split('\t')[2], 'similar')
201+
else:
202+
assessment = addToDict(assessment, groupID, line.split('\t')[2], 'dissimilar')
203+
else:
204+
noCutoff.append(groupID)
205+
return(assessment, noCutoff)
206+
207+
def doAssessment(ppDir, coreDir, coreSet, queryID, outDir, mode):
208+
assessment = {}
209+
noCutoff = []
210+
# assess completeness
211+
if mode == 1:
212+
ppFile = '%s/mode1.phyloprofile' % (ppDir)
213+
(assessment, noCutoff) = mode1(ppFile, coreDir, coreSet, queryID)
214+
elif mode == 2:
215+
ppFile = '%s/mode23.phyloprofile' % (ppDir)
216+
(assessment, noCutoff) = mode2(ppFile, coreDir, coreSet, queryID, outDir)
217+
elif mode == 3:
218+
ppFile = '%s/mode23.phyloprofile' % (ppDir)
219+
(assessment, noCutoff) = mode3(ppFile, coreDir, coreSet, queryID)
220+
elif mode == 4:
221+
ppFile = '%s/length.phyloprofile' % (ppDir)
222+
(assessment, noCutoff) = mode4(ppFile, coreDir, coreSet, queryID)
223+
elif mode == 5:
224+
ppFile = '%s/mode4.phyloprofile' % (ppDir)
225+
(assessment, noCutoff) = mode5(ppFile, coreDir, coreSet, queryID)
226+
elif mode == 6:
227+
ppFile = '%s/mode4.phyloprofile' % (ppDir)
228+
(assessment, noCutoff) = mode6(ppFile, coreDir, coreSet, queryID, outDir)
229+
# print full report
230+
stat = writeReport(assessment, outDir, coreDir, coreSet, queryID, mode)
231+
return(noCutoff, stat)
232+
233+
def writeReport(assessment, outDir, coreDir, coreSet, queryID, mode):
234+
missing = '%s/%s/%s/missing.txt' % (outDir, coreSet, queryID)
235+
ignored = '%s/%s/%s/ignored.txt' % (outDir, coreSet, queryID)
236+
Path('%s/%s/%s/mode_%s' % (outDir, coreSet, queryID, mode)).mkdir(parents=True, exist_ok=True)
237+
# write full report
238+
fullFile = open('%s/%s/%s/mode_%s/full.txt' % (outDir, coreSet, queryID, mode), 'w')
239+
for group in assessment:
240+
fullFile.write('%s\n' % assessment[group])
241+
for m in readFile(missing):
242+
fullFile.write(m.strip() + '\tmissing\n')
243+
for i in readFile(ignored):
244+
fullFile.write(i.strip() + '\tignored\n')
245+
fullFile.close()
246+
# write summary report
247+
summaryFile = open('%s/%s/%s/mode_%s/summary.txt' % (outDir, coreSet, queryID, mode), 'w')
248+
type = [x.split('\t')[1] for x in open('%s/%s/%s/mode_%s/full.txt' % (outDir, coreSet, queryID, mode)).readlines()]
249+
groupID = [x.split('\t')[0] for x in open('%s/%s/%s/mode_%s/full.txt' % (outDir, coreSet, queryID, mode)).readlines()]
250+
dup = [item for item, count in collections.Counter(groupID).items() if count > 1]
251+
coreGroups = os.listdir(coreDir + '/core_orthologs/' + coreSet)
252+
header = 'genomeID\tsimilar\tdissimilar\tduplicated\tmissing\tignored\ttotal'
253+
stat = '%s\n%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (header, queryID, type.count('similar'), type.count('dissimilar'), len(dup), len(readFile(missing)), len(readFile(ignored)), len(coreGroups))
254+
if mode == 4:
255+
header = 'genomeID\tcomplete\tfragmented\tduplicated\tmissing\tignored\ttotal'
256+
stat = '%s\n%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (header, queryID, type.count('complete'), type.count('fragmented'), len(dup), len(readFile(missing)), len(readFile(ignored)), len(coreGroups))
257+
summaryFile.write(stat)
258+
summaryFile.close()
259+
return(stat)
260+
261+
def assessCompteness(args):
262+
coreDir = os.path.abspath(args.coreDir)
263+
coreSet = args.coreSet
264+
checkFileExist(coreDir + '/core_orthologs/' + coreSet, '')
265+
mode = args.mode
266+
queryID = args.queryID
267+
outDir = os.path.abspath(args.outDir)
268+
if not 'fcatOutput' in outDir:
269+
outDir = outDir + '/fcatOutput'
270+
ppDir = '%s/%s/%s/phyloprofileOutput' % (outDir, coreSet, queryID)
271+
checkFileExist(os.path.abspath(ppDir), 'No phylogenetic profile folder found!')
272+
mode = args.mode
273+
force = args.force
274+
# do assessment and print report
275+
if mode == 0:
276+
for m in range(1,5):
277+
print('Mode %s:' % m)
278+
(noCutoff, stat) = doAssessment(ppDir, coreDir, coreSet, queryID, outDir, m)
279+
if len(noCutoff) > 0:
280+
print('\033[92mWARNING: No cutoff for %s group(s):\033[0m\n%s\n' % (len(noCutoff), ','.join(noCutoff)))
281+
if stat:
282+
print(stat)
283+
else:
284+
(noCutoff, stat) = doAssessment(ppDir, coreDir, coreSet, queryID, outDir, mode)
285+
if len(noCutoff) > 0:
286+
print('\033[92mWARNING: No cutoff for %s group(s):\033[0m\n%s\n' % (len(noCutoff), ','.join(noCutoff)))
287+
if stat:
288+
print(stat)
289+
290+
# merge all report (if available)
291+
modes = ('mode_1', 'mode_2', 'mode_3', 'mode_4')
292+
mergedStat = open('%s/%s/%s/all_summary.txt' % (outDir, coreSet, queryID), 'w')
293+
mergedStat.write('mode\tgenomeID\tsimilar\tdissimilar\tduplicated\tmissing\tignored\ttotal\n')
294+
for m in modes:
295+
if os.path.exists('%s/%s/%s/%s/summary.txt' % (outDir, coreSet, queryID, m)):
296+
for line in readFile('%s/%s/%s/%s/summary.txt' % (outDir, coreSet, queryID, m)):
297+
if not line.split('\t')[0] == 'genomeID':
298+
mergedStat.write('%s\t%s\n' % (m, line.strip()))
299+
mergedStat.close()
300+
301+
def main():
302+
version = '0.0.1'
303+
parser = argparse.ArgumentParser(description='You are running assessCompleteness version ' + str(version) + '.')
304+
required = parser.add_argument_group('required arguments')
305+
optional = parser.add_argument_group('optional arguments')
306+
required.add_argument('-d', '--coreDir', help='Path to core set directory, where folder core_orthologs can be found', action='store', default='', required=True)
307+
required.add_argument('-c', '--coreSet', help='Name of core set, which is subfolder within coreDir/core_orthologs/ directory', action='store', default='', required=True)
308+
required.add_argument('-o', '--outDir', help='Path to output directory', action='store', default='')
309+
required.add_argument('--queryID', help='ID of taxon of interest (e.g. HUMAN@9606@3)', action='store', default='', type=str)
310+
optional.add_argument('-m', '--mode',
311+
help='Score cutoff mode. (0) all modes, (1) mean of all-vs-all FAS scores, (2) mean FAS of refspec seed, (3) lower endpoint of CI of all-vs-all FAS scores, (4) mean and stdev of sequence length, (5) mean consensus, (6) reference consensus',
312+
action='store', default=0, choices=[0,1,2,3,4,5,6], type=int)
313+
optional.add_argument('--force', help='Force overwrite existing data', action='store_true', default=False)
314+
args = parser.parse_args()
315+
316+
start = time.time()
317+
assessCompteness(args)
318+
ende = time.time()
319+
print('Finished in ' + '{:5.3f}s'.format(ende-start))
320+
321+
if __name__ == '__main__':
322+
main()

0 commit comments

Comments
 (0)