|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | + |
| 3 | +####################################################################### |
| 4 | +# Copyright (C) 2020 Vinh Tran |
| 5 | +# |
| 6 | +# Calculate FAS cutoff for each core ortholog group of the core set |
| 7 | +# |
| 8 | +# This script is distributed in the hope that it will be useful, |
| 9 | +# but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 10 | +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 11 | +# GNU General Public License <http://www.gnu.org/licenses/> for |
| 12 | +# more details |
| 13 | +# |
| 14 | + |
| 15 | +# |
| 16 | +####################################################################### |
| 17 | + |
| 18 | +import sys |
| 19 | +import os |
| 20 | +import argparse |
| 21 | +from pathlib import Path |
| 22 | +from Bio import SeqIO |
| 23 | +import subprocess |
| 24 | +import multiprocessing as mp |
| 25 | +import shutil |
| 26 | +from tqdm import tqdm |
| 27 | +import time |
| 28 | +import statistics |
| 29 | +import collections |
| 30 | + |
| 31 | +def checkFileExist(file, msg): |
| 32 | + if not os.path.exists(os.path.abspath(file)): |
| 33 | + sys.exit('%s not found! %s' % (file, msg)) |
| 34 | + |
| 35 | +def readFile(file): |
| 36 | + if os.path.exists(file): |
| 37 | + with open(file, 'r') as f: |
| 38 | + lines = f.readlines() |
| 39 | + f.close() |
| 40 | + return(lines) |
| 41 | + else: |
| 42 | + return('NA') |
| 43 | + |
| 44 | +def addToDict(dict, groupID, seqID, type): |
| 45 | + if not groupID in dict: |
| 46 | + dict[groupID] = '%s\t%s\t%s' % (groupID, type, seqID) |
| 47 | + else: |
| 48 | + tmp = dict[groupID].split('\t') |
| 49 | + dict[groupID] = '%s\tduplicated (%s)\t%s' % (tmp[0], tmp[1], tmp[2]) |
| 50 | + dict[groupID] = '%s\n%s\tduplicated (%s)\t%s' % (dict[groupID], groupID, type, seqID) |
| 51 | + return(dict) |
| 52 | + |
| 53 | +def mode1(ppFile, coreDir, coreSet, queryID): |
| 54 | + noCutoff = [] |
| 55 | + assessment = {} |
| 56 | + for line in readFile(ppFile): |
| 57 | + groupID = line.split('\t')[0] |
| 58 | + if queryID in line.split('\t')[2]: |
| 59 | + # meanFas = statistics.mean((float(line.split('\t')[3]), float(line.split('\t')[4].strip()))) |
| 60 | + meanFas = float(line.split('\t')[3].strip()) |
| 61 | + scoreFile = '%s/core_orthologs/%s/%s/fas_dir/cutoff_dir/1.cutoff' % (coreDir, coreSet, groupID) |
| 62 | + if os.path.exists(scoreFile): |
| 63 | + meanGroup = 0 |
| 64 | + for l in readFile(scoreFile): |
| 65 | + if l.split('\t')[0] == 'mean': |
| 66 | + meanGroup = float(l.split('\t')[1].strip()) |
| 67 | + if meanFas >= meanGroup: |
| 68 | + assessment = addToDict(assessment, groupID, line.split('\t')[2], 'similar') |
| 69 | + else: |
| 70 | + assessment = addToDict(assessment, groupID, line.split('\t')[2], 'dissimilar') |
| 71 | + else: |
| 72 | + noCutoff.append(groupID) |
| 73 | + return(assessment, noCutoff) |
| 74 | + |
| 75 | +def mode2(ppFile, coreDir, coreSet, queryID, outDir): |
| 76 | + noCutoff = [] |
| 77 | + assessment = {} |
| 78 | + # get refspec for each group |
| 79 | + groupRefspec = {} |
| 80 | + refspecFile = '%s/%s/%s/last_refspec.txt' % (outDir, coreSet, queryID) |
| 81 | + for g in readFile(refspecFile): |
| 82 | + groupRefspec[g.split('\t')[0]] = g.split('\t')[1] |
| 83 | + # do assessment |
| 84 | + for line in readFile(ppFile): |
| 85 | + groupID = line.split('\t')[0] |
| 86 | + if queryID in line.split('\t')[2]: |
| 87 | + meanFas = statistics.mean((float(line.split('\t')[3]), float(line.split('\t')[4].strip()))) |
| 88 | + scoreFile = '%s/core_orthologs/%s/%s/fas_dir/cutoff_dir/2.cutoff' % (coreDir, coreSet, groupID) |
| 89 | + if os.path.exists(scoreFile): |
| 90 | + meanRefspec = 0 |
| 91 | + for l in readFile(scoreFile): |
| 92 | + if l.split('\t')[0] == groupRefspec[groupID].strip(): |
| 93 | + meanRefspec = float(l.split('\t')[1].strip()) |
| 94 | + if meanFas >= meanRefspec: |
| 95 | + assessment = addToDict(assessment, groupID, line.split('\t')[2], 'similar') |
| 96 | + else: |
| 97 | + assessment = addToDict(assessment, groupID, line.split('\t')[2], 'dissimilar') |
| 98 | + else: |
| 99 | + noCutoff.append(groupID) |
| 100 | + return(assessment, noCutoff) |
| 101 | + |
| 102 | +def mode3(ppFile, coreDir, coreSet, queryID): |
| 103 | + noCutoff = [] |
| 104 | + assessment = {} |
| 105 | + for line in readFile(ppFile): |
| 106 | + groupID = line.split('\t')[0] |
| 107 | + if queryID in line.split('\t')[2]: |
| 108 | + meanFas = statistics.mean((float(line.split('\t')[3]), float(line.split('\t')[4].strip()))) |
| 109 | + scoreFile = '%s/core_orthologs/%s/%s/fas_dir/cutoff_dir/1.cutoff' % (coreDir, coreSet, groupID) |
| 110 | + if os.path.exists(scoreFile): |
| 111 | + LCL = 0 |
| 112 | + UCL = 0 |
| 113 | + for l in readFile(scoreFile): |
| 114 | + if l.split('\t')[0] == 'LCL': |
| 115 | + LCL = float(l.split('\t')[1].strip()) |
| 116 | + if l.split('\t')[0] == 'UCL': |
| 117 | + UCL = float(l.split('\t')[1].strip()) |
| 118 | + # if LCL <= meanFas <= UCL: |
| 119 | + if LCL <= meanFas: |
| 120 | + assessment = addToDict(assessment, groupID, line.split('\t')[2], 'similar') |
| 121 | + else: |
| 122 | + assessment = addToDict(assessment, groupID, line.split('\t')[2], 'dissimilar') |
| 123 | + else: |
| 124 | + noCutoff.append(groupID) |
| 125 | + return(assessment, noCutoff) |
| 126 | + |
| 127 | +def mode4(ppFile, coreDir, coreSet, queryID): |
| 128 | + noCutoff = [] |
| 129 | + assessment = {} |
| 130 | + for line in readFile(ppFile): |
| 131 | + groupID = line.split('\t')[0] |
| 132 | + if queryID in line.split('\t')[2]: |
| 133 | + length = float(line.split('\t')[3].strip()) |
| 134 | + scoreFile = '%s/core_orthologs/%s/%s/fas_dir/cutoff_dir/1.cutoff' % (coreDir, coreSet, groupID) |
| 135 | + if os.path.exists(scoreFile): |
| 136 | + meanLen = 0 |
| 137 | + stdevLen = 0 |
| 138 | + for l in readFile(scoreFile): |
| 139 | + if l.split('\t')[0] == 'meanLen': |
| 140 | + meanLen = float(l.split('\t')[1].strip()) |
| 141 | + if l.split('\t')[0] == 'stdevLen': |
| 142 | + stdevLen = float(l.split('\t')[1].strip()) |
| 143 | + if not stdevLen == 0: |
| 144 | + check = abs((length - meanLen) / stdevLen) |
| 145 | + if check <= 1: |
| 146 | + assessment = addToDict(assessment, groupID, line.split('\t')[2], 'complete') |
| 147 | + else: |
| 148 | + assessment = addToDict(assessment, groupID, line.split('\t')[2], 'fragmented') |
| 149 | + else: |
| 150 | + noCutoff.append(groupID) |
| 151 | + else: |
| 152 | + noCutoff.append(groupID) |
| 153 | + return(assessment, noCutoff) |
| 154 | + |
| 155 | +def mode5(ppFile, coreDir, coreSet, queryID): |
| 156 | + noCutoff = [] |
| 157 | + assessment = {} |
| 158 | + for line in readFile(ppFile): |
| 159 | + groupID = line.split('\t')[0] |
| 160 | + # if queryID in line.split('\t')[2]: |
| 161 | + if line.split('\t')[1] == 'ncbi'+str(queryID.split('@')[1]): |
| 162 | + # meanFas = statistics.mean((float(line.split('\t')[3]), float(line.split('\t')[4].strip()))) |
| 163 | + meanFas = float(line.split('\t')[3].strip()) |
| 164 | + scoreFile = '%s/core_orthologs/%s/%s/fas_dir/cutoff_dir/3.cutoff' % (coreDir, coreSet, groupID) |
| 165 | + if os.path.exists(scoreFile): |
| 166 | + meanGroup = 0 |
| 167 | + for l in readFile(scoreFile): |
| 168 | + if l.split('\t')[0] == 'meanCons': |
| 169 | + meanGroup = float(l.split('\t')[1].strip()) |
| 170 | + if meanFas >= meanGroup: |
| 171 | + assessment = addToDict(assessment, groupID, line.split('\t')[2], 'similar') |
| 172 | + else: |
| 173 | + assessment = addToDict(assessment, groupID, line.split('\t')[2], 'dissimilar') |
| 174 | + else: |
| 175 | + noCutoff.append(groupID) |
| 176 | + return(assessment, noCutoff) |
| 177 | + |
| 178 | +def mode6(ppFile, coreDir, coreSet, queryID, outDir): |
| 179 | + noCutoff = [] |
| 180 | + assessment = {} |
| 181 | + # get refspec for each group |
| 182 | + groupRefspec = {} |
| 183 | + refspecFile = '%s/%s/%s/last_refspec.txt' % (outDir, coreSet, queryID) |
| 184 | + for g in readFile(refspecFile): |
| 185 | + groupRefspec[g.split('\t')[0]] = g.split('\t')[1] |
| 186 | + # do assessment |
| 187 | + for line in readFile(ppFile): |
| 188 | + groupID = line.split('\t')[0] |
| 189 | + # if queryID in line.split('\t')[2]: |
| 190 | + if line.split('\t')[1] == 'ncbi'+str(queryID.split('@')[1]): |
| 191 | + # meanFas = statistics.mean((float(line.split('\t')[3]), float(line.split('\t')[4].strip()))) |
| 192 | + meanFas = float(line.split('\t')[3].strip()) |
| 193 | + scoreFile = '%s/core_orthologs/%s/%s/fas_dir/cutoff_dir/4.cutoff' % (coreDir, coreSet, groupID) |
| 194 | + if os.path.exists(scoreFile): |
| 195 | + meanRefspec = 0 |
| 196 | + for l in readFile(scoreFile): |
| 197 | + if l.split('\t')[0] == groupRefspec[groupID].strip(): |
| 198 | + meanRefspec = float(l.split('\t')[1].strip()) |
| 199 | + if meanFas >= meanRefspec: |
| 200 | + assessment = addToDict(assessment, groupID, line.split('\t')[2], 'similar') |
| 201 | + else: |
| 202 | + assessment = addToDict(assessment, groupID, line.split('\t')[2], 'dissimilar') |
| 203 | + else: |
| 204 | + noCutoff.append(groupID) |
| 205 | + return(assessment, noCutoff) |
| 206 | + |
| 207 | +def doAssessment(ppDir, coreDir, coreSet, queryID, outDir, mode): |
| 208 | + assessment = {} |
| 209 | + noCutoff = [] |
| 210 | + # assess completeness |
| 211 | + if mode == 1: |
| 212 | + ppFile = '%s/mode1.phyloprofile' % (ppDir) |
| 213 | + (assessment, noCutoff) = mode1(ppFile, coreDir, coreSet, queryID) |
| 214 | + elif mode == 2: |
| 215 | + ppFile = '%s/mode23.phyloprofile' % (ppDir) |
| 216 | + (assessment, noCutoff) = mode2(ppFile, coreDir, coreSet, queryID, outDir) |
| 217 | + elif mode == 3: |
| 218 | + ppFile = '%s/mode23.phyloprofile' % (ppDir) |
| 219 | + (assessment, noCutoff) = mode3(ppFile, coreDir, coreSet, queryID) |
| 220 | + elif mode == 4: |
| 221 | + ppFile = '%s/length.phyloprofile' % (ppDir) |
| 222 | + (assessment, noCutoff) = mode4(ppFile, coreDir, coreSet, queryID) |
| 223 | + elif mode == 5: |
| 224 | + ppFile = '%s/mode4.phyloprofile' % (ppDir) |
| 225 | + (assessment, noCutoff) = mode5(ppFile, coreDir, coreSet, queryID) |
| 226 | + elif mode == 6: |
| 227 | + ppFile = '%s/mode4.phyloprofile' % (ppDir) |
| 228 | + (assessment, noCutoff) = mode6(ppFile, coreDir, coreSet, queryID, outDir) |
| 229 | + # print full report |
| 230 | + stat = writeReport(assessment, outDir, coreDir, coreSet, queryID, mode) |
| 231 | + return(noCutoff, stat) |
| 232 | + |
| 233 | +def writeReport(assessment, outDir, coreDir, coreSet, queryID, mode): |
| 234 | + missing = '%s/%s/%s/missing.txt' % (outDir, coreSet, queryID) |
| 235 | + ignored = '%s/%s/%s/ignored.txt' % (outDir, coreSet, queryID) |
| 236 | + Path('%s/%s/%s/mode_%s' % (outDir, coreSet, queryID, mode)).mkdir(parents=True, exist_ok=True) |
| 237 | + # write full report |
| 238 | + fullFile = open('%s/%s/%s/mode_%s/full.txt' % (outDir, coreSet, queryID, mode), 'w') |
| 239 | + for group in assessment: |
| 240 | + fullFile.write('%s\n' % assessment[group]) |
| 241 | + for m in readFile(missing): |
| 242 | + fullFile.write(m.strip() + '\tmissing\n') |
| 243 | + for i in readFile(ignored): |
| 244 | + fullFile.write(i.strip() + '\tignored\n') |
| 245 | + fullFile.close() |
| 246 | + # write summary report |
| 247 | + summaryFile = open('%s/%s/%s/mode_%s/summary.txt' % (outDir, coreSet, queryID, mode), 'w') |
| 248 | + type = [x.split('\t')[1] for x in open('%s/%s/%s/mode_%s/full.txt' % (outDir, coreSet, queryID, mode)).readlines()] |
| 249 | + groupID = [x.split('\t')[0] for x in open('%s/%s/%s/mode_%s/full.txt' % (outDir, coreSet, queryID, mode)).readlines()] |
| 250 | + dup = [item for item, count in collections.Counter(groupID).items() if count > 1] |
| 251 | + coreGroups = os.listdir(coreDir + '/core_orthologs/' + coreSet) |
| 252 | + header = 'genomeID\tsimilar\tdissimilar\tduplicated\tmissing\tignored\ttotal' |
| 253 | + stat = '%s\n%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (header, queryID, type.count('similar'), type.count('dissimilar'), len(dup), len(readFile(missing)), len(readFile(ignored)), len(coreGroups)) |
| 254 | + if mode == 4: |
| 255 | + header = 'genomeID\tcomplete\tfragmented\tduplicated\tmissing\tignored\ttotal' |
| 256 | + stat = '%s\n%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (header, queryID, type.count('complete'), type.count('fragmented'), len(dup), len(readFile(missing)), len(readFile(ignored)), len(coreGroups)) |
| 257 | + summaryFile.write(stat) |
| 258 | + summaryFile.close() |
| 259 | + return(stat) |
| 260 | + |
| 261 | +def assessCompteness(args): |
| 262 | + coreDir = os.path.abspath(args.coreDir) |
| 263 | + coreSet = args.coreSet |
| 264 | + checkFileExist(coreDir + '/core_orthologs/' + coreSet, '') |
| 265 | + mode = args.mode |
| 266 | + queryID = args.queryID |
| 267 | + outDir = os.path.abspath(args.outDir) |
| 268 | + if not 'fcatOutput' in outDir: |
| 269 | + outDir = outDir + '/fcatOutput' |
| 270 | + ppDir = '%s/%s/%s/phyloprofileOutput' % (outDir, coreSet, queryID) |
| 271 | + checkFileExist(os.path.abspath(ppDir), 'No phylogenetic profile folder found!') |
| 272 | + mode = args.mode |
| 273 | + force = args.force |
| 274 | + # do assessment and print report |
| 275 | + if mode == 0: |
| 276 | + for m in range(1,5): |
| 277 | + print('Mode %s:' % m) |
| 278 | + (noCutoff, stat) = doAssessment(ppDir, coreDir, coreSet, queryID, outDir, m) |
| 279 | + if len(noCutoff) > 0: |
| 280 | + print('\033[92mWARNING: No cutoff for %s group(s):\033[0m\n%s\n' % (len(noCutoff), ','.join(noCutoff))) |
| 281 | + if stat: |
| 282 | + print(stat) |
| 283 | + else: |
| 284 | + (noCutoff, stat) = doAssessment(ppDir, coreDir, coreSet, queryID, outDir, mode) |
| 285 | + if len(noCutoff) > 0: |
| 286 | + print('\033[92mWARNING: No cutoff for %s group(s):\033[0m\n%s\n' % (len(noCutoff), ','.join(noCutoff))) |
| 287 | + if stat: |
| 288 | + print(stat) |
| 289 | + |
| 290 | + # merge all report (if available) |
| 291 | + modes = ('mode_1', 'mode_2', 'mode_3', 'mode_4') |
| 292 | + mergedStat = open('%s/%s/%s/all_summary.txt' % (outDir, coreSet, queryID), 'w') |
| 293 | + mergedStat.write('mode\tgenomeID\tsimilar\tdissimilar\tduplicated\tmissing\tignored\ttotal\n') |
| 294 | + for m in modes: |
| 295 | + if os.path.exists('%s/%s/%s/%s/summary.txt' % (outDir, coreSet, queryID, m)): |
| 296 | + for line in readFile('%s/%s/%s/%s/summary.txt' % (outDir, coreSet, queryID, m)): |
| 297 | + if not line.split('\t')[0] == 'genomeID': |
| 298 | + mergedStat.write('%s\t%s\n' % (m, line.strip())) |
| 299 | + mergedStat.close() |
| 300 | + |
| 301 | +def main(): |
| 302 | + version = '0.0.1' |
| 303 | + parser = argparse.ArgumentParser(description='You are running assessCompleteness version ' + str(version) + '.') |
| 304 | + required = parser.add_argument_group('required arguments') |
| 305 | + optional = parser.add_argument_group('optional arguments') |
| 306 | + required.add_argument('-d', '--coreDir', help='Path to core set directory, where folder core_orthologs can be found', action='store', default='', required=True) |
| 307 | + required.add_argument('-c', '--coreSet', help='Name of core set, which is subfolder within coreDir/core_orthologs/ directory', action='store', default='', required=True) |
| 308 | + required.add_argument('-o', '--outDir', help='Path to output directory', action='store', default='') |
| 309 | + required.add_argument('--queryID', help='ID of taxon of interest (e.g. HUMAN@9606@3)', action='store', default='', type=str) |
| 310 | + optional.add_argument('-m', '--mode', |
| 311 | + help='Score cutoff mode. (0) all modes, (1) mean of all-vs-all FAS scores, (2) mean FAS of refspec seed, (3) lower endpoint of CI of all-vs-all FAS scores, (4) mean and stdev of sequence length, (5) mean consensus, (6) reference consensus', |
| 312 | + action='store', default=0, choices=[0,1,2,3,4,5,6], type=int) |
| 313 | + optional.add_argument('--force', help='Force overwrite existing data', action='store_true', default=False) |
| 314 | + args = parser.parse_args() |
| 315 | + |
| 316 | + start = time.time() |
| 317 | + assessCompteness(args) |
| 318 | + ende = time.time() |
| 319 | + print('Finished in ' + '{:5.3f}s'.format(ende-start)) |
| 320 | + |
| 321 | +if __name__ == '__main__': |
| 322 | + main() |
0 commit comments