1
1
# -*- coding: utf-8 -*-
2
2
3
3
#######################################################################
4
- # Copyright (C) 2020 Vinh Tran
4
+ # Copyright (C) 2022 Vinh Tran
5
5
#
6
6
# This script is used to prepare data for fdog.
7
- # For each given genome FASTA file, It will create a folder within genome_dir
7
+ # For each given genome FASTA file, It will create a folder within searchTaxa_dir
8
8
# with the naming scheme of fdog ([Species acronym]@[NCBI ID]@[Proteome version]
9
- # e.g HUMAN@9606@3), a annotation file in JSON format in weight_dir and
10
- # a blast DB in blast_dir folder (optional).
11
- # For a long header of original FASTA sequence, only the first word
12
- # will be taken as the ID of new fasta file, everything after the
13
- # first whitespace will be removed. If this first word is not unique,
14
- # an automatically increasing index will be added.
9
+ # e.g HUMAN@9606@3), a annotation file in JSON format in annotation_dir and
10
+ # a blast DB in coreTaxa_dir folder (optional).
15
11
#
16
12
# This script is distributed in the hope that it will be useful,
17
13
# but WITHOUT ANY WARRANTY; without even the implied warranty of
26
22
import sys
27
23
import os
28
24
import argparse
29
- from os import listdir
30
- from os .path import isfile , join
31
25
from pathlib import Path
32
- import subprocess
26
+ from Bio import SeqIO
33
27
import multiprocessing as mp
28
+ from tqdm import tqdm
34
29
from ete3 import NCBITaxa
35
- import csv
36
- from io import StringIO
37
30
import re
38
31
import shutil
39
- from tqdm import tqdm
40
32
from datetime import datetime
33
+ import time
34
+ from pkg_resources import get_distribution
35
+ from collections import OrderedDict
41
36
42
- def checkFileExist ( file ):
43
- if not os . path . exists ( os . path . abspath ( file )):
44
- sys . exit ( '%s not found' % file )
37
+ import fdog . libs . zzz as general_fn
38
+ import fdog . libs . tree as tree_fn
39
+ import fdog . libs . addtaxon as add_taxon_fn
45
40
46
- def getTaxName (taxId ):
47
- ncbi = NCBITaxa ()
48
- try :
49
- ncbiName = ncbi .get_taxid_translator ([taxId ])[int (taxId )]
50
- ncbiName = re .sub ('[^a-zA-Z1-9\s]+' , '' , ncbiName )
51
- taxName = ncbiName .split ()
52
- name = taxName [0 ][:3 ].upper ()+ taxName [1 ][:2 ].upper ()
53
- except :
54
- name = "UNK" + taxId
55
- return (name )
56
41
57
- def parseMapFile (mappingFile ):
58
- nameDict = {}
59
- with open (mappingFile ) as f :
42
+ def parse_map_file (mapping_file , folIn ):
43
+ """ Create spec name from mapping file
44
+ And also check if given input files in mapping file exist
45
+ """
46
+ name_dict = {}
47
+ with open (mapping_file ) as f :
60
48
for line in f :
61
49
if not '#' in line :
62
50
tmp = line .split ('\t ' )
63
- fileName = tmp [0 ]
64
- taxId = tmp [1 ].strip ()
51
+ file_name = tmp [0 ]
52
+ file_in = '%s/%s' % (folIn , file_name )
53
+ general_fn .check_file_exist (file_in )
54
+ tax_id = tmp [1 ].strip ()
65
55
try :
66
- taxName = tmp [2 ].strip ()
56
+ tax_name = tmp [2 ].strip ()
67
57
except :
68
- taxName = getTaxName ( taxId )
58
+ tax_name = ''
69
59
try :
70
60
ver = tmp [3 ].strip ()
71
61
except :
72
- ver = datetime .today ().strftime ('%y%m%d' ) #1
73
- # print(taxName+"@"+str(taxId)+"@"+str( ver) )
74
- nameDict [ fileName ] = ( taxName , str ( taxId ), str ( ver ))
75
- return (nameDict )
62
+ ver = datetime .today ().strftime ('%y%m%d' )
63
+ spec_name = add_taxon_fn . generate_spec_name ( tax_id , tax_name , ver )
64
+ name_dict [ file_in ] = spec_name
65
+ return (name_dict )
76
66
77
- def runAddTaxon (args ):
78
- (f ,n ,i ,o ,c ,v ,a ,cpus ,replace ,delete ) = args
79
- cmd = 'fdog.addTaxon -f %s -n %s -i %s -o %s -v %s --cpus %s' % (f ,n ,i ,o ,v ,cpus )
80
- if c == True :
81
- cmd = cmd + ' -c'
82
- if a == True :
83
- cmd = cmd + ' -a'
84
- if replace == True :
85
- cmd = cmd + ' --replace'
86
- if delete == True :
87
- cmd = cmd + ' --delete'
88
- # print(cmd)
89
- logFile = o + '/addTaxa2fDog.log'
90
- cmd = cmd + ' >> ' + logFile
91
- try :
92
- subprocess .call ([cmd ], shell = True )
93
- except :
94
- sys .exit ('Problem running\n %s' % (cmd ))
95
67
96
68
def main ():
97
- version = '0.0.9'
98
- parser = argparse .ArgumentParser (description = 'You are running fdog.addTaxa version ' + str (version ) + '.' )
69
+ version = get_distribution ( 'fdog' ). version
70
+ parser = argparse .ArgumentParser (description = 'You are running fDOG version ' + str (version ) + '.' )
99
71
required = parser .add_argument_group ('required arguments' )
100
72
optional = parser .add_argument_group ('optional arguments' )
101
73
required .add_argument ('-i' , '--input' , help = 'Path to input folder' , action = 'store' , default = '' , required = True )
102
74
required .add_argument ('-m' , '--mapping' ,
103
- help = 'Tab-delimited text file containing <fasta_filename >tab<taxonID>tab<taxonName>tab<genome_version>. The last 2 columns are optional.' ,
75
+ help = 'Tab-delimited text file containing <fasta_file_name >tab<taxonID>tab<taxonName>tab<genome_version>. The last 2 columns are optional.' ,
104
76
action = 'store' , default = '' , required = True )
105
77
optional .add_argument ('-o' , '--outPath' , help = 'Path to output directory' , action = 'store' , default = '' )
106
- optional .add_argument ('-c' , '--coreTaxa' , help = 'Include these taxa to core taxa (i.e. taxa in blast_dir folder)' , action = 'store_true' , default = False )
107
- optional .add_argument ('-a' , '--noAnno' , help = 'Do NOT annotate these taxa using fas.doAnno' , action = 'store_true' , default = False )
78
+ optional .add_argument ('-c' , '--coreTaxa' , help = 'Include this taxon to core taxa (i.e. taxa in coreTaxa_dir folder)' , action = 'store_true' , default = False )
79
+ optional .add_argument ('-a' , '--noAnno' , help = 'Do NOT annotate this taxon using fas.doAnno' , action = 'store_true' , default = False )
108
80
optional .add_argument ('--cpus' , help = 'Number of CPUs used for annotation. Default = available cores - 1' , action = 'store' , default = 0 , type = int )
109
81
optional .add_argument ('--replace' , help = 'Replace special characters in sequences by "X"' , action = 'store_true' , default = False )
110
82
optional .add_argument ('--delete' , help = 'Delete special characters in sequences' , action = 'store_true' , default = False )
111
- optional .add_argument ('-f' , '- -force' , help = 'Force overwrite existing data' , action = 'store_true' , default = False )
83
+ optional .add_argument ('--force' , help = 'Force overwrite existing data' , action = 'store_true' , default = False )
112
84
113
- ### get arguments
114
85
args = parser .parse_args ()
115
86
folIn = args .input
87
+ folIn = os .path .abspath (folIn )
116
88
mapping = args .mapping
117
- checkFileExist (mapping )
89
+ general_fn . check_file_exist (mapping )
118
90
outPath = args .outPath
119
91
if outPath == '' :
120
- fdogPath = os .path .realpath (__file__ ).replace ('/addTaxa .py' ,'' )
92
+ fdogPath = os .path .realpath (__file__ ).replace ('/addTaxon .py' ,'' )
121
93
pathconfigFile = fdogPath + '/bin/pathconfig.txt'
122
94
if not os .path .exists (pathconfigFile ):
123
95
sys .exit ('No pathconfig.txt found. Please run fdog.setup (https://github.com/BIONF/fDOG/wiki/Installation#setup-fdog).' )
@@ -131,61 +103,63 @@ def main():
131
103
cpus = mp .cpu_count ()- 2
132
104
replace = args .replace
133
105
delete = args .delete
106
+ add_taxon_fn .check_conflict_opts (replace , delete )
134
107
force = args .force
135
108
109
+ start = time .time ()
110
+ ### parse mapping file
111
+ name_dict = parse_map_file (mapping , folIn )
136
112
137
- ### get existing genomes
138
- Path (outPath + "/genome_dir" ).mkdir (parents = True , exist_ok = True )
139
- Path (outPath + "/weight_dir" ).mkdir (parents = True , exist_ok = True )
140
- genomeFiles = listdir (outPath + "/genome_dir" )
141
-
142
- ### generate taxon names from mapping file
143
- nameDict = parseMapFile (mapping )
144
-
145
- ### read all input fasta files and create addTaxon jobs
146
- jobs = []
147
- dupList = {}
148
- faFiles = [f for f in listdir (folIn ) if isfile (join (folIn , f ))]
149
- for f in faFiles :
150
- # tmp = f.split('.')
151
- if f in nameDict :
152
- # check duplicated taxon name in existing data
153
- taxName = '@' .join (nameDict [f ])
154
- flag = 1
155
- if taxName in genomeFiles :
156
- if force :
157
- shutil .rmtree (outPath + "/genome_dir/" + taxName )
158
- if not noAnno :
159
- shutil .rmtree (outPath + "/weight_dir/" + taxName )
160
- else :
161
- flag = 0
162
- dupList [f ] = taxName
113
+ ### initiate paths
114
+ Path (outPath + '/searchTaxa_dir' ).mkdir (parents = True , exist_ok = True )
163
115
164
- if flag == 1 :
165
- fasta = folIn + '/' + f
166
- name = nameDict [f ][0 ]
167
- taxid = nameDict [f ][1 ]
168
- verProt = nameDict [f ][2 ]
169
- jobs .append ([
170
- folIn + '/' + f , nameDict [f ][0 ], nameDict [f ][1 ],
171
- outPath , coreTaxa , nameDict [f ][2 ], noAnno , cpus , replace , delete
172
- ])
173
-
174
- if len (dupList ) > 0 :
175
- print ("These taxa are probably already present in %s:" % (outPath + "/genome_dir" ))
176
- for f in dupList :
177
- print ('\t ' + f + '\t ' + dupList [f ])
116
+ ### create file in searchTaxa_dir [and coreTaxa_dir]
117
+ genome_jobs = []
118
+ blast_jobs = []
119
+ for f in name_dict :
120
+ spec_name = name_dict [f ]
121
+ ## remove old folder if force is set
178
122
if force :
179
- print ('They will be deleted and re-compiled!' )
180
- else :
181
- sys .exit ("Please remove them from the mapping file or use different Name/ID/Version!" )
182
-
183
- print ('Parsing...' )
184
- for job in tqdm (jobs ):
185
- # print('@'.join([job[1],job[2],job[5]]) + '\t' + job[0])
186
- runAddTaxon (job )
187
-
188
- print ('Output can be found in %s' % outPath )
123
+ if os .path .exists (outPath + '/searchTaxa_dir/' + spec_name ):
124
+ shutil .rmtree (outPath + '/searchTaxa_dir/' + spec_name )
125
+ if os .path .exists (outPath + '/coreTaxa_dir/' + spec_name ):
126
+ shutil .rmtree (outPath + '/coreTaxa_dir/' + spec_name )
127
+ ## create jobs
128
+ genome_path = '%s/searchTaxa_dir/%s' % (outPath , spec_name )
129
+ Path (genome_path ).mkdir (parents = True , exist_ok = True )
130
+ genome_jobs .append ([f , genome_path , spec_name , force , replace , delete ])
131
+ if coreTaxa :
132
+ genome_file = '%s/%s.fa' % (genome_path , spec_name )
133
+ blast_jobs .append ([outPath , spec_name , genome_file , force , True ])
134
+ pool = mp .Pool (cpus )
135
+
136
+ print ('Parsing genome for %s species...' % len (genome_jobs ))
137
+ genome_out = []
138
+ for _ in tqdm (pool .imap_unordered (add_taxon_fn .create_genome , genome_jobs ),
139
+ total = len (genome_jobs )):
140
+ genome_out .append (_ )
141
+ out_msg = 'Output for %s can be found in %s within searchTaxa_dir' % (spec_name , outPath )
142
+ if len (blast_jobs ) > 0 :
143
+ print ('\n Creating Blast DB for %s species...' % len (blast_jobs ))
144
+ blast_out = []
145
+ for _ in tqdm (pool .imap_unordered (add_taxon_fn .create_blastdb , blast_jobs ),
146
+ total = len (blast_jobs )):
147
+ blast_out .append (_ )
148
+ out_msg = '%s, coreTaxa_dir' % out_msg
149
+
150
+ ### create annotation
151
+ if not noAnno :
152
+ Path (outPath + '/annotation_dir' ).mkdir (parents = True , exist_ok = True )
153
+ for f in name_dict :
154
+ genome_file = '%s/searchTaxa_dir/%s/%s.fa' % (outPath , name_dict [f ], name_dict [f ])
155
+ add_taxon_fn .create_annoFile (outPath , genome_file , cpus , force )
156
+ if os .path .exists ('%s/annotation_dir/tmp' % outPath ):
157
+ shutil .rmtree ('%s/annotation_dir/tmp' % outPath )
158
+ out_msg = '%s, annotation_dir' % out_msg
159
+
160
+ end = time .time ()
161
+ print ('==> Adding %s taxa finished in %s' % (len (name_dict ), '{:5.3f}s' .format (end - start )))
162
+ print ('==> %s' % out_msg )
189
163
190
164
if __name__ == '__main__' :
191
165
main ()
0 commit comments