Skip to content

Commit a746c8f

Browse files
committed
added check for invalid min-/maxDist; works with fasta36.3.8g
1 parent 48372cf commit a746c8f

File tree

6 files changed

+81
-5
lines changed

6 files changed

+81
-5
lines changed

fdog/data/conda_requirements.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
blast
22
hmmer
3-
fasta3
3+
fasta3=36.3.8i
44
clustalw
55
mafft
6-
muscle
6+
muscle=5.1

fdog/libs/alignment.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,10 @@ def calc_aln_score(fa1, fa2, aln_strategy = 'local', debugCore = False):
140140
Return dictionary {gene_id:aln_score}
141141
"""
142142
fdog_path = os.path.realpath(__file__).replace('/libs/alignment.py','')
143-
fasta36_options = '%s %s -s BP62 -m 9 -d 0 -z -1 -E 100' % (fa1, fa2)
143+
os.symlink(fa1, 'fasta36_1.fa')
144+
os.symlink(fa2, 'fasta36_2.fa')
145+
# fasta36_options = '%s %s -s BP62 -m 9 -d 0 -z -1 -E 100' % (fa1, fa2)
146+
fasta36_options = 'fasta36_1.fa fasta36_2.fa -s BP62 -m 9 -d 0 -z -1 -E 100'
144147
fdog_path = os.path.realpath(__file__).replace('/libs/alignment.py','')
145148
fasta36_bin = check_fasta36_executable(fdog_path)
146149
if aln_strategy == 'global':
@@ -173,4 +176,6 @@ def calc_aln_score(fa1, fa2, aln_strategy = 'local', debugCore = False):
173176
if re.search('\(\s+\d+\)', l):
174177
l = re.sub(r'\(\s+','(', l)
175178
aln_score[gene_id] = aln_score[gene_id] + int(l.split()[2])
179+
os.remove('fasta36_1.fa')
180+
os.remove('fasta36_2.fa')
176181
return(aln_score)

fdog/libs/preparation.py

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,13 @@
2020
from pathlib import Path
2121
from Bio import SeqIO
2222
from Bio.Blast.Applications import NcbiblastpCommandline
23+
from ete3 import NCBITaxa
2324

2425
import fdog.libs.zzz as general_fn
2526
import fdog.libs.fasta as fasta_fn
2627
import fdog.libs.blast as blast_fn
2728
import fdog.libs.output as output_fn
29+
import fdog.libs.tree as tree_fn
2830

2931

3032
##### FUNCTIONS FOR DATA/INPUT PREPARATION #####
@@ -117,6 +119,42 @@ def check_blast_version(corepath, refspec):
117119
'ERROR: Error running blast (probably conflict with BLAST DBs versions)\n%s'
118120
% (NcbiblastpCommandline(query = query, db = blast_db)))
119121

122+
def check_ranks_core_taxa(corepath, minDist, maxDist):
123+
""" Check if all core taxa have a valid minDist and maxDist tax ID
124+
Return 2 dictionaries of taxa for invalid minDist and maxDist, where
125+
keys is taxon name and value is the next valid rank
126+
"""
127+
invalid_minDist = []
128+
invalid_maxDist = []
129+
ncbi = NCBITaxa()
130+
rank_list = ['species', 'genus', 'family', 'order', 'class', 'phylum', 'kingdom', 'superkingdom']
131+
suggest_minIndex = rank_list.index(minDist)
132+
suggest_maxIndex = rank_list.index(maxDist)
133+
for f in os.listdir(corepath):
134+
if os.path.isdir(f'{corepath}/{f}'):
135+
id = f.split('@')[1]
136+
lineage = ncbi.get_lineage(id)
137+
ranks = ncbi.get_rank(lineage)
138+
if len(general_fn.matching_elements(ranks, minDist)) < 1:
139+
invalid_minDist.append(f)
140+
index_minDist = rank_list.index(minDist) + 1
141+
while index_minDist < len(rank_list):
142+
if len(general_fn.matching_elements(ranks, rank_list[index_minDist])) > 0:
143+
if index_minDist > suggest_minIndex:
144+
suggest_minIndex = index_minDist
145+
break
146+
index_minDist += 1
147+
if len(general_fn.matching_elements(ranks, maxDist)) < 1:
148+
invalid_maxDist.append(f)
149+
index_maxDist = rank_list.index(maxDist) + 1
150+
while index_maxDist < len(rank_list):
151+
if len(general_fn.matching_elements(ranks, rank_list[index_maxDist])) > 0:
152+
if index_maxDist > suggest_maxIndex:
153+
suggest_maxIndex = index_maxDist
154+
break
155+
index_maxDist += 1
156+
return(invalid_minDist, invalid_maxDist, rank_list[suggest_minIndex], rank_list[suggest_maxIndex])
157+
120158

121159
def get_seed_id_from_fa(core_fa, refspec):
122160
""" Get seed ID from core ortholog fasta file
@@ -147,11 +185,18 @@ def identify_seed_id(seqFile, refspec, corepath, debug, silentOff):
147185
# otherwise, perform blast search
148186
blast_xml = blast_fn.do_blastsearch(seqFile, refspec_db, evalBlast = 0.001)
149187
blast_out = blast_fn.parse_blast_xml(blast_xml)
188+
if len(blast_out['hits']) < 1:
189+
print(f'ERROR: Cannot find seed sequence {blast_out["query"]} in genome of reference species!')
190+
print(f'You can check it by running:\nblastp -query {seqFile} -db {corepath}/{refspec}/{refspec} -evalue 0.001 -outfmt 7')
191+
sys.exit()
150192
for hit in blast_out['hits']:
151193
if blast_out['hits'][hit]['align_len'] == blast_out['query_len']:
194+
print("BEST BLAST HIT")
152195
return(hit)
153196
elif abs(int(blast_out['hits'][hit]['align_len']) - int(blast_out['query_len'])) < 10:
154197
output_fn.print_stdout(silentOff, 'WARNING: Found seed sequence shorter/longer than input!')
155198
return(hit)
156199
else:
157-
sys.exit('ERROR: Cannot find seed sequence in genome of reference species for %s!' % blast_out['query'])
200+
print(f'ERROR: Cannot find seed sequence {blast_out["query"]} in genome of reference species!')
201+
print(f'You can check it by running:\nblastp -query {seqFile} -db {corepath}/{refspec}/{refspec} -evalue 0.001 -outfmt 7')
202+
sys.exit()

fdog/runMulti.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -320,12 +320,25 @@ def main():
320320

321321
begin = time.time()
322322
##### Check and group parameters
323+
print('Preparing & Checking...')
323324
(inFol, hmmpath, corepath, searchpath, annopath) = prepare_fn.check_input(
324325
[inFol, refspec, outpath, hmmpath,
325326
corepath, searchpath, annopath, pathFile])
326327
pathArgs = [outpath, hmmpath, corepath, searchpath, annopath]
327328
prepare_fn.check_blast_version(corepath, refspec)
328329

330+
(invalid_minDist, invalid_maxDist, suggested_minRank, suggested_maxRank) = prepare_fn.check_ranks_core_taxa(corepath, minDist, maxDist)
331+
if len(invalid_minDist) > 0 or len(invalid_maxDist) > 0:
332+
print(f'Invalid {minDist} (--minDist) for {len(invalid_minDist)} species:\n{invalid_minDist}')
333+
print(f'Invalid {maxDist} (--maxDist) for {len(invalid_maxDist)} species:\n{invalid_maxDist}')
334+
if not minDist == "genus" and not maxDist == "kingdom":
335+
print(f'Please consider setting --minDist and --maxDist with these valid ranks:\n--minDist {suggested_minRank} --maxDist {suggested_maxRank}')
336+
sys.exit()
337+
else:
338+
print(f'WARNING: --minDist and --maxDist will be automatically changed to {suggested_minRank} and {suggested_maxRank}')
339+
minDist = suggested_minRank
340+
maxDist = suggested_maxRank
341+
329342
if not fasOff:
330343
check_fas = fas_fn.check_fas_executable()
331344
if check_fas == 0:

fdog/runSingle.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,7 @@ def main():
189189

190190
begin = time.time()
191191
##### Check and group parameters
192+
print('Preparing & Checking...')
192193
if seqFile == 'infile.fa':
193194
fdogPath = os.path.realpath(__file__).replace('/runSingle.py','')
194195
seqFile = '%s/data/infile.fa' % fdogPath
@@ -200,6 +201,18 @@ def main():
200201

201202
prepare_fn.check_blast_version(corepath, refspec)
202203

204+
(invalid_minDist, invalid_maxDist, suggested_minRank, suggested_maxRank) = prepare_fn.check_ranks_core_taxa(corepath, minDist, maxDist)
205+
if len(invalid_minDist) > 0 or len(invalid_maxDist) > 0:
206+
print(f'Invalid {minDist} (--minDist) for {len(invalid_minDist)} species:\n{invalid_minDist}')
207+
print(f'Invalid {maxDist} (--maxDist) for {len(invalid_maxDist)} species:\n{invalid_maxDist}')
208+
if not minDist == "genus" and not maxDist == "kingdom":
209+
print(f'Please consider setting --minDist and --maxDist with these valid ranks:\n--minDist {suggested_minRank} --maxDist {suggested_maxRank}')
210+
sys.exit()
211+
else:
212+
print(f'WARNING: --minDist and --maxDist will be automatically changed to {suggested_minRank} and {suggested_maxRank}')
213+
minDist = suggested_minRank
214+
maxDist = suggested_maxRank
215+
203216
if not fasOff:
204217
check_fas = fas_fn.check_fas_executable()
205218
if check_fas == 0:

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626

2727
setup(
2828
name="fdog",
29-
version="0.1.17",
29+
version="0.1.18",
3030
python_requires='>=3.7.0',
3131
description="Feature-aware Directed OrtholoG search tool",
3232
long_description=long_description,

0 commit comments

Comments
 (0)