|
20 | 20 | from pathlib import Path
|
21 | 21 | from Bio import SeqIO
|
22 | 22 | from Bio.Blast.Applications import NcbiblastpCommandline
|
| 23 | +from ete3 import NCBITaxa |
23 | 24 |
|
24 | 25 | import fdog.libs.zzz as general_fn
|
25 | 26 | import fdog.libs.fasta as fasta_fn
|
26 | 27 | import fdog.libs.blast as blast_fn
|
27 | 28 | import fdog.libs.output as output_fn
|
| 29 | +import fdog.libs.tree as tree_fn |
28 | 30 |
|
29 | 31 |
|
30 | 32 | ##### FUNCTIONS FOR DATA/INPUT PREPARATION #####
|
@@ -117,6 +119,42 @@ def check_blast_version(corepath, refspec):
|
117 | 119 | 'ERROR: Error running blast (probably conflict with BLAST DBs versions)\n%s'
|
118 | 120 | % (NcbiblastpCommandline(query = query, db = blast_db)))
|
119 | 121 |
|
| 122 | +def check_ranks_core_taxa(corepath, minDist, maxDist): |
| 123 | + """ Check if all core taxa have a valid minDist and maxDist tax ID |
| 124 | + Return 2 dictionaries of taxa for invalid minDist and maxDist, where |
| 125 | + keys is taxon name and value is the next valid rank |
| 126 | + """ |
| 127 | + invalid_minDist = [] |
| 128 | + invalid_maxDist = [] |
| 129 | + ncbi = NCBITaxa() |
| 130 | + rank_list = ['species', 'genus', 'family', 'order', 'class', 'phylum', 'kingdom', 'superkingdom'] |
| 131 | + suggest_minIndex = rank_list.index(minDist) |
| 132 | + suggest_maxIndex = rank_list.index(maxDist) |
| 133 | + for f in os.listdir(corepath): |
| 134 | + if os.path.isdir(f'{corepath}/{f}'): |
| 135 | + id = f.split('@')[1] |
| 136 | + lineage = ncbi.get_lineage(id) |
| 137 | + ranks = ncbi.get_rank(lineage) |
| 138 | + if len(general_fn.matching_elements(ranks, minDist)) < 1: |
| 139 | + invalid_minDist.append(f) |
| 140 | + index_minDist = rank_list.index(minDist) + 1 |
| 141 | + while index_minDist < len(rank_list): |
| 142 | + if len(general_fn.matching_elements(ranks, rank_list[index_minDist])) > 0: |
| 143 | + if index_minDist > suggest_minIndex: |
| 144 | + suggest_minIndex = index_minDist |
| 145 | + break |
| 146 | + index_minDist += 1 |
| 147 | + if len(general_fn.matching_elements(ranks, maxDist)) < 1: |
| 148 | + invalid_maxDist.append(f) |
| 149 | + index_maxDist = rank_list.index(maxDist) + 1 |
| 150 | + while index_maxDist < len(rank_list): |
| 151 | + if len(general_fn.matching_elements(ranks, rank_list[index_maxDist])) > 0: |
| 152 | + if index_maxDist > suggest_maxIndex: |
| 153 | + suggest_maxIndex = index_maxDist |
| 154 | + break |
| 155 | + index_maxDist += 1 |
| 156 | + return(invalid_minDist, invalid_maxDist, rank_list[suggest_minIndex], rank_list[suggest_maxIndex]) |
| 157 | + |
120 | 158 |
|
121 | 159 | def get_seed_id_from_fa(core_fa, refspec):
|
122 | 160 | """ Get seed ID from core ortholog fasta file
|
@@ -147,11 +185,18 @@ def identify_seed_id(seqFile, refspec, corepath, debug, silentOff):
|
147 | 185 | # otherwise, perform blast search
|
148 | 186 | blast_xml = blast_fn.do_blastsearch(seqFile, refspec_db, evalBlast = 0.001)
|
149 | 187 | blast_out = blast_fn.parse_blast_xml(blast_xml)
|
| 188 | + if len(blast_out['hits']) < 1: |
| 189 | + print(f'ERROR: Cannot find seed sequence {blast_out["query"]} in genome of reference species!') |
| 190 | + print(f'You can check it by running:\nblastp -query {seqFile} -db {corepath}/{refspec}/{refspec} -evalue 0.001 -outfmt 7') |
| 191 | + sys.exit() |
150 | 192 | for hit in blast_out['hits']:
|
151 | 193 | if blast_out['hits'][hit]['align_len'] == blast_out['query_len']:
|
| 194 | + print("BEST BLAST HIT") |
152 | 195 | return(hit)
|
153 | 196 | elif abs(int(blast_out['hits'][hit]['align_len']) - int(blast_out['query_len'])) < 10:
|
154 | 197 | output_fn.print_stdout(silentOff, 'WARNING: Found seed sequence shorter/longer than input!')
|
155 | 198 | return(hit)
|
156 | 199 | else:
|
157 |
| - sys.exit('ERROR: Cannot find seed sequence in genome of reference species for %s!' % blast_out['query']) |
| 200 | + print(f'ERROR: Cannot find seed sequence {blast_out["query"]} in genome of reference species!') |
| 201 | + print(f'You can check it by running:\nblastp -query {seqFile} -db {corepath}/{refspec}/{refspec} -evalue 0.001 -outfmt 7') |
| 202 | + sys.exit() |
0 commit comments