Skip to content

Commit 3b62945

Browse files
committed
further updates to adapt fDOG-Assembly to the new fDOG version. Additionally bugfix in co-ortholog detection if MSA crashed due to too many sequences.
1 parent 41660e6 commit 3b62945

File tree

1 file changed

+51
-31
lines changed

1 file changed

+51
-31
lines changed

fdog/fDOGassembly.py

Lines changed: 51 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
import shutil
3131
import multiprocessing as mp
3232
import fdog.libs.alignment as align_fn
33+
from tqdm import tqdm
3334

3435
########################### functions ##########################################
3536
def check_path(path):
@@ -356,7 +357,7 @@ def getSeedInfo(path):
356357
del seq_records
357358
return dic
358359

359-
def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidatesOutFile, msaTool, matrix, dataPath, tmp_path):
360+
def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidatesOutFile, msaTool, matrix, dataPath, tmp_path, mode='silent'):
360361
###########getting sequences and write all in one file to make msa #########
361362
name_file = candidate_name + ".co"
362363
output_file = tmp_path + name_file + '.fasta'
@@ -384,17 +385,19 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates
384385

385386
if msaTool == "muscle":
386387
if align_fn.get_muscle_version(msaTool) == 'v3':
387-
os.system("muscle -quiet -in " + output_file + " -out " + aln_file)
388-
#print("muscle -quiet -in " + output_file + " -out " + aln_file)
388+
cmd = "muscle -quiet -in " + output_file + " -out " + aln_file
389389
else:
390-
os.system("muscle -quiet -align" + output_file + " -out " + aln_file)
390+
cmd = "muscle -align" + output_file + " -output " + aln_file
391+
starting_subprocess(cmd, mode)
391392
if not os.path.exists(aln_file):
392-
print("Muscle failed for " + candidate_name + ". Making MSA with Mafft-linsi.")
393-
os.system('mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + output_file + ' > ' + aln_file)
393+
print("Muscle failed for %s. Making MSA with Mafft-linsi." % (candidate_name))
394+
cmd = 'mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + output_file + ' > ' + aln_file
395+
starting_subprocess(cmd, mode)
394396

395397
elif msaTool == "mafft-linsi":
396398
#print("mafft-linsi")
397-
os.system('mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + output_file + ' > ' + aln_file)
399+
cmd = 'mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + output_file + ' > ' + aln_file
400+
starting_subprocess(cmd, mode)
398401

399402
try:
400403
distances = get_distance_biopython(aln_file, matrix)
@@ -406,8 +409,6 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates
406409
#print("Failure in distance computation, Candidate %s will be rejected" % candidate_name)
407410
return 0, "NaN", "NaN"
408411

409-
410-
411412
#distance_hit_query = distances[best_hit, candidate_name]
412413
#distance_ref_hit = distances[best_hit, ref]
413414

@@ -428,7 +429,7 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva
428429
#print(seedDic)
429430
blast_dir_path = dataPath + "/coreTaxa_dir/"
430431
if not os.path.exists(blast_dir_path):
431-
blast_dir_path = dataPath + "/blast_dir"
432+
blast_dir_path = dataPath + "/blast_dir/"
432433
if strict != True:
433434
seed = [fdog_ref_species]
434435
try:
@@ -639,8 +640,9 @@ def cleanup(tmp, tmp_path):
639640
if time.time() > timeout:
640641
print("tmp folder could not be removed!")
641642
break
643+
#clean up whole contigs
642644

643-
def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_species, msaTool, matrix):
645+
def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_species, msaTool, matrix, mode='silent'):
644646
if len(candidate_names) == 1:
645647
return candidate_names
646648

@@ -671,12 +673,18 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci
671673

672674
if msaTool == "muscle":
673675
if align_fn.get_muscle_version(msaTool) == 'v3':
674-
os.system("muscle -quiet -in " + out + " -out " + aln_file)
676+
cmd = "muscle -quiet -in %s -out %s" % (out, aln_file)
675677
#print("muscle -quiet -in " + output_file + " -out " + aln_file)
676678
else:
677-
os.system("muscle -quiet -align" + out + " -out " + aln_file)
679+
cmd = "muscle -align %s -output %s" % (out, aln_file)
680+
starting_subprocess(cmd, mode)
681+
if not os.path.exists(aln_file):
682+
print("Muscle failed for %s. Making MSA with Mafft-linsi." % (aln_file))
683+
cmd = 'mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + out + ' > ' + aln_file
684+
starting_subprocess(cmd, mode)
678685
elif msaTool == "mafft-linsi":
679-
os.system('mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + out + ' > ' + aln_file)
686+
cmd = 'mafft --maxiterate 1000 --localpair --anysymbol --quiet %s > %s'% (out, aln_file)
687+
starting_subprocess(cmd, mode)
680688

681689
distances = get_distance_biopython(aln_file, matrix)
682690

@@ -808,18 +816,19 @@ def blockProfiles(core_path, group, mode, out):
808816
check_path(fasta_path)
809817
if msaTool == "muscle":
810818
if align_fn.get_muscle_version(msaTool) == 'v3':
811-
os.system("muscle -quiet -in " + fasta_path + " -out " + msa_path)
819+
cmd= "muscle -quiet -in " + fasta_path + " -out " + msa_path
812820
#print("muscle -quiet -in " + output_file + " -out " + aln_file)
813821
else:
814-
os.system("muscle -quiet -align" + fasta_path + " -out " + msa_path)
822+
cmd = "muscle -quiet -align" + fasta_path + " -out " + msa_path
815823
elif msaTool == "mafft-linsi":
816-
os.system('mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + fasta_path + ' > ' + msa_path)
824+
cmd = 'mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + fasta_path + ' > ' + msa_path
825+
starting_subprocess(cmd, mode)
817826

818827
profile_path = out + "/tmp/" + group + ".prfl"
819828

820829
######################## block profile #####################################
821830

822-
print("Building a block profile ...")
831+
print("Building a block profile ...", flush=True)
823832
cmd = 'msa2prfl.pl ' + msa_path + ' --setname=' + group + ' >' + profile_path
824833
starting_subprocess(cmd, 'silent')
825834

@@ -832,7 +841,7 @@ def blockProfiles(core_path, group, mode, out):
832841
starting_subprocess(cmd, mode)
833842
cmd = 'msa2prfl.pl ' + new_path + ' --setname=' + group + ' >' + profile_path
834843
starting_subprocess(cmd, 'silent')
835-
print(" \t ...finished \n")
844+
print(" \t ...finished \n", flush=True)
836845

837846
return profile_path
838847

@@ -1031,7 +1040,8 @@ def main():
10311040
sys.stderr = f
10321041
sys.stdout = f
10331042
else:
1034-
sys.stdout = Logger(f)
1043+
pass
1044+
#sys.stdout = Logger(f)
10351045

10361046
########################### other variables ################################
10371047
if searchTaxa == []:
@@ -1069,8 +1079,8 @@ def main():
10691079
cmd = 'mkdir ' + out + '/tmp'
10701080
starting_subprocess(cmd, 'silent')
10711081

1072-
print("Gene: " + group)
1073-
print("fDOG reference species: " + fdog_ref_species + " \n")
1082+
print("Gene: " + group, flush=True)
1083+
print("fDOG reference species: " + fdog_ref_species + " \n",flush=True)
10741084

10751085
###################### preparations ########################################
10761086

@@ -1103,21 +1113,30 @@ def main():
11031113
for asName in assembly_names:
11041114
calls.append([asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db])
11051115

1106-
results = (pool.imap_unordered(ortholog_search_tblastn, calls))
1107-
pool.close()
1108-
pool.join()
1109-
for i in results:
1116+
1117+
#results = (pool.imap_unordered(ortholog_search_tblastn, calls))
1118+
#pool.close()
1119+
#pool.join()
1120+
print("Searching for orthologs ...", flush=True)
1121+
for i in tqdm(pool.imap_unordered(ortholog_search_tblastn, calls),total=len(calls)):
11101122
ortholog_sequences.append([i[0], i[1]])
1111-
for k in i[2]:
1112-
print(k)
1123+
if mode == 'debug':
1124+
for k in i[2]:
1125+
print(k)
1126+
#for i in results:
1127+
#ortholog_sequences.append([i[0], i[1]])
1128+
#for k in i[2]:
1129+
#print(k)
1130+
print("\t ...finished \n", flush=True)
11131131
else:
11141132
###################### computation species wise ################
1115-
for asName in assembly_names:
1133+
for asName in tqdm(assembly_names):
11161134
args = [asName, out, assemblyDir, consensus_path, augustus_ref_species, group, length_extension, average_intron_length, evalue, strict, fdog_ref_species, msaTool, matrix, dataPath, filter, mode, fasta_path, profile_path, taxa, searchTool, checkCoorthologs, gene_prediction, metaeuk_db]
11171135
reciprocal_sequences, candidatesOutFile, output_ortholog_search = ortholog_search_tblastn(args)
11181136
ortholog_sequences.append([reciprocal_sequences, candidatesOutFile])
1119-
for k in output_ortholog_search:
1120-
print(k)
1137+
if mode == 'debug':
1138+
for k in output_ortholog_search:
1139+
print(k)
11211140

11221141
time_ortholog_end = time.time()
11231142
time_ortholog = time_ortholog_end - time_ortholog_start
@@ -1141,6 +1160,7 @@ def main():
11411160
tmp_path = out + '/tmp/'
11421161
fas_seed_id = createFasInput(orthologsOutFile, mappingFile)
11431162
cmd = 'fas.run --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --tsv --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group
1163+
#print(cmd)
11441164
starting_subprocess(cmd, 'silent')
11451165
clean_fas(out + group + "_forward.domains", 'domains')
11461166
clean_fas(out + group + "_reverse.domains", 'domains')

0 commit comments

Comments
 (0)