30
30
import shutil
31
31
import multiprocessing as mp
32
32
import fdog .libs .alignment as align_fn
33
+ from tqdm import tqdm
33
34
34
35
########################### functions ##########################################
35
36
def check_path (path ):
@@ -356,7 +357,7 @@ def getSeedInfo(path):
356
357
del seq_records
357
358
return dic
358
359
359
- def checkCoOrthologs (candidate_name , best_hit , ref , fdog_ref_species , candidatesOutFile , msaTool , matrix , dataPath , tmp_path ):
360
+ def checkCoOrthologs (candidate_name , best_hit , ref , fdog_ref_species , candidatesOutFile , msaTool , matrix , dataPath , tmp_path , mode = 'silent' ):
360
361
###########getting sequences and write all in one file to make msa #########
361
362
name_file = candidate_name + ".co"
362
363
output_file = tmp_path + name_file + '.fasta'
@@ -384,17 +385,19 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates
384
385
385
386
if msaTool == "muscle" :
386
387
if align_fn .get_muscle_version (msaTool ) == 'v3' :
387
- os .system ("muscle -quiet -in " + output_file + " -out " + aln_file )
388
- #print("muscle -quiet -in " + output_file + " -out " + aln_file)
388
+ cmd = "muscle -quiet -in " + output_file + " -out " + aln_file
389
389
else :
390
- os .system ("muscle -quiet -align" + output_file + " -out " + aln_file )
390
+ cmd = "muscle -align" + output_file + " -output " + aln_file
391
+ starting_subprocess (cmd , mode )
391
392
if not os .path .exists (aln_file ):
392
- print ("Muscle failed for " + candidate_name + ". Making MSA with Mafft-linsi." )
393
- os .system ('mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + output_file + ' > ' + aln_file )
393
+ print ("Muscle failed for %s. Making MSA with Mafft-linsi." % (candidate_name ))
394
+ cmd = 'mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + output_file + ' > ' + aln_file
395
+ starting_subprocess (cmd , mode )
394
396
395
397
elif msaTool == "mafft-linsi" :
396
398
#print("mafft-linsi")
397
- os .system ('mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + output_file + ' > ' + aln_file )
399
+ cmd = 'mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + output_file + ' > ' + aln_file
400
+ starting_subprocess (cmd , mode )
398
401
399
402
try :
400
403
distances = get_distance_biopython (aln_file , matrix )
@@ -406,8 +409,6 @@ def checkCoOrthologs(candidate_name, best_hit, ref, fdog_ref_species, candidates
406
409
#print("Failure in distance computation, Candidate %s will be rejected" % candidate_name)
407
410
return 0 , "NaN" , "NaN"
408
411
409
-
410
-
411
412
#distance_hit_query = distances[best_hit, candidate_name]
412
413
#distance_ref_hit = distances[best_hit, ref]
413
414
@@ -428,7 +429,7 @@ def backward_search(candidatesOutFile, fasta_path, strict, fdog_ref_species, eva
428
429
#print(seedDic)
429
430
blast_dir_path = dataPath + "/coreTaxa_dir/"
430
431
if not os .path .exists (blast_dir_path ):
431
- blast_dir_path = dataPath + "/blast_dir"
432
+ blast_dir_path = dataPath + "/blast_dir/ "
432
433
if strict != True :
433
434
seed = [fdog_ref_species ]
434
435
try :
@@ -639,8 +640,9 @@ def cleanup(tmp, tmp_path):
639
640
if time .time () > timeout :
640
641
print ("tmp folder could not be removed!" )
641
642
break
643
+ #clean up whole contigs
642
644
643
- def coorthologs (candidate_names , tmp_path , candidatesFile , fasta , fdog_ref_species , msaTool , matrix ):
645
+ def coorthologs (candidate_names , tmp_path , candidatesFile , fasta , fdog_ref_species , msaTool , matrix , mode = 'silent' ):
644
646
if len (candidate_names ) == 1 :
645
647
return candidate_names
646
648
@@ -671,12 +673,18 @@ def coorthologs(candidate_names, tmp_path, candidatesFile, fasta, fdog_ref_speci
671
673
672
674
if msaTool == "muscle" :
673
675
if align_fn .get_muscle_version (msaTool ) == 'v3' :
674
- os . system ( "muscle -quiet -in " + out + " -out " + aln_file )
676
+ cmd = "muscle -quiet -in %s -out %s" % ( out , aln_file )
675
677
#print("muscle -quiet -in " + output_file + " -out " + aln_file)
676
678
else :
677
- os .system ("muscle -quiet -align" + out + " -out " + aln_file )
679
+ cmd = "muscle -align %s -output %s" % (out , aln_file )
680
+ starting_subprocess (cmd , mode )
681
+ if not os .path .exists (aln_file ):
682
+ print ("Muscle failed for %s. Making MSA with Mafft-linsi." % (aln_file ))
683
+ cmd = 'mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + out + ' > ' + aln_file
684
+ starting_subprocess (cmd , mode )
678
685
elif msaTool == "mafft-linsi" :
679
- os .system ('mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + out + ' > ' + aln_file )
686
+ cmd = 'mafft --maxiterate 1000 --localpair --anysymbol --quiet %s > %s' % (out , aln_file )
687
+ starting_subprocess (cmd , mode )
680
688
681
689
distances = get_distance_biopython (aln_file , matrix )
682
690
@@ -808,18 +816,19 @@ def blockProfiles(core_path, group, mode, out):
808
816
check_path (fasta_path )
809
817
if msaTool == "muscle" :
810
818
if align_fn .get_muscle_version (msaTool ) == 'v3' :
811
- os . system ( "muscle -quiet -in " + fasta_path + " -out " + msa_path )
819
+ cmd = "muscle -quiet -in " + fasta_path + " -out " + msa_path
812
820
#print("muscle -quiet -in " + output_file + " -out " + aln_file)
813
821
else :
814
- os . system ( "muscle -quiet -align" + fasta_path + " -out " + msa_path )
822
+ cmd = "muscle -quiet -align" + fasta_path + " -out " + msa_path
815
823
elif msaTool == "mafft-linsi" :
816
- os .system ('mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + fasta_path + ' > ' + msa_path )
824
+ cmd = 'mafft --maxiterate 1000 --localpair --anysymbol --quiet ' + fasta_path + ' > ' + msa_path
825
+ starting_subprocess (cmd , mode )
817
826
818
827
profile_path = out + "/tmp/" + group + ".prfl"
819
828
820
829
######################## block profile #####################################
821
830
822
- print ("Building a block profile ..." )
831
+ print ("Building a block profile ..." , flush = True )
823
832
cmd = 'msa2prfl.pl ' + msa_path + ' --setname=' + group + ' >' + profile_path
824
833
starting_subprocess (cmd , 'silent' )
825
834
@@ -832,7 +841,7 @@ def blockProfiles(core_path, group, mode, out):
832
841
starting_subprocess (cmd , mode )
833
842
cmd = 'msa2prfl.pl ' + new_path + ' --setname=' + group + ' >' + profile_path
834
843
starting_subprocess (cmd , 'silent' )
835
- print (" \t ...finished \n " )
844
+ print (" \t ...finished \n " , flush = True )
836
845
837
846
return profile_path
838
847
@@ -1031,7 +1040,8 @@ def main():
1031
1040
sys .stderr = f
1032
1041
sys .stdout = f
1033
1042
else :
1034
- sys .stdout = Logger (f )
1043
+ pass
1044
+ #sys.stdout = Logger(f)
1035
1045
1036
1046
########################### other variables ################################
1037
1047
if searchTaxa == []:
@@ -1069,8 +1079,8 @@ def main():
1069
1079
cmd = 'mkdir ' + out + '/tmp'
1070
1080
starting_subprocess (cmd , 'silent' )
1071
1081
1072
- print ("Gene: " + group )
1073
- print ("fDOG reference species: " + fdog_ref_species + " \n " )
1082
+ print ("Gene: " + group , flush = True )
1083
+ print ("fDOG reference species: " + fdog_ref_species + " \n " , flush = True )
1074
1084
1075
1085
###################### preparations ########################################
1076
1086
@@ -1103,21 +1113,30 @@ def main():
1103
1113
for asName in assembly_names :
1104
1114
calls .append ([asName , out , assemblyDir , consensus_path , augustus_ref_species , group , length_extension , average_intron_length , evalue , strict , fdog_ref_species , msaTool , matrix , dataPath , filter , mode , fasta_path , profile_path , taxa , searchTool , checkCoorthologs , gene_prediction , metaeuk_db ])
1105
1115
1106
- results = (pool .imap_unordered (ortholog_search_tblastn , calls ))
1107
- pool .close ()
1108
- pool .join ()
1109
- for i in results :
1116
+
1117
+ #results = (pool.imap_unordered(ortholog_search_tblastn, calls))
1118
+ #pool.close()
1119
+ #pool.join()
1120
+ print ("Searching for orthologs ..." , flush = True )
1121
+ for i in tqdm (pool .imap_unordered (ortholog_search_tblastn , calls ),total = len (calls )):
1110
1122
ortholog_sequences .append ([i [0 ], i [1 ]])
1111
- for k in i [2 ]:
1112
- print (k )
1123
+ if mode == 'debug' :
1124
+ for k in i [2 ]:
1125
+ print (k )
1126
+ #for i in results:
1127
+ #ortholog_sequences.append([i[0], i[1]])
1128
+ #for k in i[2]:
1129
+ #print(k)
1130
+ print ("\t ...finished \n " , flush = True )
1113
1131
else :
1114
1132
###################### computation species wise ################
1115
- for asName in assembly_names :
1133
+ for asName in tqdm ( assembly_names ) :
1116
1134
args = [asName , out , assemblyDir , consensus_path , augustus_ref_species , group , length_extension , average_intron_length , evalue , strict , fdog_ref_species , msaTool , matrix , dataPath , filter , mode , fasta_path , profile_path , taxa , searchTool , checkCoorthologs , gene_prediction , metaeuk_db ]
1117
1135
reciprocal_sequences , candidatesOutFile , output_ortholog_search = ortholog_search_tblastn (args )
1118
1136
ortholog_sequences .append ([reciprocal_sequences , candidatesOutFile ])
1119
- for k in output_ortholog_search :
1120
- print (k )
1137
+ if mode == 'debug' :
1138
+ for k in output_ortholog_search :
1139
+ print (k )
1121
1140
1122
1141
time_ortholog_end = time .time ()
1123
1142
time_ortholog = time_ortholog_end - time_ortholog_start
@@ -1141,6 +1160,7 @@ def main():
1141
1160
tmp_path = out + '/tmp/'
1142
1161
fas_seed_id = createFasInput (orthologsOutFile , mappingFile )
1143
1162
cmd = 'fas.run --seed ' + fasta_path + ' --query ' + orthologsOutFile + ' --annotation_dir ' + tmp_path + 'anno_dir --bidirectional --tsv --phyloprofile ' + mappingFile + ' --seed_id "' + fas_seed_id + '" --out_dir ' + out + ' --out_name ' + group
1163
+ #print(cmd)
1144
1164
starting_subprocess (cmd , 'silent' )
1145
1165
clean_fas (out + group + "_forward.domains" , 'domains' )
1146
1166
clean_fas (out + group + "_reverse.domains" , 'domains' )
0 commit comments