@@ -140,7 +140,7 @@ def __init__(self, configs: Union[dict, str]=None):
140
140
self .win_length :int = 2048
141
141
self .n_speakers :int = 300
142
142
143
- self .langauges :list = ["auto" , "en" , "zh" , "ja" , "all_zh" , "all_ja" ]
143
+ self .languages :list = ["auto" , "en" , "zh" , "ja" , "all_zh" , "all_ja" ]
144
144
# print(self)
145
145
146
146
def _load_configs (self , configs_path : str )-> dict :
@@ -207,19 +207,19 @@ def __init__(self, configs: Union[dict, str, TTS_Config]):
207
207
208
208
209
209
self .prompt_cache :dict = {
210
- "ref_audio_path" : None ,
211
- "prompt_semantic" :None ,
212
- "refer_spepc" : None ,
213
- "prompt_text" : None ,
214
- "prompt_lang" : None ,
215
- "phones" : None ,
216
- "bert_features" : None ,
217
- "norm_text" : None ,
210
+ "ref_audio_path" : None ,
211
+ "prompt_semantic" : None ,
212
+ "refer_spec" : None ,
213
+ "prompt_text" : None ,
214
+ "prompt_lang" : None ,
215
+ "phones" : None ,
216
+ "bert_features" : None ,
217
+ "norm_text" : None ,
218
218
}
219
219
220
220
221
221
self .stop_flag :bool = False
222
- self .precison :torch .dtype = torch .float16 if self .configs .is_half else torch .float32
222
+ self .precision :torch .dtype = torch .float16 if self .configs .is_half else torch .float32
223
223
224
224
def _init_models (self ,):
225
225
self .init_t2s_weights (self .configs .t2s_weights_path )
@@ -312,7 +312,7 @@ def enable_half_precision(self, enable: bool = True):
312
312
return
313
313
314
314
self .configs .is_half = enable
315
- self .precison = torch .float16 if enable else torch .float32
315
+ self .precision = torch .float16 if enable else torch .float32
316
316
self .configs .save_configs ()
317
317
if enable :
318
318
if self .t2s_model is not None :
@@ -358,9 +358,9 @@ def set_ref_audio(self, ref_audio_path:str):
358
358
ref_audio_path: str, the path of the reference audio.
359
359
'''
360
360
self ._set_prompt_semantic (ref_audio_path )
361
- self ._set_ref_spepc (ref_audio_path )
361
+ self ._set_ref_spec (ref_audio_path )
362
362
363
- def _set_ref_spepc (self , ref_audio_path ):
363
+ def _set_ref_spec (self , ref_audio_path ):
364
364
audio = load_audio (ref_audio_path , int (self .configs .sampling_rate ))
365
365
audio = torch .FloatTensor (audio )
366
366
audio_norm = audio
@@ -376,8 +376,8 @@ def _set_ref_spepc(self, ref_audio_path):
376
376
spec = spec .to (self .configs .device )
377
377
if self .configs .is_half :
378
378
spec = spec .half ()
379
- # self.refer_spepc = spec
380
- self .prompt_cache ["refer_spepc " ] = spec
379
+ # self.refer_spec = spec
380
+ self .prompt_cache ["refer_spec " ] = spec
381
381
382
382
383
383
def _set_prompt_semantic (self , ref_wav_path :str ):
@@ -435,7 +435,7 @@ def to_batch(self, data:list,
435
435
threshold :float = 0.75 ,
436
436
split_bucket :bool = True ,
437
437
device :torch .device = torch .device ("cpu" ),
438
- precison :torch .dtype = torch .float32 ,
438
+ precision :torch .dtype = torch .float32 ,
439
439
):
440
440
441
441
_data :list = []
@@ -488,13 +488,13 @@ def to_batch(self, data:list,
488
488
for item in item_list :
489
489
if prompt_data is not None :
490
490
all_bert_features = torch .cat ([prompt_data ["bert_features" ], item ["bert_features" ]], 1 )\
491
- .to (dtype = precison , device = device )
491
+ .to (dtype = precision , device = device )
492
492
all_phones = torch .LongTensor (prompt_data ["phones" ]+ item ["phones" ]).to (device )
493
493
phones = torch .LongTensor (item ["phones" ]).to (device )
494
494
# norm_text = prompt_data["norm_text"]+item["norm_text"]
495
495
else :
496
496
all_bert_features = item ["bert_features" ]\
497
- .to (dtype = precison , device = device )
497
+ .to (dtype = precision , device = device )
498
498
phones = torch .LongTensor (item ["phones" ]).to (device )
499
499
all_phones = phones
500
500
# norm_text = item["norm_text"]
@@ -519,7 +519,7 @@ def to_batch(self, data:list,
519
519
#### 直接对phones和bert_features进行pad,会增大复读概率。
520
520
# all_phones_batch = self.batch_sequences(all_phones_list, axis=0, pad_value=0, max_length=max_len)
521
521
# all_bert_features_batch = all_bert_features_list
522
- # all_bert_features_batch = torch.zeros(len(item_list), 1024, max_len, dtype=precison , device=device)
522
+ # all_bert_features_batch = torch.zeros(len(item_list), 1024, max_len, dtype=precision , device=device)
523
523
# for idx, item in enumerate(all_bert_features_list):
524
524
# all_bert_features_batch[idx, :, : item.shape[-1]] = item
525
525
@@ -555,8 +555,8 @@ def recovery_order(self, data:list, batch_index_list:list)->list:
555
555
Returns:
556
556
list (List[np.ndarray]): the data in the original order.
557
557
'''
558
- lenght = len (sum (batch_index_list , []))
559
- _data = [None ]* lenght
558
+ length = len (sum (batch_index_list , []))
559
+ _data = [None ]* length
560
560
for i , index_list in enumerate (batch_index_list ):
561
561
for j , index in enumerate (index_list ):
562
562
_data [index ] = data [i ][j ]
@@ -584,7 +584,7 @@ def run(self, inputs:dict):
584
584
"top_k": 5, # int. top k sampling
585
585
"top_p": 1, # float. top p sampling
586
586
"temperature": 1, # float. temperature for sampling
587
- "text_split_method": "cut0", # str. text split method, see text_segmentaion_method .py for details.
587
+ "text_split_method": "cut0", # str. text split method, see text_segmentation_method .py for details.
588
588
"batch_size": 1, # int. batch size for inference
589
589
"batch_threshold": 0.75, # float. threshold for batch splitting.
590
590
"split_bucket: True, # bool. whether to split the batch into multiple buckets.
@@ -594,7 +594,7 @@ def run(self, inputs:dict):
594
594
"seed": -1, # int. random seed for reproducibility.
595
595
}
596
596
returns:
597
- tulpe [int, np.ndarray]: sampling rate and audio data.
597
+ tuple [int, np.ndarray]: sampling rate and audio data.
598
598
"""
599
599
########## variables initialization ###########
600
600
self .stop_flag :bool = False
@@ -635,12 +635,12 @@ def run(self, inputs:dict):
635
635
if prompt_text in [None , "" ]:
636
636
no_prompt_text = True
637
637
638
- assert text_lang in self .configs .langauges
638
+ assert text_lang in self .configs .languages
639
639
if not no_prompt_text :
640
- assert prompt_lang in self .configs .langauges
640
+ assert prompt_lang in self .configs .languages
641
641
642
642
if ref_audio_path in [None , "" ] and \
643
- ((self .prompt_cache ["prompt_semantic" ] is None ) or (self .prompt_cache ["refer_spepc " ] is None )):
643
+ ((self .prompt_cache ["prompt_semantic" ] is None ) or (self .prompt_cache ["refer_spec " ] is None )):
644
644
raise ValueError ("ref_audio_path cannot be empty, when the reference audio is not set using set_ref_audio()" )
645
645
646
646
@@ -682,7 +682,7 @@ def run(self, inputs:dict):
682
682
threshold = batch_threshold ,
683
683
split_bucket = split_bucket ,
684
684
device = self .configs .device ,
685
- precison = self .precison
685
+ precision = self .precision
686
686
)
687
687
else :
688
688
print (i18n ("############ 切分文本 ############" ))
@@ -714,7 +714,7 @@ def make_batch(batch_texts):
714
714
threshold = batch_threshold ,
715
715
split_bucket = False ,
716
716
device = self .configs .device ,
717
- precison = self .precison
717
+ precision = self .precision
718
718
)
719
719
return batch [0 ]
720
720
@@ -760,8 +760,8 @@ def make_batch(batch_texts):
760
760
t4 = ttime ()
761
761
t_34 += t4 - t3
762
762
763
- refer_audio_spepc :torch .Tensor = self .prompt_cache ["refer_spepc " ]\
764
- .to (dtype = self .precison , device = self .configs .device )
763
+ refer_audio_spec :torch .Tensor = self .prompt_cache ["refer_spec " ]\
764
+ .to (dtype = self .precision , device = self .configs .device )
765
765
766
766
batch_audio_fragment = []
767
767
@@ -775,7 +775,7 @@ def make_batch(batch_texts):
775
775
# batch_phones = self.batch_sequences(batch_phones, axis=0, pad_value=0, max_length=max_len)
776
776
# batch_phones = batch_phones.to(self.configs.device)
777
777
# batch_audio_fragment = (self.vits_model.batched_decode(
778
- # pred_semantic, pred_semantic_len, batch_phones, batch_phones_len,refer_audio_spepc
778
+ # pred_semantic, pred_semantic_len, batch_phones, batch_phones_len,refer_audio_spec
779
779
# ))
780
780
781
781
# ## vits并行推理 method 2
@@ -786,7 +786,7 @@ def make_batch(batch_texts):
786
786
all_pred_semantic = torch .cat (pred_semantic_list ).unsqueeze (0 ).unsqueeze (0 ).to (self .configs .device )
787
787
_batch_phones = torch .cat (batch_phones ).unsqueeze (0 ).to (self .configs .device )
788
788
_batch_audio_fragment = (self .vits_model .decode (
789
- all_pred_semantic , _batch_phones ,refer_audio_spepc
789
+ all_pred_semantic , _batch_phones , refer_audio_spec
790
790
).detach ()[0 , 0 , :])
791
791
audio_frag_end_idx .insert (0 , 0 )
792
792
batch_audio_fragment = [_batch_audio_fragment [audio_frag_end_idx [i - 1 ]:audio_frag_end_idx [i ]] for i in range (1 , len (audio_frag_end_idx ))]
@@ -797,7 +797,7 @@ def make_batch(batch_texts):
797
797
# phones = batch_phones[i].unsqueeze(0).to(self.configs.device)
798
798
# _pred_semantic = (pred_semantic_list[i][-idx:].unsqueeze(0).unsqueeze(0)) # .unsqueeze(0)#mq要多unsqueeze一次
799
799
# audio_fragment =(self.vits_model.decode(
800
- # _pred_semantic, phones, refer_audio_spepc
800
+ # _pred_semantic, phones, refer_audio_spec
801
801
# ).detach()[0, 0, :])
802
802
# batch_audio_fragment.append(
803
803
# audio_fragment
@@ -866,7 +866,7 @@ def audio_postprocess(self,
866
866
)-> tuple [int , np .ndarray ]:
867
867
zero_wav = torch .zeros (
868
868
int (self .configs .sampling_rate * fragment_interval ),
869
- dtype = self .precison ,
869
+ dtype = self .precision ,
870
870
device = self .configs .device
871
871
)
872
872
0 commit comments