Open
Description
使用transformers==4.53.1版本,生成语音会混乱,改用4.51.3版本则正常
以下是调用代码
def speech(
self,
input: str,
voice: Optional[str] = "Chinese Female",
speed: float = 1,
reponse_format: str = "mp3",
**kwargs,
) -> str:
if voice not in self._voices:
raise ValueError(f"Voice {voice} not supported")
original_voice = self._get_original_voice(voice)
model_output = self._model.inference_sft( # 这里调用cosyvoice的方法
input, original_voice, stream=False, speed=speed
)
with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as temp_file:
wav_file_path = temp_file.name
with wave.open(wav_file_path, "wb") as wf:
wf.setnchannels(1) # single track
wf.setsampwidth(2) # 16-bit
wf.setframerate(22050) # Sample rate
for i in model_output:
tts_audio = (
(i["tts_speech"].numpy() * (2**15)).astype(np.int16).tobytes()
)
wf.writeframes(tts_audio)
output_file_path = convert(wav_file_path, reponse_format, speed)
return output_file_path
环境:
ubuntu22.04
NVIDIA-GeForce-RTX-4090
CosyVoice版本:6b21f8e
Metadata
Metadata
Assignees
Labels
No labels