Closed
Description
Information
self.encoding = 'utf-8' if self.configs.tokenizer.unit == 'kspon_subword' else 'cp949'
def _parse_manifest_file(self):
r"""
Parsing manifest file.
Returns:
audio_paths (list): list of audio path
transcritps (list): list of transcript of audio
"""
audio_paths = list()
transcripts = list()
with open(self.configs.dataset.manifest_file_path, encoding=self.encoding) as f:
for idx, line in enumerate(f.readlines()):
audio_path, korean_transcript, transcript = line.split('\t')
transcript = transcript.replace('\n', '')
audio_paths.append(audio_path)
transcripts.append(transcript)
return audio_paths, transcripts
Expected behavior
self.encoding = 'utf-8' if self.configs.tokenizer.unit == 'kspon_subword' else 'cp949'
--> self.encoding = 'cp949' if self.configs.tokenizer.unit == 'kspon_grapheme' else 'utf-8'