Skip to content

添加四则运算的中文格式化处理 #947

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 31 additions & 2 deletions GPT_SoVITS/text/zh_normalization/num.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,29 @@ def replace_default_num(match):
return verbalize_digit(number, alt_one=True)


# 加减乘除
RE_ASMD = re.compile(
r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))([\+\-\×÷=])((-?)((\d+)(\.\d+)?)|(\.(\d+)))')
asmd_map = {
'+': '加',
'-': '减',
'×': '乘',
'÷': '除',
'=': '等于'
}


def replace_asmd(match) -> str:
"""
Args:
match (re.Match)
Returns:
str
"""
result = match.group(1) + asmd_map[match.group(8)] + match.group(9)
return result


# 数字表达式
# 纯小数
RE_DECIMAL_NUM = re.compile(r'(-?)((\d+)(\.\d+))' r'|(\.(\d+))')
Expand Down Expand Up @@ -155,7 +178,13 @@ def replace_number(match) -> str:
# match.group(1) and match.group(8) are copy from RE_NUMBER

RE_RANGE = re.compile(
r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))[-~]((-?)((\d+)(\.\d+)?)|(\.(\d+)))')
r"""
(?<![\d\+\-\×÷=]) # 使用反向前瞻以确保数字范围之前没有其他数字和操作符
((-?)((\d+)(\.\d+)?)) # 匹配范围起始的负数或正数(整数或小数)
[-~] # 匹配范围分隔符
((-?)((\d+)(\.\d+)?)) # 匹配范围结束的负数或正数(整数或小数)
(?![\d\+\-\×÷=]) # 使用正向前瞻以确保数字范围之后没有其他数字和操作符
""", re.VERBOSE)


def replace_range(match) -> str:
Expand All @@ -165,7 +194,7 @@ def replace_range(match) -> str:
Returns:
str
"""
first, second = match.group(1), match.group(8)
first, second = match.group(1), match.group(6)
first = RE_NUMBER.sub(replace_number, first)
second = RE_NUMBER.sub(replace_number, second)
result = f"{first}到{second}"
Expand Down
9 changes: 8 additions & 1 deletion GPT_SoVITS/text/zh_normalization/text_normlization.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
from .num import RE_POSITIVE_QUANTIFIERS
from .num import RE_RANGE
from .num import RE_TO_RANGE
from .num import RE_ASMD
from .num import replace_default_num
from .num import replace_frac
from .num import replace_negative_num
Expand All @@ -42,6 +43,7 @@
from .num import replace_positive_quantifier
from .num import replace_range
from .num import replace_to_range
from .num import replace_asmd
from .phonecode import RE_MOBILE_PHONE
from .phonecode import RE_NATIONAL_UNIFORM_NUMBER
from .phonecode import RE_TELEPHONE
Expand All @@ -67,7 +69,7 @@ def _split(self, text: str, lang="zh") -> List[str]:
if lang == "zh":
text = text.replace(" ", "")
# 过滤掉特殊字符
text = re.sub(r'[——《》【】<=>{}()()#&@“”^_|\\]', '', text)
text = re.sub(r'[——《》【】<>{}()()#&@“”^_|\\]', '', text)
text = self.SENTENCE_SPLITOR.sub(r'\1\n', text)
text = text.strip()
sentences = [sentence.strip() for sentence in re.split(r'\n+', text)]
Expand Down Expand Up @@ -142,6 +144,11 @@ def normalize_sentence(self, sentence: str) -> str:
sentence = RE_NATIONAL_UNIFORM_NUMBER.sub(replace_phone, sentence)

sentence = RE_RANGE.sub(replace_range, sentence)

# 处理加减乘除
while RE_ASMD.search(sentence):
sentence = RE_ASMD.sub(replace_asmd, sentence)

sentence = RE_INTEGER.sub(replace_negative_num, sentence)
sentence = RE_DECIMAL_NUM.sub(replace_number, sentence)
sentence = RE_POSITIVE_QUANTIFIERS.sub(replace_positive_quantifier,
Expand Down