Skip to content

Commit 705d5dd

Browse files
authored
Convert latin_terms to a set (#12995)
1 parent dcd276d commit 705d5dd

File tree

2 files changed

+12
-5
lines changed

2 files changed

+12
-5
lines changed

CHANGES.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,9 @@ Bugs fixed
146146
and ensure deterministic resolution of global toctree in parallel builds
147147
by choosing the lexicographically greatest parent document.
148148
Patch by A. Rafey Khan
149+
* #12995: Significantly improve performance when building the search index
150+
for Chinese languages.
151+
Patch by Adam Turner.
149152

150153

151154
Testing

sphinx/search/zh.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -227,7 +227,10 @@ class SearchChinese(SearchLanguage):
227227
js_stemmer_code = js_porter_stemmer
228228
stopwords = english_stopwords
229229
latin1_letters = re.compile(r'[a-zA-Z0-9_]+')
230-
latin_terms: list[str] = []
230+
231+
def __init__(self, options: dict[str, str]) -> None:
232+
super().__init__(options)
233+
self.latin_terms: set[str] = set()
231234

232235
def init(self, options: dict[str, str]) -> None:
233236
if JIEBA:
@@ -238,12 +241,13 @@ def init(self, options: dict[str, str]) -> None:
238241
self.stemmer = snowballstemmer.stemmer('english')
239242

240243
def split(self, input: str) -> list[str]:
241-
chinese: list[str] = []
242244
if JIEBA:
243-
chinese = list(jieba.cut_for_search(input))
245+
chinese: list[str] = list(jieba.cut_for_search(input))
246+
else:
247+
chinese = []
244248

245249
latin1 = [term.strip() for term in self.latin1_letters.findall(input)]
246-
self.latin_terms.extend(latin1)
250+
self.latin_terms.update(latin1)
247251
return chinese + latin1
248252

249253
def word_filter(self, stemmed_word: str) -> bool:
@@ -255,7 +259,7 @@ def stem(self, word: str) -> str:
255259
# avoids some issues with acronyms
256260
stemmed = self.stemmer.stemWord(word.lower())
257261
should_not_be_stemmed = (
258-
word in self.latin_terms and len(word) >= 3 > len(stemmed)
262+
len(word) >= 3 > len(stemmed) and word in self.latin_terms
259263
) # fmt: skip
260264
if should_not_be_stemmed:
261265
return word.lower()

0 commit comments

Comments
 (0)