Skip to content

Commit 75400af

Browse files
authored
Update stemming and Snowball (#13561)
1 parent 954839a commit 75400af

34 files changed

+751
-64
lines changed

CHANGES.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ Features added
2020
``linkcheck_allowed_redirects = {}``.
2121
Patch by Adam Turner.
2222
* #13497: Support C domain objects in the table of contents.
23+
* #13535: html search: Update to the latest version of Snowball (v3.0.1).
24+
Patch by Adam Turner.
2325

2426
Bugs fixed
2527
----------

doc/internals/contributing.rst

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -337,13 +337,15 @@ Updating generated files
337337
------------------------
338338

339339
* JavaScript stemming algorithms in :file:`sphinx/search/non-minified-js/*.js`
340-
are generated using `snowball <https://github.com/snowballstem/snowball>`_
341-
by cloning the repository, executing ``make dist_libstemmer_js`` and then
342-
unpacking the tarball which is generated in :file:`dist` directory.
340+
and stopword files in :file:`sphinx/search/_stopwords/`
341+
are generated from the `Snowball project`_
342+
by running :file:`utils/generate_snowball.py`.
343343

344344
Minified files in :file:`sphinx/search/minified-js/*.js` are generated from
345-
non-minified ones using :program:`uglifyjs` (installed via npm), with ``-m``
346-
option to enable mangling.
345+
non-minified ones using :program:`uglifyjs` (installed via npm).
346+
See :file:`sphinx/search/minified-js/README.rst`.
347+
348+
.. _Snowball project: https://snowballstem.org/
347349

348350
* The :file:`searchindex.js` files found in
349351
the :file:`tests/js/fixtures/*` directories

sphinx/search/__init__.py

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -117,10 +117,7 @@ def word_filter(self, word: str) -> bool:
117117
"""Return true if the target word should be registered in the search index.
118118
This method is called after stemming.
119119
"""
120-
return len(word) == 0 or not (
121-
((len(word) < 3) and (12353 < ord(word[0]) < 12436))
122-
or (ord(word[0]) < 256 and (word in self.stopwords))
123-
)
120+
return not word.isdigit() and word not in self.stopwords
124121

125122

126123
# SearchEnglish imported after SearchLanguage is defined due to circular import
@@ -583,17 +580,17 @@ def get_js_stemmer_rawcode(self) -> str | None:
583580

584581
def get_js_stemmer_code(self) -> str:
585582
"""Returns JS code that will be inserted into language_data.js."""
586-
if self.lang.js_stemmer_rawcode:
587-
base_js_path = _NON_MINIFIED_JS_PATH / 'base-stemmer.js'
588-
language_js_path = _NON_MINIFIED_JS_PATH / self.lang.js_stemmer_rawcode
589-
base_js = base_js_path.read_text(encoding='utf-8')
590-
language_js = language_js_path.read_text(encoding='utf-8')
591-
return (
592-
f'{base_js}\n{language_js}\nStemmer = {self.lang.language_name}Stemmer;'
593-
)
594-
else:
583+
if not self.lang.js_stemmer_rawcode:
595584
return self.lang.js_stemmer_code
596585

586+
base_js_path = _MINIFIED_JS_PATH / 'base-stemmer.js'
587+
language_js_path = _MINIFIED_JS_PATH / self.lang.js_stemmer_rawcode
588+
return '\n'.join((
589+
base_js_path.read_text(encoding='utf-8'),
590+
language_js_path.read_text(encoding='utf-8'),
591+
f'window.Stemmer = {self.lang.language_name}Stemmer;',
592+
))
593+
597594

598595
def _feed_visit_nodes(
599596
node: nodes.Node,

sphinx/search/_stopwords/da.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
# automatically generated by utils/generate-snowball.py
2+
# from https://snowballstem.org/algorithms/danish/stop.txt
3+
14
from __future__ import annotations
25

36
DANISH_STOPWORDS = frozenset({

sphinx/search/_stopwords/da.txt

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,11 @@
1-
| source: https://snowballstem.org/algorithms/danish/stop.txt
1+
2+
| A Danish stop word list. Comments begin with vertical bar. Each stop
3+
| word is at the start of a line.
4+
5+
| This is a ranked list (commonest to rarest) of stopwords derived from
6+
| a large text sample.
7+
8+
29
og | and
310
i | in
411
jeg | I

sphinx/search/_stopwords/de.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
# automatically generated by utils/generate-snowball.py
2+
# from https://snowballstem.org/algorithms/german/stop.txt
3+
14
from __future__ import annotations
25

36
GERMAN_STOPWORDS = frozenset({

sphinx/search/_stopwords/de.txt

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,11 @@
1-
|source: https://snowballstem.org/algorithms/german/stop.txt
1+
2+
| A German stop word list. Comments begin with vertical bar. Each stop
3+
| word is at the start of a line.
4+
5+
| The number of forms in this list is reduced significantly by passing it
6+
| through the German stemmer.
7+
8+
29
aber | but
310

411
alle | all

sphinx/search/_stopwords/en.py

Lines changed: 146 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,37 +1,181 @@
1+
# automatically generated by utils/generate-snowball.py
2+
# from https://snowballstem.org/algorithms/english/stop.txt
3+
14
from __future__ import annotations
25

36
ENGLISH_STOPWORDS = frozenset({
47
'a',
8+
'about',
9+
'above',
10+
'after',
11+
'again',
12+
'against',
13+
'all',
14+
'am',
15+
'an',
516
'and',
17+
'any',
618
'are',
19+
"aren't",
720
'as',
821
'at',
922
'be',
23+
'because',
24+
'been',
25+
'before',
26+
'being',
27+
'below',
28+
'between',
29+
'both',
1030
'but',
1131
'by',
32+
"can't",
33+
'cannot',
34+
'could',
35+
"couldn't",
36+
'did',
37+
"didn't",
38+
'do',
39+
'does',
40+
"doesn't",
41+
'doing',
42+
"don't",
43+
'down',
44+
'during',
45+
'each',
46+
'few',
1247
'for',
48+
'from',
49+
'further',
50+
'had',
51+
"hadn't",
52+
'has',
53+
"hasn't",
54+
'have',
55+
"haven't",
56+
'having',
57+
'he',
58+
"he'd",
59+
"he'll",
60+
"he's",
61+
'her',
62+
'here',
63+
"here's",
64+
'hers',
65+
'herself',
66+
'him',
67+
'himself',
68+
'his',
69+
'how',
70+
"how's",
71+
'i',
72+
"i'd",
73+
"i'll",
74+
"i'm",
75+
"i've",
1376
'if',
1477
'in',
1578
'into',
1679
'is',
80+
"isn't",
1781
'it',
18-
'near',
82+
"it's",
83+
'its',
84+
'itself',
85+
"let's",
86+
'me',
87+
'more',
88+
'most',
89+
"mustn't",
90+
'my',
91+
'myself',
1992
'no',
93+
'nor',
2094
'not',
2195
'of',
96+
'off',
2297
'on',
98+
'once',
99+
'only',
23100
'or',
101+
'other',
102+
'ought',
103+
'our',
104+
'ours',
105+
'ourselves',
106+
'out',
107+
'over',
108+
'own',
109+
'same',
110+
"shan't",
111+
'she',
112+
"she'd",
113+
"she'll",
114+
"she's",
115+
'should',
116+
"shouldn't",
117+
'so',
118+
'some',
24119
'such',
120+
'than',
25121
'that',
122+
"that's",
26123
'the',
27124
'their',
125+
'theirs',
126+
'them',
127+
'themselves',
28128
'then',
29129
'there',
130+
"there's",
30131
'these',
31132
'they',
133+
"they'd",
134+
"they'll",
135+
"they're",
136+
"they've",
32137
'this',
138+
'those',
139+
'through',
33140
'to',
141+
'too',
142+
'under',
143+
'until',
144+
'up',
145+
'very',
34146
'was',
35-
'will',
147+
"wasn't",
148+
'we',
149+
"we'd",
150+
"we'll",
151+
"we're",
152+
"we've",
153+
'were',
154+
"weren't",
155+
'what',
156+
"what's",
157+
'when',
158+
"when's",
159+
'where',
160+
"where's",
161+
'which',
162+
'while',
163+
'who',
164+
"who's",
165+
'whom',
166+
'why',
167+
"why's",
36168
'with',
169+
"won't",
170+
'would',
171+
"wouldn't",
172+
'you',
173+
"you'd",
174+
"you'll",
175+
"you're",
176+
"you've",
177+
'your',
178+
'yours',
179+
'yourself',
180+
'yourselves',
37181
})

0 commit comments

Comments
 (0)