sphinx-doc · AA-Turner · May 19, 2025 · May 15, 2025 · May 15, 2025 · May 15, 2025
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -20,6 +20,8 @@ Features added
   ``linkcheck_allowed_redirects = {}``.
   Patch by Adam Turner.
 * #13497: Support C domain objects in the table of contents.
+* #13535: html search: Update to the latest version of Snowball (v3.0.1).
+  Patch by Adam Turner.
 
 Bugs fixed
 ----------

diff --git a/doc/internals/contributing.rst b/doc/internals/contributing.rst
@@ -337,13 +337,15 @@ Updating generated files
 ------------------------
 
 * JavaScript stemming algorithms in :file:`sphinx/search/non-minified-js/*.js`
-  are generated using `snowball <https://github.com/snowballstem/snowball>`_
-  by cloning the repository, executing ``make dist_libstemmer_js`` and then
-  unpacking the tarball which is generated in :file:`dist` directory.
+  and stopword files in :file:`sphinx/search/_stopwords/`
+  are generated from the `Snowball project`_
+  by running :file:`utils/generate_snowball.py`.
 
   Minified files in :file:`sphinx/search/minified-js/*.js` are generated from
-  non-minified ones using :program:`uglifyjs` (installed via npm), with ``-m``
-  option to enable mangling.
+  non-minified ones using :program:`uglifyjs` (installed via npm).
+  See :file:`sphinx/search/minified-js/README.rst`.
+
+  .. _Snowball project: https://snowballstem.org/
 
 * The :file:`searchindex.js` files found in
   the :file:`tests/js/fixtures/*` directories

diff --git a/sphinx/search/__init__.py b/sphinx/search/__init__.py
@@ -117,10 +117,7 @@
         """Return true if the target word should be registered in the search index.
         This method is called after stemming.
         """
-        return len(word) == 0 or not (
-            ((len(word) < 3) and (12353 < ord(word[0]) < 12436))
-            or (ord(word[0]) < 256 and (word in self.stopwords))
-        )
+        return word == '' or not word.isdigit() or word not in self.stopwords
 
 
 # SearchEnglish imported after SearchLanguage is defined due to circular import
@@ -503,32 +500,34 @@
 
         _filter = self.lang.word_filter
         _stem = self.lang.stem
+        _mapping = self._mapping
 
         # memoise self.lang.stem
         @functools.cache
         def stem(word_to_stem: str) -> str:
             return _stem(word_to_stem).lower()
 
+        def add_term(term: str, /) -> None:
+            if _filter(term):
+                _mapping.setdefault(term, set()).add(docname)
+
         self._all_titles[docname] = word_store.titles
 
         for word in word_store.title_words:
             # add stemmed and unstemmed as the stemmer must not remove words
             # from search index.
             stemmed_word = stem(word)
-            if _filter(stemmed_word):
-                self._title_mapping.setdefault(stemmed_word, set()).add(docname)
-            elif _filter(word):
-                self._title_mapping.setdefault(word, set()).add(docname)
+            add_term(stemmed_word)
+            add_term(word)
 
         for word in word_store.words:
             # add stemmed and unstemmed as the stemmer must not remove words
             # from search index.
             stemmed_word = stem(word)
-            if not _filter(stemmed_word) and _filter(word):
-                stemmed_word = word
             already_indexed = docname in self._title_mapping.get(stemmed_word, ())
-            if _filter(stemmed_word) and not already_indexed:
-                self._mapping.setdefault(stemmed_word, set()).add(docname)
+            if not already_indexed:
+                add_term(stemmed_word)
+                add_term(word)
 
         # find explicit entries within index directives
         _index_entries: set[tuple[str, str, str]] = set()
@@ -583,17 +582,18 @@
 
     def get_js_stemmer_code(self) -> str:
         """Returns JS code that will be inserted into language_data.js."""
-        if self.lang.js_stemmer_rawcode:
-            base_js_path = _NON_MINIFIED_JS_PATH / 'base-stemmer.js'
-            language_js_path = _NON_MINIFIED_JS_PATH / self.lang.js_stemmer_rawcode
-            base_js = base_js_path.read_text(encoding='utf-8')
-            language_js = language_js_path.read_text(encoding='utf-8')
-            return (
-                f'{base_js}\n{language_js}\nStemmer = {self.lang.language_name}Stemmer;'
-            )
-        else:
+        if not self.lang.js_stemmer_rawcode:
             return self.lang.js_stemmer_code
 
+        base_js_path = _MINIFIED_JS_PATH / 'base-stemmer.js'
+        language_js_path = _MINIFIED_JS_PATH / self.lang.js_stemmer_rawcode
+        return '\n'.join((
+            base_js_path.read_text(encoding='utf-8'),
+            language_js_path.read_text(encoding='utf-8'),
+            f'const Stemmer = {self.lang.language_name}Stemmer;',
+            f'globalThis.Stemmer = {self.lang.language_name}Stemmer;',
+        ))
+
 
 def _feed_visit_nodes(
     node: nodes.Node,

diff --git a/sphinx/search/_stopwords/da.py b/sphinx/search/_stopwords/da.py
@@ -1,3 +1,6 @@
+# automatically generated by utils/generate-snowball.py
+# from https://snowballstem.org/algorithms/danish/stop.txt
+
 from __future__ import annotations
 
 DANISH_STOPWORDS = frozenset({

diff --git a/sphinx/search/_stopwords/da.txt b/sphinx/search/_stopwords/da.txt
@@ -1,4 +1,11 @@
-| source: https://snowballstem.org/algorithms/danish/stop.txt
+
+ | A Danish stop word list. Comments begin with vertical bar. Each stop
+ | word is at the start of a line.
+
+ | This is a ranked list (commonest to rarest) of stopwords derived from
+ | a large text sample.
+
+
 og           | and
 i            | in
 jeg          | I

diff --git a/sphinx/search/_stopwords/de.py b/sphinx/search/_stopwords/de.py
@@ -1,3 +1,6 @@
+# automatically generated by utils/generate-snowball.py
+# from https://snowballstem.org/algorithms/german/stop.txt
+
 from __future__ import annotations
 
 GERMAN_STOPWORDS = frozenset({

diff --git a/sphinx/search/_stopwords/de.txt b/sphinx/search/_stopwords/de.txt
@@ -1,4 +1,11 @@
-|source: https://snowballstem.org/algorithms/german/stop.txt
+
+ | A German stop word list. Comments begin with vertical bar. Each stop
+ | word is at the start of a line.
+
+ | The number of forms in this list is reduced significantly by passing it
+ | through the German stemmer.
+
+
 aber           |  but
 
 alle           |  all

diff --git a/sphinx/search/_stopwords/en.py b/sphinx/search/_stopwords/en.py
@@ -1,37 +1,181 @@
+# automatically generated by utils/generate-snowball.py
+# from https://snowballstem.org/algorithms/english/stop.txt
+
 from __future__ import annotations
 
 ENGLISH_STOPWORDS = frozenset({
     'a',
+    'about',
+    'above',
+    'after',
+    'again',
+    'against',
+    'all',
+    'am',
+    'an',
     'and',
+    'any',
     'are',
+    "aren't",
     'as',
     'at',
     'be',
+    'because',
+    'been',
+    'before',
+    'being',
+    'below',
+    'between',
+    'both',
     'but',
     'by',
+    "can't",
+    'cannot',
+    'could',
+    "couldn't",
+    'did',
+    "didn't",
+    'do',
+    'does',
+    "doesn't",
+    'doing',
+    "don't",
+    'down',
+    'during',
+    'each',
+    'few',
     'for',
+    'from',
+    'further',
+    'had',
+    "hadn't",
+    'has',
+    "hasn't",
+    'have',
+    "haven't",
+    'having',
+    'he',
+    "he'd",
+    "he'll",
+    "he's",
+    'her',
+    'here',
+    "here's",
+    'hers',
+    'herself',
+    'him',
+    'himself',
+    'his',
+    'how',
+    "how's",
+    'i',
+    "i'd",
+    "i'll",
+    "i'm",
+    "i've",
     'if',
     'in',
     'into',
     'is',
+    "isn't",
     'it',
-    'near',
+    "it's",
+    'its',
+    'itself',
+    "let's",
+    'me',
+    'more',
+    'most',
+    "mustn't",
+    'my',
+    'myself',
     'no',
+    'nor',
     'not',
     'of',
+    'off',
     'on',
+    'once',
+    'only',
     'or',
+    'other',
+    'ought',
+    'our',
+    'ours',
+    'ourselves',
+    'out',
+    'over',
+    'own',
+    'same',
+    "shan't",
+    'she',
+    "she'd",
+    "she'll",
+    "she's",
+    'should',
+    "shouldn't",
+    'so',
+    'some',
     'such',
+    'than',
     'that',
+    "that's",
     'the',
     'their',
+    'theirs',
+    'them',
+    'themselves',
     'then',
     'there',
+    "there's",
     'these',
     'they',
+    "they'd",
+    "they'll",
+    "they're",
+    "they've",
     'this',
+    'those',
+    'through',
     'to',
+    'too',
+    'under',
+    'until',
+    'up',
+    'very',
     'was',
-    'will',
+    "wasn't",
+    'we',
+    "we'd",
+    "we'll",
+    "we're",
+    "we've",
+    'were',
+    "weren't",
+    'what',
+    "what's",
+    'when',
+    "when's",
+    'where',
+    "where's",
+    'which',
+    'while',
+    'who',
+    "who's",
+    'whom',
+    'why',
+    "why's",
     'with',
+    "won't",
+    'would',
+    "wouldn't",
+    'you',
+    "you'd",
+    "you'll",
+    "you're",
+    "you've",
+    'your',
+    'yours',
+    'yourself',
+    'yourselves',
 })