Skip to content

Pre-parse stopword lists #13572

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 18, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
Expand Up @@ -62,4 +62,5 @@ tests/roots/test-pycode/cp_1251_coded.py working-tree-encoding=windows-1251

tests/js/fixtures/**/*.js generated
sphinx/search/minified-js/*.js generated
sphinx/search/_stopwords/ generated
sphinx/themes/bizstyle/static/css3-mediaqueries.js generated
10 changes: 6 additions & 4 deletions sphinx/search/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from sphinx.util.index_entries import split_index_msg

if TYPE_CHECKING:
from collections.abc import Callable, Iterable
from collections.abc import Callable, Iterable, Set
from typing import Any, Protocol, TypeVar

from docutils.nodes import Node
Expand Down Expand Up @@ -74,7 +74,7 @@ class SearchLanguage:

lang: str = ''
language_name: str = ''
stopwords: set[str] = set()
stopwords: Set[str] = frozenset()
js_splitter_code: str = ''
js_stemmer_rawcode: str = ''
js_stemmer_code = """
Expand Down Expand Up @@ -128,9 +128,11 @@ def word_filter(self, word: str) -> bool:


def parse_stop_word(source: str) -> set[str]:
"""Parse snowball style word list like this:
"""Collect the stopwords from a snowball style word list:

* https://snowballstem.org/algorithms/finnish/stop.txt
.. code:: text

list of space separated stop words | optional comment
"""
result: set[str] = set()
for line in source.splitlines():
Expand Down
Empty file.
98 changes: 98 additions & 0 deletions sphinx/search/_stopwords/da.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
from __future__ import annotations

DANISH_STOPWORDS = frozenset({
'ad',
'af',
'alle',
'alt',
'anden',
'at',
'blev',
'blive',
'bliver',
'da',
'de',
'dem',
'den',
'denne',
'der',
'deres',
'det',
'dette',
'dig',
'din',
'disse',
'dog',
'du',
'efter',
'eller',
'en',
'end',
'er',
'et',
'for',
'fra',
'ham',
'han',
'hans',
'har',
'havde',
'have',
'hende',
'hendes',
'her',
'hos',
'hun',
'hvad',
'hvis',
'hvor',
'i',
'ikke',
'ind',
'jeg',
'jer',
'jo',
'kunne',
'man',
'mange',
'med',
'meget',
'men',
'mig',
'min',
'mine',
'mit',
'mod',
'ned',
'noget',
'nogle',
'nu',
'når',
'og',
'også',
'om',
'op',
'os',
'over',
'på',
'selv',
'sig',
'sin',
'sine',
'sit',
'skal',
'skulle',
'som',
'sådan',
'thi',
'til',
'ud',
'under',
'var',
'vi',
'vil',
'ville',
'vor',
'være',
'været',
})
95 changes: 95 additions & 0 deletions sphinx/search/_stopwords/da.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
| source: https://snowballstem.org/algorithms/danish/stop.txt
og | and
i | in
jeg | I
det | that (dem. pronoun)/it (pers. pronoun)
at | that (in front of a sentence)/to (with infinitive)
en | a/an
den | it (pers. pronoun)/that (dem. pronoun)
til | to/at/for/until/against/by/of/into, more
er | present tense of "to be"
som | who, as
på | on/upon/in/on/at/to/after/of/with/for, on
de | they
med | with/by/in, along
han | he
af | of/by/from/off/for/in/with/on, off
for | at/for/to/from/by/of/ago, in front/before, because
ikke | not
der | who/which, there/those
var | past tense of "to be"
mig | me/myself
sig | oneself/himself/herself/itself/themselves
men | but
et | a/an/one, one (number), someone/somebody/one
har | present tense of "to have"
om | round/about/for/in/a, about/around/down, if
vi | we
min | my
havde | past tense of "to have"
ham | him
hun | she
nu | now
over | over/above/across/by/beyond/past/on/about, over/past
da | then, when/as/since
fra | from/off/since, off, since
du | you
ud | out
sin | his/her/its/one's
dem | them
os | us/ourselves
op | up
man | you/one
hans | his
hvor | where
eller | or
hvad | what
skal | must/shall etc.
selv | myself/yourself/herself/ourselves etc., even
her | here
alle | all/everyone/everybody etc.
vil | will (verb)
blev | past tense of "to stay/to remain/to get/to become"
kunne | could
ind | in
når | when
være | present tense of "to be"
dog | however/yet/after all
noget | something
ville | would
jo | you know/you see (adv), yes
deres | their/theirs
efter | after/behind/according to/for/by/from, later/afterwards
ned | down
skulle | should
denne | this
end | than
dette | this
mit | my/mine
også | also
under | under/beneath/below/during, below/underneath
have | have
dig | you
anden | other
hende | her
mine | my
alt | everything
meget | much/very, plenty of
sit | his, her, its, one's
sine | his, her, its, one's
vor | our
mod | against
disse | these
hvis | if
din | your/yours
nogle | some
hos | by/at
blive | be/become
mange | many
ad | by/through
bliver | present tense of "to be/to become"
hendes | her/hers
været | be
thi | for (conj)
jer | you
sådan | such, like this/like that
Loading
Loading