Skip to content

Commit 8f26a64

Browse files
committed
Pre-parse stopwords
1 parent 9ab73b2 commit 8f26a64

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+4706
-2413
lines changed

.gitattributes

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,4 +62,5 @@ tests/roots/test-pycode/cp_1251_coded.py working-tree-encoding=windows-1251
6262

6363
tests/js/fixtures/**/*.js generated
6464
sphinx/search/minified-js/*.js generated
65+
sphinx/search/_stopwords/ generated
6566
sphinx/themes/bizstyle/static/css3-mediaqueries.js generated

sphinx/search/__init__.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from sphinx.util.index_entries import split_index_msg
2121

2222
if TYPE_CHECKING:
23-
from collections.abc import Callable, Iterable
23+
from collections.abc import Callable, Iterable, Set
2424
from typing import Any, Protocol, TypeVar
2525

2626
from docutils.nodes import Node
@@ -74,7 +74,7 @@ class SearchLanguage:
7474

7575
lang: str = ''
7676
language_name: str = ''
77-
stopwords: set[str] = set()
77+
stopwords: Set[str] = frozenset()
7878
js_splitter_code: str = ''
7979
js_stemmer_rawcode: str = ''
8080
js_stemmer_code = """
@@ -128,9 +128,11 @@ def word_filter(self, word: str) -> bool:
128128

129129

130130
def parse_stop_word(source: str) -> set[str]:
131-
"""Parse snowball style word list like this:
131+
"""Collect the stopwords from a snowball style word list:
132132
133-
* https://snowballstem.org/algorithms/finnish/stop.txt
133+
.. code:: text
134+
135+
list of space separated stop words | optional comment
134136
"""
135137
result: set[str] = set()
136138
for line in source.splitlines():

sphinx/search/_stopwords/__init__.py

Whitespace-only changes.

sphinx/search/_stopwords/da.py

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
from __future__ import annotations
2+
3+
DANISH_STOPWORDS = frozenset({
4+
'ad',
5+
'af',
6+
'alle',
7+
'alt',
8+
'anden',
9+
'at',
10+
'blev',
11+
'blive',
12+
'bliver',
13+
'da',
14+
'de',
15+
'dem',
16+
'den',
17+
'denne',
18+
'der',
19+
'deres',
20+
'det',
21+
'dette',
22+
'dig',
23+
'din',
24+
'disse',
25+
'dog',
26+
'du',
27+
'efter',
28+
'eller',
29+
'en',
30+
'end',
31+
'er',
32+
'et',
33+
'for',
34+
'fra',
35+
'ham',
36+
'han',
37+
'hans',
38+
'har',
39+
'havde',
40+
'have',
41+
'hende',
42+
'hendes',
43+
'her',
44+
'hos',
45+
'hun',
46+
'hvad',
47+
'hvis',
48+
'hvor',
49+
'i',
50+
'ikke',
51+
'ind',
52+
'jeg',
53+
'jer',
54+
'jo',
55+
'kunne',
56+
'man',
57+
'mange',
58+
'med',
59+
'meget',
60+
'men',
61+
'mig',
62+
'min',
63+
'mine',
64+
'mit',
65+
'mod',
66+
'ned',
67+
'noget',
68+
'nogle',
69+
'nu',
70+
'når',
71+
'og',
72+
'også',
73+
'om',
74+
'op',
75+
'os',
76+
'over',
77+
'på',
78+
'selv',
79+
'sig',
80+
'sin',
81+
'sine',
82+
'sit',
83+
'skal',
84+
'skulle',
85+
'som',
86+
'sådan',
87+
'thi',
88+
'til',
89+
'ud',
90+
'under',
91+
'var',
92+
'vi',
93+
'vil',
94+
'ville',
95+
'vor',
96+
'være',
97+
'været',
98+
})

sphinx/search/_stopwords/da.txt

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
| source: https://snowballstem.org/algorithms/danish/stop.txt
2+
og | and
3+
i | in
4+
jeg | I
5+
det | that (dem. pronoun)/it (pers. pronoun)
6+
at | that (in front of a sentence)/to (with infinitive)
7+
en | a/an
8+
den | it (pers. pronoun)/that (dem. pronoun)
9+
til | to/at/for/until/against/by/of/into, more
10+
er | present tense of "to be"
11+
som | who, as
12+
på | on/upon/in/on/at/to/after/of/with/for, on
13+
de | they
14+
med | with/by/in, along
15+
han | he
16+
af | of/by/from/off/for/in/with/on, off
17+
for | at/for/to/from/by/of/ago, in front/before, because
18+
ikke | not
19+
der | who/which, there/those
20+
var | past tense of "to be"
21+
mig | me/myself
22+
sig | oneself/himself/herself/itself/themselves
23+
men | but
24+
et | a/an/one, one (number), someone/somebody/one
25+
har | present tense of "to have"
26+
om | round/about/for/in/a, about/around/down, if
27+
vi | we
28+
min | my
29+
havde | past tense of "to have"
30+
ham | him
31+
hun | she
32+
nu | now
33+
over | over/above/across/by/beyond/past/on/about, over/past
34+
da | then, when/as/since
35+
fra | from/off/since, off, since
36+
du | you
37+
ud | out
38+
sin | his/her/its/one's
39+
dem | them
40+
os | us/ourselves
41+
op | up
42+
man | you/one
43+
hans | his
44+
hvor | where
45+
eller | or
46+
hvad | what
47+
skal | must/shall etc.
48+
selv | myself/yourself/herself/ourselves etc., even
49+
her | here
50+
alle | all/everyone/everybody etc.
51+
vil | will (verb)
52+
blev | past tense of "to stay/to remain/to get/to become"
53+
kunne | could
54+
ind | in
55+
når | when
56+
være | present tense of "to be"
57+
dog | however/yet/after all
58+
noget | something
59+
ville | would
60+
jo | you know/you see (adv), yes
61+
deres | their/theirs
62+
efter | after/behind/according to/for/by/from, later/afterwards
63+
ned | down
64+
skulle | should
65+
denne | this
66+
end | than
67+
dette | this
68+
mit | my/mine
69+
også | also
70+
under | under/beneath/below/during, below/underneath
71+
have | have
72+
dig | you
73+
anden | other
74+
hende | her
75+
mine | my
76+
alt | everything
77+
meget | much/very, plenty of
78+
sit | his, her, its, one's
79+
sine | his, her, its, one's
80+
vor | our
81+
mod | against
82+
disse | these
83+
hvis | if
84+
din | your/yours
85+
nogle | some
86+
hos | by/at
87+
blive | be/become
88+
mange | many
89+
ad | by/through
90+
bliver | present tense of "to be/to become"
91+
hendes | her/hers
92+
været | be
93+
thi | for (conj)
94+
jer | you
95+
sådan | such, like this/like that

0 commit comments

Comments
 (0)