Skip to content

Commit 2418f38

Browse files
committed
Pre-parse stopwords
1 parent 9ab73b2 commit 2418f38

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+5261
-2413
lines changed

.gitattributes

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,4 +62,5 @@ tests/roots/test-pycode/cp_1251_coded.py working-tree-encoding=windows-1251
6262

6363
tests/js/fixtures/**/*.js generated
6464
sphinx/search/minified-js/*.js generated
65+
sphinx/search/_stopwords/ generated
6566
sphinx/themes/bizstyle/static/css3-mediaqueries.js generated

sphinx/search/__init__.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from sphinx.util.index_entries import split_index_msg
2121

2222
if TYPE_CHECKING:
23-
from collections.abc import Callable, Iterable
23+
from collections.abc import Callable, Iterable, Set
2424
from typing import Any, Protocol, TypeVar
2525

2626
from docutils.nodes import Node
@@ -74,7 +74,7 @@ class SearchLanguage:
7474

7575
lang: str = ''
7676
language_name: str = ''
77-
stopwords: set[str] = set()
77+
stopwords: Set[str] = frozenset()
7878
js_splitter_code: str = ''
7979
js_stemmer_rawcode: str = ''
8080
js_stemmer_code = """
@@ -128,9 +128,11 @@ def word_filter(self, word: str) -> bool:
128128

129129

130130
def parse_stop_word(source: str) -> set[str]:
131-
"""Parse snowball style word list like this:
131+
"""Collect the stopwords from a snowball style word list:
132132
133-
* https://snowballstem.org/algorithms/finnish/stop.txt
133+
.. code:: text
134+
135+
list of space separated stop words | optional comment
134136
"""
135137
result: set[str] = set()
136138
for line in source.splitlines():

sphinx/search/_stopwords/__init__.py

Whitespace-only changes.

sphinx/search/_stopwords/da.py

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
# automatically generated by utils/generate-snowball.py
2+
# from https://snowballstem.org/algorithms/danish/stop.txt
3+
4+
from __future__ import annotations
5+
6+
DANISH_STOPWORDS = frozenset({
7+
'ad',
8+
'af',
9+
'alle',
10+
'alt',
11+
'anden',
12+
'at',
13+
'blev',
14+
'blive',
15+
'bliver',
16+
'da',
17+
'de',
18+
'dem',
19+
'den',
20+
'denne',
21+
'der',
22+
'deres',
23+
'det',
24+
'dette',
25+
'dig',
26+
'din',
27+
'disse',
28+
'dog',
29+
'du',
30+
'efter',
31+
'eller',
32+
'en',
33+
'end',
34+
'er',
35+
'et',
36+
'for',
37+
'fra',
38+
'ham',
39+
'han',
40+
'hans',
41+
'har',
42+
'havde',
43+
'have',
44+
'hende',
45+
'hendes',
46+
'her',
47+
'hos',
48+
'hun',
49+
'hvad',
50+
'hvis',
51+
'hvor',
52+
'i',
53+
'ikke',
54+
'ind',
55+
'jeg',
56+
'jer',
57+
'jo',
58+
'kunne',
59+
'man',
60+
'mange',
61+
'med',
62+
'meget',
63+
'men',
64+
'mig',
65+
'min',
66+
'mine',
67+
'mit',
68+
'mod',
69+
'ned',
70+
'noget',
71+
'nogle',
72+
'nu',
73+
'når',
74+
'og',
75+
'også',
76+
'om',
77+
'op',
78+
'os',
79+
'over',
80+
'på',
81+
'selv',
82+
'sig',
83+
'sin',
84+
'sine',
85+
'sit',
86+
'skal',
87+
'skulle',
88+
'som',
89+
'sådan',
90+
'thi',
91+
'til',
92+
'ud',
93+
'under',
94+
'var',
95+
'vi',
96+
'vil',
97+
'ville',
98+
'vor',
99+
'være',
100+
'været',
101+
})

sphinx/search/_stopwords/da.txt

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
2+
| A Danish stop word list. Comments begin with vertical bar. Each stop
3+
| word is at the start of a line.
4+
5+
| This is a ranked list (commonest to rarest) of stopwords derived from
6+
| a large text sample.
7+
8+
9+
og | and
10+
i | in
11+
jeg | I
12+
det | that (dem. pronoun)/it (pers. pronoun)
13+
at | that (in front of a sentence)/to (with infinitive)
14+
en | a/an
15+
den | it (pers. pronoun)/that (dem. pronoun)
16+
til | to/at/for/until/against/by/of/into, more
17+
er | present tense of "to be"
18+
som | who, as
19+
på | on/upon/in/on/at/to/after/of/with/for, on
20+
de | they
21+
med | with/by/in, along
22+
han | he
23+
af | of/by/from/off/for/in/with/on, off
24+
for | at/for/to/from/by/of/ago, in front/before, because
25+
ikke | not
26+
der | who/which, there/those
27+
var | past tense of "to be"
28+
mig | me/myself
29+
sig | oneself/himself/herself/itself/themselves
30+
men | but
31+
et | a/an/one, one (number), someone/somebody/one
32+
har | present tense of "to have"
33+
om | round/about/for/in/a, about/around/down, if
34+
vi | we
35+
min | my
36+
havde | past tense of "to have"
37+
ham | him
38+
hun | she
39+
nu | now
40+
over | over/above/across/by/beyond/past/on/about, over/past
41+
da | then, when/as/since
42+
fra | from/off/since, off, since
43+
du | you
44+
ud | out
45+
sin | his/her/its/one's
46+
dem | them
47+
os | us/ourselves
48+
op | up
49+
man | you/one
50+
hans | his
51+
hvor | where
52+
eller | or
53+
hvad | what
54+
skal | must/shall etc.
55+
selv | myself/yourself/herself/ourselves etc., even
56+
her | here
57+
alle | all/everyone/everybody etc.
58+
vil | will (verb)
59+
blev | past tense of "to stay/to remain/to get/to become"
60+
kunne | could
61+
ind | in
62+
når | when
63+
være | present tense of "to be"
64+
dog | however/yet/after all
65+
noget | something
66+
ville | would
67+
jo | you know/you see (adv), yes
68+
deres | their/theirs
69+
efter | after/behind/according to/for/by/from, later/afterwards
70+
ned | down
71+
skulle | should
72+
denne | this
73+
end | than
74+
dette | this
75+
mit | my/mine
76+
også | also
77+
under | under/beneath/below/during, below/underneath
78+
have | have
79+
dig | you
80+
anden | other
81+
hende | her
82+
mine | my
83+
alt | everything
84+
meget | much/very, plenty of
85+
sit | his, her, its, one's
86+
sine | his, her, its, one's
87+
vor | our
88+
mod | against
89+
disse | these
90+
hvis | if
91+
din | your/yours
92+
nogle | some
93+
hos | by/at
94+
blive | be/become
95+
mange | many
96+
ad | by/through
97+
bliver | present tense of "to be/to become"
98+
hendes | her/hers
99+
været | be
100+
thi | for (conj)
101+
jer | you
102+
sådan | such, like this/like that

0 commit comments

Comments
 (0)