diff --git a/CHANGES.rst b/CHANGES.rst index d26a93871a5..c0ed8089a60 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -20,6 +20,8 @@ Features added ``linkcheck_allowed_redirects = {}``. Patch by Adam Turner. * #13497: Support C domain objects in the table of contents. +* #13535: html search: Update to the latest version of Snowball (v3.0.1). + Patch by Adam Turner. Bugs fixed ---------- diff --git a/doc/internals/contributing.rst b/doc/internals/contributing.rst index 4b8ca84a945..de4224d7bc3 100644 --- a/doc/internals/contributing.rst +++ b/doc/internals/contributing.rst @@ -337,13 +337,15 @@ Updating generated files ------------------------ * JavaScript stemming algorithms in :file:`sphinx/search/non-minified-js/*.js` - are generated using `snowball `_ - by cloning the repository, executing ``make dist_libstemmer_js`` and then - unpacking the tarball which is generated in :file:`dist` directory. + and stopword files in :file:`sphinx/search/_stopwords/` + are generated from the `Snowball project`_ + by running :file:`utils/generate_snowball.py`. Minified files in :file:`sphinx/search/minified-js/*.js` are generated from - non-minified ones using :program:`uglifyjs` (installed via npm), with ``-m`` - option to enable mangling. + non-minified ones using :program:`uglifyjs` (installed via npm). + See :file:`sphinx/search/minified-js/README.rst`. + + .. _Snowball project: https://snowballstem.org/ * The :file:`searchindex.js` files found in the :file:`tests/js/fixtures/*` directories diff --git a/sphinx/search/__init__.py b/sphinx/search/__init__.py index 1cb05bea0e2..cc997bf6456 100644 --- a/sphinx/search/__init__.py +++ b/sphinx/search/__init__.py @@ -117,10 +117,7 @@ def word_filter(self, word: str) -> bool: """Return true if the target word should be registered in the search index. This method is called after stemming. """ - return len(word) == 0 or not ( - ((len(word) < 3) and (12353 < ord(word[0]) < 12436)) - or (ord(word[0]) < 256 and (word in self.stopwords)) - ) + return not word.isdigit() and word not in self.stopwords # SearchEnglish imported after SearchLanguage is defined due to circular import @@ -583,17 +580,17 @@ def get_js_stemmer_rawcode(self) -> str | None: def get_js_stemmer_code(self) -> str: """Returns JS code that will be inserted into language_data.js.""" - if self.lang.js_stemmer_rawcode: - base_js_path = _NON_MINIFIED_JS_PATH / 'base-stemmer.js' - language_js_path = _NON_MINIFIED_JS_PATH / self.lang.js_stemmer_rawcode - base_js = base_js_path.read_text(encoding='utf-8') - language_js = language_js_path.read_text(encoding='utf-8') - return ( - f'{base_js}\n{language_js}\nStemmer = {self.lang.language_name}Stemmer;' - ) - else: + if not self.lang.js_stemmer_rawcode: return self.lang.js_stemmer_code + base_js_path = _MINIFIED_JS_PATH / 'base-stemmer.js' + language_js_path = _MINIFIED_JS_PATH / self.lang.js_stemmer_rawcode + return '\n'.join(( + base_js_path.read_text(encoding='utf-8'), + language_js_path.read_text(encoding='utf-8'), + f'window.Stemmer = {self.lang.language_name}Stemmer;', + )) + def _feed_visit_nodes( node: nodes.Node, diff --git a/sphinx/search/_stopwords/da.py b/sphinx/search/_stopwords/da.py index c31a51c6df2..de8fa937b8a 100644 --- a/sphinx/search/_stopwords/da.py +++ b/sphinx/search/_stopwords/da.py @@ -1,3 +1,6 @@ +# automatically generated by utils/generate-snowball.py +# from https://snowballstem.org/algorithms/danish/stop.txt + from __future__ import annotations DANISH_STOPWORDS = frozenset({ diff --git a/sphinx/search/_stopwords/da.txt b/sphinx/search/_stopwords/da.txt index 6f2bd01afc2..37052042642 100644 --- a/sphinx/search/_stopwords/da.txt +++ b/sphinx/search/_stopwords/da.txt @@ -1,4 +1,11 @@ -| source: https://snowballstem.org/algorithms/danish/stop.txt + + | A Danish stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | This is a ranked list (commonest to rarest) of stopwords derived from + | a large text sample. + + og | and i | in jeg | I diff --git a/sphinx/search/_stopwords/de.py b/sphinx/search/_stopwords/de.py index 26ee3322ff3..d37e2105288 100644 --- a/sphinx/search/_stopwords/de.py +++ b/sphinx/search/_stopwords/de.py @@ -1,3 +1,6 @@ +# automatically generated by utils/generate-snowball.py +# from https://snowballstem.org/algorithms/german/stop.txt + from __future__ import annotations GERMAN_STOPWORDS = frozenset({ diff --git a/sphinx/search/_stopwords/de.txt b/sphinx/search/_stopwords/de.txt index 94c4777bd05..c8935ae61c7 100644 --- a/sphinx/search/_stopwords/de.txt +++ b/sphinx/search/_stopwords/de.txt @@ -1,4 +1,11 @@ -|source: https://snowballstem.org/algorithms/german/stop.txt + + | A German stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | The number of forms in this list is reduced significantly by passing it + | through the German stemmer. + + aber | but alle | all diff --git a/sphinx/search/_stopwords/en.py b/sphinx/search/_stopwords/en.py index 01bac4cf14e..a4d9f800a02 100644 --- a/sphinx/search/_stopwords/en.py +++ b/sphinx/search/_stopwords/en.py @@ -1,37 +1,181 @@ +# automatically generated by utils/generate-snowball.py +# from https://snowballstem.org/algorithms/english/stop.txt + from __future__ import annotations ENGLISH_STOPWORDS = frozenset({ 'a', + 'about', + 'above', + 'after', + 'again', + 'against', + 'all', + 'am', + 'an', 'and', + 'any', 'are', + "aren't", 'as', 'at', 'be', + 'because', + 'been', + 'before', + 'being', + 'below', + 'between', + 'both', 'but', 'by', + "can't", + 'cannot', + 'could', + "couldn't", + 'did', + "didn't", + 'do', + 'does', + "doesn't", + 'doing', + "don't", + 'down', + 'during', + 'each', + 'few', 'for', + 'from', + 'further', + 'had', + "hadn't", + 'has', + "hasn't", + 'have', + "haven't", + 'having', + 'he', + "he'd", + "he'll", + "he's", + 'her', + 'here', + "here's", + 'hers', + 'herself', + 'him', + 'himself', + 'his', + 'how', + "how's", + 'i', + "i'd", + "i'll", + "i'm", + "i've", 'if', 'in', 'into', 'is', + "isn't", 'it', - 'near', + "it's", + 'its', + 'itself', + "let's", + 'me', + 'more', + 'most', + "mustn't", + 'my', + 'myself', 'no', + 'nor', 'not', 'of', + 'off', 'on', + 'once', + 'only', 'or', + 'other', + 'ought', + 'our', + 'ours', + 'ourselves', + 'out', + 'over', + 'own', + 'same', + "shan't", + 'she', + "she'd", + "she'll", + "she's", + 'should', + "shouldn't", + 'so', + 'some', 'such', + 'than', 'that', + "that's", 'the', 'their', + 'theirs', + 'them', + 'themselves', 'then', 'there', + "there's", 'these', 'they', + "they'd", + "they'll", + "they're", + "they've", 'this', + 'those', + 'through', 'to', + 'too', + 'under', + 'until', + 'up', + 'very', 'was', - 'will', + "wasn't", + 'we', + "we'd", + "we'll", + "we're", + "we've", + 'were', + "weren't", + 'what', + "what's", + 'when', + "when's", + 'where', + "where's", + 'which', + 'while', + 'who', + "who's", + 'whom', + 'why', + "why's", 'with', + "won't", + 'would', + "wouldn't", + 'you', + "you'd", + "you'll", + "you're", + "you've", + 'your', + 'yours', + 'yourself', + 'yourselves', }) diff --git a/sphinx/search/_stopwords/en.txt b/sphinx/search/_stopwords/en.txt new file mode 100644 index 00000000000..e40c8c8cd6e --- /dev/null +++ b/sphinx/search/_stopwords/en.txt @@ -0,0 +1,310 @@ + + | An English stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | Many of the forms below are quite rare (e.g. "yourselves") but included for + | completeness. + + | PRONOUNS FORMS + | 1st person sing + +i | subject, always in upper case of course + +me | object +my | possessive adjective + | the possessive pronoun `mine' is best suppressed, because of the + | sense of coal-mine etc. +myself | reflexive + | 1st person plural +we | subject + +| us | object + | care is required here because US = United States. It is usually + | safe to remove it if it is in lower case. +our | possessive adjective +ours | possessive pronoun +ourselves | reflexive + | second person (archaic `thou' forms not included) +you | subject and object +your | possessive adjective +yours | possessive pronoun +yourself | reflexive (singular) +yourselves | reflexive (plural) + | third person singular +he | subject +him | object +his | possessive adjective and pronoun +himself | reflexive + +she | subject +her | object and possessive adjective +hers | possessive pronoun +herself | reflexive + +it | subject and object +its | possessive adjective +itself | reflexive + | third person plural +they | subject +them | object +their | possessive adjective +theirs | possessive pronoun +themselves | reflexive + | other forms (demonstratives, interrogatives) +what +which +who +whom +this +that +these +those + + | VERB FORMS (using F.R. Palmer's nomenclature) + | BE +am | 1st person, present +is | -s form (3rd person, present) +are | present +was | 1st person, past +were | past +be | infinitive +been | past participle +being | -ing form + | HAVE +have | simple +has | -s form +had | past +having | -ing form + | DO +do | simple +does | -s form +did | past +doing | -ing form + + | The forms below are best omitted, because of the significant homonym forms: + + | He made a WILL + | old tin CAN + | merry month of MAY + | a smell of MUST + | fight the good fight with all thy MIGHT + + | would, could, should, ought might however be included + + | | AUXILIARIES + | | WILL + |will + +would + + | | SHALL + |shall + +should + + | | CAN + |can + +could + + | | MAY + |may + |might + | | MUST + |must + | | OUGHT + +ought + + | COMPOUND FORMS, increasingly encountered nowadays in 'formal' writing + | pronoun + verb + +i'm +you're +he's +she's +it's +we're +they're +i've +you've +we've +they've +i'd +you'd +he'd +she'd +we'd +they'd +i'll +you'll +he'll +she'll +we'll +they'll + + | verb + negation + +isn't +aren't +wasn't +weren't +hasn't +haven't +hadn't +doesn't +don't +didn't + + | auxiliary + negation + +won't +wouldn't +shan't +shouldn't +can't +cannot +couldn't +mustn't + + | miscellaneous forms + +let's +that's +who's +what's +here's +there's +when's +where's +why's +how's + + | rarer forms + + | daren't needn't + + | doubtful forms + + | oughtn't mightn't + + | ARTICLES +a +an +the + + | THE REST (Overlap among prepositions, conjunctions, adverbs etc is so + | high, that classification is pointless.) +and +but +if +or +because +as +until +while + +of +at +by +for +with +about +against +between +into +through +during +before +after +above +below +to +from +up +down +in +out +on +off +over +under + +again +further +then +once + +here +there +when +where +why +how + +all +any +both +each +few +more +most +other +some +such + +no +nor +not +only +own +same +so +than +too +very + + | Just for the record, the following words are among the commonest in English + + | one + | every + | least + | less + | many + | now + | ever + | never + | say + | says + | said + | also + | get + | go + | goes + | just + | made + | make + | put + | see + | seen + | whether + | like + | well + | back + | even + | still + | way + | take + | since + | another + | however + | two + | three + | four + | five + | first + | second + | new + | old + | high + | long diff --git a/sphinx/search/_stopwords/es.py b/sphinx/search/_stopwords/es.py index d70b317d032..5db38b0cd5b 100644 --- a/sphinx/search/_stopwords/es.py +++ b/sphinx/search/_stopwords/es.py @@ -1,3 +1,6 @@ +# automatically generated by utils/generate-snowball.py +# from https://snowballstem.org/algorithms/spanish/stop.txt + from __future__ import annotations SPANISH_STOPWORDS = frozenset({ diff --git a/sphinx/search/_stopwords/es.txt b/sphinx/search/_stopwords/es.txt index d7047b93164..416c84d225a 100644 --- a/sphinx/search/_stopwords/es.txt +++ b/sphinx/search/_stopwords/es.txt @@ -1,4 +1,13 @@ -|source: https://snowballstem.org/algorithms/spanish/stop.txt + + | A Spanish stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + + | The following is a ranked list (commonest to rarest) of stopwords + | deriving from a large sample of text. + + | Extra words have been added at the end. + de | from, of la | the, her que | who, that diff --git a/sphinx/search/_stopwords/fi.py b/sphinx/search/_stopwords/fi.py index d7586cba227..ed29c8a6f0a 100644 --- a/sphinx/search/_stopwords/fi.py +++ b/sphinx/search/_stopwords/fi.py @@ -1,3 +1,6 @@ +# automatically generated by utils/generate-snowball.py +# from https://snowballstem.org/algorithms/finnish/stop.txt + from __future__ import annotations FINNISH_STOPWORDS = frozenset({ @@ -52,6 +55,7 @@ 'jossa', 'josta', 'jota', + 'joten', 'jotka', 'kanssa', 'keiden', diff --git a/sphinx/search/_stopwords/fi.txt b/sphinx/search/_stopwords/fi.txt index 9aff8a79929..5ad14064e58 100644 --- a/sphinx/search/_stopwords/fi.txt +++ b/sphinx/search/_stopwords/fi.txt @@ -1,4 +1,4 @@ -| source: https://snowballstem.org/algorithms/finnish/stop.txt + | forms of BE olla @@ -60,6 +60,7 @@ jotka joiden joita joissa joista joihin joilla joilta joille joina että | that ja | and jos | if +joten | so koska | because kuin | than mutta | but @@ -83,6 +84,5 @@ yli | over, across | other kun | when -niin | so nyt | now itse | self diff --git a/sphinx/search/_stopwords/fr.py b/sphinx/search/_stopwords/fr.py index 7dfd86d7445..cb2e5ef9501 100644 --- a/sphinx/search/_stopwords/fr.py +++ b/sphinx/search/_stopwords/fr.py @@ -1,3 +1,6 @@ +# automatically generated by utils/generate-snowball.py +# from https://snowballstem.org/algorithms/french/stop.txt + from __future__ import annotations FRENCH_STOPWORDS = frozenset({ @@ -6,14 +9,11 @@ 'aient', 'aies', 'ait', - 'as', 'au', - 'aura', 'aurai', 'auraient', 'aurais', 'aurait', - 'auras', 'aurez', 'auriez', 'aurions', @@ -26,7 +26,6 @@ 'avec', 'avez', 'aviez', - 'avions', 'avons', 'ayant', 'ayez', @@ -47,7 +46,6 @@ 'elle', 'en', 'es', - 'est', 'et', 'eu', 'eue', @@ -73,7 +71,6 @@ 'fussions', 'fut', 'fûmes', - 'fût', 'fûtes', 'ici', 'il', @@ -133,8 +130,6 @@ 'soient', 'sois', 'soit', - 'sommes', - 'son', 'sont', 'soyez', 'soyons', @@ -160,9 +155,7 @@ 'étant', 'étiez', 'étions', - 'été', 'étée', 'étées', - 'étés', 'êtes', }) diff --git a/sphinx/search/_stopwords/fr.txt b/sphinx/search/_stopwords/fr.txt index 7839ab57c86..9cb744c3c25 100644 --- a/sphinx/search/_stopwords/fr.txt +++ b/sphinx/search/_stopwords/fr.txt @@ -1,4 +1,7 @@ -| source: https://snowballstem.org/algorithms/french/stop.txt + + | A French stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + au | a + le aux | a + les avec | with @@ -40,7 +43,7 @@ qui | who sa | his, her (fem) se | oneself ses | his (pl) -son | his, her (masc) + | son | his, her (masc). Omitted because it is homonym of "sound" sur | on ta | thy (fem) te | thee @@ -68,15 +71,15 @@ t | t' y | there | forms of être (not including the infinitive): -été + | été - Omitted because it is homonym of "summer" étée étées -étés + | étés - Omitted because it is homonym of "summers" étant suis es -est -sommes + | est - Omitted because it is homonym of "east" + | sommes - Omitted because it is homonym of "sums" êtes sont serai @@ -107,7 +110,7 @@ soyez soient fusse fusses -fût + | fût - Omitted because it is homonym of "tap", like in "beer on tap" fussions fussiez fussent @@ -119,13 +122,13 @@ eue eues eus ai -as + | as - Omitted because it is homonym of "ace" avons avez ont aurai -auras -aura + | auras - Omitted because it is also the name of a kind of wind + | aura - Omitted because it is also the name of a kind of wind and homonym of "aura" aurons aurez auront @@ -136,7 +139,7 @@ auriez auraient avais avait -avions + | avions - Omitted because it is homonym of "planes" aviez avaient eut diff --git a/sphinx/search/_stopwords/hu.py b/sphinx/search/_stopwords/hu.py index 83bee011b0f..8f41b67b28a 100644 --- a/sphinx/search/_stopwords/hu.py +++ b/sphinx/search/_stopwords/hu.py @@ -1,3 +1,6 @@ +# automatically generated by utils/generate-snowball.py +# from https://snowballstem.org/algorithms/hungarian/stop.txt + from __future__ import annotations HUNGARIAN_STOPWORDS = frozenset({ diff --git a/sphinx/search/_stopwords/hu.txt b/sphinx/search/_stopwords/hu.txt index 658c6194f27..2599a8d1b96 100644 --- a/sphinx/search/_stopwords/hu.txt +++ b/sphinx/search/_stopwords/hu.txt @@ -1,5 +1,7 @@ -| source: https://snowballstem.org/algorithms/hungarian/stop.txt + +| Hungarian stop word list | prepared by Anna Tordai + a ahogy ahol diff --git a/sphinx/search/_stopwords/it.py b/sphinx/search/_stopwords/it.py index 4b0f522ac94..873a2c1f77b 100644 --- a/sphinx/search/_stopwords/it.py +++ b/sphinx/search/_stopwords/it.py @@ -1,3 +1,6 @@ +# automatically generated by utils/generate-snowball.py +# from https://snowballstem.org/algorithms/italian/stop.txt + from __future__ import annotations ITALIAN_STOPWORDS = frozenset({ @@ -213,6 +216,7 @@ 'sono', 'sta', 'stai', + 'stando', 'stanno', 'starai', 'staranno', diff --git a/sphinx/search/_stopwords/it.txt b/sphinx/search/_stopwords/it.txt index c8776836110..a20bb9528a5 100644 --- a/sphinx/search/_stopwords/it.txt +++ b/sphinx/search/_stopwords/it.txt @@ -1,4 +1,7 @@ -| source: https://snowballstem.org/algorithms/italian/stop.txt + + | An Italian stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + ad | a (to) before vowel al | a + il allo | a + lo @@ -289,3 +292,4 @@ stessi stesse stessimo stessero +stando diff --git a/sphinx/search/_stopwords/nl.py b/sphinx/search/_stopwords/nl.py index 1742ec8dad2..b8b9a4dcfcd 100644 --- a/sphinx/search/_stopwords/nl.py +++ b/sphinx/search/_stopwords/nl.py @@ -1,3 +1,6 @@ +# automatically generated by utils/generate-snowball.py +# from https://snowballstem.org/algorithms/dutch/stop.txt + from __future__ import annotations DUTCH_STOPWORDS = frozenset({ diff --git a/sphinx/search/_stopwords/nl.txt b/sphinx/search/_stopwords/nl.txt index 64336d0623b..edf99730a2c 100644 --- a/sphinx/search/_stopwords/nl.txt +++ b/sphinx/search/_stopwords/nl.txt @@ -1,4 +1,14 @@ -| source: https://snowballstem.org/algorithms/dutch/stop.txt + + + | A Dutch stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | This is a ranked list (commonest to rarest) of stopwords derived from + | a large sample of Dutch text. + + | Dutch stop words frequently exhibit homonym clashes. These are indicated + | clearly below. + de | the en | and van | of, from diff --git a/sphinx/search/_stopwords/no.py b/sphinx/search/_stopwords/no.py index 9b9bfbea4c9..d06cfc4d798 100644 --- a/sphinx/search/_stopwords/no.py +++ b/sphinx/search/_stopwords/no.py @@ -1,3 +1,6 @@ +# automatically generated by utils/generate-snowball.py +# from https://snowballstem.org/algorithms/norwegian/stop.txt + from __future__ import annotations NORWEGIAN_STOPWORDS = frozenset({ diff --git a/sphinx/search/_stopwords/no.txt b/sphinx/search/_stopwords/no.txt index 552ad326a55..c3d5da01e72 100644 --- a/sphinx/search/_stopwords/no.txt +++ b/sphinx/search/_stopwords/no.txt @@ -1,4 +1,12 @@ -| source: https://snowballstem.org/algorithms/norwegian/stop.txt + + | A Norwegian stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | This stop word list is for the dominant bokmål dialect. Words unique + | to nynorsk are marked *. + + | Revised by Jan Bruusgaard , Jan 2005 + og | and i | in jeg | I @@ -9,7 +17,7 @@ et | a/an den | it/this/that til | to er | is/am/are -som | who/that +som | who/which/that på | on de | they / you(formal) med | with @@ -68,7 +76,6 @@ noen | some noe | some ville | would dere | you -som | who/which/that deres | their/theirs kun | only/just ja | yes @@ -113,7 +120,6 @@ mange | many også | also slik | just vært | been -være | to be båe | both * begge | both siden | since @@ -139,7 +145,6 @@ hennar | her/hers hennes | hers hoss | how * hossen | how * -ikkje | not * ingi | noone * inkje | noone * korleis | how * @@ -161,7 +166,6 @@ noka | some (fem.) * nokor | some * noko | some * nokre | some * -si | his/hers * sia | since * sidan | since * so | so * diff --git a/sphinx/search/_stopwords/pt.py b/sphinx/search/_stopwords/pt.py index b79799d42a6..17b7f8ec733 100644 --- a/sphinx/search/_stopwords/pt.py +++ b/sphinx/search/_stopwords/pt.py @@ -1,3 +1,6 @@ +# automatically generated by utils/generate-snowball.py +# from https://snowballstem.org/algorithms/portuguese/stop.txt + from __future__ import annotations PORTUGUESE_STOPWORDS = frozenset({ diff --git a/sphinx/search/_stopwords/pt.txt b/sphinx/search/_stopwords/pt.txt index 5ef15633d81..9c3c9ac76d7 100644 --- a/sphinx/search/_stopwords/pt.txt +++ b/sphinx/search/_stopwords/pt.txt @@ -1,4 +1,13 @@ -| source: https://snowballstem.org/algorithms/portuguese/stop.txt + + | A Portuguese stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + + | The following is a ranked list (commonest to rarest) of stopwords + | deriving from a large sample of text. + + | Extra words have been added at the end. + de | of, from a | the; to, at; her o | the; him diff --git a/sphinx/search/_stopwords/ru.py b/sphinx/search/_stopwords/ru.py index cc275d5184a..ccd0be5badd 100644 --- a/sphinx/search/_stopwords/ru.py +++ b/sphinx/search/_stopwords/ru.py @@ -1,3 +1,6 @@ +# automatically generated by utils/generate-snowball.py +# from https://snowballstem.org/algorithms/russian/stop.txt + from __future__ import annotations RUSSIAN_STOPWORDS = frozenset({ diff --git a/sphinx/search/_stopwords/ru.txt b/sphinx/search/_stopwords/ru.txt index 43a73af0b55..96abb77073e 100644 --- a/sphinx/search/_stopwords/ru.txt +++ b/sphinx/search/_stopwords/ru.txt @@ -1,4 +1,13 @@ -| source: https://snowballstem.org/algorithms/russian/stop.txt + + + | a russian stop word list. comments begin with vertical bar. each stop + | word is at the start of a line. + + | this is a ranked list (commonest to rarest) of stopwords derived from + | a large text sample. + + | letter `ё' is translated to `е'. + и | and в | in/into во | alternative form diff --git a/sphinx/search/_stopwords/sv.py b/sphinx/search/_stopwords/sv.py index c1f10635e0b..0c6f48d2703 100644 --- a/sphinx/search/_stopwords/sv.py +++ b/sphinx/search/_stopwords/sv.py @@ -1,3 +1,6 @@ +# automatically generated by utils/generate-snowball.py +# from https://snowballstem.org/algorithms/swedish/stop.txt + from __future__ import annotations SWEDISH_STOPWORDS = frozenset({ @@ -80,7 +83,7 @@ 'sig', 'sin', 'sina', - 'sitta', + 'sitt', 'själv', 'skulle', 'som', diff --git a/sphinx/search/_stopwords/sv.txt b/sphinx/search/_stopwords/sv.txt index 850ae7474d6..77924c68dfe 100644 --- a/sphinx/search/_stopwords/sv.txt +++ b/sphinx/search/_stopwords/sv.txt @@ -1,4 +1,13 @@ -| source: https://snowballstem.org/algorithms/swedish/stop.txt + + | A Swedish stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | This is a ranked list (commonest to rarest) of stopwords derived from + | a large text sample. + + | Swedish stop words occasionally exhibit homonym clashes. For example + | så = so, but also seed. These are indicated clearly below. + och | and det | it, this/that att | to (with infinitive) @@ -103,7 +112,7 @@ vilka | who, that ditt | thy vem | who vilket | who, that -sitta | his +sitt | his sådana | such a vart | each dina | thy diff --git a/tests/js/fixtures/cpp/searchindex.js b/tests/js/fixtures/cpp/searchindex.js index 81f14cc1895..6c50cc9d99d 100644 --- a/tests/js/fixtures/cpp/searchindex.js +++ b/tests/js/fixtures/cpp/searchindex.js @@ -1 +1 @@ -Search.setIndex({"alltitles":{},"docnames":["index"],"envversion":{"sphinx":65,"sphinx.domains.c":3,"sphinx.domains.changeset":1,"sphinx.domains.citation":1,"sphinx.domains.cpp":9,"sphinx.domains.index":1,"sphinx.domains.javascript":3,"sphinx.domains.math":2,"sphinx.domains.python":4,"sphinx.domains.rst":2,"sphinx.domains.std":2},"filenames":["index.rst"],"indexentries":{"sphinx (c++ class)":[[0,"_CPPv46Sphinx",false]]},"objects":{"":[[0,0,1,"_CPPv46Sphinx","Sphinx"]]},"objnames":{"0":["cpp","class","C++ class"]},"objtypes":{"0":"cpp:class"},"terms":{"The":0,"This":0,"becaus":0,"c":0,"can":0,"cardin":0,"challeng":0,"charact":0,"class":0,"descript":0,"drop":0,"engin":0,"fixtur":0,"frequent":0,"generat":0,"index":0,"inflat":0,"mathemat":0,"occur":0,"often":0,"project":0,"punctuat":0,"queri":0,"relat":0,"sampl":0,"search":0,"size":0,"sphinx":0,"term":0,"token":0,"use":0,"web":0,"would":0},"titles":["<no title>"],"titleterms":{}}) \ No newline at end of file +Search.setIndex({"alltitles":{},"docnames":["index"],"envversion":{"sphinx":65,"sphinx.domains.c":3,"sphinx.domains.changeset":1,"sphinx.domains.citation":1,"sphinx.domains.cpp":9,"sphinx.domains.index":1,"sphinx.domains.javascript":3,"sphinx.domains.math":2,"sphinx.domains.python":4,"sphinx.domains.rst":2,"sphinx.domains.std":2},"filenames":["index.rst"],"indexentries":{"sphinx (c++ class)":[[0,"_CPPv46Sphinx",false]]},"objects":{"":[[0,0,1,"_CPPv46Sphinx","Sphinx"]]},"objnames":{"0":["cpp","class","C++ class"]},"objtypes":{"0":"cpp:class"},"terms":{"The":0,"This":0,"becaus":0,"c":0,"can":0,"cardin":0,"challeng":0,"charact":0,"class":0,"descript":0,"drop":0,"engin":0,"fixtur":0,"frequent":0,"generat":0,"index":0,"inflat":0,"mathemat":0,"occur":0,"often":0,"project":0,"punctuat":0,"queri":0,"relat":0,"sampl":0,"search":0,"size":0,"sphinx":0,"term":0,"token":0,"use":0,"web":0},"titles":["<no title>"],"titleterms":{}}) \ No newline at end of file diff --git a/tests/js/fixtures/multiterm/searchindex.js b/tests/js/fixtures/multiterm/searchindex.js index 2f3f5ec39a1..a3a52b8cf14 100644 --- a/tests/js/fixtures/multiterm/searchindex.js +++ b/tests/js/fixtures/multiterm/searchindex.js @@ -1 +1 @@ -Search.setIndex({"alltitles":{"Main Page":[[0,null]]},"docnames":["index"],"envversion":{"sphinx":65,"sphinx.domains.c":3,"sphinx.domains.changeset":1,"sphinx.domains.citation":1,"sphinx.domains.cpp":9,"sphinx.domains.index":1,"sphinx.domains.javascript":3,"sphinx.domains.math":2,"sphinx.domains.python":4,"sphinx.domains.rst":2,"sphinx.domains.std":2},"filenames":["index.rst"],"indexentries":{},"objects":{},"objnames":{},"objtypes":{},"terms":{"At":0,"This":0,"adjac":0,"all":0,"an":0,"appear":0,"applic":0,"built":0,"can":0,"check":0,"contain":0,"do":0,"document":0,"doesn":0,"each":0,"fixtur":0,"format":0,"function":0,"futur":0,"html":0,"includ":0,"match":0,"messag":0,"multipl":0,"multiterm":0,"order":0,"other":0,"output":0,"perform":0,"perhap":0,"phrase":0,"project":0,"queri":0,"requir":0,"same":0,"search":0,"success":0,"support":0,"t":0,"term":0,"test":0,"time":0,"use":0,"when":0,"write":0},"titles":["Main Page"],"titleterms":{"main":0,"page":0}}) \ No newline at end of file +Search.setIndex({"alltitles":{"Main Page":[[0,null]]},"docnames":["index"],"envversion":{"sphinx":65,"sphinx.domains.c":3,"sphinx.domains.changeset":1,"sphinx.domains.citation":1,"sphinx.domains.cpp":9,"sphinx.domains.index":1,"sphinx.domains.javascript":3,"sphinx.domains.math":2,"sphinx.domains.python":4,"sphinx.domains.rst":2,"sphinx.domains.std":2},"filenames":["index.rst"],"indexentries":{},"objects":{},"objnames":{},"objtypes":{},"terms":{"At":0,"This":0,"adjac":0,"appear":0,"applic":0,"built":0,"can":0,"check":0,"contain":0,"document":0,"doesn":0,"fixtur":0,"format":0,"function":0,"futur":0,"html":0,"includ":0,"match":0,"messag":0,"multipl":0,"multiterm":0,"order":0,"output":0,"perform":0,"perhap":0,"phrase":0,"project":0,"queri":0,"requir":0,"search":0,"success":0,"support":0,"t":0,"term":0,"test":0,"time":0,"use":0,"will":0,"write":0},"titles":["Main Page"],"titleterms":{"main":0,"page":0}}) \ No newline at end of file diff --git a/tests/js/fixtures/partial/searchindex.js b/tests/js/fixtures/partial/searchindex.js index 5eb299eea63..02863d73d83 100644 --- a/tests/js/fixtures/partial/searchindex.js +++ b/tests/js/fixtures/partial/searchindex.js @@ -1 +1 @@ -Search.setIndex({"alltitles":{"sphinx_utils module":[[0,null]]},"docnames":["index"],"envversion":{"sphinx":65,"sphinx.domains.c":3,"sphinx.domains.changeset":1,"sphinx.domains.citation":1,"sphinx.domains.cpp":9,"sphinx.domains.index":1,"sphinx.domains.javascript":3,"sphinx.domains.math":2,"sphinx.domains.python":4,"sphinx.domains.rst":2,"sphinx.domains.std":2},"filenames":["index.rst"],"indexentries":{},"objects":{},"objnames":{},"objtypes":{},"terms":{"This":0,"both":0,"built":0,"confirm":0,"document":0,"function":0,"html":0,"includ":0,"input":0,"javascript":0,"match":0,"partial":0,"possibl":0,"project":0,"provid":0,"restructuredtext":0,"sampl":0,"search":0,"should":0,"term":0,"titl":0,"use":0,"when":0},"titles":["sphinx_utils module"],"titleterms":{"modul":0,"sphinx_util":0}}) \ No newline at end of file +Search.setIndex({"alltitles":{"sphinx_utils module":[[0,null]]},"docnames":["index"],"envversion":{"sphinx":65,"sphinx.domains.c":3,"sphinx.domains.changeset":1,"sphinx.domains.citation":1,"sphinx.domains.cpp":9,"sphinx.domains.index":1,"sphinx.domains.javascript":3,"sphinx.domains.math":2,"sphinx.domains.python":4,"sphinx.domains.rst":2,"sphinx.domains.std":2},"filenames":["index.rst"],"indexentries":{},"objects":{},"objnames":{},"objtypes":{},"terms":{"This":0,"built":0,"confirm":0,"document":0,"function":0,"html":0,"includ":0,"input":0,"javascript":0,"match":0,"partial":0,"possibl":0,"project":0,"provid":0,"restructuredtext":0,"sampl":0,"search":0,"term":0,"titl":0,"use":0},"titles":["sphinx_utils module"],"titleterms":{"modul":0,"sphinx_util":0}}) \ No newline at end of file diff --git a/tests/js/fixtures/titles/searchindex.js b/tests/js/fixtures/titles/searchindex.js index fa59e11c884..9faeadf76c6 100644 --- a/tests/js/fixtures/titles/searchindex.js +++ b/tests/js/fixtures/titles/searchindex.js @@ -1 +1 @@ -Search.setIndex({"alltitles":{"Main Page":[[0,null]],"Relevance":[[0,"relevance"],[1,null]],"Result Scoring":[[0,"result-scoring"]]},"docnames":["index","relevance"],"envversion":{"sphinx":65,"sphinx.domains.c":3,"sphinx.domains.changeset":1,"sphinx.domains.citation":1,"sphinx.domains.cpp":9,"sphinx.domains.index":1,"sphinx.domains.javascript":3,"sphinx.domains.math":2,"sphinx.domains.python":4,"sphinx.domains.rst":2,"sphinx.domains.std":2},"filenames":["index.rst","relevance.rst"],"indexentries":{"example (class in relevance)":[[0,"relevance.Example",false]],"module":[[0,"module-relevance",false]],"relevance":[[0,"index-1",false],[0,"module-relevance",false]],"relevance (relevance.example attribute)":[[0,"relevance.Example.relevance",false]],"scoring":[[0,"index-0",true]]},"objects":{"":[[0,0,0,"-","relevance"]],"relevance":[[0,1,1,"","Example"]],"relevance.Example":[[0,2,1,"","relevance"]]},"objnames":{"0":["py","module","Python module"],"1":["py","class","Python class"],"2":["py","attribute","Python attribute"]},"objtypes":{"0":"py:module","1":"py:class","2":"py:attribute"},"terms":{"A":1,"By":0,"For":[0,1],"In":[0,1],"This":0,"against":0,"align":0,"also":1,"an":0,"answer":0,"appear":1,"area":0,"ask":0,"assign":0,"attempt":0,"attribut":0,"both":0,"built":1,"can":[0,1],"class":0,"code":[0,1],"collect":0,"consid":1,"contain":0,"context":0,"corpus":1,"could":1,"demonstr":0,"describ":1,"detail":1,"determin":[0,1],"docstr":0,"document":[0,1],"domain":1,"dure":0,"engin":0,"evalu":0,"exampl":[0,1],"extract":0,"feedback":0,"find":0,"found":0,"from":0,"function":1,"handl":0,"happen":1,"has":1,"head":0,"help":0,"high":[0,1],"how":0,"improv":0,"inform":0,"intend":0,"issu":[0,1],"itself":1,"knowledg":0,"languag":1,"less":1,"like":[0,1],"mani":0,"match":0,"mention":1,"more":0,"name":[0,1],"numer":0,"object":0,"often":0,"one":[0,1],"onli":[0,1],"order":0,"other":0,"over":0,"page":1,"part":1,"particular":0,"present":0,"printf":1,"program":1,"project":0,"queri":[0,1],"question":0,"re":0,"relat":0,"research":0,"result":1,"retriev":0,"s":[0,1],"same":1,"say":0,"search":[0,1],"seem":0,"softwar":1,"some":1,"sphinx":0,"straightforward":1,"subject":0,"subsect":0,"term":[0,1],"test":0,"text":0,"than":[0,1],"them":0,"time":0,"titl":0,"two":0,"typic":0,"use":0,"user":[0,1],"we":[0,1],"when":0,"whether":1,"which":0,"within":0,"word":0,"would":[0,1]},"titles":["Main Page","Relevance"],"titleterms":{"main":0,"page":0,"relev":[0,1],"result":0,"score":0}}) \ No newline at end of file +Search.setIndex({"alltitles":{"Main Page":[[0,null]],"Relevance":[[0,"relevance"],[1,null]],"Result Scoring":[[0,"result-scoring"]]},"docnames":["index","relevance"],"envversion":{"sphinx":65,"sphinx.domains.c":3,"sphinx.domains.changeset":1,"sphinx.domains.citation":1,"sphinx.domains.cpp":9,"sphinx.domains.index":1,"sphinx.domains.javascript":3,"sphinx.domains.math":2,"sphinx.domains.python":4,"sphinx.domains.rst":2,"sphinx.domains.std":2},"filenames":["index.rst","relevance.rst"],"indexentries":{"example (class in relevance)":[[0,"relevance.Example",false]],"module":[[0,"module-relevance",false]],"relevance":[[0,"index-1",false],[0,"module-relevance",false]],"relevance (relevance.example attribute)":[[0,"relevance.Example.relevance",false]],"scoring":[[0,"index-0",true]]},"objects":{"":[[0,0,0,"-","relevance"]],"relevance":[[0,1,1,"","Example"]],"relevance.Example":[[0,2,1,"","relevance"]]},"objnames":{"0":["py","module","Python module"],"1":["py","class","Python class"],"2":["py","attribute","Python attribute"]},"objtypes":{"0":"py:module","1":"py:class","2":"py:attribute"},"terms":{"A":1,"By":0,"For":[0,1],"In":[0,1],"This":0,"align":0,"also":1,"answer":0,"appear":1,"area":0,"ask":0,"assign":0,"attempt":0,"attribut":0,"built":1,"can":[0,1],"class":0,"code":[0,1],"collect":0,"consid":1,"contain":0,"context":0,"corpus":1,"demonstr":0,"describ":1,"detail":1,"determin":[0,1],"docstr":0,"document":[0,1],"domain":1,"dure":0,"engin":0,"evalu":0,"exampl":[0,1],"extract":0,"feedback":0,"find":0,"found":0,"function":1,"handl":0,"happen":1,"head":0,"help":0,"high":[0,1],"improv":0,"inform":0,"intend":0,"issu":[0,1],"knowledg":0,"languag":1,"less":1,"like":[0,1],"mani":0,"match":0,"mention":1,"name":[0,1],"numer":0,"object":0,"often":0,"one":[0,1],"onli":[0,1],"order":0,"page":1,"part":1,"particular":0,"present":0,"printf":1,"program":1,"project":0,"queri":[0,1],"question":0,"re":0,"relat":0,"research":0,"result":1,"retriev":0,"s":[0,1],"say":0,"search":[0,1],"seem":0,"softwar":1,"sphinx":0,"straightforward":1,"subject":0,"subsect":0,"term":[0,1],"test":0,"text":0,"time":0,"titl":0,"two":0,"typic":0,"use":0,"user":[0,1],"whether":1,"will":0,"within":0,"word":0},"titles":["Main Page","Relevance"],"titleterms":{"main":0,"page":0,"relev":[0,1],"result":0,"score":0}}) \ No newline at end of file diff --git a/utils/generate_snowball.py b/utils/generate_snowball.py new file mode 100755 index 00000000000..f59183d7f21 --- /dev/null +++ b/utils/generate_snowball.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 + +"""Refresh and update language stemming data from the Snowball project.""" + +# /// script +# requires-python = ">=3.11" +# dependencies = [ +# "requests>=2.30", +# ] +# /// + +from __future__ import annotations + +import hashlib +import shutil +import subprocess +import sys +import tarfile +import tempfile +from io import BytesIO +from pathlib import Path + +import requests + +SNOWBALL_VERSION = '3.0.1' +SNOWBALL_URL = f'https://github.com/snowballstem/snowball/archive/refs/tags/v{SNOWBALL_VERSION}.tar.gz' +SNOWBALL_SHA256 = '80ac10ce40dc4fcfbfed8d085c457b5613da0e86a73611a3d5527d044a142d60' + +ROOT = Path(__file__).resolve().parent.parent +SEARCH_DIR = ROOT / 'sphinx' / 'search' +STOPWORDS_DIR = SEARCH_DIR / '_stopwords' +NON_MINIFIED_JS_DIR = SEARCH_DIR / 'non-minified-js' + +STOPWORD_URLS = ( + ('da', 'danish', 'https://snowballstem.org/algorithms/danish/stop.txt'), + ('de', 'german', 'https://snowballstem.org/algorithms/german/stop.txt'), + ('en', 'english', 'https://snowballstem.org/algorithms/english/stop.txt'), + ('es', 'spanish', 'https://snowballstem.org/algorithms/spanish/stop.txt'), + ('fi', 'finnish', 'https://snowballstem.org/algorithms/finnish/stop.txt'), + ('fr', 'french', 'https://snowballstem.org/algorithms/french/stop.txt'), + ('hu', 'hungarian', 'https://snowballstem.org/algorithms/hungarian/stop.txt'), + ('it', 'italian', 'https://snowballstem.org/algorithms/italian/stop.txt'), + ('nl', 'dutch', 'https://snowballstem.org/algorithms/dutch/stop.txt'), + ('no', 'norwegian', 'https://snowballstem.org/algorithms/norwegian/stop.txt'), + ('pt', 'portuguese', 'https://snowballstem.org/algorithms/portuguese/stop.txt'), + # ('ro', 'romanian', ''), + ('ru', 'russian', 'https://snowballstem.org/algorithms/russian/stop.txt'), + ('sv', 'swedish', 'https://snowballstem.org/algorithms/swedish/stop.txt'), + # ('tr', 'turkish', ''), +) + + +def regenerate_stopwords() -> None: + STOPWORDS_DIR.mkdir(parents=True, exist_ok=True) + STOPWORDS_DIR.joinpath('__init__.py').touch() + + for lang_code, lang_name, url in STOPWORD_URLS: + data = requests.get(url, timeout=5).content.decode('utf-8') + + # record the original source of the stopwords list + txt_path = STOPWORDS_DIR / f'{lang_code}.txt' + txt_path.write_text(data.rstrip() + '\n', encoding='utf-8') + + # generate the Python stopwords set + stopwords = parse_stop_word(data) + with (STOPWORDS_DIR / f'{lang_code}.py').open('w', encoding='utf-8') as f: + f.write('# automatically generated by utils/generate-snowball.py\n') + f.write(f'# from {url}\n\n') + f.write('from __future__ import annotations\n\n') + f.write(f'{lang_name.upper()}_STOPWORDS = frozenset(') + if stopwords: + f.write('{\n') + for word in sorted(stopwords, key=str.casefold): + f.write(f' {word!r},\n') + f.write('}') + f.write(')\n') + + +def parse_stop_word(source: str) -> frozenset[str]: + """Collect the stopwords from a snowball style word list: + + .. code:: text + + list of space separated stop words | optional comment + """ + stop_words: set[str] = set() + for line in source.splitlines(): + stop_words.update(line.partition('|')[0].split()) # remove comment + return frozenset(stop_words) + + +def regenerate_javascript() -> None: + tmp_root = Path(tempfile.mkdtemp()) + + # Download and verify the snowball release + archive = requests.get(SNOWBALL_URL, timeout=60).content + digest = hashlib.sha256(archive).hexdigest() + if digest != SNOWBALL_SHA256: + msg = ( + f'data does not match expected checksum ' + f'(expected {SNOWBALL_SHA256}, saw {digest}).' + ) + raise RuntimeError(msg) + + # Extract the release archive + with tarfile.TarFile.gzopen( + 'snowball.tar.gz', mode='r', fileobj=BytesIO(archive) + ) as tar: + tar.extractall(tmp_root, filter='data') + snowball_root = tmp_root / f'snowball-{SNOWBALL_VERSION}' + snowball_dist = snowball_root / 'dist' + + # Generate JS stemmer files + cmd = ('make', '--jobs=8', 'dist_libstemmer_js') + subprocess.run(cmd, check=True, cwd=snowball_root) + with tarfile.open(snowball_dist / f'jsstemmer-{SNOWBALL_VERSION}.tar.gz') as tar: + tar.extractall(snowball_dist, filter='data') + + # Copy generated JS to sphinx/search/ + NON_MINIFIED_JS_DIR.mkdir(exist_ok=True) + js_dir = snowball_dist / f'jsstemmer-{SNOWBALL_VERSION}' / 'javascript' + shutil.copytree(js_dir, NON_MINIFIED_JS_DIR, dirs_exist_ok=True) + + # Clean up + shutil.rmtree(snowball_root) + + +if __name__ == '__main__': + regenerate_stopwords() + if sys.platform != 'win32': + regenerate_javascript()