Skip to content

Use the more modern English stemmer #13574

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
May 19, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion sphinx/search/da.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Danish search language: includes the JS Danish stemmer."""
"""Danish search language."""

from __future__ import annotations

Expand Down
2 changes: 1 addition & 1 deletion sphinx/search/de.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""German search language: includes the JS German stemmer."""
"""German search language."""

from __future__ import annotations

Expand Down
192 changes: 3 additions & 189 deletions sphinx/search/en.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""English search language: includes the JS porter stemmer."""
"""English search language."""

from __future__ import annotations

Expand All @@ -7,202 +7,16 @@
from sphinx.search import SearchLanguage
from sphinx.search._stopwords.en import ENGLISH_STOPWORDS

js_porter_stemmer = """
/**
* Porter Stemmer
*/
var Stemmer = function() {

var step2list = {
ational: 'ate',
tional: 'tion',
enci: 'ence',
anci: 'ance',
izer: 'ize',
bli: 'ble',
alli: 'al',
entli: 'ent',
eli: 'e',
ousli: 'ous',
ization: 'ize',
ation: 'ate',
ator: 'ate',
alism: 'al',
iveness: 'ive',
fulness: 'ful',
ousness: 'ous',
aliti: 'al',
iviti: 'ive',
biliti: 'ble',
logi: 'log'
};

var step3list = {
icate: 'ic',
ative: '',
alize: 'al',
iciti: 'ic',
ical: 'ic',
ful: '',
ness: ''
};

var c = "[^aeiou]"; // consonant
var v = "[aeiouy]"; // vowel
var C = c + "[^aeiouy]*"; // consonant sequence
var V = v + "[aeiou]*"; // vowel sequence

var mgr0 = "^(" + C + ")?" + V + C; // [C]VC... is m>0
var meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$"; // [C]VC[V] is m=1
var mgr1 = "^(" + C + ")?" + V + C + V + C; // [C]VCVC... is m>1
var s_v = "^(" + C + ")?" + v; // vowel in stem

this.stemWord = function (w) {
var stem;
var suffix;
var firstch;
var origword = w;

if (w.length < 3)
return w;

var re;
var re2;
var re3;
var re4;

firstch = w.substr(0,1);
if (firstch == "y")
w = firstch.toUpperCase() + w.substr(1);

// Step 1a
re = /^(.+?)(ss|i)es$/;
re2 = /^(.+?)([^s])s$/;

if (re.test(w))
w = w.replace(re,"$1$2");
else if (re2.test(w))
w = w.replace(re2,"$1$2");

// Step 1b
re = /^(.+?)eed$/;
re2 = /^(.+?)(ed|ing)$/;
if (re.test(w)) {
var fp = re.exec(w);
re = new RegExp(mgr0);
if (re.test(fp[1])) {
re = /.$/;
w = w.replace(re,"");
}
}
else if (re2.test(w)) {
var fp = re2.exec(w);
stem = fp[1];
re2 = new RegExp(s_v);
if (re2.test(stem)) {
w = stem;
re2 = /(at|bl|iz)$/;
re3 = new RegExp("([^aeiouylsz])\\\\1$");
re4 = new RegExp("^" + C + v + "[^aeiouwxy]$");
if (re2.test(w))
w = w + "e";
else if (re3.test(w)) {
re = /.$/;
w = w.replace(re,"");
}
else if (re4.test(w))
w = w + "e";
}
}

// Step 1c
re = /^(.+?)y$/;
if (re.test(w)) {
var fp = re.exec(w);
stem = fp[1];
re = new RegExp(s_v);
if (re.test(stem))
w = stem + "i";
}

// Step 2
re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|\
ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
if (re.test(w)) {
var fp = re.exec(w);
stem = fp[1];
suffix = fp[2];
re = new RegExp(mgr0);
if (re.test(stem))
w = stem + step2list[suffix];
}

// Step 3
re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
if (re.test(w)) {
var fp = re.exec(w);
stem = fp[1];
suffix = fp[2];
re = new RegExp(mgr0);
if (re.test(stem))
w = stem + step3list[suffix];
}

// Step 4
re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|\
iti|ous|ive|ize)$/;
re2 = /^(.+?)(s|t)(ion)$/;
if (re.test(w)) {
var fp = re.exec(w);
stem = fp[1];
re = new RegExp(mgr1);
if (re.test(stem))
w = stem;
}
else if (re2.test(w)) {
var fp = re2.exec(w);
stem = fp[1] + fp[2];
re2 = new RegExp(mgr1);
if (re2.test(stem))
w = stem;
}

// Step 5
re = /^(.+?)e$/;
if (re.test(w)) {
var fp = re.exec(w);
stem = fp[1];
re = new RegExp(mgr1);
re2 = new RegExp(meq1);
re3 = new RegExp("^" + C + v + "[^aeiouwxy]$");
if (re.test(stem) || (re2.test(stem) && !(re3.test(stem))))
w = stem;
}
re = /ll$/;
re2 = new RegExp(mgr1);
if (re.test(w) && re2.test(w)) {
re = /.$/;
w = w.replace(re,"");
}

// and turn initial Y back to y
if (firstch == "y")
w = firstch.toLowerCase() + w.substr(1);
return w;
}
}
"""


class SearchEnglish(SearchLanguage):
lang = 'en'
language_name = 'English'
js_stemmer_code = js_porter_stemmer
js_stemmer_rawcode = 'english-stemmer.js'
stopwords = ENGLISH_STOPWORDS

def __init__(self, options: dict[str, str]) -> None:
super().__init__(options)
self.stemmer = snowballstemmer.stemmer('porter')
self.stemmer = snowballstemmer.stemmer('english')

def stem(self, word: str) -> str:
return self.stemmer.stemWord(word.lower())
2 changes: 1 addition & 1 deletion sphinx/search/es.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Spanish search language: includes the JS Spanish stemmer."""
"""Spanish search language."""

from __future__ import annotations

Expand Down
2 changes: 1 addition & 1 deletion sphinx/search/fi.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Finnish search language: includes the JS Finnish stemmer."""
"""Finnish search language."""

from __future__ import annotations

Expand Down
2 changes: 1 addition & 1 deletion sphinx/search/fr.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""French search language: includes the JS French stemmer."""
"""French search language."""

from __future__ import annotations

Expand Down
2 changes: 1 addition & 1 deletion sphinx/search/hu.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Hungarian search language: includes the JS Hungarian stemmer."""
"""Hungarian search language."""

from __future__ import annotations

Expand Down
2 changes: 1 addition & 1 deletion sphinx/search/it.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Italian search language: includes the JS Italian stemmer."""
"""Italian search language."""

from __future__ import annotations

Expand Down
2 changes: 1 addition & 1 deletion sphinx/search/nl.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Dutch search language: includes the JS porter stemmer."""
"""Dutch search language."""

from __future__ import annotations

Expand Down
2 changes: 1 addition & 1 deletion sphinx/search/no.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Norwegian search language: includes the JS Norwegian stemmer."""
"""Norwegian search language."""

from __future__ import annotations

Expand Down
2 changes: 1 addition & 1 deletion sphinx/search/pt.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Portuguese search language: includes the JS Portuguese stemmer."""
"""Portuguese search language."""

from __future__ import annotations

Expand Down
2 changes: 1 addition & 1 deletion sphinx/search/ro.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Romanian search language: includes the JS Romanian stemmer."""
"""Romanian search language."""

from __future__ import annotations

Expand Down
2 changes: 1 addition & 1 deletion sphinx/search/ru.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Russian search language: includes the JS Russian stemmer."""
"""Russian search language."""

from __future__ import annotations

Expand Down
2 changes: 1 addition & 1 deletion sphinx/search/sv.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Swedish search language: includes the JS Swedish stemmer."""
"""Swedish search language."""

from __future__ import annotations

Expand Down
2 changes: 1 addition & 1 deletion sphinx/search/tr.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Turkish search language: includes the JS Turkish stemmer."""
"""Turkish search language."""

from __future__ import annotations

Expand Down
Loading
Loading