Skip to content

Commit 954839a

Browse files
authored
Use the more modern English stemmer (#13574)
The 'Porter' stemmer is considered frozen.
1 parent 4532958 commit 954839a

File tree

21 files changed

+26
-402
lines changed

21 files changed

+26
-402
lines changed

sphinx/search/da.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
"""Danish search language: includes the JS Danish stemmer."""
1+
"""Danish search language."""
22

33
from __future__ import annotations
44

sphinx/search/de.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
"""German search language: includes the JS German stemmer."""
1+
"""German search language."""
22

33
from __future__ import annotations
44

sphinx/search/en.py

Lines changed: 3 additions & 189 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
"""English search language: includes the JS porter stemmer."""
1+
"""English search language."""
22

33
from __future__ import annotations
44

@@ -7,202 +7,16 @@
77
from sphinx.search import SearchLanguage
88
from sphinx.search._stopwords.en import ENGLISH_STOPWORDS
99

10-
js_porter_stemmer = """
11-
/**
12-
* Porter Stemmer
13-
*/
14-
var Stemmer = function() {
15-
16-
var step2list = {
17-
ational: 'ate',
18-
tional: 'tion',
19-
enci: 'ence',
20-
anci: 'ance',
21-
izer: 'ize',
22-
bli: 'ble',
23-
alli: 'al',
24-
entli: 'ent',
25-
eli: 'e',
26-
ousli: 'ous',
27-
ization: 'ize',
28-
ation: 'ate',
29-
ator: 'ate',
30-
alism: 'al',
31-
iveness: 'ive',
32-
fulness: 'ful',
33-
ousness: 'ous',
34-
aliti: 'al',
35-
iviti: 'ive',
36-
biliti: 'ble',
37-
logi: 'log'
38-
};
39-
40-
var step3list = {
41-
icate: 'ic',
42-
ative: '',
43-
alize: 'al',
44-
iciti: 'ic',
45-
ical: 'ic',
46-
ful: '',
47-
ness: ''
48-
};
49-
50-
var c = "[^aeiou]"; // consonant
51-
var v = "[aeiouy]"; // vowel
52-
var C = c + "[^aeiouy]*"; // consonant sequence
53-
var V = v + "[aeiou]*"; // vowel sequence
54-
55-
var mgr0 = "^(" + C + ")?" + V + C; // [C]VC... is m>0
56-
var meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$"; // [C]VC[V] is m=1
57-
var mgr1 = "^(" + C + ")?" + V + C + V + C; // [C]VCVC... is m>1
58-
var s_v = "^(" + C + ")?" + v; // vowel in stem
59-
60-
this.stemWord = function (w) {
61-
var stem;
62-
var suffix;
63-
var firstch;
64-
var origword = w;
65-
66-
if (w.length < 3)
67-
return w;
68-
69-
var re;
70-
var re2;
71-
var re3;
72-
var re4;
73-
74-
firstch = w.substr(0,1);
75-
if (firstch == "y")
76-
w = firstch.toUpperCase() + w.substr(1);
77-
78-
// Step 1a
79-
re = /^(.+?)(ss|i)es$/;
80-
re2 = /^(.+?)([^s])s$/;
81-
82-
if (re.test(w))
83-
w = w.replace(re,"$1$2");
84-
else if (re2.test(w))
85-
w = w.replace(re2,"$1$2");
86-
87-
// Step 1b
88-
re = /^(.+?)eed$/;
89-
re2 = /^(.+?)(ed|ing)$/;
90-
if (re.test(w)) {
91-
var fp = re.exec(w);
92-
re = new RegExp(mgr0);
93-
if (re.test(fp[1])) {
94-
re = /.$/;
95-
w = w.replace(re,"");
96-
}
97-
}
98-
else if (re2.test(w)) {
99-
var fp = re2.exec(w);
100-
stem = fp[1];
101-
re2 = new RegExp(s_v);
102-
if (re2.test(stem)) {
103-
w = stem;
104-
re2 = /(at|bl|iz)$/;
105-
re3 = new RegExp("([^aeiouylsz])\\\\1$");
106-
re4 = new RegExp("^" + C + v + "[^aeiouwxy]$");
107-
if (re2.test(w))
108-
w = w + "e";
109-
else if (re3.test(w)) {
110-
re = /.$/;
111-
w = w.replace(re,"");
112-
}
113-
else if (re4.test(w))
114-
w = w + "e";
115-
}
116-
}
117-
118-
// Step 1c
119-
re = /^(.+?)y$/;
120-
if (re.test(w)) {
121-
var fp = re.exec(w);
122-
stem = fp[1];
123-
re = new RegExp(s_v);
124-
if (re.test(stem))
125-
w = stem + "i";
126-
}
127-
128-
// Step 2
129-
re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|\
130-
ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
131-
if (re.test(w)) {
132-
var fp = re.exec(w);
133-
stem = fp[1];
134-
suffix = fp[2];
135-
re = new RegExp(mgr0);
136-
if (re.test(stem))
137-
w = stem + step2list[suffix];
138-
}
139-
140-
// Step 3
141-
re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
142-
if (re.test(w)) {
143-
var fp = re.exec(w);
144-
stem = fp[1];
145-
suffix = fp[2];
146-
re = new RegExp(mgr0);
147-
if (re.test(stem))
148-
w = stem + step3list[suffix];
149-
}
150-
151-
// Step 4
152-
re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|\
153-
iti|ous|ive|ize)$/;
154-
re2 = /^(.+?)(s|t)(ion)$/;
155-
if (re.test(w)) {
156-
var fp = re.exec(w);
157-
stem = fp[1];
158-
re = new RegExp(mgr1);
159-
if (re.test(stem))
160-
w = stem;
161-
}
162-
else if (re2.test(w)) {
163-
var fp = re2.exec(w);
164-
stem = fp[1] + fp[2];
165-
re2 = new RegExp(mgr1);
166-
if (re2.test(stem))
167-
w = stem;
168-
}
169-
170-
// Step 5
171-
re = /^(.+?)e$/;
172-
if (re.test(w)) {
173-
var fp = re.exec(w);
174-
stem = fp[1];
175-
re = new RegExp(mgr1);
176-
re2 = new RegExp(meq1);
177-
re3 = new RegExp("^" + C + v + "[^aeiouwxy]$");
178-
if (re.test(stem) || (re2.test(stem) && !(re3.test(stem))))
179-
w = stem;
180-
}
181-
re = /ll$/;
182-
re2 = new RegExp(mgr1);
183-
if (re.test(w) && re2.test(w)) {
184-
re = /.$/;
185-
w = w.replace(re,"");
186-
}
187-
188-
// and turn initial Y back to y
189-
if (firstch == "y")
190-
w = firstch.toLowerCase() + w.substr(1);
191-
return w;
192-
}
193-
}
194-
"""
195-
19610

19711
class SearchEnglish(SearchLanguage):
19812
lang = 'en'
19913
language_name = 'English'
200-
js_stemmer_code = js_porter_stemmer
14+
js_stemmer_rawcode = 'english-stemmer.js'
20115
stopwords = ENGLISH_STOPWORDS
20216

20317
def __init__(self, options: dict[str, str]) -> None:
20418
super().__init__(options)
205-
self.stemmer = snowballstemmer.stemmer('porter')
19+
self.stemmer = snowballstemmer.stemmer('english')
20620

20721
def stem(self, word: str) -> str:
20822
return self.stemmer.stemWord(word.lower())

sphinx/search/es.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
"""Spanish search language: includes the JS Spanish stemmer."""
1+
"""Spanish search language."""
22

33
from __future__ import annotations
44

sphinx/search/fi.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
"""Finnish search language: includes the JS Finnish stemmer."""
1+
"""Finnish search language."""
22

33
from __future__ import annotations
44

sphinx/search/fr.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
"""French search language: includes the JS French stemmer."""
1+
"""French search language."""
22

33
from __future__ import annotations
44

sphinx/search/hu.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
"""Hungarian search language: includes the JS Hungarian stemmer."""
1+
"""Hungarian search language."""
22

33
from __future__ import annotations
44

sphinx/search/it.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
"""Italian search language: includes the JS Italian stemmer."""
1+
"""Italian search language."""
22

33
from __future__ import annotations
44

sphinx/search/nl.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
"""Dutch search language: includes the JS porter stemmer."""
1+
"""Dutch search language."""
22

33
from __future__ import annotations
44

sphinx/search/no.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
"""Norwegian search language: includes the JS Norwegian stemmer."""
1+
"""Norwegian search language."""
22

33
from __future__ import annotations
44

sphinx/search/pt.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
"""Portuguese search language: includes the JS Portuguese stemmer."""
1+
"""Portuguese search language."""
22

33
from __future__ import annotations
44

sphinx/search/ro.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
"""Romanian search language: includes the JS Romanian stemmer."""
1+
"""Romanian search language."""
22

33
from __future__ import annotations
44

sphinx/search/ru.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
"""Russian search language: includes the JS Russian stemmer."""
1+
"""Russian search language."""
22

33
from __future__ import annotations
44

sphinx/search/sv.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
"""Swedish search language: includes the JS Swedish stemmer."""
1+
"""Swedish search language."""
22

33
from __future__ import annotations
44

sphinx/search/tr.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
"""Turkish search language: includes the JS Turkish stemmer."""
1+
"""Turkish search language."""
22

33
from __future__ import annotations
44

0 commit comments

Comments
 (0)