Skip to content

Commit afd6c0b

Browse files
committed
Merged in shibu/sphinx/add_stemmer (pull request #214)
2 parents 85ddf03 + a130bb9 commit afd6c0b

23 files changed

+2920
-11
lines changed

CHANGES

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ New features
1818

1919
* Added ``sphinx.ext.napoleon`` extension for NumPy and Google style docstring
2020
support.
21+
* PR#214: Added stemming support for 14 languages, so that the built-in document
22+
search can now handle these. Thanks to Shibukawa Yoshiki.
2123
* PR#202: Allow "." and "~" prefixed references in ``:param:`` doc fields
2224
for Python.
2325
* PR#184: Add :confval:`autodoc_mock_imports`, allowing to mock imports of

doc/config.rst

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -748,10 +748,37 @@ that use Sphinx' HTMLWriter class.
748748

749749
Support is present for these languages:
750750

751+
* ``da`` -- Danish
752+
* ``nl`` -- Dutch
751753
* ``en`` -- English
754+
* ``fi`` -- Finnish
755+
* ``fr`` -- French
756+
* ``de`` -- German
757+
* ``hu`` -- Hungarian
758+
* ``it`` -- Italian
752759
* ``ja`` -- Japanese
760+
* ``no`` -- Norwegian
761+
* ``pr`` -- Portuguese
762+
* ``ro`` -- Romanian
763+
* ``ru`` -- Russian
764+
* ``es`` -- Spanish
765+
* ``sv`` -- Swedish
766+
* ``tr`` -- Turkish
767+
768+
.. admonition:: Accelerate build speed
769+
770+
Each language (except Japanese) provides its own stemming algorithm.
771+
Sphinx uses a Python implementation by default. You can use a C
772+
implementation to accelerate building the index file.
773+
774+
* `PorterStemmer <https://pypi.python.org/pypi/PorterStemmer>`_ (``en``)
775+
* `PyStemmer <https://pypi.python.org/pypi/PyStemmer>`_ (all languages)
753776

754777
.. versionadded:: 1.1
778+
With support for ``en`` and ``ja``.
779+
780+
.. versionchanged:: 1.3
781+
Added additional languages.
755782

756783
.. confval:: html_search_options
757784

doc/devguide.rst

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,3 +243,15 @@ Debugging Tips
243243

244244
* Set the debugging options in the `Docutils configuration file
245245
<http://docutils.sourceforge.net/docs/user/config.html>`_.
246+
247+
* JavaScript stemming algorithms in `sphinx/search/*.py` (except `en.py`) are
248+
genereted by
249+
`modified snowballcode generator <https://github.com/shibukawa/snowball>`_.
250+
Generated `JSX <http://jsx.github.io/>`_ files are
251+
in `this repository <https://github.com/shibukawa/snowball-stemmer.jsx>`_.
252+
You can get resulting JavaScript files by the following command:
253+
254+
.. code-block:: bash
255+
256+
$ npm install
257+
$ node_modules/.bin/grunt build # -> dest/*.global.js

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545
print('ERROR: Sphinx requires at least Python 2.6 or 3.2 to run.')
4646
sys.exit(1)
4747

48-
requires = ['Pygments>=1.2', 'docutils>=0.10']
48+
requires = ['Pygments>=1.2', 'docutils>=0.10', 'snowballstemmer>=1.1']
4949

5050
if (3, 0) <= sys.version_info < (3, 3):
5151
requires.append('Jinja2>=2.3,<2.7')

sphinx/builders/html.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@
3232
from sphinx.util.nodes import inline_all_toctrees
3333
from sphinx.util.matching import patmatch, compile_matchers
3434
from sphinx.util.pycompat import b
35-
from sphinx.errors import SphinxError
3635
from sphinx.locale import _
3736
from sphinx.search import js_index
3837
from sphinx.theming import Theme
@@ -818,7 +817,7 @@ def dump_inventory(self):
818817
self.info('done')
819818

820819
def dump_search_index(self):
821-
self.info(bold('dumping search index... '), nonl=True)
820+
self.info(bold('dumping search index in %s ... ' % self.indexer.label()), nonl=True)
822821
self.indexer.prune(self.env.all_docs)
823822
searchindexfn = path.join(self.outdir, self.searchindex_filename)
824823
# first write to a temporary file, so that if dumping fails,

sphinx/quickstart.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -231,10 +231,23 @@
231231
# This is the file name suffix for HTML files (e.g. ".xhtml").
232232
#html_file_suffix = None
233233
234+
# Language to be used for generating the HTML full-text search index.
235+
# Sphinx supports the following languages:
236+
# 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja'
237+
# 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr'
238+
#html_search_language = 'en'
239+
240+
# A dictionary with options for the search language support, empty by default.
241+
# Now only 'ja' uses this config value
242+
#html_search_options = {'type': 'default'}
243+
244+
# The name of a javascript file (relative to the configuration directory) that
245+
# implements a search results scorer. If empty, the default will be used.
246+
#html_search_scorer = 'scorer.js'
247+
234248
# Output file base name for HTML help builder.
235249
htmlhelp_basename = '%(project_fn)sdoc'
236250
237-
238251
# -- Options for LaTeX output ---------------------------------------------
239252
240253
latex_elements = {

sphinx/search/__init__.py

Lines changed: 47 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,14 @@
99
:license: BSD, see LICENSE for details.
1010
"""
1111
from __future__ import with_statement
12+
1213
import re
1314
import cPickle as pickle
1415

1516
from docutils.nodes import raw, comment, title, Text, NodeVisitor, SkipNode
1617

1718
from sphinx.util import jsdump, rpartition
19+
from sphinx.search.en import SearchEnglish
1820

1921

2022
class SearchLanguage(object):
@@ -40,6 +42,7 @@ class SearchLanguage(object):
4042
type, before searching index. Default implementation does nothing.
4143
"""
4244
lang = None
45+
language_name = None
4346
stopwords = set()
4447
js_stemmer_code = """
4548
/**
@@ -89,16 +92,42 @@ def word_filter(self, word):
8992
Return true if the target word should be registered in the search index.
9093
This method is called after stemming.
9194
"""
92-
return not (((len(word) < 3) and (12353 < ord(word[0]) < 12436)) or
95+
return len(word) == 0 or not (((len(word) < 3) and (12353 < ord(word[0]) < 12436)) or
9396
(ord(word[0]) < 256 and (len(word) < 3 or word in self.stopwords or
9497
word.isdigit())))
9598

9699

97-
from sphinx.search import en, ja
100+
def parse_stop_word(source):
101+
"""
102+
parse snowball style word list like this:
103+
104+
* http://snowball.tartarus.org/algorithms/finnish/stop.txt
105+
"""
106+
result = set()
107+
for line in source.splitlines():
108+
line = line.split('|')[0] # remove comment
109+
result.update(line.split())
110+
return result
98111

112+
113+
# maps language name to module.class or directly a class
99114
languages = {
100-
'en': en.SearchEnglish,
101-
'ja': ja.SearchJapanese,
115+
'da': 'sphinx.search.da.SearchDanish',
116+
'de': 'sphinx.search.de.SearchGerman',
117+
'en': SearchEnglish,
118+
'es': 'sphinx.search.es.SearchSpanish',
119+
'fi': 'sphinx.search.fi.SearchFinnish',
120+
'fr': 'sphinx.search.fr.SearchFrench',
121+
'hu': 'sphinx.search.hu.SearchHungarian',
122+
'it': 'sphinx.search.it.SearchItalian',
123+
'ja': 'sphinx.search.ja.SearchJapanese',
124+
'nl': 'sphinx.search.nl.SearchDutch',
125+
'no': 'sphinx.search.no.SearchNorwegian',
126+
'pt': 'sphinx.search.pt.SearchPortuguese',
127+
'ro': 'sphinx.search.ro.SearchRomanian',
128+
'ru': 'sphinx.search.ru.SearchRussian',
129+
'sv': 'sphinx.search.sv.SearchSwedish',
130+
'tr': 'sphinx.search.tr.SearchTurkish',
102131
}
103132

104133

@@ -185,7 +214,17 @@ def __init__(self, env, lang, options, scoring):
185214
# objtype index -> (domain, type, objname (localized))
186215
self._objnames = {}
187216
# add language-specific SearchLanguage instance
188-
self.lang = languages[lang](options)
217+
lang_class = languages.get(lang)
218+
if lang_class is None:
219+
self.lang = SearchEnglish(options)
220+
elif isinstance(lang_class, str):
221+
module, classname = lang_class.rsplit('.', 1)
222+
lang_class = getattr(__import__(module, None, None, [classname]),
223+
classname)
224+
self.lang = lang_class(options)
225+
else:
226+
# it's directly a class (e.g. added by app.add_search_language)
227+
self.lang = lang_class(options)
189228

190229
if scoring:
191230
with open(scoring, 'rb') as fp:
@@ -286,6 +325,9 @@ def freeze(self):
286325
objects=objects, objtypes=objtypes, objnames=objnames,
287326
titleterms=title_terms, envversion=self.env.version)
288327

328+
def label(self):
329+
return "%s (code: %s)" % (self.lang.language_name, self.lang.lang)
330+
289331
def prune(self, filenames):
290332
"""Remove data for all filenames not in the list."""
291333
new_titles = {}

0 commit comments

Comments
 (0)