Skip to content

Commit 3a1367b

Browse files
committed
feat: Implement spell check plugin
1 parent 2dc1633 commit 3a1367b

File tree

4 files changed

+327
-0
lines changed

4 files changed

+327
-0
lines changed

src/mkdocs_spellcheck/loggers.py

+48
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
"""Logging functions."""
2+
3+
from __future__ import annotations
4+
5+
import logging
6+
from typing import Any
7+
8+
from mkdocs.utils import warning_filter
9+
10+
11+
class LoggerAdapter(logging.LoggerAdapter):
12+
"""A logger adapter to prefix messages."""
13+
14+
def __init__(self, prefix: str, logger):
15+
"""Initialize the object.
16+
17+
Arguments:
18+
prefix: The string to insert in front of every message.
19+
logger: The logger instance.
20+
"""
21+
super().__init__(logger, {})
22+
self.prefix = prefix
23+
24+
def process(self, msg: str, kwargs) -> tuple[str, Any]:
25+
"""Process the message.
26+
27+
Arguments:
28+
msg: The message:
29+
kwargs: Remaining arguments.
30+
31+
Returns:
32+
The processed message.
33+
"""
34+
return f"{self.prefix}: {msg}", kwargs
35+
36+
37+
def get_logger(name: str) -> LoggerAdapter:
38+
"""Return a pre-configured logger.
39+
40+
Arguments:
41+
name: The name to use with `logging.getLogger`.
42+
43+
Returns:
44+
A logger configured to work well in MkDocs.
45+
"""
46+
logger = logging.getLogger(f"mkdocs.plugins.{name}")
47+
logger.addFilter(warning_filter)
48+
return LoggerAdapter(name, logger)

src/mkdocs_spellcheck/plugin.py

+93
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
"""
2+
MkDocs SpellCheck package.
3+
4+
A spell checker plugin for MkDocs.
5+
"""
6+
7+
from __future__ import annotations
8+
9+
from importlib import resources
10+
from pathlib import Path
11+
12+
from mkdocs.config import Config
13+
from mkdocs.config.config_options import Type as MkType
14+
from mkdocs.plugins import BasePlugin
15+
from mkdocs.structure.pages import Page
16+
from symspellpy import SymSpell, Verbosity
17+
18+
from mkdocs_spellcheck.loggers import get_logger
19+
from mkdocs_spellcheck.words import get_words
20+
21+
logger = get_logger(__name__)
22+
23+
24+
class SpellCheckPlugin(BasePlugin):
25+
"""A `mkdocs` plugin.
26+
27+
This plugin defines the following event hooks:
28+
29+
- `on_config`
30+
- `on_page_content`
31+
32+
Check the [Developing Plugins](https://www.mkdocs.org/user-guide/plugins/#developing-plugins) page of `mkdocs`
33+
for more information about its plugin system.
34+
"""
35+
36+
config_scheme: tuple[tuple[str, MkType], ...] = (
37+
("known_words", MkType((str, list), default=[])),
38+
("skip_files", MkType(list, default=[])),
39+
("min_length", MkType(int, default=2)),
40+
("ignore_code", MkType(bool, default=True)),
41+
("allow_unicode", MkType(bool, default=False)),
42+
)
43+
44+
def __init__(self) -> None: # noqa: D107
45+
self.known_words: set[str] = set()
46+
self.spell: SymSpell = None
47+
super().__init__()
48+
49+
def on_config(self, config: Config, **kwargs) -> Config:
50+
"""Load words to ignore.
51+
52+
Hook for the [`on_config` event](https://www.mkdocs.org/user-guide/plugins/#on_config).
53+
54+
Arguments:
55+
config: The MkDocs config object.
56+
kwargs: Additional arguments passed by MkDocs.
57+
58+
Returns:
59+
The modified config.
60+
"""
61+
self.skip_files = self.config["skip_files"]
62+
self.min_length = self.config["min_length"]
63+
self.ignore_code = self.config["ignore_code"]
64+
self.allow_unicode = self.config["allow_unicode"]
65+
66+
known_words = self.config["known_words"]
67+
if isinstance(known_words, str):
68+
self.known_words |= set(Path(config["docs_dir"], known_words).read_text().splitlines())
69+
else:
70+
self.known_words |= set(known_words)
71+
72+
self.spell = SymSpell()
73+
with resources.path("symspellpy", "frequency_dictionary_en_82_765.txt") as dictionary_path:
74+
self.spell.load_dictionary(dictionary_path, 0, 1)
75+
return config
76+
77+
def on_page_content(self, html: str, page: Page, **kwargs) -> None:
78+
"""Spell check everything.
79+
80+
Hook for the [`on_page_content` event](https://www.mkdocs.org/user-guide/plugins/#on_page_content).
81+
82+
Arguments:
83+
html: The HTML text.
84+
page: The page instance.
85+
kwargs: Additional arguments passed by MkDocs.
86+
"""
87+
if page.file.src_path not in self.skip_files:
88+
words = get_words(html, self.known_words, self.min_length, self.ignore_code, self.allow_unicode)
89+
for word in words:
90+
suggestions = self.spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2)
91+
candidates = "', '".join(suggestion.term for suggestion in suggestions if suggestion.term != word)
92+
if candidates:
93+
logger.warning(f"{page.file.src_path}: Misspelled '{word}', did you mean '{candidates}'?")

src/mkdocs_spellcheck/words.py

+85
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
"""This module contains a function to retrieve words from HTML text."""
2+
3+
from __future__ import annotations
4+
5+
import re
6+
import unicodedata
7+
from functools import partial
8+
from html.parser import HTMLParser
9+
from io import StringIO
10+
11+
12+
class _MLStripper(HTMLParser):
13+
def __init__(self, ignore_code=True):
14+
super().__init__()
15+
self.reset()
16+
self.strict = False
17+
self.convert_charrefs = True
18+
self.text = StringIO()
19+
self.ignore_code = ignore_code
20+
self.in_code_tag = False
21+
22+
def handle_starttag(self, tag, attrs):
23+
if tag == "code":
24+
self.in_code_tag = True
25+
26+
def handle_stoptag(self, tag, attrs):
27+
if tag == "code":
28+
self.in_code_tag = False
29+
30+
def handle_data(self, data):
31+
if not (self.ignore_code and self.in_code_tag):
32+
self.text.write(data)
33+
34+
def get_data(self):
35+
return self.text.getvalue()
36+
37+
38+
def _strip_tags(html, ignore_code):
39+
stripper = _MLStripper(ignore_code)
40+
stripper.feed(html)
41+
return stripper.get_data()
42+
43+
44+
not_letters_nor_spaces = re.compile(r"[^\w\s-]")
45+
dashes_or_spaces = re.compile(r"[-\s]+")
46+
number = re.compile("[0-9]+")
47+
48+
49+
def _slugify(value, allow_unicode=False):
50+
value = str(value)
51+
if allow_unicode:
52+
value = unicodedata.normalize("NFKC", value)
53+
else:
54+
value = unicodedata.normalize("NFKD", value).encode("ascii", "ignore").decode("ascii")
55+
value = not_letters_nor_spaces.sub(" ", value.lower())
56+
return dashes_or_spaces.sub("-", value).strip("-_")
57+
58+
59+
def _keep_word(word, min_length):
60+
return not word.isdigit() and len(word) >= min_length
61+
62+
63+
def get_words(
64+
html: str,
65+
known_words: set[str] | None = None,
66+
min_length: int = 2,
67+
ignore_code: bool = True,
68+
allow_unicode: bool = True,
69+
) -> list[str]:
70+
"""Get words in HTML text.
71+
72+
Parameters:
73+
html: The HTML text.
74+
known_words: Words to exclude.
75+
min_length: Words minimum length.
76+
ignore_code: Ignore words in code tags.
77+
allow_unicode: Keep unicode characters.
78+
79+
Returns:
80+
A list of words.
81+
"""
82+
known_words = known_words or set()
83+
keep = partial(_keep_word, min_length=min_length)
84+
words = filter(keep, _slugify(_strip_tags(html, ignore_code), allow_unicode).split("-"))
85+
return sorted(set(words) - known_words)

tests/test_words.py

+101
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
"""Tests for the `cli` module."""
2+
3+
import pytest
4+
5+
from mkdocs_spellcheck.words import get_words
6+
7+
8+
@pytest.mark.parametrize("tag", ["p", "em", "div", "article"])
9+
def test_remove_tags(tag: str):
10+
"""Assert tags are removed from HTML text.
11+
12+
Parameters:
13+
tag: Some HTML tag (parametrized).
14+
"""
15+
html = f"<{tag}>Some text.</{tag}><br><hr/>"
16+
words = get_words(html, min_length=1)
17+
assert tag not in words
18+
19+
20+
def test_remove_single_tags():
21+
"""Assert single tags like `br` are removed from HTML text."""
22+
html = "Some text.<br><br/><br /><img /></br>"
23+
words = get_words(html, min_length=1)
24+
assert "br" not in words
25+
assert "img" not in words
26+
27+
28+
@pytest.mark.parametrize(
29+
("text", "known_words", "expected"),
30+
[
31+
("hello", {}, ["hello"]),
32+
("hello", {"hello"}, []),
33+
("hello", {"world"}, ["hello"]),
34+
],
35+
)
36+
def test_ignore_known_words(text, known_words, expected):
37+
"""Assert known words are correctly removed.
38+
39+
Parameters:
40+
text: Some text (parametrized).
41+
known_words: Some known words (parametrized).
42+
expected: Expected list result (parametrized).
43+
"""
44+
assert get_words(text, known_words=known_words) == expected
45+
46+
47+
@pytest.mark.parametrize(
48+
("text", "min_length", "expected"),
49+
[
50+
("a bb ccc", 0, ["a", "bb", "ccc"]),
51+
("a bb ccc", 1, ["a", "bb", "ccc"]),
52+
("a bb ccc", 2, ["bb", "ccc"]),
53+
("a bb ccc", 3, ["ccc"]),
54+
("a bb ccc", 4, []),
55+
],
56+
)
57+
def test_ignore_too_short_words(text, min_length, expected):
58+
"""Assert known words are correctly removed.
59+
60+
Parameters:
61+
text: Some text (parametrized).
62+
min_length: Minimum word length (parametrized).
63+
expected: Expected list result (parametrized).
64+
"""
65+
assert get_words(text, min_length=min_length) == expected
66+
67+
68+
@pytest.mark.parametrize(
69+
("text", "ignore_code", "expected"),
70+
[
71+
("Hello <code>world!<code>", True, ["hello"]),
72+
("Hello <code>world!<code>", False, ["hello", "world"]),
73+
],
74+
)
75+
def test_ignore_text_in_code_tags(text, ignore_code, expected):
76+
"""Assert known words are correctly removed.
77+
78+
Parameters:
79+
text: Some text (parametrized).
80+
ignore_code: Whether to ignore words in code tags (parametrized).
81+
expected: Expected list result (parametrized).
82+
"""
83+
assert get_words(text, ignore_code=ignore_code) == expected
84+
85+
86+
@pytest.mark.parametrize(
87+
("text", "allow_unicode", "expected"),
88+
[
89+
("Hello world! ハローワールド!", True, ["hello", "world", "ハローワールド"]),
90+
("Hello world! ハローワールド!", False, ["hello", "world"]),
91+
],
92+
)
93+
def test_allow_unicode_characters(text, allow_unicode, expected):
94+
"""Assert known words are correctly removed.
95+
96+
Parameters:
97+
text: Some text (parametrized).
98+
allow_unicode: Whether to allow unicode characters in words (parametrized).
99+
expected: Expected list result (parametrized).
100+
"""
101+
assert get_words(text, allow_unicode=allow_unicode) == expected

0 commit comments

Comments
 (0)