feat: Implement spell check plugin

pawamoy · pawamoy · commit 3a1367b9a5eb · 2022-01-25T18:26:23.000+01:00
diff --git a/src/mkdocs_spellcheck/loggers.py b/src/mkdocs_spellcheck/loggers.py
@@ -0,0 +1,48 @@
+"""Logging functions."""
+
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+from mkdocs.utils import warning_filter
+
+
+class LoggerAdapter(logging.LoggerAdapter):
+    """A logger adapter to prefix messages."""
+
+    def __init__(self, prefix: str, logger):
+        """Initialize the object.
+
+        Arguments:
+            prefix: The string to insert in front of every message.
+            logger: The logger instance.
+        """
+        super().__init__(logger, {})
+        self.prefix = prefix
+
+    def process(self, msg: str, kwargs) -> tuple[str, Any]:
+        """Process the message.
+
+        Arguments:
+            msg: The message:
+            kwargs: Remaining arguments.
+
+        Returns:
+            The processed message.
+        """
+        return f"{self.prefix}: {msg}", kwargs
+
+
+def get_logger(name: str) -> LoggerAdapter:
+    """Return a pre-configured logger.
+
+    Arguments:
+        name: The name to use with `logging.getLogger`.
+
+    Returns:
+        A logger configured to work well in MkDocs.
+    """
+    logger = logging.getLogger(f"mkdocs.plugins.{name}")
+    logger.addFilter(warning_filter)
+    return LoggerAdapter(name, logger)
diff --git a/src/mkdocs_spellcheck/plugin.py b/src/mkdocs_spellcheck/plugin.py
@@ -0,0 +1,93 @@
+"""
+MkDocs SpellCheck package.
+
+A spell checker plugin for MkDocs.
+"""
+
+from __future__ import annotations
+
+from importlib import resources
+from pathlib import Path
+
+from mkdocs.config import Config
+from mkdocs.config.config_options import Type as MkType
+from mkdocs.plugins import BasePlugin
+from mkdocs.structure.pages import Page
+from symspellpy import SymSpell, Verbosity
+
+from mkdocs_spellcheck.loggers import get_logger
+from mkdocs_spellcheck.words import get_words
+
+logger = get_logger(__name__)
+
+
+class SpellCheckPlugin(BasePlugin):
+    """A `mkdocs` plugin.
+
+    This plugin defines the following event hooks:
+
+    - `on_config`
+    - `on_page_content`
+
+    Check the [Developing Plugins](https://www.mkdocs.org/user-guide/plugins/#developing-plugins) page of `mkdocs`
+    for more information about its plugin system.
+    """
+
+    config_scheme: tuple[tuple[str, MkType], ...] = (
+        ("known_words", MkType((str, list), default=[])),
+        ("skip_files", MkType(list, default=[])),
+        ("min_length", MkType(int, default=2)),
+        ("ignore_code", MkType(bool, default=True)),
+        ("allow_unicode", MkType(bool, default=False)),
+    )
+
+    def __init__(self) -> None:  # noqa: D107
+        self.known_words: set[str] = set()
+        self.spell: SymSpell = None
+        super().__init__()
+
+    def on_config(self, config: Config, **kwargs) -> Config:
+        """Load words to ignore.
+
+        Hook for the [`on_config` event](https://www.mkdocs.org/user-guide/plugins/#on_config).
+
+        Arguments:
+            config: The MkDocs config object.
+            kwargs: Additional arguments passed by MkDocs.
+
+        Returns:
+            The modified config.
+        """
+        self.skip_files = self.config["skip_files"]
+        self.min_length = self.config["min_length"]
+        self.ignore_code = self.config["ignore_code"]
+        self.allow_unicode = self.config["allow_unicode"]
+
+        known_words = self.config["known_words"]
+        if isinstance(known_words, str):
+            self.known_words |= set(Path(config["docs_dir"], known_words).read_text().splitlines())
+        else:
+            self.known_words |= set(known_words)
+
+        self.spell = SymSpell()
+        with resources.path("symspellpy", "frequency_dictionary_en_82_765.txt") as dictionary_path:
+            self.spell.load_dictionary(dictionary_path, 0, 1)
+        return config
+
+    def on_page_content(self, html: str, page: Page, **kwargs) -> None:
+        """Spell check everything.
+
+        Hook for the [`on_page_content` event](https://www.mkdocs.org/user-guide/plugins/#on_page_content).
+
+        Arguments:
+            html: The HTML text.
+            page: The page instance.
+            kwargs: Additional arguments passed by MkDocs.
+        """
+        if page.file.src_path not in self.skip_files:
+            words = get_words(html, self.known_words, self.min_length, self.ignore_code, self.allow_unicode)
+            for word in words:
+                suggestions = self.spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2)
+                candidates = "', '".join(suggestion.term for suggestion in suggestions if suggestion.term != word)
+                if candidates:
+                    logger.warning(f"{page.file.src_path}: Misspelled '{word}', did you mean '{candidates}'?")
diff --git a/src/mkdocs_spellcheck/words.py b/src/mkdocs_spellcheck/words.py
@@ -0,0 +1,85 @@
+"""This module contains a function to retrieve words from HTML text."""
+
+from __future__ import annotations
+
+import re
+import unicodedata
+from functools import partial
+from html.parser import HTMLParser
+from io import StringIO
+
+
+class _MLStripper(HTMLParser):
+    def __init__(self, ignore_code=True):
+        super().__init__()
+        self.reset()
+        self.strict = False
+        self.convert_charrefs = True
+        self.text = StringIO()
+        self.ignore_code = ignore_code
+        self.in_code_tag = False
+
+    def handle_starttag(self, tag, attrs):
+        if tag == "code":
+            self.in_code_tag = True
+
+    def handle_stoptag(self, tag, attrs):
+        if tag == "code":
+            self.in_code_tag = False
+
+    def handle_data(self, data):
+        if not (self.ignore_code and self.in_code_tag):
+            self.text.write(data)
+
+    def get_data(self):
+        return self.text.getvalue()
+
+
+def _strip_tags(html, ignore_code):
+    stripper = _MLStripper(ignore_code)
+    stripper.feed(html)
+    return stripper.get_data()
+
+
+not_letters_nor_spaces = re.compile(r"[^\w\s-]")
+dashes_or_spaces = re.compile(r"[-\s]+")
+number = re.compile("[0-9]+")
+
+
+def _slugify(value, allow_unicode=False):
+    value = str(value)
+    if allow_unicode:
+        value = unicodedata.normalize("NFKC", value)
+    else:
+        value = unicodedata.normalize("NFKD", value).encode("ascii", "ignore").decode("ascii")
+    value = not_letters_nor_spaces.sub(" ", value.lower())
+    return dashes_or_spaces.sub("-", value).strip("-_")
+
+
+def _keep_word(word, min_length):
+    return not word.isdigit() and len(word) >= min_length
+
+
+def get_words(
+    html: str,
+    known_words: set[str] | None = None,
+    min_length: int = 2,
+    ignore_code: bool = True,
+    allow_unicode: bool = True,
+) -> list[str]:
+    """Get words in HTML text.
+
+    Parameters:
+        html: The HTML text.
+        known_words: Words to exclude.
+        min_length: Words minimum length.
+        ignore_code: Ignore words in code tags.
+        allow_unicode: Keep unicode characters.
+
+    Returns:
+        A list of words.
+    """
+    known_words = known_words or set()
+    keep = partial(_keep_word, min_length=min_length)
+    words = filter(keep, _slugify(_strip_tags(html, ignore_code), allow_unicode).split("-"))
+    return sorted(set(words) - known_words)
diff --git a/tests/test_words.py b/tests/test_words.py
@@ -0,0 +1,101 @@
+"""Tests for the `cli` module."""
+
+import pytest
+
+from mkdocs_spellcheck.words import get_words
+
+
+@pytest.mark.parametrize("tag", ["p", "em", "div", "article"])
+def test_remove_tags(tag: str):
+    """Assert tags are removed from HTML text.
+
+    Parameters:
+        tag: Some HTML tag (parametrized).
+    """
+    html = f"<{tag}>Some text.</{tag}><br><hr/>"
+    words = get_words(html, min_length=1)
+    assert tag not in words
+
+
+def test_remove_single_tags():
+    """Assert single tags like `br` are removed from HTML text."""
+    html = "Some text.<br><br/><br /><img /></br>"
+    words = get_words(html, min_length=1)
+    assert "br" not in words
+    assert "img" not in words
+
+
+@pytest.mark.parametrize(
+    ("text", "known_words", "expected"),
+    [
+        ("hello", {}, ["hello"]),
+        ("hello", {"hello"}, []),
+        ("hello", {"world"}, ["hello"]),
+    ],
+)
+def test_ignore_known_words(text, known_words, expected):
+    """Assert known words are correctly removed.
+
+    Parameters:
+        text: Some text (parametrized).
+        known_words: Some known words (parametrized).
+        expected: Expected list result (parametrized).
+    """
+    assert get_words(text, known_words=known_words) == expected
+
+
+@pytest.mark.parametrize(
+    ("text", "min_length", "expected"),
+    [
+        ("a bb ccc", 0, ["a", "bb", "ccc"]),
+        ("a bb ccc", 1, ["a", "bb", "ccc"]),
+        ("a bb ccc", 2, ["bb", "ccc"]),
+        ("a bb ccc", 3, ["ccc"]),
+        ("a bb ccc", 4, []),
+    ],
+)
+def test_ignore_too_short_words(text, min_length, expected):
+    """Assert known words are correctly removed.
+
+    Parameters:
+        text: Some text (parametrized).
+        min_length: Minimum word length (parametrized).
+        expected: Expected list result (parametrized).
+    """
+    assert get_words(text, min_length=min_length) == expected
+
+
+@pytest.mark.parametrize(
+    ("text", "ignore_code", "expected"),
+    [
+        ("Hello <code>world!<code>", True, ["hello"]),
+        ("Hello <code>world!<code>", False, ["hello", "world"]),
+    ],
+)
+def test_ignore_text_in_code_tags(text, ignore_code, expected):
+    """Assert known words are correctly removed.
+
+    Parameters:
+        text: Some text (parametrized).
+        ignore_code: Whether to ignore words in code tags (parametrized).
+        expected: Expected list result (parametrized).
+    """
+    assert get_words(text, ignore_code=ignore_code) == expected
+
+
+@pytest.mark.parametrize(
+    ("text", "allow_unicode", "expected"),
+    [
+        ("Hello world! ハローワールド!", True, ["hello", "world", "ハローワールド"]),
+        ("Hello world! ハローワールド!", False, ["hello", "world"]),
+    ],
+)
+def test_allow_unicode_characters(text, allow_unicode, expected):
+    """Assert known words are correctly removed.
+
+    Parameters:
+        text: Some text (parametrized).
+        allow_unicode: Whether to allow unicode characters in words (parametrized).
+        expected: Expected list result (parametrized).
+    """
+    assert get_words(text, allow_unicode=allow_unicode) == expected