perf: monkey-patch feedparser to use lxml for performant URI resolution

Rongronggg9 · Rongronggg9 · commit 6930188024d1 · 2025-03-22T01:33:53.000+08:00
Signed-off-by: Rongrong &lt;i@rong.moe&gt;
diff --git a/src/compat/__init__.py b/src/compat/__init__.py
@@ -21,8 +21,11 @@
 if sys.version_info < (3, 9):
     raise RuntimeError("This bot requires Python 3.9 or later")
 
+import feedparser.mixin
+import feedparser.urls
 import listparser.opml
 
+from .feedparser_lxml_uri_resolver import resolve_relative_uris
 from .listparser_opml_mixin import OpmlMixin
 from .utils import (
     INT64_T_MAX,
@@ -44,6 +47,10 @@
     "bozo_exception_removal_wrapper",
 ]
 
-# Monkey-patching `listparser.opml.OpmlMixin` to support `text` and `title_orig`
+# Monkey-patch `feedparser` to use lxml for performant URI resolution.
+feedparser.urls.resolve_relative_uris = resolve_relative_uris
+feedparser.mixin.resolve_relative_uris = resolve_relative_uris
+
+# Monkey-patch `listparser.opml.OpmlMixin` to support `text` and `title_orig`.
 # https://github.com/kurtmckee/listparser/issues/71
 listparser.opml.OpmlMixin.start_opml_outline = OpmlMixin.start_opml_outline
diff --git a/src/compat/feedparser_lxml_uri_resolver.py b/src/compat/feedparser_lxml_uri_resolver.py
@@ -0,0 +1,159 @@
+#  RSS to Telegram Bot
+#  Copyright (C) 2025  Rongrong <i@rong.moe>
+#
+#  This program is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU Affero General Public License as
+#  published by the Free Software Foundation, either version 3 of the
+#  License, or (at your option) any later version.
+#
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU Affero General Public License for more details.
+#
+#  You should have received a copy of the GNU Affero General Public License
+#  along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+import re
+from typing import Optional, AbstractSet, Callable, Any
+
+import lxml.html
+from yarl import URL
+
+ACCEPTABLE_URI_SCHEMES = frozenset((
+    'file', 'ftp', 'gopher', 'h323', 'hdl', 'http', 'https', 'imap', 'magnet',
+    'mailto', 'mms', 'news', 'nntp', 'prospero', 'rsync', 'rtsp', 'rtspu',
+    'sftp', 'shttp', 'sip', 'sips', 'snews', 'svn', 'svn+ssh', 'telnet',
+    'wais',
+    # Additional common-but-unofficial schemes
+    'aim', 'callto', 'cvs', 'facetime', 'feed', 'git', 'gtalk', 'irc', 'ircs',
+    'irc6', 'itms', 'mms', 'msnim', 'skype', 'ssh', 'smb', 'svn', 'ymsg',
+))
+
+TAG_ATTR_MAP: dict[str, set[str]] = {
+    'a': {'href'},
+    'applet': {'codebase'},
+    'area': {'href'},
+    'audio': {'src'},
+    'blockquote': {'cite'},
+    'body': {'background'},
+    'del': {'cite'},
+    'form': {'action'},
+    'frame': {'longdesc', 'src'},
+    'head': {'profile'},
+    'iframe': {'longdesc', 'src'},
+    'img': {'longdesc', 'src', 'usemap'},
+    'input': {'src', 'usemap'},
+    'ins': {'cite'},
+    'link': {'href'},
+    'object': {'classid', 'codebase', 'data', 'usemap'},
+    'q': {'cite'},
+    'script': {'src'},
+    'source': {'src'},
+    'video': {'poster', 'src'},
+}
+
+
+def _always_true():
+    return True
+
+
+class UriResolver:
+    def __init__(
+            self,
+            allowed_schemes: Optional[AbstractSet[str]] = ...,
+            tag_attr_map: Optional[dict[str, AbstractSet[str]]] = ...,
+    ):
+        self._allowed_schemes: AbstractSet[str] = (
+            ACCEPTABLE_URI_SCHEMES
+            if allowed_schemes is ...
+            else allowed_schemes or set()
+        )
+        self._tag_attr_map: dict[str, AbstractSet[str]] = (
+            TAG_ATTR_MAP
+            if tag_attr_map is ...
+            else tag_attr_map or {}
+        )
+        self._scheme_matcher: Callable[[str], Optional[Any]] = (
+            re.compile(
+                f'^({"|".join(self._allowed_schemes)}):',
+                re.IGNORECASE,
+            ).match
+            if self._allowed_schemes
+            else _always_true
+        )
+        self._xpath: str = '|'.join((
+            '//{tag_name}[{attrs}]'.format(
+                tag_name=tag_name,
+                attrs=' or '.join((
+                    f'@{attr_name}'
+                    for attr_name in attr_names
+                ))
+            )
+            for tag_name, attr_names in self._tag_attr_map.items()
+        ))
+
+    def resolve(self, html: str, base: str, type_: str) -> str:
+        if '<' not in html:
+            # Not an HTML.
+            return html
+
+        xpath = self._xpath
+        if not xpath:
+            # Nothing to resolve.
+            return html
+
+        scheme_matcher = self._scheme_matcher
+        if not scheme_matcher(base):
+            # The base is relative or without an allowed scheme.
+            return html
+
+        tag_attr_map = self._tag_attr_map
+
+        base_url = URL(base)
+
+        html_tree = lxml.html.fragment_fromstring(html, create_parent='FEEDPARSER_URI_RESOLVER')
+
+        allowed_schemes = self._allowed_schemes
+        element: lxml.html.HtmlElement
+        for element in html_tree.xpath(xpath):
+            for attr_name in tag_attr_map[element.tag]:
+                relative = element.attrib.get(attr_name)
+                if relative is None:
+                    continue
+
+                relative = relative.strip()
+
+                if not relative:
+                    element.attrib[attr_name] = base
+                    continue
+
+                if scheme_matcher(relative):
+                    # Absolute URL with an allowed scheme, happy path.
+                    continue
+
+                relative_url = URL(relative)
+                if relative_url.absolute:
+                    # Absolute URL without an allowed scheme, erase it.
+                    element.attrib[attr_name] = ''
+                    continue
+
+                absolute_url = base_url.join(relative_url)
+                element.attrib[attr_name] = (
+                    absolute_url.human_repr()
+                    if absolute_url.scheme in allowed_schemes
+                    else ''
+                )
+
+        return lxml.html.tostring(
+            html_tree,
+            encoding='unicode',
+            method='xml' if type_ == 'application/xhtml+xml' else 'html',
+        ).partition('<FEEDPARSER_URI_RESOLVER>')[2].rpartition('</FEEDPARSER_URI_RESOLVER>')[0]
+
+
+uri_resolver = UriResolver()
+
+
+def resolve_relative_uris(html_source, base_uri, encoding, type_):
+    return uri_resolver.resolve(html_source, base_uri, type_)