perf: monkey-patch feedparser for performant URI resolution

Rongronggg9 · Rongronggg9 · commit 7a80519256bb · 2025-03-24T01:15:31.000+08:00
Relative URIs are now resolved using lxml and yarl, cutting down the
overhead of feed parsing by more than 50%.

The resolver will not pass all test cases in feedparser due to the
difference between lxml and sgmllib when handling malformed HTML. This
is mostly harmless, so let's monkey-patch feedparser to benefit from
lxml's performant HTML tree parsing.

Signed-off-by: Rongrong &lt;i@rong.moe&gt;
diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
@@ -1,5 +1,11 @@
 # Changelog
 
+## Unreleased
+
+### Enhancements
+
+- **Improve feed parsing performance**: Relative URIs are now resolved using `lxml` and `yarl`, cutting down the overhead of feed parsing by more than 50%.
+
 ## v2.10.0: Container health check, chat-specific #hashtags, and more
 
 ### Highlights
diff --git a/docs/CHANGELOG.zh.md b/docs/CHANGELOG.zh.md
@@ -1,5 +1,11 @@
 # 更新日志
 
+## 未发布
+
+### 增强
+
+- **提高 feed 解析性能**：现在使用 `lxml` 和 `yarl` 解析相对 URI，将 feed 解析的开销减少了超过 50%。
+
 ## v2.10.0: 容器健康检查、特定于聊天的 #hashtag 和更多
 
 ### 亮点
diff --git a/src/compat/__init__.py b/src/compat/__init__.py
@@ -21,8 +21,11 @@
 if sys.version_info < (3, 9):
     raise RuntimeError("This bot requires Python 3.9 or later")
 
+import feedparser.mixin
+import feedparser.urls
 import listparser.opml
 
+from .lxml_uri_resolver import resolve_relative_uris
 from .listparser_opml_mixin import OpmlMixin
 from .utils import (
     INT64_T_MAX,
@@ -44,6 +47,10 @@
     "bozo_exception_removal_wrapper",
 ]
 
-# Monkey-patching `listparser.opml.OpmlMixin` to support `text` and `title_orig`
+# Monkey-patch `feedparser` to use lxml for performant URI resolution.
+feedparser.urls.resolve_relative_uris = resolve_relative_uris
+feedparser.mixin.resolve_relative_uris = resolve_relative_uris
+
+# Monkey-patch `listparser.opml.OpmlMixin` to support `text` and `title_orig`.
 # https://github.com/kurtmckee/listparser/issues/71
 listparser.opml.OpmlMixin.start_opml_outline = OpmlMixin.start_opml_outline
diff --git a/src/compat/lxml_uri_resolver/__init__.py b/src/compat/lxml_uri_resolver/__init__.py
@@ -0,0 +1,17 @@
+#  RSS to Telegram Bot
+#  Copyright (C) 2025  Rongrong <i@rong.moe>
+#
+#  This program is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU Affero General Public License as
+#  published by the Free Software Foundation, either version 3 of the
+#  License, or (at your option) any later version.
+#
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU Affero General Public License for more details.
+#
+#  You should have received a copy of the GNU Affero General Public License
+#  along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+from .uri_resolver import resolve_relative_uris
diff --git a/src/compat/lxml_uri_resolver/presets.py b/src/compat/lxml_uri_resolver/presets.py
@@ -0,0 +1,104 @@
+#  RSS to Telegram Bot
+#  Copyright (C) 2025  Rongrong <i@rong.moe>
+#
+#  This program is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU Affero General Public License as
+#  published by the Free Software Foundation, either version 3 of the
+#  License, or (at your option) any later version.
+#
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU Affero General Public License for more details.
+#
+#  You should have received a copy of the GNU Affero General Public License
+#  along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+# Collected from:
+# - https://github.com/kurtmckee/feedparser/blob/6cdc20849a66c29e2d08b0334fceb22f210bdb26/feedparser/urls.py#L39-L47
+# - https://docs.python.org/3.12/library/urllib.parse.html
+# - https://docs.python.org/3.12/library/urllib.parse.html
+ACCEPTABLE_URI_SCHEMES: set[str] = {
+    'acap',
+    'aim',
+    'callto',
+    'cvs',
+    'facetime',
+    'feed',
+    'file',
+    'ftp',
+    'git',
+    'gopher',
+    'gtalk',
+    'h323',
+    'hdl',
+    'http',
+    'https',
+    'icap',
+    'imap',
+    'irc',
+    'irc6',
+    'ircs',
+    'itms',
+    'magnet',
+    'mailto',
+    'mms',
+    'msnim',
+    'mtqp',
+    'news',
+    'nntp',
+    'prospero',
+    'rsync',
+    'rtsp',
+    'rtspsrtspu',
+    'sftp',
+    'shttp',
+    'sip',
+    'sips',
+    'skype',
+    'smb',
+    'snews',
+    'ssh',
+    'svn',
+    'svn+ssh',
+    'telnet',
+    'wais',
+    'ws',
+    'wss',
+    'ymsg',
+}
+
+# Collected from:
+# - https://github.com/kurtmckee/feedparser/blob/6cdc20849a66c29e2d08b0334fceb22f210bdb26/feedparser/urls.py#L107-L137
+TAG_ATTR_MAP: dict[str, set[str]] = {
+    'a': {'href'},
+    'applet': {'codebase'},
+    'area': {'href'},
+    'audio': {'src'},
+    'blockquote': {'cite'},
+    'body': {'background'},
+    'del': {'cite'},
+    'form': {'action'},
+    'frame': {'longdesc', 'src'},
+    'head': {'profile'},
+    'iframe': {'longdesc', 'src'},
+    'img': {'longdesc', 'src', 'usemap'},
+    'input': {'src', 'usemap'},
+    'ins': {'cite'},
+    'link': {'href'},
+    'object': {'classid', 'codebase', 'data', 'usemap'},
+    'q': {'cite'},
+    'script': {'src'},
+    'source': {'src'},
+    'video': {'poster', 'src'},
+}
+
+TAG_ATTR_MAP_RSSTT: dict[str, set[str]] = {
+    'a': {'href'},
+    'audio': {'src'},
+    'iframe': {'src'},
+    'img': {'src'},
+    'q': {'cite'},
+    'source': {'src'},
+    'video': {'poster', 'src'},
+}
diff --git a/src/compat/lxml_uri_resolver/uri_resolver.py b/src/compat/lxml_uri_resolver/uri_resolver.py
@@ -0,0 +1,131 @@
+#  RSS to Telegram Bot
+#  Copyright (C) 2025  Rongrong <i@rong.moe>
+#
+#  This program is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU Affero General Public License as
+#  published by the Free Software Foundation, either version 3 of the
+#  License, or (at your option) any later version.
+#
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU Affero General Public License for more details.
+#
+#  You should have received a copy of the GNU Affero General Public License
+#  along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+from typing import Optional, AbstractSet, Callable, Any
+
+import lxml.html
+import re
+from yarl import URL
+
+from .presets import ACCEPTABLE_URI_SCHEMES, TAG_ATTR_MAP_RSSTT
+
+
+def _always_true():
+    return True
+
+
+class UriResolver:
+    def __init__(
+            self,
+            allowed_schemes: Optional[AbstractSet[str]] = ...,
+            tag_attr_map: Optional[dict[str, AbstractSet[str]]] = ...,
+    ):
+        self._allowed_schemes: AbstractSet[str] = (
+            ACCEPTABLE_URI_SCHEMES
+            if allowed_schemes is ...
+            else allowed_schemes or set()
+        )
+        self._tag_attr_map: dict[str, AbstractSet[str]] = (
+            TAG_ATTR_MAP_RSSTT
+            if tag_attr_map is ...
+            else tag_attr_map or {}
+        )
+        self._scheme_matcher: Callable[[str], Optional[Any]] = (
+            re.compile(
+                f'^({"|".join(self._allowed_schemes)}):',
+                re.IGNORECASE,
+            ).match
+            if self._allowed_schemes
+            else _always_true
+        )
+        self._xpath: str = '|'.join((
+            '//{tag_name}[{attrs}]'.format(
+                tag_name=tag_name,
+                attrs=' or '.join((
+                    f'@{attr_name}'
+                    for attr_name in attr_names
+                ))
+            )
+            for tag_name, attr_names in self._tag_attr_map.items()
+        ))
+
+    def resolve(self, html: str, base: str, type_: str) -> str:
+        if not base:
+            return html
+
+        if '<' not in html:
+            # Not an HTML.
+            return html
+
+        xpath = self._xpath
+        if not xpath:
+            # Nothing to resolve.
+            return html
+
+        scheme_matcher = self._scheme_matcher
+        if not scheme_matcher(base):
+            # The base is relative or without an allowed scheme.
+            return html
+
+        tag_attr_map = self._tag_attr_map
+
+        base_url = URL(base)
+
+        html_tree = lxml.html.fragment_fromstring(html, create_parent='URI_RESOLVER')
+
+        allowed_schemes = self._allowed_schemes
+        element: lxml.html.HtmlElement
+        for element in html_tree.xpath(xpath):
+            for attr_name in tag_attr_map[element.tag]:
+                relative = element.attrib.get(attr_name)
+                if relative is None:
+                    continue
+
+                relative = relative.strip()
+
+                if not relative:
+                    element.attrib[attr_name] = base
+                    continue
+
+                if scheme_matcher(relative):
+                    # Absolute URL with an allowed scheme, happy path.
+                    continue
+
+                relative_url = URL(relative)
+                if relative_url.absolute:
+                    # Absolute URL without an allowed scheme, erase it.
+                    element.attrib[attr_name] = ''
+                    continue
+
+                absolute_url = base_url.join(relative_url)
+                element.attrib[attr_name] = (
+                    absolute_url.human_repr()
+                    if absolute_url.scheme in allowed_schemes
+                    else ''
+                )
+
+        return lxml.html.tostring(
+            html_tree,
+            encoding='unicode',
+            method='xml' if type_ == 'application/xhtml+xml' else 'html',
+        ).partition('<URI_RESOLVER>')[2].rpartition('</URI_RESOLVER>')[0]
+
+
+uri_resolver = UriResolver()
+
+
+def resolve_relative_uris(html_source, base_uri, encoding, type_):
+    return uri_resolver.resolve(html_source, base_uri, type_)