Skip to content

Commit 6930188

Browse files
committed
perf: monkey-patch feedparser to use lxml for performant URI resolution
Signed-off-by: Rongrong <[email protected]>
1 parent e987bf6 commit 6930188

File tree

2 files changed

+167
-1
lines changed

2 files changed

+167
-1
lines changed

src/compat/__init__.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,11 @@
2121
if sys.version_info < (3, 9):
2222
raise RuntimeError("This bot requires Python 3.9 or later")
2323

24+
import feedparser.mixin
25+
import feedparser.urls
2426
import listparser.opml
2527

28+
from .feedparser_lxml_uri_resolver import resolve_relative_uris
2629
from .listparser_opml_mixin import OpmlMixin
2730
from .utils import (
2831
INT64_T_MAX,
@@ -44,6 +47,10 @@
4447
"bozo_exception_removal_wrapper",
4548
]
4649

47-
# Monkey-patching `listparser.opml.OpmlMixin` to support `text` and `title_orig`
50+
# Monkey-patch `feedparser` to use lxml for performant URI resolution.
51+
feedparser.urls.resolve_relative_uris = resolve_relative_uris
52+
feedparser.mixin.resolve_relative_uris = resolve_relative_uris
53+
54+
# Monkey-patch `listparser.opml.OpmlMixin` to support `text` and `title_orig`.
4855
# https://github.com/kurtmckee/listparser/issues/71
4956
listparser.opml.OpmlMixin.start_opml_outline = OpmlMixin.start_opml_outline
Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
# RSS to Telegram Bot
2+
# Copyright (C) 2025 Rongrong <[email protected]>
3+
#
4+
# This program is free software: you can redistribute it and/or modify
5+
# it under the terms of the GNU Affero General Public License as
6+
# published by the Free Software Foundation, either version 3 of the
7+
# License, or (at your option) any later version.
8+
#
9+
# This program is distributed in the hope that it will be useful,
10+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12+
# GNU Affero General Public License for more details.
13+
#
14+
# You should have received a copy of the GNU Affero General Public License
15+
# along with this program. If not, see <https://www.gnu.org/licenses/>.
16+
17+
import re
18+
from typing import Optional, AbstractSet, Callable, Any
19+
20+
import lxml.html
21+
from yarl import URL
22+
23+
ACCEPTABLE_URI_SCHEMES = frozenset((
24+
'file', 'ftp', 'gopher', 'h323', 'hdl', 'http', 'https', 'imap', 'magnet',
25+
'mailto', 'mms', 'news', 'nntp', 'prospero', 'rsync', 'rtsp', 'rtspu',
26+
'sftp', 'shttp', 'sip', 'sips', 'snews', 'svn', 'svn+ssh', 'telnet',
27+
'wais',
28+
# Additional common-but-unofficial schemes
29+
'aim', 'callto', 'cvs', 'facetime', 'feed', 'git', 'gtalk', 'irc', 'ircs',
30+
'irc6', 'itms', 'mms', 'msnim', 'skype', 'ssh', 'smb', 'svn', 'ymsg',
31+
))
32+
33+
TAG_ATTR_MAP: dict[str, set[str]] = {
34+
'a': {'href'},
35+
'applet': {'codebase'},
36+
'area': {'href'},
37+
'audio': {'src'},
38+
'blockquote': {'cite'},
39+
'body': {'background'},
40+
'del': {'cite'},
41+
'form': {'action'},
42+
'frame': {'longdesc', 'src'},
43+
'head': {'profile'},
44+
'iframe': {'longdesc', 'src'},
45+
'img': {'longdesc', 'src', 'usemap'},
46+
'input': {'src', 'usemap'},
47+
'ins': {'cite'},
48+
'link': {'href'},
49+
'object': {'classid', 'codebase', 'data', 'usemap'},
50+
'q': {'cite'},
51+
'script': {'src'},
52+
'source': {'src'},
53+
'video': {'poster', 'src'},
54+
}
55+
56+
57+
def _always_true():
58+
return True
59+
60+
61+
class UriResolver:
62+
def __init__(
63+
self,
64+
allowed_schemes: Optional[AbstractSet[str]] = ...,
65+
tag_attr_map: Optional[dict[str, AbstractSet[str]]] = ...,
66+
):
67+
self._allowed_schemes: AbstractSet[str] = (
68+
ACCEPTABLE_URI_SCHEMES
69+
if allowed_schemes is ...
70+
else allowed_schemes or set()
71+
)
72+
self._tag_attr_map: dict[str, AbstractSet[str]] = (
73+
TAG_ATTR_MAP
74+
if tag_attr_map is ...
75+
else tag_attr_map or {}
76+
)
77+
self._scheme_matcher: Callable[[str], Optional[Any]] = (
78+
re.compile(
79+
f'^({"|".join(self._allowed_schemes)}):',
80+
re.IGNORECASE,
81+
).match
82+
if self._allowed_schemes
83+
else _always_true
84+
)
85+
self._xpath: str = '|'.join((
86+
'//{tag_name}[{attrs}]'.format(
87+
tag_name=tag_name,
88+
attrs=' or '.join((
89+
f'@{attr_name}'
90+
for attr_name in attr_names
91+
))
92+
)
93+
for tag_name, attr_names in self._tag_attr_map.items()
94+
))
95+
96+
def resolve(self, html: str, base: str, type_: str) -> str:
97+
if '<' not in html:
98+
# Not an HTML.
99+
return html
100+
101+
xpath = self._xpath
102+
if not xpath:
103+
# Nothing to resolve.
104+
return html
105+
106+
scheme_matcher = self._scheme_matcher
107+
if not scheme_matcher(base):
108+
# The base is relative or without an allowed scheme.
109+
return html
110+
111+
tag_attr_map = self._tag_attr_map
112+
113+
base_url = URL(base)
114+
115+
html_tree = lxml.html.fragment_fromstring(html, create_parent='FEEDPARSER_URI_RESOLVER')
116+
117+
allowed_schemes = self._allowed_schemes
118+
element: lxml.html.HtmlElement
119+
for element in html_tree.xpath(xpath):
120+
for attr_name in tag_attr_map[element.tag]:
121+
relative = element.attrib.get(attr_name)
122+
if relative is None:
123+
continue
124+
125+
relative = relative.strip()
126+
127+
if not relative:
128+
element.attrib[attr_name] = base
129+
continue
130+
131+
if scheme_matcher(relative):
132+
# Absolute URL with an allowed scheme, happy path.
133+
continue
134+
135+
relative_url = URL(relative)
136+
if relative_url.absolute:
137+
# Absolute URL without an allowed scheme, erase it.
138+
element.attrib[attr_name] = ''
139+
continue
140+
141+
absolute_url = base_url.join(relative_url)
142+
element.attrib[attr_name] = (
143+
absolute_url.human_repr()
144+
if absolute_url.scheme in allowed_schemes
145+
else ''
146+
)
147+
148+
return lxml.html.tostring(
149+
html_tree,
150+
encoding='unicode',
151+
method='xml' if type_ == 'application/xhtml+xml' else 'html',
152+
).partition('<FEEDPARSER_URI_RESOLVER>')[2].rpartition('</FEEDPARSER_URI_RESOLVER>')[0]
153+
154+
155+
uri_resolver = UriResolver()
156+
157+
158+
def resolve_relative_uris(html_source, base_uri, encoding, type_):
159+
return uri_resolver.resolve(html_source, base_uri, type_)

0 commit comments

Comments
 (0)