|
| 1 | +# RSS to Telegram Bot |
| 2 | +# Copyright (C) 2025 Rongrong <[email protected]> |
| 3 | +# |
| 4 | +# This program is free software: you can redistribute it and/or modify |
| 5 | +# it under the terms of the GNU Affero General Public License as |
| 6 | +# published by the Free Software Foundation, either version 3 of the |
| 7 | +# License, or (at your option) any later version. |
| 8 | +# |
| 9 | +# This program is distributed in the hope that it will be useful, |
| 10 | +# but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 12 | +# GNU Affero General Public License for more details. |
| 13 | +# |
| 14 | +# You should have received a copy of the GNU Affero General Public License |
| 15 | +# along with this program. If not, see <https://www.gnu.org/licenses/>. |
| 16 | + |
| 17 | +import re |
| 18 | +from typing import Optional, AbstractSet, Callable, Any |
| 19 | + |
| 20 | +import lxml.html |
| 21 | +from yarl import URL |
| 22 | + |
| 23 | +ACCEPTABLE_URI_SCHEMES = frozenset(( |
| 24 | + 'file', 'ftp', 'gopher', 'h323', 'hdl', 'http', 'https', 'imap', 'magnet', |
| 25 | + 'mailto', 'mms', 'news', 'nntp', 'prospero', 'rsync', 'rtsp', 'rtspu', |
| 26 | + 'sftp', 'shttp', 'sip', 'sips', 'snews', 'svn', 'svn+ssh', 'telnet', |
| 27 | + 'wais', |
| 28 | + # Additional common-but-unofficial schemes |
| 29 | + 'aim', 'callto', 'cvs', 'facetime', 'feed', 'git', 'gtalk', 'irc', 'ircs', |
| 30 | + 'irc6', 'itms', 'mms', 'msnim', 'skype', 'ssh', 'smb', 'svn', 'ymsg', |
| 31 | +)) |
| 32 | + |
| 33 | +TAG_ATTR_MAP: dict[str, set[str]] = { |
| 34 | + 'a': {'href'}, |
| 35 | + 'applet': {'codebase'}, |
| 36 | + 'area': {'href'}, |
| 37 | + 'audio': {'src'}, |
| 38 | + 'blockquote': {'cite'}, |
| 39 | + 'body': {'background'}, |
| 40 | + 'del': {'cite'}, |
| 41 | + 'form': {'action'}, |
| 42 | + 'frame': {'longdesc', 'src'}, |
| 43 | + 'head': {'profile'}, |
| 44 | + 'iframe': {'longdesc', 'src'}, |
| 45 | + 'img': {'longdesc', 'src', 'usemap'}, |
| 46 | + 'input': {'src', 'usemap'}, |
| 47 | + 'ins': {'cite'}, |
| 48 | + 'link': {'href'}, |
| 49 | + 'object': {'classid', 'codebase', 'data', 'usemap'}, |
| 50 | + 'q': {'cite'}, |
| 51 | + 'script': {'src'}, |
| 52 | + 'source': {'src'}, |
| 53 | + 'video': {'poster', 'src'}, |
| 54 | +} |
| 55 | + |
| 56 | + |
| 57 | +def _always_true(): |
| 58 | + return True |
| 59 | + |
| 60 | + |
| 61 | +class UriResolver: |
| 62 | + def __init__( |
| 63 | + self, |
| 64 | + allowed_schemes: Optional[AbstractSet[str]] = ..., |
| 65 | + tag_attr_map: Optional[dict[str, AbstractSet[str]]] = ..., |
| 66 | + ): |
| 67 | + self._allowed_schemes: AbstractSet[str] = ( |
| 68 | + ACCEPTABLE_URI_SCHEMES |
| 69 | + if allowed_schemes is ... |
| 70 | + else allowed_schemes or set() |
| 71 | + ) |
| 72 | + self._tag_attr_map: dict[str, AbstractSet[str]] = ( |
| 73 | + TAG_ATTR_MAP |
| 74 | + if tag_attr_map is ... |
| 75 | + else tag_attr_map or {} |
| 76 | + ) |
| 77 | + self._scheme_matcher: Callable[[str], Optional[Any]] = ( |
| 78 | + re.compile( |
| 79 | + f'^({"|".join(self._allowed_schemes)}):', |
| 80 | + re.IGNORECASE, |
| 81 | + ).match |
| 82 | + if self._allowed_schemes |
| 83 | + else _always_true |
| 84 | + ) |
| 85 | + self._xpath: str = '|'.join(( |
| 86 | + '//{tag_name}[{attrs}]'.format( |
| 87 | + tag_name=tag_name, |
| 88 | + attrs=' or '.join(( |
| 89 | + f'@{attr_name}' |
| 90 | + for attr_name in attr_names |
| 91 | + )) |
| 92 | + ) |
| 93 | + for tag_name, attr_names in self._tag_attr_map.items() |
| 94 | + )) |
| 95 | + |
| 96 | + def resolve(self, html: str, base: str, type_: str) -> str: |
| 97 | + if '<' not in html: |
| 98 | + # Not an HTML. |
| 99 | + return html |
| 100 | + |
| 101 | + xpath = self._xpath |
| 102 | + if not xpath: |
| 103 | + # Nothing to resolve. |
| 104 | + return html |
| 105 | + |
| 106 | + scheme_matcher = self._scheme_matcher |
| 107 | + if not scheme_matcher(base): |
| 108 | + # The base is relative or without an allowed scheme. |
| 109 | + return html |
| 110 | + |
| 111 | + tag_attr_map = self._tag_attr_map |
| 112 | + |
| 113 | + base_url = URL(base) |
| 114 | + |
| 115 | + html_tree = lxml.html.fragment_fromstring(html, create_parent='FEEDPARSER_URI_RESOLVER') |
| 116 | + |
| 117 | + allowed_schemes = self._allowed_schemes |
| 118 | + element: lxml.html.HtmlElement |
| 119 | + for element in html_tree.xpath(xpath): |
| 120 | + for attr_name in tag_attr_map[element.tag]: |
| 121 | + relative = element.attrib.get(attr_name) |
| 122 | + if relative is None: |
| 123 | + continue |
| 124 | + |
| 125 | + relative = relative.strip() |
| 126 | + |
| 127 | + if not relative: |
| 128 | + element.attrib[attr_name] = base |
| 129 | + continue |
| 130 | + |
| 131 | + if scheme_matcher(relative): |
| 132 | + # Absolute URL with an allowed scheme, happy path. |
| 133 | + continue |
| 134 | + |
| 135 | + relative_url = URL(relative) |
| 136 | + if relative_url.absolute: |
| 137 | + # Absolute URL without an allowed scheme, erase it. |
| 138 | + element.attrib[attr_name] = '' |
| 139 | + continue |
| 140 | + |
| 141 | + absolute_url = base_url.join(relative_url) |
| 142 | + element.attrib[attr_name] = ( |
| 143 | + absolute_url.human_repr() |
| 144 | + if absolute_url.scheme in allowed_schemes |
| 145 | + else '' |
| 146 | + ) |
| 147 | + |
| 148 | + return lxml.html.tostring( |
| 149 | + html_tree, |
| 150 | + encoding='unicode', |
| 151 | + method='xml' if type_ == 'application/xhtml+xml' else 'html', |
| 152 | + ).partition('<FEEDPARSER_URI_RESOLVER>')[2].rpartition('</FEEDPARSER_URI_RESOLVER>')[0] |
| 153 | + |
| 154 | + |
| 155 | +uri_resolver = UriResolver() |
| 156 | + |
| 157 | + |
| 158 | +def resolve_relative_uris(html_source, base_uri, encoding, type_): |
| 159 | + return uri_resolver.resolve(html_source, base_uri, type_) |
0 commit comments