Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Commit b3e843b

Browse files
authored
Fix URL preview errors when previewing XML documents. (#11196)
1 parent e0ef8fe commit b3e843b

File tree

3 files changed

+22
-3
lines changed

3 files changed

+22
-3
lines changed

changelog.d/11196.bugfix

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix a bug introduced in v1.46.0rc1 where URL previews of some XML documents would fail.

synapse/rest/media/v1/preview_url_resource.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -718,9 +718,12 @@ def decode_body(
718718
if not body:
719719
return None
720720

721+
# The idea here is that multiple encodings are tried until one works.
722+
# Unfortunately the result is never used and then LXML will decode the string
723+
# again with the found encoding.
721724
for encoding in get_html_media_encodings(body, content_type):
722725
try:
723-
body_str = body.decode(encoding)
726+
body.decode(encoding)
724727
except Exception:
725728
pass
726729
else:
@@ -732,11 +735,11 @@ def decode_body(
732735
from lxml import etree
733736

734737
# Create an HTML parser.
735-
parser = etree.HTMLParser(recover=True, encoding="utf-8")
738+
parser = etree.HTMLParser(recover=True, encoding=encoding)
736739

737740
# Attempt to parse the body. Returns None if the body was successfully
738741
# parsed, but no tree was found.
739-
return etree.fromstring(body_str, parser)
742+
return etree.fromstring(body, parser)
740743

741744

742745
def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]:

tests/test_preview.py

+15
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,21 @@ def test_no_tree(self):
277277
tree = decode_body(html, "http://example.com/test.html")
278278
self.assertIsNone(tree)
279279

280+
def test_xml(self):
281+
"""Test decoding XML and ensure it works properly."""
282+
# Note that the strip() call is important to ensure the xml tag starts
283+
# at the initial byte.
284+
html = b"""
285+
<?xml version="1.0" encoding="UTF-8"?>
286+
287+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
288+
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
289+
<head><title>Foo</title></head><body>Some text.</body></html>
290+
""".strip()
291+
tree = decode_body(html, "http://example.com/test.html")
292+
og = _calc_og(tree, "http://example.com/test.html")
293+
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
294+
280295
def test_invalid_encoding(self):
281296
"""An invalid character encoding should be ignored and treated as UTF-8, if possible."""
282297
html = b"""

0 commit comments

Comments
 (0)