This repository was archived by the owner on Apr 26, 2024. It is now read-only.
File tree 3 files changed +22
-3
lines changed
3 files changed +22
-3
lines changed Original file line number Diff line number Diff line change
1
+ Fix a bug introduced in v1.46.0rc1 where URL previews of some XML documents would fail.
Original file line number Diff line number Diff line change @@ -718,9 +718,12 @@ def decode_body(
718
718
if not body :
719
719
return None
720
720
721
+ # The idea here is that multiple encodings are tried until one works.
722
+ # Unfortunately the result is never used and then LXML will decode the string
723
+ # again with the found encoding.
721
724
for encoding in get_html_media_encodings (body , content_type ):
722
725
try :
723
- body_str = body .decode (encoding )
726
+ body .decode (encoding )
724
727
except Exception :
725
728
pass
726
729
else :
@@ -732,11 +735,11 @@ def decode_body(
732
735
from lxml import etree
733
736
734
737
# Create an HTML parser.
735
- parser = etree .HTMLParser (recover = True , encoding = "utf-8" )
738
+ parser = etree .HTMLParser (recover = True , encoding = encoding )
736
739
737
740
# Attempt to parse the body. Returns None if the body was successfully
738
741
# parsed, but no tree was found.
739
- return etree .fromstring (body_str , parser )
742
+ return etree .fromstring (body , parser )
740
743
741
744
742
745
def _calc_og (tree : "etree.Element" , media_uri : str ) -> Dict [str , Optional [str ]]:
Original file line number Diff line number Diff line change @@ -277,6 +277,21 @@ def test_no_tree(self):
277
277
tree = decode_body (html , "http://example.com/test.html" )
278
278
self .assertIsNone (tree )
279
279
280
+ def test_xml (self ):
281
+ """Test decoding XML and ensure it works properly."""
282
+ # Note that the strip() call is important to ensure the xml tag starts
283
+ # at the initial byte.
284
+ html = b"""
285
+ <?xml version="1.0" encoding="UTF-8"?>
286
+
287
+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
288
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
289
+ <head><title>Foo</title></head><body>Some text.</body></html>
290
+ """ .strip ()
291
+ tree = decode_body (html , "http://example.com/test.html" )
292
+ og = _calc_og (tree , "http://example.com/test.html" )
293
+ self .assertEqual (og , {"og:title" : "Foo" , "og:description" : "Some text." })
294
+
280
295
def test_invalid_encoding (self ):
281
296
"""An invalid character encoding should be ignored and treated as UTF-8, if possible."""
282
297
html = b"""
You can’t perform that action at this time.
0 commit comments