Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Commit efd0074

Browse files
authored
Ensure each charset is attempted only once during media preview. (#11089)
There's no point in trying more than once since it is guaranteed to continually fail.
1 parent e2f0b49 commit efd0074

File tree

3 files changed

+64
-14
lines changed

3 files changed

+64
-14
lines changed

changelog.d/11089.bugfix

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix a long-standing bug when attempting to preview URLs which are in the `windows-1252` character encoding.

synapse/rest/media/v1/preview_url_resource.py

+28-6
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
15+
import codecs
1516
import datetime
1617
import errno
1718
import fnmatch
@@ -22,7 +23,7 @@
2223
import shutil
2324
import sys
2425
import traceback
25-
from typing import TYPE_CHECKING, Dict, Generator, Iterable, Optional, Tuple, Union
26+
from typing import TYPE_CHECKING, Dict, Generator, Iterable, Optional, Set, Tuple, Union
2627
from urllib import parse as urlparse
2728

2829
import attr
@@ -631,6 +632,14 @@ def try_remove_parent_dirs(dirs: Iterable[str]) -> None:
631632
logger.debug("No media removed from url cache")
632633

633634

635+
def _normalise_encoding(encoding: str) -> Optional[str]:
636+
"""Use the Python codec's name as the normalised entry."""
637+
try:
638+
return codecs.lookup(encoding).name
639+
except LookupError:
640+
return None
641+
642+
634643
def get_html_media_encodings(body: bytes, content_type: Optional[str]) -> Iterable[str]:
635644
"""
636645
Get potential encoding of the body based on the (presumably) HTML body or the content-type header.
@@ -652,30 +661,43 @@ def get_html_media_encodings(body: bytes, content_type: Optional[str]) -> Iterab
652661
Returns:
653662
The character encoding of the body, as a string.
654663
"""
664+
# There's no point in returning an encoding more than once.
665+
attempted_encodings: Set[str] = set()
666+
655667
# Limit searches to the first 1kb, since it ought to be at the top.
656668
body_start = body[:1024]
657669

658670
# Check if it has an encoding set in a meta tag.
659671
match = _charset_match.search(body_start)
660672
if match:
661-
yield match.group(1).decode("ascii")
673+
encoding = _normalise_encoding(match.group(1).decode("ascii"))
674+
if encoding:
675+
attempted_encodings.add(encoding)
676+
yield encoding
662677

663678
# TODO Support <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
664679

665680
# Check if it has an XML document with an encoding.
666681
match = _xml_encoding_match.match(body_start)
667682
if match:
668-
yield match.group(1).decode("ascii")
683+
encoding = _normalise_encoding(match.group(1).decode("ascii"))
684+
if encoding and encoding not in attempted_encodings:
685+
attempted_encodings.add(encoding)
686+
yield encoding
669687

670688
# Check the HTTP Content-Type header for a character set.
671689
if content_type:
672690
content_match = _content_type_match.match(content_type)
673691
if content_match:
674-
yield content_match.group(1)
692+
encoding = _normalise_encoding(content_match.group(1))
693+
if encoding and encoding not in attempted_encodings:
694+
attempted_encodings.add(encoding)
695+
yield encoding
675696

676697
# Finally, fallback to UTF-8, then windows-1252.
677-
yield "utf-8"
678-
yield "windows-1252"
698+
for fallback in ("utf-8", "cp1252"):
699+
if fallback not in attempted_encodings:
700+
yield fallback
679701

680702

681703
def decode_body(

tests/test_preview.py

+35-8
Original file line numberDiff line numberDiff line change
@@ -307,7 +307,7 @@ def test_invalid_encoding2(self):
307307
self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})
308308

309309
def test_windows_1252(self):
310-
"""A body which uses windows-1252, but doesn't declare that."""
310+
"""A body which uses cp1252, but doesn't declare that."""
311311
html = b"""
312312
<html>
313313
<head><title>\xf3</title></head>
@@ -333,7 +333,7 @@ def test_meta_charset(self):
333333
""",
334334
"text/html",
335335
)
336-
self.assertEqual(list(encodings), ["ascii", "utf-8", "windows-1252"])
336+
self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
337337

338338
# A less well-formed version.
339339
encodings = get_html_media_encodings(
@@ -345,7 +345,7 @@ def test_meta_charset(self):
345345
""",
346346
"text/html",
347347
)
348-
self.assertEqual(list(encodings), ["ascii", "utf-8", "windows-1252"])
348+
self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
349349

350350
def test_meta_charset_underscores(self):
351351
"""A character encoding contains underscore."""
@@ -358,7 +358,7 @@ def test_meta_charset_underscores(self):
358358
""",
359359
"text/html",
360360
)
361-
self.assertEqual(list(encodings), ["Shift_JIS", "utf-8", "windows-1252"])
361+
self.assertEqual(list(encodings), ["shift_jis", "utf-8", "cp1252"])
362362

363363
def test_xml_encoding(self):
364364
"""A character encoding is found via the meta tag."""
@@ -370,7 +370,7 @@ def test_xml_encoding(self):
370370
""",
371371
"text/html",
372372
)
373-
self.assertEqual(list(encodings), ["ascii", "utf-8", "windows-1252"])
373+
self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
374374

375375
def test_meta_xml_encoding(self):
376376
"""Meta tags take precedence over XML encoding."""
@@ -384,7 +384,7 @@ def test_meta_xml_encoding(self):
384384
""",
385385
"text/html",
386386
)
387-
self.assertEqual(list(encodings), ["UTF-16", "ascii", "utf-8", "windows-1252"])
387+
self.assertEqual(list(encodings), ["utf-16", "ascii", "utf-8", "cp1252"])
388388

389389
def test_content_type(self):
390390
"""A character encoding is found via the Content-Type header."""
@@ -399,9 +399,36 @@ def test_content_type(self):
399399
)
400400
for header in headers:
401401
encodings = get_html_media_encodings(b"", header)
402-
self.assertEqual(list(encodings), ["ascii", "utf-8", "windows-1252"])
402+
self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
403403

404404
def test_fallback(self):
405405
"""A character encoding cannot be found in the body or header."""
406406
encodings = get_html_media_encodings(b"", "text/html")
407-
self.assertEqual(list(encodings), ["utf-8", "windows-1252"])
407+
self.assertEqual(list(encodings), ["utf-8", "cp1252"])
408+
409+
def test_duplicates(self):
410+
"""Ensure each encoding is only attempted once."""
411+
encodings = get_html_media_encodings(
412+
b"""
413+
<?xml version="1.0" encoding="utf8"?>
414+
<html>
415+
<head><meta charset="UTF-8">
416+
</head>
417+
</html>
418+
""",
419+
'text/html; charset="UTF_8"',
420+
)
421+
self.assertEqual(list(encodings), ["utf-8", "cp1252"])
422+
423+
def test_unknown_invalid(self):
424+
"""A character encoding should be ignored if it is unknown or invalid."""
425+
encodings = get_html_media_encodings(
426+
b"""
427+
<html>
428+
<head><meta charset="invalid">
429+
</head>
430+
</html>
431+
""",
432+
'text/html; charset="invalid"',
433+
)
434+
self.assertEqual(list(encodings), ["utf-8", "cp1252"])

0 commit comments

Comments
 (0)