Skip to content

Commit adc4f99

Browse files
Reorder requirements file decoding (#12795)
This changes the decoding process to more closely match the encoding rules in the requirements file format specification. The `auto_decode` function was removed and all decoding logic moved to the `pip._internal.req.req_file` module because: * This function was only ever used to decode requirements file * It was never really a generic 'util' function, it was always tied to the idiosyncrasies of decoding requirements files. * The module lived under `_internal` so I felt comfortable removing it A warning was added when we _do_ fallback to using the locale defined encoding to encourage users to move to an explicit encoding definition via a coding style comment. This fixes two existing bugs. Firstly, when: * a requirements file is encoded as UTF-8, and * some bytes in the file are incompatible with the system locale Previously, assuming no BOM or PEP-263 style comment, we would default to using the encoding from the system locale, which would then fail (see issue #12771). Now UTF-8 is tried first over the system locale. Secondly, when decoding a file starting with a UTF-32 little endian Byte Order Marker. Previously this would always fail since `codecs.BOM_UTF32_LE` is `codecs.BOM_UTF16_LE` followed by two null bytes, and because of the ordering of the list of BOMs the UTF-16 case would be run first and match the file prefix so we would incorrectly deduce that the file was UTF-16 little endian encoded. I can't imagine this is a popular encoding for a requirements file. Fixes: #12771
1 parent 6b0fb90 commit adc4f99

File tree

6 files changed

+171
-86
lines changed

6 files changed

+171
-86
lines changed

docs/html/reference/requirements-file-format.md

+3-3
Original file line numberDiff line numberDiff line change
@@ -56,9 +56,9 @@ examples of all these forms, see {ref}`pip install Examples`.
5656

5757
### Encoding
5858

59-
Requirements files are `utf-8` encoding by default and also support
60-
{pep}`263` style comments to change the encoding (i.e.
61-
`# -*- coding: <encoding name> -*-`).
59+
The default encoding for requirement files is `UTF-8` unless a different
60+
encoding is specified using a {pep}`263` style comment (e.g. `# -*- coding:
61+
<encoding name> -*-`).
6262

6363
### Line continuations
6464

news/12771.feature.rst

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Reorder the encoding detection when decoding a requirements file, relying on
2+
UTF-8 over the locale encoding by default.

src/pip/_internal/req/req_file.py

+51-2
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,14 @@
22
Requirements file parsing
33
"""
44

5+
import codecs
6+
import locale
57
import logging
68
import optparse
79
import os
810
import re
911
import shlex
12+
import sys
1013
import urllib.parse
1114
from dataclasses import dataclass
1215
from optparse import Values
@@ -26,7 +29,6 @@
2629
from pip._internal.cli import cmdoptions
2730
from pip._internal.exceptions import InstallationError, RequirementsFileParseError
2831
from pip._internal.models.search_scope import SearchScope
29-
from pip._internal.utils.encoding import auto_decode
3032

3133
if TYPE_CHECKING:
3234
from pip._internal.index.package_finder import PackageFinder
@@ -82,6 +84,21 @@
8284
str(o().dest) for o in SUPPORTED_OPTIONS_EDITABLE_REQ
8385
]
8486

87+
# order of BOMS is important: codecs.BOM_UTF16_LE is a prefix of codecs.BOM_UTF32_LE
88+
# so data.startswith(BOM_UTF16_LE) would be true for UTF32_LE data
89+
BOMS: List[Tuple[bytes, str]] = [
90+
(codecs.BOM_UTF8, "utf-8"),
91+
(codecs.BOM_UTF32, "utf-32"),
92+
(codecs.BOM_UTF32_BE, "utf-32-be"),
93+
(codecs.BOM_UTF32_LE, "utf-32-le"),
94+
(codecs.BOM_UTF16, "utf-16"),
95+
(codecs.BOM_UTF16_BE, "utf-16-be"),
96+
(codecs.BOM_UTF16_LE, "utf-16-le"),
97+
]
98+
99+
PEP263_ENCODING_RE = re.compile(rb"coding[:=]\s*([-\w.]+)")
100+
DEFAULT_ENCODING = "utf-8"
101+
85102
logger = logging.getLogger(__name__)
86103

87104

@@ -568,7 +585,39 @@ def get_file_content(url: str, session: "PipSession") -> Tuple[str, str]:
568585
# Assume this is a bare path.
569586
try:
570587
with open(url, "rb") as f:
571-
content = auto_decode(f.read())
588+
raw_content = f.read()
572589
except OSError as exc:
573590
raise InstallationError(f"Could not open requirements file: {exc}")
591+
592+
content = _decode_req_file(raw_content, url)
593+
574594
return url, content
595+
596+
597+
def _decode_req_file(data: bytes, url: str) -> str:
598+
for bom, encoding in BOMS:
599+
if data.startswith(bom):
600+
return data[len(bom) :].decode(encoding)
601+
602+
for line in data.split(b"\n")[:2]:
603+
if line[0:1] == b"#":
604+
result = PEP263_ENCODING_RE.search(line)
605+
if result is not None:
606+
encoding = result.groups()[0].decode("ascii")
607+
return data.decode(encoding)
608+
609+
try:
610+
return data.decode(DEFAULT_ENCODING)
611+
except UnicodeDecodeError:
612+
locale_encoding = locale.getpreferredencoding(False) or sys.getdefaultencoding()
613+
logging.warning(
614+
"unable to decode data from %s with default encoding %s, "
615+
"falling back to encoding from locale: %s. "
616+
"If this is intentional you should specify the encoding with a "
617+
"PEP-263 style comment, e.g. '# -*- coding: %s -*-'",
618+
url,
619+
DEFAULT_ENCODING,
620+
locale_encoding,
621+
locale_encoding,
622+
)
623+
return data.decode(locale_encoding)

src/pip/_internal/utils/encoding.py

-36
This file was deleted.

tests/unit/test_req_file.py

+114
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import codecs
12
import collections
23
import logging
34
import os
@@ -955,3 +956,116 @@ def test_install_requirements_with_options(
955956
)
956957

957958
assert req.global_options == [global_option]
959+
960+
@pytest.mark.parametrize(
961+
"raw_req_file,expected_name,expected_spec",
962+
[
963+
pytest.param(
964+
b"Django==1.4.2",
965+
"Django",
966+
"==1.4.2",
967+
id="defaults to UTF-8",
968+
),
969+
pytest.param(
970+
"# coding=latin1\nDjango==1.4.2 # Pas trop de café".encode("latin-1"),
971+
"Django",
972+
"==1.4.2",
973+
id="decodes based on PEP-263 style headers",
974+
),
975+
],
976+
)
977+
def test_general_decoding(
978+
self,
979+
raw_req_file: bytes,
980+
expected_name: str,
981+
expected_spec: str,
982+
tmpdir: Path,
983+
session: PipSession,
984+
) -> None:
985+
req_file = tmpdir / "requirements.txt"
986+
req_file.write_bytes(raw_req_file)
987+
988+
reqs = tuple(parse_reqfile(req_file.resolve(), session=session))
989+
990+
assert len(reqs) == 1
991+
assert reqs[0].name == expected_name
992+
assert reqs[0].specifier == expected_spec
993+
994+
@pytest.mark.parametrize(
995+
"bom,encoding",
996+
[
997+
(codecs.BOM_UTF8, "utf-8"),
998+
(codecs.BOM_UTF16_BE, "utf-16-be"),
999+
(codecs.BOM_UTF16_LE, "utf-16-le"),
1000+
(codecs.BOM_UTF32_BE, "utf-32-be"),
1001+
(codecs.BOM_UTF32_LE, "utf-32-le"),
1002+
# BOM automatically added when encoding byte-order dependent encodings
1003+
(b"", "utf-16"),
1004+
(b"", "utf-32"),
1005+
],
1006+
)
1007+
def test_decoding_with_BOM(
1008+
self, bom: bytes, encoding: str, tmpdir: Path, session: PipSession
1009+
) -> None:
1010+
req_name = "Django"
1011+
req_specifier = "==1.4.2"
1012+
encoded_contents = bom + f"{req_name}{req_specifier}".encode(encoding)
1013+
req_file = tmpdir / "requirements.txt"
1014+
req_file.write_bytes(encoded_contents)
1015+
1016+
reqs = tuple(parse_reqfile(req_file.resolve(), session=session))
1017+
1018+
assert len(reqs) == 1
1019+
assert reqs[0].name == req_name
1020+
assert reqs[0].specifier == req_specifier
1021+
1022+
def test_warns_and_fallsback_to_locale_on_utf8_decode_fail(
1023+
self,
1024+
tmpdir: Path,
1025+
session: PipSession,
1026+
caplog: pytest.LogCaptureFixture,
1027+
) -> None:
1028+
# \xff is valid in latin-1 but not UTF-8
1029+
data = b"pip<=24.0 # some comment\xff\n"
1030+
locale_encoding = "latin-1"
1031+
req_file = tmpdir / "requirements.txt"
1032+
req_file.write_bytes(data)
1033+
1034+
# it's hard to rely on a locale definitely existing for testing
1035+
# so patch things out for simplicity
1036+
with caplog.at_level(logging.WARNING), mock.patch(
1037+
"locale.getpreferredencoding", return_value=locale_encoding
1038+
):
1039+
reqs = tuple(parse_reqfile(req_file.resolve(), session=session))
1040+
1041+
assert len(caplog.records) == 1
1042+
assert (
1043+
caplog.records[0].msg
1044+
== "unable to decode data from %s with default encoding %s, "
1045+
"falling back to encoding from locale: %s. "
1046+
"If this is intentional you should specify the encoding with a "
1047+
"PEP-263 style comment, e.g. '# -*- coding: %s -*-'"
1048+
)
1049+
assert caplog.records[0].args == (
1050+
str(req_file),
1051+
"utf-8",
1052+
locale_encoding,
1053+
locale_encoding,
1054+
)
1055+
1056+
assert len(reqs) == 1
1057+
assert reqs[0].name == "pip"
1058+
assert str(reqs[0].specifier) == "<=24.0"
1059+
1060+
@pytest.mark.parametrize("encoding", ["utf-8", "gbk"])
1061+
def test_errors_on_non_decodable_data(
1062+
self, encoding: str, tmpdir: Path, session: PipSession
1063+
) -> None:
1064+
data = b"\xff"
1065+
req_file = tmpdir / "requirements.txt"
1066+
req_file.write_bytes(data)
1067+
1068+
with pytest.raises(UnicodeDecodeError), mock.patch(
1069+
"locale.getpreferredencoding", return_value=encoding
1070+
):
1071+
next(parse_reqfile(req_file.resolve(), session=session))

tests/unit/test_utils.py

+1-45
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
44
"""
55

6-
import codecs
76
import os
87
import shutil
98
import stat
@@ -12,7 +11,7 @@
1211
from io import BytesIO
1312
from pathlib import Path
1413
from typing import Any, Callable, Iterator, List, NoReturn, Optional, Tuple, Type
15-
from unittest.mock import Mock, patch
14+
from unittest.mock import Mock
1615

1716
import pytest
1817

@@ -21,7 +20,6 @@
2120
from pip._internal.exceptions import HashMismatch, HashMissing, InstallationError
2221
from pip._internal.utils.deprecation import PipDeprecationWarning, deprecated
2322
from pip._internal.utils.egg_link import egg_link_path_from_location
24-
from pip._internal.utils.encoding import BOMS, auto_decode
2523
from pip._internal.utils.glibc import (
2624
glibc_version_string,
2725
glibc_version_string_confstr,
@@ -445,48 +443,6 @@ def test_has_one_of(self) -> None:
445443
assert not empty_hashes.has_one_of({"sha256": "xyzt"})
446444

447445

448-
class TestEncoding:
449-
"""Tests for pip._internal.utils.encoding"""
450-
451-
def test_auto_decode_utf_16_le(self) -> None:
452-
data = (
453-
b"\xff\xfeD\x00j\x00a\x00n\x00g\x00o\x00=\x00"
454-
b"=\x001\x00.\x004\x00.\x002\x00"
455-
)
456-
assert data.startswith(codecs.BOM_UTF16_LE)
457-
assert auto_decode(data) == "Django==1.4.2"
458-
459-
def test_auto_decode_utf_16_be(self) -> None:
460-
data = (
461-
b"\xfe\xff\x00D\x00j\x00a\x00n\x00g\x00o\x00="
462-
b"\x00=\x001\x00.\x004\x00.\x002"
463-
)
464-
assert data.startswith(codecs.BOM_UTF16_BE)
465-
assert auto_decode(data) == "Django==1.4.2"
466-
467-
def test_auto_decode_no_bom(self) -> None:
468-
assert auto_decode(b"foobar") == "foobar"
469-
470-
def test_auto_decode_pep263_headers(self) -> None:
471-
latin1_req = "# coding=latin1\n# Pas trop de café"
472-
assert auto_decode(latin1_req.encode("latin1")) == latin1_req
473-
474-
def test_auto_decode_no_preferred_encoding(self) -> None:
475-
om, em = Mock(), Mock()
476-
om.return_value = "ascii"
477-
em.return_value = None
478-
data = "data"
479-
with patch("sys.getdefaultencoding", om):
480-
with patch("locale.getpreferredencoding", em):
481-
ret = auto_decode(data.encode(sys.getdefaultencoding()))
482-
assert ret == data
483-
484-
@pytest.mark.parametrize("encoding", [encoding for bom, encoding in BOMS])
485-
def test_all_encodings_are_valid(self, encoding: str) -> None:
486-
# we really only care that there is no LookupError
487-
assert "".encode(encoding).decode(encoding) == ""
488-
489-
490446
def raises(error: Type[Exception]) -> NoReturn:
491447
raise error
492448

0 commit comments

Comments
 (0)