Skip to content

Bugfix/fs encode implementation #16

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
141 changes: 96 additions & 45 deletions src/backports/os.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,23 @@
"""
from __future__ import unicode_literals

import abc
import sys

# XXX backport: unicode on Python 2
_str = unicode if sys.version_info < (3,) else str
# XXX backport: string and binary types differ between python 2 and 3
string_types = basestring if sys.version_info[0] == 2 else str
binary_type = str if sys.version_info[0] == 2 else bytes

# XXX backport: Use backported surrogateescape for Python 2
# TODO backport: Find a way to do this without pulling in the entire future package?
if sys.version_info < (3,):
from future.utils.surrogateescape import register_surrogateescape
register_surrogateescape()

# XXX This is a compatibility shiim for the PathLike backport which gets us fspath access
ABC = abc.ABCMeta(str('ABC'), (object,), {'__slots__': ()})

# XXX backport: This invalid_utf8_indexes() helper is shamelessly copied from
# Bob Ippolito's pyutf8 package (pyutf8/ref.py), in order to help support the
Expand All @@ -42,9 +48,9 @@ def _invalid_utf8_indexes(bytes):
# U+0080 - U+07FF - 11 bits
c = (((c1 & 0x1F) << 6) |
(c2 & 0x3F))
if c < 0x80:
if c < 0x80: # pragma: no cover
# Overlong encoding
skips.extend([i, i + 1])
skips.extend([i, i + 1]) # pragma: no cover
i += 2
continue
c3 = bytes[i + 2]
Expand All @@ -70,7 +76,7 @@ def _invalid_utf8_indexes(bytes):
(c2 & 0x3F)) << 6) |
(c3 & 0x3F)) << 6) |
(c4 & 0x3F))
if (c < 0x10000) or (c > 0x10FFFF):
if (c < 0x10000) or (c > 0x10FFFF): # pragma: no cover
# Overlong encoding or invalid code point.
skips.extend([i, i + 1, i + 2, i + 3])
i += 4
Expand All @@ -92,34 +98,69 @@ def _chunks(b, indexes):
yield b[i:]


def _fscodec():
encoding = sys.getfilesystemencoding()
if encoding == 'mbcs':
errors = 'strict'
def fspath(path):
"""
Fetch the string value from a path-like object

Returns **None** if there is no string value.
"""

if isinstance(path, (string_types, binary_type)):
return path
path_type = type(path)
expect = "unicode" if sys.version_info[0] == 2 else "str"
try:
path_repr = path_type.__fspath__(path)
except AttributeError:
if hasattr(path_type, '__fspath__'):
raise
else:
raise TypeError("expected {0}, bytes or os.PathLike object, "
"not ".format(expect) + path_type.__name__)
if isinstance(path_repr, (string_types, binary_type)):
return path_repr
else:
errors = 'surrogateescape'
raise TypeError("expected {}.__fspath__() to return {} or bytes, "
"not {}".format(path_type.__name__, expect,
type(path_repr).__name__))

# XXX backport: Do we need to hack around Python 2's UTF-8 codec?
import codecs # Use codecs.lookup() for name normalisation.
_HACK_AROUND_PY2_UTF8 = (sys.version_info < (3,) and
codecs.lookup(encoding) == codecs.lookup('utf-8'))
# Do we need to hack around Python 2's ASCII codec error handler behaviour?
_HACK_AROUND_PY2_ASCII = (sys.version_info < (3,) and
codecs.lookup(encoding) == codecs.lookup('ascii'))

# XXX backport: chr(octet) became bytes([octet])
def _fscodec():
# XXX Backport: The following section attempts to use utf-8 encoders to
# roundtrip to the filesystem, and also attempts to force windows to use
# a "surrogate pass" error handling strategy to ignore the bad surrogate
# pairs sometimes generated by python 2 encoders
if sys.version_info[0] < 3:
_fs_encode_errors = "surrogateescape"
_fs_decode_errors = "surrogateescape"
_fs_encoding = "utf-8"
else:
_fs_encoding = "utf-8"
if sys.platform.startswith("win"):
_fs_error_fn = None
alt_strategy = "surrogatepass"
else:
if sys.version_info >= (3, 3):
_fs_encoding = sys.getfilesystemencoding()
if not _fs_encoding:
_fs_encoding = sys.getdefaultencoding()
alt_strategy = "surrogateescape"
_fs_error_fn = getattr(sys, "getfilesystemencodeerrors", None)
_fs_encode_errors = _fs_error_fn() if _fs_error_fn else alt_strategy
_fs_decode_errors = _fs_error_fn() if _fs_error_fn else alt_strategy

_byte = chr if sys.version_info < (3,) else lambda i: bytes([i])


def fsencode(filename):
"""Encode filename (an os.PathLike, bytes, or str) to the filesystem
encoding with 'surrogateescape' error handler, return bytes unchanged.
On Windows, use 'strict' error handler if the file system encoding is
'mbcs' (which is the default encoding).
"""
Encode filename to the filesystem encoding with 'surrogateescape' error
handler, return bytes unchanged. On Windows, use 'strict' error handler if
the file system encoding is 'mbcs' (which is the default encoding).
"""
if isinstance(filename, bytes):
return filename
elif isinstance(filename, _str):
if _HACK_AROUND_PY2_UTF8 or _HACK_AROUND_PY2_ASCII:
path = fspath(filename)
if isinstance(path, _str):
if sys.version_info[0] < 3:
# XXX backport: Unlike Python 3, Python 2's UTF-8 codec does not
# consider surrogate codepoints invalid, so the surrogateescape
# error handler never gets invoked to encode them back into high
Expand All @@ -140,25 +181,21 @@ def fsencode(filename):
#
return b''.join(
(_byte(ord(c) - 0xDC00) if 0xDC00 <= ord(c) <= 0xDCFF else
c.encode(encoding))
for c in filename)
else:
return filename.encode(encoding, errors)
c.encode(_fs_encoding, _fs_encode_errors))
for c in path)
return path.encode(_fs_encoding, _fs_encode_errors)
else:
# XXX backport: unicode instead of str for Python 2
raise TypeError("expect bytes or {_str}, not {}".format(type(filename).__name__,
_str=_str.__name__, ))
return path

def fsdecode(filename):
"""Decode filename (an os.PathLike, bytes, or str) from the filesystem
encoding with 'surrogateescape' error handler, return str unchanged. On
Windows, use 'strict' error handler if the file system encoding is
'mbcs' (which is the default encoding).
"""
Decode filename from the filesystem encoding with 'surrogateescape' error
handler, return str unchanged. On Windows, use 'strict' error handler if
the file system encoding is 'mbcs' (which is the default encoding).
"""
if isinstance(filename, _str):
return filename
elif isinstance(filename, bytes):
if _HACK_AROUND_PY2_UTF8:
path = fspath(filename)
if isinstance(path, bytes):
if sys.version_info[0] < 3:
# XXX backport: See the remarks in fsencode() above.
#
# This case is slightly trickier: Python 2 will invoke the
Expand All @@ -179,16 +216,30 @@ def fsdecode(filename):
#
from array import array
indexes = _invalid_utf8_indexes(array(str('B'), filename))
return ''.join(chunk.decode(encoding, errors)
return ''.join(chunk.decode(_fs_encoding, _fs_decode_errors)
for chunk in _chunks(filename, indexes))
else:
return filename.decode(encoding, errors)
return path.decode(_fs_encoding, _fs_decode_errors)
else:
# XXX backport: unicode instead of str for Python 2
raise TypeError("expect bytes or {_str}, not {}".format(type(filename).__name__,
_str=_str.__name__, ))
return path

return fsencode, fsdecode


fsencode, fsdecode = _fscodec()
del _fscodec


# If there is no C implementation, make the pure Python version the
# implementation as transparently as possible.
class PathLike(ABC):

"""Abstract base class for implementing the file system path protocol."""

@abc.abstractmethod
def __fspath__(self):
"""Return the file system path representation of the object."""
raise NotImplementedError

@classmethod
def __subclasshook__(cls, subclass):
return hasattr(subclass, '__fspath__')
98 changes: 91 additions & 7 deletions tests/test_extra.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,16 @@
from backports import os

import unittest
from hypothesis import given, example
from hypothesis import assume, given, example
from hypothesis.strategies import text, binary

# SKIP_CONDITIONS:
IS_WIN = sys.platform.startswith("win")
IS_PY3 = sys.version_info[0] == 3

# Example data:

HIGH_BYTES = (
SURROGATE_ESCAPE_HIGH_BYTES = (
b'\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f'
b'\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f'
b'\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf'
Expand All @@ -39,6 +43,41 @@
'\udcf0\udcf1\udcf2\udcf3\udcf4\udcf5\udcf6\udcf7\udcf8\udcf9\udcfa\udcfb\udcfc\udcfd\udcfe\udcff'
)

SURROGATE_PASS_HIGH_BYTES = (
b'\xed\xb2\x80\xed\xb2\x81\xed\xb2\x82\xed\xb2\x83\xed\xb2\x84\xed'
b'\xb2\x85\xed\xb2\x86\xed\xb2\x87\xed\xb2\x88\xed\xb2\x89\xed\xb2'
b'\x8a\xed\xb2\x8b\xed\xb2\x8c\xed\xb2\x8d\xed\xb2\x8e\xed\xb2\x8f'
b'\xed\xb2\x90\xed\xb2\x91\xed\xb2\x92\xed\xb2\x93\xed\xb2\x94\xed'
b'\xb2\x95\xed\xb2\x96\xed\xb2\x97\xed\xb2\x98\xed\xb2\x99\xed\xb2'
b'\x9a\xed\xb2\x9b\xed\xb2\x9c\xed\xb2\x9d\xed\xb2\x9e\xed\xb2\x9f'
b'\xed\xb2\xa0\xed\xb2\xa1\xed\xb2\xa2\xed\xb2\xa3\xed\xb2\xa4\xed'
b'\xb2\xa5\xed\xb2\xa6\xed\xb2\xa7\xed\xb2\xa8\xed\xb2\xa9\xed\xb2'
b'\xaa\xed\xb2\xab\xed\xb2\xac\xed\xb2\xad\xed\xb2\xae\xed\xb2\xaf'
b'\xed\xb2\xb0\xed\xb2\xb1\xed\xb2\xb2\xed\xb2\xb3\xed\xb2\xb4\xed'
b'\xb2\xb5\xed\xb2\xb6\xed\xb2\xb7\xed\xb2\xb8\xed\xb2\xb9\xed\xb2'
b'\xba\xed\xb2\xbb\xed\xb2\xbc\xed\xb2\xbd\xed\xb2\xbe\xed\xb2\xbf'
b'\xed\xb3\x80\xed\xb3\x81\xed\xb3\x82\xed\xb3\x83\xed\xb3\x84\xed'
b'\xb3\x85\xed\xb3\x86\xed\xb3\x87\xed\xb3\x88\xed\xb3\x89\xed\xb3'
b'\x8a\xed\xb3\x8b\xed\xb3\x8c\xed\xb3\x8d\xed\xb3\x8e\xed\xb3\x8f'
b'\xed\xb3\x90\xed\xb3\x91\xed\xb3\x92\xed\xb3\x93\xed\xb3\x94\xed'
b'\xb3\x95\xed\xb3\x96\xed\xb3\x97\xed\xb3\x98\xed\xb3\x99\xed\xb3'
b'\x9a\xed\xb3\x9b\xed\xb3\x9c\xed\xb3\x9d\xed\xb3\x9e\xed\xb3\x9f'
b'\xed\xb3\xa0\xed\xb3\xa1\xed\xb3\xa2\xed\xb3\xa3\xed\xb3\xa4\xed'
b'\xb3\xa5\xed\xb3\xa6\xed\xb3\xa7\xed\xb3\xa8\xed\xb3\xa9\xed\xb3'
b'\xaa\xed\xb3\xab\xed\xb3\xac\xed\xb3\xad\xed\xb3\xae\xed\xb3\xaf'
b'\xed\xb3\xb0\xed\xb3\xb1\xed\xb3\xb2\xed\xb3\xb3\xed\xb3\xb4\xed'
b'\xb3\xb5\xed\xb3\xb6\xed\xb3\xb7\xed\xb3\xb8\xed\xb3\xb9\xed\xb3'
b'\xba\xed\xb3\xbb\xed\xb3\xbc\xed\xb3\xbd\xed\xb3\xbe\xed\xb3\xbf'
)


# Use surrogate pass for encoding on windows on python 3+ to ensure
# we can decode them as the native decoder uses surrogate escape
if IS_WIN and IS_PY3:
HIGH_BYTES = SURROGATE_PASS_HIGH_BYTES
else:
HIGH_BYTES = SURROGATE_ESCAPE_HIGH_BYTES

# A U+DC80 surrogate encoded as (invalid) UTF-8.
#
# Python 3 correctly rejects this when encoding to or from UTF-8, but
Expand Down Expand Up @@ -75,21 +114,48 @@ def test_decode_surrogates(self):
def test_text_roundtrip(self, s):
self.assertEqual(os.fsdecode(os.fsencode(s)), s)

@unittest.skipIf(
IS_PY3 and sys.version_info[:2] <= (3, 5) and IS_WIN,
"Backport doesn't align with native implementation on win on or before python 3.5"
)
@given(binary())
@example(HIGH_BYTES)
@example(UTF8_ENCODED_SURROGATE)
def test_binary_roundtrip(self, b):
self.assertEqual(os.fsencode(os.fsdecode(b)), b)
# in python 3 on windows, the native implementation of os.fsdecode
# always relies on `surrogatepass` as the error handler, which means
# it will fail on surrogates (which are not unicode compatible)
# so if we fail to decode something under those circumstances we should
# verify that the native implementation also fails.
rt1 = None
try:
rt1 = os.fsdecode(b)
except Exception as e:
if IS_WIN and IS_PY3:
self.assertRaises(type(e), real_os.fsdecode, b)
else:
raise
else:
try:
roundtripped = os.fsencode(rt1)
except Exception as e:
if IS_WIN and IS_PY3:
self.assertRaises(type(e), real_os.fsencode, rt1)
else:
raise
else:
self.assertEqual(roundtripped, b)

def test_TypeError(self):
def assertTypeError(value, expected_message):
for f in [os.fsencode, os.fsdecode]:

with self.assertRaises(TypeError) as cm:
f(value)
self.assertEqual(str(cm.exception), expected_message)

pre = 'expect bytes or {}, not '.format(
'unicode' if sys.version_info < (3,) else 'str')
pre = 'expected {0}, bytes or os.PathLike object, not '.format(
'unicode' if sys.version_info < (3,) else 'str'
)
assertTypeError(None, pre + 'NoneType')
assertTypeError(5, pre + 'int')
assertTypeError([], pre + 'list')
Expand All @@ -102,16 +168,34 @@ class TestAgainstPython3(unittest.TestCase):
On Python 3, the backported implementations should match the standard library.
"""

@unittest.skipIf(
IS_PY3 and sys.version_info[:2] <= (3, 5) and IS_WIN,
"Backport doesn't align with native implementation on win on or before python 3.5"
)
@given(encodable_text())
@example(HIGH_SURROGATES)
def test_encode_text(self, s):
self.assertEqual(os.fsencode(s), real_os.fsencode(s))

@unittest.skipIf(
IS_PY3 and sys.version_info[:2] <= (3, 5) and IS_WIN,
"Backport doesn't align with native implementation on win on or before python 3.5"
)
@given(binary())
@example(HIGH_BYTES)
@example(UTF8_ENCODED_SURROGATE)
def test_decode_binary(self, b):
self.assertEqual(os.fsdecode(b), real_os.fsdecode(b))
# Python 3 on windows will never be able to decode things
# in the backported library that it can't also decode
# in the original OS module implementation, so lets just catch
# the exceptions thrown by the os module and expect them
# to be raised by the backport
try:
real_os_val = real_os.fsdecode(b)
except Exception as e:
self.assertRaises(type(e), os.fsdecode, b)
else:
self.assertEqual(os.fsdecode(b), real_os_val)

@given(binary())
@example(HIGH_BYTES)
Expand Down