PiDelport · techalchemy · Apr 4, 2019 · Apr 4, 2019 · Apr 4, 2019 · Apr 4, 2019
diff --git a/src/backports/os.py b/src/backports/os.py
@@ -8,17 +8,23 @@
 """
 from __future__ import unicode_literals
 
+import abc
 import sys
 
 # XXX backport: unicode on Python 2
 _str = unicode if sys.version_info < (3,) else str
+# XXX backport: string and binary types differ between python 2 and 3
+string_types = basestring if sys.version_info[0] == 2 else str
+binary_type = str if sys.version_info[0] == 2 else bytes
 
 # XXX backport: Use backported surrogateescape for Python 2
 # TODO backport: Find a way to do this without pulling in the entire future package?
 if sys.version_info < (3,):
     from future.utils.surrogateescape import register_surrogateescape
     register_surrogateescape()
 
+# XXX This is a compatibility shiim for the PathLike backport which gets us fspath access
+ABC = abc.ABCMeta(str('ABC'), (object,), {'__slots__': ()})
 
 # XXX backport: This invalid_utf8_indexes() helper is shamelessly copied from
 # Bob Ippolito's pyutf8 package (pyutf8/ref.py), in order to help support the
@@ -42,9 +48,9 @@ def _invalid_utf8_indexes(bytes):
                 # U+0080 - U+07FF - 11 bits
                 c = (((c1 & 0x1F) << 6) |
                      (c2 & 0x3F))
-                if c < 0x80:
+                if c < 0x80:  # pragma: no cover
                     # Overlong encoding
-                    skips.extend([i, i + 1])
+                    skips.extend([i, i + 1])  # pragma: no cover
                 i += 2
                 continue
             c3 = bytes[i + 2]
@@ -70,7 +76,7 @@ def _invalid_utf8_indexes(bytes):
                          (c2 & 0x3F)) << 6) |
                        (c3 & 0x3F)) << 6) |
                      (c4 & 0x3F))
-                if (c < 0x10000) or (c > 0x10FFFF):
+                if (c < 0x10000) or (c > 0x10FFFF):  # pragma: no cover
                     # Overlong encoding or invalid code point.
                     skips.extend([i, i + 1, i + 2, i + 3])
                 i += 4
@@ -92,34 +98,69 @@ def _chunks(b, indexes):
     yield b[i:]
 
 
-def _fscodec():
-    encoding = sys.getfilesystemencoding()
-    if encoding == 'mbcs':
-        errors = 'strict'
+def fspath(path):
+    """
+    Fetch the string value from a path-like object
+
+    Returns **None** if there is no string value.
+    """
+
+    if isinstance(path, (string_types, binary_type)):
+        return path
+    path_type = type(path)
+    expect = "unicode" if sys.version_info[0] == 2 else "str"
+    try:
+        path_repr = path_type.__fspath__(path)
+    except AttributeError:
+        if hasattr(path_type, '__fspath__'):
+            raise
+        else:
+            raise TypeError("expected {0}, bytes or os.PathLike object, "
+                            "not ".format(expect) + path_type.__name__)
+    if isinstance(path_repr, (string_types, binary_type)):
+        return path_repr
     else:
-        errors = 'surrogateescape'
+        raise TypeError("expected {}.__fspath__() to return {} or bytes, "
+                        "not {}".format(path_type.__name__, expect,
+                                        type(path_repr).__name__))
 
-    # XXX backport: Do we need to hack around Python 2's UTF-8 codec?
-    import codecs  # Use codecs.lookup() for name normalisation.
-    _HACK_AROUND_PY2_UTF8 = (sys.version_info < (3,) and
-                             codecs.lookup(encoding) == codecs.lookup('utf-8'))
-    # Do we need to hack around Python 2's ASCII codec error handler behaviour?
-    _HACK_AROUND_PY2_ASCII = (sys.version_info < (3,) and
-                              codecs.lookup(encoding) == codecs.lookup('ascii'))
 
-    # XXX backport: chr(octet) became bytes([octet])
+def _fscodec():
+    # XXX Backport: The following section attempts to use utf-8 encoders to
+    # roundtrip to the filesystem, and also attempts to force windows to use
+    # a "surrogate pass" error handling strategy to ignore the bad surrogate
+    # pairs sometimes generated by python 2 encoders
+    if sys.version_info[0] < 3:
+        _fs_encode_errors = "surrogateescape"
+        _fs_decode_errors = "surrogateescape"
+        _fs_encoding = "utf-8"
+    else:
+        _fs_encoding = "utf-8"
+        if sys.platform.startswith("win"):
+            _fs_error_fn = None
+            alt_strategy = "surrogatepass"
+        else:
+            if sys.version_info >= (3, 3):
+                _fs_encoding = sys.getfilesystemencoding()
+                if not _fs_encoding:
+                    _fs_encoding = sys.getdefaultencoding()
+            alt_strategy = "surrogateescape"
+            _fs_error_fn = getattr(sys, "getfilesystemencodeerrors", None)
+        _fs_encode_errors = _fs_error_fn() if _fs_error_fn else alt_strategy
+        _fs_decode_errors = _fs_error_fn() if _fs_error_fn else alt_strategy
+
     _byte = chr if sys.version_info < (3,) else lambda i: bytes([i])
 
+
     def fsencode(filename):
+        """Encode filename (an os.PathLike, bytes, or str) to the filesystem
+        encoding with 'surrogateescape' error handler, return bytes unchanged.
+        On Windows, use 'strict' error handler if the file system encoding is
+        'mbcs' (which is the default encoding).
         """
-        Encode filename to the filesystem encoding with 'surrogateescape' error
-        handler, return bytes unchanged. On Windows, use 'strict' error handler if
-        the file system encoding is 'mbcs' (which is the default encoding).
-        """
-        if isinstance(filename, bytes):
-            return filename
-        elif isinstance(filename, _str):
-            if _HACK_AROUND_PY2_UTF8 or _HACK_AROUND_PY2_ASCII:
+        path = fspath(filename)
+        if isinstance(path, _str):
+            if sys.version_info[0] < 3:
                 # XXX backport: Unlike Python 3, Python 2's UTF-8 codec does not
                 # consider surrogate codepoints invalid, so the surrogateescape
                 # error handler never gets invoked to encode them back into high
@@ -140,25 +181,21 @@ def fsencode(filename):
                 #
                 return b''.join(
                     (_byte(ord(c) - 0xDC00) if 0xDC00 <= ord(c) <= 0xDCFF else
-                     c.encode(encoding))
-                    for c in filename)
-            else:
-                return filename.encode(encoding, errors)
+                     c.encode(_fs_encoding, _fs_encode_errors))
+                    for c in path)
+            return path.encode(_fs_encoding, _fs_encode_errors)
         else:
-            # XXX backport: unicode instead of str for Python 2
-            raise TypeError("expect bytes or {_str}, not {}".format(type(filename).__name__,
-                                                                    _str=_str.__name__, ))
+            return path
 
     def fsdecode(filename):
+        """Decode filename (an os.PathLike, bytes, or str) from the filesystem
+        encoding with 'surrogateescape' error handler, return str unchanged. On
+        Windows, use 'strict' error handler if the file system encoding is
+        'mbcs' (which is the default encoding).
         """
-        Decode filename from the filesystem encoding with 'surrogateescape' error
-        handler, return str unchanged. On Windows, use 'strict' error handler if
-        the file system encoding is 'mbcs' (which is the default encoding).
-        """
-        if isinstance(filename, _str):
-            return filename
-        elif isinstance(filename, bytes):
-            if _HACK_AROUND_PY2_UTF8:
+        path = fspath(filename)
+        if isinstance(path, bytes):
+            if sys.version_info[0] < 3:
                 # XXX backport: See the remarks in fsencode() above.
                 #
                 # This case is slightly trickier: Python 2 will invoke the
@@ -179,16 +216,30 @@ def fsdecode(filename):
                 #
                 from array import array
                 indexes = _invalid_utf8_indexes(array(str('B'), filename))
-                return ''.join(chunk.decode(encoding, errors)
+                return ''.join(chunk.decode(_fs_encoding, _fs_decode_errors)
                                for chunk in _chunks(filename, indexes))
-            else:
-                return filename.decode(encoding, errors)
+            return path.decode(_fs_encoding, _fs_decode_errors)
         else:
-            # XXX backport: unicode instead of str for Python 2
-            raise TypeError("expect bytes or {_str}, not {}".format(type(filename).__name__,
-                                                                    _str=_str.__name__, ))
+            return path
 
     return fsencode, fsdecode
 
+
 fsencode, fsdecode = _fscodec()
 del _fscodec
+
+
+# If there is no C implementation, make the pure Python version the
+# implementation as transparently as possible.
+class PathLike(ABC):
+
+    """Abstract base class for implementing the file system path protocol."""
+
+    @abc.abstractmethod
+    def __fspath__(self):
+        """Return the file system path representation of the object."""
+        raise NotImplementedError
+
+    @classmethod
+    def __subclasshook__(cls, subclass):
+        return hasattr(subclass, '__fspath__')
diff --git a/tests/test_extra.py b/tests/test_extra.py
@@ -12,12 +12,16 @@
 from backports import os
 
 import unittest
-from hypothesis import given, example
+from hypothesis import assume, given, example
 from hypothesis.strategies import text, binary
 
+# SKIP_CONDITIONS:
+IS_WIN = sys.platform.startswith("win")
+IS_PY3 = sys.version_info[0] == 3
+
 # Example data:
 
-HIGH_BYTES = (
+SURROGATE_ESCAPE_HIGH_BYTES = (
     b'\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f'
     b'\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f'
     b'\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf'
@@ -39,6 +43,41 @@
     '\udcf0\udcf1\udcf2\udcf3\udcf4\udcf5\udcf6\udcf7\udcf8\udcf9\udcfa\udcfb\udcfc\udcfd\udcfe\udcff'
 )
 
+SURROGATE_PASS_HIGH_BYTES = (
+    b'\xed\xb2\x80\xed\xb2\x81\xed\xb2\x82\xed\xb2\x83\xed\xb2\x84\xed'
+    b'\xb2\x85\xed\xb2\x86\xed\xb2\x87\xed\xb2\x88\xed\xb2\x89\xed\xb2'
+    b'\x8a\xed\xb2\x8b\xed\xb2\x8c\xed\xb2\x8d\xed\xb2\x8e\xed\xb2\x8f'
+    b'\xed\xb2\x90\xed\xb2\x91\xed\xb2\x92\xed\xb2\x93\xed\xb2\x94\xed'
+    b'\xb2\x95\xed\xb2\x96\xed\xb2\x97\xed\xb2\x98\xed\xb2\x99\xed\xb2'
+    b'\x9a\xed\xb2\x9b\xed\xb2\x9c\xed\xb2\x9d\xed\xb2\x9e\xed\xb2\x9f'
+    b'\xed\xb2\xa0\xed\xb2\xa1\xed\xb2\xa2\xed\xb2\xa3\xed\xb2\xa4\xed'
+    b'\xb2\xa5\xed\xb2\xa6\xed\xb2\xa7\xed\xb2\xa8\xed\xb2\xa9\xed\xb2'
+    b'\xaa\xed\xb2\xab\xed\xb2\xac\xed\xb2\xad\xed\xb2\xae\xed\xb2\xaf'
+    b'\xed\xb2\xb0\xed\xb2\xb1\xed\xb2\xb2\xed\xb2\xb3\xed\xb2\xb4\xed'
+    b'\xb2\xb5\xed\xb2\xb6\xed\xb2\xb7\xed\xb2\xb8\xed\xb2\xb9\xed\xb2'
+    b'\xba\xed\xb2\xbb\xed\xb2\xbc\xed\xb2\xbd\xed\xb2\xbe\xed\xb2\xbf'
+    b'\xed\xb3\x80\xed\xb3\x81\xed\xb3\x82\xed\xb3\x83\xed\xb3\x84\xed'
+    b'\xb3\x85\xed\xb3\x86\xed\xb3\x87\xed\xb3\x88\xed\xb3\x89\xed\xb3'
+    b'\x8a\xed\xb3\x8b\xed\xb3\x8c\xed\xb3\x8d\xed\xb3\x8e\xed\xb3\x8f'
+    b'\xed\xb3\x90\xed\xb3\x91\xed\xb3\x92\xed\xb3\x93\xed\xb3\x94\xed'
+    b'\xb3\x95\xed\xb3\x96\xed\xb3\x97\xed\xb3\x98\xed\xb3\x99\xed\xb3'
+    b'\x9a\xed\xb3\x9b\xed\xb3\x9c\xed\xb3\x9d\xed\xb3\x9e\xed\xb3\x9f'
+    b'\xed\xb3\xa0\xed\xb3\xa1\xed\xb3\xa2\xed\xb3\xa3\xed\xb3\xa4\xed'
+    b'\xb3\xa5\xed\xb3\xa6\xed\xb3\xa7\xed\xb3\xa8\xed\xb3\xa9\xed\xb3'
+    b'\xaa\xed\xb3\xab\xed\xb3\xac\xed\xb3\xad\xed\xb3\xae\xed\xb3\xaf'
+    b'\xed\xb3\xb0\xed\xb3\xb1\xed\xb3\xb2\xed\xb3\xb3\xed\xb3\xb4\xed'
+    b'\xb3\xb5\xed\xb3\xb6\xed\xb3\xb7\xed\xb3\xb8\xed\xb3\xb9\xed\xb3'
+    b'\xba\xed\xb3\xbb\xed\xb3\xbc\xed\xb3\xbd\xed\xb3\xbe\xed\xb3\xbf'
+)
+
+
+# Use surrogate pass for encoding on windows on python 3+ to ensure
+# we can decode them as the native decoder uses surrogate escape
+if IS_WIN and IS_PY3:
+    HIGH_BYTES = SURROGATE_PASS_HIGH_BYTES
+else:
+    HIGH_BYTES = SURROGATE_ESCAPE_HIGH_BYTES
+
 # A U+DC80 surrogate encoded as (invalid) UTF-8.
 #
 # Python 3 correctly rejects this when encoding to or from UTF-8, but
@@ -75,21 +114,48 @@ def test_decode_surrogates(self):
     def test_text_roundtrip(self, s):
         self.assertEqual(os.fsdecode(os.fsencode(s)), s)
 
+    @unittest.skipIf(
+        IS_PY3 and sys.version_info[:2] <= (3, 5) and IS_WIN,
+        "Backport doesn't align with native implementation on win on or before python 3.5"
+    )
     @given(binary())
     @example(HIGH_BYTES)
     @example(UTF8_ENCODED_SURROGATE)
     def test_binary_roundtrip(self, b):
-        self.assertEqual(os.fsencode(os.fsdecode(b)), b)
+        # in python 3 on windows, the native implementation of os.fsdecode
+        # always relies on `surrogatepass` as the error handler, which means
+        # it will fail on surrogates (which are not unicode compatible)
+        # so if we fail to decode something under those circumstances we should
+        # verify that the native implementation also fails.
+        rt1 = None
+        try:
+            rt1 = os.fsdecode(b)
+        except Exception as e:
+            if IS_WIN and IS_PY3:
+                self.assertRaises(type(e), real_os.fsdecode, b)
+            else:
+                raise
+        else:
+            try:
+                roundtripped = os.fsencode(rt1)
+            except Exception as e:
+                if IS_WIN and IS_PY3:
+                    self.assertRaises(type(e), real_os.fsencode, rt1)
+                else:
+                    raise
+            else:
+                self.assertEqual(roundtripped, b)
 
     def test_TypeError(self):
         def assertTypeError(value, expected_message):
             for f in [os.fsencode, os.fsdecode]:
+
                 with self.assertRaises(TypeError) as cm:
                     f(value)
                 self.assertEqual(str(cm.exception), expected_message)
-
-        pre = 'expect bytes or {}, not '.format(
-            'unicode' if sys.version_info < (3,) else 'str')
+        pre = 'expected {0}, bytes or os.PathLike object, not '.format(
+            'unicode' if sys.version_info < (3,) else 'str'
+        )
         assertTypeError(None, pre + 'NoneType')
         assertTypeError(5, pre + 'int')
         assertTypeError([], pre + 'list')
@@ -102,16 +168,34 @@ class TestAgainstPython3(unittest.TestCase):
     On Python 3, the backported implementations should match the standard library.
     """
 
+    @unittest.skipIf(
+        IS_PY3 and sys.version_info[:2] <= (3, 5) and IS_WIN,
+        "Backport doesn't align with native implementation on win on or before python 3.5"
+    )
     @given(encodable_text())
     @example(HIGH_SURROGATES)
     def test_encode_text(self, s):
         self.assertEqual(os.fsencode(s), real_os.fsencode(s))
 
+    @unittest.skipIf(
+        IS_PY3 and sys.version_info[:2] <= (3, 5) and IS_WIN,
+        "Backport doesn't align with native implementation on win on or before python 3.5"
+    )
     @given(binary())
     @example(HIGH_BYTES)
     @example(UTF8_ENCODED_SURROGATE)
     def test_decode_binary(self, b):
-        self.assertEqual(os.fsdecode(b), real_os.fsdecode(b))
+        # Python 3 on windows will never be able to decode things
+        # in the backported library that it can't also decode
+        # in the original OS module implementation, so lets just catch
+        # the exceptions thrown by the os module and expect them
+        # to be raised by the backport
+        try:
+            real_os_val = real_os.fsdecode(b)
+        except Exception as e:
+            self.assertRaises(type(e), os.fsdecode, b)
+        else:
+            self.assertEqual(os.fsdecode(b), real_os_val)
 
     @given(binary())
     @example(HIGH_BYTES)