orestis
diff --git a/‎Doc/library/codecs.rst‎
Lines changed: 3 additions & 1 deletion b/‎Doc/library/codecs.rst‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎Doc/library/os.rst‎
Lines changed: 28 additions & 10 deletions b/‎Doc/library/os.rst‎
Lines changed: 28 additions & 10 deletions
diff --git a/‎Include/unicodeobject.h‎
Lines changed: 29 additions & 19 deletions b/‎Include/unicodeobject.h‎
Lines changed: 29 additions & 19 deletions
diff --git a/‎Lib/test/test_codecs.py‎
Lines changed: 29 additions & 0 deletions b/‎Lib/test/test_codecs.py‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎Lib/test/test_os.py‎
Lines changed: 38 additions & 1 deletion b/‎Lib/test/test_os.py‎
Lines changed: 38 additions & 1 deletion
diff --git a/‎Misc/NEWS‎
Lines changed: 2 additions & 0 deletions b/‎Misc/NEWS‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎Modules/_io/fileio.c‎
Lines changed: 1 addition & 1 deletion b/‎Modules/_io/fileio.c‎
Lines changed: 1 addition & 1 deletion
@@ -322,6 +322,8 @@ and implemented by all standard Python codecs:
 | ``'backslashreplace'``  | Replace with backslashed escape sequences     |
 |                         | (only for encoding).                          |
 +-------------------------+-----------------------------------------------+
+| ``'utf8b'``             | Replace byte with surrogate U+DCxx.           |
++-------------------------+-----------------------------------------------+
 
 In addition, the following error handlers are specific to a single codec:
 
@@ -333,7 +335,7 @@ In addition, the following error handlers are specific to a single codec:
 +------------------+---------+--------------------------------------------+
 
 .. versionadded:: 3.1
-   The ``'surrogates'`` error handler.
+   The ``'utf8b'`` and ``'surrogates'`` error handlers.
 
 The set of allowed values can be extended via :meth:`register_error`.
 
 
@@ -51,6 +51,30 @@ the :mod:`os` module, but using them is of course a threat to portability!
    ``'ce'``, ``'java'``.
 
 
+.. _os-filenames:
+
+File Names, Command Line Arguments, and Environment Variables
+-------------------------------------------------------------
+
+In Python, file names, command line arguments, and environment
+variables are represented using the string type. On some systems,
+decoding these strings to and from bytes is necessary before passing
+them to the operating system. Python uses the file system encoding to
+perform this conversion (see :func:`sys.getfilesystemencoding`).
+
+.. versionchanged:: 3.1
+   On some systems, conversion using the file system encoding may
+   fail. In this case, Python uses the ``utf8b`` encoding error
+   handler, which means that undecodable bytes are replaced by a
+   Unicode character U+DCxx on decoding, and these are again
+   translated to the original byte on encoding.
+
+
+The file system encoding must guarantee to successfully decode all
+bytes below 128. If the file system encoding fails to provide this
+guarantee, API functions may raise UnicodeErrors.
+
+
 .. _os-procinfo:
 
 Process Parameters
@@ -688,12 +712,8 @@ Files and Directories
 
 .. function:: getcwd()
 
-   Return a string representing the current working directory.  On Unix
-   platforms, this function may raise :exc:`UnicodeDecodeError` if the name of
-   the current directory is not decodable in the file system encoding.  Use
-   :func:`getcwdb` if you need the call to never fail. Availability: Unix,
-   Windows.
-
+   Return a string representing the current working directory.
+   Availability: Unix, Windows.
 
 .. function:: getcwdb()
 
@@ -800,10 +820,8 @@ Files and Directories
    entries ``'.'`` and ``'..'`` even if they are present in the directory.
    Availability: Unix, Windows.
 
-   This function can be called with a bytes or string argument.  In the bytes
-   case, all filenames will be listed as returned by the underlying API.  In the
-   string case, filenames will be decoded using the file system encoding, and
-   skipped if a decoding error occurs.
+   This function can be called with a bytes or string argument, and returns
+   filenames of the same datatype.
 
 
 .. function:: lstat(path)
 
@@ -198,6 +198,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
 # define PyUnicode_FromStringAndSize PyUnicodeUCS2_FromStringAndSize
 # define PyUnicode_FromUnicode PyUnicodeUCS2_FromUnicode
 # define PyUnicode_FromWideChar PyUnicodeUCS2_FromWideChar
+# define PyUnicode_FSConverter PyUnicodeUCS2_FSConverter
 # define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding
 # define PyUnicode_GetMax PyUnicodeUCS2_GetMax
 # define PyUnicode_GetSize PyUnicodeUCS2_GetSize
@@ -296,6 +297,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
 # define PyUnicode_FromStringAndSize PyUnicodeUCS4_FromStringAndSize
 # define PyUnicode_FromUnicode PyUnicodeUCS4_FromUnicode
 # define PyUnicode_FromWideChar PyUnicodeUCS4_FromWideChar
+# define PyUnicode_FSConverter PyUnicodeUCS4_FSConverter
 # define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding
 # define PyUnicode_GetMax PyUnicodeUCS4_GetMax
 # define PyUnicode_GetSize PyUnicodeUCS4_GetSize
@@ -693,25 +695,6 @@ PyAPI_FUNC(PyObject *) _PyUnicode_AsDefaultEncodedString(
     PyObject *unicode,
     const char *errors);
 
-/* Decode a null-terminated string using Py_FileSystemDefaultEncoding.
-
-   If the encoding is supported by one of the built-in codecs (i.e., UTF-8,
-   UTF-16, UTF-32, Latin-1 or MBCS), otherwise fallback to UTF-8 and replace
-   invalid characters with '?'.
-
-   The function is intended to be used for paths and file names only
-   during bootstrapping process where the codecs are not set up.
-*/
-
-PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
-    const char *s               /* encoded string */
-    );
-
-PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
-    const char *s,               /* encoded string */
-    Py_ssize_t size              /* size */
-    );
-
 /* Returns a pointer to the default encoding (normally, UTF-8) of the
    Unicode object unicode and the size of the encoded representation
    in bytes stored in *size.
@@ -1252,6 +1235,33 @@ PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
     const char *errors		/* error handling */
     );
 
+/* --- File system encoding ---------------------------------------------- */
+
+/* ParseTuple converter which converts a Unicode object into the file
+   system encoding, using the PEP 383 error handler; bytes objects are
+   output as-is. */
+
+PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
+
+/* Decode a null-terminated string using Py_FileSystemDefaultEncoding.
+
+   If the encoding is supported by one of the built-in codecs (i.e., UTF-8,
+   UTF-16, UTF-32, Latin-1 or MBCS), otherwise fallback to UTF-8 and replace
+   invalid characters with '?'.
+
+   The function is intended to be used for paths and file names only
+   during bootstrapping process where the codecs are not set up.
+*/
+
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
+    const char *s               /* encoded string */
+    );
+
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
+    const char *s,               /* encoded string */
+    Py_ssize_t size              /* size */
+    );
+
 /* --- Methods & Slots ----------------------------------------------------
 
    These are capable of handling Unicode objects and strings on input
 
@@ -1516,6 +1516,34 @@ def test_unicode_escape(self):
         self.assertEquals(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
         self.assertEquals(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
 
+class Utf8bTest(unittest.TestCase):
+
+    def test_utf8(self):
+        # Bad byte
+        self.assertEqual(b"foo\x80bar".decode("utf-8", "utf8b"),
+                         "foo\udc80bar")
+        self.assertEqual("foo\udc80bar".encode("utf-8", "utf8b"),
+                         b"foo\x80bar")
+        # bad-utf-8 encoded surrogate
+        self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "utf8b"),
+                         "\udced\udcb0\udc80")
+        self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "utf8b"),
+                         b"\xed\xb0\x80")
+
+    def test_ascii(self):
+        # bad byte
+        self.assertEqual(b"foo\x80bar".decode("ascii", "utf8b"),
+                         "foo\udc80bar")
+        self.assertEqual("foo\udc80bar".encode("ascii", "utf8b"),
+                         b"foo\x80bar")
+
+    def test_charmap(self):
+        # bad byte: \xa5 is unmapped in iso-8859-3
+        self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "utf8b"),
+                         "foo\udca5bar")
+        self.assertEqual("foo\udca5bar".encode("iso-8859-3", "utf8b"),
+                         b"foo\xa5bar")
+
 
 def test_main():
     support.run_unittest(
@@ -1543,6 +1571,7 @@ def test_main():
         CharmapTest,
         WithStmtTest,
         TypesTest,
+        Utf8bTest,
     )
 
 
 
@@ -7,6 +7,7 @@
 import unittest
 import warnings
 import sys
+import shutil
 from test import support
 
 # Tests creating TESTFN
@@ -698,9 +699,44 @@ def test_setregid(self):
                     self.assertRaises(os.error, os.setregid, 0, 0)
                 self.assertRaises(OverflowError, os.setregid, 1<<32, 0)
                 self.assertRaises(OverflowError, os.setregid, 0, 1<<32)
+
+    class Pep383Tests(unittest.TestCase):
+        filenames = [b'foo\xf6bar', 'foo\xf6bar'.encode("utf-8")]
+
+        def setUp(self):
+            self.fsencoding = sys.getfilesystemencoding()
+            sys.setfilesystemencoding("utf-8")
+            self.dir = support.TESTFN
+            self.bdir = self.dir.encode("utf-8", "utf8b")
+            os.mkdir(self.dir)
+            self.unicodefn = []
+            for fn in self.filenames:
+                f = open(os.path.join(self.bdir, fn), "w")
+                f.close()
+                self.unicodefn.append(fn.decode("utf-8", "utf8b"))
+
+        def tearDown(self):
+            shutil.rmtree(self.dir)
+            sys.setfilesystemencoding(self.fsencoding)
+
+        def test_listdir(self):
+            expected = set(self.unicodefn)
+            found = set(os.listdir(support.TESTFN))
+            self.assertEquals(found, expected)
+
+        def test_open(self):
+            for fn in self.unicodefn:
+                f = open(os.path.join(self.dir, fn))
+                f.close()
+
+        def test_stat(self):
+            for fn in self.unicodefn:
+                os.stat(os.path.join(self.dir, fn))
 else:
     class PosixUidGidTests(unittest.TestCase):
         pass
+    class Pep383Tests(unittest.TestCase):
+        pass
 
 def test_main():
     support.run_unittest(
@@ -714,7 +750,8 @@ def test_main():
         ExecTests,
         Win32ErrorTests,
         TestInvalidFD,
-        PosixUidGidTests
+        PosixUidGidTests,
+        Pep383Tests
     )
 
 if __name__ == "__main__":
 
@@ -12,6 +12,8 @@ What's New in Python 3.1 beta 1?
 Core and Builtins
 -----------------
 
+- Implement PEP 383, Non-decodable Bytes in System Character Interfaces.
+
 - Issue #5890: in subclasses of 'property' the __doc__ attribute was
   shadowed by classtype's, even if it was None.  property now
   inserts the __doc__ into the subclass instance __dict__.
 
@@ -245,7 +245,7 @@ fileio_init(PyObject *oself, PyObject *args, PyObject *kwds)
 				return -1;
 
 			stringobj = PyUnicode_AsEncodedString(
-				u, Py_FileSystemDefaultEncoding, NULL);
+				u, Py_FileSystemDefaultEncoding, "utf8b");
 			Py_DECREF(u);
 			if (stringobj == NULL)
 				return -1;