Python on Solaris doesn't handle non UTF-8 locales because of the way they are encoded. The wchar_t encoding used for stored symbols is not standardized. While on Linux symbols from all encodings will be mapped to their UTF-8 values, this is not the case on Solaris, where only UTF-8 locales work like that; other encodings can use any arbitrary value. Since Python expects no value to be higher than the maximum valid code point in Unicode (which is U+10FFFF), it breaks on Solaris when non UTF-8 locale is used. See bug 31790476. To fix this, we have to convert given wchar_t to utf32 each time locale is not UTF-8 encoded (or ASCII, which can safely be ignored). --- Python-3.9.1/Include/unicodeobject.h +++ Python-3.9.1/Include/unicodeobject.h @@ -97,6 +97,11 @@ Copyright (c) Corporation for National R # include #endif +#if defined(__sun) && defined(__SVR4) +# include +# include +#endif + /* Py_UCS4 and Py_UCS2 are typedefs for the respective unicode representations. */ typedef uint32_t Py_UCS4; --- Python-3.9.1/Objects/unicodeobject.c +++ Python-3.9.1/Objects/unicodeobject.c @@ -2187,6 +2187,15 @@ PyUnicode_FromUnicode(const Py_UNICODE * return PyUnicode_FromWideChar(u, size); } +#if defined(__sun) && defined(__SVR4) +/* Detect whether currently used locale uses UTF compatible encoding. */ +int codeset_is_utf8_compatible() +{ + char* res = nl_langinfo(CODESET); + return !(strcmp(res, "UTF-8") && strcmp(res, "646")); +} +#endif + PyObject * PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size) { @@ -2210,6 +2219,58 @@ PyUnicode_FromWideChar(const wchar_t *u, if (size == 0) _Py_RETURN_UNICODE_EMPTY(); +#if defined(__sun) && defined(__SVR4) + /* Check whether current locale uses UTF to encode symbols */ + if (!codeset_is_utf8_compatible()) { + + /* Given 'u' might not be NULL terminated (size smaller than its + length); copy and terminate part we are interested in. */ + wchar_t* substr = PyMem_RawMalloc((size + 1) * sizeof(wchar_t)); + memcpy(substr, u, size * sizeof(wchar_t)); + substr[size] = 0; + + /* Convert given wide-character string to a character string */ + size_t buffsize = wcstombs(NULL, substr, 0) + 1; + if (buffsize == (size_t)-1) { + PyMem_RawFree(substr); + PyErr_Format(PyExc_ValueError, "wcstombs() conversion failed"); + return NULL; + } + + char* buffer = PyMem_RawMalloc(buffsize * sizeof(char)); + size_t res = wcstombs(buffer, substr, buffsize); + assert(res == buffsize - 1); + + /* Convert character string to UTF32 encoded char32_t string. + Since wchar_t and char32_t have the same size on Solaris and one + wchar_t symbol corresponds to one UTF32 value, we can safely + reuse this buffer and skip additional allocation. */ + char32_t* c32 = (char32_t*) substr; + mbstate_t state = {0}; + + int i = 0; + char* ptr = buffer; + char* end = ptr + res + 1; + while (res = mbrtoc32(&(c32[i]), ptr, end - ptr, &state)) { + if (res == (size_t)-1 || res == (size_t)-2 || res == (size_t)-3) { + PyMem_RawFree(c32); + PyMem_RawFree(buffer); + PyErr_Format(PyExc_ValueError, + "mbrtoc32() conversion failed with error code: %d", + res); + return NULL; + } + ptr += res; + i ++; + } + PyMem_RawFree(buffer); + + PyObject *unicode = _PyUnicode_FromUCS4(c32, size); + PyMem_RawFree(c32); + return unicode; + } +#endif + /* Single character Unicode objects in the Latin-1 range are shared when using this constructor */ if (size == 1 && (Py_UCS4)*u < 256)