Skip to content

Commit 19b9628

Browse files
methaneebonnal
authored andcommitted
pythongh-126024: optimize UTF-8 decoder for short non-ASCII string (python#126025)
1 parent 244015b commit 19b9628

File tree

2 files changed

+261
-45
lines changed

2 files changed

+261
-45
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Optimize decoding of short UTF-8 sequences containing non-ASCII characters
2+
by approximately 15%.

Objects/unicodeobject.c

+259-45
Original file line numberDiff line numberDiff line change
@@ -4979,39 +4979,228 @@ PyUnicode_DecodeUTF8(const char *s,
49794979
#include "stringlib/codecs.h"
49804980
#include "stringlib/undef.h"
49814981

4982+
#if (SIZEOF_SIZE_T == 8)
49824983
/* Mask to quickly check whether a C 'size_t' contains a
49834984
non-ASCII, UTF8-encoded char. */
4984-
#if (SIZEOF_SIZE_T == 8)
49854985
# define ASCII_CHAR_MASK 0x8080808080808080ULL
4986+
// used to count codepoints in UTF-8 string.
4987+
# define VECTOR_0101 0x0101010101010101ULL
4988+
# define VECTOR_00FF 0x00ff00ff00ff00ffULL
49864989
#elif (SIZEOF_SIZE_T == 4)
49874990
# define ASCII_CHAR_MASK 0x80808080U
4991+
# define VECTOR_0101 0x01010101U
4992+
# define VECTOR_00FF 0x00ff00ffU
49884993
#else
49894994
# error C 'size_t' size should be either 4 or 8!
49904995
#endif
49914996

4997+
#if (defined(__clang__) || defined(__GNUC__))
4998+
#define HAVE_CTZ 1
4999+
static inline unsigned int
5000+
ctz(size_t v)
5001+
{
5002+
return __builtin_ctzll((unsigned long long)v);
5003+
}
5004+
#elif defined(_MSC_VER)
5005+
#define HAVE_CTZ 1
5006+
static inline unsigned int
5007+
ctz(size_t v)
5008+
{
5009+
unsigned long pos;
5010+
#if SIZEOF_SIZE_T == 4
5011+
_BitScanForward(&pos, v);
5012+
#else
5013+
_BitScanForward64(&pos, v);
5014+
#endif /* SIZEOF_SIZE_T */
5015+
return pos;
5016+
}
5017+
#endif
5018+
5019+
#if HAVE_CTZ
5020+
// load p[0]..p[size-1] as a little-endian size_t
5021+
// without unaligned access nor read ahead.
5022+
static size_t
5023+
load_unaligned(const unsigned char *p, size_t size)
5024+
{
5025+
assert(size <= SIZEOF_SIZE_T);
5026+
union {
5027+
size_t s;
5028+
unsigned char b[SIZEOF_SIZE_T];
5029+
} u;
5030+
u.s = 0;
5031+
switch (size) {
5032+
case 8:
5033+
u.b[7] = p[7];
5034+
_Py_FALLTHROUGH;
5035+
case 7:
5036+
u.b[6] = p[6];
5037+
_Py_FALLTHROUGH;
5038+
case 6:
5039+
u.b[5] = p[5];
5040+
_Py_FALLTHROUGH;
5041+
case 5:
5042+
u.b[4] = p[4];
5043+
_Py_FALLTHROUGH;
5044+
case 4:
5045+
u.b[3] = p[3];
5046+
_Py_FALLTHROUGH;
5047+
case 3:
5048+
u.b[2] = p[2];
5049+
_Py_FALLTHROUGH;
5050+
case 2:
5051+
u.b[1] = p[1];
5052+
_Py_FALLTHROUGH;
5053+
case 1:
5054+
u.b[0] = p[0];
5055+
break;
5056+
case 0:
5057+
break;
5058+
default:
5059+
Py_UNREACHABLE();
5060+
}
5061+
return u.s;
5062+
}
5063+
#endif
5064+
5065+
/*
5066+
* Find the first non-ASCII character in a byte sequence.
5067+
*
5068+
* This function scans a range of bytes from `start` to `end` and returns the
5069+
* index of the first byte that is not an ASCII character (i.e., has the most
5070+
* significant bit set). If all characters in the range are ASCII, it returns
5071+
* `end - start`.
5072+
*/
49925073
static Py_ssize_t
4993-
ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
5074+
find_first_nonascii(const unsigned char *start, const unsigned char *end)
49945075
{
4995-
const char *p = start;
5076+
const unsigned char *p = start;
49965077

5078+
if (end - start >= SIZEOF_SIZE_T) {
5079+
const unsigned char *p2 = _Py_ALIGN_UP(p, SIZEOF_SIZE_T);
5080+
if (p < p2) {
5081+
#if HAVE_CTZ
5082+
#if defined(_M_AMD64) || defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)
5083+
// x86 and amd64 are little endian and can load unaligned memory.
5084+
size_t u = *(const size_t*)p & ASCII_CHAR_MASK;
5085+
#else
5086+
size_t u = load_unaligned(p, p2 - p) & ASCII_CHAR_MASK;
5087+
#endif
5088+
if (u) {
5089+
return p - start + (ctz(u) - 7) / 8;
5090+
}
5091+
p = p2;
5092+
}
5093+
#else
5094+
while (p < p2) {
5095+
if (*p & 0x80) {
5096+
return p - start;
5097+
}
5098+
p++;
5099+
}
5100+
#endif
5101+
const unsigned char *e = end - SIZEOF_SIZE_T;
5102+
while (p <= e) {
5103+
size_t u = (*(const size_t *)p) & ASCII_CHAR_MASK;
5104+
if (u) {
5105+
#if PY_LITTLE_ENDIAN && HAVE_CTZ
5106+
return p - start + (ctz(u) - 7) / 8;
5107+
#else
5108+
// big endian and minor compilers are difficult to test.
5109+
// fallback to per byte check.
5110+
break;
5111+
#endif
5112+
}
5113+
p += SIZEOF_SIZE_T;
5114+
}
5115+
}
5116+
#if HAVE_CTZ
5117+
// we can not use *(const size_t*)p to avoid buffer overrun.
5118+
size_t u = load_unaligned(p, end - p) & ASCII_CHAR_MASK;
5119+
if (u) {
5120+
return p - start + (ctz(u) - 7) / 8;
5121+
}
5122+
return end - start;
5123+
#else
5124+
while (p < end) {
5125+
if (*p & 0x80) {
5126+
break;
5127+
}
5128+
p++;
5129+
}
5130+
return p - start;
5131+
#endif
5132+
}
5133+
5134+
static inline int
5135+
scalar_utf8_start_char(unsigned int ch)
5136+
{
5137+
// 0xxxxxxx or 11xxxxxx are first byte.
5138+
return (~ch >> 7 | ch >> 6) & 1;
5139+
}
5140+
5141+
static inline size_t
5142+
vector_utf8_start_chars(size_t v)
5143+
{
5144+
return ((~v >> 7) | (v >> 6)) & VECTOR_0101;
5145+
}
5146+
5147+
5148+
// Count the number of UTF-8 code points in a given byte sequence.
5149+
static Py_ssize_t
5150+
utf8_count_codepoints(const unsigned char *s, const unsigned char *end)
5151+
{
5152+
Py_ssize_t len = 0;
5153+
5154+
if (end - s >= SIZEOF_SIZE_T) {
5155+
while (!_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) {
5156+
len += scalar_utf8_start_char(*s++);
5157+
}
5158+
5159+
while (s + SIZEOF_SIZE_T <= end) {
5160+
const unsigned char *e = end;
5161+
if (e - s > SIZEOF_SIZE_T * 255) {
5162+
e = s + SIZEOF_SIZE_T * 255;
5163+
}
5164+
Py_ssize_t vstart = 0;
5165+
while (s + SIZEOF_SIZE_T <= e) {
5166+
size_t v = *(size_t*)s;
5167+
size_t vs = vector_utf8_start_chars(v);
5168+
vstart += vs;
5169+
s += SIZEOF_SIZE_T;
5170+
}
5171+
vstart = (vstart & VECTOR_00FF) + ((vstart >> 8) & VECTOR_00FF);
5172+
vstart += vstart >> 16;
5173+
#if SIZEOF_SIZE_T == 8
5174+
vstart += vstart >> 32;
5175+
#endif
5176+
len += vstart & 0x7ff;
5177+
}
5178+
}
5179+
while (s < end) {
5180+
len += scalar_utf8_start_char(*s++);
5181+
}
5182+
return len;
5183+
}
5184+
5185+
static Py_ssize_t
5186+
ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
5187+
{
49975188
#if SIZEOF_SIZE_T <= SIZEOF_VOID_P
4998-
if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)
5189+
if (_Py_IS_ALIGNED(start, ALIGNOF_SIZE_T)
49995190
&& _Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T))
50005191
{
50015192
/* Fast path, see in STRINGLIB(utf8_decode) for
50025193
an explanation. */
5003-
/* Help allocation */
5004-
const char *_p = p;
5005-
Py_UCS1 * q = dest;
5006-
while (_p + SIZEOF_SIZE_T <= end) {
5007-
size_t value = *(const size_t *) _p;
5194+
const char *p = start;
5195+
Py_UCS1 *q = dest;
5196+
while (p + SIZEOF_SIZE_T <= end) {
5197+
size_t value = *(const size_t *) p;
50085198
if (value & ASCII_CHAR_MASK)
50095199
break;
50105200
*((size_t *)q) = value;
5011-
_p += SIZEOF_SIZE_T;
5201+
p += SIZEOF_SIZE_T;
50125202
q += SIZEOF_SIZE_T;
50135203
}
5014-
p = _p;
50155204
while (p < end) {
50165205
if ((unsigned char)*p & 0x80)
50175206
break;
@@ -5020,31 +5209,12 @@ ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
50205209
return p - start;
50215210
}
50225211
#endif
5023-
while (p < end) {
5024-
/* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
5025-
for an explanation. */
5026-
if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
5027-
/* Help allocation */
5028-
const char *_p = p;
5029-
while (_p + SIZEOF_SIZE_T <= end) {
5030-
size_t value = *(const size_t *) _p;
5031-
if (value & ASCII_CHAR_MASK)
5032-
break;
5033-
_p += SIZEOF_SIZE_T;
5034-
}
5035-
p = _p;
5036-
if (_p == end)
5037-
break;
5038-
}
5039-
if ((unsigned char)*p & 0x80)
5040-
break;
5041-
++p;
5042-
}
5043-
memcpy(dest, start, p - start);
5044-
return p - start;
5212+
Py_ssize_t pos = find_first_nonascii((const unsigned char*)start,
5213+
(const unsigned char*)end);
5214+
memcpy(dest, start, pos);
5215+
return pos;
50455216
}
50465217

5047-
50485218
static int
50495219
unicode_decode_utf8_impl(_PyUnicodeWriter *writer,
50505220
const char *starts, const char *s, const char *end,
@@ -5188,27 +5358,69 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
51885358
return get_latin1_char((unsigned char)s[0]);
51895359
}
51905360

5191-
// fast path: try ASCII string.
5192-
const char *starts = s;
5193-
const char *end = s + size;
5194-
PyObject *u = PyUnicode_New(size, 127);
5195-
if (u == NULL) {
5361+
// I don't know this check is necessary or not. But there is a test
5362+
// case that requires size=PY_SSIZE_T_MAX cause MemoryError.
5363+
if (PY_SSIZE_T_MAX - sizeof(PyCompactUnicodeObject) < (size_t)size) {
5364+
PyErr_NoMemory();
51965365
return NULL;
51975366
}
5198-
Py_ssize_t decoded = ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
5199-
if (decoded == size) {
5367+
5368+
const char *starts = s;
5369+
const char *end = s + size;
5370+
5371+
Py_ssize_t pos = find_first_nonascii((const unsigned char*)starts, (const unsigned char*)end);
5372+
if (pos == size) { // fast path: ASCII string.
5373+
PyObject *u = PyUnicode_New(size, 127);
5374+
if (u == NULL) {
5375+
return NULL;
5376+
}
5377+
memcpy(PyUnicode_1BYTE_DATA(u), s, size);
52005378
if (consumed) {
52015379
*consumed = size;
52025380
}
52035381
return u;
52045382
}
5205-
s += decoded;
5206-
size -= decoded;
5383+
5384+
int maxchr = 127;
5385+
Py_ssize_t maxsize = size;
5386+
5387+
unsigned char ch = (unsigned char)(s[pos]);
5388+
// error handler other than strict may remove/replace the invalid byte.
5389+
// consumed != NULL allows 1~3 bytes remainings.
5390+
// 0x80 <= ch < 0xc2 is invalid start byte that cause UnicodeDecodeError.
5391+
// otherwise: check the input and decide the maxchr and maxsize to reduce
5392+
// reallocation and copy.
5393+
if (error_handler == _Py_ERROR_STRICT && !consumed && ch >= 0xc2) {
5394+
// we only calculate the number of codepoints and don't determine the exact maxchr.
5395+
// This is because writing fast and portable SIMD code to find maxchr is difficult.
5396+
// If reallocation occurs for a larger maxchar, knowing the exact number of codepoints
5397+
// means that it is no longer necessary to allocate several times the required amount
5398+
// of memory.
5399+
maxsize = utf8_count_codepoints((const unsigned char *)s, (const unsigned char *)end);
5400+
if (ch < 0xc4) { // latin1
5401+
maxchr = 0xff;
5402+
}
5403+
else if (ch < 0xf0) { // ucs2
5404+
maxchr = 0xffff;
5405+
}
5406+
else { // ucs4
5407+
maxchr = 0x10ffff;
5408+
}
5409+
}
5410+
PyObject *u = PyUnicode_New(maxsize, maxchr);
5411+
if (!u) {
5412+
return NULL;
5413+
}
52075414

52085415
// Use _PyUnicodeWriter after fast path is failed.
52095416
_PyUnicodeWriter writer;
52105417
_PyUnicodeWriter_InitWithBuffer(&writer, u);
5211-
writer.pos = decoded;
5418+
if (maxchr <= 255) {
5419+
memcpy(PyUnicode_1BYTE_DATA(u), s, pos);
5420+
s += pos;
5421+
size -= pos;
5422+
writer.pos = pos;
5423+
}
52125424

52135425
if (unicode_decode_utf8_impl(&writer, starts, s, end,
52145426
error_handler, errors,
@@ -5268,7 +5480,9 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
52685480
const char *errors,
52695481
Py_ssize_t *consumed)
52705482
{
5271-
return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5483+
return unicode_decode_utf8(s, size,
5484+
errors ? _Py_ERROR_UNKNOWN : _Py_ERROR_STRICT,
5485+
errors, consumed);
52725486
}
52735487

52745488

0 commit comments

Comments
 (0)