@@ -4979,39 +4979,228 @@ PyUnicode_DecodeUTF8(const char *s,
4979
4979
#include "stringlib/codecs.h"
4980
4980
#include "stringlib/undef.h"
4981
4981
4982
+ #if (SIZEOF_SIZE_T == 8 )
4982
4983
/* Mask to quickly check whether a C 'size_t' contains a
4983
4984
non-ASCII, UTF8-encoded char. */
4984
- #if (SIZEOF_SIZE_T == 8 )
4985
4985
# define ASCII_CHAR_MASK 0x8080808080808080ULL
4986
+ // used to count codepoints in UTF-8 string.
4987
+ # define VECTOR_0101 0x0101010101010101ULL
4988
+ # define VECTOR_00FF 0x00ff00ff00ff00ffULL
4986
4989
#elif (SIZEOF_SIZE_T == 4 )
4987
4990
# define ASCII_CHAR_MASK 0x80808080U
4991
+ # define VECTOR_0101 0x01010101U
4992
+ # define VECTOR_00FF 0x00ff00ffU
4988
4993
#else
4989
4994
# error C 'size_t' size should be either 4 or 8!
4990
4995
#endif
4991
4996
4997
+ #if (defined(__clang__ ) || defined(__GNUC__ ))
4998
+ #define HAVE_CTZ 1
4999
+ static inline unsigned int
5000
+ ctz (size_t v )
5001
+ {
5002
+ return __builtin_ctzll ((unsigned long long )v );
5003
+ }
5004
+ #elif defined(_MSC_VER )
5005
+ #define HAVE_CTZ 1
5006
+ static inline unsigned int
5007
+ ctz (size_t v )
5008
+ {
5009
+ unsigned long pos ;
5010
+ #if SIZEOF_SIZE_T == 4
5011
+ _BitScanForward (& pos , v );
5012
+ #else
5013
+ _BitScanForward64 (& pos , v );
5014
+ #endif /* SIZEOF_SIZE_T */
5015
+ return pos ;
5016
+ }
5017
+ #endif
5018
+
5019
+ #if HAVE_CTZ
5020
+ // load p[0]..p[size-1] as a little-endian size_t
5021
+ // without unaligned access nor read ahead.
5022
+ static size_t
5023
+ load_unaligned (const unsigned char * p , size_t size )
5024
+ {
5025
+ assert (size <= SIZEOF_SIZE_T );
5026
+ union {
5027
+ size_t s ;
5028
+ unsigned char b [SIZEOF_SIZE_T ];
5029
+ } u ;
5030
+ u .s = 0 ;
5031
+ switch (size ) {
5032
+ case 8 :
5033
+ u .b [7 ] = p [7 ];
5034
+ _Py_FALLTHROUGH ;
5035
+ case 7 :
5036
+ u .b [6 ] = p [6 ];
5037
+ _Py_FALLTHROUGH ;
5038
+ case 6 :
5039
+ u .b [5 ] = p [5 ];
5040
+ _Py_FALLTHROUGH ;
5041
+ case 5 :
5042
+ u .b [4 ] = p [4 ];
5043
+ _Py_FALLTHROUGH ;
5044
+ case 4 :
5045
+ u .b [3 ] = p [3 ];
5046
+ _Py_FALLTHROUGH ;
5047
+ case 3 :
5048
+ u .b [2 ] = p [2 ];
5049
+ _Py_FALLTHROUGH ;
5050
+ case 2 :
5051
+ u .b [1 ] = p [1 ];
5052
+ _Py_FALLTHROUGH ;
5053
+ case 1 :
5054
+ u .b [0 ] = p [0 ];
5055
+ break ;
5056
+ case 0 :
5057
+ break ;
5058
+ default :
5059
+ Py_UNREACHABLE ();
5060
+ }
5061
+ return u .s ;
5062
+ }
5063
+ #endif
5064
+
5065
+ /*
5066
+ * Find the first non-ASCII character in a byte sequence.
5067
+ *
5068
+ * This function scans a range of bytes from `start` to `end` and returns the
5069
+ * index of the first byte that is not an ASCII character (i.e., has the most
5070
+ * significant bit set). If all characters in the range are ASCII, it returns
5071
+ * `end - start`.
5072
+ */
4992
5073
static Py_ssize_t
4993
- ascii_decode (const char * start , const char * end , Py_UCS1 * dest )
5074
+ find_first_nonascii (const unsigned char * start , const unsigned char * end )
4994
5075
{
4995
- const char * p = start ;
5076
+ const unsigned char * p = start ;
4996
5077
5078
+ if (end - start >= SIZEOF_SIZE_T ) {
5079
+ const unsigned char * p2 = _Py_ALIGN_UP (p , SIZEOF_SIZE_T );
5080
+ if (p < p2 ) {
5081
+ #if HAVE_CTZ
5082
+ #if defined(_M_AMD64 ) || defined(_M_IX86 ) || defined(__x86_64__ ) || defined(__i386__ )
5083
+ // x86 and amd64 are little endian and can load unaligned memory.
5084
+ size_t u = * (const size_t * )p & ASCII_CHAR_MASK ;
5085
+ #else
5086
+ size_t u = load_unaligned (p , p2 - p ) & ASCII_CHAR_MASK ;
5087
+ #endif
5088
+ if (u ) {
5089
+ return p - start + (ctz (u ) - 7 ) / 8 ;
5090
+ }
5091
+ p = p2 ;
5092
+ }
5093
+ #else
5094
+ while (p < p2 ) {
5095
+ if (* p & 0x80 ) {
5096
+ return p - start ;
5097
+ }
5098
+ p ++ ;
5099
+ }
5100
+ #endif
5101
+ const unsigned char * e = end - SIZEOF_SIZE_T ;
5102
+ while (p <= e ) {
5103
+ size_t u = (* (const size_t * )p ) & ASCII_CHAR_MASK ;
5104
+ if (u ) {
5105
+ #if PY_LITTLE_ENDIAN && HAVE_CTZ
5106
+ return p - start + (ctz (u ) - 7 ) / 8 ;
5107
+ #else
5108
+ // big endian and minor compilers are difficult to test.
5109
+ // fallback to per byte check.
5110
+ break ;
5111
+ #endif
5112
+ }
5113
+ p += SIZEOF_SIZE_T ;
5114
+ }
5115
+ }
5116
+ #if HAVE_CTZ
5117
+ // we can not use *(const size_t*)p to avoid buffer overrun.
5118
+ size_t u = load_unaligned (p , end - p ) & ASCII_CHAR_MASK ;
5119
+ if (u ) {
5120
+ return p - start + (ctz (u ) - 7 ) / 8 ;
5121
+ }
5122
+ return end - start ;
5123
+ #else
5124
+ while (p < end ) {
5125
+ if (* p & 0x80 ) {
5126
+ break ;
5127
+ }
5128
+ p ++ ;
5129
+ }
5130
+ return p - start ;
5131
+ #endif
5132
+ }
5133
+
5134
+ static inline int
5135
+ scalar_utf8_start_char (unsigned int ch )
5136
+ {
5137
+ // 0xxxxxxx or 11xxxxxx are first byte.
5138
+ return (~ch >> 7 | ch >> 6 ) & 1 ;
5139
+ }
5140
+
5141
+ static inline size_t
5142
+ vector_utf8_start_chars (size_t v )
5143
+ {
5144
+ return ((~v >> 7 ) | (v >> 6 )) & VECTOR_0101 ;
5145
+ }
5146
+
5147
+
5148
+ // Count the number of UTF-8 code points in a given byte sequence.
5149
+ static Py_ssize_t
5150
+ utf8_count_codepoints (const unsigned char * s , const unsigned char * end )
5151
+ {
5152
+ Py_ssize_t len = 0 ;
5153
+
5154
+ if (end - s >= SIZEOF_SIZE_T ) {
5155
+ while (!_Py_IS_ALIGNED (s , ALIGNOF_SIZE_T )) {
5156
+ len += scalar_utf8_start_char (* s ++ );
5157
+ }
5158
+
5159
+ while (s + SIZEOF_SIZE_T <= end ) {
5160
+ const unsigned char * e = end ;
5161
+ if (e - s > SIZEOF_SIZE_T * 255 ) {
5162
+ e = s + SIZEOF_SIZE_T * 255 ;
5163
+ }
5164
+ Py_ssize_t vstart = 0 ;
5165
+ while (s + SIZEOF_SIZE_T <= e ) {
5166
+ size_t v = * (size_t * )s ;
5167
+ size_t vs = vector_utf8_start_chars (v );
5168
+ vstart += vs ;
5169
+ s += SIZEOF_SIZE_T ;
5170
+ }
5171
+ vstart = (vstart & VECTOR_00FF ) + ((vstart >> 8 ) & VECTOR_00FF );
5172
+ vstart += vstart >> 16 ;
5173
+ #if SIZEOF_SIZE_T == 8
5174
+ vstart += vstart >> 32 ;
5175
+ #endif
5176
+ len += vstart & 0x7ff ;
5177
+ }
5178
+ }
5179
+ while (s < end ) {
5180
+ len += scalar_utf8_start_char (* s ++ );
5181
+ }
5182
+ return len ;
5183
+ }
5184
+
5185
+ static Py_ssize_t
5186
+ ascii_decode (const char * start , const char * end , Py_UCS1 * dest )
5187
+ {
4997
5188
#if SIZEOF_SIZE_T <= SIZEOF_VOID_P
4998
- if (_Py_IS_ALIGNED (p , ALIGNOF_SIZE_T )
5189
+ if (_Py_IS_ALIGNED (start , ALIGNOF_SIZE_T )
4999
5190
&& _Py_IS_ALIGNED (dest , ALIGNOF_SIZE_T ))
5000
5191
{
5001
5192
/* Fast path, see in STRINGLIB(utf8_decode) for
5002
5193
an explanation. */
5003
- /* Help allocation */
5004
- const char * _p = p ;
5005
- Py_UCS1 * q = dest ;
5006
- while (_p + SIZEOF_SIZE_T <= end ) {
5007
- size_t value = * (const size_t * ) _p ;
5194
+ const char * p = start ;
5195
+ Py_UCS1 * q = dest ;
5196
+ while (p + SIZEOF_SIZE_T <= end ) {
5197
+ size_t value = * (const size_t * ) p ;
5008
5198
if (value & ASCII_CHAR_MASK )
5009
5199
break ;
5010
5200
* ((size_t * )q ) = value ;
5011
- _p += SIZEOF_SIZE_T ;
5201
+ p += SIZEOF_SIZE_T ;
5012
5202
q += SIZEOF_SIZE_T ;
5013
5203
}
5014
- p = _p ;
5015
5204
while (p < end ) {
5016
5205
if ((unsigned char )* p & 0x80 )
5017
5206
break ;
@@ -5020,31 +5209,12 @@ ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
5020
5209
return p - start ;
5021
5210
}
5022
5211
#endif
5023
- while (p < end ) {
5024
- /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
5025
- for an explanation. */
5026
- if (_Py_IS_ALIGNED (p , ALIGNOF_SIZE_T )) {
5027
- /* Help allocation */
5028
- const char * _p = p ;
5029
- while (_p + SIZEOF_SIZE_T <= end ) {
5030
- size_t value = * (const size_t * ) _p ;
5031
- if (value & ASCII_CHAR_MASK )
5032
- break ;
5033
- _p += SIZEOF_SIZE_T ;
5034
- }
5035
- p = _p ;
5036
- if (_p == end )
5037
- break ;
5038
- }
5039
- if ((unsigned char )* p & 0x80 )
5040
- break ;
5041
- ++ p ;
5042
- }
5043
- memcpy (dest , start , p - start );
5044
- return p - start ;
5212
+ Py_ssize_t pos = find_first_nonascii ((const unsigned char * )start ,
5213
+ (const unsigned char * )end );
5214
+ memcpy (dest , start , pos );
5215
+ return pos ;
5045
5216
}
5046
5217
5047
-
5048
5218
static int
5049
5219
unicode_decode_utf8_impl (_PyUnicodeWriter * writer ,
5050
5220
const char * starts , const char * s , const char * end ,
@@ -5188,27 +5358,69 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
5188
5358
return get_latin1_char ((unsigned char )s [0 ]);
5189
5359
}
5190
5360
5191
- // fast path: try ASCII string.
5192
- const char * starts = s ;
5193
- const char * end = s + size ;
5194
- PyObject * u = PyUnicode_New (size , 127 );
5195
- if (u == NULL ) {
5361
+ // I don't know this check is necessary or not. But there is a test
5362
+ // case that requires size=PY_SSIZE_T_MAX cause MemoryError.
5363
+ if (PY_SSIZE_T_MAX - sizeof (PyCompactUnicodeObject ) < (size_t )size ) {
5364
+ PyErr_NoMemory ();
5196
5365
return NULL ;
5197
5366
}
5198
- Py_ssize_t decoded = ascii_decode (s , end , PyUnicode_1BYTE_DATA (u ));
5199
- if (decoded == size ) {
5367
+
5368
+ const char * starts = s ;
5369
+ const char * end = s + size ;
5370
+
5371
+ Py_ssize_t pos = find_first_nonascii ((const unsigned char * )starts , (const unsigned char * )end );
5372
+ if (pos == size ) { // fast path: ASCII string.
5373
+ PyObject * u = PyUnicode_New (size , 127 );
5374
+ if (u == NULL ) {
5375
+ return NULL ;
5376
+ }
5377
+ memcpy (PyUnicode_1BYTE_DATA (u ), s , size );
5200
5378
if (consumed ) {
5201
5379
* consumed = size ;
5202
5380
}
5203
5381
return u ;
5204
5382
}
5205
- s += decoded ;
5206
- size -= decoded ;
5383
+
5384
+ int maxchr = 127 ;
5385
+ Py_ssize_t maxsize = size ;
5386
+
5387
+ unsigned char ch = (unsigned char )(s [pos ]);
5388
+ // error handler other than strict may remove/replace the invalid byte.
5389
+ // consumed != NULL allows 1~3 bytes remainings.
5390
+ // 0x80 <= ch < 0xc2 is invalid start byte that cause UnicodeDecodeError.
5391
+ // otherwise: check the input and decide the maxchr and maxsize to reduce
5392
+ // reallocation and copy.
5393
+ if (error_handler == _Py_ERROR_STRICT && !consumed && ch >= 0xc2 ) {
5394
+ // we only calculate the number of codepoints and don't determine the exact maxchr.
5395
+ // This is because writing fast and portable SIMD code to find maxchr is difficult.
5396
+ // If reallocation occurs for a larger maxchar, knowing the exact number of codepoints
5397
+ // means that it is no longer necessary to allocate several times the required amount
5398
+ // of memory.
5399
+ maxsize = utf8_count_codepoints ((const unsigned char * )s , (const unsigned char * )end );
5400
+ if (ch < 0xc4 ) { // latin1
5401
+ maxchr = 0xff ;
5402
+ }
5403
+ else if (ch < 0xf0 ) { // ucs2
5404
+ maxchr = 0xffff ;
5405
+ }
5406
+ else { // ucs4
5407
+ maxchr = 0x10ffff ;
5408
+ }
5409
+ }
5410
+ PyObject * u = PyUnicode_New (maxsize , maxchr );
5411
+ if (!u ) {
5412
+ return NULL ;
5413
+ }
5207
5414
5208
5415
// Use _PyUnicodeWriter after fast path is failed.
5209
5416
_PyUnicodeWriter writer ;
5210
5417
_PyUnicodeWriter_InitWithBuffer (& writer , u );
5211
- writer .pos = decoded ;
5418
+ if (maxchr <= 255 ) {
5419
+ memcpy (PyUnicode_1BYTE_DATA (u ), s , pos );
5420
+ s += pos ;
5421
+ size -= pos ;
5422
+ writer .pos = pos ;
5423
+ }
5212
5424
5213
5425
if (unicode_decode_utf8_impl (& writer , starts , s , end ,
5214
5426
error_handler , errors ,
@@ -5268,7 +5480,9 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
5268
5480
const char * errors ,
5269
5481
Py_ssize_t * consumed )
5270
5482
{
5271
- return unicode_decode_utf8 (s , size , _Py_ERROR_UNKNOWN , errors , consumed );
5483
+ return unicode_decode_utf8 (s , size ,
5484
+ errors ? _Py_ERROR_UNKNOWN : _Py_ERROR_STRICT ,
5485
+ errors , consumed );
5272
5486
}
5273
5487
5274
5488
0 commit comments