Skip to content

Commit bd5d8c1

Browse files
pythongh-126997: Fix support of non-ASCII strings in pickletools
* Fix support of STRING and GLOBAL opcodes with non-ASCII arguments. * dis() now outputs non-ASCII bytes in STRING, BINSTRING and SHORT_BINSTRING arguments as escaped (\xXX).
1 parent 9d2a879 commit bd5d8c1

File tree

3 files changed

+92
-4
lines changed

3 files changed

+92
-4
lines changed

Lib/pickletools.py

+7-4
Original file line numberDiff line numberDiff line change
@@ -312,7 +312,7 @@ def read_uint8(f):
312312
doc="Eight-byte unsigned integer, little-endian.")
313313

314314

315-
def read_stringnl(f, decode=True, stripquotes=True):
315+
def read_stringnl(f, decode=True, stripquotes=True, *, encoding='latin-1'):
316316
r"""
317317
>>> import io
318318
>>> read_stringnl(io.BytesIO(b"'abcd'\nefg\n"))
@@ -356,7 +356,7 @@ def read_stringnl(f, decode=True, stripquotes=True):
356356
raise ValueError("no string quotes around %r" % data)
357357

358358
if decode:
359-
data = codecs.escape_decode(data)[0].decode("ascii")
359+
data = codecs.escape_decode(data)[0].decode(encoding)
360360
return data
361361

362362
stringnl = ArgumentDescriptor(
@@ -370,7 +370,7 @@ def read_stringnl(f, decode=True, stripquotes=True):
370370
""")
371371

372372
def read_stringnl_noescape(f):
373-
return read_stringnl(f, stripquotes=False)
373+
return read_stringnl(f, stripquotes=False, encoding='utf-8')
374374

375375
stringnl_noescape = ArgumentDescriptor(
376376
name='stringnl_noescape',
@@ -2509,7 +2509,10 @@ def dis(pickle, out=None, memo=None, indentlevel=4, annotate=0):
25092509
# make a mild effort to align arguments
25102510
line += ' ' * (10 - len(opcode.name))
25112511
if arg is not None:
2512-
line += ' ' + repr(arg)
2512+
if opcode.name in ("STRING", "BINSTRING", "SHORT_BINSTRING"):
2513+
line += ' ' + ascii(arg)
2514+
else:
2515+
line += ' ' + repr(arg)
25132516
if markmsg:
25142517
line += ' ' + markmsg
25152518
if annotate:

Lib/test/test_pickletools.py

+82
Original file line numberDiff line numberDiff line change
@@ -361,6 +361,88 @@ def test_annotate(self):
361361
highest protocol among opcodes = 0
362362
''', annotate=20)
363363

364+
def test_string(self):
365+
self.check_dis(b"S'abc'\n.", '''\
366+
0: S STRING 'abc'
367+
7: . STOP
368+
highest protocol among opcodes = 0
369+
''')
370+
self.check_dis(b'S"abc"\n.', '''\
371+
0: S STRING 'abc'
372+
7: . STOP
373+
highest protocol among opcodes = 0
374+
''')
375+
self.check_dis(b"S'\xc3\xb5'\n.", '''\
376+
0: S STRING '\\xc3\\xb5'
377+
6: . STOP
378+
highest protocol among opcodes = 0
379+
''')
380+
381+
def test_string_without_quotes(self):
382+
self.check_dis_error(b"Sabc'\n.", '',
383+
'no string quotes around b"abc\'"')
384+
self.check_dis_error(b'Sabc"\n.', '',
385+
"no string quotes around b'abc\"'")
386+
self.check_dis_error(b"S'abc\n.", '',
387+
'''strinq quote b"'" not found at both ends of b"'abc"''')
388+
self.check_dis_error(b'S"abc\n.', '',
389+
r"""strinq quote b'"' not found at both ends of b'"abc'""")
390+
self.check_dis_error(b"S'abc\"\n.", '',
391+
r"""strinq quote b"'" not found at both ends of b'\\'abc"'""")
392+
self.check_dis_error(b"S\"abc'\n.", '',
393+
r"""strinq quote b'"' not found at both ends of b'"abc\\''""")
394+
395+
def test_binstring(self):
396+
self.check_dis(b"T\x03\x00\x00\x00abc.", '''\
397+
0: T BINSTRING 'abc'
398+
8: . STOP
399+
highest protocol among opcodes = 1
400+
''')
401+
self.check_dis(b"T\x02\x00\x00\x00\xc3\xb5.", '''\
402+
0: T BINSTRING '\\xc3\\xb5'
403+
7: . STOP
404+
highest protocol among opcodes = 1
405+
''')
406+
407+
def test_short_binstring(self):
408+
self.check_dis(b"U\x03abc.", '''\
409+
0: U SHORT_BINSTRING 'abc'
410+
5: . STOP
411+
highest protocol among opcodes = 1
412+
''')
413+
self.check_dis(b"U\x02\xc3\xb5.", '''\
414+
0: U SHORT_BINSTRING '\\xc3\\xb5'
415+
4: . STOP
416+
highest protocol among opcodes = 1
417+
''')
418+
419+
def test_global(self):
420+
self.check_dis(b"cmodule\nname\n.", '''\
421+
0: c GLOBAL 'module name'
422+
13: . STOP
423+
highest protocol among opcodes = 0
424+
''')
425+
self.check_dis(b"cm\xc3\xb6dule\nn\xc3\xa4me\n.", '''\
426+
0: c GLOBAL 'm\xf6dule n\xe4me'
427+
15: . STOP
428+
highest protocol among opcodes = 0
429+
''')
430+
431+
def test_inst(self):
432+
self.check_dis(b"(imodule\nname\n.", '''\
433+
0: ( MARK
434+
1: i INST 'module name' (MARK at 0)
435+
14: . STOP
436+
highest protocol among opcodes = 0
437+
''')
438+
439+
def test_persid(self):
440+
self.check_dis(b"Pabc\n.", '''\
441+
0: P PERSID 'abc'
442+
5: . STOP
443+
highest protocol among opcodes = 0
444+
''')
445+
364446

365447
class MiscTestCase(unittest.TestCase):
366448
def test__all__(self):
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Fix support of STRING and GLOBAL opcodes with non-ASCII arguments in
2+
:mod:`pickletools`. :func:`pickletools.dis` now outputs non-ASCII bytes in
3+
STRING, BINSTRING and SHORT_BINSTRING arguments as escaped (``\xXX``).

0 commit comments

Comments
 (0)