From 88f80193a9c01d3f3e35896126abf13c4daf94cb Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Thu, 6 Aug 2020 13:16:14 -0400 Subject: [PATCH 1/6] Add the option to iteratively encode JSON. --- README.rst | 11 ++++++++++- canonicaljson.py | 35 ++++++++++++++++++++++++++++++++++- 2 files changed, 44 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index c3c1b2c..a6887d6 100644 --- a/README.rst +++ b/README.rst @@ -34,12 +34,21 @@ Installing Using ----- +To encode an object into the canonicaljson: + .. code:: python import canonicaljson assert canonicaljson.encode_canonical_json({}) == b'{}' -The underlying JSON implementation can be choosen with the following: +There's also an iterator version: + +.. code:: python + + import canonicaljson + assert b''.join(canonicaljson.iterencode_canonical_json({})) == b'{}' + +The underlying JSON implementation can be chosen with the following: .. code:: python diff --git a/canonicaljson.py b/canonicaljson.py index afab92a..fa3da50 100644 --- a/canonicaljson.py +++ b/canonicaljson.py @@ -160,12 +160,45 @@ def encode_canonical_json(json_object): return _unascii(s) +def iterencode_canonical_json(json_object): + """Encodes the shortest UTF-8 JSON encoding with dictionary keys + lexicographically sorted by unicode code point. + + Args: + json_object (dict): The JSON object to encode. + + Returns: + generator which yields bytes encoding the JSON object""" + for chunk in _canonical_encoder.iterencode(json_object): + yield _unascii(chunk) + + def encode_pretty_printed_json(json_object): - """Encodes the JSON object dict as human readable ascii bytes.""" + """ + Encodes the JSON object dict as human readable ascii bytes. + + Args: + json_object (dict): The JSON object to encode. + + Returns: + bytes encoding the JSON object""" return _pretty_encoder.encode(json_object).encode("ascii") +def iterencode_pretty_printed_json(json_object): + """Encodes the JSON object dict as human readable ascii bytes. + + Args: + json_object (dict): The JSON object to encode. + + Returns: + generator which yields bytes encoding the JSON object""" + + for chunk in _pretty_encoder.iterencode(json_object): + yield chunk.encode("ascii") + + if platform.python_implementation() == "PyPy": # pragma: no cover # pypy ships with an optimised JSON encoder/decoder that is faster than # simplejson's C extension. From 5388bf27382a90c3aa9e772de43fdd3e1b96edaf Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Thu, 6 Aug 2020 14:19:23 -0400 Subject: [PATCH 2/6] Add tests. --- test_canonicaljson.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test_canonicaljson.py b/test_canonicaljson.py index b190aaa..64b942a 100644 --- a/test_canonicaljson.py +++ b/test_canonicaljson.py @@ -18,6 +18,8 @@ from canonicaljson import ( encode_canonical_json, encode_pretty_printed_json, + iterencode_canonical_json, + iterencode_pretty_printed_json, set_json_library, ) @@ -62,8 +64,12 @@ def test_encode_canonical(self): b'"\\\\u1234"', ) + # Iteratively encoding should work. + self.assertEqual(list(iterencode_canonical_json({})), [b'{}']) + def test_encode_pretty_printed(self): self.assertEqual(encode_pretty_printed_json({}), b'{}') + self.assertEqual(list(iterencode_pretty_printed_json({})), [b'{}']) def test_frozen_dict(self): self.assertEqual( From e29fbd04707a130eb6863f6c7720fd47bc0b05cf Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Thu, 6 Aug 2020 14:51:51 -0400 Subject: [PATCH 3/6] Move a comment closer to implementation. --- canonicaljson.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/canonicaljson.py b/canonicaljson.py index fa3da50..ca498f4 100644 --- a/canonicaljson.py +++ b/canonicaljson.py @@ -31,15 +31,6 @@ def _default(obj): raise TypeError('Object of type %s is not JSON serializable' % obj.__class__.__name__) - -# ideally we'd set ensure_ascii=False, but the ensure_ascii codepath is so -# much quicker (assuming c speedups are enabled) that it's actually much -# quicker to let it do that and then substitute back (it's about 2.5x faster). -# -# (in any case, simplejson's ensure_ascii doesn't get U+2028 and U+2029 right, -# as per https://github.com/simplejson/simplejson/issues/206). -# - # Declare these in the module scope, but they get configured in # set_json_library. _canonical_encoder = None @@ -54,6 +45,15 @@ def set_json_library(json_lib): json_lib: The module to use for JSON encoding. Must have a `JSONEncoder` property. """ + + # ideally we'd set ensure_ascii=False, but the ensure_ascii codepath is so + # much quicker (assuming c speedups are enabled) that it's actually much + # quicker to let it do that and then substitute back (it's about 2.5x faster). + # + # (in any case, simplejson's ensure_ascii doesn't get U+2028 and U+2029 right, + # as per https://github.com/simplejson/simplejson/issues/206). + # + global _canonical_encoder _canonical_encoder = json_lib.JSONEncoder( ensure_ascii=True, From bca38f7f9fa5d708eaaf0c0900ecfda67f71c1c6 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Thu, 6 Aug 2020 15:28:05 -0400 Subject: [PATCH 4/6] Lint. --- canonicaljson.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/canonicaljson.py b/canonicaljson.py index ca498f4..750f3fa 100644 --- a/canonicaljson.py +++ b/canonicaljson.py @@ -31,6 +31,7 @@ def _default(obj): raise TypeError('Object of type %s is not JSON serializable' % obj.__class__.__name__) + # Declare these in the module scope, but they get configured in # set_json_library. _canonical_encoder = None @@ -48,10 +49,11 @@ def set_json_library(json_lib): # ideally we'd set ensure_ascii=False, but the ensure_ascii codepath is so # much quicker (assuming c speedups are enabled) that it's actually much - # quicker to let it do that and then substitute back (it's about 2.5x faster). + # quicker to let it do that and then substitute back (it's about 2.5x + # faster). # - # (in any case, simplejson's ensure_ascii doesn't get U+2028 and U+2029 right, - # as per https://github.com/simplejson/simplejson/issues/206). + # (in any case, simplejson's ensure_ascii doesn't get U+2028 and U+2029 + # right, as per https://github.com/simplejson/simplejson/issues/206). # global _canonical_encoder From 825a031a889ad2bc73b82268ebfde1e33c92c163 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Mon, 10 Aug 2020 08:40:49 -0400 Subject: [PATCH 5/6] Remove obsolete comment. --- canonicaljson.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/canonicaljson.py b/canonicaljson.py index e8e2dfe..63413a5 100644 --- a/canonicaljson.py +++ b/canonicaljson.py @@ -45,16 +45,6 @@ def set_json_library(json_lib): json_lib: The module to use for JSON encoding. Must have a `JSONEncoder` property. """ - - # ideally we'd set ensure_ascii=False, but the ensure_ascii codepath is so - # much quicker (assuming c speedups are enabled) that it's actually much - # quicker to let it do that and then substitute back (it's about 2.5x - # faster). - # - # (in any case, simplejson's ensure_ascii doesn't get U+2028 and U+2029 - # right, as per https://github.com/simplejson/simplejson/issues/206). - # - global _canonical_encoder _canonical_encoder = json_lib.JSONEncoder( ensure_ascii=False, From dd5b6e63a7cdc3beff63609c3b0ef49b66f9acce Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Mon, 10 Aug 2020 13:35:55 -0400 Subject: [PATCH 6/6] Fix bug from merging master. --- canonicaljson.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/canonicaljson.py b/canonicaljson.py index 63413a5..ff594ab 100644 --- a/canonicaljson.py +++ b/canonicaljson.py @@ -85,7 +85,7 @@ def iterencode_canonical_json(json_object): Returns: generator which yields bytes encoding the JSON object""" for chunk in _canonical_encoder.iterencode(json_object): - yield _unascii(chunk) + yield chunk.encode("utf-8") def encode_pretty_printed_json(json_object):