Skip to content
This repository was archived by the owner on Feb 16, 2023. It is now read-only.

Commit a56645e

Browse files
tulirenJordan Scott
authored and
Jordan Scott
committed
Use cheaper operation to estimate json data byte size (airbytehq#13240)
* Simplify byte size estimation * Format code * Update comment
1 parent 77bf848 commit a56645e

File tree

4 files changed

+23
-8
lines changed

4 files changed

+23
-8
lines changed

airbyte-commons/src/main/java/io/airbyte/commons/json/Jsons.java

+11
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,17 @@ public static byte[] toBytes(final JsonNode jsonNode) {
129129
return serialize(jsonNode).getBytes(Charsets.UTF_8);
130130
}
131131

132+
/**
133+
* Use string length as an estimation for byte size, because all ASCII characters are one byte long
134+
* in UTF-8, and ASCII characters cover most of the use cases. To be more precise, we can convert
135+
* the string to byte[] and use the length of the byte[]. However, this conversion is expensive in
136+
* memory consumption. Given that the byte size of the serialized JSON is already an estimation of
137+
* the actual size of the JSON object, using a cheap operation seems an acceptable compromise.
138+
*/
139+
public static int getEstimatedByteSize(final JsonNode jsonNode) {
140+
return serialize(jsonNode).length();
141+
}
142+
132143
public static Set<String> keys(final JsonNode jsonNode) {
133144
if (jsonNode.isObject()) {
134145
return Jsons.object(jsonNode, new TypeReference<Map<String, Object>>() {}).keySet();

airbyte-commons/src/test/java/io/airbyte/commons/json/JsonsTest.java

+6
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,12 @@ void testGetStringOrNull() {
246246
assertNull(Jsons.getStringOrNull(json, "xyz"));
247247
}
248248

249+
@Test
250+
void testGetEstimatedByteSize() {
251+
final JsonNode json = Jsons.deserialize("{\"string_key\":\"abc\",\"array_key\":[\"item1\", \"item2\"]}");
252+
assertEquals(Jsons.toBytes(json).length, Jsons.getEstimatedByteSize(json));
253+
}
254+
249255
private static class ToClass {
250256

251257
@JsonProperty("str")

airbyte-workers/src/main/java/io/airbyte/workers/internal/AirbyteMessageTracker.java

+2-3
Original file line numberDiff line numberDiff line change
@@ -111,10 +111,9 @@ private void handleSourceEmittedRecord(final AirbyteRecordMessage recordMessage)
111111
final long currentTotalCount = streamToTotalRecordsEmitted.getOrDefault(streamIndex, 0L);
112112
streamToTotalRecordsEmitted.put(streamIndex, currentTotalCount + 1);
113113

114-
// todo (cgardens) - pretty wasteful to do an extra serialization just to get size.
115-
final int numBytes = Jsons.serialize(recordMessage.getData()).getBytes(Charsets.UTF_8).length;
114+
final int estimatedNumBytes = Jsons.getEstimatedByteSize(recordMessage.getData());
116115
final long currentTotalStreamBytes = streamToTotalBytesEmitted.getOrDefault(streamIndex, 0L);
117-
streamToTotalBytesEmitted.put(streamIndex, currentTotalStreamBytes + numBytes);
116+
streamToTotalBytesEmitted.put(streamIndex, currentTotalStreamBytes + estimatedNumBytes);
118117
}
119118

120119
/**

airbyte-workers/src/test/java/io/airbyte/workers/internal/AirbyteMessageTrackerTest.java

+4-5
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
import static org.junit.jupiter.api.Assertions.assertEquals;
88
import static org.junit.jupiter.api.Assertions.assertTrue;
99

10-
import com.google.common.base.Charsets;
1110
import io.airbyte.commons.json.Jsons;
1211
import io.airbyte.config.FailureReason;
1312
import io.airbyte.config.State;
@@ -53,7 +52,7 @@ public void testGetTotalRecordsStatesAndBytesEmitted() {
5352
messageTracker.acceptFromSource(s2);
5453

5554
assertEquals(3, messageTracker.getTotalRecordsEmitted());
56-
assertEquals(3 * Jsons.serialize(r1.getRecord().getData()).getBytes(Charsets.UTF_8).length, messageTracker.getTotalBytesEmitted());
55+
assertEquals(3L * Jsons.getEstimatedByteSize(r1.getRecord().getData()), messageTracker.getTotalBytesEmitted());
5756
assertEquals(2, messageTracker.getTotalStateMessagesEmitted());
5857
}
5958

@@ -112,9 +111,9 @@ public void testEmittedBytesByStream() {
112111
final AirbyteMessage r2 = AirbyteMessageUtils.createRecordMessage(STREAM_2, 2);
113112
final AirbyteMessage r3 = AirbyteMessageUtils.createRecordMessage(STREAM_3, 3);
114113

115-
final long r1Bytes = Jsons.serialize(r1.getRecord().getData()).getBytes(Charsets.UTF_8).length;
116-
final long r2Bytes = Jsons.serialize(r2.getRecord().getData()).getBytes(Charsets.UTF_8).length;
117-
final long r3Bytes = Jsons.serialize(r3.getRecord().getData()).getBytes(Charsets.UTF_8).length;
114+
final long r1Bytes = Jsons.getEstimatedByteSize(r1.getRecord().getData());
115+
final long r2Bytes = Jsons.getEstimatedByteSize(r2.getRecord().getData());
116+
final long r3Bytes = Jsons.getEstimatedByteSize(r3.getRecord().getData());
118117

119118
messageTracker.acceptFromSource(r1);
120119
messageTracker.acceptFromSource(r2);

0 commit comments

Comments
 (0)