Skip to content

feat!: Enable reading JSON data with dbjson extension dtype #1139

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jan 23, 2025
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions bigframes/bigquery/_operations/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def json_set(
>>> s = bpd.read_gbq("SELECT JSON '{\\\"a\\\": 1}' AS data")["data"]
>>> bbq.json_set(s, json_path_value_pairs=[("$.a", 100), ("$.b", "hi")])
0 {"a":100,"b":"hi"}
Name: data, dtype: large_string[pyarrow]
Name: data, dtype: dbjson

Args:
input (bigframes.series.Series):
Expand Down Expand Up @@ -253,7 +253,7 @@ def parse_json(
dtype: string
>>> bbq.parse_json(s)
0 {"class":{"students":[{"id":5},{"id":12}]}}
dtype: large_string[pyarrow]
dtype: dbjson

Args:
input (bigframes.series.Series):
Expand Down
4 changes: 2 additions & 2 deletions bigframes/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,8 @@ def from_table(
raise ValueError("must set at most one of 'offests', 'primary_key'")
if any(i.field_type == "JSON" for i in table.schema if i.name in schema.names):
msg = (
"Interpreting JSON column(s) as pyarrow.large_string. "
"This behavior may change in future versions."
"Interpreting JSON column(s) as the `db_dtypes.dbjson` extension type is"
"in preview; this behavior may change in future versions."
)
warnings.warn(msg, bfe.PreviewWarning)
# define data source only for needed columns, this makes row-hashing cheaper
Expand Down
11 changes: 2 additions & 9 deletions bigframes/core/compile/ibis_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
import textwrap
import typing
from typing import Any, cast, Dict, Iterable, Optional, Tuple, Union
import warnings

import bigframes_vendored.constants as constants
import bigframes_vendored.ibis
Expand All @@ -26,14 +25,14 @@
dtype as python_type_to_ibis_type,
)
import bigframes_vendored.ibis.expr.types as ibis_types
import db_dtypes # type: ignore
import geopandas as gpd # type: ignore
import google.cloud.bigquery as bigquery
import numpy as np
import pandas as pd
import pyarrow as pa

import bigframes.dtypes
import bigframes.exceptions as bfe

# Type hints for Ibis data types supported by BigQuery DataFrame
IbisDtype = Union[
Expand Down Expand Up @@ -76,7 +75,7 @@
ibis_dtypes.GeoSpatial(geotype="geography", srid=4326, nullable=True),
gpd.array.GeometryDtype(),
),
(ibis_dtypes.json, pd.ArrowDtype(pa.large_string())),
(ibis_dtypes.json, db_dtypes.JSONDtype()),
)

BIGFRAMES_TO_IBIS: Dict[bigframes.dtypes.Dtype, ibis_dtypes.DataType] = {
Expand Down Expand Up @@ -305,13 +304,7 @@ def ibis_dtype_to_bigframes_dtype(
if isinstance(ibis_dtype, ibis_dtypes.Integer):
return pd.Int64Dtype()

# Temporary: Will eventually support an explicit json type instead of casting to string.
if isinstance(ibis_dtype, ibis_dtypes.JSON):
msg = (
"Interpreting JSON column(s) as pyarrow.large_string. This behavior may change "
"in future versions."
)
warnings.warn(msg, category=bfe.PreviewWarning)
return bigframes.dtypes.JSON_DTYPE

if ibis_dtype in IBIS_TO_BIGFRAMES:
Expand Down
60 changes: 19 additions & 41 deletions bigframes/core/compile/scalar_op_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1188,34 +1188,33 @@ def array_slice_op_impl(x: ibis_types.Value, op: ops.ArraySliceOp):
# JSON Ops
@scalar_op_compiler.register_binary_op(ops.JSONSet, pass_op=True)
def json_set_op_impl(x: ibis_types.Value, y: ibis_types.Value, op: ops.JSONSet):
if x.type().is_json():
return json_set(
json_obj=x,
json_path=op.json_path,
json_value=y,
)
else:
# Enabling JSON type eliminates the need for less efficient string conversions.
return to_json_string(
json_set( # type: ignore
json_obj=parse_json(json_str=x),
json_path=op.json_path,
json_value=y,
)
)
return json_set(json_obj=x, json_path=op.json_path, json_value=y)


@scalar_op_compiler.register_unary_op(ops.JSONExtract, pass_op=True)
def json_extract_op_impl(x: ibis_types.Value, op: ops.JSONExtract):
if x.type().is_json():
return json_extract(json_obj=x, json_path=op.json_path)
# json string
return json_extract_string(json_obj=x, json_path=op.json_path)
# Define a user-defined function whose returned type is dynamically matching the input.
def json_extract(json_or_json_string, json_path: ibis_dtypes.str): # type: ignore
"""Extracts a JSON value and converts it to a SQL JSON-formatted STRING or JSON value."""
...

return_type = x.type()
json_extract.__annotations__["return"] = return_type
json_extract_op = ibis_udf.scalar.builtin(json_extract)
return json_extract_op(json_or_json_string=x, json_path=op.json_path)


@scalar_op_compiler.register_unary_op(ops.JSONExtractArray, pass_op=True)
def json_extract_array_op_impl(x: ibis_types.Value, op: ops.JSONExtractArray):
return json_extract_array(json_obj=x, json_path=op.json_path)
# Define a user-defined function whose returned type is dynamically matching the input.
def json_extract_array(json_or_json_string, json_path: ibis_dtypes.str): # type: ignore
"""Extracts a JSON value and converts it to a SQL JSON-formatted STRING or JSON value."""
...

return_type = x.type()
json_extract_array.__annotations__["return"] = ibis_dtypes.Array[return_type] # type: ignore
json_extract_op = ibis_udf.scalar.builtin(json_extract_array)
return json_extract_op(json_or_json_string=x, json_path=op.json_path)


@scalar_op_compiler.register_unary_op(ops.JSONExtractStringArray, pass_op=True)
Expand Down Expand Up @@ -1937,27 +1936,6 @@ def json_set( # type: ignore[empty-body]
"""Produces a new SQL JSON value with the specified JSON data inserted or replaced."""


@ibis_udf.scalar.builtin(name="json_extract")
def json_extract( # type: ignore[empty-body]
json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.String
) -> ibis_dtypes.JSON:
"""Extracts a JSON value and converts it to a JSON value."""


@ibis_udf.scalar.builtin(name="json_extract")
def json_extract_string( # type: ignore[empty-body]
json_obj: ibis_dtypes.String, json_path: ibis_dtypes.String
) -> ibis_dtypes.String:
"""Extracts a JSON SRING value and converts it to a SQL JSON-formatted STRING."""


@ibis_udf.scalar.builtin(name="json_extract_array")
def json_extract_array( # type: ignore[empty-body]
json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.String
) -> ibis_dtypes.Array[ibis_dtypes.String]:
"""Extracts a JSON array and converts it to a SQL ARRAY of JSON-formatted STRINGs or JSON values."""


@ibis_udf.scalar.builtin(name="json_extract_string_array")
def json_extract_string_array( # type: ignore[empty-body]
json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.String
Expand Down
6 changes: 3 additions & 3 deletions bigframes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from typing import Any, Dict, List, Literal, Union

import bigframes_vendored.constants as constants
import db_dtypes # type: ignore
import geopandas as gpd # type: ignore
import google.cloud.bigquery
import numpy as np
Expand Down Expand Up @@ -59,7 +60,7 @@
# No arrow equivalent
GEO_DTYPE = gpd.array.GeometryDtype()
# JSON
JSON_DTYPE = pd.ArrowDtype(pa.large_string())
JSON_DTYPE = db_dtypes.JSONDtype()
OBJ_REF_DTYPE = pd.ArrowDtype(
pa.struct(
(
Expand Down Expand Up @@ -161,7 +162,7 @@ class SimpleDtypeInfo:
),
SimpleDtypeInfo(
dtype=JSON_DTYPE,
arrow_dtype=pa.large_string(),
arrow_dtype=db_dtypes.JSONArrowType(),
type_kind=("JSON",),
orderable=False,
clusterable=False,
Expand Down Expand Up @@ -320,7 +321,6 @@ def is_struct_like(type_: ExpressionType) -> bool:


def is_json_like(type_: ExpressionType) -> bool:
# TODO: Add JSON type support
return type_ == JSON_DTYPE or type_ == STRING_DTYPE # Including JSON string


Expand Down
5 changes: 2 additions & 3 deletions bigframes/operations/json_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def output_type(self, *input_types):
+ f" Received type: {input_type}"
)
return pd.ArrowDtype(
pa.list_(dtypes.bigframes_dtype_to_arrow_dtype(dtypes.STRING_DTYPE))
pa.list_(dtypes.bigframes_dtype_to_arrow_dtype(input_type))
)


Expand Down Expand Up @@ -118,8 +118,7 @@ def output_type(self, *input_types):
+ f"Received type: {right_type}"
)

# After JSON type implementation, ONLY return JSON data.
return left_type
return dtypes.JSON_DTYPE


@dataclasses.dataclass(frozen=True)
Expand Down
3 changes: 3 additions & 0 deletions bigframes/session/_io/pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from typing import Collection, Union

import bigframes_vendored.constants as constants
import db_dtypes # type: ignore
import geopandas # type: ignore
import numpy as np
import pandas
Expand Down Expand Up @@ -122,6 +123,8 @@ def arrow_to_pandas(
)
elif isinstance(dtype, pandas.ArrowDtype):
series = _arrow_to_pandas_arrowdtype(column, dtype)
elif isinstance(dtype, db_dtypes.JSONDtype):
series = db_dtypes.JSONArray(column)
else:
series = column.to_pandas(types_mapper=lambda _: dtype)

Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@
"ipywidgets >=7.7.1",
"humanize >=4.6.0",
"matplotlib >=3.7.1",
"db-dtypes >=1.4.0",
# For vendored ibis-framework.
"atpublic>=2.3,<6",
"parsy>=2,<3",
Expand Down
1 change: 1 addition & 0 deletions testing/constraints-3.9.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ tabulate==0.9
ipywidgets==7.7.1
humanize==4.6.0
matplotlib==3.7.1
db-dtypes==1.4.0
# For vendored ibis-framework.
atpublic==2.3
parsy==2.0
Expand Down
64 changes: 46 additions & 18 deletions tests/system/small/bigquery/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,6 @@ def test_json_set_w_invalid_series_type():
def test_json_extract_from_json():
s = _get_series_from_json([{"a": {"b": [1, 2]}}, {"a": {"c": 1}}, {"a": {"b": 0}}])
actual = bbq.json_extract(s, "$.a.b").to_pandas()
# After the introduction of the JSON type, the output should be a JSON-formatted series.
expected = _get_series_from_json([[1, 2], None, 0]).to_pandas()
pd.testing.assert_series_equal(
actual,
Expand All @@ -129,12 +128,10 @@ def test_json_extract_from_json():
def test_json_extract_from_string():
s = bpd.Series(['{"a": {"b": [1, 2]}}', '{"a": {"c": 1}}', '{"a": {"b": 0}}'])
actual = bbq.json_extract(s, "$.a.b")
expected = _get_series_from_json([[1, 2], None, 0])
expected = bpd.Series(["[1,2]", None, "0"])
pd.testing.assert_series_equal(
actual.to_pandas(),
expected.to_pandas(),
check_names=False,
check_dtype=False, # json_extract returns string type. While _get_series_from_json gives a JSON series (pa.large_string).
)


Expand All @@ -143,29 +140,68 @@ def test_json_extract_w_invalid_series_type():
bbq.json_extract(bpd.Series([1, 2]), "$.a")


def test_json_extract_array_from_json():
s = _get_series_from_json(
[{"a": ["ab", "2", "3 xy"]}, {"a": []}, {"a": ["4", "5"]}, {}]
)
actual = bbq.json_extract_array(s, "$.a")

# This code provides a workaround for issue https://github.com/apache/arrow/issues/45262,
# which currently prevents constructing a series using the pa.list_(db_types.JSONArrrowType())
sql = """
SELECT 0 AS id, [JSON '"ab"', JSON '"2"', JSON '"3 xy"'] AS data,
UNION ALL
SELECT 1, [],
UNION ALL
SELECT 2, [JSON '"4"', JSON '"5"'],
UNION ALL
SELECT 3, null,
"""
df = bpd.read_gbq(sql).set_index("id").sort_index()
expected = df["data"]

pd.testing.assert_series_equal(
actual.to_pandas(),
expected.to_pandas(),
)


def test_json_extract_array_from_json_strings():
s = bpd.Series(['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4","5"]}'])
s = bpd.Series(
['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4","5"]}', "{}"],
dtype=pd.StringDtype(storage="pyarrow"),
)
actual = bbq.json_extract_array(s, "$.a")
expected = bpd.Series([['"ab"', '"2"', '"3 xy"'], [], ['"4"', '"5"']])
expected = bpd.Series(
[['"ab"', '"2"', '"3 xy"'], [], ['"4"', '"5"'], None],
dtype=pd.StringDtype(storage="pyarrow"),
)
pd.testing.assert_series_equal(
actual.to_pandas(),
expected.to_pandas(),
)


def test_json_extract_array_from_array_strings():
s = bpd.Series(["[1, 2, 3]", "[]", "[4,5]"])
def test_json_extract_array_from_json_array_strings():
s = bpd.Series(
["[1, 2, 3]", "[]", "[4,5]"],
dtype=pd.StringDtype(storage="pyarrow"),
)
actual = bbq.json_extract_array(s)
expected = bpd.Series([["1", "2", "3"], [], ["4", "5"]])
expected = bpd.Series(
[["1", "2", "3"], [], ["4", "5"]],
dtype=pd.StringDtype(storage="pyarrow"),
)
pd.testing.assert_series_equal(
actual.to_pandas(),
expected.to_pandas(),
)


def test_json_extract_array_w_invalid_series_type():
s = bpd.Series([1, 2])
with pytest.raises(TypeError):
bbq.json_extract_array(bpd.Series([1, 2]))
bbq.json_extract_array(s)


def test_json_extract_string_array_from_json_strings():
Expand Down Expand Up @@ -203,14 +239,6 @@ def test_json_extract_string_array_w_invalid_series_type():
bbq.json_extract_string_array(bpd.Series([1, 2]))


# b/381148539
def test_json_in_struct():
df = bpd.read_gbq(
"SELECT STRUCT(JSON '{\\\"a\\\": 1}' AS data, 1 AS number) as struct_col"
)
assert df["struct_col"].struct.field("data")[0] == '{"a":1}'
Comment on lines -206 to -211
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we keep / update this test, instead? I'd like to make sure we avoid regressions since I believe this was added to make sure we can work with some AI/ML/ObjectRef features.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I moved this test to test_dataframe_io.py. Also add similar tests for both struct and array



def test_parse_json_w_invalid_series_type():
with pytest.raises(TypeError):
bbq.parse_json(bpd.Series([1, 2]))
Loading
Loading