googleapis · tswast · Jan 23, 2025 · Jan 22, 2025 · Jan 22, 2025 · Jan 22, 2025
@@ -53,7 +53,7 @@ def json_set(
         >>> s = bpd.read_gbq("SELECT JSON '{\\\"a\\\": 1}' AS data")["data"]
         >>> bbq.json_set(s, json_path_value_pairs=[("$.a", 100), ("$.b", "hi")])
             0    {"a":100,"b":"hi"}
-            Name: data, dtype: large_string[pyarrow]
+            Name: data, dtype: dbjson
 
     Args:
         input (bigframes.series.Series):
@@ -253,7 +253,7 @@ def parse_json(
         dtype: string
         >>> bbq.parse_json(s)
         0    {"class":{"students":[{"id":5},{"id":12}]}}
-        dtype: large_string[pyarrow]
+        dtype: dbjson
 
     Args:
         input (bigframes.series.Series):

@@ -108,8 +108,8 @@ def from_table(
             raise ValueError("must set at most one of 'offests', 'primary_key'")
         if any(i.field_type == "JSON" for i in table.schema if i.name in schema.names):
             msg = (
-                "Interpreting JSON column(s) as pyarrow.large_string. "
-                "This behavior may change in future versions."
+                "Interpreting JSON column(s) as the `db_dtypes.dbjson` extension type is"
+                "in preview; this behavior may change in future versions."
             )
             warnings.warn(msg, bfe.PreviewWarning)
         # define data source only for needed columns, this makes row-hashing cheaper

@@ -16,7 +16,6 @@
 import textwrap
 import typing
 from typing import Any, cast, Dict, Iterable, Optional, Tuple, Union
-import warnings
 
 import bigframes_vendored.constants as constants
 import bigframes_vendored.ibis
@@ -26,14 +25,14 @@
     dtype as python_type_to_ibis_type,
 )
 import bigframes_vendored.ibis.expr.types as ibis_types
+import db_dtypes  # type: ignore
 import geopandas as gpd  # type: ignore
 import google.cloud.bigquery as bigquery
 import numpy as np
 import pandas as pd
 import pyarrow as pa
 
 import bigframes.dtypes
-import bigframes.exceptions as bfe
 
 # Type hints for Ibis data types supported by BigQuery DataFrame
 IbisDtype = Union[
@@ -76,7 +75,7 @@
         ibis_dtypes.GeoSpatial(geotype="geography", srid=4326, nullable=True),
         gpd.array.GeometryDtype(),
     ),
-    (ibis_dtypes.json, pd.ArrowDtype(pa.large_string())),
+    (ibis_dtypes.json, db_dtypes.JSONDtype()),
 )
 
 BIGFRAMES_TO_IBIS: Dict[bigframes.dtypes.Dtype, ibis_dtypes.DataType] = {
@@ -305,13 +304,7 @@ def ibis_dtype_to_bigframes_dtype(
     if isinstance(ibis_dtype, ibis_dtypes.Integer):
         return pd.Int64Dtype()
 
-    # Temporary: Will eventually support an explicit json type instead of casting to string.
     if isinstance(ibis_dtype, ibis_dtypes.JSON):
-        msg = (
-            "Interpreting JSON column(s) as pyarrow.large_string. This behavior may change "
-            "in future versions."
-        )
-        warnings.warn(msg, category=bfe.PreviewWarning)
         return bigframes.dtypes.JSON_DTYPE
 
     if ibis_dtype in IBIS_TO_BIGFRAMES:

@@ -1188,34 +1188,33 @@ def array_slice_op_impl(x: ibis_types.Value, op: ops.ArraySliceOp):
 # JSON Ops
 @scalar_op_compiler.register_binary_op(ops.JSONSet, pass_op=True)
 def json_set_op_impl(x: ibis_types.Value, y: ibis_types.Value, op: ops.JSONSet):
-    if x.type().is_json():
-        return json_set(
-            json_obj=x,
-            json_path=op.json_path,
-            json_value=y,
-        )
-    else:
-        # Enabling JSON type eliminates the need for less efficient string conversions.
-        return to_json_string(
-            json_set(  # type: ignore
-                json_obj=parse_json(json_str=x),
-                json_path=op.json_path,
-                json_value=y,
-            )
-        )
+    return json_set(json_obj=x, json_path=op.json_path, json_value=y)
 
 
 @scalar_op_compiler.register_unary_op(ops.JSONExtract, pass_op=True)
 def json_extract_op_impl(x: ibis_types.Value, op: ops.JSONExtract):
-    if x.type().is_json():
-        return json_extract(json_obj=x, json_path=op.json_path)
-    # json string
-    return json_extract_string(json_obj=x, json_path=op.json_path)
+    # Define a user-defined function whose returned type is dynamically matching the input.
+    def json_extract(json_or_json_string, json_path: ibis_dtypes.str):  # type: ignore
+        """Extracts a JSON value and converts it to a SQL JSON-formatted STRING or JSON value."""
+        ...
+
+    return_type = x.type()
+    json_extract.__annotations__["return"] = return_type
+    json_extract_op = ibis_udf.scalar.builtin(json_extract)
+    return json_extract_op(json_or_json_string=x, json_path=op.json_path)
 
 
 @scalar_op_compiler.register_unary_op(ops.JSONExtractArray, pass_op=True)
 def json_extract_array_op_impl(x: ibis_types.Value, op: ops.JSONExtractArray):
-    return json_extract_array(json_obj=x, json_path=op.json_path)
+    # Define a user-defined function whose returned type is dynamically matching the input.
+    def json_extract_array(json_or_json_string, json_path: ibis_dtypes.str):  # type: ignore
+        """Extracts a JSON value and converts it to a SQL JSON-formatted STRING or JSON value."""
+        ...
+
+    return_type = x.type()
+    json_extract_array.__annotations__["return"] = ibis_dtypes.Array[return_type]  # type: ignore
+    json_extract_op = ibis_udf.scalar.builtin(json_extract_array)
+    return json_extract_op(json_or_json_string=x, json_path=op.json_path)
 
 
 @scalar_op_compiler.register_unary_op(ops.JSONExtractStringArray, pass_op=True)
@@ -1937,27 +1936,6 @@ def json_set(  # type: ignore[empty-body]
     """Produces a new SQL JSON value with the specified JSON data inserted or replaced."""
 
 
-@ibis_udf.scalar.builtin(name="json_extract")
-def json_extract(  # type: ignore[empty-body]
-    json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.String
-) -> ibis_dtypes.JSON:
-    """Extracts a JSON value and converts it to a JSON value."""
-
-
-@ibis_udf.scalar.builtin(name="json_extract")
-def json_extract_string(  # type: ignore[empty-body]
-    json_obj: ibis_dtypes.String, json_path: ibis_dtypes.String
-) -> ibis_dtypes.String:
-    """Extracts a JSON SRING value and converts it to a SQL JSON-formatted STRING."""
-
-
-@ibis_udf.scalar.builtin(name="json_extract_array")
-def json_extract_array(  # type: ignore[empty-body]
-    json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.String
-) -> ibis_dtypes.Array[ibis_dtypes.String]:
-    """Extracts a JSON array and converts it to a SQL ARRAY of JSON-formatted STRINGs or JSON values."""
-
-
 @ibis_udf.scalar.builtin(name="json_extract_string_array")
 def json_extract_string_array(  # type: ignore[empty-body]
     json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.String

@@ -21,6 +21,7 @@
 from typing import Any, Dict, List, Literal, Union
 
 import bigframes_vendored.constants as constants
+import db_dtypes  # type: ignore
 import geopandas as gpd  # type: ignore
 import google.cloud.bigquery
 import numpy as np
@@ -59,7 +60,7 @@
 # No arrow equivalent
 GEO_DTYPE = gpd.array.GeometryDtype()
 # JSON
-JSON_DTYPE = pd.ArrowDtype(pa.large_string())
+JSON_DTYPE = db_dtypes.JSONDtype()
 OBJ_REF_DTYPE = pd.ArrowDtype(
     pa.struct(
         (
@@ -161,7 +162,7 @@ class SimpleDtypeInfo:
     ),
     SimpleDtypeInfo(
         dtype=JSON_DTYPE,
-        arrow_dtype=pa.large_string(),
+        arrow_dtype=db_dtypes.JSONArrowType(),
         type_kind=("JSON",),
         orderable=False,
         clusterable=False,
@@ -320,7 +321,6 @@ def is_struct_like(type_: ExpressionType) -> bool:
 
 
 def is_json_like(type_: ExpressionType) -> bool:
-    # TODO: Add JSON type support
     return type_ == JSON_DTYPE or type_ == STRING_DTYPE  # Including JSON string
 
 

@@ -50,7 +50,7 @@ def output_type(self, *input_types):
                 + f" Received type: {input_type}"
             )
         return pd.ArrowDtype(
-            pa.list_(dtypes.bigframes_dtype_to_arrow_dtype(dtypes.STRING_DTYPE))
+            pa.list_(dtypes.bigframes_dtype_to_arrow_dtype(input_type))
         )
 
 
@@ -118,8 +118,7 @@ def output_type(self, *input_types):
                 + f"Received type: {right_type}"
             )
 
-        # After JSON type implementation, ONLY return JSON data.
-        return left_type
+        return dtypes.JSON_DTYPE
 
 
 @dataclasses.dataclass(frozen=True)

@@ -17,6 +17,7 @@
 from typing import Collection, Union
 
 import bigframes_vendored.constants as constants
+import db_dtypes  # type: ignore
 import geopandas  # type: ignore
 import numpy as np
 import pandas
@@ -122,6 +123,8 @@ def arrow_to_pandas(
             )
         elif isinstance(dtype, pandas.ArrowDtype):
             series = _arrow_to_pandas_arrowdtype(column, dtype)
+        elif isinstance(dtype, db_dtypes.JSONDtype):
+            series = db_dtypes.JSONArray(column)
         else:
             series = column.to_pandas(types_mapper=lambda _: dtype)
 

@@ -62,6 +62,7 @@
     "ipywidgets >=7.7.1",
     "humanize >=4.6.0",
     "matplotlib >=3.7.1",
+    "db-dtypes >=1.4.0",
     # For vendored ibis-framework.
     "atpublic>=2.3,<6",
     "parsy>=2,<3",

@@ -26,6 +26,7 @@ tabulate==0.9
 ipywidgets==7.7.1
 humanize==4.6.0
 matplotlib==3.7.1
+db-dtypes==1.4.0
 # For vendored ibis-framework.
 atpublic==2.3
 parsy==2.0

@@ -118,7 +118,6 @@ def test_json_set_w_invalid_series_type():
 def test_json_extract_from_json():
     s = _get_series_from_json([{"a": {"b": [1, 2]}}, {"a": {"c": 1}}, {"a": {"b": 0}}])
     actual = bbq.json_extract(s, "$.a.b").to_pandas()
-    # After the introduction of the JSON type, the output should be a JSON-formatted series.
     expected = _get_series_from_json([[1, 2], None, 0]).to_pandas()
     pd.testing.assert_series_equal(
         actual,
@@ -129,12 +128,10 @@ def test_json_extract_from_json():
 def test_json_extract_from_string():
     s = bpd.Series(['{"a": {"b": [1, 2]}}', '{"a": {"c": 1}}', '{"a": {"b": 0}}'])
     actual = bbq.json_extract(s, "$.a.b")
-    expected = _get_series_from_json([[1, 2], None, 0])
+    expected = bpd.Series(["[1,2]", None, "0"])
     pd.testing.assert_series_equal(
         actual.to_pandas(),
         expected.to_pandas(),
-        check_names=False,
-        check_dtype=False,  # json_extract returns string type. While _get_series_from_json gives a JSON series (pa.large_string).
     )
 
 
@@ -143,29 +140,68 @@ def test_json_extract_w_invalid_series_type():
         bbq.json_extract(bpd.Series([1, 2]), "$.a")
 
 
+def test_json_extract_array_from_json():
+    s = _get_series_from_json(
+        [{"a": ["ab", "2", "3 xy"]}, {"a": []}, {"a": ["4", "5"]}, {}]
+    )
+    actual = bbq.json_extract_array(s, "$.a")
+
+    # This code provides a workaround for issue https://github.com/apache/arrow/issues/45262,
+    # which currently prevents constructing a series using the pa.list_(db_types.JSONArrrowType())
+    sql = """
+        SELECT 0 AS id, [JSON '"ab"', JSON '"2"', JSON '"3 xy"'] AS data,
+        UNION ALL
+        SELECT 1, [],
+        UNION ALL
+        SELECT 2, [JSON '"4"', JSON '"5"'],
+        UNION ALL
+        SELECT 3, null,
+    """
+    df = bpd.read_gbq(sql).set_index("id").sort_index()
+    expected = df["data"]
+
+    pd.testing.assert_series_equal(
+        actual.to_pandas(),
+        expected.to_pandas(),
+    )
+
+
 def test_json_extract_array_from_json_strings():
-    s = bpd.Series(['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4","5"]}'])
+    s = bpd.Series(
+        ['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4","5"]}', "{}"],
+        dtype=pd.StringDtype(storage="pyarrow"),
+    )
     actual = bbq.json_extract_array(s, "$.a")
-    expected = bpd.Series([['"ab"', '"2"', '"3 xy"'], [], ['"4"', '"5"']])
+    expected = bpd.Series(
+        [['"ab"', '"2"', '"3 xy"'], [], ['"4"', '"5"'], None],
+        dtype=pd.StringDtype(storage="pyarrow"),
+    )
     pd.testing.assert_series_equal(
         actual.to_pandas(),
         expected.to_pandas(),
     )
 
 
-def test_json_extract_array_from_array_strings():
-    s = bpd.Series(["[1, 2, 3]", "[]", "[4,5]"])
+def test_json_extract_array_from_json_array_strings():
+    s = bpd.Series(
+        ["[1, 2, 3]", "[]", "[4,5]"],
+        dtype=pd.StringDtype(storage="pyarrow"),
+    )
     actual = bbq.json_extract_array(s)
-    expected = bpd.Series([["1", "2", "3"], [], ["4", "5"]])
+    expected = bpd.Series(
+        [["1", "2", "3"], [], ["4", "5"]],
+        dtype=pd.StringDtype(storage="pyarrow"),
+    )
     pd.testing.assert_series_equal(
         actual.to_pandas(),
         expected.to_pandas(),
     )
 
 
 def test_json_extract_array_w_invalid_series_type():
+    s = bpd.Series([1, 2])
     with pytest.raises(TypeError):
-        bbq.json_extract_array(bpd.Series([1, 2]))
+        bbq.json_extract_array(s)
 
 
 def test_json_extract_string_array_from_json_strings():
@@ -203,14 +239,6 @@ def test_json_extract_string_array_w_invalid_series_type():
         bbq.json_extract_string_array(bpd.Series([1, 2]))
 
 
-# b/381148539
-def test_json_in_struct():
-    df = bpd.read_gbq(
-        "SELECT STRUCT(JSON '{\\\"a\\\": 1}' AS data, 1 AS number) as struct_col"
-    )
-    assert df["struct_col"].struct.field("data")[0] == '{"a":1}'
-
-
 def test_parse_json_w_invalid_series_type():
     with pytest.raises(TypeError):
         bbq.parse_json(bpd.Series([1, 2]))