fix: avoid "Unable to determine type" warning with JSON columns in to_dataframe (#1876)

tswast · web-flow · commit 968020d5be9d · 2025-03-20T11:08:48.000-05:00
* add regression tests for empty dataframe
* fix arrow test to be compatible with old pyarrow
diff --git a/google/cloud/bigquery/_helpers.py b/google/cloud/bigquery/_helpers.py
@@ -387,6 +387,21 @@ def range_to_py(self, value, field):
 CELL_DATA_PARSER = CellDataParser()
 
 
+class DataFrameCellDataParser(CellDataParser):
+    """Override of CellDataParser to handle differences in expection of values in DataFrame-like outputs.
+
+    This is used to turn the output of the REST API into a pyarrow Table,
+    emulating the serialized arrow from the BigQuery Storage Read API.
+    """
+
+    def json_to_py(self, value, _):
+        """No-op because DataFrame expects string for JSON output."""
+        return value
+
+
+DATA_FRAME_CELL_DATA_PARSER = DataFrameCellDataParser()
+
+
 class ScalarQueryParamParser(CellDataParser):
     """Override of CellDataParser to handle the differences in the response from query params.
 
diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py
@@ -158,6 +158,7 @@ def finish(self):
         b"ARROW:extension:metadata": b'{"encoding": "WKT"}',
     },
     "DATETIME": {b"ARROW:extension:name": b"google:sqlType:datetime"},
+    "JSON": {b"ARROW:extension:name": b"google:sqlType:json"},
 }
 
 
diff --git a/google/cloud/bigquery/_pyarrow_helpers.py b/google/cloud/bigquery/_pyarrow_helpers.py
@@ -15,7 +15,9 @@
 """Shared helper functions for connecting BigQuery and pyarrow.
 
 NOTE: This module is DEPRECATED. Please make updates in the pandas-gbq package,
-instead. See: go/pandas-gbq-and-bigframes-redundancy and
+instead. See: go/pandas-gbq-and-bigframes-redundancy,
+https://github.com/googleapis/python-bigquery-pandas/blob/main/pandas_gbq/schema/bigquery_to_pyarrow.py
+and
 https://github.com/googleapis/python-bigquery-pandas/blob/main/pandas_gbq/schema/pyarrow_to_bigquery.py
 """
 
@@ -26,6 +28,14 @@
 except ImportError:
     pyarrow = None
 
+try:
+    import db_dtypes  # type: ignore
+
+    db_dtypes_import_exception = None
+except ImportError as exc:
+    db_dtypes = None
+    db_dtypes_import_exception = exc
+
 
 def pyarrow_datetime():
     return pyarrow.timestamp("us", tz=None)
@@ -67,12 +77,18 @@ def pyarrow_timestamp():
         "GEOGRAPHY": pyarrow.string,
         "INT64": pyarrow.int64,
         "INTEGER": pyarrow.int64,
+        # Normally, we'd prefer JSON type built-in to pyarrow (added in 19.0.0),
+        # but we'd like this to map as closely to the BQ Storage API as
+        # possible, which uses the string() dtype, as JSON support in Arrow
+        # predates JSON support in BigQuery by several years.
+        "JSON": pyarrow.string,
         "NUMERIC": pyarrow_numeric,
         "STRING": pyarrow.string,
         "TIME": pyarrow_time,
         "TIMESTAMP": pyarrow_timestamp,
     }
 
+    # DEPRECATED: update pandas_gbq.schema.pyarrow_to_bigquery, instead.
     _ARROW_SCALAR_IDS_TO_BQ = {
         # https://arrow.apache.org/docs/python/api/datatypes.html#type-classes
         pyarrow.bool_().id: "BOOL",
@@ -97,6 +113,9 @@ def pyarrow_timestamp():
         pyarrow.large_string().id: "STRING",
         # The exact scale and precision don't matter, see below.
         pyarrow.decimal128(38, scale=9).id: "NUMERIC",
+        # NOTE: all extension types (e.g. json_, uuid, db_dtypes.JSONArrowType)
+        # have the same id (31 as of version 19.0.1), so these should not be
+        # matched by id.
     }
 
     _BQ_TO_ARROW_SCALARS["BIGNUMERIC"] = pyarrow_bignumeric
@@ -107,6 +126,9 @@ def pyarrow_timestamp():
 
 def bq_to_arrow_scalars(bq_scalar: str):
     """
+    DEPRECATED: update pandas_gbq.schema.bigquery_to_pyarrow, instead, which is
+    to be added in https://github.com/googleapis/python-bigquery-pandas/pull/893.
+
     Returns:
         The Arrow scalar type that the input BigQuery scalar type maps to.
         If it cannot find the BigQuery scalar, return None.
@@ -116,6 +138,8 @@ def bq_to_arrow_scalars(bq_scalar: str):
 
 def arrow_scalar_ids_to_bq(arrow_scalar: Any):
     """
+    DEPRECATED: update pandas_gbq.schema.pyarrow_to_bigquery, instead.
+
     Returns:
         The BigQuery scalar type that the input arrow scalar type maps to.
         If it cannot find the arrow scalar, return None.
diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py
@@ -3533,7 +3533,9 @@ def _row_iterator_page_columns(schema, response):
 
     def get_column_data(field_index, field):
         for row in rows:
-            yield _helpers.CELL_DATA_PARSER.to_py(row["f"][field_index]["v"], field)
+            yield _helpers.DATA_FRAME_CELL_DATA_PARSER.to_py(
+                row["f"][field_index]["v"], field
+            )
 
     for field_index, field in enumerate(schema):
         columns.append(get_column_data(field_index, field))
diff --git a/tests/system/test_arrow.py b/tests/system/test_arrow.py
@@ -194,3 +194,32 @@ def test_list_rows_range_csv(
 
     range_type = schema.field("range_date").type
     assert range_type == expected_type
+
+
+def test_to_arrow_query_with_empty_results(bigquery_client):
+    """
+    JSON regression test for https://github.com/googleapis/python-bigquery/issues/1580.
+    """
+    job = bigquery_client.query(
+        """
+        select
+        123 as int_col,
+        '' as string_col,
+        to_json('{}') as json_col,
+        struct(to_json('[]') as json_field, -1 as int_field) as struct_col,
+        [to_json('null')] as json_array_col,
+        from unnest([])
+        """
+    )
+    table = job.to_arrow()
+    assert list(table.column_names) == [
+        "int_col",
+        "string_col",
+        "json_col",
+        "struct_col",
+        "json_array_col",
+    ]
+    assert table.shape == (0, 5)
+    struct_type = table.field("struct_col").type
+    assert struct_type.get_field_index("json_field") == 0
+    assert struct_type.get_field_index("int_field") == 1
diff --git a/tests/system/test_pandas.py b/tests/system/test_pandas.py
@@ -1304,6 +1304,32 @@ def test_upload_time_and_datetime_56(bigquery_client, dataset_id):
     ]
 
 
+def test_to_dataframe_query_with_empty_results(bigquery_client):
+    """
+    JSON regression test for https://github.com/googleapis/python-bigquery/issues/1580.
+    """
+    job = bigquery_client.query(
+        """
+        select
+        123 as int_col,
+        '' as string_col,
+        to_json('{}') as json_col,
+        struct(to_json('[]') as json_field, -1 as int_field) as struct_col,
+        [to_json('null')] as json_array_col,
+        from unnest([])
+        """
+    )
+    df = job.to_dataframe()
+    assert list(df.columns) == [
+        "int_col",
+        "string_col",
+        "json_col",
+        "struct_col",
+        "json_array_col",
+    ]
+    assert len(df.index) == 0
+
+
 def test_to_dataframe_geography_as_objects(bigquery_client, dataset_id):
     wkt = pytest.importorskip("shapely.wkt")
     bigquery_client.query(
diff --git a/tests/unit/_helpers/test_data_frame_cell_data_parser.py b/tests/unit/_helpers/test_data_frame_cell_data_parser.py
@@ -0,0 +1,71 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+import google.cloud.bigquery.schema
+
+
+def create_field(mode="NULLABLE", type_="IGNORED", name="test_field", **kwargs):
+    return google.cloud.bigquery.schema.SchemaField(name, type_, mode=mode, **kwargs)
+
+
+@pytest.fixture
+def mut():
+    from google.cloud.bigquery import _helpers
+
+    return _helpers
+
+
+@pytest.fixture
+def object_under_test(mut):
+    return mut.DATA_FRAME_CELL_DATA_PARSER
+
+
+def test_json_to_py_doesnt_parse_json(object_under_test):
+    coerced = object_under_test.json_to_py('{"key":"value"}', create_field())
+    assert coerced == '{"key":"value"}'
+
+
+def test_json_to_py_repeated_doesnt_parse_json(object_under_test):
+    coerced = object_under_test.json_to_py('{"key":"value"}', create_field("REPEATED"))
+    assert coerced == '{"key":"value"}'
+
+
+def test_record_to_py_doesnt_parse_json(object_under_test):
+    subfield = create_field(type_="JSON", name="json")
+    field = create_field(fields=[subfield])
+    value = {"f": [{"v": '{"key":"value"}'}]}
+    coerced = object_under_test.record_to_py(value, field)
+    assert coerced == {"json": '{"key":"value"}'}
+
+
+def test_record_to_py_doesnt_parse_repeated_json(object_under_test):
+    subfield = create_field("REPEATED", "JSON", name="json")
+    field = create_field("REQUIRED", fields=[subfield])
+    value = {
+        "f": [
+            {
+                "v": [
+                    {"v": '{"key":"value0"}'},
+                    {"v": '{"key":"value1"}'},
+                    {"v": '{"key":"value2"}'},
+                ]
+            }
+        ]
+    }
+    coerced = object_under_test.record_to_py(value, field)
+    assert coerced == {
+        "json": ['{"key":"value0"}', '{"key":"value1"}', '{"key":"value2"}']
+    }
diff --git a/tests/unit/test__pyarrow_helpers.py b/tests/unit/test__pyarrow_helpers.py
@@ -27,8 +27,16 @@ def module_under_test():
 
 def test_bq_to_arrow_scalars(module_under_test):
     assert (
-        module_under_test.bq_to_arrow_scalars("BIGNUMERIC")
-        == module_under_test.pyarrow_bignumeric
+        module_under_test.bq_to_arrow_scalars("BIGNUMERIC")()
+        == module_under_test.pyarrow_bignumeric()
+    )
+    assert (
+        # Normally, we'd prefer JSON type built-in to pyarrow (added in 19.0.0),
+        # but we'd like this to map as closely to the BQ Storage API as
+        # possible, which uses the string() dtype, as JSON support in Arrow
+        # predates JSON support in BigQuery by several years.
+        module_under_test.bq_to_arrow_scalars("JSON")()
+        == pyarrow.string()
     )
     assert module_under_test.bq_to_arrow_scalars("UNKNOWN_TYPE") is None
 
diff --git a/tests/unit/test_table_arrow.py b/tests/unit/test_table_arrow.py
@@ -28,6 +28,7 @@ def test_to_arrow_with_jobs_query_response():
             "fields": [
                 {"name": "name", "type": "STRING", "mode": "NULLABLE"},
                 {"name": "number", "type": "INTEGER", "mode": "NULLABLE"},
+                {"name": "json", "type": "JSON", "mode": "NULLABLE"},
             ]
         },
         "jobReference": {
@@ -37,15 +38,21 @@ def test_to_arrow_with_jobs_query_response():
         },
         "totalRows": "9",
         "rows": [
-            {"f": [{"v": "Tiarra"}, {"v": "6"}]},
-            {"f": [{"v": "Timothy"}, {"v": "325"}]},
-            {"f": [{"v": "Tina"}, {"v": "26"}]},
-            {"f": [{"v": "Tierra"}, {"v": "10"}]},
-            {"f": [{"v": "Tia"}, {"v": "17"}]},
-            {"f": [{"v": "Tiara"}, {"v": "22"}]},
-            {"f": [{"v": "Tiana"}, {"v": "6"}]},
-            {"f": [{"v": "Tiffany"}, {"v": "229"}]},
-            {"f": [{"v": "Tiffani"}, {"v": "8"}]},
+            {"f": [{"v": "Tiarra"}, {"v": "6"}, {"v": "123"}]},
+            {"f": [{"v": "Timothy"}, {"v": "325"}, {"v": '{"key":"value"}'}]},
+            {"f": [{"v": "Tina"}, {"v": "26"}, {"v": "[1,2,3]"}]},
+            {
+                "f": [
+                    {"v": "Tierra"},
+                    {"v": "10"},
+                    {"v": '{"aKey": {"bKey": {"cKey": -123}}}'},
+                ]
+            },
+            {"f": [{"v": "Tia"}, {"v": "17"}, {"v": None}]},
+            {"f": [{"v": "Tiara"}, {"v": "22"}, {"v": '"some-json-string"'}]},
+            {"f": [{"v": "Tiana"}, {"v": "6"}, {"v": '{"nullKey":null}'}]},
+            {"f": [{"v": "Tiffany"}, {"v": "229"}, {"v": '""'}]},
+            {"f": [{"v": "Tiffani"}, {"v": "8"}, {"v": "[]"}]},
         ],
         "totalBytesProcessed": "154775150",
         "jobComplete": True,
@@ -65,7 +72,7 @@ def test_to_arrow_with_jobs_query_response():
     )
     records = rows.to_arrow()
 
-    assert records.column_names == ["name", "number"]
+    assert records.column_names == ["name", "number", "json"]
     assert records["name"].to_pylist() == [
         "Tiarra",
         "Timothy",
@@ -78,6 +85,17 @@ def test_to_arrow_with_jobs_query_response():
         "Tiffani",
     ]
     assert records["number"].to_pylist() == [6, 325, 26, 10, 17, 22, 6, 229, 8]
+    assert records["json"].to_pylist() == [
+        "123",
+        '{"key":"value"}',
+        "[1,2,3]",
+        '{"aKey": {"bKey": {"cKey": -123}}}',
+        None,
+        '"some-json-string"',
+        '{"nullKey":null}',
+        '""',
+        "[]",
+    ]
 
 
 def test_to_arrow_with_jobs_query_response_and_max_results():
@@ -87,6 +105,7 @@ def test_to_arrow_with_jobs_query_response_and_max_results():
             "fields": [
                 {"name": "name", "type": "STRING", "mode": "NULLABLE"},
                 {"name": "number", "type": "INTEGER", "mode": "NULLABLE"},
+                {"name": "json", "type": "JSON", "mode": "NULLABLE"},
             ]
         },
         "jobReference": {
@@ -96,15 +115,21 @@ def test_to_arrow_with_jobs_query_response_and_max_results():
         },
         "totalRows": "9",
         "rows": [
-            {"f": [{"v": "Tiarra"}, {"v": "6"}]},
-            {"f": [{"v": "Timothy"}, {"v": "325"}]},
-            {"f": [{"v": "Tina"}, {"v": "26"}]},
-            {"f": [{"v": "Tierra"}, {"v": "10"}]},
-            {"f": [{"v": "Tia"}, {"v": "17"}]},
-            {"f": [{"v": "Tiara"}, {"v": "22"}]},
-            {"f": [{"v": "Tiana"}, {"v": "6"}]},
-            {"f": [{"v": "Tiffany"}, {"v": "229"}]},
-            {"f": [{"v": "Tiffani"}, {"v": "8"}]},
+            {"f": [{"v": "Tiarra"}, {"v": "6"}, {"v": "123"}]},
+            {"f": [{"v": "Timothy"}, {"v": "325"}, {"v": '{"key":"value"}'}]},
+            {"f": [{"v": "Tina"}, {"v": "26"}, {"v": "[1,2,3]"}]},
+            {
+                "f": [
+                    {"v": "Tierra"},
+                    {"v": "10"},
+                    {"v": '{"aKey": {"bKey": {"cKey": -123}}}'},
+                ]
+            },
+            {"f": [{"v": "Tia"}, {"v": "17"}, {"v": None}]},
+            {"f": [{"v": "Tiara"}, {"v": "22"}, {"v": '"some-json-string"'}]},
+            {"f": [{"v": "Tiana"}, {"v": "6"}, {"v": '{"nullKey":null}'}]},
+            {"f": [{"v": "Tiffany"}, {"v": "229"}, {"v": '""'}]},
+            {"f": [{"v": "Tiffani"}, {"v": "8"}, {"v": "[]"}]},
         ],
         "totalBytesProcessed": "154775150",
         "jobComplete": True,
@@ -125,10 +150,11 @@ def test_to_arrow_with_jobs_query_response_and_max_results():
     )
     records = rows.to_arrow()
 
-    assert records.column_names == ["name", "number"]
+    assert records.column_names == ["name", "number", "json"]
     assert records["name"].to_pylist() == [
         "Tiarra",
         "Timothy",
         "Tina",
     ]
     assert records["number"].to_pylist() == [6, 325, 26]
+    assert records["json"].to_pylist() == ["123", '{"key":"value"}', "[1,2,3]"]
diff --git a/tests/unit/test_table_pandas.py b/tests/unit/test_table_pandas.py

Original file line number	Diff line number	Diff line change
`@@ -158,6 +158,7 @@ def finish(self):`
`158`	`158`	`b"ARROW:extension:metadata": b'{"encoding": "WKT"}',`
`159`	`159`	`},`
`160`	`160`	`"DATETIME": {b"ARROW:extension:name": b"google:sqlType:datetime"},`
	`161`	`+ "JSON": {b"ARROW:extension:name": b"google:sqlType:json"},`
`161`	`162`	`}`
`162`	`163`
`163`	`164`