Skip to content

Commit 968020d

Browse files
authored
fix: avoid "Unable to determine type" warning with JSON columns in to_dataframe (#1876)
* add regression tests for empty dataframe * fix arrow test to be compatible with old pyarrow
1 parent 9acd9c1 commit 968020d

10 files changed

+230
-24
lines changed

google/cloud/bigquery/_helpers.py

+15
Original file line numberDiff line numberDiff line change
@@ -387,6 +387,21 @@ def range_to_py(self, value, field):
387387
CELL_DATA_PARSER = CellDataParser()
388388

389389

390+
class DataFrameCellDataParser(CellDataParser):
391+
"""Override of CellDataParser to handle differences in expection of values in DataFrame-like outputs.
392+
393+
This is used to turn the output of the REST API into a pyarrow Table,
394+
emulating the serialized arrow from the BigQuery Storage Read API.
395+
"""
396+
397+
def json_to_py(self, value, _):
398+
"""No-op because DataFrame expects string for JSON output."""
399+
return value
400+
401+
402+
DATA_FRAME_CELL_DATA_PARSER = DataFrameCellDataParser()
403+
404+
390405
class ScalarQueryParamParser(CellDataParser):
391406
"""Override of CellDataParser to handle the differences in the response from query params.
392407

google/cloud/bigquery/_pandas_helpers.py

+1
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,7 @@ def finish(self):
158158
b"ARROW:extension:metadata": b'{"encoding": "WKT"}',
159159
},
160160
"DATETIME": {b"ARROW:extension:name": b"google:sqlType:datetime"},
161+
"JSON": {b"ARROW:extension:name": b"google:sqlType:json"},
161162
}
162163

163164

google/cloud/bigquery/_pyarrow_helpers.py

+25-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,9 @@
1515
"""Shared helper functions for connecting BigQuery and pyarrow.
1616
1717
NOTE: This module is DEPRECATED. Please make updates in the pandas-gbq package,
18-
instead. See: go/pandas-gbq-and-bigframes-redundancy and
18+
instead. See: go/pandas-gbq-and-bigframes-redundancy,
19+
https://github.com/googleapis/python-bigquery-pandas/blob/main/pandas_gbq/schema/bigquery_to_pyarrow.py
20+
and
1921
https://github.com/googleapis/python-bigquery-pandas/blob/main/pandas_gbq/schema/pyarrow_to_bigquery.py
2022
"""
2123

@@ -26,6 +28,14 @@
2628
except ImportError:
2729
pyarrow = None
2830

31+
try:
32+
import db_dtypes # type: ignore
33+
34+
db_dtypes_import_exception = None
35+
except ImportError as exc:
36+
db_dtypes = None
37+
db_dtypes_import_exception = exc
38+
2939

3040
def pyarrow_datetime():
3141
return pyarrow.timestamp("us", tz=None)
@@ -67,12 +77,18 @@ def pyarrow_timestamp():
6777
"GEOGRAPHY": pyarrow.string,
6878
"INT64": pyarrow.int64,
6979
"INTEGER": pyarrow.int64,
80+
# Normally, we'd prefer JSON type built-in to pyarrow (added in 19.0.0),
81+
# but we'd like this to map as closely to the BQ Storage API as
82+
# possible, which uses the string() dtype, as JSON support in Arrow
83+
# predates JSON support in BigQuery by several years.
84+
"JSON": pyarrow.string,
7085
"NUMERIC": pyarrow_numeric,
7186
"STRING": pyarrow.string,
7287
"TIME": pyarrow_time,
7388
"TIMESTAMP": pyarrow_timestamp,
7489
}
7590

91+
# DEPRECATED: update pandas_gbq.schema.pyarrow_to_bigquery, instead.
7692
_ARROW_SCALAR_IDS_TO_BQ = {
7793
# https://arrow.apache.org/docs/python/api/datatypes.html#type-classes
7894
pyarrow.bool_().id: "BOOL",
@@ -97,6 +113,9 @@ def pyarrow_timestamp():
97113
pyarrow.large_string().id: "STRING",
98114
# The exact scale and precision don't matter, see below.
99115
pyarrow.decimal128(38, scale=9).id: "NUMERIC",
116+
# NOTE: all extension types (e.g. json_, uuid, db_dtypes.JSONArrowType)
117+
# have the same id (31 as of version 19.0.1), so these should not be
118+
# matched by id.
100119
}
101120

102121
_BQ_TO_ARROW_SCALARS["BIGNUMERIC"] = pyarrow_bignumeric
@@ -107,6 +126,9 @@ def pyarrow_timestamp():
107126

108127
def bq_to_arrow_scalars(bq_scalar: str):
109128
"""
129+
DEPRECATED: update pandas_gbq.schema.bigquery_to_pyarrow, instead, which is
130+
to be added in https://github.com/googleapis/python-bigquery-pandas/pull/893.
131+
110132
Returns:
111133
The Arrow scalar type that the input BigQuery scalar type maps to.
112134
If it cannot find the BigQuery scalar, return None.
@@ -116,6 +138,8 @@ def bq_to_arrow_scalars(bq_scalar: str):
116138

117139
def arrow_scalar_ids_to_bq(arrow_scalar: Any):
118140
"""
141+
DEPRECATED: update pandas_gbq.schema.pyarrow_to_bigquery, instead.
142+
119143
Returns:
120144
The BigQuery scalar type that the input arrow scalar type maps to.
121145
If it cannot find the arrow scalar, return None.

google/cloud/bigquery/table.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -3533,7 +3533,9 @@ def _row_iterator_page_columns(schema, response):
35333533

35343534
def get_column_data(field_index, field):
35353535
for row in rows:
3536-
yield _helpers.CELL_DATA_PARSER.to_py(row["f"][field_index]["v"], field)
3536+
yield _helpers.DATA_FRAME_CELL_DATA_PARSER.to_py(
3537+
row["f"][field_index]["v"], field
3538+
)
35373539

35383540
for field_index, field in enumerate(schema):
35393541
columns.append(get_column_data(field_index, field))

tests/system/test_arrow.py

+29
Original file line numberDiff line numberDiff line change
@@ -194,3 +194,32 @@ def test_list_rows_range_csv(
194194

195195
range_type = schema.field("range_date").type
196196
assert range_type == expected_type
197+
198+
199+
def test_to_arrow_query_with_empty_results(bigquery_client):
200+
"""
201+
JSON regression test for https://github.com/googleapis/python-bigquery/issues/1580.
202+
"""
203+
job = bigquery_client.query(
204+
"""
205+
select
206+
123 as int_col,
207+
'' as string_col,
208+
to_json('{}') as json_col,
209+
struct(to_json('[]') as json_field, -1 as int_field) as struct_col,
210+
[to_json('null')] as json_array_col,
211+
from unnest([])
212+
"""
213+
)
214+
table = job.to_arrow()
215+
assert list(table.column_names) == [
216+
"int_col",
217+
"string_col",
218+
"json_col",
219+
"struct_col",
220+
"json_array_col",
221+
]
222+
assert table.shape == (0, 5)
223+
struct_type = table.field("struct_col").type
224+
assert struct_type.get_field_index("json_field") == 0
225+
assert struct_type.get_field_index("int_field") == 1

tests/system/test_pandas.py

+26
Original file line numberDiff line numberDiff line change
@@ -1304,6 +1304,32 @@ def test_upload_time_and_datetime_56(bigquery_client, dataset_id):
13041304
]
13051305

13061306

1307+
def test_to_dataframe_query_with_empty_results(bigquery_client):
1308+
"""
1309+
JSON regression test for https://github.com/googleapis/python-bigquery/issues/1580.
1310+
"""
1311+
job = bigquery_client.query(
1312+
"""
1313+
select
1314+
123 as int_col,
1315+
'' as string_col,
1316+
to_json('{}') as json_col,
1317+
struct(to_json('[]') as json_field, -1 as int_field) as struct_col,
1318+
[to_json('null')] as json_array_col,
1319+
from unnest([])
1320+
"""
1321+
)
1322+
df = job.to_dataframe()
1323+
assert list(df.columns) == [
1324+
"int_col",
1325+
"string_col",
1326+
"json_col",
1327+
"struct_col",
1328+
"json_array_col",
1329+
]
1330+
assert len(df.index) == 0
1331+
1332+
13071333
def test_to_dataframe_geography_as_objects(bigquery_client, dataset_id):
13081334
wkt = pytest.importorskip("shapely.wkt")
13091335
bigquery_client.query(
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import pytest
16+
17+
import google.cloud.bigquery.schema
18+
19+
20+
def create_field(mode="NULLABLE", type_="IGNORED", name="test_field", **kwargs):
21+
return google.cloud.bigquery.schema.SchemaField(name, type_, mode=mode, **kwargs)
22+
23+
24+
@pytest.fixture
25+
def mut():
26+
from google.cloud.bigquery import _helpers
27+
28+
return _helpers
29+
30+
31+
@pytest.fixture
32+
def object_under_test(mut):
33+
return mut.DATA_FRAME_CELL_DATA_PARSER
34+
35+
36+
def test_json_to_py_doesnt_parse_json(object_under_test):
37+
coerced = object_under_test.json_to_py('{"key":"value"}', create_field())
38+
assert coerced == '{"key":"value"}'
39+
40+
41+
def test_json_to_py_repeated_doesnt_parse_json(object_under_test):
42+
coerced = object_under_test.json_to_py('{"key":"value"}', create_field("REPEATED"))
43+
assert coerced == '{"key":"value"}'
44+
45+
46+
def test_record_to_py_doesnt_parse_json(object_under_test):
47+
subfield = create_field(type_="JSON", name="json")
48+
field = create_field(fields=[subfield])
49+
value = {"f": [{"v": '{"key":"value"}'}]}
50+
coerced = object_under_test.record_to_py(value, field)
51+
assert coerced == {"json": '{"key":"value"}'}
52+
53+
54+
def test_record_to_py_doesnt_parse_repeated_json(object_under_test):
55+
subfield = create_field("REPEATED", "JSON", name="json")
56+
field = create_field("REQUIRED", fields=[subfield])
57+
value = {
58+
"f": [
59+
{
60+
"v": [
61+
{"v": '{"key":"value0"}'},
62+
{"v": '{"key":"value1"}'},
63+
{"v": '{"key":"value2"}'},
64+
]
65+
}
66+
]
67+
}
68+
coerced = object_under_test.record_to_py(value, field)
69+
assert coerced == {
70+
"json": ['{"key":"value0"}', '{"key":"value1"}', '{"key":"value2"}']
71+
}

tests/unit/test__pyarrow_helpers.py

+10-2
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,16 @@ def module_under_test():
2727

2828
def test_bq_to_arrow_scalars(module_under_test):
2929
assert (
30-
module_under_test.bq_to_arrow_scalars("BIGNUMERIC")
31-
== module_under_test.pyarrow_bignumeric
30+
module_under_test.bq_to_arrow_scalars("BIGNUMERIC")()
31+
== module_under_test.pyarrow_bignumeric()
32+
)
33+
assert (
34+
# Normally, we'd prefer JSON type built-in to pyarrow (added in 19.0.0),
35+
# but we'd like this to map as closely to the BQ Storage API as
36+
# possible, which uses the string() dtype, as JSON support in Arrow
37+
# predates JSON support in BigQuery by several years.
38+
module_under_test.bq_to_arrow_scalars("JSON")()
39+
== pyarrow.string()
3240
)
3341
assert module_under_test.bq_to_arrow_scalars("UNKNOWN_TYPE") is None
3442

tests/unit/test_table_arrow.py

+46-20
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ def test_to_arrow_with_jobs_query_response():
2828
"fields": [
2929
{"name": "name", "type": "STRING", "mode": "NULLABLE"},
3030
{"name": "number", "type": "INTEGER", "mode": "NULLABLE"},
31+
{"name": "json", "type": "JSON", "mode": "NULLABLE"},
3132
]
3233
},
3334
"jobReference": {
@@ -37,15 +38,21 @@ def test_to_arrow_with_jobs_query_response():
3738
},
3839
"totalRows": "9",
3940
"rows": [
40-
{"f": [{"v": "Tiarra"}, {"v": "6"}]},
41-
{"f": [{"v": "Timothy"}, {"v": "325"}]},
42-
{"f": [{"v": "Tina"}, {"v": "26"}]},
43-
{"f": [{"v": "Tierra"}, {"v": "10"}]},
44-
{"f": [{"v": "Tia"}, {"v": "17"}]},
45-
{"f": [{"v": "Tiara"}, {"v": "22"}]},
46-
{"f": [{"v": "Tiana"}, {"v": "6"}]},
47-
{"f": [{"v": "Tiffany"}, {"v": "229"}]},
48-
{"f": [{"v": "Tiffani"}, {"v": "8"}]},
41+
{"f": [{"v": "Tiarra"}, {"v": "6"}, {"v": "123"}]},
42+
{"f": [{"v": "Timothy"}, {"v": "325"}, {"v": '{"key":"value"}'}]},
43+
{"f": [{"v": "Tina"}, {"v": "26"}, {"v": "[1,2,3]"}]},
44+
{
45+
"f": [
46+
{"v": "Tierra"},
47+
{"v": "10"},
48+
{"v": '{"aKey": {"bKey": {"cKey": -123}}}'},
49+
]
50+
},
51+
{"f": [{"v": "Tia"}, {"v": "17"}, {"v": None}]},
52+
{"f": [{"v": "Tiara"}, {"v": "22"}, {"v": '"some-json-string"'}]},
53+
{"f": [{"v": "Tiana"}, {"v": "6"}, {"v": '{"nullKey":null}'}]},
54+
{"f": [{"v": "Tiffany"}, {"v": "229"}, {"v": '""'}]},
55+
{"f": [{"v": "Tiffani"}, {"v": "8"}, {"v": "[]"}]},
4956
],
5057
"totalBytesProcessed": "154775150",
5158
"jobComplete": True,
@@ -65,7 +72,7 @@ def test_to_arrow_with_jobs_query_response():
6572
)
6673
records = rows.to_arrow()
6774

68-
assert records.column_names == ["name", "number"]
75+
assert records.column_names == ["name", "number", "json"]
6976
assert records["name"].to_pylist() == [
7077
"Tiarra",
7178
"Timothy",
@@ -78,6 +85,17 @@ def test_to_arrow_with_jobs_query_response():
7885
"Tiffani",
7986
]
8087
assert records["number"].to_pylist() == [6, 325, 26, 10, 17, 22, 6, 229, 8]
88+
assert records["json"].to_pylist() == [
89+
"123",
90+
'{"key":"value"}',
91+
"[1,2,3]",
92+
'{"aKey": {"bKey": {"cKey": -123}}}',
93+
None,
94+
'"some-json-string"',
95+
'{"nullKey":null}',
96+
'""',
97+
"[]",
98+
]
8199

82100

83101
def test_to_arrow_with_jobs_query_response_and_max_results():
@@ -87,6 +105,7 @@ def test_to_arrow_with_jobs_query_response_and_max_results():
87105
"fields": [
88106
{"name": "name", "type": "STRING", "mode": "NULLABLE"},
89107
{"name": "number", "type": "INTEGER", "mode": "NULLABLE"},
108+
{"name": "json", "type": "JSON", "mode": "NULLABLE"},
90109
]
91110
},
92111
"jobReference": {
@@ -96,15 +115,21 @@ def test_to_arrow_with_jobs_query_response_and_max_results():
96115
},
97116
"totalRows": "9",
98117
"rows": [
99-
{"f": [{"v": "Tiarra"}, {"v": "6"}]},
100-
{"f": [{"v": "Timothy"}, {"v": "325"}]},
101-
{"f": [{"v": "Tina"}, {"v": "26"}]},
102-
{"f": [{"v": "Tierra"}, {"v": "10"}]},
103-
{"f": [{"v": "Tia"}, {"v": "17"}]},
104-
{"f": [{"v": "Tiara"}, {"v": "22"}]},
105-
{"f": [{"v": "Tiana"}, {"v": "6"}]},
106-
{"f": [{"v": "Tiffany"}, {"v": "229"}]},
107-
{"f": [{"v": "Tiffani"}, {"v": "8"}]},
118+
{"f": [{"v": "Tiarra"}, {"v": "6"}, {"v": "123"}]},
119+
{"f": [{"v": "Timothy"}, {"v": "325"}, {"v": '{"key":"value"}'}]},
120+
{"f": [{"v": "Tina"}, {"v": "26"}, {"v": "[1,2,3]"}]},
121+
{
122+
"f": [
123+
{"v": "Tierra"},
124+
{"v": "10"},
125+
{"v": '{"aKey": {"bKey": {"cKey": -123}}}'},
126+
]
127+
},
128+
{"f": [{"v": "Tia"}, {"v": "17"}, {"v": None}]},
129+
{"f": [{"v": "Tiara"}, {"v": "22"}, {"v": '"some-json-string"'}]},
130+
{"f": [{"v": "Tiana"}, {"v": "6"}, {"v": '{"nullKey":null}'}]},
131+
{"f": [{"v": "Tiffany"}, {"v": "229"}, {"v": '""'}]},
132+
{"f": [{"v": "Tiffani"}, {"v": "8"}, {"v": "[]"}]},
108133
],
109134
"totalBytesProcessed": "154775150",
110135
"jobComplete": True,
@@ -125,10 +150,11 @@ def test_to_arrow_with_jobs_query_response_and_max_results():
125150
)
126151
records = rows.to_arrow()
127152

128-
assert records.column_names == ["name", "number"]
153+
assert records.column_names == ["name", "number", "json"]
129154
assert records["name"].to_pylist() == [
130155
"Tiarra",
131156
"Timothy",
132157
"Tina",
133158
]
134159
assert records["number"].to_pylist() == [6, 325, 26]
160+
assert records["json"].to_pylist() == ["123", '{"key":"value"}', "[1,2,3]"]

0 commit comments

Comments
 (0)