Skip to content

Commit 2580ef9

Browse files
committed
fix: avoid "Unable to determine type" warning with JSON columns in to_dataframe
1 parent 56de5c4 commit 2580ef9

File tree

7 files changed

+173
-24
lines changed

7 files changed

+173
-24
lines changed

google/cloud/bigquery/_helpers.py

+15
Original file line numberDiff line numberDiff line change
@@ -353,6 +353,21 @@ def range_to_py(self, value, field):
353353
CELL_DATA_PARSER = CellDataParser()
354354

355355

356+
class DataFrameCellDataParser(CellDataParser):
357+
"""Override of CellDataParser to handle differences in expection of values in DataFrame-like outputs.
358+
359+
This is used to turn the output of the REST API into a pyarrow Table,
360+
emulating the serialized arrow from the BigQuery Storage Read API.
361+
"""
362+
363+
def json_to_py(self, value, _):
364+
"""No-op because DataFrame expects string for JSON output."""
365+
return value
366+
367+
368+
DATA_FRAME_CELL_DATA_PARSER = DataFrameCellDataParser()
369+
370+
356371
class ScalarQueryParamParser(CellDataParser):
357372
"""Override of CellDataParser to handle the differences in the response from query params.
358373

google/cloud/bigquery/_pandas_helpers.py

+1
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,7 @@ def finish(self):
158158
b"ARROW:extension:metadata": b'{"encoding": "WKT"}',
159159
},
160160
"DATETIME": {b"ARROW:extension:name": b"google:sqlType:datetime"},
161+
"JSON": {b"ARROW:extension:name": b"google:sqlType:json"},
161162
}
162163

163164

google/cloud/bigquery/_pyarrow_helpers.py

+31-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,9 @@
1515
"""Shared helper functions for connecting BigQuery and pyarrow.
1616
1717
NOTE: This module is DEPRECATED. Please make updates in the pandas-gbq package,
18-
instead. See: go/pandas-gbq-and-bigframes-redundancy and
18+
instead. See: go/pandas-gbq-and-bigframes-redundancy,
19+
https://github.com/googleapis/python-bigquery-pandas/blob/main/pandas_gbq/schema/bigquery_to_pyarrow.py
20+
and
1921
https://github.com/googleapis/python-bigquery-pandas/blob/main/pandas_gbq/schema/pyarrow_to_bigquery.py
2022
"""
2123

@@ -26,6 +28,14 @@
2628
except ImportError:
2729
pyarrow = None
2830

31+
try:
32+
import db_dtypes # type: ignore
33+
34+
db_dtypes_import_exception = None
35+
except ImportError as exc:
36+
db_dtypes = None
37+
db_dtypes_import_exception = exc
38+
2939

3040
def pyarrow_datetime():
3141
return pyarrow.timestamp("us", tz=None)
@@ -53,6 +63,16 @@ def pyarrow_timestamp():
5363
_ARROW_SCALAR_IDS_TO_BQ = {}
5464

5565
if pyarrow:
66+
# Prefer JSON type built-in to pyarrow (adding in 19.0.0), if available.
67+
# Otherwise, fallback to db-dtypes, where the JSONArrowType was added in 1.4.0,
68+
# but since they might have an older db-dtypes, have string as a fallback for that.
69+
# TODO(https://github.com/pandas-dev/pandas/issues/60958): switch to
70+
# pyarrow.json_(pyarrow.string()) if available and supported by pandas.
71+
if hasattr(db_dtypes, "JSONArrowType"):
72+
json_arrow_type = db_dtypes.JSONArrowType()
73+
else:
74+
json_arrow_type = pyarrow.string()
75+
5676
# This dictionary is duplicated in bigquery_storage/test/unite/test_reader.py
5777
# When modifying it be sure to update it there as well.
5878
# Note(todo!!): type "BIGNUMERIC"'s matching pyarrow type is added in _pandas_helpers.py
@@ -67,12 +87,14 @@ def pyarrow_timestamp():
6787
"GEOGRAPHY": pyarrow.string,
6888
"INT64": pyarrow.int64,
6989
"INTEGER": pyarrow.int64,
90+
"JSON": lambda: json_arrow_type,
7091
"NUMERIC": pyarrow_numeric,
7192
"STRING": pyarrow.string,
7293
"TIME": pyarrow_time,
7394
"TIMESTAMP": pyarrow_timestamp,
7495
}
7596

97+
# DEPRECATED: update pandas_gbq.schema.pyarrow_to_bigquery, instead.
7698
_ARROW_SCALAR_IDS_TO_BQ = {
7799
# https://arrow.apache.org/docs/python/api/datatypes.html#type-classes
78100
pyarrow.bool_().id: "BOOL",
@@ -97,6 +119,9 @@ def pyarrow_timestamp():
97119
pyarrow.large_string().id: "STRING",
98120
# The exact scale and precision don't matter, see below.
99121
pyarrow.decimal128(38, scale=9).id: "NUMERIC",
122+
# NOTE: all extension types (e.g. json_, uuid, db_dtypes.JSONArrowType)
123+
# have the same id (31 as of version 19.0.1), so these should not be
124+
# matched by id.
100125
}
101126

102127
_BQ_TO_ARROW_SCALARS["BIGNUMERIC"] = pyarrow_bignumeric
@@ -107,6 +132,9 @@ def pyarrow_timestamp():
107132

108133
def bq_to_arrow_scalars(bq_scalar: str):
109134
"""
135+
DEPRECATED: update pandas_gbq.schema.bigquery_to_pyarrow, instead, which is
136+
to be added in https://github.com/googleapis/python-bigquery-pandas/pull/893.
137+
110138
Returns:
111139
The Arrow scalar type that the input BigQuery scalar type maps to.
112140
If it cannot find the BigQuery scalar, return None.
@@ -116,6 +144,8 @@ def bq_to_arrow_scalars(bq_scalar: str):
116144

117145
def arrow_scalar_ids_to_bq(arrow_scalar: Any):
118146
"""
147+
DEPRECATED: update pandas_gbq.schema.pyarrow_to_bigquery, instead.
148+
119149
Returns:
120150
The BigQuery scalar type that the input arrow scalar type maps to.
121151
If it cannot find the arrow scalar, return None.

google/cloud/bigquery/table.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -3533,7 +3533,9 @@ def _row_iterator_page_columns(schema, response):
35333533

35343534
def get_column_data(field_index, field):
35353535
for row in rows:
3536-
yield _helpers.CELL_DATA_PARSER.to_py(row["f"][field_index]["v"], field)
3536+
yield _helpers.DATA_FRAME_CELL_DATA_PARSER.to_py(
3537+
row["f"][field_index]["v"], field
3538+
)
35373539

35383540
for field_index, field in enumerate(schema):
35393541
columns.append(get_column_data(field_index, field))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import pytest
16+
17+
import google.cloud.bigquery.schema
18+
19+
20+
def create_field(mode="NULLABLE", type_="IGNORED", name="test_field", **kwargs):
21+
return google.cloud.bigquery.schema.SchemaField(name, type_, mode=mode, **kwargs)
22+
23+
24+
@pytest.fixture
25+
def mut():
26+
from google.cloud.bigquery import _helpers
27+
28+
return _helpers
29+
30+
31+
@pytest.fixture
32+
def object_under_test(mut):
33+
return mut.DATA_FRAME_CELL_DATA_PARSER
34+
35+
36+
def test_json_to_py_doesnt_parse_json(object_under_test):
37+
coerced = object_under_test.json_to_py('{"key":"value"}', create_field())
38+
assert coerced == '{"key":"value"}'
39+
40+
41+
def test_json_to_py_repeated_doesnt_parse_json(object_under_test):
42+
coerced = object_under_test.json_to_py('{"key":"value"}', create_field("REPEATED"))
43+
assert coerced == '{"key":"value"}'
44+
45+
46+
def test_record_to_py_doesnt_parse_json(object_under_test):
47+
subfield = create_field(type_="JSON", name="json")
48+
field = create_field(fields=[subfield])
49+
value = {"f": [{"v": '{"key":"value"}'}]}
50+
coerced = object_under_test.record_to_py(value, field)
51+
assert coerced == {"json": '{"key":"value"}'}
52+
53+
54+
def test_record_to_py_doesnt_parse_repeated_json(object_under_test):
55+
subfield = create_field("REPEATED", "JSON", name="json")
56+
field = create_field("REQUIRED", fields=[subfield])
57+
value = {
58+
"f": [
59+
{
60+
"v": [
61+
{"v": '{"key":"value0"}'},
62+
{"v": '{"key":"value1"}'},
63+
{"v": '{"key":"value2"}'},
64+
]
65+
}
66+
]
67+
}
68+
coerced = object_under_test.record_to_py(value, field)
69+
assert coerced == {
70+
"json": ['{"key":"value0"}', '{"key":"value1"}', '{"key":"value2"}']
71+
}

tests/unit/test__pyarrow_helpers.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,12 @@ def module_under_test():
2727

2828
def test_bq_to_arrow_scalars(module_under_test):
2929
assert (
30-
module_under_test.bq_to_arrow_scalars("BIGNUMERIC")
31-
== module_under_test.pyarrow_bignumeric
30+
module_under_test.bq_to_arrow_scalars("BIGNUMERIC")()
31+
== module_under_test.pyarrow_bignumeric()
32+
)
33+
assert (
34+
module_under_test.bq_to_arrow_scalars("JSON")()
35+
== module_under_test.json_arrow_type
3236
)
3337
assert module_under_test.bq_to_arrow_scalars("UNKNOWN_TYPE") is None
3438

tests/unit/test_table_arrow.py

+46-20
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ def test_to_arrow_with_jobs_query_response():
2828
"fields": [
2929
{"name": "name", "type": "STRING", "mode": "NULLABLE"},
3030
{"name": "number", "type": "INTEGER", "mode": "NULLABLE"},
31+
{"name": "json", "type": "JSON", "mode": "NULLABLE"},
3132
]
3233
},
3334
"jobReference": {
@@ -37,15 +38,21 @@ def test_to_arrow_with_jobs_query_response():
3738
},
3839
"totalRows": "9",
3940
"rows": [
40-
{"f": [{"v": "Tiarra"}, {"v": "6"}]},
41-
{"f": [{"v": "Timothy"}, {"v": "325"}]},
42-
{"f": [{"v": "Tina"}, {"v": "26"}]},
43-
{"f": [{"v": "Tierra"}, {"v": "10"}]},
44-
{"f": [{"v": "Tia"}, {"v": "17"}]},
45-
{"f": [{"v": "Tiara"}, {"v": "22"}]},
46-
{"f": [{"v": "Tiana"}, {"v": "6"}]},
47-
{"f": [{"v": "Tiffany"}, {"v": "229"}]},
48-
{"f": [{"v": "Tiffani"}, {"v": "8"}]},
41+
{"f": [{"v": "Tiarra"}, {"v": "6"}, {"v": "123"}]},
42+
{"f": [{"v": "Timothy"}, {"v": "325"}, {"v": '{"key":"value"}'}]},
43+
{"f": [{"v": "Tina"}, {"v": "26"}, {"v": "[1,2,3]"}]},
44+
{
45+
"f": [
46+
{"v": "Tierra"},
47+
{"v": "10"},
48+
{"v": '{"aKey": {"bKey": {"cKey": -123}}}'},
49+
]
50+
},
51+
{"f": [{"v": "Tia"}, {"v": "17"}, {"v": None}]},
52+
{"f": [{"v": "Tiara"}, {"v": "22"}, {"v": '"some-json-string"'}]},
53+
{"f": [{"v": "Tiana"}, {"v": "6"}, {"v": '{"nullKey":null}'}]},
54+
{"f": [{"v": "Tiffany"}, {"v": "229"}, {"v": '""'}]},
55+
{"f": [{"v": "Tiffani"}, {"v": "8"}, {"v": "[]"}]},
4956
],
5057
"totalBytesProcessed": "154775150",
5158
"jobComplete": True,
@@ -65,7 +72,7 @@ def test_to_arrow_with_jobs_query_response():
6572
)
6673
records = rows.to_arrow()
6774

68-
assert records.column_names == ["name", "number"]
75+
assert records.column_names == ["name", "number", "json"]
6976
assert records["name"].to_pylist() == [
7077
"Tiarra",
7178
"Timothy",
@@ -78,6 +85,17 @@ def test_to_arrow_with_jobs_query_response():
7885
"Tiffani",
7986
]
8087
assert records["number"].to_pylist() == [6, 325, 26, 10, 17, 22, 6, 229, 8]
88+
assert records["json"].to_pylist() == [
89+
"123",
90+
'{"key":"value"}',
91+
"[1,2,3]",
92+
'{"aKey": {"bKey": {"cKey": -123}}}',
93+
None,
94+
'"some-json-string"',
95+
'{"nullKey":null}',
96+
'""',
97+
"[]",
98+
]
8199

82100

83101
def test_to_arrow_with_jobs_query_response_and_max_results():
@@ -87,6 +105,7 @@ def test_to_arrow_with_jobs_query_response_and_max_results():
87105
"fields": [
88106
{"name": "name", "type": "STRING", "mode": "NULLABLE"},
89107
{"name": "number", "type": "INTEGER", "mode": "NULLABLE"},
108+
{"name": "json", "type": "JSON", "mode": "NULLABLE"},
90109
]
91110
},
92111
"jobReference": {
@@ -96,15 +115,21 @@ def test_to_arrow_with_jobs_query_response_and_max_results():
96115
},
97116
"totalRows": "9",
98117
"rows": [
99-
{"f": [{"v": "Tiarra"}, {"v": "6"}]},
100-
{"f": [{"v": "Timothy"}, {"v": "325"}]},
101-
{"f": [{"v": "Tina"}, {"v": "26"}]},
102-
{"f": [{"v": "Tierra"}, {"v": "10"}]},
103-
{"f": [{"v": "Tia"}, {"v": "17"}]},
104-
{"f": [{"v": "Tiara"}, {"v": "22"}]},
105-
{"f": [{"v": "Tiana"}, {"v": "6"}]},
106-
{"f": [{"v": "Tiffany"}, {"v": "229"}]},
107-
{"f": [{"v": "Tiffani"}, {"v": "8"}]},
118+
{"f": [{"v": "Tiarra"}, {"v": "6"}, {"v": "123"}]},
119+
{"f": [{"v": "Timothy"}, {"v": "325"}, {"v": '{"key":"value"}'}]},
120+
{"f": [{"v": "Tina"}, {"v": "26"}, {"v": "[1,2,3]"}]},
121+
{
122+
"f": [
123+
{"v": "Tierra"},
124+
{"v": "10"},
125+
{"v": '{"aKey": {"bKey": {"cKey": -123}}}'},
126+
]
127+
},
128+
{"f": [{"v": "Tia"}, {"v": "17"}, {"v": None}]},
129+
{"f": [{"v": "Tiara"}, {"v": "22"}, {"v": '"some-json-string"'}]},
130+
{"f": [{"v": "Tiana"}, {"v": "6"}, {"v": '{"nullKey":null}'}]},
131+
{"f": [{"v": "Tiffany"}, {"v": "229"}, {"v": '""'}]},
132+
{"f": [{"v": "Tiffani"}, {"v": "8"}, {"v": "[]"}]},
108133
],
109134
"totalBytesProcessed": "154775150",
110135
"jobComplete": True,
@@ -125,10 +150,11 @@ def test_to_arrow_with_jobs_query_response_and_max_results():
125150
)
126151
records = rows.to_arrow()
127152

128-
assert records.column_names == ["name", "number"]
153+
assert records.column_names == ["name", "number", "json"]
129154
assert records["name"].to_pylist() == [
130155
"Tiarra",
131156
"Timothy",
132157
"Tina",
133158
]
134159
assert records["number"].to_pylist() == [6, 325, 26]
160+
assert records["json"].to_pylist() == ["123", '{"key":"value"}', "[1,2,3]"]

0 commit comments

Comments
 (0)