Skip to content

Commit 7603bd7

Browse files
tswastgcf-owl-bot[bot]chalmerlowe
authored
deps: use pandas-gbq to determine schema in load_table_from_dataframe (#2095)
* feat: use pandas-gbq to determine schema in `load_table_from_dataframe` * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * fix some unit tests * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * bump minimum pandas-gbq to 0.26.1 * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * drop pandas-gbq from python 3.7 extras * relax warning message text assertion * use consistent time zone presense/absense in time datetime system test * Update google/cloud/bigquery/_pandas_helpers.py * Update google/cloud/bigquery/_pandas_helpers.py Co-authored-by: Chalmer Lowe <[email protected]> * remove pandas-gbq from at least 1 unit test and system test session --------- Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com> Co-authored-by: Chalmer Lowe <[email protected]>
1 parent b03a2af commit 7603bd7

File tree

8 files changed

+147
-22
lines changed

8 files changed

+147
-22
lines changed

google/cloud/bigquery/_pandas_helpers.py

+34-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,12 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
"""Shared helper functions for connecting BigQuery and pandas."""
15+
"""Shared helper functions for connecting BigQuery and pandas.
16+
17+
NOTE: This module is DEPRECATED. Please make updates in the pandas-gbq package,
18+
instead. See: go/pandas-gbq-and-bigframes-redundancy and
19+
https://github.com/googleapis/python-bigquery-pandas/blob/main/pandas_gbq/schema/pandas_to_bigquery.py
20+
"""
1621

1722
import concurrent.futures
1823
from datetime import datetime
@@ -40,6 +45,16 @@
4045
else:
4146
import numpy
4247

48+
49+
try:
50+
import pandas_gbq.schema.pandas_to_bigquery # type: ignore
51+
52+
pandas_gbq_import_exception = None
53+
except ImportError as exc:
54+
pandas_gbq = None
55+
pandas_gbq_import_exception = exc
56+
57+
4358
try:
4459
import db_dtypes # type: ignore
4560

@@ -445,6 +460,10 @@ def _first_array_valid(series):
445460
def dataframe_to_bq_schema(dataframe, bq_schema):
446461
"""Convert a pandas DataFrame schema to a BigQuery schema.
447462
463+
DEPRECATED: Use
464+
pandas_gbq.schema.pandas_to_bigquery.dataframe_to_bigquery_fields(),
465+
instead. See: go/pandas-gbq-and-bigframes-redundancy.
466+
448467
Args:
449468
dataframe (pandas.DataFrame):
450469
DataFrame for which the client determines the BigQuery schema.
@@ -460,6 +479,20 @@ def dataframe_to_bq_schema(dataframe, bq_schema):
460479
The automatically determined schema. Returns None if the type of
461480
any column cannot be determined.
462481
"""
482+
if pandas_gbq is None:
483+
warnings.warn(
484+
"Loading pandas DataFrame into BigQuery will require pandas-gbq "
485+
"package version 0.26.1 or greater in the future. "
486+
f"Tried to import pandas-gbq and got: {pandas_gbq_import_exception}",
487+
category=FutureWarning,
488+
)
489+
else:
490+
return pandas_gbq.schema.pandas_to_bigquery.dataframe_to_bigquery_fields(
491+
dataframe,
492+
override_bigquery_fields=bq_schema,
493+
index=True,
494+
)
495+
463496
if bq_schema:
464497
bq_schema = schema._to_schema_fields(bq_schema)
465498
bq_schema_index = {field.name: field for field in bq_schema}

google/cloud/bigquery/_pyarrow_helpers.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,12 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
"""Shared helper functions for connecting BigQuery and pyarrow."""
15+
"""Shared helper functions for connecting BigQuery and pyarrow.
16+
17+
NOTE: This module is DEPRECATED. Please make updates in the pandas-gbq package,
18+
instead. See: go/pandas-gbq-and-bigframes-redundancy and
19+
https://github.com/googleapis/python-bigquery-pandas/blob/main/pandas_gbq/schema/pyarrow_to_bigquery.py
20+
"""
1621

1722
from typing import Any
1823

noxfile.py

+15
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,14 @@ def default(session, install_extras=True):
110110
else:
111111
install_target = "."
112112
session.install("-e", install_target, "-c", constraints_path)
113+
114+
# Test with some broken "extras" in case the user didn't install the extra
115+
# directly. For example, pandas-gbq is recommended for pandas features, but
116+
# we want to test that we fallback to the previous behavior. For context,
117+
# see internal document go/pandas-gbq-and-bigframes-redundancy.
118+
if session.python == UNIT_TEST_PYTHON_VERSIONS[0]:
119+
session.run("python", "-m", "pip", "uninstall", "pandas-gbq", "-y")
120+
113121
session.run("python", "-m", "pip", "freeze")
114122

115123
# Run py.test against the unit tests.
@@ -228,6 +236,13 @@ def system(session):
228236
extras = "[all]"
229237
session.install("-e", f".{extras}", "-c", constraints_path)
230238

239+
# Test with some broken "extras" in case the user didn't install the extra
240+
# directly. For example, pandas-gbq is recommended for pandas features, but
241+
# we want to test that we fallback to the previous behavior. For context,
242+
# see internal document go/pandas-gbq-and-bigframes-redundancy.
243+
if session.python == SYSTEM_TEST_PYTHON_VERSIONS[0]:
244+
session.run("python", "-m", "pip", "uninstall", "pandas-gbq", "-y")
245+
231246
# print versions of all dependencies
232247
session.run("python", "-m", "pip", "freeze")
233248

pyproject.toml

+3
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,9 @@ bqstorage = [
7474
]
7575
pandas = [
7676
"pandas >= 1.1.0",
77+
"pandas-gbq >= 0.26.1; python_version >= '3.8'",
78+
"grpcio >= 1.47.0, < 2.0dev",
79+
"grpcio >= 1.49.1, < 2.0dev; python_version >= '3.11'",
7780
"pyarrow >= 3.0.0",
7881
"db-dtypes >= 0.3.0, < 2.0.0dev",
7982
"importlib_metadata >= 1.0.0; python_version < '3.8'",

testing/constraints-3.8.txt

+9
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,11 @@
11
grpcio==1.47.0
22
pandas==1.2.0
3+
4+
# This constraints file is used to check that lower bounds
5+
# are correct in setup.py
6+
#
7+
# Pin the version to the lower bound.
8+
#
9+
# e.g., if setup.py has "foo >= 1.14.0, < 2.0.0dev",
10+
# Then this file should have foo==1.14.0
11+
pandas-gbq==0.26.1

tests/system/test_pandas.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1259,7 +1259,7 @@ def test_upload_time_and_datetime_56(bigquery_client, dataset_id):
12591259
df = pandas.DataFrame(
12601260
dict(
12611261
dt=[
1262-
datetime.datetime(2020, 1, 8, 8, 0, 0),
1262+
datetime.datetime(2020, 1, 8, 8, 0, 0, tzinfo=datetime.timezone.utc),
12631263
datetime.datetime(
12641264
2020,
12651265
1,

tests/unit/test__pandas_helpers.py

+54-11
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,11 @@
3535
except ImportError:
3636
pandas = None
3737

38+
try:
39+
import pandas_gbq.schema.pandas_to_bigquery
40+
except ImportError:
41+
pandas_gbq = None
42+
3843
try:
3944
import geopandas
4045
except ImportError:
@@ -1281,7 +1286,21 @@ def test_dataframe_to_parquet_compression_method(module_under_test):
12811286

12821287

12831288
@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
1284-
def test_dataframe_to_bq_schema_w_named_index(module_under_test):
1289+
@pytest.mark.skipif(pandas_gbq is None, reason="Requires `pandas-gbq`")
1290+
def test_dataframe_to_bq_schema_returns_schema_with_pandas_gbq(
1291+
module_under_test, monkeypatch
1292+
):
1293+
monkeypatch.setattr(module_under_test, "pandas_gbq", None)
1294+
dataframe = pandas.DataFrame({"field00": ["foo", "bar"]})
1295+
got = module_under_test.dataframe_to_bq_schema(dataframe, [])
1296+
# Don't assert beyond this, since pandas-gbq is now source of truth.
1297+
assert got is not None
1298+
1299+
1300+
@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
1301+
def test_dataframe_to_bq_schema_w_named_index(module_under_test, monkeypatch):
1302+
monkeypatch.setattr(module_under_test, "pandas_gbq", None)
1303+
12851304
df_data = collections.OrderedDict(
12861305
[
12871306
("str_column", ["hello", "world"]),
@@ -1292,7 +1311,8 @@ def test_dataframe_to_bq_schema_w_named_index(module_under_test):
12921311
index = pandas.Index(["a", "b"], name="str_index")
12931312
dataframe = pandas.DataFrame(df_data, index=index)
12941313

1295-
returned_schema = module_under_test.dataframe_to_bq_schema(dataframe, [])
1314+
with pytest.warns(FutureWarning, match="pandas-gbq"):
1315+
returned_schema = module_under_test.dataframe_to_bq_schema(dataframe, [])
12961316

12971317
expected_schema = (
12981318
schema.SchemaField("str_index", "STRING", "NULLABLE"),
@@ -1304,7 +1324,9 @@ def test_dataframe_to_bq_schema_w_named_index(module_under_test):
13041324

13051325

13061326
@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
1307-
def test_dataframe_to_bq_schema_w_multiindex(module_under_test):
1327+
def test_dataframe_to_bq_schema_w_multiindex(module_under_test, monkeypatch):
1328+
monkeypatch.setattr(module_under_test, "pandas_gbq", None)
1329+
13081330
df_data = collections.OrderedDict(
13091331
[
13101332
("str_column", ["hello", "world"]),
@@ -1321,7 +1343,8 @@ def test_dataframe_to_bq_schema_w_multiindex(module_under_test):
13211343
)
13221344
dataframe = pandas.DataFrame(df_data, index=index)
13231345

1324-
returned_schema = module_under_test.dataframe_to_bq_schema(dataframe, [])
1346+
with pytest.warns(FutureWarning, match="pandas-gbq"):
1347+
returned_schema = module_under_test.dataframe_to_bq_schema(dataframe, [])
13251348

13261349
expected_schema = (
13271350
schema.SchemaField("str_index", "STRING", "NULLABLE"),
@@ -1335,7 +1358,9 @@ def test_dataframe_to_bq_schema_w_multiindex(module_under_test):
13351358

13361359

13371360
@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
1338-
def test_dataframe_to_bq_schema_w_bq_schema(module_under_test):
1361+
def test_dataframe_to_bq_schema_w_bq_schema(module_under_test, monkeypatch):
1362+
monkeypatch.setattr(module_under_test, "pandas_gbq", None)
1363+
13391364
df_data = collections.OrderedDict(
13401365
[
13411366
("str_column", ["hello", "world"]),
@@ -1350,7 +1375,10 @@ def test_dataframe_to_bq_schema_w_bq_schema(module_under_test):
13501375
{"name": "bool_column", "type": "BOOL", "mode": "REQUIRED"},
13511376
]
13521377

1353-
returned_schema = module_under_test.dataframe_to_bq_schema(dataframe, dict_schema)
1378+
with pytest.warns(FutureWarning, match="pandas-gbq"):
1379+
returned_schema = module_under_test.dataframe_to_bq_schema(
1380+
dataframe, dict_schema
1381+
)
13541382

13551383
expected_schema = (
13561384
schema.SchemaField("str_column", "STRING", "NULLABLE"),
@@ -1361,7 +1389,11 @@ def test_dataframe_to_bq_schema_w_bq_schema(module_under_test):
13611389

13621390

13631391
@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
1364-
def test_dataframe_to_bq_schema_fallback_needed_wo_pyarrow(module_under_test):
1392+
def test_dataframe_to_bq_schema_fallback_needed_wo_pyarrow(
1393+
module_under_test, monkeypatch
1394+
):
1395+
monkeypatch.setattr(module_under_test, "pandas_gbq", None)
1396+
13651397
dataframe = pandas.DataFrame(
13661398
data=[
13671399
{"id": 10, "status": "FOO", "execution_date": datetime.date(2019, 5, 10)},
@@ -1389,7 +1421,11 @@ def test_dataframe_to_bq_schema_fallback_needed_wo_pyarrow(module_under_test):
13891421

13901422
@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
13911423
@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`")
1392-
def test_dataframe_to_bq_schema_fallback_needed_w_pyarrow(module_under_test):
1424+
def test_dataframe_to_bq_schema_fallback_needed_w_pyarrow(
1425+
module_under_test, monkeypatch
1426+
):
1427+
monkeypatch.setattr(module_under_test, "pandas_gbq", None)
1428+
13931429
dataframe = pandas.DataFrame(
13941430
data=[
13951431
{"id": 10, "status": "FOO", "created_at": datetime.date(2019, 5, 10)},
@@ -1419,7 +1455,9 @@ def test_dataframe_to_bq_schema_fallback_needed_w_pyarrow(module_under_test):
14191455

14201456
@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
14211457
@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`")
1422-
def test_dataframe_to_bq_schema_pyarrow_fallback_fails(module_under_test):
1458+
def test_dataframe_to_bq_schema_pyarrow_fallback_fails(module_under_test, monkeypatch):
1459+
monkeypatch.setattr(module_under_test, "pandas_gbq", None)
1460+
14231461
dataframe = pandas.DataFrame(
14241462
data=[
14251463
{"struct_field": {"one": 2}, "status": "FOO"},
@@ -1443,9 +1481,11 @@ def test_dataframe_to_bq_schema_pyarrow_fallback_fails(module_under_test):
14431481

14441482

14451483
@pytest.mark.skipif(geopandas is None, reason="Requires `geopandas`")
1446-
def test_dataframe_to_bq_schema_geography(module_under_test):
1484+
def test_dataframe_to_bq_schema_geography(module_under_test, monkeypatch):
14471485
from shapely import wkt
14481486

1487+
monkeypatch.setattr(module_under_test, "pandas_gbq", None)
1488+
14491489
df = geopandas.GeoDataFrame(
14501490
pandas.DataFrame(
14511491
dict(
@@ -1456,7 +1496,10 @@ def test_dataframe_to_bq_schema_geography(module_under_test):
14561496
),
14571497
geometry="geo1",
14581498
)
1459-
bq_schema = module_under_test.dataframe_to_bq_schema(df, [])
1499+
1500+
with pytest.warns(FutureWarning, match="pandas-gbq"):
1501+
bq_schema = module_under_test.dataframe_to_bq_schema(df, [])
1502+
14601503
assert bq_schema == (
14611504
schema.SchemaField("name", "STRING"),
14621505
schema.SchemaField("geo1", "GEOGRAPHY"),

tests/unit/test_client.py

+25-8
Original file line numberDiff line numberDiff line change
@@ -8391,8 +8391,12 @@ def test_load_table_from_dataframe_w_automatic_schema_detection_fails(self):
83918391
autospec=True,
83928392
side_effect=google.api_core.exceptions.NotFound("Table not found"),
83938393
)
8394+
pandas_gbq_patch = mock.patch(
8395+
"google.cloud.bigquery._pandas_helpers.pandas_gbq",
8396+
new=None,
8397+
)
83948398

8395-
with load_patch as load_table_from_file, get_table_patch:
8399+
with load_patch as load_table_from_file, get_table_patch, pandas_gbq_patch:
83968400
with warnings.catch_warnings(record=True) as warned:
83978401
client.load_table_from_dataframe(
83988402
dataframe, self.TABLE_REF, location=self.LOCATION
@@ -8448,7 +8452,6 @@ def test_load_table_from_dataframe_w_index_and_auto_schema(self):
84488452
load_patch = mock.patch(
84498453
"google.cloud.bigquery.client.Client.load_table_from_file", autospec=True
84508454
)
8451-
84528455
get_table_patch = mock.patch(
84538456
"google.cloud.bigquery.client.Client.get_table",
84548457
autospec=True,
@@ -8460,6 +8463,7 @@ def test_load_table_from_dataframe_w_index_and_auto_schema(self):
84608463
]
84618464
),
84628465
)
8466+
84638467
with load_patch as load_table_from_file, get_table_patch:
84648468
client.load_table_from_dataframe(
84658469
dataframe, self.TABLE_REF, location=self.LOCATION
@@ -8580,10 +8584,10 @@ def test_load_table_from_dataframe_w_nullable_int64_datatype_automatic_schema(se
85808584

85818585
client = self._make_client()
85828586
dataframe = pandas.DataFrame({"x": [1, 2, None, 4]}, dtype="Int64")
8587+
85838588
load_patch = mock.patch(
85848589
"google.cloud.bigquery.client.Client.load_table_from_file", autospec=True
85858590
)
8586-
85878591
get_table_patch = mock.patch(
85888592
"google.cloud.bigquery.client.Client.get_table",
85898593
autospec=True,
@@ -8612,8 +8616,11 @@ def test_load_table_from_dataframe_w_nullable_int64_datatype_automatic_schema(se
86128616

86138617
sent_config = load_table_from_file.mock_calls[0][2]["job_config"]
86148618
assert sent_config.source_format == job.SourceFormat.PARQUET
8615-
assert tuple(sent_config.schema) == (
8616-
SchemaField("x", "INT64", "NULLABLE", None),
8619+
assert (
8620+
# Accept either the GoogleSQL or legacy SQL type name from pandas-gbq.
8621+
tuple(sent_config.schema) == (SchemaField("x", "INT64", "NULLABLE", None),)
8622+
or tuple(sent_config.schema)
8623+
== (SchemaField("x", "INTEGER", "NULLABLE", None),)
86178624
)
86188625

86198626
def test_load_table_from_dataframe_struct_fields(self):
@@ -8759,14 +8766,22 @@ def test_load_table_from_dataframe_array_fields_w_auto_schema(self):
87598766
data=records, columns=["float_column", "array_column"]
87608767
)
87618768

8762-
expected_schema = [
8769+
expected_schema_googlesql = [
87638770
SchemaField("float_column", "FLOAT"),
87648771
SchemaField(
87658772
"array_column",
87668773
"INT64",
87678774
mode="REPEATED",
87688775
),
87698776
]
8777+
expected_schema_legacy_sql = [
8778+
SchemaField("float_column", "FLOAT"),
8779+
SchemaField(
8780+
"array_column",
8781+
"INTEGER",
8782+
mode="REPEATED",
8783+
),
8784+
]
87708785

87718786
load_patch = mock.patch(
87728787
"google.cloud.bigquery.client.Client.load_table_from_file", autospec=True
@@ -8802,7 +8817,10 @@ def test_load_table_from_dataframe_array_fields_w_auto_schema(self):
88028817

88038818
sent_config = load_table_from_file.mock_calls[0][2]["job_config"]
88048819
assert sent_config.source_format == job.SourceFormat.PARQUET
8805-
assert sent_config.schema == expected_schema
8820+
assert (
8821+
sent_config.schema == expected_schema_googlesql
8822+
or sent_config.schema == expected_schema_legacy_sql
8823+
)
88068824

88078825
def test_load_table_from_dataframe_w_partial_schema(self):
88088826
pandas = pytest.importorskip("pandas")
@@ -8922,7 +8940,6 @@ def test_load_table_from_dataframe_w_partial_schema_extra_types(self):
89228940

89238941
load_table_from_file.assert_not_called()
89248942
message = str(exc_context.value)
8925-
assert "bq_schema contains fields not present in dataframe" in message
89268943
assert "unknown_col" in message
89278944

89288945
def test_load_table_from_dataframe_w_schema_arrow_custom_compression(self):

0 commit comments

Comments
 (0)