Skip to content

Commit f05dc69

Browse files
authored
fix: load_table_from_dataframe now assumes there may be local null values (#1735)
Even if the remote schema is REQUIRED Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes #1692 🦕
1 parent 5573579 commit f05dc69

File tree

3 files changed

+81
-23
lines changed

3 files changed

+81
-23
lines changed

google/cloud/bigquery/_pandas_helpers.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -178,12 +178,18 @@ def bq_to_arrow_field(bq_field, array_type=None):
178178
if arrow_type is not None:
179179
if array_type is not None:
180180
arrow_type = array_type # For GEOGRAPHY, at least initially
181-
is_nullable = bq_field.mode.upper() == "NULLABLE"
182181
metadata = BQ_FIELD_TYPE_TO_ARROW_FIELD_METADATA.get(
183182
bq_field.field_type.upper() if bq_field.field_type else ""
184183
)
185184
return pyarrow.field(
186-
bq_field.name, arrow_type, nullable=is_nullable, metadata=metadata
185+
bq_field.name,
186+
arrow_type,
187+
# Even if the remote schema is REQUIRED, there's a chance there's
188+
# local NULL values. Arrow will gladly interpret these NULL values
189+
# as non-NULL and give you an arbitrary value. See:
190+
# https://github.com/googleapis/python-bigquery/issues/1692
191+
nullable=True,
192+
metadata=metadata,
187193
)
188194

189195
warnings.warn("Unable to determine type for field '{}'.".format(bq_field.name))

tests/system/test_pandas.py

+40-7
Original file line numberDiff line numberDiff line change
@@ -428,8 +428,7 @@ def test_load_table_from_dataframe_w_nulls(bigquery_client, dataset_id):
428428

429429

430430
def test_load_table_from_dataframe_w_required(bigquery_client, dataset_id):
431-
"""Test that a DataFrame with required columns can be uploaded if a
432-
BigQuery schema is specified.
431+
"""Test that a DataFrame can be uploaded to a table with required columns.
433432
434433
See: https://github.com/googleapis/google-cloud-python/issues/8093
435434
"""
@@ -440,7 +439,6 @@ def test_load_table_from_dataframe_w_required(bigquery_client, dataset_id):
440439

441440
records = [{"name": "Chip", "age": 2}, {"name": "Dale", "age": 3}]
442441
dataframe = pandas.DataFrame(records, columns=["name", "age"])
443-
job_config = bigquery.LoadJobConfig(schema=table_schema)
444442
table_id = "{}.{}.load_table_from_dataframe_w_required".format(
445443
bigquery_client.project, dataset_id
446444
)
@@ -451,15 +449,50 @@ def test_load_table_from_dataframe_w_required(bigquery_client, dataset_id):
451449
bigquery.Table(table_id, schema=table_schema)
452450
)
453451

454-
job_config = bigquery.LoadJobConfig(schema=table_schema)
455-
load_job = bigquery_client.load_table_from_dataframe(
456-
dataframe, table_id, job_config=job_config
457-
)
452+
load_job = bigquery_client.load_table_from_dataframe(dataframe, table_id)
458453
load_job.result()
459454

460455
table = bigquery_client.get_table(table)
461456
assert tuple(table.schema) == table_schema
462457
assert table.num_rows == 2
458+
for field in table.schema:
459+
assert field.mode == "REQUIRED"
460+
461+
462+
def test_load_table_from_dataframe_w_required_but_local_nulls_fails(
463+
bigquery_client, dataset_id
464+
):
465+
"""Test that a DataFrame with nulls can't be uploaded to a table with
466+
required columns.
467+
468+
See: https://github.com/googleapis/python-bigquery/issues/1692
469+
"""
470+
table_schema = (
471+
bigquery.SchemaField("name", "STRING", mode="REQUIRED"),
472+
bigquery.SchemaField("age", "INTEGER", mode="REQUIRED"),
473+
)
474+
475+
records = [
476+
{"name": "Chip", "age": 2},
477+
{"name": "Dale", "age": 3},
478+
{"name": None, "age": None},
479+
{"name": "Alvin", "age": 4},
480+
]
481+
dataframe = pandas.DataFrame(records, columns=["name", "age"])
482+
table_id = (
483+
"{}.{}.load_table_from_dataframe_w_required_but_local_nulls_fails".format(
484+
bigquery_client.project, dataset_id
485+
)
486+
)
487+
488+
# Create the table before loading so that schema mismatch errors are
489+
# identified.
490+
helpers.retry_403(bigquery_client.create_table)(
491+
bigquery.Table(table_id, schema=table_schema)
492+
)
493+
494+
with pytest.raises(google.api_core.exceptions.BadRequest, match="null"):
495+
bigquery_client.load_table_from_dataframe(dataframe, table_id).result()
463496

464497

465498
def test_load_table_from_dataframe_w_explicit_schema(bigquery_client, dataset_id):

tests/unit/test__pandas_helpers.py

+33-14
Original file line numberDiff line numberDiff line change
@@ -1017,30 +1017,41 @@ def test_dataframe_to_arrow_with_required_fields(module_under_test):
10171017
)
10181018

10191019
data = {
1020-
"field01": ["hello", "world"],
1021-
"field02": [b"abd", b"efg"],
1022-
"field03": [1, 2],
1023-
"field04": [3, 4],
1024-
"field05": [1.25, 9.75],
1025-
"field06": [-1.75, -3.5],
1026-
"field07": [decimal.Decimal("1.2345"), decimal.Decimal("6.7891")],
1020+
"field01": ["hello", None, "world"],
1021+
"field02": [b"abd", b"efg", b"hij"],
1022+
"field03": [1, 2, 3],
1023+
"field04": [4, None, 5],
1024+
"field05": [1.25, 0.0, 9.75],
1025+
"field06": [-1.75, None, -3.5],
1026+
"field07": [
1027+
decimal.Decimal("1.2345"),
1028+
decimal.Decimal("6.7891"),
1029+
-decimal.Decimal("10.111213"),
1030+
],
10271031
"field08": [
10281032
decimal.Decimal("-{d38}.{d38}".format(d38="9" * 38)),
1033+
None,
10291034
decimal.Decimal("{d38}.{d38}".format(d38="9" * 38)),
10301035
],
1031-
"field09": [True, False],
1032-
"field10": [False, True],
1036+
"field09": [True, False, True],
1037+
"field10": [False, True, None],
10331038
"field11": [
10341039
datetime.datetime(1970, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc),
10351040
datetime.datetime(2012, 12, 21, 9, 7, 42, tzinfo=datetime.timezone.utc),
1041+
datetime.datetime(2022, 7, 14, 23, 59, 59, tzinfo=datetime.timezone.utc),
10361042
],
1037-
"field12": [datetime.date(9999, 12, 31), datetime.date(1970, 1, 1)],
1038-
"field13": [datetime.time(23, 59, 59, 999999), datetime.time(12, 0, 0)],
1043+
"field12": [datetime.date(9999, 12, 31), None, datetime.date(1970, 1, 1)],
1044+
"field13": [datetime.time(23, 59, 59, 999999), None, datetime.time(12, 0, 0)],
10391045
"field14": [
10401046
datetime.datetime(1970, 1, 1, 0, 0, 0),
1047+
None,
10411048
datetime.datetime(2012, 12, 21, 9, 7, 42),
10421049
],
1043-
"field15": ["POINT(30 10)", "POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))"],
1050+
"field15": [
1051+
None,
1052+
"POINT(30 10)",
1053+
"POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))",
1054+
],
10441055
}
10451056
dataframe = pandas.DataFrame(data)
10461057

@@ -1049,7 +1060,11 @@ def test_dataframe_to_arrow_with_required_fields(module_under_test):
10491060

10501061
assert len(arrow_schema) == len(bq_schema)
10511062
for arrow_field in arrow_schema:
1052-
assert not arrow_field.nullable
1063+
# Even if the remote schema is REQUIRED, there's a chance there's
1064+
# local NULL values. Arrow will gladly interpret these NULL values
1065+
# as non-NULL and give you an arbitrary value. See:
1066+
# https://github.com/googleapis/python-bigquery/issues/1692
1067+
assert arrow_field.nullable
10531068

10541069

10551070
@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
@@ -1101,7 +1116,11 @@ def test_dataframe_to_arrow_dict_sequence_schema(module_under_test):
11011116
arrow_schema = arrow_table.schema
11021117

11031118
expected_fields = [
1104-
pyarrow.field("field01", "string", nullable=False),
1119+
# Even if the remote schema is REQUIRED, there's a chance there's
1120+
# local NULL values. Arrow will gladly interpret these NULL values
1121+
# as non-NULL and give you an arbitrary value. See:
1122+
# https://github.com/googleapis/python-bigquery/issues/1692
1123+
pyarrow.field("field01", "string", nullable=True),
11051124
pyarrow.field("field02", "bool", nullable=True),
11061125
]
11071126
assert list(arrow_schema) == expected_fields

0 commit comments

Comments
 (0)