fix: load_table_from_dataframe now assumes there may be local null values (#1735)

tswast · web-flow · commit f05dc69a1f8c · 2023-11-22T16:24:14.000Z
Even if the remote schema is REQUIRED Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes #1692 🦕
diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py
@@ -178,12 +178,18 @@ def bq_to_arrow_field(bq_field, array_type=None):
     if arrow_type is not None:
         if array_type is not None:
             arrow_type = array_type  # For GEOGRAPHY, at least initially
-        is_nullable = bq_field.mode.upper() == "NULLABLE"
         metadata = BQ_FIELD_TYPE_TO_ARROW_FIELD_METADATA.get(
             bq_field.field_type.upper() if bq_field.field_type else ""
         )
         return pyarrow.field(
-            bq_field.name, arrow_type, nullable=is_nullable, metadata=metadata
+            bq_field.name,
+            arrow_type,
+            # Even if the remote schema is REQUIRED, there's a chance there's
+            # local NULL values. Arrow will gladly interpret these NULL values
+            # as non-NULL and give you an arbitrary value. See:
+            # https://github.com/googleapis/python-bigquery/issues/1692
+            nullable=True,
+            metadata=metadata,
         )
 
     warnings.warn("Unable to determine type for field '{}'.".format(bq_field.name))
diff --git a/tests/system/test_pandas.py b/tests/system/test_pandas.py
@@ -428,8 +428,7 @@ def test_load_table_from_dataframe_w_nulls(bigquery_client, dataset_id):
 
 
 def test_load_table_from_dataframe_w_required(bigquery_client, dataset_id):
-    """Test that a DataFrame with required columns can be uploaded if a
-    BigQuery schema is specified.
+    """Test that a DataFrame can be uploaded to a table with required columns.
 
     See: https://github.com/googleapis/google-cloud-python/issues/8093
     """
@@ -440,7 +439,6 @@ def test_load_table_from_dataframe_w_required(bigquery_client, dataset_id):
 
     records = [{"name": "Chip", "age": 2}, {"name": "Dale", "age": 3}]
     dataframe = pandas.DataFrame(records, columns=["name", "age"])
-    job_config = bigquery.LoadJobConfig(schema=table_schema)
     table_id = "{}.{}.load_table_from_dataframe_w_required".format(
         bigquery_client.project, dataset_id
     )
@@ -451,15 +449,50 @@ def test_load_table_from_dataframe_w_required(bigquery_client, dataset_id):
         bigquery.Table(table_id, schema=table_schema)
     )
 
-    job_config = bigquery.LoadJobConfig(schema=table_schema)
-    load_job = bigquery_client.load_table_from_dataframe(
-        dataframe, table_id, job_config=job_config
-    )
+    load_job = bigquery_client.load_table_from_dataframe(dataframe, table_id)
     load_job.result()
 
     table = bigquery_client.get_table(table)
     assert tuple(table.schema) == table_schema
     assert table.num_rows == 2
+    for field in table.schema:
+        assert field.mode == "REQUIRED"
+
+
+def test_load_table_from_dataframe_w_required_but_local_nulls_fails(
+    bigquery_client, dataset_id
+):
+    """Test that a DataFrame with nulls can't be uploaded to a table with
+    required columns.
+
+    See: https://github.com/googleapis/python-bigquery/issues/1692
+    """
+    table_schema = (
+        bigquery.SchemaField("name", "STRING", mode="REQUIRED"),
+        bigquery.SchemaField("age", "INTEGER", mode="REQUIRED"),
+    )
+
+    records = [
+        {"name": "Chip", "age": 2},
+        {"name": "Dale", "age": 3},
+        {"name": None, "age": None},
+        {"name": "Alvin", "age": 4},
+    ]
+    dataframe = pandas.DataFrame(records, columns=["name", "age"])
+    table_id = (
+        "{}.{}.load_table_from_dataframe_w_required_but_local_nulls_fails".format(
+            bigquery_client.project, dataset_id
+        )
+    )
+
+    # Create the table before loading so that schema mismatch errors are
+    # identified.
+    helpers.retry_403(bigquery_client.create_table)(
+        bigquery.Table(table_id, schema=table_schema)
+    )
+
+    with pytest.raises(google.api_core.exceptions.BadRequest, match="null"):
+        bigquery_client.load_table_from_dataframe(dataframe, table_id).result()
 
 
 def test_load_table_from_dataframe_w_explicit_schema(bigquery_client, dataset_id):
diff --git a/tests/unit/test__pandas_helpers.py b/tests/unit/test__pandas_helpers.py
@@ -1017,30 +1017,41 @@ def test_dataframe_to_arrow_with_required_fields(module_under_test):
     )
 
     data = {
-        "field01": ["hello", "world"],
-        "field02": [b"abd", b"efg"],
-        "field03": [1, 2],
-        "field04": [3, 4],
-        "field05": [1.25, 9.75],
-        "field06": [-1.75, -3.5],
-        "field07": [decimal.Decimal("1.2345"), decimal.Decimal("6.7891")],
+        "field01": ["hello", None, "world"],
+        "field02": [b"abd", b"efg", b"hij"],
+        "field03": [1, 2, 3],
+        "field04": [4, None, 5],
+        "field05": [1.25, 0.0, 9.75],
+        "field06": [-1.75, None, -3.5],
+        "field07": [
+            decimal.Decimal("1.2345"),
+            decimal.Decimal("6.7891"),
+            -decimal.Decimal("10.111213"),
+        ],
         "field08": [
             decimal.Decimal("-{d38}.{d38}".format(d38="9" * 38)),
+            None,
             decimal.Decimal("{d38}.{d38}".format(d38="9" * 38)),
         ],
-        "field09": [True, False],
-        "field10": [False, True],
+        "field09": [True, False, True],
+        "field10": [False, True, None],
         "field11": [
             datetime.datetime(1970, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc),
             datetime.datetime(2012, 12, 21, 9, 7, 42, tzinfo=datetime.timezone.utc),
+            datetime.datetime(2022, 7, 14, 23, 59, 59, tzinfo=datetime.timezone.utc),
         ],
-        "field12": [datetime.date(9999, 12, 31), datetime.date(1970, 1, 1)],
-        "field13": [datetime.time(23, 59, 59, 999999), datetime.time(12, 0, 0)],
+        "field12": [datetime.date(9999, 12, 31), None, datetime.date(1970, 1, 1)],
+        "field13": [datetime.time(23, 59, 59, 999999), None, datetime.time(12, 0, 0)],
         "field14": [
             datetime.datetime(1970, 1, 1, 0, 0, 0),
+            None,
             datetime.datetime(2012, 12, 21, 9, 7, 42),
         ],
-        "field15": ["POINT(30 10)", "POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))"],
+        "field15": [
+            None,
+            "POINT(30 10)",
+            "POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))",
+        ],
     }
     dataframe = pandas.DataFrame(data)
 
@@ -1049,7 +1060,11 @@ def test_dataframe_to_arrow_with_required_fields(module_under_test):
 
     assert len(arrow_schema) == len(bq_schema)
     for arrow_field in arrow_schema:
-        assert not arrow_field.nullable
+        # Even if the remote schema is REQUIRED, there's a chance there's
+        # local NULL values. Arrow will gladly interpret these NULL values
+        # as non-NULL and give you an arbitrary value. See:
+        # https://github.com/googleapis/python-bigquery/issues/1692
+        assert arrow_field.nullable
 
 
 @pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
@@ -1101,7 +1116,11 @@ def test_dataframe_to_arrow_dict_sequence_schema(module_under_test):
     arrow_schema = arrow_table.schema
 
     expected_fields = [
-        pyarrow.field("field01", "string", nullable=False),
+        # Even if the remote schema is REQUIRED, there's a chance there's
+        # local NULL values. Arrow will gladly interpret these NULL values
+        # as non-NULL and give you an arbitrary value. See:
+        # https://github.com/googleapis/python-bigquery/issues/1692
+        pyarrow.field("field01", "string", nullable=True),
         pyarrow.field("field02", "bool", nullable=True),
     ]
     assert list(arrow_schema) == expected_fields