Skip to content

Commit ab0cf4c

Browse files
authored
feat: support nullable boolean and Int64 dtypes in insert_rows_from_dataframe (#1816)
1 parent 57be031 commit ab0cf4c

File tree

3 files changed

+79
-18
lines changed

3 files changed

+79
-18
lines changed

google/cloud/bigquery/_pandas_helpers.py

+19
Original file line numberDiff line numberDiff line change
@@ -958,6 +958,25 @@ def dataframe_to_json_generator(dataframe):
958958
# considered a NaN, however.
959959
if isinstance(is_nan, bool) and is_nan:
960960
continue
961+
962+
# Convert numpy types to corresponding Python types.
963+
# https://stackoverflow.com/a/60441783/101923
964+
if isinstance(value, numpy.bool_):
965+
value = bool(value)
966+
elif isinstance(
967+
value,
968+
(
969+
numpy.int64,
970+
numpy.int32,
971+
numpy.int16,
972+
numpy.int8,
973+
numpy.uint64,
974+
numpy.uint32,
975+
numpy.uint16,
976+
numpy.uint8,
977+
),
978+
):
979+
value = int(value)
961980
output[column] = value
962981

963982
yield output

tests/system/test_pandas.py

+12-1
Original file line numberDiff line numberDiff line change
@@ -835,7 +835,9 @@ def test_insert_rows_from_dataframe(bigquery_client, dataset_id):
835835
schema = [
836836
SF("float_col", "FLOAT", mode="REQUIRED"),
837837
SF("int_col", "INTEGER", mode="REQUIRED"),
838+
SF("int64_col", "INTEGER", mode="NULLABLE"),
838839
SF("bool_col", "BOOLEAN", mode="REQUIRED"),
840+
SF("boolean_col", "BOOLEAN", mode="NULLABLE"),
839841
SF("string_col", "STRING", mode="NULLABLE"),
840842
SF("date_col", "DATE", mode="NULLABLE"),
841843
SF("time_col", "TIME", mode="NULLABLE"),
@@ -898,6 +900,15 @@ def test_insert_rows_from_dataframe(bigquery_client, dataset_id):
898900
dataframe["date_col"] = dataframe["date_col"].astype("dbdate")
899901
dataframe["time_col"] = dataframe["time_col"].astype("dbtime")
900902

903+
# Support nullable integer and boolean dtypes.
904+
# https://github.com/googleapis/python-bigquery/issues/1815
905+
dataframe["int64_col"] = pandas.Series(
906+
[-11, -22, pandas.NA, -44, -55, -66], dtype="Int64"
907+
)
908+
dataframe["boolean_col"] = pandas.Series(
909+
[True, False, True, pandas.NA, True, False], dtype="boolean"
910+
)
911+
901912
table_id = f"{bigquery_client.project}.{dataset_id}.test_insert_rows_from_dataframe"
902913
table_arg = bigquery.Table(table_id, schema=schema)
903914
table = helpers.retry_403(bigquery_client.create_table)(table_arg)
@@ -910,7 +921,7 @@ def test_insert_rows_from_dataframe(bigquery_client, dataset_id):
910921
expected = [
911922
# Pandas often represents NULL values as NaN. Convert to None for
912923
# easier comparison.
913-
tuple(None if col != col else col for col in data_row)
924+
tuple(None if pandas.isna(col) else col for col in data_row)
914925
for data_row in dataframe.itertuples(index=False)
915926
]
916927

tests/unit/test__pandas_helpers.py

+48-17
Original file line numberDiff line numberDiff line change
@@ -808,29 +808,60 @@ def test_list_columns_and_indexes_with_named_index_same_as_column_name(
808808
@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
809809
def test_dataframe_to_json_generator(module_under_test):
810810
utcnow = datetime.datetime.utcnow()
811-
df_data = collections.OrderedDict(
812-
[
813-
("a_series", [pandas.NA, 2, 3, 4]),
814-
("b_series", [0.1, float("NaN"), 0.3, 0.4]),
815-
("c_series", ["a", "b", pandas.NA, "d"]),
816-
("d_series", [utcnow, utcnow, utcnow, pandas.NaT]),
817-
("e_series", [True, False, True, None]),
818-
]
819-
)
820811
dataframe = pandas.DataFrame(
821-
df_data, index=pandas.Index([4, 5, 6, 7], name="a_index")
812+
{
813+
"a_series": [1, 2, 3, 4],
814+
"b_series": [0.1, float("NaN"), 0.3, 0.4],
815+
"c_series": ["a", "b", pandas.NA, "d"],
816+
"d_series": [utcnow, utcnow, utcnow, pandas.NaT],
817+
"e_series": [True, False, True, None],
818+
# Support nullable dtypes.
819+
# https://github.com/googleapis/python-bigquery/issues/1815
820+
"boolean_series": pandas.Series(
821+
[True, False, pandas.NA, False], dtype="boolean"
822+
),
823+
"int64_series": pandas.Series([-1, pandas.NA, -3, -4], dtype="Int64"),
824+
}
822825
)
823826

824-
dataframe = dataframe.astype({"a_series": pandas.Int64Dtype()})
827+
# Index is not included, even if it is not the default and has a name.
828+
dataframe = dataframe.rename(index=lambda idx: idx + 4)
829+
dataframe.index.name = "a_index"
825830

826-
rows = module_under_test.dataframe_to_json_generator(dataframe)
831+
rows = list(module_under_test.dataframe_to_json_generator(dataframe))
827832
expected = [
828-
{"b_series": 0.1, "c_series": "a", "d_series": utcnow, "e_series": True},
829-
{"a_series": 2, "c_series": "b", "d_series": utcnow, "e_series": False},
830-
{"a_series": 3, "b_series": 0.3, "d_series": utcnow, "e_series": True},
831-
{"a_series": 4, "b_series": 0.4, "c_series": "d"},
833+
{
834+
"a_series": 1,
835+
"b_series": 0.1,
836+
"c_series": "a",
837+
"d_series": utcnow,
838+
"e_series": True,
839+
"boolean_series": True,
840+
"int64_series": -1,
841+
},
842+
{
843+
"a_series": 2,
844+
"c_series": "b",
845+
"d_series": utcnow,
846+
"e_series": False,
847+
"boolean_series": False,
848+
},
849+
{
850+
"a_series": 3,
851+
"b_series": 0.3,
852+
"d_series": utcnow,
853+
"e_series": True,
854+
"int64_series": -3,
855+
},
856+
{
857+
"a_series": 4,
858+
"b_series": 0.4,
859+
"c_series": "d",
860+
"boolean_series": False,
861+
"int64_series": -4,
862+
},
832863
]
833-
assert list(rows) == expected
864+
assert rows == expected
834865

835866

836867
@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")

0 commit comments

Comments
 (0)