Skip to content

Commit 1fef2fc

Browse files
chelsea-linchalmerlowe
authored andcommitted
fix: keyerror when the load_table_from_dataframe accesses a unmapped dtype dataframe index (#1535)
1 parent 202b603 commit 1fef2fc

File tree

2 files changed

+82
-28
lines changed

2 files changed

+82
-28
lines changed

google/cloud/bigquery/_pandas_helpers.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -481,7 +481,7 @@ def dataframe_to_bq_schema(dataframe, bq_schema):
481481
# pandas dtype.
482482
bq_type = _PANDAS_DTYPE_TO_BQ.get(dtype.name)
483483
if bq_type is None:
484-
sample_data = _first_valid(dataframe[column])
484+
sample_data = _first_valid(dataframe.reset_index()[column])
485485
if (
486486
isinstance(sample_data, _BaseGeometry)
487487
and sample_data is not None # Paranoia
@@ -544,7 +544,7 @@ def augment_schema(dataframe, current_bq_schema):
544544
augmented_schema.append(field)
545545
continue
546546

547-
arrow_table = pyarrow.array(dataframe[field.name])
547+
arrow_table = pyarrow.array(dataframe.reset_index()[field.name])
548548

549549
if pyarrow.types.is_list(arrow_table.type):
550550
# `pyarrow.ListType`

tests/unit/test__pandas_helpers.py

+80-26
Original file line numberDiff line numberDiff line change
@@ -930,32 +930,6 @@ def test_list_columns_and_indexes_with_multiindex(module_under_test):
930930
assert columns_and_indexes == expected
931931

932932

933-
@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
934-
def test_dataframe_to_bq_schema_dict_sequence(module_under_test):
935-
df_data = collections.OrderedDict(
936-
[
937-
("str_column", ["hello", "world"]),
938-
("int_column", [42, 8]),
939-
("bool_column", [True, False]),
940-
]
941-
)
942-
dataframe = pandas.DataFrame(df_data)
943-
944-
dict_schema = [
945-
{"name": "str_column", "type": "STRING", "mode": "NULLABLE"},
946-
{"name": "bool_column", "type": "BOOL", "mode": "REQUIRED"},
947-
]
948-
949-
returned_schema = module_under_test.dataframe_to_bq_schema(dataframe, dict_schema)
950-
951-
expected_schema = (
952-
schema.SchemaField("str_column", "STRING", "NULLABLE"),
953-
schema.SchemaField("int_column", "INTEGER", "NULLABLE"),
954-
schema.SchemaField("bool_column", "BOOL", "REQUIRED"),
955-
)
956-
assert returned_schema == expected_schema
957-
958-
959933
@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
960934
def test_dataframe_to_arrow_with_multiindex(module_under_test):
961935
bq_schema = (
@@ -1190,6 +1164,86 @@ def test_dataframe_to_parquet_compression_method(module_under_test):
11901164
assert call_args.kwargs.get("compression") == "ZSTD"
11911165

11921166

1167+
@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
1168+
def test_dataframe_to_bq_schema_w_named_index(module_under_test):
1169+
df_data = collections.OrderedDict(
1170+
[
1171+
("str_column", ["hello", "world"]),
1172+
("int_column", [42, 8]),
1173+
("bool_column", [True, False]),
1174+
]
1175+
)
1176+
index = pandas.Index(["a", "b"], name="str_index")
1177+
dataframe = pandas.DataFrame(df_data, index=index)
1178+
1179+
returned_schema = module_under_test.dataframe_to_bq_schema(dataframe, [])
1180+
1181+
expected_schema = (
1182+
schema.SchemaField("str_index", "STRING", "NULLABLE"),
1183+
schema.SchemaField("str_column", "STRING", "NULLABLE"),
1184+
schema.SchemaField("int_column", "INTEGER", "NULLABLE"),
1185+
schema.SchemaField("bool_column", "BOOLEAN", "NULLABLE"),
1186+
)
1187+
assert returned_schema == expected_schema
1188+
1189+
1190+
@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
1191+
def test_dataframe_to_bq_schema_w_multiindex(module_under_test):
1192+
df_data = collections.OrderedDict(
1193+
[
1194+
("str_column", ["hello", "world"]),
1195+
("int_column", [42, 8]),
1196+
("bool_column", [True, False]),
1197+
]
1198+
)
1199+
index = pandas.MultiIndex.from_tuples(
1200+
[
1201+
("a", 0, datetime.datetime(1999, 12, 31, 23, 59, 59, 999999)),
1202+
("a", 0, datetime.datetime(2000, 1, 1, 0, 0, 0)),
1203+
],
1204+
names=["str_index", "int_index", "dt_index"],
1205+
)
1206+
dataframe = pandas.DataFrame(df_data, index=index)
1207+
1208+
returned_schema = module_under_test.dataframe_to_bq_schema(dataframe, [])
1209+
1210+
expected_schema = (
1211+
schema.SchemaField("str_index", "STRING", "NULLABLE"),
1212+
schema.SchemaField("int_index", "INTEGER", "NULLABLE"),
1213+
schema.SchemaField("dt_index", "DATETIME", "NULLABLE"),
1214+
schema.SchemaField("str_column", "STRING", "NULLABLE"),
1215+
schema.SchemaField("int_column", "INTEGER", "NULLABLE"),
1216+
schema.SchemaField("bool_column", "BOOLEAN", "NULLABLE"),
1217+
)
1218+
assert returned_schema == expected_schema
1219+
1220+
1221+
@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
1222+
def test_dataframe_to_bq_schema_w_bq_schema(module_under_test):
1223+
df_data = collections.OrderedDict(
1224+
[
1225+
("str_column", ["hello", "world"]),
1226+
("int_column", [42, 8]),
1227+
("bool_column", [True, False]),
1228+
]
1229+
)
1230+
dataframe = pandas.DataFrame(df_data)
1231+
1232+
dict_schema = [
1233+
{"name": "str_column", "type": "STRING", "mode": "NULLABLE"},
1234+
{"name": "bool_column", "type": "BOOL", "mode": "REQUIRED"},
1235+
]
1236+
1237+
returned_schema = module_under_test.dataframe_to_bq_schema(dataframe, dict_schema)
1238+
1239+
expected_schema = (
1240+
schema.SchemaField("str_column", "STRING", "NULLABLE"),
1241+
schema.SchemaField("int_column", "INTEGER", "NULLABLE"),
1242+
schema.SchemaField("bool_column", "BOOL", "REQUIRED"),
1243+
)
1244+
assert returned_schema == expected_schema
1245+
1246+
11931247
@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
11941248
def test_dataframe_to_bq_schema_fallback_needed_wo_pyarrow(module_under_test):
11951249
dataframe = pandas.DataFrame(

0 commit comments

Comments
 (0)