Skip to content

Commit be3e354

Browse files
perf: reimplement unpivot to use cross join rather than union
1 parent 5e199ec commit be3e354

File tree

1 file changed

+56
-41
lines changed

1 file changed

+56
-41
lines changed

bigframes/core/__init__.py

+56-41
Original file line numberDiff line numberDiff line change
@@ -982,61 +982,76 @@ def unpivot(
982982
ArrayValue: The unpivoted ArrayValue
983983
"""
984984
table = self._to_ibis_expr(ordering_mode="offset_col")
985-
sub_expressions = []
986-
987-
# Use ibis memtable to infer type of rowlabels (if possible)
988-
# TODO: Allow caller to specify dtype
989-
labels_ibis_type = ibis.memtable({"col": row_labels})["col"].type()
990-
labels_dtype = bigframes.dtypes.ibis_dtype_to_bigframes_dtype(labels_ibis_type)
991-
992985
row_n = len(row_labels)
993986
if not all(
994987
len(source_columns) == row_n for _, source_columns in unpivot_columns
995988
):
996989
raise ValueError("Columns and row labels must all be same length.")
997990

998-
for i in range(row_n):
999-
values = []
1000-
for j in range(len(unpivot_columns)):
1001-
result_col, source_cols = unpivot_columns[j]
1002-
col_dtype = dtype[j] if utils.is_list_like(dtype) else dtype
1003-
if source_cols[i] is not None:
1004-
values.append(
1005-
ops.AsTypeOp(col_dtype)
1006-
._as_ibis(table[source_cols[i]])
1007-
.name(result_col)
1008-
)
1009-
else:
1010-
values.append(
1011-
bigframes.dtypes.literal_to_ibis_scalar(
1012-
None, force_dtype=col_dtype
1013-
).name(result_col)
1014-
)
1015-
offsets_value = (
1016-
((table[ORDER_ID_COLUMN] * row_n) + i)
1017-
.cast(ibis_dtypes.int64)
1018-
.name(ORDER_ID_COLUMN),
991+
unpivot_offset_id = bigframes.core.guid.generate_guid("unpivot_offsets_")
992+
unpivot_table = table.cross_join(
993+
ibis.memtable({unpivot_offset_id: range(row_n)})
994+
)
995+
unpivot_offsets_value = (
996+
(
997+
(unpivot_table[ORDER_ID_COLUMN] * row_n)
998+
+ unpivot_table[unpivot_offset_id]
1019999
)
1020-
sub_expr = table.select(
1021-
passthrough_columns,
1000+
.cast(ibis_dtypes.int64)
1001+
.name(ORDER_ID_COLUMN),
1002+
)
1003+
1004+
# Use ibis memtable to infer type of rowlabels (if possible)
1005+
# TODO: Allow caller to specify dtype
1006+
labels_ibis_type = ibis.memtable({"col": row_labels})["col"].type()
1007+
labels_dtype = bigframes.dtypes.ibis_dtype_to_bigframes_dtype(labels_ibis_type)
1008+
cases = [
1009+
(
1010+
i,
10221011
bigframes.dtypes.literal_to_ibis_scalar(
1023-
row_labels[i], force_dtype=labels_dtype # type:ignore
1024-
).name(index_col_id),
1025-
*values,
1026-
offsets_value,
1012+
row_labels[i], force_dtype=labels_dtype
1013+
),
10271014
)
1028-
sub_expressions.append(sub_expr)
1029-
rotated_table = ibis.union(*sub_expressions)
1015+
for i in range(len(row_labels))
1016+
]
1017+
labels_value = (
1018+
typing.cast(ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id])
1019+
.cases(cases, default=None)
1020+
.name(index_col_id)
1021+
)
1022+
1023+
unpivot_values = []
1024+
for j in range(len(unpivot_columns)):
1025+
col_dtype = dtype[j] if utils.is_list_like(dtype) else dtype
1026+
result_col, source_cols = unpivot_columns[j]
1027+
null_value = bigframes.dtypes.literal_to_ibis_scalar(
1028+
None, force_dtype=col_dtype
1029+
)
1030+
ibis_values = [
1031+
ops.AsTypeOp(col_dtype)._as_ibis(unpivot_table[col])
1032+
if col is not None
1033+
else null_value
1034+
for col in source_cols
1035+
]
1036+
cases = [(i, ibis_values[i]) for i in range(len(ibis_values))]
1037+
unpivot_value = typing.cast(
1038+
ibis_types.IntegerColumn, unpivot_table[unpivot_offset_id]
1039+
).cases(cases, default=null_value)
1040+
unpivot_values.append(unpivot_value.name(result_col))
1041+
1042+
unpivot_table = unpivot_table.select(
1043+
passthrough_columns, labels_value, *unpivot_values, unpivot_offsets_value
1044+
)
10301045

10311046
value_columns = [
1032-
rotated_table[value_col_id] for value_col_id, _ in unpivot_columns
1047+
unpivot_table[value_col_id] for value_col_id, _ in unpivot_columns
10331048
]
1034-
passthrough_values = [rotated_table[col] for col in passthrough_columns]
1049+
passthrough_values = [unpivot_table[col] for col in passthrough_columns]
10351050
return ArrayValue(
10361051
session=self._session,
1037-
table=rotated_table,
1038-
columns=[rotated_table[index_col_id], *value_columns, *passthrough_values],
1039-
hidden_ordering_columns=[rotated_table[ORDER_ID_COLUMN]],
1052+
table=unpivot_table,
1053+
columns=[unpivot_table[index_col_id], *value_columns, *passthrough_values],
1054+
hidden_ordering_columns=[unpivot_table[ORDER_ID_COLUMN]],
10401055
ordering=ExpressionOrdering(
10411056
ordering_value_columns=[OrderingColumnReference(ORDER_ID_COLUMN)],
10421057
integer_encoding=IntegerEncoding(is_encoded=True, is_sequential=True),

0 commit comments

Comments
 (0)