Skip to content

fix: Fix list-like indexers in partial ordering mode #1456

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 16 additions & 4 deletions bigframes/core/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -2312,13 +2312,15 @@ def _apply_binop(

return self.project_exprs(exprs, labels=labels, drop=True)

# TODO: Re-implement join in terms of merge (requires also adding remaining merge args)
def join(
self,
other: Block,
*,
how="left",
sort: bool = False,
block_identity_join: bool = False,
always_order: bool = False,
) -> Tuple[Block, Tuple[Mapping[str, str], Mapping[str, str]],]:
"""
Join two blocks objects together, and provide mappings between source columns and output columns.
Expand All @@ -2332,6 +2334,8 @@ def join(
if true will sort result by index
block_identity_join (bool):
If true, will not convert join to a projection (implicitly assuming unique indices)
always_order (bool):
If true, will always preserve input ordering, even if ordering mode is partial

Returns:
Block, (left_mapping, right_mapping): Result block and mappers from input column ids to result column ids.
Expand Down Expand Up @@ -2377,10 +2381,14 @@ def join(
self._throw_if_null_index("join")
other._throw_if_null_index("join")
if self.index.nlevels == other.index.nlevels == 1:
return join_mono_indexed(self, other, how=how, sort=sort)
return join_mono_indexed(
self, other, how=how, sort=sort, propogate_order=always_order
)
else: # Handles cases where one or both sides are multi-indexed
# Always sort mult-index join
return join_multi_indexed(self, other, how=how, sort=sort)
return join_multi_indexed(
self, other, how=how, sort=sort, propogate_order=always_order
)

def is_monotonic_increasing(
self, column_id: typing.Union[str, Sequence[str]]
Expand Down Expand Up @@ -2837,7 +2845,8 @@ def join_mono_indexed(
right: Block,
*,
how="left",
sort=False,
sort: bool = False,
propogate_order: bool = False,
) -> Tuple[Block, Tuple[Mapping[str, str], Mapping[str, str]],]:
left_expr = left.expr
right_expr = right.expr
Expand All @@ -2848,6 +2857,7 @@ def join_mono_indexed(
conditions=(
join_defs.JoinCondition(left.index_columns[0], right.index_columns[0]),
),
propogate_order=propogate_order,
)

left_index = get_column_left[left.index_columns[0]]
Expand Down Expand Up @@ -2882,7 +2892,8 @@ def join_multi_indexed(
right: Block,
*,
how="left",
sort=False,
sort: bool = False,
propogate_order: bool = False,
) -> Tuple[Block, Tuple[Mapping[str, str], Mapping[str, str]],]:
if not (left.index.is_uniquely_named() and right.index.is_uniquely_named()):
raise ValueError("Joins not supported on indices with non-unique level names")
Expand Down Expand Up @@ -2911,6 +2922,7 @@ def join_multi_indexed(
join_defs.JoinCondition(left, right)
for left, right in zip(left_join_ids, right_join_ids)
),
propogate_order=propogate_order,
)

left_ids_post_join = [get_column_left[id] for id in left_join_ids]
Expand Down
12 changes: 10 additions & 2 deletions bigframes/core/indexers.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,12 +379,14 @@ def _perform_loc_list_join(
result = typing.cast(
bigframes.series.Series,
series_or_dataframe.to_frame()._perform_join_by_index(
keys_index, how="right"
keys_index, how="right", always_order=True
)[name],
)
result = result.rename(original_name)
else:
result = series_or_dataframe._perform_join_by_index(keys_index, how="right")
result = series_or_dataframe._perform_join_by_index(
keys_index, how="right", always_order=True
)

if drop_levels and series_or_dataframe.index.nlevels > keys_index.nlevels:
# drop common levels
Expand Down Expand Up @@ -492,6 +494,12 @@ def _iloc_getitem_series_or_dataframe(

# set to offset index and use regular loc, then restore index
df = df.reset_index(drop=False)
block = df._block
# explicitly set index to offsets, reset_index may not generate offsets in some modes
block, offsets_id = block.promote_offsets("temp_iloc_offsets_")
block = block.set_index([offsets_id])
df = bigframes.dataframe.DataFrame(block)

result = df.loc[key]
result = result.set_index(temporary_index_names)
result = result.rename_axis(original_index_names)
Expand Down
10 changes: 8 additions & 2 deletions bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -3226,9 +3226,15 @@ def join(
return left._perform_join_by_index(right, how=how)

def _perform_join_by_index(
self, other: Union[DataFrame, indexes.Index], *, how: str = "left"
self,
other: Union[DataFrame, indexes.Index],
*,
how: str = "left",
always_order: bool = False,
):
block, _ = self._block.join(other._block, how=how, block_identity_join=True)
block, _ = self._block.join(
other._block, how=how, block_identity_join=True, always_order=always_order
)
return DataFrame(block)

@validations.requires_ordering()
Expand Down
10 changes: 10 additions & 0 deletions tests/system/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -544,6 +544,16 @@ def scalars_df_index(
return session.read_gbq(scalars_table_id, index_col="rowindex")


@pytest.fixture(scope="session")
def scalars_df_partial_ordering(
scalars_table_id: str, unordered_session: bigframes.Session
) -> bigframes.dataframe.DataFrame:
"""DataFrame pointing at test data."""
return unordered_session.read_gbq(
scalars_table_id, index_col="rowindex"
).sort_index()


@pytest.fixture(scope="session")
def scalars_df_null_index(
scalars_table_id: str, session: bigframes.Session
Expand Down
14 changes: 14 additions & 0 deletions tests/system/small/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -4400,6 +4400,20 @@ def test_iloc_list(scalars_df_index, scalars_pandas_df_index):
)


def test_iloc_list_partial_ordering(
scalars_df_partial_ordering, scalars_pandas_df_index
):
index_list = [0, 0, 0, 5, 4, 7]

bf_result = scalars_df_partial_ordering.iloc[index_list]
pd_result = scalars_pandas_df_index.iloc[index_list]

pd.testing.assert_frame_equal(
bf_result.to_pandas(),
pd_result,
)


def test_iloc_list_multiindex(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
scalars_df = scalars_df.copy()
Expand Down