Skip to content

feat: add subset parameter to DataFrame.dropna to select which columns to consider #981

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Sep 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions bigframes/core/block_transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

import functools
import typing
from typing import Sequence
from typing import Optional, Sequence

import bigframes_vendored.constants as constants
import pandas as pd
Expand Down Expand Up @@ -488,11 +488,19 @@ def dropna(
block: blocks.Block,
column_ids: typing.Sequence[str],
how: typing.Literal["all", "any"] = "any",
subset: Optional[typing.Sequence[str]] = None,
):
"""
Drop na entries from block
"""
predicates = [ops.notnull_op.as_expr(column_id) for column_id in column_ids]
if subset is None:
subset = column_ids

predicates = [
ops.notnull_op.as_expr(column_id)
for column_id in column_ids
if column_id in subset
]
if len(predicates) == 0:
return block
if how == "any":
Expand Down
22 changes: 20 additions & 2 deletions bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2027,8 +2027,9 @@ def dropna(
self,
*,
axis: int | str = 0,
inplace: bool = False,
how: str = "any",
subset: typing.Union[None, blocks.Label, Sequence[blocks.Label]] = None,
inplace: bool = False,
ignore_index=False,
) -> DataFrame:
if inplace:
Expand All @@ -2040,8 +2041,25 @@ def dropna(

axis_n = utils.get_axis_number(axis)

if subset is not None and axis_n != 0:
raise NotImplementedError(
f"subset only supported when axis=0. {constants.FEEDBACK_LINK}"
)

if axis_n == 0:
result = block_ops.dropna(self._block, self._block.value_columns, how=how) # type: ignore
# subset needs to be converted into column IDs, not column labels.
if subset is None:
subset_ids = None
elif not utils.is_list_like(subset):
subset_ids = [id_ for id_ in self._block.label_to_col_id[subset]]
else:
subset_ids = [
id_
for label in subset
for id_ in self._block.label_to_col_id[label]
]

result = block_ops.dropna(self._block, self._block.value_columns, how=how, subset=subset_ids) # type: ignore
if ignore_index:
result = result.reset_index()
return DataFrame(result)
Expand Down
21 changes: 13 additions & 8 deletions tests/system/small/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -936,19 +936,24 @@ def test_assign_callable_lambda(scalars_dfs):

@skip_legacy_pandas
@pytest.mark.parametrize(
("axis", "how", "ignore_index"),
("axis", "how", "ignore_index", "subset"),
[
(0, "any", False),
(0, "any", True),
(1, "any", False),
(1, "all", False),
(0, "any", False, None),
(0, "any", True, None),
(0, "all", False, ["bool_col", "time_col"]),
(0, "any", False, ["bool_col", "time_col"]),
(0, "all", False, "time_col"),
(1, "any", False, None),
(1, "all", False, None),
],
)
def test_df_dropna(scalars_dfs, axis, how, ignore_index):
def test_df_dropna(scalars_dfs, axis, how, ignore_index, subset):
scalars_df, scalars_pandas_df = scalars_dfs
df = scalars_df.dropna(axis=axis, how=how, ignore_index=ignore_index)
df = scalars_df.dropna(axis=axis, how=how, ignore_index=ignore_index, subset=subset)
bf_result = df.to_pandas()
pd_result = scalars_pandas_df.dropna(axis=axis, how=how, ignore_index=ignore_index)
pd_result = scalars_pandas_df.dropna(
axis=axis, how=how, ignore_index=ignore_index, subset=subset
)

# Pandas uses int64 instead of Int64 (nullable) dtype.
pd_result.index = pd_result.index.astype(pd.Int64Dtype())
Expand Down
9 changes: 9 additions & 0 deletions tests/unit/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,15 @@
from . import resources


def test_dataframe_dropna_axis_1_subset_not_implememented(
monkeypatch: pytest.MonkeyPatch,
):
dataframe = resources.create_dataframe(monkeypatch)

with pytest.raises(NotImplementedError, match="subset"):
dataframe.dropna(axis=1, subset=["col1", "col2"])


def test_dataframe_repr_with_uninitialized_object():
"""Ensures DataFrame.__init__ can be paused in a visual debugger without crashing.

Expand Down
17 changes: 17 additions & 0 deletions third_party/bigframes_vendored/pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1614,6 +1614,8 @@ def dropna(
*,
axis: int | str = 0,
how: str = "any",
subset=None,
inplace: bool = False,
ignore_index=False,
) -> DataFrame:
"""Remove missing values.
Expand Down Expand Up @@ -1662,6 +1664,15 @@ def dropna(
<BLANKLINE>
[3 rows x 3 columns]

Define in which columns to look for missing values.
Copy link
Collaborator

@Genesis929 Genesis929 Sep 13, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From doctest.
@@ -1,5 +1,5 @@
- name toy born
-1 Batman Batmobile 1940-04-25
-2 Catwoman Bullwhip NaT
+ name toy born
+1 Batman Batmobile 1940-04-25
+2 Catwoman Bullwhip <NA>

[2 rows x 3 columns]

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, fixed! The diff got a little big because I pulled in #987. It should be easier to review again once that is merged.


>>> df.dropna(subset=['name', 'toy'])
name toy born
1 Batman Batmobile 1940-04-25
2 Catwoman Bullwhip <NA>
<BLANKLINE>
[2 rows x 3 columns]

Args:
axis ({0 or 'index', 1 or 'columns'}, default 'columns'):
Determine if rows or columns which contain missing values are
Expand All @@ -1675,6 +1686,12 @@ def dropna(

* 'any' : If any NA values are present, drop that row or column.
* 'all' : If all values are NA, drop that row or column.
subset (column label or sequence of labels, optional):
Labels along other axis to consider, e.g. if you are dropping
rows these would be a list of columns to include.
Only supports axis=0.
inplace (bool, default ``False``):
Not supported.
ignore_index (bool, default ``False``):
If ``True``, the resulting axis will be labeled 0, 1, …, n - 1.

Expand Down