Skip to content

feat: support describe for non-numerical type string #973

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 70 additions & 18 deletions bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

import datetime
import inspect
import itertools
import re
import sys
import textwrap
Expand Down Expand Up @@ -70,6 +71,7 @@
import bigframes.exceptions
import bigframes.formatting_helpers as formatter
import bigframes.operations as ops
import bigframes.operations.aggregations
import bigframes.operations.aggregations as agg_ops
import bigframes.operations.plotting as plotting
import bigframes.operations.structs
Expand Down Expand Up @@ -2207,14 +2209,17 @@ def agg(
self, func: str | typing.Sequence[str]
) -> DataFrame | bigframes.series.Series:
if utils.is_list_like(func):
if any(
dtype not in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE
for dtype in self.dtypes
):
raise NotImplementedError(
f"Multiple aggregations only supported on numeric columns. {constants.FEEDBACK_LINK}"
)
aggregations = [agg_ops.lookup_agg_func(f) for f in func]

for dtype, agg in itertools.product(self.dtypes, aggregations):
if not bigframes.operations.aggregations.is_agg_op_supported(
dtype, agg
):
raise NotImplementedError(
f"Type {dtype} does not support aggregation {agg}. "
f"Share your usecase with the BigQuery DataFrames team at the {constants.FEEDBACK_LINK}"
)

return DataFrame(
self._block.summarize(
self._block.value_columns,
Expand Down Expand Up @@ -2280,16 +2285,55 @@ def melt(
self._block.melt(id_col_ids, val_col_ids, var_name, value_name)
)

def describe(self) -> DataFrame:
df_numeric = self._drop_non_numeric(permissive=False)
if len(df_numeric.columns) == 0:
raise NotImplementedError(
f"df.describe() currently only supports numeric values. {constants.FEEDBACK_LINK}"
_NUMERICAL_DISCRIBE_AGGS = (
"count",
"mean",
"std",
"min",
"25%",
"50%",
"75%",
"max",
)
_NON_NUMERICAL_DESCRIBE_AGGS = ("count", "nunique")

def describe(self, include: None | Literal["all"] = None) -> DataFrame:
if include is None:
numeric_df = self._drop_non_numeric(permissive=False)
if len(numeric_df.columns) == 0:
# Describe eligible non-numerical columns
result = self._drop_non_string().agg(self._NON_NUMERICAL_DESCRIBE_AGGS)
else:
# Otherwise, only describe numerical columns
result = numeric_df.agg(self._NUMERICAL_DISCRIBE_AGGS)
return typing.cast(DataFrame, result)

elif include == "all":
numeric_result = typing.cast(
DataFrame,
self._drop_non_numeric(permissive=False).agg(
self._NUMERICAL_DISCRIBE_AGGS
),
)
string_result = typing.cast(
DataFrame,
self._drop_non_string().agg(self._NON_NUMERICAL_DESCRIBE_AGGS),
)
result = df_numeric.agg(
["count", "mean", "std", "min", "25%", "50%", "75%", "max"]
)
return typing.cast(DataFrame, result)

if len(numeric_result.columns) == 0:
return string_result
elif len(string_result.columns) == 0:
return numeric_result
else:
import bigframes.core.reshape as rs

# Use reindex after join to preserve the original column order.
return rs.concat(
[numeric_result, string_result], axis=1
)._reindex_columns(self.columns)

else:
raise ValueError(f"Unsupported include type: {include}")

def skew(self, *, numeric_only: bool = False):
if not numeric_only:
Expand Down Expand Up @@ -2487,18 +2531,26 @@ def unstack(self, level: LevelsType = -1):
return DataFrame(pivot_block)

def _drop_non_numeric(self, permissive=True) -> DataFrame:
types_to_keep = (
numerical_types = (
set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE)
if permissive
else set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE)
)
non_numeric_cols = [
col_id
for col_id, dtype in zip(self._block.value_columns, self._block.dtypes)
if dtype not in types_to_keep
if dtype not in numerical_types
]
return DataFrame(self._block.drop_columns(non_numeric_cols))

def _drop_non_string(self) -> DataFrame:
string_cols = [
col_id
for col_id, dtype in zip(self._block.value_columns, self._block.dtypes)
if dtype == bigframes.dtypes.STRING_DTYPE
]
return DataFrame(self._block.select_columns(string_cols))

def _drop_non_bool(self) -> DataFrame:
non_bool_cols = [
col_id
Expand Down
11 changes: 11 additions & 0 deletions bigframes/operations/aggregations.py
Original file line number Diff line number Diff line change
Expand Up @@ -562,3 +562,14 @@ def lookup_agg_func(key: str) -> typing.Union[UnaryAggregateOp, NullaryAggregate
return _AGGREGATIONS_LOOKUP[key]
else:
raise ValueError(f"Unrecognize aggregate function: {key}")


def is_agg_op_supported(dtype: dtypes.Dtype, op: AggregateOp) -> bool:
if dtype in dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE:
return True

if dtype == dtypes.STRING_DTYPE:
return isinstance(op, (CountOp, NuniqueOp))

# For all other types, support no aggregation
return False
81 changes: 81 additions & 0 deletions tests/system/small/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2612,6 +2612,87 @@ def test_df_describe(scalars_dfs):
).all()


@skip_legacy_pandas
@pytest.mark.parametrize("include", [None, "all"])
def test_df_describe_non_numerical(scalars_dfs, include):
scalars_df, scalars_pandas_df = scalars_dfs

non_numerical_columns = ["string_col"]

modified_bf = scalars_df[non_numerical_columns]
bf_result = modified_bf.describe(include=include).to_pandas()

modified_pd_df = scalars_pandas_df[non_numerical_columns]
pd_result = modified_pd_df.describe(include=include)

# Reindex results with the specified keys and their order, because
# the relative order is not important.
bf_result = bf_result.reindex(["count", "nunique"])
pd_result = pd_result.reindex(
["count", "unique"]
# BF counter part of "unique" is called "nunique"
).rename(index={"unique": "nunique"})

pd.testing.assert_frame_equal(
pd_result[non_numerical_columns].astype("Int64"),
bf_result[non_numerical_columns],
check_index_type=False,
)


@skip_legacy_pandas
def test_df_describe_mixed_types_include_all(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs

numerical_columns = [
"int64_col",
"float64_col",
]
non_numerical_columns = ["string_col"]
supported_columns = numerical_columns + non_numerical_columns

modified_bf = scalars_df[supported_columns]
bf_result = modified_bf.describe(include="all").to_pandas()

modified_pd_df = scalars_pandas_df[supported_columns]
pd_result = modified_pd_df.describe(include="all")

# Drop quartiles, as they are approximate
bf_min = bf_result.loc["min", :]
bf_p25 = bf_result.loc["25%", :]
bf_p50 = bf_result.loc["50%", :]
bf_p75 = bf_result.loc["75%", :]
bf_max = bf_result.loc["max", :]

# Reindex results with the specified keys and their order, because
# the relative order is not important.
bf_result = bf_result.reindex(["count", "nunique", "mean", "std", "min", "max"])
pd_result = pd_result.reindex(
["count", "unique", "mean", "std", "min", "max"]
# BF counter part of "unique" is called "nunique"
).rename(index={"unique": "nunique"})

pd.testing.assert_frame_equal(
pd_result[numerical_columns].astype("Float64"),
bf_result[numerical_columns],
check_index_type=False,
)

pd.testing.assert_frame_equal(
pd_result[non_numerical_columns].astype("Int64"),
bf_result[non_numerical_columns],
check_index_type=False,
)

# Double-check that quantiles are at least plausible.
assert (
(bf_min <= bf_p25)
& (bf_p25 <= bf_p50)
& (bf_p50 <= bf_p50)
& (bf_p75 <= bf_max)
).all()


def test_df_transpose():
# Include some floats to ensure type coercion
values = [[0, 3.5, True], [1, 4.5, False], [2, 6.5, None]]
Expand Down
13 changes: 13 additions & 0 deletions tests/unit/operations/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
92 changes: 92 additions & 0 deletions tests/unit/operations/test_aggregations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pytest

import bigframes.dtypes as dtypes
from bigframes.operations.aggregations import (
all_op,
any_op,
count_op,
dense_rank_op,
first_op,
is_agg_op_supported,
max_op,
mean_op,
median_op,
min_op,
nunique_op,
product_op,
rank_op,
size_op,
std_op,
sum_op,
var_op,
)

_ALL_OPS = set(
[
size_op,
sum_op,
mean_op,
median_op,
product_op,
max_op,
min_op,
std_op,
var_op,
count_op,
nunique_op,
rank_op,
dense_rank_op,
all_op,
any_op,
first_op,
]
)
_STRING_SUPPORTED_OPS = set([count_op, nunique_op])


@pytest.mark.parametrize("dtype", dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE)
@pytest.mark.parametrize("op", _ALL_OPS)
def test_is_agg_op_supported_numerical_support_all(dtype, op):
assert is_agg_op_supported(dtype, op) is True


@pytest.mark.parametrize("dtype", [dtypes.STRING_DTYPE])
@pytest.mark.parametrize("op", _STRING_SUPPORTED_OPS)
def test_is_agg_op_supported_string_support_ops(dtype, op):
assert is_agg_op_supported(dtype, op) is True


@pytest.mark.parametrize("dtype", [dtypes.STRING_DTYPE])
@pytest.mark.parametrize("op", _ALL_OPS - _STRING_SUPPORTED_OPS)
def test_is_agg_op_supported_string_not_support_ops(dtype, op):
assert is_agg_op_supported(dtype, op) is False


@pytest.mark.parametrize(
"dtype",
[
dtypes.BYTES_DTYPE,
dtypes.DATE_DTYPE,
dtypes.TIME_DTYPE,
dtypes.DATETIME_DTYPE,
dtypes.TIMESTAMP_DTYPE,
dtypes.GEO_DTYPE,
],
)
@pytest.mark.parametrize("op", _ALL_OPS)
def test_is_agg_op_supported_non_numerical_no_support(dtype, op):
assert is_agg_op_supported(dtype, op) is False