Skip to content

Commit 329ad7c

Browse files
NickCrewscpcloud
andauthored
feat(api): make topk() and value_counts() more flexible (#10928)
Co-authored-by: Phillip Cloud <[email protected]>
1 parent 07cc176 commit 329ad7c

File tree

5 files changed

+146
-30
lines changed

5 files changed

+146
-30
lines changed

ibis/backends/polars/compiler.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,11 @@ def operation(op, **_):
5555
raise com.OperationNotDefinedError(f"No translation rule for {type(op)}")
5656

5757

58+
@translate.register(ops.Alias)
59+
def alias(op, **kw):
60+
return translate(op.arg, **kw).alias(op.name)
61+
62+
5863
@translate.register(ops.DatabaseTable)
5964
def table(op, **_):
6065
return op.source._tables[op.name]

ibis/backends/tests/sql/snapshots/test_select_sql/test_chain_limit_doesnt_collapse/result.sql

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,13 @@ FROM (
66
FROM (
77
SELECT
88
"t0"."city",
9-
COUNT(*) AS "CountStar(tbl)"
9+
COUNT(*) AS "city_count"
1010
FROM "tbl" AS "t0"
1111
GROUP BY
1212
1
1313
) AS "t1"
1414
ORDER BY
15-
"t1"."CountStar(tbl)" DESC
15+
"t1"."city_count" DESC
1616
LIMIT 10
1717
) AS "t3"
1818
LIMIT 5
@@ -25,13 +25,13 @@ OFFSET (
2525
FROM (
2626
SELECT
2727
"t0"."city",
28-
COUNT(*) AS "CountStar(tbl)"
28+
COUNT(*) AS "city_count"
2929
FROM "tbl" AS "t0"
3030
GROUP BY
3131
1
3232
) AS "t1"
3333
ORDER BY
34-
"t1"."CountStar(tbl)" DESC
34+
"t1"."city_count" DESC
3535
LIMIT 10
3636
) AS "t3"
3737
)

ibis/backends/tests/sql/snapshots/test_select_sql/test_topk_operation/e2.sql

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,13 @@ SEMI JOIN (
1111
FROM (
1212
SELECT
1313
"t0"."city",
14-
COUNT(*) AS "CountStar(tbl)"
14+
COUNT(*) AS "city_count"
1515
FROM "tbl" AS "t0"
1616
GROUP BY
1717
1
1818
) AS "t2"
1919
ORDER BY
20-
"t2"."CountStar(tbl)" DESC
20+
"t2"."city_count" DESC
2121
LIMIT 10
2222
) AS "t5"
2323
ON "t1"."city" = "t5"."city"

ibis/expr/types/generic.py

Lines changed: 46 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -2114,12 +2114,13 @@ def nunique(self, *, where: ir.BooleanValue | None = None) -> ir.IntegerScalar:
21142114
).to_expr()
21152115

21162116
def topk(
2117-
self, k: int, by: ir.Value | None = None, *, name: str | None = None
2117+
self,
2118+
k: int | None = None,
2119+
by: ir.Value | None = None,
2120+
*,
2121+
name: str | None = None,
21182122
) -> ir.Table:
2119-
"""Return a "top k" expression.
2120-
2121-
Computes a Table containing the top `k` values by a certain metric
2122-
(defaults to count).
2123+
"""Computes a Table of the top values by a metric (defaults to count).
21232124
21242125
::: {.callout-note title="Changed in version 9.5.0"}
21252126
Added `name` parameter.
@@ -2129,17 +2130,24 @@ def topk(
21292130
----------
21302131
k
21312132
The number of rows to return.
2133+
If `None`, all values are returned in descending order.
21322134
by
21332135
The metric to compute "top" by. Defaults to `count`.
21342136
name
2135-
The name to use for the metric column. A suitable name will be
2136-
automatically generated if not provided.
2137+
The name to use for the metric column.
2138+
If not provided, a suitable name will be generated.
21372139
21382140
Returns
21392141
-------
21402142
Table
21412143
The top `k` values.
21422144
2145+
See Also
2146+
--------
2147+
[`Column.value_counts`](./expression-generic.qmd#ibis.expr.types.generic.Column.value_counts)
2148+
[`Table.topk`](./expression-tables.qmd#ibis.expr.types.relations.Table.topk)
2149+
[`Table.value_counts`](./expression-tables.qmd#ibis.expr.types.relations.Table.value_counts)
2150+
21432151
Examples
21442152
--------
21452153
>>> import ibis
@@ -2149,15 +2157,15 @@ def topk(
21492157
Compute the top 3 diamond colors by frequency:
21502158
21512159
>>> t.color.topk(3)
2152-
┏━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━
2153-
┃ color ┃ CountStar(diamonds)
2154-
┡━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━
2155-
│ string │ int64
2156-
├────────┼─────────────────────
2157-
│ G │ 11292 │
2158-
│ E │ 9797 │
2159-
│ F │ 9542 │
2160-
└────────┴─────────────────────
2160+
┏━━━━━━━━┳━━━━━━━━━━━━━┓
2161+
┃ color ┃ color_count
2162+
┡━━━━━━━━╇━━━━━━━━━━━━━┩
2163+
│ string │ int64 │
2164+
├────────┼─────────────┤
2165+
│ G │ 11292 │
2166+
│ E │ 9797 │
2167+
│ F │ 9542 │
2168+
└────────┴─────────────┘
21612169
21622170
Compute the top 3 diamond colors by mean price:
21632171
@@ -2172,16 +2180,21 @@ def topk(
21722180
│ H │ 4486.669196 │
21732181
└────────┴─────────────┘
21742182
2175-
Compute the top 2 diamond colors by max carat:
2183+
Rank all the colors by max carat:
21762184
2177-
>>> t.color.topk(2, by=t.carat.max(), name="max_carat")
2185+
>>> t.color.topk(by=t.carat.max(), name="max_carat")
21782186
┏━━━━━━━━┳━━━━━━━━━━━┓
21792187
┃ color ┃ max_carat ┃
21802188
┡━━━━━━━━╇━━━━━━━━━━━┩
21812189
│ string │ float64 │
21822190
├────────┼───────────┤
21832191
│ J │ 5.01 │
21842192
│ H │ 4.13 │
2193+
│ I │ 4.01 │
2194+
│ D │ 3.40 │
2195+
│ E │ 3.05 │
2196+
│ F │ 3.01 │
2197+
│ G │ 3.01 │
21852198
└────────┴───────────┘
21862199
"""
21872200
from ibis.expr.types.relations import bind
@@ -2193,15 +2206,20 @@ def topk(
21932206

21942207
table = table.to_expr()
21952208

2209+
if by is None and name is None:
2210+
# if `by` is something more complex, the _count doesn't make sense.
2211+
name = f"{self.get_name()}_count"
21962212
if by is None:
21972213
by = lambda t: t.count()
21982214

21992215
(metric,) = bind(table, by)
2200-
22012216
if name is not None:
22022217
metric = metric.name(name)
22032218

2204-
return table.aggregate(metric, by=[self]).order_by(metric.desc()).limit(k)
2219+
in_desc = table.aggregate(metric, by=[self]).order_by(metric.desc())
2220+
if k is not None:
2221+
in_desc = in_desc.limit(k)
2222+
return in_desc
22052223

22062224
def arbitrary(self, *, where: ir.BooleanValue | None = None) -> Scalar:
22072225
"""Select an arbitrary value in a column.
@@ -2287,14 +2305,20 @@ def value_counts(self, *, name: str | None = None) -> ir.Table:
22872305
Parameters
22882306
----------
22892307
name
2290-
The name to use for the frequency column. A suitable name will be
2291-
automatically generated if not provided.
2308+
The name to use for the frequency column.
2309+
If not provided, a suitable name will be generated.
22922310
22932311
Returns
22942312
-------
22952313
Table
22962314
The frequency table.
22972315
2316+
See Also
2317+
--------
2318+
[`Column.topk`](./expression-generic.qmd#ibis.expr.types.generic.Column.topk)
2319+
[`Table.value_counts`](./expression-tables.qmd#ibis.expr.types.relations.Table.value_counts)
2320+
[`Table.topk`](./expression-tables.qmd#ibis.expr.types.relations.Table.topk)
2321+
22982322
Examples
22992323
--------
23002324
>>> import ibis

ibis/expr/types/relations.py

Lines changed: 89 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4716,14 +4716,20 @@ def value_counts(self, *, name: str | None = None) -> ir.Table:
47164716
Parameters
47174717
----------
47184718
name
4719-
The name to use for the frequency column. A suitable name will be
4720-
automatically generated if not provided.
4719+
The name to use for the frequency column.
4720+
If not provided, a suitable name will be generated.
47214721
47224722
Returns
47234723
-------
47244724
Table
47254725
Frequency table of this table's values.
47264726
4727+
See Also
4728+
--------
4729+
[`Table.topk`](./expression-tables.qmd#ibis.expr.types.relations.Table.topk)
4730+
[`Column.value_counts`](./expression-generic.qmd#ibis.expr.types.generic.Column.value_counts)
4731+
[`Column.topk`](./expression-generic.qmd#ibis.expr.types.generic.Column.topk)
4732+
47274733
Examples
47284734
--------
47294735
>>> from ibis import examples
@@ -4773,6 +4779,87 @@ def value_counts(self, *, name: str | None = None) -> ir.Table:
47734779
name = "_".join(columns) + "_count"
47744780
return self.group_by(columns).agg(lambda t: t.count().name(name))
47754781

4782+
def topk(self, k: int | None = None, *, name: str | None = None) -> ir.Table:
4783+
"""Get the most frequent values of this table.
4784+
4785+
Parameters
4786+
----------
4787+
k
4788+
Number of top values to return.
4789+
If `None`, all values are returned in descending order.
4790+
name
4791+
The name to use for the frequency column.
4792+
If not provided, a suitable name will be generated.
4793+
4794+
Returns
4795+
-------
4796+
Table
4797+
Frequency table of this table's values.
4798+
4799+
See Also
4800+
--------
4801+
[`Table.value_counts`](./expression-tables.qmd#ibis.expr.types.relations.Table.value_counts)
4802+
[`Column.topk`](./expression-generic.qmd#ibis.expr.types.generic.Column.topk)
4803+
[`Column.value_counts`](./expression-generic.qmd#ibis.expr.types.generic.Column.value_counts)
4804+
4805+
Examples
4806+
--------
4807+
>>> from ibis import examples, selectors as s
4808+
>>> ibis.options.interactive = True
4809+
>>> t = examples.penguins.fetch().select("species", "island", "sex", "year")
4810+
>>> t.head()
4811+
┏━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━┓
4812+
┃ species ┃ island ┃ sex ┃ year ┃
4813+
┡━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━┩
4814+
│ string │ string │ string │ int64 │
4815+
├─────────┼───────────┼────────┼───────┤
4816+
│ Adelie │ Torgersen │ male │ 2007 │
4817+
│ Adelie │ Torgersen │ female │ 2007 │
4818+
│ Adelie │ Torgersen │ female │ 2007 │
4819+
│ Adelie │ Torgersen │ NULL │ 2007 │
4820+
│ Adelie │ Torgersen │ female │ 2007 │
4821+
└─────────┴───────────┴────────┴───────┘
4822+
>>> t.topk().order_by(ibis.desc("species_island_sex_year_count"), s.all() & ~s.index[-1])
4823+
┏━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
4824+
┃ species ┃ island ┃ sex ┃ year ┃ species_island_sex_year_count ┃
4825+
┡━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
4826+
│ string │ string │ string │ int64 │ int64 │
4827+
├───────────┼────────┼────────┼───────┼───────────────────────────────┤
4828+
│ Gentoo │ Biscoe │ male │ 2008 │ 23 │
4829+
│ Gentoo │ Biscoe │ female │ 2008 │ 22 │
4830+
│ Gentoo │ Biscoe │ male │ 2009 │ 21 │
4831+
│ Gentoo │ Biscoe │ female │ 2009 │ 20 │
4832+
│ Gentoo │ Biscoe │ male │ 2007 │ 17 │
4833+
│ Gentoo │ Biscoe │ female │ 2007 │ 16 │
4834+
│ Chinstrap │ Dream │ female │ 2007 │ 13 │
4835+
│ Chinstrap │ Dream │ male │ 2007 │ 13 │
4836+
│ Chinstrap │ Dream │ female │ 2009 │ 12 │
4837+
│ Chinstrap │ Dream │ male │ 2009 │ 12 │
4838+
│ … │ … │ … │ … │ … │
4839+
└───────────┴────────┴────────┴───────┴───────────────────────────────┘
4840+
>>> t.topk(3, name="n")
4841+
┏━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━┳━━━━━━━┓
4842+
┃ species ┃ island ┃ sex ┃ year ┃ n ┃
4843+
┡━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━╇━━━━━━━┩
4844+
│ string │ string │ string │ int64 │ int64 │
4845+
├─────────┼────────┼────────┼───────┼───────┤
4846+
│ Gentoo │ Biscoe │ male │ 2008 │ 23 │
4847+
│ Gentoo │ Biscoe │ female │ 2008 │ 22 │
4848+
│ Gentoo │ Biscoe │ male │ 2009 │ 21 │
4849+
└─────────┴────────┴────────┴───────┴───────┘
4850+
"""
4851+
columns = self.columns
4852+
if name is None:
4853+
name = "_".join(columns) + "_count"
4854+
in_desc = (
4855+
self.group_by(columns)
4856+
.agg(lambda t: t.count().name(name))
4857+
.order_by(ibis.desc(name))
4858+
)
4859+
if k is not None:
4860+
in_desc = in_desc.limit(k)
4861+
return in_desc
4862+
47764863
def unnest(
47774864
self, column, /, *, offset: str | None = None, keep_empty: bool = False
47784865
) -> Table:

0 commit comments

Comments
 (0)