Skip to content

Commit b6b12a2

Browse files
committed
Add h_polars.with_columns
Supporting now using with_columns for both polars eager and lazy execution. Inherits from with_columns_base. Some functionality is shared with h_pandas.with_columns -- TODO: refactor out common logic when appropriate.
1 parent debb559 commit b6b12a2

16 files changed

+2402
-123
lines changed

examples/polars/notebook.ipynb

Lines changed: 121 additions & 119 deletions
Large diffs are not rendered by default.
116 KB
Loading
116 KB
Loading

examples/polars/with_columns/README

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# Using with_columns with Polars
2+
3+
We show the ability to use the familiar `with_columns` from `polars`. Supported for both `pl.DataFrame` and `pl.LazyFrame`.
4+
5+
To see the example look at the notebook.
6+
7+
![image info](./DAG_DataFrame.png)
8+
![image info](./DAG_lazy.png)
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
import polars as pl
2+
3+
from hamilton.function_modifiers import config
4+
5+
"""
6+
Notes:
7+
1. This file is used for all the [ray|dask|spark]/hello_world examples.
8+
2. It therefore show cases how you can write something once and not only scale it, but port it
9+
to different frameworks with ease!
10+
"""
11+
12+
13+
@config.when(case="millions")
14+
def avg_3wk_spend__millions(spend: pl.Series) -> pl.Series:
15+
"""Rolling 3 week average spend."""
16+
return (
17+
spend.to_frame("spend").select(pl.col("spend").rolling_mean(window_size=3) / 1e6)
18+
).to_series(0)
19+
20+
21+
@config.when(case="thousands")
22+
def avg_3wk_spend__thousands(spend: pl.Series) -> pl.Series:
23+
"""Rolling 3 week average spend."""
24+
return (
25+
spend.to_frame("spend").select(pl.col("spend").rolling_mean(window_size=3) / 1e3)
26+
).to_series(0)
27+
28+
29+
def spend_per_signup(spend: pl.Series, signups: pl.Series) -> pl.Series:
30+
"""The cost per signup in relation to spend."""
31+
return spend / signups
32+
33+
34+
def spend_mean(spend: pl.Series) -> float:
35+
"""Shows function creating a scalar. In this case it computes the mean of the entire column."""
36+
return spend.mean()
37+
38+
39+
def spend_zero_mean(spend: pl.Series, spend_mean: float) -> pl.Series:
40+
"""Shows function that takes a scalar. In this case to zero mean spend."""
41+
return spend - spend_mean
42+
43+
44+
def spend_std_dev(spend: pl.Series) -> float:
45+
"""Function that computes the standard deviation of the spend column."""
46+
return spend.std()
47+
48+
49+
def spend_zero_mean_unit_variance(spend_zero_mean: pl.Series, spend_std_dev: float) -> pl.Series:
50+
"""Function showing one way to make spend have zero mean and unit variance."""
51+
return spend_zero_mean / spend_std_dev
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
import polars as pl
2+
3+
from hamilton.function_modifiers import config
4+
5+
"""
6+
Notes:
7+
1. This file is used for all the [ray|dask|spark]/hello_world examples.
8+
2. It therefore show cases how you can write something once and not only scale it, but port it
9+
to different frameworks with ease!
10+
"""
11+
12+
13+
@config.when(case="millions")
14+
def avg_3wk_spend__millions(spend: pl.Expr) -> pl.Expr:
15+
"""Rolling 3 week average spend."""
16+
return spend.rolling_mean(window_size=3) / 1e6
17+
18+
19+
@config.when(case="thousands")
20+
def avg_3wk_spend__thousands(spend: pl.Expr) -> pl.Expr:
21+
"""Rolling 3 week average spend."""
22+
return spend.rolling_mean(window_size=3) / 1e3
23+
24+
25+
def spend_per_signup(spend: pl.Expr, signups: pl.Expr) -> pl.Expr:
26+
"""The cost per signup in relation to spend."""
27+
return spend / signups
28+
29+
30+
def spend_mean(spend: pl.Expr) -> float:
31+
"""Shows function creating a scalar. In this case it computes the mean of the entire column."""
32+
return spend.mean()
33+
34+
35+
def spend_zero_mean(spend: pl.Expr, spend_mean: float) -> pl.Expr:
36+
"""Shows function that takes a scalar. In this case to zero mean spend."""
37+
return spend - spend_mean
38+
39+
40+
def spend_std_dev(spend: pl.Expr) -> float:
41+
"""Function that computes the standard deviation of the spend column."""
42+
return spend.std()
43+
44+
45+
def spend_zero_mean_unit_variance(spend_zero_mean: pl.Expr, spend_std_dev: float) -> pl.Expr:
46+
"""Function showing one way to make spend have zero mean and unit variance."""
47+
return spend_zero_mean / spend_std_dev

examples/polars/with_columns/notebook.ipynb

Lines changed: 1239 additions & 0 deletions
Large diffs are not rendered by default.

hamilton/plugins/h_polars.py

Lines changed: 234 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,27 @@
1-
from typing import Any, Dict, Type, Union
1+
import sys
2+
from types import ModuleType
3+
from typing import Any, Callable, Collection, Dict, List, Tuple, Type, Union, get_type_hints
24

35
import polars as pl
46

5-
from hamilton import base
7+
_sys_version_info = sys.version_info
8+
_version_tuple = (_sys_version_info.major, _sys_version_info.minor, _sys_version_info.micro)
9+
10+
if _version_tuple < (3, 11, 0):
11+
pass
12+
else:
13+
pass
14+
15+
# Copied this over from function_graph
16+
# TODO -- determine the best place to put this code
17+
from hamilton import base, node, registry
18+
from hamilton.function_modifiers.expanders import extract_columns
19+
from hamilton.function_modifiers.recursive import (
20+
_default_inject_parameter,
21+
subdag,
22+
with_columns_base,
23+
)
24+
from hamilton.plugins.polars_extensions import DATAFRAME_TYPE
625

726

827
class PolarsDataFrameResult(base.ResultMixin):
@@ -54,3 +73,216 @@ def build_result(
5473

5574
def output_type(self) -> Type:
5675
return pl.DataFrame
76+
77+
78+
# Do we need this here?
79+
class with_columns(with_columns_base):
80+
"""Initializes a with_columns decorator for polars.
81+
82+
This allows you to efficiently run groups of map operations on a dataframe. We support
83+
both eager and lazy mode in polars. In case of using eager mode the type should be
84+
pl.DataFrame and the subsequent operations run on columns with type pl.Series.
85+
86+
Here's an example of calling in eager mode -- if you've seen ``@subdag``, you should be familiar with
87+
the concepts:
88+
89+
.. code-block:: python
90+
91+
# my_module.py
92+
def a_b_average(a: pl.Series, b: pl.Series) -> pl.Series:
93+
return (a + b) / 2
94+
95+
96+
.. code-block:: python
97+
98+
# with_columns_module.py
99+
def a_plus_b(a: pl.Series, b: pl.Series) -> pl.Series:
100+
return a + b
101+
102+
103+
# the with_columns call
104+
@with_columns(
105+
*[my_module], # Load from any module
106+
*[a_plus_b], # or list operations directly
107+
columns_to_pass=["a", "b"], # The columns to pass from the dataframe to
108+
# the subdag
109+
select=["a_plus_b", "a_b_average"], # The columns to append to the dataframe
110+
)
111+
def final_df(initial_df: pl.DataFrame) -> pl.DataFrame:
112+
# process, or just return unprocessed
113+
...
114+
115+
In this instance the ``initial_df`` would get two columns added: ``a_plus_b`` and ``a_b_average``.
116+
117+
Note that the operation is "append", meaning that the columns that are selected are appended
118+
onto the dataframe.
119+
120+
If the function takes multiple dataframes, the dataframe input to process will always be
121+
the first argument. This will be passed to the subdag, transformed, and passed back to the function.
122+
This follows the hamilton rule of reference by parameter name. To demonstarte this, in the code
123+
above, the dataframe that is passed to the subdag is `initial_df`. That is transformed
124+
by the subdag, and then returned as the final dataframe.
125+
126+
You can read it as:
127+
128+
"final_df is a function that transforms the upstream dataframe initial_df, running the transformations
129+
from my_module. It starts with the columns a_from_df and b_from_df, and then adds the columns
130+
a, b, and a_plus_b to the dataframe. It then returns the dataframe, and does some processing on it."
131+
132+
In case you need more flexibility you can alternatively use ``on_input``, for example,
133+
134+
.. code-block:: python
135+
136+
# with_columns_module.py
137+
def a_from_df() -> pl.Expr:
138+
return pl.col(a).alias("a") / 100
139+
140+
def b_from_df() -> pl.Expr:
141+
return pl.col(b).alias("b") / 100
142+
143+
144+
# the with_columns call
145+
@with_columns(
146+
*[my_module],
147+
on_input="initial_df",
148+
select=["a_from_df", "b_from_df", "a_plus_b", "a_b_average"],
149+
)
150+
def final_df(initial_df: pl.DataFrame) -> pl.DataFrame:
151+
# process, or just return unprocessed
152+
...
153+
154+
the above would output a dataframe where the two columns ``a`` and ``b`` get
155+
overwritten.
156+
"""
157+
158+
def __init__(
159+
self,
160+
*load_from: Union[Callable, ModuleType],
161+
columns_to_pass: List[str] = None,
162+
pass_dataframe_as: str = None,
163+
on_input: str = None,
164+
select: List[str] = None,
165+
namespace: str = None,
166+
config_required: List[str] = None,
167+
):
168+
"""Instantiates a ``@with_columns`` decorator.
169+
170+
:param load_from: The functions or modules that will be used to generate the group of map operations.
171+
:param columns_to_pass: The initial schema of the dataframe. This is used to determine which
172+
upstream inputs should be taken from the dataframe, and which shouldn't. Note that, if this is
173+
left empty (and external_inputs is as well), we will assume that all dependencies come
174+
from the dataframe. This cannot be used in conjunction with on_input.
175+
:param on_input: The name of the dataframe that we're modifying, as known to the subdag.
176+
If you pass this in, you are responsible for extracting columns out. If not provided, you have
177+
to pass columns_to_pass in, and we will extract the columns out on the first parameter for you.
178+
:param select: The end nodes that represent columns to be appended to the original dataframe
179+
via with_columns. Existing columns will be overridden. The selected nodes need to have the
180+
corresponding column type, in this case pl.Series, to be appended to the original dataframe.
181+
:param namespace: The namespace of the nodes, so they don't clash with the global namespace
182+
and so this can be reused. If its left out, there will be no namespace (in which case you'll want
183+
to be careful about repeating it/reusing the nodes in other parts of the DAG.)
184+
:param config_required: the list of config keys that are required to resolve any functions. Pass in None\
185+
if you want the functions/modules to have access to all possible config.
186+
"""
187+
188+
if pass_dataframe_as is not None:
189+
raise NotImplementedError(
190+
"We currently do not support pass_dataframe_as for pandas. Please reach out if you need this "
191+
"functionality."
192+
)
193+
194+
super().__init__(
195+
*load_from,
196+
columns_to_pass=columns_to_pass,
197+
on_input=on_input,
198+
select=select,
199+
namespace=namespace,
200+
config_required=config_required,
201+
dataframe_type=DATAFRAME_TYPE,
202+
)
203+
204+
def _create_column_nodes(
205+
self, fn: Callable, inject_parameter: str, params: Dict[str, Type[Type]]
206+
) -> List[node.Node]:
207+
output_type = params[inject_parameter]
208+
209+
def temp_fn(**kwargs) -> Any:
210+
return kwargs[inject_parameter]
211+
212+
# We recreate the df node to use extract columns
213+
temp_node = node.Node(
214+
name=inject_parameter,
215+
typ=output_type,
216+
callabl=temp_fn,
217+
input_types={inject_parameter: output_type},
218+
)
219+
220+
extract_columns_decorator = extract_columns(*self.initial_schema)
221+
222+
out_nodes = extract_columns_decorator.transform_node(temp_node, config={}, fn=temp_fn)
223+
return out_nodes[1:]
224+
225+
def get_initial_nodes(
226+
self, fn: Callable, params: Dict[str, Type[Type]]
227+
) -> Tuple[str, Collection[node.Node]]:
228+
"""Selects the correct dataframe and optionally extracts out columns."""
229+
inject_parameter = _default_inject_parameter(fn=fn, target_dataframe=self.target_dataframe)
230+
with_columns_base.validate_dataframe(
231+
fn=fn,
232+
inject_parameter=inject_parameter,
233+
params=params,
234+
required_type=self.dataframe_type,
235+
)
236+
237+
initial_nodes = (
238+
[]
239+
if self.target_dataframe is not None
240+
else self._create_column_nodes(fn=fn, inject_parameter=inject_parameter, params=params)
241+
)
242+
243+
return inject_parameter, initial_nodes
244+
245+
def get_subdag_nodes(self, fn: Callable, config: Dict[str, Any]) -> Collection[node.Node]:
246+
return subdag.collect_nodes(config, self.subdag_functions)
247+
248+
def chain_subdag_nodes(
249+
self, fn: Callable, inject_parameter: str, generated_nodes: Collection[node.Node]
250+
) -> node.Node:
251+
"Node that adds to / overrides columns for the original dataframe based on selected output."
252+
253+
if self.select is None:
254+
self.select = [
255+
sink_node.name
256+
for sink_node in generated_nodes
257+
if sink_node.type == registry.get_column_type_from_df_type(self.dataframe_type)
258+
]
259+
260+
def new_callable(**kwargs) -> Any:
261+
df = kwargs[inject_parameter]
262+
columns_to_append = {}
263+
for column in self.select:
264+
columns_to_append[column] = kwargs[column]
265+
266+
return df.with_columns(**columns_to_append)
267+
268+
column_type = registry.get_column_type_from_df_type(self.dataframe_type)
269+
input_map = {column: column_type for column in self.select}
270+
input_map[inject_parameter] = self.dataframe_type
271+
merge_node = node.Node(
272+
name="__append",
273+
typ=self.dataframe_type,
274+
callabl=new_callable,
275+
input_types=input_map,
276+
)
277+
output_nodes = generated_nodes + [merge_node]
278+
return output_nodes, merge_node.name
279+
280+
def validate(self, fn: Callable):
281+
inject_parameter = _default_inject_parameter(fn=fn, target_dataframe=self.target_dataframe)
282+
params = get_type_hints(fn)
283+
with_columns_base.validate_dataframe(
284+
fn=fn,
285+
inject_parameter=inject_parameter,
286+
params=params,
287+
required_type=self.dataframe_type,
288+
)

0 commit comments

Comments
 (0)