|
1 |
| -from typing import Any, Dict, Type, Union |
| 1 | +import sys |
| 2 | +from types import ModuleType |
| 3 | +from typing import Any, Callable, Collection, Dict, List, Tuple, Type, Union, get_type_hints |
2 | 4 |
|
3 | 5 | import polars as pl
|
4 | 6 |
|
5 |
| -from hamilton import base |
| 7 | +_sys_version_info = sys.version_info |
| 8 | +_version_tuple = (_sys_version_info.major, _sys_version_info.minor, _sys_version_info.micro) |
| 9 | + |
| 10 | +if _version_tuple < (3, 11, 0): |
| 11 | + pass |
| 12 | +else: |
| 13 | + pass |
| 14 | + |
| 15 | +# Copied this over from function_graph |
| 16 | +# TODO -- determine the best place to put this code |
| 17 | +from hamilton import base, node, registry |
| 18 | +from hamilton.function_modifiers.expanders import extract_columns |
| 19 | +from hamilton.function_modifiers.recursive import ( |
| 20 | + _default_inject_parameter, |
| 21 | + subdag, |
| 22 | + with_columns_base, |
| 23 | +) |
| 24 | +from hamilton.plugins.polars_extensions import DATAFRAME_TYPE |
6 | 25 |
|
7 | 26 |
|
8 | 27 | class PolarsDataFrameResult(base.ResultMixin):
|
@@ -54,3 +73,216 @@ def build_result(
|
54 | 73 |
|
55 | 74 | def output_type(self) -> Type:
|
56 | 75 | return pl.DataFrame
|
| 76 | + |
| 77 | + |
| 78 | +# Do we need this here? |
| 79 | +class with_columns(with_columns_base): |
| 80 | + """Initializes a with_columns decorator for polars. |
| 81 | +
|
| 82 | + This allows you to efficiently run groups of map operations on a dataframe. We support |
| 83 | + both eager and lazy mode in polars. In case of using eager mode the type should be |
| 84 | + pl.DataFrame and the subsequent operations run on columns with type pl.Series. |
| 85 | +
|
| 86 | + Here's an example of calling in eager mode -- if you've seen ``@subdag``, you should be familiar with |
| 87 | + the concepts: |
| 88 | +
|
| 89 | + .. code-block:: python |
| 90 | +
|
| 91 | + # my_module.py |
| 92 | + def a_b_average(a: pl.Series, b: pl.Series) -> pl.Series: |
| 93 | + return (a + b) / 2 |
| 94 | +
|
| 95 | +
|
| 96 | + .. code-block:: python |
| 97 | +
|
| 98 | + # with_columns_module.py |
| 99 | + def a_plus_b(a: pl.Series, b: pl.Series) -> pl.Series: |
| 100 | + return a + b |
| 101 | +
|
| 102 | +
|
| 103 | + # the with_columns call |
| 104 | + @with_columns( |
| 105 | + *[my_module], # Load from any module |
| 106 | + *[a_plus_b], # or list operations directly |
| 107 | + columns_to_pass=["a", "b"], # The columns to pass from the dataframe to |
| 108 | + # the subdag |
| 109 | + select=["a_plus_b", "a_b_average"], # The columns to append to the dataframe |
| 110 | + ) |
| 111 | + def final_df(initial_df: pl.DataFrame) -> pl.DataFrame: |
| 112 | + # process, or just return unprocessed |
| 113 | + ... |
| 114 | +
|
| 115 | + In this instance the ``initial_df`` would get two columns added: ``a_plus_b`` and ``a_b_average``. |
| 116 | +
|
| 117 | + Note that the operation is "append", meaning that the columns that are selected are appended |
| 118 | + onto the dataframe. |
| 119 | +
|
| 120 | + If the function takes multiple dataframes, the dataframe input to process will always be |
| 121 | + the first argument. This will be passed to the subdag, transformed, and passed back to the function. |
| 122 | + This follows the hamilton rule of reference by parameter name. To demonstarte this, in the code |
| 123 | + above, the dataframe that is passed to the subdag is `initial_df`. That is transformed |
| 124 | + by the subdag, and then returned as the final dataframe. |
| 125 | +
|
| 126 | + You can read it as: |
| 127 | +
|
| 128 | + "final_df is a function that transforms the upstream dataframe initial_df, running the transformations |
| 129 | + from my_module. It starts with the columns a_from_df and b_from_df, and then adds the columns |
| 130 | + a, b, and a_plus_b to the dataframe. It then returns the dataframe, and does some processing on it." |
| 131 | +
|
| 132 | + In case you need more flexibility you can alternatively use ``on_input``, for example, |
| 133 | +
|
| 134 | + .. code-block:: python |
| 135 | +
|
| 136 | + # with_columns_module.py |
| 137 | + def a_from_df() -> pl.Expr: |
| 138 | + return pl.col(a).alias("a") / 100 |
| 139 | +
|
| 140 | + def b_from_df() -> pl.Expr: |
| 141 | + return pl.col(b).alias("b") / 100 |
| 142 | +
|
| 143 | +
|
| 144 | + # the with_columns call |
| 145 | + @with_columns( |
| 146 | + *[my_module], |
| 147 | + on_input="initial_df", |
| 148 | + select=["a_from_df", "b_from_df", "a_plus_b", "a_b_average"], |
| 149 | + ) |
| 150 | + def final_df(initial_df: pl.DataFrame) -> pl.DataFrame: |
| 151 | + # process, or just return unprocessed |
| 152 | + ... |
| 153 | +
|
| 154 | + the above would output a dataframe where the two columns ``a`` and ``b`` get |
| 155 | + overwritten. |
| 156 | + """ |
| 157 | + |
| 158 | + def __init__( |
| 159 | + self, |
| 160 | + *load_from: Union[Callable, ModuleType], |
| 161 | + columns_to_pass: List[str] = None, |
| 162 | + pass_dataframe_as: str = None, |
| 163 | + on_input: str = None, |
| 164 | + select: List[str] = None, |
| 165 | + namespace: str = None, |
| 166 | + config_required: List[str] = None, |
| 167 | + ): |
| 168 | + """Instantiates a ``@with_columns`` decorator. |
| 169 | +
|
| 170 | + :param load_from: The functions or modules that will be used to generate the group of map operations. |
| 171 | + :param columns_to_pass: The initial schema of the dataframe. This is used to determine which |
| 172 | + upstream inputs should be taken from the dataframe, and which shouldn't. Note that, if this is |
| 173 | + left empty (and external_inputs is as well), we will assume that all dependencies come |
| 174 | + from the dataframe. This cannot be used in conjunction with on_input. |
| 175 | + :param on_input: The name of the dataframe that we're modifying, as known to the subdag. |
| 176 | + If you pass this in, you are responsible for extracting columns out. If not provided, you have |
| 177 | + to pass columns_to_pass in, and we will extract the columns out on the first parameter for you. |
| 178 | + :param select: The end nodes that represent columns to be appended to the original dataframe |
| 179 | + via with_columns. Existing columns will be overridden. The selected nodes need to have the |
| 180 | + corresponding column type, in this case pl.Series, to be appended to the original dataframe. |
| 181 | + :param namespace: The namespace of the nodes, so they don't clash with the global namespace |
| 182 | + and so this can be reused. If its left out, there will be no namespace (in which case you'll want |
| 183 | + to be careful about repeating it/reusing the nodes in other parts of the DAG.) |
| 184 | + :param config_required: the list of config keys that are required to resolve any functions. Pass in None\ |
| 185 | + if you want the functions/modules to have access to all possible config. |
| 186 | + """ |
| 187 | + |
| 188 | + if pass_dataframe_as is not None: |
| 189 | + raise NotImplementedError( |
| 190 | + "We currently do not support pass_dataframe_as for pandas. Please reach out if you need this " |
| 191 | + "functionality." |
| 192 | + ) |
| 193 | + |
| 194 | + super().__init__( |
| 195 | + *load_from, |
| 196 | + columns_to_pass=columns_to_pass, |
| 197 | + on_input=on_input, |
| 198 | + select=select, |
| 199 | + namespace=namespace, |
| 200 | + config_required=config_required, |
| 201 | + dataframe_type=DATAFRAME_TYPE, |
| 202 | + ) |
| 203 | + |
| 204 | + def _create_column_nodes( |
| 205 | + self, fn: Callable, inject_parameter: str, params: Dict[str, Type[Type]] |
| 206 | + ) -> List[node.Node]: |
| 207 | + output_type = params[inject_parameter] |
| 208 | + |
| 209 | + def temp_fn(**kwargs) -> Any: |
| 210 | + return kwargs[inject_parameter] |
| 211 | + |
| 212 | + # We recreate the df node to use extract columns |
| 213 | + temp_node = node.Node( |
| 214 | + name=inject_parameter, |
| 215 | + typ=output_type, |
| 216 | + callabl=temp_fn, |
| 217 | + input_types={inject_parameter: output_type}, |
| 218 | + ) |
| 219 | + |
| 220 | + extract_columns_decorator = extract_columns(*self.initial_schema) |
| 221 | + |
| 222 | + out_nodes = extract_columns_decorator.transform_node(temp_node, config={}, fn=temp_fn) |
| 223 | + return out_nodes[1:] |
| 224 | + |
| 225 | + def get_initial_nodes( |
| 226 | + self, fn: Callable, params: Dict[str, Type[Type]] |
| 227 | + ) -> Tuple[str, Collection[node.Node]]: |
| 228 | + """Selects the correct dataframe and optionally extracts out columns.""" |
| 229 | + inject_parameter = _default_inject_parameter(fn=fn, target_dataframe=self.target_dataframe) |
| 230 | + with_columns_base.validate_dataframe( |
| 231 | + fn=fn, |
| 232 | + inject_parameter=inject_parameter, |
| 233 | + params=params, |
| 234 | + required_type=self.dataframe_type, |
| 235 | + ) |
| 236 | + |
| 237 | + initial_nodes = ( |
| 238 | + [] |
| 239 | + if self.target_dataframe is not None |
| 240 | + else self._create_column_nodes(fn=fn, inject_parameter=inject_parameter, params=params) |
| 241 | + ) |
| 242 | + |
| 243 | + return inject_parameter, initial_nodes |
| 244 | + |
| 245 | + def get_subdag_nodes(self, fn: Callable, config: Dict[str, Any]) -> Collection[node.Node]: |
| 246 | + return subdag.collect_nodes(config, self.subdag_functions) |
| 247 | + |
| 248 | + def chain_subdag_nodes( |
| 249 | + self, fn: Callable, inject_parameter: str, generated_nodes: Collection[node.Node] |
| 250 | + ) -> node.Node: |
| 251 | + "Node that adds to / overrides columns for the original dataframe based on selected output." |
| 252 | + |
| 253 | + if self.select is None: |
| 254 | + self.select = [ |
| 255 | + sink_node.name |
| 256 | + for sink_node in generated_nodes |
| 257 | + if sink_node.type == registry.get_column_type_from_df_type(self.dataframe_type) |
| 258 | + ] |
| 259 | + |
| 260 | + def new_callable(**kwargs) -> Any: |
| 261 | + df = kwargs[inject_parameter] |
| 262 | + columns_to_append = {} |
| 263 | + for column in self.select: |
| 264 | + columns_to_append[column] = kwargs[column] |
| 265 | + |
| 266 | + return df.with_columns(**columns_to_append) |
| 267 | + |
| 268 | + column_type = registry.get_column_type_from_df_type(self.dataframe_type) |
| 269 | + input_map = {column: column_type for column in self.select} |
| 270 | + input_map[inject_parameter] = self.dataframe_type |
| 271 | + merge_node = node.Node( |
| 272 | + name="__append", |
| 273 | + typ=self.dataframe_type, |
| 274 | + callabl=new_callable, |
| 275 | + input_types=input_map, |
| 276 | + ) |
| 277 | + output_nodes = generated_nodes + [merge_node] |
| 278 | + return output_nodes, merge_node.name |
| 279 | + |
| 280 | + def validate(self, fn: Callable): |
| 281 | + inject_parameter = _default_inject_parameter(fn=fn, target_dataframe=self.target_dataframe) |
| 282 | + params = get_type_hints(fn) |
| 283 | + with_columns_base.validate_dataframe( |
| 284 | + fn=fn, |
| 285 | + inject_parameter=inject_parameter, |
| 286 | + params=params, |
| 287 | + required_type=self.dataframe_type, |
| 288 | + ) |
0 commit comments