WIP

elijahbenizzy · elijahbenizzy · commit d534e7211b72 · 2023-08-09T21:19:48.000-07:00
diff --git a/examples/materialization/README.md b/examples/materialization/README.md
@@ -0,0 +1,58 @@
+# Materialization
+
+Hamilton's driver allows for ad-hoc materialization. This enables you to take a DAG you already have,
+and save your data to a set of custom locations/url.
+
+Note that these materializers are _isomorphic_ in nature to the
+[@save_to](https://hamilton.dagworks.io/en/latest/reference/decorators/save_to/)
+decorator. Materializers inject the additional node at runtime, modifying the
+DAG to include a data saver node, and returning the metadata around materialization.
+
+This framework is meant to be highly pluggable. While the set of available data savers is currently
+limited, we expect folks to build their own materializers (and, hopefully, contribute them back to the community!).
+
+
+## example
+In this example we take the scikit-learn iris_loader pipeline, and materialize outputs to specific
+locations through a driver call. We demonstrate:
+
+1. Saving model parameters to a json file (using the default json materializer)
+2. Writing a custom data adapters for:
+   1. Pickling a model to an object file
+   2. Saving confusion matrices to a csv file
+
+See [run.py](run.py) for the full example.
+
+
+## `driver.materialize`
+
+This will be a high-level overview. For more details,
+see [documentation](https://hamilton.dagworks.io/en/latest/reference/drivers/Driver/#hamilton.driver.Driver.materializehttps://hamilton.dagworks.io/en/latest/reference/drivers/Driver/#hamilton.driver.Driver.materialize).
+
+`driver.materialize()` does the following:
+1. Processes a list of materializers to create a new DAG
+2. Alters the output to include the materializer nodes
+3. Processes a list of "additional variables" (for debugging) to return intermediary data
+4. Executes the DAG, including the materializers
+5. Returns a tuple of (`materialization metadata`, `additional variables`)
+
+Materialiazers each consume:
+1. A `dependencies` list to materialize
+2. A (optional) `combine` parameter to combine the outputs of the dependencies
+(this is required if there are multiple dependencies). This is a [ResultMixin](https://hamilton.dagworks.io/en/latest/concepts/customizing-execution/#result-builders) object
+3. an `id` parameter to identify the materializer, which serves as the nde name in the DAG
+
+Materializers are referenced by the `to` object in `hamilton.io.materialization`, which utilizes
+dynamic dispatch to create the appropriate materializer.
+
+These refer to a `DataSaver`, which are keyed by a string (E.G `csv`).
+Multiple data adapters can share the same key, each of which applies to a specific type
+(E.G. pandas dataframe, numpy matrix, polars dataframe). New
+data adapters are registered by calling `hamilton.registry.register_adapter`
+
+## Custom Materializers
+
+To define a custom materializer, all you have to do is implement the `DataSaver` class
+(which will allow use in `save_to` as well.)
+
+## `driver.materialize` vs `@save_to`
diff --git a/examples/materialization/custom_materializers.py b/examples/materialization/custom_materializers.py
@@ -0,0 +1,55 @@
+import dataclasses
+import pickle
+from typing import Any, Collection, Dict, Type
+
+import numpy as np
+from sklearn import base
+
+from hamilton import registry
+from hamilton.io import utils
+from hamilton.io.data_adapters import DataSaver
+
+# TODO -- put this back in the standard library
+
+
+@dataclasses.dataclass
+class NumpyMatrixToCSV(DataSaver):
+    path: str
+    sep: str = ","
+
+    def __post_init__(self):
+        if not self.path.endswith(".csv"):
+            raise ValueError(f"CSV files must end with .csv, got {self.path}")
+
+    def save_data(self, data: np.ndarray) -> Dict[str, Any]:
+        np.savetxt(self.path, data, delimiter=self.sep)
+        return utils.get_file_metadata(self.path)
+
+    @classmethod
+    def applicable_types(cls) -> Collection[Type]:
+        return [np.ndarray]
+
+    @classmethod
+    def name(cls) -> str:
+        return "csv"
+
+
+@dataclasses.dataclass
+class SKLearnPickler(DataSaver):
+    path: str
+
+    def save_data(self, data: base.ClassifierMixin) -> Dict[str, Any]:
+        pickle.dump(data, open(self.path, "wb"))
+        return utils.get_file_metadata(self.path)
+
+    @classmethod
+    def applicable_types(cls) -> Collection[Type]:
+        return [base.ClassifierMixin]
+
+    @classmethod
+    def name(cls) -> str:
+        return "pickle"
+
+
+for adapter in [NumpyMatrixToCSV, SKLearnPickler]:
+    registry.register_adapter(adapter)
diff --git a/examples/materialization/data_loaders.py b/examples/materialization/data_loaders.py
@@ -0,0 +1,30 @@
+import numpy as np
+from sklearn import datasets, utils
+
+from hamilton.function_modifiers import config
+
+"""
+Module to load digit data.
+"""
+
+
+@config.when(data_loader="iris")
+def data__iris() -> utils.Bunch:
+    return datasets.load_digits()
+
+
+@config.when(data_loader="digits")
+def data__digits() -> utils.Bunch:
+    return datasets.load_digits()
+
+
+def target(data: utils.Bunch) -> np.ndarray:
+    return data.target
+
+
+def target_names(data: utils.Bunch) -> np.ndarray:
+    return data.target_names
+
+
+def feature_matrix(data: utils.Bunch) -> np.ndarray:
+    return data.data
diff --git a/examples/materialization/model_training.py b/examples/materialization/model_training.py
@@ -0,0 +1,96 @@
+from typing import Dict
+
+import numpy as np
+from sklearn import base, linear_model, metrics, svm
+from sklearn.model_selection import train_test_split
+
+from hamilton import function_modifiers
+
+
+@function_modifiers.config.when(clf="svm")
+def prefit_clf__svm(gamma: float = 0.001) -> base.ClassifierMixin:
+    """Returns an unfitted SVM classifier object.
+
+    :param gamma: ...
+    :return:
+    """
+    return svm.SVC(gamma=gamma)
+
+
+@function_modifiers.config.when(clf="logistic")
+def prefit_clf__logreg(penalty: str) -> base.ClassifierMixin:
+    """Returns an unfitted Logistic Regression classifier object.
+
+    :param penalty:
+    :return:
+    """
+    return linear_model.LogisticRegression(penalty)
+
+
+@function_modifiers.extract_fields(
+    {"X_train": np.ndarray, "X_test": np.ndarray, "y_train": np.ndarray, "y_test": np.ndarray}
+)
+def train_test_split_func(
+    feature_matrix: np.ndarray,
+    target: np.ndarray,
+    test_size_fraction: float,
+    shuffle_train_test_split: bool,
+) -> Dict[str, np.ndarray]:
+    """Function that creates the training & test splits.
+
+    It this then extracted out into constituent components and used downstream.
+
+    :param feature_matrix:
+    :param target:
+    :param test_size_fraction:
+    :param shuffle_train_test_split:
+    :return:
+    """
+    X_train, X_test, y_train, y_test = train_test_split(
+        feature_matrix, target, test_size=test_size_fraction, shuffle=shuffle_train_test_split
+    )
+    return {"X_train": X_train, "X_test": X_test, "y_train": y_train, "y_test": y_test}
+
+
+def y_test_with_labels(y_test: np.ndarray, target_names: np.ndarray) -> np.ndarray:
+    """Adds labels to the target output."""
+    return np.array([target_names[idx] for idx in y_test])
+
+
+def fit_clf(
+    prefit_clf: base.ClassifierMixin, X_train: np.ndarray, y_train: np.ndarray
+) -> base.ClassifierMixin:
+    """Calls fit on the classifier object; it mutates it."""
+    prefit_clf.fit(X_train, y_train)
+    return prefit_clf
+
+
+def predicted_output(fit_clf: base.ClassifierMixin, X_test: np.ndarray) -> np.ndarray:
+    """Exercised the fit classifier to perform a prediction."""
+    return fit_clf.predict(X_test)
+
+
+def predicted_output_with_labels(
+    predicted_output: np.ndarray, target_names: np.ndarray
+) -> np.ndarray:
+    """Replaces the predictions with the desired labels."""
+    return np.array([target_names[idx] for idx in predicted_output])
+
+
+def classification_report(
+    predicted_output_with_labels: np.ndarray, y_test_with_labels: np.ndarray
+) -> str:
+    """Returns a classification report."""
+    return metrics.classification_report(y_test_with_labels, predicted_output_with_labels)
+
+
+def confusion_matrix(
+    predicted_output_with_labels: np.ndarray, y_test_with_labels: np.ndarray
+) -> str:
+    """Returns a confusion matrix report."""
+    return metrics.confusion_matrix(y_test_with_labels, predicted_output_with_labels)
+
+
+def model_parameters(fit_clf: base.ClassifierMixin) -> dict:
+    """Returns a dictionary of model parameters."""
+    return fit_clf.get_params()
diff --git a/examples/materialization/requirements.txt b/examples/materialization/requirements.txt
@@ -0,0 +1,2 @@
+scikit-learn
+sf-hamilton
diff --git a/examples/materialization/run.py b/examples/materialization/run.py
@@ -0,0 +1,75 @@
+"""
+Example script showing how one might setup a generic model training pipeline that is quickly configurable.
+"""
+
+# Required import to register adapters
+import data_loaders
+import model_training
+
+from hamilton import base, driver
+from hamilton.io.materialization import to
+
+
+def get_model_config(model_type: str) -> dict:
+    """Returns model type specific configuration"""
+    if model_type == "svm":
+        return {"clf": "svm", "gamma": 0.001}
+    elif model_type == "logistic":
+        return {"clf": "logistic", "penalty": "l2"}
+    else:
+        raise ValueError(f"Unsupported model {model_type}.")
+
+
+if __name__ == "__main__":
+    import sys
+
+    if len(sys.argv) < 3:
+        print("Error: required arguments are [iris|digits] [svm|logistic]")
+        sys.exit(1)
+    _data_set = sys.argv[1]  # the data set to load
+    _model_type = sys.argv[2]  # the model type to fit and evaluate with
+
+    dag_config = {
+        "test_size_fraction": 0.5,
+        "shuffle_train_test_split": True,
+    }
+    # augment config
+    dag_config.update(get_model_config(_model_type))
+    dag_config["data_loader"] = _data_set
+    dr = (
+        driver.Builder()
+        .with_adapter(base.DefaultAdapter())
+        .with_config(dag_config)
+        .with_modules(data_loaders, model_training)
+        .build()
+    )
+    materializers = [
+        to.json(dependencies=["model_parameters"], id="model_params_to_json", path="./params.json"),
+        # classificaiton report to .txt file
+        to.file(
+            dependencies=["classification_report"],
+            id="classification_report_to_csv",
+            path="./classification_report.txt",
+        ),
+        # materialize the model to a pickle file
+        to.pickle(dependencies=["prefit_clf"], id="prefit_clf_to_pickle", path="./prefit_clf.pkl"),
+        # materialize the predictions we made to a csv file
+        to.csv(
+            dependencies=["predicted_output_with_labels"],
+            id="predicted_output_with_labels_to_csv",
+            path="./predicted_output_with_labels.csv",
+        ),
+    ]
+    dr.visualize_materialization(
+        *materializers,
+        additional_vars=["classification_report"],
+        output_file_path="./dag",
+        render_kwargs={},
+    )
+    materialization_results, additional_vars = dr.materialize(
+        # materialize model parameters to json
+        *materializers,
+        additional_vars=["classification_report"],
+    )
+    print(materialization_results["classification_report"])
+    print(additional_vars)
diff --git a/hamilton/function_modifiers/dependencies.py b/hamilton/function_modifiers/dependencies.py
@@ -29,15 +29,15 @@ class SingleDependency(ParametrizedDependency, abc.ABC):
 
 
 @dataclasses.dataclass
-class LiteralDependency(ParametrizedDependency):
+class LiteralDependency(SingleDependency):
     value: Any
 
     def get_dependency_type(self) -> ParametrizedDependencySource:
         return ParametrizedDependencySource.LITERAL
 
 
 @dataclasses.dataclass
-class UpstreamDependency(ParametrizedDependency):
+class UpstreamDependency(SingleDependency):
     source: str
 
     def get_dependency_type(self) -> ParametrizedDependencySource:
diff --git a/hamilton/io/materialization.py b/hamilton/io/materialization.py
@@ -7,7 +7,7 @@
 from hamilton.function_modifiers.dependencies import SingleDependency, value
 from hamilton.graph import FunctionGraph
 from hamilton.io.data_adapters import DataSaver
-from hamilton.registry import LOADER_REGISTRY
+from hamilton.registry import SAVER_REGISTRY
 
 
 class materialization_meta__(type):
@@ -19,17 +19,17 @@ class in registry, or make it a function that just proxies to the decorator. We
     """
 
     def __getattr__(cls, item: str):
-        if item in LOADER_REGISTRY:
-            potential_loaders = LOADER_REGISTRY[item]
+        if item in SAVER_REGISTRY:
+            potential_loaders = SAVER_REGISTRY[item]
             savers = [loader for loader in potential_loaders if issubclass(loader, DataSaver)]
             if len(savers) > 0:
-                return Materialize.partial(LOADER_REGISTRY[item])
+                return Materialize.partial(SAVER_REGISTRY[item])
         try:
             return super().__getattribute__(item)
         except AttributeError as e:
             raise AttributeError(
-                f"No loader named: {item} available for {cls.__name__}. "
-                f"Available loaders are: {LOADER_REGISTRY.keys()}. "
+                f"No data materializer named: {item}. "
+                f"Available materializers are: {SAVER_REGISTRY.keys()}. "
                 f"If you've gotten to this point, you either (1) spelled the "
                 f"loader name wrong, (2) are trying to use a loader that does"
                 f"not exist (yet)"
@@ -76,6 +76,7 @@ def _process_kwargs(
         """
         processed_kwargs = {}
         for kwarg, kwarg_val in data_saver_kwargs.items():
+
             if not isinstance(kwarg_val, SingleDependency):
                 processed_kwargs[kwarg] = value(kwarg_val)
             else: