Progress on type hinting

amol- · amol- · commit d192e64524c3 · 2025-03-26T15:57:42.000+01:00
diff --git a/examples/pipeline_boosted_tree_classifier.py b/examples/pipeline_boosted_tree_classifier.py
@@ -16,7 +16,7 @@
 
 PRINT_SQL = False
 logging.basicConfig(level=logging.INFO)
-logging.getLogger("mustela").setLevel(logging.DEBUG)
+logging.getLogger("mustela").setLevel(logging.INFO)  # Set DEBUG to see translation process.
 
 # Load Ames Housing for classification
 ames = fetch_openml(name="house_prices", as_frame=True)
@@ -91,7 +91,6 @@ def categorize_price(price: float) -> str:
 
 # Convert types from numpy to mustela types
 features = mustela.types.guess_datatypes(X)
-print("Mustela Features:", features)
 
 # Target only 5 rows, so that it's easier for a human to understand
 data_sample = X.head(5)
diff --git a/examples/pipeline_boosted_tree_regressor.py b/examples/pipeline_boosted_tree_regressor.py
@@ -17,7 +17,7 @@
 PRINT_SQL = False
 
 logging.basicConfig(level=logging.INFO)
-logging.getLogger("mustela").setLevel(logging.DEBUG)
+logging.getLogger("mustela").setLevel(logging.INFO)  # Set DEBUG to see translation process.
 
 ames = fetch_openml(name="house_prices", as_frame=True)
 ames = ames.frame
@@ -81,13 +81,11 @@
 )
 model.fit(X, y)
 
-features = mustela.types.guess_datatypes(X)
-print("Mustela Features:", features)
-
 # Create a small set of data for the prediction
 # It's easier to understand if it's small
 data_sample = X.head(5)
 
+features = mustela.types.guess_datatypes(X)
 mustela_pipeline = mustela.parse_pipeline(model, features=features)
 print(mustela_pipeline)
 
diff --git a/examples/pipeline_decision_tree_classifier.py b/examples/pipeline_decision_tree_classifier.py
@@ -17,7 +17,7 @@
 PRINT_SQL = False
 
 logging.basicConfig(level=logging.INFO)
-logging.getLogger("mustela").setLevel(logging.DEBUG)
+logging.getLogger("mustela").setLevel(logging.INFO)  # Change to DEBUG to see each translation step.
 
 iris = load_iris()
 df = pd.DataFrame(
diff --git a/examples/pipeline_decision_tree_regressor.py b/examples/pipeline_decision_tree_regressor.py
@@ -16,7 +16,7 @@
 
 PRINT_SQL = False
 logging.basicConfig(level=logging.INFO)
-logging.getLogger("mustela").setLevel(logging.DEBUG)
+logging.getLogger("mustela").setLevel(logging.INFO)  # Set DEBUG to see translation process.
 
 # Carica il dataset
 iris = load_iris()
diff --git a/examples/pipeline_elasticnet.py b/examples/pipeline_elasticnet.py
@@ -13,40 +13,38 @@
 PRINT_SQL = False
 
 logging.basicConfig(level=logging.INFO)
-logging.getLogger("mustela").setLevel(logging.DEBUG)
+logging.getLogger("mustela").setLevel(logging.INFO)  # Set DEBUG to see translation process.
 
 # Load the Iris dataset
 iris = load_iris(as_frame=True)
+iris_x = iris.data
 
-# Define column names for consistency
-names = ["sepal.length", "sepal.width", "petal.length", "petal.width"]
+# SQL and Mustela don't like dots in column names, replace them with underscores
+iris_x.columns = [cname.replace(".", "_") for cname in iris_x.columns]
 
-iris_x = iris.data.set_axis(names, axis=1)
+numeric_cols = ["sepal_length", "sepal_width", "petal_length", "petal_width"]
+iris_x = iris_x.set_axis(numeric_cols, axis=1)
 
 # Create a pipeline with ElasticNet instead of LinearRegression
 pipeline = Pipeline(
     [
         (
             "preprocess",
             ColumnTransformer(
-                [("scaler", StandardScaler(with_std=False), names)],
+                [("scaler", StandardScaler(with_std=False), numeric_cols)],
                 remainder="passthrough",
             ),
         ),
         ("elastic_net", ElasticNet(alpha=0.1, l1_ratio=0.5)),  # ElasticNet with L1/L2 regularization
     ]
 )
 
-# Train the pipeline
 pipeline.fit(iris_x, iris.target)
 
-print(iris_x.columns)
-
-# Identify feature types for Mustela
+# Convenience for this example to avoid repeating the schema,
+# in real cases, the user would know the schema of its database.
 features = mustela.types.guess_datatypes(iris_x)
-print("Mustela Features:", features)
 
-# Convert the pipeline into SQL with Mustela
 mustela_pipeline = mustela.parse_pipeline(pipeline, features=features)
 print(mustela_pipeline)
 
@@ -60,20 +58,23 @@
     }
 )
 
-# Generate an SQL query using Mustela
+# Generate a query expression using Mustela
 ibis_expression = mustela.translate(ibis.memtable(example_data), mustela_pipeline)
 
+con = ibis.duckdb.connect()
+
 if PRINT_SQL:
+    sql = mustela.export_sql("DATA_TABLE", mustela_pipeline, dialect="duckdb")
     print("\nGenerated Query for DuckDB:")
-    con = ibis.duckdb.connect()
-    print(con.compile(ibis_expression))
+    print(sql)
+    print("\nPrediction with SQL")
+    # We need to create the table for the SQL to query it.
+    con.create_table(ibis_table.get_name(), obj=ibis_table)
+    print(con.raw_sql(sql).df())
 
-# Predictions using Ibis
 print("\nPrediction with Ibis")
 print(ibis_expression.execute())
 
-# Predictions using SKLearn
-new_column_names = [name.replace("_", ".") for name in example_data.column_names]  # SkLearn uses dots in column names
-renamed_example_data = example_data.rename_columns(new_column_names).to_pandas()
-predictions = pipeline.predict(renamed_example_data)
+print("\nPrediction with SKLearn")
+predictions = pipeline.predict(example_data.to_pandas())
 print(predictions)
diff --git a/examples/pipeline_lasso.py b/examples/pipeline_lasso.py
@@ -14,7 +14,7 @@
 PRINT_SQL = False
 
 logging.basicConfig(level=logging.INFO)
-logging.getLogger("mustela").setLevel(logging.DEBUG)
+logging.getLogger("mustela").setLevel(logging.INFO)  # Set DEBUG to see translation process.
 
 iris = load_iris(as_frame=True)
 
diff --git a/examples/pipeline_lineareg.py b/examples/pipeline_lineareg.py
@@ -14,7 +14,7 @@
 PRINT_SQL = True
 
 logging.basicConfig(level=logging.INFO)
-logging.getLogger("mustela").setLevel(logging.INFO)
+logging.getLogger("mustela").setLevel(logging.INFO)  # Set DEBUG to see translation process.
 
 iris = load_iris(as_frame=True)
 iris_x = iris.data
@@ -39,9 +39,9 @@
 )
 pipeline.fit(iris_x, iris.target)
 
-
+# Convenience for this example to avoid repeating the schema,
+# in real cases, the user would know the schema of its database.
 features = mustela.types.guess_datatypes(iris_x)
-print("Mustela Features:", features)
 
 mustela_pipeline = mustela.parse_pipeline(pipeline, features=features)
 print(mustela_pipeline)
diff --git a/examples/pipeline_logisticreg.py b/examples/pipeline_logisticreg.py
@@ -16,7 +16,7 @@
 PRINT_SQL = False
 
 logging.basicConfig(level=logging.INFO)
-logging.getLogger("mustela").setLevel(logging.DEBUG)
+logging.getLogger("mustela").setLevel(logging.INFO)  # Set DEBUG to see translation process.
 
 # Carichiamo il dataset iris e creiamo un DataFrame
 iris = load_iris()
diff --git a/examples/pipeline_randforest_classifier.py b/examples/pipeline_randforest_classifier.py
@@ -16,7 +16,7 @@
 PRINT_SQL = False
 
 logging.basicConfig(level=logging.INFO)
-logging.getLogger("mustela").setLevel(logging.DEBUG)
+logging.getLogger("mustela").setLevel(logging.INFO)  # Set DEBUG to see translation process.
 
 iris = load_iris()
 df = pd.DataFrame(iris.data, columns=["sepal_length", "sepal_width", "petal_length", "petal_width"])
diff --git a/src/mustela/_utils/repr_pipeline.py b/src/mustela/_utils/repr_pipeline.py
@@ -65,7 +65,8 @@ def _attributes(self, attributes: typing.Iterable[_onnx.AttributeProto]) -> str:
         def _attr_value(attr: _onnx.AttributeProto) -> str:
             return self._shorten(str(get_attr_value(attr)))
 
-        return ", ".join((f"{attr.name}={_attr_value(attr)}" for attr in attributes))
+        indent = "\n        "
+        return indent + indent.join((f"{attr.name}={_attr_value(attr)}" for attr in attributes))
 
     def _shorten(self, value: str) -> str:
         """Shorten a string to maxlen characters."""
diff --git a/src/mustela/translation/steps/argmax.py b/src/mustela/translation/steps/argmax.py
@@ -29,10 +29,9 @@ class ArgMaxTranslator(Translator):
     on which to perform a prediction/classification (row).
     """
 
-    # https://onnx.ai/onnx/operators/onnx__ArgMax.html
-
     def process(self) -> None:
         """Performs the translation and set the output variable."""
+        # https://onnx.ai/onnx/operators/onnx__ArgMax.html
         data = self._variables.consume(self.inputs[0])
         axis = self._attributes.get("axis", 1)
         keepdims = self._attributes.get("keepdims", 1)
diff --git a/src/mustela/translation/steps/arrayfeatureextractor.py b/src/mustela/translation/steps/arrayfeatureextractor.py
@@ -6,35 +6,50 @@
 
 
 class ArrayFeatureExtractorTranslator(Translator):
-    """Processes an ArgMax node and updates the variables with the output expression."""
-
-
-    def process(self):
+    """Processes an ArrayFeatureExtractor node and updates the variables with the output expression.
+    
+    ArrayFeatureExtractor can be considered the opposit of :class:`ConactTranslator`, as
+    in most cases it will be used to pick one or more features out of a group of column
+    previously concatenated, or to pick a specific feature out of the result of an ArgMax operation.
+
+    The provided indices always refer to the **last** axis of the input tensor.
+    If the input is a 2D tensor, the last axis is the column axis. So an index
+    of ``0`` would mean the first column. If the input is a 1D tensor instead the
+    last axis is the row axis. So an index of ``0`` would mean the first row.
+
+    This could be confusing because axis are inverted between tensors and mustela column groups.
+    In the case of Tensors, axis=0 means row=0, while instead of mustela
+    column groups (by virtue of being a group of columns), axis=0 means
+    the first column.
+
+    We have to consider that the indices we receive, in case of column groups,
+    are actually column indices, not row indices as in case of a tensor,
+    the last index would be the column index. In case of single columns,
+    instead the index is the index of a row like it would be with a 1D tensor.
+    """
+    def process(self) -> None:
+        """Performs the translation and set the output variable."""
         # https://onnx.ai/onnx/operators/onnx_aionnxml_ArrayFeatureExtractor.html
 
-        # Given an array of features, grab only one of them
-        # This probably is used to extract a single feature from a list of features
-        # Previously made by Concat.
-        # Or to pick the right feature from the result of ArgMax
         data = self._variables.consume(self.inputs[0])
         indices = self._variables.consume(self.inputs[1])
 
-        data_keys = None
-        if isinstance(data, dict):
-            # This expects that dictionaries are sorted by insertion order
-            # AND that all values of the dictionary are featues with dim_value: 1
-            # TODO: Implement a class for Concatenaed values
-            #       that implements support based on dimensions
-            data_keys = list(data.keys())
-            data = list(data.values())
+        if not isinstance(data, dict):
+            # TODO: Implement support for selecting rows from a 1D tensor
+            raise NotImplementedError("ArrayFeatureExtractor only supports column groups as inputs")
+
+        # This expects that dictionaries are sorted by insertion order
+        # AND that all values of the dictionary are columns.
+        data_keys = list(data.keys())
+        data = list(data.values())
 
         if isinstance(indices, (list, tuple)):
-            # We only work with dictionaries of faturename: feature
-            # So when we are expected to output a list of features
-            # we should output a dictionary of features as they are just sorted.
+            if data_keys is None:
+                raise ValueError("ArrayFeatureExtractor expects a group of columns as input when receiving a list of indices")
+            if len(indices) > len(data_keys):
+                raise ValueError("Indices requested are more than the available numer of columns.")
+            # Pick only the columns that are in the list of indicies.
             result = {data_keys[i]: data[i] for i in indices}
-        elif isinstance(indices, int):
-            result = data[indices]
         elif isinstance(indices, ibis.expr.types.Column):
             # The indices that we need to pick are contained in
             # another column of the table.