Fix binary trees (#37)

amol- · web-flow · commit c0b9d0fcc0e8 · 2025-04-01T18:14:12.000+02:00
diff --git a/src/mustela/translation/steps/trees/classifier.py b/src/mustela/translation/steps/trees/classifier.py
@@ -81,7 +81,22 @@ def build_classifier(
         ) or self._attributes.get("classlabels_int64s")
         if classlabels is None:
             raise ValueError("Unable to detect classlabels for classification")
-        classlabels = typing.cast(list[str] | list[int], classlabels)
+        output_classlabels = classlabels = typing.cast(
+            list[str] | list[int], classlabels
+        )
+
+        # ONNX treats binary classification as a special case:
+        # https://github.com/microsoft/onnxruntime/blob/5982430af66f52a288cb8b2181e0b5b2e09118c8/onnxruntime/core/providers/cpu/ml/tree_ensemble_common.h#L854C1-L871C4
+        # https://github.com/microsoft/onnxruntime/blob/5982430af66f52a288cb8b2181e0b5b2e09118c8/onnxruntime/core/providers/cpu/ml/tree_ensemble_aggregator.h#L469-L494
+        # In this case there is only one weight and it's the probability of the positive class.
+        # So we need to check if we are in a binary classification case.
+        weights_classid = typing.cast(list[int], self._attributes["class_ids"])
+        is_binary = len(classlabels) == 2 and len(set(weights_classid)) == 1
+        if is_binary:
+            # In this case there is only one label, the first one
+            # which actually acts as the score of the prediction.
+            # When > 0.5 then class 1, when < 0.5 then class 0
+            classlabels = typing.cast(list[str] | list[int], [classlabels[0]])
 
         if isinstance(input_expr, VariablesGroup):
             ordered_features = input_expr.values_value()
@@ -134,39 +149,53 @@ def build_tree_case(node: dict) -> dict[str | int, ibis.Expr]:
                 )
 
         # Compute prediction of class itself.
-        candidate_cls = classlabels[0]
-        candidate_vote = total_votes[candidate_cls]
-        for clslabel in classlabels[1:]:
-            candidate_cls = optimizer.fold_case(
+        if is_binary:
+            total_score = total_votes[classlabels[0]]
+            label_expr = optimizer.fold_case(
                 ibis.case()
-                .when(total_votes[clslabel] > candidate_vote, clslabel)
-                .else_(candidate_cls)
+                .when(total_score > 0.5, output_classlabels[1])
+                .else_(output_classlabels[0])
                 .end()
             )
-            candidate_vote = optimizer.fold_case(
-                ibis.case()
-                .when(total_votes[clslabel] > candidate_vote, total_votes[clslabel])
-                .else_(candidate_vote)
-                .end()
+            # The order matters, for ONNX the VariableGroup is a list of subvariables
+            # the names are not important.
+            prob_dict = VariablesGroup(
+                {
+                    str(output_classlabels[0]): 1.0 - total_score,
+                    str(output_classlabels[1]): total_score,
+                }
             )
+        else:
+            candidate_cls = classlabels[0]
+            candidate_vote = total_votes[candidate_cls]
+            for clslabel in classlabels[1:]:
+                candidate_cls = optimizer.fold_case(
+                    ibis.case()
+                    .when(total_votes[clslabel] > candidate_vote, clslabel)
+                    .else_(candidate_cls)
+                    .end()
+                )
+                candidate_vote = optimizer.fold_case(
+                    ibis.case()
+                    .when(total_votes[clslabel] > candidate_vote, total_votes[clslabel])
+                    .else_(candidate_vote)
+                    .end()
+                )
 
-        label_expr = ibis.case()
-        for clslabel in classlabels:
-            label_expr = label_expr.when(candidate_cls == clslabel, clslabel)
-        label_expr = label_expr.else_(ibis.null()).end()
-        label_expr = optimizer.fold_case(label_expr)
+            label_expr = ibis.case()
+            for clslabel in classlabels:
+                label_expr = label_expr.when(candidate_cls == clslabel, clslabel)
+            label_expr = label_expr.else_(ibis.null()).end()
+            label_expr = optimizer.fold_case(label_expr)
 
-        # Compute probability to return it too.
-        sum_votes = None
-        for clslabel in classlabels:
-            if sum_votes is None:
-                sum_votes = total_votes[clslabel]
-            else:
+            # Compute probability to return it too.
+            sum_votes = ibis.literal(0.0)
+            for clslabel in classlabels:
                 sum_votes = optimizer.fold_operation(sum_votes + total_votes[clslabel])
 
-        # FIXME: Probabilities are currently broken for gradient boosted trees.
-        prob_dict = VariablesGroup()
-        for clslabel in classlabels:
-            prob_dict[str(clslabel)] = total_votes[clslabel] / sum_votes
+            # FIXME: Probabilities are currently broken for gradient boosted trees.
+            prob_dict = VariablesGroup()
+            for clslabel in classlabels:
+                prob_dict[str(clslabel)] = total_votes[clslabel] / sum_votes
 
         return label_expr, prob_dict
diff --git a/src/mustela/translation/steps/trees/tree.py b/src/mustela/translation/steps/trees/tree.py
@@ -67,7 +67,6 @@ def build_tree(translator: Translator) -> dict[int, dict[int, dict]]:
         if not classlabels:
             raise ValueError("Missing class labels when building tree")
 
-        is_binary = len(classlabels) == 2 and len(set(weights_classid)) == 1
         for tree_id, node_id, weight, weight_classid in zip(
             class_treeids, class_nodeids, class_weights, weights_classid
         ):
@@ -76,23 +75,6 @@ def build_tree(translator: Translator) -> dict[int, dict[int, dict]]:
             )
             node_weights[classlabels[weight_classid]] = weight
 
-        if is_binary:
-            # ONNX treats binary classification as a special case:
-            # https://github.com/microsoft/onnxruntime/blob/5982430af66f52a288cb8b2181e0b5b2e09118c8/onnxruntime/core/providers/cpu/ml/tree_ensemble_common.h#L854C1-L871C4
-            # https://github.com/microsoft/onnxruntime/blob/5982430af66f52a288cb8b2181e0b5b2e09118c8/onnxruntime/core/providers/cpu/ml/tree_ensemble_aggregator.h#L469-L494
-            # In this case there is only one weight and it's the probability of the positive class.
-            for node_weights in weights.values():
-                assert len(node_weights) == 1, (
-                    f"Binary classification expected to have only one class, got: {node_weights}"
-                )
-                score = list(node_weights.values())[0]
-                if score > 0.5:
-                    node_weights[classlabels[1]] = 1.0
-                    node_weights[classlabels[0]] = 0.0
-                else:
-                    node_weights[classlabels[1]] = 0.0
-                    node_weights[classlabels[0]] = 1.0
-
     elif node.op_type == "TreeEnsembleRegressor":
         # Weights for the regressor, in this case leaf nodes have only 1 weight
         weights = typing.cast(dict[tuple[int, int], float], weights)
diff --git a/tests/test_pipeline_e2e.py b/tests/test_pipeline_e2e.py
@@ -463,7 +463,6 @@ def assign_region(width):
 
     def test_binary_random_forest_classifier(self, iris_data, db_connection):
         """Test a binary random forest classifier with mixed preprocessing."""
-        pytest.skip("Binary classification on trees is currently not implemented.")
         df, feature_names = iris_data
         conn, dialect = db_connection
 
@@ -504,9 +503,22 @@ def test_binary_random_forest_classifier(self, iris_data, db_connection):
         )
         parsed_pipeline = mustela.parse_pipeline(sklearn_pipeline, features=features)
 
+        # Test prediction
         sql = mustela.export_sql("data", parsed_pipeline, dialect=dialect)
         sql_results = self.execute_sql(sql, conn, dialect, binary_df)
-
         np.testing.assert_allclose(
             sql_results["output_label"].to_numpy(), sklearn_class
         )
+
+        # Test probabilities
+        sklearn_proba = sklearn_pipeline.predict_proba(X)
+        sklearn_proba_df = pd.DataFrame(
+            sklearn_proba, columns=sklearn_pipeline.classes_
+        )
+        for class_label in sklearn_pipeline.classes_:
+            np.testing.assert_allclose(
+                sql_results[f"output_probability.{class_label}"].to_numpy(),
+                sklearn_proba_df[class_label].values.flatten(),
+                rtol=1e-4,
+                atol=1e-4,
+            )