TabPFNRegressor preprocessing fails on bigger datasets fix (#255)

Krishnadubey1008 · noahho · web-flow · commit c8cbc6f436ad · 2025-05-20T16:01:30.000+02:00
Co-authored-by: noahho &lt;Noah.homa@gmail.com&gt;
diff --git a/scripts/get_max_dependencies.py b/scripts/get_max_dependencies.py
@@ -37,4 +37,4 @@ def main() -> None:
 
 
 if __name__ == "__main__":
-    main()
+    main()
diff --git a/scripts/get_min_dependencies.py b/scripts/get_min_dependencies.py
@@ -31,4 +31,4 @@ def main() -> None:
 
 
 if __name__ == "__main__":
-    main()
+    main()
diff --git a/src/tabpfn/model/loading.py b/src/tabpfn/model/loading.py
@@ -13,7 +13,7 @@
 from dataclasses import dataclass
 from enum import Enum
 from pathlib import Path
-from typing import Literal, overload
+from typing import Literal, cast, overload
 from urllib.error import URLError
 
 import torch
@@ -101,7 +101,7 @@ def _get_model_source(version: ModelVersion, model_type: ModelType) -> ModelSour
     )
 
 
-def _suppress_hf_token_warning():
+def _suppress_hf_token_warning() -> None:
     """Suppress warning about missing HuggingFace token."""
     import warnings
 
@@ -287,7 +287,7 @@ def download_all_models(to: Path) -> None:
             download_model(
                 to=to / ckpt_name,
                 version="v2",
-                which=model_type,
+                which=cast(Literal["classifier", "regressor"], model_type),
                 model_name=ckpt_name,
             )
 
@@ -370,31 +370,6 @@ def load_model_criterion_config(
 ) -> tuple[PerFeatureTransformer, FullSupportBarDistribution, InferenceConfig]: ...
 
 
-def resolve_model_path(
-    model_path: None | str | Path,
-    which: Literal["regressor", "classifier"],
-    version: Literal["v2"] = "v2",
-) -> tuple[Path, Path, str, str]:
-    if model_path is None:
-        USER_TABPFN_CACHE_DIR_LOCATION = os.environ.get("TABPFN_MODEL_CACHE_DIR", "")
-        if USER_TABPFN_CACHE_DIR_LOCATION.strip() != "":
-            model_dir = Path(USER_TABPFN_CACHE_DIR_LOCATION)
-        else:
-            model_dir = _user_cache_dir(platform=sys.platform, appname="tabpfn")
-
-        model_name = f"tabpfn-{version}-{which}.ckpt"
-        model_path = model_dir / model_name
-    else:
-        if not isinstance(model_path, (str, Path)):
-            raise ValueError(f"Invalid model_path: {model_path}")
-
-        model_path = Path(model_path)
-        model_dir = model_path.parent
-        model_name = model_path.name
-
-    return model_path, model_dir, model_name, which
-
-
 def load_model_criterion_config(
     model_path: None | str | Path,
     *,
@@ -452,7 +427,7 @@ def load_model_criterion_config(
         res = download_model(
             model_path,
             version=version,
-            which=which,
+            which=cast(Literal["classifier", "regressor"], which),
             model_name=model_name,
         )
         if res != "ok":
@@ -478,6 +453,31 @@ def load_model_criterion_config(
     return loaded_model, criterion, config
 
 
+def resolve_model_path(
+    model_path: None | str | Path,
+    which: Literal["regressor", "classifier"],
+    version: Literal["v2"] = "v2",
+) -> tuple[Path, Path, str, str]:
+    if model_path is None:
+        USER_TABPFN_CACHE_DIR_LOCATION = os.environ.get("TABPFN_MODEL_CACHE_DIR", "")
+        if USER_TABPFN_CACHE_DIR_LOCATION.strip() != "":
+            model_dir = Path(USER_TABPFN_CACHE_DIR_LOCATION)
+        else:
+            model_dir = _user_cache_dir(platform=sys.platform, appname="tabpfn")
+
+        model_name = f"tabpfn-{version}-{which}.ckpt"
+        model_path = model_dir / model_name
+    else:
+        if not isinstance(model_path, (str, Path)):
+            raise ValueError(f"Invalid model_path: {model_path}")
+
+        model_path = Path(model_path)
+        model_dir = model_path.parent
+        model_name = model_path.name
+
+    return model_path, model_dir, model_name, which
+
+
 def get_loss_criterion(
     config: InferenceConfig,
 ) -> nn.BCEWithLogitsLoss | nn.CrossEntropyLoss | FullSupportBarDistribution:
diff --git a/src/tabpfn/model/preprocessing.py b/src/tabpfn/model/preprocessing.py
@@ -82,6 +82,57 @@ def transform(self, X: torch.Tensor | np.ndarray) -> np.ndarray:
         return X  # type: ignore
 
 
+class AdaptiveQuantileTransformer(QuantileTransformer):
+    """A QuantileTransformer that automatically adapts the 'n_quantiles' parameter
+    based on the number of samples provided during the 'fit' method.
+
+    This prevents errors that occur when the requested 'n_quantiles' is
+    greater than the number of available samples in the input data (X).
+    This situation can arises because we first initialize the transformer
+    based on total samples and then subsample.
+    """
+
+    def __init__(self, *, n_quantiles: int = 1000, **kwargs: Any) -> None:
+        # Store the user's desired n_quantiles to use as an upper bound
+        self._user_n_quantiles = n_quantiles
+        # Initialize parent with this, but it will be adapted in fit
+        super().__init__(n_quantiles=n_quantiles, **kwargs)
+
+    def fit(
+        self, X: np.ndarray, y: np.ndarray | None = None
+    ) -> AdaptiveQuantileTransformer:
+        X = self._validate_data(
+            X, copy=self.copy, estimator=self, dtype=float, force_all_finite="allow-nan"
+        )
+        n_samples = X.shape[0]
+
+        # Adapt n_quantiles for this fit: min of user's preference and available samples
+        # Ensure n_quantiles is at least 1
+        effective_n_quantiles = max(1, min(self._user_n_quantiles, n_samples))
+
+        # Set self.n_quantiles to the effective value BEFORE calling super().fit()
+        # This ensures the parent class uses the adapted value for fitting
+        # and self.n_quantiles will reflect the value used for the fit.
+        self.n_quantiles = effective_n_quantiles
+
+        return super().fit(X, y)
+
+    # For completeness and scikit-learn compatibility, allow getting params
+    # to show the original user setting if desired, though self.n_quantiles
+    # will show the fitted effective value.
+    def get_params(self, *, deep: bool = True) -> dict:
+        params = super().get_params(deep)
+        # Report the original user_n_quantiles if it's in params
+        if "_user_n_quantiles" in self.__dict__:  # Check if it was set
+            params["n_quantiles"] = self._user_n_quantiles
+        return params
+
+    def set_params(self, **params: Any) -> AdaptiveQuantileTransformer:
+        if "n_quantiles" in params:
+            self._user_n_quantiles = params["n_quantiles"]
+        return super().set_params(**params)
+
+
 ALPHAS = (
     0.05,
     0.1,
@@ -656,9 +707,9 @@ def get_adaptive_preprocessors(
                     ),
                     (
                         "other",
-                        QuantileTransformer(
+                        AdaptiveQuantileTransformer(
                             output_distribution="normal",
-                            n_quantiles=num_examples // 10,
+                            n_quantiles=max(num_examples // 10, 2),
                             random_state=random_state,
                         ),
                         # "other" or "ordinal"
diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py
@@ -0,0 +1,32 @@
+from __future__ import annotations
+
+import numpy as np
+
+from tabpfn.model.preprocessing import ReshapeFeatureDistributionsStep
+
+
+def test_preprocessing_large_dataset():
+    # Generate a synthetic dataset with more than 10,000 samples
+    num_samples = 15000
+    num_features = 10
+    rng = np.random.default_rng()
+    X = rng.random((num_samples, num_features))
+
+    # Create an instance of ReshapeFeatureDistributionsStep
+    preprocessing_step = ReshapeFeatureDistributionsStep(
+        transform_name="quantile_norm",
+        apply_to_categorical=False,
+        append_to_original=False,
+        subsample_features=-1,
+        global_transformer_name=None,
+        random_state=42,
+    )
+
+    # Define categorical features (empty in this case)
+    categorical_features = []
+
+    # Run the preprocessing step
+    result = preprocessing_step.fit_transform(X, categorical_features)
+
+    # Assert the result is not None
+    assert result is not None

Original file line number	Diff line number	Diff line change
`@@ -37,4 +37,4 @@ def main() -> None:`
`37`	`37`
`38`	`38`
`39`	`39`	`if __name__ == "__main__":`
`40`		`- main()`
	`40`	`+ main()`
Original file line number	Diff line number	Diff line change
`@@ -31,4 +31,4 @@ def main() -> None:`
`31`	`31`
`32`	`32`
`33`	`33`	`if __name__ == "__main__":`
`34`		`- main()`
	`34`	`+ main()`