Rework allow_cpu_override to be usable without environment variables (#275)

LennartPurucker · noahho · web-flow · commit 0a3ba43f9b3b · 2025-04-15T13:25:42.000+02:00
Co-authored-by: noahho &lt;Noah.homa@gmail.com&gt;
diff --git a/src/tabpfn/base.py b/src/tabpfn/base.py
@@ -245,15 +245,25 @@ def create_inference_engine(  # noqa: PLR0913
 
 
 def check_cpu_warning(
-    device: str | torch.device, X: np.ndarray | torch.Tensor | pd.DataFrame
+    device: str | torch.device,
+    X: np.ndarray | torch.Tensor | pd.DataFrame,
+    *,
+    allow_cpu_override: bool = False,
 ) -> None:
     """Check if using CPU with large datasets and warn or error appropriately.
 
     Args:
         device: The torch device being used
         X: The input data (NumPy array, Pandas DataFrame, or Torch Tensor)
+        allow_cpu_override: If True, allow CPU usage with large datasets.
     """
-    allow_cpu_override = os.getenv("TABPFN_ALLOW_CPU_LARGE_DATASET", "0") == "1"
+    allow_cpu_override = allow_cpu_override or (
+        os.getenv("TABPFN_ALLOW_CPU_LARGE_DATASET", "0") == "1"
+    )
+
+    if allow_cpu_override:
+        return
+
     device_mapped = infer_device_and_type(device)
 
     # Determine number of samples
@@ -264,16 +274,16 @@ def check_cpu_warning(
 
     if torch.device(device_mapped).type == "cpu":
         if num_samples > 1000:
-            if not allow_cpu_override:
-                raise RuntimeError(
-                    "Running on CPU with more than 1000 samples is not allowed "
-                    "by default due to slow performance.\n"
-                    "To override this behavior, set the environment variable "
-                    "TABPFN_ALLOW_CPU_LARGE_DATASET=1.\n"
-                    "Alternatively, consider using a GPU or the tabpfn-client API: "
-                    "https://github.com/PriorLabs/tabpfn-client"
-                )
-        elif num_samples > 200:
+            raise RuntimeError(
+                "Running on CPU with more than 1000 samples is not allowed "
+                "by default due to slow performance.\n"
+                "To override this behavior, set the environment variable "
+                "TABPFN_ALLOW_CPU_LARGE_DATASET=1 or "
+                "set ignore_pretraining_limits=True.\n"
+                "Alternatively, consider using a GPU or the tabpfn-client API: "
+                "https://github.com/PriorLabs/tabpfn-client"
+            )
+        if num_samples > 200:
             warnings.warn(
                 "Running on CPU with more than 200 samples may be slow.\n"
                 "Consider using a GPU or the tabpfn-client API: "
diff --git a/src/tabpfn/classifier.py b/src/tabpfn/classifier.py
@@ -228,7 +228,8 @@ def __init__(  # noqa: PLR0913
                 pre-training range.
 
                 - If `True`, the model will not raise an error if the input data is
-                  outside the pre-training range.
+                  outside the pre-training range. Also supresses error when using
+                  the model with more than 1000 samples on CPU.
                 - If `False`, you can use the model outside the pre-training range, but
                   the model could perform worse.
 
@@ -428,7 +429,9 @@ def fit(self, X: XType, y: YType) -> Self:
             ignore_pretraining_limits=self.ignore_pretraining_limits,
         )
 
-        check_cpu_warning(self.device, X)
+        check_cpu_warning(
+            self.device, X, allow_cpu_override=self.ignore_pretraining_limits
+        )
 
         if feature_names_in is not None:
             self.feature_names_in_ = feature_names_in
diff --git a/src/tabpfn/regressor.py b/src/tabpfn/regressor.py
@@ -251,7 +251,8 @@ def __init__(  # noqa: PLR0913
                 pre-training range.
 
                 - If `True`, the model will not raise an error if the input data is
-                  outside the pre-training range.
+                  outside the pre-training range. Also supresses error when using
+                  the model with more than 1000 samples on CPU.
                 - If `False`, you can use the model outside the pre-training range, but
                   the model could perform worse.
 
@@ -456,7 +457,9 @@ def fit(self, X: XType, y: YType) -> Self:
             ignore_pretraining_limits=self.ignore_pretraining_limits,
         )
         assert isinstance(X, np.ndarray)
-        check_cpu_warning(self.device, X)
+        check_cpu_warning(
+            self.device, X, allow_cpu_override=self.ignore_pretraining_limits
+        )
 
         if feature_names_in is not None:
             self.feature_names_in_ = feature_names_in
diff --git a/tests/test_regressor_interface.py b/tests/test_regressor_interface.py
@@ -355,13 +355,35 @@ def test_cpu_large_dataset_warning():
     with pytest.warns(
         UserWarning, match="Running on CPU with more than 200 samples may be slow"
     ):
-        # Set environment variable to allow large datasets to avoid RuntimeError
-        os.environ["TABPFN_ALLOW_CPU_LARGE_DATASET"] = "1"
-        try:
-            model.fit(X_large, y_large)
-        finally:
-            # Clean up environment variable
-            os.environ.pop("TABPFN_ALLOW_CPU_LARGE_DATASET")
+        model.fit(X_large, y_large)
+
+
+def test_cpu_large_dataset_warning_override():
+    """Test that runtime error is raised when using CPU with large datasets
+    and that we can disable the error with ignore_pretraining_limits.
+    """
+    rng = np.random.default_rng(seed=42)
+    X_large = rng.random((1001, 10))
+    y_large = rng.random(1001)
+
+    model = TabPFNRegressor(device="cpu")
+    with pytest.raises(
+        RuntimeError, match="Running on CPU with more than 1000 samples is not"
+    ):
+        model.fit(X_large, y_large)
+
+    # -- Test overrides
+    model = TabPFNRegressor(device="cpu", ignore_pretraining_limits=True)
+    model.fit(X_large, y_large)
+
+    # Set environment variable to allow large datasets to avoid RuntimeError
+    os.environ["TABPFN_ALLOW_CPU_LARGE_DATASET"] = "1"
+    try:
+        model = TabPFNRegressor(device="cpu", ignore_pretraining_limits=False)
+        model.fit(X_large, y_large)
+    finally:
+        # Clean up environment variable
+        os.environ.pop("TABPFN_ALLOW_CPU_LARGE_DATASET")
 
 
 def test_cpu_large_dataset_error():