Skip to content

Commit 7100860

Browse files
committed
Update ens builder (#1434)
* Move ensemble_bulder test data to named folder * Update backend to take a temlate to copy from * Update tests to use new cases system * Update tests to be documented and cleaned up * Switch to using cached automl backends * Readd missing file which failed test for `case_3_models` * Seperate out tests that rely on old toy data and those that don't * Setup test framework for ensemble builder on real situations * Formatting * Remove `unit_test` arg * Remove SAVE2DISC * Split builder and manager into seperate files * Tidy up init of EnsembleBuilder * Moved to cached properties * Change List to list * Move to solely using cached properties * Add disk util file with `sizeof` * Update tests to use cached mechanism * Switch `sizeof` for disk consumption * Remove disk consumption * Remove unneeded function * Add type hints and documenation * Simplyify _read_np_fn * Update get_valid_test_preds to use Pathlib * Add intersection to functional * Make functional take *args * Further simplifications * Add a dataclass to represent run information for builder * Rename to Run * Change to Run objects * Formatting * Reduce side effects of `compute_loss_per_model` To make testing easier and changes easier, the targets are now passed to the method. This also reduces it's complexity by removing the checking from the method as we can assume the parameters coming in are correct. * Change Tuple to tuple * Forcibly add data files for tests * Fix: Can now load pickled numpy arrays w/ test * Add test for checking ensemble builder output * Fix bug with using list instead of set * Making deubgging message a little clearer * Fix typing and case name * Rename test file to reflect what it tests * Make pynisher context optional * Fix loaded models test * Updates to Run dataclass * Add method to `Run` to allow recording of last modified * Change Run mtimes to dictionary * Change `compute_loss_per_model` to use new Run dataclass * Factor out run loss into main loop * Simplyify get_nbest and compute_losses * Major rewrite of ensemble builder main loop * Change to simpler hashing * Start value split * Add `value_split` * Reworked Builder * Add some docstring * Formatting * Fix type signature * Fix typing for `loss` * Removed Literal * Mypy fixes for ensemble builder * Mypy fixes * Tests for `Runs` * Move `make_run` to fixtures * Fix run deletion * Test candidates * Made delete it's own function * Further simplifications * Fixup test with simplification * Test: `max_models` for `requires_deletion` * Test: `memory_limit` for `requires_deletion` * Test: Loss of runs * Test: Delete runs * Test: `fit_ensemble` of ensemble builder * Add test for run time parameter * Remove parameter `return_predictions` * Add note about pickled arrays should not be supported * Make cached automl instances copy backend * Add valid static method to run * Remove old test data * Add filter for bad run dirs * Made `main` args optional * Fix check for updated runs * Make `main` raise errors * Fix default value for ensemble builder `main` * Test valid ensemble with real runs * Rename parameter for manager * Add defaults and reorder parameters for EnsembleBuilderManager * Fixup parameters in `fit_and_return_ensemble` * Typing fixes * Make `fit_and_return_ensemble` a staticmethod * Add: `make_ensemble_builder_manager` * Add: Test files for manager * Add atomic rmtree * Add: atomic rmtree now accepts where mv should go * Make builder use atomic rmtree * Fix import bugs, remove valid preds in builder * Remove `np.inf` as valid arg for `read_at_most` * Possible reproducible num_run, no predictions error * Make automl caching robust to `pytest-xdist` * Test fixes * Extend interval for test on run caching * Use pickle for reseting cache * Fix test for caching mechanism to not rely on `stat` * Move run deletion to the end of the builder `main` * Remove `getattr` version of tae.client * Remove `normalize` * Extend not for `Run` * Fix `__init__` of `Run` * Parameter and comment fixes from feedback * Change to `min(...)` instead of `sorted(...)[0]` * Make default time `np.inf` * Add test for safe deletion in builder * Update docstring of `loss` for a run * Remove stray print * Minor feedback fixes * Fix `_metric` to `_metrics` * Fix `make_ensemble_builder` * One more fix for multiple metrics
1 parent 8947890 commit 7100860

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+3274
-2780
lines changed

autosklearn/automl.py

+21-10
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@
6262
convert_if_sparse,
6363
)
6464
from autosklearn.data.xy_data_manager import XYDataManager
65-
from autosklearn.ensemble_builder import EnsembleBuilderManager
65+
from autosklearn.ensemble_building import EnsembleBuilderManager
6666
from autosklearn.ensembles.singlebest_ensemble import SingleBest
6767
from autosklearn.evaluation import ExecuteTaFuncWithQueue, get_cost_of_crash
6868
from autosklearn.evaluation.abstract_evaluator import _fit_and_suppress_warnings
@@ -303,6 +303,8 @@ def __init__(
303303
self._label_num = None
304304
self._parser = None
305305
self._can_predict = False
306+
self._read_at_most = None
307+
self._max_ensemble_build_iterations = None
306308
self.models_: Optional[dict] = None
307309
self.cv_models_: Optional[dict] = None
308310
self.ensemble_ = None
@@ -808,9 +810,9 @@ def fit(
808810
max_models_on_disc=self._max_models_on_disc,
809811
seed=self._seed,
810812
precision=self.precision,
811-
max_iterations=None,
812-
read_at_most=np.inf,
813-
ensemble_memory_limit=self._memory_limit,
813+
max_iterations=self._max_ensemble_build_iterations,
814+
read_at_most=self._read_at_most,
815+
memory_limit=self._memory_limit,
814816
random_state=self._seed,
815817
logger_port=self._logger_port,
816818
pynisher_context=self._multiprocessing_context,
@@ -923,7 +925,7 @@ def fit(
923925
)
924926
result = proc_ensemble.futures.pop().result()
925927
if result:
926-
ensemble_history, _, _, _, _ = result
928+
ensemble_history, _ = result
927929
self.ensemble_performance_history.extend(ensemble_history)
928930
self._logger.info("Ensemble script finished, continue shutdown.")
929931

@@ -1524,8 +1526,8 @@ def fit_ensemble(
15241526
seed=self._seed,
15251527
precision=precision if precision else self.precision,
15261528
max_iterations=1,
1527-
read_at_most=np.inf,
1528-
ensemble_memory_limit=self._memory_limit,
1529+
read_at_most=None,
1530+
memory_limit=self._memory_limit,
15291531
random_state=self._seed,
15301532
logger_port=self._logger_port,
15311533
pynisher_context=self._multiprocessing_context,
@@ -1538,7 +1540,7 @@ def fit_ensemble(
15381540
"Error building the ensemble - please check the log file and command "
15391541
"line output for error messages."
15401542
)
1541-
self.ensemble_performance_history, _, _, _, _ = result
1543+
self.ensemble_performance_history, _ = result
15421544
self._ensemble_size = ensemble_size
15431545

15441546
self._load_models()
@@ -2096,6 +2098,15 @@ def has_key(rv, key):
20962098

20972099
return ensemble_dict
20982100

2101+
def has_ensemble(self) -> bool:
2102+
"""
2103+
Returns
2104+
-------
2105+
bool
2106+
Whether this AutoML instance has an ensemble
2107+
"""
2108+
return self.ensemble_ is not None
2109+
20992110
def _create_search_space(
21002111
self,
21012112
tmp_dir: str,
@@ -2154,7 +2165,7 @@ def fit(
21542165
y: SUPPORTED_TARGET_TYPES | spmatrix,
21552166
X_test: Optional[SUPPORTED_FEAT_TYPES] = None,
21562167
y_test: Optional[SUPPORTED_TARGET_TYPES | spmatrix] = None,
2157-
feat_type: Optional[list[bool]] = None,
2168+
feat_type: Optional[list[str]] = None,
21582169
dataset_name: Optional[str] = None,
21592170
only_return_configuration_space: bool = False,
21602171
load_models: bool = True,
@@ -2244,7 +2255,7 @@ def fit(
22442255
y: SUPPORTED_TARGET_TYPES | spmatrix,
22452256
X_test: Optional[SUPPORTED_FEAT_TYPES] = None,
22462257
y_test: Optional[SUPPORTED_TARGET_TYPES | spmatrix] = None,
2247-
feat_type: Optional[list[bool]] = None,
2258+
feat_type: Optional[list[str]] = None,
22482259
dataset_name: Optional[str] = None,
22492260
only_return_configuration_space: bool = False,
22502261
load_models: bool = True,

0 commit comments

Comments
 (0)