From 08f30aef0a58f55339c4f5337d4ebd333206d621 Mon Sep 17 00:00:00 2001 From: Eddie Bergman Date: Fri, 2 Aug 2024 18:23:41 +0200 Subject: [PATCH] refactor: modularize file state (#126) --- .gitignore | 3 + .pre-commit-config.yaml | 6 +- neps/api.py | 64 +- neps/env.py | 89 ++ neps/exceptions.py | 47 + neps/optimizers/__init__.py | 5 +- neps/optimizers/base_optimizer.py | 148 +- .../acquisition_samplers/base_acq_sampler.py | 14 +- .../acquisition_samplers/mutation_sampler.py | 14 +- .../bayesian_optimization/cost_cooling.py | 48 +- .../bayesian_optimization/kernels/encoding.py | 10 +- .../bayesian_optimization/mf_tpe.py | 11 +- .../bayesian_optimization/models/deepGP.py | 53 +- .../models/gp_hierarchy.py | 49 +- .../bayesian_optimization/optimizer.py | 7 +- neps/optimizers/grid_search/optimizer.py | 8 +- neps/optimizers/multi_fidelity/_dyhpo.py | 9 +- neps/optimizers/multi_fidelity/dyhpo.py | 7 +- neps/optimizers/multi_fidelity/hyperband.py | 15 +- .../multi_fidelity/successive_halving.py | 21 +- .../multi_fidelity_prior/async_priorband.py | 15 +- .../prototype_optimizer.py | 7 +- neps/optimizers/random_search/optimizer.py | 8 +- .../regularized_evolution/optimizer.py | 17 +- neps/plot/tensorboard_eval.py | 20 +- neps/runtime.py | 1372 ++++++----------- neps/search_spaces/architecture/graph.py | 22 +- neps/search_spaces/search_space.py | 36 +- neps/search_spaces/yaml_search_space_utils.py | 84 +- neps/state/__init__.py | 19 + neps/state/_eval.py | 195 +++ neps/state/err_dump.py | 77 + neps/state/filebased.py | 672 ++++++++ neps/state/neps_state.py | 231 +++ neps/state/optimizer.py | 57 + neps/state/protocols.py | 560 +++++++ neps/state/seed_snapshot.py | 115 ++ neps/state/settings.py | 171 ++ neps/state/trial.py | 289 ++++ neps/status/status.py | 37 +- neps/utils/_rng.py | 176 --- neps/utils/common.py | 62 +- neps/utils/data_loading.py | 93 +- neps/utils/files.py | 34 +- neps/utils/types.py | 9 +- neps/utils/validation.py | 31 + neps_examples/basic_usage/hyperparameters.py | 2 +- neps_examples/efficiency/multi_fidelity.py | 4 +- pyproject.toml | 4 +- tests/test_neps_api/test_api.py | 10 +- tests/test_runtime/__init__.py | 0 .../test_default_report_values.py | 170 ++ .../test_error_handling_strategies.py | 200 +++ tests/test_runtime/test_locking.py | 105 -- tests/test_runtime/test_stopping_criterion.py | 481 ++++++ tests/test_state/__init__.py | 0 tests/test_state/test_filebased_neps_state.py | 123 ++ tests/test_state/test_neps_state.py | 205 +++ tests/{ => test_state}/test_rng.py | 34 +- tests/test_state/test_synced.py | 432 ++++++ tests/test_state/test_trial.py | 301 ++++ .../test_declarative_usage_docs.py | 35 +- .../test_neps_run.py | 76 +- 63 files changed, 5523 insertions(+), 1696 deletions(-) create mode 100644 neps/env.py create mode 100644 neps/exceptions.py create mode 100644 neps/state/__init__.py create mode 100644 neps/state/_eval.py create mode 100644 neps/state/err_dump.py create mode 100644 neps/state/filebased.py create mode 100644 neps/state/neps_state.py create mode 100644 neps/state/optimizer.py create mode 100644 neps/state/protocols.py create mode 100644 neps/state/seed_snapshot.py create mode 100644 neps/state/settings.py create mode 100644 neps/state/trial.py delete mode 100644 neps/utils/_rng.py create mode 100644 neps/utils/validation.py create mode 100644 tests/test_runtime/__init__.py create mode 100644 tests/test_runtime/test_default_report_values.py create mode 100644 tests/test_runtime/test_error_handling_strategies.py delete mode 100644 tests/test_runtime/test_locking.py create mode 100644 tests/test_runtime/test_stopping_criterion.py create mode 100644 tests/test_state/__init__.py create mode 100644 tests/test_state/test_filebased_neps_state.py create mode 100644 tests/test_state/test_neps_state.py rename tests/{ => test_state}/test_rng.py (52%) create mode 100644 tests/test_state/test_synced.py create mode 100644 tests/test_state/test_trial.py diff --git a/.gitignore b/.gitignore index 1471b0e8..e8be93e7 100644 --- a/.gitignore +++ b/.gitignore @@ -44,3 +44,6 @@ jahs_bench_data/ # MacOS *.DS_Store + +# Yaml tests +path diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7228325e..92ff2356 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -27,7 +27,7 @@ repos: files: '^src/.*\.py$' - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.10.0 + rev: v1.11.1 hooks: - id: mypy files: | @@ -42,7 +42,7 @@ repos: - "--show-traceback" - repo: https://github.com/python-jsonschema/check-jsonschema - rev: 0.28.2 + rev: 0.29.1 hooks: - id: check-github-workflows files: '^github/workflows/.*\.ya?ml$' @@ -51,7 +51,7 @@ repos: files: '^\.github/dependabot\.ya?ml$' - repo: https://github.com/charliermarsh/ruff-pre-commit - rev: v0.4.2 + rev: v0.5.5 hooks: - id: ruff args: [--fix, --exit-non-zero-on-fix, --no-cache] diff --git a/neps/api.py b/neps/api.py index e523fe44..196e371d 100644 --- a/neps/api.py +++ b/neps/api.py @@ -1,5 +1,5 @@ -"""API for the neps package. -""" +"""API for the neps package.""" + from __future__ import annotations import inspect @@ -12,7 +12,7 @@ from neps.utils.run_args import Settings, Default from neps.utils.common import instance_from_map -from neps.runtime import launch_runtime +from neps.runtime import _launch_runtime from neps.optimizers import BaseOptimizer, SearcherMapping from neps.search_spaces.parameter import Parameter from neps.search_spaces.search_space import ( @@ -24,6 +24,8 @@ from neps.utils.common import get_searcher_data, get_value from neps.optimizers.info import SearcherConfigs +logger = logging.getLogger(__name__) + def run( run_pipeline: Callable | None = Default(None), @@ -59,7 +61,8 @@ def run( "asha", "regularized_evolution", ] - | BaseOptimizer | Path + | BaseOptimizer + | Path ) = Default("default"), **searcher_kwargs, ) -> None: @@ -144,13 +147,11 @@ def run( ) max_cost_total = searcher_kwargs["budget"] del searcher_kwargs["budget"] + settings = Settings(locals(), run_args) # TODO: check_essentials, - logger = logging.getLogger("neps") - # DO NOT use any neps arguments directly; instead, access them via the Settings class. - if settings.pre_load_hooks is None: settings.pre_load_hooks = [] @@ -175,8 +176,9 @@ def run( # TODO habe hier searcher kwargs gedroppt, sprich das merging muss davor statt # finden searcher_info["searcher_args"] = settings.searcher_kwargs - settings.searcher = settings.searcher(search_space, - **settings.searcher_kwargs) + settings.searcher = settings.searcher( + search_space, **settings.searcher_kwargs + ) else: # Raise an error if searcher is not a subclass of BaseOptimizer raise TypeError( @@ -200,7 +202,6 @@ def run( ignore_errors=settings.ignore_errors, loss_value_on_error=settings.loss_value_on_error, cost_value_on_error=settings.cost_value_on_error, - logger=logger, searcher=settings.searcher, **settings.searcher_kwargs, ) @@ -220,23 +221,25 @@ def run( ) if settings.task_id is not None: - settings.root_directory = Path(settings.root_directory) / (f"task_" - f"{settings.task_id}") + settings.root_directory = Path(settings.root_directory) / ( + f"task_" f"{settings.task_id}" + ) if settings.development_stage_id is not None: - settings.root_directory = (Path(settings.root_directory) / - f"dev_{settings.development_stage_id}") + settings.root_directory = ( + Path(settings.root_directory) / f"dev_{settings.development_stage_id}" + ) - launch_runtime( + _launch_runtime( evaluation_fn=settings.run_pipeline, - sampler=searcher_instance, + optimizer=searcher_instance, optimizer_info=searcher_info, - optimization_dir=settings.root_directory, + max_cost_total=settings.max_cost_total, + optimization_dir=Path(settings.root_directory), max_evaluations_total=settings.max_evaluations_total, - max_evaluations_per_run=settings.max_evaluations_per_run, - continue_until_max_evaluation_completed - =settings.continue_until_max_evaluation_completed, - logger=logger, + max_evaluations_for_worker=settings.max_evaluations_per_run, + continue_until_max_evaluation_completed=settings.continue_until_max_evaluation_completed, loss_value_on_error=settings.loss_value_on_error, + cost_value_on_error=settings.cost_value_on_error, ignore_errors=settings.ignore_errors, overwrite_optimization_dir=settings.overwrite_working_directory, pre_load_hooks=settings.pre_load_hooks, @@ -260,7 +263,6 @@ def _run_args( ignore_errors: bool = False, loss_value_on_error: None | float = None, cost_value_on_error: None | float = None, - logger=None, searcher: ( Literal[ "default", @@ -306,13 +308,17 @@ def _run_args( raise TypeError(message) from e # Load the information of the optimizer - if isinstance(searcher, (str, Path)) and searcher not in \ - SearcherConfigs.get_searchers() and searcher != "default": + if ( + isinstance(searcher, (str, Path)) + and searcher not in SearcherConfigs.get_searchers() + and searcher != "default" + ): # The users have their own custom searcher provided via yaml. logging.info("Preparing to run user created searcher") - searcher_config, file_name = get_searcher_data(searcher, - loading_custom_searcher=True) + searcher_config, file_name = get_searcher_data( + searcher, loading_custom_searcher=True + ) # name defined via key or the filename of the yaml searcher_name = searcher_config.pop("name", file_name) searcher_info["searcher_selection"] = "user-yaml" @@ -351,21 +357,19 @@ def _run_args( warnings.warn( "The 'algorithm' argument is deprecated and will be removed in " "future versions. Please use 'strategy' instead.", - DeprecationWarning + DeprecationWarning, ) # Map the old 'algorithm' argument to 'strategy' - searcher_config['strategy'] = searcher_config.pop("algorithm") + searcher_config["strategy"] = searcher_config.pop("algorithm") if "strategy" in searcher_config: searcher_alg = searcher_config.pop("strategy") else: raise KeyError(f"Missing key strategy in searcher config:{searcher_config}") - logger.info(f"Running {searcher_name} as the searcher") logger.info(f"Strategy: {searcher_alg}") - # Used to create the yaml holding information about the searcher. # Also important for testing and debugging the api. searcher_info["searcher_name"] = searcher_name diff --git a/neps/env.py b/neps/env.py new file mode 100644 index 00000000..7cb8eada --- /dev/null +++ b/neps/env.py @@ -0,0 +1,89 @@ +"""Environment variable parsing for the state.""" + +from __future__ import annotations + +import os +from typing import Callable, TypeVar + +T = TypeVar("T") +V = TypeVar("V") + + +def get_env(key: str, parse: Callable[[str], T], default: V) -> T | V: + """Get an environment variable or return a default value.""" + if (e := os.environ.get(key)) is not None: + return parse(e) + + return default + + +def is_nullable(e: str) -> bool: + """Check if an environment variable is nullable.""" + return e.lower() in ("none", "n", "null") + + +TRIAL_FILELOCK_POLL = get_env( + "NEPS_TRIAL_FILELOCK_POLL", + parse=float, + default=0.05, +) +TRIAL_FILELOCK_TIMEOUT = get_env( + "NEPS_TRIAL_FILELOCK_TIMEOUT", + parse=lambda e: None if is_nullable(e) else float(e), + default=None, +) + +JOBQUEUE_FILELOCK_POLL = get_env( + "NEPS_JOBQUEUE_FILELOCK_POLL", + parse=float, + default=0.05, +) +JOBQUEUE_FILELOCK_TIMEOUT = get_env( + "NEPS_JOBQUEUE_FILELOCK_TIMEOUT", + parse=lambda e: None if is_nullable(e) else float(e), + default=None, +) + +SEED_SNAPSHOT_FILELOCK_POLL = get_env( + "NEPS_SEED_SNAPSHOT_FILELOCK_POLL", + parse=float, + default=0.05, +) +SEED_SNAPSHOT_FILELOCK_TIMEOUT = get_env( + "NEPS_SEED_SNAPSHOT_FILELOCK_TIMEOUT", + parse=lambda e: None if is_nullable(e) else float(e), + default=None, +) + +OPTIMIZER_INFO_FILELOCK_POLL = get_env( + "NEPS_OPTIMIZER_INFO_FILELOCK_POLL", + parse=float, + default=0.05, +) +OPTIMIZER_INFO_FILELOCK_TIMEOUT = get_env( + "NEPS_OPTIMIZER_INFO_FILELOCK_TIMEOUT", + parse=lambda e: None if is_nullable(e) else float(e), + default=None, +) + +OPTIMIZER_STATE_FILELOCK_POLL = get_env( + "NEPS_OPTIMIZER_STATE_FILELOCK_POLL", + parse=float, + default=0.05, +) +OPTIMIZER_STATE_FILELOCK_TIMEOUT = get_env( + "NEPS_OPTIMIZER_STATE_FILELOCK_TIMEOUT", + parse=lambda e: None if is_nullable(e) else float(e), + default=None, +) + +GLOBAL_ERR_FILELOCK_POLL = get_env( + "NEPS_GLOBAL_ERR_FILELOCK_POLL", + parse=float, + default=0.05, +) +GLOBAL_ERR_FILELOCK_TIMEOUT = get_env( + "NEPS_GLOBAL_ERR_FILELOCK_TIMEOUT", + parse=lambda e: None if is_nullable(e) else float(e), + default=None, +) diff --git a/neps/exceptions.py b/neps/exceptions.py new file mode 100644 index 00000000..597dfb1f --- /dev/null +++ b/neps/exceptions.py @@ -0,0 +1,47 @@ +"""Exceptions for NePS that don't belong in a specific module.""" + +from __future__ import annotations + + +class NePSError(Exception): + """Base class for all NePS exceptions. + + This allows an easier way to catch all NePS exceptions + if we inherit all exceptions from this class. + """ + + +class VersionMismatchError(NePSError): + """Raised when the version of a resource does not match the expected version.""" + + +class VersionedResourceAlreadyExistsError(NePSError): + """Raised when a version already exists when trying to create a new versioned + data. + """ + + +class VersionedResourceRemovedError(NePSError): + """Raised when a version already exists when trying to create a new versioned + data. + """ + + +class VersionedResourceDoesNotExistsError(NePSError): + """Raised when a versioned resource does not exist at a location.""" + + +class LockFailedError(NePSError): + """Raised when a lock cannot be acquired.""" + + +class TrialAlreadyExistsError(VersionedResourceAlreadyExistsError): + """Raised when a trial already exists in the store.""" + + +class TrialNotFoundError(VersionedResourceDoesNotExistsError): + """Raised when a trial already exists in the store.""" + + +class WorkerFailedToGetPendingTrialsError(NePSError): + """Raised when a worker failed to get pending trials.""" diff --git a/neps/optimizers/__init__.py b/neps/optimizers/__init__.py index 0493078a..31cb4c4a 100644 --- a/neps/optimizers/__init__.py +++ b/neps/optimizers/__init__.py @@ -1,7 +1,7 @@ from __future__ import annotations from functools import partial -from typing import Callable +from typing import Callable, Mapping from .base_optimizer import BaseOptimizer from .bayesian_optimization.cost_cooling import CostCooling @@ -26,7 +26,8 @@ from .random_search.optimizer import RandomSearch from .regularized_evolution.optimizer import RegularizedEvolution -SearcherMapping: dict[str, Callable] = { +# TODO: Rename Searcher to Optimizer... +SearcherMapping: Mapping[str, Callable[..., BaseOptimizer]] = { "bayesian_optimization": BayesianOptimization, "pibo": partial(BayesianOptimization, disable_priors=False), "cost_cooling_bayesian_optimization": CostCooling, diff --git a/neps/optimizers/base_optimizer.py b/neps/optimizers/base_optimizer.py index 33f1a804..34804626 100644 --- a/neps/optimizers/base_optimizer.py +++ b/neps/optimizers/base_optimizer.py @@ -2,15 +2,21 @@ import logging from abc import abstractmethod -from typing import Any, Iterator, Mapping -from typing_extensions import Self -from contextlib import contextmanager -from pathlib import Path +from typing import Any, Mapping +from dataclasses import asdict, dataclass +from neps.state.optimizer import BudgetInfo from neps.utils.types import ConfigResult, RawConfig, ERROR, ResultDict -from neps.utils.files import serialize, deserialize from neps.search_spaces.search_space import SearchSpace from neps.utils.data_loading import _get_cost, _get_learning_curve, _get_loss +from neps.state.trial import Trial + + +@dataclass +class SampledConfig: + id: Trial.ID + config: Mapping[str, Any] + previous_config_id: Trial.ID | None class BaseOptimizer: @@ -41,10 +47,12 @@ def __init__( self.ignore_errors = ignore_errors @abstractmethod - def load_results( + def load_optimization_state( self, previous_results: dict[str, ConfigResult], pending_evaluations: dict[str, SearchSpace], + budget_info: BudgetInfo | None, + optimizer_state: dict[str, Any], ) -> None: raise NotImplementedError @@ -60,45 +68,125 @@ def get_config_and_ids(self) -> tuple[RawConfig, str, str | None]: """ raise NotImplementedError - def get_state(self) -> Any: - _state = {"used_budget": self.used_budget} - if self.budget is not None: - # TODO(eddiebergman): Seems like this isn't used anywhere, - # A fuzzy find search for `remaining_budget` shows this as the - # only use point. - _state["remaining_budget"] = self.budget - self.used_budget + def ask( + self, + trials: Mapping[str, Trial], + budget_info: BudgetInfo | None, + optimizer_state: dict[str, Any], + ) -> tuple[SampledConfig, dict[str, Any]]: + """Sample a new configuration + + !!! note - return _state + The plan is this method replaces the two-step procedure of `load_optimization_state` + and `get_config_and_ids` in the future, replacing both with a single method `ask` + which would be easier for developer of NePS optimizers to implement. - def load_state(self, state: Any) -> None: - self.used_budget = state["used_budget"] + !!! note - def load_config(self, config_dict: Mapping[str, Any]) -> SearchSpace: - config = self.pipeline_space.clone() - config.load_from(config_dict) - return config + The `optimizer_state` right now is just a `dict` that optimizers are free to mutate + as desired. A `dict` is not ideal as its _stringly_ typed but this was the least + invasive way to add this at the moment. It's actually an existing feature no + optimizer uses except _cost-cooling_ which basically just took a value from + `budget_info`. - def get_loss(self, result: ERROR | ResultDict | float) -> float | Any: + Ideally an optimizer overwriting this can decide what to return instead of having + to rely on them mutating it, however this is the best work-around I could come up with + for now. + + Args: + trials: All of the trials that are known about. + budget_info: information about the budget + optimizer_state: extra state the optimizer would like to keep between calls + + Returns: + SampledConfig: a sampled configuration + dict: state the optimizer would like to keep between calls + """ + completed: dict[Trial.ID, ConfigResult] = {} + pending: dict[Trial.ID, SearchSpace] = {} + for trial_id, trial in trials.items(): + if trial.report is not None: + completed[trial_id] = ConfigResult( + id=trial_id, + config=self.pipeline_space.from_dict(trial.config), + result=trial.report, + # TODO: Better if we could just pass around this metadata + # object instead of converting to a dict each time. + metadata=asdict(trial.metadata), + ) + elif trial.state in ( + Trial.State.PENDING, + Trial.State.SUBMITTED, + Trial.State.EVALUATING, + ): + pending[trial_id] = self.pipeline_space.from_dict(trial.config) + + self.load_optimization_state( + previous_results=completed, + pending_evaluations=pending, + budget_info=budget_info, + optimizer_state=optimizer_state, + ) + config, config_id, previous_config_id = self.get_config_and_ids() + return SampledConfig( + id=config_id, config=config, previous_config_id=previous_config_id + ), optimizer_state + + def update_state_post_evaluation( + self, state: dict[str, Any], report: Trial.Report + ) -> dict[str, Any]: + # TODO: There's a slot in `OptimizerState` to store extra things + # required for the optimizer but is currently not used + # state["key"] = "value" + return state + + def get_loss( + self, result: ERROR | ResultDict | float | Trial.Report + ) -> float | ERROR: """Calls result.utils.get_loss() and passes the error handling through. Please use self.get_loss() instead of get_loss() in all optimizer classes.""" + + # TODO(eddiebergman): This is a forward change for whenever we can have optimizers + # use `Trial` and `Report`, they already take care of this and save having to do this + # `_get_loss` at every call. We can also then just use `None` instead of the string `"error"` + if isinstance(result, Trial.Report): + return result.loss if result.loss is not None else "error" + return _get_loss( result, loss_value_on_error=self.loss_value_on_error, ignore_errors=self.ignore_errors, ) - def get_cost(self, result: ERROR | ResultDict | float) -> float | Any: + def get_cost( + self, result: ERROR | ResultDict | float | Trial.Report + ) -> float | ERROR: """Calls result.utils.get_cost() and passes the error handling through. Please use self.get_cost() instead of get_cost() in all optimizer classes.""" + # TODO(eddiebergman): This is a forward change for whenever we can have optimizers + # use `Trial` and `Report`, they already take care of this and save having to do this + # `_get_loss` at every call + if isinstance(result, Trial.Report): + return result.loss if result.loss is not None else "error" + return _get_cost( result, cost_value_on_error=self.cost_value_on_error, ignore_errors=self.ignore_errors, ) - def get_learning_curve(self, result: str | dict | float) -> float | Any: + def get_learning_curve( + self, result: str | dict | float | Trial.Report + ) -> list[float] | Any: """Calls result.utils.get_loss() and passes the error handling through. Please use self.get_loss() instead of get_loss() in all optimizer classes.""" + # TODO(eddiebergman): This is a forward change for whenever we can have optimizers + # use `Trial` and `Report`, they already take care of this and save having to do this + # `_get_loss` at every call + if isinstance(result, Trial.Report): + return result.learning_curve + return _get_learning_curve( result, learning_curve_on_error=self.learning_curve_on_error, @@ -107,17 +195,3 @@ def get_learning_curve(self, result: str | dict | float) -> float | Any: def whoami(self) -> str: return type(self).__name__ - - @contextmanager - def using_state(self, state_file: Path) -> Iterator[Self]: - if state_file.exists(): - optimizer_state = deserialize(state_file) - self.load_state(optimizer_state) - - yield self - - serialize(self.get_state(), path=state_file) - - def is_out_of_budget(self) -> bool: - """Check if the optimizer has used all of its budget, if any.""" - return self.budget is not None and self.used_budget >= self.budget diff --git a/neps/optimizers/bayesian_optimization/acquisition_samplers/base_acq_sampler.py b/neps/optimizers/bayesian_optimization/acquisition_samplers/base_acq_sampler.py index 8fbc572a..adf47b82 100644 --- a/neps/optimizers/bayesian_optimization/acquisition_samplers/base_acq_sampler.py +++ b/neps/optimizers/bayesian_optimization/acquisition_samplers/base_acq_sampler.py @@ -3,9 +3,9 @@ from abc import abstractmethod from typing import TYPE_CHECKING, Sequence, Callable -from neps.utils.types import Array - if TYPE_CHECKING: + import numpy as np + import torch from neps.search_spaces.search_space import SearchSpace @@ -17,16 +17,20 @@ def __init__(self, pipeline_space: SearchSpace, patience: int = 50): self.pipeline_space = pipeline_space self.acquisition_function = None self.x: list[SearchSpace] = [] - self.y: Sequence[float] | Array = [] + self.y: Sequence[float] | np.ndarray | torch.Tensor = [] self.patience = patience @abstractmethod def sample(self, acquisition_function: Callable) -> SearchSpace: raise NotImplementedError - def sample_batch(self, acquisition_function: Callable, batch: int) -> list[SearchSpace]: + def sample_batch( + self, acquisition_function: Callable, batch: int + ) -> list[SearchSpace]: return [self.sample(acquisition_function) for _ in range(batch)] - def set_state(self, x: list[SearchSpace], y: Sequence[float] | Array) -> None: + def set_state( + self, x: list[SearchSpace], y: Sequence[float] | np.ndarray | torch.Tensor + ) -> None: self.x = x self.y = y diff --git a/neps/optimizers/bayesian_optimization/acquisition_samplers/mutation_sampler.py b/neps/optimizers/bayesian_optimization/acquisition_samplers/mutation_sampler.py index 869d5e91..4c6b17df 100644 --- a/neps/optimizers/bayesian_optimization/acquisition_samplers/mutation_sampler.py +++ b/neps/optimizers/bayesian_optimization/acquisition_samplers/mutation_sampler.py @@ -7,11 +7,14 @@ from more_itertools import first from typing_extensions import override -from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import AcquisitionSampler -from neps.optimizers.bayesian_optimization.acquisition_samplers.random_sampler import RandomSampler +from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import ( + AcquisitionSampler, +) +from neps.optimizers.bayesian_optimization.acquisition_samplers.random_sampler import ( + RandomSampler, +) if TYPE_CHECKING: - from neps.utils.types import Array from neps.search_spaces.search_space import SearchSpace @@ -64,7 +67,9 @@ def __init__( ) @override - def set_state(self, x: list[SearchSpace], y: Sequence[float] | Array) -> None: + def set_state( + self, x: list[SearchSpace], y: Sequence[float] | np.ndarray | torch.Tensor + ) -> None: super().set_state(x, y) self.random_sampling.set_state(x, y) @@ -108,6 +113,7 @@ def create_pool( ][:n_best] seen: set[int] = set() + def _hash(_config: SearchSpace) -> int: return hash(_config.hp_values().values()) diff --git a/neps/optimizers/bayesian_optimization/cost_cooling.py b/neps/optimizers/bayesian_optimization/cost_cooling.py index ea6dffff..f2878fe9 100644 --- a/neps/optimizers/bayesian_optimization/cost_cooling.py +++ b/neps/optimizers/bayesian_optimization/cost_cooling.py @@ -1,7 +1,9 @@ from __future__ import annotations from typing import Any +from typing_extensions import override +from neps.state.optimizer import BudgetInfo from neps.utils.types import ConfigResult from neps.utils.common import instance_from_map from neps.optimizers.bayesian_optimization.acquisition_functions.cost_cooling import ( @@ -9,11 +11,19 @@ ) from neps.search_spaces.search_space import SearchSpace from neps.optimizers.bayesian_optimization.acquisition_functions import AcquisitionMapping -from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import BaseAcquisition -from neps.optimizers.bayesian_optimization.acquisition_functions.prior_weighted import DecayingPriorWeightedAcquisition -from neps.optimizers.bayesian_optimization.acquisition_samplers import AcquisitionSamplerMapping -from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import AcquisitionSampler -from neps.optimizers.bayesian_optimization.kernels import get_kernels +from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import ( + BaseAcquisition, +) +from neps.optimizers.bayesian_optimization.acquisition_functions.prior_weighted import ( + DecayingPriorWeightedAcquisition, +) +from neps.optimizers.bayesian_optimization.acquisition_samplers import ( + AcquisitionSamplerMapping, +) +from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import ( + AcquisitionSampler, +) +from neps.optimizers.bayesian_optimization.kernels.get_kernels import get_kernels from neps.optimizers.bayesian_optimization.models import SurrogateModelMapping from neps.optimizers.bayesian_optimization.optimizer import BayesianOptimization @@ -133,9 +143,9 @@ def __init__( raise ValueError("No kernels are provided!") if "vectorial_features" not in surrogate_model_args: - surrogate_model_args[ - "vectorial_features" - ] = self.pipeline_space.get_vectorial_dim() + surrogate_model_args["vectorial_features"] = ( + self.pipeline_space.get_vectorial_dim() + ) self.surrogate_model = instance_from_map( SurrogateModelMapping, @@ -153,9 +163,9 @@ def __init__( raise ValueError("No kernels are provided!") if "vectorial_features" not in cost_model_args: - cost_model_args[ - "vectorial_features" - ] = self.pipeline_space.get_vectorial_dim() + cost_model_args["vectorial_features"] = ( + self.pipeline_space.get_vectorial_dim() + ) self.cost_model = instance_from_map( SurrogateModelMapping, @@ -184,12 +194,23 @@ def __init__( kwargs={"patience": self.patience, "pipeline_space": self.pipeline_space}, ) - def load_results( + @override + def load_optimization_state( self, previous_results: dict[str, ConfigResult], pending_evaluations: dict[str, SearchSpace], + budget_info: BudgetInfo | None, + optimizer_state: dict[str, Any], ) -> None: # TODO(Jan): read out cost and fit cost model + if budget_info is None: + raise ValueError( + "Used budget is not set in the optimizer state but is required" + " for cost cooling, please return a `'cost'` when you return results" + " and/or a `max_cost_budget` when running NePS!" + ) + self.used_budget = budget_info.used_cost_budget + train_x = [el.config for el in previous_results.values()] train_y = [self.get_loss(el.result) for el in previous_results.values()] train_cost = [self.get_cost(el.result) for el in previous_results.values()] @@ -214,7 +235,8 @@ def load_results( # TODO: set acquisition state self.acquisition.set_state( self.surrogate_model, - alpha=1 - (self.used_budget / self.budget), + alpha=1 + - (budget_info.used_cost_budget / budget_info.max_cost_budget), cost_model=self.cost_model, ) self.acquisition_sampler.set_state(x=train_x, y=train_y) diff --git a/neps/optimizers/bayesian_optimization/kernels/encoding.py b/neps/optimizers/bayesian_optimization/kernels/encoding.py index 0e7e35df..419b6926 100644 --- a/neps/optimizers/bayesian_optimization/kernels/encoding.py +++ b/neps/optimizers/bayesian_optimization/kernels/encoding.py @@ -65,7 +65,6 @@ def _compute_kernel(self, dist, l=None): return 0.0 if l is None: l = self.lengthscale - # print(dist) return np.exp(-dist / (l**2)) def _compute_dist( @@ -141,10 +140,7 @@ def fit_transform( save_gram_matrix: bool = False, **kwargs, ): - if ( - not rebuild_model - and self._gram is not None - ): + if not rebuild_model and self._gram is not None: return self._gram K = self.forward(*gr, l=l) if save_gram_matrix: @@ -152,9 +148,7 @@ def fit_transform( self._train_x = gr[:] return K - def transform( - self, gr: list, l: float = None, **kwargs - ): + def transform(self, gr: list, l: float = None, **kwargs): if self._gram is None: raise ValueError("The kernel has not been fitted. Run fit_transform first") n = len(gr) diff --git a/neps/optimizers/bayesian_optimization/mf_tpe.py b/neps/optimizers/bayesian_optimization/mf_tpe.py index 4c38352e..45e4adc4 100644 --- a/neps/optimizers/bayesian_optimization/mf_tpe.py +++ b/neps/optimizers/bayesian_optimization/mf_tpe.py @@ -2,13 +2,14 @@ import random from copy import deepcopy -from typing import Iterable +from typing import Any, Iterable import numpy as np import torch from scipy.stats import spearmanr -from typing_extensions import Literal +from typing_extensions import Literal, override +from neps.state.optimizer import BudgetInfo, OptimizationState from neps.utils.types import ConfigResult, RawConfig from neps.utils.common import instance_from_map from neps.search_spaces import ( @@ -457,10 +458,13 @@ def is_init_phase(self) -> bool: return False return True - def load_results( + @override + def load_optimization_state( self, previous_results: dict[str, ConfigResult], pending_evaluations: dict[str, SearchSpace], + budget_info: BudgetInfo | None, + optimizer_state: dict[str, Any], ) -> None: # TODO remove doubles from previous results train_y = [self.get_loss(el.result) for el in previous_results.values()] @@ -637,7 +641,6 @@ def get_config_and_ids(self) -> tuple[RawConfig, str, str | None]: else: config = self.acquisition_sampler.sample(self.acquisition) - print([hp.value for hp in config.hyperparameters.values()]) config.fidelity.set_value(self.rung_map[self.min_rung]) config_id = str(self._num_train_x + len(self._pending_evaluations) + 1) diff --git a/neps/optimizers/bayesian_optimization/models/deepGP.py b/neps/optimizers/bayesian_optimization/models/deepGP.py index 06745184..d5145043 100644 --- a/neps/optimizers/bayesian_optimization/models/deepGP.py +++ b/neps/optimizers/bayesian_optimization/models/deepGP.py @@ -317,9 +317,7 @@ def __reset_xy( normalize_y: bool = False, normalize_budget: bool = True, ): - self.normalize_budget = ( - normalize_budget - ) + self.normalize_budget = normalize_budget self.normalize_y = normalize_y x_train, train_budgets, learning_curves = self._preprocess_input( @@ -329,12 +327,8 @@ def __reset_xy( y_train = self._preprocess_y(y_train, normalize_y) self.x_train = x_train - self.train_budgets = ( - train_budgets - ) - self.learning_curves = ( - learning_curves - ) + self.train_budgets = train_budgets + self.learning_curves = learning_curves self.y_train = y_train def _preprocess_input( @@ -446,13 +440,11 @@ def __train_model( self.model.train() self.likelihood.train() self.nn.train() - self.optimizer = ( - torch.optim.Adam( - [ - dict({"params": self.model.parameters()}, **optimizer_args), - dict({"params": self.nn.parameters()}, **optimizer_args), - ] - ) + self.optimizer = torch.optim.Adam( + [ + dict({"params": self.model.parameters()}, **optimizer_args), + dict({"params": self.nn.parameters()}, **optimizer_args), + ] ) count_down = patience @@ -547,10 +539,8 @@ def __train_model( # break def set_prediction_learning_curves(self, learning_curves: list[list[float]]): - self.prediction_learning_curves = learning_curves - def predict( self, x: list[SearchSpace], learning_curves: list[list[float]] | None = None ): @@ -642,30 +632,3 @@ def get_state(self) -> dict[str, dict]: def delete_checkpoint(self): self.checkpoint_path.unlink(missing_ok=True) - - -if __name__ == "__main__": - print(torch.version.__version__) - - pipe_space = SearchSpace( - float_=FloatParameter(lower=0.0, upper=5.0), - e=IntegerParameter(lower=0, upper=10, is_fidelity=True), - ) - - configs = [pipe_space.sample(ignore_fidelity=False) for _ in range(100)] - - y = np.random.random(100).tolist() - - lcs = [ - np.random.random(size=np.random.randint(low=1, high=50)).tolist() - for _ in range(100) - ] - - deep_gp = DeepGP(pipe_space, neural_network_args={}) - - deep_gp.fit(x_train=configs, learning_curves=lcs, y_train=y) - - means, stds = deep_gp.predict(configs, lcs) - - print(list(zip(means, y))) - print(stds) diff --git a/neps/optimizers/bayesian_optimization/models/gp_hierarchy.py b/neps/optimizers/bayesian_optimization/models/gp_hierarchy.py index 14f5cda5..a359b937 100644 --- a/neps/optimizers/bayesian_optimization/models/gp_hierarchy.py +++ b/neps/optimizers/bayesian_optimization/models/gp_hierarchy.py @@ -15,6 +15,10 @@ from ..kernels.vectorial_kernels import Stationary from ..kernels.weisfilerlehman import WeisfilerLehman +import logging + +logger = logging.getLogger(__name__) + # Code for psd_safe_cholesky from gypytorch class _value_context: @@ -69,20 +73,12 @@ def _set_value(cls, float_value, double_value, half_value): if half_value is not None: cls._global_half_value = half_value - def __init__( - self, float=None, double=None, half=None - ): - self._orig_float_value = ( - self.__class__.value() - ) + def __init__(self, float=None, double=None, half=None): + self._orig_float_value = self.__class__.value() self._instance_float_value = float - self._orig_double_value = ( - self.__class__.value() - ) + self._orig_double_value = self.__class__.value() self._instance_double_value = double - self._orig_half_value = ( - self.__class__.value() - ) + self._orig_half_value = self.__class__.value() self._instance_half_value = half def __enter__( @@ -459,7 +455,6 @@ def _optimize_graph_kernels(self, h_: int, lengthscale_): K, self.likelihood, self.gpytorch_kinv ) nlml = -compute_log_marginal_likelihood(K_i, logDetK, train_y) - # print(i, nlml) if nlml < best_nlml: best_nlml = nlml best_subtree_depth = h_i @@ -468,9 +463,7 @@ def _optimize_graph_kernels(self, h_: int, lengthscale_): self.combined_kernel.kernels[0].change_kernel_params( {"h": best_subtree_depth} ) - self.combined_kernel._gram = ( - best_K - ) + self.combined_kernel._gram = best_K def fit(self, train_x: Iterable, train_y: Union[Iterable, torch.Tensor]): self._fit(train_x, train_y, **self.surrogate_model_fit_args) @@ -483,7 +476,8 @@ def _fit( optimizer: str = "adam", wl_subtree_candidates: tuple = tuple(range(5)), wl_lengthscales: tuple = tuple( - np.e**i for i in range(-2, 3) # type: ignore[name-defined] + np.e**i + for i in range(-2, 3) # type: ignore[name-defined] ), optimize_lik: bool = True, max_lik: float = 0.5, @@ -589,7 +583,7 @@ def _fit( nlml = -compute_log_marginal_likelihood(K_i, logDetK, self.y) nlml.backward(create_graph=True) if self.verbose and i % 10 == 0: - print( + logger.info( "Iteration:", i, "/", @@ -646,20 +640,6 @@ def _fit( k.update_hyperparameters(lengthscale=torch.exp(theta_vector)) self.combined_kernel.weights = weights.clone() - if self.verbose: - print("Optimisation summary: ") - print("Optimal NLML: ", nlml) - print("Lengthscales: ", torch.exp(theta_vector)) - try: - print( - "Optimal h: ", - self.domain_kernels[0]._h, - ) - except AttributeError: - pass - print("Weights: ", self.weights) - print("Lik:", self.likelihood) - print("Optimal layer weights", layer_weights) def predict(self, x_configs, preserve_comp_graph: bool = False): """Kriging predictions""" @@ -993,17 +973,12 @@ def _grid_search_wl_kernel( k.change_se_params({"lengthscale": i[1]}) k.change_kernel_params({"h": i[0]}) K = k.fit_transform(train_x, rebuild_model=True, save_gram_matrix=True) - # print(K) K_i, logDetK = compute_pd_inverse(K, lik, gpytorch_kinv) - # print(train_y) nlml = -compute_log_marginal_likelihood(K_i, logDetK, train_y) - # print(i, nlml) if nlml < best_nlml: best_nlml = nlml best_subtree_depth, best_lengthscale = i best_K = torch.clone(K) - # print("h: ", best_subtree_depth, "theta: ", best_lengthscale) - # print(best_subtree_depth) k.change_kernel_params({"h": best_subtree_depth}) if k.se is not None: k.change_se_params({"lengthscale": best_lengthscale}) diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py index 9c9ab5fe..9fc3aeae 100644 --- a/neps/optimizers/bayesian_optimization/optimizer.py +++ b/neps/optimizers/bayesian_optimization/optimizer.py @@ -2,7 +2,9 @@ import random from typing import Any, TYPE_CHECKING, Literal +from typing_extensions import override +from neps.state.optimizer import BudgetInfo, OptimizationState from neps.utils.types import ConfigResult, RawConfig from neps.utils.common import instance_from_map from neps.search_spaces import ( @@ -228,10 +230,13 @@ def is_init_phase(self) -> bool: return False return True - def load_results( + @override + def load_optimization_state( self, previous_results: dict[str, ConfigResult], pending_evaluations: dict[str, SearchSpace], + budget_info: BudgetInfo | None, + optimizer_state: dict[str, Any], ) -> None: train_x = [el.config for el in previous_results.values()] train_y = [self.get_loss(el.result) for el in previous_results.values()] diff --git a/neps/optimizers/grid_search/optimizer.py b/neps/optimizers/grid_search/optimizer.py index fc082df1..4f5ff24e 100644 --- a/neps/optimizers/grid_search/optimizer.py +++ b/neps/optimizers/grid_search/optimizer.py @@ -1,7 +1,10 @@ from __future__ import annotations import random +from typing import Any +from typing_extensions import override +from neps.state.optimizer import BudgetInfo from neps.utils.types import ConfigResult, RawConfig from neps.search_spaces.search_space import SearchSpace from neps.optimizers.base_optimizer import BaseOptimizer @@ -19,10 +22,13 @@ def __init__( ) random.shuffle(self.configs_list) - def load_results( + @override + def load_optimization_state( self, previous_results: dict[str, ConfigResult], pending_evaluations: dict[str, SearchSpace], + budget_info: BudgetInfo | None, + optimizer_state: dict[str, Any], ) -> None: self._num_previous_configs = len(previous_results) + len(pending_evaluations) diff --git a/neps/optimizers/multi_fidelity/_dyhpo.py b/neps/optimizers/multi_fidelity/_dyhpo.py index e61e9d33..da3e36bf 100644 --- a/neps/optimizers/multi_fidelity/_dyhpo.py +++ b/neps/optimizers/multi_fidelity/_dyhpo.py @@ -1,9 +1,11 @@ from __future__ import annotations from typing import Any, List, Union +from typing_extensions import override import numpy as np +from neps.state.optimizer import BudgetInfo, OptimizationState from neps.utils.types import ConfigResult, RawConfig from neps.search_spaces.search_space import FloatParameter, IntegerParameter, SearchSpace from neps.optimizers.base_optimizer import BaseOptimizer @@ -206,10 +208,13 @@ def is_init_phase(self) -> bool: def num_train_configs(self): return len(self.observed_configs.completed_runs) - def load_results( + @override + def load_optimization_state( self, previous_results: dict[str, ConfigResult], pending_evaluations: dict[str, SearchSpace], + budget_info: BudgetInfo | None, + optimizer_state: dict[str, Any], ) -> None: """This is basically the fit method. @@ -386,7 +391,6 @@ def get_config_and_ids(self) -> tuple[RawConfig, str, Union[str, None]]: # if the returned config already observed, # set the fidelity to the next budget level if not max already # else set the fidelity to the minimum budget level - # print(config_condition) else: config = self.pipeline_space.sample( patience=self.patience, user_priors=True, ignore_fidelity=False @@ -402,5 +406,4 @@ def get_config_and_ids(self) -> tuple[RawConfig, str, Union[str, None]]: else 0 ) config_id = f"{_config_id}_{self.get_budget_level(config)}" - # print(self.observed_configs) return config.hp_values(), config_id, None diff --git a/neps/optimizers/multi_fidelity/dyhpo.py b/neps/optimizers/multi_fidelity/dyhpo.py index 7176c2d5..59804637 100755 --- a/neps/optimizers/multi_fidelity/dyhpo.py +++ b/neps/optimizers/multi_fidelity/dyhpo.py @@ -1,9 +1,11 @@ from __future__ import annotations from typing import Any +from typing_extensions import override import numpy as np +from neps.state.optimizer import BudgetInfo from neps.utils.types import ConfigResult, RawConfig from neps.utils.common import instance_from_map from neps.search_spaces.search_space import FloatParameter, IntegerParameter, SearchSpace @@ -282,10 +284,13 @@ def is_init_phase(self, budget_based: bool = True) -> bool: def num_train_configs(self): return len(self.observed_configs.completed_runs) - def load_results( + @override + def load_optimization_state( self, previous_results: dict[str, ConfigResult], pending_evaluations: dict[str, SearchSpace], + budget_info: BudgetInfo | None, + optimizer_state: dict[str, Any], ) -> None: """This is basically the fit method. diff --git a/neps/optimizers/multi_fidelity/hyperband.py b/neps/optimizers/multi_fidelity/hyperband.py index 8823e21a..dde96c56 100644 --- a/neps/optimizers/multi_fidelity/hyperband.py +++ b/neps/optimizers/multi_fidelity/hyperband.py @@ -5,8 +5,9 @@ from typing import Any import numpy as np -from typing_extensions import Literal +from typing_extensions import Literal, override +from neps.state.optimizer import BudgetInfo from neps.utils.types import ConfigResult, RawConfig from neps.search_spaces.search_space import SearchSpace from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import ( @@ -123,12 +124,20 @@ def _handle_promotions(self): # overloaded function disables the need for retrieving promotions for HB overall return - def load_results( + @override + def load_optimization_state( self, previous_results: dict[str, ConfigResult], pending_evaluations: dict[str, SearchSpace], + budget_info: BudgetInfo | None, + optimizer_state: dict[str, Any], ) -> None: - super().load_results(previous_results, pending_evaluations) + super().load_optimization_state( + previous_results=previous_results, + pending_evaluations=pending_evaluations, + budget_info=budget_info, + optimizer_state=optimizer_state + ) # important for the global HB to run the right SH self._update_sh_bracket_state() diff --git a/neps/optimizers/multi_fidelity/successive_halving.py b/neps/optimizers/multi_fidelity/successive_halving.py index 16c93fb0..a936b0a2 100644 --- a/neps/optimizers/multi_fidelity/successive_halving.py +++ b/neps/optimizers/multi_fidelity/successive_halving.py @@ -8,7 +8,7 @@ import numpy as np import pandas as pd -from typing_extensions import Literal +from typing_extensions import Literal, override from neps.utils.types import ConfigResult, RawConfig from neps.search_spaces import ( @@ -16,7 +16,7 @@ ConstantParameter, FloatParameter, IntegerParameter, - SearchSpace + SearchSpace, ) from neps.optimizers.base_optimizer import BaseOptimizer from neps.optimizers.multi_fidelity.promotion_policy import ( @@ -31,7 +31,9 @@ CUSTOM_FLOAT_CONFIDENCE_SCORES = dict(FloatParameter.DEFAULT_CONFIDENCE_SCORES) CUSTOM_FLOAT_CONFIDENCE_SCORES.update({"ultra": 0.05}) -CUSTOM_CATEGORICAL_CONFIDENCE_SCORES = dict(CategoricalParameter.DEFAULT_CONFIDENCE_SCORES) +CUSTOM_CATEGORICAL_CONFIDENCE_SCORES = dict( + CategoricalParameter.DEFAULT_CONFIDENCE_SCORES +) CUSTOM_CATEGORICAL_CONFIDENCE_SCORES.update({"ultra": 8}) @@ -317,18 +319,15 @@ def _fit_models(self): # if adding model-based search to the basic multi-fidelity algorithm return - def load_results( + @override + def load_optimization_state( self, previous_results: dict[str, ConfigResult], pending_evaluations: dict[str, SearchSpace], + budget_info: BudgetInfo | None, + optimizer_state: dict[str, Any], ) -> None: - """This is basically the fit method. - - Args: - previous_results (dict[str, ConfigResult]): [description] - pending_evaluations (dict[str, ConfigResult]): [description] - """ - + """This is basically the fit method.""" self.rung_histories = { rung: {"config": [], "perf": []} for rung in range(self.min_rung, self.max_rung + 1) diff --git a/neps/optimizers/multi_fidelity_prior/async_priorband.py b/neps/optimizers/multi_fidelity_prior/async_priorband.py index 5ab55139..40f6cb29 100644 --- a/neps/optimizers/multi_fidelity_prior/async_priorband.py +++ b/neps/optimizers/multi_fidelity_prior/async_priorband.py @@ -3,8 +3,9 @@ import typing import numpy as np -from typing_extensions import Literal +from typing_extensions import Literal, override +from neps.state.optimizer import BudgetInfo, OptimizationState from neps.utils.types import ConfigResult, RawConfig from neps.search_spaces.search_space import SearchSpace from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import ( @@ -245,12 +246,20 @@ def _update_sh_bracket_state(self) -> None: bracket.observed_configs = self.observed_configs.copy() bracket.rung_histories = self.rung_histories - def load_results( + @override + def load_optimization_state( self, previous_results: dict[str, ConfigResult], pending_evaluations: dict[str, SearchSpace], + budget_info: BudgetInfo | None, + optimizer_state: dict[str, typing.Any], ) -> None: - super().load_results(previous_results, pending_evaluations) + super().load_optimization_state( + previous_results=previous_results, + pending_evaluations=pending_evaluations, + budget_info=budget_info, + optimizer_state=optimizer_state + ) # important for the global HB to run the right SH self._update_sh_bracket_state() diff --git a/neps/optimizers/multiple_knowledge_sources/prototype_optimizer.py b/neps/optimizers/multiple_knowledge_sources/prototype_optimizer.py index f0bd46df..845552ea 100644 --- a/neps/optimizers/multiple_knowledge_sources/prototype_optimizer.py +++ b/neps/optimizers/multiple_knowledge_sources/prototype_optimizer.py @@ -2,7 +2,9 @@ import logging from typing import Any +from typing_extensions import override +from neps.state.optimizer import BudgetInfo, OptimizationState from neps.utils.types import ConfigResult, RawConfig from neps.search_spaces.search_space import SearchSpace from neps.utils.data_loading import read_tasks_and_dev_stages_from_disk @@ -43,10 +45,13 @@ def calculate_defaults(self): hp_values, delete_previous_defaults=True, delete_previous_values=True ) - def load_results( + @override + def load_optimization_state( self, previous_results: dict[str, ConfigResult], pending_evaluations: dict[str, SearchSpace], + budget_info: BudgetInfo | None, + optimizer_state: dict[str, Any], ) -> None: self._num_previous_configs = len(previous_results) + len(pending_evaluations) diff --git a/neps/optimizers/random_search/optimizer.py b/neps/optimizers/random_search/optimizer.py index fbd5b7be..5aeaff33 100644 --- a/neps/optimizers/random_search/optimizer.py +++ b/neps/optimizers/random_search/optimizer.py @@ -1,5 +1,8 @@ from __future__ import annotations +from typing import Any +from typing_extensions import override +from neps.state.optimizer import BudgetInfo, OptimizationState from neps.utils.types import ConfigResult, RawConfig from neps.search_spaces.search_space import SearchSpace from neps.optimizers.base_optimizer import BaseOptimizer @@ -12,10 +15,13 @@ def __init__(self, use_priors=False, ignore_fidelity=True, **optimizer_kwargs): self.use_priors = use_priors self.ignore_fidelity = ignore_fidelity - def load_results( + @override + def load_optimization_state( self, previous_results: dict[str, ConfigResult], pending_evaluations: dict[str, SearchSpace], + budget_info: BudgetInfo | None, + optimizer_state: dict[str, Any], ) -> None: self._num_previous_configs = len(previous_results) + len(pending_evaluations) diff --git a/neps/optimizers/regularized_evolution/optimizer.py b/neps/optimizers/regularized_evolution/optimizer.py index 2117a226..0860ba1c 100644 --- a/neps/optimizers/regularized_evolution/optimizer.py +++ b/neps/optimizers/regularized_evolution/optimizer.py @@ -4,12 +4,14 @@ import os import random from pathlib import Path -from typing import Callable +from typing import Any, Callable +from typing_extensions import override import numpy as np import yaml -from neps.utils.types import RawConfig +from neps.state.optimizer import BudgetInfo, OptimizationState +from neps.utils.types import ConfigResult, RawConfig from neps.search_spaces.search_space import SearchSpace from neps.optimizers.base_optimizer import BaseOptimizer @@ -52,7 +54,14 @@ def __init__( self.assisted_init_population_dir = Path(assisted_init_population_dir) self.assisted_init_population_dir.mkdir(exist_ok=True) - def load_results(self, previous_results: dict, pending_evaluations: dict) -> None: + @override + def load_optimization_state( + self, + previous_results: dict[str, ConfigResult], + pending_evaluations: dict[str, SearchSpace], + budget_info: BudgetInfo | None, + optimizer_state: dict[str, Any], + ) -> None: train_x = [el.config for el in previous_results.values()] train_y = [self.get_loss(el.result) for el in previous_results.values()] self.num_train_x = len(train_x) @@ -68,7 +77,6 @@ def get_config_and_ids(self) -> tuple[RawConfig, str, str | None]: if len(self.population) < self.population_size: if self.assisted: if 0 == len(os.listdir(self.assisted_init_population_dir)): - print("Generate initial design with assistance") cur_population_size = self.population_size - len(self.population) configs = [ self.pipeline_space.sample( @@ -93,7 +101,6 @@ def get_config_and_ids(self) -> tuple[RawConfig, str, str | None]: encoding="utf-8", ) as f: yaml.dump(configs[config_idx].serialize(), f) - print("Pick config from pre-computed population") config_yaml = sorted(os.listdir(self.assisted_init_population_dir))[0] with open( self.assisted_init_population_dir / config_yaml, encoding="utf-8" diff --git a/neps/plot/tensorboard_eval.py b/neps/plot/tensorboard_eval.py index 9259952c..e77329b4 100644 --- a/neps/plot/tensorboard_eval.py +++ b/neps/plot/tensorboard_eval.py @@ -12,7 +12,7 @@ from torch.utils.tensorboard.summary import hparams from torch.utils.tensorboard.writer import SummaryWriter -from neps.runtime import get_in_progress_trial +from neps.runtime import get_in_progress_trial, get_workers_neps_state from neps.status.status import get_summary_dict from neps.utils.common import get_initial_directory @@ -85,13 +85,19 @@ def _initiate_internal_configurations() -> None: operating on. """ trial = get_in_progress_trial() - assert trial is not None + neps_state = get_workers_neps_state() - # TODO(eddiebergman): We could just save the instance of the trial - # on this object, OR even just use `get_in_process_trial()` in each call directly. - tblogger.config_working_directory = trial.pipeline_dir - tblogger.config_previous_directory = trial.disk.previous_pipeline_dir - tblogger.optimizer_dir = trial.disk.optimization_dir.parent + # We are assuming that neps state is all filebased here + root_dir = Path(neps_state.location) + assert root_dir.exists() + + tblogger.config_working_directory = Path(trial.metadata.location) + tblogger.config_previous_directory = ( + Path(trial.metadata.previous_trial_location) + if trial.metadata.previous_trial_location is not None + else None + ) + tblogger.optimizer_dir = root_dir tblogger.config = trial.config @staticmethod diff --git a/neps/runtime.py b/neps/runtime.py index 6d76ef28..c9988f70 100644 --- a/neps/runtime.py +++ b/neps/runtime.py @@ -1,1042 +1,534 @@ -"""Module for the runtime of a single instance of NePS running. - -An important advantage of NePS with a running instance per worker and no -multiprocessing is that we can reliably use globals to store information such -as the currently running configuration, without interfering with other -workers which have launched. - -This allows us to have a global `Trial` object which can be accessed -using `import neps.runtime; neps.get_in_progress_trial()`. - ---- - -This module primarily handles the worker loop where important concepts are: -* **State**: The state of optimization is all of the configurations, their results and - the current state of the optimizer. -* **Shared State**: Whenever a worker wishes to read or write any state, they will _lock_ -the shared state, declaring themselves as operating on it. At this point, no other worker -can access the shared state. -* **Optimizer Hydration**: This is the process through which an optimizer instance is -_hydrated_ with the Shared State so it can make a decision, i.e. for sampling. -Equally we _serialize_ the optimizer when writing it back to Shared State -* **Trial Lock**: When evaluating a configuration, a worker must _lock_ it to declared -itself as evaluating it. This communicates to other workers that this configuration is -in progress. - -### Loop -We mark lines with `+` as the worker having locked the Shared State and `~` as the worker -having locked the Trial. The trial lock `~` is allowed to fail, in which case all steps -with a `~` are skipped and the loop continues. - -1. + Check exit conditions -2. + Hydrate the optimizer -3. + Sample a new Trial -3. Unlock the Shared State -4. ~ Obtain a Trial Lock -5. ~ Set the global trial for this work to the current trial -6. ~ Evaluate the trial -7. ~+ Lock the shared state -8. ~+ Write the results of the config to disk -9. ~+ Update the optimizer if required (used budget for evaluating trial) -10. ~ Unlock the shared state -11. Unlock Trial Lock -""" +"""TODO.""" from __future__ import annotations -import inspect +import datetime import logging import os import shutil import time -import traceback -import warnings from contextlib import contextmanager -from dataclasses import dataclass, field -from enum import Enum +from dataclasses import dataclass from pathlib import Path from typing import ( TYPE_CHECKING, Any, Callable, + Generic, Iterable, Iterator, + Literal, Mapping, - Union, + TypeVar, ) -from typing_extensions import Self, TypeAlias - -import numpy as np - -from neps.utils._locker import Locker -from neps.utils._rng import SeedState -from neps.utils.files import deserialize, empty_file, serialize -from neps.utils.types import ( - ERROR, - ConfigID, - ConfigResult, - RawConfig, + +from neps.exceptions import ( + NePSError, + VersionMismatchError, + WorkerFailedToGetPendingTrialsError, ) +from neps.state._eval import evaluate_trial +from neps.state.filebased import create_or_load_filebased_neps_state +from neps.state.optimizer import BudgetInfo, OptimizationState, OptimizerInfo +from neps.state.settings import DefaultReportValues, OnErrorPossibilities, WorkerSettings +from neps.state.trial import Trial if TYPE_CHECKING: from neps.optimizers.base_optimizer import BaseOptimizer - from neps.search_spaces.search_space import SearchSpace + from neps.state.neps_state import NePSState logger = logging.getLogger(__name__) -# Wait time between each successive poll to see if state can be grabbed -DEFAULT_STATE_POLL: float = 0.1 -ENVIRON_STATE_POLL_KEY = "NEPS_STATE_POLL" -# Timeout before giving up on trying to grab the state, raising an error -DEFAULT_STATE_TIMEOUT: float | None = None -ENVIRON_STATE_TIMEOUT_KEY = "NEPS_STATE_TIMEOUT" +def _default_worker_name() -> str: + isoformat = datetime.datetime.now(datetime.timezone.utc).isoformat() + return f"{os.getpid()}-{isoformat}" -# TODO(eddiebergman): We should not do this... -warnings.simplefilter("always", DeprecationWarning) +N_FAILED_GET_NEXT_PENDING_ATTEMPTS_BEFORE_ERROR = 10 +N_FAILED_TO_SET_TRIAL_STATE = 10 +Loc = TypeVar("Loc") -# NOTE: As each NEPS process is only ever evaluating a single trial, -# this global can be retrieved in NePS and refers to what this process -# is currently evaluating. -_CURRENTLY_RUNNING_TRIAL_IN_PROCESS: Trial | None = None +# NOTE: As each NEPS process is only ever evaluating a single trial, this global can +# be retrieved in NePS and refers to what this process is currently evaluating. +# Note that before `_set_in_progress_trial` is called, this should be cleared +# with `_clear_in_progress_trial` to ensure that we are not in some erroneuous state. +# Prefer to call `_clear_in_progress_trial` after a trial has finished evaluating and +# not just before `_set_in_progress_trial`, as the latter defeats the purpose of this +# assertion. +_CURRENTLY_RUNNING_TRIAL_IN_PROCESS: Trial | None = None +_WORKER_NEPS_STATE: NePSState | None = None -def get_in_progress_trial() -> Trial | None: - """Get the currently running trial in this process.""" - return _CURRENTLY_RUNNING_TRIAL_IN_PROCESS +# TODO: This only works with a filebased nepsstate +def get_workers_neps_state() -> NePSState[Path]: + """Get the worker's NePS state.""" + if _WORKER_NEPS_STATE is None: + raise RuntimeError( + "The worker's NePS state has not been set! This should only be called" + " from within a `run_pipeline` context. If you are not running a pipeline" + " and you did not call this function (`get_workers_neps_state`) yourself," + " this is a bug and should be reported to NePS." + ) + return _WORKER_NEPS_STATE -def _set_in_progress_trial(trial: Trial | None) -> None: - global _CURRENTLY_RUNNING_TRIAL_IN_PROCESS # noqa: PLW0603 - _CURRENTLY_RUNNING_TRIAL_IN_PROCESS = trial +def _set_workers_neps_state(state: NePSState[Path]) -> None: + global _WORKER_NEPS_STATE # noqa: PLW0603 + _WORKER_NEPS_STATE = state -def get_shared_state_poll_and_timeout() -> tuple[float, float | None]: - """Get the poll and timeout for the shared state.""" - poll = float(os.environ.get(ENVIRON_STATE_POLL_KEY, DEFAULT_STATE_POLL)) - timeout = os.environ.get(ENVIRON_STATE_TIMEOUT_KEY, DEFAULT_STATE_TIMEOUT) - timeout = float(timeout) if timeout is not None else None - return poll, timeout +def get_in_progress_trial() -> Trial: + """Get the currently running trial in this process.""" + if _CURRENTLY_RUNNING_TRIAL_IN_PROCESS is None: + raise RuntimeError( + "The worker's NePS state has not been set! This should only be called" + " from within a `run_pipeline` context. If you are not running a pipeline" + " and you did not call this function (`get_workers_neps_state`) yourself," + " this is a bug and should be reported to NePS." + ) + return _CURRENTLY_RUNNING_TRIAL_IN_PROCESS -@dataclass -class SuccessReport: - """A successful report of the evaluation of a configuration.""" - loss: float - cost: float | None - account_for_cost: bool - results: Mapping[str, Any] +@contextmanager +def _set_global_trial(trial: Trial) -> Iterator[None]: + global _CURRENTLY_RUNNING_TRIAL_IN_PROCESS # noqa: PLW0603 + if _CURRENTLY_RUNNING_TRIAL_IN_PROCESS is not None: + raise NePSError( + "A trial was already set to run in this process, yet some other trial was" + " attempted to be set as the global trial in progress." + " We assume that each process will only ever have one trial running at a time" + " to allow functionality like `neps.get_in_progress_trial()`," + " `load_checkpoint()` and `save_checkpoint()` to work." + "\n\nThis is most likely a bug and should be reported to NePS!" + ) + _CURRENTLY_RUNNING_TRIAL_IN_PROCESS = trial + yield + _CURRENTLY_RUNNING_TRIAL_IN_PROCESS = None +# NOTE: This class is quite stateful and has been split up quite a bit to make testing +# interleaving of workers easier. This comes at the cost of more fragmented code. @dataclass -class ErrorReport: - """A failed report of the evaluation of a configuration.""" - - err: Exception - tb: str | None - loss: float | None - cost: float | None - account_for_cost: bool - results: Mapping[str, Any] +class DefaultWorker(Generic[Loc]): + """A default worker for the NePS system. - -Report: TypeAlias = Union[SuccessReport, ErrorReport] - - -@dataclass -class Trial: - """A trial is a configuration and it's associated data. - - The object is considered mutable and the global trial currently being - evaluated can be access using `get_in_progress_trial()`. - - Attributes: - id: Unique identifier for the configuration - config: The configuration to evaluate - pipeline_dir: Directory where the configuration is evaluated - previous: The previous trial before this trial. - time_sampled: The time the configuration was sampled - metadata: Additional metadata about the configuration + This is the worker that is used by default in the neps.run() loop. """ - id: ConfigID - config: Mapping[str, Any] - pipeline_dir: Path - previous: Trial | None - report: Report | None - time_sampled: float - metadata: dict[str, Any] - _lock: Locker = field(init=False) - disk: Trial.Disk = field(init=False) - - def to_config_result( - self, - config_to_search_space: Callable[[RawConfig], SearchSpace], - ) -> ConfigResult: - """Convert the report to a `ConfigResult` object.""" - result: ERROR | Mapping[str, Any] = ( - "error" - if self.report is None or isinstance(self.report, ErrorReport) - else self.report.results - ) - return ConfigResult( - self.id, - config=config_to_search_space(self.config), - result=result, - metadata=self.metadata, - ) + state: NePSState + """The state of the NePS system.""" - class State(Enum): - """The state of a trial.""" + settings: WorkerSettings + """The settings for the worker.""" - PENDING = "pending" - IN_PROGRESS = "in_progress" - SUCCESS = "success" - ERROR = "error" - CORRUPTED = "corrupted" + evaluation_fn: Callable[..., float | Mapping[str, Any]] + """The evaluation function to use for the worker.""" - def __post_init__(self) -> None: - if "time_sampled" not in self.metadata: - self.metadata["time_sampled"] = self.time_sampled - self.pipeline_dir.mkdir(exist_ok=True, parents=True) - self._lock = Locker(self.pipeline_dir / ".config_lock") - self.disk = Trial.Disk(pipeline_dir=self.pipeline_dir) + optimizer: BaseOptimizer + """The optimizer that is in use by the worker.""" - @property - def config_file(self) -> Path: - """The path to the configuration file.""" - return self.pipeline_dir / "config.yaml" + worker_id: str + """The id of the worker.""" - @property - def metadata_file(self) -> Path: - """The path to the metadata file.""" - return self.pipeline_dir / "metadata.yaml" + _pre_sample_hooks: list[Callable[[BaseOptimizer], BaseOptimizer]] | None = None + """Hooks to run before sampling a new trial.""" - @classmethod - def from_dir(cls, pipeline_dir: Path, *, previous: Trial | None = None) -> Self: - """Create a `Trial` object from a directory. + worker_cumulative_eval_count: int = 0 + """The number of evaluations done by this worker.""" - Args: - pipeline_dir: The directory where the trial is stored - previous: The previous trial before this trial. - You can use this to prevent loading the previous trial from disk, - if it exists, i.e. a caching shortcut. + worker_cumulative_eval_cost: float = 0.0 + """The cost of the evaluations done by this worker.""" - Returns: - The trial object. - """ - return cls.from_disk( - Trial.Disk.from_dir(pipeline_dir), - previous=previous, - ) + worker_cumulative_evaluation_time_seconds: float = 0.0 + """The time spent evaluating configurations by this worker.""" @classmethod - def from_disk(cls, disk: Trial.Disk, *, previous: Trial | None = None) -> Self: - """Create a `Trial` object from a disk. - - Args: - disk: The disk information of the trial. - previous: The previous trial before this trial. - You can use this to prevent loading the previous trial from disk, - if it exists, i.e. a caching shortcut. - - Returns: - The trial object. - """ - try: - config = deserialize(disk.config_file) - except Exception as e: - logger.error( - f"Error loading config from {disk.config_file}: {e}", - exc_info=True, - ) - config = {} - - try: - metadata = deserialize(disk.metadata_file) - time_sampled = metadata["time_sampled"] - except Exception as e: - logger.error( - f"Error loading metadata from {disk.metadata_file}: {e}", - exc_info=True, - ) - metadata = {} - time_sampled = float("nan") - - try: - result: dict[str, Any] | tuple[Exception, str | None] | None - report: Report | None - if not empty_file(disk.result_file): - result = deserialize(disk.result_file) - - assert isinstance(result, dict) - report = SuccessReport( - loss=result["loss"], - cost=result.get("cost", None), - account_for_cost=result.get("account_for_cost", True), - results=result, - ) - elif not empty_file(disk.error_file): - error_tb = deserialize(disk.error_file) - result = deserialize(disk.result_file) - report = ErrorReport( - # NOTE: Not sure we can easily get the original exception type, - # once serialized - err=Exception(error_tb["err"]), - tb=error_tb.get("tb"), - loss=result.get("loss", None), - cost=result.get("cost", None), - account_for_cost=result.get("account_for_cost", True), - results=result, - ) - else: - report = None - except Exception as e: - logger.error( - f"Error loading result from {disk.result_file}: {e}", - exc_info=True, - ) - report = None - - try: - if previous is None and disk.previous_pipeline_dir is not None: - previous = Trial.from_dir(disk.previous_pipeline_dir) - except Exception as e: - logger.error( - f"Error loading previous from {disk.previous_pipeline_dir}: {e}", - exc_info=True, - ) - previous = None - - return cls( - id=disk.config_id, - config=config, - pipeline_dir=disk.pipeline_dir, - report=report, - previous=previous, - time_sampled=time_sampled, - metadata=metadata, + def new( + cls, + *, + state: NePSState, + optimizer: BaseOptimizer, + settings: WorkerSettings, + evaluation_fn: Callable[..., float | Mapping[str, Any]], + _pre_sample_hooks: list[Callable[[BaseOptimizer], BaseOptimizer]] | None = None, + worker_id: str | None = None, + ) -> DefaultWorker: + """Create a new worker.""" + return DefaultWorker( + state=state, + optimizer=optimizer, + settings=settings, + evaluation_fn=evaluation_fn, + worker_id=worker_id if worker_id is not None else _default_worker_name(), + _pre_sample_hooks=_pre_sample_hooks, ) - @property - def previous_config_id_file(self) -> Path: - """The path to the previous configuration id file.""" - return self.pipeline_dir / "previous_config.id" - - def create_error_report(self, err: Exception, tb: str | None = None) -> ErrorReport: - """Create a [`Report`][neps.runtime.Report] object with an error.""" - # TODO(eddiebergman): For now we assume the loss and cost for an error is None - # and that we don't account for cost and there are no possible results. - return ErrorReport( - loss=None, - cost=None, - account_for_cost=False, - results={}, - err=err, - tb=tb, - ) + def _get_next_trial_from_state(self) -> Trial: + nxt_trial = self.state.get_next_pending_trial() - def create_success_report(self, result: float | Mapping[str, Any]) -> SuccessReport: - """Check if the trial has succeeded.""" - _result: dict[str, Any] = {} - if isinstance(result, Mapping): - if "loss" not in result: - raise KeyError("The 'loss' should be provided in the evaluation result") + # If we have a trial, we will use it + if nxt_trial is not None: + logger.info( + f"Worker '{self.worker_id}' got previosly sampled trial: {nxt_trial}" + ) - _result = dict(result) - loss = _result["loss"] + # Otherwise sample a new one else: - loss = result - - try: - _result["loss"] = float(loss) - except (TypeError, ValueError) as e: - raise ValueError( - "The evaluation result should be a dictionnary or a float but got" - f" a `{type(loss)}` with value of {loss}", - ) from e - - # TODO(eddiebergman): For now we have no access to the cost for crash - # so we just set it to None. - _cost: float | None = _result.get("cost", None) - if _cost is not None: - try: - _result["cost"] = float(_cost) - except (TypeError, ValueError) as e: - raise ValueError( - "The evaluation result should be a dictionnary or a float but got" - f" a `{type(_cost)}` with value of {_cost}", - ) from e - - # TODO(eddiebergman): Should probably be a global user setting for this. - _account_for_cost = _result.get("account_for_cost", True) - - return SuccessReport( - loss=_result["loss"], - cost=_cost, - account_for_cost=_account_for_cost, - results=_result, - ) - - @dataclass - class Disk: - """The disk information of a trial. - - Attributes: - pipeline_dir: The directory where the trial is stored - id: The unique identifier of the trial - config_file: The path to the configuration file - result_file: The path to the result file - metadata_file: The path to the metadata file - optimization_dir: The directory from which optimization is running - previous_config_id_file: The path to the previous config id file - previous_pipeline_dir: The directory of the previous configuration - lock: The lock for the trial. Obtaining this lock indicates the worker - is evaluating this trial. - """ - - pipeline_dir: Path - - config_id: ConfigID = field(init=False) - config_file: Path = field(init=False) - result_file: Path = field(init=False) - error_file: Path = field(init=False) - metadata_file: Path = field(init=False) - optimization_dir: Path = field(init=False) - previous_config_id_file: Path = field(init=False) - previous_config_id: ConfigID | None = field(init=False) - previous_pipeline_dir: Path | None = field(init=False) - lock: Locker = field(init=False) - - def __post_init__(self) -> None: - self.config_id = self.pipeline_dir.name[len("config_") :] - self.config_file = self.pipeline_dir / "config.yaml" - self.result_file = self.pipeline_dir / "result.yaml" - self.error_file = self.pipeline_dir / "error.yaml" - self.metadata_file = self.pipeline_dir / "metadata.yaml" - - # NOTE: This is a bit of an assumption! - self.optimization_dir = self.pipeline_dir.parent - - self.previous_config_id_file = self.pipeline_dir / "previous_config.id" - if not empty_file(self.previous_config_id_file): - with self.previous_config_id_file.open("r") as f: - self.previous_config_id = f.read().strip() - - self.previous_pipeline_dir = ( - self.pipeline_dir.parent / f"config_{self.previous_config_id}" - ) - else: - self.previous_pipeline_dir = None - self.previous_config_id = None - - self.pipeline_dir.mkdir(exist_ok=True, parents=True) - self.lock = Locker(self.pipeline_dir / ".config_lock") - - def raw_config(self) -> dict[str, Any]: - """Deserialize the configuration from disk.""" - return deserialize(self.config_file) - - def state(self) -> Trial.State: # noqa: PLR0911 - """The state of the trial.""" - result_file_exists = not empty_file(self.result_file) - error_file_exists = not empty_file(self.error_file) - config_file_exists = not empty_file(self.config_file) - - # NOTE: We don't handle the case where it's locked and there is a result - # or error file existing, namely as this might introduce a race condition, - # where the result/error is being written to while the lock still exists. - - if error_file_exists: - # Should not have a results file if there is an error file - if result_file_exists: - return Trial.State.CORRUPTED - - # Should have a config file if there is an error file - if not config_file_exists: - return Trial.State.CORRUPTED - - return Trial.State.ERROR - - if result_file_exists: - # Should have a config file if there is a results file - if not config_file_exists: - return Trial.State.CORRUPTED - - return Trial.State.SUCCESS - - if self.lock.is_locked(): - # Should have a config to evaluate if it's locked - if not config_file_exists: - return Trial.State.CORRUPTED - - return Trial.State.IN_PROGRESS - - return Trial.State.PENDING - - @classmethod - def from_dir(cls, pipeline_dir: Path) -> Trial.Disk: - """Create a `Trial.Disk` object from a directory.""" - return cls(pipeline_dir=pipeline_dir) - - -@dataclass -class StatePaths: - """The paths used for the state of the optimization process. - - Most important method is [`config_dir`][neps.runtime.StatePaths.config_dir], - which gives the directory to use for a configuration. - - Attributes: - root: The root directory of the optimization process. - create_dirs: Whether to create the directories if they do not exist. - optimizer_state_file: The path to the optimizer state file. - optimizer_info_file: The path to the optimizer info file. - seed_state_dir: The directory where the seed state is stored. - results_dir: The directory where results are stored. - """ - - root: Path - create_dirs: bool = False - - optimizer_state_file: Path = field(init=False) - optimizer_info_file: Path = field(init=False) - seed_state_dir: Path = field(init=False) - results_dir: Path = field(init=False) - - def __post_init__(self) -> None: - if self.create_dirs: - self.root.mkdir(parents=True, exist_ok=True) - - self.results_dir = self.root / "results" - - if self.create_dirs: - self.results_dir.mkdir(exist_ok=True) - - self.optimizer_state_file = self.root / ".optimizer_state.yaml" - self.optimizer_info_file = self.root / ".optimizer_info.yaml" - self.seed_state_dir = self.root / ".seed_state" - - def config_dir(self, config_id: ConfigID) -> Path: - """Get the directory for a configuration.""" - return self.results_dir / f"config_{config_id}" - - -@dataclass -class SharedState: - """The shared state of the optimization process that workers communicate through. - - Attributes: - base_dir: The base directory from which the optimization is running. - create_dirs: Whether to create the directories if they do not exist. - lock: The lock to signify that a worker is operating on the shared state. - optimizer_state_file: The path to the optimizers state. - optimizer_info_file: The path to the file containing information about the - optimizer's setup. - seed_state_dir: Directory where the seed state is stored. - results_dir: Directory where results for configurations are stored. - """ - - base_dir: Path - paths: StatePaths = field(init=False) - create_dirs: bool = False - lock: Locker = field(init=False) - - trials: dict[ConfigID, tuple[Trial, Trial.State]] = field(default_factory=dict) - """Mapping from a configid to the trial and it's last known state, including if - it's been evaluated.""" - - def __post_init__(self) -> None: - self.paths = StatePaths(root=self.base_dir, create_dirs=self.create_dirs) - self.lock = Locker(self.base_dir / ".decision_lock") + nxt_trial = self.state.sample_trial( + worker_id=self.worker_id, + optimizer=self.optimizer, + _sample_hooks=self._pre_sample_hooks, + ) + logger.info(f"Worker '{self.worker_id}' sampled a new trial: {nxt_trial}") - def trials_by_state(self) -> dict[Trial.State, list[Trial]]: - """Get the trials grouped by their state.""" - _dict: dict[Trial.State, list[Trial]] = {state: [] for state in Trial.State} - for trial, state in self.trials.values(): - _dict[state].append(trial) - return _dict + return nxt_trial - def check_optimizer_info_on_disk_matches( + def _check_if_should_stop( # noqa: C901, PLR0912, PLR0911 self, - optimizer_info: dict[str, Any], *, - excluded_keys: Iterable[str] = ("searcher_name",), - ) -> None: - """Sanity check that the provided info matches the one on disk (if any). - - Args: - optimizer_info: The optimizer info to check. - excluded_keys: Any keys to exclude during the comparison. - - Raises: - ValueError: If there is optimizer info on disk and it does not match the - provided info. - """ - optimizer_info_copy = optimizer_info.copy() - loaded_info = deserialize(self.paths.optimizer_info_file) - - for key in excluded_keys: - optimizer_info_copy.pop(key, None) - loaded_info.pop(key, None) - - if optimizer_info_copy != loaded_info: - raise ValueError( - f"The sampler_info in the file {self.paths.optimizer_info_file} is not" - f" valid. Expected: {optimizer_info_copy}, Found: {loaded_info}", + time_monotonic_start: float, + error_from_this_worker: Exception | None, + ) -> str | Literal[False]: + # NOTE: Sorry this code is kind of ugly but it's pretty straightforward, just a + # lot of conditional checking and making sure to check cheaper conditions first. + # It would look a little nicer with a match statement but we've got to wait + # for python 3.10 for that. + + # First check for stopping criterion for this worker in particular as it's + # cheaper and doesn't require anything from the state. + if error_from_this_worker and self.settings.on_error in ( + OnErrorPossibilities.RAISE_WORKER_ERROR, + OnErrorPossibilities.RAISE_ANY_ERROR, + OnErrorPossibilities.STOP_WORKER_ERROR, + OnErrorPossibilities.STOP_ANY_ERROR, + ): + if self.settings.on_error in ( + OnErrorPossibilities.RAISE_WORKER_ERROR, + OnErrorPossibilities.RAISE_ANY_ERROR, + ): + raise error_from_this_worker + return ( + "Error occurred while evaluating a configuration with this worker and" + f" the worker is set to stop with {self.settings.on_error}." ) - @contextmanager - def use_sampler( - self, - sampler: BaseOptimizer, - *, - serialize_seed: bool = True, - ) -> Iterator[BaseOptimizer]: - """Use the sampler with the shared state.""" - if serialize_seed: - with SeedState.use(self.paths.seed_state_dir), sampler.using_state( - self.paths.optimizer_state_file - ): - yield sampler - else: - with sampler.using_state(self.paths.optimizer_state_file): - yield sampler - - def update_from_disk(self) -> None: - """Update the shared state from disk.""" - trial_dirs = (p for p in self.paths.results_dir.iterdir() if p.is_dir()) - _disks = [Trial.Disk.from_dir(p) for p in trial_dirs] - _disk_lookup = {disk.config_id: disk for disk in _disks} - - # NOTE: We sort all trials such that we process previous trials first, i.e. - # if trial_3 has trial_2 as previous, we process trial_2 first, which - # requires trial_1 to have been processed first. - def _depth(trial: Trial.Disk) -> int: - depth = 0 - previous = trial.previous_config_id - while previous is not None: - depth += 1 - previous_trial = _disk_lookup.get(previous) - if previous_trial is None: - raise RuntimeError( - "Previous trial not found on disk when processing a trial." - " This should not happen as if a tria has a previous trial," - " then it should be present and evaluated on disk.", - ) - previous = previous_trial.previous_config_id + if ( + self.settings.max_evaluations_for_worker is not None + and self.worker_cumulative_eval_count + >= self.settings.max_evaluations_for_worker + ): + return ( + "Worker has reached the maximum number of evaluations it is allowed to do" + f" as given by `{self.settings.max_evaluations_for_worker=}`." + "\nTo allow more evaluations, increase this value or use a different" + " stopping criterion." + ) - return depth + if ( + self.settings.max_cost_for_worker is not None + and self.worker_cumulative_eval_cost >= self.settings.max_cost_for_worker + ): + return ( + "Worker has reached the maximum cost it is allowed to spend" + f" which is given by `{self.settings.max_cost_for_worker=}`." + f" This worker has spend '{self.worker_cumulative_eval_cost}'." + "\n To allow more evaluations, increase this value or use a different" + " stopping criterion." + ) - # This allows is to traverse linearly and used cached values of previous - # trial data loading, as done below. - _disks.sort(key=_depth) + if self.settings.max_wallclock_time_for_worker_seconds is not None and ( + time.monotonic() - time_monotonic_start + >= self.settings.max_wallclock_time_for_worker_seconds + ): + return ( + "Worker has reached the maximum wallclock time it is allowed to spend" + f", given by `{self.settings.max_wallclock_time_for_worker_seconds=}`." + ) - for disk in _disks: - config_id = disk.config_id - state = disk.state() + if self.settings.max_evaluation_time_for_worker_seconds is not None and ( + self.worker_cumulative_evaluation_time_seconds + >= self.settings.max_evaluation_time_for_worker_seconds + ): + return ( + "Worker has reached the maximum evaluation time it is allowed to spend" + f", given by `{self.settings.max_evaluation_time_for_worker_seconds=}`." + ) - if state is Trial.State.CORRUPTED: - logger.warning(f"Trial {config_id} was corrupted somehow!") + # We check this global error stopping criterion as it's much + # cheaper than sweeping the state from all trials. + if self.settings.on_error in ( + OnErrorPossibilities.RAISE_ANY_ERROR, + OnErrorPossibilities.STOP_ANY_ERROR, + ): + err = self.state._shared_errors.synced().latest_err_as_raisable() + if err is not None: + if self.settings.on_error == OnErrorPossibilities.RAISE_ANY_ERROR: + raise err + + return ( + "An error occurred in another worker and this worker is set to stop" + f" with {self.settings.on_error}." + "\n To allow more evaluations, use a different stopping criterion." + ) - previous: Trial | None = None - if disk.previous_config_id is not None: - previous, _ = self.trials.get(disk.previous_config_id, (None, None)) - if previous is None: - raise RuntimeError( - "Previous trial not found in memory when processing a trial." - " This should not happen as if a trial has a previous trial," - " then it should be present and evaluated in memory.", - ) + # If there are no global stopping criterion, we can no just return early. + if ( + self.settings.max_evaluations_total is None + and self.settings.max_cost_total is None + and self.settings.max_evaluation_time_total_seconds is None + ): + return False + + # At this point, if we have some global stopping criterion, we need to sweep + # the current state of trials to determine if we should stop + # NOTE: If these `sum` turn out to somehow be a bottleneck, these could + # be precomputed and accumulated over time. This would have to be handled + # in the `NePSState` class. + trials = self.state.get_all_trials() + if self.settings.max_evaluations_total is not None: + if self.settings.include_in_progress_evaluations_towards_maximum: + count = sum( + 1 + for _, trial in trials.items() + if trial.report is not None + or trial.state in (Trial.State.EVALUATING, Trial.State.SUBMITTED) + ) + else: + count = sum(1 for _, trial in trials.items() if trial.report is not None) + + if count >= self.settings.max_evaluations_total: + return ( + "The total number of evaluations has reached the maximum allowed of" + f" `{self.settings.max_evaluations_total=}`." + " To allow more evaluations, increase this value or use a different" + " stopping criterion." + ) - cached_trial = self.trials.get(config_id, None) - - # If not currently cached or it was and had a state change - if cached_trial is None or cached_trial[1] != state: - trial = Trial.from_disk(disk, previous=previous) - self.trials[config_id] = (trial, state) - - @contextmanager - def sync(self, *, lock: bool = True) -> Iterator[None]: - """Sync up with what's on disk.""" - if lock: - _poll, _timeout = get_shared_state_poll_and_timeout() - with self.lock(poll=_poll, timeout=_timeout): - self.update_from_disk() - yield - else: - yield + if self.settings.max_cost_total is not None: + cost = sum( + trial.report.cost + for _, trial in trials.items() + if trial.report is not None and trial.report.cost is not None + ) + if cost >= self.settings.max_cost_total: + return ( + f"The maximum cost `{self.settings.max_cost_total=}` has been" + " reached by all of the evaluated trials. To allow more evaluations," + " increase this value or use a different stopping criterion." + ) + if self.settings.max_evaluation_time_total_seconds is not None: + time_spent = sum( + trial.report.evaluation_duration + for _, trial in trials.items() + if trial.report is not None + if trial.report.evaluation_duration is not None + ) + if time_spent >= self.settings.max_evaluation_time_total_seconds: + return ( + "The maximum evaluation time of" + f" `{self.settings.max_evaluation_time_total_seconds=}` has been" + " reached. To allow more evaluations, increase this value or use" + " a different stopping criterion." + ) -def _evaluate_config( - trial: Trial, - evaluation_fn: Callable[..., float | Mapping[str, Any]], - logger: logging.Logger, -) -> float | Mapping[str, Any]: - config = trial.config - config_id = trial.id - pipeline_directory = trial.pipeline_dir - previous_pipeline_directory = ( - None if trial.previous is None else trial.previous.pipeline_dir - ) + return False - logger.info(f"Start evaluating config {config_id}") + def run(self) -> None: # noqa: C901, PLR0915 + """Run the worker. - # If pipeline_directory and previous_pipeline_directory are included in the - # signature we supply their values, otherwise we simply do nothing. - directory_params: list[Path | None] = [] + Will keep running until one of the criterion defined by the `WorkerSettings` + is met. + """ + _set_workers_neps_state(self.state) - evaluation_fn_params = inspect.signature(evaluation_fn).parameters - if "pipeline_directory" in evaluation_fn_params: - directory_params.append(pipeline_directory) - if "previous_pipeline_directory" in evaluation_fn_params: - directory_params.append(previous_pipeline_directory) + logger.info("Launching NePS") - return evaluation_fn(*directory_params, **config) + _time_monotonic_start = time.monotonic() + _error_from_evaluation: Exception | None = None + _repeated_fail_get_next_trial_count = 0 + while True: + # NOTE: We rely on this function to do logging and raising errors if it should + should_stop = self._check_if_should_stop( + time_monotonic_start=_time_monotonic_start, + error_from_this_worker=_error_from_evaluation, + ) + if should_stop is not False: + logger.info(should_stop) + break -def _worker_should_continue( - max_evaluations_total: int | None, - *, - n_inprogress: int, - n_evaluated: int, - continue_until_max_evaluation_completed: bool, -) -> bool: - # Check if we have reached the total amount of configurations to evaluated - # (including pending evaluations possibly) - if max_evaluations_total is None: - return True - - n_counter = ( - n_evaluated - if continue_until_max_evaluation_completed - else n_evaluated + n_inprogress - ) - return n_counter < max_evaluations_total + try: + trial_to_eval = self._get_next_trial_from_state() + _repeated_fail_get_next_trial_count = 0 + except Exception as e: + _repeated_fail_get_next_trial_count += 1 + logger.error( + "Error while trying to get the next trial to evaluate.", exc_info=True + ) + # NOTE: This is to prevent any infinite loops if we can't get a trial + if ( + _repeated_fail_get_next_trial_count + >= N_FAILED_GET_NEXT_PENDING_ATTEMPTS_BEFORE_ERROR + ): + raise WorkerFailedToGetPendingTrialsError( + "Worker '%s' failed to get pending trials %d times in a row." + " Bailing!" + ) from e -def _sample_trial_from_optimizer( - optimizer: BaseOptimizer, - config_dir_f: Callable[[ConfigID], Path], - evaluated_trials: Mapping[ConfigID, Trial], - pending_trials: Mapping[ConfigID, Trial], -) -> Trial: - optimizer.load_results( - previous_results={ - config_id: report.to_config_result(optimizer.load_config) - for config_id, report in evaluated_trials.items() - }, - pending_evaluations={ - config_id: optimizer.load_config(trial.config) - for config_id, trial in pending_trials.items() - }, - ) - config, config_id, prev_config_id = optimizer.get_config_and_ids() - previous = None - if prev_config_id is not None: - previous = evaluated_trials[prev_config_id] - - time_sampled = time.time() - return Trial( - id=config_id, - config=config, - report=None, - time_sampled=time_sampled, - pipeline_dir=config_dir_f(config_id), - previous=previous, - metadata={"time_sampled": time_sampled}, - ) + continue + # If we can't set this working to evaluating, then just retry the loop + try: + trial_to_eval.set_evaluating( + time_started=time.time(), + worker_id=self.worker_id, + ) + self.state.put_updated_trial(trial_to_eval) + n_failed_set_trial_state = 0 + except VersionMismatchError: + n_failed_set_trial_state += 1 + logger.debug( + f"Another worker has managed to change trial '{trial_to_eval.id}'" + " to evaluate and put back into state. This is fine and likely means" + " the other worker is evaluating it.", + exc_info=True, + ) + except Exception: + n_failed_set_trial_state += 1 + logger.error( + f"Error trying to set trial '{trial_to_eval.id}' to evaluating.", + exc_info=True, + ) -def _post_evaluation_hook( # type: ignore - trial: Trial, - result: ERROR | dict[str, Any], - logger: logging.Logger, - loss_value_on_error: float | None, - ignore_errors, -) -> None: - # We import here to avoid circular imports - from neps.plot.tensorboard_eval import tblogger - from neps.utils.data_loading import _get_loss - - working_directory = Path(trial.pipeline_dir, "../../") - loss = _get_loss(result, loss_value_on_error, ignore_errors=ignore_errors) - - # 1. Write all configs and losses - all_configs_losses = Path(working_directory, "all_losses_and_configs.txt") - - def write_loss_and_config(file_handle, loss_, config_id_, config_): # type: ignore - file_handle.write(f"Loss: {loss_}\n") - file_handle.write(f"Config ID: {config_id_}\n") - file_handle.write(f"Config: {config_}\n") - file_handle.write(79 * "-" + "\n") - - with all_configs_losses.open("a", encoding="utf-8") as f: - write_loss_and_config(f, loss, trial.id, trial.config) - - # no need to handle best loss cases if an error occurred - if result == "error": - return - - # The "best" loss exists only in the pareto sense for multi-objective - is_multi_objective = isinstance(loss, dict) - if is_multi_objective: - logger.info(f"Finished evaluating config {trial.id}") - return - - # 2. Write best losses/configs - best_loss_trajectory_file = Path(working_directory, "best_loss_trajectory.txt") - best_loss_config_trajectory_file = Path( - working_directory, "best_loss_with_config_trajectory.txt" - ) + # NOTE: This is to prevent infinite looping if it somehow keeps getting + # the same trial and can't set it to evaluating. + if n_failed_set_trial_state != 0: + if n_failed_set_trial_state >= N_FAILED_TO_SET_TRIAL_STATE: + raise WorkerFailedToGetPendingTrialsError( + "Worker '%s' failed to set trial to evaluating %d times in a row." + " Bailing!" + ) + continue - if not best_loss_trajectory_file.exists(): - is_new_best = result != "error" - else: - best_loss_trajectory: str | list[str] - best_loss_trajectory = best_loss_trajectory_file.read_text(encoding="utf-8") - best_loss_trajectory = list(best_loss_trajectory.rstrip("\n").split("\n")) - best_loss = best_loss_trajectory[-1] - is_new_best = float(best_loss) > loss # type: ignore + # We (this worker) has managed to set it to evaluating, now we can evaluate it + with _set_global_trial(trial_to_eval): + evaluated_trial, report = evaluate_trial( + trial=trial_to_eval, + evaluation_fn=self.evaluation_fn, + default_report_values=self.settings.default_report_values, + ) + evaluation_duration = evaluated_trial.metadata.evaluation_duration + assert evaluation_duration is not None + self.worker_cumulative_evaluation_time_seconds += evaluation_duration - if is_new_best: - with best_loss_trajectory_file.open("a", encoding="utf-8") as f: - f.write(f"{loss}\n") + self.worker_cumulative_eval_count += 1 - with best_loss_config_trajectory_file.open("a", encoding="utf-8") as f: - write_loss_and_config(f, loss, trial.id, trial.config) + logger.info( + "Worker '%s' evaluated trial: %s as %s.", + self.worker_id, + evaluated_trial.id, + evaluated_trial.state, + ) - logger.info( - f"Finished evaluating config {trial.id}" - f" -- new best with loss {float(loss) :.6f}" - ) + if report.cost is not None: + self.worker_cumulative_eval_cost += report.cost - else: - logger.info(f"Finished evaluating config {trial.id}") + if report.err is not None: + logger.error( + f"Error during evaluation of '{evaluated_trial.id}'" + f" : {evaluated_trial.config}." + ) + logger.exception(report.err) + _error_from_evaluation = report.err + + self.state.report_trial_evaluation( + optimizer=self.optimizer, + trial=evaluated_trial, + report=report, + worker_id=self.worker_id, + ) - tblogger.end_of_config() + logger.debug("Config %s: %s", evaluated_trial.id, evaluated_trial.config) + logger.debug("Loss %s: %s", evaluated_trial.id, report.loss) + logger.debug("Cost %s: %s", evaluated_trial.id, report.loss) + logger.debug( + "Learning Curve %s: %s", evaluated_trial.id, report.learning_curve + ) -def launch_runtime( # noqa: PLR0913, C901, PLR0915 +# TODO: This should be done directly in `api.run` at some point to make it clearer at an +# entryy point how the woerer is set up to run if someone reads the entry point code. +def _launch_runtime( # noqa: PLR0913 *, evaluation_fn: Callable[..., float | Mapping[str, Any]], - sampler: BaseOptimizer, + optimizer: BaseOptimizer, optimizer_info: dict, - optimization_dir: Path | str, - max_evaluations_total: int | None = None, - max_evaluations_per_run: int | None = None, - continue_until_max_evaluation_completed: bool = False, - logger: logging.Logger | None = None, + optimization_dir: Path, + max_cost_total: float | None, ignore_errors: bool = False, - loss_value_on_error: None | float = None, - overwrite_optimization_dir: bool = False, - pre_load_hooks: Iterable[Callable[[BaseOptimizer], BaseOptimizer]] | None = None, + loss_value_on_error: float | None, + cost_value_on_error: float | None, + continue_until_max_evaluation_completed: bool, + overwrite_optimization_dir: bool, + max_evaluations_total: int | None, + max_evaluations_for_worker: int | None, + pre_load_hooks: Iterable[Callable[[BaseOptimizer], BaseOptimizer]] | None, ) -> None: - """Launch the runtime of a single instance of NePS. - - Please refer to the module docstring for a detailed explanation of the runtime. - Runs until some exit condition is met. - - Args: - evaluation_fn: The evaluation function to use. - sampler: The optimizer to use for sampling configurations. - optimizer_info: Information about the optimizer. - optimization_dir: The directory where the optimization is running. - max_evaluations_total: The maximum number of evaluations to run. - max_evaluations_per_run: The maximum number of evaluations to run in a single run. - continue_until_max_evaluation_completed: Whether to continue until the maximum - evaluations are completed. - logger: The logger to use. - loss_value_on_error: Setting this and cost_value_on_error to any float will - supress any error and will use given loss value instead. default: None - ignore_errors: Ignore hyperparameter settings that threw an error and do not raise - an error. Error configs still count towards max_evaluations_total. - overwrite_optimization_dir: Whether to overwrite the optimization directory. - pre_load_hooks: Hooks to run before loading the results. - """ - # NOTE(eddiebergman): This was deprecated a while ago and called at - # evaluate, now we just crash immediatly instead. Should probably - # promote this check closer to the user, i.e. `neps.run()` - evaluation_fn_params = inspect.signature(evaluation_fn).parameters - if "previous_working_directory" in evaluation_fn_params: - raise RuntimeError( - "the argument: 'previous_working_directory' was deprecated. " - f"In the function: '{evaluation_fn.__name__}', please, " - "use 'previous_pipeline_directory' instead. ", - ) - if "working_directory" in evaluation_fn_params: - raise RuntimeError( - "the argument: 'working_directory' was deprecated. " - f"In the function: '{evaluation_fn.__name__}', please, " - "use 'pipeline_directory' instead. ", - ) - - if logger is None: - logger = logging.getLogger("neps") - - optimization_dir = Path(optimization_dir) - - # TODO(eddiebergman): Not sure how overwriting works with multiple workers.... if overwrite_optimization_dir and optimization_dir.exists(): - logger.warning("Overwriting working_directory") + logger.info( + f"Overwriting optimization directory '{optimization_dir}' as" + " `overwrite_optimization_dir=True`." + ) shutil.rmtree(optimization_dir) - shared_state = SharedState(optimization_dir, create_dirs=True) - - _poll, _timeout = get_shared_state_poll_and_timeout() - with shared_state.sync(lock=True): - if not shared_state.paths.optimizer_info_file.exists(): - serialize( - optimizer_info, - shared_state.paths.optimizer_info_file, - sort_keys=False, - ) - else: - shared_state.check_optimizer_info_on_disk_matches(optimizer_info) - - _max_evals_this_run = ( - max_evaluations_per_run if max_evaluations_per_run is not None else np.inf + neps_state = create_or_load_filebased_neps_state( + directory=optimization_dir, + optimizer_info=OptimizerInfo(optimizer_info), + optimizer_state=OptimizationState( + budget=( + BudgetInfo(max_cost_budget=max_cost_total, used_cost_budget=0) + if max_cost_total is not None + else None + ), + shared_state={}, # TODO: Unused for the time being... + ), ) - evaluations_in_this_run = 0 - while True: - if evaluations_in_this_run >= _max_evals_this_run: - logger.info("Maximum evaluations per run is reached, shutting down") - break - - with shared_state.sync(lock=True): - trials_by_state = shared_state.trials_by_state() - if not _worker_should_continue( - max_evaluations_total, - n_inprogress=len(trials_by_state[Trial.State.IN_PROGRESS]), - n_evaluated=( - len(trials_by_state[Trial.State.SUCCESS]) - + len(trials_by_state[Trial.State.ERROR]) - ), - continue_until_max_evaluation_completed=continue_until_max_evaluation_completed, - ): - logger.info("Maximum total evaluations is reached, shutting down") - break - - # While we have the decision lock, we will now sample - # with the optimizer in this process - with shared_state.use_sampler(sampler) as sampler: - if sampler.is_out_of_budget(): - logger.info("Maximum budget reached, shutting down") - break - - if pre_load_hooks is not None: - for hook in pre_load_hooks: - sampler = hook(sampler) # noqa: PLW2901 - - logger.debug("Sampling a new configuration") - - evaluated = ( - trials_by_state[Trial.State.SUCCESS] - + trials_by_state[Trial.State.ERROR] - ) - pending = ( - trials_by_state[Trial.State.PENDING] - + trials_by_state[Trial.State.IN_PROGRESS] - ) - trial = _sample_trial_from_optimizer( - sampler, - shared_state.paths.config_dir, - evaluated_trials={trial.id: trial for trial in evaluated}, - pending_trials={trial.id: trial for trial in pending}, - ) - serialize(trial.config, trial.config_file) - serialize(trial.metadata, trial.metadata_file) - if trial.previous is not None: - trial.previous_config_id_file.write_text(trial.previous.id) - - logger.debug(f"Sampled config {trial.id}") - - # Obtain the lock on this trial and evaluate it, - # otherwise continue back to waiting to sampling - with trial._lock.try_lock() as acquired: - if not acquired: - continue - - # Inform the global state that this trial is being evaluated - _set_in_progress_trial(trial) - - # TODO(eddiebergman): Right now if a trial crashes, it's cost is not accounted - # for, this should probably removed from BaseOptimizer as it does not need - # to know this and the runtime can fill this in for it. - try: - user_result = _evaluate_config(trial, evaluation_fn, logger) - except Exception as e: # noqa: BLE001 - # TODO(eddiebergman): Right now this never accounts for cost! - # NOTE: It's important to lock the shared state such that any - # sampling done is with taking this result into account - # accidentally reads this config as un-evaluated - with shared_state.lock(poll=_poll, timeout=_timeout): - # TODO(eddiebergman): We should add an option to just crash here - # if something goes wrong and raise up this error to the top. - logger.error( - f"Error during evaluation of '{trial.id}': {trial.config}." - ) - logger.exception(e) - tb = traceback.format_exc() - - trial.report = trial.create_error_report(e, tb=tb) - trial.metadata["time_end"] = time.time() - - shared_state.trials[trial.id] = (trial, Trial.State.ERROR) - - serialize({"err": str(e), "tb": tb}, trial.disk.error_file) - serialize(trial.metadata, trial.disk.metadata_file) - else: - trial.report = trial.create_success_report(user_result) - trial.metadata["time_end"] = time.time() - if sampler.budget is not None and trial.report.cost is None: - raise ValueError( - "The evaluation function result should contain a 'cost'" - f"field when used with a budget. Got {trial.report.results}", - ) - - with shared_state.lock(poll=_poll, timeout=_timeout): - shared_state.trials[trial.id] = (trial, Trial.State.SUCCESS) - - eval_cost = trial.report.cost - account_for_cost = False - if eval_cost is not None: - account_for_cost = trial.report.account_for_cost - budget_metadata = { - "max": sampler.budget, - "used": sampler.used_budget, - "eval_cost": eval_cost, - "account_for_cost": account_for_cost, - } - trial.metadata.update(budget_metadata) - - serialize(trial.metadata, trial.disk.metadata_file) - serialize(trial.report.results, trial.disk.result_file) - if account_for_cost: - assert eval_cost is not None - with shared_state.use_sampler(sampler, serialize_seed=False): - sampler.used_budget += eval_cost - - _result: ERROR | dict[str, Any] - report = trial.report - if isinstance(report, ErrorReport): - _result = "error" - elif isinstance(report, SuccessReport): - _result = dict(report.results) - else: - _type = type(report) - raise TypeError(f"Unknown result type '{_type}' for report: {report}") - - _post_evaluation_hook( - trial, - _result, - logger, - loss_value_on_error, - ignore_errors, - ) + settings = WorkerSettings( + on_error=( + OnErrorPossibilities.IGNORE + if ignore_errors + else OnErrorPossibilities.RAISE_ANY_ERROR + ), + default_report_values=DefaultReportValues( + loss_value_on_error=loss_value_on_error, + cost_value_on_error=cost_value_on_error, + cost_if_not_provided=None, # TODO: User can't specify yet + learning_curve_on_error=None, # TODO: User can't specify yet + learning_curve_if_not_provided="loss", # report the loss as single value LC + ), + max_evaluations_total=max_evaluations_total, + include_in_progress_evaluations_towards_maximum=( + not continue_until_max_evaluation_completed + ), + max_cost_total=max_cost_total, + max_evaluations_for_worker=max_evaluations_for_worker, + max_evaluation_time_total_seconds=None, # TODO: User can't specify yet + max_wallclock_time_for_worker_seconds=None, # TODO: User can't specify yet + max_evaluation_time_for_worker_seconds=None, # TODO: User can't specify yet + max_cost_for_worker=None, # TODO: User can't specify yet + ) - evaluations_in_this_run += 1 + worker = DefaultWorker.new( + state=neps_state, + optimizer=optimizer, + evaluation_fn=evaluation_fn, + settings=settings, + _pre_sample_hooks=list(pre_load_hooks) if pre_load_hooks is not None else None, + ) + worker.run() diff --git a/neps/search_spaces/architecture/graph.py b/neps/search_spaces/architecture/graph.py index b1cd2e8b..f776b231 100644 --- a/neps/search_spaces/architecture/graph.py +++ b/neps/search_spaces/architecture/graph.py @@ -300,9 +300,7 @@ def copy_dict(d): if isinstance(v, Graph): copied_dict[k] = v.copy() elif isinstance(v, list): - copied_dict[k] = [ - i.copy() if isinstance(i, Graph) else i for i in v - ] + copied_dict[k] = [i.copy() if isinstance(i, Graph) else i for i in v] elif isinstance(v, torch.nn.Module) or isinstance(v, AbstractPrimitive): copied_dict[k] = copy.deepcopy(v) return copied_dict @@ -634,6 +632,7 @@ def parse(self): f"{self.name}-comb_op_at({node_idx})", self.nodes[node_idx]["comb_op"], ) + for neigbor_idx in self.neighbors(node_idx): edge_data = self.get_edge_data(node_idx, neigbor_idx) if isinstance(edge_data.op, Graph): @@ -642,6 +641,7 @@ def parse(self): for primitive in edge_data.op.get_embedded_ops(): if isinstance(primitive, Graph): primitive.parse() + self.add_module( f"{self.name}-edge({node_idx},{neigbor_idx})", edge_data.op, @@ -705,9 +705,7 @@ def _get_child_graphs(self, single_instances: bool = False) -> list: node_data = self.nodes[node_idx] if "subgraph" in node_data: graphs.append(node_data["subgraph"]) - graphs.append( - node_data["subgraph"]._get_child_graphs() - ) + graphs.append(node_data["subgraph"]._get_child_graphs()) for _, _, edge_data in self.edges.data(): if isinstance(edge_data.op, Graph): @@ -724,16 +722,12 @@ def _get_child_graphs(self, single_instances: bool = False) -> list: if embedded_ops is not None: if isinstance(embedded_ops, Graph): graphs.append(embedded_ops) - graphs.append( - embedded_ops._get_child_graphs() - ) + graphs.append(embedded_ops._get_child_graphs()) elif isinstance(embedded_ops, list): for child_op in edge_data.op.get_embedded_ops(): if isinstance(child_op, Graph): graphs.append(child_op) - graphs.append( - child_op._get_child_graphs() - ) + graphs.append(child_op._get_child_graphs()) else: logger.debug( "Got embedded op, but is neither a graph nor a list: {}".format( @@ -971,9 +965,7 @@ def update_nodes( in_edges = [ (v, data) for v, u, data in in_edges if not data.is_final() ] # u is same for all - out_edges = list( - graph.out_edges(node_idx, data=True) - ) # (v, u, data) + out_edges = list(graph.out_edges(node_idx, data=True)) # (v, u, data) out_edges = [ (u, data) for v, u, data in out_edges if not data.is_final() ] # v is same for all diff --git a/neps/search_spaces/search_space.py b/neps/search_spaces/search_space.py index b02a7662..1b990802 100644 --- a/neps/search_spaces/search_space.py +++ b/neps/search_spaces/search_space.py @@ -94,23 +94,21 @@ def pipeline_space_from_configspace( return pipeline_space -def pipeline_space_from_yaml( # noqa: C901, PLR0912 +def pipeline_space_from_yaml( # noqa: C901 config: str | Path | dict, ) -> dict[str, Parameter]: """Reads configuration details from a YAML file or a dictionary and constructs a pipeline space dictionary. Args: - config (str | Path | dict): Path to the YAML file or a dictionary containing - parameter configurations. + config: Path to the YAML file or a dictionary containing parameter configurations. Returns: - dict[str, Parameter]: A dictionary where keys are parameter names and values - are parameter objects. + A dictionary where keys are parameter names and values are parameter objects. Raises: SearchSpaceFromYamlFileError: Raised if there are issues with the YAML file's - format, contents, or if the dictionary is invalid. + format, contents, or if the dictionary is invalid. """ try: if isinstance(config, (str, Path)): @@ -134,31 +132,23 @@ def pipeline_space_from_yaml( # noqa: C901, PLR0912 except yaml.YAMLError as e: raise ValueError(f"The file at {config} is not a valid YAML file.") from e - # Initialize the pipeline space pipeline_space: dict[str, Parameter] = {} - # Iterate over the items in the YAML configuration for name, details in config.items(): - # get parameter type param_type = deduce_type(name, details) - # init parameter by checking type if param_type in ("int", "integer"): - # Integer Parameter formatted_details = formatting_int(name, details) pipeline_space[name] = IntegerParameter(**formatted_details) elif param_type == "float": - # Float Parameter formatted_details = formatting_float(name, details) pipeline_space[name] = FloatParameter(**formatted_details) elif param_type in ("cat", "categorical"): - # Categorical parameter formatted_details = formatting_cat(name, details) pipeline_space[name] = CategoricalParameter(**formatted_details) elif param_type == "const": - # Constant parameter - formatted_details = formatting_const(details) # type: ignore - pipeline_space[name] = ConstantParameter(formatted_details) + const_details = formatting_const(details) + pipeline_space[name] = ConstantParameter(const_details) else: # Handle unknown parameter type raise TypeError( @@ -408,7 +398,6 @@ def _smbo_mutation(self, *, patience: int = 5, **kwargs: Any) -> Self: mutated_param = hp.mutate(**kwargs) except Exception as e: # noqa: BLE001 logger.warning(f"{chosen_hp_name} failed to mutate! Error: {e}, {kwargs}") - # !- print(traceback.format_exc()) # noq-a: T201 continue new_params = { @@ -683,10 +672,17 @@ def serialize(self) -> dict[str, Hashable]: serialized_config[name] = hp.serialize_value(hp.value) return serialized_config - def load_from(self, config: Mapping[str, Any | GraphParameter]) -> None: - """Load a configuration from a dictionary, setting all the values.""" + def from_dict(self, config: Mapping[str, Any | GraphParameter]) -> SearchSpace: + """Create a new instance of this search space with parameters set from the config. + + Args: + config: The dictionary of hyperparameters to set with values. + """ + new = self.clone() for name, val in config.items(): - self.hyperparameters[name].load_from(val) + new.hyperparameters[name].load_from(val) + + return new def clone(self, *, _with_tabular: bool = False) -> SearchSpace: """Create a copy of the search space.""" diff --git a/neps/search_spaces/yaml_search_space_utils.py b/neps/search_spaces/yaml_search_space_utils.py index 9bcfcb11..8b25b1b0 100644 --- a/neps/search_spaces/yaml_search_space_utils.py +++ b/neps/search_spaces/yaml_search_space_utils.py @@ -1,12 +1,27 @@ from __future__ import annotations + import logging import re +from typing import Literal, overload logger = logging.getLogger("neps") -def convert_scientific_notation(value: str | int | float, show_usage_flag=False) \ - -> float | (float, bool): +@overload +def convert_scientific_notation( + value: str | int | float, show_usage_flag: Literal[False] = False +) -> float: ... + + +@overload +def convert_scientific_notation( + value: str | int | float, show_usage_flag: Literal[True] +) -> tuple[float, bool]: ... + + +def convert_scientific_notation( + value: str | int | float, show_usage_flag: bool = False +) -> float | tuple[float, bool]: """ Convert a given value to a float if it's a string that matches scientific e notation. This is especially useful for numbers like "3.3e-5" which YAML parsers may not @@ -72,7 +87,7 @@ class SearchSpaceFromYamlFileError(Exception): raise SearchSpaceFromYamlFileError(e) """ - def __init__(self, exception): + def __init__(self, exception: Exception) -> None: self.exception_type = type(exception).__name__ self.message = ( f"Error occurred during initialization of search space from " @@ -84,33 +99,34 @@ def __init__(self, exception): def deduce_type( name: str, details: dict[str, str | int | float] | str | int | float ) -> str: - """ - Deduces the parameter type from details. + """Deduces the parameter type from details. Args: - name (str): The name of the parameter. - details (dict | str | int | float): A dictionary containing parameter - specifications or a direct value (string, integer, or float). + name: The name of the parameter. + details: A dictionary containing parameter specifications or + a direct value (string, integer, or float). Returns: - str: The deduced parameter type ('int', 'float', 'categorical', or 'constant'). + The deduced parameter type ('int', 'float', 'categorical', or 'constant'). Raises: TypeError: If the type cannot be deduced or the details don't align with expected constraints. - """ - if isinstance(details, (str, int, float)): - param_type = "const" - elif isinstance(details, dict): + """ + if isinstance(details, (str, int, float)): + return "const" + + if isinstance(details, dict): if "type" in details: - param_type = details.pop("type").lower() - else: - param_type = deduce_param_type(name, details) - else: - raise TypeError( - f"Unable to deduce parameter type for '{name}' with details '{details}'.") + param_type = details.pop("type") + assert isinstance(param_type, str) + return param_type.lower() - return param_type + return deduce_param_type(name, details) + + raise TypeError( + f"Unable to deduce parameter type for '{name}' with details '{details}'." + ) def deduce_param_type(name: str, details: dict[str, int | str | float]) -> str: @@ -284,7 +300,7 @@ def formatting_float(name: str, details: dict[str, str | int | float]) -> dict: return details -def formatting_cat(name: str, details: dict[str, str | int | float]) -> dict: +def formatting_cat(name: str, details: dict[str, list | str | int | float]) -> dict: """ This function ensures that the 'choices' key in the details is a list and attempts to convert any elements expressed in scientific notation to floats. It also handles @@ -303,34 +319,44 @@ def formatting_cat(name: str, details: dict[str, str | int | float]) -> dict: """ if not isinstance(details["choices"], list): raise TypeError(f"The 'choices' for '{name}' must be a list.") + for i, element in enumerate(details["choices"]): try: converted_value, e_flag = convert_scientific_notation( element, show_usage_flag=True ) + if e_flag: - details["choices"][ - i - ] = converted_value # Replace the element at the same position + # Replace the element at the same position + details["choices"][i] = converted_value except ValueError: pass # If a ValueError occurs, simply continue to the next element + if "default" in details: e_flag = False + extracted_default = details["default"] + if not isinstance(extracted_default, (str, int, float)): + raise TypeError( + f"The 'default' value for '{name}' must be a string, integer, or float." + f" Got {type(extracted_default).__name__}." + ) + try: # check if e notation, if then convert to number default, e_flag = convert_scientific_notation( - details["default"], show_usage_flag=True + extracted_default, show_usage_flag=True ) except ValueError: pass # if default value is not in a numeric format, Value Error occurs + if e_flag is True: details["default"] = default + return details def formatting_const(details: str | int | float) -> str | int | float: - """ - Validates and converts a constant parameter. + """Validates and converts a constant parameter. This function checks if the 'details' parameter contains a value expressed in scientific notation and converts it to a float. It ensures that the input @@ -354,8 +380,8 @@ def formatting_const(details: str | int | float) -> str | int | float: # if the value is not able to convert to float a ValueError get raised by # convert_scientific_notation function pass + if e_flag: details = converted_value - return details - + return details diff --git a/neps/state/__init__.py b/neps/state/__init__.py new file mode 100644 index 00000000..6508dba2 --- /dev/null +++ b/neps/state/__init__.py @@ -0,0 +1,19 @@ +from neps.state.protocols import ( + Locker, + ReaderWriter, + Synced, + VersionedResource, + Versioner, +) +from neps.state.seed_snapshot import SeedSnapshot +from neps.state.trial import Trial + +__all__ = [ + "Locker", + "SeedSnapshot", + "Synced", + "Trial", + "ReaderWriter", + "Versioner", + "VersionedResource", +] diff --git a/neps/state/_eval.py b/neps/state/_eval.py new file mode 100644 index 00000000..0d08dfdd --- /dev/null +++ b/neps/state/_eval.py @@ -0,0 +1,195 @@ +from __future__ import annotations + +import inspect +import logging +import time +import traceback +from pathlib import Path +from typing import TYPE_CHECKING, Any, Callable, Literal, Mapping, TypeVar + +from neps.exceptions import NePSError + +if TYPE_CHECKING: + from neps.state.settings import DefaultReportValues + from neps.state.trial import Trial + +logger = logging.getLogger(__name__) + +Loc = TypeVar("Loc") +_notset = object() + + +class GotNonePendingTrialForEvalautionError(NePSError): + """Raised when trying to evaluate a trial that is not in a pending state.""" + + def __init__( + self, + trial_id: Trial.ID, + state: Trial.State, + worker_id: str, + *args: Any, + ): + """Initialize the error. + + Args: + trial_id: The ID of the trial that was not in a pending state. + state: The state of the trial. + worker_id: The ID of the worker that picked up this trial. + *args: Additional arguments to pass to the parent class. + """ + super().__init__(trial_id, state, worker_id, *args) + self.trial_id = trial_id + self.state = state + self.worker_id = worker_id + + def __str__(self) -> str: + return ( + f"Trial '{self.trial_id}' is not in a pending state but in '{self.state}'." + f"This trial was picked up for evaluation by worker '{self.worker_id}'." + ) + + +def _check_float(value: Any, name: str) -> float: + try: + return float(value) + except (TypeError, ValueError) as e: + raise ValueError( + f"The '{name}' should be a float but got a `{type(value)}`" + f" with value of {value}", + ) from e + + +def parse_user_result( + user_result: float | dict[str, Any], + *, + default_cost_value: float | None = None, + default_learning_curve: Literal["loss"] | list[float] | None = None, +) -> tuple[float, float | None, list[float] | None, dict[str, Any]]: + """Check if the trial has succeeded.""" + if isinstance(user_result, Mapping): + extracted_loss = user_result.pop("loss", _notset) + if extracted_loss is _notset: + raise KeyError( + "The 'loss' should be provided in the evaluation result if providing" + " a dictionary." + ) + extracted_cost = user_result.pop("cost", default_cost_value) + + extracted_learning_curve = user_result.pop("learning_curve", _notset) + + if extracted_learning_curve is _notset: + # HACK: Backwards compat, check if it's in the "info_dict" key + if "info_dict" in user_result: + extracted_learning_curve = user_result["info_dict"].pop( + "learning_curve", + default_learning_curve, + ) + else: + extracted_learning_curve = default_learning_curve + + if extracted_learning_curve == "loss": + extracted_learning_curve = [extracted_loss] + + extra = user_result + else: + extracted_loss = user_result + extracted_learning_curve = ( + None + if default_learning_curve is None + else [user_result] + if default_learning_curve == "loss" + else default_learning_curve + ) + extracted_cost = default_cost_value + extra = {} + + loss = _check_float(extracted_loss, "loss") + cost = _check_float(extracted_cost, "cost") if extracted_cost is not None else None + learning_curve = ( + [float(v) for v in extracted_learning_curve] + if extracted_learning_curve is not None + else None + ) + return loss, cost, learning_curve, extra + + +def _eval_trial( + *, + trial: Trial, + default_report_values: DefaultReportValues, + fn: Callable[..., Any], + **kwargs: Any, +) -> Trial.Report: + start = time.monotonic() + try: + user_result = fn(**kwargs, **trial.config) + # Something went wrong in evaluation + except Exception as e: + duration = time.monotonic() - start + time_end = time.time() + logger.error(f"Error during evaluation of '{trial.id}': {trial.config}.") + logger.exception(e) + report = trial.set_complete( + report_as="crashed", + loss=default_report_values.loss_value_on_error, + cost=default_report_values.cost_value_on_error, + learning_curve=default_report_values.learning_curve_on_error, + extra=None, + err=e, + tb=traceback.format_exc(), + time_end=time_end, + evaluation_duration=duration, + ) + else: + duration = time.monotonic() - start + time_end = time.time() + logger.info(f"Successful evaluation of '{trial.id}': {user_result}.") + + loss, cost, learning_curve, extra = parse_user_result( + dict(user_result) if isinstance(user_result, Mapping) else user_result, + default_cost_value=default_report_values.cost_if_not_provided, + default_learning_curve=default_report_values.learning_curve_if_not_provided, + ) + report = trial.set_complete( + report_as="success", + loss=loss, + cost=cost, + learning_curve=learning_curve, + err=None, + tb=None, + extra=extra, + time_end=time_end, + evaluation_duration=duration, + ) + + return report + + +def evaluate_trial( + trial: Trial, + *, + evaluation_fn: Callable[..., Any], + default_report_values: DefaultReportValues, +) -> tuple[Trial, Trial.Report]: + # NOTE: For now we are assuming everything is on a shared filesystem + # will have to revisit if the location can be elsewhere + trial_location = Path(trial.metadata.location) + prev_trial_location = ( + Path(trial.metadata.previous_trial_location) + if trial.metadata.previous_trial_location is not None + else None + ) + + params = { + "pipeline_directory": trial_location, + "previous_pipeline_directory": prev_trial_location, + } + sigkeys = inspect.signature(evaluation_fn).parameters.keys() + injectable_params = {key: val for key, val in params.items() if key in sigkeys} + report = _eval_trial( + trial=trial, + fn=evaluation_fn, + default_report_values=default_report_values, + **injectable_params, + ) + return trial, report diff --git a/neps/state/err_dump.py b/neps/state/err_dump.py new file mode 100644 index 00000000..167ab48f --- /dev/null +++ b/neps/state/err_dump.py @@ -0,0 +1,77 @@ +"""Error dump for serializing errors. + +This resource is used to store errors that can be serialized and deserialized, +such that they can be shared between workers. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import ClassVar + +from neps.exceptions import NePSError + + +class SerializedError(NePSError): + """An error the is serialized.""" + + +@dataclass +class SerializableTrialError: + """Error information for a trial.""" + + trial_id: str + """The ID of the trial.""" + + worker_id: str + """The ID of the worker that evaluated the trial which caused the error.""" + + err_type: str + """The type of the error.""" + + err: str + """The error msg.""" + + tb: str | None + """The traceback of the error.""" + + def as_raisable(self) -> SerializedError: + """Convert the error to a raisable error.""" + return SerializedError( + f"An error occurred during the evaluation of a trial '{self.trial_id}' which" + f" was evaluted by worker '{self.worker_id}'. The original error could not" + " be deserialized but had the following information:" + "\n" + f"{self.err_type}: {self.err}" + "\n\n" + f"{self.tb}" + ) + + +@dataclass +class ErrDump: + """A collection of errors that can be serialized and deserialized.""" + + SerializableTrialError: ClassVar = SerializableTrialError + + errs: list[SerializableTrialError] = field(default_factory=list) + + def append(self, err: SerializableTrialError) -> None: + """Append the an error to the reported errors.""" + return self.errs.append(err) + + def __len__(self) -> int: + return len(self.errs) + + def __bool__(self) -> bool: + return bool(self.errs) + + def empty(self) -> bool: + """Check if the queue is empty.""" + return not self.errs + + def latest_err_as_raisable(self) -> SerializedError | None: + """Get the latest error.""" + if self.errs: + return self.errs[-1].as_raisable() + return None diff --git a/neps/state/filebased.py b/neps/state/filebased.py new file mode 100644 index 00000000..6940016d --- /dev/null +++ b/neps/state/filebased.py @@ -0,0 +1,672 @@ +"""This module houses the implementation of a NePSState that +does everything on the filesystem, i.e. locking, versioning and +storing/loading. + +The main components are: +* [`FileVersioner`][neps.state.filebased.FileVersioner]: A versioner that + stores a version tag on disk, usually for a resource like a Trial. +* [`FileLocker`][neps.state.filebased.FileLocker]: A locker that uses a file + to lock between processes. +* [`TrialRepoInDirectory`][neps.state.filebased.TrialRepoInDirectory]: A + repository of Trials that are stored in a directory. +* `ReaderWriterXXX`: Reader/writers for various resources NePSState needs +* [`load_filebased_neps_state`][neps.state.filebased.load_filebased_neps_state]: + A function to load a NePSState from a directory. +* [`create_filebased_neps_state`][neps.state.filebased.create_filebased_neps_state]: + A function to create a new NePSState in a directory. +""" + +from __future__ import annotations + +import json +import logging +from contextlib import contextmanager +from dataclasses import asdict, dataclass, field +from pathlib import Path +from typing import ClassVar, Iterable, Iterator, TypeVar +from typing_extensions import override +from uuid import uuid4 + +import numpy as np +import portalocker as pl + +from neps.env import ( + GLOBAL_ERR_FILELOCK_POLL, + GLOBAL_ERR_FILELOCK_TIMEOUT, + SEED_SNAPSHOT_FILELOCK_POLL, + SEED_SNAPSHOT_FILELOCK_TIMEOUT, + TRIAL_FILELOCK_POLL, + TRIAL_FILELOCK_TIMEOUT, +) +from neps.exceptions import NePSError +from neps.state.err_dump import ErrDump +from neps.state.neps_state import NePSState +from neps.state.optimizer import BudgetInfo, OptimizationState, OptimizerInfo +from neps.state.protocols import Locker, ReaderWriter, Synced, TrialRepo, Versioner +from neps.state.seed_snapshot import SeedSnapshot +from neps.state.trial import Trial +from neps.utils.files import deserialize, serialize + +logger = logging.getLogger(__name__) +K = TypeVar("K") +T = TypeVar("T") + + +def make_sha() -> str: + """Generate a str hex sha.""" + return uuid4().hex + + +@dataclass +class FileVersioner(Versioner): + """A versioner that stores a version tag on disk.""" + + version_file: Path + + @override + def current(self) -> str | None: + if not self.version_file.exists(): + return None + return self.version_file.read_text() + + @override + def bump(self) -> str: + sha = make_sha() + self.version_file.write_text(sha) + return sha + + +@dataclass +class TrialRepoInDirectory(TrialRepo[Path]): + """A repository of Trials that are stored in a directory.""" + + directory: Path + _cache: dict[Trial.ID, Synced[Trial, Path]] = field(default_factory=dict) + + @override + def all_trial_ids(self) -> set[Trial.ID]: + """List all the trial ids in this trial Repo.""" + return { + config_path.name.replace("config_", "") + for config_path in self.directory.iterdir() + if config_path.name.startswith("config_") and config_path.is_dir() + } + + @override + def get_by_id( + self, + trial_id: Trial.ID, + *, + lock_poll: float = TRIAL_FILELOCK_POLL, + lock_timeout: float | None = TRIAL_FILELOCK_TIMEOUT, + ) -> Synced[Trial, Path]: + """Get a Trial by its ID. + + !!! note + + This will **not** explicitly sync the trial and it is up to the caller + to do so. Most of the time, the caller should be a NePSState + object which will do that for you. However if the trial is not in the + cache, then it will be loaded from disk which requires syncing. + + Args: + trial_id: The ID of the trial to get. + lock_poll: The poll time for the file lock. + lock_timeout: The timeout for the file lock. + + Returns: + The trial with the given ID. + """ + trial = self._cache.get(trial_id) + if trial is not None: + return trial + + config_path = self.directory / f"config_{trial_id}" + if not config_path.exists(): + raise TrialRepo.TrialNotFoundError(trial_id, config_path) + + trial = Synced.load( + location=config_path, + locker=FileLocker( + lock_path=config_path / ".lock", + poll=lock_poll, + timeout=lock_timeout, + ), + versioner=FileVersioner(version_file=config_path / ".version"), + reader_writer=ReaderWriterTrial(), + ) + self._cache[trial_id] = trial + return trial + + @override + def get_by_ids(self, trial_ids: Iterable[Trial.ID]) -> dict[str, Synced[Trial, Path]]: + """Get multiple Trials by their IDs. + + !!! note + See [`get_by_id()`][neps.state.filebased.TrialRepoInDirectory.get_by_id] + for notes on the trials syncing. + + Args: + trial_ids: The IDs of the trials to get. + + Returns: + A dictionary of the trials with the given IDs. + + Raises: + TrialRepo.TrialNotFoundError: If a trial is not found. + """ + return {trial_id: self.get_by_id(trial_id) for trial_id in trial_ids} + + @override + def put_new( + self, + trial: Trial, + *, + lock_poll: float = TRIAL_FILELOCK_POLL, + lock_timeout: float | None = TRIAL_FILELOCK_TIMEOUT, + ) -> Synced[Trial, Path]: + """Put a new Trial into the repository. + + Args: + trial: The trial to put. + lock_poll: The poll time for the file lock. + lock_timeout: The timeout for the file lock. + + Returns: + The synced trial. + + Raises: + TrialRepo.TrialAlreadyExistsError: If the trial already exists in the + repository. + """ + config_path = self.directory / f"config_{trial.metadata.id}" + if config_path.exists(): + raise TrialRepo.TrialAlreadyExistsError( + f"Trial '{trial.metadata.id}' already exists as '{config_path}'." + ) + + # HACK: We do this here as there is no way to know where a Trial will + # be located when it's created... + trial.metadata.location = str(config_path) + shared_trial = Synced.new( + data=trial, + location=config_path, + locker=FileLocker( + lock_path=config_path / ".lock", + poll=lock_poll, + timeout=lock_timeout, + ), + versioner=FileVersioner(version_file=config_path / ".version"), + reader_writer=ReaderWriterTrial(), + ) + self._cache[trial.metadata.id] = shared_trial + return shared_trial + + @override + def all(self) -> dict[Trial.ID, Synced[Trial, Path]]: + """Get a dictionary of all the Trials in the repository. + + !!! note + See [`get_by_id()`][neps.state.filebased.TrialRepoInDirectory.get_by_id] + for notes on the trials syncing. + """ + return {trial_id: self.get_by_id(trial_id) for trial_id in self.all_trial_ids()} + + @override + def pending(self) -> Iterable[tuple[Trial.ID, Synced[Trial, Path]]]: + pending = [ + (_id, t, trial.metadata.time_sampled) + for (_id, t) in self.all().items() + if (trial := t.synced()).state == Trial.State.PENDING + ] + return iter((_id, t) for _id, t, _ in sorted(pending, key=lambda x: x[2])) + + +@dataclass +class ReaderWriterTrial(ReaderWriter[Trial, Path]): + """ReaderWriter for Trial objects.""" + + CONFIG_FILENAME = "config.yaml" + METADATA_FILENAME = "metadata.yaml" + STATE_FILENAME = "state.txt" + REPORT_FILENAME = "report.yaml" + PREVIOUS_TRIAL_ID_FILENAME = "previous_trial_id.txt" + + @override + @classmethod + def read(cls, directory: Path) -> Trial: + config_path = directory / cls.CONFIG_FILENAME + metadata_path = directory / cls.METADATA_FILENAME + state_path = directory / cls.STATE_FILENAME + report_path = directory / cls.REPORT_FILENAME + + return Trial( + config=deserialize(config_path), + metadata=Trial.MetaData(**deserialize(metadata_path)), + state=Trial.State(state_path.read_text(encoding="utf-8").strip()), + report=( + Trial.Report(**deserialize(report_path)) if report_path.exists() else None + ), + ) + + @override + @classmethod + def write(cls, trial: Trial, directory: Path) -> None: + config_path = directory / cls.CONFIG_FILENAME + metadata_path = directory / cls.METADATA_FILENAME + state_path = directory / cls.STATE_FILENAME + + serialize(trial.config, config_path) + serialize(asdict(trial.metadata), metadata_path) + state_path.write_text(trial.state.value, encoding="utf-8") + + if trial.metadata.previous_trial_id is not None: + previous_trial_path = directory / cls.PREVIOUS_TRIAL_ID_FILENAME + previous_trial_path.write_text(trial.metadata.previous_trial_id) + + if trial.report is not None: + report_path = directory / cls.REPORT_FILENAME + serialize(asdict(trial.report), report_path) + + +@dataclass +class ReaderWriterSeedSnapshot(ReaderWriter[SeedSnapshot, Path]): + """ReaderWriter for SeedSnapshot objects.""" + + # It seems like they're all uint32 but I can't be sure. + PY_RNG_STATE_DTYPE: ClassVar = np.int64 + + PY_RNG_TUPLE_FILENAME: ClassVar = "py_rng.npy" + NP_RNG_STATE_FILENAME: ClassVar = "np_rng_state.npy" + TORCH_RNG_STATE_FILENAME: ClassVar = "torch_rng_state.pt" + TORCH_CUDA_RNG_STATE_FILENAME: ClassVar = "torch_cuda_rng_state.pt" + SEED_INFO_FILENAME: ClassVar = "seed_info.json" + + @override + @classmethod + def read(cls, directory: Path) -> SeedSnapshot: + seedinfo_path = directory / cls.SEED_INFO_FILENAME + py_rng_path = directory / cls.PY_RNG_TUPLE_FILENAME + np_rng_path = directory / cls.NP_RNG_STATE_FILENAME + torch_rng_path = directory / cls.TORCH_RNG_STATE_FILENAME + torch_cuda_rng_path = directory / cls.TORCH_CUDA_RNG_STATE_FILENAME + + # Load and set pythons rng + py_rng_state = tuple( + int(x) for x in np.fromfile(py_rng_path, dtype=cls.PY_RNG_STATE_DTYPE) + ) + np_rng_state = np.fromfile(np_rng_path, dtype=np.uint32) + seed_info = deserialize(seedinfo_path) + + torch_exists = torch_rng_path.exists() or torch_cuda_rng_path.exists() + + # By specifying `weights_only=True`, it disables arbitrary object loading + torch_rng_state = None + torch_cuda_rng = None + if torch_exists: + import torch + + if torch_rng_path.exists(): + torch_rng_state = torch.load(torch_rng_path, weights_only=True) + + if torch_cuda_rng_path.exists(): + # By specifying `weights_only=True`, it disables arbitrary object loading + torch_cuda_rng = torch.load(torch_cuda_rng_path, weights_only=True) + + return SeedSnapshot( + np_rng=( + seed_info["np_rng_kind"], + np_rng_state, + seed_info["np_pos"], + seed_info["np_has_gauss"], + seed_info["np_cached_gauss"], + ), + py_rng=( + seed_info["py_rng_version"], + py_rng_state, + seed_info["py_guass_next"], + ), + torch_rng=torch_rng_state, + torch_cuda_rng=torch_cuda_rng, + ) + + @override + @classmethod + def write(cls, snapshot: SeedSnapshot, directory: Path) -> None: + seedinfo_path = directory / cls.SEED_INFO_FILENAME + py_rng_path = directory / cls.PY_RNG_TUPLE_FILENAME + np_rng_path = directory / cls.NP_RNG_STATE_FILENAME + torch_rng_path = directory / cls.TORCH_RNG_STATE_FILENAME + torch_cuda_rng_path = directory / cls.TORCH_CUDA_RNG_STATE_FILENAME + + py_rng_version, py_rng_state, py_guass_next = snapshot.py_rng + + np.array(py_rng_state, dtype=cls.PY_RNG_STATE_DTYPE).tofile(py_rng_path) + + seed_info = { + "np_rng_kind": snapshot.np_rng[0], + "np_pos": snapshot.np_rng[2], + "np_has_gauss": snapshot.np_rng[3], + "np_cached_gauss": snapshot.np_rng[4], + "py_rng_version": py_rng_version, + "py_guass_next": py_guass_next, + } + serialize(seed_info, seedinfo_path) + np_rng_state = snapshot.np_rng[1] + np_rng_state.tofile(np_rng_path) + + if snapshot.torch_rng is not None: + import torch + + torch.save(snapshot.torch_rng, torch_rng_path) + + if snapshot.torch_cuda_rng is not None: + import torch + + torch.save(snapshot.torch_cuda_rng, torch_cuda_rng_path) + + +@dataclass +class ReaderWriterOptimizerInfo(ReaderWriter[OptimizerInfo, Path]): + """ReaderWriter for OptimizerInfo objects.""" + + INFO_FILENAME: ClassVar = "info.yaml" + + @override + @classmethod + def read(cls, directory: Path) -> OptimizerInfo: + info_path = directory / cls.INFO_FILENAME + return OptimizerInfo(info=deserialize(info_path)) + + @override + @classmethod + def write(cls, optimizer_info: OptimizerInfo, directory: Path) -> None: + info_path = directory / cls.INFO_FILENAME + serialize(optimizer_info.info, info_path) + + +# TODO(eddiebergman): If an optimizer wants to store some hefty state, i.e. a numpy array +# or something, this is horribly inefficient and we would need to adapt OptimizerState to +# handle this. +# TODO(eddiebergman): May also want to consider serializing budget into a seperate entity +@dataclass +class ReaderWriterOptimizationState(ReaderWriter[OptimizationState, Path]): + """ReaderWriter for OptimizationState objects.""" + + STATE_FILE_NAME: ClassVar = "state.yaml" + + @override + @classmethod + def read(cls, directory: Path) -> OptimizationState: + state_path = directory / cls.STATE_FILE_NAME + state = deserialize(state_path) + budget_info = state.get("budget") + budget = BudgetInfo(**budget_info) if budget_info is not None else None + return OptimizationState( + shared_state=state.get("shared_state") or {}, + budget=budget, + ) + + @override + @classmethod + def write(cls, info: OptimizationState, directory: Path) -> None: + info_path = directory / cls.STATE_FILE_NAME + serialize(asdict(info), info_path) + + +@dataclass +class ReaderWriterErrDump(ReaderWriter[ErrDump, Path]): + """ReaderWriter for shared error lists.""" + + name: str + + @override + def read(self, directory: Path) -> ErrDump: + errors_path = directory / f"{self.name}-errors.jsonl" + with errors_path.open("r") as f: + data = [json.loads(line) for line in f] + + return ErrDump([ErrDump.SerializableTrialError(**d) for d in data]) + + @override + def write(self, err_dump: ErrDump, directory: Path) -> None: + errors_path = directory / f"{self.name}-errors.jsonl" + with errors_path.open("w") as f: + lines = [json.dumps(asdict(trial_err)) for trial_err in err_dump.errs] + f.write("\n".join(lines)) + + +FILELOCK_EXCLUSIVE_NONE_BLOCKING = pl.LOCK_EX | pl.LOCK_NB + + +@dataclass +class FileLocker(Locker): + """File-based locker using `portalocker`. + + [`FileLocker`][neps.state.locker.file.FileLocker] implements + the [`Locker`][neps.state.locker.locker.Locker] protocol using + `portalocker` to lock a file between processes with a shared + filesystem. + """ + + lock_path: Path + poll: float + timeout: float | None + + def __post_init__(self) -> None: + self.lock_path = self.lock_path.resolve().absolute() + + @override + def is_locked(self) -> bool: + if not self.lock_path.exists(): + return False + try: + with self.lock(fail_if_locked=True): + pass + return False + except pl.exceptions.LockException: + return True + + @override + @contextmanager + def lock( + self, + *, + fail_if_locked: bool = False, + ) -> Iterator[None]: + self.lock_path.parent.mkdir(parents=True, exist_ok=True) + self.lock_path.touch(exist_ok=True) + logger.debug("Acquiring lock on %s", self.lock_path) + with pl.Lock( + self.lock_path, + check_interval=self.poll, + timeout=self.timeout, + flags=FILELOCK_EXCLUSIVE_NONE_BLOCKING, + fail_when_locked=fail_if_locked, + ): + yield + logger.debug("Released lock on %s", self.lock_path) + + +def load_filebased_neps_state(directory: Path) -> NePSState[Path]: + """Load a NePSState from a directory. + + Args: + directory: The directory to load the state from. + + Returns: + The loaded NePSState. + + Raises: + FileNotFoundError: If no NePSState is found at the given directory. + """ + if not directory.exists(): + raise FileNotFoundError(f"No NePSState found at '{directory}'.") + directory.mkdir(parents=True, exist_ok=True) + config_dir = directory / "configs" + config_dir.mkdir(parents=True, exist_ok=True) + seed_dir = directory / ".seed_state" + seed_dir.mkdir(parents=True, exist_ok=True) + error_dir = directory / ".errors" + error_dir.mkdir(parents=True, exist_ok=True) + optimizer_state_dir = directory / ".optimizer_state" + optimizer_state_dir.mkdir(parents=True, exist_ok=True) + optimizer_info_dir = directory / ".optimizer_info" + optimizer_info_dir.mkdir(parents=True, exist_ok=True) + + return NePSState( + location=str(directory.absolute().resolve()), + _trials=TrialRepoInDirectory(config_dir), + _optimizer_info=Synced.load( + location=optimizer_info_dir, + versioner=FileVersioner(version_file=optimizer_info_dir / ".version"), + locker=FileLocker( + lock_path=optimizer_info_dir / ".lock", + poll=0.01, + timeout=None, + ), + reader_writer=ReaderWriterOptimizerInfo(), + ), + _seed_state=Synced.load( + location=seed_dir, + reader_writer=ReaderWriterSeedSnapshot(), + versioner=FileVersioner(version_file=seed_dir / ".version"), + locker=FileLocker( + lock_path=seed_dir / ".lock", + poll=SEED_SNAPSHOT_FILELOCK_POLL, + timeout=SEED_SNAPSHOT_FILELOCK_TIMEOUT, + ), + ), + _shared_errors=Synced.load( + location=error_dir, + reader_writer=ReaderWriterErrDump("all"), + versioner=FileVersioner(version_file=error_dir / ".all.version"), + locker=FileLocker( + lock_path=error_dir / ".all.lock", + poll=GLOBAL_ERR_FILELOCK_POLL, + timeout=GLOBAL_ERR_FILELOCK_TIMEOUT, + ), + ), + _optimizer_state=Synced.load( + location=optimizer_state_dir, + reader_writer=ReaderWriterOptimizationState(), + versioner=FileVersioner(version_file=optimizer_state_dir / ".version"), + locker=FileLocker( + lock_path=optimizer_state_dir / ".lock", + poll=GLOBAL_ERR_FILELOCK_POLL, + timeout=GLOBAL_ERR_FILELOCK_TIMEOUT, + ), + ), + ) + + +def create_or_load_filebased_neps_state( + directory: Path, + *, + optimizer_info: OptimizerInfo, + optimizer_state: OptimizationState, +) -> NePSState[Path]: + """Create a new NePSState in a directory or load the existing one + if it already exists. + + !!! warning + + We check that the optimizer info in the NePSState on disk matches + the one that is passed. However we do not lock this check so it + is possible that if two processes try to create a NePSState at the + same time, both with different optimizer infos, that one will fail + to create the NePSState. This is a limitation of the current design. + + In principal, we could allow multiple optimizers to be run and share + the same set of trials. + + Args: + directory: The directory to create the state in. + optimizer_info: The optimizer info to use. + optimizer_state: The optimizer state to use. + + Returns: + The NePSState. + + Raises: + NePSError: If the optimizer info on disk does not match the one provided. + """ + is_new = not directory.exists() + directory.mkdir(parents=True, exist_ok=True) + config_dir = directory / "configs" + config_dir.mkdir(parents=True, exist_ok=True) + seed_dir = directory / ".seed_state" + seed_dir.mkdir(parents=True, exist_ok=True) + error_dir = directory / ".errors" + error_dir.mkdir(parents=True, exist_ok=True) + optimizer_state_dir = directory / ".optimizer_state" + optimizer_state_dir.mkdir(parents=True, exist_ok=True) + optimizer_info_dir = directory / ".optimizer_info" + optimizer_info_dir.mkdir(parents=True, exist_ok=True) + + # We have to do one bit of sanity checking to ensure that the optimzier + # info on disk manages the one we have recieved, otherwise we are unsure which + # optimizer is being used. + # NOTE: We assume that we do not have to worry about a race condition + # here where we have two different NePSState objects with two different optimizer + # infos trying to be created at the same time. This avoids the need to lock to + # check the optimizer info. If this assumption changes, then we would have + # to first lock before we do this check + optimizer_info_reader_writer = ReaderWriterOptimizerInfo() + if not is_new: + existing_info = optimizer_info_reader_writer.read(optimizer_info_dir) + if existing_info != optimizer_info: + raise NePSError( + "The optimizer info on disk does not match the one provided." + f"\nOn disk: {existing_info}\nProvided: {optimizer_info}" + f"\n\nLoaded the one on disk from {optimizer_info_dir}." + ) + + return NePSState( + location=str(directory.absolute().resolve()), + _trials=TrialRepoInDirectory(config_dir), + _optimizer_info=Synced.new_or_load( + data=optimizer_info, # type: ignore + location=optimizer_info_dir, + versioner=FileVersioner(version_file=optimizer_info_dir / ".version"), + locker=FileLocker( + lock_path=optimizer_info_dir / ".lock", + poll=0.01, + timeout=None, + ), + reader_writer=ReaderWriterOptimizerInfo(), + ), + _seed_state=Synced.new_or_load( + data=SeedSnapshot.new_capture(), + location=seed_dir, + reader_writer=ReaderWriterSeedSnapshot(), + versioner=FileVersioner(version_file=seed_dir / ".version"), + locker=FileLocker( + lock_path=seed_dir / ".lock", + poll=SEED_SNAPSHOT_FILELOCK_POLL, + timeout=SEED_SNAPSHOT_FILELOCK_TIMEOUT, + ), + ), + _shared_errors=Synced.new_or_load( + data=ErrDump(), + location=error_dir, + reader_writer=ReaderWriterErrDump("all"), + versioner=FileVersioner(version_file=error_dir / ".all.version"), + locker=FileLocker( + lock_path=error_dir / ".all.lock", + poll=GLOBAL_ERR_FILELOCK_POLL, + timeout=GLOBAL_ERR_FILELOCK_TIMEOUT, + ), + ), + _optimizer_state=Synced.new_or_load( + data=optimizer_state, + location=optimizer_state_dir, + reader_writer=ReaderWriterOptimizationState(), + versioner=FileVersioner(version_file=optimizer_state_dir / ".version"), + locker=FileLocker( + lock_path=optimizer_state_dir / ".lock", + poll=GLOBAL_ERR_FILELOCK_POLL, + timeout=GLOBAL_ERR_FILELOCK_TIMEOUT, + ), + ), + ) diff --git a/neps/state/neps_state.py b/neps/state/neps_state.py new file mode 100644 index 00000000..8afaee62 --- /dev/null +++ b/neps/state/neps_state.py @@ -0,0 +1,231 @@ +"""The main state object that holds all the shared state objects. + +This object is used to interact with the shared state objects in a safe atomic +manner, such that each worker can create an identical NePSState and interact with +it without having to worry about locking or out-dated information. + +For an actual instantiation of this object, see +[`create_or_load_filebased_neps_state`][neps.state.filebased.create_or_load_filebased_neps_state]. +""" + +from __future__ import annotations + +import logging +import time +from dataclasses import dataclass, field +from typing import TYPE_CHECKING, Callable, Generic, TypeVar, overload + +from more_itertools import take + +from neps.state.err_dump import ErrDump +from neps.state.optimizer import OptimizationState, OptimizerInfo +from neps.state.trial import Trial + +if TYPE_CHECKING: + from neps.optimizers.base_optimizer import BaseOptimizer + from neps.state.protocols import Synced, TrialRepo + from neps.state.seed_snapshot import SeedSnapshot + +logger = logging.getLogger(__name__) + +# TODO: Technically we don't need the same Location type for all shared objects. +Loc = TypeVar("Loc") +T = TypeVar("T") + + +@dataclass +class NePSState(Generic[Loc]): + """The main state object that holds all the shared state objects.""" + + location: str + + _trials: TrialRepo[Loc] = field(repr=False) + _optimizer_info: Synced[OptimizerInfo, Loc] + _seed_state: Synced[SeedSnapshot, Loc] = field(repr=False) + _optimizer_state: Synced[OptimizationState, Loc] + _shared_errors: Synced[ErrDump, Loc] = field(repr=False) + + def put_updated_trial(self, trial: Trial, /) -> None: + """Update the trial with the new information. + + Args: + trial: The trial to update. + + Raises: + VersionMismatchError: If the trial has been updated since it was last + fetched by the worker using this state. This indicates that some other + worker has updated the trial in the meantime and the changes from + this worker are rejected. + """ + shared_trial = self._trials.get_by_id(trial.id) + shared_trial.put(trial) + + def get_trial_by_id(self, trial_id: str, /) -> Trial: + """Get a trial by its id.""" + return self._trials.get_by_id(trial_id).synced() + + def get_trials_by_ids(self, trial_ids: list[str], /) -> dict[str, Trial | None]: + """Get trials by their ids.""" + return { + _id: shared_trial.synced() + for _id, shared_trial in self._trials.get_by_ids(trial_ids).items() + } + + def sample_trial( + self, + optimizer: BaseOptimizer, + *, + worker_id: str, + _sample_hooks: list[Callable] | None = None, + ) -> Trial: + """Sample a new trial from the optimizer. + + Args: + optimizer: The optimizer to sample the trial from. + worker_id: The worker that is sampling the trial. + _sample_hooks: A list of hooks to apply to the optimizer before sampling. + + Returns: + The new trial. + """ + with self._optimizer_state.acquire() as ( + opt_state, + put_opt, + ), self._seed_state.acquire() as (seed_state, put_seed_state): + trials: dict[Trial.ID, Trial] = {} + for trial_id, shared_trial in self._trials.all().items(): + trial = shared_trial.synced() + trials[trial_id] = trial + + seed_state.set_as_global_seed_state() + + # TODO: Not sure if any existing pre_load hooks required + # it to be done after `load_results`... I hope not. + if _sample_hooks is not None: + for hook in _sample_hooks: + optimizer = hook(optimizer) + + # NOTE: We don't want optimizers mutating this before serialization + budget = opt_state.budget.clone() if opt_state.budget is not None else None + sampled_config, new_opt_state = optimizer.ask( + trials=trials, + budget_info=budget, + optimizer_state=opt_state.shared_state, + ) + + if sampled_config.previous_config_id is not None: + previous_trial = trials.get(sampled_config.previous_config_id) + if previous_trial is None: + raise ValueError( + f"Previous trial '{sampled_config.previous_config_id}' not found." + ) + previous_trial_location = previous_trial.metadata.location + else: + previous_trial_location = None + + trial = Trial.new( + trial_id=sampled_config.id, + location="", # HACK: This will be set by the `TrialRepo` + config=sampled_config.config, + previous_trial=sampled_config.previous_config_id, + previous_trial_location=previous_trial_location, + time_sampled=time.time(), + worker_id=worker_id, + ) + shared_trial = self._trials.put_new(trial) + seed_state.recapture() + put_seed_state(seed_state) + put_opt( + OptimizationState(budget=opt_state.budget, shared_state=new_opt_state) + ) + + return trial + + def report_trial_evaluation( + self, + trial: Trial, + report: Trial.Report, + optimizer: BaseOptimizer, + *, + worker_id: str, + ) -> None: + """Update the trial with the evaluation report and update the optimizer state + accordingly. + + Args: + trial: The trial that was evaluated. + report: The evaluation report. + optimizer: The optimizer to update and get the state from + worker_id: The worker that evaluated the trial. + """ + shared_trial = self._trials.get_by_id(trial.id) + # TODO: This would fail if some other worker has already updated the trial. + + # IMPORTANT: We need to attach the report to the trial before updating the things. + trial.report = report + shared_trial.put(trial) + logger.debug("Updated trial '%s' with status '%s'", trial.id, trial.state) + with self._optimizer_state.acquire() as (opt_state, put_opt_state): + optimizer.update_state_post_evaluation(opt_state.shared_state, report) + + # TODO: If an optimizer doesn't use the state, this is a waste of time. + # Update the budget if we have one. + if opt_state.budget is not None: + budget_info = opt_state.budget + + if report.cost is not None: + budget_info.used_cost_budget += report.cost + put_opt_state(opt_state) + + if report.err is not None: + with self._shared_errors.acquire() as (errs, put_errs): + trial_err = ErrDump.SerializableTrialError( + trial_id=trial.id, + worker_id=worker_id, + err_type=type(report.err).__name__, + err=str(report.err), + tb=report.tb, + ) + errs.append(trial_err) + put_errs(errs) + + def get_errors(self) -> ErrDump: + """Get all the errors that have occurred during the optimization.""" + return self._shared_errors.synced() + + @overload + def get_next_pending_trial(self) -> Trial | None: ... + @overload + def get_next_pending_trial(self, n: int | None = None) -> list[Trial]: ... + + def get_next_pending_trial(self, n: int | None = None) -> Trial | list[Trial] | None: + """Get the next pending trial to evaluate. + + Args: + n: The number of trials to get. If `None`, get the next trial. + + Returns: + The next trial or a list of trials if `n` is not `None`. + """ + _pending_itr = ( + shared_trial.synced() for _, shared_trial in self._trials.pending() + ) + if n is not None: + return take(n, _pending_itr) + return next(_pending_itr, None) + + def all_trial_ids(self) -> set[Trial.ID]: + """Get all the trial ids that are known about.""" + return self._trials.all_trial_ids() + + def get_all_trials(self) -> dict[Trial.ID, Trial]: + """Get all the trials that are known about.""" + return {_id: trial.synced() for _id, trial in self._trials.all().items()} + + def optimizer_info(self) -> OptimizerInfo: + """Get the optimizer information.""" + return self._optimizer_info.synced() + + def optimizer_state(self) -> OptimizationState: + """Get the optimizer state.""" + return self._optimizer_state.synced() diff --git a/neps/state/optimizer.py b/neps/state/optimizer.py new file mode 100644 index 00000000..f4000b07 --- /dev/null +++ b/neps/state/optimizer.py @@ -0,0 +1,57 @@ +"""Optimizer state and info dataclasses.""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any, Mapping + + +@dataclass +class BudgetInfo: + """Information about the budget of an optimizer.""" + + max_cost_budget: float + used_cost_budget: float + + @property + def remaining_cost_budget(self) -> float: + """The remaining budget.""" + return self.max_cost_budget - self.used_cost_budget + + def clone(self) -> BudgetInfo: + """Clone the budget info.""" + return BudgetInfo( + max_cost_budget=self.max_cost_budget, + used_cost_budget=self.used_cost_budget, + ) + + +@dataclass +class OptimizationState: + """The current state of an optimizer.""" + + budget: BudgetInfo | None + """Information regarind the budget used by the optimization trajectory.""" + + shared_state: dict[str, Any] + """Any information the optimizer wants to store between calls + to sample and post evaluations. + + For example, an optimizer may wish to store running totals here or various other + bits of information that may be expensive to recompute. + + Right now there's no support for tensors/arrays and almost no optimizer uses this + feature. Only cost-cooling uses information out of `.budget`. + + Please reach out to @eddiebergman if you have a use case for this so we can make + it more robust. + """ + + +@dataclass +class OptimizerInfo: + """Meta-information about an optimizer.""" + + # TODO(eddiebergman): What are the common keywords + # we can use that don't have to be crammed into mapping + info: Mapping[str, Any] diff --git a/neps/state/protocols.py b/neps/state/protocols.py new file mode 100644 index 00000000..78fcee0d --- /dev/null +++ b/neps/state/protocols.py @@ -0,0 +1,560 @@ +"""This module defines the protocols used by +[`NePSState`][neps.state.neps_state.NePSState] and +[`Synced`][neps.state.synced.Synced] to ensure atomic operations to the state itself. +""" + +from __future__ import annotations + +import logging +from contextlib import contextmanager +from copy import deepcopy +from dataclasses import dataclass +from typing import TYPE_CHECKING, Callable, ClassVar, Generic, Iterable, Iterator, TypeVar +from typing_extensions import Protocol, Self + +from neps.exceptions import ( + LockFailedError, + TrialAlreadyExistsError, + TrialNotFoundError, + VersionedResourceAlreadyExistsError, + VersionedResourceDoesNotExistsError, + VersionedResourceRemovedError, + VersionMismatchError, +) + +if TYPE_CHECKING: + from neps.state import Trial + +logger = logging.getLogger(__name__) + +T = TypeVar("T") +K = TypeVar("K") + +# https://github.com/MaT1g3R/option/issues/40 +K2 = TypeVar("K2") +T2 = TypeVar("T2") + +Loc_contra = TypeVar("Loc_contra", contravariant=True) + + +class Versioner(Protocol): + """A versioner that can bump the version of a resource. + + It should have some [`current()`][neps.state.protocols.Versioner.current] method + to give the current version tag of a resource and a + [`bump()`][neps.state.protocols.Versioner.bump] method to provide a new version tag. + + These [`current()`][neps.state.protocols.Versioner.current] and + [`bump()`][neps.state.protocols.Versioner.bump] methods do not need to be atomic + but they should read/write to external state, i.e. file-system, database, etc. + """ + + def current(self) -> str | None: + """Return the current version as defined by the external state, i.e. + the version of the tag on disk. + + Returns: + The current version if there is one written. + """ + ... + + def bump(self) -> str: + """Create a new external version tag. + + Returns: + The new version tag. + """ + ... + + +class Locker(Protocol): + """A locker that can be used to communicate between workers.""" + + LockFailedError: ClassVar = LockFailedError + + @contextmanager + def lock(self) -> Iterator[None]: + """Initiate the lock as a context manager, releasing it when done.""" + ... + + def is_locked(self) -> bool: + """Check if lock is...well, locked. + + Should return True if the resource is locked, even if the lock is held by the + current worker/process. + """ + ... + + +class ReaderWriter(Protocol[T, Loc_contra]): + """A reader-writer that can read and write some resource T with location Loc. + + For example, a `ReaderWriter[Trial, Path]` indicates a class that can read and write + trials, given some `Path`. + """ + + def read(self, loc: Loc_contra, /) -> T: + """Read the resource at the given location.""" + ... + + def write(self, value: T, loc: Loc_contra, /) -> None: + """Write the resource at the given location.""" + ... + + +class TrialRepo(Protocol[K]): + """A repository of trials. + + The primary purpose of this protocol is to ensure consistent access to trial, + the ability to put in a new trial and know about the trials that are stored there. + """ + + TrialAlreadyExistsError: ClassVar = TrialAlreadyExistsError + TrialNotFoundError: ClassVar = TrialNotFoundError + + def all_trial_ids(self) -> set[Trial.ID]: + """List all the trial ids in this trial Repo.""" + ... + + def get_by_id(self, trial_id: Trial.ID) -> Synced[Trial, K]: + """Get a trial by its id.""" + ... + + def get_by_ids(self, trial_ids: list[Trial.ID]) -> dict[str, Synced[Trial, K]]: + """Get trials by their ids.""" + ... + + def put_new(self, trial: Trial) -> Synced[Trial, K]: + """Put a new trial in the repo.""" + ... + + def all(self) -> dict[Trial.ID, Synced[Trial, K]]: + """Get all trials in the repo.""" + ... + + def pending(self) -> Iterable[tuple[Trial.ID, Synced[Trial, K]]]: + """Get all pending trials in the repo. + + !!! note + This should return trials in the order in which they should be next evaluated, + usually the order in which they were put in the repo. + """ + ... + + +@dataclass +class VersionedResource(Generic[T, K]): + """A resource that will be read if it needs to update to the latest version. + + Relies on 3 main components: + * A [`Versioner`][neps.state.protocols.Versioner] to manage the versioning of the + resource. + * A [`ReaderWriter`][neps.state.protocols.ReaderWriter] to read and write the + resource. + * The location of the resource that can be used for the reader-writer. + """ + + VersionMismatchError: ClassVar = VersionMismatchError + VersionedResourceDoesNotExistsError: ClassVar = VersionedResourceDoesNotExistsError + VersionedResourceAlreadyExistsError: ClassVar = VersionedResourceAlreadyExistsError + VersionedResourceRemovedError: ClassVar = VersionedResourceRemovedError + + _current: T + _location: K + _version: str + _versioner: Versioner + _reader_writer: ReaderWriter[T, K] + + @staticmethod + def new( + *, + data: T2, + location: K2, + versioner: Versioner, + reader_writer: ReaderWriter[T2, K2], + ) -> VersionedResource[T2, K2]: + """Create a new VersionedResource. + + This will create a new resource if it doesn't exist, otherwise, + if it already exists, it will raise an error. + + Use [`load()`][neps.state.protocols.VersionedResource.load] if you want to + load an existing resource. + + Args: + data: The data to be stored. + location: The location where the data will be stored. + versioner: The versioner to be used. + reader_writer: The reader-writer to be used. + + Returns: + A new VersionedResource + + Raises: + VersionedResourceAlreadyExistsError: If a versioned resource already exists + at the given location. + """ + current_version = versioner.current() + if current_version is not None: + raise VersionedResourceAlreadyExistsError( + f"A versioend resource already already exists at '{location}'" + f" with version '{current_version}'" + ) + + version = versioner.bump() + reader_writer.write(data, location) + return VersionedResource( + _current=data, + _location=location, + _version=version, + _versioner=versioner, + _reader_writer=reader_writer, + ) + + @classmethod + def load( + cls, + *, + location: K2, + versioner: Versioner, + reader_writer: ReaderWriter[T2, K2], + ) -> VersionedResource[T2, K2]: + """Load an existing VersionedResource. + + This will load an existing resource if it exists, otherwise, it will raise an + error. + + Use [`new()`][neps.state.protocols.VersionedResource.new] if you want to + create a new resource. + + Args: + location: The location of the resource. + versioner: The versioner to be used. + reader_writer: The reader-writer to be used. + + Returns: + A VersionedResource + + Raises: + VersionedResourceDoesNotExistsError: If no versioned resource exists at + the given location. + """ + version = versioner.current() + if version is None: + raise cls.VersionedResourceDoesNotExistsError( + f"No versioned resource exists at '{location}'." + ) + data = reader_writer.read(location) + return VersionedResource( + _current=data, + _location=location, + _version=version, + _versioner=versioner, + _reader_writer=reader_writer, + ) + + def sync_and_get(self) -> T: + """Get the data and version of the resource.""" + self.sync() + return self._current + + def sync(self) -> None: + """Sync the resource with the latest version.""" + current_version = self._versioner.current() + if current_version is None: + raise self.VersionedResourceRemovedError( + f"Versioned resource at '{self._location}' has been removed!" + f" Last known version was '{self._version}'." + ) + + if self._version != current_version: + self._current = self._reader_writer.read(self._location) + self._version = current_version + + def put(self, data: T) -> None: + """Put the data and version of the resource. + + Raises: + VersionMismatchError: If the version of the resource is not the same as the + current version. This implies that the resource has been updated by + another worker. + """ + current_version = self._versioner.current() + if self._version != current_version: + raise self.VersionMismatchError( + f"Version mismatch - ours: '{self._version}', remote: '{current_version}'" + f" Tried to put data at '{self._location}'. Doing so would overwrite" + " changes made by another worker. The solution is to pull the latest" + " version of the resource and try again." + " The most possible reasons for this error is that a lock was not" + " utilized when getting this resource before putting it back." + ) + + self._reader_writer.write(data, self._location) + self._current = data + self._version = self._versioner.bump() + + def current(self) -> T: + """Get the current data of the resource.""" + return self._current + + def is_stale(self) -> bool: + """Check if the resource is stale.""" + return self._version != self._versioner.current() + + def location(self) -> K: + """Get the location of the resource.""" + return self._location + + +@dataclass +class Synced(Generic[T, K]): + """Manages a versioned resource but it's methods also implement locking procedures + for accessing it. + + Its types are parametrized by two type variables: + + * `T` is the type of the data stored in the resource. + * `K` is the type of the location of the resource, for example `Path` + + This wraps a [`VersionedResource`][neps.state.protocols.VersionedResource] and + additionally provides utility to perform atmoic operations on it using a + [`Locker`][neps.state.protocols.Locker]. + + This is used by [`NePSState`][neps.state.neps_state.NePSState] to manage the state + of trials and other shared resources. + + It consists of 2 main components: + + * A [`VersionedResource`][neps.state.protocols.VersionedResource] to manage the + versioning of the resource. + * A [`Locker`][neps.state.protocols.Locker] to manage the locking of the resource. + + The primary methods to interact with a resource that is behined a `Synced` are: + + * [`synced()`][neps.state.protocols.Synced.synced] to get the data of the resource + after syncing it to it's latest verison. + * [`acquire()`][neps.state.protocols.Synced.acquire] context manager to get latest + version of the data while also mainting a lock on it. This additionally provides + a `put()` operation to put the data back. This can primarily be used to get the + data, perform some mutation on it and then put it back, while not allowing other + workers access to the data. + """ + + LockFailedError: ClassVar = Locker.LockFailedError + VersionedResourceRemovedError: ClassVar = ( + VersionedResource.VersionedResourceRemovedError + ) + VersionMismatchError: ClassVar = VersionedResource.VersionMismatchError + VersionedResourceAlreadyExistsError: ClassVar = ( + VersionedResource.VersionedResourceAlreadyExistsError + ) + VersionedResourceDoesNotExistsError: ClassVar = ( + VersionedResource.VersionedResourceDoesNotExistsError + ) + + _resource: VersionedResource[T, K] + _locker: Locker + + @classmethod + def new( + cls, + *, + locker: Locker, + data: T2, + location: K2, + versioner: Versioner, + reader_writer: ReaderWriter[T2, K2], + ) -> Synced[T2, K2]: + """Create a new Synced resource. + + This will create a new resource if it doesn't exist, otherwise, + if it already exists, it will raise an error. + + Use [`load()`][neps.state.protocols.Synced.load] if you want to load an existing + resource. Use [`new_or_load()`][neps.state.protocols.Synced.new_or_load] if you + want to create a new resource if it doesn't exist, otherwise load an existing + resource. + + Args: + locker: The locker to be used. + data: The data to be stored. + location: The location where the data will be stored. + versioner: The versioner to be used. + reader_writer: The reader-writer to be used. + + Returns: + A new Synced resource. + + Raises: + VersionedResourceAlreadyExistsError: If a versioned resource already exists + at the given location. + """ + with locker.lock(): + vr = VersionedResource.new( + data=data, + location=location, + versioner=versioner, + reader_writer=reader_writer, + ) + return Synced(_resource=vr, _locker=locker) + + @classmethod + def load( + cls, + *, + locker: Locker, + location: K2, + versioner: Versioner, + reader_writer: ReaderWriter[T2, K2], + ) -> Synced[T2, K2]: + """Load an existing Synced resource. + + This will load an existing resource if it exists, otherwise, it will raise an + error. + + Use [`new()`][neps.state.protocols.Synced.new] if you want to create a new + resource. Use [`new_or_load()`][neps.state.protocols.Synced.new_or_load] if you + want to create a new resource if it doesn't exist, otherwise load an existing + resource. + + Args: + locker: The locker to be used. + location: The location of the resource. + versioner: The versioner to be used. + reader_writer: The reader-writer to be used. + + Returns: + A Synced resource. + + Raises: + VersionedResourceDoesNotExistsError: If no versioned resource exists at + the given location. + """ + with locker.lock(): + return Synced( + _resource=VersionedResource.load( + location=location, + versioner=versioner, + reader_writer=reader_writer, + ), + _locker=locker, + ) + + @classmethod + def new_or_load( + cls, + *, + locker: Locker, + data: T2, + location: K2, + versioner: Versioner, + reader_writer: ReaderWriter[T2, K2], + ) -> Synced[T2, K2]: + """Create a new Synced resource if it doesn't exist, otherwise load it. + + This will create a new resource if it doesn't exist, otherwise, it will load + an existing resource. + + Use [`new()`][neps.state.protocols.Synced.new] if you want to create a new + resource and fail otherwise. Use [`load()`][neps.state.protocols.Synced.load] + if you want to load an existing resource and fail if it doesn't exist. + + Args: + locker: The locker to be used. + data: The data to be stored. + + !!! warning + + This will be ignored if the data already exists. + + location: The location where the data will be stored. + versioner: The versioner to be used. + reader_writer: The reader-writer to be used. + + Returns: + A Synced resource. + """ + try: + return Synced.new( + locker=locker, + data=data, + location=location, + versioner=versioner, + reader_writer=reader_writer, + ) + except VersionedResourceAlreadyExistsError: + return Synced.load( + locker=locker, + location=location, + versioner=versioner, + reader_writer=reader_writer, + ) + + def synced(self) -> T: + """Get the data of the resource atomically.""" + with self._locker.lock(): + return self._resource.sync_and_get() + + def location(self) -> K: + """Get the location of the resource.""" + return self._resource.location() + + def put(self, data: T) -> None: + """Update the data atomically.""" + with self._locker.lock(): + self._resource.put(data) + + @contextmanager + def acquire(self) -> Iterator[tuple[T, Callable[[T], None]]]: + """Acquire the lock and get the data of the resource. + + This is a context manager that returns the data of the resource and a function + to put the data back. + + !!! note + This is the primary way to get the resource, mutate it and put it back. + Otherwise you likely want [`synced()`][neps.state.protocols.Synced.synced] + or [`put()`][neps.state.protocols.Synced.put]. + + Yields: + A tuple containing the data of the resource and a function to put the data + back. + """ + with self._locker.lock(): + self._resource.sync() + yield self._resource.current(), self._put_unsafe + + def deepcopy(self) -> Self: + """Create a deep copy of the shared resource.""" + return deepcopy(self) + + def _components(self) -> tuple[T, K, Versioner, ReaderWriter[T, K], Locker]: + """Get the components of the shared resource.""" + return ( + self._resource.current(), + self._resource.location(), + self._resource._versioner, + self._resource._reader_writer, + self._locker, + ) + + def _unsynced(self) -> T: + """Get the current data of the resource **without** locking and syncing it.""" + return self._resource.current() + + def _is_stale(self) -> bool: + """Check if the data held currently is not the latest version.""" + return self._resource.is_stale() + + def _is_locked(self) -> bool: + """Check if the resource is locked.""" + return self._locker.is_locked() + + def _put_unsafe(self, data: T) -> None: + """Put the data without checking for staleness or acquiring the lock. + + !!! warning + This should only really be called if you know what you're doing. + """ + self._resource.put(data) diff --git a/neps/state/seed_snapshot.py b/neps/state/seed_snapshot.py new file mode 100644 index 00000000..0f9fad87 --- /dev/null +++ b/neps/state/seed_snapshot.py @@ -0,0 +1,115 @@ +"""Snapshot of the global rng state.""" + +from __future__ import annotations + +import contextlib +import random +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, List, Tuple, Union +from typing_extensions import TypeAlias + +import numpy as np + +if TYPE_CHECKING: + import torch + + NP_RNG_STATE: TypeAlias = Tuple[str, np.ndarray, int, int, float] + PY_RNG_STATE: TypeAlias = Tuple[int, Tuple[int, ...], Union[int, None]] + TORCH_RNG_STATE: TypeAlias = torch.Tensor + TORCH_CUDA_RNG_STATE: TypeAlias = List[torch.Tensor] + + +@dataclass +class SeedSnapshot: + """State of the global rng. + + Primarly enables storing of the rng state to disk using a binary format + native to each library, allowing for potential version mistmatches between + processes loading the state, as long as they can read the binary format. + """ + + np_rng: NP_RNG_STATE + py_rng: PY_RNG_STATE + torch_rng: TORCH_RNG_STATE | None + torch_cuda_rng: TORCH_CUDA_RNG_STATE | None + + @classmethod + def new_capture(cls) -> SeedSnapshot: + """Current state of the global rng. + + Takes a snapshot, including cloning or copying any arrays, tensors, etc. + """ + self = cls(None, None, None, None) # type: ignore + self.recapture() + return self + + def recapture(self) -> None: + """Reread the state of the global rng into this snapshot.""" + # https://numpy.org/doc/stable/reference/random/generated/numpy.random.get_state.html + + self.py_rng = random.getstate() + + np_keys = np.random.get_state(legacy=True) + assert np_keys[0] == "MT19937" # type: ignore + self.np_rng = (np_keys[0], np_keys[1].copy(), *np_keys[2:]) # type: ignore + + with contextlib.suppress(Exception): + import torch + + self.torch_rng = torch.random.get_rng_state().clone() + torch_cuda_keys: list[torch.Tensor] | None = None + if torch.cuda.is_available(): + torch_cuda_keys = [c.clone() for c in torch.cuda.get_rng_state_all()] + self.torch_cuda_rng = torch_cuda_keys + + def set_as_global_seed_state(self) -> None: + """Set the global rng to the given state.""" + np.random.set_state(self.np_rng) + random.setstate(self.py_rng) + + if self.torch_rng is not None or self.torch_cuda_rng is not None: + import torch + + if self.torch_rng is not None: + torch.random.set_rng_state(self.torch_rng) + + if self.torch_cuda_rng is not None and torch.cuda.is_available(): + torch.cuda.set_rng_state_all(self.torch_cuda_rng) + + def __eq__(self, other: Any, /) -> bool: # noqa: PLR0911 + if not isinstance(other, SeedSnapshot): + return False + + if not (self.py_rng == other.py_rng): + return False + + if not ( + self.np_rng[0] == other.np_rng[0] + and self.np_rng[2] == other.np_rng[2] + and self.np_rng[3] == other.np_rng[3] + and self.np_rng[4] == other.np_rng[4] + ): + return False + + if not np.array_equal(self.np_rng[1], other.np_rng[1]): + return False + + if self.torch_rng is not None and other.torch_rng is not None: + import torch + + if not torch.equal(self.torch_rng, other.torch_rng): + return False + + if self.torch_cuda_rng is not None and other.torch_cuda_rng is not None: + import torch + + if not all( + torch.equal(a, b) + for a, b in zip(self.torch_cuda_rng, other.torch_cuda_rng) + ): + return False + + if not isinstance(self.torch_rng, type(other.torch_rng)): + return False + + return isinstance(self.torch_cuda_rng, type(other.torch_cuda_rng)) diff --git a/neps/state/settings.py b/neps/state/settings.py new file mode 100644 index 00000000..f34a9435 --- /dev/null +++ b/neps/state/settings.py @@ -0,0 +1,171 @@ +"""Settings for the worker and the global state of NePS.""" + +from __future__ import annotations + +from dataclasses import dataclass +from enum import Enum +from typing import Literal + + +@dataclass +class DefaultReportValues: + """Values to use when an error occurs.""" + + loss_value_on_error: float | None = None + """The value to use for the loss when an error occurs.""" + + cost_value_on_error: float | None = None + """The value to use for the cost when an error occurs.""" + + cost_if_not_provided: float | None = None + """The value to use for the cost when the evaluation function does not provide one.""" + + learning_curve_on_error: list[float] | None = None + """The value to use for the learning curve when an error occurs. + + If `'loss'`, the learning curve will be set to the loss value but as + a list with a single value. + """ + + learning_curve_if_not_provided: Literal["loss"] | list[float] | None = None + """The value to use for the learning curve when the evaluation function does + not provide one.""" + + +class OnErrorPossibilities(Enum): + """Possible values for what to do when an error occurs.""" + + RAISE_WORKER_ERROR = "raise_worker_error" + """Raise an error only if the error occurs in the worker.""" + + STOP_WORKER_ERROR = "stop_worker_error" + """Stop the worker if an error occurs in the worker, without raising""" + + RAISE_ANY_ERROR = "raise_any_error" + """Raise an error if there was an error from any worker, i.e. there is a trial in the + NePSState that has an error.""" + + STOP_ANY_ERROR = "stop_any_error" + """Stop the workers if any error occured from any worker, i.e. there is a trial in the + NePSState that has an error.""" + + IGNORE = "ignore" + """Ignore all errors and continue running.""" + + +# TODO: We can extend this over time +# For now this is what was needed for the backend state and workers. +@dataclass +class WorkerSettings: + """Settings for a running instance of NePS.""" + + # --------- Evaluation --------- + on_error: OnErrorPossibilities + """What to do when an error occurs. + + - `'raise_worker_error'`: Raise an error only if the error occurs in the worker. + - `'raise_any_error'`: Raise an error if any error occurs from any worker, i.e. + there is a trial in the NePSState that has an error. + - `'ignore'`: Ignore all errors and continue running. + """ + + default_report_values: DefaultReportValues + """Values to use when an error occurs or was not specified.""" + + # --------- Global Stopping Criterion --------- + max_evaluations_total: int | None + """The maximum number of evaluations to run in total. + + Once this evaluation total is reached, **all** workers will stop evaluating + new configurations. + + To control whether currently evaluating configurations are included in this + total, see + [`include_in_progress_evaluations_towards_maximum`][neps.state.settings.WorkerSettings.include_in_progress_evaluations_towards_maximum]. + + If `None`, there is no limit and workers will continue to evaluate + indefinitely. + """ + + include_in_progress_evaluations_towards_maximum: bool + """Whether to include currently evaluating configurations towards the + stopping criterion + [`max_evaluations_total`][neps.state.settings.WorkerSettings.max_evaluations_total] + """ + + max_cost_total: float | None + """The maximum cost to run in total. + + Once this cost total is reached, **all** workers will stop evaluating new + configurations. + + This cost is the sum of `'cost'` values that are returned by evaluation + of the target function. + + If `None`, there is no limit and workers will continue to evaluate + indefinitely or until another stopping criterion is met. + """ + + max_evaluation_time_total_seconds: float | None + """The maximum wallclock time allowed for evaluation in total. + + !!! note + This does not include time for sampling new configurations. + + Once this wallclock time is reached, **all** workers will stop once their + current evaluation is finished. + + If `None`, there is no limit and workers will continue to evaluate + indefinitely or until another stopping criterion is met. + """ + + # --------- Local Worker Stopping Criterion --------- + max_evaluations_for_worker: int | None + """The maximum number of evaluations to run for the worker. + + This count is specific to each worker spawned by NePS. + **only** the current worker will stop evaluating new configurations once + this limit is reached. + + If `None`, there is no limit and this worker will continue to evaluate + indefinitely or until another stopping criterion is met. + """ + + max_cost_for_worker: float | None + """The maximum cost incurred by a worker before finisihng. + + Once this cost total is reached, **only** this worker will stop evaluating new + configurations. + + This cost is the sum of `'cost'` values that are returned by evaluation + of the target function. + + If `None`, there is no limit and the worker will continue to evaluate + indefinitely or until another stopping criterion is met. + """ + + max_evaluation_time_for_worker_seconds: float | None + """The maximum time to allow this worker for evaluating configurations. + + !!! note + This does not include time for sampling new configurations. + + If `None`, there is no limit and this worker will continue to evaluate + indefinitely or until another stopping criterion is met. + """ + + max_wallclock_time_for_worker_seconds: float | None + """The maximum wallclock time to run for this worker. + + Once this wallclock time is reached, **only** this worker will stop evaluating + new configurations. + + !!! warning + This will not stop the worker if it is currently evaluating a configuration. + + This is useful when the worker is deployed on some managed resource where + there is a time limit. + + If `None`, there is no limit and this worker will continue to evaluate + indefinitely or until another stopping criterion is met. + """ diff --git a/neps/state/trial.py b/neps/state/trial.py new file mode 100644 index 00000000..862e2bbb --- /dev/null +++ b/neps/state/trial.py @@ -0,0 +1,289 @@ +"""A trial is a configuration and it's associated data.""" + +from __future__ import annotations + +import logging +from dataclasses import asdict, dataclass +from enum import Enum +from typing import TYPE_CHECKING, Any, Callable, ClassVar, Literal, Mapping +from typing_extensions import Self + +import numpy as np + +from neps.exceptions import NePSError +from neps.utils.types import ConfigResult + +if TYPE_CHECKING: + from neps.search_spaces import SearchSpace + from neps.utils.types import ERROR, RawConfig + + +logger = logging.getLogger(__name__) + + +class NotReportedYetError(NePSError): + """Raised when trying to access a report that has not been reported yet.""" + + +class State(Enum): + """The state of a trial.""" + + PENDING = "pending" + SUBMITTED = "submitted" + EVALUATING = "evaluating" + SUCCESS = "success" + FAILED = "failed" + CRASHED = "crashed" + CORRUPTED = "corrupted" + UNKNOWN = "unknown" + + +@dataclass +class MetaData: + """Metadata for a trial.""" + + id: str + location: str + previous_trial_id: Trial.ID | None + previous_trial_location: str | None + sampling_worker_id: str + time_sampled: float + + evaluating_worker_id: str | None = None + evaluation_duration: float | None = None + + time_submitted: float | None = None + time_started: float | None = None + time_end: float | None = None + + +@dataclass +class Report: + """A failed report of the evaluation of a configuration.""" + + trial_id: Trial.ID + loss: float | None + cost: float | None + learning_curve: list[float] | None # TODO: Serializing a large list into yaml sucks! + extra: Mapping[str, Any] + err: Exception | None + tb: str | None + reported_as: Literal["success", "failed", "crashed"] + evaluation_duration: float | None + + def __post_init__(self) -> None: + if isinstance(self.err, str): + self.err = Exception(self.err) # type: ignore + + def to_deprecate_result_dict(self) -> dict[str, Any] | ERROR: + """Return the report as a dictionary.""" + if self.reported_as == "success": + d = {"loss": self.loss, "cost": self.cost, **self.extra} + + # HACK: Backwards compatibility. Not sure how much this is needed + # but it should be removed once optimizers stop calling the + # `get_loss`, `get_cost`, `get_learning_curve` methods of `BaseOptimizer` + # and just use the `Report` directly. + if "info_dict" not in d or "learning_curve" not in d["info_dict"]: + d.setdefault("info_dict", {})["learning_curve"] = self.learning_curve + return d + + return "error" + + def __eq__(self, value: Any, /) -> bool: + # HACK : Since it could be probably that one of loss or cost is nan, + # we need a custom comparator for this object + # HACK : We also have to skip over the `Err` object since when it's deserialized, + # we can not recover the original object/type. + if not isinstance(value, Report): + return False + + other_items = value.__dict__ + for k, v in self.__dict__.items(): + other_v = other_items[k] + + # HACK: Deserialization of `Err` means we can only compare + # the string representation of the error. + if k == "err": + if str(v) != str(other_v): + return False + elif k in ("loss", "cost"): + if v is not None and np.isnan(v): + if other_v is None or not np.isnan(other_v): + return False + elif v != other_v: + return False + elif v != other_v: + return False + + return True + + +@dataclass +class Trial: + """A trial is a configuration and it's associated data.""" + + ID: ClassVar = str + State: ClassVar = State + Report: ClassVar = Report + MetaData: ClassVar = MetaData + NotReportedYetError: ClassVar = NotReportedYetError + + config: Mapping[str, Any] + metadata: MetaData + state: State + report: Report | None + + @classmethod + def new( + cls, + *, + trial_id: Trial.ID, + config: Mapping[str, Any], + location: str, + previous_trial: Trial.ID | None, + previous_trial_location: str | None, + time_sampled: float, + worker_id: int | str, + ) -> Self: + """Create a new trial object that was just sampled.""" + worker_id = str(worker_id) + return cls( + state=State.PENDING, + config=config, + metadata=MetaData( + id=trial_id, + location=location, + time_sampled=time_sampled, + previous_trial_id=previous_trial, + previous_trial_location=previous_trial_location, + sampling_worker_id=worker_id, + ), + report=None, + ) + + @property + def id(self) -> Trial.ID: + """Return the id of the trial.""" + return self.metadata.id + + def into_config_result( + self, + config_to_search_space: Callable[[RawConfig], SearchSpace], + ) -> ConfigResult: + """Convert the trial and report to a `ConfigResult` object.""" + if self.report is None: + raise self.NotReportedYetError("The trial has not been reported yet.") + + result: dict[str, Any] | ERROR + if self.report.reported_as == "success": + result = { + **self.report.extra, + "loss": self.report.loss, + "cost": self.report.cost, + } + else: + result = "error" + + return ConfigResult( + self.id, + config=config_to_search_space(self.config), + result=result, + metadata=asdict(self.metadata), + ) + + def set_submitted(self, *, time_submitted: float) -> None: + """Set the trial as submitted.""" + self.metadata.time_submitted = time_submitted + self.state = State.SUBMITTED + + def set_evaluating(self, *, time_started: float, worker_id: int | str) -> None: + """Set the trial as in progress.""" + self.metadata.time_started = time_started + self.metadata.evaluating_worker_id = str(worker_id) + self.state = State.EVALUATING + + def set_complete( + self, + *, + report_as: Literal["success", "failed", "crashed"], + time_end: float, + loss: float | None, + cost: float | None, + learning_curve: list[float] | None, + err: Exception | None, + tb: str | None, + extra: Mapping[str, Any] | None, + evaluation_duration: float | None, + ) -> Report: + """Set the report for the trial.""" + if report_as == "success": + self.state = State.SUCCESS + elif report_as == "failed": + self.state = State.FAILED + elif report_as == "crashed": + self.state = State.CRASHED + else: + raise ValueError(f"Invalid report_as: '{report_as}'") + + self.metadata.time_end = time_end + self.metadata.evaluation_duration = evaluation_duration + + extra = {} if extra is None else extra + + loss = float(loss) if loss is not None else None + cost = float(cost) if cost is not None else None + if learning_curve is not None: + learning_curve = [float(v) for v in learning_curve] + + return Report( + trial_id=self.metadata.id, + reported_as=report_as, + evaluation_duration=evaluation_duration, + loss=loss, + cost=cost, + learning_curve=learning_curve, + extra=extra, + err=err, + tb=tb, + ) + + def set_corrupted(self) -> None: + """Set the trial as corrupted.""" + self.state = State.CORRUPTED + + def reset(self) -> None: + """Reset the trial to a pending state.""" + self.state = State.PENDING + self.metadata = MetaData( + id=self.metadata.id, + location=self.metadata.location, + previous_trial_id=self.metadata.previous_trial_id, + previous_trial_location=self.metadata.previous_trial_location, + time_sampled=self.metadata.time_sampled, + sampling_worker_id=self.metadata.sampling_worker_id, + ) + + +def to_config_result( + trial: Trial, + report: Report, + config_to_search_space: Callable[[RawConfig], SearchSpace], +) -> ConfigResult: + """Convert the trial and report to a `ConfigResult` object.""" + result: dict[str, Any] | ERROR + if report.reported_as == "success": + result = { + **report.extra, + "loss": report.loss, + "cost": report.cost, + } + else: + result = "error" + + return ConfigResult( + trial.id, + config=config_to_search_space(trial.config), + result=result, + metadata=asdict(trial.metadata), + ) diff --git a/neps/status/status.py b/neps/status/status.py index 0199a911..e2f43eb6 100644 --- a/neps/status/status.py +++ b/neps/status/status.py @@ -3,13 +3,14 @@ # ruff: noqa: T201 from __future__ import annotations -from itertools import chain +from dataclasses import asdict from pathlib import Path from typing import TYPE_CHECKING, Any import pandas as pd -from neps.runtime import ErrorReport, SharedState, Trial +from neps.state.filebased import load_filebased_neps_state +from neps.state.trial import Trial from neps.utils._locker import Locker from neps.utils.types import ConfigID, _ConfigResultForStats @@ -36,30 +37,34 @@ def get_summary_dict( # NOTE: We don't lock the shared state since we are just reading and don't need to # make decisions based on the state - shared_state = SharedState(root_directory) - shared_state.update_from_disk() + shared_state = load_filebased_neps_state(root_directory) - trials_by_state = shared_state.trials_by_state() + trials = shared_state.get_all_trials() evaluated: dict[ConfigID, _ConfigResultForStats] = {} - for trial in chain( - trials_by_state[Trial.State.SUCCESS], - trials_by_state[Trial.State.ERROR], - ): - assert trial.report is not None + for trial in trials.values(): + if trial.report is None: + continue + _result_for_stats = _ConfigResultForStats( - trial.id, - trial.config, - "error" if isinstance(trial.report, ErrorReport) else trial.report.results, - trial.metadata, + id=trial.id, + config=trial.config, + result=trial.report.to_deprecate_result_dict(), + metadata=asdict(trial.metadata), ) evaluated[trial.id] = _result_for_stats in_progress = { - trial.id: trial.config for trial in trials_by_state[Trial.State.IN_PROGRESS] + trial.id: trial.config + for trial in trials.values() + if trial.State == Trial.State.EVALUATING + } + pending = { + trial.id: trial.config + for trial in trials.values() + if trial.State == Trial.State.PENDING } - pending = {trial.id: trial.config for trial in trials_by_state[Trial.State.PENDING]} summary: dict[str, Any] = {} diff --git a/neps/utils/_rng.py b/neps/utils/_rng.py deleted file mode 100644 index 0705837f..00000000 --- a/neps/utils/_rng.py +++ /dev/null @@ -1,176 +0,0 @@ -from __future__ import annotations - -import json -import random -from contextlib import contextmanager -from dataclasses import dataclass -from pathlib import Path -from typing import Iterator, List, Tuple, Union -from typing_extensions import TypeAlias - -import numpy as np -import torch - -NP_RNG_STATE: TypeAlias = Tuple[str, np.ndarray, int, int, float] -PY_RNG_STATE: TypeAlias = Tuple[int, Tuple[int, ...], Union[int, None]] -TORCH_RNG_STATE: TypeAlias = torch.Tensor -TORCH_CUDA_RNG_STATE: TypeAlias = List[torch.Tensor] - - -@dataclass -class SeedState: - """State of the global rng. - - Primarly enables storing of the rng state to disk using a binary format - native to each library, allowing for potential version mistmatches between - processes loading the state, as long as they can read the binary format. - """ - - # It seems like they're all uint32 but I can't be sure. - PY_RNG_STATE_DTYPE = np.int64 - - np_rng: NP_RNG_STATE - py_rng: PY_RNG_STATE - torch_rng: TORCH_RNG_STATE - torch_cuda_rng: TORCH_CUDA_RNG_STATE | None - - @classmethod - def get(cls) -> SeedState: - """Current state of the global rng. - - Takes a snapshot, including cloning or copying any arrays, tensors, etc. - """ - # https://numpy.org/doc/stable/reference/random/generated/numpy.random.get_state.html - np_keys = np.random.get_state(legacy=True) - assert np_keys[0] == "MT19937" # type: ignore - np_keys = (np_keys[0], np_keys[1].copy(), *np_keys[2:]) # type: ignore - - py_rng = random.getstate() - torch_rng = torch.random.get_rng_state().clone() - torch_cuda_keys: list[torch.Tensor] | None = None - if torch.cuda.is_available(): - torch_cuda_keys = [c.clone() for c in torch.cuda.get_rng_state_all()] - - return cls( - np_rng=np_keys, # type: ignore - py_rng=py_rng, - torch_rng=torch_rng, - torch_cuda_rng=torch_cuda_keys, - ) - - def set_as_global_state(self) -> None: - """Set the global rng to the given state.""" - np.random.set_state(self.np_rng) - random.setstate(self.py_rng) - torch.random.set_rng_state(self.torch_rng) - if self.torch_cuda_rng and torch.cuda.is_available(): - torch.cuda.set_rng_state_all(self.torch_cuda_rng) - - def dump(self, path: Path) -> None: - """Save the state to a directory.""" - if path.exists(): - assert path.is_dir() - else: - path.mkdir(parents=True) - - py_rng_version, py_rng_state, py_guass_next = self.py_rng - np_rng_kind, np_rng_state, np_pos, np_has_gauss, np_cached_gauss = self.np_rng - - seed_info = { - "np_rng_kind": np_rng_kind, - "np_pos": np_pos, - "np_has_gauss": np_has_gauss, - "np_cached_gauss": np_cached_gauss, - "py_rng_version": py_rng_version, - "py_guass_next": py_guass_next, - } - - # NOTE(eddiebergman): Chose JSON since it's fast and non-injectable - with (path / "seed_info.json").open("w") as f: - json.dump(seed_info, f) - - py_rng_state_arr = np.array(py_rng_state, dtype=self.PY_RNG_STATE_DTYPE) - with (path / "py_rng.npy").open("wb") as f: - py_rng_state_arr.tofile(f) - - with (path / "np_rng_state.npy").open("wb") as f: - np_rng_state.tofile(f) - - torch.save(self.torch_rng, path / "torch_rng_state.pt") - - if self.torch_cuda_rng: - torch.save(self.torch_cuda_rng, path / "torch_cuda_rng_state.pt") - - @classmethod - def load(cls, path: Path) -> SeedState: - assert path.is_dir() - - with (path / "seed_info.json").open("r") as f: - seed_info = json.load(f) - - # Load and set pythons rng - py_rng_state = tuple( - int(x) for x in np.fromfile(path / "py_rng.npy", dtype=cls.PY_RNG_STATE_DTYPE) - ) - np_rng_state = np.fromfile(path / "np_rng_state.npy", dtype=np.uint32) - - # By specifying `weights_only=True`, it disables arbitrary object loading - torch_rng_state = torch.load(path / "torch_rng_state.pt", weights_only=True) - - torch_cuda_rng = None - torch_cuda_rng_path = path / "torch_cuda_rng_state.pt" - if torch_cuda_rng_path.exists(): - # By specifying `weights_only=True`, it disables arbitrary object loading - torch_cuda_rng = torch.load( - path / "torch_cuda_rng_state.pt", - weights_only=True, - ) - - return cls( - np_rng=( - seed_info["np_rng_kind"], - np_rng_state, - seed_info["np_pos"], - seed_info["np_has_gauss"], - seed_info["np_cached_gauss"], - ), - py_rng=( - seed_info["py_rng_version"], - py_rng_state, - seed_info["py_guass_next"], - ), - torch_rng=torch_rng_state, - torch_cuda_rng=torch_cuda_rng, - ) - - @classmethod - @contextmanager - def use( - cls, - path: Path, - *, - update_on_exit: bool = True, - ) -> Iterator[SeedState]: - """Context manager to use a seed state. - - If the path exists, load the seed state from the path and set it as the - global state. Otherwise, use the current global state. - - Args: - path: Path to the seed state. - update_on_exit: If True, get the seed state after the context manager returns - and save it to the path. - - Yields: - SeedState: The seed state in use. - """ - if path.exists(): - seed_state = cls.load(path) - seed_state.set_as_global_state() - else: - seed_state = cls.get() - - yield seed_state - - if update_on_exit: - cls.get().dump(path) diff --git a/neps/utils/common.py b/neps/utils/common.py index f80a01ea..2a9ca586 100644 --- a/neps/utils/common.py +++ b/neps/utils/common.py @@ -10,7 +10,7 @@ import torch import yaml -from neps.runtime import get_in_progress_trial +from neps.runtime import get_in_progress_trial, get_workers_neps_state # TODO(eddiebergman): I feel like this function should throw an error if it can't @@ -36,13 +36,10 @@ def load_checkpoint( """ if directory is None: trial = get_in_progress_trial() - - if trial is None: - return None - - directory = trial.disk.previous_pipeline_dir + directory = trial.metadata.previous_trial_location if directory is None: return None + assert isinstance(directory, str) directory = Path(directory) checkpoint_path = (directory / checkpoint_name).with_suffix(".pth") @@ -79,14 +76,7 @@ def save_checkpoint( """ if directory is None: in_progress_trial = get_in_progress_trial() - - if in_progress_trial is None: - raise ValueError( - "No current trial was found to save the checkpoint! This should not" - " happen. Please report this issue and in the meantime you may provide a" - " directory manually." - ) - directory = in_progress_trial.pipeline_dir + directory = in_progress_trial.metadata.location directory = Path(directory) checkpoint_path = (directory / checkpoint_name).with_suffix(".pth") @@ -115,8 +105,8 @@ def load_lightning_checkpoint( checkpoint data. Args: - previous_pipeline_directory: The previous pipeline directory. checkpoint_dir: The directory where checkpoint files are stored. + previous_pipeline_directory: The previous pipeline directory. Returns: A tuple containing the checkpoint path (str) and the loaded checkpoint data (dict) @@ -124,9 +114,7 @@ def load_lightning_checkpoint( """ if previous_pipeline_directory is None: trial = get_in_progress_trial() - if trial is not None: - previous_pipeline_directory = trial.disk.previous_pipeline_dir - + previous_pipeline_directory = trial.metadata.previous_trial_location if previous_pipeline_directory is None: return None, None @@ -151,6 +139,9 @@ def load_lightning_checkpoint( return checkpoint_path, checkpoint +# TODO: We should have a better way to have a shared folder between trials. +# Right now, the fidelity lineage is linear, however this will be a difficulty +# when/if we have a tree structure. def get_initial_directory(pipeline_directory: Path | str | None = None) -> Path: """Find the initial directory based on its existence and the presence of the "previous_config.id" file. @@ -161,35 +152,24 @@ def get_initial_directory(pipeline_directory: Path | str | None = None) -> Path: Returns: The initial directory. """ + neps_state = get_workers_neps_state() if pipeline_directory is not None: pipeline_directory = Path(pipeline_directory) + # TODO: Hard coded assumption + config_id = pipeline_directory.name.split("_", maxsplit=1)[-1] + trial = neps_state.get_trial_by_id(config_id) else: trial = get_in_progress_trial() - if trial is None: - raise ValueError( - "No current trial was found to get the initial directory! This should not" - " happen. Please report this issue and in the meantime you may provide" - " a directory manually." - ) - pipeline_directory = trial.pipeline_dir - - # TODO(eddiebergman): Can we just make this a method of the Trial class somehow? - # This relies on the fact it's always called "previous_config.id" which could subtly - # break, if it were to be updated. # Recursively find the initial directory - current_pipeline_directory = pipeline_directory - while True: - previous_pipeline_directory_id = current_pipeline_directory / "previous_config.id" - if not previous_pipeline_directory_id.exists(): - # Initial directory found - return pipeline_directory + while (prev_trial_id := trial.metadata.previous_trial_id) is not None: + trial = neps_state.get_trial_by_id(prev_trial_id) - optim_result_dir = pipeline_directory.parent - with previous_pipeline_directory_id.open("r") as config_id_file: - config_id = config_id_file.read() + initial_dir = trial.metadata.location - current_pipeline_directory = optim_result_dir / f"config_{config_id}" + # TODO: Hard coded assumption that we are operating in a filebased neps + assert isinstance(initial_dir, str) + return Path(initial_dir) def get_searcher_data( @@ -363,14 +343,14 @@ def instance_from_map( # noqa: C901, PLR0912 # Give the arguments to the class if args_dict: - instance = partial(instance, **args_dict) + instance = partial(instance, **args_dict) # type: ignore if as_class: return instance if is_partial_class(instance): try: - instance = instance() + instance = instance() # type: ignore except TypeError as e: raise TypeError(f"{e} when calling {instance} with {args_dict}") from e diff --git a/neps/utils/data_loading.py b/neps/utils/data_loading.py index 0bdb15e3..a0f86210 100644 --- a/neps/utils/data_loading.py +++ b/neps/utils/data_loading.py @@ -5,6 +5,7 @@ import json import os import re +from dataclasses import asdict from itertools import chain from pathlib import Path from typing import Any, Mapping, TypedDict @@ -12,7 +13,7 @@ import numpy as np import yaml -from neps.runtime import ErrorReport, SharedState, Trial +from neps.state.filebased import load_filebased_neps_state from neps.utils.types import ERROR, ConfigID, ResultDict, _ConfigResultForStats @@ -135,24 +136,20 @@ def read_tasks_and_dev_stages_from_disk( if dev_id is None: continue - state = SharedState(Path(dev_dir_path)) - state.update_from_disk() - trials_by_state = state.trials_by_state() + state = load_filebased_neps_state(Path(dev_dir_path)) + trials = state.get_all_trials() evaluated: dict[ConfigID, _ConfigResultForStats] = {} - for trial in chain( - trials_by_state[Trial.State.SUCCESS], - trials_by_state[Trial.State.ERROR], - ): - assert trial.report is not None + for trial in trials.values(): + if trial.report is None: + continue + _result_for_stats = _ConfigResultForStats( trial.id, trial.config, - "error" - if isinstance(trial.report, ErrorReport) - else trial.report.results, - trial.metadata, + trial.report.to_deprecate_result_dict(), + asdict(trial.metadata), ) evaluated[trial.id] = _result_for_stats @@ -181,27 +178,24 @@ def read_user_prior_results_from_disk( if not prior_dir.is_dir(): continue - state = SharedState(prior_dir) - with state.sync(lock=False): - evaluated: dict[ConfigID, _ConfigResultForStats] = {} - trials_by_state = state.trials_by_state() + state = load_filebased_neps_state(Path(prior_dir)) + trials = state.get_all_trials() + evaluated: dict[ConfigID, _ConfigResultForStats] = {} - for trial in chain( - trials_by_state[Trial.State.SUCCESS], - trials_by_state[Trial.State.ERROR], - ): - assert trial.report is not None - _result_for_stats = _ConfigResultForStats( - trial.id, - trial.config, - "error" - if isinstance(trial.report, ErrorReport) - else trial.report.results, - trial.metadata, - ) - evaluated[trial.id] = _result_for_stats + for trial in trials.values(): + if trial.report is None: + continue - results[prior_dir.name] = evaluated + assert trial.report is not None + _result_for_stats = _ConfigResultForStats( + trial.id, + trial.config, + trial.report.to_deprecate_result_dict(), + asdict(trial.metadata), + ) + evaluated[trial.id] = _result_for_stats + + results[prior_dir.name] = evaluated return results @@ -328,25 +322,22 @@ def summarize_results( # noqa: C901 # TODO(unknown): only use IDs if provided final_results = results[final_task_id][final_dev_id] else: - state = SharedState(Path(seed_dir)) - with state.sync(lock=False): - trials_by_state = state.trials_by_state() - - final_results = {} - for trial in chain( - trials_by_state[Trial.State.SUCCESS], - trials_by_state[Trial.State.ERROR], - ): - assert trial.report is not None - _result_for_stats = _ConfigResultForStats( - trial.id, - trial.config, - "error" - if isinstance(trial.report, ErrorReport) - else trial.report.results, - trial.metadata, - ) - final_results[trial.id] = _result_for_stats + state = load_filebased_neps_state(Path(seed_dir)) + trials = state.get_all_trials() + + final_results = {} + for trial in trials.values(): + if trial.report is None: + continue + + assert trial.report is not None + _result_for_stats = _ConfigResultForStats( + trial.id, + trial.config, + trial.report.to_deprecate_result_dict(), + asdict(trial.metadata), + ) + final_results[trial.id] = _result_for_stats # This part is copied from neps.status() best_loss: float = float("inf") diff --git a/neps/utils/files.py b/neps/utils/files.py index 0111f2a7..ddb0627c 100644 --- a/neps/utils/files.py +++ b/neps/utils/files.py @@ -2,35 +2,47 @@ from __future__ import annotations +import dataclasses +from enum import Enum from pathlib import Path from typing import Any, Iterable, Mapping import yaml -def _serializable_format(data: Any) -> Any: +def serializable_format(data: Any) -> Any: # noqa: PLR0911 + """Format data to be serializable.""" if hasattr(data, "serialize"): - return _serializable_format(data.serialize()) + return serializable_format(data.serialize()) + + if dataclasses.is_dataclass(data) and not isinstance(data, type): + return serializable_format(dataclasses.asdict(data)) # type: ignore + + if isinstance(data, Exception): + return str(data) + + if isinstance(data, Enum): + return data.value if isinstance(data, Mapping): - return {key: _serializable_format(val) for key, val in data.items()} + return {key: serializable_format(val) for key, val in data.items()} if not isinstance(data, str) and isinstance(data, Iterable): - return [_serializable_format(val) for val in data] + return [serializable_format(val) for val in data] if type(data).__module__ in ["numpy", "torch"]: data = data.tolist() # type: ignore if type(data).__module__ == "numpy": data = data.item() - return _serializable_format(data) + return serializable_format(data) return data def serialize(data: Any, path: Path | str, *, sort_keys: bool = True) -> None: """Serialize data to a yaml file.""" - data = _serializable_format(data) + data = serializable_format(data) path = Path(path) with path.open("w") as file_stream: try: @@ -45,7 +57,15 @@ def serialize(data: Any, path: Path | str, *, sort_keys: bool = True) -> None: def deserialize(path: Path | str) -> dict[str, Any]: """Deserialize data from a yaml file.""" with Path(path).open("r") as file_stream: - return yaml.full_load(file_stream) # type: ignore + data = yaml.full_load(file_stream) # type: ignore + + if not isinstance(data, dict): + raise TypeError( + f"Deserialized data at {path} is not a dictionary!" + f" Got {type(data)} instead.\n{data}" + ) + + return data def empty_file(file_path: Path) -> bool: diff --git a/neps/utils/types.py b/neps/utils/types.py index 3316eb03..a6b6c540 100644 --- a/neps/utils/types.py +++ b/neps/utils/types.py @@ -7,18 +7,15 @@ from typing_extensions import TypeAlias import numpy as np -import torch if TYPE_CHECKING: from neps.search_spaces.search_space import SearchSpace + from neps.state.trial import Trial # TODO(eddiebergman): We can turn this to an enum at some # point to prevent having to isinstance and str match ERROR: TypeAlias = Literal["error"] - Number: TypeAlias = Union[int, float, np.number] -Array: TypeAlias = Union[np.ndarray, torch.Tensor] - ConfigID: TypeAlias = str RawConfig: TypeAlias = Mapping[str, Any] Metadata: TypeAlias = Dict[str, Any] @@ -54,9 +51,7 @@ class ConfigResult: config: SearchSpace """Configuration that was evaluated.""" - # TODO(eddiebergman): Check about using a `TypedDict` here since I'm pretty sure - # there's always a "loss" key - result: ResultDict | ERROR + result: Trial.Report | ResultDict | ERROR """Some dictionary of results.""" metadata: dict diff --git a/neps/utils/validation.py b/neps/utils/validation.py new file mode 100644 index 00000000..884df0c5 --- /dev/null +++ b/neps/utils/validation.py @@ -0,0 +1,31 @@ +"""Validation utilities for the NePS package.""" + +from __future__ import annotations + +import inspect +from typing import Any, Callable + +from neps.exceptions import NePSError + + +class DeprecatedArgumentError(NePSError): + """Raised when a deprecated argument is used.""" + + +def validate_run_pipeline_arguments(f: Callable[..., Any]) -> None: + """Validate the arguments of a run pipeline function to see if deprcated arguments + are used. + """ + evaluation_fn_params = inspect.signature(f).parameters + if "previous_working_directory" in evaluation_fn_params: + raise RuntimeError( + "the argument: 'previous_working_directory' was deprecated. " + f"In the function: '{f.__name__}', please, " + "use 'previous_pipeline_directory' instead. ", + ) + if "working_directory" in evaluation_fn_params: + raise RuntimeError( + "the argument: 'working_directory' was deprecated. " + f"In the function: '{f.__name__}', please, " + "use 'pipeline_directory' instead. ", + ) diff --git a/neps_examples/basic_usage/hyperparameters.py b/neps_examples/basic_usage/hyperparameters.py index b254e16b..164b49cb 100644 --- a/neps_examples/basic_usage/hyperparameters.py +++ b/neps_examples/basic_usage/hyperparameters.py @@ -8,7 +8,7 @@ def run_pipeline(float1, float2, categorical, integer1, integer2): loss = -float(np.sum([float1, float2, int(categorical), integer1, integer2])) - time.sleep(0.7) # For demonstration purposes + # time.sleep(0.7) # For demonstration purposes return loss diff --git a/neps_examples/efficiency/multi_fidelity.py b/neps_examples/efficiency/multi_fidelity.py index 0731b1b5..bdbcc965 100644 --- a/neps_examples/efficiency/multi_fidelity.py +++ b/neps_examples/efficiency/multi_fidelity.py @@ -82,7 +82,7 @@ def run_pipeline(pipeline_directory, previous_pipeline_directory, learning_rate, run_pipeline=run_pipeline, pipeline_space=pipeline_space, root_directory="results/multi_fidelity_example", - # Optional: Do not start another evaluation after <=100 epochs, corresponds to cost + # Optional: Do not start another evaluation after <=50 epochs, corresponds to cost # field above. - max_cost_total=100, + max_cost_total=50, ) diff --git a/pyproject.toml b/pyproject.toml index 37cf020d..06b4baa4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -71,6 +71,7 @@ ruff = "^0.4" pre-commit = "^3" mypy = "^1" pytest = "^7" +pytest-cases = "^3" types-PyYAML = "^6" mkdocs-material = "*" mkdocs-autorefs = "*" @@ -261,7 +262,7 @@ convention = "google" max-args = 10 # Changed from default of 5 [tool.pytest.ini_options] -addopts = "--basetemp ./tests_tmpdir -m 'neps_api or core_examples'" +addopts = "--basetemp ./tests_tmpdir -m 'not ci_examples'" markers = [ "ci_examples", "core_examples", @@ -302,7 +303,6 @@ module = [ "neps.api", "neps.optimizers.*", "neps.search_spaces.architecture.*", - "neps.search_spaces.yaml_search_space_utils", "neps.utils.run_args_from_yaml", ] ignore_errors = true diff --git a/tests/test_neps_api/test_api.py b/tests/test_neps_api/test_api.py index a50b91d1..32408007 100644 --- a/tests/test_neps_api/test_api.py +++ b/tests/test_neps_api/test_api.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import logging import os import runpy @@ -54,11 +56,11 @@ def test_default_examples(tmp_path): assert os.path.exists(folder_path), f"Directory does not exist: {folder_path}" - info_yaml_path = os.path.join(folder_path, ".optimizer_info.yaml") + info_yaml_path = os.path.join(folder_path, ".optimizer_info", "info.yaml") assert os.path.exists( str(info_yaml_path) - ), f"File does not exist: {info_yaml_path}" + ), f"File does not exist: {info_yaml_path}\n{os.listdir(folder_path)}" # Load the YAML file with open(str(info_yaml_path)) as yaml_config: @@ -85,7 +87,7 @@ def test_baseoptimizer_examples(tmp_path): assert os.path.exists(folder_path), f"Directory does not exist: {folder_path}" - info_yaml_path = os.path.join(folder_path, ".optimizer_info.yaml") + info_yaml_path = os.path.join(folder_path, ".optimizer_info", "info.yaml") assert os.path.exists( str(info_yaml_path) @@ -114,7 +116,7 @@ def test_user_created_yaml_examples(tmp_path): assert os.path.exists(folder_path), f"Directory does not exist: {folder_path}" - info_yaml_path = os.path.join(folder_path, ".optimizer_info.yaml") + info_yaml_path = os.path.join(folder_path, ".optimizer_info", "info.yaml") assert os.path.exists( str(info_yaml_path) diff --git a/tests/test_runtime/__init__.py b/tests/test_runtime/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_runtime/test_default_report_values.py b/tests/test_runtime/test_default_report_values.py new file mode 100644 index 00000000..652db9de --- /dev/null +++ b/tests/test_runtime/test_default_report_values.py @@ -0,0 +1,170 @@ +from __future__ import annotations + +from pathlib import Path +from pytest_cases import fixture + +from neps.optimizers.random_search.optimizer import RandomSearch +from neps.runtime import DefaultWorker +from neps.search_spaces.search_space import SearchSpace +from neps.state.filebased import create_or_load_filebased_neps_state +from neps.state.neps_state import NePSState +from neps.state.optimizer import OptimizationState, OptimizerInfo +from neps.state.settings import DefaultReportValues, OnErrorPossibilities, WorkerSettings +from neps.search_spaces import FloatParameter +from neps.state.trial import Trial + + +@fixture +def neps_state(tmp_path: Path) -> NePSState[Path]: + return create_or_load_filebased_neps_state( + directory=tmp_path / "neps_state", + optimizer_info=OptimizerInfo(info={"nothing": "here"}), + optimizer_state=OptimizationState(budget=None, shared_state={}), + ) + + +def test_default_values_on_error( + neps_state: NePSState, +) -> None: + optimizer = RandomSearch(pipeline_space=SearchSpace(a=FloatParameter(0, 1))) + settings = WorkerSettings( + on_error=OnErrorPossibilities.IGNORE, + default_report_values=DefaultReportValues( + loss_value_on_error=2.4, # <- Highlight + cost_value_on_error=2.4, # <- Highlight + learning_curve_on_error=[2.4, 2.5], # <- Highlight + ), + max_evaluations_total=None, + include_in_progress_evaluations_towards_maximum=False, + max_cost_total=None, + max_evaluations_for_worker=1, + max_evaluation_time_total_seconds=None, + max_wallclock_time_for_worker_seconds=None, + max_evaluation_time_for_worker_seconds=None, + max_cost_for_worker=None, + ) + + def eval_function(*args, **kwargs) -> float: + raise ValueError("This is an error") + + worker = DefaultWorker.new( + state=neps_state, + optimizer=optimizer, + evaluation_fn=eval_function, + settings=settings, + _pre_sample_hooks=None, + ) + worker.run() + + trials = neps_state.get_all_trials() + n_crashed = sum( + trial.state == Trial.State.CRASHED is not None for trial in trials.values() + ) + assert len(trials) == 1 + assert n_crashed == 1 + + assert neps_state.get_next_pending_trial() is None + assert len(neps_state.get_errors()) == 1 + + trial = trials.popitem()[1] + assert trial.state == Trial.State.CRASHED + assert trial.report is not None + assert trial.report.loss == 2.4 + assert trial.report.cost == 2.4 + assert trial.report.learning_curve == [2.4, 2.5] + + +def test_default_values_on_not_specified( + neps_state: NePSState, +) -> None: + optimizer = RandomSearch(pipeline_space=SearchSpace(a=FloatParameter(0, 1))) + settings = WorkerSettings( + on_error=OnErrorPossibilities.IGNORE, + default_report_values=DefaultReportValues( + cost_if_not_provided=2.4, + learning_curve_if_not_provided=[2.4, 2.5], + ), + max_evaluations_total=None, + include_in_progress_evaluations_towards_maximum=False, + max_cost_total=None, + max_evaluations_for_worker=1, + max_evaluation_time_total_seconds=None, + max_wallclock_time_for_worker_seconds=None, + max_evaluation_time_for_worker_seconds=None, + max_cost_for_worker=None, + ) + + def eval_function(*args, **kwargs) -> float: + return 1.0 + + worker = DefaultWorker.new( + state=neps_state, + optimizer=optimizer, + evaluation_fn=eval_function, + settings=settings, + _pre_sample_hooks=None, + ) + worker.run() + + trials = neps_state.get_all_trials() + n_sucess = sum( + trial.state == Trial.State.SUCCESS is not None for trial in trials.values() + ) + assert len(trials) == 1 + assert n_sucess == 1 + + assert neps_state.get_next_pending_trial() is None + assert len(neps_state.get_errors()) == 0 + + trial = trials.popitem()[1] + assert trial.state == Trial.State.SUCCESS + assert trial.report is not None + assert trial.report.cost == 2.4 + assert trial.report.learning_curve == [2.4, 2.5] + + +def test_default_value_loss_curve_take_loss_value( + neps_state: NePSState, +) -> None: + optimizer = RandomSearch(pipeline_space=SearchSpace(a=FloatParameter(0, 1))) + settings = WorkerSettings( + on_error=OnErrorPossibilities.IGNORE, + default_report_values=DefaultReportValues(learning_curve_if_not_provided="loss"), + max_evaluations_total=None, + include_in_progress_evaluations_towards_maximum=False, + max_cost_total=None, + max_evaluations_for_worker=1, + max_evaluation_time_total_seconds=None, + max_wallclock_time_for_worker_seconds=None, + max_evaluation_time_for_worker_seconds=None, + max_cost_for_worker=None, + ) + + LOSS = 1.0 + + def eval_function(*args, **kwargs) -> float: + return LOSS + + worker = DefaultWorker.new( + state=neps_state, + optimizer=optimizer, + evaluation_fn=eval_function, + settings=settings, + _pre_sample_hooks=None, + ) + worker.run() + + trials = neps_state.get_all_trials() + n_sucess = sum( + trial.state == Trial.State.SUCCESS is not None for trial in trials.values() + ) + assert len(trials) == 1 + assert n_sucess == 1 + + assert neps_state.get_next_pending_trial() is None + assert len(neps_state.get_errors()) == 0 + + trial = trials.popitem()[1] + assert trial.state == Trial.State.SUCCESS + assert trial.report is not None + assert trial.report.learning_curve == [LOSS] diff --git a/tests/test_runtime/test_error_handling_strategies.py b/tests/test_runtime/test_error_handling_strategies.py new file mode 100644 index 00000000..5e819448 --- /dev/null +++ b/tests/test_runtime/test_error_handling_strategies.py @@ -0,0 +1,200 @@ +from __future__ import annotations + +import pytest +from dataclasses import dataclass +from pandas.core.common import contextlib +from pathlib import Path +from pytest_cases import fixture, parametrize + +from neps.optimizers.random_search.optimizer import RandomSearch +from neps.runtime import DefaultWorker +from neps.search_spaces.search_space import SearchSpace +from neps.state.err_dump import SerializedError +from neps.state.filebased import create_or_load_filebased_neps_state +from neps.state.neps_state import NePSState +from neps.state.optimizer import OptimizationState, OptimizerInfo +from neps.state.settings import DefaultReportValues, OnErrorPossibilities, WorkerSettings +from neps.search_spaces import FloatParameter +from neps.state.trial import Trial + + +@fixture +def neps_state(tmp_path: Path) -> NePSState[Path]: + return create_or_load_filebased_neps_state( + directory=tmp_path / "neps_state", + optimizer_info=OptimizerInfo(info={"nothing": "here"}), + optimizer_state=OptimizationState(budget=None, shared_state={}), + ) + + +@parametrize( + "on_error", + [OnErrorPossibilities.RAISE_ANY_ERROR, OnErrorPossibilities.RAISE_WORKER_ERROR], +) +def test_worker_raises_when_error_in_self( + neps_state: NePSState, + on_error: OnErrorPossibilities, +) -> None: + optimizer = RandomSearch(pipeline_space=SearchSpace(a=FloatParameter(0, 1))) + settings = WorkerSettings( + on_error=on_error, # <- Highlight + default_report_values=DefaultReportValues(), + max_evaluations_total=None, + include_in_progress_evaluations_towards_maximum=False, + max_cost_total=None, + max_evaluations_for_worker=1, + max_evaluation_time_total_seconds=None, + max_wallclock_time_for_worker_seconds=None, + max_evaluation_time_for_worker_seconds=None, + max_cost_for_worker=None, + ) + + def eval_function(*args, **kwargs) -> float: + raise ValueError("This is an error") + + worker = DefaultWorker.new( + state=neps_state, + optimizer=optimizer, + evaluation_fn=eval_function, + settings=settings, + _pre_sample_hooks=None, + ) + with pytest.raises(ValueError, match="This is an error"): + worker.run() + + trials = neps_state.get_all_trials() + n_crashed = sum( + trial.state == Trial.State.CRASHED is not None for trial in trials.values() + ) + assert len(trials) == 1 + assert n_crashed == 1 + + assert neps_state.get_next_pending_trial() is None + assert len(neps_state.get_errors()) == 1 + + +def test_worker_raises_when_error_in_other_worker(neps_state: NePSState) -> None: + optimizer = RandomSearch(pipeline_space=SearchSpace(a=FloatParameter(0, 1))) + settings = WorkerSettings( + on_error=OnErrorPossibilities.RAISE_ANY_ERROR, # <- Highlight + default_report_values=DefaultReportValues(), + max_evaluations_total=None, + include_in_progress_evaluations_towards_maximum=False, + max_cost_total=None, + max_evaluations_for_worker=1, + max_evaluation_time_total_seconds=None, + max_wallclock_time_for_worker_seconds=None, + max_evaluation_time_for_worker_seconds=None, + max_cost_for_worker=None, + ) + + def evaler(*args, **kwargs) -> float: + raise ValueError("This is an error") + + worker1 = DefaultWorker.new( + state=neps_state, + optimizer=optimizer, + evaluation_fn=evaler, + settings=settings, + _pre_sample_hooks=None, + ) + worker2 = DefaultWorker.new( + state=neps_state, + optimizer=optimizer, + evaluation_fn=evaler, + settings=settings, + _pre_sample_hooks=None, + ) + + # Worker1 should run 1 and error out + with contextlib.suppress(ValueError): + worker1.run() + + # Worker2 should not run and immeditaly error out, however + # it will have loaded in a serialized error + with pytest.raises(SerializedError): + worker2.run() + + trials = neps_state.get_all_trials() + n_crashed = sum( + trial.state == Trial.State.CRASHED is not None for trial in trials.values() + ) + assert len(trials) == 1 + assert n_crashed == 1 + + assert neps_state.get_next_pending_trial() is None + assert len(neps_state.get_errors()) == 1 + + +@pytest.mark.parametrize( + "on_error", + [OnErrorPossibilities.IGNORE, OnErrorPossibilities.RAISE_WORKER_ERROR], +) +def test_worker_does_not_raise_when_error_in_other_worker( + neps_state: NePSState, + on_error: OnErrorPossibilities, +) -> None: + optimizer = RandomSearch(pipeline_space=SearchSpace(a=FloatParameter(0, 1))) + settings = WorkerSettings( + on_error=OnErrorPossibilities.RAISE_WORKER_ERROR, # <- Highlight + default_report_values=DefaultReportValues(), + max_evaluations_total=None, + include_in_progress_evaluations_towards_maximum=False, + max_cost_total=None, + max_evaluations_for_worker=1, + max_evaluation_time_total_seconds=None, + max_wallclock_time_for_worker_seconds=None, + max_evaluation_time_for_worker_seconds=None, + max_cost_for_worker=None, + ) + + @dataclass + class _Eval: + do_raise: bool + + def __call__(self, *args, **kwargs) -> float: + if self.do_raise: + raise ValueError("This is an error") + return 10 + + evaler = _Eval(do_raise=True) + + worker1 = DefaultWorker.new( + state=neps_state, + optimizer=optimizer, + evaluation_fn=evaler, + settings=settings, + _pre_sample_hooks=None, + ) + worker2 = DefaultWorker.new( + state=neps_state, + optimizer=optimizer, + evaluation_fn=evaler, + settings=settings, + _pre_sample_hooks=None, + ) + + # Worker1 should run 1 and error out + evaler.do_raise = True + with contextlib.suppress(ValueError): + worker1.run() + assert worker1.worker_cumulative_eval_count == 1 + + # Worker2 should run successfully + evaler.do_raise = False + worker2.run() + assert worker2.worker_cumulative_eval_count == 1 + + trials = neps_state.get_all_trials() + n_success = sum( + trial.state == Trial.State.SUCCESS is not None for trial in trials.values() + ) + n_crashed = sum( + trial.state == Trial.State.CRASHED is not None for trial in trials.values() + ) + assert n_success == 1 + assert n_crashed == 1 + assert len(trials) == 2 + + assert neps_state.get_next_pending_trial() is None + assert len(neps_state.get_errors()) == 1 diff --git a/tests/test_runtime/test_locking.py b/tests/test_runtime/test_locking.py deleted file mode 100644 index a79dfa23..00000000 --- a/tests/test_runtime/test_locking.py +++ /dev/null @@ -1,105 +0,0 @@ -import re -import shutil -import subprocess -from pathlib import Path - -import pandas as pd -import pytest -from more_itertools import first_true - - -def launch_example_processes(n_workers: int = 3) -> list: - processes = [] - for _ in range(n_workers): - processes.append( - subprocess.Popen( - "python -m neps_examples.basic_usage.hyperparameters && python -m neps_examples.basic_usage.analyse", - stdout=subprocess.PIPE, - shell=True, - text=True, - ) - ) - return processes - - -@pytest.mark.runtime -def test_filelock() -> None: - """Test that the filelocking method of parallelization works as intended.""" - # Note: Not using tmpdir - # - # Unfortunatly we can't control this from launching the subprocess so we handle - # clean up manualy. This is likely to break if doing multi-processing testing - # with something like pytest-forked - # - # Note: dependancy on basic_usage example - # - # Not a great idea incase the example changes, ideally each process here would - # perform some predefined operation which is known to this test. If the example - # changes in some unexpected way, it may break this test - results_dir = Path("results") / "hyperparameters_example" / "results" - assert not results_dir.exists(), "Please delete this directory before running the test" - try: - # Wait for them - p_list = launch_example_processes(n_workers=2) - for p in p_list: - p.wait() - out, _ = p.communicate() - lines = out.splitlines() - - pending_re = r"#Pending configs with worker:\s+(\d+)" - eval_re = r"#Evaluated configs:\s+(\d+)" - - evaluated = first_true((re.match(eval_re, l) for l in lines), default=0) # noqa - pending = first_true((re.match(pending_re, l) for l in lines), default=0) # noqa - - assert evaluated is not None - assert pending is not None - - if evaluated == 0: - evaluated_configs = 0 - else: - evaluated_configs = int(evaluated.groups()[0]) # type: ignore - - if pending == 0: - pending_configs = 0 - else: - pending_configs = int(pending.groups()[0]) # type: ignore - - # Make sure the evaluated configs and the ones pending add up to 15 - assert evaluated_configs + pending_configs == 15 - - # Make sure there are 15 completed configurations - expected = sorted(f"config_{i}" for i in range(1, 16)) - folders = sorted(f.name for f in results_dir.iterdir()) - assert folders == expected - - except Exception as e: - raise e - finally: - if results_dir.exists(): - shutil.rmtree(results_dir.parent) - - -@pytest.mark.summary_csv -def test_summary_csv(): - # Testing the csv files output. - summary_dir = Path("results") / "hyperparameters_example" / "summary_csv" - try: - if not summary_dir.exists(): - p_list = launch_example_processes(n_workers=2) - for p in p_list: - p.wait() - assert summary_dir.is_dir() - run_data_df = pd.read_csv(summary_dir / "run_status.csv") - run_data_df.set_index("description", inplace=True) - num_evaluated_configs_csv = run_data_df.loc["num_evaluated_configs", "value"] - assert num_evaluated_configs_csv == 15 - - config_data_df = pd.read_csv(summary_dir / "config_data.csv") - assert config_data_df.shape[0] == 15 - assert (config_data_df["status"] == "complete").all() - except Exception as e: - raise e - finally: - if summary_dir.exists(): - shutil.rmtree(summary_dir.parent) diff --git a/tests/test_runtime/test_stopping_criterion.py b/tests/test_runtime/test_stopping_criterion.py new file mode 100644 index 00000000..28426a1f --- /dev/null +++ b/tests/test_runtime/test_stopping_criterion.py @@ -0,0 +1,481 @@ +from __future__ import annotations + +import time +from pathlib import Path +from pytest_cases import fixture + +from neps.optimizers.random_search.optimizer import RandomSearch +from neps.runtime import DefaultWorker +from neps.search_spaces.search_space import SearchSpace +from neps.state.filebased import create_or_load_filebased_neps_state +from neps.state.neps_state import NePSState +from neps.state.optimizer import OptimizationState, OptimizerInfo +from neps.state.settings import DefaultReportValues, OnErrorPossibilities, WorkerSettings +from neps.search_spaces import FloatParameter +from neps.state.trial import Trial + + +@fixture +def neps_state(tmp_path: Path) -> NePSState[Path]: + return create_or_load_filebased_neps_state( + directory=tmp_path / "neps_state", + optimizer_info=OptimizerInfo(info={"nothing": "here"}), + optimizer_state=OptimizationState(budget=None, shared_state={}), + ) + + +def test_max_evaluations_total_stopping_criterion( + neps_state: NePSState, +) -> None: + optimizer = RandomSearch(pipeline_space=SearchSpace(a=FloatParameter(0, 1))) + settings = WorkerSettings( + on_error=OnErrorPossibilities.IGNORE, + default_report_values=DefaultReportValues(), + max_evaluations_total=3, # <- Highlight + include_in_progress_evaluations_towards_maximum=False, + max_cost_total=None, + max_evaluations_for_worker=None, + max_evaluation_time_total_seconds=None, + max_wallclock_time_for_worker_seconds=None, + max_evaluation_time_for_worker_seconds=None, + max_cost_for_worker=None, + ) + + def eval_function(*args, **kwargs) -> float: + return 1.0 + + worker = DefaultWorker.new( + state=neps_state, + optimizer=optimizer, + evaluation_fn=eval_function, + settings=settings, + _pre_sample_hooks=None, + ) + worker.run() + + assert worker.worker_cumulative_eval_count == 3 + assert neps_state.get_next_pending_trial() is None + assert len(neps_state.get_errors()) == 0 + + trials = neps_state.get_all_trials() + for _, trial in trials.items(): + assert trial.state == Trial.State.SUCCESS + assert trial.report is not None + assert trial.report.loss == 1.0 + + # New worker has the same total number of evaluations so it should not run anything. + new_worker = DefaultWorker.new( + state=neps_state, + optimizer=optimizer, + evaluation_fn=eval_function, + settings=settings, + _pre_sample_hooks=None, + ) + new_worker.run() + assert new_worker.worker_cumulative_eval_count == 0 + assert neps_state.get_next_pending_trial() is None + assert len(neps_state.get_errors()) == 0 + + +def test_worker_evaluations_total_stopping_criterion( + neps_state: NePSState, +) -> None: + optimizer = RandomSearch(pipeline_space=SearchSpace(a=FloatParameter(0, 1))) + settings = WorkerSettings( + on_error=OnErrorPossibilities.IGNORE, + default_report_values=DefaultReportValues(), + max_evaluations_total=None, + include_in_progress_evaluations_towards_maximum=False, + max_cost_total=None, + max_evaluations_for_worker=2, + max_evaluation_time_total_seconds=None, + max_wallclock_time_for_worker_seconds=None, + max_evaluation_time_for_worker_seconds=None, + max_cost_for_worker=None, + ) + + def eval_function(*args, **kwargs) -> float: + return 1.0 + + worker = DefaultWorker.new( + state=neps_state, + optimizer=optimizer, + evaluation_fn=eval_function, + settings=settings, + _pre_sample_hooks=None, + ) + worker.run() + + assert worker.worker_cumulative_eval_count == 2 + assert neps_state.get_next_pending_trial() is None + assert len(neps_state.get_errors()) == 0 + + trials = neps_state.get_all_trials() + assert len(trials) == 2 + for _, trial in trials.items(): + assert trial.state == Trial.State.SUCCESS + assert trial.report is not None + assert trial.report.loss == 1.0 + + # New worker should run 2 more evaluations + new_worker = DefaultWorker.new( + state=neps_state, + optimizer=optimizer, + evaluation_fn=eval_function, + settings=settings, + _pre_sample_hooks=None, + ) + new_worker.run() + + assert worker.worker_cumulative_eval_count == 2 + assert neps_state.get_next_pending_trial() is None + assert len(neps_state.get_errors()) == 0 + + trials = neps_state.get_all_trials() + assert len(trials) == 4 # Now we should have 4 of them + for _, trial in trials.items(): + assert trial.state == Trial.State.SUCCESS + assert trial.report is not None + assert trial.report.loss == 1.0 + + +def test_include_in_progress_evaluations_towards_maximum_with_work_eval_count( + neps_state: NePSState, +) -> None: + optimizer = RandomSearch(pipeline_space=SearchSpace(a=FloatParameter(0, 1))) + settings = WorkerSettings( + on_error=OnErrorPossibilities.IGNORE, + default_report_values=DefaultReportValues(), + max_evaluations_total=2, # <- Highlight, only 2 maximum evaluations allowed + include_in_progress_evaluations_towards_maximum=True, # <- include the inprogress trial + max_cost_total=None, + max_evaluations_for_worker=None, + max_evaluation_time_total_seconds=None, + max_wallclock_time_for_worker_seconds=None, + max_evaluation_time_for_worker_seconds=None, + max_cost_for_worker=None, + ) + + # We put in one trial as being inprogress + pending_trial = neps_state.sample_trial(optimizer, worker_id="dummy") + pending_trial.set_evaluating(time_started=0.0, worker_id="dummy") + neps_state.put_updated_trial(pending_trial) + + def eval_function(*args, **kwargs) -> float: + return 1.0 + + worker = DefaultWorker.new( + state=neps_state, + optimizer=optimizer, + evaluation_fn=eval_function, + settings=settings, + _pre_sample_hooks=None, + ) + worker.run() + + assert worker.worker_cumulative_eval_count == 1 + assert ( + neps_state.get_next_pending_trial() is None + ) # should have no pending trials to be picked up + assert len(neps_state.get_errors()) == 0 + + trials = neps_state.get_all_trials() + assert len(trials) == 2 + + the_pending_trial = trials[pending_trial.id] + assert the_pending_trial == pending_trial + assert the_pending_trial.state == Trial.State.EVALUATING + assert the_pending_trial.report is None + + the_completed_trial_id = next(iter(trials.keys() - {pending_trial.id})) + the_completed_trial = trials[the_completed_trial_id] + + assert the_completed_trial.state == Trial.State.SUCCESS + assert the_completed_trial.report is not None + assert the_completed_trial.report.loss == 1.0 + + +def test_max_cost_total( + neps_state: NePSState, +) -> None: + optimizer = RandomSearch(pipeline_space=SearchSpace(a=FloatParameter(0, 1))) + settings = WorkerSettings( + on_error=OnErrorPossibilities.IGNORE, + default_report_values=DefaultReportValues(), + max_evaluations_total=10, # Safety incase it doesn't work that we eventually stop + include_in_progress_evaluations_towards_maximum=False, + max_cost_total=2, # <- Highlight, only 2 maximum evaluations allowed + max_evaluations_for_worker=None, + max_evaluation_time_total_seconds=None, + max_wallclock_time_for_worker_seconds=None, + max_evaluation_time_for_worker_seconds=None, + max_cost_for_worker=None, + ) + + def eval_function(*args, **kwargs) -> dict: + return {"loss": 1.0, "cost": 1.0} + + worker = DefaultWorker.new( + state=neps_state, + optimizer=optimizer, + evaluation_fn=eval_function, + settings=settings, + _pre_sample_hooks=None, + ) + worker.run() + + assert worker.worker_cumulative_eval_count == 2 + assert worker.worker_cumulative_eval_cost == 2.0 + assert ( + neps_state.get_next_pending_trial() is None + ) # should have no pending trials to be picked up + assert len(neps_state.get_errors()) == 0 + + trials = neps_state.get_all_trials() + assert len(trials) == 2 + + # New worker should now not run anything as the total cost has been reached. + new_worker = DefaultWorker.new( + state=neps_state, + optimizer=optimizer, + evaluation_fn=eval_function, + settings=settings, + _pre_sample_hooks=None, + ) + new_worker.run() + assert new_worker.worker_cumulative_eval_count == 0 + + +def test_worker_cost_total( + neps_state: NePSState, +) -> None: + optimizer = RandomSearch(pipeline_space=SearchSpace(a=FloatParameter(0, 1))) + settings = WorkerSettings( + on_error=OnErrorPossibilities.IGNORE, + default_report_values=DefaultReportValues(), + max_evaluations_total=10, # Safety incase it doesn't work that we eventually stop + include_in_progress_evaluations_towards_maximum=False, + max_cost_total=None, + max_evaluations_for_worker=None, + max_evaluation_time_total_seconds=None, + max_wallclock_time_for_worker_seconds=None, + max_evaluation_time_for_worker_seconds=None, + max_cost_for_worker=2, # <- Highlight, only 2 maximum evaluations allowed + ) + + def eval_function(*args, **kwargs) -> dict: + return {"loss": 1.0, "cost": 1.0} + + worker = DefaultWorker.new( + state=neps_state, + optimizer=optimizer, + evaluation_fn=eval_function, + settings=settings, + _pre_sample_hooks=None, + ) + worker.run() + + assert worker.worker_cumulative_eval_count == 2 + assert worker.worker_cumulative_eval_cost == 2.0 + assert ( + neps_state.get_next_pending_trial() is None + ) # should have no pending trials to be picked up + assert len(neps_state.get_errors()) == 0 + + trials = neps_state.get_all_trials() + assert len(trials) == 2 + + # New worker should also run 2 more trials + new_worker = DefaultWorker.new( + state=neps_state, + optimizer=optimizer, + evaluation_fn=eval_function, + settings=settings, + _pre_sample_hooks=None, + ) + new_worker.run() + assert new_worker.worker_cumulative_eval_count == 2 + assert new_worker.worker_cumulative_eval_cost == 2.0 + assert ( + neps_state.get_next_pending_trial() is None + ) # should have no pending trials to be picked up + assert len(neps_state.get_errors()) == 0 + + trials = neps_state.get_all_trials() + assert len(trials) == 4 # 2 more trials were ran + + +def test_worker_wallclock_time( + neps_state: NePSState, +) -> None: + optimizer = RandomSearch(pipeline_space=SearchSpace(a=FloatParameter(0, 1))) + settings = WorkerSettings( + on_error=OnErrorPossibilities.IGNORE, + default_report_values=DefaultReportValues(), + max_evaluations_total=1000, # Safety incase it doesn't work that we eventually stop + include_in_progress_evaluations_towards_maximum=False, + max_cost_total=None, + max_evaluations_for_worker=None, + max_evaluation_time_total_seconds=None, + max_wallclock_time_for_worker_seconds=1, # <- highlight, 1 second + max_evaluation_time_for_worker_seconds=None, + max_cost_for_worker=None, + ) + + def eval_function(*args, **kwargs) -> float: + return 1.0 + + worker = DefaultWorker.new( + state=neps_state, + optimizer=optimizer, + evaluation_fn=eval_function, + settings=settings, + _pre_sample_hooks=None, + worker_id="dummy", + ) + worker.run() + + assert worker.worker_cumulative_eval_count > 0 + assert worker.worker_cumulative_evaluation_time_seconds <= 2.0 + assert ( + neps_state.get_next_pending_trial() is None + ) # should have no pending trials to be picked up + assert len(neps_state.get_errors()) == 0 + len_trials_on_first_worker = len(neps_state.get_all_trials()) + + # New worker should also run some trials more trials + new_worker = DefaultWorker.new( + state=neps_state, + optimizer=optimizer, + evaluation_fn=eval_function, + settings=settings, + _pre_sample_hooks=None, + worker_id="dummy2", + ) + new_worker.run() + assert new_worker.worker_cumulative_eval_count > 0 + assert new_worker.worker_cumulative_evaluation_time_seconds <= 2.0 + assert ( + neps_state.get_next_pending_trial() is None + ) # should have no pending trials to be picked up + assert len(neps_state.get_errors()) == 0 + len_trials_on_second_worker = len(neps_state.get_all_trials()) + assert len_trials_on_second_worker > len_trials_on_first_worker + + +def test_max_worker_evaluation_time( + neps_state: NePSState, +) -> None: + optimizer = RandomSearch(pipeline_space=SearchSpace(a=FloatParameter(0, 1))) + settings = WorkerSettings( + on_error=OnErrorPossibilities.IGNORE, + default_report_values=DefaultReportValues(), + max_evaluations_total=10, # Safety incase it doesn't work that we eventually stop + include_in_progress_evaluations_towards_maximum=False, + max_cost_total=None, + max_evaluations_for_worker=None, + max_evaluation_time_total_seconds=None, + max_wallclock_time_for_worker_seconds=None, + max_evaluation_time_for_worker_seconds=0.5, + max_cost_for_worker=None, + ) + + def eval_function(*args, **kwargs) -> float: + time.sleep(0.6) + return 1.0 + + worker = DefaultWorker.new( + state=neps_state, + optimizer=optimizer, + evaluation_fn=eval_function, + settings=settings, + _pre_sample_hooks=None, + worker_id="dummy", + ) + worker.run() + + assert worker.worker_cumulative_eval_count > 0 + assert worker.worker_cumulative_evaluation_time_seconds <= 1.0 + assert ( + neps_state.get_next_pending_trial() is None + ) # should have no pending trials to be picked up + assert len(neps_state.get_errors()) == 0 + len_trials_on_first_worker = len(neps_state.get_all_trials()) + + # New worker should also run some trials more trials + new_worker = DefaultWorker.new( + state=neps_state, + optimizer=optimizer, + evaluation_fn=eval_function, + settings=settings, + _pre_sample_hooks=None, + worker_id="dummy2", + ) + new_worker.run() + assert new_worker.worker_cumulative_eval_count > 0 + assert new_worker.worker_cumulative_evaluation_time_seconds <= 1.0 + assert ( + neps_state.get_next_pending_trial() is None + ) # should have no pending trials to be picked up + assert len(neps_state.get_errors()) == 0 + len_trials_on_second_worker = len(neps_state.get_all_trials()) + assert len_trials_on_second_worker > len_trials_on_first_worker + + +def test_max_evaluation_time_global( + neps_state: NePSState, +) -> None: + optimizer = RandomSearch(pipeline_space=SearchSpace(a=FloatParameter(0, 1))) + settings = WorkerSettings( + on_error=OnErrorPossibilities.IGNORE, + default_report_values=DefaultReportValues(), + max_evaluations_total=10, # Safety incase it doesn't work that we eventually stop + include_in_progress_evaluations_towards_maximum=False, + max_cost_total=None, + max_evaluations_for_worker=None, + max_evaluation_time_total_seconds=0.5, # <- Highlight + max_wallclock_time_for_worker_seconds=None, + max_evaluation_time_for_worker_seconds=None, + max_cost_for_worker=None, + ) + + def eval_function(*args, **kwargs) -> float: + time.sleep(0.6) + return 1.0 + + worker = DefaultWorker.new( + state=neps_state, + optimizer=optimizer, + evaluation_fn=eval_function, + settings=settings, + _pre_sample_hooks=None, + worker_id="dummy", + ) + worker.run() + + assert worker.worker_cumulative_eval_count > 0 + assert worker.worker_cumulative_evaluation_time_seconds <= 1.0 + assert ( + neps_state.get_next_pending_trial() is None + ) # should have no pending trials to be picked up + assert len(neps_state.get_errors()) == 0 + len_trials_on_first_worker = len(neps_state.get_all_trials()) + + # New worker should also run some trials more trials + new_worker = DefaultWorker.new( + state=neps_state, + optimizer=optimizer, + evaluation_fn=eval_function, + settings=settings, + _pre_sample_hooks=None, + worker_id="dummy2", + ) + new_worker.run() + assert new_worker.worker_cumulative_eval_count == 0 + assert new_worker.worker_cumulative_evaluation_time_seconds == 0 + assert ( + neps_state.get_next_pending_trial() is None + ) # should have no pending trials to be picked up + assert len(neps_state.get_errors()) == 0 + len_trials_on_second_worker = len(neps_state.get_all_trials()) + assert len_trials_on_second_worker == len_trials_on_first_worker diff --git a/tests/test_state/__init__.py b/tests/test_state/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_state/test_filebased_neps_state.py b/tests/test_state/test_filebased_neps_state.py new file mode 100644 index 00000000..a3385141 --- /dev/null +++ b/tests/test_state/test_filebased_neps_state.py @@ -0,0 +1,123 @@ +"""NOTE: These tests are pretty specific to the filebased state implementation. +This could be generalized if we end up with a server based implementation but +for now we're just testing the filebased implementation.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any +from neps.exceptions import NePSError, TrialNotFoundError +from neps.state.err_dump import ErrDump +from neps.state.filebased import ( + create_or_load_filebased_neps_state, + load_filebased_neps_state, +) + +import pytest +from pytest_cases import fixture, parametrize +from neps.state.optimizer import BudgetInfo, OptimizationState, OptimizerInfo + + +@fixture +@parametrize("budget", [BudgetInfo(max_cost_budget=10, used_cost_budget=0), None]) +@parametrize("shared_state", [{"a": "b"}, {}]) +def optimizer_state( + budget: BudgetInfo | None, + shared_state: dict[str, Any], +) -> OptimizationState: + return OptimizationState(budget=budget, shared_state=shared_state) + + +@fixture +@parametrize("optimizer_info", [OptimizerInfo({"a": "b"}), OptimizerInfo({})]) +def optimizer_info(optimizer_info: OptimizerInfo) -> OptimizerInfo: + return optimizer_info + + +def test_create_with_new_filebased_neps_state( + tmp_path: Path, + optimizer_info: OptimizerInfo, + optimizer_state: OptimizationState, +) -> None: + new_path = tmp_path / "neps_state" + neps_state = create_or_load_filebased_neps_state( + directory=new_path, + optimizer_info=optimizer_info, + optimizer_state=optimizer_state, + ) + assert neps_state.optimizer_info() == optimizer_info + assert neps_state.optimizer_state() == optimizer_state + assert neps_state.all_trial_ids() == set() + assert neps_state.get_all_trials() == {} + assert neps_state.get_errors() == ErrDump(errs=[]) + assert neps_state.get_next_pending_trial() is None + assert neps_state.get_next_pending_trial(n=10) == [] + + with pytest.raises(TrialNotFoundError): + assert neps_state.get_trial_by_id("1") + + with pytest.raises(TrialNotFoundError): + assert neps_state.get_trials_by_ids(["1", "2"]) + + +def test_create_or_load_with_load_filebased_neps_state( + tmp_path: Path, + optimizer_info: OptimizerInfo, + optimizer_state: OptimizationState, +) -> None: + new_path = tmp_path / "neps_state" + neps_state = create_or_load_filebased_neps_state( + directory=new_path, + optimizer_info=optimizer_info, + optimizer_state=optimizer_state, + ) + + # NOTE: This isn't a defined way to do this but we should check + # that we prioritize what's in the existing data over what + # was passed in. + different_state = OptimizationState( + budget=BudgetInfo(max_cost_budget=20, used_cost_budget=10), + shared_state={"c": "d"}, + ) + neps_state2 = create_or_load_filebased_neps_state( + directory=new_path, + optimizer_info=optimizer_info, + optimizer_state=different_state, + ) + assert neps_state == neps_state2 + + +def test_load_on_existing_neps_state( + tmp_path: Path, + optimizer_info: OptimizerInfo, + optimizer_state: OptimizationState, +) -> None: + new_path = tmp_path / "neps_state" + neps_state = create_or_load_filebased_neps_state( + directory=new_path, + optimizer_info=optimizer_info, + optimizer_state=optimizer_state, + ) + + neps_state2 = load_filebased_neps_state(directory=new_path) + assert neps_state == neps_state2 + + +def test_new_or_load_on_existing_neps_state_with_different_optimizer_info( + tmp_path: Path, + optimizer_info: OptimizerInfo, + optimizer_state: OptimizationState, +) -> None: + new_path = tmp_path / "neps_state" + create_or_load_filebased_neps_state( + directory=new_path, + optimizer_info=optimizer_info, + optimizer_state=optimizer_state, + ) + + with pytest.raises(NePSError): + create_or_load_filebased_neps_state( + directory=new_path, + optimizer_info=OptimizerInfo({"e": "f"}), + optimizer_state=optimizer_state, + ) diff --git a/tests/test_state/test_neps_state.py b/tests/test_state/test_neps_state.py new file mode 100644 index 00000000..0d0891ce --- /dev/null +++ b/tests/test_state/test_neps_state.py @@ -0,0 +1,205 @@ +"""NOTE: These tests are pretty specific to the filebased state implementation. +This could be generalized if we end up with a server based implementation but +for now we're just testing the filebased implementation.""" + +from __future__ import annotations + +import time +from pathlib import Path +from typing import Any + +import pytest +from neps.optimizers.base_optimizer import BaseOptimizer +from neps.search_spaces.hyperparameters import ( + FloatParameter, + IntegerParameter, + ConstantParameter, + CategoricalParameter, +) +from neps.search_spaces.search_space import SearchSpace +from neps.state.filebased import ( + create_or_load_filebased_neps_state, +) + +from pytest_cases import fixture, parametrize, parametrize_with_cases, case +from neps.state.neps_state import NePSState +from neps.state.optimizer import BudgetInfo, OptimizationState, OptimizerInfo +from neps.optimizers import SearcherMapping +from neps.utils.common import MissingDependencyError + + +@case +def case_search_space_no_fid() -> SearchSpace: + return SearchSpace( + a=FloatParameter(0, 1), + b=CategoricalParameter(["a", "b", "c"]), + c=ConstantParameter("a"), + d=IntegerParameter(0, 10), + ) + + +@case +def case_search_space_with_fid() -> SearchSpace: + return SearchSpace( + a=FloatParameter(0, 1), + b=CategoricalParameter(["a", "b", "c"]), + c=ConstantParameter("a"), + d=IntegerParameter(0, 10), + e=IntegerParameter(1, 10, is_fidelity=True), + ) + + +@case +def case_search_space_no_fid_with_prior() -> SearchSpace: + return SearchSpace( + a=FloatParameter(0, 1, default=0.5), + b=CategoricalParameter(["a", "b", "c"], default="a"), + c=ConstantParameter("a"), + d=IntegerParameter(0, 10, default=5), + ) + + +@case +def case_search_space_fid_with_prior() -> SearchSpace: + return SearchSpace( + a=FloatParameter(0, 1, default=0.5), + b=CategoricalParameter(["a", "b", "c"], default="a"), + c=ConstantParameter("a"), + d=IntegerParameter(0, 10, default=5), + e=IntegerParameter(1, 10, is_fidelity=True), + ) + + +# See issue #118 +NON_INSTANTIABLE_SEARCH_SPACES_WITHOUT_SPECIFIC_KWARGS = "assisted_regularized_evolution" + +# See issue #121 +JUST_SKIP = [ + "multifidelity_tpe", +] + +# +OPTIMIZER_FAILS_WITH_FIDELITY = [ + "random_search", +] + +# There's no programattic way to check if a class requires a fidelity. +# See issue #118, #119, #120 +OPTIMIZER_REQUIRES_FIDELITY = [ + "successive_halving", + "successive_halving_prior", + "asha", + "asha_prior", + "hyperband", + "hyperband_custom_default", + "priorband", + "mobster", + "mf_ei_bo", +] +OPTIMIZER_REQUIRES_BUDGET = [ + "successive_halving_prior", + "hyperband_custom_default", + "asha", + "priorband", + "hyperband", + "asha_prior", + "mobster", +] +REQUIRES_PRIOR = { + "priorband", +} +REQUIRES_COST = ["cost_cooling_bayesian_optimization", "cost_cooling"] + + +@fixture +@parametrize( + "key", + [ + k + for k in SearcherMapping.keys() + if k not in NON_INSTANTIABLE_SEARCH_SPACES_WITHOUT_SPECIFIC_KWARGS + ], +) +@parametrize_with_cases("search_space", cases=".", prefix="case_search_space") +def optimizer_and_key(key: str, search_space: SearchSpace) -> tuple[BaseOptimizer, str]: + if key in JUST_SKIP: + pytest.xfail(f"{key} is not instantiable") + + if key in REQUIRES_PRIOR and search_space.hyperparameters["a"].default is None: + pytest.xfail(f"{key} requires a prior") + + if search_space.has_fidelity and key in OPTIMIZER_FAILS_WITH_FIDELITY: + pytest.xfail(f"{key} crashed with a fidelity") + + if key in OPTIMIZER_REQUIRES_FIDELITY and not search_space.has_fidelity: + pytest.xfail(f"{key} requires a fidelity parameter") + kwargs: dict[str, Any] = { + "pipeline_space": search_space, + } + if key in OPTIMIZER_REQUIRES_BUDGET: + kwargs["budget"] = 10 + + optimizer_cls = SearcherMapping[key] + + try: + return optimizer_cls(**kwargs), key + except MissingDependencyError as e: + pytest.xfail(f"{key} requires {e.dep} to run.") + + +@parametrize("optimizer_info", [OptimizerInfo({"a": "b"}), OptimizerInfo({})]) +@parametrize("budget", [BudgetInfo(max_cost_budget=10, used_cost_budget=0), None]) +@parametrize("shared_state", [{"a": "b"}, {}]) +def case_neps_state_filebased( + tmp_path: Path, + budget: BudgetInfo | None, + optimizer_info: OptimizerInfo, + shared_state: dict[str, Any], +) -> NePSState: + new_path = tmp_path / "neps_state" + return create_or_load_filebased_neps_state( + directory=new_path, + optimizer_info=optimizer_info, + optimizer_state=OptimizationState(budget=budget, shared_state=shared_state), + ) + + +@parametrize_with_cases("neps_state", cases=".", prefix="case_neps_state") +def test_sample_trial( + neps_state: NePSState, + optimizer_and_key: tuple[BaseOptimizer, str], +) -> None: + optimizer, key = optimizer_and_key + if key in REQUIRES_COST and neps_state.optimizer_state().budget is None: + pytest.xfail(f"{key} requires a cost budget") + + assert neps_state.get_all_trials() == {} + assert neps_state.get_next_pending_trial() is None + assert neps_state.get_next_pending_trial(n=10) == [] + assert neps_state.all_trial_ids() == set() + + trial1 = neps_state.sample_trial(optimizer=optimizer, worker_id="1") + for k, v in trial1.config.items(): + assert k in optimizer.pipeline_space.hyperparameters + assert v is not None, f"'{k}' is None in {trial1.config}" + + # HACK: Unfortunatly due to windows, who's time.time() is not very + # precise, we need to introduce a sleep -_- + time.sleep(0.1) + + assert neps_state.get_all_trials() == {trial1.id: trial1} + assert neps_state.get_next_pending_trial() == trial1 + assert neps_state.get_next_pending_trial(n=10) == [trial1] + assert neps_state.all_trial_ids() == {trial1.id} + + trial2 = neps_state.sample_trial(optimizer=optimizer, worker_id="1") + for k, v in trial1.config.items(): + assert k in optimizer.pipeline_space.hyperparameters + assert v is not None, f"'{k}' is None in {trial1.config}" + + assert trial1 != trial2 + + assert neps_state.get_all_trials() == {trial1.id: trial1, trial2.id: trial2} + assert neps_state.get_next_pending_trial() == trial1 + assert neps_state.get_next_pending_trial(n=10) == [trial1, trial2] + assert neps_state.all_trial_ids() == {trial1.id, trial2.id} diff --git a/tests/test_rng.py b/tests/test_state/test_rng.py similarity index 52% rename from tests/test_rng.py rename to tests/test_state/test_rng.py index 0af38b49..1f1318d3 100644 --- a/tests/test_rng.py +++ b/tests/test_state/test_rng.py @@ -7,36 +7,50 @@ import torch import pytest -from neps.utils._rng import SeedState +from neps.state.seed_snapshot import SeedSnapshot +from neps.state.filebased import ReaderWriterSeedSnapshot + @pytest.mark.parametrize( - "make_ints", ( + "make_ints", + ( lambda: [random.randint(0, 100) for _ in range(10)], lambda: list(np.random.randint(0, 100, (10,))), lambda: list(torch.randint(0, 100, (10,))), - ) + ), ) -def test_randomstate_consistent(tmp_path: Path, make_ints: Callable[[], list[int]]) -> None: +def test_randomstate_consistent( + tmp_path: Path, make_ints: Callable[[], list[int]] +) -> None: random.seed(42) np.random.seed(42) torch.manual_seed(42) seed_dir = tmp_path / "seed_dir" + seed_dir.mkdir(exist_ok=True, parents=True) - seed_state = SeedState.get() + seed_state = SeedSnapshot.new_capture() integers_1 = make_ints() - seed_state.set_as_global_state() - integers_2 = make_ints() + seed_state.set_as_global_seed_state() + integers_2 = make_ints() assert integers_1 == integers_2 - SeedState.get().dump(seed_dir) - integers_3 = make_ints() + ReaderWriterSeedSnapshot.write(SeedSnapshot.new_capture(), seed_dir) + integers_3 = make_ints() assert integers_3 != integers_2, "Ensure we have actually changed random state" - SeedState.load(seed_dir).set_as_global_state() + ReaderWriterSeedSnapshot.read(seed_dir).set_as_global_seed_state() integers_4 = make_ints() assert integers_3 == integers_4 + + before = SeedSnapshot.new_capture() + after = SeedSnapshot.new_capture() + + _ = make_ints() + + after.recapture() + assert before != after diff --git a/tests/test_state/test_synced.py b/tests/test_state/test_synced.py new file mode 100644 index 00000000..3a28b724 --- /dev/null +++ b/tests/test_state/test_synced.py @@ -0,0 +1,432 @@ +from __future__ import annotations + +from pytest_cases import parametrize, parametrize_with_cases, case +import copy +import numpy as np +import random +from neps.state.err_dump import ErrDump, SerializableTrialError +from neps.state.filebased import ( + ReaderWriterErrDump, + ReaderWriterOptimizationState, + ReaderWriterOptimizerInfo, + ReaderWriterSeedSnapshot, + ReaderWriterTrial, + FileVersioner, + FileLocker, +) +from neps.state.optimizer import BudgetInfo, OptimizationState, OptimizerInfo +from neps.state.protocols import Synced +from neps.state.trial import Trial +import pytest +from typing import Any, Callable +from pathlib import Path +from neps.state import SeedSnapshot, Synced, Trial + + +@case +def case_trial_1(tmp_path: Path) -> tuple[Synced[Trial, Path], Callable[[Trial], None]]: + trial_id = "1" + trial = Trial.new( + trial_id=trial_id, + location="", + config={"a": "b"}, + time_sampled=0, + previous_trial=None, + previous_trial_location=None, + worker_id=0, + ) + + def _mutate(trial: Trial) -> None: + trial.set_submitted(time_submitted=1) + + x = Synced.new( + data=trial, + location=tmp_path / "1", + locker=FileLocker(lock_path=tmp_path / "1" / ".lock", poll=0.1, timeout=None), + versioner=FileVersioner(version_file=tmp_path / "1" / ".version"), + reader_writer=ReaderWriterTrial(), + ) + return x, _mutate + + +@case +def case_trial_2(tmp_path: Path) -> tuple[Synced[Trial, Path], Callable[[Trial], None]]: + trial_id = "1" + trial = Trial.new( + trial_id=trial_id, + location="", + config={"a": "b"}, + time_sampled=0, + previous_trial=None, + previous_trial_location=None, + worker_id=0, + ) + trial.set_submitted(time_submitted=1) + + def _mutate(trial: Trial) -> None: + trial.set_evaluating(time_started=2, worker_id="1") + + x = Synced.new( + data=trial, + location=tmp_path / "1", + locker=FileLocker(lock_path=tmp_path / "1" / ".lock", poll=0.1, timeout=None), + versioner=FileVersioner(version_file=tmp_path / "1" / ".version"), + reader_writer=ReaderWriterTrial(), + ) + return x, _mutate + + +@case +def case_trial_3(tmp_path: Path) -> tuple[Synced[Trial, Path], Callable[[Trial], None]]: + trial_id = "1" + trial = Trial.new( + trial_id=trial_id, + config={"a": "b"}, + location="", + time_sampled=0, + previous_trial=None, + previous_trial_location=None, + worker_id=0, + ) + trial.set_submitted(time_submitted=1) + trial.set_evaluating(time_started=2, worker_id="1") + + def _mutate(trial: Trial) -> None: + trial.set_complete( + time_end=3, + loss=1, + cost=1, + extra={"hi": [1, 2, 3]}, + learning_curve=[1], + report_as="success", + evaluation_duration=1, + err=None, + tb=None, + ) + + x = Synced.new( + data=trial, + location=tmp_path / "1", + locker=FileLocker(lock_path=tmp_path / "1" / ".lock", poll=0.1, timeout=None), + versioner=FileVersioner(version_file=tmp_path / "1" / ".version"), + reader_writer=ReaderWriterTrial(), + ) + return x, _mutate + + +@case +def case_trial_4(tmp_path: Path) -> tuple[Synced[Trial, Path], Callable[[Trial], None]]: + trial_id = "1" + trial = Trial.new( + trial_id=trial_id, + config={"a": "b"}, + location="", + time_sampled=0, + previous_trial=None, + previous_trial_location=None, + worker_id=0, + ) + trial.set_submitted(time_submitted=1) + trial.set_evaluating(time_started=2, worker_id="1") + + def _mutate(trial: Trial) -> None: + trial.set_complete( + time_end=3, + loss=np.nan, + cost=np.inf, + extra={"hi": [1, 2, 3]}, + report_as="failed", + learning_curve=None, + evaluation_duration=2, + err=None, + tb=None, + ) + + x = Synced.new( + data=trial, + location=tmp_path / "1", + locker=FileLocker(lock_path=tmp_path / "1" / ".lock", poll=0.1, timeout=None), + versioner=FileVersioner(version_file=tmp_path / "1" / ".version"), + reader_writer=ReaderWriterTrial(), + ) + return x, _mutate + + +@case +def case_trial_5(tmp_path: Path) -> tuple[Synced[Trial, Path], Callable[[Trial], None]]: + trial_id = "1" + trial = Trial.new( + trial_id=trial_id, + config={"a": "b"}, + location="", + time_sampled=0, + previous_trial=None, + previous_trial_location=None, + worker_id=0, + ) + trial.set_submitted(time_submitted=1) + trial.set_evaluating(time_started=2, worker_id=1) + + def _mutate(trial: Trial) -> None: + trial.set_complete( + time_end=3, + loss=np.nan, + cost=np.inf, + extra={"hi": [1, 2, 3]}, + learning_curve=None, + evaluation_duration=2, + report_as="failed", + err=ValueError("hi"), + tb="something something traceback", + ) + + x = Synced.new( + data=trial, + location=tmp_path / "1", + locker=FileLocker(lock_path=tmp_path / "1" / ".lock", poll=0.1, timeout=None), + versioner=FileVersioner(version_file=tmp_path / "1" / ".version"), + reader_writer=ReaderWriterTrial(), + ) + return x, _mutate + + +@case +def case_trial_6(tmp_path: Path) -> tuple[Synced[Trial, Path], Callable[[Trial], None]]: + trial_id = "1" + trial = Trial.new( + trial_id=trial_id, + config={"a": "b"}, + location="", + time_sampled=0, + previous_trial=None, + previous_trial_location=None, + worker_id=0, + ) + trial.set_submitted(time_submitted=1) + trial.set_evaluating(time_started=2, worker_id=1) + + def _mutate(trial: Trial) -> None: + trial.set_corrupted() + + x = Synced.new( + data=trial, + location=tmp_path / "1", + locker=FileLocker(lock_path=tmp_path / "1" / ".lock", poll=0.1, timeout=None), + versioner=FileVersioner(version_file=tmp_path / "1" / ".version"), + reader_writer=ReaderWriterTrial(), + ) + return x, _mutate + + +@case +def case_trial_7(tmp_path: Path) -> tuple[Synced[Trial, Path], Callable[[Trial], None]]: + trial_id = "1" + trial = Trial.new( + trial_id=trial_id, + config={"a": "b"}, + location="", + time_sampled=0, + previous_trial=None, + previous_trial_location=None, + worker_id=0, + ) + trial.set_submitted(time_submitted=1) + trial.set_evaluating(time_started=2, worker_id=1) + trial.set_complete( + time_end=3, + loss=np.nan, + cost=np.inf, + extra={"hi": [1, 2, 3]}, + learning_curve=[1, 2, 3], + report_as="failed", + evaluation_duration=2, + err=ValueError("hi"), + tb="something something traceback", + ) + + def _mutate(trial: Trial) -> None: + trial.reset() + + x = Synced.new( + data=trial, + location=tmp_path / "1", + locker=FileLocker(lock_path=tmp_path / "1" / ".lock", poll=0.1, timeout=None), + versioner=FileVersioner(version_file=tmp_path / "1" / ".version"), + reader_writer=ReaderWriterTrial(), + ) + return x, _mutate + + +@case +def case_seed_snapshot( + tmp_path: Path, +) -> tuple[Synced[SeedSnapshot, Path], Callable[[SeedSnapshot], None]]: + seed = SeedSnapshot.new_capture() + + def _mutate(seed: SeedSnapshot) -> None: + random.randint(0, 100) + seed.recapture() + + x = Synced.new( + data=seed, + location=tmp_path / "seeds", + locker=FileLocker(lock_path=tmp_path / "seeds" / ".lock", poll=0.1, timeout=None), + versioner=FileVersioner(version_file=tmp_path / "seeds" / ".version"), + reader_writer=ReaderWriterSeedSnapshot(), + ) + return x, _mutate + + +@case +@parametrize( + "err", + [ + None, + SerializableTrialError( + trial_id="1", + worker_id="2", + err_type="ValueError", + err="hi", + tb="traceback\nmore", + ), + ], +) +def case_err_dump( + tmp_path: Path, + err: None | SerializableTrialError, +) -> tuple[Synced[ErrDump, Path], Callable[[ErrDump], None]]: + err_dump = ErrDump() if err is None else ErrDump(errs=[err]) + + def _mutate(err_dump: ErrDump) -> None: + new_err = SerializableTrialError( + trial_id="2", + worker_id="2", + err_type="RuntimeError", + err="hi", + tb="traceback\nless", + ) + err_dump.append(new_err) + + x = Synced.new( + data=err_dump, + location=tmp_path / "err_dump", + locker=FileLocker( + lock_path=tmp_path / "err_dump" / ".lock", poll=0.1, timeout=None + ), + versioner=FileVersioner(version_file=tmp_path / "err_dump" / ".version"), + reader_writer=ReaderWriterErrDump("all"), + ) + return x, _mutate + + +@case +def case_optimizer_info( + tmp_path: Path, +) -> tuple[Synced[OptimizerInfo, Path], Callable[[OptimizerInfo], None]]: + optimizer_info = OptimizerInfo(info={"a": "b"}) + + def _mutate(optimizer_info: OptimizerInfo) -> None: + optimizer_info.info["b"] = "c" # type: ignore # NOTE: We shouldn't be mutating but anywho... + + x = Synced.new( + data=optimizer_info, + location=tmp_path / "optimizer_info", + locker=FileLocker( + lock_path=tmp_path / "optimizer_info" / ".lock", poll=0.1, timeout=None + ), + versioner=FileVersioner(version_file=tmp_path / "optimizer_info" / ".version"), + reader_writer=ReaderWriterOptimizerInfo(), + ) + return x, _mutate + + +@case +@pytest.mark.parametrize( + "budget", (None, BudgetInfo(max_cost_budget=10, used_cost_budget=0)) +) +@pytest.mark.parametrize("shared_state", ({}, {"a": "b"})) +def case_optimization_state( + tmp_path: Path, + budget: BudgetInfo | None, + shared_state: dict[str, Any], +) -> tuple[Synced[OptimizationState, Path], Callable[[OptimizationState], None]]: + optimization_state = OptimizationState(budget=budget, shared_state=shared_state) + + def _mutate(optimization_state: OptimizationState) -> None: + optimization_state.shared_state["a"] = "c" # type: ignore # NOTE: We shouldn't be mutating but anywho... + optimization_state.budget = BudgetInfo(max_cost_budget=10, used_cost_budget=5) + + x = Synced.new( + data=optimization_state, + location=tmp_path / "optimizer_info", + locker=FileLocker( + lock_path=tmp_path / "optimizer_info" / ".lock", poll=0.1, timeout=None + ), + versioner=FileVersioner(version_file=tmp_path / "optimizer_info" / ".version"), + reader_writer=ReaderWriterOptimizationState(), + ) + return x, _mutate + + +@parametrize_with_cases("shared, mutate", cases=".") +def test_initial_state(shared: Synced, mutate: Callable) -> None: + assert shared._is_locked() == False + assert shared._is_stale() == False + assert shared._unsynced() == shared.synced() + + +@parametrize_with_cases("shared, mutate", cases=".") +def test_put_updates_current_data_and_is_not_stale( + shared: Synced, mutate: Callable +) -> None: + current_data = shared._unsynced() + + new_data = copy.deepcopy(current_data) + mutate(new_data) + assert new_data != current_data + + shared.put(new_data) + assert shared._unsynced() == new_data + assert shared._is_stale() == False + assert shared._is_locked() == False + + +@parametrize_with_cases("shared1, mutate", cases=".") +def test_share_synced_mutate_and_put(shared1: Synced, mutate: Callable) -> None: + shared2 = shared1.deepcopy() + assert shared1 == shared2 + assert not shared1._is_locked() + assert not shared2._is_locked() + + with shared2.acquire() as (data2, put2): + assert shared1._is_locked() + assert shared2._is_locked() + mutate(data2) + put2(data2) + + assert not shared1._is_locked() + assert not shared2._is_locked() + + assert shared1 != shared2 + assert shared1._unsynced() != shared2._unsynced() + assert shared1._is_stale() + + shared1.synced() + assert not shared1._is_stale() + assert not shared2._is_stale() + assert shared1._unsynced() == shared2._unsynced() + + +@parametrize_with_cases("shared, mutate", cases=".") +def test_shared_new_fails_if_done_on_existing_resource( + shared: Synced, mutate: Callable +) -> None: + data, location, versioner, rw, lock = shared._components() + with pytest.raises(Synced.VersionedResourceAlreadyExistsError): + Synced.new( + data=data, + location=location, + versioner=versioner, + reader_writer=rw, + locker=lock, + ) diff --git a/tests/test_state/test_trial.py b/tests/test_state/test_trial.py new file mode 100644 index 00000000..0ddc9e34 --- /dev/null +++ b/tests/test_state/test_trial.py @@ -0,0 +1,301 @@ +from __future__ import annotations +from neps.state import Trial +import os +import numpy as np + + +def test_trial_creation() -> None: + trial_id = "1" + time_sampled = 0 + previous_trial = "0" + worker_id = str(os.getpid()) + + trial = Trial.new( + trial_id=trial_id, + config={"a": "b"}, + location="1", + previous_trial_location=None, + time_sampled=time_sampled, + previous_trial=previous_trial, + worker_id=worker_id, + ) + assert trial.state == Trial.State.PENDING + assert trial.id == trial_id + assert trial.config == {"a": "b"} + assert trial.metadata == Trial.MetaData( + id="1", + time_sampled=time_sampled, + location="1", + previous_trial_location=None, + previous_trial_id=previous_trial, + sampling_worker_id=worker_id, + time_started=None, + time_submitted=None, + time_end=None, + ) + + +def test_trial_as_submitted() -> None: + trial_id = "1" + time_sampled = 0 + time_submitted = 1 + previous_trial = "0" + worker_id = str(os.getpid()) + + trial = Trial.new( + trial_id=trial_id, + config={"a": "b"}, + previous_trial_location="0", + location="1", + time_sampled=time_sampled, + previous_trial=previous_trial, + worker_id=worker_id, + ) + trial.set_submitted(time_submitted=time_submitted) + + assert trial.state == Trial.State.SUBMITTED + assert trial.id == trial_id + assert trial.config == {"a": "b"} + assert trial.metadata == Trial.MetaData( + id=trial_id, + time_sampled=time_sampled, + previous_trial_location="0", + location="1", + previous_trial_id=previous_trial, + sampling_worker_id=worker_id, + time_submitted=time_submitted, + time_started=None, + time_end=None, + ) + + +def test_trial_as_in_progress_with_different_evaluating_worker() -> None: + trial_id = "1" + time_sampled = 0 + time_submitted = 1 + time_started = 2 + previous_trial = "0" + sampling_worker_id = "42" + evaluating_worker_id = "43" + + trial = Trial.new( + trial_id=trial_id, + config={"a": "b"}, + location="1", + previous_trial_location="0", + time_sampled=time_sampled, + previous_trial=previous_trial, + worker_id=sampling_worker_id, + ) + trial.set_submitted(time_submitted=time_submitted) + trial.set_evaluating(time_started=time_started, worker_id=evaluating_worker_id) + + assert trial.state == Trial.State.EVALUATING + assert trial.id == trial_id + assert trial.config == {"a": "b"} + assert trial.metadata == Trial.MetaData( + id=trial_id, + time_sampled=time_sampled, + previous_trial_id=previous_trial, + previous_trial_location="0", + location="1", + sampling_worker_id=sampling_worker_id, + evaluating_worker_id=evaluating_worker_id, + time_submitted=time_submitted, + time_started=time_started, + time_end=None, + ) + + +def test_trial_as_success_after_being_progress() -> None: + trial_id = "1" + time_sampled = 0 + time_submitted = 1 + time_started = 2 + time_end = 3 + previous_trial = "0" + sampling_worker_id = "42" + evaluating_worker_id = "43" + loss = 427 + cost = -123.6 + extra = {"picnic": "basket", "counts": [1, 2, 3]} + + trial = Trial.new( + trial_id=trial_id, + config={"a": "b"}, + time_sampled=time_sampled, + previous_trial=previous_trial, + location="1", + previous_trial_location="0", + worker_id=sampling_worker_id, + ) + trial.set_submitted(time_submitted=time_submitted) + trial.set_evaluating(time_started=time_started, worker_id=evaluating_worker_id) + report = trial.set_complete( + report_as="success", + loss=loss, + cost=cost, + err=None, + tb=None, + learning_curve=None, + evaluation_duration=time_end - time_started, + extra=extra, + time_end=time_end, + ) + + assert trial.state == Trial.State.SUCCESS + assert trial.id == trial_id + assert trial.config == {"a": "b"} + assert trial.metadata == Trial.MetaData( + id=trial_id, + time_sampled=time_sampled, + previous_trial_location="0", + location="1", + previous_trial_id=previous_trial, + sampling_worker_id=sampling_worker_id, + evaluating_worker_id=evaluating_worker_id, + evaluation_duration=time_end - time_started, + time_submitted=time_submitted, + time_started=time_started, + time_end=time_end, + ) + assert report == Trial.Report( + trial_id=trial_id, + loss=loss, + cost=cost, + learning_curve=None, + evaluation_duration=1, + extra=extra, + err=None, + tb=None, + reported_as="success", + ) + + +def test_trial_as_failed_with_nan_loss_and_in_cost() -> None: + trial_id = "1" + time_sampled = 0 + time_submitted = 1 + time_started = 2 + time_end = 3 + previous_trial = "0" + sampling_worker_id = "42" + evaluating_worker_id = "43" + loss = np.nan + cost = np.inf + extra = {"picnic": "basket", "counts": [1, 2, 3]} + + trial = Trial.new( + trial_id=trial_id, + config={"a": "b"}, + location="1", + previous_trial_location="0", + time_sampled=time_sampled, + previous_trial=previous_trial, + worker_id=sampling_worker_id, + ) + trial.set_submitted(time_submitted=time_submitted) + trial.set_evaluating(time_started=time_started, worker_id=evaluating_worker_id) + report = trial.set_complete( + report_as="failed", + loss=loss, + cost=cost, + learning_curve=None, + evaluation_duration=time_end - time_started, + err=None, + tb=None, + extra=extra, + time_end=time_end, + ) + assert trial.state == Trial.State.FAILED + assert trial.id == trial_id + assert trial.config == {"a": "b"} + assert trial.metadata == Trial.MetaData( + id=trial_id, + time_sampled=time_sampled, + previous_trial_id=previous_trial, + sampling_worker_id=sampling_worker_id, + evaluating_worker_id=evaluating_worker_id, + time_submitted=time_submitted, + previous_trial_location="0", + location="1", + time_started=time_started, + time_end=time_end, + evaluation_duration=time_end - time_started, + ) + assert report == Trial.Report( + trial_id=trial_id, + loss=loss, + cost=cost, + learning_curve=None, + evaluation_duration=time_end - time_started, + extra=extra, + err=None, + tb=None, + reported_as="failed", + ) + + +def test_trial_as_crashed_with_err_and_tb() -> None: + trial_id = "1" + time_sampled = 0 + time_submitted = 1 + time_started = 2 + time_end = 3 + err = ValueError("Something went wrong") + tb = "some traceback" + previous_trial = "0" + sampling_worker_id = "42" + evaluating_worker_id = "43" + extra = {"picnic": "basket", "counts": [1, 2, 3]} + + trial = Trial.new( + trial_id=trial_id, + config={"a": "b"}, + time_sampled=time_sampled, + location="1", + previous_trial_location="42", + previous_trial=previous_trial, + worker_id=sampling_worker_id, + ) + trial.set_submitted(time_submitted=time_submitted) + trial.set_evaluating(time_started=time_started, worker_id=evaluating_worker_id) + report = trial.set_complete( + report_as="crashed", + loss=None, + cost=None, + learning_curve=None, + evaluation_duration=time_end - time_started, + err=err, + tb=tb, + extra=extra, + time_end=time_end, + ) + + assert trial.state == Trial.State.CRASHED + assert trial.id == trial_id + assert trial.config == {"a": "b"} + assert trial.metadata == Trial.MetaData( + id=trial_id, + time_sampled=time_sampled, + previous_trial_id=previous_trial, + sampling_worker_id=sampling_worker_id, + evaluating_worker_id=evaluating_worker_id, + time_submitted=time_submitted, + previous_trial_location="42", + location="1", + time_started=time_started, + time_end=time_end, + evaluation_duration=time_end - time_started, + ) + assert report == Trial.Report( + trial_id=trial_id, + loss=None, + cost=None, + learning_curve=None, + evaluation_duration=time_end - time_started, + extra=extra, + err=err, + tb=tb, + reported_as="crashed", + ) diff --git a/tests/test_yaml_run_args/test_declarative_usage_docs/test_declarative_usage_docs.py b/tests/test_yaml_run_args/test_declarative_usage_docs/test_declarative_usage_docs.py index 5d6d8368..ea2ca6ec 100644 --- a/tests/test_yaml_run_args/test_declarative_usage_docs/test_declarative_usage_docs.py +++ b/tests/test_yaml_run_args/test_declarative_usage_docs/test_declarative_usage_docs.py @@ -2,20 +2,24 @@ import os import subprocess import sys + BASE_PATH = "tests/test_yaml_run_args/test_declarative_usage_docs/" @pytest.mark.neps_api -@pytest.mark.parametrize("yaml_file", [ - "simple_example_including_run_pipeline.yaml", - "full_configuration_template.yaml", - "defining_hooks.yaml", - "customizing_neps_optimizer.yaml", - "loading_own_optimizer.yaml", - "loading_pipeline_space_dict.yaml", - "outsourcing_optimizer.yaml", - "outsourcing_pipeline_space.yaml" -]) +@pytest.mark.parametrize( + "yaml_file", + [ + "simple_example_including_run_pipeline.yaml", + "full_configuration_template.yaml", + "defining_hooks.yaml", + "customizing_neps_optimizer.yaml", + "loading_own_optimizer.yaml", + "loading_pipeline_space_dict.yaml", + "outsourcing_optimizer.yaml", + "outsourcing_pipeline_space.yaml", + ], +) def test_run_with_yaml(yaml_file: str) -> None: """ Test 'neps.run' with various run_args.yaml settings to simulate loading options @@ -25,11 +29,11 @@ def test_run_with_yaml(yaml_file: str) -> None: assert os.path.exists(yaml_path), f"{yaml_file} does not exist." try: - subprocess.check_call( - [sys.executable, BASE_PATH + 'neps_run.py', yaml_path]) + subprocess.check_call([sys.executable, BASE_PATH + "neps_run.py", yaml_path]) except subprocess.CalledProcessError as e: pytest.fail( - f"NePS run failed for configuration: {yaml_file} with error: {str(e)}") + f"NePS run failed for configuration: {yaml_file} with error: {str(e)}" + ) @pytest.mark.neps_api @@ -43,8 +47,9 @@ def test_run_with_yaml_and_run_pipeline() -> None: try: subprocess.check_call( - [sys.executable, BASE_PATH + 'neps_run.py', yaml_path, "--run_pipeline"] + [sys.executable, BASE_PATH + "neps_run.py", yaml_path, "--run_pipeline"] ) except subprocess.CalledProcessError as e: pytest.fail( - f"NePS run failed for configuration: simple_example.yaml with error: {str(e)}") + f"NePS run failed for configuration: simple_example.yaml with error: {str(e)}" + ) diff --git a/tests/test_yaml_run_args/test_run_args_by_neps_run/test_neps_run.py b/tests/test_yaml_run_args/test_run_args_by_neps_run/test_neps_run.py index 30d6e178..4995a14c 100644 --- a/tests/test_yaml_run_args/test_run_args_by_neps_run/test_neps_run.py +++ b/tests/test_yaml_run_args/test_run_args_by_neps_run/test_neps_run.py @@ -3,39 +3,54 @@ import os import sys import yaml + BASE_PATH = "tests/test_yaml_run_args/test_run_args_by_neps_run/" @pytest.mark.neps_api -@pytest.mark.parametrize("config", [ - {"file_name": "config.yaml"}, - {"file_name": "loading_pipeline_space.yaml"}, - {"file_name": "loading_optimizer.yaml"}, - {"file_name": "config_select_bo.yaml", "check_optimizer": True, "optimizer_path": - "select_bo_run_args.yaml", - "result_path": "tests_tmpdir/test_run_args_by_neps_run/optimizer_bo" - "/.optimizer_info.yaml"}, - {"file_name": "config_priorband_with_args.yaml", "check_optimizer": True, - "optimizer_path": "priorband_args_run_args.yaml", - "result_path": "tests_tmpdir/test_run_args_by_neps_run/optimizer_priorband" - "/.optimizer_info.yaml"}, - {"file_name": "config_hyperband_mixed_args.yaml", "check_optimizer": True, - "optimizer_path": "hyperband_searcher_kwargs_yaml_args.yaml", - "result_path": "tests_tmpdir/test_run_args_by_neps_run/optimizer_hyperband" - "/.optimizer_info.yaml", "args": True} -]) +@pytest.mark.parametrize( + "config", + [ + {"file_name": "config.yaml"}, + {"file_name": "loading_pipeline_space.yaml"}, + {"file_name": "loading_optimizer.yaml"}, + { + "file_name": "config_select_bo.yaml", + "check_optimizer": True, + "optimizer_path": "select_bo_run_args.yaml", + "result_path": "tests_tmpdir/test_run_args_by_neps_run/optimizer_bo/.optimizer_info/info.yaml", + }, + { + "file_name": "config_priorband_with_args.yaml", + "check_optimizer": True, + "optimizer_path": "priorband_args_run_args.yaml", + "result_path": "tests_tmpdir/test_run_args_by_neps_run/optimizer_priorband/.optimizer_info/info.yaml", + }, + { + "file_name": "config_hyperband_mixed_args.yaml", + "check_optimizer": True, + "optimizer_path": "hyperband_searcher_kwargs_yaml_args.yaml", + "result_path": "tests_tmpdir/test_run_args_by_neps_run/optimizer_hyperband/.optimizer_info/info.yaml", + "args": True, + }, + ], +) def test_run_with_yaml(config: dict) -> None: """Test "neps.run" with various run_args.yaml settings to simulate loading options for variables.""" file_name = config["file_name"] check_optimizer = config.pop("check_optimizer", False) - assert os.path.exists(os.path.join(BASE_PATH, file_name)), (f"{file_name} " - f"does not exist.") + assert os.path.exists(os.path.join(BASE_PATH, file_name)), ( + f"{file_name} " f"does not exist." + ) - cmd = [sys.executable, os.path.join(BASE_PATH, 'neps_run.py'), - os.path.join(BASE_PATH, file_name)] + cmd = [ + sys.executable, + os.path.join(BASE_PATH, "neps_run.py"), + os.path.join(BASE_PATH, file_name), + ] if "args" in config: - cmd.append('--kwargs_flag') + cmd.append("--kwargs_flag") try: subprocess.check_call(cmd) @@ -50,17 +65,18 @@ def test_run_with_yaml(config: dict) -> None: def compare_generated_yaml(result_path, optimizer_path): """compare generated optimizer settings and solution settings""" - assert os.path.exists(result_path), \ - "Generated YAML file does not exist." + assert os.path.exists(result_path), "Generated YAML file does not exist." - assert os.path.exists(BASE_PATH + "optimizer_yamls/" + optimizer_path), \ - "Solution YAML file does not exist." + assert os.path.exists( + BASE_PATH + "optimizer_yamls/" + optimizer_path + ), "Solution YAML file does not exist." - with open(result_path, 'r') as gen_file: + with open(result_path, "r") as gen_file: generated_content = yaml.safe_load(gen_file) - with open(BASE_PATH + "optimizer_yamls/" + optimizer_path, 'r') as ref_file: + with open(BASE_PATH + "optimizer_yamls/" + optimizer_path, "r") as ref_file: reference_content = yaml.safe_load(ref_file) - assert generated_content == reference_content, \ - "The generated YAML does not match the reference YAML" + assert ( + generated_content == reference_content + ), "The generated YAML does not match the reference YAML"