From b1167a111fe2ff4ccd8472fa6a93ca97d0e7196a Mon Sep 17 00:00:00 2001 From: Antonin RAFFIN Date: Mon, 18 Nov 2024 12:21:46 +0100 Subject: [PATCH 1/2] Drop python 3.8, add python 3.12 support --- .github/workflows/ci.yml | 5 +- docs/conda_env.yml | 2 +- docs/conf.py | 3 +- docs/misc/changelog.rst | 24 +++++++ pyproject.toml | 4 +- sb3_contrib/ars/ars.py | 14 ++-- sb3_contrib/ars/policies.py | 8 +-- .../common/envs/invalid_actions_env.py | 20 +++--- sb3_contrib/common/maskable/buffers.py | 3 +- sb3_contrib/common/maskable/distributions.py | 10 +-- sb3_contrib/common/maskable/evaluation.py | 6 +- sb3_contrib/common/maskable/policies.py | 52 +++++++------- sb3_contrib/common/recurrent/buffers.py | 9 +-- sb3_contrib/common/recurrent/policies.py | 68 +++++++++---------- sb3_contrib/common/recurrent/type_aliases.py | 6 +- sb3_contrib/common/utils.py | 3 +- sb3_contrib/common/vec_env/async_eval.py | 12 ++-- sb3_contrib/common/wrappers/time_feature.py | 10 +-- sb3_contrib/crossq/crossq.py | 18 ++--- sb3_contrib/crossq/policies.py | 34 +++++----- sb3_contrib/ppo_mask/ppo_mask.py | 18 ++--- sb3_contrib/ppo_recurrent/ppo_recurrent.py | 10 +-- sb3_contrib/qrdqn/policies.py | 46 ++++++------- sb3_contrib/qrdqn/qrdqn.py | 24 +++---- sb3_contrib/tqc/policies.py | 54 +++++++-------- sb3_contrib/tqc/tqc.py | 18 ++--- sb3_contrib/trpo/trpo.py | 20 +++--- sb3_contrib/version.txt | 2 +- setup.py | 4 +- tests/test_dict_env.py | 4 +- tests/test_invalid_actions.py | 3 +- tests/test_lstm.py | 4 +- tests/wrappers/test_action_masker.py | 6 +- 33 files changed, 273 insertions(+), 251 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c3a2ec21..5973997a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -19,7 +19,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.8", "3.9", "3.10", "3.11"] + python-version: ["3.9", "3.10", "3.11", "3.12"] include: # Default version - gymnasium-version: "1.0.0" @@ -51,6 +51,7 @@ jobs: - name: Install specific version of gym run: | uv pip install --system gymnasium==${{ matrix.gymnasium-version }} + uv pip install --system "numpy<2" # Only run for python 3.10, downgrade gym to 0.29.1 - name: Lint with ruff @@ -65,8 +66,6 @@ jobs: - name: Type check run: | make type - # Do not run for python 3.8 (mypy internal error) - if: matrix.python-version != '3.8' - name: Test with pytest run: | make pytest diff --git a/docs/conda_env.yml b/docs/conda_env.yml index a080a9db..a2bfa265 100644 --- a/docs/conda_env.yml +++ b/docs/conda_env.yml @@ -13,7 +13,7 @@ dependencies: - cloudpickle - opencv-python-headless - pandas - - numpy>=1.20,<2.0 + - numpy>=1.20,<3.0 - matplotlib - sphinx>=5,<8 - sphinx_rtd_theme>=1.3.0 diff --git a/docs/conf.py b/docs/conf.py index a4b238e5..68d5e105 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -14,7 +14,6 @@ import datetime import os import sys -from typing import Dict # We CANNOT enable 'sphinxcontrib.spelling' because ReadTheDocs.org does not support # PyEnchant. @@ -151,7 +150,7 @@ def setup(app): # -- Options for LaTeX output ------------------------------------------------ -latex_elements: Dict[str, str] = { +latex_elements: dict[str, str] = { # The paper size ('letterpaper' or 'a4paper'). # # 'papersize': 'letterpaper', diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst index fcfbc181..05811d0c 100644 --- a/docs/misc/changelog.rst +++ b/docs/misc/changelog.rst @@ -3,6 +3,30 @@ Changelog ========== +Release 2.5.0a0 (WIP) +-------------------------- + +Breaking Changes: +^^^^^^^^^^^^^^^^^ +- Upgraded to PyTorch 2.3.0 +- Dropped Python 3.8 support + +New Features: +^^^^^^^^^^^^^ +- Added Python 3.12 support + +Bug Fixes: +^^^^^^^^^^ + +Deprecations: +^^^^^^^^^^^^^ + +Others: +^^^^^^^ + +Documentation: +^^^^^^^^^^^^^^ + Release 2.4.0 (2024-11-18) -------------------------- diff --git a/pyproject.toml b/pyproject.toml index bbac8c41..2f19c2c0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,8 +1,8 @@ [tool.ruff] # Same as Black. line-length = 127 -# Assume Python 3.8 -target-version = "py38" +# Assume Python 3.9 +target-version = "py39" [tool.ruff.lint] select = ["E", "F", "B", "UP", "C90", "RUF"] diff --git a/sb3_contrib/ars/ars.py b/sb3_contrib/ars/ars.py index 26d2d8a3..975ca361 100644 --- a/sb3_contrib/ars/ars.py +++ b/sb3_contrib/ars/ars.py @@ -3,7 +3,7 @@ import time import warnings from functools import partial -from typing import Any, ClassVar, Dict, Optional, Type, TypeVar, Union +from typing import Any, ClassVar, Optional, TypeVar, Union import numpy as np import torch as th @@ -50,14 +50,14 @@ class ARS(BaseAlgorithm): :param _init_setup_model: Whether or not to build the network at the creation of the instance """ - policy_aliases: ClassVar[Dict[str, Type[BasePolicy]]] = { + policy_aliases: ClassVar[dict[str, type[BasePolicy]]] = { "MlpPolicy": MlpPolicy, "LinearPolicy": LinearPolicy, } def __init__( self, - policy: Union[str, Type[ARSPolicy]], + policy: Union[str, type[ARSPolicy]], env: Union[GymEnv, str], n_delta: int = 8, n_top: Optional[int] = None, @@ -66,7 +66,7 @@ def __init__( zero_policy: bool = True, alive_bonus_offset: float = 0, n_eval_episodes: int = 1, - policy_kwargs: Optional[Dict[str, Any]] = None, + policy_kwargs: Optional[dict[str, Any]] = None, stats_window_size: int = 100, tensorboard_log: Optional[str] = None, seed: Optional[int] = None, @@ -144,8 +144,8 @@ def _mimic_monitor_wrapper(self, episode_rewards: np.ndarray, episode_lengths: n def _trigger_callback( self, - _locals: Dict[str, Any], - _globals: Dict[str, Any], + _locals: dict[str, Any], + _globals: dict[str, Any], callback: BaseCallback, n_envs: int, ) -> None: @@ -353,7 +353,7 @@ def learn( def set_parameters( self, - load_path_or_dict: Union[str, Dict[str, Dict]], + load_path_or_dict: Union[str, dict[str, dict]], exact_match: bool = True, device: Union[th.device, str] = "auto", ) -> None: diff --git a/sb3_contrib/ars/policies.py b/sb3_contrib/ars/policies.py index 2c3aeec7..454265fa 100644 --- a/sb3_contrib/ars/policies.py +++ b/sb3_contrib/ars/policies.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, List, Optional, Type +from typing import Any, Optional import torch as th from gymnasium import spaces @@ -26,8 +26,8 @@ def __init__( self, observation_space: spaces.Space, action_space: spaces.Space, - net_arch: Optional[List[int]] = None, - activation_fn: Type[nn.Module] = nn.ReLU, + net_arch: Optional[list[int]] = None, + activation_fn: type[nn.Module] = nn.ReLU, with_bias: bool = True, squash_output: bool = True, ): @@ -57,7 +57,7 @@ def __init__( self.action_net = nn.Sequential(*actor_net) - def _get_constructor_parameters(self) -> Dict[str, Any]: + def _get_constructor_parameters(self) -> dict[str, Any]: # data = super()._get_constructor_parameters() this adds normalize_images, which we don't support... data = dict( observation_space=self.observation_space, diff --git a/sb3_contrib/common/envs/invalid_actions_env.py b/sb3_contrib/common/envs/invalid_actions_env.py index 8306897d..82f182eb 100644 --- a/sb3_contrib/common/envs/invalid_actions_env.py +++ b/sb3_contrib/common/envs/invalid_actions_env.py @@ -1,4 +1,4 @@ -from typing import List, Optional +from typing import Optional import numpy as np from gymnasium import spaces @@ -23,7 +23,7 @@ def __init__( space = spaces.Discrete(dim) self.n_invalid_actions = n_invalid_actions self.possible_actions = np.arange(space.n) - self.invalid_actions: List[int] = [] + self.invalid_actions: list[int] = [] super().__init__(space=space, ep_length=ep_length) def _choose_next_state(self) -> None: @@ -32,7 +32,7 @@ def _choose_next_state(self) -> None: potential_invalid_actions = [i for i in self.possible_actions if i != self.state] self.invalid_actions = np.random.choice(potential_invalid_actions, self.n_invalid_actions, replace=False).tolist() - def action_masks(self) -> List[bool]: + def action_masks(self) -> list[bool]: return [action not in self.invalid_actions for action in self.possible_actions] @@ -45,7 +45,7 @@ class InvalidActionEnvMultiDiscrete(IdentityEnv[np.ndarray]): def __init__( self, - dims: Optional[List[int]] = None, + dims: Optional[list[int]] = None, ep_length: int = 100, n_invalid_actions: int = 0, ): @@ -58,13 +58,13 @@ def __init__( space = spaces.MultiDiscrete(dims) self.n_invalid_actions = n_invalid_actions self.possible_actions = np.arange(sum(dims)) - self.invalid_actions: List[int] = [] + self.invalid_actions: list[int] = [] super().__init__(space=space, ep_length=ep_length) def _choose_next_state(self) -> None: self.state = self.action_space.sample() - converted_state: List[int] = [] + converted_state: list[int] = [] running_total = 0 for i in range(len(self.action_space.nvec)): converted_state.append(running_total + self.state[i]) @@ -74,7 +74,7 @@ def _choose_next_state(self) -> None: potential_invalid_actions = [i for i in self.possible_actions if i not in converted_state] self.invalid_actions = np.random.choice(potential_invalid_actions, self.n_invalid_actions, replace=False).tolist() - def action_masks(self) -> List[bool]: + def action_masks(self) -> list[bool]: return [action not in self.invalid_actions for action in self.possible_actions] @@ -99,13 +99,13 @@ def __init__( self.n_dims = dims self.n_invalid_actions = n_invalid_actions self.possible_actions = np.arange(2 * dims) - self.invalid_actions: List[int] = [] + self.invalid_actions: list[int] = [] super().__init__(space=space, ep_length=ep_length) def _choose_next_state(self) -> None: self.state = self.action_space.sample() - converted_state: List[int] = [] + converted_state: list[int] = [] running_total = 0 for i in range(self.n_dims): converted_state.append(running_total + self.state[i]) @@ -115,5 +115,5 @@ def _choose_next_state(self) -> None: potential_invalid_actions = [i for i in self.possible_actions if i not in converted_state] self.invalid_actions = np.random.choice(potential_invalid_actions, self.n_invalid_actions, replace=False).tolist() - def action_masks(self) -> List[bool]: + def action_masks(self) -> list[bool]: return [action not in self.invalid_actions for action in self.possible_actions] diff --git a/sb3_contrib/common/maskable/buffers.py b/sb3_contrib/common/maskable/buffers.py index f4430ca5..37923131 100644 --- a/sb3_contrib/common/maskable/buffers.py +++ b/sb3_contrib/common/maskable/buffers.py @@ -1,4 +1,5 @@ -from typing import Generator, NamedTuple, Optional, Union +from collections.abc import Generator +from typing import NamedTuple, Optional, Union import numpy as np import torch as th diff --git a/sb3_contrib/common/maskable/distributions.py b/sb3_contrib/common/maskable/distributions.py index bacf3682..d2a92ae0 100644 --- a/sb3_contrib/common/maskable/distributions.py +++ b/sb3_contrib/common/maskable/distributions.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import List, Optional, Tuple, TypeVar, Union +from typing import Optional, TypeVar, Union import numpy as np import torch as th @@ -157,7 +157,7 @@ def actions_from_params(self, action_logits: th.Tensor, deterministic: bool = Fa self.proba_distribution(action_logits) return self.get_actions(deterministic=deterministic) - def log_prob_from_params(self, action_logits: th.Tensor) -> Tuple[th.Tensor, th.Tensor]: + def log_prob_from_params(self, action_logits: th.Tensor) -> tuple[th.Tensor, th.Tensor]: actions = self.actions_from_params(action_logits) log_prob = self.log_prob(actions) return actions, log_prob @@ -174,9 +174,9 @@ class MaskableMultiCategoricalDistribution(MaskableDistribution): :param action_dims: List of sizes of discrete action spaces """ - def __init__(self, action_dims: List[int]): + def __init__(self, action_dims: list[int]): super().__init__() - self.distributions: List[MaskableCategorical] = [] + self.distributions: list[MaskableCategorical] = [] self.action_dims = action_dims def proba_distribution_net(self, latent_dim: int) -> nn.Module: @@ -232,7 +232,7 @@ def actions_from_params(self, action_logits: th.Tensor, deterministic: bool = Fa self.proba_distribution(action_logits) return self.get_actions(deterministic=deterministic) - def log_prob_from_params(self, action_logits: th.Tensor) -> Tuple[th.Tensor, th.Tensor]: + def log_prob_from_params(self, action_logits: th.Tensor) -> tuple[th.Tensor, th.Tensor]: actions = self.actions_from_params(action_logits) log_prob = self.log_prob(actions) return actions, log_prob diff --git a/sb3_contrib/common/maskable/evaluation.py b/sb3_contrib/common/maskable/evaluation.py index 35e2e7ec..36cebd8f 100644 --- a/sb3_contrib/common/maskable/evaluation.py +++ b/sb3_contrib/common/maskable/evaluation.py @@ -1,5 +1,5 @@ import warnings -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import Any, Callable, Optional, Union import gymnasium as gym import numpy as np @@ -16,12 +16,12 @@ def evaluate_policy( n_eval_episodes: int = 10, deterministic: bool = True, render: bool = False, - callback: Optional[Callable[[Dict[str, Any], Dict[str, Any]], None]] = None, + callback: Optional[Callable[[dict[str, Any], dict[str, Any]], None]] = None, reward_threshold: Optional[float] = None, return_episode_rewards: bool = False, warn: bool = True, use_masking: bool = True, -) -> Union[Tuple[float, float], Tuple[List[float], List[int]]]: +) -> Union[tuple[float, float], tuple[list[float], list[int]]]: """ Runs policy for ``n_eval_episodes`` episodes and returns average reward. If a vector env is passed in, this divides the episodes to evaluate onto the diff --git a/sb3_contrib/common/maskable/policies.py b/sb3_contrib/common/maskable/policies.py index 1a0d53aa..a917543c 100644 --- a/sb3_contrib/common/maskable/policies.py +++ b/sb3_contrib/common/maskable/policies.py @@ -1,6 +1,6 @@ import warnings from functools import partial -from typing import Any, Dict, List, Optional, Tuple, Type, Union +from typing import Any, Optional, Union import numpy as np import torch as th @@ -47,15 +47,15 @@ def __init__( observation_space: spaces.Space, action_space: spaces.Space, lr_schedule: Schedule, - net_arch: Optional[Union[List[int], Dict[str, List[int]]]] = None, - activation_fn: Type[nn.Module] = nn.Tanh, + net_arch: Optional[Union[list[int], dict[str, list[int]]]] = None, + activation_fn: type[nn.Module] = nn.Tanh, ortho_init: bool = True, - features_extractor_class: Type[BaseFeaturesExtractor] = FlattenExtractor, - features_extractor_kwargs: Optional[Dict[str, Any]] = None, + features_extractor_class: type[BaseFeaturesExtractor] = FlattenExtractor, + features_extractor_kwargs: Optional[dict[str, Any]] = None, share_features_extractor: bool = True, normalize_images: bool = True, - optimizer_class: Type[th.optim.Optimizer] = th.optim.Adam, - optimizer_kwargs: Optional[Dict[str, Any]] = None, + optimizer_class: type[th.optim.Optimizer] = th.optim.Adam, + optimizer_kwargs: Optional[dict[str, Any]] = None, ): if optimizer_kwargs is None: optimizer_kwargs = {} @@ -115,7 +115,7 @@ def forward( obs: th.Tensor, deterministic: bool = False, action_masks: Optional[np.ndarray] = None, - ) -> Tuple[th.Tensor, th.Tensor, th.Tensor]: + ) -> tuple[th.Tensor, th.Tensor, th.Tensor]: """ Forward pass in all the networks (actor and critic) @@ -143,7 +143,7 @@ def forward( def extract_features( # type: ignore[override] self, obs: PyTorchObs, features_extractor: Optional[BaseFeaturesExtractor] = None - ) -> Union[th.Tensor, Tuple[th.Tensor, th.Tensor]]: + ) -> Union[th.Tensor, tuple[th.Tensor, th.Tensor]]: """ Preprocess the observation if needed and extract features. @@ -165,7 +165,7 @@ def extract_features( # type: ignore[override] vf_features = super().extract_features(obs, self.vf_features_extractor) return pi_features, vf_features - def _get_constructor_parameters(self) -> Dict[str, Any]: + def _get_constructor_parameters(self) -> dict[str, Any]: data = super()._get_constructor_parameters() data.update( @@ -267,12 +267,12 @@ def _predict( # type: ignore[override] def predict( self, - observation: Union[np.ndarray, Dict[str, np.ndarray]], - state: Optional[Tuple[np.ndarray, ...]] = None, + observation: Union[np.ndarray, dict[str, np.ndarray]], + state: Optional[tuple[np.ndarray, ...]] = None, episode_start: Optional[np.ndarray] = None, deterministic: bool = False, action_masks: Optional[np.ndarray] = None, - ) -> Tuple[np.ndarray, Optional[Tuple[np.ndarray, ...]]]: + ) -> tuple[np.ndarray, Optional[tuple[np.ndarray, ...]]]: """ Get the policy action from an observation (and optional hidden state). Includes sugar-coating to handle different observations (e.g. normalizing images). @@ -326,7 +326,7 @@ def evaluate_actions( obs: th.Tensor, actions: th.Tensor, action_masks: Optional[th.Tensor] = None, - ) -> Tuple[th.Tensor, th.Tensor, Optional[th.Tensor]]: + ) -> tuple[th.Tensor, th.Tensor, Optional[th.Tensor]]: """ Evaluate actions according to the current policy, given the observations. @@ -406,15 +406,15 @@ def __init__( observation_space: spaces.Space, action_space: spaces.Space, lr_schedule: Schedule, - net_arch: Optional[Union[List[int], Dict[str, List[int]]]] = None, - activation_fn: Type[nn.Module] = nn.Tanh, + net_arch: Optional[Union[list[int], dict[str, list[int]]]] = None, + activation_fn: type[nn.Module] = nn.Tanh, ortho_init: bool = True, - features_extractor_class: Type[BaseFeaturesExtractor] = NatureCNN, - features_extractor_kwargs: Optional[Dict[str, Any]] = None, + features_extractor_class: type[BaseFeaturesExtractor] = NatureCNN, + features_extractor_kwargs: Optional[dict[str, Any]] = None, share_features_extractor: bool = True, normalize_images: bool = True, - optimizer_class: Type[th.optim.Optimizer] = th.optim.Adam, - optimizer_kwargs: Optional[Dict[str, Any]] = None, + optimizer_class: type[th.optim.Optimizer] = th.optim.Adam, + optimizer_kwargs: Optional[dict[str, Any]] = None, ): super().__init__( observation_space, @@ -460,15 +460,15 @@ def __init__( observation_space: spaces.Dict, action_space: spaces.Space, lr_schedule: Schedule, - net_arch: Optional[Union[List[int], Dict[str, List[int]]]] = None, - activation_fn: Type[nn.Module] = nn.Tanh, + net_arch: Optional[Union[list[int], dict[str, list[int]]]] = None, + activation_fn: type[nn.Module] = nn.Tanh, ortho_init: bool = True, - features_extractor_class: Type[BaseFeaturesExtractor] = CombinedExtractor, - features_extractor_kwargs: Optional[Dict[str, Any]] = None, + features_extractor_class: type[BaseFeaturesExtractor] = CombinedExtractor, + features_extractor_kwargs: Optional[dict[str, Any]] = None, share_features_extractor: bool = True, normalize_images: bool = True, - optimizer_class: Type[th.optim.Optimizer] = th.optim.Adam, - optimizer_kwargs: Optional[Dict[str, Any]] = None, + optimizer_class: type[th.optim.Optimizer] = th.optim.Adam, + optimizer_kwargs: Optional[dict[str, Any]] = None, ): super().__init__( observation_space, diff --git a/sb3_contrib/common/recurrent/buffers.py b/sb3_contrib/common/recurrent/buffers.py index a9f1d4ec..5386db11 100644 --- a/sb3_contrib/common/recurrent/buffers.py +++ b/sb3_contrib/common/recurrent/buffers.py @@ -1,5 +1,6 @@ +from collections.abc import Generator from functools import partial -from typing import Callable, Generator, Optional, Tuple, Union +from typing import Callable, Optional, Union import numpy as np import torch as th @@ -64,7 +65,7 @@ def create_sequencers( episode_starts: np.ndarray, env_change: np.ndarray, device: th.device, -) -> Tuple[np.ndarray, Callable, Callable]: +) -> tuple[np.ndarray, Callable, Callable]: """ Create the utility function to chunk data into sequences and pad them to create fixed size tensors. @@ -115,7 +116,7 @@ def __init__( buffer_size: int, observation_space: spaces.Space, action_space: spaces.Space, - hidden_state_shape: Tuple[int, int, int, int], + hidden_state_shape: tuple[int, int, int, int], device: Union[th.device, str] = "auto", gae_lambda: float = 1, gamma: float = 0.99, @@ -262,7 +263,7 @@ def __init__( buffer_size: int, observation_space: spaces.Space, action_space: spaces.Space, - hidden_state_shape: Tuple[int, int, int, int], + hidden_state_shape: tuple[int, int, int, int], device: Union[th.device, str] = "auto", gae_lambda: float = 1, gamma: float = 0.99, diff --git a/sb3_contrib/common/recurrent/policies.py b/sb3_contrib/common/recurrent/policies.py index 3fa59407..ef7b987e 100644 --- a/sb3_contrib/common/recurrent/policies.py +++ b/sb3_contrib/common/recurrent/policies.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, List, Optional, Tuple, Type, Union +from typing import Any, Optional, Union import numpy as np import torch as th @@ -66,25 +66,25 @@ def __init__( observation_space: spaces.Space, action_space: spaces.Space, lr_schedule: Schedule, - net_arch: Optional[Union[List[int], Dict[str, List[int]]]] = None, - activation_fn: Type[nn.Module] = nn.Tanh, + net_arch: Optional[Union[list[int], dict[str, list[int]]]] = None, + activation_fn: type[nn.Module] = nn.Tanh, ortho_init: bool = True, use_sde: bool = False, log_std_init: float = 0.0, full_std: bool = True, use_expln: bool = False, squash_output: bool = False, - features_extractor_class: Type[BaseFeaturesExtractor] = FlattenExtractor, - features_extractor_kwargs: Optional[Dict[str, Any]] = None, + features_extractor_class: type[BaseFeaturesExtractor] = FlattenExtractor, + features_extractor_kwargs: Optional[dict[str, Any]] = None, share_features_extractor: bool = True, normalize_images: bool = True, - optimizer_class: Type[th.optim.Optimizer] = th.optim.Adam, - optimizer_kwargs: Optional[Dict[str, Any]] = None, + optimizer_class: type[th.optim.Optimizer] = th.optim.Adam, + optimizer_kwargs: Optional[dict[str, Any]] = None, lstm_hidden_size: int = 256, n_lstm_layers: int = 1, shared_lstm: bool = False, enable_critic_lstm: bool = True, - lstm_kwargs: Optional[Dict[str, Any]] = None, + lstm_kwargs: Optional[dict[str, Any]] = None, ): self.lstm_output_dim = lstm_hidden_size super().__init__( @@ -162,10 +162,10 @@ def _build_mlp_extractor(self) -> None: @staticmethod def _process_sequence( features: th.Tensor, - lstm_states: Tuple[th.Tensor, th.Tensor], + lstm_states: tuple[th.Tensor, th.Tensor], episode_starts: th.Tensor, lstm: nn.LSTM, - ) -> Tuple[th.Tensor, th.Tensor]: + ) -> tuple[th.Tensor, th.Tensor]: """ Do a forward pass in the LSTM network. @@ -216,7 +216,7 @@ def forward( lstm_states: RNNStates, episode_starts: th.Tensor, deterministic: bool = False, - ) -> Tuple[th.Tensor, th.Tensor, th.Tensor, RNNStates]: + ) -> tuple[th.Tensor, th.Tensor, th.Tensor, RNNStates]: """ Forward pass in all the networks (actor and critic) @@ -259,9 +259,9 @@ def forward( def get_distribution( self, obs: th.Tensor, - lstm_states: Tuple[th.Tensor, th.Tensor], + lstm_states: tuple[th.Tensor, th.Tensor], episode_starts: th.Tensor, - ) -> Tuple[Distribution, Tuple[th.Tensor, ...]]: + ) -> tuple[Distribution, tuple[th.Tensor, ...]]: """ Get the current policy distribution given the observations. @@ -280,7 +280,7 @@ def get_distribution( def predict_values( self, obs: th.Tensor, - lstm_states: Tuple[th.Tensor, th.Tensor], + lstm_states: tuple[th.Tensor, th.Tensor], episode_starts: th.Tensor, ) -> th.Tensor: """ @@ -309,7 +309,7 @@ def predict_values( def evaluate_actions( self, obs: th.Tensor, actions: th.Tensor, lstm_states: RNNStates, episode_starts: th.Tensor - ) -> Tuple[th.Tensor, th.Tensor, th.Tensor]: + ) -> tuple[th.Tensor, th.Tensor, th.Tensor]: """ Evaluate actions according to the current policy, given the observations. @@ -347,10 +347,10 @@ def evaluate_actions( def _predict( self, observation: th.Tensor, - lstm_states: Tuple[th.Tensor, th.Tensor], + lstm_states: tuple[th.Tensor, th.Tensor], episode_starts: th.Tensor, deterministic: bool = False, - ) -> Tuple[th.Tensor, Tuple[th.Tensor, ...]]: + ) -> tuple[th.Tensor, tuple[th.Tensor, ...]]: """ Get the action according to the policy for a given observation. @@ -366,11 +366,11 @@ def _predict( def predict( self, - observation: Union[np.ndarray, Dict[str, np.ndarray]], - state: Optional[Tuple[np.ndarray, ...]] = None, + observation: Union[np.ndarray, dict[str, np.ndarray]], + state: Optional[tuple[np.ndarray, ...]] = None, episode_start: Optional[np.ndarray] = None, deterministic: bool = False, - ) -> Tuple[np.ndarray, Optional[Tuple[np.ndarray, ...]]]: + ) -> tuple[np.ndarray, Optional[tuple[np.ndarray, ...]]]: """ Get the policy action from an observation (and optional hidden state). Includes sugar-coating to handle different observations (e.g. normalizing images). @@ -475,25 +475,25 @@ def __init__( observation_space: spaces.Space, action_space: spaces.Space, lr_schedule: Schedule, - net_arch: Optional[Union[List[int], Dict[str, List[int]]]] = None, - activation_fn: Type[nn.Module] = nn.Tanh, + net_arch: Optional[Union[list[int], dict[str, list[int]]]] = None, + activation_fn: type[nn.Module] = nn.Tanh, ortho_init: bool = True, use_sde: bool = False, log_std_init: float = 0.0, full_std: bool = True, use_expln: bool = False, squash_output: bool = False, - features_extractor_class: Type[BaseFeaturesExtractor] = NatureCNN, - features_extractor_kwargs: Optional[Dict[str, Any]] = None, + features_extractor_class: type[BaseFeaturesExtractor] = NatureCNN, + features_extractor_kwargs: Optional[dict[str, Any]] = None, share_features_extractor: bool = True, normalize_images: bool = True, - optimizer_class: Type[th.optim.Optimizer] = th.optim.Adam, - optimizer_kwargs: Optional[Dict[str, Any]] = None, + optimizer_class: type[th.optim.Optimizer] = th.optim.Adam, + optimizer_kwargs: Optional[dict[str, Any]] = None, lstm_hidden_size: int = 256, n_lstm_layers: int = 1, shared_lstm: bool = False, enable_critic_lstm: bool = True, - lstm_kwargs: Optional[Dict[str, Any]] = None, + lstm_kwargs: Optional[dict[str, Any]] = None, ): super().__init__( observation_space, @@ -565,25 +565,25 @@ def __init__( observation_space: spaces.Space, action_space: spaces.Space, lr_schedule: Schedule, - net_arch: Optional[Union[List[int], Dict[str, List[int]]]] = None, - activation_fn: Type[nn.Module] = nn.Tanh, + net_arch: Optional[Union[list[int], dict[str, list[int]]]] = None, + activation_fn: type[nn.Module] = nn.Tanh, ortho_init: bool = True, use_sde: bool = False, log_std_init: float = 0.0, full_std: bool = True, use_expln: bool = False, squash_output: bool = False, - features_extractor_class: Type[BaseFeaturesExtractor] = CombinedExtractor, - features_extractor_kwargs: Optional[Dict[str, Any]] = None, + features_extractor_class: type[BaseFeaturesExtractor] = CombinedExtractor, + features_extractor_kwargs: Optional[dict[str, Any]] = None, share_features_extractor: bool = True, normalize_images: bool = True, - optimizer_class: Type[th.optim.Optimizer] = th.optim.Adam, - optimizer_kwargs: Optional[Dict[str, Any]] = None, + optimizer_class: type[th.optim.Optimizer] = th.optim.Adam, + optimizer_kwargs: Optional[dict[str, Any]] = None, lstm_hidden_size: int = 256, n_lstm_layers: int = 1, shared_lstm: bool = False, enable_critic_lstm: bool = True, - lstm_kwargs: Optional[Dict[str, Any]] = None, + lstm_kwargs: Optional[dict[str, Any]] = None, ): super().__init__( observation_space, diff --git a/sb3_contrib/common/recurrent/type_aliases.py b/sb3_contrib/common/recurrent/type_aliases.py index 21ac0e0d..17b9bfca 100644 --- a/sb3_contrib/common/recurrent/type_aliases.py +++ b/sb3_contrib/common/recurrent/type_aliases.py @@ -1,12 +1,12 @@ -from typing import NamedTuple, Tuple +from typing import NamedTuple import torch as th from stable_baselines3.common.type_aliases import TensorDict class RNNStates(NamedTuple): - pi: Tuple[th.Tensor, ...] - vf: Tuple[th.Tensor, ...] + pi: tuple[th.Tensor, ...] + vf: tuple[th.Tensor, ...] class RecurrentRolloutBufferSamples(NamedTuple): diff --git a/sb3_contrib/common/utils.py b/sb3_contrib/common/utils.py index 73ab0b1e..010bbe14 100644 --- a/sb3_contrib/common/utils.py +++ b/sb3_contrib/common/utils.py @@ -1,4 +1,5 @@ -from typing import Callable, Optional, Sequence +from collections.abc import Sequence +from typing import Callable, Optional import torch as th from torch import nn diff --git a/sb3_contrib/common/vec_env/async_eval.py b/sb3_contrib/common/vec_env/async_eval.py index c18b7712..b4eef035 100644 --- a/sb3_contrib/common/vec_env/async_eval.py +++ b/sb3_contrib/common/vec_env/async_eval.py @@ -1,6 +1,6 @@ import multiprocessing as mp from collections import defaultdict -from typing import Callable, Dict, List, Optional, Tuple, Union +from typing import Callable, Optional, Union import numpy as np import torch as th @@ -103,7 +103,7 @@ class AsyncEval: def __init__( self, - envs_fn: List[Callable[[], VecEnv]], + envs_fn: list[Callable[[], VecEnv]], train_policy: BasePolicy, start_method: Optional[str] = None, n_eval_episodes: int = 1, @@ -151,7 +151,7 @@ def send_jobs(self, candidate_weights: th.Tensor, pop_size: int) -> None: remote.send(("eval", jobs_per_worker[remote_idx])) self.waiting = True - def seed(self, seed: Optional[int] = None) -> List[Union[None, int]]: + def seed(self, seed: Optional[int] = None) -> list[Union[None, int]]: """ Seed the environments. @@ -166,7 +166,7 @@ def seed(self, seed: Optional[int] = None) -> List[Union[None, int]]: remote.send(("seed", seed + idx)) return [remote.recv() for remote in self.remotes] - def set_options(self, options: Optional[Union[List[Dict], Dict]] = None) -> List[Union[None, int]]: + def set_options(self, options: Optional[Union[list[dict], dict]] = None) -> list[Union[None, int]]: """ Set environment options for all environments. If a dict is passed instead of a list, the same options will be used for all environments. @@ -179,7 +179,7 @@ def set_options(self, options: Optional[Union[List[Dict], Dict]] = None) -> List remote.send(("set_options", options)) return [remote.recv() for remote in self.remotes] - def get_results(self) -> List[Tuple[int, Tuple[np.ndarray, np.ndarray]]]: + def get_results(self) -> list[tuple[int, tuple[np.ndarray, np.ndarray]]]: """ Retreive episode rewards and lengths from each worker for all candidates (there might be multiple candidates per worker) @@ -192,7 +192,7 @@ def get_results(self) -> List[Tuple[int, Tuple[np.ndarray, np.ndarray]]]: self.waiting = False return flat_results - def get_obs_rms(self) -> List[RunningMeanStd]: + def get_obs_rms(self) -> list[RunningMeanStd]: """ Retrieve the observation filters (observation running mean std) of each process, they will be combined in the main process. diff --git a/sb3_contrib/common/wrappers/time_feature.py b/sb3_contrib/common/wrappers/time_feature.py index 7eeb9b11..435230a7 100644 --- a/sb3_contrib/common/wrappers/time_feature.py +++ b/sb3_contrib/common/wrappers/time_feature.py @@ -1,11 +1,11 @@ -from typing import Any, Dict, SupportsFloat, Tuple, Union +from typing import Any, SupportsFloat, Union import gymnasium as gym import numpy as np from gymnasium import spaces from gymnasium.core import ActType -TimeFeatureObs = Union[np.ndarray, Dict[str, np.ndarray]] +TimeFeatureObs = Union[np.ndarray, dict[str, np.ndarray]] class TimeFeatureWrapper(gym.Wrapper[TimeFeatureObs, ActType, TimeFeatureObs, ActType]): @@ -70,17 +70,17 @@ def __init__(self, env: gym.Env, max_steps: int = 1000, test_mode: bool = False) self._current_step = 0 self._test_mode = test_mode - def reset(self, **kwargs) -> Tuple[TimeFeatureObs, Dict[str, Any]]: + def reset(self, **kwargs) -> tuple[TimeFeatureObs, dict[str, Any]]: self._current_step = 0 obs, info = self.env.reset(**kwargs) return self._get_obs(obs), info - def step(self, action: ActType) -> Tuple[TimeFeatureObs, SupportsFloat, bool, bool, Dict[str, Any]]: + def step(self, action: ActType) -> tuple[TimeFeatureObs, SupportsFloat, bool, bool, dict[str, Any]]: self._current_step += 1 obs, reward, done, truncated, info = self.env.step(action) return self._get_obs(obs), reward, done, truncated, info - def _get_obs(self, obs: Union[np.ndarray, Dict[str, np.ndarray]]) -> Union[np.ndarray, Dict[str, np.ndarray]]: + def _get_obs(self, obs: Union[np.ndarray, dict[str, np.ndarray]]) -> Union[np.ndarray, dict[str, np.ndarray]]: """ Concatenate the time feature to the current observation. diff --git a/sb3_contrib/crossq/crossq.py b/sb3_contrib/crossq/crossq.py index 73204eb4..6fa860f1 100644 --- a/sb3_contrib/crossq/crossq.py +++ b/sb3_contrib/crossq/crossq.py @@ -1,4 +1,4 @@ -from typing import Any, ClassVar, Dict, List, Optional, Tuple, Type, TypeVar, Union +from typing import Any, ClassVar, Optional, TypeVar, Union import numpy as np import torch as th @@ -65,7 +65,7 @@ class CrossQ(OffPolicyAlgorithm): :param _init_setup_model: Whether or not to build the network at the creation of the instance """ - policy_aliases: ClassVar[Dict[str, Type[BasePolicy]]] = { + policy_aliases: ClassVar[dict[str, type[BasePolicy]]] = { "MlpPolicy": MlpPolicy, # TODO: Implement CnnPolicy and MultiInputPolicy } @@ -75,18 +75,18 @@ class CrossQ(OffPolicyAlgorithm): def __init__( self, - policy: Union[str, Type[CrossQPolicy]], + policy: Union[str, type[CrossQPolicy]], env: Union[GymEnv, str], learning_rate: Union[float, Schedule] = 1e-3, buffer_size: int = 1_000_000, # 1e6 learning_starts: int = 100, batch_size: int = 256, gamma: float = 0.99, - train_freq: Union[int, Tuple[int, str]] = 1, + train_freq: Union[int, tuple[int, str]] = 1, gradient_steps: int = 1, action_noise: Optional[ActionNoise] = None, - replay_buffer_class: Optional[Type[ReplayBuffer]] = None, - replay_buffer_kwargs: Optional[Dict[str, Any]] = None, + replay_buffer_class: Optional[type[ReplayBuffer]] = None, + replay_buffer_kwargs: Optional[dict[str, Any]] = None, optimize_memory_usage: bool = False, ent_coef: Union[str, float] = "auto", target_entropy: Union[str, float] = "auto", @@ -96,7 +96,7 @@ def __init__( use_sde_at_warmup: bool = False, stats_window_size: int = 100, tensorboard_log: Optional[str] = None, - policy_kwargs: Optional[Dict[str, Any]] = None, + policy_kwargs: Optional[dict[str, Any]] = None, verbose: int = 0, seed: Optional[int] = None, device: Union[th.device, str] = "auto", @@ -320,10 +320,10 @@ def learn( progress_bar=progress_bar, ) - def _excluded_save_params(self) -> List[str]: + def _excluded_save_params(self) -> list[str]: return [*super()._excluded_save_params(), "actor", "critic"] - def _get_torch_save_params(self) -> Tuple[List[str], List[str]]: + def _get_torch_save_params(self) -> tuple[list[str], list[str]]: state_dicts = ["policy", "actor.optimizer", "critic.optimizer"] if self.ent_coef_optimizer is not None: saved_pytorch_variables = ["log_ent_coef"] diff --git a/sb3_contrib/crossq/policies.py b/sb3_contrib/crossq/policies.py index 8fedeacf..eedaa365 100644 --- a/sb3_contrib/crossq/policies.py +++ b/sb3_contrib/crossq/policies.py @@ -1,5 +1,5 @@ from functools import partial -from typing import Any, Dict, List, Optional, Tuple, Type, Union +from typing import Any, Optional, Union import torch as th from gymnasium import spaces @@ -57,10 +57,10 @@ def __init__( self, observation_space: spaces.Space, action_space: spaces.Box, - net_arch: List[int], + net_arch: list[int], features_extractor: nn.Module, features_dim: int, - activation_fn: Type[nn.Module] = nn.ReLU, + activation_fn: type[nn.Module] = nn.ReLU, use_sde: bool = False, log_std_init: float = -3, full_std: bool = True, @@ -134,7 +134,7 @@ def __init__( self.mu = nn.Linear(last_layer_dim, action_dim) self.log_std = nn.Linear(last_layer_dim, action_dim) # type: ignore[assignment] - def _get_constructor_parameters(self) -> Dict[str, Any]: + def _get_constructor_parameters(self) -> dict[str, Any]: data = super()._get_constructor_parameters() data.update( @@ -176,7 +176,7 @@ def reset_noise(self, batch_size: int = 1) -> None: assert isinstance(self.action_dist, StateDependentNoiseDistribution), msg self.action_dist.sample_weights(self.log_std, batch_size=batch_size) - def get_action_dist_params(self, obs: PyTorchObs) -> Tuple[th.Tensor, th.Tensor, Dict[str, th.Tensor]]: + def get_action_dist_params(self, obs: PyTorchObs) -> tuple[th.Tensor, th.Tensor, dict[str, th.Tensor]]: """ Get the parameters for the action distribution. @@ -201,7 +201,7 @@ def forward(self, obs: PyTorchObs, deterministic: bool = False) -> th.Tensor: # Note: the action is squashed return self.action_dist.actions_from_params(mean_actions, log_std, deterministic=deterministic, **kwargs) - def action_log_prob(self, obs: PyTorchObs) -> Tuple[th.Tensor, th.Tensor]: + def action_log_prob(self, obs: PyTorchObs) -> tuple[th.Tensor, th.Tensor]: mean_actions, log_std, kwargs = self.get_action_dist_params(obs) # return action and associated log prob return self.action_dist.log_prob_from_params(mean_actions, log_std, **kwargs) @@ -254,10 +254,10 @@ def __init__( self, observation_space: spaces.Space, action_space: spaces.Box, - net_arch: List[int], + net_arch: list[int], features_extractor: BaseFeaturesExtractor, features_dim: int, - activation_fn: Type[nn.Module] = nn.ReLU, + activation_fn: type[nn.Module] = nn.ReLU, normalize_images: bool = True, n_critics: int = 2, share_features_extractor: bool = True, @@ -287,7 +287,7 @@ def __init__( self.share_features_extractor = share_features_extractor self.n_critics = n_critics - self.q_networks: List[nn.Module] = [] + self.q_networks: list[nn.Module] = [] for idx in range(n_critics): q_net_list = create_mlp( features_dim + action_dim, @@ -300,7 +300,7 @@ def __init__( self.add_module(f"qf{idx}", q_net) self.q_networks.append(q_net) - def forward(self, obs: th.Tensor, actions: th.Tensor) -> Tuple[th.Tensor, ...]: + def forward(self, obs: th.Tensor, actions: th.Tensor) -> tuple[th.Tensor, ...]: # Learn the features extractor using the policy loss only # when the features_extractor is shared with the actor with th.set_grad_enabled(not self.share_features_extractor): @@ -362,8 +362,8 @@ def __init__( observation_space: spaces.Space, action_space: spaces.Box, lr_schedule: Schedule, - net_arch: Optional[Union[List[int], Dict[str, List[int]]]] = None, - activation_fn: Type[nn.Module] = nn.ReLU, + net_arch: Optional[Union[list[int], dict[str, list[int]]]] = None, + activation_fn: type[nn.Module] = nn.ReLU, batch_norm: bool = True, batch_norm_momentum: float = 0.01, # Note: Jax implementation is 1 - momentum = 0.99 batch_norm_eps: float = 0.001, @@ -372,11 +372,11 @@ def __init__( log_std_init: float = -3, use_expln: bool = False, clip_mean: float = 2.0, - features_extractor_class: Type[BaseFeaturesExtractor] = FlattenExtractor, - features_extractor_kwargs: Optional[Dict[str, Any]] = None, + features_extractor_class: type[BaseFeaturesExtractor] = FlattenExtractor, + features_extractor_kwargs: Optional[dict[str, Any]] = None, normalize_images: bool = True, - optimizer_class: Type[th.optim.Optimizer] = th.optim.Adam, - optimizer_kwargs: Optional[Dict[str, Any]] = None, + optimizer_class: type[th.optim.Optimizer] = th.optim.Adam, + optimizer_kwargs: Optional[dict[str, Any]] = None, n_critics: int = 2, share_features_extractor: bool = False, ): @@ -471,7 +471,7 @@ def _build(self, lr_schedule: Schedule) -> None: **self.optimizer_kwargs, ) - def _get_constructor_parameters(self) -> Dict[str, Any]: + def _get_constructor_parameters(self) -> dict[str, Any]: data = super()._get_constructor_parameters() data.update( diff --git a/sb3_contrib/ppo_mask/ppo_mask.py b/sb3_contrib/ppo_mask/ppo_mask.py index a0d7da6e..f845ad6f 100644 --- a/sb3_contrib/ppo_mask/ppo_mask.py +++ b/sb3_contrib/ppo_mask/ppo_mask.py @@ -1,4 +1,4 @@ -from typing import Any, ClassVar, Dict, Optional, Tuple, Type, TypeVar, Union +from typing import Any, ClassVar, Optional, TypeVar, Union import numpy as np import torch as th @@ -65,7 +65,7 @@ class MaskablePPO(OnPolicyAlgorithm): :param _init_setup_model: Whether or not to build the network at the creation of the instance """ - policy_aliases: ClassVar[Dict[str, Type[BasePolicy]]] = { + policy_aliases: ClassVar[dict[str, type[BasePolicy]]] = { "MlpPolicy": MlpPolicy, "CnnPolicy": CnnPolicy, "MultiInputPolicy": MultiInputPolicy, @@ -75,7 +75,7 @@ class MaskablePPO(OnPolicyAlgorithm): def __init__( self, - policy: Union[str, Type[MaskableActorCriticPolicy]], + policy: Union[str, type[MaskableActorCriticPolicy]], env: Union[GymEnv, str], learning_rate: Union[float, Schedule] = 3e-4, n_steps: int = 2048, @@ -89,12 +89,12 @@ def __init__( ent_coef: float = 0.0, vf_coef: float = 0.5, max_grad_norm: float = 0.5, - rollout_buffer_class: Optional[Type[RolloutBuffer]] = None, - rollout_buffer_kwargs: Optional[Dict[str, Any]] = None, + rollout_buffer_class: Optional[type[RolloutBuffer]] = None, + rollout_buffer_kwargs: Optional[dict[str, Any]] = None, target_kl: Optional[float] = None, stats_window_size: int = 100, tensorboard_log: Optional[str] = None, - policy_kwargs: Optional[Dict[str, Any]] = None, + policy_kwargs: Optional[dict[str, Any]] = None, verbose: int = 0, seed: Optional[int] = None, device: Union[th.device, str] = "auto", @@ -285,12 +285,12 @@ def collect_rollouts( def predict( # type: ignore[override] self, - observation: Union[np.ndarray, Dict[str, np.ndarray]], - state: Optional[Tuple[np.ndarray, ...]] = None, + observation: Union[np.ndarray, dict[str, np.ndarray]], + state: Optional[tuple[np.ndarray, ...]] = None, episode_start: Optional[np.ndarray] = None, deterministic: bool = False, action_masks: Optional[np.ndarray] = None, - ) -> Tuple[np.ndarray, Optional[Tuple[np.ndarray, ...]]]: + ) -> tuple[np.ndarray, Optional[tuple[np.ndarray, ...]]]: """ Get the policy action from an observation (and optional hidden state). Includes sugar-coating to handle different observations (e.g. normalizing images). diff --git a/sb3_contrib/ppo_recurrent/ppo_recurrent.py b/sb3_contrib/ppo_recurrent/ppo_recurrent.py index 5b976939..060892f8 100644 --- a/sb3_contrib/ppo_recurrent/ppo_recurrent.py +++ b/sb3_contrib/ppo_recurrent/ppo_recurrent.py @@ -1,5 +1,5 @@ from copy import deepcopy -from typing import Any, ClassVar, Dict, List, Optional, Type, TypeVar, Union +from typing import Any, ClassVar, Optional, TypeVar, Union import numpy as np import torch as th @@ -65,7 +65,7 @@ class RecurrentPPO(OnPolicyAlgorithm): :param _init_setup_model: Whether or not to build the network at the creation of the instance """ - policy_aliases: ClassVar[Dict[str, Type[BasePolicy]]] = { + policy_aliases: ClassVar[dict[str, type[BasePolicy]]] = { "MlpLstmPolicy": MlpLstmPolicy, "CnnLstmPolicy": CnnLstmPolicy, "MultiInputLstmPolicy": MultiInputLstmPolicy, @@ -73,7 +73,7 @@ class RecurrentPPO(OnPolicyAlgorithm): def __init__( self, - policy: Union[str, Type[RecurrentActorCriticPolicy]], + policy: Union[str, type[RecurrentActorCriticPolicy]], env: Union[GymEnv, str], learning_rate: Union[float, Schedule] = 3e-4, n_steps: int = 128, @@ -92,7 +92,7 @@ def __init__( target_kl: Optional[float] = None, stats_window_size: int = 100, tensorboard_log: Optional[str] = None, - policy_kwargs: Optional[Dict[str, Any]] = None, + policy_kwargs: Optional[dict[str, Any]] = None, verbose: int = 0, seed: Optional[int] = None, device: Union[th.device, str] = "auto", @@ -456,5 +456,5 @@ def learn( progress_bar=progress_bar, ) - def _excluded_save_params(self) -> List[str]: + def _excluded_save_params(self) -> list[str]: return super()._excluded_save_params() + ["_last_lstm_states"] # noqa: RUF005 diff --git a/sb3_contrib/qrdqn/policies.py b/sb3_contrib/qrdqn/policies.py index 317396ac..2f8e4e3b 100644 --- a/sb3_contrib/qrdqn/policies.py +++ b/sb3_contrib/qrdqn/policies.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, List, Optional, Type +from typing import Any, Optional import torch as th from gymnasium import spaces @@ -36,8 +36,8 @@ def __init__( features_extractor: BaseFeaturesExtractor, features_dim: int, n_quantiles: int = 200, - net_arch: Optional[List[int]] = None, - activation_fn: Type[nn.Module] = nn.ReLU, + net_arch: Optional[list[int]] = None, + activation_fn: type[nn.Module] = nn.ReLU, normalize_images: bool = True, ): super().__init__( @@ -74,7 +74,7 @@ def _predict(self, observation: PyTorchObs, deterministic: bool = True) -> th.Te action = q_values.argmax(dim=1).reshape(-1) return action - def _get_constructor_parameters(self) -> Dict[str, Any]: + def _get_constructor_parameters(self) -> dict[str, Any]: data = super()._get_constructor_parameters() data.update( @@ -119,13 +119,13 @@ def __init__( action_space: spaces.Discrete, lr_schedule: Schedule, n_quantiles: int = 200, - net_arch: Optional[List[int]] = None, - activation_fn: Type[nn.Module] = nn.ReLU, - features_extractor_class: Type[BaseFeaturesExtractor] = FlattenExtractor, - features_extractor_kwargs: Optional[Dict[str, Any]] = None, + net_arch: Optional[list[int]] = None, + activation_fn: type[nn.Module] = nn.ReLU, + features_extractor_class: type[BaseFeaturesExtractor] = FlattenExtractor, + features_extractor_kwargs: Optional[dict[str, Any]] = None, normalize_images: bool = True, - optimizer_class: Type[th.optim.Optimizer] = th.optim.Adam, - optimizer_kwargs: Optional[Dict[str, Any]] = None, + optimizer_class: type[th.optim.Optimizer] = th.optim.Adam, + optimizer_kwargs: Optional[dict[str, Any]] = None, ): super().__init__( observation_space, @@ -187,7 +187,7 @@ def forward(self, obs: PyTorchObs, deterministic: bool = True) -> th.Tensor: def _predict(self, obs: PyTorchObs, deterministic: bool = True) -> th.Tensor: return self.quantile_net._predict(obs, deterministic=deterministic) - def _get_constructor_parameters(self) -> Dict[str, Any]: + def _get_constructor_parameters(self) -> dict[str, Any]: data = super()._get_constructor_parameters() data.update( @@ -242,13 +242,13 @@ def __init__( action_space: spaces.Discrete, lr_schedule: Schedule, n_quantiles: int = 200, - net_arch: Optional[List[int]] = None, - activation_fn: Type[nn.Module] = nn.ReLU, - features_extractor_class: Type[BaseFeaturesExtractor] = NatureCNN, - features_extractor_kwargs: Optional[Dict[str, Any]] = None, + net_arch: Optional[list[int]] = None, + activation_fn: type[nn.Module] = nn.ReLU, + features_extractor_class: type[BaseFeaturesExtractor] = NatureCNN, + features_extractor_kwargs: Optional[dict[str, Any]] = None, normalize_images: bool = True, - optimizer_class: Type[th.optim.Optimizer] = th.optim.Adam, - optimizer_kwargs: Optional[Dict[str, Any]] = None, + optimizer_class: type[th.optim.Optimizer] = th.optim.Adam, + optimizer_kwargs: Optional[dict[str, Any]] = None, ): super().__init__( observation_space, @@ -290,13 +290,13 @@ def __init__( action_space: spaces.Discrete, lr_schedule: Schedule, n_quantiles: int = 200, - net_arch: Optional[List[int]] = None, - activation_fn: Type[nn.Module] = nn.ReLU, - features_extractor_class: Type[BaseFeaturesExtractor] = CombinedExtractor, - features_extractor_kwargs: Optional[Dict[str, Any]] = None, + net_arch: Optional[list[int]] = None, + activation_fn: type[nn.Module] = nn.ReLU, + features_extractor_class: type[BaseFeaturesExtractor] = CombinedExtractor, + features_extractor_kwargs: Optional[dict[str, Any]] = None, normalize_images: bool = True, - optimizer_class: Type[th.optim.Optimizer] = th.optim.Adam, - optimizer_kwargs: Optional[Dict[str, Any]] = None, + optimizer_class: type[th.optim.Optimizer] = th.optim.Adam, + optimizer_kwargs: Optional[dict[str, Any]] = None, ): super().__init__( observation_space, diff --git a/sb3_contrib/qrdqn/qrdqn.py b/sb3_contrib/qrdqn/qrdqn.py index 129cef79..e2c8ac34 100644 --- a/sb3_contrib/qrdqn/qrdqn.py +++ b/sb3_contrib/qrdqn/qrdqn.py @@ -1,5 +1,5 @@ import warnings -from typing import Any, ClassVar, Dict, List, Optional, Tuple, Type, TypeVar, Union +from typing import Any, ClassVar, Optional, TypeVar, Union import numpy as np import torch as th @@ -61,7 +61,7 @@ class QRDQN(OffPolicyAlgorithm): :param _init_setup_model: Whether or not to build the network at the creation of the instance """ - policy_aliases: ClassVar[Dict[str, Type[BasePolicy]]] = { + policy_aliases: ClassVar[dict[str, type[BasePolicy]]] = { "MlpPolicy": MlpPolicy, "CnnPolicy": CnnPolicy, "MultiInputPolicy": MultiInputPolicy, @@ -74,7 +74,7 @@ class QRDQN(OffPolicyAlgorithm): def __init__( self, - policy: Union[str, Type[QRDQNPolicy]], + policy: Union[str, type[QRDQNPolicy]], env: Union[GymEnv, str], learning_rate: Union[float, Schedule] = 5e-5, buffer_size: int = 1000000, # 1e6 @@ -82,10 +82,10 @@ def __init__( batch_size: int = 32, tau: float = 1.0, gamma: float = 0.99, - train_freq: Union[int, Tuple[int, str]] = 4, + train_freq: Union[int, tuple[int, str]] = 4, gradient_steps: int = 1, - replay_buffer_class: Optional[Type[ReplayBuffer]] = None, - replay_buffer_kwargs: Optional[Dict[str, Any]] = None, + replay_buffer_class: Optional[type[ReplayBuffer]] = None, + replay_buffer_kwargs: Optional[dict[str, Any]] = None, optimize_memory_usage: bool = False, target_update_interval: int = 10000, exploration_fraction: float = 0.005, @@ -94,7 +94,7 @@ def __init__( max_grad_norm: Optional[float] = None, stats_window_size: int = 100, tensorboard_log: Optional[str] = None, - policy_kwargs: Optional[Dict[str, Any]] = None, + policy_kwargs: Optional[dict[str, Any]] = None, verbose: int = 0, seed: Optional[int] = None, device: Union[th.device, str] = "auto", @@ -235,11 +235,11 @@ def train(self, gradient_steps: int, batch_size: int = 100) -> None: def predict( self, - observation: Union[np.ndarray, Dict[str, np.ndarray]], - state: Optional[Tuple[np.ndarray, ...]] = None, + observation: Union[np.ndarray, dict[str, np.ndarray]], + state: Optional[tuple[np.ndarray, ...]] = None, episode_start: Optional[np.ndarray] = None, deterministic: bool = False, - ) -> Tuple[np.ndarray, Optional[Tuple[np.ndarray, ...]]]: + ) -> tuple[np.ndarray, Optional[tuple[np.ndarray, ...]]]: """ Get the policy action from an observation (and optional hidden state). Includes sugar-coating to handle different observations (e.g. normalizing images). @@ -284,10 +284,10 @@ def learn( progress_bar=progress_bar, ) - def _excluded_save_params(self) -> List[str]: + def _excluded_save_params(self) -> list[str]: return super()._excluded_save_params() + ["quantile_net", "quantile_net_target"] # noqa: RUF005 - def _get_torch_save_params(self) -> Tuple[List[str], List[str]]: + def _get_torch_save_params(self) -> tuple[list[str], list[str]]: state_dicts = ["policy", "policy.optimizer"] return state_dicts, [] diff --git a/sb3_contrib/tqc/policies.py b/sb3_contrib/tqc/policies.py index 5f1d55bc..3ea10638 100644 --- a/sb3_contrib/tqc/policies.py +++ b/sb3_contrib/tqc/policies.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, List, Optional, Tuple, Type, Union +from typing import Any, Optional, Union import torch as th from gymnasium import spaces @@ -50,10 +50,10 @@ def __init__( self, observation_space: spaces.Space, action_space: spaces.Box, - net_arch: List[int], + net_arch: list[int], features_extractor: nn.Module, features_dim: int, - activation_fn: Type[nn.Module] = nn.ReLU, + activation_fn: type[nn.Module] = nn.ReLU, use_sde: bool = False, log_std_init: float = -3, full_std: bool = True, @@ -101,7 +101,7 @@ def __init__( self.mu = nn.Linear(last_layer_dim, action_dim) self.log_std = nn.Linear(last_layer_dim, action_dim) # type: ignore[assignment] - def _get_constructor_parameters(self) -> Dict[str, Any]: + def _get_constructor_parameters(self) -> dict[str, Any]: data = super()._get_constructor_parameters() data.update( @@ -143,7 +143,7 @@ def reset_noise(self, batch_size: int = 1) -> None: assert isinstance(self.action_dist, StateDependentNoiseDistribution), msg self.action_dist.sample_weights(self.log_std, batch_size=batch_size) - def get_action_dist_params(self, obs: PyTorchObs) -> Tuple[th.Tensor, th.Tensor, Dict[str, th.Tensor]]: + def get_action_dist_params(self, obs: PyTorchObs) -> tuple[th.Tensor, th.Tensor, dict[str, th.Tensor]]: """ Get the parameters for the action distribution. @@ -168,7 +168,7 @@ def forward(self, obs: PyTorchObs, deterministic: bool = False) -> th.Tensor: # Note: the action is squashed return self.action_dist.actions_from_params(mean_actions, log_std, deterministic=deterministic, **kwargs) - def action_log_prob(self, obs: PyTorchObs) -> Tuple[th.Tensor, th.Tensor]: + def action_log_prob(self, obs: PyTorchObs) -> tuple[th.Tensor, th.Tensor]: mean_actions, log_std, kwargs = self.get_action_dist_params(obs) # return action and associated log prob return self.action_dist.log_prob_from_params(mean_actions, log_std, **kwargs) @@ -201,10 +201,10 @@ def __init__( self, observation_space: spaces.Space, action_space: spaces.Box, - net_arch: List[int], + net_arch: list[int], features_extractor: BaseFeaturesExtractor, features_dim: int, - activation_fn: Type[nn.Module] = nn.ReLU, + activation_fn: type[nn.Module] = nn.ReLU, normalize_images: bool = True, n_quantiles: int = 25, n_critics: int = 2, @@ -280,17 +280,17 @@ def __init__( observation_space: spaces.Space, action_space: spaces.Box, lr_schedule: Schedule, - net_arch: Optional[Union[List[int], Dict[str, List[int]]]] = None, - activation_fn: Type[nn.Module] = nn.ReLU, + net_arch: Optional[Union[list[int], dict[str, list[int]]]] = None, + activation_fn: type[nn.Module] = nn.ReLU, use_sde: bool = False, log_std_init: float = -3, use_expln: bool = False, clip_mean: float = 2.0, - features_extractor_class: Type[BaseFeaturesExtractor] = FlattenExtractor, - features_extractor_kwargs: Optional[Dict[str, Any]] = None, + features_extractor_class: type[BaseFeaturesExtractor] = FlattenExtractor, + features_extractor_kwargs: Optional[dict[str, Any]] = None, normalize_images: bool = True, - optimizer_class: Type[th.optim.Optimizer] = th.optim.Adam, - optimizer_kwargs: Optional[Dict[str, Any]] = None, + optimizer_class: type[th.optim.Optimizer] = th.optim.Adam, + optimizer_kwargs: Optional[dict[str, Any]] = None, n_quantiles: int = 25, n_critics: int = 2, share_features_extractor: bool = False, @@ -373,7 +373,7 @@ def _build(self, lr_schedule: Schedule) -> None: **self.optimizer_kwargs, ) - def _get_constructor_parameters(self) -> Dict[str, Any]: + def _get_constructor_parameters(self) -> dict[str, Any]: data = super()._get_constructor_parameters() data.update( @@ -464,17 +464,17 @@ def __init__( observation_space: spaces.Space, action_space: spaces.Box, lr_schedule: Schedule, - net_arch: Optional[Union[List[int], Dict[str, List[int]]]] = None, - activation_fn: Type[nn.Module] = nn.ReLU, + net_arch: Optional[Union[list[int], dict[str, list[int]]]] = None, + activation_fn: type[nn.Module] = nn.ReLU, use_sde: bool = False, log_std_init: float = -3, use_expln: bool = False, clip_mean: float = 2.0, - features_extractor_class: Type[BaseFeaturesExtractor] = NatureCNN, - features_extractor_kwargs: Optional[Dict[str, Any]] = None, + features_extractor_class: type[BaseFeaturesExtractor] = NatureCNN, + features_extractor_kwargs: Optional[dict[str, Any]] = None, normalize_images: bool = True, - optimizer_class: Type[th.optim.Optimizer] = th.optim.Adam, - optimizer_kwargs: Optional[Dict[str, Any]] = None, + optimizer_class: type[th.optim.Optimizer] = th.optim.Adam, + optimizer_kwargs: Optional[dict[str, Any]] = None, n_quantiles: int = 25, n_critics: int = 2, share_features_extractor: bool = False, @@ -533,17 +533,17 @@ def __init__( observation_space: spaces.Space, action_space: spaces.Box, lr_schedule: Schedule, - net_arch: Optional[Union[List[int], Dict[str, List[int]]]] = None, - activation_fn: Type[nn.Module] = nn.ReLU, + net_arch: Optional[Union[list[int], dict[str, list[int]]]] = None, + activation_fn: type[nn.Module] = nn.ReLU, use_sde: bool = False, log_std_init: float = -3, use_expln: bool = False, clip_mean: float = 2.0, - features_extractor_class: Type[BaseFeaturesExtractor] = CombinedExtractor, - features_extractor_kwargs: Optional[Dict[str, Any]] = None, + features_extractor_class: type[BaseFeaturesExtractor] = CombinedExtractor, + features_extractor_kwargs: Optional[dict[str, Any]] = None, normalize_images: bool = True, - optimizer_class: Type[th.optim.Optimizer] = th.optim.Adam, - optimizer_kwargs: Optional[Dict[str, Any]] = None, + optimizer_class: type[th.optim.Optimizer] = th.optim.Adam, + optimizer_kwargs: Optional[dict[str, Any]] = None, n_quantiles: int = 25, n_critics: int = 2, share_features_extractor: bool = False, diff --git a/sb3_contrib/tqc/tqc.py b/sb3_contrib/tqc/tqc.py index d4d85380..58679dc7 100644 --- a/sb3_contrib/tqc/tqc.py +++ b/sb3_contrib/tqc/tqc.py @@ -1,4 +1,4 @@ -from typing import Any, Callable, ClassVar, Dict, List, Optional, Tuple, Type, TypeVar, Union +from typing import Any, Callable, ClassVar, Optional, TypeVar, Union import numpy as np import torch as th @@ -68,7 +68,7 @@ class TQC(OffPolicyAlgorithm): :param _init_setup_model: Whether or not to build the network at the creation of the instance """ - policy_aliases: ClassVar[Dict[str, Type[BasePolicy]]] = { + policy_aliases: ClassVar[dict[str, type[BasePolicy]]] = { "MlpPolicy": MlpPolicy, "CnnPolicy": CnnPolicy, "MultiInputPolicy": MultiInputPolicy, @@ -80,7 +80,7 @@ class TQC(OffPolicyAlgorithm): def __init__( self, - policy: Union[str, Type[TQCPolicy]], + policy: Union[str, type[TQCPolicy]], env: Union[GymEnv, str], learning_rate: Union[float, Callable] = 3e-4, buffer_size: int = 1000000, # 1e6 @@ -88,11 +88,11 @@ def __init__( batch_size: int = 256, tau: float = 0.005, gamma: float = 0.99, - train_freq: Union[int, Tuple[int, str]] = 1, + train_freq: Union[int, tuple[int, str]] = 1, gradient_steps: int = 1, action_noise: Optional[ActionNoise] = None, - replay_buffer_class: Optional[Type[ReplayBuffer]] = None, - replay_buffer_kwargs: Optional[Dict[str, Any]] = None, + replay_buffer_class: Optional[type[ReplayBuffer]] = None, + replay_buffer_kwargs: Optional[dict[str, Any]] = None, optimize_memory_usage: bool = False, ent_coef: Union[str, float] = "auto", target_update_interval: int = 1, @@ -103,7 +103,7 @@ def __init__( use_sde_at_warmup: bool = False, stats_window_size: int = 100, tensorboard_log: Optional[str] = None, - policy_kwargs: Optional[Dict[str, Any]] = None, + policy_kwargs: Optional[dict[str, Any]] = None, verbose: int = 0, seed: Optional[int] = None, device: Union[th.device, str] = "auto", @@ -308,11 +308,11 @@ def learn( progress_bar=progress_bar, ) - def _excluded_save_params(self) -> List[str]: + def _excluded_save_params(self) -> list[str]: # Exclude aliases return super()._excluded_save_params() + ["actor", "critic", "critic_target"] # noqa: RUF005 - def _get_torch_save_params(self) -> Tuple[List[str], List[str]]: + def _get_torch_save_params(self) -> tuple[list[str], list[str]]: state_dicts = ["policy", "actor.optimizer", "critic.optimizer"] if self.ent_coef_optimizer is not None: saved_pytorch_variables = ["log_ent_coef"] diff --git a/sb3_contrib/trpo/trpo.py b/sb3_contrib/trpo/trpo.py index a8e65567..c3dc89b1 100644 --- a/sb3_contrib/trpo/trpo.py +++ b/sb3_contrib/trpo/trpo.py @@ -1,7 +1,7 @@ import copy import warnings from functools import partial -from typing import Any, ClassVar, Dict, List, Optional, Tuple, Type, TypeVar, Union +from typing import Any, ClassVar, Optional, TypeVar, Union import numpy as np import torch as th @@ -72,7 +72,7 @@ class TRPO(OnPolicyAlgorithm): :param _init_setup_model: Whether or not to build the network at the creation of the instance """ - policy_aliases: ClassVar[Dict[str, Type[BasePolicy]]] = { + policy_aliases: ClassVar[dict[str, type[BasePolicy]]] = { "MlpPolicy": MlpPolicy, "CnnPolicy": CnnPolicy, "MultiInputPolicy": MultiInputPolicy, @@ -80,7 +80,7 @@ class TRPO(OnPolicyAlgorithm): def __init__( self, - policy: Union[str, Type[ActorCriticPolicy]], + policy: Union[str, type[ActorCriticPolicy]], env: Union[GymEnv, str], learning_rate: Union[float, Schedule] = 1e-3, n_steps: int = 2048, @@ -94,14 +94,14 @@ def __init__( gae_lambda: float = 0.95, use_sde: bool = False, sde_sample_freq: int = -1, - rollout_buffer_class: Optional[Type[RolloutBuffer]] = None, - rollout_buffer_kwargs: Optional[Dict[str, Any]] = None, + rollout_buffer_class: Optional[type[RolloutBuffer]] = None, + rollout_buffer_kwargs: Optional[dict[str, Any]] = None, normalize_advantage: bool = True, target_kl: float = 0.01, sub_sampling_factor: int = 1, stats_window_size: int = 100, tensorboard_log: Optional[str] = None, - policy_kwargs: Optional[Dict[str, Any]] = None, + policy_kwargs: Optional[dict[str, Any]] = None, verbose: int = 0, seed: Optional[int] = None, device: Union[th.device, str] = "auto", @@ -175,7 +175,7 @@ def __init__( def _compute_actor_grad( self, kl_div: th.Tensor, policy_objective: th.Tensor - ) -> Tuple[List[nn.Parameter], th.Tensor, th.Tensor, List[Tuple[int, ...]]]: + ) -> tuple[list[nn.Parameter], th.Tensor, th.Tensor, list[tuple[int, ...]]]: """ Compute actor gradients for kl div and surrogate objectives. @@ -191,10 +191,10 @@ def _compute_actor_grad( # Contains the shape of the gradients of the KL divergence w.r.t each parameter # This way the flattened gradient can be reshaped back into the original shapes and applied to # the parameters - grad_shape: List[Tuple[int, ...]] = [] + grad_shape: list[tuple[int, ...]] = [] # Contains the parameters which have non-zeros KL divergence gradients # The list is used during the line-search to apply the step to each parameters - actor_params: List[nn.Parameter] = [] + actor_params: list[nn.Parameter] = [] for name, param in self.policy.named_parameters(): # Skip parameters related to value function based on name @@ -388,7 +388,7 @@ def train(self) -> None: self.logger.record("train/n_updates", self._n_updates, exclude="tensorboard") def hessian_vector_product( - self, params: List[nn.Parameter], grad_kl: th.Tensor, vector: th.Tensor, retain_graph: bool = True + self, params: list[nn.Parameter], grad_kl: th.Tensor, vector: th.Tensor, retain_graph: bool = True ) -> th.Tensor: """ Computes the matrix-vector product with the Fisher information matrix. diff --git a/sb3_contrib/version.txt b/sb3_contrib/version.txt index 197c4d5c..b8feefb9 100644 --- a/sb3_contrib/version.txt +++ b/sb3_contrib/version.txt @@ -1 +1 @@ -2.4.0 +2.5.0a0 diff --git a/setup.py b/setup.py index a61e74bc..91d9ae7a 100644 --- a/setup.py +++ b/setup.py @@ -79,7 +79,7 @@ long_description=long_description, long_description_content_type="text/markdown", version=__version__, - python_requires=">=3.8", + python_requires=">=3.9", # PyPI package information. project_urls={ "Code": "https://github.com/Stable-Baselines-Team/stable-baselines3-contrib", @@ -91,9 +91,9 @@ }, classifiers=[ "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", ], ) diff --git a/tests/test_dict_env.py b/tests/test_dict_env.py index a276709e..429255a1 100644 --- a/tests/test_dict_env.py +++ b/tests/test_dict_env.py @@ -1,4 +1,4 @@ -from typing import Dict, Optional +from typing import Optional import gymnasium as gym import numpy as np @@ -67,7 +67,7 @@ def step(self, action): done = truncated = False return self.observation_space.sample(), reward, done, truncated, {} - def reset(self, *, seed: Optional[int] = None, options: Optional[Dict] = None): + def reset(self, *, seed: Optional[int] = None, options: Optional[dict] = None): if seed is not None: self.observation_space.seed(seed) return self.observation_space.sample(), {} diff --git a/tests/test_invalid_actions.py b/tests/test_invalid_actions.py index 3d119b06..6459a4cf 100644 --- a/tests/test_invalid_actions.py +++ b/tests/test_invalid_actions.py @@ -1,5 +1,4 @@ import random -from typing import Dict, Tuple import gymnasium as gym import numpy as np @@ -32,7 +31,7 @@ def __init__(self, env): super().__init__(env) self.observation_space = spaces.Dict({"obs": self.env.observation_space}) - def reset(self, **kwargs) -> Tuple[Dict[str, np.ndarray], Dict]: + def reset(self, **kwargs) -> tuple[dict[str, np.ndarray], dict]: return {"obs": self.env.reset(seed=kwargs.get("seed", 0))[0]}, {} # type: ignore[dict-item] def step(self, action): diff --git a/tests/test_lstm.py b/tests/test_lstm.py index d1cfa4e4..003dbcbb 100644 --- a/tests/test_lstm.py +++ b/tests/test_lstm.py @@ -1,4 +1,4 @@ -from typing import Dict, Optional +from typing import Optional import gymnasium as gym import numpy as np @@ -51,7 +51,7 @@ def _pos_obs(full_obs): xpos, _xvel, thetapos, _thetavel = full_obs return np.array([xpos, thetapos]) - def reset(self, *, seed: Optional[int] = None, options: Optional[Dict] = None): + def reset(self, *, seed: Optional[int] = None, options: Optional[dict] = None): full_obs, info = super().reset(seed=seed, options=options) return CartPoleNoVelEnv._pos_obs(full_obs), info diff --git a/tests/wrappers/test_action_masker.py b/tests/wrappers/test_action_masker.py index afd6d341..ea129c5d 100644 --- a/tests/wrappers/test_action_masker.py +++ b/tests/wrappers/test_action_masker.py @@ -1,5 +1,3 @@ -from typing import List - import pytest from gymnasium import spaces from stable_baselines3.common.envs import IdentityEnv @@ -19,12 +17,12 @@ def __init__(self, dim: int = 1, ep_length: int = 100): self.useless_property = 1 super().__init__(ep_length=ep_length, space=space) - def _action_masks(self) -> List[int]: + def _action_masks(self) -> list[int]: assert isinstance(self.action_space, spaces.Discrete) return [i == self.state for i in range(self.action_space.n)] -def action_mask_fn(env: IdentityEnvDiscrete) -> List[int]: +def action_mask_fn(env: IdentityEnvDiscrete) -> list[int]: assert isinstance(env.action_space, spaces.Discrete) return [i == env.state for i in range(env.action_space.n)] From 4fe5c58b8d3827c302f6f22cc030aa901d28e0dc Mon Sep 17 00:00:00 2001 From: Antonin RAFFIN Date: Mon, 18 Nov 2024 15:41:56 +0100 Subject: [PATCH 2/2] Update changelog --- docs/misc/changelog.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst index 05811d0c..f7552c21 100644 --- a/docs/misc/changelog.rst +++ b/docs/misc/changelog.rst @@ -14,6 +14,7 @@ Breaking Changes: New Features: ^^^^^^^^^^^^^ - Added Python 3.12 support +- Added Numpy v2.0 support Bug Fixes: ^^^^^^^^^^