Stable-Baselines-Team · araffin · Nov 18, 2024 · Nov 18, 2024 · Nov 18, 2024
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -19,7 +19,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11"]
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
         include:
           # Default version
           - gymnasium-version: "1.0.0"
@@ -51,6 +51,7 @@ jobs:
       - name: Install specific version of gym
         run: |
           uv pip install --system gymnasium==${{ matrix.gymnasium-version }}
+          uv pip install --system "numpy<2"
         # Only run for python 3.10, downgrade gym to 0.29.1
 
       - name: Lint with ruff
@@ -65,8 +66,6 @@ jobs:
       - name: Type check
         run: |
           make type
-        # Do not run for python 3.8 (mypy internal error)
-        if: matrix.python-version != '3.8'
       - name: Test with pytest
         run: |
           make pytest
diff --git a/docs/conda_env.yml b/docs/conda_env.yml
@@ -13,7 +13,7 @@ dependencies:
     - cloudpickle
     - opencv-python-headless
     - pandas
-    - numpy>=1.20,<2.0
+    - numpy>=1.20,<3.0
     - matplotlib
     - sphinx>=5,<8
     - sphinx_rtd_theme>=1.3.0

diff --git a/docs/conf.py b/docs/conf.py
@@ -14,7 +14,6 @@
 import datetime
 import os
 import sys
-from typing import Dict
 
 # We CANNOT enable 'sphinxcontrib.spelling' because ReadTheDocs.org does not support
 # PyEnchant.
@@ -151,7 +150,7 @@ def setup(app):
 
 # -- Options for LaTeX output ------------------------------------------------
 
-latex_elements: Dict[str, str] = {
+latex_elements: dict[str, str] = {
     # The paper size ('letterpaper' or 'a4paper').
     #
     # 'papersize': 'letterpaper',

diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst
@@ -3,6 +3,31 @@
 Changelog
 ==========
 
+Release 2.5.0a0 (WIP)
+--------------------------
+
+Breaking Changes:
+^^^^^^^^^^^^^^^^^
+- Upgraded to PyTorch 2.3.0
+- Dropped Python 3.8 support
+
+New Features:
+^^^^^^^^^^^^^
+- Added Python 3.12 support
+- Added Numpy v2.0 support
+
+Bug Fixes:
+^^^^^^^^^^
+
+Deprecations:
+^^^^^^^^^^^^^
+
+Others:
+^^^^^^^
+
+Documentation:
+^^^^^^^^^^^^^^
+
 Release 2.4.0 (2024-11-18)
 --------------------------
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,8 +1,8 @@
 [tool.ruff]
 # Same as Black.
 line-length = 127
-# Assume Python 3.8
-target-version = "py38"
+# Assume Python 3.9
+target-version = "py39"
 
 [tool.ruff.lint]
 select = ["E", "F", "B", "UP", "C90", "RUF"]

diff --git a/sb3_contrib/ars/ars.py b/sb3_contrib/ars/ars.py
@@ -3,7 +3,7 @@
 import time
 import warnings
 from functools import partial
-from typing import Any, ClassVar, Dict, Optional, Type, TypeVar, Union
+from typing import Any, ClassVar, Optional, TypeVar, Union
 
 import numpy as np
 import torch as th
@@ -50,14 +50,14 @@ class ARS(BaseAlgorithm):
     :param _init_setup_model: Whether or not to build the network at the creation of the instance
     """
 
-    policy_aliases: ClassVar[Dict[str, Type[BasePolicy]]] = {
+    policy_aliases: ClassVar[dict[str, type[BasePolicy]]] = {
         "MlpPolicy": MlpPolicy,
         "LinearPolicy": LinearPolicy,
     }
 
     def __init__(
         self,
-        policy: Union[str, Type[ARSPolicy]],
+        policy: Union[str, type[ARSPolicy]],
         env: Union[GymEnv, str],
         n_delta: int = 8,
         n_top: Optional[int] = None,
@@ -66,7 +66,7 @@ def __init__(
         zero_policy: bool = True,
         alive_bonus_offset: float = 0,
         n_eval_episodes: int = 1,
-        policy_kwargs: Optional[Dict[str, Any]] = None,
+        policy_kwargs: Optional[dict[str, Any]] = None,
         stats_window_size: int = 100,
         tensorboard_log: Optional[str] = None,
         seed: Optional[int] = None,
@@ -144,8 +144,8 @@ def _mimic_monitor_wrapper(self, episode_rewards: np.ndarray, episode_lengths: n
 
     def _trigger_callback(
         self,
-        _locals: Dict[str, Any],
-        _globals: Dict[str, Any],
+        _locals: dict[str, Any],
+        _globals: dict[str, Any],
         callback: BaseCallback,
         n_envs: int,
     ) -> None:
@@ -353,7 +353,7 @@ def learn(
 
     def set_parameters(
         self,
-        load_path_or_dict: Union[str, Dict[str, Dict]],
+        load_path_or_dict: Union[str, dict[str, dict]],
         exact_match: bool = True,
         device: Union[th.device, str] = "auto",
     ) -> None:

diff --git a/sb3_contrib/ars/policies.py b/sb3_contrib/ars/policies.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional, Type
+from typing import Any, Optional
 
 import torch as th
 from gymnasium import spaces
@@ -26,8 +26,8 @@ def __init__(
         self,
         observation_space: spaces.Space,
         action_space: spaces.Space,
-        net_arch: Optional[List[int]] = None,
-        activation_fn: Type[nn.Module] = nn.ReLU,
+        net_arch: Optional[list[int]] = None,
+        activation_fn: type[nn.Module] = nn.ReLU,
         with_bias: bool = True,
         squash_output: bool = True,
     ):
@@ -57,7 +57,7 @@ def __init__(
 
         self.action_net = nn.Sequential(*actor_net)
 
-    def _get_constructor_parameters(self) -> Dict[str, Any]:
+    def _get_constructor_parameters(self) -> dict[str, Any]:
         # data = super()._get_constructor_parameters() this adds normalize_images, which we don't support...
         data = dict(
             observation_space=self.observation_space,

diff --git a/sb3_contrib/common/envs/invalid_actions_env.py b/sb3_contrib/common/envs/invalid_actions_env.py
@@ -1,4 +1,4 @@
-from typing import List, Optional
+from typing import Optional
 
 import numpy as np
 from gymnasium import spaces
@@ -23,7 +23,7 @@ def __init__(
         space = spaces.Discrete(dim)
         self.n_invalid_actions = n_invalid_actions
         self.possible_actions = np.arange(space.n)
-        self.invalid_actions: List[int] = []
+        self.invalid_actions: list[int] = []
         super().__init__(space=space, ep_length=ep_length)
 
     def _choose_next_state(self) -> None:
@@ -32,7 +32,7 @@ def _choose_next_state(self) -> None:
         potential_invalid_actions = [i for i in self.possible_actions if i != self.state]
         self.invalid_actions = np.random.choice(potential_invalid_actions, self.n_invalid_actions, replace=False).tolist()
 
-    def action_masks(self) -> List[bool]:
+    def action_masks(self) -> list[bool]:
         return [action not in self.invalid_actions for action in self.possible_actions]
 
 
@@ -45,7 +45,7 @@ class InvalidActionEnvMultiDiscrete(IdentityEnv[np.ndarray]):
 
     def __init__(
         self,
-        dims: Optional[List[int]] = None,
+        dims: Optional[list[int]] = None,
         ep_length: int = 100,
         n_invalid_actions: int = 0,
     ):
@@ -58,13 +58,13 @@ def __init__(
         space = spaces.MultiDiscrete(dims)
         self.n_invalid_actions = n_invalid_actions
         self.possible_actions = np.arange(sum(dims))
-        self.invalid_actions: List[int] = []
+        self.invalid_actions: list[int] = []
         super().__init__(space=space, ep_length=ep_length)
 
     def _choose_next_state(self) -> None:
         self.state = self.action_space.sample()
 
-        converted_state: List[int] = []
+        converted_state: list[int] = []
         running_total = 0
         for i in range(len(self.action_space.nvec)):
             converted_state.append(running_total + self.state[i])
@@ -74,7 +74,7 @@ def _choose_next_state(self) -> None:
         potential_invalid_actions = [i for i in self.possible_actions if i not in converted_state]
         self.invalid_actions = np.random.choice(potential_invalid_actions, self.n_invalid_actions, replace=False).tolist()
 
-    def action_masks(self) -> List[bool]:
+    def action_masks(self) -> list[bool]:
         return [action not in self.invalid_actions for action in self.possible_actions]
 
 
@@ -99,13 +99,13 @@ def __init__(
         self.n_dims = dims
         self.n_invalid_actions = n_invalid_actions
         self.possible_actions = np.arange(2 * dims)
-        self.invalid_actions: List[int] = []
+        self.invalid_actions: list[int] = []
         super().__init__(space=space, ep_length=ep_length)
 
     def _choose_next_state(self) -> None:
         self.state = self.action_space.sample()
 
-        converted_state: List[int] = []
+        converted_state: list[int] = []
         running_total = 0
         for i in range(self.n_dims):
             converted_state.append(running_total + self.state[i])
@@ -115,5 +115,5 @@ def _choose_next_state(self) -> None:
         potential_invalid_actions = [i for i in self.possible_actions if i not in converted_state]
         self.invalid_actions = np.random.choice(potential_invalid_actions, self.n_invalid_actions, replace=False).tolist()
 
-    def action_masks(self) -> List[bool]:
+    def action_masks(self) -> list[bool]:
         return [action not in self.invalid_actions for action in self.possible_actions]
diff --git a/sb3_contrib/common/maskable/buffers.py b/sb3_contrib/common/maskable/buffers.py
@@ -1,4 +1,5 @@
-from typing import Generator, NamedTuple, Optional, Union
+from collections.abc import Generator
+from typing import NamedTuple, Optional, Union
 
 import numpy as np
 import torch as th

diff --git a/sb3_contrib/common/maskable/distributions.py b/sb3_contrib/common/maskable/distributions.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import List, Optional, Tuple, TypeVar, Union
+from typing import Optional, TypeVar, Union
 
 import numpy as np
 import torch as th
@@ -157,7 +157,7 @@ def actions_from_params(self, action_logits: th.Tensor, deterministic: bool = Fa
         self.proba_distribution(action_logits)
         return self.get_actions(deterministic=deterministic)
 
-    def log_prob_from_params(self, action_logits: th.Tensor) -> Tuple[th.Tensor, th.Tensor]:
+    def log_prob_from_params(self, action_logits: th.Tensor) -> tuple[th.Tensor, th.Tensor]:
         actions = self.actions_from_params(action_logits)
         log_prob = self.log_prob(actions)
         return actions, log_prob
@@ -174,9 +174,9 @@ class MaskableMultiCategoricalDistribution(MaskableDistribution):
     :param action_dims: List of sizes of discrete action spaces
     """
 
-    def __init__(self, action_dims: List[int]):
+    def __init__(self, action_dims: list[int]):
         super().__init__()
-        self.distributions: List[MaskableCategorical] = []
+        self.distributions: list[MaskableCategorical] = []
         self.action_dims = action_dims
 
     def proba_distribution_net(self, latent_dim: int) -> nn.Module:
@@ -232,7 +232,7 @@ def actions_from_params(self, action_logits: th.Tensor, deterministic: bool = Fa
         self.proba_distribution(action_logits)
         return self.get_actions(deterministic=deterministic)
 
-    def log_prob_from_params(self, action_logits: th.Tensor) -> Tuple[th.Tensor, th.Tensor]:
+    def log_prob_from_params(self, action_logits: th.Tensor) -> tuple[th.Tensor, th.Tensor]:
         actions = self.actions_from_params(action_logits)
         log_prob = self.log_prob(actions)
         return actions, log_prob

diff --git a/sb3_contrib/common/maskable/evaluation.py b/sb3_contrib/common/maskable/evaluation.py
@@ -1,5 +1,5 @@
 import warnings
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Optional, Union
 
 import gymnasium as gym
 import numpy as np
@@ -16,12 +16,12 @@ def evaluate_policy(
     n_eval_episodes: int = 10,
     deterministic: bool = True,
     render: bool = False,
-    callback: Optional[Callable[[Dict[str, Any], Dict[str, Any]], None]] = None,
+    callback: Optional[Callable[[dict[str, Any], dict[str, Any]], None]] = None,
     reward_threshold: Optional[float] = None,
     return_episode_rewards: bool = False,
     warn: bool = True,
     use_masking: bool = True,
-) -> Union[Tuple[float, float], Tuple[List[float], List[int]]]:
+) -> Union[tuple[float, float], tuple[list[float], list[int]]]:
     """
     Runs policy for ``n_eval_episodes`` episodes and returns average reward.
     If a vector env is passed in, this divides the episodes to evaluate onto the