Skip to content

Commit d67de80

Browse files
authored
Add Python 3.13 support, drop Python 3.9 (#319)
* Drop Python 3.9 support * Apply autofixes for Python 3.10 * Use zip strict * Reformat * Use built-in zip * Cast to float TRPO max step size * Fix mypy annotations
1 parent 55dede1 commit d67de80

31 files changed

+277
-257
lines changed

.github/workflows/ci.yml

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,17 +19,17 @@ jobs:
1919
runs-on: ubuntu-latest
2020
strategy:
2121
matrix:
22-
python-version: ["3.9", "3.10", "3.11", "3.12"]
22+
python-version: ["3.10", "3.11", "3.12", "3.13"]
2323
include:
2424
# Default version
2525
- gymnasium-version: "1.0.0"
2626
# Add a new config to test gym<1.0
2727
- python-version: "3.10"
2828
gymnasium-version: "0.29.1"
2929
steps:
30-
- uses: actions/checkout@v3
30+
- uses: actions/checkout@v6
3131
- name: Set up Python ${{ matrix.python-version }}
32-
uses: actions/setup-python@v4
32+
uses: actions/setup-python@v6
3333
with:
3434
python-version: ${{ matrix.python-version }}
3535
- name: Install dependencies
@@ -39,7 +39,8 @@ jobs:
3939
pip install uv
4040
# cpu version of pytorch
4141
# See https://github.com/astral-sh/uv/issues/1497
42-
uv pip install --system torch==2.4.1+cpu --index https://download.pytorch.org/whl/cpu
42+
# Need Pytorch 2.9+ for Python 3.13
43+
uv pip install --system torch==2.9.1+cpu --index https://download.pytorch.org/whl/cpu
4344
4445
# Install master version
4546
# and dependencies for docs and tests

docs/misc/changelog.rst

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,32 @@
33
Changelog
44
==========
55

6+
Release 2.8.0a0 (WIP)
7+
--------------------------
8+
9+
Breaking Changes:
10+
^^^^^^^^^^^^^^^^^
11+
- Removed support for Python 3.9, please upgrade to Python >= 3.10
12+
- Upgraded to Stable-Baselines3 >= 2.8.0
13+
- Set ``strict=True`` for every call to ``zip(...)``
14+
15+
16+
New Features:
17+
^^^^^^^^^^^^^
18+
- Added official support for Python 3.13
19+
20+
Bug Fixes:
21+
^^^^^^^^^^
22+
23+
Deprecations:
24+
^^^^^^^^^^^^^
25+
26+
Others:
27+
^^^^^^^
28+
29+
Documentation:
30+
^^^^^^^^^^^^^^
31+
632
Release 2.7.1 (2025-12-05)
733
--------------------------
834

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
[tool.ruff]
22
# Same as Black.
33
line-length = 127
4-
# Assume Python 3.9
5-
target-version = "py39"
4+
# Assume Python 3.10
5+
target-version = "py310"
66

77
[tool.ruff.lint]
88
select = ["E", "F", "B", "UP", "C90", "RUF"]

sb3_contrib/ars/ars.py

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import time
44
import warnings
55
from functools import partial
6-
from typing import Any, ClassVar, Optional, TypeVar, Union
6+
from typing import Any, ClassVar, TypeVar
77

88
import numpy as np
99
import torch as th
@@ -57,21 +57,21 @@ class ARS(BaseAlgorithm):
5757

5858
def __init__(
5959
self,
60-
policy: Union[str, type[ARSPolicy]],
61-
env: Union[GymEnv, str],
60+
policy: str | type[ARSPolicy],
61+
env: GymEnv | str,
6262
n_delta: int = 8,
63-
n_top: Optional[int] = None,
64-
learning_rate: Union[float, Schedule] = 0.02,
65-
delta_std: Union[float, Schedule] = 0.05,
63+
n_top: int | None = None,
64+
learning_rate: float | Schedule = 0.02,
65+
delta_std: float | Schedule = 0.05,
6666
zero_policy: bool = True,
6767
alive_bonus_offset: float = 0,
6868
n_eval_episodes: int = 1,
69-
policy_kwargs: Optional[dict[str, Any]] = None,
69+
policy_kwargs: dict[str, Any] | None = None,
7070
stats_window_size: int = 100,
71-
tensorboard_log: Optional[str] = None,
72-
seed: Optional[int] = None,
71+
tensorboard_log: str | None = None,
72+
seed: int | None = None,
7373
verbose: int = 0,
74-
device: Union[th.device, str] = "cpu",
74+
device: th.device | str = "cpu",
7575
_init_setup_model: bool = True,
7676
):
7777
super().__init__(
@@ -137,7 +137,7 @@ def _mimic_monitor_wrapper(self, episode_rewards: np.ndarray, episode_lengths: n
137137
# Mimic Monitor Wrapper
138138
infos = [
139139
{"episode": {"r": episode_reward, "l": episode_length}}
140-
for episode_reward, episode_length in zip(episode_rewards, episode_lengths)
140+
for episode_reward, episode_length in zip(episode_rewards, episode_lengths, strict=True)
141141
]
142142

143143
self._update_info_buffer(infos)
@@ -163,7 +163,7 @@ def _trigger_callback(
163163
callback.on_step()
164164

165165
def evaluate_candidates(
166-
self, candidate_weights: th.Tensor, callback: BaseCallback, async_eval: Optional[AsyncEval]
166+
self, candidate_weights: th.Tensor, callback: BaseCallback, async_eval: AsyncEval | None
167167
) -> th.Tensor:
168168
"""
169169
Evaluate each candidate.
@@ -257,7 +257,7 @@ def dump_logs(self) -> None:
257257
self.logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard")
258258
self.logger.dump(step=self.num_timesteps)
259259

260-
def _do_one_update(self, callback: BaseCallback, async_eval: Optional[AsyncEval]) -> None:
260+
def _do_one_update(self, callback: BaseCallback, async_eval: AsyncEval | None) -> None:
261261
"""
262262
Sample new candidates, evaluate them and then update current policy.
263263
@@ -312,7 +312,7 @@ def learn(
312312
log_interval: int = 1,
313313
tb_log_name: str = "ARS",
314314
reset_num_timesteps: bool = True,
315-
async_eval: Optional[AsyncEval] = None,
315+
async_eval: AsyncEval | None = None,
316316
progress_bar: bool = False,
317317
) -> SelfARS:
318318
"""
@@ -353,9 +353,9 @@ def learn(
353353

354354
def set_parameters(
355355
self,
356-
load_path_or_dict: Union[str, dict[str, dict]],
356+
load_path_or_dict: str | dict[str, dict],
357357
exact_match: bool = True,
358-
device: Union[th.device, str] = "auto",
358+
device: th.device | str = "auto",
359359
) -> None:
360360
# Patched set_parameters() to handle ARS linear policy saved with sb3-contrib < 1.7.0
361361
params = None

sb3_contrib/ars/policies.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import Any, Optional
1+
from typing import Any
22

33
import torch as th
44
from gymnasium import spaces
@@ -26,7 +26,7 @@ def __init__(
2626
self,
2727
observation_space: spaces.Space,
2828
action_space: spaces.Space,
29-
net_arch: Optional[list[int]] = None,
29+
net_arch: list[int] | None = None,
3030
activation_fn: type[nn.Module] = nn.ReLU,
3131
with_bias: bool = True,
3232
squash_output: bool = True,

sb3_contrib/common/envs/invalid_actions_env.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
from typing import Optional
2-
31
import numpy as np
42
from gymnasium import spaces
53
from stable_baselines3.common.envs import IdentityEnv
@@ -12,7 +10,7 @@ class InvalidActionEnvDiscrete(IdentityEnv[int]):
1210

1311
def __init__(
1412
self,
15-
dim: Optional[int] = None,
13+
dim: int | None = None,
1614
ep_length: int = 100,
1715
n_invalid_actions: int = 0,
1816
):
@@ -47,7 +45,7 @@ class InvalidActionEnvMultiDiscrete(IdentityEnv[np.ndarray]):
4745

4846
def __init__(
4947
self,
50-
dims: Optional[list[int]] = None,
48+
dims: list[int] | None = None,
5149
ep_length: int = 100,
5250
n_invalid_actions: int = 0,
5351
):
@@ -89,7 +87,7 @@ class InvalidActionEnvMultiBinary(IdentityEnv[np.ndarray]):
8987

9088
def __init__(
9189
self,
92-
dims: Optional[int] = None,
90+
dims: int | None = None,
9391
ep_length: int = 100,
9492
n_invalid_actions: int = 0,
9593
):

sb3_contrib/common/maskable/buffers.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from collections.abc import Generator
2-
from typing import NamedTuple, Optional, Union
2+
from typing import NamedTuple
33

44
import numpy as np
55
import torch as th
@@ -50,7 +50,7 @@ def __init__(
5050
buffer_size: int,
5151
observation_space: spaces.Space,
5252
action_space: spaces.Space,
53-
device: Union[th.device, str] = "auto",
53+
device: th.device | str = "auto",
5454
gae_lambda: float = 1,
5555
gamma: float = 0.99,
5656
n_envs: int = 1,
@@ -76,7 +76,7 @@ def reset(self) -> None:
7676

7777
super().reset()
7878

79-
def add(self, *args, action_masks: Optional[np.ndarray] = None, **kwargs) -> None:
79+
def add(self, *args, action_masks: np.ndarray | None = None, **kwargs) -> None:
8080
"""
8181
:param action_masks: Masks applied to constrain the choice of possible actions.
8282
"""
@@ -85,7 +85,7 @@ def add(self, *args, action_masks: Optional[np.ndarray] = None, **kwargs) -> Non
8585

8686
super().add(*args, **kwargs)
8787

88-
def get(self, batch_size: Optional[int] = None) -> Generator[MaskableRolloutBufferSamples, None, None]: # type: ignore[override]
88+
def get(self, batch_size: int | None = None) -> Generator[MaskableRolloutBufferSamples, None, None]: # type: ignore[override]
8989
assert self.full, ""
9090
indices = np.random.permutation(self.buffer_size * self.n_envs)
9191
# Prepare the data
@@ -111,7 +111,7 @@ def get(self, batch_size: Optional[int] = None) -> Generator[MaskableRolloutBuff
111111
yield self._get_samples(indices[start_idx : start_idx + batch_size])
112112
start_idx += batch_size
113113

114-
def _get_samples(self, batch_inds: np.ndarray, env: Optional[VecNormalize] = None) -> MaskableRolloutBufferSamples: # type: ignore[override]
114+
def _get_samples(self, batch_inds: np.ndarray, env: VecNormalize | None = None) -> MaskableRolloutBufferSamples: # type: ignore[override]
115115
data = (
116116
self.observations[batch_inds],
117117
self.actions[batch_inds],
@@ -156,7 +156,7 @@ def __init__(
156156
buffer_size: int,
157157
observation_space: spaces.Dict,
158158
action_space: spaces.Space,
159-
device: Union[th.device, str] = "auto",
159+
device: th.device | str = "auto",
160160
gae_lambda: float = 1,
161161
gamma: float = 0.99,
162162
n_envs: int = 1,
@@ -182,7 +182,7 @@ def reset(self) -> None:
182182

183183
super().reset()
184184

185-
def add(self, *args, action_masks: Optional[np.ndarray] = None, **kwargs) -> None:
185+
def add(self, *args, action_masks: np.ndarray | None = None, **kwargs) -> None:
186186
"""
187187
:param action_masks: Masks applied to constrain the choice of possible actions.
188188
"""
@@ -191,7 +191,7 @@ def add(self, *args, action_masks: Optional[np.ndarray] = None, **kwargs) -> Non
191191

192192
super().add(*args, **kwargs)
193193

194-
def get(self, batch_size: Optional[int] = None) -> Generator[MaskableDictRolloutBufferSamples, None, None]: # type: ignore[override]
194+
def get(self, batch_size: int | None = None) -> Generator[MaskableDictRolloutBufferSamples, None, None]: # type: ignore[override]
195195
assert self.full, ""
196196
indices = np.random.permutation(self.buffer_size * self.n_envs)
197197
# Prepare the data
@@ -214,7 +214,7 @@ def get(self, batch_size: Optional[int] = None) -> Generator[MaskableDictRollout
214214
yield self._get_samples(indices[start_idx : start_idx + batch_size])
215215
start_idx += batch_size
216216

217-
def _get_samples(self, batch_inds: np.ndarray, env: Optional[VecNormalize] = None) -> MaskableDictRolloutBufferSamples: # type: ignore[override]
217+
def _get_samples(self, batch_inds: np.ndarray, env: VecNormalize | None = None) -> MaskableDictRolloutBufferSamples: # type: ignore[override]
218218
return MaskableDictRolloutBufferSamples(
219219
observations={key: self.to_torch(obs[batch_inds]) for (key, obs) in self.observations.items()},
220220
actions=self.to_torch(self.actions[batch_inds]),

sb3_contrib/common/maskable/distributions.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from abc import ABC, abstractmethod
2-
from typing import Optional, TypeVar, Union
2+
from typing import TypeVar, Union
33

44
import numpy as np
55
import torch as th
@@ -13,7 +13,7 @@
1313
SelfMaskableMultiCategoricalDistribution = TypeVar(
1414
"SelfMaskableMultiCategoricalDistribution", bound="MaskableMultiCategoricalDistribution"
1515
)
16-
MaybeMasks = Union[th.Tensor, np.ndarray, None]
16+
MaybeMasks = Union[th.Tensor, np.ndarray, None] # noqa: UP007
1717

1818

1919
class MaskableCategorical(Categorical):
@@ -34,12 +34,12 @@ class MaskableCategorical(Categorical):
3434

3535
def __init__(
3636
self,
37-
probs: Optional[th.Tensor] = None,
38-
logits: Optional[th.Tensor] = None,
39-
validate_args: Optional[bool] = None,
37+
probs: th.Tensor | None = None,
38+
logits: th.Tensor | None = None,
39+
validate_args: bool | None = None,
4040
masks: MaybeMasks = None,
4141
):
42-
self.masks: Optional[th.Tensor] = None
42+
self.masks: th.Tensor | None = None
4343
super().__init__(probs, logits, validate_args)
4444
self._original_logits = self.logits
4545
self.apply_masking(masks)
@@ -112,7 +112,7 @@ class MaskableCategoricalDistribution(MaskableDistribution):
112112

113113
def __init__(self, action_dim: int):
114114
super().__init__()
115-
self.distribution: Optional[MaskableCategorical] = None
115+
self.distribution: MaskableCategorical | None = None
116116
self.action_dim = action_dim
117117

118118
def proba_distribution_net(self, latent_dim: int) -> nn.Module:
@@ -212,7 +212,7 @@ def log_prob(self, actions: th.Tensor) -> th.Tensor:
212212

213213
# Extract each discrete action and compute log prob for their respective distributions
214214
return th.stack(
215-
[dist.log_prob(action) for dist, action in zip(self.distributions, th.unbind(actions, dim=1))], dim=1
215+
[dist.log_prob(action) for dist, action in zip(self.distributions, th.unbind(actions, dim=1), strict=True)], dim=1
216216
).sum(dim=1)
217217

218218
def entropy(self) -> th.Tensor:
@@ -248,7 +248,7 @@ def apply_masking(self, masks: MaybeMasks) -> None:
248248
# Then split columnwise for each discrete action
249249
split_masks = th.split(masks_tensor, list(self.action_dims), dim=1) # type: ignore[assignment]
250250

251-
for distribution, mask in zip(self.distributions, split_masks):
251+
for distribution, mask in zip(self.distributions, split_masks, strict=True):
252252
distribution.apply_masking(mask)
253253

254254

sb3_contrib/common/maskable/evaluation.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import warnings
2-
from typing import Any, Callable, Optional, Union
2+
from collections.abc import Callable
3+
from typing import Any
34

45
import gymnasium as gym
56
import numpy as np
@@ -12,16 +13,16 @@
1213

1314
def evaluate_policy(
1415
model: MaskablePPO,
15-
env: Union[gym.Env, VecEnv],
16+
env: gym.Env | VecEnv,
1617
n_eval_episodes: int = 10,
1718
deterministic: bool = True,
1819
render: bool = False,
19-
callback: Optional[Callable[[dict[str, Any], dict[str, Any]], None]] = None,
20-
reward_threshold: Optional[float] = None,
20+
callback: Callable[[dict[str, Any], dict[str, Any]], None] | None = None,
21+
reward_threshold: float | None = None,
2122
return_episode_rewards: bool = False,
2223
warn: bool = True,
2324
use_masking: bool = True,
24-
) -> Union[tuple[float, float], tuple[list[float], list[int]]]:
25+
) -> tuple[float, float] | tuple[list[float], list[int]]:
2526
"""
2627
Runs policy for ``n_eval_episodes`` episodes and returns average reward.
2728
If a vector env is passed in, this divides the episodes to evaluate onto the

0 commit comments

Comments
 (0)