HumanCompatibleAI · AdamGleave · Mar 31, 2022 · Mar 31, 2022 · Apr 1, 2022 · Apr 1, 2022
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -65,11 +65,7 @@ commands:
           name: install dependencies
           # MUJOCO_KEY is defined in a CircleCI context
           # Do some sanity checks to make sure key works
-          command: |
-            curl -o /root/.mujoco/mjkey.txt ${MUJOCO_KEY}
-            md5sum /root/.mujoco/mjkey.txt
-            [[ -d /venv ]] || /evaluating-rewards/scripts/build_venv.sh /venv
-            python -c "import mujoco_py"
+          command: "[[ -d /venv ]] || /evaluating-rewards/scripts/build_venv.sh /venv"
 
       - save_cache:
           paths:

diff --git a/Dockerfile b/Dockerfile
@@ -29,6 +29,7 @@ RUN    apt-get update -q \
     ffmpeg \
     software-properties-common \
     net-tools \
+    patchelf \
     parallel \
     python3.7 \
     python3.7-dev \
@@ -42,15 +43,13 @@ RUN    apt-get update -q \
     && apt-get clean \
     && rm -rf /var/lib/apt/lists/*
 
-RUN curl -o /usr/local/bin/patchelf https://s3-us-west-2.amazonaws.com/openai-sci-artifacts/manual-builds/patchelf_0.9_amd64.elf \
-    && chmod +x /usr/local/bin/patchelf
-
 ENV LANG C.UTF-8
 
 RUN    mkdir -p /root/.mujoco \
     && curl -o mjpro150.zip https://www.roboti.us/download/mjpro150_linux.zip \
     && unzip mjpro150.zip -d /root/.mujoco \
-    && rm mjpro150.zip
+    && rm mjpro150.zip \
+    && curl -o /root/.mujoco/mjkey.txt https://www.roboti.us/file/mjkey.txt
 
 # Set the PATH to the venv before we create the venv, so it's visible in base.
 # This is since we may create the venv outside of Docker, e.g. in CI
@@ -69,9 +68,7 @@ WORKDIR /evaluating-rewards
 COPY ./scripts /evaluating-rewards/scripts
 COPY ./requirements.txt /evaluating-rewards
 COPY ./requirements-dev.txt /evaluating-rewards
-
-# mjkey.txt needs to exist for build, but doesn't need to be a real key
-RUN touch /root/.mujoco/mjkey.txt && /evaluating-rewards/scripts/build_venv.sh /venv
+RUN /evaluating-rewards/scripts/build_venv.sh /venv
 
 # full stage contains everything.
 # Can be used for deployment and local testing.
@@ -81,7 +78,7 @@ FROM python-req as full
 COPY . /evaluating-rewards
 # Build a wheel then install to avoid copying whole directory (pip issue #2195)
 RUN python setup.py sdist bdist_wheel
-RUN pip install dist/evaluating_rewards-*.whl
+RUN pip install --upgrade dist/evaluating_rewards-*.whl
 
 # Default entrypoints
 CMD ["pytest", "-n", "auto", "-vv", "tests/", "examples/"]
diff --git a/requirements.txt b/requirements.txt
@@ -1,8 +1,7 @@
 seals @ git+https://github.com/HumanCompatibleAI/seals.git@a425714
 imitation @ git+https://github.com/HumanCompatibleAI/imitation.git@tf-master
-stable-baselines @ git+https://github.com/hill-a/stable-baselines.git
 sacred @ git+https://github.com/IDSIA/sacred.git@e62bb6
-gym[mujoco]
+gym[mujoco]~=0.21.0
 tabulate
 # Avoid https://github.com/matplotlib/matplotlib/issues/18407
 matplotlib!=3.3.1,!=3.3.0
@@ -17,5 +16,6 @@ seaborn
 setuptools
 scikit-learn
 scipy
+stable-baselines>=2.10.1
 tensorflow>=1.15,<1.16
 xarray
diff --git a/setup.py b/setup.py
@@ -29,7 +29,7 @@ def get_version() -> str:
         0,
         os.path.join(os.path.dirname(__file__), "src", "evaluating_rewards"),
     )
-    from version import (  # type:ignore  # pylint:disable=no-name-in-module,import-outside-toplevel
+    from version import (  # type:ignore  # pylint:disable=import-outside-toplevel,import-error
         VERSION,
     )
 
@@ -38,7 +38,7 @@ def get_version() -> str:
 
 
 def load_requirements(fname):
-    with open(fname) as f:
+    with open(fname, "r", encoding="utf-8") as f:
         return f.read().strip().split("\n")
 
 

diff --git a/src/evaluating_rewards/analysis/results.py b/src/evaluating_rewards/analysis/results.py
@@ -168,10 +168,10 @@ def _find_sacred_parent(
     seen[parent] = path
 
     config_path = os.path.join(parent, "sacred", "config.json")
-    with open(config_path, "r") as f:
+    with open(config_path, "r", encoding="utf-8") as f:
         config = json.load(f)
     run_path = os.path.join(parent, "sacred", "run.json")
-    with open(run_path, "r") as f:
+    with open(run_path, "r", encoding="utf-8") as f:
         run = json.load(f)
 
     return config, run, parent

diff --git a/src/evaluating_rewards/analysis/reward_figures/point_mass.py b/src/evaluating_rewards/analysis/reward_figures/point_mass.py
@@ -171,7 +171,7 @@ def evaluate_reward_model(
 
 
 def plot_state_density(
-    dataset_generator: datasets.TransitionsCallable, nsamples: int = 2 ** 12, **kwargs
+    dataset_generator: datasets.TransitionsCallable, nsamples: int = 2**12, **kwargs
 ):
     """Plots the density of a state distribution.
 

diff --git a/src/evaluating_rewards/distances/common_config.py b/src/evaluating_rewards/distances/common_config.py
@@ -184,7 +184,7 @@ def _update_common_configs() -> None:
             COMMON_CONFIGS[chk_key] = dict(**base_cfg, y_reward_cfgs=chk_cfgs)
 
             nbits = 4
-            total_shards = 2 ** nbits
+            total_shards = 2**nbits
             if target_num > total_shards:
                 shards = _bisect_nbits(nbits)
                 for i, shard_num in zip(range(total_shards), shards):

diff --git a/src/evaluating_rewards/distances/epic_sample.py b/src/evaluating_rewards/distances/epic_sample.py
@@ -153,7 +153,7 @@ def sample_mean_rews(
     mean_from_obs: np.ndarray,
     act_samples: np.ndarray,
     next_obs_samples: np.ndarray,
-    batch_size: int = 2 ** 28,
+    batch_size: int = 2**28,
 ) -> Mapping[K, np.ndarray]:
     """
     Estimates the mean reward from observations `mean_from_obs` using given samples.

diff --git a/src/evaluating_rewards/distances/npec.py b/src/evaluating_rewards/distances/npec.py
@@ -67,6 +67,7 @@ def fit_affine(self, batch: types.Transitions):
 
     def fit(
         self,
+        *,
         dataset: datasets.TransitionsCallable,
         affine_size: Optional[int] = 4096,
         **kwargs,
@@ -136,6 +137,7 @@ def fit_affine(self, batch: types.Transitions) -> base.AffineParameters:
 
     def fit(
         self,
+        *,
         dataset: datasets.TransitionsCallable,
         total_timesteps: int = int(1e6),
         epoch_timesteps: int = 16384,
@@ -168,7 +170,7 @@ def fit(
             logging.info(f"Epoch {epoch}: {affine_stats}")
 
             epoch_stats = super().fit(
-                dataset, total_timesteps=epoch_timesteps, affine_size=None, **kwargs
+                dataset=dataset, total_timesteps=epoch_timesteps, affine_size=None, **kwargs
             )
 
             for k, v in epoch_stats.items():

diff --git a/src/evaluating_rewards/envs/point_mass.py b/src/evaluating_rewards/envs/point_mass.py
@@ -79,17 +79,17 @@ def initial_state(self):
                 break
         return {"pos": pos, "vel": vel, "goal": goal}
 
-    def transition(self, old_state, action):
+    def transition(self, state, action):
         action = np.array(action)
         action = action.clip(-1, 1)
         return {
-            "pos": old_state["pos"] + self.dt * old_state["vel"],
-            "vel": old_state["vel"] + self.dt * action,
-            "goal": old_state["goal"],
+            "pos": state["pos"] + self.dt * state["vel"],
+            "vel": state["vel"] + self.dt * action,
+            "goal": state["goal"],
         }
 
-    def reward(self, old_state, action, new_state):
-        del old_state
+    def reward(self, state, action, new_state):
+        del state
         dist = np.linalg.norm(new_state["pos"] - new_state["goal"])
         ctrl_penalty = np.dot(action, action)
         return -dist - self.ctrl_coef * ctrl_penalty

diff --git a/src/evaluating_rewards/experiments/synthetic.py b/src/evaluating_rewards/experiments/synthetic.py
@@ -225,7 +225,7 @@ def compare_synthetic(
     model_potential: bool = True,
     discount: float = 0.99,
     optimizer: Type[tf.train.Optimizer] = tf.train.AdamOptimizer,
-    total_timesteps: int = 2 ** 16,
+    total_timesteps: int = 2**16,
     batch_size: int = 128,
     test_size: int = 4096,
     pretrain: bool = True,

diff --git a/src/evaluating_rewards/scripts/pipeline/combined_distances.py b/src/evaluating_rewards/scripts/pipeline/combined_distances.py
@@ -388,7 +388,7 @@ def _fixed_width_format(x: float, figs: int = 3) -> str:
             num_leading_zeros += 1
         else:
             break
-    if x >= 10 ** figs:
+    if x >= 10**figs:
         # No decimal point gives us an extra character to use
         figs += 1
     fstr = "{:." + str(max(0, figs - num_leading_zeros)) + "g}"

diff --git a/src/evaluating_rewards/scripts/rewards/train_regress.py b/src/evaluating_rewards/scripts/rewards/train_regress.py
@@ -32,7 +32,7 @@ def default_config():
     locals().update(**regress_utils.DEFAULT_CONFIG)
     checkpoint_interval = 50  # save every checkpoint_interval epochs
     dataset_factory = datasets.transitions_factory_from_serialized_policy
-    dataset_factory_kwargs = dict()
+    dataset_factory_kwargs = {}
 
     # Model to train and hyperparameters
     model_reward_type = base.MLPRewardModel
@@ -104,7 +104,7 @@ def make_trainer(model, model_scope, target):
         def do_training(target, trainer, callback: Optional[base.Callback]):
             del target
             return trainer.fit(
-                dataset_generator,
+                dataset=dataset_generator,
                 total_timesteps=total_timesteps,
                 batch_size=batch_size,
                 callback=callback,

diff --git a/src/evaluating_rewards/scripts/script_utils.py b/src/evaluating_rewards/scripts/script_utils.py
@@ -87,7 +87,8 @@ def recursive_dict_merge(
             elif overwrite:
                 dest[key] = update_by[key]
             else:
-                raise Exception("Conflict at {}".format(".".join(path + [str(key)])))
+                msg = "Conflict at " + ".".join(path + [str(key)])
+                raise Exception(msg)
         else:
             dest[key] = update_by[key]
     return dest

diff --git a/tests/test_rewards.py b/tests/test_rewards.py
@@ -34,7 +34,7 @@
 from evaluating_rewards.rewards import base
 from tests import common
 
-ENVS = ["FrozenLake-v0", "CartPole-v1", "Pendulum-v0"]
+ENVS = ["FrozenLake-v1", "CartPole-v1", "Pendulum-v1"]
 
 STANDALONE_REWARD_MODELS = {
     "halfcheetah_ground_truth": {

diff --git a/tests/test_synthetic.py b/tests/test_synthetic.py
@@ -109,7 +109,7 @@ def f(total_timesteps: int):
             "dataset_potential_hids": [],
             "model_potential_hids": [],
             "learning_rate": 1e-2,
-            "total_timesteps": 2 ** 18,
+            "total_timesteps": 2**18,
             "batch_size": 256,
         },
         "rel_upperbound": 0.2,
@@ -120,7 +120,7 @@ def f(total_timesteps: int):
             "dataset_potential_hids": [4],
             "model_potential_hids": [32],
             "learning_rate": 1e-2,
-            "total_timesteps": 2 ** 18,
+            "total_timesteps": 2**18,
             "batch_size": 512,
         },
         "rel_upperbound": 0.2,
@@ -131,7 +131,7 @@ def f(total_timesteps: int):
             "dataset_potential_hids": [4, 4],
             "model_potential_hids": [32, 32],
             "learning_rate": 1e-2,
-            "total_timesteps": 2 ** 18,
+            "total_timesteps": 2**18,
             "batch_size": 512,
         },
         "rel_upperbound": 0.2,
@@ -283,7 +283,7 @@ def test_pretrain_affine(self, helper_affine, kwargs):
             dataset_potential_hids=[4, 4],
             model_potential=True,
             model_potential_hids=[32, 32],
-            total_timesteps=2 ** 18,
+            total_timesteps=2**18,
             learning_rate=1e-2,
             potential_noise=np.array([0.0, 1.0]),
             **kwargs,