diff --git a/.circleci/config.yml b/.circleci/config.yml index 9696bd5..bd0da67 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -65,11 +65,7 @@ commands: name: install dependencies # MUJOCO_KEY is defined in a CircleCI context # Do some sanity checks to make sure key works - command: | - curl -o /root/.mujoco/mjkey.txt ${MUJOCO_KEY} - md5sum /root/.mujoco/mjkey.txt - [[ -d /venv ]] || /evaluating-rewards/scripts/build_venv.sh /venv - python -c "import mujoco_py" + command: "[[ -d /venv ]] || /evaluating-rewards/scripts/build_venv.sh /venv" - save_cache: paths: diff --git a/Dockerfile b/Dockerfile index 9983f07..8756a83 100644 --- a/Dockerfile +++ b/Dockerfile @@ -29,6 +29,7 @@ RUN apt-get update -q \ ffmpeg \ software-properties-common \ net-tools \ + patchelf \ parallel \ python3.7 \ python3.7-dev \ @@ -42,15 +43,13 @@ RUN apt-get update -q \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* -RUN curl -o /usr/local/bin/patchelf https://s3-us-west-2.amazonaws.com/openai-sci-artifacts/manual-builds/patchelf_0.9_amd64.elf \ - && chmod +x /usr/local/bin/patchelf - ENV LANG C.UTF-8 RUN mkdir -p /root/.mujoco \ && curl -o mjpro150.zip https://www.roboti.us/download/mjpro150_linux.zip \ && unzip mjpro150.zip -d /root/.mujoco \ - && rm mjpro150.zip + && rm mjpro150.zip \ + && curl -o /root/.mujoco/mjkey.txt https://www.roboti.us/file/mjkey.txt # Set the PATH to the venv before we create the venv, so it's visible in base. # This is since we may create the venv outside of Docker, e.g. in CI @@ -69,9 +68,7 @@ WORKDIR /evaluating-rewards COPY ./scripts /evaluating-rewards/scripts COPY ./requirements.txt /evaluating-rewards COPY ./requirements-dev.txt /evaluating-rewards - -# mjkey.txt needs to exist for build, but doesn't need to be a real key -RUN touch /root/.mujoco/mjkey.txt && /evaluating-rewards/scripts/build_venv.sh /venv +RUN /evaluating-rewards/scripts/build_venv.sh /venv # full stage contains everything. # Can be used for deployment and local testing. @@ -81,7 +78,7 @@ FROM python-req as full COPY . /evaluating-rewards # Build a wheel then install to avoid copying whole directory (pip issue #2195) RUN python setup.py sdist bdist_wheel -RUN pip install dist/evaluating_rewards-*.whl +RUN pip install --upgrade dist/evaluating_rewards-*.whl # Default entrypoints CMD ["pytest", "-n", "auto", "-vv", "tests/", "examples/"] diff --git a/requirements.txt b/requirements.txt index 3b86610..b0efb2f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,7 @@ seals @ git+https://github.com/HumanCompatibleAI/seals.git@a425714 imitation @ git+https://github.com/HumanCompatibleAI/imitation.git@tf-master -stable-baselines @ git+https://github.com/hill-a/stable-baselines.git sacred @ git+https://github.com/IDSIA/sacred.git@e62bb6 -gym[mujoco] +gym[mujoco]~=0.21.0 tabulate # Avoid https://github.com/matplotlib/matplotlib/issues/18407 matplotlib!=3.3.1,!=3.3.0 @@ -17,5 +16,6 @@ seaborn setuptools scikit-learn scipy +stable-baselines>=2.10.1 tensorflow>=1.15,<1.16 xarray diff --git a/setup.py b/setup.py index 3519fbe..17ec8e4 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ def get_version() -> str: 0, os.path.join(os.path.dirname(__file__), "src", "evaluating_rewards"), ) - from version import ( # type:ignore # pylint:disable=no-name-in-module,import-outside-toplevel + from version import ( # type:ignore # pylint:disable=import-outside-toplevel,import-error VERSION, ) @@ -38,7 +38,7 @@ def get_version() -> str: def load_requirements(fname): - with open(fname) as f: + with open(fname, "r", encoding="utf-8") as f: return f.read().strip().split("\n") diff --git a/src/evaluating_rewards/analysis/results.py b/src/evaluating_rewards/analysis/results.py index 23d38d8..45ff6aa 100644 --- a/src/evaluating_rewards/analysis/results.py +++ b/src/evaluating_rewards/analysis/results.py @@ -168,10 +168,10 @@ def _find_sacred_parent( seen[parent] = path config_path = os.path.join(parent, "sacred", "config.json") - with open(config_path, "r") as f: + with open(config_path, "r", encoding="utf-8") as f: config = json.load(f) run_path = os.path.join(parent, "sacred", "run.json") - with open(run_path, "r") as f: + with open(run_path, "r", encoding="utf-8") as f: run = json.load(f) return config, run, parent diff --git a/src/evaluating_rewards/analysis/reward_figures/point_mass.py b/src/evaluating_rewards/analysis/reward_figures/point_mass.py index 3cd3eb6..4e83193 100644 --- a/src/evaluating_rewards/analysis/reward_figures/point_mass.py +++ b/src/evaluating_rewards/analysis/reward_figures/point_mass.py @@ -171,7 +171,7 @@ def evaluate_reward_model( def plot_state_density( - dataset_generator: datasets.TransitionsCallable, nsamples: int = 2 ** 12, **kwargs + dataset_generator: datasets.TransitionsCallable, nsamples: int = 2**12, **kwargs ): """Plots the density of a state distribution. diff --git a/src/evaluating_rewards/distances/common_config.py b/src/evaluating_rewards/distances/common_config.py index 2acd499..7f521fb 100644 --- a/src/evaluating_rewards/distances/common_config.py +++ b/src/evaluating_rewards/distances/common_config.py @@ -184,7 +184,7 @@ def _update_common_configs() -> None: COMMON_CONFIGS[chk_key] = dict(**base_cfg, y_reward_cfgs=chk_cfgs) nbits = 4 - total_shards = 2 ** nbits + total_shards = 2**nbits if target_num > total_shards: shards = _bisect_nbits(nbits) for i, shard_num in zip(range(total_shards), shards): diff --git a/src/evaluating_rewards/distances/epic_sample.py b/src/evaluating_rewards/distances/epic_sample.py index 8ef927b..638244e 100644 --- a/src/evaluating_rewards/distances/epic_sample.py +++ b/src/evaluating_rewards/distances/epic_sample.py @@ -153,7 +153,7 @@ def sample_mean_rews( mean_from_obs: np.ndarray, act_samples: np.ndarray, next_obs_samples: np.ndarray, - batch_size: int = 2 ** 28, + batch_size: int = 2**28, ) -> Mapping[K, np.ndarray]: """ Estimates the mean reward from observations `mean_from_obs` using given samples. diff --git a/src/evaluating_rewards/distances/npec.py b/src/evaluating_rewards/distances/npec.py index 28b4848..e1d5e09 100644 --- a/src/evaluating_rewards/distances/npec.py +++ b/src/evaluating_rewards/distances/npec.py @@ -67,6 +67,7 @@ def fit_affine(self, batch: types.Transitions): def fit( self, + *, dataset: datasets.TransitionsCallable, affine_size: Optional[int] = 4096, **kwargs, @@ -136,6 +137,7 @@ def fit_affine(self, batch: types.Transitions) -> base.AffineParameters: def fit( self, + *, dataset: datasets.TransitionsCallable, total_timesteps: int = int(1e6), epoch_timesteps: int = 16384, @@ -168,7 +170,7 @@ def fit( logging.info(f"Epoch {epoch}: {affine_stats}") epoch_stats = super().fit( - dataset, total_timesteps=epoch_timesteps, affine_size=None, **kwargs + dataset=dataset, total_timesteps=epoch_timesteps, affine_size=None, **kwargs ) for k, v in epoch_stats.items(): diff --git a/src/evaluating_rewards/envs/point_mass.py b/src/evaluating_rewards/envs/point_mass.py index cafd450..dff1a3b 100644 --- a/src/evaluating_rewards/envs/point_mass.py +++ b/src/evaluating_rewards/envs/point_mass.py @@ -79,17 +79,17 @@ def initial_state(self): break return {"pos": pos, "vel": vel, "goal": goal} - def transition(self, old_state, action): + def transition(self, state, action): action = np.array(action) action = action.clip(-1, 1) return { - "pos": old_state["pos"] + self.dt * old_state["vel"], - "vel": old_state["vel"] + self.dt * action, - "goal": old_state["goal"], + "pos": state["pos"] + self.dt * state["vel"], + "vel": state["vel"] + self.dt * action, + "goal": state["goal"], } - def reward(self, old_state, action, new_state): - del old_state + def reward(self, state, action, new_state): + del state dist = np.linalg.norm(new_state["pos"] - new_state["goal"]) ctrl_penalty = np.dot(action, action) return -dist - self.ctrl_coef * ctrl_penalty diff --git a/src/evaluating_rewards/experiments/synthetic.py b/src/evaluating_rewards/experiments/synthetic.py index 0fcbfc2..8d2ea3d 100644 --- a/src/evaluating_rewards/experiments/synthetic.py +++ b/src/evaluating_rewards/experiments/synthetic.py @@ -225,7 +225,7 @@ def compare_synthetic( model_potential: bool = True, discount: float = 0.99, optimizer: Type[tf.train.Optimizer] = tf.train.AdamOptimizer, - total_timesteps: int = 2 ** 16, + total_timesteps: int = 2**16, batch_size: int = 128, test_size: int = 4096, pretrain: bool = True, diff --git a/src/evaluating_rewards/scripts/pipeline/combined_distances.py b/src/evaluating_rewards/scripts/pipeline/combined_distances.py index 62294f5..858f73d 100644 --- a/src/evaluating_rewards/scripts/pipeline/combined_distances.py +++ b/src/evaluating_rewards/scripts/pipeline/combined_distances.py @@ -388,7 +388,7 @@ def _fixed_width_format(x: float, figs: int = 3) -> str: num_leading_zeros += 1 else: break - if x >= 10 ** figs: + if x >= 10**figs: # No decimal point gives us an extra character to use figs += 1 fstr = "{:." + str(max(0, figs - num_leading_zeros)) + "g}" diff --git a/src/evaluating_rewards/scripts/rewards/train_regress.py b/src/evaluating_rewards/scripts/rewards/train_regress.py index 33a93d3..04d38f9 100644 --- a/src/evaluating_rewards/scripts/rewards/train_regress.py +++ b/src/evaluating_rewards/scripts/rewards/train_regress.py @@ -32,7 +32,7 @@ def default_config(): locals().update(**regress_utils.DEFAULT_CONFIG) checkpoint_interval = 50 # save every checkpoint_interval epochs dataset_factory = datasets.transitions_factory_from_serialized_policy - dataset_factory_kwargs = dict() + dataset_factory_kwargs = {} # Model to train and hyperparameters model_reward_type = base.MLPRewardModel @@ -104,7 +104,7 @@ def make_trainer(model, model_scope, target): def do_training(target, trainer, callback: Optional[base.Callback]): del target return trainer.fit( - dataset_generator, + dataset=dataset_generator, total_timesteps=total_timesteps, batch_size=batch_size, callback=callback, diff --git a/src/evaluating_rewards/scripts/script_utils.py b/src/evaluating_rewards/scripts/script_utils.py index 2e38b72..41a73a8 100644 --- a/src/evaluating_rewards/scripts/script_utils.py +++ b/src/evaluating_rewards/scripts/script_utils.py @@ -87,7 +87,8 @@ def recursive_dict_merge( elif overwrite: dest[key] = update_by[key] else: - raise Exception("Conflict at {}".format(".".join(path + [str(key)]))) + msg = "Conflict at " + ".".join(path + [str(key)]) + raise Exception(msg) else: dest[key] = update_by[key] return dest diff --git a/tests/test_rewards.py b/tests/test_rewards.py index ec5ada6..893d3d9 100644 --- a/tests/test_rewards.py +++ b/tests/test_rewards.py @@ -34,7 +34,7 @@ from evaluating_rewards.rewards import base from tests import common -ENVS = ["FrozenLake-v0", "CartPole-v1", "Pendulum-v0"] +ENVS = ["FrozenLake-v1", "CartPole-v1", "Pendulum-v1"] STANDALONE_REWARD_MODELS = { "halfcheetah_ground_truth": { diff --git a/tests/test_synthetic.py b/tests/test_synthetic.py index ca44e35..2d3e907 100644 --- a/tests/test_synthetic.py +++ b/tests/test_synthetic.py @@ -109,7 +109,7 @@ def f(total_timesteps: int): "dataset_potential_hids": [], "model_potential_hids": [], "learning_rate": 1e-2, - "total_timesteps": 2 ** 18, + "total_timesteps": 2**18, "batch_size": 256, }, "rel_upperbound": 0.2, @@ -120,7 +120,7 @@ def f(total_timesteps: int): "dataset_potential_hids": [4], "model_potential_hids": [32], "learning_rate": 1e-2, - "total_timesteps": 2 ** 18, + "total_timesteps": 2**18, "batch_size": 512, }, "rel_upperbound": 0.2, @@ -131,7 +131,7 @@ def f(total_timesteps: int): "dataset_potential_hids": [4, 4], "model_potential_hids": [32, 32], "learning_rate": 1e-2, - "total_timesteps": 2 ** 18, + "total_timesteps": 2**18, "batch_size": 512, }, "rel_upperbound": 0.2, @@ -283,7 +283,7 @@ def test_pretrain_affine(self, helper_affine, kwargs): dataset_potential_hids=[4, 4], model_potential=True, model_potential_hids=[32, 32], - total_timesteps=2 ** 18, + total_timesteps=2**18, learning_rate=1e-2, potential_noise=np.array([0.0, 1.0]), **kwargs,