asm_env, asm_fns added: started refactoring (#10)

* asm_env, asm_fns added: refactoring * added saved_agents to .gitignore * played with hyperpars, added more sb3_zoo hyperpars, fixed bug in asm_fns.observe_2o * changes from Chris email chain: harvest_vul, survey_vul * no resetting p_big, sdr, rho in initialize_population * harvest_vul and survey_vul now different * missing avoid zero division condition * isVecObs for vec env implementation inside sb3/evaluate_policy --------- Co-authored-by: Felipe Montealegre-Mora <[email protected]>
boettiger-lab · Mar 21, 2024 · bb69f3b · bb69f3b
1 parent bc0a45d
commit bb69f3b
Show file tree

Hide file tree

Showing 15 changed files with 612 additions and 31 deletions.
diff --git a/.gitignore b/.gitignore
@@ -158,3 +158,5 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+
+saved_agents/
diff --git a/hyperpars/rppo-asm2o.yml b/hyperpars/rppo-asm2o.yml
@@ -2,38 +2,38 @@
 
 # algo overall
 algo: "RPPO"
-total_timesteps: 10000000
+total_timesteps: 20000000
 
 additional_imports: ["torch"]
 
 # env overall
 env_id: "Asm2o-v0"
 config: {}
-n_envs: 32
+n_envs: 4
 
 # io
 repo: "cboettig/rl-ecology"
 save_path: "../saved_agents"
 
 # # MINIMAL CONFIG
-id: "minimal"
-algo_config:
-    policy: 'MlpLstmPolicy'
-    tensorboard_log: "~/logs"
+# id: "minimal"
+# algo_config:
+#     policy: 'MlpLstmPolicy'
+#     tensorboard_log: "../../logs"
 
 # # SLOW LEARN
 # id: "slow"
 # algo_config:
 #     policy: 'MlpLstmPolicy'
-#     tensorboard_log: "~/logs"
+#     tensorboard_log: "../../logs"
 #     learning_rate: 0.0001
 #     # default learning rate = 0.0003
 
 # # EXTRA SLOW LEARN
 # id: "extra-slow"
 # algo_config:
 #     policy: 'MlpLstmPolicy'
-#     tensorboard_log: "~/logs"
+#     tensorboard_log: "../../logs"
 #     learning_rate: 0.00003
 
 
@@ -46,7 +46,7 @@ algo_config:
 # algo_config:
 #     # normalize: True # not clear what this one actually does -- from the source code it seems to 'activate' VecNormalize, but more care & examination needed
 #     policy: 'MlpLstmPolicy'
-#     tensorboard_log: "~/logs"
+#     tensorboard_log: "../../logs"
 #     n_steps: 256
 #     batch_size: 256
 #     gae_lambda: 0.95
@@ -68,7 +68,7 @@ algo_config:
 # id: "cheetah"
 # algo_config:
 #     policy: 'MlpLstmPolicy'
-#     tensorboard_log: "~/logs"
+#     tensorboard_log: "../../logs"
 #     batch_size: 64
 #     n_steps: 512
 #     gamma: 0.98
@@ -88,26 +88,26 @@ algo_config:
 
 
 
-# # INVERTED PENDULUM
-# id: "inv_pend"
-# algo_config:
-#     tensorboard_log: "~/logs"
-#     policy: 'MlpLstmPolicy'
-#     n_steps: 2048
-#     batch_size: 64
-#     gae_lambda: 0.95
-#     gamma: 0.99
-#     n_epochs: 10
-#     ent_coef: 0.0
-#     learning_rate: 2.5e-4
-#     clip_range: 0.2
+# INVERTED PENDULUM
+id: "inv_pend"
+algo_config:
+    tensorboard_log: "../../logs"
+    policy: 'MlpLstmPolicy'
+    n_steps: 2048
+    batch_size: 64
+    gae_lambda: 0.95
+    gamma: 0.99
+    n_epochs: 10
+    ent_coef: 0.0
+    learning_rate: 2.5e-4
+    clip_range: 0.2
 
 
 # # MOUNTAIN CAR NO VEL
 
 # id: "mount_car"
 # algo_config:
-    # tensorboard_log: "~/logs"
+    # tensorboard_log: "../../logs"
     # policy: 'MlpLstmPolicy'
     # batch_size: 256
     # n_steps: 1024
@@ -124,4 +124,17 @@ algo_config:
     # policy_kwargs: "dict(log_std_init=0.0, ortho_init=False,
     #                    lstm_hidden_size=32,
     #                    enable_critic_lstm=True,
-    #                    net_arch=dict(pi=[64], vf=[64]))"
+    #                    net_arch=dict(pi=[64], vf=[64]))"
+
+# SPACE INVADERS V4
+# id: "space_invaders"
+# algo_config:
+#     tensorboard_log: "../../logs"
+#     policy: 'MlpLstmPolicy'
+#     batch_size: 512
+#     # clip_range: 0.1
+#     ent_coef: 0.012
+#     frame_stack: 4
+#     learning_rate: 2.5e-4
+#     policy_kwargs: dict(enable_critic_lstm=False, lstm_hidden_size=128, )
+#     vf_coef: 0.5
diff --git a/hyperpars/tqc-asm2o-v0-1.yml b/hyperpars/tqc-asm2o-v0-1.yml
@@ -3,7 +3,7 @@
 algo: "TQC"
 env_id: "Asm2o-v0"
 n_envs: 6
-tensorboard: "/~/logs"
+tensorboard: "~/logs"
 total_timesteps: 12000000
 config: {"learning_rate": 0.0001,
          "learning_starts": 1000,

diff --git a/scripts/fixed_policy_opt.py b/scripts/fixed_policy_opt.py
@@ -0,0 +1,106 @@
+#!/opt/venv/bin/python
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument("-p", "--policy", choices = ["msy", "esc", "cr"], help="Policy to be tuned", type=str)
+parser.add_argument("-v", "--verbose", help="Verbosity of tuning method", type=bool)
+parser.add_argument("-o", "--opt-algo", choices=["gp", "gbrt"], help="Optimization algo used")
+args = parser.parse_args()
+
+from huggingface_hub import hf_hub_download, HfApi, login
+import numpy as np
+from skopt.space import Real
+from skopt.utils import use_named_args
+from stable_baselines3.common.evaluation import evaluate_policy
+from stable_baselines3.common.monitor import Monitor
+
+from rl4fisheries import AsmEnv
+
+# hf login
+# api = HfApi()
+# login()
+
+# optimization algo
+if args.opt_algo == "gp":
+    from skopt import gp_minimize
+    opt_algo = gp_minimize
+elif args.opt_algo == "gbrt":
+    from skopt import gbrt_minimize
+    opt_algo = gbrt_minimize
+
+# policy
+if args.policy == "msy":
+    from rl4fisheries import Msy
+    policy_cls = Msy
+elif args.policy == "esc":
+    from rl4fisheries import ConstEsc
+    policy_cls = ConstEsc
+elif args.policy == "cr":
+    from rl4fisheries import CautionaryRule
+    policy_cls = CautionaryRule
+
+
+# optimizing space
+msy_space = [Real(0.002, 0.25, name='mortality')]
+esc_space = [Real(0.02, 0.15, name='escapement')]
+cr_space  = [
+    Real(0.00001, 1, name='radius'),
+    Real(0.00001, np.pi/4.00001, name='theta'),
+    Real(0, 0.2, name='y2')
+]
+space = {'msy':msy_space, 'esc':esc_space, 'cr':cr_space}[args.policy]
+
+# optimizing function
+from stable_baselines3.common.monitor import Monitor
+
+@use_named_args(space)
+def msy_fn(**params):
+    agent = Msy(AsmEnv(), mortality=params['mortality'])
+    env = AsmEnv()
+    mean, sd = evaluate_policy(agent, Monitor(env), n_eval_episodes=100)
+    return -mean
+
+@use_named_args(space)
+def esc_fn(**params):
+    agent = ConstEsc(AsmEnv(), escapement=params['escapement'])
+    env = AsmEnv()
+    mean, sd = evaluate_policy(agent, Monitor(env), n_eval_episodes=100)
+    return -mean
+
+@use_named_args(space)
+def cr_fn(**params):
+    theta = params["theta"]
+    radius = params["radius"]
+    x1 = np.sin(theta) * radius
+    x2 = np.cos(theta) * radius
+
+    assert x1 <= x2, ("CautionaryRule error: x1 < x2, " + str(x1) + ", ", str(x2) )
+
+    agent = CautionaryRule(AsmEnv(), x1 = x1, x2 =  x2, y2 = params["y2"])
+    env = AsmEnv()
+    mean, sd = evaluate_policy(agent, Monitor(env), n_eval_episodes=100)
+    return -mean
+
+opt_fn = {'msy':msy_fn, 'esc':esc_fn, 'cr':cr_fn}[args.policy]
+
+
+# optimize
+results = opt_algo(opt_fn, space, n_calls=300, verbose=args.verbose, n_jobs=-1)
+print(
+    f"{args.policy}-{args.opt_algo} results: "
+    f"opt args = {[eval(f'{r:.4f}') for r in results.x]}, "
+    f"rew={results.fun:.4f}"
+)
+
+# save
+path = "../saved_agents/"
+fname = f"{args.policy}_{args.opt_algo}.pkl"
+dump(results, path+fname)
+
+api.upload_file(
+    path_or_fileobj=path+fname,
+    path_in_repo="sb3/rl4fisheries/"+fname,
+    repo_id="boettiger-lab/rl4eco",
+    repo_type="model",
+)
+
+
diff --git a/scripts/tune_fixed_policies.sh b/scripts/tune_fixed_policies.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+# move to script directory for normalized relative paths.
+scriptdir="$(dirname "$0")"
+cd "$scriptdir"
+
+# gp
+python fixed_policy_opt.py -p msy -v True -o gp &
+python fixed_policy_opt.py -p esc -v True -o gp &
+python fixed_policy_opt.py -p cr -v True -o gp &
+
+# gbrt
+python fixed_policy_opt.py -p msy -v True -o gbrt &
+python fixed_policy_opt.py -p esc -v True -o gbrt &
+python fixed_policy_opt.py -p cr -v True -o gbrt &
diff --git a/src/rl4fisheries/__init__.py b/src/rl4fisheries/__init__.py
@@ -2,6 +2,7 @@
 from rl4fisheries.envs.asm import Asm
 from rl4fisheries.envs.asm_2o import Asm2o
 from rl4fisheries.envs.asm_esc import AsmEsc
+from rl4fisheries.envs.asm_env import AsmEnv
 
 from rl4fisheries.agents.cautionary_rule import CautionaryRule
 from rl4fisheries.agents.const_esc import ConstEsc
@@ -15,3 +16,6 @@
 register(id="AsmEsc-v0", entry_point="rl4fisheries.envs.asm_esc:AsmEsc")
 # action is harvest, but observes both total count and mean biomass
 register(id="Asm2o-v0", entry_point="rl4fisheries.envs.asm_2o:Asm2o")
+# action is harvest, but observes both total count and mean biomass
+register(id="AsmEnv", entry_point="rl4fisheries.envs.asm_env:AsmEnv")
+
diff --git a/src/rl4fisheries/agents/__init__.py b/src/rl4fisheries/agents/__init__.py
diff --git a/src/rl4fisheries/agents/cautionary_rule.py b/src/rl4fisheries/agents/cautionary_rule.py
@@ -6,18 +6,22 @@
 from tqdm import tqdm
 from .unit_interface import unitInterface
 
+from rl4fisheries.agents.common import isVecObs
 
 class CautionaryRule:
-    def __init__(self, x1=0, x2=1, y2=1, obs_bounds=1, **kwargs):
+    def __init__(self, env, x1=0, x2=1, y2=1, obs_bounds=1, **kwargs):
         self.ui = unitInterface(bounds=obs_bounds)
         self.x1 = x1
         self.x2 = x2
         self.y2 = y2
         self.policy_type = "CautionaryRule_piecewise_linear"
+        self.env = env
 
         assert x1 <= x2, "CautionaryRule error: x1 <= x2" 
 
     def predict(self, observation, **kwargs):
+        if isVecObs(observation, self.env):
+            observation = observation[0]
         pop = self.ui.to_natural_units(observation)
         raw_prediction = np.clip( self.predict_raw(pop), 0, 1)
         return np.float32([2 * raw_prediction - 1]), {}

diff --git a/src/rl4fisheries/agents/common.py b/src/rl4fisheries/agents/common.py
@@ -0,0 +1,10 @@
+import numpy as np
+
+def isVecObs(obs, env):
+    shp = env.observation_space.shape
+    if (
+        (shp != np.shape(obs)) and 
+        (np.shape(obs[0]) == shp) # quick n dirty, possibly prone to bugs tho
+    ):
+        return True
+    return False
diff --git a/src/rl4fisheries/agents/const_esc.py b/src/rl4fisheries/agents/const_esc.py
@@ -4,19 +4,24 @@
 import polars as pl
 from tqdm import tqdm
 
+from rl4fisheries.agents.common import isVecObs
+
 class ConstEsc:
-    def __init__(self, escapement=0, bounds = 1, **kwargs):
+    def __init__(self, env, escapement=0, bounds = 1, **kwargs):
         from .unit_interface import unitInterface
         self.ui = unitInterface(bounds=bounds)
         self.escapement = escapement
         self.bounds = bounds
         self.policy_type = "constant_escapement"
+        self.env = env
 
 
     def predict(self, observation, **kwargs):
+        if isVecObs(observation, self.env):
+            observation = observation[0]
         pop = self.ui.to_natural_units(observation)
         raw_prediction = self.predict_raw(pop)
-        return 2 * raw_prediction - 1, {}
+        return np.float32([2 * raw_prediction - 1]), {}
 
     def predict_raw(self, pop):
         population = pop[0]

diff --git a/src/rl4fisheries/agents/msy.py b/src/rl4fisheries/agents/msy.py
@@ -4,14 +4,18 @@
 import polars as pl
 from tqdm import tqdm
 
+from rl4fisheries.agents.common import isVecObs
+
 class Msy:
-    def __init__(self, mortality: float =0, threshold: float =0, env = None, **kwargs):
+    def __init__(self, env, mortality: float =0, threshold: float =0, **kwargs):
         self.mortality = mortality
         self.threshold = threshold
         self.policy_type = "msy_and_threshold"
         self.env = env
 
     def predict(self, observation, **kwargs):
+        if isVecObs(observation, self.env):
+            observation = observation[0]
         pop = self.state_to_pop(observation)
         raw_prediction = raw_prediction = np.clip( self.predict_raw(pop), 0, 1)
         return np.float32([2 * raw_prediction - 1]), {}

diff --git a/src/rl4fisheries/envs/__init__.py b/src/rl4fisheries/envs/__init__.py