Merge remote-tracking branch 'devops/bugfix/episode-length-and-rewards' into feature/2085-dump_describe_state

2023-12-01 15:58:32 +00:00
parent 008efa1e9d 321d1f7219
commit 32c13e06f6
6 changed files with 989 additions and 169 deletions
--- a/src/primaite/session/environment.py
+++ b/src/primaite/session/environment.py
@@ -83,14 +83,15 @@ class PrimaiteGymEnv(gymnasium.Env):
 class PrimaiteRayEnv(gymnasium.Env):
    """Ray wrapper that accepts a single `env_config` parameter in init function for compatibility with Ray."""

-    def __init__(self, env_config: Dict[str, PrimaiteGame]) -> None:
+    def __init__(self, env_config: Dict) -> None:
        """Initialise the environment.

        :param env_config: A dictionary containing the environment configuration. It must contain a single key, `game`
            which is the PrimaiteGame instance.
        :type env_config: Dict[str, PrimaiteGame]
        """
-        self.env = PrimaiteGymEnv(game=env_config["game"])
+        self.env = PrimaiteGymEnv(game=PrimaiteGame.from_config(env_config["cfg"]))
+        self.env.game.episode_counter -= 1
        self.action_space = self.env.action_space
        self.observation_space = self.env.observation_space

@@ -106,14 +107,14 @@ class PrimaiteRayEnv(gymnasium.Env):
 class PrimaiteRayMARLEnv(MultiAgentEnv):
    """Ray Environment that inherits from MultiAgentEnv to allow training MARL systems."""

-    def __init__(self, env_config: Optional[Dict] = None) -> None:
+    def __init__(self, env_config: Dict) -> None:
        """Initialise the environment.

        :param env_config: A dictionary containing the environment configuration. It must contain a single key, `game`
            which is the PrimaiteGame instance.
        :type env_config: Dict[str, PrimaiteGame]
        """
-        self.game: PrimaiteGame = env_config["game"]
+        self.game: PrimaiteGame = PrimaiteGame.from_config(env_config["cfg"])
        """Reference to the primaite game"""
        self.agents: Final[Dict[str, ProxyAgent]] = {agent.agent_name: agent for agent in self.game.rl_agents}
        """List of all possible agents in the environment. This list should not change!"""
@@ -122,7 +123,10 @@ class PrimaiteRayMARLEnv(MultiAgentEnv):
        self.terminateds = set()
        self.truncateds = set()
        self.observation_space = gymnasium.spaces.Dict(
-            {name: agent.observation_manager.space for name, agent in self.agents.items()}
+            {
+                name: gymnasium.spaces.flatten_space(agent.observation_manager.space)
+                for name, agent in self.agents.items()
+            }
        )
        self.action_space = gymnasium.spaces.Dict(
            {name: agent.action_manager.space for name, agent in self.agents.items()}
@@ -173,4 +177,9 @@ class PrimaiteRayMARLEnv(MultiAgentEnv):

    def _get_obs(self) -> Dict[str, ObsType]:
        """Return the current observation."""
-        return {name: agent.observation_manager.current_observation for name, agent in self.agents.items()}
+        obs = {}
+        for name, agent in self.agents.items():
+            unflat_space = agent.observation_manager.space
+            unflat_obs = agent.observation_manager.current_observation
+            obs[name] = gymnasium.spaces.flatten(unflat_space, unflat_obs)
+        return obs
--- a/src/primaite/session/policy/rllib.py
+++ b/src/primaite/session/policy/rllib.py
@@ -12,6 +12,10 @@ from ray import air, tune
 from ray.rllib.algorithms import ppo
 from ray.rllib.algorithms.ppo import PPOConfig

+from primaite import getLogger
+
+_LOGGER = getLogger(__name__)
+

 class RaySingleAgentPolicy(PolicyABC, identifier="RLLIB_single_agent"):
    """Single agent RL policy using Ray RLLib."""
@@ -19,7 +23,7 @@ class RaySingleAgentPolicy(PolicyABC, identifier="RLLIB_single_agent"):
    def __init__(self, session: "PrimaiteSession", algorithm: Literal["PPO", "A2C"], seed: Optional[int] = None):
        super().__init__(session=session)

-        config = {
+        self.config = {
            "env": PrimaiteRayEnv,
            "env_config": {"game": session.game},
            "disable_env_checking": True,
@@ -29,12 +33,13 @@ class RaySingleAgentPolicy(PolicyABC, identifier="RLLIB_single_agent"):
        ray.shutdown()
        ray.init()

-        self._algo = ppo.PPO(config=config)
-
    def learn(self, n_episodes: int, timesteps_per_episode: int) -> None:
        """Train the agent."""
-        for ep in range(n_episodes):
-            self._algo.train()
+        self.config["training_iterations"] = n_episodes * timesteps_per_episode
+        self.config["train_batch_size"] = 128
+        self._algo = ppo.PPO(config=self.config)
+        _LOGGER.info("Starting RLLIB training session")
+        self._algo.train()

    def eval(self, n_episodes: int, deterministic: bool) -> None:
        """Evaluate the agent."""