Commit 8e0ef4b0 authored by metataro's avatar metataro

sparse reward and deadlock reward environment wrappers

parent 673f3e52
from collections import defaultdict
from typing import Dict, Any, Optional, Set, List
import gym
import numpy as np
from flatland.core.grid.grid4_utils import get_new_position
from flatland.envs.agent_utils import EnvAgent
from flatland.envs.agent_utils import EnvAgent, RailAgentStatus
from flatland.envs.rail_env import RailEnv, RailEnvActions
from envs.flatland.utils.gym_env import StepOutput
......@@ -87,11 +88,13 @@ def find_all_cells_where_agent_can_choose(rail_env: RailEnv):
class SkipNoChoiceCellsWrapper(gym.Wrapper):
def __init__(self, env) -> None:
def __init__(self, env, accumulate_skipped_rewards) -> None:
super().__init__(env)
self._switches = None
self._switches_neighbors = None
self._decision_cells = None
self._accumulate_skipped_rewards = accumulate_skipped_rewards
self._skipped_rewards = defaultdict(float)
def _on_decision_cell(self, agent: EnvAgent):
return agent.position is None or agent.position in self._decision_cells
......@@ -112,6 +115,11 @@ class SkipNoChoiceCellsWrapper(gym.Wrapper):
r[agent_id] = reward[agent_id]
d[agent_id] = done[agent_id]
i[agent_id] = info[agent_id]
if self._accumulate_skipped_rewards:
r[agent_id] += self._skipped_rewards[agent_id]
self._skipped_rewards[agent_id] = 0.
elif self._accumulate_skipped_rewards:
self._skipped_rewards[agent_id] += reward[agent_id]
d['__all__'] = done['__all__']
action_dict = {}
return StepOutput(o, r, d, i)
......@@ -122,3 +130,107 @@ class SkipNoChoiceCellsWrapper(gym.Wrapper):
find_all_cells_where_agent_can_choose(self.unwrapped.rail_env)
return obs
class SparseRewardWrapper(gym.Wrapper):
def __init__(self, env, finished_reward=1, not_finished_reward=-1) -> None:
super().__init__(env)
self._finished_reward = finished_reward
self._not_finished_reward = not_finished_reward
def step(self, action_dict: Dict[int, RailEnvActions]) -> StepOutput:
rail_env: RailEnv = self.unwrapped.rail_env
obs, reward, done, info = self.env.step(action_dict)
o, r, d, i = {}, {}, {}, {}
for agent_id, agent_obs in obs.items():
o[agent_id] = obs[agent_id]
d[agent_id] = done[agent_id]
i[agent_id] = info[agent_id]
if done[agent_id]:
if rail_env.agents[agent_id].status in [RailAgentStatus.DONE, RailAgentStatus.DONE_REMOVED]:
# agent is done and really done -> give finished reward
r[agent_id] = self._finished_reward
else:
# agent is done but not really done -> give not_finished reward
r[agent_id] = self._not_finished_reward
else:
r[agent_id] = 0
d['__all__'] = done['__all__'] or all(d.values())
return StepOutput(o, r, d, i)
def reset(self, random_seed: Optional[int] = None) -> Dict[int, Any]:
return self.env.reset(random_seed)
class DeadlockWrapper(gym.Wrapper):
def __init__(self, env, deadlock_reward=-1) -> None:
super().__init__(env)
self._deadlock_reward = deadlock_reward
self._deadlocked_agents = []
def check_deadlock(self): # -> Set[int]:
rail_env: RailEnv = self.unwrapped.rail_env
new_deadlocked_agents = []
for agent in rail_env.agents:
if agent.status == RailAgentStatus.ACTIVE and agent.handle not in self._deadlocked_agents:
position = agent.position
direction = agent.direction
while position is not None:
possible_transitions = rail_env.rail.get_transitions(*position, direction)
num_transitions = np.count_nonzero(possible_transitions)
if num_transitions == 1:
new_direction_me = np.argmax(possible_transitions)
new_cell_me = get_new_position(position, new_direction_me)
opp_agent = rail_env.agent_positions[new_cell_me]
if opp_agent != -1:
opp_position = rail_env.agents[opp_agent].position
opp_direction = rail_env.agents[opp_agent].direction
opp_possible_transitions = rail_env.rail.get_transitions(*opp_position, opp_direction)
opp_num_transitions = np.count_nonzero(opp_possible_transitions)
if opp_num_transitions == 1:
if opp_direction != direction:
self._deadlocked_agents.append(agent.handle)
new_deadlocked_agents.append(agent.handle)
position = None
else:
position = new_cell_me
direction = new_direction_me
else:
position = new_cell_me
direction = new_direction_me
else:
position = None
else:
position = None
return new_deadlocked_agents
def step(self, action_dict: Dict[int, RailEnvActions]) -> StepOutput:
obs, reward, done, info = self.env.step(action_dict)
if self._deadlock_reward != 0:
new_deadlocked_agents = self.check_deadlock()
else:
new_deadlocked_agents = []
o, r, d, i = {}, {}, {}, {}
for agent_id, agent_obs in obs.items():
if agent_id not in self._deadlocked_agents or agent_id in new_deadlocked_agents:
o[agent_id] = obs[agent_id]
d[agent_id] = done[agent_id]
i[agent_id] = info[agent_id]
r[agent_id] = reward[agent_id]
if agent_id in new_deadlocked_agents:
# agent is in deadlocked (and was not before) -> give deadlock reward and set to done
r[agent_id] += self._deadlock_reward
d[agent_id] = True
d['__all__'] = done['__all__'] or all(d.values())
return StepOutput(o, r, d, i)
def reset(self, random_seed: Optional[int] = None) -> Dict[int, Any]:
self._deadlocked_agents = []
return self.env.reset(random_seed)
......@@ -32,7 +32,7 @@ class FlatlandRandomSparseSmall(MultiAgentEnv):
regenerate_schedule_on_reset=self._config['regenerate_schedule_on_reset']
)
if env_config.get('skip_no_choice_cells', False):
self._env = SkipNoChoiceCellsWrapper(self._env)
self._env = SkipNoChoiceCellsWrapper(self._env, self._config.get('accumulate_skipped_rewards', False))
if env_config.get('available_actions_obs', False):
self._env = AvailableActionsWrapper(self._env)
......
......@@ -28,7 +28,7 @@ class FlatlandSingle(gym.Env):
regenerate_schedule_on_reset=self._config['regenerate_schedule_on_reset']
)
if env_config.get('skip_no_choice_cells', False):
self._env = SkipNoChoiceCellsWrapper(self._env)
self._env = SkipNoChoiceCellsWrapper(self._env, self._config.get('accumulate_skipped_rewards', False))
if env_config.get('available_actions_obs', False):
self._env = AvailableActionsWrapper(self._env)
......
......@@ -11,7 +11,8 @@ from ray.rllib import MultiAgentEnv
from envs.flatland import get_generator_config
from envs.flatland.observations import make_obs
from envs.flatland.utils.gym_env import FlatlandGymEnv
from envs.flatland.utils.gym_env_wrappers import AvailableActionsWrapper, SkipNoChoiceCellsWrapper
from envs.flatland.utils.gym_env_wrappers import AvailableActionsWrapper, SkipNoChoiceCellsWrapper, SparseRewardWrapper, \
DeadlockWrapper
class FlatlandSparse(MultiAgentEnv):
......@@ -36,14 +37,18 @@ class FlatlandSparse(MultiAgentEnv):
regenerate_rail_on_reset=self._config['regenerate_rail_on_reset'],
regenerate_schedule_on_reset=self._config['regenerate_schedule_on_reset']
)
if env_config.get('sparse_reward', False):
self._env = SparseRewardWrapper(self._env, finished_reward=env_config.get('done_reward', 1),
not_finished_reward=env_config.get('not_finished_reward', -1))
if env_config.get('deadlock_reward', 0) != 0:
self._env = DeadlockWrapper(self._env, deadlock_reward=env_config['deadlock_reward'])
if env_config.get('skip_no_choice_cells', False):
self._env = SkipNoChoiceCellsWrapper(self._env)
self._env = SkipNoChoiceCellsWrapper(self._env, env_config.get('accumulate_skipped_rewards', False))
if env_config.get('available_actions_obs', False):
self._env = AvailableActionsWrapper(self._env)
@property
def observation_space(self) -> gym.spaces.Space:
print(self._env.observation_space)
return self._env.observation_space
@property
......
flatland-sparse-small-sparse-reward-tree-fc-apex:
run: APEX
env: flatland_sparse
stop:
timesteps_total: 100000000 # 1e8
checkpoint_freq: 10
checkpoint_at_end: True
# keep_checkpoints_num: 5
checkpoint_score_attr: episode_reward_mean
config:
num_workers: 15
num_envs_per_worker: 5
num_gpus: 0
env_config:
sparse_reward: True
done_reward: 1
not_finished_reward: -1
deadlock_reward: -1
skip_no_choice_cells: False
available_actions_obs: False
observation: tree
observation_config:
max_depth: 2
shortest_path_max_depth: 30
generator: sparse_rail_generator
generator_config: small_v0
wandb:
project: flatland
entity: masterscrat
tags: ["small_v0", "tree_obs", "apex", "sparse_reward", "deadlock_reward"] # TODO should be set programmatically
model:
fcnet_activation: relu
fcnet_hiddens: [256, 256]
vf_share_layers: True # False
# custom_model: fully_connected_model
# custom_options:
# layers: [256, 256, 256]
# activation: relu
# layer_norm: False
# vf_share_layers: True # False
# mask_unavailable_actions: False
flatland-sparse-small-action-mask-tree-fc-ppo:
run: PPO
env: flatland_sparse
stop:
timesteps_total: 10000000 # 1e7
checkpoint_freq: 10
checkpoint_at_end: True
checkpoint_score_attr: episode_reward_mean
config:
clip_rewards: False
# clip_param: 0.1
# vf_clip_param: 500.0
vf_clip_param: 10.0
entropy_coeff: 0.01
# effective batch_size: train_batch_size * num_agents_in_each_environment
# see https://github.com/ray-project/ray/issues/4628
train_batch_size: 1000 # 5000
rollout_fragment_length: 50 # 100
sgd_minibatch_size: 100 # 500
num_sgd_iter: 10
num_workers: 15
num_envs_per_worker: 5
batch_mode: truncate_episodes
observation_filter: NoFilter
vf_share_layers: True
vf_loss_coeff: 0.5
num_gpus: 0
env_config:
sparse_reward: True
done_reward: 1
deadlock_reward: -1
not_finished_reward: -1
skip_no_choice_cells: False
available_actions_obs: False
observation: new_tree
observation_config:
max_depth: 2
shortest_path_max_depth: 30
generator: sparse_rail_generator
generator_config: small_v0
wandb:
project: flatland
entity: masterscrat
tags: ["small_v0", "tree_obs", "ppo", "sparse_reward", "deadlock_reward"] # TODO should be set programmatically
model:
fcnet_activation: relu
fcnet_hiddens: [256, 256]
vf_share_layers: True # False
# custom_model: fully_connected_model
# custom_options:
# layers: [256, 256, 256]
# activation: relu
# layer_norm: False
# vf_share_layers: True # False
# mask_unavailable_actions: False
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment