Skip to content
Snippets Groups Projects
Commit 3c443618 authored by Egli Adrian (IT-SCI-API-PFI)'s avatar Egli Adrian (IT-SCI-API-PFI)
Browse files

experiment with ppo

parent 44fc3248
No related branches found
No related tags found
No related merge requests found
......@@ -22,7 +22,8 @@ from torch.utils.tensorboard import SummaryWriter
from reinforcement_learning.dddqn_policy import DDDQNPolicy
from reinforcement_learning.ppo_agent import PPOPolicy
from reinforcement_learning.ppo_deadlockavoidance_agent import MultiDecisionAgent
from utils.agent_action_config import get_flatland_full_action_size, get_action_size, map_actions, map_action
from utils.agent_action_config import get_flatland_full_action_size, get_action_size, map_actions, map_action, \
map_rail_env_action
from utils.dead_lock_avoidance_agent import DeadLockAvoidanceAgent
from utils.deadlock_check import get_agent_positions, check_for_deadlock
......@@ -173,7 +174,7 @@ def train_agent(train_params, train_env_params, eval_env_params, obs_params):
# Double Dueling DQN policy
policy = DDDQNPolicy(state_size, get_action_size(), train_params)
if True:
policy = PPOPolicy(state_size, get_action_size())
policy = PPOPolicy(state_size, get_action_size(), use_replay_buffer=True, in_parameters=train_params)
if False:
policy = DeadLockAvoidanceAgent(train_env, get_action_size())
if False:
......@@ -517,9 +518,9 @@ def eval_policy(env, tree_observation, policy, train_params, obs_params):
if __name__ == "__main__":
parser = ArgumentParser()
parser.add_argument("-n", "--n_episodes", help="number of episodes to run", default=12000, type=int)
parser.add_argument("-t", "--training_env_config", help="training config id (eg 0 for Test_0)", default=2,
parser.add_argument("-t", "--training_env_config", help="training config id (eg 0 for Test_0)", default=1,
type=int)
parser.add_argument("-e", "--evaluation_env_config", help="evaluation config id (eg 0 for Test_0)", default=2,
parser.add_argument("-e", "--evaluation_env_config", help="evaluation config id (eg 0 for Test_0)", default=1,
type=int)
parser.add_argument("--n_evaluation_episodes", help="number of evaluation episodes", default=10, type=int)
parser.add_argument("--checkpoint_interval", help="checkpoint interval", default=100, type=int)
......
import copy
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
......@@ -11,10 +10,6 @@ from torch.distributions import Categorical
from reinforcement_learning.policy import LearningPolicy
from reinforcement_learning.replay_buffer import ReplayBuffer
device = torch.device("cpu") # "cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
# https://lilianweng.github.io/lil-log/2018/04/08/policy-gradient-algorithms.html
class EpisodeBuffers:
......@@ -96,27 +91,46 @@ class ActorCriticModel(nn.Module):
class PPOPolicy(LearningPolicy):
def __init__(self, state_size, action_size, use_replay_buffer=False):
def __init__(self, state_size, action_size, use_replay_buffer=False, in_parameters=None):
print(">> PPOPolicy")
super(PPOPolicy, self).__init__()
# parameters
self.learning_rate = 1.0e-3
self.gamma = 0.95
self.ppo_parameters = in_parameters
if self.ppo_parameters is not None:
self.hidsize = self.ppo_parameters.hidden_size
self.buffer_size = self.ppo_parameters.buffer_size
self.batch_size = self.ppo_parameters.batch_size
self.learning_rate = self.ppo_parameters.learning_rate
self.gamma = self.ppo_parameters.gamma
# Device
if self.ppo_parameters.use_gpu and torch.cuda.is_available():
self.device = torch.device("cuda:0")
# print("🐇 Using GPU")
else:
self.device = torch.device("cpu")
# print("🐢 Using CPU")
else:
self.hidsize = 128
self.learning_rate = 1.0e-3
self.gamma = 0.95
self.buffer_size = 32_000
self.batch_size = 1024
self.device = torch.device("cpu")
self.surrogate_eps_clip = 0.1
self.K_epoch = 10
self.weight_loss = 0.5
self.weight_entropy = 0.01
self.buffer_size = 32_000
self.batch_size = 1024
self.buffer_min_size = 0
self.use_replay_buffer = use_replay_buffer
self.device = device
self.current_episode_memory = EpisodeBuffers()
self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, self.device)
self.loss = 0
self.actor_critic_model = ActorCriticModel(state_size, action_size, self.device)
self.actor_critic_model = ActorCriticModel(state_size, action_size,self.device,
hidsize1=self.hidsize,
hidsize2=self.hidsize)
self.optimizer = optim.Adam(self.actor_critic_model.parameters(), lr=self.learning_rate)
self.loss_function = nn.MSELoss() # nn.SmoothL1Loss()
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment