修改car_pos
This commit is contained in:
parent
ee914ff930
commit
3086413171
2
.gitignore
vendored
2
.gitignore
vendored
@ -10,6 +10,8 @@ __pycache__/
|
|||||||
# Pytorch weights
|
# Pytorch weights
|
||||||
weights/
|
weights/
|
||||||
solutions/
|
solutions/
|
||||||
|
PPO_preTrained/
|
||||||
|
PPO_logs/
|
||||||
|
|
||||||
# Distribution / packaging
|
# Distribution / packaging
|
||||||
.Python
|
.Python
|
||||||
|
@ -11,6 +11,9 @@ import argparse
|
|||||||
from ppo import PPO
|
from ppo import PPO
|
||||||
from network import FeedForwardNN
|
from network import FeedForwardNN
|
||||||
from eval_policy import eval_policy
|
from eval_policy import eval_policy
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
from env import PartitionMazeEnv
|
from env import PartitionMazeEnv
|
||||||
|
|
||||||
def train(env, hyperparameters, actor_model, critic_model):
|
def train(env, hyperparameters, actor_model, critic_model):
|
@ -11,6 +11,9 @@ import argparse
|
|||||||
from ppo import PPO
|
from ppo import PPO
|
||||||
from network import FeedForwardNN
|
from network import FeedForwardNN
|
||||||
from eval_policy import eval_policy
|
from eval_policy import eval_policy
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
from env import PartitionMazeEnv
|
from env import PartitionMazeEnv
|
||||||
|
|
||||||
def train(env, hyperparameters, actor_model, critic_model):
|
def train(env, hyperparameters, actor_model, critic_model):
|
@ -183,7 +183,7 @@ class PPO:
|
|||||||
ep_rews = [] # rewards collected per episode
|
ep_rews = [] # rewards collected per episode
|
||||||
|
|
||||||
# Reset the environment. sNote that obs is short for observation.
|
# Reset the environment. sNote that obs is short for observation.
|
||||||
obs, _ = self.env.reset()
|
obs = self.env.reset()
|
||||||
done = False
|
done = False
|
||||||
|
|
||||||
# Run an episode for a maximum of max_timesteps_per_episode timesteps
|
# Run an episode for a maximum of max_timesteps_per_episode timesteps
|
273
PPO2/PPO.py
Normal file
273
PPO2/PPO.py
Normal file
@ -0,0 +1,273 @@
|
|||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
from torch.distributions import MultivariateNormal
|
||||||
|
from torch.distributions import Categorical
|
||||||
|
|
||||||
|
################################## set device ##################################
|
||||||
|
print("============================================================================================")
|
||||||
|
# set device to cpu or cuda
|
||||||
|
device = torch.device('cpu')
|
||||||
|
if(torch.cuda.is_available()):
|
||||||
|
device = torch.device('cuda:0')
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
print("Device set to : " + str(torch.cuda.get_device_name(device)))
|
||||||
|
else:
|
||||||
|
print("Device set to : cpu")
|
||||||
|
print("============================================================================================")
|
||||||
|
|
||||||
|
|
||||||
|
################################## PPO Policy ##################################
|
||||||
|
class RolloutBuffer:
|
||||||
|
def __init__(self):
|
||||||
|
self.actions = []
|
||||||
|
self.states = []
|
||||||
|
self.logprobs = []
|
||||||
|
self.rewards = []
|
||||||
|
self.state_values = []
|
||||||
|
self.is_terminals = []
|
||||||
|
|
||||||
|
def clear(self):
|
||||||
|
del self.actions[:]
|
||||||
|
del self.states[:]
|
||||||
|
del self.logprobs[:]
|
||||||
|
del self.rewards[:]
|
||||||
|
del self.state_values[:]
|
||||||
|
del self.is_terminals[:]
|
||||||
|
|
||||||
|
|
||||||
|
class ActorCritic(nn.Module):
|
||||||
|
def __init__(self, state_dim, action_dim, has_continuous_action_space, action_std_init):
|
||||||
|
super(ActorCritic, self).__init__()
|
||||||
|
|
||||||
|
self.has_continuous_action_space = has_continuous_action_space
|
||||||
|
|
||||||
|
if has_continuous_action_space:
|
||||||
|
self.action_dim = action_dim
|
||||||
|
self.action_var = torch.full((action_dim,), action_std_init * action_std_init).to(device)
|
||||||
|
# actor
|
||||||
|
if has_continuous_action_space :
|
||||||
|
self.actor = nn.Sequential(
|
||||||
|
nn.Linear(state_dim, 64),
|
||||||
|
nn.Tanh(),
|
||||||
|
# nn.Sigmoid(),
|
||||||
|
# nn.ReLU(),
|
||||||
|
nn.Linear(64, 64),
|
||||||
|
nn.Tanh(),
|
||||||
|
# nn.Sigmoid(),
|
||||||
|
# nn.ReLU(),
|
||||||
|
nn.Linear(64, action_dim),
|
||||||
|
nn.Tanh()
|
||||||
|
# nn.Sigmoid()
|
||||||
|
# nn.ReLU()
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.actor = nn.Sequential(
|
||||||
|
nn.Linear(state_dim, 64),
|
||||||
|
nn.Tanh(),
|
||||||
|
nn.Linear(64, 64),
|
||||||
|
nn.Tanh(),
|
||||||
|
nn.Linear(64, action_dim),
|
||||||
|
nn.Softmax(dim=-1)
|
||||||
|
)
|
||||||
|
# critic
|
||||||
|
self.critic = nn.Sequential(
|
||||||
|
nn.Linear(state_dim, 64),
|
||||||
|
nn.Tanh(),
|
||||||
|
# nn.Sigmoid(),
|
||||||
|
# nn.ReLU(),
|
||||||
|
nn.Linear(64, 64),
|
||||||
|
nn.Tanh(),
|
||||||
|
# nn.Sigmoid(),
|
||||||
|
# nn.ReLU(),
|
||||||
|
nn.Linear(64, 1)
|
||||||
|
)
|
||||||
|
|
||||||
|
def set_action_std(self, new_action_std):
|
||||||
|
if self.has_continuous_action_space:
|
||||||
|
self.action_var = torch.full((self.action_dim,), new_action_std * new_action_std).to(device)
|
||||||
|
else:
|
||||||
|
print("--------------------------------------------------------------------------------------------")
|
||||||
|
print("WARNING : Calling ActorCritic::set_action_std() on discrete action space policy")
|
||||||
|
print("--------------------------------------------------------------------------------------------")
|
||||||
|
|
||||||
|
def forward(self):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def act(self, state):
|
||||||
|
|
||||||
|
if self.has_continuous_action_space:
|
||||||
|
action_mean = self.actor(state)
|
||||||
|
cov_mat = torch.diag(self.action_var).unsqueeze(dim=0)
|
||||||
|
dist = MultivariateNormal(action_mean, cov_mat)
|
||||||
|
else:
|
||||||
|
action_probs = self.actor(state)
|
||||||
|
dist = Categorical(action_probs)
|
||||||
|
|
||||||
|
action = dist.sample()
|
||||||
|
action_logprob = dist.log_prob(action)
|
||||||
|
state_val = self.critic(state)
|
||||||
|
|
||||||
|
return action.detach(), action_logprob.detach(), state_val.detach()
|
||||||
|
|
||||||
|
def evaluate(self, state, action):
|
||||||
|
|
||||||
|
if self.has_continuous_action_space:
|
||||||
|
action_mean = self.actor(state)
|
||||||
|
|
||||||
|
action_var = self.action_var.expand_as(action_mean)
|
||||||
|
cov_mat = torch.diag_embed(action_var).to(device)
|
||||||
|
dist = MultivariateNormal(action_mean, cov_mat)
|
||||||
|
|
||||||
|
# For Single Action Environments.
|
||||||
|
if self.action_dim == 1:
|
||||||
|
action = action.reshape(-1, self.action_dim)
|
||||||
|
else:
|
||||||
|
action_probs = self.actor(state)
|
||||||
|
dist = Categorical(action_probs)
|
||||||
|
action_logprobs = dist.log_prob(action)
|
||||||
|
dist_entropy = dist.entropy()
|
||||||
|
state_values = self.critic(state)
|
||||||
|
|
||||||
|
return action_logprobs, state_values, dist_entropy
|
||||||
|
|
||||||
|
|
||||||
|
class PPO:
|
||||||
|
def __init__(self, state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip, has_continuous_action_space, action_std_init=0.6):
|
||||||
|
|
||||||
|
self.has_continuous_action_space = has_continuous_action_space
|
||||||
|
|
||||||
|
if has_continuous_action_space:
|
||||||
|
self.action_std = action_std_init
|
||||||
|
|
||||||
|
self.gamma = gamma
|
||||||
|
self.eps_clip = eps_clip
|
||||||
|
self.K_epochs = K_epochs
|
||||||
|
|
||||||
|
self.buffer = RolloutBuffer()
|
||||||
|
|
||||||
|
self.policy = ActorCritic(state_dim, action_dim, has_continuous_action_space, action_std_init).to(device)
|
||||||
|
self.optimizer = torch.optim.Adam([
|
||||||
|
{'params': self.policy.actor.parameters(), 'lr': lr_actor},
|
||||||
|
{'params': self.policy.critic.parameters(), 'lr': lr_critic}
|
||||||
|
])
|
||||||
|
|
||||||
|
self.policy_old = ActorCritic(state_dim, action_dim, has_continuous_action_space, action_std_init).to(device)
|
||||||
|
self.policy_old.load_state_dict(self.policy.state_dict())
|
||||||
|
|
||||||
|
self.MseLoss = nn.MSELoss()
|
||||||
|
|
||||||
|
def set_action_std(self, new_action_std):
|
||||||
|
if self.has_continuous_action_space:
|
||||||
|
self.action_std = new_action_std
|
||||||
|
self.policy.set_action_std(new_action_std)
|
||||||
|
self.policy_old.set_action_std(new_action_std)
|
||||||
|
else:
|
||||||
|
print("--------------------------------------------------------------------------------------------")
|
||||||
|
print("WARNING : Calling PPO::set_action_std() on discrete action space policy")
|
||||||
|
print("--------------------------------------------------------------------------------------------")
|
||||||
|
|
||||||
|
def decay_action_std(self, action_std_decay_rate, min_action_std):
|
||||||
|
print("--------------------------------------------------------------------------------------------")
|
||||||
|
if self.has_continuous_action_space:
|
||||||
|
self.action_std = self.action_std - action_std_decay_rate
|
||||||
|
self.action_std = round(self.action_std, 4)
|
||||||
|
if (self.action_std <= min_action_std):
|
||||||
|
self.action_std = min_action_std
|
||||||
|
print("setting actor output action_std to min_action_std : ", self.action_std)
|
||||||
|
else:
|
||||||
|
print("setting actor output action_std to : ", self.action_std)
|
||||||
|
self.set_action_std(self.action_std)
|
||||||
|
|
||||||
|
else:
|
||||||
|
print("WARNING : Calling PPO::decay_action_std() on discrete action space policy")
|
||||||
|
print("--------------------------------------------------------------------------------------------")
|
||||||
|
|
||||||
|
def select_action(self, state):
|
||||||
|
|
||||||
|
if self.has_continuous_action_space:
|
||||||
|
with torch.no_grad():
|
||||||
|
state = torch.FloatTensor(state).to(device)
|
||||||
|
action, action_logprob, state_val = self.policy_old.act(state)
|
||||||
|
|
||||||
|
self.buffer.states.append(state)
|
||||||
|
self.buffer.actions.append(action)
|
||||||
|
self.buffer.logprobs.append(action_logprob)
|
||||||
|
self.buffer.state_values.append(state_val)
|
||||||
|
|
||||||
|
return action.detach().cpu().numpy().flatten()
|
||||||
|
else:
|
||||||
|
with torch.no_grad():
|
||||||
|
state = torch.FloatTensor(state).to(device)
|
||||||
|
action, action_logprob, state_val = self.policy_old.act(state)
|
||||||
|
|
||||||
|
self.buffer.states.append(state)
|
||||||
|
self.buffer.actions.append(action)
|
||||||
|
self.buffer.logprobs.append(action_logprob)
|
||||||
|
self.buffer.state_values.append(state_val)
|
||||||
|
|
||||||
|
return action.item()
|
||||||
|
|
||||||
|
def update(self):
|
||||||
|
# Monte Carlo estimate of returns
|
||||||
|
rewards = []
|
||||||
|
discounted_reward = 0
|
||||||
|
for reward, is_terminal in zip(reversed(self.buffer.rewards), reversed(self.buffer.is_terminals)):
|
||||||
|
if is_terminal:
|
||||||
|
discounted_reward = 0
|
||||||
|
discounted_reward = reward + (self.gamma * discounted_reward)
|
||||||
|
rewards.insert(0, discounted_reward)
|
||||||
|
|
||||||
|
# Normalizing the rewards
|
||||||
|
rewards = torch.tensor(rewards, dtype=torch.float32).to(device)
|
||||||
|
rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-7)
|
||||||
|
|
||||||
|
# convert list to tensor
|
||||||
|
old_states = torch.squeeze(torch.stack(self.buffer.states, dim=0)).detach().to(device)
|
||||||
|
old_actions = torch.squeeze(torch.stack(self.buffer.actions, dim=0)).detach().to(device)
|
||||||
|
old_logprobs = torch.squeeze(torch.stack(self.buffer.logprobs, dim=0)).detach().to(device)
|
||||||
|
old_state_values = torch.squeeze(torch.stack(self.buffer.state_values, dim=0)).detach().to(device)
|
||||||
|
|
||||||
|
# calculate advantages
|
||||||
|
advantages = rewards.detach() - old_state_values.detach()
|
||||||
|
|
||||||
|
# Optimize policy for K epochs
|
||||||
|
for _ in range(self.K_epochs):
|
||||||
|
|
||||||
|
# Evaluating old actions and values
|
||||||
|
logprobs, state_values, dist_entropy = self.policy.evaluate(old_states, old_actions)
|
||||||
|
|
||||||
|
# match state_values tensor dimensions with rewards tensor
|
||||||
|
state_values = torch.squeeze(state_values)
|
||||||
|
|
||||||
|
# Finding the ratio (pi_theta / pi_theta__old)
|
||||||
|
ratios = torch.exp(logprobs - old_logprobs.detach())
|
||||||
|
|
||||||
|
# Finding Surrogate Loss
|
||||||
|
surr1 = ratios * advantages
|
||||||
|
surr2 = torch.clamp(ratios, 1-self.eps_clip, 1+self.eps_clip) * advantages
|
||||||
|
|
||||||
|
# final loss of clipped objective PPO
|
||||||
|
loss = -torch.min(surr1, surr2) + 0.5 * self.MseLoss(state_values, rewards) - 0.01 * dist_entropy
|
||||||
|
|
||||||
|
# take gradient step
|
||||||
|
self.optimizer.zero_grad()
|
||||||
|
loss.mean().backward()
|
||||||
|
self.optimizer.step()
|
||||||
|
|
||||||
|
# Copy new weights into old policy
|
||||||
|
self.policy_old.load_state_dict(self.policy.state_dict())
|
||||||
|
|
||||||
|
# clear buffer
|
||||||
|
self.buffer.clear()
|
||||||
|
|
||||||
|
def save(self, checkpoint_path):
|
||||||
|
torch.save(self.policy_old.state_dict(), checkpoint_path)
|
||||||
|
|
||||||
|
def load(self, checkpoint_path):
|
||||||
|
self.policy_old.load_state_dict(torch.load(checkpoint_path, map_location=lambda storage, loc: storage))
|
||||||
|
self.policy.load_state_dict(torch.load(checkpoint_path, map_location=lambda storage, loc: storage))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
142
PPO2/plot_graph.py
Normal file
142
PPO2/plot_graph.py
Normal file
@ -0,0 +1,142 @@
|
|||||||
|
import os
|
||||||
|
import pandas as pd
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
|
||||||
|
def save_graph():
|
||||||
|
print("============================================================================================")
|
||||||
|
# env_name = 'CartPole-v1'
|
||||||
|
# env_name = 'LunarLander-v2'
|
||||||
|
# env_name = 'BipedalWalker-v2'
|
||||||
|
env_name = 'RoboschoolWalker2d-v1'
|
||||||
|
|
||||||
|
fig_num = 0 #### change this to prevent overwriting figures in same env_name folder
|
||||||
|
plot_avg = True # plot average of all runs; else plot all runs separately
|
||||||
|
fig_width = 10
|
||||||
|
fig_height = 6
|
||||||
|
|
||||||
|
# smooth out rewards to get a smooth and a less smooth (var) plot lines
|
||||||
|
window_len_smooth = 20
|
||||||
|
min_window_len_smooth = 1
|
||||||
|
linewidth_smooth = 1.5
|
||||||
|
alpha_smooth = 1
|
||||||
|
|
||||||
|
window_len_var = 5
|
||||||
|
min_window_len_var = 1
|
||||||
|
linewidth_var = 2
|
||||||
|
alpha_var = 0.1
|
||||||
|
|
||||||
|
colors = ['red', 'blue', 'green', 'orange', 'purple', 'olive', 'brown', 'magenta', 'cyan', 'crimson','gray', 'black']
|
||||||
|
|
||||||
|
# make directory for saving figures
|
||||||
|
figures_dir = "PPO_figs"
|
||||||
|
if not os.path.exists(figures_dir):
|
||||||
|
os.makedirs(figures_dir)
|
||||||
|
|
||||||
|
# make environment directory for saving figures
|
||||||
|
figures_dir = figures_dir + '/' + env_name + '/'
|
||||||
|
if not os.path.exists(figures_dir):
|
||||||
|
os.makedirs(figures_dir)
|
||||||
|
|
||||||
|
fig_save_path = figures_dir + '/PPO_' + env_name + '_fig_' + str(fig_num) + '.png'
|
||||||
|
|
||||||
|
# get number of log files in directory
|
||||||
|
log_dir = "PPO_logs" + '/' + env_name + '/'
|
||||||
|
|
||||||
|
current_num_files = next(os.walk(log_dir))[2]
|
||||||
|
num_runs = len(current_num_files)
|
||||||
|
|
||||||
|
all_runs = []
|
||||||
|
|
||||||
|
for run_num in range(num_runs):
|
||||||
|
|
||||||
|
log_f_name = log_dir + '/PPO_' + env_name + "_log_" + str(run_num) + ".csv"
|
||||||
|
print("loading data from : " + log_f_name)
|
||||||
|
data = pd.read_csv(log_f_name)
|
||||||
|
data = pd.DataFrame(data)
|
||||||
|
|
||||||
|
print("data shape : ", data.shape)
|
||||||
|
|
||||||
|
all_runs.append(data)
|
||||||
|
print("--------------------------------------------------------------------------------------------")
|
||||||
|
|
||||||
|
ax = plt.gca()
|
||||||
|
|
||||||
|
if plot_avg:
|
||||||
|
# average all runs
|
||||||
|
df_concat = pd.concat(all_runs)
|
||||||
|
df_concat_groupby = df_concat.groupby(df_concat.index)
|
||||||
|
data_avg = df_concat_groupby.mean()
|
||||||
|
|
||||||
|
# smooth out rewards to get a smooth and a less smooth (var) plot lines
|
||||||
|
data_avg['reward_smooth'] = data_avg['reward'].rolling(window=window_len_smooth, win_type='triang', min_periods=min_window_len_smooth).mean()
|
||||||
|
data_avg['reward_var'] = data_avg['reward'].rolling(window=window_len_var, win_type='triang', min_periods=min_window_len_var).mean()
|
||||||
|
|
||||||
|
data_avg.plot(kind='line', x='timestep' , y='reward_smooth',ax=ax,color=colors[0], linewidth=linewidth_smooth, alpha=alpha_smooth)
|
||||||
|
data_avg.plot(kind='line', x='timestep' , y='reward_var',ax=ax,color=colors[0], linewidth=linewidth_var, alpha=alpha_var)
|
||||||
|
|
||||||
|
# keep only reward_smooth in the legend and rename it
|
||||||
|
handles, labels = ax.get_legend_handles_labels()
|
||||||
|
ax.legend([handles[0]], ["reward_avg_" + str(len(all_runs)) + "_runs"], loc=2)
|
||||||
|
|
||||||
|
else:
|
||||||
|
for i, run in enumerate(all_runs):
|
||||||
|
# smooth out rewards to get a smooth and a less smooth (var) plot lines
|
||||||
|
run['reward_smooth_' + str(i)] = run['reward'].rolling(window=window_len_smooth, win_type='triang', min_periods=min_window_len_smooth).mean()
|
||||||
|
run['reward_var_' + str(i)] = run['reward'].rolling(window=window_len_var, win_type='triang', min_periods=min_window_len_var).mean()
|
||||||
|
|
||||||
|
# plot the lines
|
||||||
|
run.plot(kind='line', x='timestep' , y='reward_smooth_' + str(i),ax=ax,color=colors[i % len(colors)], linewidth=linewidth_smooth, alpha=alpha_smooth)
|
||||||
|
run.plot(kind='line', x='timestep' , y='reward_var_' + str(i),ax=ax,color=colors[i % len(colors)], linewidth=linewidth_var, alpha=alpha_var)
|
||||||
|
|
||||||
|
# keep alternate elements (reward_smooth_i) in the legend
|
||||||
|
handles, labels = ax.get_legend_handles_labels()
|
||||||
|
new_handles = []
|
||||||
|
new_labels = []
|
||||||
|
for i in range(len(handles)):
|
||||||
|
if(i%2 == 0):
|
||||||
|
new_handles.append(handles[i])
|
||||||
|
new_labels.append(labels[i])
|
||||||
|
ax.legend(new_handles, new_labels, loc=2)
|
||||||
|
|
||||||
|
# ax.set_yticks(np.arange(0, 1800, 200))
|
||||||
|
# ax.set_xticks(np.arange(0, int(4e6), int(5e5)))
|
||||||
|
|
||||||
|
ax.grid(color='gray', linestyle='-', linewidth=1, alpha=0.2)
|
||||||
|
|
||||||
|
ax.set_xlabel("Timesteps", fontsize=12)
|
||||||
|
ax.set_ylabel("Rewards", fontsize=12)
|
||||||
|
|
||||||
|
plt.title(env_name, fontsize=14)
|
||||||
|
|
||||||
|
fig = plt.gcf()
|
||||||
|
fig.set_size_inches(fig_width, fig_height)
|
||||||
|
|
||||||
|
print("============================================================================================")
|
||||||
|
plt.savefig(fig_save_path)
|
||||||
|
print("figure saved at : ", fig_save_path)
|
||||||
|
print("============================================================================================")
|
||||||
|
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
save_graph()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
123
PPO2/test.py
Normal file
123
PPO2/test.py
Normal file
@ -0,0 +1,123 @@
|
|||||||
|
import os
|
||||||
|
import glob
|
||||||
|
import time
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
# import gym
|
||||||
|
# import roboschool
|
||||||
|
|
||||||
|
from PPO import PPO
|
||||||
|
import sys
|
||||||
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
|
from env import PartitionMazeEnv
|
||||||
|
|
||||||
|
#################################### Testing ###################################
|
||||||
|
def test():
|
||||||
|
print("============================================================================================")
|
||||||
|
|
||||||
|
################## hyperparameters ##################
|
||||||
|
|
||||||
|
# env_name = "CartPole-v1"
|
||||||
|
# has_continuous_action_space = False
|
||||||
|
# max_ep_len = 400
|
||||||
|
# action_std = None
|
||||||
|
|
||||||
|
# env_name = "LunarLander-v2"
|
||||||
|
# has_continuous_action_space = False
|
||||||
|
# max_ep_len = 300
|
||||||
|
# action_std = None
|
||||||
|
|
||||||
|
# env_name = "BipedalWalker-v2"
|
||||||
|
# has_continuous_action_space = True
|
||||||
|
# max_ep_len = 1500 # max timesteps in one episode
|
||||||
|
# action_std = 0.1 # set same std for action distribution which was used while saving
|
||||||
|
|
||||||
|
env_name = "test"
|
||||||
|
has_continuous_action_space = True
|
||||||
|
max_ep_len = 1000 # max timesteps in one episode
|
||||||
|
action_std = 0.1 # set same std for action distribution which was used while saving
|
||||||
|
|
||||||
|
render = True # render environment on screen
|
||||||
|
frame_delay = 0 # if required; add delay b/w frames
|
||||||
|
|
||||||
|
total_test_episodes = 10 # total num of testing episodes
|
||||||
|
|
||||||
|
K_epochs = 80 # update policy for K epochs
|
||||||
|
eps_clip = 0.2 # clip parameter for PPO
|
||||||
|
gamma = 0.99 # discount factor
|
||||||
|
|
||||||
|
lr_actor = 0.0003 # learning rate for actor
|
||||||
|
lr_critic = 0.001 # learning rate for critic
|
||||||
|
|
||||||
|
#####################################################
|
||||||
|
|
||||||
|
# env = gym.make(env_name)
|
||||||
|
env = PartitionMazeEnv()
|
||||||
|
|
||||||
|
# state space dimension
|
||||||
|
state_dim = env.observation_space.shape[0]
|
||||||
|
|
||||||
|
# action space dimension
|
||||||
|
if has_continuous_action_space:
|
||||||
|
action_dim = env.action_space.shape[0]
|
||||||
|
else:
|
||||||
|
action_dim = env.action_space.n
|
||||||
|
|
||||||
|
# initialize a PPO agent
|
||||||
|
ppo_agent = PPO(state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip, has_continuous_action_space, action_std)
|
||||||
|
|
||||||
|
# preTrained weights directory
|
||||||
|
|
||||||
|
random_seed = 0 #### set this to load a particular checkpoint trained on random seed
|
||||||
|
run_num_pretrained = 0 #### set this to load a particular checkpoint num
|
||||||
|
|
||||||
|
directory = "PPO_preTrained" + '/' + env_name + '/'
|
||||||
|
checkpoint_path = directory + "PPO_{}_{}_{}.pth".format(env_name, random_seed, run_num_pretrained)
|
||||||
|
print("loading network from : " + checkpoint_path)
|
||||||
|
|
||||||
|
ppo_agent.load(checkpoint_path)
|
||||||
|
|
||||||
|
print("--------------------------------------------------------------------------------------------")
|
||||||
|
|
||||||
|
test_running_reward = 0
|
||||||
|
|
||||||
|
for ep in range(1, total_test_episodes+1):
|
||||||
|
ep_reward = 0
|
||||||
|
state = env.reset()
|
||||||
|
|
||||||
|
for t in range(1, max_ep_len+1):
|
||||||
|
action = ppo_agent.select_action(state)
|
||||||
|
state, reward, done, _, _ = env.step(action)
|
||||||
|
ep_reward += reward
|
||||||
|
|
||||||
|
if render:
|
||||||
|
env.render()
|
||||||
|
time.sleep(frame_delay)
|
||||||
|
|
||||||
|
if done:
|
||||||
|
break
|
||||||
|
|
||||||
|
# clear buffer
|
||||||
|
ppo_agent.buffer.clear()
|
||||||
|
|
||||||
|
test_running_reward += ep_reward
|
||||||
|
print('Episode: {} \t\t Reward: {}'.format(ep, round(ep_reward, 2)))
|
||||||
|
ep_reward = 0
|
||||||
|
|
||||||
|
env.close()
|
||||||
|
|
||||||
|
print("============================================================================================")
|
||||||
|
|
||||||
|
avg_test_reward = test_running_reward / total_test_episodes
|
||||||
|
avg_test_reward = round(avg_test_reward, 2)
|
||||||
|
print("average test reward : " + str(avg_test_reward))
|
||||||
|
|
||||||
|
print("============================================================================================")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
test()
|
266
PPO2/train.py
Normal file
266
PPO2/train.py
Normal file
@ -0,0 +1,266 @@
|
|||||||
|
import os
|
||||||
|
import glob
|
||||||
|
import time
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
# import gym
|
||||||
|
# import roboschool
|
||||||
|
import gymnasium as gym
|
||||||
|
|
||||||
|
from PPO import PPO
|
||||||
|
import sys
|
||||||
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
|
from env import PartitionMazeEnv
|
||||||
|
|
||||||
|
################################### Training ###################################
|
||||||
|
def train():
|
||||||
|
print("============================================================================================")
|
||||||
|
|
||||||
|
####### initialize environment hyperparameters ######
|
||||||
|
env_name = "test"
|
||||||
|
|
||||||
|
has_continuous_action_space = True # continuous action space; else discrete
|
||||||
|
|
||||||
|
max_ep_len = 100 # max timesteps in one episode
|
||||||
|
max_training_timesteps = int(3e8) # break training loop if timeteps > max_training_timesteps
|
||||||
|
|
||||||
|
print_freq = max_ep_len * 10 # print avg reward in the interval (in num timesteps)
|
||||||
|
log_freq = max_ep_len * 2 # log avg reward in the interval (in num timesteps)
|
||||||
|
save_model_freq = int(1e5) # save model frequency (in num timesteps)
|
||||||
|
|
||||||
|
action_std = 0.6 # starting std for action distribution (Multivariate Normal)
|
||||||
|
action_std_decay_rate = 0.05 # linearly decay action_std (action_std = action_std - action_std_decay_rate)
|
||||||
|
min_action_std = 0.1 # minimum action_std (stop decay after action_std <= min_action_std)
|
||||||
|
action_std_decay_freq = int(2.5e5) # action_std decay frequency (in num timesteps)
|
||||||
|
#####################################################
|
||||||
|
|
||||||
|
## Note : print/log frequencies should be > than max_ep_len
|
||||||
|
|
||||||
|
################ PPO hyperparameters ################
|
||||||
|
update_timestep = max_ep_len * 4 # update policy every n timesteps
|
||||||
|
K_epochs = 80 # update policy for K epochs in one PPO update
|
||||||
|
|
||||||
|
eps_clip = 0.2 # clip parameter for PPO
|
||||||
|
gamma = 0.99 # discount factor
|
||||||
|
|
||||||
|
lr_actor = 0.0003 # learning rate for actor network
|
||||||
|
lr_critic = 0.001 # learning rate for critic network
|
||||||
|
|
||||||
|
random_seed = 0 # set random seed if required (0 = no random seed)
|
||||||
|
#####################################################
|
||||||
|
|
||||||
|
print("training environment name : " + env_name)
|
||||||
|
|
||||||
|
# env = gym.make(env_name)
|
||||||
|
env = PartitionMazeEnv()
|
||||||
|
|
||||||
|
# state space dimension
|
||||||
|
state_dim = env.observation_space.shape[0]
|
||||||
|
|
||||||
|
# action space dimension
|
||||||
|
if has_continuous_action_space:
|
||||||
|
action_dim = env.action_space.shape[0]
|
||||||
|
else:
|
||||||
|
action_dim = env.action_space.n
|
||||||
|
|
||||||
|
###################### logging ######################
|
||||||
|
|
||||||
|
#### log files for multiple runs are NOT overwritten
|
||||||
|
log_dir = "PPO_logs"
|
||||||
|
if not os.path.exists(log_dir):
|
||||||
|
os.makedirs(log_dir)
|
||||||
|
|
||||||
|
log_dir = log_dir + '/' + env_name + '/'
|
||||||
|
if not os.path.exists(log_dir):
|
||||||
|
os.makedirs(log_dir)
|
||||||
|
|
||||||
|
#### get number of log files in log directory
|
||||||
|
run_num = 0
|
||||||
|
current_num_files = next(os.walk(log_dir))[2]
|
||||||
|
run_num = len(current_num_files)
|
||||||
|
|
||||||
|
#### create new log file for each run
|
||||||
|
log_f_name = log_dir + '/PPO_' + env_name + "_log_" + str(run_num) + ".csv"
|
||||||
|
|
||||||
|
print("current logging run number for " + env_name + " : ", run_num)
|
||||||
|
print("logging at : " + log_f_name)
|
||||||
|
#####################################################
|
||||||
|
|
||||||
|
################### checkpointing ###################
|
||||||
|
run_num_pretrained = 0 #### change this to prevent overwriting weights in same env_name folder
|
||||||
|
|
||||||
|
directory = "PPO_preTrained"
|
||||||
|
if not os.path.exists(directory):
|
||||||
|
os.makedirs(directory)
|
||||||
|
|
||||||
|
directory = directory + '/' + env_name + '/'
|
||||||
|
if not os.path.exists(directory):
|
||||||
|
os.makedirs(directory)
|
||||||
|
|
||||||
|
|
||||||
|
checkpoint_path = directory + "PPO_{}_{}_{}.pth".format(env_name, random_seed, run_num_pretrained)
|
||||||
|
print("save checkpoint path : " + checkpoint_path)
|
||||||
|
#####################################################
|
||||||
|
|
||||||
|
|
||||||
|
############# print all hyperparameters #############
|
||||||
|
print("--------------------------------------------------------------------------------------------")
|
||||||
|
print("max training timesteps : ", max_training_timesteps)
|
||||||
|
print("max timesteps per episode : ", max_ep_len)
|
||||||
|
print("model saving frequency : " + str(save_model_freq) + " timesteps")
|
||||||
|
print("log frequency : " + str(log_freq) + " timesteps")
|
||||||
|
print("printing average reward over episodes in last : " + str(print_freq) + " timesteps")
|
||||||
|
print("--------------------------------------------------------------------------------------------")
|
||||||
|
print("state space dimension : ", state_dim)
|
||||||
|
print("action space dimension : ", action_dim)
|
||||||
|
print("--------------------------------------------------------------------------------------------")
|
||||||
|
if has_continuous_action_space:
|
||||||
|
print("Initializing a continuous action space policy")
|
||||||
|
print("--------------------------------------------------------------------------------------------")
|
||||||
|
print("starting std of action distribution : ", action_std)
|
||||||
|
print("decay rate of std of action distribution : ", action_std_decay_rate)
|
||||||
|
print("minimum std of action distribution : ", min_action_std)
|
||||||
|
print("decay frequency of std of action distribution : " + str(action_std_decay_freq) + " timesteps")
|
||||||
|
else:
|
||||||
|
print("Initializing a discrete action space policy")
|
||||||
|
print("--------------------------------------------------------------------------------------------")
|
||||||
|
print("PPO update frequency : " + str(update_timestep) + " timesteps")
|
||||||
|
print("PPO K epochs : ", K_epochs)
|
||||||
|
print("PPO epsilon clip : ", eps_clip)
|
||||||
|
print("discount factor (gamma) : ", gamma)
|
||||||
|
print("--------------------------------------------------------------------------------------------")
|
||||||
|
print("optimizer learning rate actor : ", lr_actor)
|
||||||
|
print("optimizer learning rate critic : ", lr_critic)
|
||||||
|
if random_seed:
|
||||||
|
print("--------------------------------------------------------------------------------------------")
|
||||||
|
print("setting random seed to ", random_seed)
|
||||||
|
torch.manual_seed(random_seed)
|
||||||
|
env.seed(random_seed)
|
||||||
|
np.random.seed(random_seed)
|
||||||
|
#####################################################
|
||||||
|
|
||||||
|
print("============================================================================================")
|
||||||
|
|
||||||
|
################# training procedure ################
|
||||||
|
|
||||||
|
# initialize a PPO agent
|
||||||
|
ppo_agent = PPO(state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip, has_continuous_action_space, action_std)
|
||||||
|
|
||||||
|
# track total training time
|
||||||
|
start_time = datetime.now().replace(microsecond=0)
|
||||||
|
print("Started training at (GMT) : ", start_time)
|
||||||
|
|
||||||
|
print("============================================================================================")
|
||||||
|
|
||||||
|
# logging file
|
||||||
|
log_f = open(log_f_name,"w+")
|
||||||
|
log_f.write('episode,timestep,reward\n')
|
||||||
|
|
||||||
|
# printing and logging variables
|
||||||
|
print_running_reward = 0
|
||||||
|
print_running_episodes = 0
|
||||||
|
|
||||||
|
log_running_reward = 0
|
||||||
|
log_running_episodes = 0
|
||||||
|
|
||||||
|
time_step = 0
|
||||||
|
i_episode = 0
|
||||||
|
|
||||||
|
# training loop
|
||||||
|
while time_step <= max_training_timesteps:
|
||||||
|
|
||||||
|
state = env.reset()
|
||||||
|
current_ep_reward = 0
|
||||||
|
|
||||||
|
for t in range(1, max_ep_len+1):
|
||||||
|
|
||||||
|
# select action with policy
|
||||||
|
action = ppo_agent.select_action(state)
|
||||||
|
state, reward, done, _, _ = env.step(action)
|
||||||
|
|
||||||
|
# saving reward and is_terminals
|
||||||
|
ppo_agent.buffer.rewards.append(reward)
|
||||||
|
ppo_agent.buffer.is_terminals.append(done)
|
||||||
|
|
||||||
|
time_step +=1
|
||||||
|
current_ep_reward += reward
|
||||||
|
|
||||||
|
# update PPO agent
|
||||||
|
if time_step % update_timestep == 0:
|
||||||
|
ppo_agent.update()
|
||||||
|
|
||||||
|
# if continuous action space; then decay action std of ouput action distribution
|
||||||
|
if has_continuous_action_space and time_step % action_std_decay_freq == 0:
|
||||||
|
ppo_agent.decay_action_std(action_std_decay_rate, min_action_std)
|
||||||
|
|
||||||
|
# log in logging file
|
||||||
|
if time_step % log_freq == 0:
|
||||||
|
|
||||||
|
# log average reward till last episode
|
||||||
|
log_avg_reward = log_running_reward / log_running_episodes
|
||||||
|
log_avg_reward = round(log_avg_reward, 4)
|
||||||
|
|
||||||
|
log_f.write('{},{},{}\n'.format(i_episode, time_step, log_avg_reward))
|
||||||
|
log_f.flush()
|
||||||
|
|
||||||
|
log_running_reward = 0
|
||||||
|
log_running_episodes = 0
|
||||||
|
|
||||||
|
# printing average reward
|
||||||
|
if time_step % print_freq == 0:
|
||||||
|
|
||||||
|
# print average reward till last episode
|
||||||
|
print_avg_reward = print_running_reward / print_running_episodes
|
||||||
|
print_avg_reward = round(print_avg_reward, 2)
|
||||||
|
|
||||||
|
print("Episode : {} \t\t Timestep : {} \t\t Average Reward : {}".format(i_episode, time_step, print_avg_reward))
|
||||||
|
|
||||||
|
print_running_reward = 0
|
||||||
|
print_running_episodes = 0
|
||||||
|
|
||||||
|
# save model weights
|
||||||
|
if time_step % save_model_freq == 0:
|
||||||
|
print("--------------------------------------------------------------------------------------------")
|
||||||
|
print("saving model at : " + checkpoint_path)
|
||||||
|
ppo_agent.save(checkpoint_path)
|
||||||
|
print("model saved")
|
||||||
|
print("Elapsed Time : ", datetime.now().replace(microsecond=0) - start_time)
|
||||||
|
print("--------------------------------------------------------------------------------------------")
|
||||||
|
|
||||||
|
# break; if the episode is over
|
||||||
|
if done:
|
||||||
|
break
|
||||||
|
|
||||||
|
print_running_reward += current_ep_reward
|
||||||
|
print_running_episodes += 1
|
||||||
|
|
||||||
|
log_running_reward += current_ep_reward
|
||||||
|
log_running_episodes += 1
|
||||||
|
|
||||||
|
i_episode += 1
|
||||||
|
|
||||||
|
log_f.close()
|
||||||
|
env.close()
|
||||||
|
|
||||||
|
# print total training time
|
||||||
|
print("============================================================================================")
|
||||||
|
end_time = datetime.now().replace(microsecond=0)
|
||||||
|
print("Started training at (GMT) : ", start_time)
|
||||||
|
print("Finished training at (GMT) : ", end_time)
|
||||||
|
print("Total training time : ", end_time - start_time)
|
||||||
|
print("============================================================================================")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
train()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -64,7 +64,7 @@ class PartitionMazeEnv(gym.Env):
|
|||||||
self.BASE_LINE = 3500.0 # 基准时间,通过greedy或者蒙特卡洛计算出来
|
self.BASE_LINE = 3500.0 # 基准时间,通过greedy或者蒙特卡洛计算出来
|
||||||
self.step_count = 0
|
self.step_count = 0
|
||||||
self.rectangles = {}
|
self.rectangles = {}
|
||||||
self.car_pos = [(0.5, 0.5) for _ in range(self.num_cars)]
|
self.car_pos = [(self.H / 2, self.W / 2) for _ in range(self.num_cars)]
|
||||||
self.car_traj = [[] for _ in range(self.num_cars)]
|
self.car_traj = [[] for _ in range(self.num_cars)]
|
||||||
self.current_car_index = 0
|
self.current_car_index = 0
|
||||||
|
|
||||||
@ -79,13 +79,13 @@ class PartitionMazeEnv(gym.Env):
|
|||||||
self.region_centers = []
|
self.region_centers = []
|
||||||
self.step_count = 0
|
self.step_count = 0
|
||||||
self.rectangles = {}
|
self.rectangles = {}
|
||||||
self.car_pos = [(0.5, 0.5) for _ in range(self.num_cars)]
|
self.car_pos = [(self.H / 2, self.W / 2) for _ in range(self.num_cars)]
|
||||||
self.car_traj = [[] for _ in range(self.num_cars)]
|
self.car_traj = [[] for _ in range(self.num_cars)]
|
||||||
self.current_car_index = 0
|
self.current_car_index = 0
|
||||||
# 状态:前 4 维为 partition_values,其余补 0
|
# 状态:前 4 维为 partition_values,其余补 0
|
||||||
state = np.concatenate(
|
state = np.concatenate(
|
||||||
[self.partition_values, np.zeros(np.array(self.car_pos).flatten().shape[0], dtype=np.float32)])
|
[self.partition_values, np.zeros(np.array(self.car_pos).flatten().shape[0], dtype=np.float32)])
|
||||||
return state, {}
|
return state
|
||||||
|
|
||||||
def step(self, action):
|
def step(self, action):
|
||||||
# 在所有阶段动作均为 1 维连续动作,取 action[0]
|
# 在所有阶段动作均为 1 维连续动作,取 action[0]
|
||||||
@ -153,12 +153,14 @@ class PartitionMazeEnv(gym.Env):
|
|||||||
[self.partition_values, np.zeros(np.array(self.car_pos).flatten().shape[0], dtype=np.float32)])
|
[self.partition_values, np.zeros(np.array(self.car_pos).flatten().shape[0], dtype=np.float32)])
|
||||||
return state, reward, True, False, {}
|
return state, reward, True, False, {}
|
||||||
else:
|
else:
|
||||||
reward = 10
|
|
||||||
|
|
||||||
# 进入阶段 1:初始化迷宫
|
# 进入阶段 1:初始化迷宫
|
||||||
self.phase = 1
|
self.phase = 1
|
||||||
state = np.concatenate(
|
state = np.concatenate(
|
||||||
[self.partition_values, np.array(self.car_pos).flatten()])
|
[self.partition_values, np.array(self.car_pos).flatten()])
|
||||||
|
reward = 10
|
||||||
|
|
||||||
|
# 构建反向索引,方便后续计算
|
||||||
|
self.reverse_rectangles = {v['center']: k for k, v in self.rectangles.items()}
|
||||||
return state, reward, False, False, {}
|
return state, reward, False, False, {}
|
||||||
|
|
||||||
elif self.phase == 1:
|
elif self.phase == 1:
|
||||||
@ -172,7 +174,7 @@ class PartitionMazeEnv(gym.Env):
|
|||||||
# 将index映射到笛卡尔坐标
|
# 将index映射到笛卡尔坐标
|
||||||
coord = (target_region_index // (len(self.col_cuts) - 1),
|
coord = (target_region_index // (len(self.col_cuts) - 1),
|
||||||
target_region_index % (len(self.col_cuts) - 1))
|
target_region_index % (len(self.col_cuts) - 1))
|
||||||
self.car_pos[self.init_maze_step] = coord
|
self.car_pos[self.init_maze_step] = self.rectangles[coord]['center']
|
||||||
self.car_traj[self.init_maze_step].append(coord)
|
self.car_traj[self.init_maze_step].append(coord)
|
||||||
self.rectangles[coord]['is_visited'] = True
|
self.rectangles[coord]['is_visited'] = True
|
||||||
|
|
||||||
@ -190,7 +192,8 @@ class PartitionMazeEnv(gym.Env):
|
|||||||
elif self.phase == 2:
|
elif self.phase == 2:
|
||||||
# 阶段 2:路径规划(走迷宫)
|
# 阶段 2:路径规划(走迷宫)
|
||||||
current_car = self.current_car_index
|
current_car = self.current_car_index
|
||||||
current_row, current_col = self.car_pos[current_car]
|
# 查表,找出当前车辆所在的网格
|
||||||
|
current_row, current_col = self.reverse_rectangles[self.car_pos[current_car]]
|
||||||
|
|
||||||
# 当前动作 a 为 1 维连续动作,映射到四个方向
|
# 当前动作 a 为 1 维连续动作,映射到四个方向
|
||||||
if a < 0.2:
|
if a < 0.2:
|
||||||
@ -219,7 +222,8 @@ class PartitionMazeEnv(gym.Env):
|
|||||||
# TODO 移动不合法,加一些惩罚
|
# TODO 移动不合法,加一些惩罚
|
||||||
|
|
||||||
# 更新车辆位置
|
# 更新车辆位置
|
||||||
self.car_pos[current_car] = (new_row, new_col)
|
self.car_pos[current_car] = self.rectangles[(
|
||||||
|
new_row, new_col)]['center']
|
||||||
if new_row != current_row or new_col != current_col:
|
if new_row != current_row or new_col != current_col:
|
||||||
self.car_traj[current_car].append((new_row, new_col))
|
self.car_traj[current_car].append((new_row, new_col))
|
||||||
self.step_count += 1
|
self.step_count += 1
|
@ -6,7 +6,7 @@ import json
|
|||||||
# 固定随机种子,便于复现
|
# 固定随机种子,便于复现
|
||||||
random.seed(42)
|
random.seed(42)
|
||||||
|
|
||||||
num_iterations = 10000
|
num_iterations = 1000000
|
||||||
|
|
||||||
# ---------------------------
|
# ---------------------------
|
||||||
# 参数设置
|
# 参数设置
|
||||||
|
97
ray/atari_ppo.py
Normal file
97
ray/atari_ppo.py
Normal file
@ -0,0 +1,97 @@
|
|||||||
|
# These tags allow extracting portions of this script on Anyscale.
|
||||||
|
# ws-template-imports-start
|
||||||
|
import gymnasium as gym
|
||||||
|
|
||||||
|
from ray import tune
|
||||||
|
from ray.rllib.algorithms.ppo import PPOConfig
|
||||||
|
from ray.rllib.connectors.env_to_module.frame_stacking import FrameStackingEnvToModule
|
||||||
|
from ray.rllib.connectors.learner.frame_stacking import FrameStackingLearner
|
||||||
|
from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
|
||||||
|
from ray.rllib.env.wrappers.atari_wrappers import wrap_atari_for_new_api_stack
|
||||||
|
from ray.rllib.utils.test_utils import add_rllib_example_script_args
|
||||||
|
|
||||||
|
# ws-template-imports-end
|
||||||
|
|
||||||
|
parser = add_rllib_example_script_args(
|
||||||
|
default_reward=float("inf"),
|
||||||
|
default_timesteps=3000000,
|
||||||
|
default_iters=100000000000,
|
||||||
|
)
|
||||||
|
parser.set_defaults(
|
||||||
|
enable_new_api_stack=True,
|
||||||
|
env="ale_py:ALE/Pong-v5",
|
||||||
|
)
|
||||||
|
# Use `parser` to add your own custom command line options to this script
|
||||||
|
# and (if needed) use their values to set up `config` below.
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
NUM_LEARNERS = args.num_learners or 1
|
||||||
|
ENV = args.env
|
||||||
|
|
||||||
|
|
||||||
|
# These tags allow extracting portions of this script on Anyscale.
|
||||||
|
# ws-template-code-start
|
||||||
|
def _make_env_to_module_connector(env):
|
||||||
|
return FrameStackingEnvToModule(num_frames=4)
|
||||||
|
|
||||||
|
|
||||||
|
def _make_learner_connector(input_observation_space, input_action_space):
|
||||||
|
return FrameStackingLearner(num_frames=4)
|
||||||
|
|
||||||
|
|
||||||
|
# Create a custom Atari setup (w/o the usual RLlib-hard-coded framestacking in it).
|
||||||
|
# We would like our frame stacking connector to do this job.
|
||||||
|
def _env_creator(cfg):
|
||||||
|
return wrap_atari_for_new_api_stack(
|
||||||
|
gym.make(ENV, **cfg, render_mode="rgb_array"),
|
||||||
|
# Perform frame-stacking through ConnectorV2 API.
|
||||||
|
framestack=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
tune.register_env("env", _env_creator)
|
||||||
|
|
||||||
|
config = (
|
||||||
|
PPOConfig()
|
||||||
|
.environment(
|
||||||
|
"env",
|
||||||
|
env_config={
|
||||||
|
# Make analogous to old v4 + NoFrameskip.
|
||||||
|
"frameskip": 1,
|
||||||
|
"full_action_space": False,
|
||||||
|
"repeat_action_probability": 0.0,
|
||||||
|
},
|
||||||
|
clip_rewards=True,
|
||||||
|
)
|
||||||
|
.env_runners(
|
||||||
|
env_to_module_connector=_make_env_to_module_connector,
|
||||||
|
)
|
||||||
|
.training(
|
||||||
|
learner_connector=_make_learner_connector,
|
||||||
|
train_batch_size_per_learner=4000,
|
||||||
|
minibatch_size=128,
|
||||||
|
lambda_=0.95,
|
||||||
|
kl_coeff=0.5,
|
||||||
|
clip_param=0.1,
|
||||||
|
vf_clip_param=10.0,
|
||||||
|
entropy_coeff=0.01,
|
||||||
|
num_epochs=10,
|
||||||
|
lr=0.00015 * NUM_LEARNERS,
|
||||||
|
grad_clip=100.0,
|
||||||
|
grad_clip_by="global_norm",
|
||||||
|
)
|
||||||
|
.rl_module(
|
||||||
|
model_config=DefaultModelConfig(
|
||||||
|
conv_filters=[[16, 4, 2], [32, 4, 2], [64, 4, 2], [128, 4, 2]],
|
||||||
|
conv_activation="relu",
|
||||||
|
head_fcnet_hiddens=[256],
|
||||||
|
vf_share_layers=True,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
# ws-template-code-end
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
from ray.rllib.utils.test_utils import run_rllib_example_script_experiment
|
||||||
|
|
||||||
|
run_rllib_example_script_experiment(config, args=args)
|
32
ray/cartpole_ppo.py
Normal file
32
ray/cartpole_ppo.py
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
from ray.rllib.algorithms.ppo import PPOConfig
|
||||||
|
from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
|
||||||
|
from ray.rllib.utils.test_utils import add_rllib_example_script_args
|
||||||
|
|
||||||
|
parser = add_rllib_example_script_args(default_reward=450.0, default_timesteps=300000)
|
||||||
|
parser.set_defaults(enable_new_api_stack=True)
|
||||||
|
# Use `parser` to add your own custom command line options to this script
|
||||||
|
# and (if needed) use their values to set up `config` below.
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
config = (
|
||||||
|
PPOConfig()
|
||||||
|
.environment("CartPole-v1")
|
||||||
|
.training(
|
||||||
|
lr=0.0003,
|
||||||
|
num_epochs=6,
|
||||||
|
vf_loss_coeff=0.01,
|
||||||
|
)
|
||||||
|
.rl_module(
|
||||||
|
model_config=DefaultModelConfig(
|
||||||
|
fcnet_hiddens=[32],
|
||||||
|
fcnet_activation="linear",
|
||||||
|
vf_share_layers=True,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
from ray.rllib.utils.test_utils import run_rllib_example_script_experiment
|
||||||
|
|
||||||
|
run_rllib_example_script_experiment(config, args)
|
38
ray/partition_maze_ppo.py
Normal file
38
ray/partition_maze_ppo.py
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
import gymnasium as gym
|
||||||
|
from ray import tune
|
||||||
|
from ray.rllib.algorithms.ppo import PPOConfig
|
||||||
|
from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
|
||||||
|
from ray.rllib.utils.test_utils import add_rllib_example_script_args
|
||||||
|
from env import PartitionMazeEnv # 导入自定义环境
|
||||||
|
|
||||||
|
# 注册自定义环境
|
||||||
|
gym.envs.register(
|
||||||
|
id='PartitionMazeEnv-v0',
|
||||||
|
entry_point='env:PartitionMazeEnv',
|
||||||
|
)
|
||||||
|
|
||||||
|
parser = add_rllib_example_script_args(default_reward=450.0, default_timesteps=300000)
|
||||||
|
parser.set_defaults(enable_new_api_stack=True)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
config = (
|
||||||
|
PPOConfig()
|
||||||
|
.environment("PartitionMazeEnv-v0")
|
||||||
|
.training(
|
||||||
|
lr=0.0003,
|
||||||
|
num_epochs=6,
|
||||||
|
vf_loss_coeff=0.01,
|
||||||
|
)
|
||||||
|
.rl_module(
|
||||||
|
model_config=DefaultModelConfig(
|
||||||
|
fcnet_hiddens=[32],
|
||||||
|
fcnet_activation="linear",
|
||||||
|
vf_share_layers=True,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
from ray.rllib.utils.test_utils import run_rllib_example_script_experiment
|
||||||
|
|
||||||
|
run_rllib_example_script_experiment(config, args=args)
|
Loading…
Reference in New Issue
Block a user