diff --git a/.gitignore b/.gitignore index 21a7ae0..4b3b308 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,8 @@ __pycache__/ # Pytorch weights weights/ solutions/ +PPO_preTrained/ +PPO_logs/ # Distribution / packaging .Python diff --git a/PPO/eval_policy.py b/PPO1/eval_policy.py similarity index 100% rename from PPO/eval_policy.py rename to PPO1/eval_policy.py diff --git a/PPO/main.py b/PPO1/main.py similarity index 97% rename from PPO/main.py rename to PPO1/main.py index 3873639..117b0f4 100644 --- a/PPO/main.py +++ b/PPO1/main.py @@ -11,6 +11,9 @@ import argparse from ppo import PPO from network import FeedForwardNN from eval_policy import eval_policy +import os +import sys +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from env import PartitionMazeEnv def train(env, hyperparameters, actor_model, critic_model): diff --git a/PPO/main_test.py b/PPO1/main_test.py similarity index 97% rename from PPO/main_test.py rename to PPO1/main_test.py index bab2b28..676b365 100644 --- a/PPO/main_test.py +++ b/PPO1/main_test.py @@ -11,6 +11,9 @@ import argparse from ppo import PPO from network import FeedForwardNN from eval_policy import eval_policy +import os +import sys +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from env import PartitionMazeEnv def train(env, hyperparameters, actor_model, critic_model): diff --git a/PPO/network.py b/PPO1/network.py similarity index 100% rename from PPO/network.py rename to PPO1/network.py diff --git a/PPO/ppo.py b/PPO1/ppo.py similarity index 99% rename from PPO/ppo.py rename to PPO1/ppo.py index 3ac2047..d14ed68 100644 --- a/PPO/ppo.py +++ b/PPO1/ppo.py @@ -183,7 +183,7 @@ class PPO: ep_rews = [] # rewards collected per episode # Reset the environment. sNote that obs is short for observation. - obs, _ = self.env.reset() + obs = self.env.reset() done = False # Run an episode for a maximum of max_timesteps_per_episode timesteps diff --git a/PPO2/PPO.py b/PPO2/PPO.py new file mode 100644 index 0000000..c071b9b --- /dev/null +++ b/PPO2/PPO.py @@ -0,0 +1,273 @@ +import torch +import torch.nn as nn +from torch.distributions import MultivariateNormal +from torch.distributions import Categorical + +################################## set device ################################## +print("============================================================================================") +# set device to cpu or cuda +device = torch.device('cpu') +if(torch.cuda.is_available()): + device = torch.device('cuda:0') + torch.cuda.empty_cache() + print("Device set to : " + str(torch.cuda.get_device_name(device))) +else: + print("Device set to : cpu") +print("============================================================================================") + + +################################## PPO Policy ################################## +class RolloutBuffer: + def __init__(self): + self.actions = [] + self.states = [] + self.logprobs = [] + self.rewards = [] + self.state_values = [] + self.is_terminals = [] + + def clear(self): + del self.actions[:] + del self.states[:] + del self.logprobs[:] + del self.rewards[:] + del self.state_values[:] + del self.is_terminals[:] + + +class ActorCritic(nn.Module): + def __init__(self, state_dim, action_dim, has_continuous_action_space, action_std_init): + super(ActorCritic, self).__init__() + + self.has_continuous_action_space = has_continuous_action_space + + if has_continuous_action_space: + self.action_dim = action_dim + self.action_var = torch.full((action_dim,), action_std_init * action_std_init).to(device) + # actor + if has_continuous_action_space : + self.actor = nn.Sequential( + nn.Linear(state_dim, 64), + nn.Tanh(), + # nn.Sigmoid(), + # nn.ReLU(), + nn.Linear(64, 64), + nn.Tanh(), + # nn.Sigmoid(), + # nn.ReLU(), + nn.Linear(64, action_dim), + nn.Tanh() + # nn.Sigmoid() + # nn.ReLU() + ) + else: + self.actor = nn.Sequential( + nn.Linear(state_dim, 64), + nn.Tanh(), + nn.Linear(64, 64), + nn.Tanh(), + nn.Linear(64, action_dim), + nn.Softmax(dim=-1) + ) + # critic + self.critic = nn.Sequential( + nn.Linear(state_dim, 64), + nn.Tanh(), + # nn.Sigmoid(), + # nn.ReLU(), + nn.Linear(64, 64), + nn.Tanh(), + # nn.Sigmoid(), + # nn.ReLU(), + nn.Linear(64, 1) + ) + + def set_action_std(self, new_action_std): + if self.has_continuous_action_space: + self.action_var = torch.full((self.action_dim,), new_action_std * new_action_std).to(device) + else: + print("--------------------------------------------------------------------------------------------") + print("WARNING : Calling ActorCritic::set_action_std() on discrete action space policy") + print("--------------------------------------------------------------------------------------------") + + def forward(self): + raise NotImplementedError + + def act(self, state): + + if self.has_continuous_action_space: + action_mean = self.actor(state) + cov_mat = torch.diag(self.action_var).unsqueeze(dim=0) + dist = MultivariateNormal(action_mean, cov_mat) + else: + action_probs = self.actor(state) + dist = Categorical(action_probs) + + action = dist.sample() + action_logprob = dist.log_prob(action) + state_val = self.critic(state) + + return action.detach(), action_logprob.detach(), state_val.detach() + + def evaluate(self, state, action): + + if self.has_continuous_action_space: + action_mean = self.actor(state) + + action_var = self.action_var.expand_as(action_mean) + cov_mat = torch.diag_embed(action_var).to(device) + dist = MultivariateNormal(action_mean, cov_mat) + + # For Single Action Environments. + if self.action_dim == 1: + action = action.reshape(-1, self.action_dim) + else: + action_probs = self.actor(state) + dist = Categorical(action_probs) + action_logprobs = dist.log_prob(action) + dist_entropy = dist.entropy() + state_values = self.critic(state) + + return action_logprobs, state_values, dist_entropy + + +class PPO: + def __init__(self, state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip, has_continuous_action_space, action_std_init=0.6): + + self.has_continuous_action_space = has_continuous_action_space + + if has_continuous_action_space: + self.action_std = action_std_init + + self.gamma = gamma + self.eps_clip = eps_clip + self.K_epochs = K_epochs + + self.buffer = RolloutBuffer() + + self.policy = ActorCritic(state_dim, action_dim, has_continuous_action_space, action_std_init).to(device) + self.optimizer = torch.optim.Adam([ + {'params': self.policy.actor.parameters(), 'lr': lr_actor}, + {'params': self.policy.critic.parameters(), 'lr': lr_critic} + ]) + + self.policy_old = ActorCritic(state_dim, action_dim, has_continuous_action_space, action_std_init).to(device) + self.policy_old.load_state_dict(self.policy.state_dict()) + + self.MseLoss = nn.MSELoss() + + def set_action_std(self, new_action_std): + if self.has_continuous_action_space: + self.action_std = new_action_std + self.policy.set_action_std(new_action_std) + self.policy_old.set_action_std(new_action_std) + else: + print("--------------------------------------------------------------------------------------------") + print("WARNING : Calling PPO::set_action_std() on discrete action space policy") + print("--------------------------------------------------------------------------------------------") + + def decay_action_std(self, action_std_decay_rate, min_action_std): + print("--------------------------------------------------------------------------------------------") + if self.has_continuous_action_space: + self.action_std = self.action_std - action_std_decay_rate + self.action_std = round(self.action_std, 4) + if (self.action_std <= min_action_std): + self.action_std = min_action_std + print("setting actor output action_std to min_action_std : ", self.action_std) + else: + print("setting actor output action_std to : ", self.action_std) + self.set_action_std(self.action_std) + + else: + print("WARNING : Calling PPO::decay_action_std() on discrete action space policy") + print("--------------------------------------------------------------------------------------------") + + def select_action(self, state): + + if self.has_continuous_action_space: + with torch.no_grad(): + state = torch.FloatTensor(state).to(device) + action, action_logprob, state_val = self.policy_old.act(state) + + self.buffer.states.append(state) + self.buffer.actions.append(action) + self.buffer.logprobs.append(action_logprob) + self.buffer.state_values.append(state_val) + + return action.detach().cpu().numpy().flatten() + else: + with torch.no_grad(): + state = torch.FloatTensor(state).to(device) + action, action_logprob, state_val = self.policy_old.act(state) + + self.buffer.states.append(state) + self.buffer.actions.append(action) + self.buffer.logprobs.append(action_logprob) + self.buffer.state_values.append(state_val) + + return action.item() + + def update(self): + # Monte Carlo estimate of returns + rewards = [] + discounted_reward = 0 + for reward, is_terminal in zip(reversed(self.buffer.rewards), reversed(self.buffer.is_terminals)): + if is_terminal: + discounted_reward = 0 + discounted_reward = reward + (self.gamma * discounted_reward) + rewards.insert(0, discounted_reward) + + # Normalizing the rewards + rewards = torch.tensor(rewards, dtype=torch.float32).to(device) + rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-7) + + # convert list to tensor + old_states = torch.squeeze(torch.stack(self.buffer.states, dim=0)).detach().to(device) + old_actions = torch.squeeze(torch.stack(self.buffer.actions, dim=0)).detach().to(device) + old_logprobs = torch.squeeze(torch.stack(self.buffer.logprobs, dim=0)).detach().to(device) + old_state_values = torch.squeeze(torch.stack(self.buffer.state_values, dim=0)).detach().to(device) + + # calculate advantages + advantages = rewards.detach() - old_state_values.detach() + + # Optimize policy for K epochs + for _ in range(self.K_epochs): + + # Evaluating old actions and values + logprobs, state_values, dist_entropy = self.policy.evaluate(old_states, old_actions) + + # match state_values tensor dimensions with rewards tensor + state_values = torch.squeeze(state_values) + + # Finding the ratio (pi_theta / pi_theta__old) + ratios = torch.exp(logprobs - old_logprobs.detach()) + + # Finding Surrogate Loss + surr1 = ratios * advantages + surr2 = torch.clamp(ratios, 1-self.eps_clip, 1+self.eps_clip) * advantages + + # final loss of clipped objective PPO + loss = -torch.min(surr1, surr2) + 0.5 * self.MseLoss(state_values, rewards) - 0.01 * dist_entropy + + # take gradient step + self.optimizer.zero_grad() + loss.mean().backward() + self.optimizer.step() + + # Copy new weights into old policy + self.policy_old.load_state_dict(self.policy.state_dict()) + + # clear buffer + self.buffer.clear() + + def save(self, checkpoint_path): + torch.save(self.policy_old.state_dict(), checkpoint_path) + + def load(self, checkpoint_path): + self.policy_old.load_state_dict(torch.load(checkpoint_path, map_location=lambda storage, loc: storage)) + self.policy.load_state_dict(torch.load(checkpoint_path, map_location=lambda storage, loc: storage)) + + + + + diff --git a/PPO2/plot_graph.py b/PPO2/plot_graph.py new file mode 100644 index 0000000..5496a90 --- /dev/null +++ b/PPO2/plot_graph.py @@ -0,0 +1,142 @@ +import os +import pandas as pd +import matplotlib.pyplot as plt + + +def save_graph(): + print("============================================================================================") + # env_name = 'CartPole-v1' + # env_name = 'LunarLander-v2' + # env_name = 'BipedalWalker-v2' + env_name = 'RoboschoolWalker2d-v1' + + fig_num = 0 #### change this to prevent overwriting figures in same env_name folder + plot_avg = True # plot average of all runs; else plot all runs separately + fig_width = 10 + fig_height = 6 + + # smooth out rewards to get a smooth and a less smooth (var) plot lines + window_len_smooth = 20 + min_window_len_smooth = 1 + linewidth_smooth = 1.5 + alpha_smooth = 1 + + window_len_var = 5 + min_window_len_var = 1 + linewidth_var = 2 + alpha_var = 0.1 + + colors = ['red', 'blue', 'green', 'orange', 'purple', 'olive', 'brown', 'magenta', 'cyan', 'crimson','gray', 'black'] + + # make directory for saving figures + figures_dir = "PPO_figs" + if not os.path.exists(figures_dir): + os.makedirs(figures_dir) + + # make environment directory for saving figures + figures_dir = figures_dir + '/' + env_name + '/' + if not os.path.exists(figures_dir): + os.makedirs(figures_dir) + + fig_save_path = figures_dir + '/PPO_' + env_name + '_fig_' + str(fig_num) + '.png' + + # get number of log files in directory + log_dir = "PPO_logs" + '/' + env_name + '/' + + current_num_files = next(os.walk(log_dir))[2] + num_runs = len(current_num_files) + + all_runs = [] + + for run_num in range(num_runs): + + log_f_name = log_dir + '/PPO_' + env_name + "_log_" + str(run_num) + ".csv" + print("loading data from : " + log_f_name) + data = pd.read_csv(log_f_name) + data = pd.DataFrame(data) + + print("data shape : ", data.shape) + + all_runs.append(data) + print("--------------------------------------------------------------------------------------------") + + ax = plt.gca() + + if plot_avg: + # average all runs + df_concat = pd.concat(all_runs) + df_concat_groupby = df_concat.groupby(df_concat.index) + data_avg = df_concat_groupby.mean() + + # smooth out rewards to get a smooth and a less smooth (var) plot lines + data_avg['reward_smooth'] = data_avg['reward'].rolling(window=window_len_smooth, win_type='triang', min_periods=min_window_len_smooth).mean() + data_avg['reward_var'] = data_avg['reward'].rolling(window=window_len_var, win_type='triang', min_periods=min_window_len_var).mean() + + data_avg.plot(kind='line', x='timestep' , y='reward_smooth',ax=ax,color=colors[0], linewidth=linewidth_smooth, alpha=alpha_smooth) + data_avg.plot(kind='line', x='timestep' , y='reward_var',ax=ax,color=colors[0], linewidth=linewidth_var, alpha=alpha_var) + + # keep only reward_smooth in the legend and rename it + handles, labels = ax.get_legend_handles_labels() + ax.legend([handles[0]], ["reward_avg_" + str(len(all_runs)) + "_runs"], loc=2) + + else: + for i, run in enumerate(all_runs): + # smooth out rewards to get a smooth and a less smooth (var) plot lines + run['reward_smooth_' + str(i)] = run['reward'].rolling(window=window_len_smooth, win_type='triang', min_periods=min_window_len_smooth).mean() + run['reward_var_' + str(i)] = run['reward'].rolling(window=window_len_var, win_type='triang', min_periods=min_window_len_var).mean() + + # plot the lines + run.plot(kind='line', x='timestep' , y='reward_smooth_' + str(i),ax=ax,color=colors[i % len(colors)], linewidth=linewidth_smooth, alpha=alpha_smooth) + run.plot(kind='line', x='timestep' , y='reward_var_' + str(i),ax=ax,color=colors[i % len(colors)], linewidth=linewidth_var, alpha=alpha_var) + + # keep alternate elements (reward_smooth_i) in the legend + handles, labels = ax.get_legend_handles_labels() + new_handles = [] + new_labels = [] + for i in range(len(handles)): + if(i%2 == 0): + new_handles.append(handles[i]) + new_labels.append(labels[i]) + ax.legend(new_handles, new_labels, loc=2) + + # ax.set_yticks(np.arange(0, 1800, 200)) + # ax.set_xticks(np.arange(0, int(4e6), int(5e5))) + + ax.grid(color='gray', linestyle='-', linewidth=1, alpha=0.2) + + ax.set_xlabel("Timesteps", fontsize=12) + ax.set_ylabel("Rewards", fontsize=12) + + plt.title(env_name, fontsize=14) + + fig = plt.gcf() + fig.set_size_inches(fig_width, fig_height) + + print("============================================================================================") + plt.savefig(fig_save_path) + print("figure saved at : ", fig_save_path) + print("============================================================================================") + + plt.show() + + +if __name__ == '__main__': + + save_graph() + + + + + + + + + + + + + + + + + diff --git a/PPO2/test.py b/PPO2/test.py new file mode 100644 index 0000000..d6306f5 --- /dev/null +++ b/PPO2/test.py @@ -0,0 +1,123 @@ +import os +import glob +import time +from datetime import datetime + +import torch +import numpy as np + +# import gym +# import roboschool + +from PPO import PPO +import sys +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from env import PartitionMazeEnv + +#################################### Testing ################################### +def test(): + print("============================================================================================") + + ################## hyperparameters ################## + + # env_name = "CartPole-v1" + # has_continuous_action_space = False + # max_ep_len = 400 + # action_std = None + + # env_name = "LunarLander-v2" + # has_continuous_action_space = False + # max_ep_len = 300 + # action_std = None + + # env_name = "BipedalWalker-v2" + # has_continuous_action_space = True + # max_ep_len = 1500 # max timesteps in one episode + # action_std = 0.1 # set same std for action distribution which was used while saving + + env_name = "test" + has_continuous_action_space = True + max_ep_len = 1000 # max timesteps in one episode + action_std = 0.1 # set same std for action distribution which was used while saving + + render = True # render environment on screen + frame_delay = 0 # if required; add delay b/w frames + + total_test_episodes = 10 # total num of testing episodes + + K_epochs = 80 # update policy for K epochs + eps_clip = 0.2 # clip parameter for PPO + gamma = 0.99 # discount factor + + lr_actor = 0.0003 # learning rate for actor + lr_critic = 0.001 # learning rate for critic + + ##################################################### + + # env = gym.make(env_name) + env = PartitionMazeEnv() + + # state space dimension + state_dim = env.observation_space.shape[0] + + # action space dimension + if has_continuous_action_space: + action_dim = env.action_space.shape[0] + else: + action_dim = env.action_space.n + + # initialize a PPO agent + ppo_agent = PPO(state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip, has_continuous_action_space, action_std) + + # preTrained weights directory + + random_seed = 0 #### set this to load a particular checkpoint trained on random seed + run_num_pretrained = 0 #### set this to load a particular checkpoint num + + directory = "PPO_preTrained" + '/' + env_name + '/' + checkpoint_path = directory + "PPO_{}_{}_{}.pth".format(env_name, random_seed, run_num_pretrained) + print("loading network from : " + checkpoint_path) + + ppo_agent.load(checkpoint_path) + + print("--------------------------------------------------------------------------------------------") + + test_running_reward = 0 + + for ep in range(1, total_test_episodes+1): + ep_reward = 0 + state = env.reset() + + for t in range(1, max_ep_len+1): + action = ppo_agent.select_action(state) + state, reward, done, _, _ = env.step(action) + ep_reward += reward + + if render: + env.render() + time.sleep(frame_delay) + + if done: + break + + # clear buffer + ppo_agent.buffer.clear() + + test_running_reward += ep_reward + print('Episode: {} \t\t Reward: {}'.format(ep, round(ep_reward, 2))) + ep_reward = 0 + + env.close() + + print("============================================================================================") + + avg_test_reward = test_running_reward / total_test_episodes + avg_test_reward = round(avg_test_reward, 2) + print("average test reward : " + str(avg_test_reward)) + + print("============================================================================================") + + +if __name__ == '__main__': + + test() diff --git a/PPO2/train.py b/PPO2/train.py new file mode 100644 index 0000000..5601a4c --- /dev/null +++ b/PPO2/train.py @@ -0,0 +1,266 @@ +import os +import glob +import time +from datetime import datetime + +import torch +import numpy as np + +# import gym +# import roboschool +import gymnasium as gym + +from PPO import PPO +import sys +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from env import PartitionMazeEnv + +################################### Training ################################### +def train(): + print("============================================================================================") + + ####### initialize environment hyperparameters ###### + env_name = "test" + + has_continuous_action_space = True # continuous action space; else discrete + + max_ep_len = 100 # max timesteps in one episode + max_training_timesteps = int(3e8) # break training loop if timeteps > max_training_timesteps + + print_freq = max_ep_len * 10 # print avg reward in the interval (in num timesteps) + log_freq = max_ep_len * 2 # log avg reward in the interval (in num timesteps) + save_model_freq = int(1e5) # save model frequency (in num timesteps) + + action_std = 0.6 # starting std for action distribution (Multivariate Normal) + action_std_decay_rate = 0.05 # linearly decay action_std (action_std = action_std - action_std_decay_rate) + min_action_std = 0.1 # minimum action_std (stop decay after action_std <= min_action_std) + action_std_decay_freq = int(2.5e5) # action_std decay frequency (in num timesteps) + ##################################################### + + ## Note : print/log frequencies should be > than max_ep_len + + ################ PPO hyperparameters ################ + update_timestep = max_ep_len * 4 # update policy every n timesteps + K_epochs = 80 # update policy for K epochs in one PPO update + + eps_clip = 0.2 # clip parameter for PPO + gamma = 0.99 # discount factor + + lr_actor = 0.0003 # learning rate for actor network + lr_critic = 0.001 # learning rate for critic network + + random_seed = 0 # set random seed if required (0 = no random seed) + ##################################################### + + print("training environment name : " + env_name) + + # env = gym.make(env_name) + env = PartitionMazeEnv() + + # state space dimension + state_dim = env.observation_space.shape[0] + + # action space dimension + if has_continuous_action_space: + action_dim = env.action_space.shape[0] + else: + action_dim = env.action_space.n + + ###################### logging ###################### + + #### log files for multiple runs are NOT overwritten + log_dir = "PPO_logs" + if not os.path.exists(log_dir): + os.makedirs(log_dir) + + log_dir = log_dir + '/' + env_name + '/' + if not os.path.exists(log_dir): + os.makedirs(log_dir) + + #### get number of log files in log directory + run_num = 0 + current_num_files = next(os.walk(log_dir))[2] + run_num = len(current_num_files) + + #### create new log file for each run + log_f_name = log_dir + '/PPO_' + env_name + "_log_" + str(run_num) + ".csv" + + print("current logging run number for " + env_name + " : ", run_num) + print("logging at : " + log_f_name) + ##################################################### + + ################### checkpointing ################### + run_num_pretrained = 0 #### change this to prevent overwriting weights in same env_name folder + + directory = "PPO_preTrained" + if not os.path.exists(directory): + os.makedirs(directory) + + directory = directory + '/' + env_name + '/' + if not os.path.exists(directory): + os.makedirs(directory) + + + checkpoint_path = directory + "PPO_{}_{}_{}.pth".format(env_name, random_seed, run_num_pretrained) + print("save checkpoint path : " + checkpoint_path) + ##################################################### + + + ############# print all hyperparameters ############# + print("--------------------------------------------------------------------------------------------") + print("max training timesteps : ", max_training_timesteps) + print("max timesteps per episode : ", max_ep_len) + print("model saving frequency : " + str(save_model_freq) + " timesteps") + print("log frequency : " + str(log_freq) + " timesteps") + print("printing average reward over episodes in last : " + str(print_freq) + " timesteps") + print("--------------------------------------------------------------------------------------------") + print("state space dimension : ", state_dim) + print("action space dimension : ", action_dim) + print("--------------------------------------------------------------------------------------------") + if has_continuous_action_space: + print("Initializing a continuous action space policy") + print("--------------------------------------------------------------------------------------------") + print("starting std of action distribution : ", action_std) + print("decay rate of std of action distribution : ", action_std_decay_rate) + print("minimum std of action distribution : ", min_action_std) + print("decay frequency of std of action distribution : " + str(action_std_decay_freq) + " timesteps") + else: + print("Initializing a discrete action space policy") + print("--------------------------------------------------------------------------------------------") + print("PPO update frequency : " + str(update_timestep) + " timesteps") + print("PPO K epochs : ", K_epochs) + print("PPO epsilon clip : ", eps_clip) + print("discount factor (gamma) : ", gamma) + print("--------------------------------------------------------------------------------------------") + print("optimizer learning rate actor : ", lr_actor) + print("optimizer learning rate critic : ", lr_critic) + if random_seed: + print("--------------------------------------------------------------------------------------------") + print("setting random seed to ", random_seed) + torch.manual_seed(random_seed) + env.seed(random_seed) + np.random.seed(random_seed) + ##################################################### + + print("============================================================================================") + + ################# training procedure ################ + + # initialize a PPO agent + ppo_agent = PPO(state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip, has_continuous_action_space, action_std) + + # track total training time + start_time = datetime.now().replace(microsecond=0) + print("Started training at (GMT) : ", start_time) + + print("============================================================================================") + + # logging file + log_f = open(log_f_name,"w+") + log_f.write('episode,timestep,reward\n') + + # printing and logging variables + print_running_reward = 0 + print_running_episodes = 0 + + log_running_reward = 0 + log_running_episodes = 0 + + time_step = 0 + i_episode = 0 + + # training loop + while time_step <= max_training_timesteps: + + state = env.reset() + current_ep_reward = 0 + + for t in range(1, max_ep_len+1): + + # select action with policy + action = ppo_agent.select_action(state) + state, reward, done, _, _ = env.step(action) + + # saving reward and is_terminals + ppo_agent.buffer.rewards.append(reward) + ppo_agent.buffer.is_terminals.append(done) + + time_step +=1 + current_ep_reward += reward + + # update PPO agent + if time_step % update_timestep == 0: + ppo_agent.update() + + # if continuous action space; then decay action std of ouput action distribution + if has_continuous_action_space and time_step % action_std_decay_freq == 0: + ppo_agent.decay_action_std(action_std_decay_rate, min_action_std) + + # log in logging file + if time_step % log_freq == 0: + + # log average reward till last episode + log_avg_reward = log_running_reward / log_running_episodes + log_avg_reward = round(log_avg_reward, 4) + + log_f.write('{},{},{}\n'.format(i_episode, time_step, log_avg_reward)) + log_f.flush() + + log_running_reward = 0 + log_running_episodes = 0 + + # printing average reward + if time_step % print_freq == 0: + + # print average reward till last episode + print_avg_reward = print_running_reward / print_running_episodes + print_avg_reward = round(print_avg_reward, 2) + + print("Episode : {} \t\t Timestep : {} \t\t Average Reward : {}".format(i_episode, time_step, print_avg_reward)) + + print_running_reward = 0 + print_running_episodes = 0 + + # save model weights + if time_step % save_model_freq == 0: + print("--------------------------------------------------------------------------------------------") + print("saving model at : " + checkpoint_path) + ppo_agent.save(checkpoint_path) + print("model saved") + print("Elapsed Time : ", datetime.now().replace(microsecond=0) - start_time) + print("--------------------------------------------------------------------------------------------") + + # break; if the episode is over + if done: + break + + print_running_reward += current_ep_reward + print_running_episodes += 1 + + log_running_reward += current_ep_reward + log_running_episodes += 1 + + i_episode += 1 + + log_f.close() + env.close() + + # print total training time + print("============================================================================================") + end_time = datetime.now().replace(microsecond=0) + print("Started training at (GMT) : ", start_time) + print("Finished training at (GMT) : ", end_time) + print("Total training time : ", end_time - start_time) + print("============================================================================================") + + +if __name__ == '__main__': + + train() + + + + + + + diff --git a/PPO/env.py b/env.py similarity index 94% rename from PPO/env.py rename to env.py index 08f5861..b5a54c4 100644 --- a/PPO/env.py +++ b/env.py @@ -64,7 +64,7 @@ class PartitionMazeEnv(gym.Env): self.BASE_LINE = 3500.0 # 基准时间,通过greedy或者蒙特卡洛计算出来 self.step_count = 0 self.rectangles = {} - self.car_pos = [(0.5, 0.5) for _ in range(self.num_cars)] + self.car_pos = [(self.H / 2, self.W / 2) for _ in range(self.num_cars)] self.car_traj = [[] for _ in range(self.num_cars)] self.current_car_index = 0 @@ -79,13 +79,13 @@ class PartitionMazeEnv(gym.Env): self.region_centers = [] self.step_count = 0 self.rectangles = {} - self.car_pos = [(0.5, 0.5) for _ in range(self.num_cars)] + self.car_pos = [(self.H / 2, self.W / 2) for _ in range(self.num_cars)] self.car_traj = [[] for _ in range(self.num_cars)] self.current_car_index = 0 # 状态:前 4 维为 partition_values,其余补 0 state = np.concatenate( [self.partition_values, np.zeros(np.array(self.car_pos).flatten().shape[0], dtype=np.float32)]) - return state, {} + return state def step(self, action): # 在所有阶段动作均为 1 维连续动作,取 action[0] @@ -153,12 +153,14 @@ class PartitionMazeEnv(gym.Env): [self.partition_values, np.zeros(np.array(self.car_pos).flatten().shape[0], dtype=np.float32)]) return state, reward, True, False, {} else: - reward = 10 - # 进入阶段 1:初始化迷宫 self.phase = 1 state = np.concatenate( [self.partition_values, np.array(self.car_pos).flatten()]) + reward = 10 + + # 构建反向索引,方便后续计算 + self.reverse_rectangles = {v['center']: k for k, v in self.rectangles.items()} return state, reward, False, False, {} elif self.phase == 1: @@ -172,7 +174,7 @@ class PartitionMazeEnv(gym.Env): # 将index映射到笛卡尔坐标 coord = (target_region_index // (len(self.col_cuts) - 1), target_region_index % (len(self.col_cuts) - 1)) - self.car_pos[self.init_maze_step] = coord + self.car_pos[self.init_maze_step] = self.rectangles[coord]['center'] self.car_traj[self.init_maze_step].append(coord) self.rectangles[coord]['is_visited'] = True @@ -190,7 +192,8 @@ class PartitionMazeEnv(gym.Env): elif self.phase == 2: # 阶段 2:路径规划(走迷宫) current_car = self.current_car_index - current_row, current_col = self.car_pos[current_car] + # 查表,找出当前车辆所在的网格 + current_row, current_col = self.reverse_rectangles[self.car_pos[current_car]] # 当前动作 a 为 1 维连续动作,映射到四个方向 if a < 0.2: @@ -219,7 +222,8 @@ class PartitionMazeEnv(gym.Env): # TODO 移动不合法,加一些惩罚 # 更新车辆位置 - self.car_pos[current_car] = (new_row, new_col) + self.car_pos[current_car] = self.rectangles[( + new_row, new_col)]['center'] if new_row != current_row or new_col != current_col: self.car_traj[current_car].append((new_row, new_col)) self.step_count += 1 diff --git a/mtkl_sovler.py b/mtkl_sovler.py index 8526546..102b595 100644 --- a/mtkl_sovler.py +++ b/mtkl_sovler.py @@ -6,7 +6,7 @@ import json # 固定随机种子,便于复现 random.seed(42) -num_iterations = 10000 +num_iterations = 1000000 # --------------------------- # 参数设置 diff --git a/ray/atari_ppo.py b/ray/atari_ppo.py new file mode 100644 index 0000000..1f9b268 --- /dev/null +++ b/ray/atari_ppo.py @@ -0,0 +1,97 @@ +# These tags allow extracting portions of this script on Anyscale. +# ws-template-imports-start +import gymnasium as gym + +from ray import tune +from ray.rllib.algorithms.ppo import PPOConfig +from ray.rllib.connectors.env_to_module.frame_stacking import FrameStackingEnvToModule +from ray.rllib.connectors.learner.frame_stacking import FrameStackingLearner +from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig +from ray.rllib.env.wrappers.atari_wrappers import wrap_atari_for_new_api_stack +from ray.rllib.utils.test_utils import add_rllib_example_script_args + +# ws-template-imports-end + +parser = add_rllib_example_script_args( + default_reward=float("inf"), + default_timesteps=3000000, + default_iters=100000000000, +) +parser.set_defaults( + enable_new_api_stack=True, + env="ale_py:ALE/Pong-v5", +) +# Use `parser` to add your own custom command line options to this script +# and (if needed) use their values to set up `config` below. +args = parser.parse_args() + +NUM_LEARNERS = args.num_learners or 1 +ENV = args.env + + +# These tags allow extracting portions of this script on Anyscale. +# ws-template-code-start +def _make_env_to_module_connector(env): + return FrameStackingEnvToModule(num_frames=4) + + +def _make_learner_connector(input_observation_space, input_action_space): + return FrameStackingLearner(num_frames=4) + + +# Create a custom Atari setup (w/o the usual RLlib-hard-coded framestacking in it). +# We would like our frame stacking connector to do this job. +def _env_creator(cfg): + return wrap_atari_for_new_api_stack( + gym.make(ENV, **cfg, render_mode="rgb_array"), + # Perform frame-stacking through ConnectorV2 API. + framestack=None, + ) + + +tune.register_env("env", _env_creator) + +config = ( + PPOConfig() + .environment( + "env", + env_config={ + # Make analogous to old v4 + NoFrameskip. + "frameskip": 1, + "full_action_space": False, + "repeat_action_probability": 0.0, + }, + clip_rewards=True, + ) + .env_runners( + env_to_module_connector=_make_env_to_module_connector, + ) + .training( + learner_connector=_make_learner_connector, + train_batch_size_per_learner=4000, + minibatch_size=128, + lambda_=0.95, + kl_coeff=0.5, + clip_param=0.1, + vf_clip_param=10.0, + entropy_coeff=0.01, + num_epochs=10, + lr=0.00015 * NUM_LEARNERS, + grad_clip=100.0, + grad_clip_by="global_norm", + ) + .rl_module( + model_config=DefaultModelConfig( + conv_filters=[[16, 4, 2], [32, 4, 2], [64, 4, 2], [128, 4, 2]], + conv_activation="relu", + head_fcnet_hiddens=[256], + vf_share_layers=True, + ), + ) +) +# ws-template-code-end + +if __name__ == "__main__": + from ray.rllib.utils.test_utils import run_rllib_example_script_experiment + + run_rllib_example_script_experiment(config, args=args) \ No newline at end of file diff --git a/ray/cartpole_ppo.py b/ray/cartpole_ppo.py new file mode 100644 index 0000000..65a91c8 --- /dev/null +++ b/ray/cartpole_ppo.py @@ -0,0 +1,32 @@ +from ray.rllib.algorithms.ppo import PPOConfig +from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig +from ray.rllib.utils.test_utils import add_rllib_example_script_args + +parser = add_rllib_example_script_args(default_reward=450.0, default_timesteps=300000) +parser.set_defaults(enable_new_api_stack=True) +# Use `parser` to add your own custom command line options to this script +# and (if needed) use their values to set up `config` below. +args = parser.parse_args() + +config = ( + PPOConfig() + .environment("CartPole-v1") + .training( + lr=0.0003, + num_epochs=6, + vf_loss_coeff=0.01, + ) + .rl_module( + model_config=DefaultModelConfig( + fcnet_hiddens=[32], + fcnet_activation="linear", + vf_share_layers=True, + ), + ) +) + + +if __name__ == "__main__": + from ray.rllib.utils.test_utils import run_rllib_example_script_experiment + + run_rllib_example_script_experiment(config, args) \ No newline at end of file diff --git a/ray/partition_maze_ppo.py b/ray/partition_maze_ppo.py new file mode 100644 index 0000000..595d679 --- /dev/null +++ b/ray/partition_maze_ppo.py @@ -0,0 +1,38 @@ +import gymnasium as gym +from ray import tune +from ray.rllib.algorithms.ppo import PPOConfig +from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig +from ray.rllib.utils.test_utils import add_rllib_example_script_args +from env import PartitionMazeEnv # 导入自定义环境 + +# 注册自定义环境 +gym.envs.register( + id='PartitionMazeEnv-v0', + entry_point='env:PartitionMazeEnv', +) + +parser = add_rllib_example_script_args(default_reward=450.0, default_timesteps=300000) +parser.set_defaults(enable_new_api_stack=True) +args = parser.parse_args() + +config = ( + PPOConfig() + .environment("PartitionMazeEnv-v0") + .training( + lr=0.0003, + num_epochs=6, + vf_loss_coeff=0.01, + ) + .rl_module( + model_config=DefaultModelConfig( + fcnet_hiddens=[32], + fcnet_activation="linear", + vf_share_layers=True, + ), + ) +) + +if __name__ == "__main__": + from ray.rllib.utils.test_utils import run_rllib_example_script_experiment + + run_rllib_example_script_experiment(config, args=args) \ No newline at end of file