diff --git a/.gitignore b/.gitignore index 5d381cc..07940b9 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,9 @@ __pycache__/ # C extensions *.so +# Pytorch weights +weights/ + # Distribution / packaging .Python build/ diff --git a/PPO/arguments.py b/PPO/arguments.py new file mode 100644 index 0000000..9a8e17e --- /dev/null +++ b/PPO/arguments.py @@ -0,0 +1,27 @@ +""" + This file contains the arguments to parse at command line. + File main.py will call get_args, which then the arguments + will be returned. +""" +import argparse + +def get_args(): + """ + Description: + Parses arguments at command line. + + Parameters: + None + + Return: + args - the arguments parsed + """ + parser = argparse.ArgumentParser() + + parser.add_argument('--mode', dest='mode', type=str, default='train') # can be 'train' or 'test' + parser.add_argument('--actor_model', dest='actor_model', type=str, default='') # your actor model filename + parser.add_argument('--critic_model', dest='critic_model', type=str, default='') # your critic model filename + + args = parser.parse_args() + + return args diff --git a/PPO/env.py b/PPO/env.py new file mode 100644 index 0000000..51e2e9e --- /dev/null +++ b/PPO/env.py @@ -0,0 +1,295 @@ +import gymnasium as gym +from gymnasium import spaces +import numpy as np + + +class PartitionMazeEnv(gym.Env): + """ + 自定义环境,分为两阶段: + 阶段 0:区域切分(共 4 步,每一步输出一个标量,用于确定竖切和横切位置)。 + 切分顺序为:第一步输出 c₁,第二步输出 c₂,第三步输出 r₁,第四步输出 r₂。 + 离散化后取值仅为 {0, 0.1, 0.2, …, 0.9}(其中 0 表示不切)。 + 阶段 1:车辆路径规划(走迷宫),车辆从区域中心出发,在九宫格内按照上下左右移动, + 直到所有目标格子被覆盖或步数上限达到。 + """ + + def __init__(self, config=None): + super(PartitionMazeEnv, self).__init__() + # 车队参数设置 + self.H = 20 # 区域高度,网格点之间的距离为25m(单位距离) + self.W = 30 # 区域宽度 + self.num_cars = 2 # 系统数量(车-巢-机系统个数) + + # 时间系数(单位:秒,每个网格一张照片) + self.flight_time_factor = 3 # 每张照片对应的飞行时间,无人机飞行速度为9.5m/s,拍摄照片的时间间隔为3s + self.comp_uav_factor = 5 # 无人机上每张照片计算时间,5s + self.trans_time_factor = 0.3 # 每张照片传输时间,0.3s + self.car_move_time_factor = 2 * 50 # TODO 汽车每单位距离的移动时间,2s,加了一个放大因子 + self.comp_bs_factor = 5 # 机巢上每张照片计算时间 + + # 能耗参数 + self.flight_energy_factor = 0.05 # 单位:分钟/张 + self.comp_energy_factor = 0.05 # 计算能耗需要重新估计 + self.trans_energy_factor = 0.0025 + self.battery_capacity = 10 # 无人机只进行飞行,续航为30分钟 + + self.phase = 0 # 阶段控制,0:区域划分阶段,1:迷宫初始化阶段,2:走迷宫阶段 + self.partition_step = 0 # 区域划分阶段步数,范围 0~4 + # TODO 切的刀数现在固定为4(2+2) + self.partition_values = np.zeros( + 4, dtype=np.float32) # 存储 c₁, c₂, r₁, r₂ + + # 定义动作空间:全部动作均为 1 维连续 [0,1] + self.action_space = spaces.Box( + low=0.0, high=1.0, shape=(1,), dtype=np.float32) + + # 定义观察空间为8维向量 + # TODO 返回的状态目前只有位置坐标 + # 阶段 0 状态:前 4 维表示已决策的切分值(未决策部分为 0) + # 阶段 1 状态:车辆位置 (2D) + self.observation_space = spaces.Box( + low=0.0, high=1.0, shape=(8,), dtype=np.float32) + + # 切分阶段相关变量 + self.vertical_cuts = [] # 存储竖切位置(c₁, c₂),当值为0时表示不切 + self.horizontal_cuts = [] # 存储横切位置(r₁, r₂) + # TODO region_centers可不可以优化一下,减少一些参数 + self.region_centers = [] # 存储切分后每个子区域的中心点(归一化坐标) + + # 路径规划阶段相关变量 + self.MAX_STEPS = 50 # 迷宫走法步数上限 + self.step_count = 0 + self.rectangles = {} + self.car_pos = [[0.5, 0.5] for _ in range(self.num_cars)] + self.car_traj = [[] for _ in range(self.num_cars)] + self.current_car_index = 0 + + def reset(self, seed=None, options=None): + # 重置所有变量,回到切分阶段(phase 0) + self.phase = 0 + self.partition_step = 0 + self.partition_values = np.zeros(4, dtype=np.float32) + self.vertical_cuts = [] + self.horizontal_cuts = [] + self.region_centers = [] + self.step_count = 0 + self.rectangles = {} + self.car_pos = [[0.5, 0.5] for _ in range(self.num_cars)] + self.car_traj = [[] for _ in range(self.num_cars)] + self.current_car_index = 0 + # 状态:前 4 维为 partition_values,其余补 0 + state = np.concatenate( + [self.partition_values, np.zeros(np.array(self.car_pos).flatten().shape[0], dtype=np.float32)]) + return state, {} + + def step(self, action): + # 在所有阶段动作均为 1 维连续动作,取 action[0] + a = float(action[0]) + + if self.phase == 0: + # 切分阶段:每一步输出一个标量,离散化为 {0, 0.1, ..., 0.9} + disc_val = np.floor(a * 10) / 10.0 + disc_val = np.clip(disc_val, 0.0, 0.9) + self.partition_values[self.partition_step] = disc_val + self.partition_step += 1 + + # 构造当前状态:前 partition_step 个为已决策值,其余为 0,再补 7 个 0 + state = np.concatenate( + [self.partition_values, np.zeros(np.array(self.car_pos).flatten().shape[0], dtype=np.float32)]) + + # 如果未完成 4 步,则仍处于切分阶段,不发奖励,done 为 False + if self.partition_step < 4: + return state, 0.0, False, False, {} + else: + # 完成 4 步后,计算切分边界 + # 过滤掉 0,并去重后排序 + vert = sorted(set(v for v in self.partition_values[:len( + self.partition_values) // 2] if v > 0)) + horiz = sorted(set(v for v in self.partition_values[len( + self.partition_values) // 2:] if v > 0)) + self.vertical_cuts = vert if vert else [] + self.horizontal_cuts = horiz if horiz else [] + + # 边界:始终包含 0 和 1 + v_boundaries = [0.0] + self.vertical_cuts + [1.0] + h_boundaries = [0.0] + self.horizontal_cuts + [1.0] + + # 判断分区是否合理,并计算各个分区的任务卸载率ρ + valid_partition = True + for i in range(len(h_boundaries) - 1): + for j in range(len(v_boundaries) - 1): + d = (v_boundaries[j+1] - v_boundaries[j]) * self.W * \ + (h_boundaries[i] + h_boundaries[i+1]) * self.H + rho_time_limit = (self.flight_time_factor - self.trans_time_factor) / \ + (self.comp_uav_factor - self.trans_time_factor) + rho_energy_limit = (self.battery_capacity - self.flight_energy_factor * d - self.trans_energy_factor * d) / \ + (self.comp_energy_factor * d - + self.trans_energy_factor * d) + if rho_energy_limit < 0: + valid_partition = False + break + rho = min(rho_time_limit, rho_energy_limit) + + flight_time = self.flight_time_factor * d + comp_time = self.comp_uav_factor * rho * d + trans_time = self.trans_time_factor * (1 - rho) * d + comp_bs_time = self.comp_bs_factor * (1 - rho) * d + + self.rectangles[(i, j)] = { + # 'r1': h_boundaries[i], 'r2': h_boundaries[i+1], 'c1': v_boundaries[j], 'c2': v_boundaries[j+1], + 'd': d, + 'rho': rho, + 'flight_time': flight_time, + 'comp_time': comp_time, + 'trans_time': trans_time, + 'comp_bs_time': comp_bs_time, + 'is_visited': False + # 'center': (center_r, center_c) + } + if not valid_partition: + break + + if not valid_partition: + reward = -100 + state = np.concatenate( + [self.partition_values, np.zeros(np.array(self.car_pos).flatten().shape[0], dtype=np.float32)]) + return state, reward, True, False, {} + else: + reward = 10 + + # 进入阶段 1:走迷宫 + self.phase = 1 + # 根据分割边界计算每个子区域中心 + self.region_centers = [] + for i in range(len(h_boundaries) - 1): + for j in range(len(v_boundaries) - 1): + center_x = ( + v_boundaries[j] + v_boundaries[j+1]) / 2.0 + center_y = ( + h_boundaries[i] + h_boundaries[i+1]) / 2.0 + self.region_centers.append((center_x, center_y)) + # 存储切分边界,供后续网格映射使用 + self.v_boundaries = v_boundaries + self.h_boundaries = h_boundaries + # 初始化迷宫阶段:步数清零,建立 visited_grid 大小与网格数相同 + self.step_count = 0 + self.visited_grid = np.zeros( + (len(v_boundaries) - 1) * (len(h_boundaries) - 1), dtype=np.int32) + + state = np.concatenate( + [self.partition_values, np.array(self.car_pos).flatten()]) + return state, reward, False, False, {} + + elif self.phase == 1: + # 阶段 1:初始化迷宫,让多个车辆从区域中心出发,前往划分区域的中心点 + # 确保 action 的值在 [0, 1],然后映射到 0~(num_regions-1) 的索引 + num_regions = len(self.region_centers) + target_region_index = int(np.floor(a * num_regions)) + target_region_index = np.clip( + target_region_index, 0, num_regions - 1) + + # 遍历所有车辆,让它们依次移动到目标子区域 + for car_idx in range(self.num_cars): + target_position = np.array( + self.region_centers[target_region_index]) # 目标区域中心 + + # 更新该车辆位置 + self.car_pos[car_idx] = target_position + # 累计步数 + self.step_count += 1 + self.car_traj[car_idx].append(target_position) # 记录每辆车的轨迹 + + # 进入阶段 2:走迷宫 + self.phase = 2 + + # 观察状态 + state = np.concatenate( + [self.partition_values, np.array(self.car_pos).flatten()]) + return state, 0.0, False, False, {} + + elif self.phase == 2: + # 阶段 2:路径规划(走迷宫) + current_car = self.current_car_index + + # 当前动作 a 为 1 维连续动作,映射到四个方向 + if a < 0.2: + move_dir = 'up' + elif a < 0.4: + move_dir = 'down' + elif a < 0.6: + move_dir = 'left' + elif a < 0.8: + move_dir = 'right' + else: + move_dir = 'stay' + + current_row, current_col = self.car_pos[current_car] + + # 初始化新的行、列为当前值 + new_row, new_col = current_row, current_col + + if move_dir == 'up' and current_row < len(h_boundaries) - 1: + new_row = current_row + 1 + elif move_dir == 'down' and current_row > 0: + new_row = current_row - 1 + elif move_dir == 'left' and current_col > 0: + new_col = current_col - 1 + elif move_dir == 'right' and current_col < len(v_boundaries) - 1: + new_col = current_col + 1 + # 如果移动不合法,或者动作为stay,则保持原位置 + # TODO 移动不合法,加一些惩罚 + + # 更新车辆位置 + self.car_pos[current_car] = [new_row, new_col] + if new_row != current_row or new_col != current_col: + self.car_traj[current_car].append(np.array(new_row, new_col)) + self.step_count += 1 + self.current_car_index = ( + self.current_car_index + 1) % self.num_cars + + # 更新访问标记:将新网格标记为已访问 + self.rectangles[(new_col, new_col)]['is_visited'] = True + + # 观察状态 + state = np.concatenate( + [self.partition_values, np.array(self.car_pos).flatten()]) + + # Episode 终止条件:所有网格均被访问或步数达到上限 + done = all([rec['is_visited'] for rec in self.rectangles]) or ( + self.step_count >= self.MAX_STEPS) + if done and np.all(self.visited_grid == 1): + # 区域覆盖完毕,根据轨迹计算各车队的执行时间 + T = max([self._compute_motorcade_time(idx) + for idx in range(self.num_cars)]) + reward += 10.0 # TODO 奖励与greedy比较 + elif done and self.step_count >= self.MAX_STEPS: + reward -= 100 + + return state, reward, done, False, {} + + def _compute_motorcade_time(self, idx): + flight_time = sum(self.rectangles[point]['flight_time'] + for point in self.car_traj[idx]) + bs_time = sum(self.rectangles[point]['comp_bs_time'] + for point in self.car_traj[idx]) + + # 计算车的移动时间,首先在轨迹的首尾添加上大区域中心 + self.car_traj[idx].append([0.5, 0.5]) + self.car_traj[idx].insert(0, [0.5, 0.5]) + for i in range(len(self.car_traj[idx])): + first_point = self.car_traj[idx][i] + second_point = self.car_traj[idx][i + 1] + car_time += np.linalg.norm(first_point, second_point) * \ + self.H * self.W * self.car_move_time_factor + + return max(car_time + flight_time, bs_time) + + def render(self): + if self.phase == 0: + print("Phase 0: Partitioning.") + print(f"Partition step: {self.partition_step}") + print(f"Partition values so far: {self.partition_values}") + elif self.phase == 1: + print("Phase 1: Path planning (maze).") + print(f"Visited grid: {self.visited_grid}") + print(f"Step count: {self.step_count}") diff --git a/PPO/eval_policy.py b/PPO/eval_policy.py new file mode 100644 index 0000000..a3323de --- /dev/null +++ b/PPO/eval_policy.py @@ -0,0 +1,103 @@ +""" + This file is used only to evaluate our trained policy/actor after + training in main.py with ppo.py. I wrote this file to demonstrate + that our trained policy exists independently of our learning algorithm, + which resides in ppo.py. Thus, we can test our trained policy without + relying on ppo.py. +""" + +def _log_summary(ep_len, ep_ret, ep_num): + """ + Print to stdout what we've logged so far in the most recent episode. + + Parameters: + None + + Return: + None + """ + # Round decimal places for more aesthetic logging messages + ep_len = str(round(ep_len, 2)) + ep_ret = str(round(ep_ret, 2)) + + # Print logging statements + print(flush=True) + print(f"-------------------- Episode #{ep_num} --------------------", flush=True) + print(f"Episodic Length: {ep_len}", flush=True) + print(f"Episodic Return: {ep_ret}", flush=True) + print(f"------------------------------------------------------", flush=True) + print(flush=True) + +def rollout(policy, env, render): + """ + Returns a generator to roll out each episode given a trained policy and + environment to test on. + + Parameters: + policy - The trained policy to test + env - The environment to evaluate the policy on + render - Specifies whether to render or not + + Return: + A generator object rollout, or iterable, which will return the latest + episodic length and return on each iteration of the generator. + + Note: + If you're unfamiliar with Python generators, check this out: + https://wiki.python.org/moin/Generators + If you're unfamiliar with Python "yield", check this out: + https://stackoverflow.com/questions/231767/what-does-the-yield-keyword-do + """ + # Rollout until user kills process + while True: + obs, _ = env.reset() + done = False + + # number of timesteps so far + t = 0 + + # Logging data + ep_len = 0 # episodic length + ep_ret = 0 # episodic return + + while not done: + t += 1 + + # Render environment if specified, off by default + if render: + env.render() + + # Query deterministic action from policy and run it + action = policy(obs).detach().numpy() + obs, rew, terminated, truncated, _ = env.step(action) + done = terminated | truncated + + # Sum all episodic rewards as we go along + ep_ret += rew + + # Track episodic length + ep_len = t + + # returns episodic length and return in this iteration + yield ep_len, ep_ret + +def eval_policy(policy, env, render=False): + """ + The main function to evaluate our policy with. It will iterate a generator object + "rollout", which will simulate each episode and return the most recent episode's + length and return. We can then log it right after. And yes, eval_policy will run + forever until you kill the process. + + Parameters: + policy - The trained policy to test, basically another name for our actor model + env - The environment to test the policy on + render - Whether we should render our episodes. False by default. + + Return: + None + + NOTE: To learn more about generators, look at rollout's function description + """ + # Rollout with the policy and environment, and log each episode's data + for ep_num, (ep_len, ep_ret) in enumerate(rollout(policy, env, render)): + _log_summary(ep_len=ep_len, ep_ret=ep_ret, ep_num=ep_num) \ No newline at end of file diff --git a/PPO/main.py b/PPO/main.py new file mode 100644 index 0000000..bf2d929 --- /dev/null +++ b/PPO/main.py @@ -0,0 +1,123 @@ +""" + This file is the executable for running PPO. It is based on this medium article: + https://medium.com/@eyyu/coding-ppo-from-scratch-with-pytorch-part-1-4-613dfc1b14c8 +""" + +import gymnasium as gym +import sys +import torch + +from arguments import get_args +from ppo import PPO +from network import FeedForwardNN +from eval_policy import eval_policy +from env import PartitionMazeEnv + +def train(env, hyperparameters, actor_model, critic_model): + """ + Trains the model. + + Parameters: + env - the environment to train on + hyperparameters - a dict of hyperparameters to use, defined in main + actor_model - the actor model to load in if we want to continue training + critic_model - the critic model to load in if we want to continue training + + Return: + None + """ + print(f"Training", flush=True) + + # Create a model for PPO. + model = PPO(policy_class=FeedForwardNN, env=env, **hyperparameters) + + # Tries to load in an existing actor/critic model to continue training on + if actor_model != '' and critic_model != '': + print(f"Loading in {actor_model} and {critic_model}...", flush=True) + model.actor.load_state_dict(torch.load(actor_model)) + model.critic.load_state_dict(torch.load(critic_model)) + print(f"Successfully loaded.", flush=True) + elif actor_model != '' or critic_model != '': # Don't train from scratch if user accidentally forgets actor/critic model + print(f"Error: Either specify both actor/critic models or none at all. We don't want to accidentally override anything!") + sys.exit(0) + else: + print(f"Training from scratch.", flush=True) + + # Train the PPO model with a specified total timesteps + # NOTE: You can change the total timesteps here, I put a big number just because + # you can kill the process whenever you feel like PPO is converging + model.learn(total_timesteps=200_000_000) + +def test(env, actor_model): + """ + Tests the model. + + Parameters: + env - the environment to test the policy on + actor_model - the actor model to load in + + Return: + None + """ + print(f"Testing {actor_model}", flush=True) + + # If the actor model is not specified, then exit + if actor_model == '': + print(f"Didn't specify model file. Exiting.", flush=True) + sys.exit(0) + + # Extract out dimensions of observation and action spaces + obs_dim = env.observation_space.shape[0] + act_dim = env.action_space.shape[0] + + # Build our policy the same way we build our actor model in PPO + policy = FeedForwardNN(obs_dim, act_dim) + + # Load in the actor model saved by the PPO algorithm + policy.load_state_dict(torch.load(actor_model)) + + # Evaluate our policy with a separate module, eval_policy, to demonstrate + # that once we are done training the model/policy with ppo.py, we no longer need + # ppo.py since it only contains the training algorithm. The model/policy itself exists + # independently as a binary file that can be loaded in with torch. + eval_policy(policy=policy, env=env, render=True) + +def main(args): + """ + The main function to run. + + Parameters: + args - the arguments parsed from command line + + Return: + None + """ + # NOTE: Here's where you can set hyperparameters for PPO. I don't include them as part of + # ArgumentParser because it's too annoying to type them every time at command line. Instead, you can change them here. + # To see a list of hyperparameters, look in ppo.py at function _init_hyperparameters + hyperparameters = { + 'timesteps_per_batch': 2048, + 'max_timesteps_per_episode': 200, + 'gamma': 0.99, + 'n_updates_per_iteration': 10, + 'lr': 3e-4, + 'clip': 0.2, + 'render': True, + 'render_every_i': 10 + } + + # Creates the environment we'll be running. If you want to replace with your own + # custom environment, note that it must inherit Gym and have both continuous + # observation and action spaces. + # env = gym.make('Pendulum-v1', render_mode='human' if args.mode == 'test' else 'rgb_array') + env = PartitionMazeEnv() + + # Train or test, depending on the mode specified + if args.mode == 'train': + train(env=env, hyperparameters=hyperparameters, actor_model=args.actor_model, critic_model=args.critic_model) + else: + test(env=env, actor_model=args.actor_model) + +if __name__ == '__main__': + args = get_args() # Parse arguments from command line + main(args) diff --git a/PPO/network.py b/PPO/network.py new file mode 100644 index 0000000..2a0cf87 --- /dev/null +++ b/PPO/network.py @@ -0,0 +1,50 @@ +""" + This file contains a neural network module for us to + define our actor and critic networks in PPO. +""" + +import torch +from torch import nn +import torch.nn.functional as F +import numpy as np + +class FeedForwardNN(nn.Module): + """ + A standard in_dim-64-64-out_dim Feed Forward Neural Network. + """ + def __init__(self, in_dim, out_dim): + """ + Initialize the network and set up the layers. + + Parameters: + in_dim - input dimensions as an int + out_dim - output dimensions as an int + + Return: + None + """ + super(FeedForwardNN, self).__init__() + + self.layer1 = nn.Linear(in_dim, 64) + self.layer2 = nn.Linear(64, 64) + self.layer3 = nn.Linear(64, out_dim) + + def forward(self, obs): + """ + Runs a forward pass on the neural network. + + Parameters: + obs - observation to pass as input + + Return: + output - the output of our forward pass + """ + # Convert observation to tensor if it's a numpy array + if isinstance(obs, np.ndarray): + obs = torch.tensor(obs, dtype=torch.float) + + activation1 = F.relu(self.layer1(obs)) + activation2 = F.relu(self.layer2(activation1)) + output = self.layer3(activation2) + + return output diff --git a/PPO/ppo.py b/PPO/ppo.py new file mode 100644 index 0000000..3ac2047 --- /dev/null +++ b/PPO/ppo.py @@ -0,0 +1,402 @@ +""" + The file contains the PPO class to train with. + NOTE: All "ALG STEP"s are following the numbers from the original PPO pseudocode. + It can be found here: https://spinningup.openai.com/en/latest/_images/math/e62a8971472597f4b014c2da064f636ffe365ba3.svg +""" + +import gymnasium as gym +import time + +import numpy as np +import time +import torch +import torch.nn as nn +from torch.optim import Adam +from torch.distributions import MultivariateNormal + +class PPO: + """ + This is the PPO class we will use as our model in main.py + """ + def __init__(self, policy_class, env, **hyperparameters): + """ + Initializes the PPO model, including hyperparameters. + + Parameters: + policy_class - the policy class to use for our actor/critic networks. + env - the environment to train on. + hyperparameters - all extra arguments passed into PPO that should be hyperparameters. + + Returns: + None + """ + # Make sure the environment is compatible with our code + assert(type(env.observation_space) == gym.spaces.Box) + assert(type(env.action_space) == gym.spaces.Box) + + # Initialize hyperparameters for training with PPO + self._init_hyperparameters(hyperparameters) + + # Extract environment information + self.env = env + self.obs_dim = env.observation_space.shape[0] + self.act_dim = env.action_space.shape[0] + + # Initialize actor and critic networks + self.actor = policy_class(self.obs_dim, self.act_dim) # ALG STEP 1 + self.critic = policy_class(self.obs_dim, 1) + + # Initialize optimizers for actor and critic + self.actor_optim = Adam(self.actor.parameters(), lr=self.lr) + self.critic_optim = Adam(self.critic.parameters(), lr=self.lr) + + # Initialize the covariance matrix used to query the actor for actions + self.cov_var = torch.full(size=(self.act_dim,), fill_value=0.5) + self.cov_mat = torch.diag(self.cov_var) + + # This logger will help us with printing out summaries of each iteration + self.logger = { + 'delta_t': time.time_ns(), + 't_so_far': 0, # timesteps so far + 'i_so_far': 0, # iterations so far + 'batch_lens': [], # episodic lengths in batch + 'batch_rews': [], # episodic returns in batch + 'actor_losses': [], # losses of actor network in current iteration + } + + def learn(self, total_timesteps): + """ + Train the actor and critic networks. Here is where the main PPO algorithm resides. + + Parameters: + total_timesteps - the total number of timesteps to train for + + Return: + None + """ + print(f"Learning... Running {self.max_timesteps_per_episode} timesteps per episode, ", end='') + print(f"{self.timesteps_per_batch} timesteps per batch for a total of {total_timesteps} timesteps") + t_so_far = 0 # Timesteps simulated so far + i_so_far = 0 # Iterations ran so far + while t_so_far < total_timesteps: # ALG STEP 2 + # Autobots, roll out (just kidding, we're collecting our batch simulations here) + batch_obs, batch_acts, batch_log_probs, batch_rtgs, batch_lens = self.rollout() # ALG STEP 3 + + # Calculate how many timesteps we collected this batch + t_so_far += np.sum(batch_lens) + + # Increment the number of iterations + i_so_far += 1 + + # Logging timesteps so far and iterations so far + self.logger['t_so_far'] = t_so_far + self.logger['i_so_far'] = i_so_far + + # Calculate advantage at k-th iteration + V, _ = self.evaluate(batch_obs, batch_acts) + A_k = batch_rtgs - V.detach() # ALG STEP 5 + + # One of the only tricks I use that isn't in the pseudocode. Normalizing advantages + # isn't theoretically necessary, but in practice it decreases the variance of + # our advantages and makes convergence much more stable and faster. I added this because + # solving some environments was too unstable without it. + A_k = (A_k - A_k.mean()) / (A_k.std() + 1e-10) + + # This is the loop where we update our network for some n epochs + for _ in range(self.n_updates_per_iteration): # ALG STEP 6 & 7 + # Calculate V_phi and pi_theta(a_t | s_t) + V, curr_log_probs = self.evaluate(batch_obs, batch_acts) + + # Calculate the ratio pi_theta(a_t | s_t) / pi_theta_k(a_t | s_t) + # NOTE: we just subtract the logs, which is the same as + # dividing the values and then canceling the log with e^log. + # For why we use log probabilities instead of actual probabilities, + # here's a great explanation: + # https://cs.stackexchange.com/questions/70518/why-do-we-use-the-log-in-gradient-based-reinforcement-algorithms + # TL;DR makes gradient ascent easier behind the scenes. + ratios = torch.exp(curr_log_probs - batch_log_probs) + + # Calculate surrogate losses. + surr1 = ratios * A_k + surr2 = torch.clamp(ratios, 1 - self.clip, 1 + self.clip) * A_k + + # Calculate actor and critic losses. + # NOTE: we take the negative min of the surrogate losses because we're trying to maximize + # the performance function, but Adam minimizes the loss. So minimizing the negative + # performance function maximizes it. + actor_loss = (-torch.min(surr1, surr2)).mean() + critic_loss = nn.MSELoss()(V, batch_rtgs) + + # Calculate gradients and perform backward propagation for actor network + self.actor_optim.zero_grad() + actor_loss.backward(retain_graph=True) + self.actor_optim.step() + + # Calculate gradients and perform backward propagation for critic network + self.critic_optim.zero_grad() + critic_loss.backward() + self.critic_optim.step() + + # Log actor loss + self.logger['actor_losses'].append(actor_loss.detach()) + + # Print a summary of our training so far + self._log_summary() + + # Save our model if it's time + if i_so_far % self.save_freq == 0: + torch.save(self.actor.state_dict(), './weights/ppo_actor.pth') + torch.save(self.critic.state_dict(), './weights/ppo_critic.pth') + + def rollout(self): + """ + Too many transformers references, I'm sorry. This is where we collect the batch of data + from simulation. Since this is an on-policy algorithm, we'll need to collect a fresh batch + of data each time we iterate the actor/critic networks. + + Parameters: + None + + Return: + batch_obs - the observations collected this batch. Shape: (number of timesteps, dimension of observation) + batch_acts - the actions collected this batch. Shape: (number of timesteps, dimension of action) + batch_log_probs - the log probabilities of each action taken this batch. Shape: (number of timesteps) + batch_rtgs - the Rewards-To-Go of each timestep in this batch. Shape: (number of timesteps) + batch_lens - the lengths of each episode this batch. Shape: (number of episodes) + """ + # Batch data. For more details, check function header. + batch_obs = [] + batch_acts = [] + batch_log_probs = [] + batch_rews = [] + batch_rtgs = [] + batch_lens = [] + + # Episodic data. Keeps track of rewards per episode, will get cleared + # upon each new episode + ep_rews = [] + + t = 0 # Keeps track of how many timesteps we've run so far this batch + + # Keep simulating until we've run more than or equal to specified timesteps per batch + while t < self.timesteps_per_batch: + ep_rews = [] # rewards collected per episode + + # Reset the environment. sNote that obs is short for observation. + obs, _ = self.env.reset() + done = False + + # Run an episode for a maximum of max_timesteps_per_episode timesteps + for ep_t in range(self.max_timesteps_per_episode): + # If render is specified, render the environment + if self.render and (self.logger['i_so_far'] % self.render_every_i == 0) and len(batch_lens) == 0: + self.env.render() + + t += 1 # Increment timesteps ran this batch so far + + # Track observations in this batch + batch_obs.append(obs) + + # Calculate action and make a step in the env. + # Note that rew is short for reward. + action, log_prob = self.get_action(obs) + obs, rew, terminated, truncated, _ = self.env.step(action) + + # Don't really care about the difference between terminated or truncated in this, so just combine them + done = terminated | truncated + + # Track recent reward, action, and action log probability + ep_rews.append(rew) + batch_acts.append(action) + batch_log_probs.append(log_prob) + + # If the environment tells us the episode is terminated, break + if done: + break + + # Track episodic lengths and rewards + batch_lens.append(ep_t + 1) + batch_rews.append(ep_rews) + + # Reshape data as tensors in the shape specified in function description, before returning + batch_obs = torch.tensor(batch_obs, dtype=torch.float) + batch_acts = torch.tensor(batch_acts, dtype=torch.float) + batch_log_probs = torch.tensor(batch_log_probs, dtype=torch.float) + batch_rtgs = self.compute_rtgs(batch_rews) # ALG STEP 4 + + # Log the episodic returns and episodic lengths in this batch. + self.logger['batch_rews'] = batch_rews + self.logger['batch_lens'] = batch_lens + + return batch_obs, batch_acts, batch_log_probs, batch_rtgs, batch_lens + + def compute_rtgs(self, batch_rews): + """ + Compute the Reward-To-Go of each timestep in a batch given the rewards. + + Parameters: + batch_rews - the rewards in a batch, Shape: (number of episodes, number of timesteps per episode) + + Return: + batch_rtgs - the rewards to go, Shape: (number of timesteps in batch) + """ + # The rewards-to-go (rtg) per episode per batch to return. + # The shape will be (num timesteps per episode) + batch_rtgs = [] + + # Iterate through each episode + for ep_rews in reversed(batch_rews): + + discounted_reward = 0 # The discounted reward so far + + # Iterate through all rewards in the episode. We go backwards for smoother calculation of each + # discounted return (think about why it would be harder starting from the beginning) + for rew in reversed(ep_rews): + discounted_reward = rew + discounted_reward * self.gamma + batch_rtgs.insert(0, discounted_reward) + + # Convert the rewards-to-go into a tensor + batch_rtgs = torch.tensor(batch_rtgs, dtype=torch.float) + + return batch_rtgs + + def get_action(self, obs): + """ + Queries an action from the actor network, should be called from rollout. + + Parameters: + obs - the observation at the current timestep + + Return: + action - the action to take, as a numpy array + log_prob - the log probability of the selected action in the distribution + """ + # Query the actor network for a mean action + mean = self.actor(obs) + + # Create a distribution with the mean action and std from the covariance matrix above. + # For more information on how this distribution works, check out Andrew Ng's lecture on it: + # https://www.youtube.com/watch?v=JjB58InuTqM + dist = MultivariateNormal(mean, self.cov_mat) + + # Sample an action from the distribution + action = dist.sample() + + # Calculate the log probability for that action + log_prob = dist.log_prob(action) + + # Return the sampled action and the log probability of that action in our distribution + return action.detach().numpy(), log_prob.detach() + + def evaluate(self, batch_obs, batch_acts): + """ + Estimate the values of each observation, and the log probs of + each action in the most recent batch with the most recent + iteration of the actor network. Should be called from learn. + + Parameters: + batch_obs - the observations from the most recently collected batch as a tensor. + Shape: (number of timesteps in batch, dimension of observation) + batch_acts - the actions from the most recently collected batch as a tensor. + Shape: (number of timesteps in batch, dimension of action) + + Return: + V - the predicted values of batch_obs + log_probs - the log probabilities of the actions taken in batch_acts given batch_obs + """ + # Query critic network for a value V for each batch_obs. Shape of V should be same as batch_rtgs + V = self.critic(batch_obs).squeeze() + + # Calculate the log probabilities of batch actions using most recent actor network. + # This segment of code is similar to that in get_action() + mean = self.actor(batch_obs) + dist = MultivariateNormal(mean, self.cov_mat) + log_probs = dist.log_prob(batch_acts) + + # Return the value vector V of each observation in the batch + # and log probabilities log_probs of each action in the batch + return V, log_probs + + def _init_hyperparameters(self, hyperparameters): + """ + Initialize default and custom values for hyperparameters + + Parameters: + hyperparameters - the extra arguments included when creating the PPO model, should only include + hyperparameters defined below with custom values. + + Return: + None + """ + # Initialize default values for hyperparameters + # Algorithm hyperparameters + self.timesteps_per_batch = 4800 # Number of timesteps to run per batch + self.max_timesteps_per_episode = 1600 # Max number of timesteps per episode + self.n_updates_per_iteration = 5 # Number of times to update actor/critic per iteration + self.lr = 0.005 # Learning rate of actor optimizer + self.gamma = 0.95 # Discount factor to be applied when calculating Rewards-To-Go + self.clip = 0.2 # Recommended 0.2, helps define the threshold to clip the ratio during SGA + + # Miscellaneous parameters + self.render = True # If we should render during rollout + self.render_every_i = 10 # Only render every n iterations + self.save_freq = 10 # How often we save in number of iterations + self.seed = None # Sets the seed of our program, used for reproducibility of results + + # Change any default values to custom values for specified hyperparameters + for param, val in hyperparameters.items(): + exec('self.' + param + ' = ' + str(val)) + + # Sets the seed if specified + if self.seed != None: + # Check if our seed is valid first + assert(type(self.seed) == int) + + # Set the seed + torch.manual_seed(self.seed) + print(f"Successfully set seed to {self.seed}") + + def _log_summary(self): + """ + Print to stdout what we've logged so far in the most recent batch. + + Parameters: + None + + Return: + None + """ + # Calculate logging values. I use a few python shortcuts to calculate each value + # without explaining since it's not too important to PPO; feel free to look it over, + # and if you have any questions you can email me (look at bottom of README) + delta_t = self.logger['delta_t'] + self.logger['delta_t'] = time.time_ns() + delta_t = (self.logger['delta_t'] - delta_t) / 1e9 + delta_t = str(round(delta_t, 2)) + + t_so_far = self.logger['t_so_far'] + i_so_far = self.logger['i_so_far'] + avg_ep_lens = np.mean(self.logger['batch_lens']) + avg_ep_rews = np.mean([np.sum(ep_rews) for ep_rews in self.logger['batch_rews']]) + avg_actor_loss = np.mean([losses.float().mean() for losses in self.logger['actor_losses']]) + + # Round decimal places for more aesthetic logging messages + avg_ep_lens = str(round(avg_ep_lens, 2)) + avg_ep_rews = str(round(avg_ep_rews, 2)) + avg_actor_loss = str(round(avg_actor_loss, 5)) + + # Print logging statements + print(flush=True) + print(f"-------------------- Iteration #{i_so_far} --------------------", flush=True) + print(f"Average Episodic Length: {avg_ep_lens}", flush=True) + print(f"Average Episodic Return: {avg_ep_rews}", flush=True) + print(f"Average Loss: {avg_actor_loss}", flush=True) + print(f"Timesteps So Far: {t_so_far}", flush=True) + print(f"Iteration took: {delta_t} secs", flush=True) + print(f"------------------------------------------------------", flush=True) + print(flush=True) + + # Reset batch-specific logging data + self.logger['batch_lens'] = [] + self.logger['batch_rews'] = [] + self.logger['actor_losses'] = []