添加PPO代码

2025-03-11 16:01:07 +08:00 · 2025-03-11 16:01:07 +08:00 · 1058f37be6
commit 1058f37be6
parent e7a4395340
7 changed files with 1003 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -7,6 +7,9 @@ __pycache__/
 # C extensions
 *.so
 # Pytorch weights
 weights/
 # Distribution / packaging
 .Python
 build/
--- a/PPO/arguments.py
+++ b/PPO/arguments.py
@ -0,0 +1,27 @@
 """
 	This file contains the arguments to parse at command line.
 	File main.py will call get_args, which then the arguments
 	will be returned.
 """
 import argparse
 def get_args():
 	"""
 		Description:
 		Parses arguments at command line.
 		Parameters:
 			None
 		Return:
 			args - the arguments parsed
 	"""
 	parser = argparse.ArgumentParser()
 	parser.add_argument('--mode', dest='mode', type=str, default='train')              # can be 'train' or 'test'
 	parser.add_argument('--actor_model', dest='actor_model', type=str, default='')     # your actor model filename
 	parser.add_argument('--critic_model', dest='critic_model', type=str, default='')   # your critic model filename
 	args = parser.parse_args()
 	return args
--- a/PPO/env.py
+++ b/PPO/env.py
@ -0,0 +1,295 @@
 import gymnasium as gym
 from gymnasium import spaces
 import numpy as np
 class PartitionMazeEnv(gym.Env):
    """
    自定义环境，分为两阶段：
    阶段 0：区域切分（共 4 步，每一步输出一个标量，用于确定竖切和横切位置）。
           切分顺序为：第一步输出 c₁，第二步输出 c₂，第三步输出 r₁，第四步输出 r₂。
           离散化后取值仅为 {0, 0.1, 0.2, …, 0.9}（其中 0 表示不切）。
    阶段 1：车辆路径规划（走迷宫），车辆从区域中心出发，在九宫格内按照上下左右移动，
           直到所有目标格子被覆盖或步数上限达到。
    """
    def __init__(self, config=None):
        super(PartitionMazeEnv, self).__init__()
        # 车队参数设置
        self.H = 20         # 区域高度，网格点之间的距离为25m（单位距离）
        self.W = 30         # 区域宽度
        self.num_cars = 2           # 系统数量（车-巢-机系统个数）
        # 时间系数（单位：秒，每个网格一张照片）
        self.flight_time_factor = 3     # 每张照片对应的飞行时间，无人机飞行速度为9.5m/s，拍摄照片的时间间隔为3s
        self.comp_uav_factor = 5    # 无人机上每张照片计算时间，5s
        self.trans_time_factor = 0.3    # 每张照片传输时间，0.3s
        self.car_move_time_factor = 2 * 50    # TODO 汽车每单位距离的移动时间，2s，加了一个放大因子
        self.comp_bs_factor = 5    # 机巢上每张照片计算时间
        # 能耗参数
        self.flight_energy_factor = 0.05     # 单位：分钟/张
        self.comp_energy_factor = 0.05    # 计算能耗需要重新估计
        self.trans_energy_factor = 0.0025
        self.battery_capacity = 10  # 无人机只进行飞行，续航为30分钟
        self.phase = 0    # 阶段控制，0：区域划分阶段，1：迷宫初始化阶段，2：走迷宫阶段
        self.partition_step = 0      # 区域划分阶段步数，范围 0~4
        # TODO 切的刀数现在固定为4（2+2）
        self.partition_values = np.zeros(
            4, dtype=np.float32)  # 存储 c₁, c₂, r₁, r₂
        # 定义动作空间：全部动作均为 1 维连续 [0,1]
        self.action_space = spaces.Box(
            low=0.0, high=1.0, shape=(1,), dtype=np.float32)
        # 定义观察空间为8维向量
        # TODO 返回的状态目前只有位置坐标
        # 阶段 0 状态：前 4 维表示已决策的切分值（未决策部分为 0）
        # 阶段 1 状态：车辆位置 (2D)
        self.observation_space = spaces.Box(
            low=0.0, high=1.0, shape=(8,), dtype=np.float32)
        # 切分阶段相关变量
        self.vertical_cuts = []     # 存储竖切位置（c₁, c₂），当值为0时表示不切
        self.horizontal_cuts = []   # 存储横切位置（r₁, r₂）
        # TODO region_centers可不可以优化一下，减少一些参数
        self.region_centers = []    # 存储切分后每个子区域的中心点（归一化坐标）
        # 路径规划阶段相关变量
        self.MAX_STEPS = 50         # 迷宫走法步数上限
        self.step_count = 0
        self.rectangles = {}
        self.car_pos = [[0.5, 0.5] for _ in range(self.num_cars)]
        self.car_traj = [[] for _ in range(self.num_cars)]
        self.current_car_index = 0
    def reset(self, seed=None, options=None):
        # 重置所有变量，回到切分阶段（phase 0）
        self.phase = 0
        self.partition_step = 0
        self.partition_values = np.zeros(4, dtype=np.float32)
        self.vertical_cuts = []
        self.horizontal_cuts = []
        self.region_centers = []
        self.step_count = 0
        self.rectangles = {}
        self.car_pos = [[0.5, 0.5] for _ in range(self.num_cars)]
        self.car_traj = [[] for _ in range(self.num_cars)]
        self.current_car_index = 0
        # 状态：前 4 维为 partition_values，其余补 0
        state = np.concatenate(
            [self.partition_values, np.zeros(np.array(self.car_pos).flatten().shape[0], dtype=np.float32)])
        return state, {}
    def step(self, action):
        # 在所有阶段动作均为 1 维连续动作，取 action[0]
        a = float(action[0])
        if self.phase == 0:
            # 切分阶段：每一步输出一个标量，离散化为 {0, 0.1, ..., 0.9}
            disc_val = np.floor(a * 10) / 10.0
            disc_val = np.clip(disc_val, 0.0, 0.9)
            self.partition_values[self.partition_step] = disc_val
            self.partition_step += 1
            # 构造当前状态：前 partition_step 个为已决策值，其余为 0，再补 7 个 0
            state = np.concatenate(
                [self.partition_values, np.zeros(np.array(self.car_pos).flatten().shape[0], dtype=np.float32)])
            # 如果未完成 4 步，则仍处于切分阶段，不发奖励，done 为 False
            if self.partition_step < 4:
                return state, 0.0, False, False, {}
            else:
                # 完成 4 步后，计算切分边界
                # 过滤掉 0，并去重后排序
                vert = sorted(set(v for v in self.partition_values[:len(
                    self.partition_values) // 2] if v > 0))
                horiz = sorted(set(v for v in self.partition_values[len(
                    self.partition_values) // 2:] if v > 0))
                self.vertical_cuts = vert if vert else []
                self.horizontal_cuts = horiz if horiz else []
                # 边界：始终包含 0 和 1
                v_boundaries = [0.0] + self.vertical_cuts + [1.0]
                h_boundaries = [0.0] + self.horizontal_cuts + [1.0]
                # 判断分区是否合理，并计算各个分区的任务卸载率ρ
                valid_partition = True
                for i in range(len(h_boundaries) - 1):
                    for j in range(len(v_boundaries) - 1):
                        d = (v_boundaries[j+1] - v_boundaries[j]) * self.W * \
                            (h_boundaries[i] + h_boundaries[i+1]) * self.H
                        rho_time_limit = (self.flight_time_factor - self.trans_time_factor) / \
                            (self.comp_uav_factor - self.trans_time_factor)
                        rho_energy_limit = (self.battery_capacity - self.flight_energy_factor * d - self.trans_energy_factor * d) / \
                            (self.comp_energy_factor * d -
                             self.trans_energy_factor * d)
                        if rho_energy_limit < 0:
                            valid_partition = False
                            break
                        rho = min(rho_time_limit, rho_energy_limit)
                        flight_time = self.flight_time_factor * d
                        comp_time = self.comp_uav_factor * rho * d
                        trans_time = self.trans_time_factor * (1 - rho) * d
                        comp_bs_time = self.comp_bs_factor * (1 - rho) * d
                        self.rectangles[(i, j)] = {
                            # 'r1': h_boundaries[i], 'r2': h_boundaries[i+1], 'c1': v_boundaries[j], 'c2': v_boundaries[j+1],
                            'd': d,
                            'rho': rho,
                            'flight_time': flight_time,
                            'comp_time': comp_time,
                            'trans_time': trans_time,
                            'comp_bs_time': comp_bs_time,
                            'is_visited': False
                            # 'center': (center_r, center_c)
                        }
                    if not valid_partition:
                        break
                if not valid_partition:
                    reward = -100
                    state = np.concatenate(
                        [self.partition_values, np.zeros(np.array(self.car_pos).flatten().shape[0], dtype=np.float32)])
                    return state, reward, True, False, {}
                else:
                    reward = 10
                    # 进入阶段 1：走迷宫
                    self.phase = 1
                    # 根据分割边界计算每个子区域中心
                    self.region_centers = []
                    for i in range(len(h_boundaries) - 1):
                        for j in range(len(v_boundaries) - 1):
                            center_x = (
                                v_boundaries[j] + v_boundaries[j+1]) / 2.0
                            center_y = (
                                h_boundaries[i] + h_boundaries[i+1]) / 2.0
                            self.region_centers.append((center_x, center_y))
                    # 存储切分边界，供后续网格映射使用
                    self.v_boundaries = v_boundaries
                    self.h_boundaries = h_boundaries
                    # 初始化迷宫阶段：步数清零，建立 visited_grid 大小与网格数相同
                    self.step_count = 0
                    self.visited_grid = np.zeros(
                        (len(v_boundaries) - 1) * (len(h_boundaries) - 1), dtype=np.int32)
                    state = np.concatenate(
                        [self.partition_values, np.array(self.car_pos).flatten()])
                    return state, reward, False, False, {}
        elif self.phase == 1:
            # 阶段 1：初始化迷宫，让多个车辆从区域中心出发，前往划分区域的中心点
            # 确保 action 的值在 [0, 1]，然后映射到 0~(num_regions-1) 的索引
            num_regions = len(self.region_centers)
            target_region_index = int(np.floor(a * num_regions))
            target_region_index = np.clip(
                target_region_index, 0, num_regions - 1)
            # 遍历所有车辆，让它们依次移动到目标子区域
            for car_idx in range(self.num_cars):
                target_position = np.array(
                    self.region_centers[target_region_index])  # 目标区域中心
                # 更新该车辆位置
                self.car_pos[car_idx] = target_position
                # 累计步数
                self.step_count += 1
                self.car_traj[car_idx].append(target_position)  # 记录每辆车的轨迹
            # 进入阶段 2：走迷宫
            self.phase = 2
            # 观察状态
            state = np.concatenate(
                [self.partition_values, np.array(self.car_pos).flatten()])
            return state, 0.0, False, False, {}
        elif self.phase == 2:
            # 阶段 2：路径规划（走迷宫）
            current_car = self.current_car_index
            # 当前动作 a 为 1 维连续动作，映射到四个方向
            if a < 0.2:
                move_dir = 'up'
            elif a < 0.4:
                move_dir = 'down'
            elif a < 0.6:
                move_dir = 'left'
            elif a < 0.8:
                move_dir = 'right'
            else:
                move_dir = 'stay'
            current_row, current_col = self.car_pos[current_car]
            # 初始化新的行、列为当前值
            new_row, new_col = current_row, current_col
            if move_dir == 'up' and current_row < len(h_boundaries) - 1:
                new_row = current_row + 1
            elif move_dir == 'down' and current_row > 0:
                new_row = current_row - 1
            elif move_dir == 'left' and current_col > 0:
                new_col = current_col - 1
            elif move_dir == 'right' and current_col < len(v_boundaries) - 1:
                new_col = current_col + 1
            # 如果移动不合法，或者动作为stay，则保持原位置
            # TODO 移动不合法，加一些惩罚
            # 更新车辆位置
            self.car_pos[current_car] = [new_row, new_col]
            if new_row != current_row or new_col != current_col:
                self.car_traj[current_car].append(np.array(new_row, new_col))
            self.step_count += 1
            self.current_car_index = (
                self.current_car_index + 1) % self.num_cars
            # 更新访问标记：将新网格标记为已访问
            self.rectangles[(new_col, new_col)]['is_visited'] = True
            # 观察状态
            state = np.concatenate(
                [self.partition_values, np.array(self.car_pos).flatten()])
            # Episode 终止条件：所有网格均被访问或步数达到上限
            done = all([rec['is_visited'] for rec in self.rectangles]) or (
                self.step_count >= self.MAX_STEPS)
            if done and np.all(self.visited_grid == 1):
                # 区域覆盖完毕，根据轨迹计算各车队的执行时间
                T = max([self._compute_motorcade_time(idx)
                        for idx in range(self.num_cars)])
                reward += 10.0  # TODO 奖励与greedy比较
            elif done and self.step_count >= self.MAX_STEPS:
                reward -= 100
            return state, reward, done, False, {}
    def _compute_motorcade_time(self, idx):
        flight_time = sum(self.rectangles[point]['flight_time']
                          for point in self.car_traj[idx])
        bs_time = sum(self.rectangles[point]['comp_bs_time']
                      for point in self.car_traj[idx])
        # 计算车的移动时间，首先在轨迹的首尾添加上大区域中心
        self.car_traj[idx].append([0.5, 0.5])
        self.car_traj[idx].insert(0, [0.5, 0.5])
        for i in range(len(self.car_traj[idx])):
            first_point = self.car_traj[idx][i]
            second_point = self.car_traj[idx][i + 1]
            car_time += np.linalg.norm(first_point, second_point) * \
                self.H * self.W * self.car_move_time_factor
        return max(car_time + flight_time, bs_time)
    def render(self):
        if self.phase == 0:
            print("Phase 0: Partitioning.")
            print(f"Partition step: {self.partition_step}")
            print(f"Partition values so far: {self.partition_values}")
        elif self.phase == 1:
            print("Phase 1: Path planning (maze).")
            print(f"Visited grid: {self.visited_grid}")
            print(f"Step count: {self.step_count}")
--- a/PPO/eval_policy.py
+++ b/PPO/eval_policy.py
@ -0,0 +1,103 @@
 """
 	This file is used only to evaluate our trained policy/actor after
 	training in main.py with ppo.py. I wrote this file to demonstrate
 	that our trained policy exists independently of our learning algorithm,
 	which resides in ppo.py. Thus, we can test our trained policy without 
 	relying on ppo.py.
 """
 def _log_summary(ep_len, ep_ret, ep_num):
 		"""
 			Print to stdout what we've logged so far in the most recent episode.
 			Parameters:
 				None
 			Return:
 				None
 		"""
 		# Round decimal places for more aesthetic logging messages
 		ep_len = str(round(ep_len, 2))
 		ep_ret = str(round(ep_ret, 2))
 		# Print logging statements
 		print(flush=True)
 		print(f"-------------------- Episode #{ep_num} --------------------", flush=True)
 		print(f"Episodic Length: {ep_len}", flush=True)
 		print(f"Episodic Return: {ep_ret}", flush=True)
 		print(f"------------------------------------------------------", flush=True)
 		print(flush=True)
 def rollout(policy, env, render):
 	"""
 		Returns a generator to roll out each episode given a trained policy and
 		environment to test on. 
 		Parameters:
 			policy - The trained policy to test
 			env - The environment to evaluate the policy on
 			render - Specifies whether to render or not
 		Return:
 			A generator object rollout, or iterable, which will return the latest
 			episodic length and return on each iteration of the generator.
 		Note:
 			If you're unfamiliar with Python generators, check this out:
 				https://wiki.python.org/moin/Generators
 			If you're unfamiliar with Python "yield", check this out:
 				https://stackoverflow.com/questions/231767/what-does-the-yield-keyword-do
 	"""
 	# Rollout until user kills process
 	while True:
 		obs, _ = env.reset()
 		done = False
 		# number of timesteps so far
 		t = 0
 		# Logging data
 		ep_len = 0            # episodic length
 		ep_ret = 0            # episodic return
 		while not done:
 			t += 1
 			# Render environment if specified, off by default
 			if render:
 				env.render()
 			# Query deterministic action from policy and run it
 			action = policy(obs).detach().numpy()
 			obs, rew, terminated, truncated, _ = env.step(action)
 			done = terminated | truncated
 			# Sum all episodic rewards as we go along
 			ep_ret += rew
 		# Track episodic length
 		ep_len = t
 		# returns episodic length and return in this iteration
 		yield ep_len, ep_ret
 def eval_policy(policy, env, render=False):
 	"""
 		The main function to evaluate our policy with. It will iterate a generator object
 		"rollout", which will simulate each episode and return the most recent episode's
 		length and return. We can then log it right after. And yes, eval_policy will run
 		forever until you kill the process. 
 		Parameters:
 			policy - The trained policy to test, basically another name for our actor model
 			env - The environment to test the policy on
 			render - Whether we should render our episodes. False by default.
 		Return:
 			None
 		NOTE: To learn more about generators, look at rollout's function description
 	"""
 	# Rollout with the policy and environment, and log each episode's data
 	for ep_num, (ep_len, ep_ret) in enumerate(rollout(policy, env, render)):
 		_log_summary(ep_len=ep_len, ep_ret=ep_ret, ep_num=ep_num)
--- a/PPO/main.py
+++ b/PPO/main.py
@ -0,0 +1,123 @@
 """
 	This file is the executable for running PPO. It is based on this medium article: 
 	https://medium.com/@eyyu/coding-ppo-from-scratch-with-pytorch-part-1-4-613dfc1b14c8
 """
 import gymnasium as gym
 import sys
 import torch
 from arguments import get_args
 from ppo import PPO
 from network import FeedForwardNN
 from eval_policy import eval_policy
 from env import PartitionMazeEnv
 def train(env, hyperparameters, actor_model, critic_model):
 	"""
 		Trains the model.
 		Parameters:
 			env - the environment to train on
 			hyperparameters - a dict of hyperparameters to use, defined in main
 			actor_model - the actor model to load in if we want to continue training
 			critic_model - the critic model to load in if we want to continue training
 		Return:
 			None
 	"""	
 	print(f"Training", flush=True)
 	# Create a model for PPO.
 	model = PPO(policy_class=FeedForwardNN, env=env, **hyperparameters)
 	# Tries to load in an existing actor/critic model to continue training on
 	if actor_model != '' and critic_model != '':
 		print(f"Loading in {actor_model} and {critic_model}...", flush=True)
 		model.actor.load_state_dict(torch.load(actor_model))
 		model.critic.load_state_dict(torch.load(critic_model))
 		print(f"Successfully loaded.", flush=True)
 	elif actor_model != '' or critic_model != '': # Don't train from scratch if user accidentally forgets actor/critic model
 		print(f"Error: Either specify both actor/critic models or none at all. We don't want to accidentally override anything!")
 		sys.exit(0)
 	else:
 		print(f"Training from scratch.", flush=True)
 	# Train the PPO model with a specified total timesteps
 	# NOTE: You can change the total timesteps here, I put a big number just because
 	# you can kill the process whenever you feel like PPO is converging
 	model.learn(total_timesteps=200_000_000)
 def test(env, actor_model):
 	"""
 		Tests the model.
 		Parameters:
 			env - the environment to test the policy on
 			actor_model - the actor model to load in
 		Return:
 			None
 	"""
 	print(f"Testing {actor_model}", flush=True)
 	# If the actor model is not specified, then exit
 	if actor_model == '':
 		print(f"Didn't specify model file. Exiting.", flush=True)
 		sys.exit(0)
 	# Extract out dimensions of observation and action spaces
 	obs_dim = env.observation_space.shape[0]
 	act_dim = env.action_space.shape[0]
 	# Build our policy the same way we build our actor model in PPO
 	policy = FeedForwardNN(obs_dim, act_dim)
 	# Load in the actor model saved by the PPO algorithm
 	policy.load_state_dict(torch.load(actor_model))
 	# Evaluate our policy with a separate module, eval_policy, to demonstrate
 	# that once we are done training the model/policy with ppo.py, we no longer need
 	# ppo.py since it only contains the training algorithm. The model/policy itself exists
 	# independently as a binary file that can be loaded in with torch.
 	eval_policy(policy=policy, env=env, render=True)
 def main(args):
 	"""
 		The main function to run.
 		Parameters:
 			args - the arguments parsed from command line
 		Return:
 			None
 	"""
 	# NOTE: Here's where you can set hyperparameters for PPO. I don't include them as part of
 	# ArgumentParser because it's too annoying to type them every time at command line. Instead, you can change them here.
 	# To see a list of hyperparameters, look in ppo.py at function _init_hyperparameters
 	hyperparameters = {
 				'timesteps_per_batch': 2048, 
 				'max_timesteps_per_episode': 200, 
 				'gamma': 0.99, 
 				'n_updates_per_iteration': 10,
 				'lr': 3e-4, 
 				'clip': 0.2,
 				'render': True,
 				'render_every_i': 10
 			  }
 	# Creates the environment we'll be running. If you want to replace with your own
 	# custom environment, note that it must inherit Gym and have both continuous
 	# observation and action spaces.
 	# env = gym.make('Pendulum-v1', render_mode='human' if args.mode == 'test' else 'rgb_array')
 	env = PartitionMazeEnv()
 	# Train or test, depending on the mode specified
 	if args.mode == 'train':
 		train(env=env, hyperparameters=hyperparameters, actor_model=args.actor_model, critic_model=args.critic_model)
 	else:
 		test(env=env, actor_model=args.actor_model)
 if __name__ == '__main__':
 	args = get_args() # Parse arguments from command line
 	main(args)
--- a/PPO/network.py
+++ b/PPO/network.py
@ -0,0 +1,50 @@
 """
 	This file contains a neural network module for us to
 	define our actor and critic networks in PPO.
 """
 import torch
 from torch import nn
 import torch.nn.functional as F
 import numpy as np
 class FeedForwardNN(nn.Module):
 	"""
 		A standard in_dim-64-64-out_dim Feed Forward Neural Network.
 	"""
 	def __init__(self, in_dim, out_dim):
 		"""
 			Initialize the network and set up the layers.
 			Parameters:
 				in_dim - input dimensions as an int
 				out_dim - output dimensions as an int
 			Return:
 				None
 		"""
 		super(FeedForwardNN, self).__init__()
 		self.layer1 = nn.Linear(in_dim, 64)
 		self.layer2 = nn.Linear(64, 64)
 		self.layer3 = nn.Linear(64, out_dim)
 	def forward(self, obs):
 		"""
 			Runs a forward pass on the neural network.
 			Parameters:
 				obs - observation to pass as input
 			Return:
 				output - the output of our forward pass
 		"""
 		# Convert observation to tensor if it's a numpy array
 		if isinstance(obs, np.ndarray):
 			obs = torch.tensor(obs, dtype=torch.float)
 		activation1 = F.relu(self.layer1(obs))
 		activation2 = F.relu(self.layer2(activation1))
 		output = self.layer3(activation2)
 		return output
--- a/PPO/ppo.py
+++ b/PPO/ppo.py
@ -0,0 +1,402 @@
 """
 	The file contains the PPO class to train with.
 	NOTE: All "ALG STEP"s are following the numbers from the original PPO pseudocode.
 			It can be found here: https://spinningup.openai.com/en/latest/_images/math/e62a8971472597f4b014c2da064f636ffe365ba3.svg
 """
 import gymnasium as gym
 import time
 import numpy as np
 import time
 import torch
 import torch.nn as nn
 from torch.optim import Adam
 from torch.distributions import MultivariateNormal
 class PPO:
 	"""
 		This is the PPO class we will use as our model in main.py
 	"""
 	def __init__(self, policy_class, env, **hyperparameters):
 		"""
 			Initializes the PPO model, including hyperparameters.
 			Parameters:
 				policy_class - the policy class to use for our actor/critic networks.
 				env - the environment to train on.
 				hyperparameters - all extra arguments passed into PPO that should be hyperparameters.
 			Returns:
 				None
 		"""
 		# Make sure the environment is compatible with our code
 		assert(type(env.observation_space) == gym.spaces.Box)
 		assert(type(env.action_space) == gym.spaces.Box)
 		# Initialize hyperparameters for training with PPO
 		self._init_hyperparameters(hyperparameters)
 		# Extract environment information
 		self.env = env
 		self.obs_dim = env.observation_space.shape[0]
 		self.act_dim = env.action_space.shape[0]
 		 # Initialize actor and critic networks
 		self.actor = policy_class(self.obs_dim, self.act_dim)                                                   # ALG STEP 1
 		self.critic = policy_class(self.obs_dim, 1)
 		# Initialize optimizers for actor and critic
 		self.actor_optim = Adam(self.actor.parameters(), lr=self.lr)
 		self.critic_optim = Adam(self.critic.parameters(), lr=self.lr)
 		# Initialize the covariance matrix used to query the actor for actions
 		self.cov_var = torch.full(size=(self.act_dim,), fill_value=0.5)
 		self.cov_mat = torch.diag(self.cov_var)
 		# This logger will help us with printing out summaries of each iteration
 		self.logger = {
 			'delta_t': time.time_ns(),
 			't_so_far': 0,          # timesteps so far
 			'i_so_far': 0,          # iterations so far
 			'batch_lens': [],       # episodic lengths in batch
 			'batch_rews': [],       # episodic returns in batch
 			'actor_losses': [],     # losses of actor network in current iteration
 		}
 	def learn(self, total_timesteps):
 		"""
 			Train the actor and critic networks. Here is where the main PPO algorithm resides.
 			Parameters:
 				total_timesteps - the total number of timesteps to train for
 			Return:
 				None
 		"""
 		print(f"Learning... Running {self.max_timesteps_per_episode} timesteps per episode, ", end='')
 		print(f"{self.timesteps_per_batch} timesteps per batch for a total of {total_timesteps} timesteps")
 		t_so_far = 0 # Timesteps simulated so far
 		i_so_far = 0 # Iterations ran so far
 		while t_so_far < total_timesteps:                                                                       # ALG STEP 2
 			# Autobots, roll out (just kidding, we're collecting our batch simulations here)
 			batch_obs, batch_acts, batch_log_probs, batch_rtgs, batch_lens = self.rollout()                     # ALG STEP 3
 			# Calculate how many timesteps we collected this batch
 			t_so_far += np.sum(batch_lens)
 			# Increment the number of iterations
 			i_so_far += 1
 			# Logging timesteps so far and iterations so far
 			self.logger['t_so_far'] = t_so_far
 			self.logger['i_so_far'] = i_so_far
 			# Calculate advantage at k-th iteration
 			V, _ = self.evaluate(batch_obs, batch_acts)
 			A_k = batch_rtgs - V.detach()                                                                       # ALG STEP 5
 			# One of the only tricks I use that isn't in the pseudocode. Normalizing advantages
 			# isn't theoretically necessary, but in practice it decreases the variance of 
 			# our advantages and makes convergence much more stable and faster. I added this because
 			# solving some environments was too unstable without it.
 			A_k = (A_k - A_k.mean()) / (A_k.std() + 1e-10)
 			# This is the loop where we update our network for some n epochs
 			for _ in range(self.n_updates_per_iteration):                                                       # ALG STEP 6 & 7
 				# Calculate V_phi and pi_theta(a_t | s_t)
 				V, curr_log_probs = self.evaluate(batch_obs, batch_acts)
 				# Calculate the ratio pi_theta(a_t | s_t) / pi_theta_k(a_t | s_t)
 				# NOTE: we just subtract the logs, which is the same as
 				# dividing the values and then canceling the log with e^log.
 				# For why we use log probabilities instead of actual probabilities,
 				# here's a great explanation: 
 				# https://cs.stackexchange.com/questions/70518/why-do-we-use-the-log-in-gradient-based-reinforcement-algorithms
 				# TL;DR makes gradient ascent easier behind the scenes.
 				ratios = torch.exp(curr_log_probs - batch_log_probs)
 				# Calculate surrogate losses.
 				surr1 = ratios * A_k
 				surr2 = torch.clamp(ratios, 1 - self.clip, 1 + self.clip) * A_k
 				# Calculate actor and critic losses.
 				# NOTE: we take the negative min of the surrogate losses because we're trying to maximize
 				# the performance function, but Adam minimizes the loss. So minimizing the negative
 				# performance function maximizes it.
 				actor_loss = (-torch.min(surr1, surr2)).mean()
 				critic_loss = nn.MSELoss()(V, batch_rtgs)
 				# Calculate gradients and perform backward propagation for actor network
 				self.actor_optim.zero_grad()
 				actor_loss.backward(retain_graph=True)
 				self.actor_optim.step()
 				# Calculate gradients and perform backward propagation for critic network
 				self.critic_optim.zero_grad()
 				critic_loss.backward()
 				self.critic_optim.step()
 				# Log actor loss
 				self.logger['actor_losses'].append(actor_loss.detach())
 			# Print a summary of our training so far
 			self._log_summary()
 			# Save our model if it's time
 			if i_so_far % self.save_freq == 0:
 				torch.save(self.actor.state_dict(), './weights/ppo_actor.pth')
 				torch.save(self.critic.state_dict(), './weights/ppo_critic.pth')
 	def rollout(self):
 		"""
 			Too many transformers references, I'm sorry. This is where we collect the batch of data
 			from simulation. Since this is an on-policy algorithm, we'll need to collect a fresh batch
 			of data each time we iterate the actor/critic networks.
 			Parameters:
 				None
 			Return:
 				batch_obs - the observations collected this batch. Shape: (number of timesteps, dimension of observation)
 				batch_acts - the actions collected this batch. Shape: (number of timesteps, dimension of action)
 				batch_log_probs - the log probabilities of each action taken this batch. Shape: (number of timesteps)
 				batch_rtgs - the Rewards-To-Go of each timestep in this batch. Shape: (number of timesteps)
 				batch_lens - the lengths of each episode this batch. Shape: (number of episodes)
 		"""
 		# Batch data. For more details, check function header.
 		batch_obs = []
 		batch_acts = []
 		batch_log_probs = []
 		batch_rews = []
 		batch_rtgs = []
 		batch_lens = []
 		# Episodic data. Keeps track of rewards per episode, will get cleared
 		# upon each new episode
 		ep_rews = []
 		t = 0 # Keeps track of how many timesteps we've run so far this batch
 		# Keep simulating until we've run more than or equal to specified timesteps per batch
 		while t < self.timesteps_per_batch:
 			ep_rews = [] # rewards collected per episode
 			# Reset the environment. sNote that obs is short for observation. 
 			obs, _ = self.env.reset()
 			done = False
 			# Run an episode for a maximum of max_timesteps_per_episode timesteps
 			for ep_t in range(self.max_timesteps_per_episode):
 				# If render is specified, render the environment
 				if self.render and (self.logger['i_so_far'] % self.render_every_i == 0) and len(batch_lens) == 0:
 					self.env.render()
 				t += 1 # Increment timesteps ran this batch so far
 				# Track observations in this batch
 				batch_obs.append(obs)
 				# Calculate action and make a step in the env. 
 				# Note that rew is short for reward.
 				action, log_prob = self.get_action(obs)
 				obs, rew, terminated, truncated, _ = self.env.step(action)
 				# Don't really care about the difference between terminated or truncated in this, so just combine them
 				done = terminated | truncated
 				# Track recent reward, action, and action log probability
 				ep_rews.append(rew)
 				batch_acts.append(action)
 				batch_log_probs.append(log_prob)
 				# If the environment tells us the episode is terminated, break
 				if done:
 					break
 			# Track episodic lengths and rewards
 			batch_lens.append(ep_t + 1)
 			batch_rews.append(ep_rews)
 		# Reshape data as tensors in the shape specified in function description, before returning
 		batch_obs = torch.tensor(batch_obs, dtype=torch.float)
 		batch_acts = torch.tensor(batch_acts, dtype=torch.float)
 		batch_log_probs = torch.tensor(batch_log_probs, dtype=torch.float)
 		batch_rtgs = self.compute_rtgs(batch_rews)                                                              # ALG STEP 4
 		# Log the episodic returns and episodic lengths in this batch.
 		self.logger['batch_rews'] = batch_rews
 		self.logger['batch_lens'] = batch_lens
 		return batch_obs, batch_acts, batch_log_probs, batch_rtgs, batch_lens
 	def compute_rtgs(self, batch_rews):
 		"""
 			Compute the Reward-To-Go of each timestep in a batch given the rewards.
 			Parameters:
 				batch_rews - the rewards in a batch, Shape: (number of episodes, number of timesteps per episode)
 			Return:
 				batch_rtgs - the rewards to go, Shape: (number of timesteps in batch)
 		"""
 		# The rewards-to-go (rtg) per episode per batch to return.
 		# The shape will be (num timesteps per episode)
 		batch_rtgs = []
 		# Iterate through each episode
 		for ep_rews in reversed(batch_rews):
 			discounted_reward = 0 # The discounted reward so far
 			# Iterate through all rewards in the episode. We go backwards for smoother calculation of each
 			# discounted return (think about why it would be harder starting from the beginning)
 			for rew in reversed(ep_rews):
 				discounted_reward = rew + discounted_reward * self.gamma
 				batch_rtgs.insert(0, discounted_reward)
 		# Convert the rewards-to-go into a tensor
 		batch_rtgs = torch.tensor(batch_rtgs, dtype=torch.float)
 		return batch_rtgs
 	def get_action(self, obs):
 		"""
 			Queries an action from the actor network, should be called from rollout.
 			Parameters:
 				obs - the observation at the current timestep
 			Return:
 				action - the action to take, as a numpy array
 				log_prob - the log probability of the selected action in the distribution
 		"""
 		# Query the actor network for a mean action
 		mean = self.actor(obs)
 		# Create a distribution with the mean action and std from the covariance matrix above.
 		# For more information on how this distribution works, check out Andrew Ng's lecture on it:
 		# https://www.youtube.com/watch?v=JjB58InuTqM
 		dist = MultivariateNormal(mean, self.cov_mat)
 		# Sample an action from the distribution
 		action = dist.sample()
 		# Calculate the log probability for that action
 		log_prob = dist.log_prob(action)
 		# Return the sampled action and the log probability of that action in our distribution
 		return action.detach().numpy(), log_prob.detach()
 	def evaluate(self, batch_obs, batch_acts):
 		"""
 			Estimate the values of each observation, and the log probs of
 			each action in the most recent batch with the most recent
 			iteration of the actor network. Should be called from learn.
 			Parameters:
 				batch_obs - the observations from the most recently collected batch as a tensor.
 							Shape: (number of timesteps in batch, dimension of observation)
 				batch_acts - the actions from the most recently collected batch as a tensor.
 							Shape: (number of timesteps in batch, dimension of action)
 			Return:
 				V - the predicted values of batch_obs
 				log_probs - the log probabilities of the actions taken in batch_acts given batch_obs
 		"""
 		# Query critic network for a value V for each batch_obs. Shape of V should be same as batch_rtgs
 		V = self.critic(batch_obs).squeeze()
 		# Calculate the log probabilities of batch actions using most recent actor network.
 		# This segment of code is similar to that in get_action()
 		mean = self.actor(batch_obs)
 		dist = MultivariateNormal(mean, self.cov_mat)
 		log_probs = dist.log_prob(batch_acts)
 		# Return the value vector V of each observation in the batch
 		# and log probabilities log_probs of each action in the batch
 		return V, log_probs
 	def _init_hyperparameters(self, hyperparameters):
 		"""
 			Initialize default and custom values for hyperparameters
 			Parameters:
 				hyperparameters - the extra arguments included when creating the PPO model, should only include
 									hyperparameters defined below with custom values.
 			Return:
 				None
 		"""
 		# Initialize default values for hyperparameters
 		# Algorithm hyperparameters
 		self.timesteps_per_batch = 4800                 # Number of timesteps to run per batch
 		self.max_timesteps_per_episode = 1600           # Max number of timesteps per episode
 		self.n_updates_per_iteration = 5                # Number of times to update actor/critic per iteration
 		self.lr = 0.005                                 # Learning rate of actor optimizer
 		self.gamma = 0.95                               # Discount factor to be applied when calculating Rewards-To-Go
 		self.clip = 0.2                                 # Recommended 0.2, helps define the threshold to clip the ratio during SGA
 		# Miscellaneous parameters
 		self.render = True                              # If we should render during rollout
 		self.render_every_i = 10                        # Only render every n iterations
 		self.save_freq = 10                             # How often we save in number of iterations
 		self.seed = None                                # Sets the seed of our program, used for reproducibility of results
 		# Change any default values to custom values for specified hyperparameters
 		for param, val in hyperparameters.items():
 			exec('self.' + param + ' = ' + str(val))
 		# Sets the seed if specified
 		if self.seed != None:
 			# Check if our seed is valid first
 			assert(type(self.seed) == int)
 			# Set the seed 
 			torch.manual_seed(self.seed)
 			print(f"Successfully set seed to {self.seed}")
 	def _log_summary(self):
 		"""
 			Print to stdout what we've logged so far in the most recent batch.
 			Parameters:
 				None
 			Return:
 				None
 		"""
 		# Calculate logging values. I use a few python shortcuts to calculate each value
 		# without explaining since it's not too important to PPO; feel free to look it over,
 		# and if you have any questions you can email me (look at bottom of README)
 		delta_t = self.logger['delta_t']
 		self.logger['delta_t'] = time.time_ns()
 		delta_t = (self.logger['delta_t'] - delta_t) / 1e9
 		delta_t = str(round(delta_t, 2))
 		t_so_far = self.logger['t_so_far']
 		i_so_far = self.logger['i_so_far']
 		avg_ep_lens = np.mean(self.logger['batch_lens'])
 		avg_ep_rews = np.mean([np.sum(ep_rews) for ep_rews in self.logger['batch_rews']])
 		avg_actor_loss = np.mean([losses.float().mean() for losses in self.logger['actor_losses']])
 		# Round decimal places for more aesthetic logging messages
 		avg_ep_lens = str(round(avg_ep_lens, 2))
 		avg_ep_rews = str(round(avg_ep_rews, 2))
 		avg_actor_loss = str(round(avg_actor_loss, 5))
 		# Print logging statements
 		print(flush=True)
 		print(f"-------------------- Iteration #{i_so_far} --------------------", flush=True)
 		print(f"Average Episodic Length: {avg_ep_lens}", flush=True)
 		print(f"Average Episodic Return: {avg_ep_rews}", flush=True)
 		print(f"Average Loss: {avg_actor_loss}", flush=True)
 		print(f"Timesteps So Far: {t_so_far}", flush=True)
 		print(f"Iteration took: {delta_t} secs", flush=True)
 		print(f"------------------------------------------------------", flush=True)
 		print(flush=True)
 		# Reset batch-specific logging data
 		self.logger['batch_lens'] = []
 		self.logger['batch_rews'] = []
 		self.logger['actor_losses'] = []