diff --git a/.gitignore b/.gitignore
index 5d381cc..07940b9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,9 @@ __pycache__/
 # C extensions
 *.so
 
+# Pytorch weights
+weights/
+
 # Distribution / packaging
 .Python
 build/
diff --git a/PPO/arguments.py b/PPO/arguments.py
new file mode 100644
index 0000000..9a8e17e
--- /dev/null
+++ b/PPO/arguments.py
@@ -0,0 +1,27 @@
+"""
+	This file contains the arguments to parse at command line.
+	File main.py will call get_args, which then the arguments
+	will be returned.
+"""
+import argparse
+
+def get_args():
+	"""
+		Description:
+		Parses arguments at command line.
+
+		Parameters:
+			None
+
+		Return:
+			args - the arguments parsed
+	"""
+	parser = argparse.ArgumentParser()
+
+	parser.add_argument('--mode', dest='mode', type=str, default='train')              # can be 'train' or 'test'
+	parser.add_argument('--actor_model', dest='actor_model', type=str, default='')     # your actor model filename
+	parser.add_argument('--critic_model', dest='critic_model', type=str, default='')   # your critic model filename
+
+	args = parser.parse_args()
+
+	return args
diff --git a/PPO/env.py b/PPO/env.py
new file mode 100644
index 0000000..51e2e9e
--- /dev/null
+++ b/PPO/env.py
@@ -0,0 +1,295 @@
+import gymnasium as gym
+from gymnasium import spaces
+import numpy as np
+
+
+class PartitionMazeEnv(gym.Env):
+    """
+    自定义环境，分为两阶段：
+    阶段 0：区域切分（共 4 步，每一步输出一个标量，用于确定竖切和横切位置）。
+           切分顺序为：第一步输出 c₁，第二步输出 c₂，第三步输出 r₁，第四步输出 r₂。
+           离散化后取值仅为 {0, 0.1, 0.2, …, 0.9}（其中 0 表示不切）。
+    阶段 1：车辆路径规划（走迷宫），车辆从区域中心出发，在九宫格内按照上下左右移动，
+           直到所有目标格子被覆盖或步数上限达到。
+    """
+
+    def __init__(self, config=None):
+        super(PartitionMazeEnv, self).__init__()
+        # 车队参数设置
+        self.H = 20         # 区域高度，网格点之间的距离为25m（单位距离）
+        self.W = 30         # 区域宽度
+        self.num_cars = 2           # 系统数量（车-巢-机系统个数）
+
+        # 时间系数（单位：秒，每个网格一张照片）
+        self.flight_time_factor = 3     # 每张照片对应的飞行时间，无人机飞行速度为9.5m/s，拍摄照片的时间间隔为3s
+        self.comp_uav_factor = 5    # 无人机上每张照片计算时间，5s
+        self.trans_time_factor = 0.3    # 每张照片传输时间，0.3s
+        self.car_move_time_factor = 2 * 50    # TODO 汽车每单位距离的移动时间，2s，加了一个放大因子
+        self.comp_bs_factor = 5    # 机巢上每张照片计算时间
+
+        # 能耗参数
+        self.flight_energy_factor = 0.05     # 单位：分钟/张
+        self.comp_energy_factor = 0.05    # 计算能耗需要重新估计
+        self.trans_energy_factor = 0.0025
+        self.battery_capacity = 10  # 无人机只进行飞行，续航为30分钟
+
+        self.phase = 0    # 阶段控制，0：区域划分阶段，1：迷宫初始化阶段，2：走迷宫阶段
+        self.partition_step = 0      # 区域划分阶段步数，范围 0~4
+        # TODO 切的刀数现在固定为4（2+2）
+        self.partition_values = np.zeros(
+            4, dtype=np.float32)  # 存储 c₁, c₂, r₁, r₂
+
+        # 定义动作空间：全部动作均为 1 维连续 [0,1]
+        self.action_space = spaces.Box(
+            low=0.0, high=1.0, shape=(1,), dtype=np.float32)
+
+        # 定义观察空间为8维向量
+        # TODO 返回的状态目前只有位置坐标
+        # 阶段 0 状态：前 4 维表示已决策的切分值（未决策部分为 0）
+        # 阶段 1 状态：车辆位置 (2D)
+        self.observation_space = spaces.Box(
+            low=0.0, high=1.0, shape=(8,), dtype=np.float32)
+
+        # 切分阶段相关变量
+        self.vertical_cuts = []     # 存储竖切位置（c₁, c₂），当值为0时表示不切
+        self.horizontal_cuts = []   # 存储横切位置（r₁, r₂）
+        # TODO region_centers可不可以优化一下，减少一些参数
+        self.region_centers = []    # 存储切分后每个子区域的中心点（归一化坐标）
+
+        # 路径规划阶段相关变量
+        self.MAX_STEPS = 50         # 迷宫走法步数上限
+        self.step_count = 0
+        self.rectangles = {}
+        self.car_pos = [[0.5, 0.5] for _ in range(self.num_cars)]
+        self.car_traj = [[] for _ in range(self.num_cars)]
+        self.current_car_index = 0
+
+    def reset(self, seed=None, options=None):
+        # 重置所有变量，回到切分阶段（phase 0）
+        self.phase = 0
+        self.partition_step = 0
+        self.partition_values = np.zeros(4, dtype=np.float32)
+        self.vertical_cuts = []
+        self.horizontal_cuts = []
+        self.region_centers = []
+        self.step_count = 0
+        self.rectangles = {}
+        self.car_pos = [[0.5, 0.5] for _ in range(self.num_cars)]
+        self.car_traj = [[] for _ in range(self.num_cars)]
+        self.current_car_index = 0
+        # 状态：前 4 维为 partition_values，其余补 0
+        state = np.concatenate(
+            [self.partition_values, np.zeros(np.array(self.car_pos).flatten().shape[0], dtype=np.float32)])
+        return state, {}
+
+    def step(self, action):
+        # 在所有阶段动作均为 1 维连续动作，取 action[0]
+        a = float(action[0])
+
+        if self.phase == 0:
+            # 切分阶段：每一步输出一个标量，离散化为 {0, 0.1, ..., 0.9}
+            disc_val = np.floor(a * 10) / 10.0
+            disc_val = np.clip(disc_val, 0.0, 0.9)
+            self.partition_values[self.partition_step] = disc_val
+            self.partition_step += 1
+
+            # 构造当前状态：前 partition_step 个为已决策值，其余为 0，再补 7 个 0
+            state = np.concatenate(
+                [self.partition_values, np.zeros(np.array(self.car_pos).flatten().shape[0], dtype=np.float32)])
+
+            # 如果未完成 4 步，则仍处于切分阶段，不发奖励，done 为 False
+            if self.partition_step < 4:
+                return state, 0.0, False, False, {}
+            else:
+                # 完成 4 步后，计算切分边界
+                # 过滤掉 0，并去重后排序
+                vert = sorted(set(v for v in self.partition_values[:len(
+                    self.partition_values) // 2] if v > 0))
+                horiz = sorted(set(v for v in self.partition_values[len(
+                    self.partition_values) // 2:] if v > 0))
+                self.vertical_cuts = vert if vert else []
+                self.horizontal_cuts = horiz if horiz else []
+
+                # 边界：始终包含 0 和 1
+                v_boundaries = [0.0] + self.vertical_cuts + [1.0]
+                h_boundaries = [0.0] + self.horizontal_cuts + [1.0]
+
+                # 判断分区是否合理，并计算各个分区的任务卸载率ρ
+                valid_partition = True
+                for i in range(len(h_boundaries) - 1):
+                    for j in range(len(v_boundaries) - 1):
+                        d = (v_boundaries[j+1] - v_boundaries[j]) * self.W * \
+                            (h_boundaries[i] + h_boundaries[i+1]) * self.H
+                        rho_time_limit = (self.flight_time_factor - self.trans_time_factor) / \
+                            (self.comp_uav_factor - self.trans_time_factor)
+                        rho_energy_limit = (self.battery_capacity - self.flight_energy_factor * d - self.trans_energy_factor * d) / \
+                            (self.comp_energy_factor * d -
+                             self.trans_energy_factor * d)
+                        if rho_energy_limit < 0:
+                            valid_partition = False
+                            break
+                        rho = min(rho_time_limit, rho_energy_limit)
+
+                        flight_time = self.flight_time_factor * d
+                        comp_time = self.comp_uav_factor * rho * d
+                        trans_time = self.trans_time_factor * (1 - rho) * d
+                        comp_bs_time = self.comp_bs_factor * (1 - rho) * d
+
+                        self.rectangles[(i, j)] = {
+                            # 'r1': h_boundaries[i], 'r2': h_boundaries[i+1], 'c1': v_boundaries[j], 'c2': v_boundaries[j+1],
+                            'd': d,
+                            'rho': rho,
+                            'flight_time': flight_time,
+                            'comp_time': comp_time,
+                            'trans_time': trans_time,
+                            'comp_bs_time': comp_bs_time,
+                            'is_visited': False
+                            # 'center': (center_r, center_c)
+                        }
+                    if not valid_partition:
+                        break
+
+                if not valid_partition:
+                    reward = -100
+                    state = np.concatenate(
+                        [self.partition_values, np.zeros(np.array(self.car_pos).flatten().shape[0], dtype=np.float32)])
+                    return state, reward, True, False, {}
+                else:
+                    reward = 10
+
+                    # 进入阶段 1：走迷宫
+                    self.phase = 1
+                    # 根据分割边界计算每个子区域中心
+                    self.region_centers = []
+                    for i in range(len(h_boundaries) - 1):
+                        for j in range(len(v_boundaries) - 1):
+                            center_x = (
+                                v_boundaries[j] + v_boundaries[j+1]) / 2.0
+                            center_y = (
+                                h_boundaries[i] + h_boundaries[i+1]) / 2.0
+                            self.region_centers.append((center_x, center_y))
+                    # 存储切分边界，供后续网格映射使用
+                    self.v_boundaries = v_boundaries
+                    self.h_boundaries = h_boundaries
+                    # 初始化迷宫阶段：步数清零，建立 visited_grid 大小与网格数相同
+                    self.step_count = 0
+                    self.visited_grid = np.zeros(
+                        (len(v_boundaries) - 1) * (len(h_boundaries) - 1), dtype=np.int32)
+
+                    state = np.concatenate(
+                        [self.partition_values, np.array(self.car_pos).flatten()])
+                    return state, reward, False, False, {}
+
+        elif self.phase == 1:
+            # 阶段 1：初始化迷宫，让多个车辆从区域中心出发，前往划分区域的中心点
+            # 确保 action 的值在 [0, 1]，然后映射到 0~(num_regions-1) 的索引
+            num_regions = len(self.region_centers)
+            target_region_index = int(np.floor(a * num_regions))
+            target_region_index = np.clip(
+                target_region_index, 0, num_regions - 1)
+
+            # 遍历所有车辆，让它们依次移动到目标子区域
+            for car_idx in range(self.num_cars):
+                target_position = np.array(
+                    self.region_centers[target_region_index])  # 目标区域中心
+
+                # 更新该车辆位置
+                self.car_pos[car_idx] = target_position
+                # 累计步数
+                self.step_count += 1
+                self.car_traj[car_idx].append(target_position)  # 记录每辆车的轨迹
+
+            # 进入阶段 2：走迷宫
+            self.phase = 2
+
+            # 观察状态
+            state = np.concatenate(
+                [self.partition_values, np.array(self.car_pos).flatten()])
+            return state, 0.0, False, False, {}
+
+        elif self.phase == 2:
+            # 阶段 2：路径规划（走迷宫）
+            current_car = self.current_car_index
+
+            # 当前动作 a 为 1 维连续动作，映射到四个方向
+            if a < 0.2:
+                move_dir = 'up'
+            elif a < 0.4:
+                move_dir = 'down'
+            elif a < 0.6:
+                move_dir = 'left'
+            elif a < 0.8:
+                move_dir = 'right'
+            else:
+                move_dir = 'stay'
+
+            current_row, current_col = self.car_pos[current_car]
+
+            # 初始化新的行、列为当前值
+            new_row, new_col = current_row, current_col
+
+            if move_dir == 'up' and current_row < len(h_boundaries) - 1:
+                new_row = current_row + 1
+            elif move_dir == 'down' and current_row > 0:
+                new_row = current_row - 1
+            elif move_dir == 'left' and current_col > 0:
+                new_col = current_col - 1
+            elif move_dir == 'right' and current_col < len(v_boundaries) - 1:
+                new_col = current_col + 1
+            # 如果移动不合法，或者动作为stay，则保持原位置
+            # TODO 移动不合法，加一些惩罚
+
+            # 更新车辆位置
+            self.car_pos[current_car] = [new_row, new_col]
+            if new_row != current_row or new_col != current_col:
+                self.car_traj[current_car].append(np.array(new_row, new_col))
+            self.step_count += 1
+            self.current_car_index = (
+                self.current_car_index + 1) % self.num_cars
+
+            # 更新访问标记：将新网格标记为已访问
+            self.rectangles[(new_col, new_col)]['is_visited'] = True
+
+            # 观察状态
+            state = np.concatenate(
+                [self.partition_values, np.array(self.car_pos).flatten()])
+
+            # Episode 终止条件：所有网格均被访问或步数达到上限
+            done = all([rec['is_visited'] for rec in self.rectangles]) or (
+                self.step_count >= self.MAX_STEPS)
+            if done and np.all(self.visited_grid == 1):
+                # 区域覆盖完毕，根据轨迹计算各车队的执行时间
+                T = max([self._compute_motorcade_time(idx)
+                        for idx in range(self.num_cars)])
+                reward += 10.0  # TODO 奖励与greedy比较
+            elif done and self.step_count >= self.MAX_STEPS:
+                reward -= 100
+
+            return state, reward, done, False, {}
+
+    def _compute_motorcade_time(self, idx):
+        flight_time = sum(self.rectangles[point]['flight_time']
+                          for point in self.car_traj[idx])
+        bs_time = sum(self.rectangles[point]['comp_bs_time']
+                      for point in self.car_traj[idx])
+
+        # 计算车的移动时间，首先在轨迹的首尾添加上大区域中心
+        self.car_traj[idx].append([0.5, 0.5])
+        self.car_traj[idx].insert(0, [0.5, 0.5])
+        for i in range(len(self.car_traj[idx])):
+            first_point = self.car_traj[idx][i]
+            second_point = self.car_traj[idx][i + 1]
+            car_time += np.linalg.norm(first_point, second_point) * \
+                self.H * self.W * self.car_move_time_factor
+
+        return max(car_time + flight_time, bs_time)
+
+    def render(self):
+        if self.phase == 0:
+            print("Phase 0: Partitioning.")
+            print(f"Partition step: {self.partition_step}")
+            print(f"Partition values so far: {self.partition_values}")
+        elif self.phase == 1:
+            print("Phase 1: Path planning (maze).")
+            print(f"Visited grid: {self.visited_grid}")
+            print(f"Step count: {self.step_count}")
diff --git a/PPO/eval_policy.py b/PPO/eval_policy.py
new file mode 100644
index 0000000..a3323de
--- /dev/null
+++ b/PPO/eval_policy.py
@@ -0,0 +1,103 @@
+"""
+	This file is used only to evaluate our trained policy/actor after
+	training in main.py with ppo.py. I wrote this file to demonstrate
+	that our trained policy exists independently of our learning algorithm,
+	which resides in ppo.py. Thus, we can test our trained policy without 
+	relying on ppo.py.
+"""
+
+def _log_summary(ep_len, ep_ret, ep_num):
+		"""
+			Print to stdout what we've logged so far in the most recent episode.
+
+			Parameters:
+				None
+
+			Return:
+				None
+		"""
+		# Round decimal places for more aesthetic logging messages
+		ep_len = str(round(ep_len, 2))
+		ep_ret = str(round(ep_ret, 2))
+
+		# Print logging statements
+		print(flush=True)
+		print(f"-------------------- Episode #{ep_num} --------------------", flush=True)
+		print(f"Episodic Length: {ep_len}", flush=True)
+		print(f"Episodic Return: {ep_ret}", flush=True)
+		print(f"------------------------------------------------------", flush=True)
+		print(flush=True)
+
+def rollout(policy, env, render):
+	"""
+		Returns a generator to roll out each episode given a trained policy and
+		environment to test on. 
+
+		Parameters:
+			policy - The trained policy to test
+			env - The environment to evaluate the policy on
+			render - Specifies whether to render or not
+		
+		Return:
+			A generator object rollout, or iterable, which will return the latest
+			episodic length and return on each iteration of the generator.
+
+		Note:
+			If you're unfamiliar with Python generators, check this out:
+				https://wiki.python.org/moin/Generators
+			If you're unfamiliar with Python "yield", check this out:
+				https://stackoverflow.com/questions/231767/what-does-the-yield-keyword-do
+	"""
+	# Rollout until user kills process
+	while True:
+		obs, _ = env.reset()
+		done = False
+
+		# number of timesteps so far
+		t = 0
+
+		# Logging data
+		ep_len = 0            # episodic length
+		ep_ret = 0            # episodic return
+
+		while not done:
+			t += 1
+
+			# Render environment if specified, off by default
+			if render:
+				env.render()
+
+			# Query deterministic action from policy and run it
+			action = policy(obs).detach().numpy()
+			obs, rew, terminated, truncated, _ = env.step(action)
+			done = terminated | truncated
+
+			# Sum all episodic rewards as we go along
+			ep_ret += rew
+			
+		# Track episodic length
+		ep_len = t
+
+		# returns episodic length and return in this iteration
+		yield ep_len, ep_ret
+
+def eval_policy(policy, env, render=False):
+	"""
+		The main function to evaluate our policy with. It will iterate a generator object
+		"rollout", which will simulate each episode and return the most recent episode's
+		length and return. We can then log it right after. And yes, eval_policy will run
+		forever until you kill the process. 
+
+		Parameters:
+			policy - The trained policy to test, basically another name for our actor model
+			env - The environment to test the policy on
+			render - Whether we should render our episodes. False by default.
+
+		Return:
+			None
+
+		NOTE: To learn more about generators, look at rollout's function description
+	"""
+	# Rollout with the policy and environment, and log each episode's data
+	for ep_num, (ep_len, ep_ret) in enumerate(rollout(policy, env, render)):
+		_log_summary(ep_len=ep_len, ep_ret=ep_ret, ep_num=ep_num)
\ No newline at end of file
diff --git a/PPO/main.py b/PPO/main.py
new file mode 100644
index 0000000..bf2d929
--- /dev/null
+++ b/PPO/main.py
@@ -0,0 +1,123 @@
+"""
+	This file is the executable for running PPO. It is based on this medium article: 
+	https://medium.com/@eyyu/coding-ppo-from-scratch-with-pytorch-part-1-4-613dfc1b14c8
+"""
+
+import gymnasium as gym
+import sys
+import torch
+
+from arguments import get_args
+from ppo import PPO
+from network import FeedForwardNN
+from eval_policy import eval_policy
+from env import PartitionMazeEnv
+
+def train(env, hyperparameters, actor_model, critic_model):
+	"""
+		Trains the model.
+
+		Parameters:
+			env - the environment to train on
+			hyperparameters - a dict of hyperparameters to use, defined in main
+			actor_model - the actor model to load in if we want to continue training
+			critic_model - the critic model to load in if we want to continue training
+
+		Return:
+			None
+	"""	
+	print(f"Training", flush=True)
+
+	# Create a model for PPO.
+	model = PPO(policy_class=FeedForwardNN, env=env, **hyperparameters)
+
+	# Tries to load in an existing actor/critic model to continue training on
+	if actor_model != '' and critic_model != '':
+		print(f"Loading in {actor_model} and {critic_model}...", flush=True)
+		model.actor.load_state_dict(torch.load(actor_model))
+		model.critic.load_state_dict(torch.load(critic_model))
+		print(f"Successfully loaded.", flush=True)
+	elif actor_model != '' or critic_model != '': # Don't train from scratch if user accidentally forgets actor/critic model
+		print(f"Error: Either specify both actor/critic models or none at all. We don't want to accidentally override anything!")
+		sys.exit(0)
+	else:
+		print(f"Training from scratch.", flush=True)
+
+	# Train the PPO model with a specified total timesteps
+	# NOTE: You can change the total timesteps here, I put a big number just because
+	# you can kill the process whenever you feel like PPO is converging
+	model.learn(total_timesteps=200_000_000)
+
+def test(env, actor_model):
+	"""
+		Tests the model.
+
+		Parameters:
+			env - the environment to test the policy on
+			actor_model - the actor model to load in
+
+		Return:
+			None
+	"""
+	print(f"Testing {actor_model}", flush=True)
+
+	# If the actor model is not specified, then exit
+	if actor_model == '':
+		print(f"Didn't specify model file. Exiting.", flush=True)
+		sys.exit(0)
+
+	# Extract out dimensions of observation and action spaces
+	obs_dim = env.observation_space.shape[0]
+	act_dim = env.action_space.shape[0]
+
+	# Build our policy the same way we build our actor model in PPO
+	policy = FeedForwardNN(obs_dim, act_dim)
+
+	# Load in the actor model saved by the PPO algorithm
+	policy.load_state_dict(torch.load(actor_model))
+
+	# Evaluate our policy with a separate module, eval_policy, to demonstrate
+	# that once we are done training the model/policy with ppo.py, we no longer need
+	# ppo.py since it only contains the training algorithm. The model/policy itself exists
+	# independently as a binary file that can be loaded in with torch.
+	eval_policy(policy=policy, env=env, render=True)
+
+def main(args):
+	"""
+		The main function to run.
+
+		Parameters:
+			args - the arguments parsed from command line
+
+		Return:
+			None
+	"""
+	# NOTE: Here's where you can set hyperparameters for PPO. I don't include them as part of
+	# ArgumentParser because it's too annoying to type them every time at command line. Instead, you can change them here.
+	# To see a list of hyperparameters, look in ppo.py at function _init_hyperparameters
+	hyperparameters = {
+				'timesteps_per_batch': 2048, 
+				'max_timesteps_per_episode': 200, 
+				'gamma': 0.99, 
+				'n_updates_per_iteration': 10,
+				'lr': 3e-4, 
+				'clip': 0.2,
+				'render': True,
+				'render_every_i': 10
+			  }
+
+	# Creates the environment we'll be running. If you want to replace with your own
+	# custom environment, note that it must inherit Gym and have both continuous
+	# observation and action spaces.
+	# env = gym.make('Pendulum-v1', render_mode='human' if args.mode == 'test' else 'rgb_array')
+	env = PartitionMazeEnv()
+
+	# Train or test, depending on the mode specified
+	if args.mode == 'train':
+		train(env=env, hyperparameters=hyperparameters, actor_model=args.actor_model, critic_model=args.critic_model)
+	else:
+		test(env=env, actor_model=args.actor_model)
+
+if __name__ == '__main__':
+	args = get_args() # Parse arguments from command line
+	main(args)
diff --git a/PPO/network.py b/PPO/network.py
new file mode 100644
index 0000000..2a0cf87
--- /dev/null
+++ b/PPO/network.py
@@ -0,0 +1,50 @@
+"""
+	This file contains a neural network module for us to
+	define our actor and critic networks in PPO.
+"""
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+import numpy as np
+
+class FeedForwardNN(nn.Module):
+	"""
+		A standard in_dim-64-64-out_dim Feed Forward Neural Network.
+	"""
+	def __init__(self, in_dim, out_dim):
+		"""
+			Initialize the network and set up the layers.
+
+			Parameters:
+				in_dim - input dimensions as an int
+				out_dim - output dimensions as an int
+
+			Return:
+				None
+		"""
+		super(FeedForwardNN, self).__init__()
+
+		self.layer1 = nn.Linear(in_dim, 64)
+		self.layer2 = nn.Linear(64, 64)
+		self.layer3 = nn.Linear(64, out_dim)
+
+	def forward(self, obs):
+		"""
+			Runs a forward pass on the neural network.
+
+			Parameters:
+				obs - observation to pass as input
+
+			Return:
+				output - the output of our forward pass
+		"""
+		# Convert observation to tensor if it's a numpy array
+		if isinstance(obs, np.ndarray):
+			obs = torch.tensor(obs, dtype=torch.float)
+
+		activation1 = F.relu(self.layer1(obs))
+		activation2 = F.relu(self.layer2(activation1))
+		output = self.layer3(activation2)
+
+		return output
diff --git a/PPO/ppo.py b/PPO/ppo.py
new file mode 100644
index 0000000..3ac2047
--- /dev/null
+++ b/PPO/ppo.py
@@ -0,0 +1,402 @@
+"""
+	The file contains the PPO class to train with.
+	NOTE: All "ALG STEP"s are following the numbers from the original PPO pseudocode.
+			It can be found here: https://spinningup.openai.com/en/latest/_images/math/e62a8971472597f4b014c2da064f636ffe365ba3.svg
+"""
+
+import gymnasium as gym
+import time
+
+import numpy as np
+import time
+import torch
+import torch.nn as nn
+from torch.optim import Adam
+from torch.distributions import MultivariateNormal
+
+class PPO:
+	"""
+		This is the PPO class we will use as our model in main.py
+	"""
+	def __init__(self, policy_class, env, **hyperparameters):
+		"""
+			Initializes the PPO model, including hyperparameters.
+
+			Parameters:
+				policy_class - the policy class to use for our actor/critic networks.
+				env - the environment to train on.
+				hyperparameters - all extra arguments passed into PPO that should be hyperparameters.
+
+			Returns:
+				None
+		"""
+		# Make sure the environment is compatible with our code
+		assert(type(env.observation_space) == gym.spaces.Box)
+		assert(type(env.action_space) == gym.spaces.Box)
+
+		# Initialize hyperparameters for training with PPO
+		self._init_hyperparameters(hyperparameters)
+
+		# Extract environment information
+		self.env = env
+		self.obs_dim = env.observation_space.shape[0]
+		self.act_dim = env.action_space.shape[0]
+
+		 # Initialize actor and critic networks
+		self.actor = policy_class(self.obs_dim, self.act_dim)                                                   # ALG STEP 1
+		self.critic = policy_class(self.obs_dim, 1)
+
+		# Initialize optimizers for actor and critic
+		self.actor_optim = Adam(self.actor.parameters(), lr=self.lr)
+		self.critic_optim = Adam(self.critic.parameters(), lr=self.lr)
+
+		# Initialize the covariance matrix used to query the actor for actions
+		self.cov_var = torch.full(size=(self.act_dim,), fill_value=0.5)
+		self.cov_mat = torch.diag(self.cov_var)
+
+		# This logger will help us with printing out summaries of each iteration
+		self.logger = {
+			'delta_t': time.time_ns(),
+			't_so_far': 0,          # timesteps so far
+			'i_so_far': 0,          # iterations so far
+			'batch_lens': [],       # episodic lengths in batch
+			'batch_rews': [],       # episodic returns in batch
+			'actor_losses': [],     # losses of actor network in current iteration
+		}
+
+	def learn(self, total_timesteps):
+		"""
+			Train the actor and critic networks. Here is where the main PPO algorithm resides.
+
+			Parameters:
+				total_timesteps - the total number of timesteps to train for
+
+			Return:
+				None
+		"""
+		print(f"Learning... Running {self.max_timesteps_per_episode} timesteps per episode, ", end='')
+		print(f"{self.timesteps_per_batch} timesteps per batch for a total of {total_timesteps} timesteps")
+		t_so_far = 0 # Timesteps simulated so far
+		i_so_far = 0 # Iterations ran so far
+		while t_so_far < total_timesteps:                                                                       # ALG STEP 2
+			# Autobots, roll out (just kidding, we're collecting our batch simulations here)
+			batch_obs, batch_acts, batch_log_probs, batch_rtgs, batch_lens = self.rollout()                     # ALG STEP 3
+
+			# Calculate how many timesteps we collected this batch
+			t_so_far += np.sum(batch_lens)
+
+			# Increment the number of iterations
+			i_so_far += 1
+
+			# Logging timesteps so far and iterations so far
+			self.logger['t_so_far'] = t_so_far
+			self.logger['i_so_far'] = i_so_far
+
+			# Calculate advantage at k-th iteration
+			V, _ = self.evaluate(batch_obs, batch_acts)
+			A_k = batch_rtgs - V.detach()                                                                       # ALG STEP 5
+
+			# One of the only tricks I use that isn't in the pseudocode. Normalizing advantages
+			# isn't theoretically necessary, but in practice it decreases the variance of 
+			# our advantages and makes convergence much more stable and faster. I added this because
+			# solving some environments was too unstable without it.
+			A_k = (A_k - A_k.mean()) / (A_k.std() + 1e-10)
+
+			# This is the loop where we update our network for some n epochs
+			for _ in range(self.n_updates_per_iteration):                                                       # ALG STEP 6 & 7
+				# Calculate V_phi and pi_theta(a_t | s_t)
+				V, curr_log_probs = self.evaluate(batch_obs, batch_acts)
+
+				# Calculate the ratio pi_theta(a_t | s_t) / pi_theta_k(a_t | s_t)
+				# NOTE: we just subtract the logs, which is the same as
+				# dividing the values and then canceling the log with e^log.
+				# For why we use log probabilities instead of actual probabilities,
+				# here's a great explanation: 
+				# https://cs.stackexchange.com/questions/70518/why-do-we-use-the-log-in-gradient-based-reinforcement-algorithms
+				# TL;DR makes gradient ascent easier behind the scenes.
+				ratios = torch.exp(curr_log_probs - batch_log_probs)
+
+				# Calculate surrogate losses.
+				surr1 = ratios * A_k
+				surr2 = torch.clamp(ratios, 1 - self.clip, 1 + self.clip) * A_k
+
+				# Calculate actor and critic losses.
+				# NOTE: we take the negative min of the surrogate losses because we're trying to maximize
+				# the performance function, but Adam minimizes the loss. So minimizing the negative
+				# performance function maximizes it.
+				actor_loss = (-torch.min(surr1, surr2)).mean()
+				critic_loss = nn.MSELoss()(V, batch_rtgs)
+
+				# Calculate gradients and perform backward propagation for actor network
+				self.actor_optim.zero_grad()
+				actor_loss.backward(retain_graph=True)
+				self.actor_optim.step()
+
+				# Calculate gradients and perform backward propagation for critic network
+				self.critic_optim.zero_grad()
+				critic_loss.backward()
+				self.critic_optim.step()
+
+				# Log actor loss
+				self.logger['actor_losses'].append(actor_loss.detach())
+
+			# Print a summary of our training so far
+			self._log_summary()
+
+			# Save our model if it's time
+			if i_so_far % self.save_freq == 0:
+				torch.save(self.actor.state_dict(), './weights/ppo_actor.pth')
+				torch.save(self.critic.state_dict(), './weights/ppo_critic.pth')
+
+	def rollout(self):
+		"""
+			Too many transformers references, I'm sorry. This is where we collect the batch of data
+			from simulation. Since this is an on-policy algorithm, we'll need to collect a fresh batch
+			of data each time we iterate the actor/critic networks.
+
+			Parameters:
+				None
+
+			Return:
+				batch_obs - the observations collected this batch. Shape: (number of timesteps, dimension of observation)
+				batch_acts - the actions collected this batch. Shape: (number of timesteps, dimension of action)
+				batch_log_probs - the log probabilities of each action taken this batch. Shape: (number of timesteps)
+				batch_rtgs - the Rewards-To-Go of each timestep in this batch. Shape: (number of timesteps)
+				batch_lens - the lengths of each episode this batch. Shape: (number of episodes)
+		"""
+		# Batch data. For more details, check function header.
+		batch_obs = []
+		batch_acts = []
+		batch_log_probs = []
+		batch_rews = []
+		batch_rtgs = []
+		batch_lens = []
+
+		# Episodic data. Keeps track of rewards per episode, will get cleared
+		# upon each new episode
+		ep_rews = []
+
+		t = 0 # Keeps track of how many timesteps we've run so far this batch
+
+		# Keep simulating until we've run more than or equal to specified timesteps per batch
+		while t < self.timesteps_per_batch:
+			ep_rews = [] # rewards collected per episode
+
+			# Reset the environment. sNote that obs is short for observation. 
+			obs, _ = self.env.reset()
+			done = False
+
+			# Run an episode for a maximum of max_timesteps_per_episode timesteps
+			for ep_t in range(self.max_timesteps_per_episode):
+				# If render is specified, render the environment
+				if self.render and (self.logger['i_so_far'] % self.render_every_i == 0) and len(batch_lens) == 0:
+					self.env.render()
+
+				t += 1 # Increment timesteps ran this batch so far
+
+				# Track observations in this batch
+				batch_obs.append(obs)
+
+				# Calculate action and make a step in the env. 
+				# Note that rew is short for reward.
+				action, log_prob = self.get_action(obs)
+				obs, rew, terminated, truncated, _ = self.env.step(action)
+
+				# Don't really care about the difference between terminated or truncated in this, so just combine them
+				done = terminated | truncated
+
+				# Track recent reward, action, and action log probability
+				ep_rews.append(rew)
+				batch_acts.append(action)
+				batch_log_probs.append(log_prob)
+
+				# If the environment tells us the episode is terminated, break
+				if done:
+					break
+
+			# Track episodic lengths and rewards
+			batch_lens.append(ep_t + 1)
+			batch_rews.append(ep_rews)
+
+		# Reshape data as tensors in the shape specified in function description, before returning
+		batch_obs = torch.tensor(batch_obs, dtype=torch.float)
+		batch_acts = torch.tensor(batch_acts, dtype=torch.float)
+		batch_log_probs = torch.tensor(batch_log_probs, dtype=torch.float)
+		batch_rtgs = self.compute_rtgs(batch_rews)                                                              # ALG STEP 4
+
+		# Log the episodic returns and episodic lengths in this batch.
+		self.logger['batch_rews'] = batch_rews
+		self.logger['batch_lens'] = batch_lens
+
+		return batch_obs, batch_acts, batch_log_probs, batch_rtgs, batch_lens
+
+	def compute_rtgs(self, batch_rews):
+		"""
+			Compute the Reward-To-Go of each timestep in a batch given the rewards.
+
+			Parameters:
+				batch_rews - the rewards in a batch, Shape: (number of episodes, number of timesteps per episode)
+
+			Return:
+				batch_rtgs - the rewards to go, Shape: (number of timesteps in batch)
+		"""
+		# The rewards-to-go (rtg) per episode per batch to return.
+		# The shape will be (num timesteps per episode)
+		batch_rtgs = []
+
+		# Iterate through each episode
+		for ep_rews in reversed(batch_rews):
+
+			discounted_reward = 0 # The discounted reward so far
+
+			# Iterate through all rewards in the episode. We go backwards for smoother calculation of each
+			# discounted return (think about why it would be harder starting from the beginning)
+			for rew in reversed(ep_rews):
+				discounted_reward = rew + discounted_reward * self.gamma
+				batch_rtgs.insert(0, discounted_reward)
+
+		# Convert the rewards-to-go into a tensor
+		batch_rtgs = torch.tensor(batch_rtgs, dtype=torch.float)
+
+		return batch_rtgs
+
+	def get_action(self, obs):
+		"""
+			Queries an action from the actor network, should be called from rollout.
+
+			Parameters:
+				obs - the observation at the current timestep
+
+			Return:
+				action - the action to take, as a numpy array
+				log_prob - the log probability of the selected action in the distribution
+		"""
+		# Query the actor network for a mean action
+		mean = self.actor(obs)
+
+		# Create a distribution with the mean action and std from the covariance matrix above.
+		# For more information on how this distribution works, check out Andrew Ng's lecture on it:
+		# https://www.youtube.com/watch?v=JjB58InuTqM
+		dist = MultivariateNormal(mean, self.cov_mat)
+
+		# Sample an action from the distribution
+		action = dist.sample()
+
+		# Calculate the log probability for that action
+		log_prob = dist.log_prob(action)
+
+		# Return the sampled action and the log probability of that action in our distribution
+		return action.detach().numpy(), log_prob.detach()
+
+	def evaluate(self, batch_obs, batch_acts):
+		"""
+			Estimate the values of each observation, and the log probs of
+			each action in the most recent batch with the most recent
+			iteration of the actor network. Should be called from learn.
+
+			Parameters:
+				batch_obs - the observations from the most recently collected batch as a tensor.
+							Shape: (number of timesteps in batch, dimension of observation)
+				batch_acts - the actions from the most recently collected batch as a tensor.
+							Shape: (number of timesteps in batch, dimension of action)
+
+			Return:
+				V - the predicted values of batch_obs
+				log_probs - the log probabilities of the actions taken in batch_acts given batch_obs
+		"""
+		# Query critic network for a value V for each batch_obs. Shape of V should be same as batch_rtgs
+		V = self.critic(batch_obs).squeeze()
+
+		# Calculate the log probabilities of batch actions using most recent actor network.
+		# This segment of code is similar to that in get_action()
+		mean = self.actor(batch_obs)
+		dist = MultivariateNormal(mean, self.cov_mat)
+		log_probs = dist.log_prob(batch_acts)
+
+		# Return the value vector V of each observation in the batch
+		# and log probabilities log_probs of each action in the batch
+		return V, log_probs
+
+	def _init_hyperparameters(self, hyperparameters):
+		"""
+			Initialize default and custom values for hyperparameters
+
+			Parameters:
+				hyperparameters - the extra arguments included when creating the PPO model, should only include
+									hyperparameters defined below with custom values.
+
+			Return:
+				None
+		"""
+		# Initialize default values for hyperparameters
+		# Algorithm hyperparameters
+		self.timesteps_per_batch = 4800                 # Number of timesteps to run per batch
+		self.max_timesteps_per_episode = 1600           # Max number of timesteps per episode
+		self.n_updates_per_iteration = 5                # Number of times to update actor/critic per iteration
+		self.lr = 0.005                                 # Learning rate of actor optimizer
+		self.gamma = 0.95                               # Discount factor to be applied when calculating Rewards-To-Go
+		self.clip = 0.2                                 # Recommended 0.2, helps define the threshold to clip the ratio during SGA
+
+		# Miscellaneous parameters
+		self.render = True                              # If we should render during rollout
+		self.render_every_i = 10                        # Only render every n iterations
+		self.save_freq = 10                             # How often we save in number of iterations
+		self.seed = None                                # Sets the seed of our program, used for reproducibility of results
+
+		# Change any default values to custom values for specified hyperparameters
+		for param, val in hyperparameters.items():
+			exec('self.' + param + ' = ' + str(val))
+
+		# Sets the seed if specified
+		if self.seed != None:
+			# Check if our seed is valid first
+			assert(type(self.seed) == int)
+
+			# Set the seed 
+			torch.manual_seed(self.seed)
+			print(f"Successfully set seed to {self.seed}")
+
+	def _log_summary(self):
+		"""
+			Print to stdout what we've logged so far in the most recent batch.
+
+			Parameters:
+				None
+
+			Return:
+				None
+		"""
+		# Calculate logging values. I use a few python shortcuts to calculate each value
+		# without explaining since it's not too important to PPO; feel free to look it over,
+		# and if you have any questions you can email me (look at bottom of README)
+		delta_t = self.logger['delta_t']
+		self.logger['delta_t'] = time.time_ns()
+		delta_t = (self.logger['delta_t'] - delta_t) / 1e9
+		delta_t = str(round(delta_t, 2))
+
+		t_so_far = self.logger['t_so_far']
+		i_so_far = self.logger['i_so_far']
+		avg_ep_lens = np.mean(self.logger['batch_lens'])
+		avg_ep_rews = np.mean([np.sum(ep_rews) for ep_rews in self.logger['batch_rews']])
+		avg_actor_loss = np.mean([losses.float().mean() for losses in self.logger['actor_losses']])
+
+		# Round decimal places for more aesthetic logging messages
+		avg_ep_lens = str(round(avg_ep_lens, 2))
+		avg_ep_rews = str(round(avg_ep_rews, 2))
+		avg_actor_loss = str(round(avg_actor_loss, 5))
+
+		# Print logging statements
+		print(flush=True)
+		print(f"-------------------- Iteration #{i_so_far} --------------------", flush=True)
+		print(f"Average Episodic Length: {avg_ep_lens}", flush=True)
+		print(f"Average Episodic Return: {avg_ep_rews}", flush=True)
+		print(f"Average Loss: {avg_actor_loss}", flush=True)
+		print(f"Timesteps So Far: {t_so_far}", flush=True)
+		print(f"Iteration took: {delta_t} secs", flush=True)
+		print(f"------------------------------------------------------", flush=True)
+		print(flush=True)
+
+		# Reset batch-specific logging data
+		self.logger['batch_lens'] = []
+		self.logger['batch_rews'] = []
+		self.logger['actor_losses'] = []