添加PPO代码
This commit is contained in:
parent
e7a4395340
commit
1058f37be6
3
.gitignore
vendored
3
.gitignore
vendored
@ -7,6 +7,9 @@ __pycache__/
|
|||||||
# C extensions
|
# C extensions
|
||||||
*.so
|
*.so
|
||||||
|
|
||||||
|
# Pytorch weights
|
||||||
|
weights/
|
||||||
|
|
||||||
# Distribution / packaging
|
# Distribution / packaging
|
||||||
.Python
|
.Python
|
||||||
build/
|
build/
|
||||||
|
27
PPO/arguments.py
Normal file
27
PPO/arguments.py
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
"""
|
||||||
|
This file contains the arguments to parse at command line.
|
||||||
|
File main.py will call get_args, which then the arguments
|
||||||
|
will be returned.
|
||||||
|
"""
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
def get_args():
|
||||||
|
"""
|
||||||
|
Description:
|
||||||
|
Parses arguments at command line.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
None
|
||||||
|
|
||||||
|
Return:
|
||||||
|
args - the arguments parsed
|
||||||
|
"""
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
|
||||||
|
parser.add_argument('--mode', dest='mode', type=str, default='train') # can be 'train' or 'test'
|
||||||
|
parser.add_argument('--actor_model', dest='actor_model', type=str, default='') # your actor model filename
|
||||||
|
parser.add_argument('--critic_model', dest='critic_model', type=str, default='') # your critic model filename
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
return args
|
295
PPO/env.py
Normal file
295
PPO/env.py
Normal file
@ -0,0 +1,295 @@
|
|||||||
|
import gymnasium as gym
|
||||||
|
from gymnasium import spaces
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
class PartitionMazeEnv(gym.Env):
|
||||||
|
"""
|
||||||
|
自定义环境,分为两阶段:
|
||||||
|
阶段 0:区域切分(共 4 步,每一步输出一个标量,用于确定竖切和横切位置)。
|
||||||
|
切分顺序为:第一步输出 c₁,第二步输出 c₂,第三步输出 r₁,第四步输出 r₂。
|
||||||
|
离散化后取值仅为 {0, 0.1, 0.2, …, 0.9}(其中 0 表示不切)。
|
||||||
|
阶段 1:车辆路径规划(走迷宫),车辆从区域中心出发,在九宫格内按照上下左右移动,
|
||||||
|
直到所有目标格子被覆盖或步数上限达到。
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, config=None):
|
||||||
|
super(PartitionMazeEnv, self).__init__()
|
||||||
|
# 车队参数设置
|
||||||
|
self.H = 20 # 区域高度,网格点之间的距离为25m(单位距离)
|
||||||
|
self.W = 30 # 区域宽度
|
||||||
|
self.num_cars = 2 # 系统数量(车-巢-机系统个数)
|
||||||
|
|
||||||
|
# 时间系数(单位:秒,每个网格一张照片)
|
||||||
|
self.flight_time_factor = 3 # 每张照片对应的飞行时间,无人机飞行速度为9.5m/s,拍摄照片的时间间隔为3s
|
||||||
|
self.comp_uav_factor = 5 # 无人机上每张照片计算时间,5s
|
||||||
|
self.trans_time_factor = 0.3 # 每张照片传输时间,0.3s
|
||||||
|
self.car_move_time_factor = 2 * 50 # TODO 汽车每单位距离的移动时间,2s,加了一个放大因子
|
||||||
|
self.comp_bs_factor = 5 # 机巢上每张照片计算时间
|
||||||
|
|
||||||
|
# 能耗参数
|
||||||
|
self.flight_energy_factor = 0.05 # 单位:分钟/张
|
||||||
|
self.comp_energy_factor = 0.05 # 计算能耗需要重新估计
|
||||||
|
self.trans_energy_factor = 0.0025
|
||||||
|
self.battery_capacity = 10 # 无人机只进行飞行,续航为30分钟
|
||||||
|
|
||||||
|
self.phase = 0 # 阶段控制,0:区域划分阶段,1:迷宫初始化阶段,2:走迷宫阶段
|
||||||
|
self.partition_step = 0 # 区域划分阶段步数,范围 0~4
|
||||||
|
# TODO 切的刀数现在固定为4(2+2)
|
||||||
|
self.partition_values = np.zeros(
|
||||||
|
4, dtype=np.float32) # 存储 c₁, c₂, r₁, r₂
|
||||||
|
|
||||||
|
# 定义动作空间:全部动作均为 1 维连续 [0,1]
|
||||||
|
self.action_space = spaces.Box(
|
||||||
|
low=0.0, high=1.0, shape=(1,), dtype=np.float32)
|
||||||
|
|
||||||
|
# 定义观察空间为8维向量
|
||||||
|
# TODO 返回的状态目前只有位置坐标
|
||||||
|
# 阶段 0 状态:前 4 维表示已决策的切分值(未决策部分为 0)
|
||||||
|
# 阶段 1 状态:车辆位置 (2D)
|
||||||
|
self.observation_space = spaces.Box(
|
||||||
|
low=0.0, high=1.0, shape=(8,), dtype=np.float32)
|
||||||
|
|
||||||
|
# 切分阶段相关变量
|
||||||
|
self.vertical_cuts = [] # 存储竖切位置(c₁, c₂),当值为0时表示不切
|
||||||
|
self.horizontal_cuts = [] # 存储横切位置(r₁, r₂)
|
||||||
|
# TODO region_centers可不可以优化一下,减少一些参数
|
||||||
|
self.region_centers = [] # 存储切分后每个子区域的中心点(归一化坐标)
|
||||||
|
|
||||||
|
# 路径规划阶段相关变量
|
||||||
|
self.MAX_STEPS = 50 # 迷宫走法步数上限
|
||||||
|
self.step_count = 0
|
||||||
|
self.rectangles = {}
|
||||||
|
self.car_pos = [[0.5, 0.5] for _ in range(self.num_cars)]
|
||||||
|
self.car_traj = [[] for _ in range(self.num_cars)]
|
||||||
|
self.current_car_index = 0
|
||||||
|
|
||||||
|
def reset(self, seed=None, options=None):
|
||||||
|
# 重置所有变量,回到切分阶段(phase 0)
|
||||||
|
self.phase = 0
|
||||||
|
self.partition_step = 0
|
||||||
|
self.partition_values = np.zeros(4, dtype=np.float32)
|
||||||
|
self.vertical_cuts = []
|
||||||
|
self.horizontal_cuts = []
|
||||||
|
self.region_centers = []
|
||||||
|
self.step_count = 0
|
||||||
|
self.rectangles = {}
|
||||||
|
self.car_pos = [[0.5, 0.5] for _ in range(self.num_cars)]
|
||||||
|
self.car_traj = [[] for _ in range(self.num_cars)]
|
||||||
|
self.current_car_index = 0
|
||||||
|
# 状态:前 4 维为 partition_values,其余补 0
|
||||||
|
state = np.concatenate(
|
||||||
|
[self.partition_values, np.zeros(np.array(self.car_pos).flatten().shape[0], dtype=np.float32)])
|
||||||
|
return state, {}
|
||||||
|
|
||||||
|
def step(self, action):
|
||||||
|
# 在所有阶段动作均为 1 维连续动作,取 action[0]
|
||||||
|
a = float(action[0])
|
||||||
|
|
||||||
|
if self.phase == 0:
|
||||||
|
# 切分阶段:每一步输出一个标量,离散化为 {0, 0.1, ..., 0.9}
|
||||||
|
disc_val = np.floor(a * 10) / 10.0
|
||||||
|
disc_val = np.clip(disc_val, 0.0, 0.9)
|
||||||
|
self.partition_values[self.partition_step] = disc_val
|
||||||
|
self.partition_step += 1
|
||||||
|
|
||||||
|
# 构造当前状态:前 partition_step 个为已决策值,其余为 0,再补 7 个 0
|
||||||
|
state = np.concatenate(
|
||||||
|
[self.partition_values, np.zeros(np.array(self.car_pos).flatten().shape[0], dtype=np.float32)])
|
||||||
|
|
||||||
|
# 如果未完成 4 步,则仍处于切分阶段,不发奖励,done 为 False
|
||||||
|
if self.partition_step < 4:
|
||||||
|
return state, 0.0, False, False, {}
|
||||||
|
else:
|
||||||
|
# 完成 4 步后,计算切分边界
|
||||||
|
# 过滤掉 0,并去重后排序
|
||||||
|
vert = sorted(set(v for v in self.partition_values[:len(
|
||||||
|
self.partition_values) // 2] if v > 0))
|
||||||
|
horiz = sorted(set(v for v in self.partition_values[len(
|
||||||
|
self.partition_values) // 2:] if v > 0))
|
||||||
|
self.vertical_cuts = vert if vert else []
|
||||||
|
self.horizontal_cuts = horiz if horiz else []
|
||||||
|
|
||||||
|
# 边界:始终包含 0 和 1
|
||||||
|
v_boundaries = [0.0] + self.vertical_cuts + [1.0]
|
||||||
|
h_boundaries = [0.0] + self.horizontal_cuts + [1.0]
|
||||||
|
|
||||||
|
# 判断分区是否合理,并计算各个分区的任务卸载率ρ
|
||||||
|
valid_partition = True
|
||||||
|
for i in range(len(h_boundaries) - 1):
|
||||||
|
for j in range(len(v_boundaries) - 1):
|
||||||
|
d = (v_boundaries[j+1] - v_boundaries[j]) * self.W * \
|
||||||
|
(h_boundaries[i] + h_boundaries[i+1]) * self.H
|
||||||
|
rho_time_limit = (self.flight_time_factor - self.trans_time_factor) / \
|
||||||
|
(self.comp_uav_factor - self.trans_time_factor)
|
||||||
|
rho_energy_limit = (self.battery_capacity - self.flight_energy_factor * d - self.trans_energy_factor * d) / \
|
||||||
|
(self.comp_energy_factor * d -
|
||||||
|
self.trans_energy_factor * d)
|
||||||
|
if rho_energy_limit < 0:
|
||||||
|
valid_partition = False
|
||||||
|
break
|
||||||
|
rho = min(rho_time_limit, rho_energy_limit)
|
||||||
|
|
||||||
|
flight_time = self.flight_time_factor * d
|
||||||
|
comp_time = self.comp_uav_factor * rho * d
|
||||||
|
trans_time = self.trans_time_factor * (1 - rho) * d
|
||||||
|
comp_bs_time = self.comp_bs_factor * (1 - rho) * d
|
||||||
|
|
||||||
|
self.rectangles[(i, j)] = {
|
||||||
|
# 'r1': h_boundaries[i], 'r2': h_boundaries[i+1], 'c1': v_boundaries[j], 'c2': v_boundaries[j+1],
|
||||||
|
'd': d,
|
||||||
|
'rho': rho,
|
||||||
|
'flight_time': flight_time,
|
||||||
|
'comp_time': comp_time,
|
||||||
|
'trans_time': trans_time,
|
||||||
|
'comp_bs_time': comp_bs_time,
|
||||||
|
'is_visited': False
|
||||||
|
# 'center': (center_r, center_c)
|
||||||
|
}
|
||||||
|
if not valid_partition:
|
||||||
|
break
|
||||||
|
|
||||||
|
if not valid_partition:
|
||||||
|
reward = -100
|
||||||
|
state = np.concatenate(
|
||||||
|
[self.partition_values, np.zeros(np.array(self.car_pos).flatten().shape[0], dtype=np.float32)])
|
||||||
|
return state, reward, True, False, {}
|
||||||
|
else:
|
||||||
|
reward = 10
|
||||||
|
|
||||||
|
# 进入阶段 1:走迷宫
|
||||||
|
self.phase = 1
|
||||||
|
# 根据分割边界计算每个子区域中心
|
||||||
|
self.region_centers = []
|
||||||
|
for i in range(len(h_boundaries) - 1):
|
||||||
|
for j in range(len(v_boundaries) - 1):
|
||||||
|
center_x = (
|
||||||
|
v_boundaries[j] + v_boundaries[j+1]) / 2.0
|
||||||
|
center_y = (
|
||||||
|
h_boundaries[i] + h_boundaries[i+1]) / 2.0
|
||||||
|
self.region_centers.append((center_x, center_y))
|
||||||
|
# 存储切分边界,供后续网格映射使用
|
||||||
|
self.v_boundaries = v_boundaries
|
||||||
|
self.h_boundaries = h_boundaries
|
||||||
|
# 初始化迷宫阶段:步数清零,建立 visited_grid 大小与网格数相同
|
||||||
|
self.step_count = 0
|
||||||
|
self.visited_grid = np.zeros(
|
||||||
|
(len(v_boundaries) - 1) * (len(h_boundaries) - 1), dtype=np.int32)
|
||||||
|
|
||||||
|
state = np.concatenate(
|
||||||
|
[self.partition_values, np.array(self.car_pos).flatten()])
|
||||||
|
return state, reward, False, False, {}
|
||||||
|
|
||||||
|
elif self.phase == 1:
|
||||||
|
# 阶段 1:初始化迷宫,让多个车辆从区域中心出发,前往划分区域的中心点
|
||||||
|
# 确保 action 的值在 [0, 1],然后映射到 0~(num_regions-1) 的索引
|
||||||
|
num_regions = len(self.region_centers)
|
||||||
|
target_region_index = int(np.floor(a * num_regions))
|
||||||
|
target_region_index = np.clip(
|
||||||
|
target_region_index, 0, num_regions - 1)
|
||||||
|
|
||||||
|
# 遍历所有车辆,让它们依次移动到目标子区域
|
||||||
|
for car_idx in range(self.num_cars):
|
||||||
|
target_position = np.array(
|
||||||
|
self.region_centers[target_region_index]) # 目标区域中心
|
||||||
|
|
||||||
|
# 更新该车辆位置
|
||||||
|
self.car_pos[car_idx] = target_position
|
||||||
|
# 累计步数
|
||||||
|
self.step_count += 1
|
||||||
|
self.car_traj[car_idx].append(target_position) # 记录每辆车的轨迹
|
||||||
|
|
||||||
|
# 进入阶段 2:走迷宫
|
||||||
|
self.phase = 2
|
||||||
|
|
||||||
|
# 观察状态
|
||||||
|
state = np.concatenate(
|
||||||
|
[self.partition_values, np.array(self.car_pos).flatten()])
|
||||||
|
return state, 0.0, False, False, {}
|
||||||
|
|
||||||
|
elif self.phase == 2:
|
||||||
|
# 阶段 2:路径规划(走迷宫)
|
||||||
|
current_car = self.current_car_index
|
||||||
|
|
||||||
|
# 当前动作 a 为 1 维连续动作,映射到四个方向
|
||||||
|
if a < 0.2:
|
||||||
|
move_dir = 'up'
|
||||||
|
elif a < 0.4:
|
||||||
|
move_dir = 'down'
|
||||||
|
elif a < 0.6:
|
||||||
|
move_dir = 'left'
|
||||||
|
elif a < 0.8:
|
||||||
|
move_dir = 'right'
|
||||||
|
else:
|
||||||
|
move_dir = 'stay'
|
||||||
|
|
||||||
|
current_row, current_col = self.car_pos[current_car]
|
||||||
|
|
||||||
|
# 初始化新的行、列为当前值
|
||||||
|
new_row, new_col = current_row, current_col
|
||||||
|
|
||||||
|
if move_dir == 'up' and current_row < len(h_boundaries) - 1:
|
||||||
|
new_row = current_row + 1
|
||||||
|
elif move_dir == 'down' and current_row > 0:
|
||||||
|
new_row = current_row - 1
|
||||||
|
elif move_dir == 'left' and current_col > 0:
|
||||||
|
new_col = current_col - 1
|
||||||
|
elif move_dir == 'right' and current_col < len(v_boundaries) - 1:
|
||||||
|
new_col = current_col + 1
|
||||||
|
# 如果移动不合法,或者动作为stay,则保持原位置
|
||||||
|
# TODO 移动不合法,加一些惩罚
|
||||||
|
|
||||||
|
# 更新车辆位置
|
||||||
|
self.car_pos[current_car] = [new_row, new_col]
|
||||||
|
if new_row != current_row or new_col != current_col:
|
||||||
|
self.car_traj[current_car].append(np.array(new_row, new_col))
|
||||||
|
self.step_count += 1
|
||||||
|
self.current_car_index = (
|
||||||
|
self.current_car_index + 1) % self.num_cars
|
||||||
|
|
||||||
|
# 更新访问标记:将新网格标记为已访问
|
||||||
|
self.rectangles[(new_col, new_col)]['is_visited'] = True
|
||||||
|
|
||||||
|
# 观察状态
|
||||||
|
state = np.concatenate(
|
||||||
|
[self.partition_values, np.array(self.car_pos).flatten()])
|
||||||
|
|
||||||
|
# Episode 终止条件:所有网格均被访问或步数达到上限
|
||||||
|
done = all([rec['is_visited'] for rec in self.rectangles]) or (
|
||||||
|
self.step_count >= self.MAX_STEPS)
|
||||||
|
if done and np.all(self.visited_grid == 1):
|
||||||
|
# 区域覆盖完毕,根据轨迹计算各车队的执行时间
|
||||||
|
T = max([self._compute_motorcade_time(idx)
|
||||||
|
for idx in range(self.num_cars)])
|
||||||
|
reward += 10.0 # TODO 奖励与greedy比较
|
||||||
|
elif done and self.step_count >= self.MAX_STEPS:
|
||||||
|
reward -= 100
|
||||||
|
|
||||||
|
return state, reward, done, False, {}
|
||||||
|
|
||||||
|
def _compute_motorcade_time(self, idx):
|
||||||
|
flight_time = sum(self.rectangles[point]['flight_time']
|
||||||
|
for point in self.car_traj[idx])
|
||||||
|
bs_time = sum(self.rectangles[point]['comp_bs_time']
|
||||||
|
for point in self.car_traj[idx])
|
||||||
|
|
||||||
|
# 计算车的移动时间,首先在轨迹的首尾添加上大区域中心
|
||||||
|
self.car_traj[idx].append([0.5, 0.5])
|
||||||
|
self.car_traj[idx].insert(0, [0.5, 0.5])
|
||||||
|
for i in range(len(self.car_traj[idx])):
|
||||||
|
first_point = self.car_traj[idx][i]
|
||||||
|
second_point = self.car_traj[idx][i + 1]
|
||||||
|
car_time += np.linalg.norm(first_point, second_point) * \
|
||||||
|
self.H * self.W * self.car_move_time_factor
|
||||||
|
|
||||||
|
return max(car_time + flight_time, bs_time)
|
||||||
|
|
||||||
|
def render(self):
|
||||||
|
if self.phase == 0:
|
||||||
|
print("Phase 0: Partitioning.")
|
||||||
|
print(f"Partition step: {self.partition_step}")
|
||||||
|
print(f"Partition values so far: {self.partition_values}")
|
||||||
|
elif self.phase == 1:
|
||||||
|
print("Phase 1: Path planning (maze).")
|
||||||
|
print(f"Visited grid: {self.visited_grid}")
|
||||||
|
print(f"Step count: {self.step_count}")
|
103
PPO/eval_policy.py
Normal file
103
PPO/eval_policy.py
Normal file
@ -0,0 +1,103 @@
|
|||||||
|
"""
|
||||||
|
This file is used only to evaluate our trained policy/actor after
|
||||||
|
training in main.py with ppo.py. I wrote this file to demonstrate
|
||||||
|
that our trained policy exists independently of our learning algorithm,
|
||||||
|
which resides in ppo.py. Thus, we can test our trained policy without
|
||||||
|
relying on ppo.py.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def _log_summary(ep_len, ep_ret, ep_num):
|
||||||
|
"""
|
||||||
|
Print to stdout what we've logged so far in the most recent episode.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
None
|
||||||
|
|
||||||
|
Return:
|
||||||
|
None
|
||||||
|
"""
|
||||||
|
# Round decimal places for more aesthetic logging messages
|
||||||
|
ep_len = str(round(ep_len, 2))
|
||||||
|
ep_ret = str(round(ep_ret, 2))
|
||||||
|
|
||||||
|
# Print logging statements
|
||||||
|
print(flush=True)
|
||||||
|
print(f"-------------------- Episode #{ep_num} --------------------", flush=True)
|
||||||
|
print(f"Episodic Length: {ep_len}", flush=True)
|
||||||
|
print(f"Episodic Return: {ep_ret}", flush=True)
|
||||||
|
print(f"------------------------------------------------------", flush=True)
|
||||||
|
print(flush=True)
|
||||||
|
|
||||||
|
def rollout(policy, env, render):
|
||||||
|
"""
|
||||||
|
Returns a generator to roll out each episode given a trained policy and
|
||||||
|
environment to test on.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
policy - The trained policy to test
|
||||||
|
env - The environment to evaluate the policy on
|
||||||
|
render - Specifies whether to render or not
|
||||||
|
|
||||||
|
Return:
|
||||||
|
A generator object rollout, or iterable, which will return the latest
|
||||||
|
episodic length and return on each iteration of the generator.
|
||||||
|
|
||||||
|
Note:
|
||||||
|
If you're unfamiliar with Python generators, check this out:
|
||||||
|
https://wiki.python.org/moin/Generators
|
||||||
|
If you're unfamiliar with Python "yield", check this out:
|
||||||
|
https://stackoverflow.com/questions/231767/what-does-the-yield-keyword-do
|
||||||
|
"""
|
||||||
|
# Rollout until user kills process
|
||||||
|
while True:
|
||||||
|
obs, _ = env.reset()
|
||||||
|
done = False
|
||||||
|
|
||||||
|
# number of timesteps so far
|
||||||
|
t = 0
|
||||||
|
|
||||||
|
# Logging data
|
||||||
|
ep_len = 0 # episodic length
|
||||||
|
ep_ret = 0 # episodic return
|
||||||
|
|
||||||
|
while not done:
|
||||||
|
t += 1
|
||||||
|
|
||||||
|
# Render environment if specified, off by default
|
||||||
|
if render:
|
||||||
|
env.render()
|
||||||
|
|
||||||
|
# Query deterministic action from policy and run it
|
||||||
|
action = policy(obs).detach().numpy()
|
||||||
|
obs, rew, terminated, truncated, _ = env.step(action)
|
||||||
|
done = terminated | truncated
|
||||||
|
|
||||||
|
# Sum all episodic rewards as we go along
|
||||||
|
ep_ret += rew
|
||||||
|
|
||||||
|
# Track episodic length
|
||||||
|
ep_len = t
|
||||||
|
|
||||||
|
# returns episodic length and return in this iteration
|
||||||
|
yield ep_len, ep_ret
|
||||||
|
|
||||||
|
def eval_policy(policy, env, render=False):
|
||||||
|
"""
|
||||||
|
The main function to evaluate our policy with. It will iterate a generator object
|
||||||
|
"rollout", which will simulate each episode and return the most recent episode's
|
||||||
|
length and return. We can then log it right after. And yes, eval_policy will run
|
||||||
|
forever until you kill the process.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
policy - The trained policy to test, basically another name for our actor model
|
||||||
|
env - The environment to test the policy on
|
||||||
|
render - Whether we should render our episodes. False by default.
|
||||||
|
|
||||||
|
Return:
|
||||||
|
None
|
||||||
|
|
||||||
|
NOTE: To learn more about generators, look at rollout's function description
|
||||||
|
"""
|
||||||
|
# Rollout with the policy and environment, and log each episode's data
|
||||||
|
for ep_num, (ep_len, ep_ret) in enumerate(rollout(policy, env, render)):
|
||||||
|
_log_summary(ep_len=ep_len, ep_ret=ep_ret, ep_num=ep_num)
|
123
PPO/main.py
Normal file
123
PPO/main.py
Normal file
@ -0,0 +1,123 @@
|
|||||||
|
"""
|
||||||
|
This file is the executable for running PPO. It is based on this medium article:
|
||||||
|
https://medium.com/@eyyu/coding-ppo-from-scratch-with-pytorch-part-1-4-613dfc1b14c8
|
||||||
|
"""
|
||||||
|
|
||||||
|
import gymnasium as gym
|
||||||
|
import sys
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from arguments import get_args
|
||||||
|
from ppo import PPO
|
||||||
|
from network import FeedForwardNN
|
||||||
|
from eval_policy import eval_policy
|
||||||
|
from env import PartitionMazeEnv
|
||||||
|
|
||||||
|
def train(env, hyperparameters, actor_model, critic_model):
|
||||||
|
"""
|
||||||
|
Trains the model.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
env - the environment to train on
|
||||||
|
hyperparameters - a dict of hyperparameters to use, defined in main
|
||||||
|
actor_model - the actor model to load in if we want to continue training
|
||||||
|
critic_model - the critic model to load in if we want to continue training
|
||||||
|
|
||||||
|
Return:
|
||||||
|
None
|
||||||
|
"""
|
||||||
|
print(f"Training", flush=True)
|
||||||
|
|
||||||
|
# Create a model for PPO.
|
||||||
|
model = PPO(policy_class=FeedForwardNN, env=env, **hyperparameters)
|
||||||
|
|
||||||
|
# Tries to load in an existing actor/critic model to continue training on
|
||||||
|
if actor_model != '' and critic_model != '':
|
||||||
|
print(f"Loading in {actor_model} and {critic_model}...", flush=True)
|
||||||
|
model.actor.load_state_dict(torch.load(actor_model))
|
||||||
|
model.critic.load_state_dict(torch.load(critic_model))
|
||||||
|
print(f"Successfully loaded.", flush=True)
|
||||||
|
elif actor_model != '' or critic_model != '': # Don't train from scratch if user accidentally forgets actor/critic model
|
||||||
|
print(f"Error: Either specify both actor/critic models or none at all. We don't want to accidentally override anything!")
|
||||||
|
sys.exit(0)
|
||||||
|
else:
|
||||||
|
print(f"Training from scratch.", flush=True)
|
||||||
|
|
||||||
|
# Train the PPO model with a specified total timesteps
|
||||||
|
# NOTE: You can change the total timesteps here, I put a big number just because
|
||||||
|
# you can kill the process whenever you feel like PPO is converging
|
||||||
|
model.learn(total_timesteps=200_000_000)
|
||||||
|
|
||||||
|
def test(env, actor_model):
|
||||||
|
"""
|
||||||
|
Tests the model.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
env - the environment to test the policy on
|
||||||
|
actor_model - the actor model to load in
|
||||||
|
|
||||||
|
Return:
|
||||||
|
None
|
||||||
|
"""
|
||||||
|
print(f"Testing {actor_model}", flush=True)
|
||||||
|
|
||||||
|
# If the actor model is not specified, then exit
|
||||||
|
if actor_model == '':
|
||||||
|
print(f"Didn't specify model file. Exiting.", flush=True)
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
# Extract out dimensions of observation and action spaces
|
||||||
|
obs_dim = env.observation_space.shape[0]
|
||||||
|
act_dim = env.action_space.shape[0]
|
||||||
|
|
||||||
|
# Build our policy the same way we build our actor model in PPO
|
||||||
|
policy = FeedForwardNN(obs_dim, act_dim)
|
||||||
|
|
||||||
|
# Load in the actor model saved by the PPO algorithm
|
||||||
|
policy.load_state_dict(torch.load(actor_model))
|
||||||
|
|
||||||
|
# Evaluate our policy with a separate module, eval_policy, to demonstrate
|
||||||
|
# that once we are done training the model/policy with ppo.py, we no longer need
|
||||||
|
# ppo.py since it only contains the training algorithm. The model/policy itself exists
|
||||||
|
# independently as a binary file that can be loaded in with torch.
|
||||||
|
eval_policy(policy=policy, env=env, render=True)
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
"""
|
||||||
|
The main function to run.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
args - the arguments parsed from command line
|
||||||
|
|
||||||
|
Return:
|
||||||
|
None
|
||||||
|
"""
|
||||||
|
# NOTE: Here's where you can set hyperparameters for PPO. I don't include them as part of
|
||||||
|
# ArgumentParser because it's too annoying to type them every time at command line. Instead, you can change them here.
|
||||||
|
# To see a list of hyperparameters, look in ppo.py at function _init_hyperparameters
|
||||||
|
hyperparameters = {
|
||||||
|
'timesteps_per_batch': 2048,
|
||||||
|
'max_timesteps_per_episode': 200,
|
||||||
|
'gamma': 0.99,
|
||||||
|
'n_updates_per_iteration': 10,
|
||||||
|
'lr': 3e-4,
|
||||||
|
'clip': 0.2,
|
||||||
|
'render': True,
|
||||||
|
'render_every_i': 10
|
||||||
|
}
|
||||||
|
|
||||||
|
# Creates the environment we'll be running. If you want to replace with your own
|
||||||
|
# custom environment, note that it must inherit Gym and have both continuous
|
||||||
|
# observation and action spaces.
|
||||||
|
# env = gym.make('Pendulum-v1', render_mode='human' if args.mode == 'test' else 'rgb_array')
|
||||||
|
env = PartitionMazeEnv()
|
||||||
|
|
||||||
|
# Train or test, depending on the mode specified
|
||||||
|
if args.mode == 'train':
|
||||||
|
train(env=env, hyperparameters=hyperparameters, actor_model=args.actor_model, critic_model=args.critic_model)
|
||||||
|
else:
|
||||||
|
test(env=env, actor_model=args.actor_model)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
args = get_args() # Parse arguments from command line
|
||||||
|
main(args)
|
50
PPO/network.py
Normal file
50
PPO/network.py
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
"""
|
||||||
|
This file contains a neural network module for us to
|
||||||
|
define our actor and critic networks in PPO.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from torch import nn
|
||||||
|
import torch.nn.functional as F
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
class FeedForwardNN(nn.Module):
|
||||||
|
"""
|
||||||
|
A standard in_dim-64-64-out_dim Feed Forward Neural Network.
|
||||||
|
"""
|
||||||
|
def __init__(self, in_dim, out_dim):
|
||||||
|
"""
|
||||||
|
Initialize the network and set up the layers.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
in_dim - input dimensions as an int
|
||||||
|
out_dim - output dimensions as an int
|
||||||
|
|
||||||
|
Return:
|
||||||
|
None
|
||||||
|
"""
|
||||||
|
super(FeedForwardNN, self).__init__()
|
||||||
|
|
||||||
|
self.layer1 = nn.Linear(in_dim, 64)
|
||||||
|
self.layer2 = nn.Linear(64, 64)
|
||||||
|
self.layer3 = nn.Linear(64, out_dim)
|
||||||
|
|
||||||
|
def forward(self, obs):
|
||||||
|
"""
|
||||||
|
Runs a forward pass on the neural network.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
obs - observation to pass as input
|
||||||
|
|
||||||
|
Return:
|
||||||
|
output - the output of our forward pass
|
||||||
|
"""
|
||||||
|
# Convert observation to tensor if it's a numpy array
|
||||||
|
if isinstance(obs, np.ndarray):
|
||||||
|
obs = torch.tensor(obs, dtype=torch.float)
|
||||||
|
|
||||||
|
activation1 = F.relu(self.layer1(obs))
|
||||||
|
activation2 = F.relu(self.layer2(activation1))
|
||||||
|
output = self.layer3(activation2)
|
||||||
|
|
||||||
|
return output
|
402
PPO/ppo.py
Normal file
402
PPO/ppo.py
Normal file
@ -0,0 +1,402 @@
|
|||||||
|
"""
|
||||||
|
The file contains the PPO class to train with.
|
||||||
|
NOTE: All "ALG STEP"s are following the numbers from the original PPO pseudocode.
|
||||||
|
It can be found here: https://spinningup.openai.com/en/latest/_images/math/e62a8971472597f4b014c2da064f636ffe365ba3.svg
|
||||||
|
"""
|
||||||
|
|
||||||
|
import gymnasium as gym
|
||||||
|
import time
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import time
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
from torch.optim import Adam
|
||||||
|
from torch.distributions import MultivariateNormal
|
||||||
|
|
||||||
|
class PPO:
|
||||||
|
"""
|
||||||
|
This is the PPO class we will use as our model in main.py
|
||||||
|
"""
|
||||||
|
def __init__(self, policy_class, env, **hyperparameters):
|
||||||
|
"""
|
||||||
|
Initializes the PPO model, including hyperparameters.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
policy_class - the policy class to use for our actor/critic networks.
|
||||||
|
env - the environment to train on.
|
||||||
|
hyperparameters - all extra arguments passed into PPO that should be hyperparameters.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
None
|
||||||
|
"""
|
||||||
|
# Make sure the environment is compatible with our code
|
||||||
|
assert(type(env.observation_space) == gym.spaces.Box)
|
||||||
|
assert(type(env.action_space) == gym.spaces.Box)
|
||||||
|
|
||||||
|
# Initialize hyperparameters for training with PPO
|
||||||
|
self._init_hyperparameters(hyperparameters)
|
||||||
|
|
||||||
|
# Extract environment information
|
||||||
|
self.env = env
|
||||||
|
self.obs_dim = env.observation_space.shape[0]
|
||||||
|
self.act_dim = env.action_space.shape[0]
|
||||||
|
|
||||||
|
# Initialize actor and critic networks
|
||||||
|
self.actor = policy_class(self.obs_dim, self.act_dim) # ALG STEP 1
|
||||||
|
self.critic = policy_class(self.obs_dim, 1)
|
||||||
|
|
||||||
|
# Initialize optimizers for actor and critic
|
||||||
|
self.actor_optim = Adam(self.actor.parameters(), lr=self.lr)
|
||||||
|
self.critic_optim = Adam(self.critic.parameters(), lr=self.lr)
|
||||||
|
|
||||||
|
# Initialize the covariance matrix used to query the actor for actions
|
||||||
|
self.cov_var = torch.full(size=(self.act_dim,), fill_value=0.5)
|
||||||
|
self.cov_mat = torch.diag(self.cov_var)
|
||||||
|
|
||||||
|
# This logger will help us with printing out summaries of each iteration
|
||||||
|
self.logger = {
|
||||||
|
'delta_t': time.time_ns(),
|
||||||
|
't_so_far': 0, # timesteps so far
|
||||||
|
'i_so_far': 0, # iterations so far
|
||||||
|
'batch_lens': [], # episodic lengths in batch
|
||||||
|
'batch_rews': [], # episodic returns in batch
|
||||||
|
'actor_losses': [], # losses of actor network in current iteration
|
||||||
|
}
|
||||||
|
|
||||||
|
def learn(self, total_timesteps):
|
||||||
|
"""
|
||||||
|
Train the actor and critic networks. Here is where the main PPO algorithm resides.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
total_timesteps - the total number of timesteps to train for
|
||||||
|
|
||||||
|
Return:
|
||||||
|
None
|
||||||
|
"""
|
||||||
|
print(f"Learning... Running {self.max_timesteps_per_episode} timesteps per episode, ", end='')
|
||||||
|
print(f"{self.timesteps_per_batch} timesteps per batch for a total of {total_timesteps} timesteps")
|
||||||
|
t_so_far = 0 # Timesteps simulated so far
|
||||||
|
i_so_far = 0 # Iterations ran so far
|
||||||
|
while t_so_far < total_timesteps: # ALG STEP 2
|
||||||
|
# Autobots, roll out (just kidding, we're collecting our batch simulations here)
|
||||||
|
batch_obs, batch_acts, batch_log_probs, batch_rtgs, batch_lens = self.rollout() # ALG STEP 3
|
||||||
|
|
||||||
|
# Calculate how many timesteps we collected this batch
|
||||||
|
t_so_far += np.sum(batch_lens)
|
||||||
|
|
||||||
|
# Increment the number of iterations
|
||||||
|
i_so_far += 1
|
||||||
|
|
||||||
|
# Logging timesteps so far and iterations so far
|
||||||
|
self.logger['t_so_far'] = t_so_far
|
||||||
|
self.logger['i_so_far'] = i_so_far
|
||||||
|
|
||||||
|
# Calculate advantage at k-th iteration
|
||||||
|
V, _ = self.evaluate(batch_obs, batch_acts)
|
||||||
|
A_k = batch_rtgs - V.detach() # ALG STEP 5
|
||||||
|
|
||||||
|
# One of the only tricks I use that isn't in the pseudocode. Normalizing advantages
|
||||||
|
# isn't theoretically necessary, but in practice it decreases the variance of
|
||||||
|
# our advantages and makes convergence much more stable and faster. I added this because
|
||||||
|
# solving some environments was too unstable without it.
|
||||||
|
A_k = (A_k - A_k.mean()) / (A_k.std() + 1e-10)
|
||||||
|
|
||||||
|
# This is the loop where we update our network for some n epochs
|
||||||
|
for _ in range(self.n_updates_per_iteration): # ALG STEP 6 & 7
|
||||||
|
# Calculate V_phi and pi_theta(a_t | s_t)
|
||||||
|
V, curr_log_probs = self.evaluate(batch_obs, batch_acts)
|
||||||
|
|
||||||
|
# Calculate the ratio pi_theta(a_t | s_t) / pi_theta_k(a_t | s_t)
|
||||||
|
# NOTE: we just subtract the logs, which is the same as
|
||||||
|
# dividing the values and then canceling the log with e^log.
|
||||||
|
# For why we use log probabilities instead of actual probabilities,
|
||||||
|
# here's a great explanation:
|
||||||
|
# https://cs.stackexchange.com/questions/70518/why-do-we-use-the-log-in-gradient-based-reinforcement-algorithms
|
||||||
|
# TL;DR makes gradient ascent easier behind the scenes.
|
||||||
|
ratios = torch.exp(curr_log_probs - batch_log_probs)
|
||||||
|
|
||||||
|
# Calculate surrogate losses.
|
||||||
|
surr1 = ratios * A_k
|
||||||
|
surr2 = torch.clamp(ratios, 1 - self.clip, 1 + self.clip) * A_k
|
||||||
|
|
||||||
|
# Calculate actor and critic losses.
|
||||||
|
# NOTE: we take the negative min of the surrogate losses because we're trying to maximize
|
||||||
|
# the performance function, but Adam minimizes the loss. So minimizing the negative
|
||||||
|
# performance function maximizes it.
|
||||||
|
actor_loss = (-torch.min(surr1, surr2)).mean()
|
||||||
|
critic_loss = nn.MSELoss()(V, batch_rtgs)
|
||||||
|
|
||||||
|
# Calculate gradients and perform backward propagation for actor network
|
||||||
|
self.actor_optim.zero_grad()
|
||||||
|
actor_loss.backward(retain_graph=True)
|
||||||
|
self.actor_optim.step()
|
||||||
|
|
||||||
|
# Calculate gradients and perform backward propagation for critic network
|
||||||
|
self.critic_optim.zero_grad()
|
||||||
|
critic_loss.backward()
|
||||||
|
self.critic_optim.step()
|
||||||
|
|
||||||
|
# Log actor loss
|
||||||
|
self.logger['actor_losses'].append(actor_loss.detach())
|
||||||
|
|
||||||
|
# Print a summary of our training so far
|
||||||
|
self._log_summary()
|
||||||
|
|
||||||
|
# Save our model if it's time
|
||||||
|
if i_so_far % self.save_freq == 0:
|
||||||
|
torch.save(self.actor.state_dict(), './weights/ppo_actor.pth')
|
||||||
|
torch.save(self.critic.state_dict(), './weights/ppo_critic.pth')
|
||||||
|
|
||||||
|
def rollout(self):
|
||||||
|
"""
|
||||||
|
Too many transformers references, I'm sorry. This is where we collect the batch of data
|
||||||
|
from simulation. Since this is an on-policy algorithm, we'll need to collect a fresh batch
|
||||||
|
of data each time we iterate the actor/critic networks.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
None
|
||||||
|
|
||||||
|
Return:
|
||||||
|
batch_obs - the observations collected this batch. Shape: (number of timesteps, dimension of observation)
|
||||||
|
batch_acts - the actions collected this batch. Shape: (number of timesteps, dimension of action)
|
||||||
|
batch_log_probs - the log probabilities of each action taken this batch. Shape: (number of timesteps)
|
||||||
|
batch_rtgs - the Rewards-To-Go of each timestep in this batch. Shape: (number of timesteps)
|
||||||
|
batch_lens - the lengths of each episode this batch. Shape: (number of episodes)
|
||||||
|
"""
|
||||||
|
# Batch data. For more details, check function header.
|
||||||
|
batch_obs = []
|
||||||
|
batch_acts = []
|
||||||
|
batch_log_probs = []
|
||||||
|
batch_rews = []
|
||||||
|
batch_rtgs = []
|
||||||
|
batch_lens = []
|
||||||
|
|
||||||
|
# Episodic data. Keeps track of rewards per episode, will get cleared
|
||||||
|
# upon each new episode
|
||||||
|
ep_rews = []
|
||||||
|
|
||||||
|
t = 0 # Keeps track of how many timesteps we've run so far this batch
|
||||||
|
|
||||||
|
# Keep simulating until we've run more than or equal to specified timesteps per batch
|
||||||
|
while t < self.timesteps_per_batch:
|
||||||
|
ep_rews = [] # rewards collected per episode
|
||||||
|
|
||||||
|
# Reset the environment. sNote that obs is short for observation.
|
||||||
|
obs, _ = self.env.reset()
|
||||||
|
done = False
|
||||||
|
|
||||||
|
# Run an episode for a maximum of max_timesteps_per_episode timesteps
|
||||||
|
for ep_t in range(self.max_timesteps_per_episode):
|
||||||
|
# If render is specified, render the environment
|
||||||
|
if self.render and (self.logger['i_so_far'] % self.render_every_i == 0) and len(batch_lens) == 0:
|
||||||
|
self.env.render()
|
||||||
|
|
||||||
|
t += 1 # Increment timesteps ran this batch so far
|
||||||
|
|
||||||
|
# Track observations in this batch
|
||||||
|
batch_obs.append(obs)
|
||||||
|
|
||||||
|
# Calculate action and make a step in the env.
|
||||||
|
# Note that rew is short for reward.
|
||||||
|
action, log_prob = self.get_action(obs)
|
||||||
|
obs, rew, terminated, truncated, _ = self.env.step(action)
|
||||||
|
|
||||||
|
# Don't really care about the difference between terminated or truncated in this, so just combine them
|
||||||
|
done = terminated | truncated
|
||||||
|
|
||||||
|
# Track recent reward, action, and action log probability
|
||||||
|
ep_rews.append(rew)
|
||||||
|
batch_acts.append(action)
|
||||||
|
batch_log_probs.append(log_prob)
|
||||||
|
|
||||||
|
# If the environment tells us the episode is terminated, break
|
||||||
|
if done:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Track episodic lengths and rewards
|
||||||
|
batch_lens.append(ep_t + 1)
|
||||||
|
batch_rews.append(ep_rews)
|
||||||
|
|
||||||
|
# Reshape data as tensors in the shape specified in function description, before returning
|
||||||
|
batch_obs = torch.tensor(batch_obs, dtype=torch.float)
|
||||||
|
batch_acts = torch.tensor(batch_acts, dtype=torch.float)
|
||||||
|
batch_log_probs = torch.tensor(batch_log_probs, dtype=torch.float)
|
||||||
|
batch_rtgs = self.compute_rtgs(batch_rews) # ALG STEP 4
|
||||||
|
|
||||||
|
# Log the episodic returns and episodic lengths in this batch.
|
||||||
|
self.logger['batch_rews'] = batch_rews
|
||||||
|
self.logger['batch_lens'] = batch_lens
|
||||||
|
|
||||||
|
return batch_obs, batch_acts, batch_log_probs, batch_rtgs, batch_lens
|
||||||
|
|
||||||
|
def compute_rtgs(self, batch_rews):
|
||||||
|
"""
|
||||||
|
Compute the Reward-To-Go of each timestep in a batch given the rewards.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
batch_rews - the rewards in a batch, Shape: (number of episodes, number of timesteps per episode)
|
||||||
|
|
||||||
|
Return:
|
||||||
|
batch_rtgs - the rewards to go, Shape: (number of timesteps in batch)
|
||||||
|
"""
|
||||||
|
# The rewards-to-go (rtg) per episode per batch to return.
|
||||||
|
# The shape will be (num timesteps per episode)
|
||||||
|
batch_rtgs = []
|
||||||
|
|
||||||
|
# Iterate through each episode
|
||||||
|
for ep_rews in reversed(batch_rews):
|
||||||
|
|
||||||
|
discounted_reward = 0 # The discounted reward so far
|
||||||
|
|
||||||
|
# Iterate through all rewards in the episode. We go backwards for smoother calculation of each
|
||||||
|
# discounted return (think about why it would be harder starting from the beginning)
|
||||||
|
for rew in reversed(ep_rews):
|
||||||
|
discounted_reward = rew + discounted_reward * self.gamma
|
||||||
|
batch_rtgs.insert(0, discounted_reward)
|
||||||
|
|
||||||
|
# Convert the rewards-to-go into a tensor
|
||||||
|
batch_rtgs = torch.tensor(batch_rtgs, dtype=torch.float)
|
||||||
|
|
||||||
|
return batch_rtgs
|
||||||
|
|
||||||
|
def get_action(self, obs):
|
||||||
|
"""
|
||||||
|
Queries an action from the actor network, should be called from rollout.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
obs - the observation at the current timestep
|
||||||
|
|
||||||
|
Return:
|
||||||
|
action - the action to take, as a numpy array
|
||||||
|
log_prob - the log probability of the selected action in the distribution
|
||||||
|
"""
|
||||||
|
# Query the actor network for a mean action
|
||||||
|
mean = self.actor(obs)
|
||||||
|
|
||||||
|
# Create a distribution with the mean action and std from the covariance matrix above.
|
||||||
|
# For more information on how this distribution works, check out Andrew Ng's lecture on it:
|
||||||
|
# https://www.youtube.com/watch?v=JjB58InuTqM
|
||||||
|
dist = MultivariateNormal(mean, self.cov_mat)
|
||||||
|
|
||||||
|
# Sample an action from the distribution
|
||||||
|
action = dist.sample()
|
||||||
|
|
||||||
|
# Calculate the log probability for that action
|
||||||
|
log_prob = dist.log_prob(action)
|
||||||
|
|
||||||
|
# Return the sampled action and the log probability of that action in our distribution
|
||||||
|
return action.detach().numpy(), log_prob.detach()
|
||||||
|
|
||||||
|
def evaluate(self, batch_obs, batch_acts):
|
||||||
|
"""
|
||||||
|
Estimate the values of each observation, and the log probs of
|
||||||
|
each action in the most recent batch with the most recent
|
||||||
|
iteration of the actor network. Should be called from learn.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
batch_obs - the observations from the most recently collected batch as a tensor.
|
||||||
|
Shape: (number of timesteps in batch, dimension of observation)
|
||||||
|
batch_acts - the actions from the most recently collected batch as a tensor.
|
||||||
|
Shape: (number of timesteps in batch, dimension of action)
|
||||||
|
|
||||||
|
Return:
|
||||||
|
V - the predicted values of batch_obs
|
||||||
|
log_probs - the log probabilities of the actions taken in batch_acts given batch_obs
|
||||||
|
"""
|
||||||
|
# Query critic network for a value V for each batch_obs. Shape of V should be same as batch_rtgs
|
||||||
|
V = self.critic(batch_obs).squeeze()
|
||||||
|
|
||||||
|
# Calculate the log probabilities of batch actions using most recent actor network.
|
||||||
|
# This segment of code is similar to that in get_action()
|
||||||
|
mean = self.actor(batch_obs)
|
||||||
|
dist = MultivariateNormal(mean, self.cov_mat)
|
||||||
|
log_probs = dist.log_prob(batch_acts)
|
||||||
|
|
||||||
|
# Return the value vector V of each observation in the batch
|
||||||
|
# and log probabilities log_probs of each action in the batch
|
||||||
|
return V, log_probs
|
||||||
|
|
||||||
|
def _init_hyperparameters(self, hyperparameters):
|
||||||
|
"""
|
||||||
|
Initialize default and custom values for hyperparameters
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
hyperparameters - the extra arguments included when creating the PPO model, should only include
|
||||||
|
hyperparameters defined below with custom values.
|
||||||
|
|
||||||
|
Return:
|
||||||
|
None
|
||||||
|
"""
|
||||||
|
# Initialize default values for hyperparameters
|
||||||
|
# Algorithm hyperparameters
|
||||||
|
self.timesteps_per_batch = 4800 # Number of timesteps to run per batch
|
||||||
|
self.max_timesteps_per_episode = 1600 # Max number of timesteps per episode
|
||||||
|
self.n_updates_per_iteration = 5 # Number of times to update actor/critic per iteration
|
||||||
|
self.lr = 0.005 # Learning rate of actor optimizer
|
||||||
|
self.gamma = 0.95 # Discount factor to be applied when calculating Rewards-To-Go
|
||||||
|
self.clip = 0.2 # Recommended 0.2, helps define the threshold to clip the ratio during SGA
|
||||||
|
|
||||||
|
# Miscellaneous parameters
|
||||||
|
self.render = True # If we should render during rollout
|
||||||
|
self.render_every_i = 10 # Only render every n iterations
|
||||||
|
self.save_freq = 10 # How often we save in number of iterations
|
||||||
|
self.seed = None # Sets the seed of our program, used for reproducibility of results
|
||||||
|
|
||||||
|
# Change any default values to custom values for specified hyperparameters
|
||||||
|
for param, val in hyperparameters.items():
|
||||||
|
exec('self.' + param + ' = ' + str(val))
|
||||||
|
|
||||||
|
# Sets the seed if specified
|
||||||
|
if self.seed != None:
|
||||||
|
# Check if our seed is valid first
|
||||||
|
assert(type(self.seed) == int)
|
||||||
|
|
||||||
|
# Set the seed
|
||||||
|
torch.manual_seed(self.seed)
|
||||||
|
print(f"Successfully set seed to {self.seed}")
|
||||||
|
|
||||||
|
def _log_summary(self):
|
||||||
|
"""
|
||||||
|
Print to stdout what we've logged so far in the most recent batch.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
None
|
||||||
|
|
||||||
|
Return:
|
||||||
|
None
|
||||||
|
"""
|
||||||
|
# Calculate logging values. I use a few python shortcuts to calculate each value
|
||||||
|
# without explaining since it's not too important to PPO; feel free to look it over,
|
||||||
|
# and if you have any questions you can email me (look at bottom of README)
|
||||||
|
delta_t = self.logger['delta_t']
|
||||||
|
self.logger['delta_t'] = time.time_ns()
|
||||||
|
delta_t = (self.logger['delta_t'] - delta_t) / 1e9
|
||||||
|
delta_t = str(round(delta_t, 2))
|
||||||
|
|
||||||
|
t_so_far = self.logger['t_so_far']
|
||||||
|
i_so_far = self.logger['i_so_far']
|
||||||
|
avg_ep_lens = np.mean(self.logger['batch_lens'])
|
||||||
|
avg_ep_rews = np.mean([np.sum(ep_rews) for ep_rews in self.logger['batch_rews']])
|
||||||
|
avg_actor_loss = np.mean([losses.float().mean() for losses in self.logger['actor_losses']])
|
||||||
|
|
||||||
|
# Round decimal places for more aesthetic logging messages
|
||||||
|
avg_ep_lens = str(round(avg_ep_lens, 2))
|
||||||
|
avg_ep_rews = str(round(avg_ep_rews, 2))
|
||||||
|
avg_actor_loss = str(round(avg_actor_loss, 5))
|
||||||
|
|
||||||
|
# Print logging statements
|
||||||
|
print(flush=True)
|
||||||
|
print(f"-------------------- Iteration #{i_so_far} --------------------", flush=True)
|
||||||
|
print(f"Average Episodic Length: {avg_ep_lens}", flush=True)
|
||||||
|
print(f"Average Episodic Return: {avg_ep_rews}", flush=True)
|
||||||
|
print(f"Average Loss: {avg_actor_loss}", flush=True)
|
||||||
|
print(f"Timesteps So Far: {t_so_far}", flush=True)
|
||||||
|
print(f"Iteration took: {delta_t} secs", flush=True)
|
||||||
|
print(f"------------------------------------------------------", flush=True)
|
||||||
|
print(flush=True)
|
||||||
|
|
||||||
|
# Reset batch-specific logging data
|
||||||
|
self.logger['batch_lens'] = []
|
||||||
|
self.logger['batch_rews'] = []
|
||||||
|
self.logger['actor_losses'] = []
|
Loading…
Reference in New Issue
Block a user