修改bug
This commit is contained in:
parent
d53eda2570
commit
b1851ac489
12
PPO/env.py
12
PPO/env.py
@ -61,7 +61,7 @@ class PartitionMazeEnv(gym.Env):
|
|||||||
|
|
||||||
# 路径规划阶段相关变量
|
# 路径规划阶段相关变量
|
||||||
self.MAX_STEPS = 50 # 迷宫走法步数上限
|
self.MAX_STEPS = 50 # 迷宫走法步数上限
|
||||||
self.BASE_LINE = 3400.0 # 基准时间,通过greedy或者蒙特卡洛计算出来
|
self.BASE_LINE = 3500.0 # 基准时间,通过greedy或者蒙特卡洛计算出来
|
||||||
self.step_count = 0
|
self.step_count = 0
|
||||||
self.rectangles = {}
|
self.rectangles = {}
|
||||||
self.car_pos = [[0.5, 0.5] for _ in range(self.num_cars)]
|
self.car_pos = [[0.5, 0.5] for _ in range(self.num_cars)]
|
||||||
@ -139,7 +139,7 @@ class PartitionMazeEnv(gym.Env):
|
|||||||
bs_time = self.bs_time_factor * (1 - rho) * d
|
bs_time = self.bs_time_factor * (1 - rho) * d
|
||||||
|
|
||||||
self.rectangles[(i, j)] = {
|
self.rectangles[(i, j)] = {
|
||||||
'center': ((h_boundaries[i] + h_boundaries[i+1]) * self.H / 2, (v_boundaries[j+1] + v_boundaries[j]) * self.W / 2),
|
'center': ((v_boundaries[j+1] + v_boundaries[j]) * self.W / 2, (h_boundaries[i] + h_boundaries[i+1]) * self.H / 2),
|
||||||
'flight_time': flight_time,
|
'flight_time': flight_time,
|
||||||
'bs_time': bs_time,
|
'bs_time': bs_time,
|
||||||
'is_visited': False
|
'is_visited': False
|
||||||
@ -247,10 +247,8 @@ class PartitionMazeEnv(gym.Env):
|
|||||||
# print(T)
|
# print(T)
|
||||||
# print(self.car_traj)
|
# print(self.car_traj)
|
||||||
reward += -(T - self.BASE_LINE)
|
reward += -(T - self.BASE_LINE)
|
||||||
print(T)
|
|
||||||
print(self.car_traj)
|
|
||||||
elif done and self.step_count >= self.MAX_STEPS:
|
elif done and self.step_count >= self.MAX_STEPS:
|
||||||
reward += -100
|
reward += -10000
|
||||||
|
|
||||||
return state, reward, done, False, {}
|
return state, reward, done, False, {}
|
||||||
|
|
||||||
@ -269,8 +267,8 @@ class PartitionMazeEnv(gym.Env):
|
|||||||
second_point = self.car_traj[idx][i + 1]
|
second_point = self.car_traj[idx][i + 1]
|
||||||
car_time += math.dist(self.rectangles[tuple(first_point)]['center'], self.rectangles[tuple(second_point)]['center']) * \
|
car_time += math.dist(self.rectangles[tuple(first_point)]['center'], self.rectangles[tuple(second_point)]['center']) * \
|
||||||
self.car_time_factor
|
self.car_time_factor
|
||||||
car_time += math.dist(self.rectangles[tuple(self.car_traj[idx][0])]['center'], [self.H / 2, self.W / 2])
|
car_time += math.dist(self.rectangles[tuple(self.car_traj[idx][0])]['center'], [self.W / 2, self.H / 2]) * self.car_time_factor
|
||||||
car_time += math.dist(self.rectangles[tuple(self.car_traj[idx][-1])]['center'], [self.H / 2, self.W / 2])
|
car_time += math.dist(self.rectangles[tuple(self.car_traj[idx][-1])]['center'], [self.W / 2, self.H / 2]) * self.car_time_factor
|
||||||
|
|
||||||
return max(float(car_time) + flight_time, bs_time)
|
return max(float(car_time) + flight_time, bs_time)
|
||||||
|
|
||||||
|
129
PPO/main_test.py
Normal file
129
PPO/main_test.py
Normal file
@ -0,0 +1,129 @@
|
|||||||
|
"""
|
||||||
|
This file is the executable for running PPO. It is based on this medium article:
|
||||||
|
https://medium.com/@eyyu/coding-ppo-from-scratch-with-pytorch-part-1-4-613dfc1b14c8
|
||||||
|
"""
|
||||||
|
|
||||||
|
import gymnasium as gym
|
||||||
|
import sys
|
||||||
|
import torch
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
from ppo import PPO
|
||||||
|
from network import FeedForwardNN
|
||||||
|
from eval_policy import eval_policy
|
||||||
|
from env import PartitionMazeEnv
|
||||||
|
|
||||||
|
def train(env, hyperparameters, actor_model, critic_model):
|
||||||
|
"""
|
||||||
|
Trains the model.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
env - the environment to train on
|
||||||
|
hyperparameters - a dict of hyperparameters to use, defined in main
|
||||||
|
actor_model - the actor model to load in if we want to continue training
|
||||||
|
critic_model - the critic model to load in if we want to continue training
|
||||||
|
|
||||||
|
Return:
|
||||||
|
None
|
||||||
|
"""
|
||||||
|
print(f"Training", flush=True)
|
||||||
|
|
||||||
|
# Create a model for PPO.
|
||||||
|
model = PPO(policy_class=FeedForwardNN, env=env, **hyperparameters)
|
||||||
|
|
||||||
|
# Tries to load in an existing actor/critic model to continue training on
|
||||||
|
if actor_model != '' and critic_model != '':
|
||||||
|
print(f"Loading in {actor_model} and {critic_model}...", flush=True)
|
||||||
|
model.actor.load_state_dict(torch.load(actor_model))
|
||||||
|
model.critic.load_state_dict(torch.load(critic_model))
|
||||||
|
print(f"Successfully loaded.", flush=True)
|
||||||
|
elif actor_model != '' or critic_model != '': # Don't train from scratch if user accidentally forgets actor/critic model
|
||||||
|
print(f"Error: Either specify both actor/critic models or none at all. We don't want to accidentally override anything!")
|
||||||
|
sys.exit(0)
|
||||||
|
else:
|
||||||
|
print(f"Training from scratch.", flush=True)
|
||||||
|
|
||||||
|
# Train the PPO model with a specified total timesteps
|
||||||
|
# NOTE: You can change the total timesteps here, I put a big number just because
|
||||||
|
# you can kill the process whenever you feel like PPO is converging
|
||||||
|
model.learn(total_timesteps=200_000_000)
|
||||||
|
|
||||||
|
def test(env, actor_model):
|
||||||
|
"""
|
||||||
|
Tests the model.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
env - the environment to test the policy on
|
||||||
|
actor_model - the actor model to load in
|
||||||
|
|
||||||
|
Return:
|
||||||
|
None
|
||||||
|
"""
|
||||||
|
print(f"Testing {actor_model}", flush=True)
|
||||||
|
|
||||||
|
# If the actor model is not specified, then exit
|
||||||
|
if actor_model == '':
|
||||||
|
print(f"Didn't specify model file. Exiting.", flush=True)
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
# Extract out dimensions of observation and action spaces
|
||||||
|
obs_dim = env.observation_space.shape[0]
|
||||||
|
act_dim = env.action_space.shape[0]
|
||||||
|
|
||||||
|
# Build our policy the same way we build our actor model in PPO
|
||||||
|
policy = FeedForwardNN(obs_dim, act_dim)
|
||||||
|
|
||||||
|
# Load in the actor model saved by the PPO algorithm
|
||||||
|
policy.load_state_dict(torch.load(actor_model))
|
||||||
|
|
||||||
|
# Evaluate our policy with a separate module, eval_policy, to demonstrate
|
||||||
|
# that once we are done training the model/policy with ppo.py, we no longer need
|
||||||
|
# ppo.py since it only contains the training algorithm. The model/policy itself exists
|
||||||
|
# independently as a binary file that can be loaded in with torch.
|
||||||
|
eval_policy(policy=policy, env=env, render=True)
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
"""
|
||||||
|
The main function to run.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
args - the arguments parsed from command line
|
||||||
|
|
||||||
|
Return:
|
||||||
|
None
|
||||||
|
"""
|
||||||
|
# NOTE: Here's where you can set hyperparameters for PPO. I don't include them as part of
|
||||||
|
# ArgumentParser because it's too annoying to type them every time at command line. Instead, you can change them here.
|
||||||
|
# To see a list of hyperparameters, look in ppo.py at function _init_hyperparameters
|
||||||
|
hyperparameters = {
|
||||||
|
'timesteps_per_batch': 2048,
|
||||||
|
'max_timesteps_per_episode': 200,
|
||||||
|
'gamma': 0.99,
|
||||||
|
'n_updates_per_iteration': 10,
|
||||||
|
'lr': 3e-4,
|
||||||
|
'clip': 0.2,
|
||||||
|
'render': True,
|
||||||
|
'render_every_i': 10
|
||||||
|
}
|
||||||
|
|
||||||
|
# Creates the environment we'll be running. If you want to replace with your own
|
||||||
|
# custom environment, note that it must inherit Gym and have both continuous
|
||||||
|
# observation and action spaces.
|
||||||
|
# env = gym.make('Pendulum-v1', render_mode='human' if args.mode == 'test' else 'rgb_array')
|
||||||
|
env = PartitionMazeEnv()
|
||||||
|
|
||||||
|
# Train or test, depending on the mode specified
|
||||||
|
if args.mode == 'train':
|
||||||
|
train(env=env, hyperparameters=hyperparameters, actor_model=args.actor_model, critic_model=args.critic_model)
|
||||||
|
else:
|
||||||
|
test(env=env, actor_model=args.actor_model)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
|
||||||
|
parser.add_argument('--mode', dest='mode', type=str, default='test') # can be 'train' or 'test'
|
||||||
|
parser.add_argument('--actor_model', dest='actor_model', type=str, default='./weights/ppo_actor.pth') # your actor model filename
|
||||||
|
parser.add_argument('--critic_model', dest='critic_model', type=str, default='') # your critic model filename
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
main(args)
|
@ -6,7 +6,7 @@ import yaml
|
|||||||
# 固定随机种子,便于复现
|
# 固定随机种子,便于复现
|
||||||
random.seed(42)
|
random.seed(42)
|
||||||
|
|
||||||
num_iterations = 1000000
|
num_iterations = 10000
|
||||||
|
|
||||||
# ---------------------------
|
# ---------------------------
|
||||||
# 参数设置
|
# 参数设置
|
||||||
@ -117,17 +117,14 @@ for iteration in range(num_iterations):
|
|||||||
total_flight_time = sum(task['flight_time'] for task in tasks)
|
total_flight_time = sum(task['flight_time'] for task in tasks)
|
||||||
if tasks:
|
if tasks:
|
||||||
# 车辆从区域中心到第一个任务中心
|
# 车辆从区域中心到第一个任务中心
|
||||||
car_time = math.hypot(tasks[0]['center'][0] - region_center[0],
|
car_time += math.dist(tasks[0]['center'], region_center) * car_time_factor
|
||||||
tasks[0]['center'][1] - region_center[1]) * car_time_factor
|
|
||||||
# 依次经过任务中心
|
# 依次经过任务中心
|
||||||
for j in range(1, len(tasks)):
|
for j in range(len(tasks) - 1):
|
||||||
prev_center = tasks[j - 1]['center']
|
prev_center = tasks[j]['center']
|
||||||
curr_center = tasks[j]['center']
|
curr_center = tasks[j + 1]['center']
|
||||||
car_time += math.hypot(curr_center[0] - prev_center[0],
|
car_time += math.dist(curr_center, prev_center) * car_time_factor
|
||||||
curr_center[1] - prev_center[1]) * car_time_factor
|
|
||||||
# 回到区域中心
|
# 回到区域中心
|
||||||
car_time += math.hypot(curr_center[0] - region_center[0],
|
car_time += math.dist(region_center, curr_center) * car_time_factor
|
||||||
curr_center[1] - prev_center[1]) * car_time_factor
|
|
||||||
else:
|
else:
|
||||||
car_time = 0
|
car_time = 0
|
||||||
|
|
||||||
@ -150,7 +147,10 @@ for iteration in range(num_iterations):
|
|||||||
'R': R,
|
'R': R,
|
||||||
'C': C,
|
'C': C,
|
||||||
'row_boundaries': row_boundaries,
|
'row_boundaries': row_boundaries,
|
||||||
'col_boundaries': col_boundaries
|
'col_boundaries': col_boundaries,
|
||||||
|
'car_time': car_time,
|
||||||
|
'flight_time': total_flight_time,
|
||||||
|
'bs_time': total_bs_time
|
||||||
}
|
}
|
||||||
|
|
||||||
# ---------------------------
|
# ---------------------------
|
||||||
@ -158,6 +158,8 @@ for iteration in range(num_iterations):
|
|||||||
# ---------------------------
|
# ---------------------------
|
||||||
if best_solution is not None:
|
if best_solution is not None:
|
||||||
print("最佳 T (各系统中最长的完成时间):", best_solution['T_max'])
|
print("最佳 T (各系统中最长的完成时间):", best_solution['T_max'])
|
||||||
|
print(best_solution['iteration'], "次模拟后找到最佳方案:")
|
||||||
|
print(best_solution['car_time'], best_solution['flight_time'], best_solution['bs_time'])
|
||||||
for i in range(k):
|
for i in range(k):
|
||||||
num_tasks = len(best_solution['system_tasks'][i])
|
num_tasks = len(best_solution['system_tasks'][i])
|
||||||
print(
|
print(
|
||||||
@ -168,7 +170,7 @@ else:
|
|||||||
# 在输出最佳方案后添加详细信息
|
# 在输出最佳方案后添加详细信息
|
||||||
if best_solution is not None:
|
if best_solution is not None:
|
||||||
print("\n各系统详细信息:")
|
print("\n各系统详细信息:")
|
||||||
region_center = (H / 2.0, W / 2.0)
|
region_center = (W / 2.0, H / 2.0)
|
||||||
|
|
||||||
for system_id, tasks in best_solution['system_tasks'].items():
|
for system_id, tasks in best_solution['system_tasks'].items():
|
||||||
print(f"\n系统 {system_id} 的任务详情:")
|
print(f"\n系统 {system_id} 的任务详情:")
|
||||||
|
Loading…
Reference in New Issue
Block a user