From b1851ac489ec0a46c2821eaa75dde91814a6242a Mon Sep 17 00:00:00 2001 From: weixin_46229132 Date: Thu, 13 Mar 2025 10:46:28 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- PPO/env.py | 12 ++--- PPO/main_test.py | 129 +++++++++++++++++++++++++++++++++++++++++++++++ mtkl_sovler.py | 26 +++++----- 3 files changed, 148 insertions(+), 19 deletions(-) create mode 100644 PPO/main_test.py diff --git a/PPO/env.py b/PPO/env.py index 0cae274..5ad71c7 100644 --- a/PPO/env.py +++ b/PPO/env.py @@ -61,7 +61,7 @@ class PartitionMazeEnv(gym.Env): # 路径规划阶段相关变量 self.MAX_STEPS = 50 # 迷宫走法步数上限 - self.BASE_LINE = 3400.0 # 基准时间,通过greedy或者蒙特卡洛计算出来 + self.BASE_LINE = 3500.0 # 基准时间,通过greedy或者蒙特卡洛计算出来 self.step_count = 0 self.rectangles = {} self.car_pos = [[0.5, 0.5] for _ in range(self.num_cars)] @@ -139,7 +139,7 @@ class PartitionMazeEnv(gym.Env): bs_time = self.bs_time_factor * (1 - rho) * d self.rectangles[(i, j)] = { - 'center': ((h_boundaries[i] + h_boundaries[i+1]) * self.H / 2, (v_boundaries[j+1] + v_boundaries[j]) * self.W / 2), + 'center': ((v_boundaries[j+1] + v_boundaries[j]) * self.W / 2, (h_boundaries[i] + h_boundaries[i+1]) * self.H / 2), 'flight_time': flight_time, 'bs_time': bs_time, 'is_visited': False @@ -247,10 +247,8 @@ class PartitionMazeEnv(gym.Env): # print(T) # print(self.car_traj) reward += -(T - self.BASE_LINE) - print(T) - print(self.car_traj) elif done and self.step_count >= self.MAX_STEPS: - reward += -100 + reward += -10000 return state, reward, done, False, {} @@ -269,8 +267,8 @@ class PartitionMazeEnv(gym.Env): second_point = self.car_traj[idx][i + 1] car_time += math.dist(self.rectangles[tuple(first_point)]['center'], self.rectangles[tuple(second_point)]['center']) * \ self.car_time_factor - car_time += math.dist(self.rectangles[tuple(self.car_traj[idx][0])]['center'], [self.H / 2, self.W / 2]) - car_time += math.dist(self.rectangles[tuple(self.car_traj[idx][-1])]['center'], [self.H / 2, self.W / 2]) + car_time += math.dist(self.rectangles[tuple(self.car_traj[idx][0])]['center'], [self.W / 2, self.H / 2]) * self.car_time_factor + car_time += math.dist(self.rectangles[tuple(self.car_traj[idx][-1])]['center'], [self.W / 2, self.H / 2]) * self.car_time_factor return max(float(car_time) + flight_time, bs_time) diff --git a/PPO/main_test.py b/PPO/main_test.py new file mode 100644 index 0000000..bab2b28 --- /dev/null +++ b/PPO/main_test.py @@ -0,0 +1,129 @@ +""" + This file is the executable for running PPO. It is based on this medium article: + https://medium.com/@eyyu/coding-ppo-from-scratch-with-pytorch-part-1-4-613dfc1b14c8 +""" + +import gymnasium as gym +import sys +import torch +import argparse + +from ppo import PPO +from network import FeedForwardNN +from eval_policy import eval_policy +from env import PartitionMazeEnv + +def train(env, hyperparameters, actor_model, critic_model): + """ + Trains the model. + + Parameters: + env - the environment to train on + hyperparameters - a dict of hyperparameters to use, defined in main + actor_model - the actor model to load in if we want to continue training + critic_model - the critic model to load in if we want to continue training + + Return: + None + """ + print(f"Training", flush=True) + + # Create a model for PPO. + model = PPO(policy_class=FeedForwardNN, env=env, **hyperparameters) + + # Tries to load in an existing actor/critic model to continue training on + if actor_model != '' and critic_model != '': + print(f"Loading in {actor_model} and {critic_model}...", flush=True) + model.actor.load_state_dict(torch.load(actor_model)) + model.critic.load_state_dict(torch.load(critic_model)) + print(f"Successfully loaded.", flush=True) + elif actor_model != '' or critic_model != '': # Don't train from scratch if user accidentally forgets actor/critic model + print(f"Error: Either specify both actor/critic models or none at all. We don't want to accidentally override anything!") + sys.exit(0) + else: + print(f"Training from scratch.", flush=True) + + # Train the PPO model with a specified total timesteps + # NOTE: You can change the total timesteps here, I put a big number just because + # you can kill the process whenever you feel like PPO is converging + model.learn(total_timesteps=200_000_000) + +def test(env, actor_model): + """ + Tests the model. + + Parameters: + env - the environment to test the policy on + actor_model - the actor model to load in + + Return: + None + """ + print(f"Testing {actor_model}", flush=True) + + # If the actor model is not specified, then exit + if actor_model == '': + print(f"Didn't specify model file. Exiting.", flush=True) + sys.exit(0) + + # Extract out dimensions of observation and action spaces + obs_dim = env.observation_space.shape[0] + act_dim = env.action_space.shape[0] + + # Build our policy the same way we build our actor model in PPO + policy = FeedForwardNN(obs_dim, act_dim) + + # Load in the actor model saved by the PPO algorithm + policy.load_state_dict(torch.load(actor_model)) + + # Evaluate our policy with a separate module, eval_policy, to demonstrate + # that once we are done training the model/policy with ppo.py, we no longer need + # ppo.py since it only contains the training algorithm. The model/policy itself exists + # independently as a binary file that can be loaded in with torch. + eval_policy(policy=policy, env=env, render=True) + +def main(args): + """ + The main function to run. + + Parameters: + args - the arguments parsed from command line + + Return: + None + """ + # NOTE: Here's where you can set hyperparameters for PPO. I don't include them as part of + # ArgumentParser because it's too annoying to type them every time at command line. Instead, you can change them here. + # To see a list of hyperparameters, look in ppo.py at function _init_hyperparameters + hyperparameters = { + 'timesteps_per_batch': 2048, + 'max_timesteps_per_episode': 200, + 'gamma': 0.99, + 'n_updates_per_iteration': 10, + 'lr': 3e-4, + 'clip': 0.2, + 'render': True, + 'render_every_i': 10 + } + + # Creates the environment we'll be running. If you want to replace with your own + # custom environment, note that it must inherit Gym and have both continuous + # observation and action spaces. + # env = gym.make('Pendulum-v1', render_mode='human' if args.mode == 'test' else 'rgb_array') + env = PartitionMazeEnv() + + # Train or test, depending on the mode specified + if args.mode == 'train': + train(env=env, hyperparameters=hyperparameters, actor_model=args.actor_model, critic_model=args.critic_model) + else: + test(env=env, actor_model=args.actor_model) + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + + parser.add_argument('--mode', dest='mode', type=str, default='test') # can be 'train' or 'test' + parser.add_argument('--actor_model', dest='actor_model', type=str, default='./weights/ppo_actor.pth') # your actor model filename + parser.add_argument('--critic_model', dest='critic_model', type=str, default='') # your critic model filename + + args = parser.parse_args() + main(args) diff --git a/mtkl_sovler.py b/mtkl_sovler.py index d746e91..9afcec0 100644 --- a/mtkl_sovler.py +++ b/mtkl_sovler.py @@ -6,7 +6,7 @@ import yaml # 固定随机种子,便于复现 random.seed(42) -num_iterations = 1000000 +num_iterations = 10000 # --------------------------- # 参数设置 @@ -117,17 +117,14 @@ for iteration in range(num_iterations): total_flight_time = sum(task['flight_time'] for task in tasks) if tasks: # 车辆从区域中心到第一个任务中心 - car_time = math.hypot(tasks[0]['center'][0] - region_center[0], - tasks[0]['center'][1] - region_center[1]) * car_time_factor + car_time += math.dist(tasks[0]['center'], region_center) * car_time_factor # 依次经过任务中心 - for j in range(1, len(tasks)): - prev_center = tasks[j - 1]['center'] - curr_center = tasks[j]['center'] - car_time += math.hypot(curr_center[0] - prev_center[0], - curr_center[1] - prev_center[1]) * car_time_factor + for j in range(len(tasks) - 1): + prev_center = tasks[j]['center'] + curr_center = tasks[j + 1]['center'] + car_time += math.dist(curr_center, prev_center) * car_time_factor # 回到区域中心 - car_time += math.hypot(curr_center[0] - region_center[0], - curr_center[1] - prev_center[1]) * car_time_factor + car_time += math.dist(region_center, curr_center) * car_time_factor else: car_time = 0 @@ -150,7 +147,10 @@ for iteration in range(num_iterations): 'R': R, 'C': C, 'row_boundaries': row_boundaries, - 'col_boundaries': col_boundaries + 'col_boundaries': col_boundaries, + 'car_time': car_time, + 'flight_time': total_flight_time, + 'bs_time': total_bs_time } # --------------------------- @@ -158,6 +158,8 @@ for iteration in range(num_iterations): # --------------------------- if best_solution is not None: print("最佳 T (各系统中最长的完成时间):", best_solution['T_max']) + print(best_solution['iteration'], "次模拟后找到最佳方案:") + print(best_solution['car_time'], best_solution['flight_time'], best_solution['bs_time']) for i in range(k): num_tasks = len(best_solution['system_tasks'][i]) print( @@ -168,7 +170,7 @@ else: # 在输出最佳方案后添加详细信息 if best_solution is not None: print("\n各系统详细信息:") - region_center = (H / 2.0, W / 2.0) + region_center = (W / 2.0, H / 2.0) for system_id, tasks in best_solution['system_tasks'].items(): print(f"\n系统 {system_id} 的任务详情:")