From b1851ac489ec0a46c2821eaa75dde91814a6242a Mon Sep 17 00:00:00 2001
From: weixin_46229132 <weixin_46229132@noreply.gitcode.com>
Date: Thu, 13 Mar 2025 10:46:28 +0800
Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9bug?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 PPO/env.py       |  12 ++---
 PPO/main_test.py | 129 +++++++++++++++++++++++++++++++++++++++++++++++
 mtkl_sovler.py   |  26 +++++-----
 3 files changed, 148 insertions(+), 19 deletions(-)
 create mode 100644 PPO/main_test.py

diff --git a/PPO/env.py b/PPO/env.py
index 0cae274..5ad71c7 100644
--- a/PPO/env.py
+++ b/PPO/env.py
@@ -61,7 +61,7 @@ class PartitionMazeEnv(gym.Env):
 
         # 路径规划阶段相关变量
         self.MAX_STEPS = 50         # 迷宫走法步数上限
-        self.BASE_LINE = 3400.0     # 基准时间，通过greedy或者蒙特卡洛计算出来
+        self.BASE_LINE = 3500.0     # 基准时间，通过greedy或者蒙特卡洛计算出来
         self.step_count = 0
         self.rectangles = {}
         self.car_pos = [[0.5, 0.5] for _ in range(self.num_cars)]
@@ -139,7 +139,7 @@ class PartitionMazeEnv(gym.Env):
                         bs_time = self.bs_time_factor * (1 - rho) * d
 
                         self.rectangles[(i, j)] = {
-                            'center': ((h_boundaries[i] + h_boundaries[i+1]) * self.H / 2, (v_boundaries[j+1] + v_boundaries[j]) * self.W / 2),
+                            'center': ((v_boundaries[j+1] + v_boundaries[j]) * self.W / 2, (h_boundaries[i] + h_boundaries[i+1]) * self.H / 2),
                             'flight_time': flight_time,
                             'bs_time': bs_time,
                             'is_visited': False
@@ -247,10 +247,8 @@ class PartitionMazeEnv(gym.Env):
                 # print(T)
                 # print(self.car_traj)
                 reward += -(T - self.BASE_LINE)
-                print(T)
-                print(self.car_traj)
             elif done and self.step_count >= self.MAX_STEPS:
-                reward += -100
+                reward += -10000
 
             return state, reward, done, False, {}
 
@@ -269,8 +267,8 @@ class PartitionMazeEnv(gym.Env):
             second_point = self.car_traj[idx][i + 1]
             car_time += math.dist(self.rectangles[tuple(first_point)]['center'], self.rectangles[tuple(second_point)]['center']) * \
                 self.car_time_factor
-        car_time += math.dist(self.rectangles[tuple(self.car_traj[idx][0])]['center'], [self.H / 2, self.W / 2])
-        car_time += math.dist(self.rectangles[tuple(self.car_traj[idx][-1])]['center'], [self.H / 2, self.W / 2])
+        car_time += math.dist(self.rectangles[tuple(self.car_traj[idx][0])]['center'], [self.W / 2, self.H / 2]) * self.car_time_factor
+        car_time += math.dist(self.rectangles[tuple(self.car_traj[idx][-1])]['center'], [self.W / 2, self.H / 2]) * self.car_time_factor
 
         return max(float(car_time) + flight_time, bs_time)
 
diff --git a/PPO/main_test.py b/PPO/main_test.py
new file mode 100644
index 0000000..bab2b28
--- /dev/null
+++ b/PPO/main_test.py
@@ -0,0 +1,129 @@
+"""
+	This file is the executable for running PPO. It is based on this medium article: 
+	https://medium.com/@eyyu/coding-ppo-from-scratch-with-pytorch-part-1-4-613dfc1b14c8
+"""
+
+import gymnasium as gym
+import sys
+import torch
+import argparse
+
+from ppo import PPO
+from network import FeedForwardNN
+from eval_policy import eval_policy
+from env import PartitionMazeEnv
+
+def train(env, hyperparameters, actor_model, critic_model):
+	"""
+		Trains the model.
+
+		Parameters:
+			env - the environment to train on
+			hyperparameters - a dict of hyperparameters to use, defined in main
+			actor_model - the actor model to load in if we want to continue training
+			critic_model - the critic model to load in if we want to continue training
+
+		Return:
+			None
+	"""	
+	print(f"Training", flush=True)
+
+	# Create a model for PPO.
+	model = PPO(policy_class=FeedForwardNN, env=env, **hyperparameters)
+
+	# Tries to load in an existing actor/critic model to continue training on
+	if actor_model != '' and critic_model != '':
+		print(f"Loading in {actor_model} and {critic_model}...", flush=True)
+		model.actor.load_state_dict(torch.load(actor_model))
+		model.critic.load_state_dict(torch.load(critic_model))
+		print(f"Successfully loaded.", flush=True)
+	elif actor_model != '' or critic_model != '': # Don't train from scratch if user accidentally forgets actor/critic model
+		print(f"Error: Either specify both actor/critic models or none at all. We don't want to accidentally override anything!")
+		sys.exit(0)
+	else:
+		print(f"Training from scratch.", flush=True)
+
+	# Train the PPO model with a specified total timesteps
+	# NOTE: You can change the total timesteps here, I put a big number just because
+	# you can kill the process whenever you feel like PPO is converging
+	model.learn(total_timesteps=200_000_000)
+
+def test(env, actor_model):
+	"""
+		Tests the model.
+
+		Parameters:
+			env - the environment to test the policy on
+			actor_model - the actor model to load in
+
+		Return:
+			None
+	"""
+	print(f"Testing {actor_model}", flush=True)
+
+	# If the actor model is not specified, then exit
+	if actor_model == '':
+		print(f"Didn't specify model file. Exiting.", flush=True)
+		sys.exit(0)
+
+	# Extract out dimensions of observation and action spaces
+	obs_dim = env.observation_space.shape[0]
+	act_dim = env.action_space.shape[0]
+
+	# Build our policy the same way we build our actor model in PPO
+	policy = FeedForwardNN(obs_dim, act_dim)
+
+	# Load in the actor model saved by the PPO algorithm
+	policy.load_state_dict(torch.load(actor_model))
+
+	# Evaluate our policy with a separate module, eval_policy, to demonstrate
+	# that once we are done training the model/policy with ppo.py, we no longer need
+	# ppo.py since it only contains the training algorithm. The model/policy itself exists
+	# independently as a binary file that can be loaded in with torch.
+	eval_policy(policy=policy, env=env, render=True)
+
+def main(args):
+	"""
+		The main function to run.
+
+		Parameters:
+			args - the arguments parsed from command line
+
+		Return:
+			None
+	"""
+	# NOTE: Here's where you can set hyperparameters for PPO. I don't include them as part of
+	# ArgumentParser because it's too annoying to type them every time at command line. Instead, you can change them here.
+	# To see a list of hyperparameters, look in ppo.py at function _init_hyperparameters
+	hyperparameters = {
+				'timesteps_per_batch': 2048, 
+				'max_timesteps_per_episode': 200, 
+				'gamma': 0.99, 
+				'n_updates_per_iteration': 10,
+				'lr': 3e-4, 
+				'clip': 0.2,
+				'render': True,
+				'render_every_i': 10
+			  }
+
+	# Creates the environment we'll be running. If you want to replace with your own
+	# custom environment, note that it must inherit Gym and have both continuous
+	# observation and action spaces.
+	# env = gym.make('Pendulum-v1', render_mode='human' if args.mode == 'test' else 'rgb_array')
+	env = PartitionMazeEnv()
+
+	# Train or test, depending on the mode specified
+	if args.mode == 'train':
+		train(env=env, hyperparameters=hyperparameters, actor_model=args.actor_model, critic_model=args.critic_model)
+	else:
+		test(env=env, actor_model=args.actor_model)
+
+if __name__ == '__main__':
+	parser = argparse.ArgumentParser()
+
+	parser.add_argument('--mode', dest='mode', type=str, default='test')              # can be 'train' or 'test'
+	parser.add_argument('--actor_model', dest='actor_model', type=str, default='./weights/ppo_actor.pth')     # your actor model filename
+	parser.add_argument('--critic_model', dest='critic_model', type=str, default='')   # your critic model filename
+
+	args = parser.parse_args()
+	main(args)
diff --git a/mtkl_sovler.py b/mtkl_sovler.py
index d746e91..9afcec0 100644
--- a/mtkl_sovler.py
+++ b/mtkl_sovler.py
@@ -6,7 +6,7 @@ import yaml
 # 固定随机种子，便于复现
 random.seed(42)
 
-num_iterations = 1000000
+num_iterations = 10000
 
 # ---------------------------
 # 参数设置
@@ -117,17 +117,14 @@ for iteration in range(num_iterations):
         total_flight_time = sum(task['flight_time'] for task in tasks)
         if tasks:
             # 车辆从区域中心到第一个任务中心
-            car_time = math.hypot(tasks[0]['center'][0] - region_center[0],
-                                  tasks[0]['center'][1] - region_center[1]) * car_time_factor
+            car_time += math.dist(tasks[0]['center'], region_center) * car_time_factor
             # 依次经过任务中心
-            for j in range(1, len(tasks)):
-                prev_center = tasks[j - 1]['center']
-                curr_center = tasks[j]['center']
-                car_time += math.hypot(curr_center[0] - prev_center[0],
-                                       curr_center[1] - prev_center[1]) * car_time_factor
+            for j in range(len(tasks) - 1):
+                prev_center = tasks[j]['center']
+                curr_center = tasks[j + 1]['center']
+                car_time += math.dist(curr_center, prev_center) * car_time_factor
             # 回到区域中心
-            car_time += math.hypot(curr_center[0] - region_center[0],
-                                  curr_center[1] - prev_center[1]) * car_time_factor
+            car_time += math.dist(region_center, curr_center) * car_time_factor
         else:
             car_time = 0
 
@@ -150,7 +147,10 @@ for iteration in range(num_iterations):
             'R': R,
             'C': C,
             'row_boundaries': row_boundaries,
-            'col_boundaries': col_boundaries
+            'col_boundaries': col_boundaries,
+            'car_time': car_time,
+            'flight_time': total_flight_time,
+            'bs_time': total_bs_time
         }
 
 # ---------------------------
@@ -158,6 +158,8 @@ for iteration in range(num_iterations):
 # ---------------------------
 if best_solution is not None:
     print("最佳 T (各系统中最长的完成时间):", best_solution['T_max'])
+    print(best_solution['iteration'], "次模拟后找到最佳方案:")
+    print(best_solution['car_time'], best_solution['flight_time'], best_solution['bs_time'])
     for i in range(k):
         num_tasks = len(best_solution['system_tasks'][i])
         print(
@@ -168,7 +170,7 @@ else:
 # 在输出最佳方案后添加详细信息
 if best_solution is not None:
     print("\n各系统详细信息:")
-    region_center = (H / 2.0, W / 2.0)
+    region_center = (W / 2.0, H / 2.0)
 
     for system_id, tasks in best_solution['system_tasks'].items():
         print(f"\n系统 {system_id} 的任务详情:")