From 2c88915112d36a753b992ce829ec0f43f38c4723 Mon Sep 17 00:00:00 2001
From: weixin_46229132 <weixin_46229132@noreply.gitcode.com>
Date: Fri, 28 Mar 2025 21:37:31 +0800
Subject: [PATCH] =?UTF-8?q?=E8=B7=91=E9=80=9APPO=20partition?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 PPO_Continuous/main.py               |  10 +-
 env_partion.py                       | 168 +++++++++++++++++++++++++++
 human_action.py                      |  15 ++-
 Q_learning/mTSP.py => mTSP_solver.py |  51 ++++++--
 mtkl_sovler.py                       |   8 +-
 5 files changed, 227 insertions(+), 25 deletions(-)
 create mode 100644 env_partion.py
 rename Q_learning/mTSP.py => mTSP_solver.py (80%)

diff --git a/PPO_Continuous/main.py b/PPO_Continuous/main.py
index 4ba824e..573c109 100644
--- a/PPO_Continuous/main.py
+++ b/PPO_Continuous/main.py
@@ -9,7 +9,7 @@ from PPO import PPO_agent
 import sys
 import os
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from env import PartitionMazeEnv
+from env_partion import PartitionEnv
 # fmt: on
 
 '''Hyperparameter Setting'''
@@ -68,16 +68,16 @@ print(opt)
 
 
 def main():
-    EnvName = ['PartitionMaze_PPO_Continuous', 'Pendulum-v1', 'LunarLanderContinuous-v2',
+    EnvName = ['Partition_PPO_Continuous', 'Pendulum-v1', 'LunarLanderContinuous-v2',
                'Humanoid-v4', 'HalfCheetah-v4', 'BipedalWalker-v3', 'BipedalWalkerHardcore-v3']
-    BrifEnvName = ['PM_PPO_Con', 'PV1', 'LLdV2',
+    BrifEnvName = ['Part_PPO_Con', 'PV1', 'LLdV2',
                    'Humanv4', 'HCv4', 'BWv3', 'BWHv3']
 
     # Build Env
     # env = gym.make(EnvName[opt.EnvIdex], render_mode = "human" if opt.render else None)
-    env = PartitionMazeEnv()
+    env = PartitionEnv()
     # eval_env = gym.make(EnvName[opt.EnvIdex])
-    eval_env = PartitionMazeEnv()
+    eval_env = PartitionEnv()
     opt.state_dim = env.observation_space.shape[0]
     opt.action_dim = env.action_space.shape[0]
     opt.max_action = float(env.action_space.high[0])
diff --git a/env_partion.py b/env_partion.py
new file mode 100644
index 0000000..b7706cc
--- /dev/null
+++ b/env_partion.py
@@ -0,0 +1,168 @@
+import gymnasium as gym
+from gymnasium import spaces
+import numpy as np
+import yaml
+import math
+from mTSP_solver import mTSP
+
+
+class PartitionEnv(gym.Env):
+    """
+    自定义环境，分为两阶段：
+    区域切分，每一次切分都是(0, 1)之间的连续值
+    """
+
+    def __init__(self, config=None):
+        super(PartitionEnv, self).__init__()
+        ##############################
+        # 可能需要手动修改的超参数
+        ##############################
+        self.params = 'params3'
+        self.CUT_NUM = 2
+        self.ROW_CUT_LIMIT = 1
+        self.COL_CUT_LIMIT = 1
+        self.BASE_LINE = 5000
+
+        # 车队参数设置
+        with open(self.params + '.yml', 'r', encoding='utf-8') as file:
+            params = yaml.safe_load(file)
+
+        self.H = params['H']
+        self.W = params['W']
+        self.center = (self.H/2, self.W/2)
+        self.num_cars = params['num_cars']
+
+        self.flight_time_factor = params['flight_time_factor']
+        self.comp_time_factor = params['comp_time_factor']
+        self.trans_time_factor = params['trans_time_factor']
+        self.car_time_factor = params['car_time_factor']
+        self.bs_time_factor = params['bs_time_factor']
+
+        self.flight_energy_factor = params['flight_energy_factor']
+        self.comp_energy_factor = params['comp_energy_factor']
+        self.trans_energy_factor = params['trans_energy_factor']
+        self.battery_energy_capacity = params['battery_energy_capacity']
+
+        self.partition_step = 0      # 区域划分阶段步数，范围 0~4
+        self.partition_values = np.zeros(
+            self.CUT_NUM, dtype=np.float32)  # 存储 c₁, c₂, r₁, r₂
+
+        # 定义动作空间：全部动作均为 1 维连续 [0,1]
+        self.action_space = spaces.Box(
+            low=0.0, high=1.0, shape=(1,), dtype=np.float32)
+
+        # 定义观察空间为8维向量
+        # 前 4 维表示已决策的切分值（未决策部分为 0）
+        self.observation_space = spaces.Box(
+            low=0.0, high=1.0, shape=(self.CUT_NUM,), dtype=np.float32)
+
+        # 切分阶段相关变量
+        self.col_cuts = []     # 存储竖切位置（c₁, c₂），当值为0时表示不切
+        self.row_cuts = []   # 存储横切位置（r₁, r₂）
+        self.rectangles = []
+
+    def reset(self, seed=None, options=None):
+        # 重置所有变量，回到切分阶段（phase 0）
+        self.phase = 0
+        self.partition_step = 0
+        self.partition_values = np.zeros(self.CUT_NUM, dtype=np.float32)
+        self.col_cuts = []
+        self.row_cuts = []
+        self.rectangles = []
+
+        # 状态：前 4 维为 partition_values，其余为区域访问状态（初始全0）
+        state = self.partition_values
+
+        return state
+
+    def step(self, action):
+        # 在所有阶段动作均为 1 维连续动作，取 action[0]
+        a = float(action[0])
+        self.partition_values[self.partition_step] = a
+        self.partition_step += 1
+
+        # 构造当前状态：前 partition_step 个为已决策值，其余为 0，再补 7 个 0
+        state = self.partition_values
+
+        # 如果未完成 4 步，则仍处于切分阶段，不发奖励，done 为 False
+        if self.partition_step < self.CUT_NUM:
+            return state, 0.0, False, False, {}
+        else:
+            # 完成 4 步后，计算切分边界
+            # 过滤掉 0，并去重后排序
+            rows = sorted(
+                set(v for v in self.partition_values[:self.ROW_CUT_LIMIT] if v > 0))
+            cols = sorted(
+                set(v for v in self.partition_values[self.ROW_CUT_LIMIT:] if v > 0))
+            rows = rows if rows else []
+            cols = rows if cols else []
+
+            # 边界：始终包含 0 和 1
+            self.row_cuts = [0.0] + rows + [1.0]
+            self.col_cuts = [0.0] + cols + [1.0]
+
+            # 判断分区是否合理，并计算各个分区的任务卸载率ρ
+            valid_partition = True
+            for i in range(len(self.row_cuts) - 1):
+                for j in range(len(self.col_cuts) - 1):
+                    d = (self.col_cuts[j+1] - self.col_cuts[j]) * self.W * \
+                        (self.row_cuts[i+1] - self.row_cuts[i]) * self.H
+                    rho_time_limit = (self.flight_time_factor - self.trans_time_factor) / \
+                        (self.comp_time_factor - self.trans_time_factor)
+                    rho_energy_limit = (self.battery_energy_capacity - self.flight_energy_factor * d - self.trans_energy_factor * d) / \
+                        (self.comp_energy_factor * d -
+                            self.trans_energy_factor * d)
+                    if rho_energy_limit < 0:
+                        valid_partition = False
+                        break
+                    rho = min(rho_time_limit, rho_energy_limit)
+
+                    flight_time = self.flight_time_factor * d
+                    bs_time = self.bs_time_factor * (1 - rho) * d
+
+                    self.rectangles.append({
+                        'center': ((self.row_cuts[i] + self.row_cuts[i+1]) * self.H / 2, (self.col_cuts[j+1] + self.col_cuts[j]) * self.W / 2),
+                        'flight_time': flight_time,
+                        'bs_time': bs_time,
+                    })
+                if not valid_partition:
+                    break
+
+            if not valid_partition:
+                reward = -100
+                state = self.partition_values
+                return state, reward, True, False, {}
+            else:
+                reward = 0
+                state = self.partition_values
+
+                # 继续进行路径规划
+                # cities: [[x1, x2, x3...], [y1, y2, y3...]] 城市坐标
+                rec_center_lt = [rec_info['center']
+                                 for rec_info in self.rectangles]
+                cities = np.column_stack(rec_center_lt)
+                cities = np.column_stack((self.center, cities))
+
+                center_idx = []
+                for i in range(self.num_cars - 1):
+                    cities = np.column_stack((cities, self.center))
+                    center_idx.append(cities.shape[1] - 1)
+
+                tsp = mTSP(params=self.params, num_cities=cities.shape[1], cities=cities, num_cars=self.num_cars,
+                           center_idx=center_idx, rectangles=self.rectangles)
+                best_time, best_path = tsp.train(10000)
+
+                reward += self.BASE_LINE - best_time
+
+                return state, reward, True, False, best_path
+
+    def render(self):
+        if self.phase == 1:
+            print("Phase 1: Initialize maze environment.")
+            print(f"Partition values so far: {self.partition_values}")
+            print(f"Motorcade positon: {self.car_pos}")
+            # input('1111')
+        elif self.phase == 2:
+            print("Phase 2: Play maze.")
+            print(f'Motorcade trajectory: {self.car_traj}')
+            # input('2222')
diff --git a/human_action.py b/human_action.py
index da4125d..6d0c98f 100644
--- a/human_action.py
+++ b/human_action.py
@@ -1,18 +1,21 @@
-from env import PartitionMazeEnv
+# from env import PartitionMazeEnv
 # from env_dis import PartitionMazeEnv
+from env_partion import PartitionEnv
 
-env = PartitionMazeEnv()
+# env = PartitionMazeEnv()
+env = PartitionEnv()
 
 state = env.reset()
-print(state)
+print('state:', state)
 
-action_series = [[0.67], [0], [0], [0], [0.7]]
+# action_series = [[0.67], [0], [0], [0], [0.7]]
 # action_series = [0, 0, 3, 0, 10]
+action_series = [[0.5], [0.5]]
 
 for i in range(100):
     action = action_series[i]
     state, reward, done, info, _ = env.step(action)
-    print(state)
-    print(reward)
+    print('state:', state)
+    print('reward:', reward)
     if done:
         break
diff --git a/Q_learning/mTSP.py b/mTSP_solver.py
similarity index 80%
rename from Q_learning/mTSP.py
rename to mTSP_solver.py
index eb56a98..c79b07d 100644
--- a/Q_learning/mTSP.py
+++ b/mTSP_solver.py
@@ -1,17 +1,20 @@
 import numpy as np
+import yaml
 
 
-class TSP(object):
+class mTSP(object):
     '''
     用 Q-Learning 求解 TSP 问题
     作者 Surfer Zen @ https://www.zhihu.com/people/surfer-zen
     '''
 
     def __init__(self,
+                 params='params',
                  num_cities=15,
                  cities=None,
                  num_cars=2,
                  center_idx=[0],
+                 rectangles=None,
                  alpha=2,
                  beta=1,
                  learning_rate=0.001,
@@ -29,6 +32,7 @@ class TSP(object):
         self.cities = cities
         self.num_cars = num_cars
         self.center_idx = center_idx
+        self.rectangles = rectangles
 
         self.alpha = alpha
         self.beta = beta
@@ -41,6 +45,24 @@ class TSP(object):
         self.best_path = None
         self.best_path_length = np.inf
 
+        with open(params+'.yml', 'r', encoding='utf-8') as file:
+            params = yaml.safe_load(file)
+
+        self.H = params['H']
+        self.W = params['W']
+        self.num_cars = params['num_cars']
+
+        self.flight_time_factor = params['flight_time_factor']
+        self.comp_time_factor = params['comp_time_factor']
+        self.trans_time_factor = params['trans_time_factor']
+        self.car_time_factor = params['car_time_factor']
+        self.bs_time_factor = params['bs_time_factor']
+
+        self.flight_energy_factor = params['flight_energy_factor']
+        self.comp_energy_factor = params['comp_energy_factor']
+        self.trans_energy_factor = params['trans_energy_factor']
+        self.battery_energy_capacity = params['battery_energy_capacity']
+
     def get_dist_matrix(self):
         '''
         根据城市坐标，计算距离矩阵
@@ -137,13 +159,21 @@ class TSP(object):
         '''
         split_result = self.split_path(path)
 
-        length_lt = []
+        time_lt = []
         for car_path in split_result:
             path_length = 0
+            flight_time = 0
+            bs_time = 0
             for fr, to in zip(car_path[:-1], car_path[1:]):
                 path_length += self.distances[fr, to]
-            length_lt.append(path_length)
-        return max(length_lt)
+            car_time = path_length * self.car_time_factor
+            for offset_rec_idx in car_path[1:-1]:
+                flight_time += self.rectangles[offset_rec_idx -
+                                               1]['flight_time']
+                bs_time += self.rectangles[offset_rec_idx - 1]['bs_time']
+            system_time = max(flight_time + car_time, bs_time)
+            time_lt.append(system_time)
+        return max(time_lt)
 
     def split_path(self, path):
         # 分割路径
@@ -197,23 +227,24 @@ class TSP(object):
         '''
         for epoch in range(num_epochs):
             self.train_for_one_rollout(start_city_id=0)
+        return self.best_path_length, self.best_path
 
 
 def main():
     np.random.seed(42)
     center = np.array([0, 0])
     # cities: [[x1, x2, x3...], [y1, y2, y3...]] 城市坐标
-    cites = np.random.random([2, 15]) * np.array([800, 600]).reshape(2, -1)
-    # cites = np.array([[10, -10], [0, 0]])
-    cites = np.column_stack((center, cites))
+    cities = np.random.random([2, 15]) * np.array([800, 600]).reshape(2, -1)
+    # cities = np.array([[10, -10], [0, 0]])
+    cities = np.column_stack((center, cities))
 
     num_cars = 2
     center_idx = []
     for i in range(num_cars - 1):
-        cites = np.column_stack((cites, center))
-        center_idx.append(cites.shape[1] - 1)
+        cities = np.column_stack((cities, center))
+        center_idx.append(cities.shape[1] - 1)
 
-    tsp = TSP(num_cities=cites.shape[1], cities=cites,
+    tsp = mTSP(num_cities=cities.shape[1], cities=cities,
               num_cars=num_cars, center_idx=center_idx)
 
     # 训练模型
diff --git a/mtkl_sovler.py b/mtkl_sovler.py
index 5af8701..5014705 100644
--- a/mtkl_sovler.py
+++ b/mtkl_sovler.py
@@ -11,13 +11,13 @@ random.seed(42)
 # ---------------------------
 # 需要修改的超参数
 # ---------------------------
-num_iterations = 1000000
+num_iterations = 10000
 # 随机生成分区的行分段数与列分段数
 # R = random.randint(0, 3)  # 行分段数
 # C = random.randint(0, 3)  # 列分段数
-R = 3
-C = 3
-params_file = 'params2'
+R = 1
+C = 1
+params_file = 'params3'
 
 
 with open(params_file + '.yml', 'r', encoding='utf-8') as file: