From f347ca8276a6becb9201fac37e98fbaa3613d952 Mon Sep 17 00:00:00 2001
From: weixin_46229132 <weixin_46229132@noreply.gitcode.com>
Date: Sat, 29 Mar 2025 16:28:30 +0800
Subject: [PATCH] =?UTF-8?q?=E5=BE=AE=E8=B0=83=E5=88=86=E5=8C=BA?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 PPO_Continuous/main.py  |  16 +--
 PPO_Continuous/utils.py |   6 +-
 env.py                  |   7 +-
 env_partion.py          | 217 ++++++++++++++++++++--------------------
 human_action.py         |   2 +-
 5 files changed, 128 insertions(+), 120 deletions(-)

diff --git a/PPO_Continuous/main.py b/PPO_Continuous/main.py
index f306702..b44ea85 100644
--- a/PPO_Continuous/main.py
+++ b/PPO_Continuous/main.py
@@ -10,6 +10,7 @@ import sys
 import os
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from env_partion import PartitionEnv
+# from env import PartitionMazeEnv
 # fmt: on
 
 '''Hyperparameter Setting'''
@@ -18,7 +19,7 @@ parser.add_argument('--dvc', type=str, default='cpu',
                     help='running device: cuda or cpu')
 parser.add_argument('--EnvIdex', type=int, default=0,
                     help='PM_PPO_Con, PV1, Lch_Cv2, Humanv4, HCv4, BWv3, BWHv3')
-parser.add_argument('--write', type=str2bool, default=True,
+parser.add_argument('--write', type=str2bool, default=False,
                     help='Use SummaryWriter to record the training')
 parser.add_argument('--render', type=str2bool,
                     default=False, help='Render or Not')
@@ -28,7 +29,7 @@ parser.add_argument('--ModelIdex', type=int, default=500,
                     help='which model to load')
 
 parser.add_argument('--seed', type=int, default=0, help='random seed')
-parser.add_argument('--T_horizon', type=int, default=20,
+parser.add_argument('--T_horizon', type=int, default=15,
                     help='lenth of long trajectory')
 parser.add_argument('--Distribution', type=str, default='Beta',
                     help='Should be one of Beta ; GS_ms  ;  GS_m')
@@ -36,7 +37,7 @@ parser.add_argument('--Max_train_steps', type=int,
                     default=int(5e8), help='Max training steps')
 parser.add_argument('--save_interval', type=int,
                     default=int(5e5), help='Model saving interval, in steps.')
-parser.add_argument('--eval_interval', type=int, default=int(5e1),
+parser.add_argument('--eval_interval', type=int, default=int(5e3),
                     help='Model evaluating interval, in steps.')
 
 parser.add_argument('--gamma', type=float, default=0.99,
@@ -74,10 +75,10 @@ def main():
                    'Humanv4', 'HCv4', 'BWv3', 'BWHv3']
 
     # Build Env
-    # env = gym.make(EnvName[opt.EnvIdex], render_mode = "human" if opt.render else None)
     env = PartitionEnv()
-    # eval_env = gym.make(EnvName[opt.EnvIdex])
+    # env = PartitionMazeEnv()
     eval_env = PartitionEnv()
+    # eval_env = PartitionMazeEnv()
     opt.state_dim = env.observation_space.shape[0]
     opt.action_dim = env.action_space.shape[0]
     opt.max_action = float(env.action_space.high[0])
@@ -129,9 +130,9 @@ def main():
                 '''Interact with Env'''
                 a, logprob_a = agent.select_action(
                     s, deterministic=False)  # use stochastic when training
-                # act = Action_adapter(a,opt.max_action) #[0,1] to [-max,max]
+                act = Action_adapter(a,opt.max_action) #[0,1] to [-max,max]
                 s_next, r, dw, tr, info = env.step(
-                    a)  # dw: dead&win; tr: truncated
+                    act)  # dw: dead&win; tr: truncated
                 # r = Reward_adapter(r, opt.EnvIdex)
                 done = (dw or tr)
 
@@ -152,6 +153,7 @@ def main():
                     # evaluate the policy for 3 times, and get averaged result
                     score = evaluate_policy(
                         eval_env, agent, opt.max_action, turns=1)
+                    # TODO 保存新的路径
                     if opt.write:
                         writer.add_scalar(
                             'ep_r', score, global_step=total_steps)
diff --git a/PPO_Continuous/utils.py b/PPO_Continuous/utils.py
index 58b339d..aeb26cd 100644
--- a/PPO_Continuous/utils.py
+++ b/PPO_Continuous/utils.py
@@ -143,10 +143,10 @@ def evaluate_policy(env, agent, max_action, turns):
         while not done:
             # Take deterministic actions when evaluation
             a, logprob_a = agent.select_action(s, deterministic=True)
-            # act = Action_adapter(a, max_action)  # [0,1] to [-max,max]
-            s_next, r, dw, tr, info = env.step(a)
+            act = Action_adapter(a, max_action)  # [0,1] to [-max,max]
+            s_next, r, dw, tr, info = env.step(act)
             done = (dw or tr)
-            action_series.append(a[0])
+            action_series.append(act[0])
             total_scores += r
             s = s_next
         print('action series: ', np.round(action_series, 3))
diff --git a/env.py b/env.py
index 3696dd8..ec81e68 100644
--- a/env.py
+++ b/env.py
@@ -39,9 +39,9 @@ class PartitionMazeEnv(gym.Env):
         ##############################
         # 可能需要手动修改的超参数
         ##############################
-        self.CUT_NUM = 4    # 横切一半，竖切一半
-        self.BASE_LINE = 3500     # 基准时间，通过greedy或者蒙特卡洛计算出来
-        self.MAX_STEPS = 10        # 迷宫走法步数上限
+        self.CUT_NUM = 6    # 横切一半，竖切一半
+        self.BASE_LINE = 10000     # 基准时间，通过greedy或者蒙特卡洛计算出来
+        self.MAX_STEPS = 20        # 迷宫走法步数上限
 
         self.phase = 0    # 阶段控制，0：区域划分阶段，1：迷宫初始化阶段，2：走迷宫阶段
         self.partition_step = 0      # 区域划分阶段步数，范围 0~4
@@ -172,6 +172,7 @@ class PartitionMazeEnv(gym.Env):
                     ])
                     return state, reward, True, False, {}
                 else:
+                    print(self.partition_values)
                     # 进入阶段 1：初始化迷宫
                     self.phase = 1
                     reward = 0.2
diff --git a/env_partion.py b/env_partion.py
index 65b20d3..a9da83f 100644
--- a/env_partion.py
+++ b/env_partion.py
@@ -19,12 +19,28 @@ class PartitionEnv(gym.Env):
         # 可能需要手动修改的超参数
         ##############################
         self.params = 'params2'
+        self.ORI_ROW_CUTS = [0, 0.2, 0.4, 0.7, 1]
+        self.ORI_COL_CUTS = [0, 0.5, 1]
         self.CUT_NUM = 4
         self.ROW_CUT_LIMIT = 3
         self.COL_CUT_LIMIT = 1
         self.BASE_LINE = 10000
         self.mTSP_STEPS = 10000
 
+        # 定义动作空间：全部动作均为 1 维连续 [0,1]
+        self.action_space = spaces.Box(
+            low=0.0, high=1.0, shape=(1,), dtype=np.float32)
+
+        # 定义观察空间为8维向量
+        # 前 4 维表示已决策的切分值（未决策部分为 0）
+        self.observation_space = spaces.Box(
+            low=0.0, high=1.0, shape=(self.CUT_NUM + 4,), dtype=np.float32)
+
+        self.partition_step = 0
+        self.ori_row_cuts = self.ORI_ROW_CUTS[:]
+        self.ori_col_cuts = self.ORI_COL_CUTS[:]
+        self.rectangles = []
+
         # 车队参数设置
         with open(self.params + '.yml', 'r', encoding='utf-8') as file:
             params = yaml.safe_load(file)
@@ -45,140 +61,129 @@ class PartitionEnv(gym.Env):
         self.trans_energy_factor = params['trans_energy_factor']
         self.battery_energy_capacity = params['battery_energy_capacity']
 
-        self.partition_step = 0      # 区域划分阶段步数，范围 0~4
-        self.partition_values = np.zeros(
-            self.CUT_NUM, dtype=np.float32)  # 存储 c₁, c₂, r₁, r₂
-
-        # 定义动作空间：全部动作均为 1 维连续 [0,1]
-        self.action_space = spaces.Box(
-            low=0.0, high=1.0, shape=(1,), dtype=np.float32)
-
-        # 定义观察空间为8维向量
-        # 前 4 维表示已决策的切分值（未决策部分为 0）
-        self.observation_space = spaces.Box(
-            low=0.0, high=1.0, shape=(self.CUT_NUM,), dtype=np.float32)
-
-        # 切分阶段相关变量
-        self.col_cuts = []     # 存储竖切位置（c₁, c₂），当值为0时表示不切
-        self.row_cuts = []   # 存储横切位置（r₁, r₂）
-        self.rectangles = []
-
     def reset(self, seed=None, options=None):
         # 重置所有变量，回到切分阶段（phase 0）
         self.phase = 0
         self.partition_step = 0
-        self.partition_values = np.zeros(self.CUT_NUM, dtype=np.float32)
-        self.col_cuts = []
-        self.row_cuts = []
+        self.ori_row_cuts = self.ORI_ROW_CUTS[:]
+        self.ori_col_cuts = self.ORI_COL_CUTS[:]
         self.rectangles = []
 
         # 状态：前 4 维为 partition_values，其余为区域访问状态（初始全0）
-        state = self.partition_values
+        state = np.array(self.ori_row_cuts + self.ori_col_cuts)
 
         return state
 
     def step(self, action):
         # 在所有阶段动作均为 1 维连续动作，取 action[0]
-        a = float(action[0])
-        self.partition_values[self.partition_step] = a
+        adjust = float(action[0])
+        valid_adjust = True
+
+        if self.partition_step < self.ROW_CUT_LIMIT:
+            row_cut = self.ori_row_cuts[self.partition_step + 1]
+            new_row_cut = row_cut + adjust
+            self.ori_row_cuts[self.partition_step + 1] = new_row_cut
+
+            if self.ori_row_cuts[self.partition_step] < new_row_cut < self.ori_row_cuts[self.partition_step + 2]:
+                pass
+            else:
+                valid_adjust = False
+                reward = -100
+        else:
+            col_idx = self.partition_step - self.ROW_CUT_LIMIT
+            col_cut = self.ori_col_cuts[col_idx + 1]
+            new_col_cut = col_cut + adjust
+            self.ori_col_cuts[col_idx + 1] = new_col_cut
+
+            if self.ori_col_cuts[col_idx] < new_col_cut < self.ori_col_cuts[col_idx + 2]:
+                pass
+            else:
+                valid_adjust = False
+                reward = -100
+
         self.partition_step += 1
 
-        # 构造当前状态：前 partition_step 个为已决策值，其余为 0，再补 7 个 0
-        state = self.partition_values
+        state = np.array(self.ori_row_cuts + self.ori_col_cuts)
 
-        # 如果未完成 4 步，则仍处于切分阶段，不发奖励，done 为 False
-        if self.partition_step < self.CUT_NUM:
-            return state, 0.0, False, False, {}
+        # 出现无效调整，直接结束
+        if not valid_adjust:
+            return state, reward, True, False, {}
         else:
-            # 完成 4 步后，计算切分边界
-            # 过滤掉 0，并去重后排序
-            rows = sorted(
-                set(v for v in self.partition_values[:self.ROW_CUT_LIMIT] if v > 0))
-            cols = sorted(
-                set(v for v in self.partition_values[self.ROW_CUT_LIMIT:] if v > 0))
-            rows = rows if rows else []
-            cols = cols if cols else []
-
-            # 边界：始终包含 0 和 1
-            self.row_cuts = [0.0] + rows + [1.0]
-            self.col_cuts = [0.0] + cols + [1.0]
-
-            # 判断分区是否合理，并计算各个分区的任务卸载率ρ
-            valid_partition = True
-            for i in range(len(self.row_cuts) - 1):
-                for j in range(len(self.col_cuts) - 1):
-                    d = (self.col_cuts[j+1] - self.col_cuts[j]) * self.W * \
-                        (self.row_cuts[i+1] - self.row_cuts[i]) * self.H
-                    rho_time_limit = (self.flight_time_factor - self.trans_time_factor) / \
-                        (self.comp_time_factor - self.trans_time_factor)
-                    rho_energy_limit = (self.battery_energy_capacity - self.flight_energy_factor * d - self.trans_energy_factor * d) / \
-                        (self.comp_energy_factor * d -
-                            self.trans_energy_factor * d)
-                    if rho_energy_limit < 0:
-                        valid_partition = False
-                        break
-                    rho = min(rho_time_limit, rho_energy_limit)
-
-                    flight_time = self.flight_time_factor * d
-                    bs_time = self.bs_time_factor * (1 - rho) * d
-
-                    self.rectangles.append({
-                        'center': ((self.row_cuts[i] + self.row_cuts[i+1]) * self.H / 2, (self.col_cuts[j+1] + self.col_cuts[j]) * self.W / 2),
-                        'flight_time': flight_time,
-                        'bs_time': bs_time,
-                    })
-                if not valid_partition:
-                    break
-
-            if not valid_partition:
-                reward = -100
-                state = self.partition_values
-                return state, reward, True, False, {}
+            if self.partition_step < self.CUT_NUM:
+                return state, 0.0, False, False, {}
             else:
-                reward = 0
-                state = self.partition_values
+                # 完成 4 步后，判断分区是否合理，并计算各个分区的任务卸载率ρ
+                valid_partition = True
+                for i in range(len(self.ori_row_cuts) - 1):
+                    for j in range(len(self.ori_col_cuts) - 1):
+                        d = (self.ori_col_cuts[j+1] - self.ori_col_cuts[j]) * self.W * \
+                            (self.ori_row_cuts[i+1] -
+                             self.ori_row_cuts[i]) * self.H
+                        rho_time_limit = (self.flight_time_factor - self.trans_time_factor) / \
+                            (self.comp_time_factor - self.trans_time_factor)
+                        rho_energy_limit = (self.battery_energy_capacity - self.flight_energy_factor * d - self.trans_energy_factor * d) / \
+                            (self.comp_energy_factor * d -
+                                self.trans_energy_factor * d)
+                        if rho_energy_limit < 0:
+                            valid_partition = False
+                            break
+                        rho = min(rho_time_limit, rho_energy_limit)
 
-                # 继续进行路径规划
-                # 使用q_learning解多旅行商
-                # cities: [[x1, x2, x3...], [y1, y2, y3...]] 城市坐标
-                # rec_center_lt = [rec_info['center']
-                #                  for rec_info in self.rectangles]
-                # cities = np.column_stack(rec_center_lt)
-                # cities = np.column_stack((self.center, cities))
+                        flight_time = self.flight_time_factor * d
+                        bs_time = self.bs_time_factor * (1 - rho) * d
 
-                # center_idx = []
-                # for i in range(self.num_cars - 1):
-                #     cities = np.column_stack((cities, self.center))
-                #     center_idx.append(cities.shape[1] - 1)
+                        self.rectangles.append({
+                            'center': ((self.ori_row_cuts[i] + self.ori_row_cuts[i+1]) * self.H / 2, (self.ori_col_cuts[j+1] + self.ori_col_cuts[j]) * self.W / 2),
+                            'flight_time': flight_time,
+                            'bs_time': bs_time,
+                        })
+                    if not valid_partition:
+                        break
 
-                # tsp = mTSP(params=self.params, num_cities=cities.shape[1], cities=cities, num_cars=self.num_cars,
-                #            center_idx=center_idx, rectangles=self.rectangles)
+                if not valid_partition:
+                    reward = -10
+                    return state, reward, True, False, {}
+                else:
+                    # 继续进行路径规划
+                    # 使用q_learning解多旅行商
+                    # cities: [[x1, x2, x3...], [y1, y2, y3...]] 城市坐标
+                    # rec_center_lt = [rec_info['center']
+                    #                  for rec_info in self.rectangles]
+                    # cities = np.column_stack(rec_center_lt)
+                    # cities = np.column_stack((self.center, cities))
 
-                # best_time, best_path = tsp.train(self.mTSP_STEPS)
+                    # center_idx = []
+                    # for i in range(self.num_cars - 1):
+                    #     cities = np.column_stack((cities, self.center))
+                    #     center_idx.append(cities.shape[1] - 1)
 
-                # 使用遗传算法解多旅行商
-                cities = [self.center]
-                for rec in self.rectangles:
-                    cities.append(rec['center'])
-                cities = np.array(cities)
+                    # tsp = mTSP(params=self.params, num_cities=cities.shape[1], cities=cities, num_cars=self.num_cars,
+                    #            center_idx=center_idx, rectangles=self.rectangles)
 
-                center_idx = [0]
-                for i in range(self.num_cars - 1):
-                    cities = np.row_stack((cities, self.center))
-                    center_idx.append(cities.shape[0] - 1)
+                    # best_time, best_path = tsp.train(self.mTSP_STEPS)
 
-                ga = GA(num_drones=self.num_cars, num_city=cities.shape[0], num_total=20,
-                        data=cities, to_process_idx=center_idx, rectangles=self.rectangles)
+                    # 使用遗传算法解多旅行商
+                    cities = [self.center]
+                    for rec in self.rectangles:
+                        cities.append(rec['center'])
+                    cities = np.array(cities)
 
-                best_path, best_time = ga.run()
+                    center_idx = [0]
+                    for i in range(self.num_cars - 1):
+                        cities = np.row_stack((cities, self.center))
+                        center_idx.append(cities.shape[0] - 1)
 
-                # print(best_time)
-                # print(best_path)
+                    ga = GA(num_drones=self.num_cars, num_city=cities.shape[0], num_total=20,
+                            data=cities, to_process_idx=center_idx, rectangles=self.rectangles)
 
-                reward += self.BASE_LINE - best_time
-                print(reward)
+                    best_path, best_time = ga.run()
 
-                return state, reward, True, False, best_path
+                    # print(best_time)
+                    # print(best_path)
+
+                    reward = self.BASE_LINE / best_time
+
+                    return state, reward, True, False, best_path
 
     def render(self):
         if self.phase == 1:
diff --git a/human_action.py b/human_action.py
index 9e3fe34..f8d6006 100644
--- a/human_action.py
+++ b/human_action.py
@@ -11,7 +11,7 @@ print('state:', state)
 # action_series = [[0.67], [0], [0], [0], [0.7]]
 # action_series = [0, 0, 3, 0, 10]
 action_series = [[0.2], [0.4], [0.7], [0.5]]
-# action_series = [[0.5], [0.5]]
+action_series = [[-0.1], [0], [0], [0]]
 
 for i in range(100):
     action = action_series[i]