From 7ca5ce08b1ef28c3805da3db63faf869030196de Mon Sep 17 00:00:00 2001
From: weixin_46229132 <weixin_46229132@noreply.gitcode.com>
Date: Wed, 19 Mar 2025 14:22:24 +0800
Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E7=8E=AF=E5=A2=83?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 DDPG_solver/utils.py |  3 ++-
 env.py               | 60 +++++++++++++++++++++++++++++++++-----------
 human_action.py      |  2 +-
 3 files changed, 48 insertions(+), 17 deletions(-)

diff --git a/DDPG_solver/utils.py b/DDPG_solver/utils.py
index 2039c65..e76fc48 100644
--- a/DDPG_solver/utils.py
+++ b/DDPG_solver/utils.py
@@ -50,7 +50,8 @@ def evaluate_policy(env, agent, turns = 3):
             action_series.append(a[0])
             total_scores += r
             s = s_next
-        print(np.round(action_series, 3))
+        print('action series: ', np.round(action_series, 3))
+        print('state: {s_next}')
     return int(total_scores/turns)
 
 
diff --git a/env.py b/env.py
index 99851f0..d7edea7 100644
--- a/env.py
+++ b/env.py
@@ -53,11 +53,11 @@ class PartitionMazeEnv(gym.Env):
             low=0.0, high=1.0, shape=(1,), dtype=np.float32)
 
         # 定义观察空间为8维向量
-        # TODO 返回的状态目前只有位置坐标
         # 阶段 0 状态：前 4 维表示已决策的切分值（未决策部分为 0）
-        # 阶段 1 状态：车辆位置 (2D)
+        # 阶段 1 状态：区域访问状态向量（长度为(CUT_NUM/2+1)^2）
+        max_regions = (self.CUT_NUM // 2 + 1) ** 2
         self.observation_space = spaces.Box(
-            low=0.0, high=1.0, shape=(self.CUT_NUM + 2 * self.num_cars,), dtype=np.float32)
+            low=0.0, high=1.0, shape=(self.CUT_NUM + max_regions,), dtype=np.float32)
 
         # 切分阶段相关变量
         self.col_cuts = []     # 存储竖切位置（c₁, c₂），当值为0时表示不切
@@ -86,9 +86,13 @@ class PartitionMazeEnv(gym.Env):
         self.car_pos = [(self.H / 2, self.W / 2) for _ in range(self.num_cars)]
         self.car_traj = [[] for _ in range(self.num_cars)]
         self.current_car_index = 0
-        # 状态：前 4 维为 partition_values，其余补 0
-        state = np.concatenate(
-            [self.partition_values, np.zeros(np.array(self.car_pos).flatten().shape[0], dtype=np.float32)])
+        
+        # 状态：前 4 维为 partition_values，其余为区域访问状态（初始全0）
+        max_regions = (self.CUT_NUM // 2 + 1) ** 2
+        state = np.concatenate([
+            self.partition_values, 
+            np.zeros(max_regions, dtype=np.float32)
+        ])
         return state
 
     def step(self, action):
@@ -103,8 +107,10 @@ class PartitionMazeEnv(gym.Env):
             self.partition_step += 1
 
             # 构造当前状态：前 partition_step 个为已决策值，其余为 0，再补 7 个 0
-            state = np.concatenate(
-                [self.partition_values, np.zeros(np.array(self.car_pos).flatten().shape[0], dtype=np.float32)])
+            state = np.concatenate([
+                self.partition_values, 
+                np.zeros((self.CUT_NUM // 2 + 1) ** 2, dtype=np.float32)
+            ])
 
             # 如果未完成 4 步，则仍处于切分阶段，不发奖励，done 为 False
             if self.partition_step < self.CUT_NUM:
@@ -153,8 +159,12 @@ class PartitionMazeEnv(gym.Env):
 
                 if not valid_partition:
                     reward = -10000
-                    state = np.concatenate(
-                        [self.partition_values, np.zeros(np.array(self.car_pos).flatten().shape[0], dtype=np.float32)])
+                    # 状态：前 4 维为 partition_values，其余为区域访问状态（初始全0）
+                    max_regions = (self.CUT_NUM // 2 + 1) ** 2
+                    state = np.concatenate([
+                        self.partition_values, 
+                        np.zeros(max_regions, dtype=np.float32)
+                    ])
                     return state, reward, True, False, {}
                 else:
                     # 进入阶段 1：初始化迷宫
@@ -183,9 +193,19 @@ class PartitionMazeEnv(gym.Env):
 
                     # 进入阶段 2：走迷宫
                     self.phase = 2
-                    state = np.concatenate(
-                        [self.partition_values, np.array(self.car_pos).flatten()]
-                    )
+                    
+                    # 构造访问状态向量
+                    max_regions = (self.CUT_NUM // 2 + 1) ** 2
+                    visit_status = np.zeros(max_regions, dtype=np.float32)
+                    
+                    # 将实际区域的访问状态填入向量
+                    for i in range(len(self.row_cuts) - 1):
+                        for j in range(len(self.col_cuts) - 1):
+                            idx = i * (len(self.col_cuts) - 1) + j
+                            visit_status[idx] = float(self.rectangles[(i, j)]['is_visited'])
+                    for i in range(idx + 1, max_regions):
+                        visit_status[i] = 100
+                    state = np.concatenate([self.partition_values, visit_status])
                     return state, reward, False, False, {}
 
         elif self.phase == 2:
@@ -250,8 +270,18 @@ class PartitionMazeEnv(gym.Env):
             self.rectangles[(new_row, new_col)]['is_visited'] = True
 
             # 观察状态
-            state = np.concatenate(
-                [self.partition_values, np.array(self.car_pos).flatten()])            
+            # 构造访问状态向量
+            max_regions = (self.CUT_NUM // 2 + 1) ** 2
+            visit_status = np.zeros(max_regions, dtype=np.float32)
+            
+            # 将实际区域的访问状态填入向量
+            for i in range(len(self.row_cuts) - 1):
+                for j in range(len(self.col_cuts) - 1):
+                    idx = i * (len(self.col_cuts) - 1) + j
+                    visit_status[idx] = float(self.rectangles[(i, j)]['is_visited'])
+            for i in range(idx + 1, max_regions):
+                visit_status[i] = 100
+            state = np.concatenate([self.partition_values, visit_status])            
 
             # Episode 终止条件：所有网格均被访问或步数达到上限
             done = all([value['is_visited'] for _, value in self.rectangles.items()]) or (
diff --git a/human_action.py b/human_action.py
index 2275342..d2565d2 100644
--- a/human_action.py
+++ b/human_action.py
@@ -6,7 +6,7 @@ env = PartitionMazeEnv()
 state = env.reset()
 print(state)
 
-action_series = [[0.1], [0.2], [0.4], [0], [0.1]]
+action_series = [[0], [0], [0.4], [0], [0.1]]
 # action_series = [0, 0, 3, 0, 0, 10]
 
 for i in range(100):