环境增加delay_time

2025-03-22 09:47:52 +08:00 · 2025-03-22 09:47:52 +08:00 · a9ee5ceec7
commit a9ee5ceec7
parent 5b468deb9d
2 changed files with 32 additions and 20 deletions
--- a/env.py
+++ b/env.py
@ -41,7 +41,7 @@ class PartitionMazeEnv(gym.Env):
        ##############################
        self.CUT_NUM = 4    # 横切一半，竖切一半
        self.BASE_LINE = 3500     # 基准时间，通过greedy或者蒙特卡洛计算出来
-        self.MAX_STEPS = 50        # 迷宫走法步数上限
+        self.MAX_STEPS = 10        # 迷宫走法步数上限

        self.phase = 0    # 阶段控制，0：区域划分阶段，1：迷宫初始化阶段，2：走迷宫阶段
        self.partition_step = 0      # 区域划分阶段步数，范围 0~4
@ -57,7 +57,7 @@ class PartitionMazeEnv(gym.Env):
        # 阶段 1 状态：区域访问状态向量（长度为(CUT_NUM/2+1)^2）
        max_regions = (self.CUT_NUM // 2 + 1) ** 2
        self.observation_space = spaces.Box(
-            low=0.0, high=100.0, shape=(self.CUT_NUM + max_regions,), dtype=np.float32)
+            low=0.0, high=100.0, shape=(self.CUT_NUM + max_regions + 1,), dtype=np.float32)

        # 切分阶段相关变量
        self.col_cuts = []     # 存储竖切位置（c₁, c₂），当值为0时表示不切
@ -71,7 +71,7 @@ class PartitionMazeEnv(gym.Env):
        self.car_pos = [(self.H / 2, self.W / 2) for _ in range(self.num_cars)]
        self.car_traj = [[] for _ in range(self.num_cars)]
        self.current_car_index = 0
-        self.previous_T = 0
+        self.delay_time = 0

    def reset(self, seed=None, options=None):
        # 重置所有变量，回到切分阶段（phase 0）
@ -87,12 +87,14 @@ class PartitionMazeEnv(gym.Env):
        self.car_pos = [(self.H / 2, self.W / 2) for _ in range(self.num_cars)]
        self.car_traj = [[] for _ in range(self.num_cars)]
        self.current_car_index = 0
+        self.delay_time = 0

        # 状态：前 4 维为 partition_values，其余为区域访问状态（初始全0）
        max_regions = (self.CUT_NUM // 2 + 1) ** 2
        state = np.concatenate([
            self.partition_values,
-            np.zeros(max_regions, dtype=np.float32)
+            np.zeros(max_regions, dtype=np.float32),
+            [0.0]
        ])
        return state

@ -110,7 +112,8 @@ class PartitionMazeEnv(gym.Env):
            # 构造当前状态：前 partition_step 个为已决策值，其余为 0，再补 7 个 0
            state = np.concatenate([
                self.partition_values,
-                np.zeros((self.CUT_NUM // 2 + 1) ** 2, dtype=np.float32)
+                np.zeros((self.CUT_NUM // 2 + 1) ** 2, dtype=np.float32),
+                [0.0]
            ])

            # 如果未完成 4 步，则仍处于切分阶段，不发奖励，done 为 False
@ -164,7 +167,8 @@ class PartitionMazeEnv(gym.Env):
                    max_regions = (self.CUT_NUM // 2 + 1) ** 2
                    state = np.concatenate([
                        self.partition_values,
-                        np.zeros(max_regions, dtype=np.float32)
+                        np.zeros(max_regions, dtype=np.float32),
+                        [0.0]
                    ])
                    return state, reward, True, False, {}
                else:
@ -209,7 +213,7 @@ class PartitionMazeEnv(gym.Env):
                    for i in range(idx + 1, max_regions):
                        visit_status[i] = 100
                    state = np.concatenate(
-                        [self.partition_values, visit_status])
+                        [self.partition_values, visit_status, [0.0]])
                    return state, reward, False, False, {}

        elif self.phase == 2:
@ -276,9 +280,14 @@ class PartitionMazeEnv(gym.Env):
                # 新一轮的开始，初始化移动标记
                self.cars_moved = [False] * self.num_cars
            self.cars_moved[current_car] = car_moved
+            # 计算当前的 T 值
+            current_T = max([self._compute_motorcade_time(idx)
+                             for idx in range(self.num_cars)])
            # 如果一轮结束，检查是否所有车辆都没有移动
            if self.current_car_index == (self.num_cars - 1) and not any(self.cars_moved):
-                reward -= 0.01
+                # 增加时间 BASE_LINE / T * 10
+                self.delay_time += self.BASE_LINE * (1 / self.MAX_STEPS)
+            real_T = current_T + self.delay_time

            self.step_count += 1
            self.current_car_index = (
@ -297,18 +306,21 @@ class PartitionMazeEnv(gym.Env):
                        self.rectangles[(i, j)]['is_visited'])
            for i in range(idx + 1, max_regions):
                visit_status[i] = 100
-            state = np.concatenate([self.partition_values, visit_status])
+            # 在状态向量最后增加一维，表示当前的 T 值
+            state = np.concatenate(
+                [self.partition_values, visit_status, [real_T]])

            # Episode 终止条件：所有网格均被访问或步数达到上限
            done = all([value['is_visited'] for _, value in self.rectangles.items()]) or (
                self.step_count >= self.MAX_STEPS)
            if done and all([value['is_visited'] for _, value in self.rectangles.items()]):
-                # 区域覆盖完毕，根据轨迹计算各车队的执行时间
-                T = max([self._compute_motorcade_time(idx)
-                        for idx in range(self.num_cars)])
-                # TODO 让奖励在baseline附近变化更剧烈
-                # reward = math.exp(-T / self.BASE_LINE) * 1000
-                reward += self.BASE_LINE / T
+                # # 区域覆盖完毕，根据轨迹计算各车队的执行时间
+                # T = max([self._compute_motorcade_time(idx)
+                #         for idx in range(self.num_cars)])
+                # # TODO 让奖励在baseline附近变化更剧烈
+                # # reward = math.exp(-T / self.BASE_LINE) * 1000
+                reward += self.BASE_LINE / real_T * 5
+                print(real_T, "="*20)

                # if reward > self.BASE_LINE:
                #     reward -= 200
@ -316,7 +328,7 @@ class PartitionMazeEnv(gym.Env):
                # reward -= 10 * self.step_count
                # TODO 动态调整baseline
            elif done and self.step_count >= self.MAX_STEPS:
-                reward += -0.8
+                reward += -5

            return state, reward, done, False, {}

--- a/human_action.py
+++ b/human_action.py
@ -1,13 +1,13 @@
-# from env import PartitionMazeEnv
-from env_dis import PartitionMazeEnv
+from env import PartitionMazeEnv
+# from env_dis import PartitionMazeEnv

 env = PartitionMazeEnv()

 state = env.reset()
 print(state)

-# action_series = [[0], [0], [0.4], [0], [0.1]]
-action_series = [0, 0, 3, 0, 10]
+action_series = [[0.67], [0], [0], [0], [0.7]]
+# action_series = [0, 0, 3, 0, 10]

 for i in range(100):
    action = action_series[i]