diff --git a/env.py b/env.py index b78dcfb..6c9f102 100644 --- a/env.py +++ b/env.py @@ -41,7 +41,7 @@ class PartitionMazeEnv(gym.Env): ############################## self.CUT_NUM = 4 # 横切一半,竖切一半 self.BASE_LINE = 3500 # 基准时间,通过greedy或者蒙特卡洛计算出来 - self.MAX_STEPS = 50 # 迷宫走法步数上限 + self.MAX_STEPS = 10 # 迷宫走法步数上限 self.phase = 0 # 阶段控制,0:区域划分阶段,1:迷宫初始化阶段,2:走迷宫阶段 self.partition_step = 0 # 区域划分阶段步数,范围 0~4 @@ -57,7 +57,7 @@ class PartitionMazeEnv(gym.Env): # 阶段 1 状态:区域访问状态向量(长度为(CUT_NUM/2+1)^2) max_regions = (self.CUT_NUM // 2 + 1) ** 2 self.observation_space = spaces.Box( - low=0.0, high=100.0, shape=(self.CUT_NUM + max_regions,), dtype=np.float32) + low=0.0, high=100.0, shape=(self.CUT_NUM + max_regions + 1,), dtype=np.float32) # 切分阶段相关变量 self.col_cuts = [] # 存储竖切位置(c₁, c₂),当值为0时表示不切 @@ -71,7 +71,7 @@ class PartitionMazeEnv(gym.Env): self.car_pos = [(self.H / 2, self.W / 2) for _ in range(self.num_cars)] self.car_traj = [[] for _ in range(self.num_cars)] self.current_car_index = 0 - self.previous_T = 0 + self.delay_time = 0 def reset(self, seed=None, options=None): # 重置所有变量,回到切分阶段(phase 0) @@ -87,12 +87,14 @@ class PartitionMazeEnv(gym.Env): self.car_pos = [(self.H / 2, self.W / 2) for _ in range(self.num_cars)] self.car_traj = [[] for _ in range(self.num_cars)] self.current_car_index = 0 + self.delay_time = 0 # 状态:前 4 维为 partition_values,其余为区域访问状态(初始全0) max_regions = (self.CUT_NUM // 2 + 1) ** 2 state = np.concatenate([ self.partition_values, - np.zeros(max_regions, dtype=np.float32) + np.zeros(max_regions, dtype=np.float32), + [0.0] ]) return state @@ -110,7 +112,8 @@ class PartitionMazeEnv(gym.Env): # 构造当前状态:前 partition_step 个为已决策值,其余为 0,再补 7 个 0 state = np.concatenate([ self.partition_values, - np.zeros((self.CUT_NUM // 2 + 1) ** 2, dtype=np.float32) + np.zeros((self.CUT_NUM // 2 + 1) ** 2, dtype=np.float32), + [0.0] ]) # 如果未完成 4 步,则仍处于切分阶段,不发奖励,done 为 False @@ -164,7 +167,8 @@ class PartitionMazeEnv(gym.Env): max_regions = (self.CUT_NUM // 2 + 1) ** 2 state = np.concatenate([ self.partition_values, - np.zeros(max_regions, dtype=np.float32) + np.zeros(max_regions, dtype=np.float32), + [0.0] ]) return state, reward, True, False, {} else: @@ -209,7 +213,7 @@ class PartitionMazeEnv(gym.Env): for i in range(idx + 1, max_regions): visit_status[i] = 100 state = np.concatenate( - [self.partition_values, visit_status]) + [self.partition_values, visit_status, [0.0]]) return state, reward, False, False, {} elif self.phase == 2: @@ -276,9 +280,14 @@ class PartitionMazeEnv(gym.Env): # 新一轮的开始,初始化移动标记 self.cars_moved = [False] * self.num_cars self.cars_moved[current_car] = car_moved + # 计算当前的 T 值 + current_T = max([self._compute_motorcade_time(idx) + for idx in range(self.num_cars)]) # 如果一轮结束,检查是否所有车辆都没有移动 if self.current_car_index == (self.num_cars - 1) and not any(self.cars_moved): - reward -= 0.01 + # 增加时间 BASE_LINE / T * 10 + self.delay_time += self.BASE_LINE * (1 / self.MAX_STEPS) + real_T = current_T + self.delay_time self.step_count += 1 self.current_car_index = ( @@ -297,18 +306,21 @@ class PartitionMazeEnv(gym.Env): self.rectangles[(i, j)]['is_visited']) for i in range(idx + 1, max_regions): visit_status[i] = 100 - state = np.concatenate([self.partition_values, visit_status]) + # 在状态向量最后增加一维,表示当前的 T 值 + state = np.concatenate( + [self.partition_values, visit_status, [real_T]]) # Episode 终止条件:所有网格均被访问或步数达到上限 done = all([value['is_visited'] for _, value in self.rectangles.items()]) or ( self.step_count >= self.MAX_STEPS) if done and all([value['is_visited'] for _, value in self.rectangles.items()]): - # 区域覆盖完毕,根据轨迹计算各车队的执行时间 - T = max([self._compute_motorcade_time(idx) - for idx in range(self.num_cars)]) - # TODO 让奖励在baseline附近变化更剧烈 - # reward = math.exp(-T / self.BASE_LINE) * 1000 - reward += self.BASE_LINE / T + # # 区域覆盖完毕,根据轨迹计算各车队的执行时间 + # T = max([self._compute_motorcade_time(idx) + # for idx in range(self.num_cars)]) + # # TODO 让奖励在baseline附近变化更剧烈 + # # reward = math.exp(-T / self.BASE_LINE) * 1000 + reward += self.BASE_LINE / real_T * 5 + print(real_T, "="*20) # if reward > self.BASE_LINE: # reward -= 200 @@ -316,7 +328,7 @@ class PartitionMazeEnv(gym.Env): # reward -= 10 * self.step_count # TODO 动态调整baseline elif done and self.step_count >= self.MAX_STEPS: - reward += -0.8 + reward += -5 return state, reward, done, False, {} diff --git a/human_action.py b/human_action.py index 6adcbff..da4125d 100644 --- a/human_action.py +++ b/human_action.py @@ -1,13 +1,13 @@ -# from env import PartitionMazeEnv -from env_dis import PartitionMazeEnv +from env import PartitionMazeEnv +# from env_dis import PartitionMazeEnv env = PartitionMazeEnv() state = env.reset() print(state) -# action_series = [[0], [0], [0.4], [0], [0.1]] -action_series = [0, 0, 3, 0, 10] +action_series = [[0.67], [0], [0], [0], [0.7]] +# action_series = [0, 0, 3, 0, 10] for i in range(100): action = action_series[i]