diff --git a/Duel_Double_DQN/DQN.py b/Duel_Double_DQN/DQN.py index d03adf8..92773a8 100644 --- a/Duel_Double_DQN/DQN.py +++ b/Duel_Double_DQN/DQN.py @@ -69,11 +69,11 @@ class DQN_agent(object): else: if state[0][0] == 0: q_value = self.q_net(state) - q_value[10:] = - float('inf') + q_value[0][10:] = - float('inf') a = q_value.argmax().item() else: q_value = self.q_net(state) - q_value[:10] = - float('inf') + q_value[0][:10] = - float('inf') a = q_value.argmax().item() return a diff --git a/Duel_Double_DQN/main.py b/Duel_Double_DQN/main.py index fff5c96..e470bdf 100644 --- a/Duel_Double_DQN/main.py +++ b/Duel_Double_DQN/main.py @@ -1,15 +1,17 @@ +from DQN import DQN_agent +from datetime import datetime +from utils import evaluate_policy, str2bool import gymnasium as gym -import os import shutil import argparse import torch import numpy as np +# fmt: off import sys +import os sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from env_dis import PartitionMazeEnv -from utils import evaluate_policy, str2bool -from datetime import datetime -from DQN import DQN_agent +# fmt: on '''Hyperparameter Setting''' parser = argparse.ArgumentParser() @@ -66,7 +68,6 @@ def main(): eval_env = PartitionMazeEnv() opt.state_dim = env.observation_space.shape[0] opt.action_dim = env.action_space.n - opt.max_e_steps = 50 # Algorithm Setting if opt.Duel: @@ -87,7 +88,7 @@ def main(): print("Random Seed: {}".format(opt.seed)) print('Algorithm:', algo_name, ' Env:', BriefEnvName[opt.EnvIdex], ' state_dim:', opt.state_dim, - ' action_dim:', opt.action_dim, ' Random Seed:', opt.seed, ' max_e_steps:', opt.max_e_steps, '\n') + ' action_dim:', opt.action_dim, ' Random Seed:', opt.seed, '\n') if opt.write: from torch.utils.tensorboard import SummaryWriter diff --git a/Duel_Double_DQN/utils.py b/Duel_Double_DQN/utils.py index bd342ff..362efa6 100644 --- a/Duel_Double_DQN/utils.py +++ b/Duel_Double_DQN/utils.py @@ -1,16 +1,21 @@ +import numpy as np + def evaluate_policy(env, agent, turns = 3): total_scores = 0 for j in range(turns): s = env.reset() done = False + action_series = [] while not done: # Take deterministic actions at test time a = agent.select_action(s, deterministic=True) s_next, r, dw, tr, info = env.step(a) done = (dw or tr) - + action_series.append(a) total_scores += r s = s_next + print('action series: ', np.roudn(action_series, 3)) + print('state: ', s) return int(total_scores/turns) diff --git a/env.py b/env.py index 0ad2e18..789c4bb 100644 --- a/env.py +++ b/env.py @@ -71,6 +71,7 @@ class PartitionMazeEnv(gym.Env): self.car_pos = [(self.H / 2, self.W / 2) for _ in range(self.num_cars)] self.car_traj = [[] for _ in range(self.num_cars)] self.current_car_index = 0 + self.previous_T = 0 def reset(self, seed=None, options=None): # 重置所有变量,回到切分阶段(phase 0) @@ -290,9 +291,19 @@ class PartitionMazeEnv(gym.Env): # 区域覆盖完毕,根据轨迹计算各车队的执行时间 T = max([self._compute_motorcade_time(idx) for idx in range(self.num_cars)]) - reward += self.BASE_LINE / T * 1000 - # reward += self.BASE_LINE - T - # print(reward) + # TODO 让奖励在baseline附近变化更剧烈 + # reward = math.exp(-T / self.BASE_LINE) * 1000 + reward = self.BASE_LINE / T * 1000 + if T < self.BASE_LINE: + reward *= 10 + print(reward) + + + # if reward > self.BASE_LINE: + # reward -= 200 + # # TODO 计算len(self.car_traj)的值,需要修改轨迹记录法则 + # reward -= 10 * self.step_count + # TODO 动态调整baseline elif done and self.step_count >= self.MAX_STEPS: reward += -1000 diff --git a/env_dis.py b/env_dis.py index e96715f..277057b 100644 --- a/env_dis.py +++ b/env_dis.py @@ -41,6 +41,7 @@ class PartitionMazeEnv(gym.Env): ############################## self.CUT_NUM = 4 # 横切一半,竖切一半 self.BASE_LINE = 4000 # 基准时间,通过greedy或者蒙特卡洛计算出来 + self.MAX_STEPS = 50 # 迷宫走法步数上限 self.phase = 0 # 阶段控制,0:区域划分阶段,1:迷宫初始化阶段,2:走迷宫阶段 self.partition_step = 0 # 区域划分阶段步数,范围 0~4 @@ -52,11 +53,11 @@ class PartitionMazeEnv(gym.Env): self.action_space = spaces.Discrete(15) # 定义观察空间为8维向量 - # TODO 返回的状态目前只有位置坐标 # 阶段 0 状态:前 4 维表示已决策的切分值(未决策部分为 0) # 阶段 1 状态:车辆位置 (2D) + max_regions = (self.CUT_NUM // 2 + 1) ** 2 self.observation_space = spaces.Box( - low=0.0, high=1.0, shape=(1 + self.CUT_NUM + 2 * self.num_cars,), dtype=np.float32) + low=0.0, high=1.0, shape=(1 + self.CUT_NUM + max_regions,), dtype=np.float32) # 切分阶段相关变量 self.col_cuts = [] # 存储竖切位置(c₁, c₂),当值为0时表示不切 @@ -65,7 +66,6 @@ class PartitionMazeEnv(gym.Env): self.init_maze_step = 0 # 路径规划阶段相关变量 - self.MAX_STEPS = 50 # 迷宫走法步数上限 self.step_count = 0 self.rectangles = {} self.car_pos = [(self.H / 2, self.W / 2) for _ in range(self.num_cars)] @@ -87,8 +87,12 @@ class PartitionMazeEnv(gym.Env): self.car_traj = [[] for _ in range(self.num_cars)] self.current_car_index = 0 # 状态:前 4 维为 partition_values,其余补 0 - state = np.concatenate( - [[self.phase], self.partition_values, np.zeros(np.array(self.car_pos).flatten().shape[0], dtype=np.float32)]) + max_regions = (self.CUT_NUM // 2 + 1) ** 2 + state = np.concatenate([ + [self.phase], + self.partition_values, + np.zeros(max_regions, dtype=np.float32) + ]) return state def step(self, action): @@ -102,7 +106,7 @@ class PartitionMazeEnv(gym.Env): # 构造当前状态:前 partition_step 个为已决策值,其余为 0,再补 7 个 0 state = np.concatenate( [[self.phase], self.partition_values, np.zeros( - np.array(self.car_pos).flatten().shape[0], dtype=np.float32)] + (self.CUT_NUM // 2 + 1) ** 2, dtype=np.float32)] ) # 如果未完成 4 步,则仍处于切分阶段,不发奖励,done 为 False @@ -153,7 +157,9 @@ class PartitionMazeEnv(gym.Env): if not valid_partition: reward = -10000 state = np.concatenate( - [[self.phase], self.partition_values, np.zeros(np.array(self.car_pos).flatten().shape[0], dtype=np.float32)]) + [[self.phase], self.partition_values, np.zeros( + (self.CUT_NUM // 2 + 1) ** 2, dtype=np.float32)] + ) return state, reward, True, False, {} else: # 初始化迷宫 @@ -184,10 +190,21 @@ class PartitionMazeEnv(gym.Env): # 进入阶段 2:走迷宫 self.phase = 2 + + # 构造访问状态向量 + max_regions = (self.CUT_NUM // 2 + 1) ** 2 + visit_status = np.zeros(max_regions, dtype=np.float32) + + # 将实际区域的访问状态填入向量 + for i in range(len(self.row_cuts) - 1): + for j in range(len(self.col_cuts) - 1): + idx = i * (len(self.col_cuts) - 1) + j + visit_status[idx] = float( + self.rectangles[(i, j)]['is_visited']) + for i in range(idx + 1, max_regions): + visit_status[i] = 100 state = np.concatenate( - [[self.phase], self.partition_values, - np.array(self.car_pos).flatten()] - ) + [[self.phase], self.partition_values, visit_status]) return state, reward, False, False, {} elif self.phase == 2: @@ -224,9 +241,20 @@ class PartitionMazeEnv(gym.Env): self.rectangles[(new_row, new_col)]['is_visited'] = True # 观察状态 - state = np.concatenate( - [[self.phase], self.partition_values, np.array(self.car_pos).flatten()]) reward = 0 + max_regions = (self.CUT_NUM // 2 + 1) ** 2 + visit_status = np.zeros(max_regions, dtype=np.float32) + + # 将实际区域的访问状态填入向量 + for i in range(len(self.row_cuts) - 1): + for j in range(len(self.col_cuts) - 1): + idx = i * (len(self.col_cuts) - 1) + j + visit_status[idx] = float( + self.rectangles[(i, j)]['is_visited']) + for i in range(idx + 1, max_regions): + visit_status[i] = 100 + state = np.concatenate( + [[self.phase], self.partition_values, visit_status]) # Episode 终止条件:所有网格均被访问或步数达到上限 done = all([value['is_visited'] for _, value in self.rectangles.items()]) or ( @@ -238,7 +266,7 @@ class PartitionMazeEnv(gym.Env): # print(T) # print(self.partition_values) # print(self.car_traj) - reward += self.BASE_LINE / T * 100 + reward += self.BASE_LINE / T * 1000 elif done and self.step_count >= self.MAX_STEPS: reward += -1000 diff --git a/human_action.py b/human_action.py index d2565d2..6adcbff 100644 --- a/human_action.py +++ b/human_action.py @@ -1,17 +1,18 @@ -from env import PartitionMazeEnv -# from env_dis import PartitionMazeEnv +# from env import PartitionMazeEnv +from env_dis import PartitionMazeEnv env = PartitionMazeEnv() state = env.reset() print(state) -action_series = [[0], [0], [0.4], [0], [0.1]] -# action_series = [0, 0, 3, 0, 0, 10] +# action_series = [[0], [0], [0.4], [0], [0.1]] +action_series = [0, 0, 3, 0, 10] for i in range(100): action = action_series[i] state, reward, done, info, _ = env.step(action) - print(state, reward, done, info) + print(state) + print(reward) if done: break