更新env_dis

This commit is contained in:
weixin_46229132 2025-03-19 20:40:35 +08:00
parent ff23b5e745
commit 4972306ca7
6 changed files with 76 additions and 30 deletions

View File

@ -69,11 +69,11 @@ class DQN_agent(object):
else: else:
if state[0][0] == 0: if state[0][0] == 0:
q_value = self.q_net(state) q_value = self.q_net(state)
q_value[10:] = - float('inf') q_value[0][10:] = - float('inf')
a = q_value.argmax().item() a = q_value.argmax().item()
else: else:
q_value = self.q_net(state) q_value = self.q_net(state)
q_value[:10] = - float('inf') q_value[0][:10] = - float('inf')
a = q_value.argmax().item() a = q_value.argmax().item()
return a return a

View File

@ -1,15 +1,17 @@
from DQN import DQN_agent
from datetime import datetime
from utils import evaluate_policy, str2bool
import gymnasium as gym import gymnasium as gym
import os
import shutil import shutil
import argparse import argparse
import torch import torch
import numpy as np import numpy as np
# fmt: off
import sys import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from env_dis import PartitionMazeEnv from env_dis import PartitionMazeEnv
from utils import evaluate_policy, str2bool # fmt: on
from datetime import datetime
from DQN import DQN_agent
'''Hyperparameter Setting''' '''Hyperparameter Setting'''
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
@ -66,7 +68,6 @@ def main():
eval_env = PartitionMazeEnv() eval_env = PartitionMazeEnv()
opt.state_dim = env.observation_space.shape[0] opt.state_dim = env.observation_space.shape[0]
opt.action_dim = env.action_space.n opt.action_dim = env.action_space.n
opt.max_e_steps = 50
# Algorithm Setting # Algorithm Setting
if opt.Duel: if opt.Duel:
@ -87,7 +88,7 @@ def main():
print("Random Seed: {}".format(opt.seed)) print("Random Seed: {}".format(opt.seed))
print('Algorithm:', algo_name, ' Env:', BriefEnvName[opt.EnvIdex], ' state_dim:', opt.state_dim, print('Algorithm:', algo_name, ' Env:', BriefEnvName[opt.EnvIdex], ' state_dim:', opt.state_dim,
' action_dim:', opt.action_dim, ' Random Seed:', opt.seed, ' max_e_steps:', opt.max_e_steps, '\n') ' action_dim:', opt.action_dim, ' Random Seed:', opt.seed, '\n')
if opt.write: if opt.write:
from torch.utils.tensorboard import SummaryWriter from torch.utils.tensorboard import SummaryWriter

View File

@ -1,16 +1,21 @@
import numpy as np
def evaluate_policy(env, agent, turns = 3): def evaluate_policy(env, agent, turns = 3):
total_scores = 0 total_scores = 0
for j in range(turns): for j in range(turns):
s = env.reset() s = env.reset()
done = False done = False
action_series = []
while not done: while not done:
# Take deterministic actions at test time # Take deterministic actions at test time
a = agent.select_action(s, deterministic=True) a = agent.select_action(s, deterministic=True)
s_next, r, dw, tr, info = env.step(a) s_next, r, dw, tr, info = env.step(a)
done = (dw or tr) done = (dw or tr)
action_series.append(a)
total_scores += r total_scores += r
s = s_next s = s_next
print('action series: ', np.roudn(action_series, 3))
print('state: ', s)
return int(total_scores/turns) return int(total_scores/turns)

17
env.py
View File

@ -71,6 +71,7 @@ class PartitionMazeEnv(gym.Env):
self.car_pos = [(self.H / 2, self.W / 2) for _ in range(self.num_cars)] self.car_pos = [(self.H / 2, self.W / 2) for _ in range(self.num_cars)]
self.car_traj = [[] for _ in range(self.num_cars)] self.car_traj = [[] for _ in range(self.num_cars)]
self.current_car_index = 0 self.current_car_index = 0
self.previous_T = 0
def reset(self, seed=None, options=None): def reset(self, seed=None, options=None):
# 重置所有变量回到切分阶段phase 0 # 重置所有变量回到切分阶段phase 0
@ -290,9 +291,19 @@ class PartitionMazeEnv(gym.Env):
# 区域覆盖完毕,根据轨迹计算各车队的执行时间 # 区域覆盖完毕,根据轨迹计算各车队的执行时间
T = max([self._compute_motorcade_time(idx) T = max([self._compute_motorcade_time(idx)
for idx in range(self.num_cars)]) for idx in range(self.num_cars)])
reward += self.BASE_LINE / T * 1000 # TODO 让奖励在baseline附近变化更剧烈
# reward += self.BASE_LINE - T # reward = math.exp(-T / self.BASE_LINE) * 1000
# print(reward) reward = self.BASE_LINE / T * 1000
if T < self.BASE_LINE:
reward *= 10
print(reward)
# if reward > self.BASE_LINE:
# reward -= 200
# # TODO 计算len(self.car_traj)的值,需要修改轨迹记录法则
# reward -= 10 * self.step_count
# TODO 动态调整baseline
elif done and self.step_count >= self.MAX_STEPS: elif done and self.step_count >= self.MAX_STEPS:
reward += -1000 reward += -1000

View File

@ -41,6 +41,7 @@ class PartitionMazeEnv(gym.Env):
############################## ##############################
self.CUT_NUM = 4 # 横切一半,竖切一半 self.CUT_NUM = 4 # 横切一半,竖切一半
self.BASE_LINE = 4000 # 基准时间通过greedy或者蒙特卡洛计算出来 self.BASE_LINE = 4000 # 基准时间通过greedy或者蒙特卡洛计算出来
self.MAX_STEPS = 50 # 迷宫走法步数上限
self.phase = 0 # 阶段控制0区域划分阶段1迷宫初始化阶段2走迷宫阶段 self.phase = 0 # 阶段控制0区域划分阶段1迷宫初始化阶段2走迷宫阶段
self.partition_step = 0 # 区域划分阶段步数,范围 0~4 self.partition_step = 0 # 区域划分阶段步数,范围 0~4
@ -52,11 +53,11 @@ class PartitionMazeEnv(gym.Env):
self.action_space = spaces.Discrete(15) self.action_space = spaces.Discrete(15)
# 定义观察空间为8维向量 # 定义观察空间为8维向量
# TODO 返回的状态目前只有位置坐标
# 阶段 0 状态:前 4 维表示已决策的切分值(未决策部分为 0 # 阶段 0 状态:前 4 维表示已决策的切分值(未决策部分为 0
# 阶段 1 状态:车辆位置 (2D) # 阶段 1 状态:车辆位置 (2D)
max_regions = (self.CUT_NUM // 2 + 1) ** 2
self.observation_space = spaces.Box( self.observation_space = spaces.Box(
low=0.0, high=1.0, shape=(1 + self.CUT_NUM + 2 * self.num_cars,), dtype=np.float32) low=0.0, high=1.0, shape=(1 + self.CUT_NUM + max_regions,), dtype=np.float32)
# 切分阶段相关变量 # 切分阶段相关变量
self.col_cuts = [] # 存储竖切位置c₁, c₂当值为0时表示不切 self.col_cuts = [] # 存储竖切位置c₁, c₂当值为0时表示不切
@ -65,7 +66,6 @@ class PartitionMazeEnv(gym.Env):
self.init_maze_step = 0 self.init_maze_step = 0
# 路径规划阶段相关变量 # 路径规划阶段相关变量
self.MAX_STEPS = 50 # 迷宫走法步数上限
self.step_count = 0 self.step_count = 0
self.rectangles = {} self.rectangles = {}
self.car_pos = [(self.H / 2, self.W / 2) for _ in range(self.num_cars)] self.car_pos = [(self.H / 2, self.W / 2) for _ in range(self.num_cars)]
@ -87,8 +87,12 @@ class PartitionMazeEnv(gym.Env):
self.car_traj = [[] for _ in range(self.num_cars)] self.car_traj = [[] for _ in range(self.num_cars)]
self.current_car_index = 0 self.current_car_index = 0
# 状态:前 4 维为 partition_values其余补 0 # 状态:前 4 维为 partition_values其余补 0
state = np.concatenate( max_regions = (self.CUT_NUM // 2 + 1) ** 2
[[self.phase], self.partition_values, np.zeros(np.array(self.car_pos).flatten().shape[0], dtype=np.float32)]) state = np.concatenate([
[self.phase],
self.partition_values,
np.zeros(max_regions, dtype=np.float32)
])
return state return state
def step(self, action): def step(self, action):
@ -102,7 +106,7 @@ class PartitionMazeEnv(gym.Env):
# 构造当前状态:前 partition_step 个为已决策值,其余为 0再补 7 个 0 # 构造当前状态:前 partition_step 个为已决策值,其余为 0再补 7 个 0
state = np.concatenate( state = np.concatenate(
[[self.phase], self.partition_values, np.zeros( [[self.phase], self.partition_values, np.zeros(
np.array(self.car_pos).flatten().shape[0], dtype=np.float32)] (self.CUT_NUM // 2 + 1) ** 2, dtype=np.float32)]
) )
# 如果未完成 4 步则仍处于切分阶段不发奖励done 为 False # 如果未完成 4 步则仍处于切分阶段不发奖励done 为 False
@ -153,7 +157,9 @@ class PartitionMazeEnv(gym.Env):
if not valid_partition: if not valid_partition:
reward = -10000 reward = -10000
state = np.concatenate( state = np.concatenate(
[[self.phase], self.partition_values, np.zeros(np.array(self.car_pos).flatten().shape[0], dtype=np.float32)]) [[self.phase], self.partition_values, np.zeros(
(self.CUT_NUM // 2 + 1) ** 2, dtype=np.float32)]
)
return state, reward, True, False, {} return state, reward, True, False, {}
else: else:
# 初始化迷宫 # 初始化迷宫
@ -184,10 +190,21 @@ class PartitionMazeEnv(gym.Env):
# 进入阶段 2走迷宫 # 进入阶段 2走迷宫
self.phase = 2 self.phase = 2
# 构造访问状态向量
max_regions = (self.CUT_NUM // 2 + 1) ** 2
visit_status = np.zeros(max_regions, dtype=np.float32)
# 将实际区域的访问状态填入向量
for i in range(len(self.row_cuts) - 1):
for j in range(len(self.col_cuts) - 1):
idx = i * (len(self.col_cuts) - 1) + j
visit_status[idx] = float(
self.rectangles[(i, j)]['is_visited'])
for i in range(idx + 1, max_regions):
visit_status[i] = 100
state = np.concatenate( state = np.concatenate(
[[self.phase], self.partition_values, [[self.phase], self.partition_values, visit_status])
np.array(self.car_pos).flatten()]
)
return state, reward, False, False, {} return state, reward, False, False, {}
elif self.phase == 2: elif self.phase == 2:
@ -224,9 +241,20 @@ class PartitionMazeEnv(gym.Env):
self.rectangles[(new_row, new_col)]['is_visited'] = True self.rectangles[(new_row, new_col)]['is_visited'] = True
# 观察状态 # 观察状态
state = np.concatenate(
[[self.phase], self.partition_values, np.array(self.car_pos).flatten()])
reward = 0 reward = 0
max_regions = (self.CUT_NUM // 2 + 1) ** 2
visit_status = np.zeros(max_regions, dtype=np.float32)
# 将实际区域的访问状态填入向量
for i in range(len(self.row_cuts) - 1):
for j in range(len(self.col_cuts) - 1):
idx = i * (len(self.col_cuts) - 1) + j
visit_status[idx] = float(
self.rectangles[(i, j)]['is_visited'])
for i in range(idx + 1, max_regions):
visit_status[i] = 100
state = np.concatenate(
[[self.phase], self.partition_values, visit_status])
# Episode 终止条件:所有网格均被访问或步数达到上限 # Episode 终止条件:所有网格均被访问或步数达到上限
done = all([value['is_visited'] for _, value in self.rectangles.items()]) or ( done = all([value['is_visited'] for _, value in self.rectangles.items()]) or (
@ -238,7 +266,7 @@ class PartitionMazeEnv(gym.Env):
# print(T) # print(T)
# print(self.partition_values) # print(self.partition_values)
# print(self.car_traj) # print(self.car_traj)
reward += self.BASE_LINE / T * 100 reward += self.BASE_LINE / T * 1000
elif done and self.step_count >= self.MAX_STEPS: elif done and self.step_count >= self.MAX_STEPS:
reward += -1000 reward += -1000

View File

@ -1,17 +1,18 @@
from env import PartitionMazeEnv # from env import PartitionMazeEnv
# from env_dis import PartitionMazeEnv from env_dis import PartitionMazeEnv
env = PartitionMazeEnv() env = PartitionMazeEnv()
state = env.reset() state = env.reset()
print(state) print(state)
action_series = [[0], [0], [0.4], [0], [0.1]] # action_series = [[0], [0], [0.4], [0], [0.1]]
# action_series = [0, 0, 3, 0, 0, 10] action_series = [0, 0, 3, 0, 10]
for i in range(100): for i in range(100):
action = action_series[i] action = action_series[i]
state, reward, done, info, _ = env.step(action) state, reward, done, info, _ = env.step(action)
print(state, reward, done, info) print(state)
print(reward)
if done: if done:
break break