更新env_dis
This commit is contained in:
parent
ff23b5e745
commit
4972306ca7
@ -69,11 +69,11 @@ class DQN_agent(object):
|
|||||||
else:
|
else:
|
||||||
if state[0][0] == 0:
|
if state[0][0] == 0:
|
||||||
q_value = self.q_net(state)
|
q_value = self.q_net(state)
|
||||||
q_value[10:] = - float('inf')
|
q_value[0][10:] = - float('inf')
|
||||||
a = q_value.argmax().item()
|
a = q_value.argmax().item()
|
||||||
else:
|
else:
|
||||||
q_value = self.q_net(state)
|
q_value = self.q_net(state)
|
||||||
q_value[:10] = - float('inf')
|
q_value[0][:10] = - float('inf')
|
||||||
a = q_value.argmax().item()
|
a = q_value.argmax().item()
|
||||||
return a
|
return a
|
||||||
|
|
||||||
|
@ -1,15 +1,17 @@
|
|||||||
|
from DQN import DQN_agent
|
||||||
|
from datetime import datetime
|
||||||
|
from utils import evaluate_policy, str2bool
|
||||||
import gymnasium as gym
|
import gymnasium as gym
|
||||||
import os
|
|
||||||
import shutil
|
import shutil
|
||||||
import argparse
|
import argparse
|
||||||
import torch
|
import torch
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
# fmt: off
|
||||||
import sys
|
import sys
|
||||||
|
import os
|
||||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
from env_dis import PartitionMazeEnv
|
from env_dis import PartitionMazeEnv
|
||||||
from utils import evaluate_policy, str2bool
|
# fmt: on
|
||||||
from datetime import datetime
|
|
||||||
from DQN import DQN_agent
|
|
||||||
|
|
||||||
'''Hyperparameter Setting'''
|
'''Hyperparameter Setting'''
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
@ -66,7 +68,6 @@ def main():
|
|||||||
eval_env = PartitionMazeEnv()
|
eval_env = PartitionMazeEnv()
|
||||||
opt.state_dim = env.observation_space.shape[0]
|
opt.state_dim = env.observation_space.shape[0]
|
||||||
opt.action_dim = env.action_space.n
|
opt.action_dim = env.action_space.n
|
||||||
opt.max_e_steps = 50
|
|
||||||
|
|
||||||
# Algorithm Setting
|
# Algorithm Setting
|
||||||
if opt.Duel:
|
if opt.Duel:
|
||||||
@ -87,7 +88,7 @@ def main():
|
|||||||
print("Random Seed: {}".format(opt.seed))
|
print("Random Seed: {}".format(opt.seed))
|
||||||
|
|
||||||
print('Algorithm:', algo_name, ' Env:', BriefEnvName[opt.EnvIdex], ' state_dim:', opt.state_dim,
|
print('Algorithm:', algo_name, ' Env:', BriefEnvName[opt.EnvIdex], ' state_dim:', opt.state_dim,
|
||||||
' action_dim:', opt.action_dim, ' Random Seed:', opt.seed, ' max_e_steps:', opt.max_e_steps, '\n')
|
' action_dim:', opt.action_dim, ' Random Seed:', opt.seed, '\n')
|
||||||
|
|
||||||
if opt.write:
|
if opt.write:
|
||||||
from torch.utils.tensorboard import SummaryWriter
|
from torch.utils.tensorboard import SummaryWriter
|
||||||
|
@ -1,16 +1,21 @@
|
|||||||
|
import numpy as np
|
||||||
|
|
||||||
def evaluate_policy(env, agent, turns = 3):
|
def evaluate_policy(env, agent, turns = 3):
|
||||||
total_scores = 0
|
total_scores = 0
|
||||||
for j in range(turns):
|
for j in range(turns):
|
||||||
s = env.reset()
|
s = env.reset()
|
||||||
done = False
|
done = False
|
||||||
|
action_series = []
|
||||||
while not done:
|
while not done:
|
||||||
# Take deterministic actions at test time
|
# Take deterministic actions at test time
|
||||||
a = agent.select_action(s, deterministic=True)
|
a = agent.select_action(s, deterministic=True)
|
||||||
s_next, r, dw, tr, info = env.step(a)
|
s_next, r, dw, tr, info = env.step(a)
|
||||||
done = (dw or tr)
|
done = (dw or tr)
|
||||||
|
action_series.append(a)
|
||||||
total_scores += r
|
total_scores += r
|
||||||
s = s_next
|
s = s_next
|
||||||
|
print('action series: ', np.roudn(action_series, 3))
|
||||||
|
print('state: ', s)
|
||||||
return int(total_scores/turns)
|
return int(total_scores/turns)
|
||||||
|
|
||||||
|
|
||||||
|
17
env.py
17
env.py
@ -71,6 +71,7 @@ class PartitionMazeEnv(gym.Env):
|
|||||||
self.car_pos = [(self.H / 2, self.W / 2) for _ in range(self.num_cars)]
|
self.car_pos = [(self.H / 2, self.W / 2) for _ in range(self.num_cars)]
|
||||||
self.car_traj = [[] for _ in range(self.num_cars)]
|
self.car_traj = [[] for _ in range(self.num_cars)]
|
||||||
self.current_car_index = 0
|
self.current_car_index = 0
|
||||||
|
self.previous_T = 0
|
||||||
|
|
||||||
def reset(self, seed=None, options=None):
|
def reset(self, seed=None, options=None):
|
||||||
# 重置所有变量,回到切分阶段(phase 0)
|
# 重置所有变量,回到切分阶段(phase 0)
|
||||||
@ -290,9 +291,19 @@ class PartitionMazeEnv(gym.Env):
|
|||||||
# 区域覆盖完毕,根据轨迹计算各车队的执行时间
|
# 区域覆盖完毕,根据轨迹计算各车队的执行时间
|
||||||
T = max([self._compute_motorcade_time(idx)
|
T = max([self._compute_motorcade_time(idx)
|
||||||
for idx in range(self.num_cars)])
|
for idx in range(self.num_cars)])
|
||||||
reward += self.BASE_LINE / T * 1000
|
# TODO 让奖励在baseline附近变化更剧烈
|
||||||
# reward += self.BASE_LINE - T
|
# reward = math.exp(-T / self.BASE_LINE) * 1000
|
||||||
# print(reward)
|
reward = self.BASE_LINE / T * 1000
|
||||||
|
if T < self.BASE_LINE:
|
||||||
|
reward *= 10
|
||||||
|
print(reward)
|
||||||
|
|
||||||
|
|
||||||
|
# if reward > self.BASE_LINE:
|
||||||
|
# reward -= 200
|
||||||
|
# # TODO 计算len(self.car_traj)的值,需要修改轨迹记录法则
|
||||||
|
# reward -= 10 * self.step_count
|
||||||
|
# TODO 动态调整baseline
|
||||||
elif done and self.step_count >= self.MAX_STEPS:
|
elif done and self.step_count >= self.MAX_STEPS:
|
||||||
reward += -1000
|
reward += -1000
|
||||||
|
|
||||||
|
54
env_dis.py
54
env_dis.py
@ -41,6 +41,7 @@ class PartitionMazeEnv(gym.Env):
|
|||||||
##############################
|
##############################
|
||||||
self.CUT_NUM = 4 # 横切一半,竖切一半
|
self.CUT_NUM = 4 # 横切一半,竖切一半
|
||||||
self.BASE_LINE = 4000 # 基准时间,通过greedy或者蒙特卡洛计算出来
|
self.BASE_LINE = 4000 # 基准时间,通过greedy或者蒙特卡洛计算出来
|
||||||
|
self.MAX_STEPS = 50 # 迷宫走法步数上限
|
||||||
|
|
||||||
self.phase = 0 # 阶段控制,0:区域划分阶段,1:迷宫初始化阶段,2:走迷宫阶段
|
self.phase = 0 # 阶段控制,0:区域划分阶段,1:迷宫初始化阶段,2:走迷宫阶段
|
||||||
self.partition_step = 0 # 区域划分阶段步数,范围 0~4
|
self.partition_step = 0 # 区域划分阶段步数,范围 0~4
|
||||||
@ -52,11 +53,11 @@ class PartitionMazeEnv(gym.Env):
|
|||||||
self.action_space = spaces.Discrete(15)
|
self.action_space = spaces.Discrete(15)
|
||||||
|
|
||||||
# 定义观察空间为8维向量
|
# 定义观察空间为8维向量
|
||||||
# TODO 返回的状态目前只有位置坐标
|
|
||||||
# 阶段 0 状态:前 4 维表示已决策的切分值(未决策部分为 0)
|
# 阶段 0 状态:前 4 维表示已决策的切分值(未决策部分为 0)
|
||||||
# 阶段 1 状态:车辆位置 (2D)
|
# 阶段 1 状态:车辆位置 (2D)
|
||||||
|
max_regions = (self.CUT_NUM // 2 + 1) ** 2
|
||||||
self.observation_space = spaces.Box(
|
self.observation_space = spaces.Box(
|
||||||
low=0.0, high=1.0, shape=(1 + self.CUT_NUM + 2 * self.num_cars,), dtype=np.float32)
|
low=0.0, high=1.0, shape=(1 + self.CUT_NUM + max_regions,), dtype=np.float32)
|
||||||
|
|
||||||
# 切分阶段相关变量
|
# 切分阶段相关变量
|
||||||
self.col_cuts = [] # 存储竖切位置(c₁, c₂),当值为0时表示不切
|
self.col_cuts = [] # 存储竖切位置(c₁, c₂),当值为0时表示不切
|
||||||
@ -65,7 +66,6 @@ class PartitionMazeEnv(gym.Env):
|
|||||||
self.init_maze_step = 0
|
self.init_maze_step = 0
|
||||||
|
|
||||||
# 路径规划阶段相关变量
|
# 路径规划阶段相关变量
|
||||||
self.MAX_STEPS = 50 # 迷宫走法步数上限
|
|
||||||
self.step_count = 0
|
self.step_count = 0
|
||||||
self.rectangles = {}
|
self.rectangles = {}
|
||||||
self.car_pos = [(self.H / 2, self.W / 2) for _ in range(self.num_cars)]
|
self.car_pos = [(self.H / 2, self.W / 2) for _ in range(self.num_cars)]
|
||||||
@ -87,8 +87,12 @@ class PartitionMazeEnv(gym.Env):
|
|||||||
self.car_traj = [[] for _ in range(self.num_cars)]
|
self.car_traj = [[] for _ in range(self.num_cars)]
|
||||||
self.current_car_index = 0
|
self.current_car_index = 0
|
||||||
# 状态:前 4 维为 partition_values,其余补 0
|
# 状态:前 4 维为 partition_values,其余补 0
|
||||||
state = np.concatenate(
|
max_regions = (self.CUT_NUM // 2 + 1) ** 2
|
||||||
[[self.phase], self.partition_values, np.zeros(np.array(self.car_pos).flatten().shape[0], dtype=np.float32)])
|
state = np.concatenate([
|
||||||
|
[self.phase],
|
||||||
|
self.partition_values,
|
||||||
|
np.zeros(max_regions, dtype=np.float32)
|
||||||
|
])
|
||||||
return state
|
return state
|
||||||
|
|
||||||
def step(self, action):
|
def step(self, action):
|
||||||
@ -102,7 +106,7 @@ class PartitionMazeEnv(gym.Env):
|
|||||||
# 构造当前状态:前 partition_step 个为已决策值,其余为 0,再补 7 个 0
|
# 构造当前状态:前 partition_step 个为已决策值,其余为 0,再补 7 个 0
|
||||||
state = np.concatenate(
|
state = np.concatenate(
|
||||||
[[self.phase], self.partition_values, np.zeros(
|
[[self.phase], self.partition_values, np.zeros(
|
||||||
np.array(self.car_pos).flatten().shape[0], dtype=np.float32)]
|
(self.CUT_NUM // 2 + 1) ** 2, dtype=np.float32)]
|
||||||
)
|
)
|
||||||
|
|
||||||
# 如果未完成 4 步,则仍处于切分阶段,不发奖励,done 为 False
|
# 如果未完成 4 步,则仍处于切分阶段,不发奖励,done 为 False
|
||||||
@ -153,7 +157,9 @@ class PartitionMazeEnv(gym.Env):
|
|||||||
if not valid_partition:
|
if not valid_partition:
|
||||||
reward = -10000
|
reward = -10000
|
||||||
state = np.concatenate(
|
state = np.concatenate(
|
||||||
[[self.phase], self.partition_values, np.zeros(np.array(self.car_pos).flatten().shape[0], dtype=np.float32)])
|
[[self.phase], self.partition_values, np.zeros(
|
||||||
|
(self.CUT_NUM // 2 + 1) ** 2, dtype=np.float32)]
|
||||||
|
)
|
||||||
return state, reward, True, False, {}
|
return state, reward, True, False, {}
|
||||||
else:
|
else:
|
||||||
# 初始化迷宫
|
# 初始化迷宫
|
||||||
@ -184,10 +190,21 @@ class PartitionMazeEnv(gym.Env):
|
|||||||
|
|
||||||
# 进入阶段 2:走迷宫
|
# 进入阶段 2:走迷宫
|
||||||
self.phase = 2
|
self.phase = 2
|
||||||
|
|
||||||
|
# 构造访问状态向量
|
||||||
|
max_regions = (self.CUT_NUM // 2 + 1) ** 2
|
||||||
|
visit_status = np.zeros(max_regions, dtype=np.float32)
|
||||||
|
|
||||||
|
# 将实际区域的访问状态填入向量
|
||||||
|
for i in range(len(self.row_cuts) - 1):
|
||||||
|
for j in range(len(self.col_cuts) - 1):
|
||||||
|
idx = i * (len(self.col_cuts) - 1) + j
|
||||||
|
visit_status[idx] = float(
|
||||||
|
self.rectangles[(i, j)]['is_visited'])
|
||||||
|
for i in range(idx + 1, max_regions):
|
||||||
|
visit_status[i] = 100
|
||||||
state = np.concatenate(
|
state = np.concatenate(
|
||||||
[[self.phase], self.partition_values,
|
[[self.phase], self.partition_values, visit_status])
|
||||||
np.array(self.car_pos).flatten()]
|
|
||||||
)
|
|
||||||
return state, reward, False, False, {}
|
return state, reward, False, False, {}
|
||||||
|
|
||||||
elif self.phase == 2:
|
elif self.phase == 2:
|
||||||
@ -224,9 +241,20 @@ class PartitionMazeEnv(gym.Env):
|
|||||||
self.rectangles[(new_row, new_col)]['is_visited'] = True
|
self.rectangles[(new_row, new_col)]['is_visited'] = True
|
||||||
|
|
||||||
# 观察状态
|
# 观察状态
|
||||||
state = np.concatenate(
|
|
||||||
[[self.phase], self.partition_values, np.array(self.car_pos).flatten()])
|
|
||||||
reward = 0
|
reward = 0
|
||||||
|
max_regions = (self.CUT_NUM // 2 + 1) ** 2
|
||||||
|
visit_status = np.zeros(max_regions, dtype=np.float32)
|
||||||
|
|
||||||
|
# 将实际区域的访问状态填入向量
|
||||||
|
for i in range(len(self.row_cuts) - 1):
|
||||||
|
for j in range(len(self.col_cuts) - 1):
|
||||||
|
idx = i * (len(self.col_cuts) - 1) + j
|
||||||
|
visit_status[idx] = float(
|
||||||
|
self.rectangles[(i, j)]['is_visited'])
|
||||||
|
for i in range(idx + 1, max_regions):
|
||||||
|
visit_status[i] = 100
|
||||||
|
state = np.concatenate(
|
||||||
|
[[self.phase], self.partition_values, visit_status])
|
||||||
|
|
||||||
# Episode 终止条件:所有网格均被访问或步数达到上限
|
# Episode 终止条件:所有网格均被访问或步数达到上限
|
||||||
done = all([value['is_visited'] for _, value in self.rectangles.items()]) or (
|
done = all([value['is_visited'] for _, value in self.rectangles.items()]) or (
|
||||||
@ -238,7 +266,7 @@ class PartitionMazeEnv(gym.Env):
|
|||||||
# print(T)
|
# print(T)
|
||||||
# print(self.partition_values)
|
# print(self.partition_values)
|
||||||
# print(self.car_traj)
|
# print(self.car_traj)
|
||||||
reward += self.BASE_LINE / T * 100
|
reward += self.BASE_LINE / T * 1000
|
||||||
elif done and self.step_count >= self.MAX_STEPS:
|
elif done and self.step_count >= self.MAX_STEPS:
|
||||||
reward += -1000
|
reward += -1000
|
||||||
|
|
||||||
|
@ -1,17 +1,18 @@
|
|||||||
from env import PartitionMazeEnv
|
# from env import PartitionMazeEnv
|
||||||
# from env_dis import PartitionMazeEnv
|
from env_dis import PartitionMazeEnv
|
||||||
|
|
||||||
env = PartitionMazeEnv()
|
env = PartitionMazeEnv()
|
||||||
|
|
||||||
state = env.reset()
|
state = env.reset()
|
||||||
print(state)
|
print(state)
|
||||||
|
|
||||||
action_series = [[0], [0], [0.4], [0], [0.1]]
|
# action_series = [[0], [0], [0.4], [0], [0.1]]
|
||||||
# action_series = [0, 0, 3, 0, 0, 10]
|
action_series = [0, 0, 3, 0, 10]
|
||||||
|
|
||||||
for i in range(100):
|
for i in range(100):
|
||||||
action = action_series[i]
|
action = action_series[i]
|
||||||
state, reward, done, info, _ = env.step(action)
|
state, reward, done, info, _ = env.step(action)
|
||||||
print(state, reward, done, info)
|
print(state)
|
||||||
|
print(reward)
|
||||||
if done:
|
if done:
|
||||||
break
|
break
|
||||||
|
Loading…
Reference in New Issue
Block a user