加入dqn算法
This commit is contained in:
parent
343008bc9f
commit
f19e8fbdbf
254
DQN/RL_brain.py
254
DQN/RL_brain.py
@ -1,254 +0,0 @@
|
|||||||
"""
|
|
||||||
Deep Q Network off-policy
|
|
||||||
"""
|
|
||||||
import torch
|
|
||||||
import torch.nn as nn
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
|
|
||||||
np.random.seed(42)
|
|
||||||
torch.manual_seed(2)
|
|
||||||
|
|
||||||
|
|
||||||
class Network(nn.Module):
|
|
||||||
"""
|
|
||||||
Network Structure
|
|
||||||
"""
|
|
||||||
def __init__(self,
|
|
||||||
n_features,
|
|
||||||
n_actions,
|
|
||||||
n_neuron=10
|
|
||||||
):
|
|
||||||
super(Network, self).__init__()
|
|
||||||
self.net = nn.Sequential(
|
|
||||||
nn.Linear(in_features=n_features, out_features=n_neuron, bias=True),
|
|
||||||
nn.Linear(in_features=n_neuron, out_features=n_actions, bias=True),
|
|
||||||
nn.ReLU()
|
|
||||||
)
|
|
||||||
|
|
||||||
def forward(self, s):
|
|
||||||
"""
|
|
||||||
|
|
||||||
:param s: s
|
|
||||||
:return: q
|
|
||||||
"""
|
|
||||||
q = self.net(s)
|
|
||||||
return q
|
|
||||||
|
|
||||||
|
|
||||||
class DeepQNetwork(nn.Module):
|
|
||||||
"""
|
|
||||||
Q Learning Algorithm
|
|
||||||
"""
|
|
||||||
def __init__(self,
|
|
||||||
n_actions,
|
|
||||||
n_features,
|
|
||||||
learning_rate=0.01,
|
|
||||||
reward_decay=0.9,
|
|
||||||
e_greedy=0.9,
|
|
||||||
replace_target_iter=300,
|
|
||||||
memory_size=500,
|
|
||||||
batch_size=32,
|
|
||||||
e_greedy_increment=None):
|
|
||||||
super(DeepQNetwork, self).__init__()
|
|
||||||
|
|
||||||
self.n_actions = n_actions
|
|
||||||
self.n_features = n_features
|
|
||||||
self.lr = learning_rate
|
|
||||||
self.gamma = reward_decay
|
|
||||||
self.epsilon_max = e_greedy
|
|
||||||
self.replace_target_iter = replace_target_iter
|
|
||||||
self.memory_size = memory_size
|
|
||||||
self.batch_size = batch_size
|
|
||||||
self.epsilon_increment = e_greedy_increment
|
|
||||||
self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max
|
|
||||||
|
|
||||||
# total learning step
|
|
||||||
self.learn_step_counter = 0
|
|
||||||
|
|
||||||
# initialize zero memory [s, a, r, s_]
|
|
||||||
# 这里用pd.DataFrame创建的表格作为memory
|
|
||||||
# 表格的行数是memory的大小,也就是transition的个数
|
|
||||||
# 表格的列数是transition的长度,一个transition包含[s, a, r, s_],其中a和r分别是一个数字,s和s_的长度分别是n_features
|
|
||||||
self.memory = pd.DataFrame(np.zeros((self.memory_size, self.n_features*2+2)))
|
|
||||||
|
|
||||||
# build two network: eval_net and target_net
|
|
||||||
self.eval_net = Network(n_features=self.n_features, n_actions=self.n_actions)
|
|
||||||
self.target_net = Network(n_features=self.n_features, n_actions=self.n_actions)
|
|
||||||
self.loss_function = nn.MSELoss()
|
|
||||||
self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=self.lr)
|
|
||||||
|
|
||||||
# 记录每一步的误差
|
|
||||||
self.cost_his = []
|
|
||||||
|
|
||||||
|
|
||||||
def store_transition(self, s, a, r, s_):
|
|
||||||
if not hasattr(self, 'memory_counter'):
|
|
||||||
# hasattr用于判断对象是否包含对应的属性。
|
|
||||||
self.memory_counter = 0
|
|
||||||
|
|
||||||
transition = np.hstack((s, [a,r], s_))
|
|
||||||
|
|
||||||
# replace the old memory with new memory
|
|
||||||
index = self.memory_counter % self.memory_size
|
|
||||||
self.memory.iloc[index, :] = transition
|
|
||||||
|
|
||||||
self.memory_counter += 1
|
|
||||||
|
|
||||||
def choose_action(self, observation):
|
|
||||||
observation = observation[np.newaxis, :]
|
|
||||||
|
|
||||||
if np.random.uniform() < self.epsilon:
|
|
||||||
# forward feed the observation and get q value for every actions
|
|
||||||
s = torch.FloatTensor(observation)
|
|
||||||
actions_value = self.eval_net(s)
|
|
||||||
action = [np.argmax(actions_value.detach().numpy())][0]
|
|
||||||
else:
|
|
||||||
action = np.random.randint(0, self.n_actions)
|
|
||||||
return action
|
|
||||||
|
|
||||||
def _replace_target_params(self):
|
|
||||||
# 复制网络参数
|
|
||||||
self.target_net.load_state_dict(self.eval_net.state_dict())
|
|
||||||
|
|
||||||
def learn(self):
|
|
||||||
# check to replace target parameters
|
|
||||||
if self.learn_step_counter % self.replace_target_iter == 0:
|
|
||||||
self._replace_target_params()
|
|
||||||
print('\ntarget params replaced\n')
|
|
||||||
|
|
||||||
# sample batch memory from all memory
|
|
||||||
batch_memory = self.memory.sample(self.batch_size) \
|
|
||||||
if self.memory_counter > self.memory_size \
|
|
||||||
else self.memory.iloc[:self.memory_counter].sample(self.batch_size, replace=True)
|
|
||||||
|
|
||||||
# run the nextwork
|
|
||||||
s = torch.FloatTensor(batch_memory.iloc[:, :self.n_features].values)
|
|
||||||
s_ = torch.FloatTensor(batch_memory.iloc[:, -self.n_features:].values)
|
|
||||||
q_eval = self.eval_net(s)
|
|
||||||
q_next = self.target_net(s_)
|
|
||||||
|
|
||||||
# change q_target w.r.t q_eval's action
|
|
||||||
q_target = q_eval.clone()
|
|
||||||
|
|
||||||
# 更新值
|
|
||||||
batch_index = np.arange(self.batch_size, dtype=np.int32)
|
|
||||||
eval_act_index = batch_memory.iloc[:, self.n_features].values.astype(int)
|
|
||||||
reward = batch_memory.iloc[:, self.n_features + 1].values
|
|
||||||
|
|
||||||
q_target[batch_index, eval_act_index] = torch.FloatTensor(reward) + self.gamma * q_next.max(dim=1).values
|
|
||||||
|
|
||||||
# train eval network
|
|
||||||
loss = self.loss_function(q_target, q_eval)
|
|
||||||
self.optimizer.zero_grad()
|
|
||||||
loss.backward()
|
|
||||||
self.optimizer.step()
|
|
||||||
|
|
||||||
self.cost_his.append(loss.detach().numpy())
|
|
||||||
|
|
||||||
# increasing epsilon
|
|
||||||
self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
|
|
||||||
self.learn_step_counter += 1
|
|
||||||
|
|
||||||
def plot_cost(self):
|
|
||||||
plt.figure()
|
|
||||||
plt.plot(np.arange(len(self.cost_his)), self.cost_his)
|
|
||||||
plt.show()
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1,58 +0,0 @@
|
|||||||
from RL_brain import DeepQNetwork
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
||||||
from env import PartitionMazeEnv
|
|
||||||
|
|
||||||
def run_maze():
|
|
||||||
step = 0 # 为了记录走到第几步,记忆录中积累经验(也就是积累一些transition)之后再开始学习
|
|
||||||
for episode in range(200):
|
|
||||||
# initial observation
|
|
||||||
observation = env.reset()
|
|
||||||
|
|
||||||
while True:
|
|
||||||
# refresh env
|
|
||||||
env.render()
|
|
||||||
|
|
||||||
# RL choose action based on observation
|
|
||||||
action = RL.choose_action(observation)
|
|
||||||
|
|
||||||
# RL take action and get next observation and reward
|
|
||||||
observation_, reward, done = env.step(action)
|
|
||||||
|
|
||||||
# !! restore transition
|
|
||||||
RL.store_transition(observation, action, reward, observation_)
|
|
||||||
|
|
||||||
# 超过200条transition之后每隔5步学习一次
|
|
||||||
if (step > 200) and (step % 5 == 0):
|
|
||||||
RL.learn()
|
|
||||||
|
|
||||||
# swap observation
|
|
||||||
observation = observation_
|
|
||||||
|
|
||||||
# break while loop when end of this episode
|
|
||||||
if done:
|
|
||||||
break
|
|
||||||
step += 1
|
|
||||||
|
|
||||||
# end of game
|
|
||||||
print("game over")
|
|
||||||
env.destroy()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
# maze game
|
|
||||||
env = PartitionMazeEnv()
|
|
||||||
|
|
||||||
# TODO 代码还没有写完,跑不了!!!
|
|
||||||
RL = DeepQNetwork(env.n_actions, env.n_features,
|
|
||||||
learning_rate=0.01,
|
|
||||||
reward_decay=0.9,
|
|
||||||
e_greedy=0.9,
|
|
||||||
replace_target_iter=200,
|
|
||||||
memory_size=2000)
|
|
||||||
env.after(100, run_maze)
|
|
||||||
env.mainloop()
|
|
||||||
RL.plot_cost()
|
|
||||||
|
|
||||||
|
|
144
Duel_Double_DQN/DQN.py
Normal file
144
Duel_Double_DQN/DQN.py
Normal file
@ -0,0 +1,144 @@
|
|||||||
|
import torch.nn.functional as F
|
||||||
|
import torch.nn as nn
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
import copy
|
||||||
|
|
||||||
|
|
||||||
|
def build_net(layer_shape, activation, output_activation):
|
||||||
|
'''Build networks with For loop'''
|
||||||
|
layers = []
|
||||||
|
for j in range(len(layer_shape)-1):
|
||||||
|
act = activation if j < len(layer_shape)-2 else output_activation
|
||||||
|
layers += [nn.Linear(layer_shape[j], layer_shape[j+1]), act()]
|
||||||
|
return nn.Sequential(*layers)
|
||||||
|
|
||||||
|
class Q_Net(nn.Module):
|
||||||
|
def __init__(self, state_dim, action_dim, hid_shape):
|
||||||
|
super(Q_Net, self).__init__()
|
||||||
|
layers = [state_dim] + list(hid_shape) + [action_dim]
|
||||||
|
self.Q = build_net(layers, nn.ReLU, nn.Identity)
|
||||||
|
def forward(self, s):
|
||||||
|
q = self.Q(s)
|
||||||
|
return q
|
||||||
|
|
||||||
|
|
||||||
|
class Duel_Q_Net(nn.Module):
|
||||||
|
def __init__(self, state_dim, action_dim, hid_shape):
|
||||||
|
super(Duel_Q_Net, self).__init__()
|
||||||
|
layers = [state_dim] + list(hid_shape)
|
||||||
|
self.hidden = build_net(layers, nn.ReLU, nn.ReLU)
|
||||||
|
self.V = nn.Linear(hid_shape[-1], 1)
|
||||||
|
self.A = nn.Linear(hid_shape[-1], action_dim)
|
||||||
|
|
||||||
|
def forward(self, s):
|
||||||
|
s = self.hidden(s)
|
||||||
|
Adv = self.A(s)
|
||||||
|
V = self.V(s)
|
||||||
|
Q = V + (Adv - torch.mean(Adv, dim=-1, keepdim=True)) # Q(s,a)=V(s)+A(s,a)-mean(A(s,a))
|
||||||
|
return Q
|
||||||
|
|
||||||
|
|
||||||
|
class DQN_agent(object):
|
||||||
|
def __init__(self, **kwargs):
|
||||||
|
# Init hyperparameters for agent, just like "self.gamma = opt.gamma, self.lambd = opt.lambd, ..."
|
||||||
|
self.__dict__.update(kwargs)
|
||||||
|
self.tau = 0.005
|
||||||
|
self.replay_buffer = ReplayBuffer(self.state_dim, self.dvc, max_size=int(1e6))
|
||||||
|
if self.Duel:
|
||||||
|
self.q_net = Duel_Q_Net(self.state_dim, self.action_dim, (self.net_width,self.net_width)).to(self.dvc)
|
||||||
|
else:
|
||||||
|
self.q_net = Q_Net(self.state_dim, self.action_dim, (self.net_width, self.net_width)).to(self.dvc)
|
||||||
|
self.q_net_optimizer = torch.optim.Adam(self.q_net.parameters(), lr=self.lr)
|
||||||
|
self.q_target = copy.deepcopy(self.q_net)
|
||||||
|
# Freeze target networks with respect to optimizers (only update via polyak averaging)
|
||||||
|
for p in self.q_target.parameters(): p.requires_grad = False
|
||||||
|
|
||||||
|
|
||||||
|
def select_action(self, state, deterministic):#only used when interact with the env
|
||||||
|
with torch.no_grad():
|
||||||
|
state = torch.FloatTensor(state.reshape(1, -1)).to(self.dvc)
|
||||||
|
# if deterministic:
|
||||||
|
# a = self.q_net(state).argmax().item()
|
||||||
|
# else:
|
||||||
|
if np.random.rand() < self.exp_noise:
|
||||||
|
if state[0][0] == 0:
|
||||||
|
a = np.random.randint(0,10)
|
||||||
|
else:
|
||||||
|
a = np.random.randint(10,13)
|
||||||
|
else:
|
||||||
|
if state[0][0] == 0:
|
||||||
|
q_value = self.q_net(state)
|
||||||
|
q_value[:10] = - float('inf')
|
||||||
|
a = q_value.argmax().item()
|
||||||
|
else:
|
||||||
|
q_value = self.q_net(state)
|
||||||
|
q_value[10:] = - float('inf')
|
||||||
|
a = q_value.argmax().item()
|
||||||
|
return a
|
||||||
|
|
||||||
|
|
||||||
|
def train(self):
|
||||||
|
s, a, r, s_next, dw = self.replay_buffer.sample(self.batch_size)
|
||||||
|
|
||||||
|
'''Compute the target Q value'''
|
||||||
|
with torch.no_grad():
|
||||||
|
if self.Double:
|
||||||
|
argmax_a = self.q_net(s_next).argmax(dim=1).unsqueeze(-1)
|
||||||
|
max_q_next = self.q_target(s_next).gather(1,argmax_a)
|
||||||
|
else:
|
||||||
|
max_q_next = self.q_target(s_next).max(1)[0].unsqueeze(1)
|
||||||
|
target_Q = r + (~dw) * self.gamma * max_q_next #dw: die or win
|
||||||
|
|
||||||
|
# Get current Q estimates
|
||||||
|
current_q = self.q_net(s)
|
||||||
|
current_q_a = current_q.gather(1,a)
|
||||||
|
|
||||||
|
q_loss = F.mse_loss(current_q_a, target_Q)
|
||||||
|
self.q_net_optimizer.zero_grad()
|
||||||
|
q_loss.backward()
|
||||||
|
self.q_net_optimizer.step()
|
||||||
|
|
||||||
|
# Update the frozen target models
|
||||||
|
for param, target_param in zip(self.q_net.parameters(), self.q_target.parameters()):
|
||||||
|
target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
|
||||||
|
|
||||||
|
|
||||||
|
def save(self,algo,EnvName,steps):
|
||||||
|
torch.save(self.q_net.state_dict(), "./weights/{}_{}_{}.pth".format(algo,EnvName,steps))
|
||||||
|
|
||||||
|
def load(self,algo,EnvName,steps):
|
||||||
|
self.q_net.load_state_dict(torch.load("./model/{}_{}_{}.pth".format(algo,EnvName,steps),map_location=self.dvc))
|
||||||
|
self.q_target.load_state_dict(torch.load("./model/{}_{}_{}.pth".format(algo,EnvName,steps),map_location=self.dvc))
|
||||||
|
|
||||||
|
|
||||||
|
class ReplayBuffer(object):
|
||||||
|
def __init__(self, state_dim, dvc, max_size=int(1e6)):
|
||||||
|
self.max_size = max_size
|
||||||
|
self.dvc = dvc
|
||||||
|
self.ptr = 0
|
||||||
|
self.size = 0
|
||||||
|
|
||||||
|
self.s = torch.zeros((max_size, state_dim),dtype=torch.float,device=self.dvc)
|
||||||
|
self.a = torch.zeros((max_size, 1),dtype=torch.long,device=self.dvc)
|
||||||
|
self.r = torch.zeros((max_size, 1),dtype=torch.float,device=self.dvc)
|
||||||
|
self.s_next = torch.zeros((max_size, state_dim),dtype=torch.float,device=self.dvc)
|
||||||
|
self.dw = torch.zeros((max_size, 1),dtype=torch.bool,device=self.dvc)
|
||||||
|
|
||||||
|
def add(self, s, a, r, s_next, dw):
|
||||||
|
self.s[self.ptr] = torch.from_numpy(s).to(self.dvc)
|
||||||
|
self.a[self.ptr] = a
|
||||||
|
self.r[self.ptr] = r
|
||||||
|
self.s_next[self.ptr] = torch.from_numpy(s_next).to(self.dvc)
|
||||||
|
self.dw[self.ptr] = dw
|
||||||
|
|
||||||
|
self.ptr = (self.ptr + 1) % self.max_size
|
||||||
|
self.size = min(self.size + 1, self.max_size)
|
||||||
|
|
||||||
|
def sample(self, batch_size):
|
||||||
|
ind = torch.randint(0, self.size, device=self.dvc, size=(batch_size,))
|
||||||
|
return self.s[ind], self.a[ind], self.r[ind], self.s_next[ind], self.dw[ind]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
163
Duel_Double_DQN/main.py
Normal file
163
Duel_Double_DQN/main.py
Normal file
@ -0,0 +1,163 @@
|
|||||||
|
import gymnasium as gym
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import argparse
|
||||||
|
import torch
|
||||||
|
import sys
|
||||||
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
|
from env_dis import PartitionMazeEnv
|
||||||
|
from utils import evaluate_policy, str2bool
|
||||||
|
from datetime import datetime
|
||||||
|
from DQN import DQN_agent
|
||||||
|
|
||||||
|
'''Hyperparameter Setting'''
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('--dvc', type=str, default='cpu',
|
||||||
|
help='running device: cuda or cpu')
|
||||||
|
parser.add_argument('--EnvIdex', type=int, default=0, help='CP-v1, LLd-v2')
|
||||||
|
parser.add_argument('--write', type=str2bool, default=False,
|
||||||
|
help='Use SummaryWriter to record the training')
|
||||||
|
parser.add_argument('--render', type=str2bool,
|
||||||
|
default=False, help='Render or Not')
|
||||||
|
parser.add_argument('--Loadmodel', type=str2bool,
|
||||||
|
default=False, help='Load pretrained model or Not')
|
||||||
|
parser.add_argument('--ModelIdex', type=int, default=100,
|
||||||
|
help='which model to load')
|
||||||
|
|
||||||
|
parser.add_argument('--seed', type=int, default=42, help='random seed')
|
||||||
|
parser.add_argument('--Max_train_steps', type=int,
|
||||||
|
default=int(1e8), help='Max training steps')
|
||||||
|
parser.add_argument('--save_interval', type=int,
|
||||||
|
default=int(50e3), help='Model saving interval, in steps.')
|
||||||
|
parser.add_argument('--eval_interval', type=int, default=int(2e3),
|
||||||
|
help='Model evaluating interval, in steps.')
|
||||||
|
parser.add_argument('--random_steps', type=int, default=int(3e3),
|
||||||
|
help='steps for random policy to explore')
|
||||||
|
parser.add_argument('--update_every', type=int,
|
||||||
|
default=50, help='training frequency')
|
||||||
|
|
||||||
|
parser.add_argument('--gamma', type=float, default=0.99,
|
||||||
|
help='Discounted Factor')
|
||||||
|
parser.add_argument('--net_width', type=int,
|
||||||
|
default=200, help='Hidden net width')
|
||||||
|
parser.add_argument('--lr', type=float, default=1e-4, help='Learning rate')
|
||||||
|
parser.add_argument('--batch_size', type=int, default=256,
|
||||||
|
help='lenth of sliced trajectory')
|
||||||
|
parser.add_argument('--exp_noise', type=float,
|
||||||
|
default=0.2, help='explore noise')
|
||||||
|
parser.add_argument('--noise_decay', type=float, default=0.99,
|
||||||
|
help='decay rate of explore noise')
|
||||||
|
parser.add_argument('--Double', type=str2bool, default=True,
|
||||||
|
help='Whether to use Double Q-learning')
|
||||||
|
parser.add_argument('--Duel', type=str2bool, default=True,
|
||||||
|
help='Whether to use Duel networks')
|
||||||
|
opt = parser.parse_args()
|
||||||
|
opt.dvc = torch.device(opt.dvc) # from str to torch.device
|
||||||
|
print(opt)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
EnvName = ['CartPole-v1', 'LunarLander-v2']
|
||||||
|
BriefEnvName = ['PM_DQN', 'CPV1', 'LLdV2']
|
||||||
|
# env = gym.make(EnvName[opt.EnvIdex], render_mode = "human" if opt.render else None)
|
||||||
|
# eval_env = gym.make(EnvName[opt.EnvIdex])
|
||||||
|
env = PartitionMazeEnv()
|
||||||
|
eval_env = PartitionMazeEnv()
|
||||||
|
opt.state_dim = env.observation_space.shape[0]
|
||||||
|
opt.action_dim = env.action_space.n
|
||||||
|
opt.max_e_steps = 50
|
||||||
|
|
||||||
|
# Algorithm Setting
|
||||||
|
if opt.Duel:
|
||||||
|
algo_name = 'Duel'
|
||||||
|
else:
|
||||||
|
algo_name = ''
|
||||||
|
if opt.Double:
|
||||||
|
algo_name += 'DDQN'
|
||||||
|
else:
|
||||||
|
algo_name += 'DQN'
|
||||||
|
|
||||||
|
# Seed Everything
|
||||||
|
env_seed = opt.seed
|
||||||
|
torch.manual_seed(opt.seed)
|
||||||
|
torch.cuda.manual_seed(opt.seed)
|
||||||
|
torch.backends.cudnn.deterministic = True
|
||||||
|
torch.backends.cudnn.benchmark = False
|
||||||
|
print("Random Seed: {}".format(opt.seed))
|
||||||
|
|
||||||
|
print('Algorithm:', algo_name, ' Env:', BriefEnvName[opt.EnvIdex], ' state_dim:', opt.state_dim,
|
||||||
|
' action_dim:', opt.action_dim, ' Random Seed:', opt.seed, ' max_e_steps:', opt.max_e_steps, '\n')
|
||||||
|
|
||||||
|
if opt.write:
|
||||||
|
from torch.utils.tensorboard import SummaryWriter
|
||||||
|
timenow = str(datetime.now())[0:-10]
|
||||||
|
timenow = ' ' + timenow[0:13] + '_' + timenow[-2::]
|
||||||
|
writepath = 'runs/{}-{}_S{}_'.format(algo_name,
|
||||||
|
BriefEnvName[opt.EnvIdex], opt.seed) + timenow
|
||||||
|
if os.path.exists(writepath):
|
||||||
|
shutil.rmtree(writepath)
|
||||||
|
writer = SummaryWriter(log_dir=writepath)
|
||||||
|
|
||||||
|
# Build model and replay buffer
|
||||||
|
if not os.path.exists('model'):
|
||||||
|
os.mkdir('model')
|
||||||
|
agent = DQN_agent(**vars(opt))
|
||||||
|
if opt.Loadmodel:
|
||||||
|
agent.load(algo_name, BriefEnvName[opt.EnvIdex], opt.ModelIdex)
|
||||||
|
|
||||||
|
if opt.render:
|
||||||
|
while True:
|
||||||
|
score = evaluate_policy(env, agent, 1)
|
||||||
|
print('EnvName:', BriefEnvName[opt.EnvIdex],
|
||||||
|
'seed:', opt.seed, 'score:', score)
|
||||||
|
else:
|
||||||
|
total_steps = 0
|
||||||
|
while total_steps < opt.Max_train_steps:
|
||||||
|
# Do not use opt.seed directly, or it can overfit to opt.seed
|
||||||
|
s = env.reset(seed=env_seed)
|
||||||
|
env_seed += 1
|
||||||
|
done = False
|
||||||
|
|
||||||
|
'''Interact & trian'''
|
||||||
|
while not done:
|
||||||
|
# e-greedy exploration
|
||||||
|
if total_steps < opt.random_steps:
|
||||||
|
a = env.action_space.sample()
|
||||||
|
else:
|
||||||
|
a = agent.select_action(s, deterministic=False)
|
||||||
|
s_next, r, dw, tr, info = env.step(a)
|
||||||
|
done = (dw or tr)
|
||||||
|
|
||||||
|
agent.replay_buffer.add(s, a, r, s_next, dw)
|
||||||
|
s = s_next
|
||||||
|
|
||||||
|
'''Update'''
|
||||||
|
# train 50 times every 50 steps rather than 1 training per step. Better!
|
||||||
|
if total_steps >= opt.random_steps and total_steps % opt.update_every == 0:
|
||||||
|
for j in range(opt.update_every):
|
||||||
|
agent.train()
|
||||||
|
|
||||||
|
'''Noise decay & Record & Log'''
|
||||||
|
if total_steps % 1000 == 0:
|
||||||
|
agent.exp_noise *= opt.noise_decay
|
||||||
|
if total_steps % opt.eval_interval == 0:
|
||||||
|
score = evaluate_policy(eval_env, agent, turns=3)
|
||||||
|
if opt.write:
|
||||||
|
writer.add_scalar(
|
||||||
|
'ep_r', score, global_step=total_steps)
|
||||||
|
writer.add_scalar(
|
||||||
|
'noise', agent.exp_noise, global_step=total_steps)
|
||||||
|
print('EnvName:', BriefEnvName[opt.EnvIdex], 'seed:', opt.seed, 'steps: {}k'.format(
|
||||||
|
int(total_steps/1000)), 'score:', int(score))
|
||||||
|
total_steps += 1
|
||||||
|
|
||||||
|
'''save model'''
|
||||||
|
if total_steps % opt.save_interval == 0:
|
||||||
|
agent.save(algo_name, BriefEnvName[opt.EnvIdex], int(
|
||||||
|
total_steps/1000))
|
||||||
|
env.close()
|
||||||
|
eval_env.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
28
Duel_Double_DQN/utils.py
Normal file
28
Duel_Double_DQN/utils.py
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
def evaluate_policy(env, agent, turns = 3):
|
||||||
|
total_scores = 0
|
||||||
|
for j in range(turns):
|
||||||
|
s = env.reset()
|
||||||
|
done = False
|
||||||
|
while not done:
|
||||||
|
# Take deterministic actions at test time
|
||||||
|
a = agent.select_action(s, deterministic=True)
|
||||||
|
s_next, r, dw, tr, info = env.step(a)
|
||||||
|
done = (dw or tr)
|
||||||
|
|
||||||
|
total_scores += r
|
||||||
|
s = s_next
|
||||||
|
return int(total_scores/turns)
|
||||||
|
|
||||||
|
|
||||||
|
#You can just ignore this funciton. Is not related to the RL.
|
||||||
|
def str2bool(v):
|
||||||
|
'''transfer str to bool for argparse'''
|
||||||
|
if isinstance(v, bool):
|
||||||
|
return v
|
||||||
|
if v.lower() in ('yes', 'True','true','TRUE', 't', 'y', '1'):
|
||||||
|
return True
|
||||||
|
elif v.lower() in ('no', 'False','false','FALSE', 'f', 'n', '0'):
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
print('Wrong Input.')
|
||||||
|
raise
|
4
env.py
4
env.py
@ -39,8 +39,8 @@ class PartitionMazeEnv(gym.Env):
|
|||||||
##############################
|
##############################
|
||||||
# 可能需要手动修改的超参数
|
# 可能需要手动修改的超参数
|
||||||
##############################
|
##############################
|
||||||
self.CUT_NUM = 2 # 横切一半,竖切一半
|
self.CUT_NUM = 6 # 横切一半,竖切一半
|
||||||
self.BASE_LINE = 4000 # 基准时间,通过greedy或者蒙特卡洛计算出来
|
self.BASE_LINE = 12000 # 基准时间,通过greedy或者蒙特卡洛计算出来
|
||||||
|
|
||||||
self.phase = 0 # 阶段控制,0:区域划分阶段,1:迷宫初始化阶段,2:走迷宫阶段
|
self.phase = 0 # 阶段控制,0:区域划分阶段,1:迷宫初始化阶段,2:走迷宫阶段
|
||||||
self.partition_step = 0 # 区域划分阶段步数,范围 0~4
|
self.partition_step = 0 # 区域划分阶段步数,范围 0~4
|
||||||
|
278
env_dis.py
Normal file
278
env_dis.py
Normal file
@ -0,0 +1,278 @@
|
|||||||
|
import gymnasium as gym
|
||||||
|
from gymnasium import spaces
|
||||||
|
import numpy as np
|
||||||
|
import yaml
|
||||||
|
import math
|
||||||
|
|
||||||
|
|
||||||
|
class PartitionMazeEnv(gym.Env):
|
||||||
|
"""
|
||||||
|
自定义环境,分为两阶段:
|
||||||
|
阶段 0:区域切分(共 4 步,每一步输出一个标量,用于确定竖切和横切位置)。
|
||||||
|
切分顺序为:第一步输出 c₁,第二步输出 c₂,第三步输出 r₁,第四步输出 r₂。
|
||||||
|
离散化后取值仅为 {0, 0.1, 0.2, …, 0.9}(其中 0 表示不切)。
|
||||||
|
阶段 1:车辆路径规划(走迷宫),车辆从区域中心出发,在九宫格内按照上下左右移动,
|
||||||
|
直到所有目标格子被覆盖或步数上限达到。
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, config=None):
|
||||||
|
super(PartitionMazeEnv, self).__init__()
|
||||||
|
# 车队参数设置
|
||||||
|
with open('params.yml', 'r', encoding='utf-8') as file:
|
||||||
|
params = yaml.safe_load(file)
|
||||||
|
|
||||||
|
self.H = params['H']
|
||||||
|
self.W = params['W']
|
||||||
|
self.num_cars = params['num_cars']
|
||||||
|
|
||||||
|
self.flight_time_factor = params['flight_time_factor']
|
||||||
|
self.comp_time_factor = params['comp_time_factor']
|
||||||
|
self.trans_time_factor = params['trans_time_factor']
|
||||||
|
self.car_time_factor = params['car_time_factor']
|
||||||
|
self.bs_time_factor = params['bs_time_factor']
|
||||||
|
|
||||||
|
self.flight_energy_factor = params['flight_energy_factor']
|
||||||
|
self.comp_energy_factor = params['comp_energy_factor']
|
||||||
|
self.trans_energy_factor = params['trans_energy_factor']
|
||||||
|
self.battery_energy_capacity = params['battery_energy_capacity']
|
||||||
|
|
||||||
|
##############################
|
||||||
|
# 可能需要手动修改的超参数
|
||||||
|
##############################
|
||||||
|
self.CUT_NUM = 4 # 横切一半,竖切一半
|
||||||
|
self.BASE_LINE = 4000 # 基准时间,通过greedy或者蒙特卡洛计算出来
|
||||||
|
|
||||||
|
self.phase = 0 # 阶段控制,0:区域划分阶段,1:迷宫初始化阶段,2:走迷宫阶段
|
||||||
|
self.partition_step = 0 # 区域划分阶段步数,范围 0~4
|
||||||
|
self.partition_values = np.zeros(
|
||||||
|
self.CUT_NUM, dtype=np.float32) # 存储 c₁, c₂, r₁, r₂
|
||||||
|
|
||||||
|
# 定义动作空间:长度为 14 的离散动作空间
|
||||||
|
# 前 10 个表示切分动作 {0, 0.1, ..., 0.9},后 4 个表示上下左右移动
|
||||||
|
self.action_space = spaces.Discrete(14)
|
||||||
|
|
||||||
|
# 定义观察空间为8维向量
|
||||||
|
# TODO 返回的状态目前只有位置坐标
|
||||||
|
# 阶段 0 状态:前 4 维表示已决策的切分值(未决策部分为 0)
|
||||||
|
# 阶段 1 状态:车辆位置 (2D)
|
||||||
|
self.observation_space = spaces.Box(
|
||||||
|
low=0.0, high=1.0, shape=(1 + self.CUT_NUM + 2 * self.num_cars,), dtype=np.float32)
|
||||||
|
|
||||||
|
# 切分阶段相关变量
|
||||||
|
self.col_cuts = [] # 存储竖切位置(c₁, c₂),当值为0时表示不切
|
||||||
|
self.row_cuts = [] # 存储横切位置(r₁, r₂)
|
||||||
|
|
||||||
|
self.init_maze_step = 0
|
||||||
|
|
||||||
|
# 路径规划阶段相关变量
|
||||||
|
self.MAX_STEPS = 50 # 迷宫走法步数上限
|
||||||
|
self.step_count = 0
|
||||||
|
self.rectangles = {}
|
||||||
|
self.car_pos = [(self.H / 2, self.W / 2) for _ in range(self.num_cars)]
|
||||||
|
self.car_traj = [[] for _ in range(self.num_cars)]
|
||||||
|
self.current_car_index = 0
|
||||||
|
|
||||||
|
def reset(self, seed=None, options=None):
|
||||||
|
# 重置所有变量,回到切分阶段(phase 0)
|
||||||
|
self.phase = 0
|
||||||
|
self.partition_step = 0
|
||||||
|
self.partition_values = np.zeros(self.CUT_NUM, dtype=np.float32)
|
||||||
|
self.col_cuts = []
|
||||||
|
self.row_cuts = []
|
||||||
|
self.init_maze_step = 0
|
||||||
|
self.region_centers = []
|
||||||
|
self.step_count = 0
|
||||||
|
self.rectangles = {}
|
||||||
|
self.car_pos = [(self.H / 2, self.W / 2) for _ in range(self.num_cars)]
|
||||||
|
self.car_traj = [[] for _ in range(self.num_cars)]
|
||||||
|
self.current_car_index = 0
|
||||||
|
# 状态:前 4 维为 partition_values,其余补 0
|
||||||
|
state = np.concatenate(
|
||||||
|
[[self.phase], self.partition_values, np.zeros(np.array(self.car_pos).flatten().shape[0], dtype=np.float32)])
|
||||||
|
return state
|
||||||
|
|
||||||
|
def step(self, action):
|
||||||
|
# 在所有阶段动作均为离散动作
|
||||||
|
if self.phase == 0:
|
||||||
|
# 切分阶段:前 10 个动作对应 {0, 0.1, ..., 0.9}
|
||||||
|
disc_val = action * 0.1 # 修正为动作直接映射到切分比例
|
||||||
|
self.partition_values[self.partition_step] = disc_val
|
||||||
|
self.partition_step += 1
|
||||||
|
|
||||||
|
# 构造当前状态:前 partition_step 个为已决策值,其余为 0,再补 7 个 0
|
||||||
|
state = np.concatenate(
|
||||||
|
[[self.phase], self.partition_values, np.zeros(
|
||||||
|
np.array(self.car_pos).flatten().shape[0], dtype=np.float32)]
|
||||||
|
)
|
||||||
|
|
||||||
|
# 如果未完成 4 步,则仍处于切分阶段,不发奖励,done 为 False
|
||||||
|
if self.partition_step < self.CUT_NUM:
|
||||||
|
return state, 0.0, False, False, {}
|
||||||
|
else:
|
||||||
|
# 完成 4 步后,计算切分边界
|
||||||
|
# 过滤掉 0,并去重后排序
|
||||||
|
vert = sorted(set(v for v in self.partition_values[:len(
|
||||||
|
self.partition_values) // 2] if v > 0))
|
||||||
|
horiz = sorted(set(v for v in self.partition_values[len(
|
||||||
|
self.partition_values) // 2:] if v > 0))
|
||||||
|
vertical_cuts = vert if vert else []
|
||||||
|
horizontal_cuts = horiz if horiz else []
|
||||||
|
|
||||||
|
# 边界:始终包含 0 和 1
|
||||||
|
self.col_cuts = [0.0] + vertical_cuts + [1.0]
|
||||||
|
self.row_cuts = [0.0] + horizontal_cuts + [1.0]
|
||||||
|
|
||||||
|
# 判断分区是否合理,并计算各个分区的任务卸载率ρ
|
||||||
|
valid_partition = True
|
||||||
|
for i in range(len(self.row_cuts) - 1):
|
||||||
|
for j in range(len(self.col_cuts) - 1):
|
||||||
|
d = (self.col_cuts[j+1] - self.col_cuts[j]) * self.W * \
|
||||||
|
(self.row_cuts[i+1] - self.row_cuts[i]) * self.H
|
||||||
|
rho_time_limit = (self.flight_time_factor - self.trans_time_factor) / \
|
||||||
|
(self.comp_time_factor - self.trans_time_factor)
|
||||||
|
rho_energy_limit = (self.battery_energy_capacity - self.flight_energy_factor * d - self.trans_energy_factor * d) / \
|
||||||
|
(self.comp_energy_factor * d -
|
||||||
|
self.trans_energy_factor * d)
|
||||||
|
if rho_energy_limit < 0:
|
||||||
|
valid_partition = False
|
||||||
|
break
|
||||||
|
rho = min(rho_time_limit, rho_energy_limit)
|
||||||
|
|
||||||
|
flight_time = self.flight_time_factor * d
|
||||||
|
bs_time = self.bs_time_factor * (1 - rho) * d
|
||||||
|
|
||||||
|
self.rectangles[(i, j)] = {
|
||||||
|
'center': ((self.row_cuts[i] + self.row_cuts[i+1]) * self.H / 2, (self.col_cuts[j+1] + self.col_cuts[j]) * self.W / 2),
|
||||||
|
'flight_time': flight_time,
|
||||||
|
'bs_time': bs_time,
|
||||||
|
'is_visited': False
|
||||||
|
}
|
||||||
|
if not valid_partition:
|
||||||
|
break
|
||||||
|
|
||||||
|
if not valid_partition:
|
||||||
|
reward = -10000
|
||||||
|
state = np.concatenate(
|
||||||
|
[[self.phase], self.partition_values, np.zeros(np.array(self.car_pos).flatten().shape[0], dtype=np.float32)])
|
||||||
|
return state, reward, True, False, {}
|
||||||
|
else:
|
||||||
|
# 进入阶段 1:初始化迷宫
|
||||||
|
self.phase = 1
|
||||||
|
state = np.concatenate(
|
||||||
|
[[self.phase], self.partition_values, np.array(self.car_pos).flatten()])
|
||||||
|
reward = 10
|
||||||
|
|
||||||
|
# 构建反向索引,方便后续计算
|
||||||
|
self.reverse_rectangles = {
|
||||||
|
v['center']: k for k, v in self.rectangles.items()}
|
||||||
|
return state, reward, False, False, {}
|
||||||
|
|
||||||
|
elif self.phase == 1:
|
||||||
|
# TODO 阶段一可以不写出来!!!
|
||||||
|
# 阶段 1:初始化迷宫,让多个车辆从区域中心出发,前往最近的几个区域中心点
|
||||||
|
region_centers = [
|
||||||
|
(i, j, self.rectangles[(i, j)]['center'])
|
||||||
|
for i in range(len(self.row_cuts) - 1)
|
||||||
|
for j in range(len(self.col_cuts) - 1)
|
||||||
|
]
|
||||||
|
# 按照与区域中心的距离从近到远排序
|
||||||
|
region_centers.sort(
|
||||||
|
key=lambda x: math.dist(x[2], (self.H / 2, self.W / 2))
|
||||||
|
)
|
||||||
|
|
||||||
|
# 分配最近的区域给每辆车
|
||||||
|
for idx in range(self.num_cars):
|
||||||
|
i, j, center = region_centers[idx]
|
||||||
|
self.car_pos[idx] = center
|
||||||
|
self.car_traj[idx].append((i, j))
|
||||||
|
self.rectangles[(i, j)]['is_visited'] = True
|
||||||
|
|
||||||
|
# 进入阶段 2:走迷宫
|
||||||
|
self.phase = 2
|
||||||
|
state = np.concatenate(
|
||||||
|
[[self.phase], self.partition_values,
|
||||||
|
np.array(self.car_pos).flatten()]
|
||||||
|
)
|
||||||
|
return state, 0.0, False, False, {}
|
||||||
|
|
||||||
|
elif self.phase == 2:
|
||||||
|
# 阶段 2:路径规划(走迷宫)
|
||||||
|
# 后 4 个动作对应上下左右移动
|
||||||
|
current_car = self.current_car_index
|
||||||
|
current_row, current_col = self.reverse_rectangles[self.car_pos[current_car]]
|
||||||
|
|
||||||
|
# 初始化新的行、列为当前值
|
||||||
|
new_row, new_col = current_row, current_col
|
||||||
|
|
||||||
|
if action == 10 and current_row > 0: # 上
|
||||||
|
new_row = current_row - 1
|
||||||
|
elif action == 11 and current_row < len(self.row_cuts) - 2: # 下
|
||||||
|
new_row = current_row + 1
|
||||||
|
elif action == 12 and current_col > 0: # 左
|
||||||
|
new_col = current_col - 1
|
||||||
|
elif action == 13 and current_col < len(self.col_cuts) - 2: # 右
|
||||||
|
new_col = new_col + 1
|
||||||
|
|
||||||
|
# 更新车辆位置
|
||||||
|
self.car_pos[current_car] = self.rectangles[(
|
||||||
|
new_row, new_col)]['center']
|
||||||
|
if new_row != current_row or new_col != current_col:
|
||||||
|
self.car_traj[current_car].append((new_row, new_col))
|
||||||
|
self.step_count += 1
|
||||||
|
self.current_car_index = (
|
||||||
|
self.current_car_index + 1) % self.num_cars
|
||||||
|
|
||||||
|
# 更新访问标记:将新网格标记为已访问
|
||||||
|
self.rectangles[(new_row, new_col)]['is_visited'] = True
|
||||||
|
|
||||||
|
# 观察状态
|
||||||
|
state = np.concatenate(
|
||||||
|
[[self.phase], self.partition_values, np.array(self.car_pos).flatten()])
|
||||||
|
reward = 0
|
||||||
|
|
||||||
|
# Episode 终止条件:所有网格均被访问或步数达到上限
|
||||||
|
done = all([value['is_visited'] for _, value in self.rectangles.items()]) or (
|
||||||
|
self.step_count >= self.MAX_STEPS)
|
||||||
|
if done and all([value['is_visited'] for _, value in self.rectangles.items()]):
|
||||||
|
# 区域覆盖完毕,根据轨迹计算各车队的执行时间
|
||||||
|
T = max([self._compute_motorcade_time(idx)
|
||||||
|
for idx in range(self.num_cars)])
|
||||||
|
# print(T)
|
||||||
|
# print(self.partition_values)
|
||||||
|
# print(self.car_traj)
|
||||||
|
reward += self.BASE_LINE / T * 100
|
||||||
|
elif done and self.step_count >= self.MAX_STEPS:
|
||||||
|
reward += -1000
|
||||||
|
|
||||||
|
return state, reward, done, False, {}
|
||||||
|
|
||||||
|
def _compute_motorcade_time(self, idx):
|
||||||
|
flight_time = sum(self.rectangles[tuple(point)]['flight_time']
|
||||||
|
for point in self.car_traj[idx])
|
||||||
|
bs_time = sum(self.rectangles[tuple(point)]['bs_time']
|
||||||
|
for point in self.car_traj[idx])
|
||||||
|
|
||||||
|
# 计算车的移动时间,首先在轨迹的首尾添加上大区域中心
|
||||||
|
car_time = 0
|
||||||
|
for i in range(len(self.car_traj[idx]) - 1):
|
||||||
|
first_point = self.car_traj[idx][i]
|
||||||
|
second_point = self.car_traj[idx][i + 1]
|
||||||
|
car_time += math.dist(self.rectangles[first_point]['center'], self.rectangles[second_point]['center']) * \
|
||||||
|
self.car_time_factor
|
||||||
|
car_time += math.dist(self.rectangles[self.car_traj[idx][0]]['center'], [
|
||||||
|
self.H / 2, self.W / 2]) * self.car_time_factor
|
||||||
|
car_time += math.dist(self.rectangles[self.car_traj[idx][-1]]['center'], [
|
||||||
|
self.H / 2, self.W / 2]) * self.car_time_factor
|
||||||
|
|
||||||
|
return max(float(car_time) + flight_time, bs_time)
|
||||||
|
|
||||||
|
def render(self):
|
||||||
|
if self.phase == 1:
|
||||||
|
print("Phase 1: Initialize maze environment.")
|
||||||
|
print(f"Partition values so far: {self.partition_values}")
|
||||||
|
print(f"Motorcade positon: {self.car_pos}")
|
||||||
|
# input('1111')
|
||||||
|
elif self.phase == 2:
|
||||||
|
print("Phase 2: Play maze.")
|
||||||
|
print(f'Motorcade trajectory: {self.car_traj}')
|
||||||
|
# input('2222')
|
Loading…
Reference in New Issue
Block a user