添加ddpg代码
This commit is contained in:
parent
4fdb8aa152
commit
ab51727253
105
DDPG/DDPG.py
Normal file
105
DDPG/DDPG.py
Normal file
@ -0,0 +1,105 @@
|
||||
from utils import Actor, Q_Critic
|
||||
import torch.nn.functional as F
|
||||
import numpy as np
|
||||
import torch
|
||||
import copy
|
||||
|
||||
|
||||
class DDPG_agent():
|
||||
def __init__(self, **kwargs):
|
||||
# Init hyperparameters for agent, just like "self.gamma = opt.gamma, self.lambd = opt.lambd, ..."
|
||||
self.__dict__.update(kwargs)
|
||||
self.tau = 0.005
|
||||
|
||||
self.actor = Actor(self.state_dim, self.action_dim, self.net_width, self.max_action).to(self.dvc)
|
||||
self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.a_lr)
|
||||
self.actor_target = copy.deepcopy(self.actor)
|
||||
|
||||
self.q_critic = Q_Critic(self.state_dim, self.action_dim, self.net_width).to(self.dvc)
|
||||
self.q_critic_optimizer = torch.optim.Adam(self.q_critic.parameters(), lr=self.c_lr)
|
||||
self.q_critic_target = copy.deepcopy(self.q_critic)
|
||||
|
||||
self.replay_buffer = ReplayBuffer(self.state_dim, self.action_dim, max_size=int(5e5), dvc=self.dvc)
|
||||
|
||||
def select_action(self, state, deterministic):
|
||||
with torch.no_grad():
|
||||
state = torch.FloatTensor(state[np.newaxis, :]).to(self.dvc) # from [x,x,...,x] to [[x,x,...,x]]
|
||||
a = self.actor(state).cpu().numpy()[0] # from [[x,x,...,x]] to [x,x,...,x]
|
||||
if deterministic:
|
||||
return a
|
||||
else:
|
||||
noise = np.random.normal(0, self.max_action * self.noise, size=self.action_dim)
|
||||
return (a + noise).clip(-self.max_action, self.max_action)
|
||||
|
||||
def train(self):
|
||||
# Compute the target Q
|
||||
with torch.no_grad():
|
||||
s, a, r, s_next, dw = self.replay_buffer.sample(self.batch_size)
|
||||
target_a_next = self.actor_target(s_next)
|
||||
target_Q= self.q_critic_target(s_next, target_a_next)
|
||||
target_Q = r + (~dw) * self.gamma * target_Q #dw: die or win
|
||||
|
||||
# Get current Q estimates
|
||||
current_Q = self.q_critic(s, a)
|
||||
|
||||
# Compute critic loss
|
||||
q_loss = F.mse_loss(current_Q, target_Q)
|
||||
|
||||
# Optimize the q_critic
|
||||
self.q_critic_optimizer.zero_grad()
|
||||
q_loss.backward()
|
||||
self.q_critic_optimizer.step()
|
||||
|
||||
# Update the Actor
|
||||
a_loss = -self.q_critic(s,self.actor(s)).mean()
|
||||
self.actor_optimizer.zero_grad()
|
||||
a_loss.backward()
|
||||
self.actor_optimizer.step()
|
||||
|
||||
# Update the frozen target models
|
||||
with torch.no_grad():
|
||||
for param, target_param in zip(self.q_critic.parameters(), self.q_critic_target.parameters()):
|
||||
target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
|
||||
|
||||
for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
|
||||
target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
|
||||
|
||||
def save(self,EnvName, timestep):
|
||||
torch.save(self.actor.state_dict(), "./model/{}_actor{}.pth".format(EnvName,timestep))
|
||||
torch.save(self.q_critic.state_dict(), "./model/{}_q_critic{}.pth".format(EnvName,timestep))
|
||||
|
||||
def load(self,EnvName, timestep):
|
||||
self.actor.load_state_dict(torch.load("./model/{}_actor{}.pth".format(EnvName, timestep), map_location=self.dvc))
|
||||
self.q_critic.load_state_dict(torch.load("./model/{}_q_critic{}.pth".format(EnvName, timestep), map_location=self.dvc))
|
||||
|
||||
|
||||
class ReplayBuffer():
|
||||
def __init__(self, state_dim, action_dim, max_size, dvc):
|
||||
self.max_size = max_size
|
||||
self.dvc = dvc
|
||||
self.ptr = 0
|
||||
self.size = 0
|
||||
|
||||
self.s = torch.zeros((max_size, state_dim) ,dtype=torch.float,device=self.dvc)
|
||||
self.a = torch.zeros((max_size, action_dim) ,dtype=torch.float,device=self.dvc)
|
||||
self.r = torch.zeros((max_size, 1) ,dtype=torch.float,device=self.dvc)
|
||||
self.s_next = torch.zeros((max_size, state_dim) ,dtype=torch.float,device=self.dvc)
|
||||
self.dw = torch.zeros((max_size, 1) ,dtype=torch.bool,device=self.dvc)
|
||||
|
||||
def add(self, s, a, r, s_next, dw):
|
||||
#每次只放入一个时刻的数据
|
||||
self.s[self.ptr] = torch.from_numpy(s).to(self.dvc)
|
||||
self.a[self.ptr] = torch.from_numpy(a).to(self.dvc) # Note that a is numpy.array
|
||||
self.r[self.ptr] = r
|
||||
self.s_next[self.ptr] = torch.from_numpy(s_next).to(self.dvc)
|
||||
self.dw[self.ptr] = dw
|
||||
|
||||
self.ptr = (self.ptr + 1) % self.max_size #存满了又重头开始存
|
||||
self.size = min(self.size + 1, self.max_size)
|
||||
|
||||
def sample(self, batch_size):
|
||||
ind = torch.randint(0, self.size, device=self.dvc, size=(batch_size,))
|
||||
return self.s[ind], self.a[ind], self.r[ind], self.s_next[ind], self.dw[ind]
|
||||
|
||||
|
||||
|
121
DDPG/main.py
Normal file
121
DDPG/main.py
Normal file
@ -0,0 +1,121 @@
|
||||
from utils import str2bool,evaluate_policy
|
||||
from datetime import datetime
|
||||
from DDPG import DDPG_agent
|
||||
import gymnasium as gym
|
||||
import os, shutil
|
||||
import argparse
|
||||
import torch
|
||||
import sys
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
from env import PartitionMazeEnv
|
||||
|
||||
|
||||
'''Hyperparameter Setting'''
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--dvc', type=str, default='cpu', help='running device: cuda or cpu')
|
||||
parser.add_argument('--EnvIdex', type=int, default=0, help='PV1, Lch_Cv2, Humanv4, HCv4, BWv3, BWHv3')
|
||||
parser.add_argument('--write', type=str2bool, default=False, help='Use SummaryWriter to record the training')
|
||||
parser.add_argument('--render', type=str2bool, default=False, help='Render or Not')
|
||||
parser.add_argument('--Loadmodel', type=str2bool, default=False, help='Load pretrained model or Not')
|
||||
parser.add_argument('--ModelIdex', type=int, default=100, help='which model to load')
|
||||
|
||||
parser.add_argument('--seed', type=int, default=0, help='random seed')
|
||||
parser.add_argument('--Max_train_steps', type=int, default=5e6, help='Max training steps')
|
||||
parser.add_argument('--save_interval', type=int, default=1e5, help='Model saving interval, in steps.')
|
||||
parser.add_argument('--eval_interval', type=int, default=2e3, help='Model evaluating interval, in steps.')
|
||||
|
||||
parser.add_argument('--gamma', type=float, default=0.99, help='Discounted Factor')
|
||||
parser.add_argument('--net_width', type=int, default=400, help='Hidden net width, s_dim-400-300-a_dim')
|
||||
parser.add_argument('--a_lr', type=float, default=1e-3, help='Learning rate of actor')
|
||||
parser.add_argument('--c_lr', type=float, default=1e-3, help='Learning rate of critic')
|
||||
parser.add_argument('--batch_size', type=int, default=128, help='batch_size of training')
|
||||
parser.add_argument('--random_steps', type=int, default=5e4, help='random steps before trianing')
|
||||
parser.add_argument('--noise', type=float, default=0.1, help='exploring noise')
|
||||
opt = parser.parse_args()
|
||||
opt.dvc = torch.device(opt.dvc) # from str to torch.device
|
||||
print(opt)
|
||||
|
||||
|
||||
def main():
|
||||
EnvName = ['Pendulum-v1','LunarLanderContinuous-v2','Humanoid-v4','HalfCheetah-v4','BipedalWalker-v3','BipedalWalkerHardcore-v3']
|
||||
BrifEnvName = ['PV1', 'LLdV2', 'Humanv4', 'HCv4','BWv3', 'BWHv3']
|
||||
|
||||
# Build Env
|
||||
# env = gym.make(EnvName[opt.EnvIdex], render_mode = "human" if opt.render else None)
|
||||
env = PartitionMazeEnv()
|
||||
# eval_env = gym.make(EnvName[opt.EnvIdex])
|
||||
eval_env = PartitionMazeEnv()
|
||||
opt.state_dim = env.observation_space.shape[0]
|
||||
opt.action_dim = env.action_space.shape[0]
|
||||
opt.max_action = float(env.action_space.high[0]) #remark: action space【-max,max】
|
||||
print(f'Env:{EnvName[opt.EnvIdex]} state_dim:{opt.state_dim} action_dim:{opt.action_dim}')
|
||||
print(f'max_a:{opt.max_action} min_a:{env.action_space.low[0]}')
|
||||
|
||||
# Seed Everything
|
||||
env_seed = opt.seed
|
||||
torch.manual_seed(opt.seed)
|
||||
torch.cuda.manual_seed(opt.seed)
|
||||
torch.backends.cudnn.deterministic = True
|
||||
torch.backends.cudnn.benchmark = False
|
||||
print("Random Seed: {}".format(opt.seed))
|
||||
|
||||
# Build SummaryWriter to record training curves
|
||||
if opt.write:
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
timenow = str(datetime.now())[0:-10]
|
||||
timenow = ' ' + timenow[0:13] + '_' + timenow[-2::]
|
||||
writepath = 'runs/{}'.format(BrifEnvName[opt.EnvIdex]) + timenow
|
||||
if os.path.exists(writepath): shutil.rmtree(writepath)
|
||||
writer = SummaryWriter(log_dir=writepath)
|
||||
|
||||
|
||||
# Build DRL model
|
||||
if not os.path.exists('model'): os.mkdir('model')
|
||||
agent = DDPG_agent(**vars(opt)) # var: transfer argparse to dictionary
|
||||
if opt.Loadmodel: agent.load(BrifEnvName[opt.EnvIdex], opt.ModelIdex)
|
||||
|
||||
if opt.render:
|
||||
while True:
|
||||
score = evaluate_policy(env, agent, turns=1)
|
||||
print('EnvName:', BrifEnvName[opt.EnvIdex], 'score:', score)
|
||||
else:
|
||||
total_steps = 0
|
||||
while total_steps < opt.Max_train_steps:
|
||||
s = env.reset(seed=env_seed) # Do not use opt.seed directly, or it can overfit to opt.seed
|
||||
env_seed += 1
|
||||
done = False
|
||||
|
||||
'''Interact & trian'''
|
||||
while not done:
|
||||
if total_steps < opt.random_steps: a = env.action_space.sample()
|
||||
else: a = agent.select_action(s, deterministic=False)
|
||||
s_next, r, dw, tr, info = env.step(a) # dw: dead&win; tr: truncated
|
||||
done = (dw or tr)
|
||||
|
||||
agent.replay_buffer.add(s, a, r, s_next, dw)
|
||||
s = s_next
|
||||
total_steps += 1
|
||||
|
||||
'''train'''
|
||||
if total_steps >= opt.random_steps:
|
||||
agent.train()
|
||||
|
||||
'''record & log'''
|
||||
if total_steps % opt.eval_interval == 0:
|
||||
ep_r = evaluate_policy(eval_env, agent, turns=3)
|
||||
if opt.write: writer.add_scalar('ep_r', ep_r, global_step=total_steps)
|
||||
print(f'EnvName:{BrifEnvName[opt.EnvIdex]}, Steps: {int(total_steps/1000)}k, Episode Reward:{ep_r}')
|
||||
|
||||
'''save model'''
|
||||
if total_steps % opt.save_interval == 0:
|
||||
agent.save(BrifEnvName[opt.EnvIdex], int(total_steps/1000))
|
||||
env.close()
|
||||
eval_env.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
|
||||
|
||||
|
64
DDPG/utils.py
Normal file
64
DDPG/utils.py
Normal file
@ -0,0 +1,64 @@
|
||||
import torch.nn.functional as F
|
||||
import torch.nn as nn
|
||||
import argparse
|
||||
import torch
|
||||
|
||||
class Actor(nn.Module):
|
||||
def __init__(self, state_dim, action_dim, net_width, maxaction):
|
||||
super(Actor, self).__init__()
|
||||
|
||||
self.l1 = nn.Linear(state_dim, net_width)
|
||||
self.l2 = nn.Linear(net_width, 300)
|
||||
self.l3 = nn.Linear(300, action_dim)
|
||||
|
||||
self.maxaction = maxaction
|
||||
|
||||
def forward(self, state):
|
||||
a = torch.relu(self.l1(state))
|
||||
a = torch.relu(self.l2(a))
|
||||
a = torch.tanh(self.l3(a)) * self.maxaction
|
||||
return a
|
||||
|
||||
|
||||
class Q_Critic(nn.Module):
|
||||
def __init__(self, state_dim, action_dim, net_width):
|
||||
super(Q_Critic, self).__init__()
|
||||
|
||||
self.l1 = nn.Linear(state_dim + action_dim, net_width)
|
||||
self.l2 = nn.Linear(net_width, 300)
|
||||
self.l3 = nn.Linear(300, 1)
|
||||
|
||||
def forward(self, state, action):
|
||||
sa = torch.cat([state, action], 1)
|
||||
q = F.relu(self.l1(sa))
|
||||
q = F.relu(self.l2(q))
|
||||
q = self.l3(q)
|
||||
return q
|
||||
|
||||
def evaluate_policy(env, agent, turns = 3):
|
||||
total_scores = 0
|
||||
for j in range(turns):
|
||||
s = env.reset()
|
||||
done = False
|
||||
while not done:
|
||||
# Take deterministic actions at test time
|
||||
a = agent.select_action(s, deterministic=True)
|
||||
s_next, r, dw, tr, info = env.step(a)
|
||||
done = (dw or tr)
|
||||
|
||||
total_scores += r
|
||||
s = s_next
|
||||
return int(total_scores/turns)
|
||||
|
||||
|
||||
#Just ignore this function~
|
||||
def str2bool(v):
|
||||
'''transfer str to bool for argparse'''
|
||||
if isinstance(v, bool):
|
||||
return v
|
||||
if v.lower() in ('yes', 'True','true','TRUE', 't', 'y', '1'):
|
||||
return True
|
||||
elif v.lower() in ('no', 'False','false','FALSE', 'f', 'n', '0'):
|
||||
return False
|
||||
else:
|
||||
raise argparse.ArgumentTypeError('Boolean value expected.')
|
20
PPO2/PPO.py
20
PPO2/PPO.py
@ -48,17 +48,17 @@ class ActorCritic(nn.Module):
|
||||
if has_continuous_action_space :
|
||||
self.actor = nn.Sequential(
|
||||
nn.Linear(state_dim, 64),
|
||||
# nn.Tanh(),
|
||||
nn.Tanh(),
|
||||
# nn.Sigmoid(),
|
||||
nn.ReLU(),
|
||||
# nn.ReLU(),
|
||||
nn.Linear(64, 64),
|
||||
# nn.Tanh(),
|
||||
nn.Tanh(),
|
||||
# nn.Sigmoid(),
|
||||
nn.ReLU(),
|
||||
# nn.ReLU(),
|
||||
nn.Linear(64, action_dim),
|
||||
# nn.Tanh()
|
||||
# nn.Sigmoid()
|
||||
nn.ReLU()
|
||||
nn.Sigmoid()
|
||||
# nn.ReLU()
|
||||
)
|
||||
else:
|
||||
self.actor = nn.Sequential(
|
||||
@ -72,13 +72,13 @@ class ActorCritic(nn.Module):
|
||||
# critic
|
||||
self.critic = nn.Sequential(
|
||||
nn.Linear(state_dim, 64),
|
||||
# nn.Tanh(),
|
||||
nn.Tanh(),
|
||||
# nn.Sigmoid(),
|
||||
nn.ReLU(),
|
||||
# nn.ReLU(),
|
||||
nn.Linear(64, 64),
|
||||
# nn.Tanh(),
|
||||
nn.Tanh(),
|
||||
# nn.Sigmoid(),
|
||||
nn.ReLU(),
|
||||
# nn.ReLU(),
|
||||
nn.Linear(64, 1)
|
||||
)
|
||||
|
||||
|
6
env.py
6
env.py
@ -39,8 +39,8 @@ class PartitionMazeEnv(gym.Env):
|
||||
##############################
|
||||
# 可能需要手动修改的超参数
|
||||
##############################
|
||||
self.CUT_NUM = 4 # 横切一半,竖切一半
|
||||
self.BASE_LINE = 3500.0 # 基准时间,通过greedy或者蒙特卡洛计算出来
|
||||
self.CUT_NUM = 6 # 横切一半,竖切一半
|
||||
self.BASE_LINE = 12133.250161412347 # 基准时间,通过greedy或者蒙特卡洛计算出来
|
||||
|
||||
self.phase = 0 # 阶段控制,0:区域划分阶段,1:迷宫初始化阶段,2:走迷宫阶段
|
||||
self.partition_step = 0 # 区域划分阶段步数,范围 0~4
|
||||
@ -254,7 +254,7 @@ class PartitionMazeEnv(gym.Env):
|
||||
# print(self.car_traj)
|
||||
reward += self.BASE_LINE / T * 100
|
||||
elif done and self.step_count >= self.MAX_STEPS:
|
||||
reward += -10000
|
||||
reward += -1000
|
||||
|
||||
return state, reward, done, False, {}
|
||||
|
||||
|
@ -5,9 +5,9 @@ env = PartitionMazeEnv()
|
||||
state = env.reset()
|
||||
print(state)
|
||||
|
||||
action_series = [[0], [0.3], [0], [0], [0.1], [0.7]]
|
||||
action_series = [[0], [0.5], [0], [0.2], [0.4], [0.7], [0.3], [0.8], [0.5], [0.1], [0.7], [0.7], [0.9], [0.9], [0.1], [0.9], [0.9], [0.1]]
|
||||
|
||||
for i in range(10):
|
||||
for i in range(100):
|
||||
action = action_series[i]
|
||||
state, reward, done, info, _ = env.step(action)
|
||||
print(state, reward, done, info)
|
||||
|
@ -1,6 +1,6 @@
|
||||
H : 20 # 区域高度,网格点之间的距离为25m(单位距离)
|
||||
W : 25 # 区域宽度
|
||||
num_cars : 1 # 系统数量(车-巢-机系统个数)
|
||||
H : 50 # 区域高度,网格点之间的距离为25m(单位距离)
|
||||
W : 50 # 区域宽度
|
||||
num_cars : 3 # 系统数量(车-巢-机系统个数)
|
||||
|
||||
# 时间系数(单位:秒,每个网格一张照片)
|
||||
flight_time_factor : 3 # 每张照片对应的飞行时间,无人机飞行速度为9.5m/s,拍摄照片的时间间隔为3s
|
||||
|
Loading…
Reference in New Issue
Block a user