添加ddpg代码

This commit is contained in:
weixin_46229132 2025-03-14 15:27:05 +08:00
parent 4fdb8aa152
commit ab51727253
7 changed files with 308 additions and 18 deletions

105
DDPG/DDPG.py Normal file
View File

@ -0,0 +1,105 @@
from utils import Actor, Q_Critic
import torch.nn.functional as F
import numpy as np
import torch
import copy
class DDPG_agent():
def __init__(self, **kwargs):
# Init hyperparameters for agent, just like "self.gamma = opt.gamma, self.lambd = opt.lambd, ..."
self.__dict__.update(kwargs)
self.tau = 0.005
self.actor = Actor(self.state_dim, self.action_dim, self.net_width, self.max_action).to(self.dvc)
self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.a_lr)
self.actor_target = copy.deepcopy(self.actor)
self.q_critic = Q_Critic(self.state_dim, self.action_dim, self.net_width).to(self.dvc)
self.q_critic_optimizer = torch.optim.Adam(self.q_critic.parameters(), lr=self.c_lr)
self.q_critic_target = copy.deepcopy(self.q_critic)
self.replay_buffer = ReplayBuffer(self.state_dim, self.action_dim, max_size=int(5e5), dvc=self.dvc)
def select_action(self, state, deterministic):
with torch.no_grad():
state = torch.FloatTensor(state[np.newaxis, :]).to(self.dvc) # from [x,x,...,x] to [[x,x,...,x]]
a = self.actor(state).cpu().numpy()[0] # from [[x,x,...,x]] to [x,x,...,x]
if deterministic:
return a
else:
noise = np.random.normal(0, self.max_action * self.noise, size=self.action_dim)
return (a + noise).clip(-self.max_action, self.max_action)
def train(self):
# Compute the target Q
with torch.no_grad():
s, a, r, s_next, dw = self.replay_buffer.sample(self.batch_size)
target_a_next = self.actor_target(s_next)
target_Q= self.q_critic_target(s_next, target_a_next)
target_Q = r + (~dw) * self.gamma * target_Q #dw: die or win
# Get current Q estimates
current_Q = self.q_critic(s, a)
# Compute critic loss
q_loss = F.mse_loss(current_Q, target_Q)
# Optimize the q_critic
self.q_critic_optimizer.zero_grad()
q_loss.backward()
self.q_critic_optimizer.step()
# Update the Actor
a_loss = -self.q_critic(s,self.actor(s)).mean()
self.actor_optimizer.zero_grad()
a_loss.backward()
self.actor_optimizer.step()
# Update the frozen target models
with torch.no_grad():
for param, target_param in zip(self.q_critic.parameters(), self.q_critic_target.parameters()):
target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
def save(self,EnvName, timestep):
torch.save(self.actor.state_dict(), "./model/{}_actor{}.pth".format(EnvName,timestep))
torch.save(self.q_critic.state_dict(), "./model/{}_q_critic{}.pth".format(EnvName,timestep))
def load(self,EnvName, timestep):
self.actor.load_state_dict(torch.load("./model/{}_actor{}.pth".format(EnvName, timestep), map_location=self.dvc))
self.q_critic.load_state_dict(torch.load("./model/{}_q_critic{}.pth".format(EnvName, timestep), map_location=self.dvc))
class ReplayBuffer():
def __init__(self, state_dim, action_dim, max_size, dvc):
self.max_size = max_size
self.dvc = dvc
self.ptr = 0
self.size = 0
self.s = torch.zeros((max_size, state_dim) ,dtype=torch.float,device=self.dvc)
self.a = torch.zeros((max_size, action_dim) ,dtype=torch.float,device=self.dvc)
self.r = torch.zeros((max_size, 1) ,dtype=torch.float,device=self.dvc)
self.s_next = torch.zeros((max_size, state_dim) ,dtype=torch.float,device=self.dvc)
self.dw = torch.zeros((max_size, 1) ,dtype=torch.bool,device=self.dvc)
def add(self, s, a, r, s_next, dw):
#每次只放入一个时刻的数据
self.s[self.ptr] = torch.from_numpy(s).to(self.dvc)
self.a[self.ptr] = torch.from_numpy(a).to(self.dvc) # Note that a is numpy.array
self.r[self.ptr] = r
self.s_next[self.ptr] = torch.from_numpy(s_next).to(self.dvc)
self.dw[self.ptr] = dw
self.ptr = (self.ptr + 1) % self.max_size #存满了又重头开始存
self.size = min(self.size + 1, self.max_size)
def sample(self, batch_size):
ind = torch.randint(0, self.size, device=self.dvc, size=(batch_size,))
return self.s[ind], self.a[ind], self.r[ind], self.s_next[ind], self.dw[ind]

121
DDPG/main.py Normal file
View File

@ -0,0 +1,121 @@
from utils import str2bool,evaluate_policy
from datetime import datetime
from DDPG import DDPG_agent
import gymnasium as gym
import os, shutil
import argparse
import torch
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from env import PartitionMazeEnv
'''Hyperparameter Setting'''
parser = argparse.ArgumentParser()
parser.add_argument('--dvc', type=str, default='cpu', help='running device: cuda or cpu')
parser.add_argument('--EnvIdex', type=int, default=0, help='PV1, Lch_Cv2, Humanv4, HCv4, BWv3, BWHv3')
parser.add_argument('--write', type=str2bool, default=False, help='Use SummaryWriter to record the training')
parser.add_argument('--render', type=str2bool, default=False, help='Render or Not')
parser.add_argument('--Loadmodel', type=str2bool, default=False, help='Load pretrained model or Not')
parser.add_argument('--ModelIdex', type=int, default=100, help='which model to load')
parser.add_argument('--seed', type=int, default=0, help='random seed')
parser.add_argument('--Max_train_steps', type=int, default=5e6, help='Max training steps')
parser.add_argument('--save_interval', type=int, default=1e5, help='Model saving interval, in steps.')
parser.add_argument('--eval_interval', type=int, default=2e3, help='Model evaluating interval, in steps.')
parser.add_argument('--gamma', type=float, default=0.99, help='Discounted Factor')
parser.add_argument('--net_width', type=int, default=400, help='Hidden net width, s_dim-400-300-a_dim')
parser.add_argument('--a_lr', type=float, default=1e-3, help='Learning rate of actor')
parser.add_argument('--c_lr', type=float, default=1e-3, help='Learning rate of critic')
parser.add_argument('--batch_size', type=int, default=128, help='batch_size of training')
parser.add_argument('--random_steps', type=int, default=5e4, help='random steps before trianing')
parser.add_argument('--noise', type=float, default=0.1, help='exploring noise')
opt = parser.parse_args()
opt.dvc = torch.device(opt.dvc) # from str to torch.device
print(opt)
def main():
EnvName = ['Pendulum-v1','LunarLanderContinuous-v2','Humanoid-v4','HalfCheetah-v4','BipedalWalker-v3','BipedalWalkerHardcore-v3']
BrifEnvName = ['PV1', 'LLdV2', 'Humanv4', 'HCv4','BWv3', 'BWHv3']
# Build Env
# env = gym.make(EnvName[opt.EnvIdex], render_mode = "human" if opt.render else None)
env = PartitionMazeEnv()
# eval_env = gym.make(EnvName[opt.EnvIdex])
eval_env = PartitionMazeEnv()
opt.state_dim = env.observation_space.shape[0]
opt.action_dim = env.action_space.shape[0]
opt.max_action = float(env.action_space.high[0]) #remark: action space【-max,max】
print(f'Env:{EnvName[opt.EnvIdex]} state_dim:{opt.state_dim} action_dim:{opt.action_dim}')
print(f'max_a:{opt.max_action} min_a:{env.action_space.low[0]}')
# Seed Everything
env_seed = opt.seed
torch.manual_seed(opt.seed)
torch.cuda.manual_seed(opt.seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
print("Random Seed: {}".format(opt.seed))
# Build SummaryWriter to record training curves
if opt.write:
from torch.utils.tensorboard import SummaryWriter
timenow = str(datetime.now())[0:-10]
timenow = ' ' + timenow[0:13] + '_' + timenow[-2::]
writepath = 'runs/{}'.format(BrifEnvName[opt.EnvIdex]) + timenow
if os.path.exists(writepath): shutil.rmtree(writepath)
writer = SummaryWriter(log_dir=writepath)
# Build DRL model
if not os.path.exists('model'): os.mkdir('model')
agent = DDPG_agent(**vars(opt)) # var: transfer argparse to dictionary
if opt.Loadmodel: agent.load(BrifEnvName[opt.EnvIdex], opt.ModelIdex)
if opt.render:
while True:
score = evaluate_policy(env, agent, turns=1)
print('EnvName:', BrifEnvName[opt.EnvIdex], 'score:', score)
else:
total_steps = 0
while total_steps < opt.Max_train_steps:
s = env.reset(seed=env_seed) # Do not use opt.seed directly, or it can overfit to opt.seed
env_seed += 1
done = False
'''Interact & trian'''
while not done:
if total_steps < opt.random_steps: a = env.action_space.sample()
else: a = agent.select_action(s, deterministic=False)
s_next, r, dw, tr, info = env.step(a) # dw: dead&win; tr: truncated
done = (dw or tr)
agent.replay_buffer.add(s, a, r, s_next, dw)
s = s_next
total_steps += 1
'''train'''
if total_steps >= opt.random_steps:
agent.train()
'''record & log'''
if total_steps % opt.eval_interval == 0:
ep_r = evaluate_policy(eval_env, agent, turns=3)
if opt.write: writer.add_scalar('ep_r', ep_r, global_step=total_steps)
print(f'EnvName:{BrifEnvName[opt.EnvIdex]}, Steps: {int(total_steps/1000)}k, Episode Reward:{ep_r}')
'''save model'''
if total_steps % opt.save_interval == 0:
agent.save(BrifEnvName[opt.EnvIdex], int(total_steps/1000))
env.close()
eval_env.close()
if __name__ == '__main__':
main()

64
DDPG/utils.py Normal file
View File

@ -0,0 +1,64 @@
import torch.nn.functional as F
import torch.nn as nn
import argparse
import torch
class Actor(nn.Module):
def __init__(self, state_dim, action_dim, net_width, maxaction):
super(Actor, self).__init__()
self.l1 = nn.Linear(state_dim, net_width)
self.l2 = nn.Linear(net_width, 300)
self.l3 = nn.Linear(300, action_dim)
self.maxaction = maxaction
def forward(self, state):
a = torch.relu(self.l1(state))
a = torch.relu(self.l2(a))
a = torch.tanh(self.l3(a)) * self.maxaction
return a
class Q_Critic(nn.Module):
def __init__(self, state_dim, action_dim, net_width):
super(Q_Critic, self).__init__()
self.l1 = nn.Linear(state_dim + action_dim, net_width)
self.l2 = nn.Linear(net_width, 300)
self.l3 = nn.Linear(300, 1)
def forward(self, state, action):
sa = torch.cat([state, action], 1)
q = F.relu(self.l1(sa))
q = F.relu(self.l2(q))
q = self.l3(q)
return q
def evaluate_policy(env, agent, turns = 3):
total_scores = 0
for j in range(turns):
s = env.reset()
done = False
while not done:
# Take deterministic actions at test time
a = agent.select_action(s, deterministic=True)
s_next, r, dw, tr, info = env.step(a)
done = (dw or tr)
total_scores += r
s = s_next
return int(total_scores/turns)
#Just ignore this function~
def str2bool(v):
'''transfer str to bool for argparse'''
if isinstance(v, bool):
return v
if v.lower() in ('yes', 'True','true','TRUE', 't', 'y', '1'):
return True
elif v.lower() in ('no', 'False','false','FALSE', 'f', 'n', '0'):
return False
else:
raise argparse.ArgumentTypeError('Boolean value expected.')

View File

@ -48,17 +48,17 @@ class ActorCritic(nn.Module):
if has_continuous_action_space :
self.actor = nn.Sequential(
nn.Linear(state_dim, 64),
# nn.Tanh(),
nn.Tanh(),
# nn.Sigmoid(),
nn.ReLU(),
# nn.ReLU(),
nn.Linear(64, 64),
# nn.Tanh(),
nn.Tanh(),
# nn.Sigmoid(),
nn.ReLU(),
# nn.ReLU(),
nn.Linear(64, action_dim),
# nn.Tanh()
# nn.Sigmoid()
nn.ReLU()
nn.Sigmoid()
# nn.ReLU()
)
else:
self.actor = nn.Sequential(
@ -72,13 +72,13 @@ class ActorCritic(nn.Module):
# critic
self.critic = nn.Sequential(
nn.Linear(state_dim, 64),
# nn.Tanh(),
nn.Tanh(),
# nn.Sigmoid(),
nn.ReLU(),
# nn.ReLU(),
nn.Linear(64, 64),
# nn.Tanh(),
nn.Tanh(),
# nn.Sigmoid(),
nn.ReLU(),
# nn.ReLU(),
nn.Linear(64, 1)
)

6
env.py
View File

@ -39,8 +39,8 @@ class PartitionMazeEnv(gym.Env):
##############################
# 可能需要手动修改的超参数
##############################
self.CUT_NUM = 4 # 横切一半,竖切一半
self.BASE_LINE = 3500.0 # 基准时间通过greedy或者蒙特卡洛计算出来
self.CUT_NUM = 6 # 横切一半,竖切一半
self.BASE_LINE = 12133.250161412347 # 基准时间通过greedy或者蒙特卡洛计算出来
self.phase = 0 # 阶段控制0区域划分阶段1迷宫初始化阶段2走迷宫阶段
self.partition_step = 0 # 区域划分阶段步数,范围 0~4
@ -254,7 +254,7 @@ class PartitionMazeEnv(gym.Env):
# print(self.car_traj)
reward += self.BASE_LINE / T * 100
elif done and self.step_count >= self.MAX_STEPS:
reward += -10000
reward += -1000
return state, reward, done, False, {}

View File

@ -5,9 +5,9 @@ env = PartitionMazeEnv()
state = env.reset()
print(state)
action_series = [[0], [0.3], [0], [0], [0.1], [0.7]]
action_series = [[0], [0.5], [0], [0.2], [0.4], [0.7], [0.3], [0.8], [0.5], [0.1], [0.7], [0.7], [0.9], [0.9], [0.1], [0.9], [0.9], [0.1]]
for i in range(10):
for i in range(100):
action = action_series[i]
state, reward, done, info, _ = env.step(action)
print(state, reward, done, info)

View File

@ -1,6 +1,6 @@
H : 20 # 区域高度网格点之间的距离为25m单位距离
W : 25 # 区域宽度
num_cars : 1 # 系统数量(车-巢-机系统个数)
H : 50 # 区域高度网格点之间的距离为25m单位距离
W : 50 # 区域宽度
num_cars : 3 # 系统数量(车-巢-机系统个数)
# 时间系数(单位:秒,每个网格一张照片)
flight_time_factor : 3 # 每张照片对应的飞行时间无人机飞行速度为9.5m/s拍摄照片的时间间隔为3s