From ab51727253b878c8033e9af2816ddd8ac95edbe0 Mon Sep 17 00:00:00 2001 From: weixin_46229132 Date: Fri, 14 Mar 2025 15:27:05 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0ddpg=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- DDPG/DDPG.py | 105 +++++++++++++++++++++++++++++++++++++++++ DDPG/main.py | 121 ++++++++++++++++++++++++++++++++++++++++++++++++ DDPG/utils.py | 64 +++++++++++++++++++++++++ PPO2/PPO.py | 20 ++++---- env.py | 6 +-- human_action.py | 4 +- params.yml | 6 +-- 7 files changed, 308 insertions(+), 18 deletions(-) create mode 100644 DDPG/DDPG.py create mode 100644 DDPG/main.py create mode 100644 DDPG/utils.py diff --git a/DDPG/DDPG.py b/DDPG/DDPG.py new file mode 100644 index 0000000..0a24aa1 --- /dev/null +++ b/DDPG/DDPG.py @@ -0,0 +1,105 @@ +from utils import Actor, Q_Critic +import torch.nn.functional as F +import numpy as np +import torch +import copy + + +class DDPG_agent(): + def __init__(self, **kwargs): + # Init hyperparameters for agent, just like "self.gamma = opt.gamma, self.lambd = opt.lambd, ..." + self.__dict__.update(kwargs) + self.tau = 0.005 + + self.actor = Actor(self.state_dim, self.action_dim, self.net_width, self.max_action).to(self.dvc) + self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.a_lr) + self.actor_target = copy.deepcopy(self.actor) + + self.q_critic = Q_Critic(self.state_dim, self.action_dim, self.net_width).to(self.dvc) + self.q_critic_optimizer = torch.optim.Adam(self.q_critic.parameters(), lr=self.c_lr) + self.q_critic_target = copy.deepcopy(self.q_critic) + + self.replay_buffer = ReplayBuffer(self.state_dim, self.action_dim, max_size=int(5e5), dvc=self.dvc) + + def select_action(self, state, deterministic): + with torch.no_grad(): + state = torch.FloatTensor(state[np.newaxis, :]).to(self.dvc) # from [x,x,...,x] to [[x,x,...,x]] + a = self.actor(state).cpu().numpy()[0] # from [[x,x,...,x]] to [x,x,...,x] + if deterministic: + return a + else: + noise = np.random.normal(0, self.max_action * self.noise, size=self.action_dim) + return (a + noise).clip(-self.max_action, self.max_action) + + def train(self): + # Compute the target Q + with torch.no_grad(): + s, a, r, s_next, dw = self.replay_buffer.sample(self.batch_size) + target_a_next = self.actor_target(s_next) + target_Q= self.q_critic_target(s_next, target_a_next) + target_Q = r + (~dw) * self.gamma * target_Q #dw: die or win + + # Get current Q estimates + current_Q = self.q_critic(s, a) + + # Compute critic loss + q_loss = F.mse_loss(current_Q, target_Q) + + # Optimize the q_critic + self.q_critic_optimizer.zero_grad() + q_loss.backward() + self.q_critic_optimizer.step() + + # Update the Actor + a_loss = -self.q_critic(s,self.actor(s)).mean() + self.actor_optimizer.zero_grad() + a_loss.backward() + self.actor_optimizer.step() + + # Update the frozen target models + with torch.no_grad(): + for param, target_param in zip(self.q_critic.parameters(), self.q_critic_target.parameters()): + target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) + + for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): + target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) + + def save(self,EnvName, timestep): + torch.save(self.actor.state_dict(), "./model/{}_actor{}.pth".format(EnvName,timestep)) + torch.save(self.q_critic.state_dict(), "./model/{}_q_critic{}.pth".format(EnvName,timestep)) + + def load(self,EnvName, timestep): + self.actor.load_state_dict(torch.load("./model/{}_actor{}.pth".format(EnvName, timestep), map_location=self.dvc)) + self.q_critic.load_state_dict(torch.load("./model/{}_q_critic{}.pth".format(EnvName, timestep), map_location=self.dvc)) + + +class ReplayBuffer(): + def __init__(self, state_dim, action_dim, max_size, dvc): + self.max_size = max_size + self.dvc = dvc + self.ptr = 0 + self.size = 0 + + self.s = torch.zeros((max_size, state_dim) ,dtype=torch.float,device=self.dvc) + self.a = torch.zeros((max_size, action_dim) ,dtype=torch.float,device=self.dvc) + self.r = torch.zeros((max_size, 1) ,dtype=torch.float,device=self.dvc) + self.s_next = torch.zeros((max_size, state_dim) ,dtype=torch.float,device=self.dvc) + self.dw = torch.zeros((max_size, 1) ,dtype=torch.bool,device=self.dvc) + + def add(self, s, a, r, s_next, dw): + #每次只放入一个时刻的数据 + self.s[self.ptr] = torch.from_numpy(s).to(self.dvc) + self.a[self.ptr] = torch.from_numpy(a).to(self.dvc) # Note that a is numpy.array + self.r[self.ptr] = r + self.s_next[self.ptr] = torch.from_numpy(s_next).to(self.dvc) + self.dw[self.ptr] = dw + + self.ptr = (self.ptr + 1) % self.max_size #存满了又重头开始存 + self.size = min(self.size + 1, self.max_size) + + def sample(self, batch_size): + ind = torch.randint(0, self.size, device=self.dvc, size=(batch_size,)) + return self.s[ind], self.a[ind], self.r[ind], self.s_next[ind], self.dw[ind] + + + diff --git a/DDPG/main.py b/DDPG/main.py new file mode 100644 index 0000000..985b361 --- /dev/null +++ b/DDPG/main.py @@ -0,0 +1,121 @@ +from utils import str2bool,evaluate_policy +from datetime import datetime +from DDPG import DDPG_agent +import gymnasium as gym +import os, shutil +import argparse +import torch +import sys +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from env import PartitionMazeEnv + + +'''Hyperparameter Setting''' +parser = argparse.ArgumentParser() +parser.add_argument('--dvc', type=str, default='cpu', help='running device: cuda or cpu') +parser.add_argument('--EnvIdex', type=int, default=0, help='PV1, Lch_Cv2, Humanv4, HCv4, BWv3, BWHv3') +parser.add_argument('--write', type=str2bool, default=False, help='Use SummaryWriter to record the training') +parser.add_argument('--render', type=str2bool, default=False, help='Render or Not') +parser.add_argument('--Loadmodel', type=str2bool, default=False, help='Load pretrained model or Not') +parser.add_argument('--ModelIdex', type=int, default=100, help='which model to load') + +parser.add_argument('--seed', type=int, default=0, help='random seed') +parser.add_argument('--Max_train_steps', type=int, default=5e6, help='Max training steps') +parser.add_argument('--save_interval', type=int, default=1e5, help='Model saving interval, in steps.') +parser.add_argument('--eval_interval', type=int, default=2e3, help='Model evaluating interval, in steps.') + +parser.add_argument('--gamma', type=float, default=0.99, help='Discounted Factor') +parser.add_argument('--net_width', type=int, default=400, help='Hidden net width, s_dim-400-300-a_dim') +parser.add_argument('--a_lr', type=float, default=1e-3, help='Learning rate of actor') +parser.add_argument('--c_lr', type=float, default=1e-3, help='Learning rate of critic') +parser.add_argument('--batch_size', type=int, default=128, help='batch_size of training') +parser.add_argument('--random_steps', type=int, default=5e4, help='random steps before trianing') +parser.add_argument('--noise', type=float, default=0.1, help='exploring noise') +opt = parser.parse_args() +opt.dvc = torch.device(opt.dvc) # from str to torch.device +print(opt) + + +def main(): + EnvName = ['Pendulum-v1','LunarLanderContinuous-v2','Humanoid-v4','HalfCheetah-v4','BipedalWalker-v3','BipedalWalkerHardcore-v3'] + BrifEnvName = ['PV1', 'LLdV2', 'Humanv4', 'HCv4','BWv3', 'BWHv3'] + + # Build Env + # env = gym.make(EnvName[opt.EnvIdex], render_mode = "human" if opt.render else None) + env = PartitionMazeEnv() + # eval_env = gym.make(EnvName[opt.EnvIdex]) + eval_env = PartitionMazeEnv() + opt.state_dim = env.observation_space.shape[0] + opt.action_dim = env.action_space.shape[0] + opt.max_action = float(env.action_space.high[0]) #remark: action space【-max,max】 + print(f'Env:{EnvName[opt.EnvIdex]} state_dim:{opt.state_dim} action_dim:{opt.action_dim}') + print(f'max_a:{opt.max_action} min_a:{env.action_space.low[0]}') + + # Seed Everything + env_seed = opt.seed + torch.manual_seed(opt.seed) + torch.cuda.manual_seed(opt.seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + print("Random Seed: {}".format(opt.seed)) + + # Build SummaryWriter to record training curves + if opt.write: + from torch.utils.tensorboard import SummaryWriter + timenow = str(datetime.now())[0:-10] + timenow = ' ' + timenow[0:13] + '_' + timenow[-2::] + writepath = 'runs/{}'.format(BrifEnvName[opt.EnvIdex]) + timenow + if os.path.exists(writepath): shutil.rmtree(writepath) + writer = SummaryWriter(log_dir=writepath) + + + # Build DRL model + if not os.path.exists('model'): os.mkdir('model') + agent = DDPG_agent(**vars(opt)) # var: transfer argparse to dictionary + if opt.Loadmodel: agent.load(BrifEnvName[opt.EnvIdex], opt.ModelIdex) + + if opt.render: + while True: + score = evaluate_policy(env, agent, turns=1) + print('EnvName:', BrifEnvName[opt.EnvIdex], 'score:', score) + else: + total_steps = 0 + while total_steps < opt.Max_train_steps: + s = env.reset(seed=env_seed) # Do not use opt.seed directly, or it can overfit to opt.seed + env_seed += 1 + done = False + + '''Interact & trian''' + while not done: + if total_steps < opt.random_steps: a = env.action_space.sample() + else: a = agent.select_action(s, deterministic=False) + s_next, r, dw, tr, info = env.step(a) # dw: dead&win; tr: truncated + done = (dw or tr) + + agent.replay_buffer.add(s, a, r, s_next, dw) + s = s_next + total_steps += 1 + + '''train''' + if total_steps >= opt.random_steps: + agent.train() + + '''record & log''' + if total_steps % opt.eval_interval == 0: + ep_r = evaluate_policy(eval_env, agent, turns=3) + if opt.write: writer.add_scalar('ep_r', ep_r, global_step=total_steps) + print(f'EnvName:{BrifEnvName[opt.EnvIdex]}, Steps: {int(total_steps/1000)}k, Episode Reward:{ep_r}') + + '''save model''' + if total_steps % opt.save_interval == 0: + agent.save(BrifEnvName[opt.EnvIdex], int(total_steps/1000)) + env.close() + eval_env.close() + + +if __name__ == '__main__': + main() + + + + diff --git a/DDPG/utils.py b/DDPG/utils.py new file mode 100644 index 0000000..7816b9e --- /dev/null +++ b/DDPG/utils.py @@ -0,0 +1,64 @@ +import torch.nn.functional as F +import torch.nn as nn +import argparse +import torch + +class Actor(nn.Module): + def __init__(self, state_dim, action_dim, net_width, maxaction): + super(Actor, self).__init__() + + self.l1 = nn.Linear(state_dim, net_width) + self.l2 = nn.Linear(net_width, 300) + self.l3 = nn.Linear(300, action_dim) + + self.maxaction = maxaction + + def forward(self, state): + a = torch.relu(self.l1(state)) + a = torch.relu(self.l2(a)) + a = torch.tanh(self.l3(a)) * self.maxaction + return a + + +class Q_Critic(nn.Module): + def __init__(self, state_dim, action_dim, net_width): + super(Q_Critic, self).__init__() + + self.l1 = nn.Linear(state_dim + action_dim, net_width) + self.l2 = nn.Linear(net_width, 300) + self.l3 = nn.Linear(300, 1) + + def forward(self, state, action): + sa = torch.cat([state, action], 1) + q = F.relu(self.l1(sa)) + q = F.relu(self.l2(q)) + q = self.l3(q) + return q + +def evaluate_policy(env, agent, turns = 3): + total_scores = 0 + for j in range(turns): + s = env.reset() + done = False + while not done: + # Take deterministic actions at test time + a = agent.select_action(s, deterministic=True) + s_next, r, dw, tr, info = env.step(a) + done = (dw or tr) + + total_scores += r + s = s_next + return int(total_scores/turns) + + +#Just ignore this function~ +def str2bool(v): + '''transfer str to bool for argparse''' + if isinstance(v, bool): + return v + if v.lower() in ('yes', 'True','true','TRUE', 't', 'y', '1'): + return True + elif v.lower() in ('no', 'False','false','FALSE', 'f', 'n', '0'): + return False + else: + raise argparse.ArgumentTypeError('Boolean value expected.') \ No newline at end of file diff --git a/PPO2/PPO.py b/PPO2/PPO.py index e095f67..9db83c6 100644 --- a/PPO2/PPO.py +++ b/PPO2/PPO.py @@ -48,17 +48,17 @@ class ActorCritic(nn.Module): if has_continuous_action_space : self.actor = nn.Sequential( nn.Linear(state_dim, 64), - # nn.Tanh(), + nn.Tanh(), # nn.Sigmoid(), - nn.ReLU(), + # nn.ReLU(), nn.Linear(64, 64), - # nn.Tanh(), + nn.Tanh(), # nn.Sigmoid(), - nn.ReLU(), + # nn.ReLU(), nn.Linear(64, action_dim), # nn.Tanh() - # nn.Sigmoid() - nn.ReLU() + nn.Sigmoid() + # nn.ReLU() ) else: self.actor = nn.Sequential( @@ -72,13 +72,13 @@ class ActorCritic(nn.Module): # critic self.critic = nn.Sequential( nn.Linear(state_dim, 64), - # nn.Tanh(), + nn.Tanh(), # nn.Sigmoid(), - nn.ReLU(), + # nn.ReLU(), nn.Linear(64, 64), - # nn.Tanh(), + nn.Tanh(), # nn.Sigmoid(), - nn.ReLU(), + # nn.ReLU(), nn.Linear(64, 1) ) diff --git a/env.py b/env.py index cc28716..7691c86 100644 --- a/env.py +++ b/env.py @@ -39,8 +39,8 @@ class PartitionMazeEnv(gym.Env): ############################## # 可能需要手动修改的超参数 ############################## - self.CUT_NUM = 4 # 横切一半,竖切一半 - self.BASE_LINE = 3500.0 # 基准时间,通过greedy或者蒙特卡洛计算出来 + self.CUT_NUM = 6 # 横切一半,竖切一半 + self.BASE_LINE = 12133.250161412347 # 基准时间,通过greedy或者蒙特卡洛计算出来 self.phase = 0 # 阶段控制,0:区域划分阶段,1:迷宫初始化阶段,2:走迷宫阶段 self.partition_step = 0 # 区域划分阶段步数,范围 0~4 @@ -254,7 +254,7 @@ class PartitionMazeEnv(gym.Env): # print(self.car_traj) reward += self.BASE_LINE / T * 100 elif done and self.step_count >= self.MAX_STEPS: - reward += -10000 + reward += -1000 return state, reward, done, False, {} diff --git a/human_action.py b/human_action.py index 1de7b5d..dcd70c2 100644 --- a/human_action.py +++ b/human_action.py @@ -5,9 +5,9 @@ env = PartitionMazeEnv() state = env.reset() print(state) -action_series = [[0], [0.3], [0], [0], [0.1], [0.7]] +action_series = [[0], [0.5], [0], [0.2], [0.4], [0.7], [0.3], [0.8], [0.5], [0.1], [0.7], [0.7], [0.9], [0.9], [0.1], [0.9], [0.9], [0.1]] -for i in range(10): +for i in range(100): action = action_series[i] state, reward, done, info, _ = env.step(action) print(state, reward, done, info) diff --git a/params.yml b/params.yml index 8233493..382ea74 100644 --- a/params.yml +++ b/params.yml @@ -1,6 +1,6 @@ -H : 20 # 区域高度,网格点之间的距离为25m(单位距离) -W : 25 # 区域宽度 -num_cars : 1 # 系统数量(车-巢-机系统个数) +H : 50 # 区域高度,网格点之间的距离为25m(单位距离) +W : 50 # 区域宽度 +num_cars : 3 # 系统数量(车-巢-机系统个数) # 时间系数(单位:秒,每个网格一张照片) flight_time_factor : 3 # 每张照片对应的飞行时间,无人机飞行速度为9.5m/s,拍摄照片的时间间隔为3s