添加ddpg代码

2025-03-14 15:27:05 +08:00 · 2025-03-14 15:27:05 +08:00 · ab51727253
commit ab51727253
parent 4fdb8aa152
7 changed files with 308 additions and 18 deletions
--- a/DDPG/DDPG.py
+++ b/DDPG/DDPG.py
@ -0,0 +1,105 @@
 from utils import Actor, Q_Critic
 import torch.nn.functional as F
 import numpy as np
 import torch
 import copy
 class DDPG_agent():
 	def __init__(self, **kwargs):
 		# Init hyperparameters for agent, just like "self.gamma = opt.gamma, self.lambd = opt.lambd, ..."
 		self.__dict__.update(kwargs)
 		self.tau = 0.005
 		self.actor = Actor(self.state_dim, self.action_dim, self.net_width, self.max_action).to(self.dvc)
 		self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.a_lr)
 		self.actor_target = copy.deepcopy(self.actor)
 		self.q_critic = Q_Critic(self.state_dim, self.action_dim, self.net_width).to(self.dvc)
 		self.q_critic_optimizer = torch.optim.Adam(self.q_critic.parameters(), lr=self.c_lr)
 		self.q_critic_target = copy.deepcopy(self.q_critic)
 		self.replay_buffer = ReplayBuffer(self.state_dim, self.action_dim, max_size=int(5e5), dvc=self.dvc)
 	def select_action(self, state, deterministic):
 		with torch.no_grad():
 			state = torch.FloatTensor(state[np.newaxis, :]).to(self.dvc)  # from [x,x,...,x] to [[x,x,...,x]]
 			a = self.actor(state).cpu().numpy()[0] # from [[x,x,...,x]] to [x,x,...,x]
 			if deterministic:
 				return a
 			else:
 				noise = np.random.normal(0, self.max_action * self.noise, size=self.action_dim)
 				return (a + noise).clip(-self.max_action, self.max_action)
 	def train(self):
 		# Compute the target Q
 		with torch.no_grad():
 			s, a, r, s_next, dw = self.replay_buffer.sample(self.batch_size)
 			target_a_next = self.actor_target(s_next)
 			target_Q= self.q_critic_target(s_next, target_a_next)
 			target_Q = r + (~dw) * self.gamma * target_Q  #dw: die or win
 		# Get current Q estimates
 		current_Q = self.q_critic(s, a)
 		# Compute critic loss
 		q_loss = F.mse_loss(current_Q, target_Q)
 		# Optimize the q_critic
 		self.q_critic_optimizer.zero_grad()
 		q_loss.backward()
 		self.q_critic_optimizer.step()
 		# Update the Actor
 		a_loss = -self.q_critic(s,self.actor(s)).mean()
 		self.actor_optimizer.zero_grad()
 		a_loss.backward()
 		self.actor_optimizer.step()
 		# Update the frozen target models
 		with torch.no_grad():
 			for param, target_param in zip(self.q_critic.parameters(), self.q_critic_target.parameters()):
 				target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
 			for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
 				target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
 	def save(self,EnvName, timestep):
 		torch.save(self.actor.state_dict(), "./model/{}_actor{}.pth".format(EnvName,timestep))
 		torch.save(self.q_critic.state_dict(), "./model/{}_q_critic{}.pth".format(EnvName,timestep))
 	def load(self,EnvName, timestep):
 		self.actor.load_state_dict(torch.load("./model/{}_actor{}.pth".format(EnvName, timestep), map_location=self.dvc))
 		self.q_critic.load_state_dict(torch.load("./model/{}_q_critic{}.pth".format(EnvName, timestep), map_location=self.dvc))
 class ReplayBuffer():
 	def __init__(self, state_dim, action_dim, max_size, dvc):
 		self.max_size = max_size
 		self.dvc = dvc
 		self.ptr = 0
 		self.size = 0
 		self.s = torch.zeros((max_size, state_dim) ,dtype=torch.float,device=self.dvc)
 		self.a = torch.zeros((max_size, action_dim) ,dtype=torch.float,device=self.dvc)
 		self.r = torch.zeros((max_size, 1) ,dtype=torch.float,device=self.dvc)
 		self.s_next = torch.zeros((max_size, state_dim) ,dtype=torch.float,device=self.dvc)
 		self.dw = torch.zeros((max_size, 1) ,dtype=torch.bool,device=self.dvc)
 	def add(self, s, a, r, s_next, dw):
 		#每次只放入一个时刻的数据
 		self.s[self.ptr] = torch.from_numpy(s).to(self.dvc)
 		self.a[self.ptr] = torch.from_numpy(a).to(self.dvc) # Note that a is numpy.array
 		self.r[self.ptr] = r
 		self.s_next[self.ptr] = torch.from_numpy(s_next).to(self.dvc)
 		self.dw[self.ptr] = dw
 		self.ptr = (self.ptr + 1) % self.max_size #存满了又重头开始存
 		self.size = min(self.size + 1, self.max_size)
 	def sample(self, batch_size):
 		ind = torch.randint(0, self.size, device=self.dvc, size=(batch_size,))
 		return self.s[ind], self.a[ind], self.r[ind], self.s_next[ind], self.dw[ind]
--- a/DDPG/main.py
+++ b/DDPG/main.py
@ -0,0 +1,121 @@
 from utils import str2bool,evaluate_policy
 from datetime import datetime
 from DDPG import DDPG_agent
 import gymnasium as gym
 import os, shutil
 import argparse
 import torch
 import sys
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from env import PartitionMazeEnv
 '''Hyperparameter Setting'''
 parser = argparse.ArgumentParser()
 parser.add_argument('--dvc', type=str, default='cpu', help='running device: cuda or cpu')
 parser.add_argument('--EnvIdex', type=int, default=0, help='PV1, Lch_Cv2, Humanv4, HCv4, BWv3, BWHv3')
 parser.add_argument('--write', type=str2bool, default=False, help='Use SummaryWriter to record the training')
 parser.add_argument('--render', type=str2bool, default=False, help='Render or Not')
 parser.add_argument('--Loadmodel', type=str2bool, default=False, help='Load pretrained model or Not')
 parser.add_argument('--ModelIdex', type=int, default=100, help='which model to load')
 parser.add_argument('--seed', type=int, default=0, help='random seed')
 parser.add_argument('--Max_train_steps', type=int, default=5e6, help='Max training steps')
 parser.add_argument('--save_interval', type=int, default=1e5, help='Model saving interval, in steps.')
 parser.add_argument('--eval_interval', type=int, default=2e3, help='Model evaluating interval, in steps.')
 parser.add_argument('--gamma', type=float, default=0.99, help='Discounted Factor')
 parser.add_argument('--net_width', type=int, default=400, help='Hidden net width, s_dim-400-300-a_dim')
 parser.add_argument('--a_lr', type=float, default=1e-3, help='Learning rate of actor')
 parser.add_argument('--c_lr', type=float, default=1e-3, help='Learning rate of critic')
 parser.add_argument('--batch_size', type=int, default=128, help='batch_size of training')
 parser.add_argument('--random_steps', type=int, default=5e4, help='random steps before trianing')
 parser.add_argument('--noise', type=float, default=0.1, help='exploring noise')
 opt = parser.parse_args()
 opt.dvc = torch.device(opt.dvc) # from str to torch.device
 print(opt)
 def main():
    EnvName = ['Pendulum-v1','LunarLanderContinuous-v2','Humanoid-v4','HalfCheetah-v4','BipedalWalker-v3','BipedalWalkerHardcore-v3']
    BrifEnvName = ['PV1', 'LLdV2', 'Humanv4', 'HCv4','BWv3', 'BWHv3']
    # Build Env
    # env = gym.make(EnvName[opt.EnvIdex], render_mode = "human" if opt.render else None)
    env = PartitionMazeEnv()
    # eval_env = gym.make(EnvName[opt.EnvIdex])
    eval_env = PartitionMazeEnv()
    opt.state_dim = env.observation_space.shape[0]
    opt.action_dim = env.action_space.shape[0]
    opt.max_action = float(env.action_space.high[0])   #remark: action space【-max,max】
    print(f'Env:{EnvName[opt.EnvIdex]}  state_dim:{opt.state_dim}  action_dim:{opt.action_dim}')
    print(f'max_a:{opt.max_action}  min_a:{env.action_space.low[0]}')
    # Seed Everything
    env_seed = opt.seed
    torch.manual_seed(opt.seed)
    torch.cuda.manual_seed(opt.seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    print("Random Seed: {}".format(opt.seed))
    # Build SummaryWriter to record training curves
    if opt.write:
        from torch.utils.tensorboard import SummaryWriter
        timenow = str(datetime.now())[0:-10]
        timenow = ' ' + timenow[0:13] + '_' + timenow[-2::]
        writepath = 'runs/{}'.format(BrifEnvName[opt.EnvIdex]) + timenow
        if os.path.exists(writepath): shutil.rmtree(writepath)
        writer = SummaryWriter(log_dir=writepath)
    # Build DRL model
    if not os.path.exists('model'): os.mkdir('model')
    agent = DDPG_agent(**vars(opt)) # var: transfer argparse to dictionary
    if opt.Loadmodel: agent.load(BrifEnvName[opt.EnvIdex], opt.ModelIdex)
    if opt.render:
        while True:
            score = evaluate_policy(env, agent, turns=1)
            print('EnvName:', BrifEnvName[opt.EnvIdex], 'score:', score)
    else:
        total_steps = 0
        while total_steps < opt.Max_train_steps:
            s = env.reset(seed=env_seed)  # Do not use opt.seed directly, or it can overfit to opt.seed
            env_seed += 1
            done = False
            '''Interact & trian'''
            while not done:  
                if total_steps < opt.random_steps: a = env.action_space.sample()
                else: a = agent.select_action(s, deterministic=False)
                s_next, r, dw, tr, info = env.step(a) # dw: dead&win; tr: truncated
                done = (dw or tr)
                agent.replay_buffer.add(s, a, r, s_next, dw)
                s = s_next
                total_steps += 1
                '''train'''
                if total_steps >= opt.random_steps:
                    agent.train()
                '''record & log'''
                if total_steps % opt.eval_interval == 0:
                    ep_r = evaluate_policy(eval_env, agent, turns=3)
                    if opt.write: writer.add_scalar('ep_r', ep_r, global_step=total_steps)
                    print(f'EnvName:{BrifEnvName[opt.EnvIdex]}, Steps: {int(total_steps/1000)}k, Episode Reward:{ep_r}')
                '''save model'''
                if total_steps % opt.save_interval == 0:
                    agent.save(BrifEnvName[opt.EnvIdex], int(total_steps/1000))
        env.close()
        eval_env.close()
 if __name__ == '__main__':
    main()
--- a/DDPG/utils.py
+++ b/DDPG/utils.py
@ -0,0 +1,64 @@
 import torch.nn.functional as F
 import torch.nn as nn
 import argparse
 import torch
 class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, net_width, maxaction):
        super(Actor, self).__init__()
        self.l1 = nn.Linear(state_dim, net_width)
        self.l2 = nn.Linear(net_width, 300)
        self.l3 = nn.Linear(300, action_dim)
        self.maxaction = maxaction
    def forward(self, state):
        a = torch.relu(self.l1(state))
        a = torch.relu(self.l2(a))
        a = torch.tanh(self.l3(a)) * self.maxaction
        return a
 class Q_Critic(nn.Module):
    def __init__(self, state_dim, action_dim, net_width):
        super(Q_Critic, self).__init__()
        self.l1 = nn.Linear(state_dim + action_dim, net_width)
        self.l2 = nn.Linear(net_width, 300)
        self.l3 = nn.Linear(300, 1)
    def forward(self, state, action):
        sa = torch.cat([state, action], 1)
        q = F.relu(self.l1(sa))
        q = F.relu(self.l2(q))
        q = self.l3(q)
        return q
 def evaluate_policy(env, agent, turns = 3):
    total_scores = 0
    for j in range(turns):
        s = env.reset()
        done = False
        while not done:
            # Take deterministic actions at test time
            a = agent.select_action(s, deterministic=True)
            s_next, r, dw, tr, info = env.step(a)
            done = (dw or tr)
            total_scores += r
            s = s_next
    return int(total_scores/turns)
 #Just ignore this function~
 def str2bool(v):
    '''transfer str to bool for argparse'''
    if isinstance(v, bool):
        return v
    if v.lower() in ('yes', 'True','true','TRUE', 't', 'y', '1'):
        return True
    elif v.lower() in ('no', 'False','false','FALSE', 'f', 'n', '0'):
        return False
    else:
        raise argparse.ArgumentTypeError('Boolean value expected.')
--- a/PPO2/PPO.py
+++ b/PPO2/PPO.py
@ -48,17 +48,17 @@ class ActorCritic(nn.Module):
        if has_continuous_action_space :
            self.actor = nn.Sequential(
                            nn.Linear(state_dim, 64),
-                            # nn.Tanh(),
+                            nn.Tanh(),
                            # nn.Sigmoid(),
-                            nn.ReLU(),
+                            # nn.ReLU(),
                            nn.Linear(64, 64),
-                            # nn.Tanh(),
+                            nn.Tanh(),
                            # nn.Sigmoid(),
-                            nn.ReLU(),
+                            # nn.ReLU(),
                            nn.Linear(64, action_dim),
                            # nn.Tanh()
-                            # nn.Sigmoid()
+                            nn.Sigmoid()
-                            nn.ReLU()
+                            # nn.ReLU()
                        )
        else:
            self.actor = nn.Sequential(
@ -72,13 +72,13 @@ class ActorCritic(nn.Module):
        # critic
        self.critic = nn.Sequential(
                        nn.Linear(state_dim, 64),
-                        # nn.Tanh(),
+                        nn.Tanh(),
                        # nn.Sigmoid(),
-                        nn.ReLU(),
+                        # nn.ReLU(),
                        nn.Linear(64, 64),
-                        # nn.Tanh(),
+                        nn.Tanh(),
                        # nn.Sigmoid(),
-                        nn.ReLU(),
+                        # nn.ReLU(),
                        nn.Linear(64, 1)
                    )
--- a/env.py
+++ b/env.py
@ -39,8 +39,8 @@ class PartitionMazeEnv(gym.Env):
        ##############################
        # 可能需要手动修改的超参数
        ##############################
-        self.CUT_NUM = 4    # 横切一半，竖切一半
+        self.CUT_NUM = 6    # 横切一半，竖切一半
-        self.BASE_LINE = 3500.0     # 基准时间，通过greedy或者蒙特卡洛计算出来
+        self.BASE_LINE = 12133.250161412347     # 基准时间，通过greedy或者蒙特卡洛计算出来
        self.phase = 0    # 阶段控制，0：区域划分阶段，1：迷宫初始化阶段，2：走迷宫阶段
        self.partition_step = 0      # 区域划分阶段步数，范围 0~4
@ -254,7 +254,7 @@ class PartitionMazeEnv(gym.Env):
                # print(self.car_traj)
                reward += self.BASE_LINE / T * 100
            elif done and self.step_count >= self.MAX_STEPS:
-                reward += -10000
+                reward += -1000
            return state, reward, done, False, {}
--- a/human_action.py
+++ b/human_action.py
@ -5,9 +5,9 @@ env = PartitionMazeEnv()
 state = env.reset()
 print(state)
-action_series = [[0], [0.3], [0], [0], [0.1], [0.7]]
+action_series = [[0], [0.5], [0], [0.2], [0.4], [0.7], [0.3], [0.8], [0.5], [0.1], [0.7], [0.7], [0.9], [0.9], [0.1], [0.9], [0.9], [0.1]]
-for i in range(10):
+for i in range(100):
    action = action_series[i]
    state, reward, done, info, _ = env.step(action)
    print(state, reward, done, info)
--- a/params.yml
+++ b/params.yml
@ -1,6 +1,6 @@
-H : 20         # 区域高度，网格点之间的距离为25m（单位距离）
+H : 50         # 区域高度，网格点之间的距离为25m（单位距离）
-W : 25         # 区域宽度
+W : 50         # 区域宽度
-num_cars : 1           # 系统数量（车-巢-机系统个数）
+num_cars : 3           # 系统数量（车-巢-机系统个数）
 # 时间系数（单位：秒，每个网格一张照片）
 flight_time_factor : 3     # 每张照片对应的飞行时间，无人机飞行速度为9.5m/s，拍摄照片的时间间隔为3s