添加ddpg代码

2025-03-14 15:27:05 +08:00 · 2025-03-14 15:27:05 +08:00 · ab51727253
commit ab51727253
parent 4fdb8aa152
7 changed files with 308 additions and 18 deletions
--- a/DDPG/DDPG.py
+++ b/DDPG/DDPG.py
@ -0,0 +1,105 @@
+from utils import Actor, Q_Critic
+import torch.nn.functional as F
+import numpy as np
+import torch
+import copy
+
+
+class DDPG_agent():
+	def __init__(self, **kwargs):
+		# Init hyperparameters for agent, just like "self.gamma = opt.gamma, self.lambd = opt.lambd, ..."
+		self.__dict__.update(kwargs)
+		self.tau = 0.005
+
+		self.actor = Actor(self.state_dim, self.action_dim, self.net_width, self.max_action).to(self.dvc)
+		self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.a_lr)
+		self.actor_target = copy.deepcopy(self.actor)
+
+		self.q_critic = Q_Critic(self.state_dim, self.action_dim, self.net_width).to(self.dvc)
+		self.q_critic_optimizer = torch.optim.Adam(self.q_critic.parameters(), lr=self.c_lr)
+		self.q_critic_target = copy.deepcopy(self.q_critic)
+
+		self.replay_buffer = ReplayBuffer(self.state_dim, self.action_dim, max_size=int(5e5), dvc=self.dvc)
+		
+	def select_action(self, state, deterministic):
+		with torch.no_grad():
+			state = torch.FloatTensor(state[np.newaxis, :]).to(self.dvc)  # from [x,x,...,x] to [[x,x,...,x]]
+			a = self.actor(state).cpu().numpy()[0] # from [[x,x,...,x]] to [x,x,...,x]
+			if deterministic:
+				return a
+			else:
+				noise = np.random.normal(0, self.max_action * self.noise, size=self.action_dim)
+				return (a + noise).clip(-self.max_action, self.max_action)
+
+	def train(self):
+		# Compute the target Q
+		with torch.no_grad():
+			s, a, r, s_next, dw = self.replay_buffer.sample(self.batch_size)
+			target_a_next = self.actor_target(s_next)
+			target_Q= self.q_critic_target(s_next, target_a_next)
+			target_Q = r + (~dw) * self.gamma * target_Q  #dw: die or win
+
+		# Get current Q estimates
+		current_Q = self.q_critic(s, a)
+
+		# Compute critic loss
+		q_loss = F.mse_loss(current_Q, target_Q)
+
+		# Optimize the q_critic
+		self.q_critic_optimizer.zero_grad()
+		q_loss.backward()
+		self.q_critic_optimizer.step()
+
+		# Update the Actor
+		a_loss = -self.q_critic(s,self.actor(s)).mean()
+		self.actor_optimizer.zero_grad()
+		a_loss.backward()
+		self.actor_optimizer.step()
+
+		# Update the frozen target models
+		with torch.no_grad():
+			for param, target_param in zip(self.q_critic.parameters(), self.q_critic_target.parameters()):
+				target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
+
+			for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
+				target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
+
+	def save(self,EnvName, timestep):
+		torch.save(self.actor.state_dict(), "./model/{}_actor{}.pth".format(EnvName,timestep))
+		torch.save(self.q_critic.state_dict(), "./model/{}_q_critic{}.pth".format(EnvName,timestep))
+
+	def load(self,EnvName, timestep):
+		self.actor.load_state_dict(torch.load("./model/{}_actor{}.pth".format(EnvName, timestep), map_location=self.dvc))
+		self.q_critic.load_state_dict(torch.load("./model/{}_q_critic{}.pth".format(EnvName, timestep), map_location=self.dvc))
+
+
+class ReplayBuffer():
+	def __init__(self, state_dim, action_dim, max_size, dvc):
+		self.max_size = max_size
+		self.dvc = dvc
+		self.ptr = 0
+		self.size = 0
+
+		self.s = torch.zeros((max_size, state_dim) ,dtype=torch.float,device=self.dvc)
+		self.a = torch.zeros((max_size, action_dim) ,dtype=torch.float,device=self.dvc)
+		self.r = torch.zeros((max_size, 1) ,dtype=torch.float,device=self.dvc)
+		self.s_next = torch.zeros((max_size, state_dim) ,dtype=torch.float,device=self.dvc)
+		self.dw = torch.zeros((max_size, 1) ,dtype=torch.bool,device=self.dvc)
+
+	def add(self, s, a, r, s_next, dw):
+		#每次只放入一个时刻的数据
+		self.s[self.ptr] = torch.from_numpy(s).to(self.dvc)
+		self.a[self.ptr] = torch.from_numpy(a).to(self.dvc) # Note that a is numpy.array
+		self.r[self.ptr] = r
+		self.s_next[self.ptr] = torch.from_numpy(s_next).to(self.dvc)
+		self.dw[self.ptr] = dw
+
+		self.ptr = (self.ptr + 1) % self.max_size #存满了又重头开始存
+		self.size = min(self.size + 1, self.max_size)
+
+	def sample(self, batch_size):
+		ind = torch.randint(0, self.size, device=self.dvc, size=(batch_size,))
+		return self.s[ind], self.a[ind], self.r[ind], self.s_next[ind], self.dw[ind]
+
+
+
--- a/DDPG/main.py
+++ b/DDPG/main.py
@ -0,0 +1,121 @@
+from utils import str2bool,evaluate_policy
+from datetime import datetime
+from DDPG import DDPG_agent
+import gymnasium as gym
+import os, shutil
+import argparse
+import torch
+import sys
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from env import PartitionMazeEnv
+
+
+'''Hyperparameter Setting'''
+parser = argparse.ArgumentParser()
+parser.add_argument('--dvc', type=str, default='cpu', help='running device: cuda or cpu')
+parser.add_argument('--EnvIdex', type=int, default=0, help='PV1, Lch_Cv2, Humanv4, HCv4, BWv3, BWHv3')
+parser.add_argument('--write', type=str2bool, default=False, help='Use SummaryWriter to record the training')
+parser.add_argument('--render', type=str2bool, default=False, help='Render or Not')
+parser.add_argument('--Loadmodel', type=str2bool, default=False, help='Load pretrained model or Not')
+parser.add_argument('--ModelIdex', type=int, default=100, help='which model to load')
+
+parser.add_argument('--seed', type=int, default=0, help='random seed')
+parser.add_argument('--Max_train_steps', type=int, default=5e6, help='Max training steps')
+parser.add_argument('--save_interval', type=int, default=1e5, help='Model saving interval, in steps.')
+parser.add_argument('--eval_interval', type=int, default=2e3, help='Model evaluating interval, in steps.')
+
+parser.add_argument('--gamma', type=float, default=0.99, help='Discounted Factor')
+parser.add_argument('--net_width', type=int, default=400, help='Hidden net width, s_dim-400-300-a_dim')
+parser.add_argument('--a_lr', type=float, default=1e-3, help='Learning rate of actor')
+parser.add_argument('--c_lr', type=float, default=1e-3, help='Learning rate of critic')
+parser.add_argument('--batch_size', type=int, default=128, help='batch_size of training')
+parser.add_argument('--random_steps', type=int, default=5e4, help='random steps before trianing')
+parser.add_argument('--noise', type=float, default=0.1, help='exploring noise')
+opt = parser.parse_args()
+opt.dvc = torch.device(opt.dvc) # from str to torch.device
+print(opt)
+
+
+def main():
+    EnvName = ['Pendulum-v1','LunarLanderContinuous-v2','Humanoid-v4','HalfCheetah-v4','BipedalWalker-v3','BipedalWalkerHardcore-v3']
+    BrifEnvName = ['PV1', 'LLdV2', 'Humanv4', 'HCv4','BWv3', 'BWHv3']
+
+    # Build Env
+    # env = gym.make(EnvName[opt.EnvIdex], render_mode = "human" if opt.render else None)
+    env = PartitionMazeEnv()
+    # eval_env = gym.make(EnvName[opt.EnvIdex])
+    eval_env = PartitionMazeEnv()
+    opt.state_dim = env.observation_space.shape[0]
+    opt.action_dim = env.action_space.shape[0]
+    opt.max_action = float(env.action_space.high[0])   #remark: action space【-max,max】
+    print(f'Env:{EnvName[opt.EnvIdex]}  state_dim:{opt.state_dim}  action_dim:{opt.action_dim}')
+    print(f'max_a:{opt.max_action}  min_a:{env.action_space.low[0]}')
+
+    # Seed Everything
+    env_seed = opt.seed
+    torch.manual_seed(opt.seed)
+    torch.cuda.manual_seed(opt.seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+    print("Random Seed: {}".format(opt.seed))
+
+    # Build SummaryWriter to record training curves
+    if opt.write:
+        from torch.utils.tensorboard import SummaryWriter
+        timenow = str(datetime.now())[0:-10]
+        timenow = ' ' + timenow[0:13] + '_' + timenow[-2::]
+        writepath = 'runs/{}'.format(BrifEnvName[opt.EnvIdex]) + timenow
+        if os.path.exists(writepath): shutil.rmtree(writepath)
+        writer = SummaryWriter(log_dir=writepath)
+
+
+    # Build DRL model
+    if not os.path.exists('model'): os.mkdir('model')
+    agent = DDPG_agent(**vars(opt)) # var: transfer argparse to dictionary
+    if opt.Loadmodel: agent.load(BrifEnvName[opt.EnvIdex], opt.ModelIdex)
+
+    if opt.render:
+        while True:
+            score = evaluate_policy(env, agent, turns=1)
+            print('EnvName:', BrifEnvName[opt.EnvIdex], 'score:', score)
+    else:
+        total_steps = 0
+        while total_steps < opt.Max_train_steps:
+            s = env.reset(seed=env_seed)  # Do not use opt.seed directly, or it can overfit to opt.seed
+            env_seed += 1
+            done = False
+
+            '''Interact & trian'''
+            while not done:  
+                if total_steps < opt.random_steps: a = env.action_space.sample()
+                else: a = agent.select_action(s, deterministic=False)
+                s_next, r, dw, tr, info = env.step(a) # dw: dead&win; tr: truncated
+                done = (dw or tr)
+
+                agent.replay_buffer.add(s, a, r, s_next, dw)
+                s = s_next
+                total_steps += 1
+
+                '''train'''
+                if total_steps >= opt.random_steps:
+                    agent.train()
+
+                '''record & log'''
+                if total_steps % opt.eval_interval == 0:
+                    ep_r = evaluate_policy(eval_env, agent, turns=3)
+                    if opt.write: writer.add_scalar('ep_r', ep_r, global_step=total_steps)
+                    print(f'EnvName:{BrifEnvName[opt.EnvIdex]}, Steps: {int(total_steps/1000)}k, Episode Reward:{ep_r}')
+
+                '''save model'''
+                if total_steps % opt.save_interval == 0:
+                    agent.save(BrifEnvName[opt.EnvIdex], int(total_steps/1000))
+        env.close()
+        eval_env.close()
+
+
+if __name__ == '__main__':
+    main()
+
+
+
+
--- a/DDPG/utils.py
+++ b/DDPG/utils.py
@ -0,0 +1,64 @@
+import torch.nn.functional as F
+import torch.nn as nn
+import argparse
+import torch
+
+class Actor(nn.Module):
+    def __init__(self, state_dim, action_dim, net_width, maxaction):
+        super(Actor, self).__init__()
+
+        self.l1 = nn.Linear(state_dim, net_width)
+        self.l2 = nn.Linear(net_width, 300)
+        self.l3 = nn.Linear(300, action_dim)
+
+        self.maxaction = maxaction
+
+    def forward(self, state):
+        a = torch.relu(self.l1(state))
+        a = torch.relu(self.l2(a))
+        a = torch.tanh(self.l3(a)) * self.maxaction
+        return a
+
+
+class Q_Critic(nn.Module):
+    def __init__(self, state_dim, action_dim, net_width):
+        super(Q_Critic, self).__init__()
+
+        self.l1 = nn.Linear(state_dim + action_dim, net_width)
+        self.l2 = nn.Linear(net_width, 300)
+        self.l3 = nn.Linear(300, 1)
+
+    def forward(self, state, action):
+        sa = torch.cat([state, action], 1)
+        q = F.relu(self.l1(sa))
+        q = F.relu(self.l2(q))
+        q = self.l3(q)
+        return q
+
+def evaluate_policy(env, agent, turns = 3):
+    total_scores = 0
+    for j in range(turns):
+        s = env.reset()
+        done = False
+        while not done:
+            # Take deterministic actions at test time
+            a = agent.select_action(s, deterministic=True)
+            s_next, r, dw, tr, info = env.step(a)
+            done = (dw or tr)
+
+            total_scores += r
+            s = s_next
+    return int(total_scores/turns)
+
+
+#Just ignore this function~
+def str2bool(v):
+    '''transfer str to bool for argparse'''
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ('yes', 'True','true','TRUE', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'False','false','FALSE', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
--- a/PPO2/PPO.py
+++ b/PPO2/PPO.py
@ -48,17 +48,17 @@ class ActorCritic(nn.Module):
        if has_continuous_action_space :
            self.actor = nn.Sequential(
                            nn.Linear(state_dim, 64),
-                            # nn.Tanh(),
+                            nn.Tanh(),
                            # nn.Sigmoid(),
-                            nn.ReLU(),
+                            # nn.ReLU(),
                            nn.Linear(64, 64),
-                            # nn.Tanh(),
+                            nn.Tanh(),
                            # nn.Sigmoid(),
-                            nn.ReLU(),
+                            # nn.ReLU(),
                            nn.Linear(64, action_dim),
                            # nn.Tanh()
-                            # nn.Sigmoid()
-                            nn.ReLU()
+                            nn.Sigmoid()
+                            # nn.ReLU()
                        )
        else:
            self.actor = nn.Sequential(
@ -72,13 +72,13 @@ class ActorCritic(nn.Module):
        # critic
        self.critic = nn.Sequential(
                        nn.Linear(state_dim, 64),
-                        # nn.Tanh(),
+                        nn.Tanh(),
                        # nn.Sigmoid(),
-                        nn.ReLU(),
+                        # nn.ReLU(),
                        nn.Linear(64, 64),
-                        # nn.Tanh(),
+                        nn.Tanh(),
                        # nn.Sigmoid(),
-                        nn.ReLU(),
+                        # nn.ReLU(),
                        nn.Linear(64, 1)
                    )
        
--- a/env.py
+++ b/env.py
@ -39,8 +39,8 @@ class PartitionMazeEnv(gym.Env):
        ##############################
        # 可能需要手动修改的超参数
        ##############################
-        self.CUT_NUM = 4    # 横切一半，竖切一半
-        self.BASE_LINE = 3500.0     # 基准时间，通过greedy或者蒙特卡洛计算出来
+        self.CUT_NUM = 6    # 横切一半，竖切一半
+        self.BASE_LINE = 12133.250161412347     # 基准时间，通过greedy或者蒙特卡洛计算出来

        self.phase = 0    # 阶段控制，0：区域划分阶段，1：迷宫初始化阶段，2：走迷宫阶段
        self.partition_step = 0      # 区域划分阶段步数，范围 0~4
@ -254,7 +254,7 @@ class PartitionMazeEnv(gym.Env):
                # print(self.car_traj)
                reward += self.BASE_LINE / T * 100
            elif done and self.step_count >= self.MAX_STEPS:
-                reward += -10000
+                reward += -1000

            return state, reward, done, False, {}

--- a/human_action.py
+++ b/human_action.py
@ -5,9 +5,9 @@ env = PartitionMazeEnv()
 state = env.reset()
 print(state)

-action_series = [[0], [0.3], [0], [0], [0.1], [0.7]]
+action_series = [[0], [0.5], [0], [0.2], [0.4], [0.7], [0.3], [0.8], [0.5], [0.1], [0.7], [0.7], [0.9], [0.9], [0.1], [0.9], [0.9], [0.1]]

-for i in range(10):
+for i in range(100):
    action = action_series[i]
    state, reward, done, info, _ = env.step(action)
    print(state, reward, done, info)
--- a/params.yml
+++ b/params.yml
@ -1,6 +1,6 @@
-H : 20         # 区域高度，网格点之间的距离为25m（单位距离）
-W : 25         # 区域宽度
-num_cars : 1           # 系统数量（车-巢-机系统个数）
+H : 50         # 区域高度，网格点之间的距离为25m（单位距离）
+W : 50         # 区域宽度
+num_cars : 3           # 系统数量（车-巢-机系统个数）

 # 时间系数（单位：秒，每个网格一张照片）
 flight_time_factor : 3     # 每张照片对应的飞行时间，无人机飞行速度为9.5m/s，拍摄照片的时间间隔为3s