diff --git a/DDPG_solver/utils.py b/DDPG_solver/utils.py index e76fc48..bff61ec 100644 --- a/DDPG_solver/utils.py +++ b/DDPG_solver/utils.py @@ -51,7 +51,7 @@ def evaluate_policy(env, agent, turns = 3): total_scores += r s = s_next print('action series: ', np.round(action_series, 3)) - print('state: {s_next}') + print('state: ', s) return int(total_scores/turns) diff --git a/PPO_Continuous/PPO.py b/PPO_Continuous/PPO.py new file mode 100644 index 0000000..5715854 --- /dev/null +++ b/PPO_Continuous/PPO.py @@ -0,0 +1,144 @@ +from utils import BetaActor, GaussianActor_musigma, GaussianActor_mu, Critic +import numpy as np +import copy +import torch +import math + + +class PPO_agent(object): + def __init__(self, **kwargs): + # Init hyperparameters for PPO agent, just like "self.gamma = opt.gamma, self.lambd = opt.lambd, ..." + self.__dict__.update(kwargs) + + # Choose distribution for the actor + if self.Distribution == 'Beta': + self.actor = BetaActor(self.state_dim, self.action_dim, self.net_width).to(self.dvc) + elif self.Distribution == 'GS_ms': + self.actor = GaussianActor_musigma(self.state_dim, self.action_dim, self.net_width).to(self.dvc) + elif self.Distribution == 'GS_m': + self.actor = GaussianActor_mu(self.state_dim, self.action_dim, self.net_width).to(self.dvc) + else: print('Dist Error') + self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.a_lr) + + # Build Critic + self.critic = Critic(self.state_dim, self.net_width).to(self.dvc) + self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.c_lr) + + # Build Trajectory holder + self.s_hoder = np.zeros((self.T_horizon, self.state_dim),dtype=np.float32) + self.a_hoder = np.zeros((self.T_horizon, self.action_dim),dtype=np.float32) + self.r_hoder = np.zeros((self.T_horizon, 1),dtype=np.float32) + self.s_next_hoder = np.zeros((self.T_horizon, self.state_dim),dtype=np.float32) + self.logprob_a_hoder = np.zeros((self.T_horizon, self.action_dim),dtype=np.float32) + self.done_hoder = np.zeros((self.T_horizon, 1),dtype=np.bool_) + self.dw_hoder = np.zeros((self.T_horizon, 1),dtype=np.bool_) + + def select_action(self, state, deterministic): + with torch.no_grad(): + state = torch.FloatTensor(state.reshape(1, -1)).to(self.dvc) + if deterministic: + # only used when evaluate the policy.Making the performance more stable + a = self.actor.deterministic_act(state) + return a.cpu().numpy()[0], None # action is in shape (adim, 0) + else: + # only used when interact with the env + dist = self.actor.get_dist(state) + a = dist.sample() + a = torch.clamp(a, 0, 1) + logprob_a = dist.log_prob(a).cpu().numpy().flatten() + return a.cpu().numpy()[0], logprob_a # both are in shape (adim, 0) + + + def train(self): + self.entropy_coef*=self.entropy_coef_decay + + '''Prepare PyTorch data from Numpy data''' + s = torch.from_numpy(self.s_hoder).to(self.dvc) + a = torch.from_numpy(self.a_hoder).to(self.dvc) + r = torch.from_numpy(self.r_hoder).to(self.dvc) + s_next = torch.from_numpy(self.s_next_hoder).to(self.dvc) + logprob_a = torch.from_numpy(self.logprob_a_hoder).to(self.dvc) + done = torch.from_numpy(self.done_hoder).to(self.dvc) + dw = torch.from_numpy(self.dw_hoder).to(self.dvc) + + ''' Use TD+GAE+LongTrajectory to compute Advantage and TD target''' + with torch.no_grad(): + vs = self.critic(s) + vs_ = self.critic(s_next) + + '''dw for TD_target and Adv''' + deltas = r + self.gamma * vs_ * (~dw) - vs + deltas = deltas.cpu().flatten().numpy() + adv = [0] + + '''done for GAE''' + for dlt, mask in zip(deltas[::-1], done.cpu().flatten().numpy()[::-1]): + advantage = dlt + self.gamma * self.lambd * adv[-1] * (~mask) + adv.append(advantage) + adv.reverse() + adv = copy.deepcopy(adv[0:-1]) + adv = torch.tensor(adv).unsqueeze(1).float().to(self.dvc) + td_target = adv + vs + adv = (adv - adv.mean()) / ((adv.std()+1e-4)) #sometimes helps + + + """Slice long trajectopy into short trajectory and perform mini-batch PPO update""" + a_optim_iter_num = int(math.ceil(s.shape[0] / self.a_optim_batch_size)) + c_optim_iter_num = int(math.ceil(s.shape[0] / self.c_optim_batch_size)) + for i in range(self.K_epochs): + + #Shuffle the trajectory, Good for training + perm = np.arange(s.shape[0]) + np.random.shuffle(perm) + perm = torch.LongTensor(perm).to(self.dvc) + s, a, td_target, adv, logprob_a = \ + s[perm].clone(), a[perm].clone(), td_target[perm].clone(), adv[perm].clone(), logprob_a[perm].clone() + + '''update the actor''' + for i in range(a_optim_iter_num): + index = slice(i * self.a_optim_batch_size, min((i + 1) * self.a_optim_batch_size, s.shape[0])) + distribution = self.actor.get_dist(s[index]) + dist_entropy = distribution.entropy().sum(1, keepdim=True) + logprob_a_now = distribution.log_prob(a[index]) + ratio = torch.exp(logprob_a_now.sum(1,keepdim=True) - logprob_a[index].sum(1,keepdim=True)) # a/b == exp(log(a)-log(b)) + + surr1 = ratio * adv[index] + surr2 = torch.clamp(ratio, 1 - self.clip_rate, 1 + self.clip_rate) * adv[index] + a_loss = -torch.min(surr1, surr2) - self.entropy_coef * dist_entropy + + self.actor_optimizer.zero_grad() + a_loss.mean().backward() + torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 40) + self.actor_optimizer.step() + + '''update the critic''' + for i in range(c_optim_iter_num): + index = slice(i * self.c_optim_batch_size, min((i + 1) * self.c_optim_batch_size, s.shape[0])) + c_loss = (self.critic(s[index]) - td_target[index]).pow(2).mean() + for name,param in self.critic.named_parameters(): + if 'weight' in name: + c_loss += param.pow(2).sum() * self.l2_reg + + self.critic_optimizer.zero_grad() + c_loss.backward() + self.critic_optimizer.step() + + def put_data(self, s, a, r, s_next, logprob_a, done, dw, idx): + self.s_hoder[idx] = s + self.a_hoder[idx] = a + self.r_hoder[idx] = r + self.s_next_hoder[idx] = s_next + self.logprob_a_hoder[idx] = logprob_a + self.done_hoder[idx] = done + self.dw_hoder[idx] = dw + + def save(self,EnvName, timestep): + torch.save(self.actor.state_dict(), "./model/{}_actor{}.pth".format(EnvName,timestep)) + torch.save(self.critic.state_dict(), "./model/{}_q_critic{}.pth".format(EnvName,timestep)) + + def load(self,EnvName, timestep): + self.actor.load_state_dict(torch.load("./model/{}_actor{}.pth".format(EnvName, timestep), map_location=self.dvc)) + self.critic.load_state_dict(torch.load("./model/{}_q_critic{}.pth".format(EnvName, timestep), map_location=self.dvc)) + + + diff --git a/PPO_Continuous/main.py b/PPO_Continuous/main.py new file mode 100644 index 0000000..d6c2f4e --- /dev/null +++ b/PPO_Continuous/main.py @@ -0,0 +1,172 @@ +from datetime import datetime +import os +import shutil +import argparse +import torch +import gymnasium as gym +from utils import str2bool, Action_adapter, Reward_adapter, evaluate_policy +from PPO import PPO_agent +import sys +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from env import PartitionMazeEnv + +'''Hyperparameter Setting''' +parser = argparse.ArgumentParser() +parser.add_argument('--dvc', type=str, default='', + help='running device: cuda or cpu') +parser.add_argument('--EnvIdex', type=int, default=0, + help='PV1, Lch_Cv2, Humanv4, HCv4, BWv3, BWHv3') +parser.add_argument('--write', type=str2bool, default=False, + help='Use SummaryWriter to record the training') +parser.add_argument('--render', type=str2bool, + default=False, help='Render or Not') +parser.add_argument('--Loadmodel', type=str2bool, + default=False, help='Load pretrained model or Not') +parser.add_argument('--ModelIdex', type=int, default=100, + help='which model to load') + +parser.add_argument('--seed', type=int, default=0, help='random seed') +parser.add_argument('--T_horizon', type=int, default=200, + help='lenth of long trajectory') +parser.add_argument('--Distribution', type=str, default='Beta', + help='Should be one of Beta ; GS_ms ; GS_m') +parser.add_argument('--Max_train_steps', type=int, + default=int(5e8), help='Max training steps') +parser.add_argument('--save_interval', type=int, + default=int(5e5), help='Model saving interval, in steps.') +parser.add_argument('--eval_interval', type=int, default=int(5e3), + help='Model evaluating interval, in steps.') + +parser.add_argument('--gamma', type=float, default=0.99, + help='Discounted Factor') +parser.add_argument('--lambd', type=float, default=0.95, help='GAE Factor') +parser.add_argument('--clip_rate', type=float, + default=0.2, help='PPO Clip rate') +parser.add_argument('--K_epochs', type=int, default=10, + help='PPO update times') +parser.add_argument('--net_width', type=int, + default=150, help='Hidden net width') +parser.add_argument('--a_lr', type=float, default=2e-4, + help='Learning rate of actor') +parser.add_argument('--c_lr', type=float, default=2e-4, + help='Learning rate of critic') +parser.add_argument('--l2_reg', type=float, default=1e-3, + help='L2 regulization coefficient for Critic') +parser.add_argument('--a_optim_batch_size', type=int, + default=64, help='lenth of sliced trajectory of actor') +parser.add_argument('--c_optim_batch_size', type=int, + default=64, help='lenth of sliced trajectory of critic') +parser.add_argument('--entropy_coef', type=float, + default=1e-3, help='Entropy coefficient of Actor') +parser.add_argument('--entropy_coef_decay', type=float, + default=0.99, help='Decay rate of entropy_coef') +opt = parser.parse_args() +opt.dvc = torch.device(opt.dvc) # from str to torch.device +print(opt) + + +def main(): + EnvName = ['PartitionMaze_PPO_Continuous', 'Pendulum-v1', 'LunarLanderContinuous-v2', + 'Humanoid-v4', 'HalfCheetah-v4', 'BipedalWalker-v3', 'BipedalWalkerHardcore-v3'] + BrifEnvName = ['PM_PPO_Con', 'PV1', 'LLdV2', + 'Humanv4', 'HCv4', 'BWv3', 'BWHv3'] + + # Build Env + # env = gym.make(EnvName[opt.EnvIdex], render_mode = "human" if opt.render else None) + env = PartitionMazeEnv() + # eval_env = gym.make(EnvName[opt.EnvIdex]) + eval_env = PartitionMazeEnv() + opt.state_dim = env.observation_space.shape[0] + opt.action_dim = env.action_space.shape[0] + opt.max_action = float(env.action_space.high[0]) + opt.max_steps = env._max_episode_steps + print('Env:', EnvName[opt.EnvIdex], ' state_dim:', opt.state_dim, ' action_dim:', opt.action_dim, + ' max_a:', opt.max_action, ' min_a:', env.action_space.low[0], 'max_steps', opt.max_steps) + + # Seed Everything + env_seed = opt.seed + torch.manual_seed(opt.seed) + torch.cuda.manual_seed(opt.seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + print("Random Seed: {}".format(opt.seed)) + + # Use tensorboard to record training curves + if opt.write: + from torch.utils.tensorboard import SummaryWriter + timenow = str(datetime.now())[0:-10] + timenow = ' ' + timenow[0:13] + '_' + timenow[-2::] + writepath = 'runs/{}'.format(BrifEnvName[opt.EnvIdex]) + timenow + if os.path.exists(writepath): + shutil.rmtree(writepath) + writer = SummaryWriter(log_dir=writepath) + + # Beta dist maybe need larger learning rate, Sometimes helps + # if Dist[distnum] == 'Beta' : + # kwargs["a_lr"] *= 2 + # kwargs["c_lr"] *= 4 + + if not os.path.exists('model'): + os.mkdir('model') + # transfer opt to dictionary, and use it to init PPO_agent + agent = PPO_agent(**vars(opt)) + if opt.Loadmodel: + agent.load(BrifEnvName[opt.EnvIdex], opt.ModelIdex) + + if opt.render: + while True: + ep_r = evaluate_policy(env, agent, opt.max_action, 1) + print(f'Env:{EnvName[opt.EnvIdex]}, Episode Reward:{ep_r}') + else: + traj_lenth, total_steps = 0, 0 + while total_steps < opt.Max_train_steps: + # Do not use opt.seed directly, or it can overfit to opt.seed + s, info = env.reset(seed=env_seed) + env_seed += 1 + done = False + + '''Interact & trian''' + while not done: + '''Interact with Env''' + a, logprob_a = agent.select_action( + s, deterministic=False) # use stochastic when training + # act = Action_adapter(a,opt.max_action) #[0,1] to [-max,max] + s_next, r, dw, tr, info = env.step( + a) # dw: dead&win; tr: truncated + # r = Reward_adapter(r, opt.EnvIdex) + done = (dw or tr) + + '''Store the current transition''' + agent.put_data(s, a, r, s_next, logprob_a, + done, dw, idx=traj_lenth) + s = s_next + + traj_lenth += 1 + total_steps += 1 + + '''Update if its time''' + if traj_lenth % opt.T_horizon == 0: + agent.train() + traj_lenth = 0 + + '''Record & log''' + if total_steps % opt.eval_interval == 0: + # evaluate the policy for 3 times, and get averaged result + score = evaluate_policy( + eval_env, agent, opt.max_action, turns=3) + if opt.write: + writer.add_scalar( + 'ep_r', score, global_step=total_steps) + print('EnvName:', EnvName[opt.EnvIdex], 'seed:', opt.seed, 'steps: {}k'.format( + int(total_steps/1000)), 'score:', score) + + '''Save model''' + if total_steps % opt.save_interval == 0: + agent.save(BrifEnvName[opt.EnvIdex], int(total_steps/1000)) + + env.close() + eval_env.close() + + +if __name__ == '__main__': + main() diff --git a/PPO_Continuous/utils.py b/PPO_Continuous/utils.py new file mode 100644 index 0000000..7d20a0f --- /dev/null +++ b/PPO_Continuous/utils.py @@ -0,0 +1,155 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.distributions import Beta, Normal +import numpy as np + + +class BetaActor(nn.Module): + def __init__(self, state_dim, action_dim, net_width): + super(BetaActor, self).__init__() + + self.l1 = nn.Linear(state_dim, net_width) + self.l2 = nn.Linear(net_width, net_width) + self.alpha_head = nn.Linear(net_width, action_dim) + self.beta_head = nn.Linear(net_width, action_dim) + + def forward(self, state): + a = torch.tanh(self.l1(state)) + a = torch.tanh(self.l2(a)) + + alpha = F.softplus(self.alpha_head(a)) + 1.0 + beta = F.softplus(self.beta_head(a)) + 1.0 + + return alpha, beta + + def get_dist(self, state): + alpha, beta = self.forward(state) + dist = Beta(alpha, beta) + return dist + + def deterministic_act(self, state): + alpha, beta = self.forward(state) + mode = (alpha) / (alpha + beta) + return mode + + +class GaussianActor_musigma(nn.Module): + def __init__(self, state_dim, action_dim, net_width): + super(GaussianActor_musigma, self).__init__() + + self.l1 = nn.Linear(state_dim, net_width) + self.l2 = nn.Linear(net_width, net_width) + self.mu_head = nn.Linear(net_width, action_dim) + self.sigma_head = nn.Linear(net_width, action_dim) + + def forward(self, state): + a = torch.tanh(self.l1(state)) + a = torch.tanh(self.l2(a)) + mu = torch.sigmoid(self.mu_head(a)) + sigma = F.softplus(self.sigma_head(a)) + return mu, sigma + + def get_dist(self, state): + mu, sigma = self.forward(state) + dist = Normal(mu, sigma) + return dist + + def deterministic_act(self, state): + mu, sigma = self.forward(state) + return mu + + +class GaussianActor_mu(nn.Module): + def __init__(self, state_dim, action_dim, net_width, log_std=0): + super(GaussianActor_mu, self).__init__() + + self.l1 = nn.Linear(state_dim, net_width) + self.l2 = nn.Linear(net_width, net_width) + self.mu_head = nn.Linear(net_width, action_dim) + self.mu_head.weight.data.mul_(0.1) + self.mu_head.bias.data.mul_(0.0) + + self.action_log_std = nn.Parameter(torch.ones(1, action_dim) * log_std) + + def forward(self, state): + a = torch.relu(self.l1(state)) + a = torch.relu(self.l2(a)) + mu = torch.sigmoid(self.mu_head(a)) + return mu + + def get_dist(self, state): + mu = self.forward(state) + action_log_std = self.action_log_std.expand_as(mu) + action_std = torch.exp(action_log_std) + + dist = Normal(mu, action_std) + return dist + + def deterministic_act(self, state): + return self.forward(state) + + +class Critic(nn.Module): + def __init__(self, state_dim, net_width): + super(Critic, self).__init__() + + self.C1 = nn.Linear(state_dim, net_width) + self.C2 = nn.Linear(net_width, net_width) + self.C3 = nn.Linear(net_width, 1) + + def forward(self, state): + v = torch.tanh(self.C1(state)) + v = torch.tanh(self.C2(v)) + v = self.C3(v) + return v + + +def str2bool(v): + '''transfer str to bool for argparse''' + if isinstance(v, bool): + return v + if v.lower() in ('yes', 'True', 'true', 'TRUE', 't', 'y', '1'): + return True + elif v.lower() in ('no', 'False', 'false', 'FALSE', 'f', 'n', '0'): + return False + else: + print('Wrong Input.') + raise + + +def Action_adapter(a, max_action): + # from [0,1] to [-max,max] + return 2*(a-0.5)*max_action + + +def Reward_adapter(r, EnvIdex): + # For BipedalWalker + if EnvIdex == 0 or EnvIdex == 1: + if r <= -100: + r = -1 + # For Pendulum-v0 + elif EnvIdex == 3: + r = (r + 8) / 8 + return r + + +def evaluate_policy(env, agent, max_action, turns): + total_scores = 0 + for j in range(turns): + s, info = env.reset() + done = False + action_series = [] + while not done: + # Take deterministic actions when evaluation + a, logprob_a = agent.select_action(s, deterministic=True) + # act = Action_adapter(a, max_action) # [0,1] to [-max,max] + s_next, r, dw, tr, info = env.step(a) + done = (dw or tr) + action_series.append(a[0]) + total_scores += r + s = s_next + print('action series: ', np.round(action_series, 3)) + print('state: ', s) + + return total_scores/turns