from datetime import datetime import shutil import argparse import torch import gymnasium as gym from utils import str2bool, Action_adapter, Reward_adapter, evaluate_policy from PPO import PPO_agent # fmt: off import sys import os sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from env_partion import PartitionEnv # fmt: on '''Hyperparameter Setting''' parser = argparse.ArgumentParser() parser.add_argument('--dvc', type=str, default='cpu', help='running device: cuda or cpu') parser.add_argument('--EnvIdex', type=int, default=0, help='PM_PPO_Con, PV1, Lch_Cv2, Humanv4, HCv4, BWv3, BWHv3') parser.add_argument('--write', type=str2bool, default=True, help='Use SummaryWriter to record the training') parser.add_argument('--render', type=str2bool, default=False, help='Render or Not') parser.add_argument('--Loadmodel', type=str2bool, default=False, help='Load pretrained model or Not') parser.add_argument('--ModelIdex', type=int, default=500, help='which model to load') parser.add_argument('--seed', type=int, default=0, help='random seed') parser.add_argument('--T_horizon', type=int, default=20, help='lenth of long trajectory') parser.add_argument('--Distribution', type=str, default='Beta', help='Should be one of Beta ; GS_ms ; GS_m') parser.add_argument('--Max_train_steps', type=int, default=int(5e8), help='Max training steps') parser.add_argument('--save_interval', type=int, default=int(5e5), help='Model saving interval, in steps.') parser.add_argument('--eval_interval', type=int, default=int(5e1), help='Model evaluating interval, in steps.') parser.add_argument('--gamma', type=float, default=0.99, help='Discounted Factor') parser.add_argument('--lambd', type=float, default=0.95, help='GAE Factor') parser.add_argument('--clip_rate', type=float, default=0.2, help='PPO Clip rate') parser.add_argument('--K_epochs', type=int, default=10, help='PPO update times') parser.add_argument('--net_width', type=int, default=150, help='Hidden net width') parser.add_argument('--a_lr', type=float, default=2e-4, help='Learning rate of actor') parser.add_argument('--c_lr', type=float, default=2e-4, help='Learning rate of critic') parser.add_argument('--l2_reg', type=float, default=1e-3, help='L2 regulization coefficient for Critic') parser.add_argument('--a_optim_batch_size', type=int, default=64, help='lenth of sliced trajectory of actor') parser.add_argument('--c_optim_batch_size', type=int, default=64, help='lenth of sliced trajectory of critic') parser.add_argument('--entropy_coef', type=float, default=1e-3, help='Entropy coefficient of Actor') parser.add_argument('--entropy_coef_decay', type=float, default=0.99, help='Decay rate of entropy_coef') opt = parser.parse_args() opt.dvc = torch.device(opt.dvc) # from str to torch.device print(opt) def main(): EnvName = ['Partition_PPO_Continuous', 'Pendulum-v1', 'LunarLanderContinuous-v2', 'Humanoid-v4', 'HalfCheetah-v4', 'BipedalWalker-v3', 'BipedalWalkerHardcore-v3'] BrifEnvName = ['Part_PPO_Con', 'PV1', 'LLdV2', 'Humanv4', 'HCv4', 'BWv3', 'BWHv3'] # Build Env # env = gym.make(EnvName[opt.EnvIdex], render_mode = "human" if opt.render else None) env = PartitionEnv() # eval_env = gym.make(EnvName[opt.EnvIdex]) eval_env = PartitionEnv() opt.state_dim = env.observation_space.shape[0] opt.action_dim = env.action_space.shape[0] opt.max_action = float(env.action_space.high[0]) print('Env:', EnvName[opt.EnvIdex], ' state_dim:', opt.state_dim, ' action_dim:', opt.action_dim, ' max_a:', opt.max_action, ' min_a:', env.action_space.low[0]) # Seed Everything env_seed = opt.seed torch.manual_seed(opt.seed) torch.cuda.manual_seed(opt.seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False print("Random Seed: {}".format(opt.seed)) # Use tensorboard to record training curves if opt.write: from torch.utils.tensorboard import SummaryWriter timenow = str(datetime.now())[0:-10] timenow = ' ' + timenow[0:13] + '_' + timenow[-2::] writepath = 'logs/{}'.format(BrifEnvName[opt.EnvIdex]) + timenow if os.path.exists(writepath): shutil.rmtree(writepath) writer = SummaryWriter(log_dir=writepath) # Beta dist maybe need larger learning rate, Sometimes helps # if Dist[distnum] == 'Beta' : # kwargs["a_lr"] *= 2 # kwargs["c_lr"] *= 4 # transfer opt to dictionary, and use it to init PPO_agent agent = PPO_agent(**vars(opt)) if opt.Loadmodel: agent.load(BrifEnvName[opt.EnvIdex], opt.ModelIdex) if opt.render: while True: ep_r = evaluate_policy(env, agent, opt.max_action, 1) print(f'Env:{EnvName[opt.EnvIdex]}, Episode Reward:{ep_r}') else: traj_lenth, total_steps = 0, 0 while total_steps < opt.Max_train_steps: # Do not use opt.seed directly, or it can overfit to opt.seed s = env.reset(seed=env_seed) env_seed += 1 done = False '''Interact & trian''' while not done: '''Interact with Env''' a, logprob_a = agent.select_action( s, deterministic=False) # use stochastic when training # act = Action_adapter(a,opt.max_action) #[0,1] to [-max,max] s_next, r, dw, tr, info = env.step( a) # dw: dead&win; tr: truncated # r = Reward_adapter(r, opt.EnvIdex) done = (dw or tr) '''Store the current transition''' agent.put_data(s, a, r, s_next, logprob_a, done, dw, idx=traj_lenth) traj_lenth += 1 total_steps += 1 '''Update if its time''' if traj_lenth % opt.T_horizon == 0: agent.train() traj_lenth = 0 '''Record & log''' if total_steps % opt.eval_interval == 0: # evaluate the policy for 3 times, and get averaged result score = evaluate_policy( eval_env, agent, opt.max_action, turns=1) if opt.write: writer.add_scalar( 'ep_r', score, global_step=total_steps) print('EnvName:', EnvName[opt.EnvIdex], 'seed:', opt.seed, 'steps: {}k'.format( int(total_steps/1000)), 'score:', score) '''Save model''' if total_steps % opt.save_interval == 0: agent.save(BrifEnvName[opt.EnvIdex], int(total_steps/1000)) env.close() eval_env.close() if __name__ == '__main__': main()