from DQN import DQN_agent from datetime import datetime from utils import evaluate_policy, str2bool import gymnasium as gym import shutil import argparse import torch import numpy as np # fmt: off import sys import os sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from env_dis import PartitionMazeEnv # fmt: on '''Hyperparameter Setting''' parser = argparse.ArgumentParser() parser.add_argument('--dvc', type=str, default='cpu', help='running device: cuda or cpu') parser.add_argument('--EnvIdex', type=int, default=0, help='CP-v1, LLd-v2') parser.add_argument('--write', type=str2bool, default=False, help='Use SummaryWriter to record the training') parser.add_argument('--render', type=str2bool, default=False, help='Render or Not') parser.add_argument('--Loadmodel', type=str2bool, default=False, help='Load pretrained model or Not') parser.add_argument('--ModelIdex', type=int, default=100, help='which model to load') parser.add_argument('--seed', type=int, default=42, help='random seed') parser.add_argument('--Max_train_steps', type=int, default=int(1e8), help='Max training steps') parser.add_argument('--save_interval', type=int, default=int(50e3), help='Model saving interval, in steps.') parser.add_argument('--eval_interval', type=int, default=int(2e3), help='Model evaluating interval, in steps.') parser.add_argument('--random_steps', type=int, default=int(3e3), help='steps for random policy to explore') parser.add_argument('--update_every', type=int, default=50, help='training frequency') parser.add_argument('--gamma', type=float, default=0.99, help='Discounted Factor') parser.add_argument('--net_width', type=int, default=200, help='Hidden net width') parser.add_argument('--lr', type=float, default=1e-4, help='Learning rate') parser.add_argument('--batch_size', type=int, default=256, help='lenth of sliced trajectory') parser.add_argument('--exp_noise', type=float, default=0.2, help='explore noise') parser.add_argument('--noise_decay', type=float, default=0.99, help='decay rate of explore noise') parser.add_argument('--Double', type=str2bool, default=False, help='Whether to use Double Q-learning') parser.add_argument('--Duel', type=str2bool, default=False, help='Whether to use Duel networks') opt = parser.parse_args() opt.dvc = torch.device(opt.dvc) # from str to torch.device print(opt) def main(): EnvName = ['CartPole-v1', 'LunarLander-v2'] BriefEnvName = ['PM_DQN', 'CPV1', 'LLdV2'] # env = gym.make(EnvName[opt.EnvIdex], render_mode = "human" if opt.render else None) # eval_env = gym.make(EnvName[opt.EnvIdex]) env = PartitionMazeEnv() eval_env = PartitionMazeEnv() opt.state_dim = env.observation_space.shape[0] opt.action_dim = env.action_space.n # Algorithm Setting if opt.Duel: algo_name = 'Duel' else: algo_name = '' if opt.Double: algo_name += 'DDQN' else: algo_name += 'DQN' # Seed Everything env_seed = opt.seed torch.manual_seed(opt.seed) torch.cuda.manual_seed(opt.seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False print("Random Seed: {}".format(opt.seed)) print('Algorithm:', algo_name, ' Env:', BriefEnvName[opt.EnvIdex], ' state_dim:', opt.state_dim, ' action_dim:', opt.action_dim, ' Random Seed:', opt.seed, '\n') if opt.write: from torch.utils.tensorboard import SummaryWriter timenow = str(datetime.now())[0:-10] timenow = ' ' + timenow[0:13] + '_' + timenow[-2::] writepath = 'runs/{}-{}_S{}_'.format(algo_name, BriefEnvName[opt.EnvIdex], opt.seed) + timenow if os.path.exists(writepath): shutil.rmtree(writepath) writer = SummaryWriter(log_dir=writepath) # Build model and replay buffer if not os.path.exists('model'): os.mkdir('model') agent = DQN_agent(**vars(opt)) if opt.Loadmodel: agent.load(algo_name, BriefEnvName[opt.EnvIdex], opt.ModelIdex) if opt.render: while True: score = evaluate_policy(env, agent, 1) print('EnvName:', BriefEnvName[opt.EnvIdex], 'seed:', opt.seed, 'score:', score) else: total_steps = 0 while total_steps < opt.Max_train_steps: # Do not use opt.seed directly, or it can overfit to opt.seed s = env.reset(seed=env_seed) done = False '''Interact & trian''' while not done: # e-greedy exploration if total_steps < opt.random_steps: # if s[0] == 0: # a = np.random.randint(0, 10) # else: # a = np.random.randint(10, 14) action_series = [0, 0, 3, 0, 10] a = action_series[total_steps % 5] else: a = agent.select_action(s, deterministic=False) s_next, r, dw, tr, info = env.step(a) done = (dw or tr) agent.replay_buffer.add(s, a, r, s_next, dw) s = s_next '''Update''' # train 50 times every 50 steps rather than 1 training per step. Better! if total_steps >= opt.random_steps and total_steps % opt.update_every == 0: for j in range(opt.update_every): agent.train() '''Noise decay & Record & Log''' if total_steps % 1000 == 0: agent.exp_noise *= opt.noise_decay if total_steps % opt.eval_interval == 0: score = evaluate_policy(eval_env, agent, turns=3) if opt.write: writer.add_scalar( 'ep_r', score, global_step=total_steps) writer.add_scalar( 'noise', agent.exp_noise, global_step=total_steps) print('EnvName:', BriefEnvName[opt.EnvIdex], 'seed:', opt.seed, 'steps: {}k'.format( int(total_steps/1000)), 'score:', int(score)) total_steps += 1 '''save model''' if total_steps % opt.save_interval == 0: agent.save(algo_name, BriefEnvName[opt.EnvIdex], int( total_steps/1000)) env.close() eval_env.close() if __name__ == '__main__': main()