165 lines
6.4 KiB
Python
165 lines
6.4 KiB
Python
from utils import evaluate_policy, str2bool
|
|
from datetime import datetime
|
|
from DQN import DQN_agent
|
|
import gymnasium as gym
|
|
import os
|
|
import shutil
|
|
import argparse
|
|
import torch
|
|
# fmt: off
|
|
import sys
|
|
import os
|
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
from env_partion_dist import PartitionEnv
|
|
# fmt: on
|
|
|
|
'''Hyperparameter Setting'''
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('--dvc', type=str, default='cpu',
|
|
help='running device: cuda or cpu')
|
|
parser.add_argument('--EnvIdex', type=int, default=0, help='CP-v1, LLd-v2')
|
|
parser.add_argument('--write', type=str2bool, default=True,
|
|
help='Use SummaryWriter to record the training')
|
|
parser.add_argument('--render', type=str2bool,
|
|
default=False, help='Render or Not')
|
|
parser.add_argument('--Loadmodel', type=str2bool,
|
|
default=False, help='Load pretrained model or Not')
|
|
parser.add_argument('--ModelIdex', type=int, default=100,
|
|
help='which model to load')
|
|
|
|
parser.add_argument('--seed', type=int, default=0, help='random seed')
|
|
parser.add_argument('--Max_train_steps', type=int,
|
|
default=int(1e8), help='Max training steps')
|
|
parser.add_argument('--save_interval', type=int,
|
|
default=int(5e4), help='Model saving interval, in steps.')
|
|
parser.add_argument('--eval_interval', type=int, default=int(2e3),
|
|
help='Model evaluating interval, in steps.')
|
|
parser.add_argument('--random_steps', type=int, default=int(6e3),
|
|
help='steps for random policy to explore')
|
|
parser.add_argument('--update_every', type=int,
|
|
default=10, help='training frequency')
|
|
|
|
parser.add_argument('--gamma', type=float, default=0.99,
|
|
help='Discounted Factor')
|
|
parser.add_argument('--net_width', type=int,
|
|
default=200, help='Hidden net width')
|
|
parser.add_argument('--lr', type=float, default=1e-4, help='Learning rate')
|
|
parser.add_argument('--batch_size', type=int, default=256,
|
|
help='lenth of sliced trajectory')
|
|
parser.add_argument('--exp_noise', type=float,
|
|
default=0.3, help='explore noise')
|
|
parser.add_argument('--noise_decay', type=float, default=0.99,
|
|
help='decay rate of explore noise')
|
|
parser.add_argument('--Double', type=str2bool, default=False,
|
|
help='Whether to use Double Q-learning')
|
|
parser.add_argument('--Duel', type=str2bool, default=False,
|
|
help='Whether to use Duel networks')
|
|
opt = parser.parse_args()
|
|
opt.dvc = torch.device(opt.dvc) # from str to torch.device
|
|
print(opt)
|
|
|
|
|
|
def main():
|
|
EnvName = ['PartitionEnv']
|
|
BriefEnvName = ['PartEnv']
|
|
env = PartitionEnv()
|
|
eval_env = PartitionEnv()
|
|
opt.state_dim = env.observation_space.shape[0]
|
|
opt.action_dim = env.action_space.n
|
|
opt.max_e_steps = env.MAX_ADJUST_STEP
|
|
|
|
# Algorithm Setting
|
|
if opt.Duel:
|
|
algo_name = 'Duel'
|
|
else:
|
|
algo_name = ''
|
|
if opt.Double:
|
|
algo_name += 'DDQN'
|
|
else:
|
|
algo_name += 'DQN'
|
|
|
|
# Seed Everything
|
|
env_seed = opt.seed
|
|
torch.manual_seed(opt.seed)
|
|
torch.cuda.manual_seed(opt.seed)
|
|
torch.backends.cudnn.deterministic = True
|
|
torch.backends.cudnn.benchmark = False
|
|
print("Random Seed: {}".format(opt.seed))
|
|
|
|
print('Algorithm:', algo_name, ' Env:', BriefEnvName[opt.EnvIdex], ' state_dim:', opt.state_dim,
|
|
' action_dim:', opt.action_dim, ' Random Seed:', opt.seed, ' max_e_steps:', opt.max_e_steps, '\n')
|
|
|
|
if opt.write:
|
|
from torch.utils.tensorboard import SummaryWriter
|
|
timenow = str(datetime.now())[0:-10]
|
|
timenow = ' ' + timenow[0:13] + '_' + timenow[-2::]
|
|
writepath = 'runs/{}-{}_S{}_'.format(algo_name,
|
|
BriefEnvName[opt.EnvIdex], opt.seed) + timenow
|
|
if os.path.exists(writepath):
|
|
shutil.rmtree(writepath)
|
|
writer = SummaryWriter(log_dir=writepath)
|
|
|
|
# Build model and replay buffer
|
|
if not os.path.exists('model'):
|
|
os.mkdir('model')
|
|
agent = DQN_agent(**vars(opt))
|
|
if opt.Loadmodel:
|
|
agent.load(algo_name, BriefEnvName[opt.EnvIdex], opt.ModelIdex)
|
|
|
|
if opt.render:
|
|
while True:
|
|
score = evaluate_policy(env, agent, 1)
|
|
print('EnvName:', BriefEnvName[opt.EnvIdex],
|
|
'seed:', opt.seed, 'score:', score)
|
|
else:
|
|
total_steps = 0
|
|
while total_steps < opt.Max_train_steps:
|
|
# Do not use opt.seed directly, or it can overfit to opt.seed
|
|
s = env.reset(seed=env_seed)
|
|
env_seed += 1
|
|
done = False
|
|
|
|
'''Interact & trian'''
|
|
while not done:
|
|
# e-greedy exploration
|
|
if total_steps < opt.random_steps:
|
|
a = env.action_space.sample()
|
|
else:
|
|
a = agent.select_action(s, deterministic=False)
|
|
s_next, r, dw, tr, info = env.step(a)
|
|
done = (dw or tr)
|
|
|
|
agent.replay_buffer.add(s, a, r, s_next, dw)
|
|
s = s_next
|
|
|
|
'''Update'''
|
|
# train 50 times every 50 steps rather than 1 training per step. Better!
|
|
if total_steps >= opt.random_steps and total_steps % opt.update_every == 0:
|
|
for j in range(opt.update_every):
|
|
agent.train()
|
|
|
|
'''Noise decay & Record & Log'''
|
|
if total_steps % 1000 == 0:
|
|
agent.exp_noise *= opt.noise_decay
|
|
if total_steps % opt.eval_interval == 0:
|
|
score = evaluate_policy(eval_env, agent, turns=1)
|
|
if opt.write:
|
|
writer.add_scalar(
|
|
'ep_r', score, global_step=total_steps)
|
|
writer.add_scalar(
|
|
'noise', agent.exp_noise, global_step=total_steps)
|
|
print('EnvName:', BriefEnvName[opt.EnvIdex], 'seed:', opt.seed, 'steps: {}k'.format(
|
|
int(total_steps/1000)), 'score:', int(score))
|
|
total_steps += 1
|
|
|
|
'''save model'''
|
|
if total_steps % opt.save_interval == 0:
|
|
agent.save(algo_name, BriefEnvName[opt.EnvIdex], int(
|
|
total_steps/1000))
|
|
env.close()
|
|
eval_env.close()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|