修改car_pos

2025-03-13 21:28:30 +08:00 · 2025-03-13 21:28:30 +08:00 · 3086413171
commit 3086413171
parent ee914ff930
15 changed files with 993 additions and 10 deletions
--- a/.gitignore
+++ b/.gitignore
@ -10,6 +10,8 @@ __pycache__/
 # Pytorch weights
 weights/
 solutions/
 PPO_preTrained/
 PPO_logs/
 # Distribution / packaging
 .Python
--- a/PPO1/eval_policy.py
+++ b/PPO1/eval_policy.py
--- a/PPO1/main.py
+++ b/PPO1/main.py
@ -11,6 +11,9 @@ import argparse
 from ppo import PPO
 from network import FeedForwardNN
 from eval_policy import eval_policy
 import os
 import sys
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 
 from env import PartitionMazeEnv
 def train(env, hyperparameters, actor_model, critic_model):
--- a/PPO1/main_test.py
+++ b/PPO1/main_test.py
@ -11,6 +11,9 @@ import argparse
 from ppo import PPO
 from network import FeedForwardNN
 from eval_policy import eval_policy
 import os
 import sys
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 
 from env import PartitionMazeEnv
 def train(env, hyperparameters, actor_model, critic_model):
--- a/PPO1/network.py
+++ b/PPO1/network.py
--- a/PPO1/ppo.py
+++ b/PPO1/ppo.py
@ -183,7 +183,7 @@ class PPO:
 			ep_rews = [] # rewards collected per episode
 			# Reset the environment. sNote that obs is short for observation. 
-			obs, _ = self.env.reset()
+			obs = self.env.reset()
 			done = False
 			# Run an episode for a maximum of max_timesteps_per_episode timesteps
--- a/PPO2/PPO.py
+++ b/PPO2/PPO.py
@ -0,0 +1,273 @@
 import torch
 import torch.nn as nn
 from torch.distributions import MultivariateNormal
 from torch.distributions import Categorical
 ################################## set device ##################################
 print("============================================================================================")
 # set device to cpu or cuda
 device = torch.device('cpu')
 if(torch.cuda.is_available()): 
    device = torch.device('cuda:0') 
    torch.cuda.empty_cache()
    print("Device set to : " + str(torch.cuda.get_device_name(device)))
 else:
    print("Device set to : cpu")
 print("============================================================================================")
 ################################## PPO Policy ##################################
 class RolloutBuffer:
    def __init__(self):
        self.actions = []
        self.states = []
        self.logprobs = []
        self.rewards = []
        self.state_values = []
        self.is_terminals = []
    def clear(self):
        del self.actions[:]
        del self.states[:]
        del self.logprobs[:]
        del self.rewards[:]
        del self.state_values[:]
        del self.is_terminals[:]
 class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim, has_continuous_action_space, action_std_init):
        super(ActorCritic, self).__init__()
        self.has_continuous_action_space = has_continuous_action_space
        if has_continuous_action_space:
            self.action_dim = action_dim
            self.action_var = torch.full((action_dim,), action_std_init * action_std_init).to(device)
        # actor
        if has_continuous_action_space :
            self.actor = nn.Sequential(
                            nn.Linear(state_dim, 64),
                            nn.Tanh(),
                            # nn.Sigmoid(),
                            # nn.ReLU(),
                            nn.Linear(64, 64),
                            nn.Tanh(),
                            # nn.Sigmoid(),
                            # nn.ReLU(),
                            nn.Linear(64, action_dim),
                            nn.Tanh()
                            # nn.Sigmoid()
                            # nn.ReLU()
                        )
        else:
            self.actor = nn.Sequential(
                            nn.Linear(state_dim, 64),
                            nn.Tanh(),
                            nn.Linear(64, 64),
                            nn.Tanh(),
                            nn.Linear(64, action_dim),
                            nn.Softmax(dim=-1)
                        )
        # critic
        self.critic = nn.Sequential(
                        nn.Linear(state_dim, 64),
                        nn.Tanh(),
                        # nn.Sigmoid(),
                        # nn.ReLU(),
                        nn.Linear(64, 64),
                        nn.Tanh(),
                        # nn.Sigmoid(),
                        # nn.ReLU(),
                        nn.Linear(64, 1)
                    )
    def set_action_std(self, new_action_std):
        if self.has_continuous_action_space:
            self.action_var = torch.full((self.action_dim,), new_action_std * new_action_std).to(device)
        else:
            print("--------------------------------------------------------------------------------------------")
            print("WARNING : Calling ActorCritic::set_action_std() on discrete action space policy")
            print("--------------------------------------------------------------------------------------------")
    def forward(self):
        raise NotImplementedError
    def act(self, state):
        if self.has_continuous_action_space:
            action_mean = self.actor(state)
            cov_mat = torch.diag(self.action_var).unsqueeze(dim=0)
            dist = MultivariateNormal(action_mean, cov_mat)
        else:
            action_probs = self.actor(state)
            dist = Categorical(action_probs)
        action = dist.sample()
        action_logprob = dist.log_prob(action)
        state_val = self.critic(state)
        return action.detach(), action_logprob.detach(), state_val.detach()
    def evaluate(self, state, action):
        if self.has_continuous_action_space:
            action_mean = self.actor(state)
            action_var = self.action_var.expand_as(action_mean)
            cov_mat = torch.diag_embed(action_var).to(device)
            dist = MultivariateNormal(action_mean, cov_mat)
            # For Single Action Environments.
            if self.action_dim == 1:
                action = action.reshape(-1, self.action_dim)
        else:
            action_probs = self.actor(state)
            dist = Categorical(action_probs)
        action_logprobs = dist.log_prob(action)
        dist_entropy = dist.entropy()
        state_values = self.critic(state)
        return action_logprobs, state_values, dist_entropy
 class PPO:
    def __init__(self, state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip, has_continuous_action_space, action_std_init=0.6):
        self.has_continuous_action_space = has_continuous_action_space
        if has_continuous_action_space:
            self.action_std = action_std_init
        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs
        self.buffer = RolloutBuffer()
        self.policy = ActorCritic(state_dim, action_dim, has_continuous_action_space, action_std_init).to(device)
        self.optimizer = torch.optim.Adam([
                        {'params': self.policy.actor.parameters(), 'lr': lr_actor},
                        {'params': self.policy.critic.parameters(), 'lr': lr_critic}
                    ])
        self.policy_old = ActorCritic(state_dim, action_dim, has_continuous_action_space, action_std_init).to(device)
        self.policy_old.load_state_dict(self.policy.state_dict())
        self.MseLoss = nn.MSELoss()
    def set_action_std(self, new_action_std):
        if self.has_continuous_action_space:
            self.action_std = new_action_std
            self.policy.set_action_std(new_action_std)
            self.policy_old.set_action_std(new_action_std)
        else:
            print("--------------------------------------------------------------------------------------------")
            print("WARNING : Calling PPO::set_action_std() on discrete action space policy")
            print("--------------------------------------------------------------------------------------------")
    def decay_action_std(self, action_std_decay_rate, min_action_std):
        print("--------------------------------------------------------------------------------------------")
        if self.has_continuous_action_space:
            self.action_std = self.action_std - action_std_decay_rate
            self.action_std = round(self.action_std, 4)
            if (self.action_std <= min_action_std):
                self.action_std = min_action_std
                print("setting actor output action_std to min_action_std : ", self.action_std)
            else:
                print("setting actor output action_std to : ", self.action_std)
            self.set_action_std(self.action_std)
        else:
            print("WARNING : Calling PPO::decay_action_std() on discrete action space policy")
        print("--------------------------------------------------------------------------------------------")
    def select_action(self, state):
        if self.has_continuous_action_space:
            with torch.no_grad():
                state = torch.FloatTensor(state).to(device)
                action, action_logprob, state_val = self.policy_old.act(state)
            self.buffer.states.append(state)
            self.buffer.actions.append(action)
            self.buffer.logprobs.append(action_logprob)
            self.buffer.state_values.append(state_val)
            return action.detach().cpu().numpy().flatten()
        else:
            with torch.no_grad():
                state = torch.FloatTensor(state).to(device)
                action, action_logprob, state_val = self.policy_old.act(state)
            self.buffer.states.append(state)
            self.buffer.actions.append(action)
            self.buffer.logprobs.append(action_logprob)
            self.buffer.state_values.append(state_val)
            return action.item()
    def update(self):
        # Monte Carlo estimate of returns
        rewards = []
        discounted_reward = 0
        for reward, is_terminal in zip(reversed(self.buffer.rewards), reversed(self.buffer.is_terminals)):
            if is_terminal:
                discounted_reward = 0
            discounted_reward = reward + (self.gamma * discounted_reward)
            rewards.insert(0, discounted_reward)
        # Normalizing the rewards
        rewards = torch.tensor(rewards, dtype=torch.float32).to(device)
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-7)
        # convert list to tensor
        old_states = torch.squeeze(torch.stack(self.buffer.states, dim=0)).detach().to(device)
        old_actions = torch.squeeze(torch.stack(self.buffer.actions, dim=0)).detach().to(device)
        old_logprobs = torch.squeeze(torch.stack(self.buffer.logprobs, dim=0)).detach().to(device)
        old_state_values = torch.squeeze(torch.stack(self.buffer.state_values, dim=0)).detach().to(device)
        # calculate advantages
        advantages = rewards.detach() - old_state_values.detach()
        # Optimize policy for K epochs
        for _ in range(self.K_epochs):
            # Evaluating old actions and values
            logprobs, state_values, dist_entropy = self.policy.evaluate(old_states, old_actions)
            # match state_values tensor dimensions with rewards tensor
            state_values = torch.squeeze(state_values)
            # Finding the ratio (pi_theta / pi_theta__old)
            ratios = torch.exp(logprobs - old_logprobs.detach())
            # Finding Surrogate Loss  
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1-self.eps_clip, 1+self.eps_clip) * advantages
            # final loss of clipped objective PPO
            loss = -torch.min(surr1, surr2) + 0.5 * self.MseLoss(state_values, rewards) - 0.01 * dist_entropy
            # take gradient step
            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()
        # Copy new weights into old policy
        self.policy_old.load_state_dict(self.policy.state_dict())
        # clear buffer
        self.buffer.clear()
    def save(self, checkpoint_path):
        torch.save(self.policy_old.state_dict(), checkpoint_path)
    def load(self, checkpoint_path):
        self.policy_old.load_state_dict(torch.load(checkpoint_path, map_location=lambda storage, loc: storage))
        self.policy.load_state_dict(torch.load(checkpoint_path, map_location=lambda storage, loc: storage))
--- a/PPO2/plot_graph.py
+++ b/PPO2/plot_graph.py
@ -0,0 +1,142 @@
 import os
 import pandas as pd
 import matplotlib.pyplot as plt
 def save_graph():
    print("============================================================================================")
    # env_name = 'CartPole-v1'
    # env_name = 'LunarLander-v2'
    # env_name = 'BipedalWalker-v2'
    env_name = 'RoboschoolWalker2d-v1'
    fig_num = 0     #### change this to prevent overwriting figures in same env_name folder
    plot_avg = True    # plot average of all runs; else plot all runs separately
    fig_width = 10
    fig_height = 6
    # smooth out rewards to get a smooth and a less smooth (var) plot lines
    window_len_smooth = 20
    min_window_len_smooth = 1
    linewidth_smooth = 1.5
    alpha_smooth = 1
    window_len_var = 5
    min_window_len_var = 1
    linewidth_var = 2
    alpha_var = 0.1
    colors = ['red', 'blue', 'green', 'orange', 'purple', 'olive', 'brown', 'magenta', 'cyan', 'crimson','gray', 'black']
    # make directory for saving figures
    figures_dir = "PPO_figs"
    if not os.path.exists(figures_dir):
        os.makedirs(figures_dir)
    # make environment directory for saving figures
    figures_dir = figures_dir + '/' + env_name + '/'
    if not os.path.exists(figures_dir):
        os.makedirs(figures_dir)
    fig_save_path = figures_dir + '/PPO_' + env_name + '_fig_' + str(fig_num) + '.png'
    # get number of log files in directory
    log_dir = "PPO_logs" + '/' + env_name + '/'
    current_num_files = next(os.walk(log_dir))[2]
    num_runs = len(current_num_files)
    all_runs = []
    for run_num in range(num_runs):
        log_f_name = log_dir + '/PPO_' + env_name + "_log_" + str(run_num) + ".csv"
        print("loading data from : " + log_f_name)
        data = pd.read_csv(log_f_name)
        data = pd.DataFrame(data)
        print("data shape : ", data.shape)
        all_runs.append(data)
        print("--------------------------------------------------------------------------------------------")
    ax = plt.gca()
    if plot_avg:
        # average all runs
        df_concat = pd.concat(all_runs)
        df_concat_groupby = df_concat.groupby(df_concat.index)
        data_avg = df_concat_groupby.mean()
        # smooth out rewards to get a smooth and a less smooth (var) plot lines
        data_avg['reward_smooth'] = data_avg['reward'].rolling(window=window_len_smooth, win_type='triang', min_periods=min_window_len_smooth).mean()
        data_avg['reward_var'] = data_avg['reward'].rolling(window=window_len_var, win_type='triang', min_periods=min_window_len_var).mean()
        data_avg.plot(kind='line', x='timestep' , y='reward_smooth',ax=ax,color=colors[0],  linewidth=linewidth_smooth, alpha=alpha_smooth)
        data_avg.plot(kind='line', x='timestep' , y='reward_var',ax=ax,color=colors[0],  linewidth=linewidth_var, alpha=alpha_var)
        # keep only reward_smooth in the legend and rename it
        handles, labels = ax.get_legend_handles_labels()
        ax.legend([handles[0]], ["reward_avg_" + str(len(all_runs)) + "_runs"], loc=2)
    else:
        for i, run in enumerate(all_runs):
            # smooth out rewards to get a smooth and a less smooth (var) plot lines
            run['reward_smooth_' + str(i)] = run['reward'].rolling(window=window_len_smooth, win_type='triang', min_periods=min_window_len_smooth).mean()
            run['reward_var_' + str(i)] = run['reward'].rolling(window=window_len_var, win_type='triang', min_periods=min_window_len_var).mean()
            # plot the lines
            run.plot(kind='line', x='timestep' , y='reward_smooth_' + str(i),ax=ax,color=colors[i % len(colors)],  linewidth=linewidth_smooth, alpha=alpha_smooth)
            run.plot(kind='line', x='timestep' , y='reward_var_' + str(i),ax=ax,color=colors[i % len(colors)],  linewidth=linewidth_var, alpha=alpha_var)
        # keep alternate elements (reward_smooth_i) in the legend
        handles, labels = ax.get_legend_handles_labels()
        new_handles = []
        new_labels = []
        for i in range(len(handles)):
            if(i%2 == 0):
                new_handles.append(handles[i])
                new_labels.append(labels[i])
        ax.legend(new_handles, new_labels, loc=2)
    # ax.set_yticks(np.arange(0, 1800, 200))
    # ax.set_xticks(np.arange(0, int(4e6), int(5e5)))
    ax.grid(color='gray', linestyle='-', linewidth=1, alpha=0.2)
    ax.set_xlabel("Timesteps", fontsize=12)
    ax.set_ylabel("Rewards", fontsize=12)
    plt.title(env_name, fontsize=14)
    fig = plt.gcf()
    fig.set_size_inches(fig_width, fig_height)
    print("============================================================================================")
    plt.savefig(fig_save_path)
    print("figure saved at : ", fig_save_path)
    print("============================================================================================")
    plt.show()
 if __name__ == '__main__':
    save_graph()
--- a/PPO2/test.py
+++ b/PPO2/test.py
@ -0,0 +1,123 @@
 import os
 import glob
 import time
 from datetime import datetime
 import torch
 import numpy as np
 # import gym
 # import roboschool
 from PPO import PPO
 import sys
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from env import PartitionMazeEnv
 #################################### Testing ###################################
 def test():
    print("============================================================================================")
    ################## hyperparameters ##################
    # env_name = "CartPole-v1"
    # has_continuous_action_space = False
    # max_ep_len = 400
    # action_std = None
    # env_name = "LunarLander-v2"
    # has_continuous_action_space = False
    # max_ep_len = 300
    # action_std = None
    # env_name = "BipedalWalker-v2"
    # has_continuous_action_space = True
    # max_ep_len = 1500           # max timesteps in one episode
    # action_std = 0.1            # set same std for action distribution which was used while saving
    env_name = "test"
    has_continuous_action_space = True
    max_ep_len = 1000           # max timesteps in one episode
    action_std = 0.1            # set same std for action distribution which was used while saving
    render = True              # render environment on screen
    frame_delay = 0             # if required; add delay b/w frames
    total_test_episodes = 10    # total num of testing episodes
    K_epochs = 80               # update policy for K epochs
    eps_clip = 0.2              # clip parameter for PPO
    gamma = 0.99                # discount factor
    lr_actor = 0.0003           # learning rate for actor
    lr_critic = 0.001           # learning rate for critic
    #####################################################
    # env = gym.make(env_name)
    env = PartitionMazeEnv()
    # state space dimension
    state_dim = env.observation_space.shape[0]
    # action space dimension
    if has_continuous_action_space:
        action_dim = env.action_space.shape[0]
    else:
        action_dim = env.action_space.n
    # initialize a PPO agent
    ppo_agent = PPO(state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip, has_continuous_action_space, action_std)
    # preTrained weights directory
    random_seed = 0             #### set this to load a particular checkpoint trained on random seed
    run_num_pretrained = 0      #### set this to load a particular checkpoint num
    directory = "PPO_preTrained" + '/' + env_name + '/'
    checkpoint_path = directory + "PPO_{}_{}_{}.pth".format(env_name, random_seed, run_num_pretrained)
    print("loading network from : " + checkpoint_path)
    ppo_agent.load(checkpoint_path)
    print("--------------------------------------------------------------------------------------------")
    test_running_reward = 0
    for ep in range(1, total_test_episodes+1):
        ep_reward = 0
        state = env.reset()
        for t in range(1, max_ep_len+1):
            action = ppo_agent.select_action(state)
            state, reward, done, _, _ = env.step(action)
            ep_reward += reward
            if render:
                env.render()
                time.sleep(frame_delay)
            if done:
                break
        # clear buffer
        ppo_agent.buffer.clear()
        test_running_reward +=  ep_reward
        print('Episode: {} \t\t Reward: {}'.format(ep, round(ep_reward, 2)))
        ep_reward = 0
    env.close()
    print("============================================================================================")
    avg_test_reward = test_running_reward / total_test_episodes
    avg_test_reward = round(avg_test_reward, 2)
    print("average test reward : " + str(avg_test_reward))
    print("============================================================================================")
 if __name__ == '__main__':
    test()
--- a/PPO2/train.py
+++ b/PPO2/train.py
@ -0,0 +1,266 @@
 import os
 import glob
 import time
 from datetime import datetime
 import torch
 import numpy as np
 # import gym
 # import roboschool
 import gymnasium as gym
 from PPO import PPO
 import sys
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from env import PartitionMazeEnv
 ################################### Training ###################################
 def train():
    print("============================================================================================")
    ####### initialize environment hyperparameters ######
    env_name = "test"
    has_continuous_action_space = True  # continuous action space; else discrete
    max_ep_len = 100                   # max timesteps in one episode
    max_training_timesteps = int(3e8)   # break training loop if timeteps > max_training_timesteps
    print_freq = max_ep_len * 10        # print avg reward in the interval (in num timesteps)
    log_freq = max_ep_len * 2           # log avg reward in the interval (in num timesteps)
    save_model_freq = int(1e5)          # save model frequency (in num timesteps)
    action_std = 0.6                    # starting std for action distribution (Multivariate Normal)
    action_std_decay_rate = 0.05        # linearly decay action_std (action_std = action_std - action_std_decay_rate)
    min_action_std = 0.1                # minimum action_std (stop decay after action_std <= min_action_std)
    action_std_decay_freq = int(2.5e5)  # action_std decay frequency (in num timesteps)
    #####################################################
    ## Note : print/log frequencies should be > than max_ep_len
    ################ PPO hyperparameters ################
    update_timestep = max_ep_len * 4      # update policy every n timesteps
    K_epochs = 80               # update policy for K epochs in one PPO update
    eps_clip = 0.2          # clip parameter for PPO
    gamma = 0.99            # discount factor
    lr_actor = 0.0003       # learning rate for actor network
    lr_critic = 0.001       # learning rate for critic network
    random_seed = 0         # set random seed if required (0 = no random seed)
    #####################################################
    print("training environment name : " + env_name)
    # env = gym.make(env_name)
    env = PartitionMazeEnv()
    # state space dimension
    state_dim = env.observation_space.shape[0]
    # action space dimension
    if has_continuous_action_space:
        action_dim = env.action_space.shape[0]
    else:
        action_dim = env.action_space.n
    ###################### logging ######################
    #### log files for multiple runs are NOT overwritten
    log_dir = "PPO_logs"
    if not os.path.exists(log_dir):
          os.makedirs(log_dir)
    log_dir = log_dir + '/' + env_name + '/'
    if not os.path.exists(log_dir):
          os.makedirs(log_dir)
    #### get number of log files in log directory
    run_num = 0
    current_num_files = next(os.walk(log_dir))[2]
    run_num = len(current_num_files)
    #### create new log file for each run
    log_f_name = log_dir + '/PPO_' + env_name + "_log_" + str(run_num) + ".csv"
    print("current logging run number for " + env_name + " : ", run_num)
    print("logging at : " + log_f_name)
    #####################################################
    ################### checkpointing ###################
    run_num_pretrained = 0      #### change this to prevent overwriting weights in same env_name folder
    directory = "PPO_preTrained"
    if not os.path.exists(directory):
          os.makedirs(directory)
    directory = directory + '/' + env_name + '/'
    if not os.path.exists(directory):
          os.makedirs(directory)
    checkpoint_path = directory + "PPO_{}_{}_{}.pth".format(env_name, random_seed, run_num_pretrained)
    print("save checkpoint path : " + checkpoint_path)
    #####################################################
    ############# print all hyperparameters #############
    print("--------------------------------------------------------------------------------------------")
    print("max training timesteps : ", max_training_timesteps)
    print("max timesteps per episode : ", max_ep_len)
    print("model saving frequency : " + str(save_model_freq) + " timesteps")
    print("log frequency : " + str(log_freq) + " timesteps")
    print("printing average reward over episodes in last : " + str(print_freq) + " timesteps")
    print("--------------------------------------------------------------------------------------------")
    print("state space dimension : ", state_dim)
    print("action space dimension : ", action_dim)
    print("--------------------------------------------------------------------------------------------")
    if has_continuous_action_space:
        print("Initializing a continuous action space policy")
        print("--------------------------------------------------------------------------------------------")
        print("starting std of action distribution : ", action_std)
        print("decay rate of std of action distribution : ", action_std_decay_rate)
        print("minimum std of action distribution : ", min_action_std)
        print("decay frequency of std of action distribution : " + str(action_std_decay_freq) + " timesteps")
    else:
        print("Initializing a discrete action space policy")
    print("--------------------------------------------------------------------------------------------")
    print("PPO update frequency : " + str(update_timestep) + " timesteps")
    print("PPO K epochs : ", K_epochs)
    print("PPO epsilon clip : ", eps_clip)
    print("discount factor (gamma) : ", gamma)
    print("--------------------------------------------------------------------------------------------")
    print("optimizer learning rate actor : ", lr_actor)
    print("optimizer learning rate critic : ", lr_critic)
    if random_seed:
        print("--------------------------------------------------------------------------------------------")
        print("setting random seed to ", random_seed)
        torch.manual_seed(random_seed)
        env.seed(random_seed)
        np.random.seed(random_seed)
    #####################################################
    print("============================================================================================")
    ################# training procedure ################
    # initialize a PPO agent
    ppo_agent = PPO(state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip, has_continuous_action_space, action_std)
    # track total training time
    start_time = datetime.now().replace(microsecond=0)
    print("Started training at (GMT) : ", start_time)
    print("============================================================================================")
    # logging file
    log_f = open(log_f_name,"w+")
    log_f.write('episode,timestep,reward\n')
    # printing and logging variables
    print_running_reward = 0
    print_running_episodes = 0
    log_running_reward = 0
    log_running_episodes = 0
    time_step = 0
    i_episode = 0
    # training loop
    while time_step <= max_training_timesteps:
        state = env.reset()
        current_ep_reward = 0
        for t in range(1, max_ep_len+1):
            # select action with policy
            action = ppo_agent.select_action(state)
            state, reward, done, _, _ = env.step(action)
            # saving reward and is_terminals
            ppo_agent.buffer.rewards.append(reward)
            ppo_agent.buffer.is_terminals.append(done)
            time_step +=1
            current_ep_reward += reward
            # update PPO agent
            if time_step % update_timestep == 0:
                ppo_agent.update()
            # if continuous action space; then decay action std of ouput action distribution
            if has_continuous_action_space and time_step % action_std_decay_freq == 0:
                ppo_agent.decay_action_std(action_std_decay_rate, min_action_std)
            # log in logging file
            if time_step % log_freq == 0:
                # log average reward till last episode
                log_avg_reward = log_running_reward / log_running_episodes
                log_avg_reward = round(log_avg_reward, 4)
                log_f.write('{},{},{}\n'.format(i_episode, time_step, log_avg_reward))
                log_f.flush()
                log_running_reward = 0
                log_running_episodes = 0
            # printing average reward
            if time_step % print_freq == 0:
                # print average reward till last episode
                print_avg_reward = print_running_reward / print_running_episodes
                print_avg_reward = round(print_avg_reward, 2)
                print("Episode : {} \t\t Timestep : {} \t\t Average Reward : {}".format(i_episode, time_step, print_avg_reward))
                print_running_reward = 0
                print_running_episodes = 0
            # save model weights
            if time_step % save_model_freq == 0:
                print("--------------------------------------------------------------------------------------------")
                print("saving model at : " + checkpoint_path)
                ppo_agent.save(checkpoint_path)
                print("model saved")
                print("Elapsed Time  : ", datetime.now().replace(microsecond=0) - start_time)
                print("--------------------------------------------------------------------------------------------")
            # break; if the episode is over
            if done:
                break
        print_running_reward += current_ep_reward
        print_running_episodes += 1
        log_running_reward += current_ep_reward
        log_running_episodes += 1
        i_episode += 1
    log_f.close()
    env.close()
    # print total training time
    print("============================================================================================")
    end_time = datetime.now().replace(microsecond=0)
    print("Started training at (GMT) : ", start_time)
    print("Finished training at (GMT) : ", end_time)
    print("Total training time  : ", end_time - start_time)
    print("============================================================================================")
 if __name__ == '__main__':
    train()
--- a/PPO/env.py
+++ b/PPO/env.py
@ -64,7 +64,7 @@ class PartitionMazeEnv(gym.Env):
        self.BASE_LINE = 3500.0     # 基准时间，通过greedy或者蒙特卡洛计算出来
        self.step_count = 0
        self.rectangles = {}
-        self.car_pos = [(0.5, 0.5) for _ in range(self.num_cars)]
+        self.car_pos = [(self.H / 2, self.W / 2) for _ in range(self.num_cars)]
        self.car_traj = [[] for _ in range(self.num_cars)]
        self.current_car_index = 0
@ -79,13 +79,13 @@ class PartitionMazeEnv(gym.Env):
        self.region_centers = []
        self.step_count = 0
        self.rectangles = {}
-        self.car_pos = [(0.5, 0.5) for _ in range(self.num_cars)]
+        self.car_pos = [(self.H / 2, self.W / 2) for _ in range(self.num_cars)]
        self.car_traj = [[] for _ in range(self.num_cars)]
        self.current_car_index = 0
        # 状态：前 4 维为 partition_values，其余补 0
        state = np.concatenate(
            [self.partition_values, np.zeros(np.array(self.car_pos).flatten().shape[0], dtype=np.float32)])
-        return state, {}
+        return state
    def step(self, action):
        # 在所有阶段动作均为 1 维连续动作，取 action[0]
@ -153,12 +153,14 @@ class PartitionMazeEnv(gym.Env):
                        [self.partition_values, np.zeros(np.array(self.car_pos).flatten().shape[0], dtype=np.float32)])
                    return state, reward, True, False, {}
                else:
                    reward = 10
                    # 进入阶段 1：初始化迷宫
                    self.phase = 1
                    state = np.concatenate(
                        [self.partition_values, np.array(self.car_pos).flatten()])
                    reward = 10
                    # 构建反向索引，方便后续计算
                    self.reverse_rectangles = {v['center']: k for k, v in self.rectangles.items()}
                    return state, reward, False, False, {}
        elif self.phase == 1:
@ -172,7 +174,7 @@ class PartitionMazeEnv(gym.Env):
            # 将index映射到笛卡尔坐标
            coord = (target_region_index // (len(self.col_cuts) - 1),
                     target_region_index % (len(self.col_cuts) - 1))
-            self.car_pos[self.init_maze_step] = coord
+            self.car_pos[self.init_maze_step] = self.rectangles[coord]['center']
            self.car_traj[self.init_maze_step].append(coord)
            self.rectangles[coord]['is_visited'] = True
@ -190,7 +192,8 @@ class PartitionMazeEnv(gym.Env):
        elif self.phase == 2:
            # 阶段 2：路径规划（走迷宫）
            current_car = self.current_car_index
-            current_row, current_col = self.car_pos[current_car]
+            # 查表，找出当前车辆所在的网格
            current_row, current_col = self.reverse_rectangles[self.car_pos[current_car]]
            # 当前动作 a 为 1 维连续动作，映射到四个方向
            if a < 0.2:
@ -219,7 +222,8 @@ class PartitionMazeEnv(gym.Env):
            # TODO 移动不合法，加一些惩罚
            # 更新车辆位置
-            self.car_pos[current_car] = (new_row, new_col)
+            self.car_pos[current_car] = self.rectangles[(
                new_row, new_col)]['center']
            if new_row != current_row or new_col != current_col:
                self.car_traj[current_car].append((new_row, new_col))
            self.step_count += 1
--- a/mtkl_sovler.py
+++ b/mtkl_sovler.py
@ -6,7 +6,7 @@ import json
 # 固定随机种子，便于复现
 random.seed(42)
-num_iterations = 10000
+num_iterations = 1000000
 # ---------------------------
 # 参数设置
--- a/ray/atari_ppo.py
+++ b/ray/atari_ppo.py
@ -0,0 +1,97 @@
 # These tags allow extracting portions of this script on Anyscale.
 # ws-template-imports-start
 import gymnasium as gym
 from ray import tune
 from ray.rllib.algorithms.ppo import PPOConfig
 from ray.rllib.connectors.env_to_module.frame_stacking import FrameStackingEnvToModule
 from ray.rllib.connectors.learner.frame_stacking import FrameStackingLearner
 from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
 from ray.rllib.env.wrappers.atari_wrappers import wrap_atari_for_new_api_stack
 from ray.rllib.utils.test_utils import add_rllib_example_script_args
 # ws-template-imports-end
 parser = add_rllib_example_script_args(
    default_reward=float("inf"),
    default_timesteps=3000000,
    default_iters=100000000000,
 )
 parser.set_defaults(
    enable_new_api_stack=True,
    env="ale_py:ALE/Pong-v5",
 )
 # Use `parser` to add your own custom command line options to this script
 # and (if needed) use their values to set up `config` below.
 args = parser.parse_args()
 NUM_LEARNERS = args.num_learners or 1
 ENV = args.env
 # These tags allow extracting portions of this script on Anyscale.
 # ws-template-code-start
 def _make_env_to_module_connector(env):
    return FrameStackingEnvToModule(num_frames=4)
 def _make_learner_connector(input_observation_space, input_action_space):
    return FrameStackingLearner(num_frames=4)
 # Create a custom Atari setup (w/o the usual RLlib-hard-coded framestacking in it).
 # We would like our frame stacking connector to do this job.
 def _env_creator(cfg):
    return wrap_atari_for_new_api_stack(
        gym.make(ENV, **cfg, render_mode="rgb_array"),
        # Perform frame-stacking through ConnectorV2 API.
        framestack=None,
    )
 tune.register_env("env", _env_creator)
 config = (
    PPOConfig()
    .environment(
        "env",
        env_config={
            # Make analogous to old v4 + NoFrameskip.
            "frameskip": 1,
            "full_action_space": False,
            "repeat_action_probability": 0.0,
        },
        clip_rewards=True,
    )
    .env_runners(
        env_to_module_connector=_make_env_to_module_connector,
    )
    .training(
        learner_connector=_make_learner_connector,
        train_batch_size_per_learner=4000,
        minibatch_size=128,
        lambda_=0.95,
        kl_coeff=0.5,
        clip_param=0.1,
        vf_clip_param=10.0,
        entropy_coeff=0.01,
        num_epochs=10,
        lr=0.00015 * NUM_LEARNERS,
        grad_clip=100.0,
        grad_clip_by="global_norm",
    )
    .rl_module(
        model_config=DefaultModelConfig(
            conv_filters=[[16, 4, 2], [32, 4, 2], [64, 4, 2], [128, 4, 2]],
            conv_activation="relu",
            head_fcnet_hiddens=[256],
            vf_share_layers=True,
        ),
    )
 )
 # ws-template-code-end
 if __name__ == "__main__":
    from ray.rllib.utils.test_utils import run_rllib_example_script_experiment
    run_rllib_example_script_experiment(config, args=args)
--- a/ray/cartpole_ppo.py
+++ b/ray/cartpole_ppo.py
@ -0,0 +1,32 @@
 from ray.rllib.algorithms.ppo import PPOConfig
 from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
 from ray.rllib.utils.test_utils import add_rllib_example_script_args
 parser = add_rllib_example_script_args(default_reward=450.0, default_timesteps=300000)
 parser.set_defaults(enable_new_api_stack=True)
 # Use `parser` to add your own custom command line options to this script
 # and (if needed) use their values to set up `config` below.
 args = parser.parse_args()
 config = (
    PPOConfig()
    .environment("CartPole-v1")
    .training(
        lr=0.0003,
        num_epochs=6,
        vf_loss_coeff=0.01,
    )
    .rl_module(
        model_config=DefaultModelConfig(
            fcnet_hiddens=[32],
            fcnet_activation="linear",
            vf_share_layers=True,
        ),
    )
 )
 if __name__ == "__main__":
    from ray.rllib.utils.test_utils import run_rllib_example_script_experiment
    run_rllib_example_script_experiment(config, args)
--- a/ray/partition_maze_ppo.py
+++ b/ray/partition_maze_ppo.py
@ -0,0 +1,38 @@
 import gymnasium as gym
 from ray import tune
 from ray.rllib.algorithms.ppo import PPOConfig
 from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
 from ray.rllib.utils.test_utils import add_rllib_example_script_args
 from env import PartitionMazeEnv  # 导入自定义环境
 # 注册自定义环境
 gym.envs.register(
    id='PartitionMazeEnv-v0',
    entry_point='env:PartitionMazeEnv',
 )
 parser = add_rllib_example_script_args(default_reward=450.0, default_timesteps=300000)
 parser.set_defaults(enable_new_api_stack=True)
 args = parser.parse_args()
 config = (
    PPOConfig()
    .environment("PartitionMazeEnv-v0")
    .training(
        lr=0.0003,
        num_epochs=6,
        vf_loss_coeff=0.01,
    )
    .rl_module(
        model_config=DefaultModelConfig(
            fcnet_hiddens=[32],
            fcnet_activation="linear",
            vf_share_layers=True,
        ),
    )
 )
 if __name__ == "__main__":
    from ray.rllib.utils.test_utils import run_rllib_example_script_experiment
    run_rllib_example_script_experiment(config, args=args)