修改car_pos

2025-03-13 21:28:30 +08:00 · 2025-03-13 21:28:30 +08:00 · 3086413171
commit 3086413171
parent ee914ff930
15 changed files with 993 additions and 10 deletions
--- a/.gitignore
+++ b/.gitignore
@ -10,6 +10,8 @@ __pycache__/
 # Pytorch weights
 weights/
 solutions/
+PPO_preTrained/
+PPO_logs/

 # Distribution / packaging
 .Python
--- a/PPO1/eval_policy.py
+++ b/PPO1/eval_policy.py
--- a/PPO1/main.py
+++ b/PPO1/main.py
@ -11,6 +11,9 @@ import argparse
 from ppo import PPO
 from network import FeedForwardNN
 from eval_policy import eval_policy
+import os
+import sys
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 
 from env import PartitionMazeEnv

 def train(env, hyperparameters, actor_model, critic_model):
--- a/PPO1/main_test.py
+++ b/PPO1/main_test.py
@ -11,6 +11,9 @@ import argparse
 from ppo import PPO
 from network import FeedForwardNN
 from eval_policy import eval_policy
+import os
+import sys
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 
 from env import PartitionMazeEnv

 def train(env, hyperparameters, actor_model, critic_model):
--- a/PPO1/network.py
+++ b/PPO1/network.py
--- a/PPO1/ppo.py
+++ b/PPO1/ppo.py
@ -183,7 +183,7 @@ class PPO:
 			ep_rews = [] # rewards collected per episode

 			# Reset the environment. sNote that obs is short for observation. 
-			obs, _ = self.env.reset()
+			obs = self.env.reset()
 			done = False

 			# Run an episode for a maximum of max_timesteps_per_episode timesteps
--- a/PPO2/PPO.py
+++ b/PPO2/PPO.py
@ -0,0 +1,273 @@
+import torch
+import torch.nn as nn
+from torch.distributions import MultivariateNormal
+from torch.distributions import Categorical
+
+################################## set device ##################################
+print("============================================================================================")
+# set device to cpu or cuda
+device = torch.device('cpu')
+if(torch.cuda.is_available()): 
+    device = torch.device('cuda:0') 
+    torch.cuda.empty_cache()
+    print("Device set to : " + str(torch.cuda.get_device_name(device)))
+else:
+    print("Device set to : cpu")
+print("============================================================================================")
+
+
+################################## PPO Policy ##################################
+class RolloutBuffer:
+    def __init__(self):
+        self.actions = []
+        self.states = []
+        self.logprobs = []
+        self.rewards = []
+        self.state_values = []
+        self.is_terminals = []
+    
+    def clear(self):
+        del self.actions[:]
+        del self.states[:]
+        del self.logprobs[:]
+        del self.rewards[:]
+        del self.state_values[:]
+        del self.is_terminals[:]
+
+
+class ActorCritic(nn.Module):
+    def __init__(self, state_dim, action_dim, has_continuous_action_space, action_std_init):
+        super(ActorCritic, self).__init__()
+
+        self.has_continuous_action_space = has_continuous_action_space
+        
+        if has_continuous_action_space:
+            self.action_dim = action_dim
+            self.action_var = torch.full((action_dim,), action_std_init * action_std_init).to(device)
+        # actor
+        if has_continuous_action_space :
+            self.actor = nn.Sequential(
+                            nn.Linear(state_dim, 64),
+                            nn.Tanh(),
+                            # nn.Sigmoid(),
+                            # nn.ReLU(),
+                            nn.Linear(64, 64),
+                            nn.Tanh(),
+                            # nn.Sigmoid(),
+                            # nn.ReLU(),
+                            nn.Linear(64, action_dim),
+                            nn.Tanh()
+                            # nn.Sigmoid()
+                            # nn.ReLU()
+                        )
+        else:
+            self.actor = nn.Sequential(
+                            nn.Linear(state_dim, 64),
+                            nn.Tanh(),
+                            nn.Linear(64, 64),
+                            nn.Tanh(),
+                            nn.Linear(64, action_dim),
+                            nn.Softmax(dim=-1)
+                        )
+        # critic
+        self.critic = nn.Sequential(
+                        nn.Linear(state_dim, 64),
+                        nn.Tanh(),
+                        # nn.Sigmoid(),
+                        # nn.ReLU(),
+                        nn.Linear(64, 64),
+                        nn.Tanh(),
+                        # nn.Sigmoid(),
+                        # nn.ReLU(),
+                        nn.Linear(64, 1)
+                    )
+        
+    def set_action_std(self, new_action_std):
+        if self.has_continuous_action_space:
+            self.action_var = torch.full((self.action_dim,), new_action_std * new_action_std).to(device)
+        else:
+            print("--------------------------------------------------------------------------------------------")
+            print("WARNING : Calling ActorCritic::set_action_std() on discrete action space policy")
+            print("--------------------------------------------------------------------------------------------")
+
+    def forward(self):
+        raise NotImplementedError
+    
+    def act(self, state):
+
+        if self.has_continuous_action_space:
+            action_mean = self.actor(state)
+            cov_mat = torch.diag(self.action_var).unsqueeze(dim=0)
+            dist = MultivariateNormal(action_mean, cov_mat)
+        else:
+            action_probs = self.actor(state)
+            dist = Categorical(action_probs)
+
+        action = dist.sample()
+        action_logprob = dist.log_prob(action)
+        state_val = self.critic(state)
+
+        return action.detach(), action_logprob.detach(), state_val.detach()
+    
+    def evaluate(self, state, action):
+
+        if self.has_continuous_action_space:
+            action_mean = self.actor(state)
+            
+            action_var = self.action_var.expand_as(action_mean)
+            cov_mat = torch.diag_embed(action_var).to(device)
+            dist = MultivariateNormal(action_mean, cov_mat)
+            
+            # For Single Action Environments.
+            if self.action_dim == 1:
+                action = action.reshape(-1, self.action_dim)
+        else:
+            action_probs = self.actor(state)
+            dist = Categorical(action_probs)
+        action_logprobs = dist.log_prob(action)
+        dist_entropy = dist.entropy()
+        state_values = self.critic(state)
+        
+        return action_logprobs, state_values, dist_entropy
+
+
+class PPO:
+    def __init__(self, state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip, has_continuous_action_space, action_std_init=0.6):
+
+        self.has_continuous_action_space = has_continuous_action_space
+
+        if has_continuous_action_space:
+            self.action_std = action_std_init
+
+        self.gamma = gamma
+        self.eps_clip = eps_clip
+        self.K_epochs = K_epochs
+        
+        self.buffer = RolloutBuffer()
+
+        self.policy = ActorCritic(state_dim, action_dim, has_continuous_action_space, action_std_init).to(device)
+        self.optimizer = torch.optim.Adam([
+                        {'params': self.policy.actor.parameters(), 'lr': lr_actor},
+                        {'params': self.policy.critic.parameters(), 'lr': lr_critic}
+                    ])
+
+        self.policy_old = ActorCritic(state_dim, action_dim, has_continuous_action_space, action_std_init).to(device)
+        self.policy_old.load_state_dict(self.policy.state_dict())
+        
+        self.MseLoss = nn.MSELoss()
+
+    def set_action_std(self, new_action_std):
+        if self.has_continuous_action_space:
+            self.action_std = new_action_std
+            self.policy.set_action_std(new_action_std)
+            self.policy_old.set_action_std(new_action_std)
+        else:
+            print("--------------------------------------------------------------------------------------------")
+            print("WARNING : Calling PPO::set_action_std() on discrete action space policy")
+            print("--------------------------------------------------------------------------------------------")
+
+    def decay_action_std(self, action_std_decay_rate, min_action_std):
+        print("--------------------------------------------------------------------------------------------")
+        if self.has_continuous_action_space:
+            self.action_std = self.action_std - action_std_decay_rate
+            self.action_std = round(self.action_std, 4)
+            if (self.action_std <= min_action_std):
+                self.action_std = min_action_std
+                print("setting actor output action_std to min_action_std : ", self.action_std)
+            else:
+                print("setting actor output action_std to : ", self.action_std)
+            self.set_action_std(self.action_std)
+
+        else:
+            print("WARNING : Calling PPO::decay_action_std() on discrete action space policy")
+        print("--------------------------------------------------------------------------------------------")
+
+    def select_action(self, state):
+
+        if self.has_continuous_action_space:
+            with torch.no_grad():
+                state = torch.FloatTensor(state).to(device)
+                action, action_logprob, state_val = self.policy_old.act(state)
+
+            self.buffer.states.append(state)
+            self.buffer.actions.append(action)
+            self.buffer.logprobs.append(action_logprob)
+            self.buffer.state_values.append(state_val)
+
+            return action.detach().cpu().numpy().flatten()
+        else:
+            with torch.no_grad():
+                state = torch.FloatTensor(state).to(device)
+                action, action_logprob, state_val = self.policy_old.act(state)
+            
+            self.buffer.states.append(state)
+            self.buffer.actions.append(action)
+            self.buffer.logprobs.append(action_logprob)
+            self.buffer.state_values.append(state_val)
+
+            return action.item()
+
+    def update(self):
+        # Monte Carlo estimate of returns
+        rewards = []
+        discounted_reward = 0
+        for reward, is_terminal in zip(reversed(self.buffer.rewards), reversed(self.buffer.is_terminals)):
+            if is_terminal:
+                discounted_reward = 0
+            discounted_reward = reward + (self.gamma * discounted_reward)
+            rewards.insert(0, discounted_reward)
+            
+        # Normalizing the rewards
+        rewards = torch.tensor(rewards, dtype=torch.float32).to(device)
+        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-7)
+
+        # convert list to tensor
+        old_states = torch.squeeze(torch.stack(self.buffer.states, dim=0)).detach().to(device)
+        old_actions = torch.squeeze(torch.stack(self.buffer.actions, dim=0)).detach().to(device)
+        old_logprobs = torch.squeeze(torch.stack(self.buffer.logprobs, dim=0)).detach().to(device)
+        old_state_values = torch.squeeze(torch.stack(self.buffer.state_values, dim=0)).detach().to(device)
+
+        # calculate advantages
+        advantages = rewards.detach() - old_state_values.detach()
+
+        # Optimize policy for K epochs
+        for _ in range(self.K_epochs):
+
+            # Evaluating old actions and values
+            logprobs, state_values, dist_entropy = self.policy.evaluate(old_states, old_actions)
+
+            # match state_values tensor dimensions with rewards tensor
+            state_values = torch.squeeze(state_values)
+            
+            # Finding the ratio (pi_theta / pi_theta__old)
+            ratios = torch.exp(logprobs - old_logprobs.detach())
+
+            # Finding Surrogate Loss  
+            surr1 = ratios * advantages
+            surr2 = torch.clamp(ratios, 1-self.eps_clip, 1+self.eps_clip) * advantages
+
+            # final loss of clipped objective PPO
+            loss = -torch.min(surr1, surr2) + 0.5 * self.MseLoss(state_values, rewards) - 0.01 * dist_entropy
+            
+            # take gradient step
+            self.optimizer.zero_grad()
+            loss.mean().backward()
+            self.optimizer.step()
+            
+        # Copy new weights into old policy
+        self.policy_old.load_state_dict(self.policy.state_dict())
+
+        # clear buffer
+        self.buffer.clear()
+    
+    def save(self, checkpoint_path):
+        torch.save(self.policy_old.state_dict(), checkpoint_path)
+   
+    def load(self, checkpoint_path):
+        self.policy_old.load_state_dict(torch.load(checkpoint_path, map_location=lambda storage, loc: storage))
+        self.policy.load_state_dict(torch.load(checkpoint_path, map_location=lambda storage, loc: storage))
+        
+        
+       
+
+
--- a/PPO2/plot_graph.py
+++ b/PPO2/plot_graph.py
@ -0,0 +1,142 @@
+import os
+import pandas as pd
+import matplotlib.pyplot as plt
+
+
+def save_graph():
+    print("============================================================================================")
+    # env_name = 'CartPole-v1'
+    # env_name = 'LunarLander-v2'
+    # env_name = 'BipedalWalker-v2'
+    env_name = 'RoboschoolWalker2d-v1'
+
+    fig_num = 0     #### change this to prevent overwriting figures in same env_name folder
+    plot_avg = True    # plot average of all runs; else plot all runs separately
+    fig_width = 10
+    fig_height = 6
+
+    # smooth out rewards to get a smooth and a less smooth (var) plot lines
+    window_len_smooth = 20
+    min_window_len_smooth = 1
+    linewidth_smooth = 1.5
+    alpha_smooth = 1
+
+    window_len_var = 5
+    min_window_len_var = 1
+    linewidth_var = 2
+    alpha_var = 0.1
+
+    colors = ['red', 'blue', 'green', 'orange', 'purple', 'olive', 'brown', 'magenta', 'cyan', 'crimson','gray', 'black']
+
+    # make directory for saving figures
+    figures_dir = "PPO_figs"
+    if not os.path.exists(figures_dir):
+        os.makedirs(figures_dir)
+
+    # make environment directory for saving figures
+    figures_dir = figures_dir + '/' + env_name + '/'
+    if not os.path.exists(figures_dir):
+        os.makedirs(figures_dir)
+
+    fig_save_path = figures_dir + '/PPO_' + env_name + '_fig_' + str(fig_num) + '.png'
+
+    # get number of log files in directory
+    log_dir = "PPO_logs" + '/' + env_name + '/'
+
+    current_num_files = next(os.walk(log_dir))[2]
+    num_runs = len(current_num_files)
+
+    all_runs = []
+
+    for run_num in range(num_runs):
+
+        log_f_name = log_dir + '/PPO_' + env_name + "_log_" + str(run_num) + ".csv"
+        print("loading data from : " + log_f_name)
+        data = pd.read_csv(log_f_name)
+        data = pd.DataFrame(data)
+
+        print("data shape : ", data.shape)
+
+        all_runs.append(data)
+        print("--------------------------------------------------------------------------------------------")
+
+    ax = plt.gca()
+
+    if plot_avg:
+        # average all runs
+        df_concat = pd.concat(all_runs)
+        df_concat_groupby = df_concat.groupby(df_concat.index)
+        data_avg = df_concat_groupby.mean()
+
+        # smooth out rewards to get a smooth and a less smooth (var) plot lines
+        data_avg['reward_smooth'] = data_avg['reward'].rolling(window=window_len_smooth, win_type='triang', min_periods=min_window_len_smooth).mean()
+        data_avg['reward_var'] = data_avg['reward'].rolling(window=window_len_var, win_type='triang', min_periods=min_window_len_var).mean()
+
+        data_avg.plot(kind='line', x='timestep' , y='reward_smooth',ax=ax,color=colors[0],  linewidth=linewidth_smooth, alpha=alpha_smooth)
+        data_avg.plot(kind='line', x='timestep' , y='reward_var',ax=ax,color=colors[0],  linewidth=linewidth_var, alpha=alpha_var)
+
+        # keep only reward_smooth in the legend and rename it
+        handles, labels = ax.get_legend_handles_labels()
+        ax.legend([handles[0]], ["reward_avg_" + str(len(all_runs)) + "_runs"], loc=2)
+
+    else:
+        for i, run in enumerate(all_runs):
+            # smooth out rewards to get a smooth and a less smooth (var) plot lines
+            run['reward_smooth_' + str(i)] = run['reward'].rolling(window=window_len_smooth, win_type='triang', min_periods=min_window_len_smooth).mean()
+            run['reward_var_' + str(i)] = run['reward'].rolling(window=window_len_var, win_type='triang', min_periods=min_window_len_var).mean()
+
+            # plot the lines
+            run.plot(kind='line', x='timestep' , y='reward_smooth_' + str(i),ax=ax,color=colors[i % len(colors)],  linewidth=linewidth_smooth, alpha=alpha_smooth)
+            run.plot(kind='line', x='timestep' , y='reward_var_' + str(i),ax=ax,color=colors[i % len(colors)],  linewidth=linewidth_var, alpha=alpha_var)
+
+        # keep alternate elements (reward_smooth_i) in the legend
+        handles, labels = ax.get_legend_handles_labels()
+        new_handles = []
+        new_labels = []
+        for i in range(len(handles)):
+            if(i%2 == 0):
+                new_handles.append(handles[i])
+                new_labels.append(labels[i])
+        ax.legend(new_handles, new_labels, loc=2)
+
+    # ax.set_yticks(np.arange(0, 1800, 200))
+    # ax.set_xticks(np.arange(0, int(4e6), int(5e5)))
+
+    ax.grid(color='gray', linestyle='-', linewidth=1, alpha=0.2)
+
+    ax.set_xlabel("Timesteps", fontsize=12)
+    ax.set_ylabel("Rewards", fontsize=12)
+
+    plt.title(env_name, fontsize=14)
+
+    fig = plt.gcf()
+    fig.set_size_inches(fig_width, fig_height)
+
+    print("============================================================================================")
+    plt.savefig(fig_save_path)
+    print("figure saved at : ", fig_save_path)
+    print("============================================================================================")
+    
+    plt.show()
+
+
+if __name__ == '__main__':
+
+    save_graph()
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
--- a/PPO2/test.py
+++ b/PPO2/test.py
@ -0,0 +1,123 @@
+import os
+import glob
+import time
+from datetime import datetime
+
+import torch
+import numpy as np
+
+# import gym
+# import roboschool
+
+from PPO import PPO
+import sys
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from env import PartitionMazeEnv
+
+#################################### Testing ###################################
+def test():
+    print("============================================================================================")
+
+    ################## hyperparameters ##################
+
+    # env_name = "CartPole-v1"
+    # has_continuous_action_space = False
+    # max_ep_len = 400
+    # action_std = None
+
+    # env_name = "LunarLander-v2"
+    # has_continuous_action_space = False
+    # max_ep_len = 300
+    # action_std = None
+
+    # env_name = "BipedalWalker-v2"
+    # has_continuous_action_space = True
+    # max_ep_len = 1500           # max timesteps in one episode
+    # action_std = 0.1            # set same std for action distribution which was used while saving
+
+    env_name = "test"
+    has_continuous_action_space = True
+    max_ep_len = 1000           # max timesteps in one episode
+    action_std = 0.1            # set same std for action distribution which was used while saving
+
+    render = True              # render environment on screen
+    frame_delay = 0             # if required; add delay b/w frames
+
+    total_test_episodes = 10    # total num of testing episodes
+
+    K_epochs = 80               # update policy for K epochs
+    eps_clip = 0.2              # clip parameter for PPO
+    gamma = 0.99                # discount factor
+
+    lr_actor = 0.0003           # learning rate for actor
+    lr_critic = 0.001           # learning rate for critic
+
+    #####################################################
+
+    # env = gym.make(env_name)
+    env = PartitionMazeEnv()
+
+    # state space dimension
+    state_dim = env.observation_space.shape[0]
+
+    # action space dimension
+    if has_continuous_action_space:
+        action_dim = env.action_space.shape[0]
+    else:
+        action_dim = env.action_space.n
+
+    # initialize a PPO agent
+    ppo_agent = PPO(state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip, has_continuous_action_space, action_std)
+
+    # preTrained weights directory
+
+    random_seed = 0             #### set this to load a particular checkpoint trained on random seed
+    run_num_pretrained = 0      #### set this to load a particular checkpoint num
+
+    directory = "PPO_preTrained" + '/' + env_name + '/'
+    checkpoint_path = directory + "PPO_{}_{}_{}.pth".format(env_name, random_seed, run_num_pretrained)
+    print("loading network from : " + checkpoint_path)
+
+    ppo_agent.load(checkpoint_path)
+
+    print("--------------------------------------------------------------------------------------------")
+
+    test_running_reward = 0
+
+    for ep in range(1, total_test_episodes+1):
+        ep_reward = 0
+        state = env.reset()
+
+        for t in range(1, max_ep_len+1):
+            action = ppo_agent.select_action(state)
+            state, reward, done, _, _ = env.step(action)
+            ep_reward += reward
+
+            if render:
+                env.render()
+                time.sleep(frame_delay)
+
+            if done:
+                break
+
+        # clear buffer
+        ppo_agent.buffer.clear()
+
+        test_running_reward +=  ep_reward
+        print('Episode: {} \t\t Reward: {}'.format(ep, round(ep_reward, 2)))
+        ep_reward = 0
+
+    env.close()
+
+    print("============================================================================================")
+
+    avg_test_reward = test_running_reward / total_test_episodes
+    avg_test_reward = round(avg_test_reward, 2)
+    print("average test reward : " + str(avg_test_reward))
+
+    print("============================================================================================")
+
+
+if __name__ == '__main__':
+
+    test()
--- a/PPO2/train.py
+++ b/PPO2/train.py
@ -0,0 +1,266 @@
+import os
+import glob
+import time
+from datetime import datetime
+
+import torch
+import numpy as np
+
+# import gym
+# import roboschool
+import gymnasium as gym
+
+from PPO import PPO
+import sys
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from env import PartitionMazeEnv
+
+################################### Training ###################################
+def train():
+    print("============================================================================================")
+
+    ####### initialize environment hyperparameters ######
+    env_name = "test"
+
+    has_continuous_action_space = True  # continuous action space; else discrete
+
+    max_ep_len = 100                   # max timesteps in one episode
+    max_training_timesteps = int(3e8)   # break training loop if timeteps > max_training_timesteps
+
+    print_freq = max_ep_len * 10        # print avg reward in the interval (in num timesteps)
+    log_freq = max_ep_len * 2           # log avg reward in the interval (in num timesteps)
+    save_model_freq = int(1e5)          # save model frequency (in num timesteps)
+
+    action_std = 0.6                    # starting std for action distribution (Multivariate Normal)
+    action_std_decay_rate = 0.05        # linearly decay action_std (action_std = action_std - action_std_decay_rate)
+    min_action_std = 0.1                # minimum action_std (stop decay after action_std <= min_action_std)
+    action_std_decay_freq = int(2.5e5)  # action_std decay frequency (in num timesteps)
+    #####################################################
+
+    ## Note : print/log frequencies should be > than max_ep_len
+
+    ################ PPO hyperparameters ################
+    update_timestep = max_ep_len * 4      # update policy every n timesteps
+    K_epochs = 80               # update policy for K epochs in one PPO update
+
+    eps_clip = 0.2          # clip parameter for PPO
+    gamma = 0.99            # discount factor
+
+    lr_actor = 0.0003       # learning rate for actor network
+    lr_critic = 0.001       # learning rate for critic network
+
+    random_seed = 0         # set random seed if required (0 = no random seed)
+    #####################################################
+
+    print("training environment name : " + env_name)
+
+    # env = gym.make(env_name)
+    env = PartitionMazeEnv()
+
+    # state space dimension
+    state_dim = env.observation_space.shape[0]
+
+    # action space dimension
+    if has_continuous_action_space:
+        action_dim = env.action_space.shape[0]
+    else:
+        action_dim = env.action_space.n
+
+    ###################### logging ######################
+
+    #### log files for multiple runs are NOT overwritten
+    log_dir = "PPO_logs"
+    if not os.path.exists(log_dir):
+          os.makedirs(log_dir)
+
+    log_dir = log_dir + '/' + env_name + '/'
+    if not os.path.exists(log_dir):
+          os.makedirs(log_dir)
+
+    #### get number of log files in log directory
+    run_num = 0
+    current_num_files = next(os.walk(log_dir))[2]
+    run_num = len(current_num_files)
+
+    #### create new log file for each run
+    log_f_name = log_dir + '/PPO_' + env_name + "_log_" + str(run_num) + ".csv"
+
+    print("current logging run number for " + env_name + " : ", run_num)
+    print("logging at : " + log_f_name)
+    #####################################################
+
+    ################### checkpointing ###################
+    run_num_pretrained = 0      #### change this to prevent overwriting weights in same env_name folder
+
+    directory = "PPO_preTrained"
+    if not os.path.exists(directory):
+          os.makedirs(directory)
+
+    directory = directory + '/' + env_name + '/'
+    if not os.path.exists(directory):
+          os.makedirs(directory)
+
+
+    checkpoint_path = directory + "PPO_{}_{}_{}.pth".format(env_name, random_seed, run_num_pretrained)
+    print("save checkpoint path : " + checkpoint_path)
+    #####################################################
+
+
+    ############# print all hyperparameters #############
+    print("--------------------------------------------------------------------------------------------")
+    print("max training timesteps : ", max_training_timesteps)
+    print("max timesteps per episode : ", max_ep_len)
+    print("model saving frequency : " + str(save_model_freq) + " timesteps")
+    print("log frequency : " + str(log_freq) + " timesteps")
+    print("printing average reward over episodes in last : " + str(print_freq) + " timesteps")
+    print("--------------------------------------------------------------------------------------------")
+    print("state space dimension : ", state_dim)
+    print("action space dimension : ", action_dim)
+    print("--------------------------------------------------------------------------------------------")
+    if has_continuous_action_space:
+        print("Initializing a continuous action space policy")
+        print("--------------------------------------------------------------------------------------------")
+        print("starting std of action distribution : ", action_std)
+        print("decay rate of std of action distribution : ", action_std_decay_rate)
+        print("minimum std of action distribution : ", min_action_std)
+        print("decay frequency of std of action distribution : " + str(action_std_decay_freq) + " timesteps")
+    else:
+        print("Initializing a discrete action space policy")
+    print("--------------------------------------------------------------------------------------------")
+    print("PPO update frequency : " + str(update_timestep) + " timesteps")
+    print("PPO K epochs : ", K_epochs)
+    print("PPO epsilon clip : ", eps_clip)
+    print("discount factor (gamma) : ", gamma)
+    print("--------------------------------------------------------------------------------------------")
+    print("optimizer learning rate actor : ", lr_actor)
+    print("optimizer learning rate critic : ", lr_critic)
+    if random_seed:
+        print("--------------------------------------------------------------------------------------------")
+        print("setting random seed to ", random_seed)
+        torch.manual_seed(random_seed)
+        env.seed(random_seed)
+        np.random.seed(random_seed)
+    #####################################################
+
+    print("============================================================================================")
+
+    ################# training procedure ################
+
+    # initialize a PPO agent
+    ppo_agent = PPO(state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip, has_continuous_action_space, action_std)
+
+    # track total training time
+    start_time = datetime.now().replace(microsecond=0)
+    print("Started training at (GMT) : ", start_time)
+
+    print("============================================================================================")
+
+    # logging file
+    log_f = open(log_f_name,"w+")
+    log_f.write('episode,timestep,reward\n')
+
+    # printing and logging variables
+    print_running_reward = 0
+    print_running_episodes = 0
+
+    log_running_reward = 0
+    log_running_episodes = 0
+
+    time_step = 0
+    i_episode = 0
+
+    # training loop
+    while time_step <= max_training_timesteps:
+
+        state = env.reset()
+        current_ep_reward = 0
+
+        for t in range(1, max_ep_len+1):
+
+            # select action with policy
+            action = ppo_agent.select_action(state)
+            state, reward, done, _, _ = env.step(action)
+
+            # saving reward and is_terminals
+            ppo_agent.buffer.rewards.append(reward)
+            ppo_agent.buffer.is_terminals.append(done)
+
+            time_step +=1
+            current_ep_reward += reward
+
+            # update PPO agent
+            if time_step % update_timestep == 0:
+                ppo_agent.update()
+
+            # if continuous action space; then decay action std of ouput action distribution
+            if has_continuous_action_space and time_step % action_std_decay_freq == 0:
+                ppo_agent.decay_action_std(action_std_decay_rate, min_action_std)
+
+            # log in logging file
+            if time_step % log_freq == 0:
+
+                # log average reward till last episode
+                log_avg_reward = log_running_reward / log_running_episodes
+                log_avg_reward = round(log_avg_reward, 4)
+
+                log_f.write('{},{},{}\n'.format(i_episode, time_step, log_avg_reward))
+                log_f.flush()
+
+                log_running_reward = 0
+                log_running_episodes = 0
+
+            # printing average reward
+            if time_step % print_freq == 0:
+
+                # print average reward till last episode
+                print_avg_reward = print_running_reward / print_running_episodes
+                print_avg_reward = round(print_avg_reward, 2)
+
+                print("Episode : {} \t\t Timestep : {} \t\t Average Reward : {}".format(i_episode, time_step, print_avg_reward))
+
+                print_running_reward = 0
+                print_running_episodes = 0
+
+            # save model weights
+            if time_step % save_model_freq == 0:
+                print("--------------------------------------------------------------------------------------------")
+                print("saving model at : " + checkpoint_path)
+                ppo_agent.save(checkpoint_path)
+                print("model saved")
+                print("Elapsed Time  : ", datetime.now().replace(microsecond=0) - start_time)
+                print("--------------------------------------------------------------------------------------------")
+
+            # break; if the episode is over
+            if done:
+                break
+
+        print_running_reward += current_ep_reward
+        print_running_episodes += 1
+
+        log_running_reward += current_ep_reward
+        log_running_episodes += 1
+
+        i_episode += 1
+
+    log_f.close()
+    env.close()
+
+    # print total training time
+    print("============================================================================================")
+    end_time = datetime.now().replace(microsecond=0)
+    print("Started training at (GMT) : ", start_time)
+    print("Finished training at (GMT) : ", end_time)
+    print("Total training time  : ", end_time - start_time)
+    print("============================================================================================")
+
+
+if __name__ == '__main__':
+
+    train()
+    
+    
+    
+    
+    
+    
+    
--- a/PPO/env.py
+++ b/PPO/env.py
@ -64,7 +64,7 @@ class PartitionMazeEnv(gym.Env):
        self.BASE_LINE = 3500.0     # 基准时间，通过greedy或者蒙特卡洛计算出来
        self.step_count = 0
        self.rectangles = {}
-        self.car_pos = [(0.5, 0.5) for _ in range(self.num_cars)]
+        self.car_pos = [(self.H / 2, self.W / 2) for _ in range(self.num_cars)]
        self.car_traj = [[] for _ in range(self.num_cars)]
        self.current_car_index = 0

@ -79,13 +79,13 @@ class PartitionMazeEnv(gym.Env):
        self.region_centers = []
        self.step_count = 0
        self.rectangles = {}
-        self.car_pos = [(0.5, 0.5) for _ in range(self.num_cars)]
+        self.car_pos = [(self.H / 2, self.W / 2) for _ in range(self.num_cars)]
        self.car_traj = [[] for _ in range(self.num_cars)]
        self.current_car_index = 0
        # 状态：前 4 维为 partition_values，其余补 0
        state = np.concatenate(
            [self.partition_values, np.zeros(np.array(self.car_pos).flatten().shape[0], dtype=np.float32)])
-        return state, {}
+        return state

    def step(self, action):
        # 在所有阶段动作均为 1 维连续动作，取 action[0]
@ -153,12 +153,14 @@ class PartitionMazeEnv(gym.Env):
                        [self.partition_values, np.zeros(np.array(self.car_pos).flatten().shape[0], dtype=np.float32)])
                    return state, reward, True, False, {}
                else:
-                    reward = 10
-
                    # 进入阶段 1：初始化迷宫
                    self.phase = 1
                    state = np.concatenate(
                        [self.partition_values, np.array(self.car_pos).flatten()])
+                    reward = 10
+                    
+                    # 构建反向索引，方便后续计算
+                    self.reverse_rectangles = {v['center']: k for k, v in self.rectangles.items()}
                    return state, reward, False, False, {}

        elif self.phase == 1:
@ -172,7 +174,7 @@ class PartitionMazeEnv(gym.Env):
            # 将index映射到笛卡尔坐标
            coord = (target_region_index // (len(self.col_cuts) - 1),
                     target_region_index % (len(self.col_cuts) - 1))
-            self.car_pos[self.init_maze_step] = coord
+            self.car_pos[self.init_maze_step] = self.rectangles[coord]['center']
            self.car_traj[self.init_maze_step].append(coord)
            self.rectangles[coord]['is_visited'] = True

@ -190,7 +192,8 @@ class PartitionMazeEnv(gym.Env):
        elif self.phase == 2:
            # 阶段 2：路径规划（走迷宫）
            current_car = self.current_car_index
-            current_row, current_col = self.car_pos[current_car]
+            # 查表，找出当前车辆所在的网格
+            current_row, current_col = self.reverse_rectangles[self.car_pos[current_car]]

            # 当前动作 a 为 1 维连续动作，映射到四个方向
            if a < 0.2:
@ -219,7 +222,8 @@ class PartitionMazeEnv(gym.Env):
            # TODO 移动不合法，加一些惩罚

            # 更新车辆位置
-            self.car_pos[current_car] = (new_row, new_col)
+            self.car_pos[current_car] = self.rectangles[(
+                new_row, new_col)]['center']
            if new_row != current_row or new_col != current_col:
                self.car_traj[current_car].append((new_row, new_col))
            self.step_count += 1
--- a/mtkl_sovler.py
+++ b/mtkl_sovler.py
@ -6,7 +6,7 @@ import json
 # 固定随机种子，便于复现
 random.seed(42)

-num_iterations = 10000
+num_iterations = 1000000

 # ---------------------------
 # 参数设置
--- a/ray/atari_ppo.py
+++ b/ray/atari_ppo.py
@ -0,0 +1,97 @@
+# These tags allow extracting portions of this script on Anyscale.
+# ws-template-imports-start
+import gymnasium as gym
+
+from ray import tune
+from ray.rllib.algorithms.ppo import PPOConfig
+from ray.rllib.connectors.env_to_module.frame_stacking import FrameStackingEnvToModule
+from ray.rllib.connectors.learner.frame_stacking import FrameStackingLearner
+from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
+from ray.rllib.env.wrappers.atari_wrappers import wrap_atari_for_new_api_stack
+from ray.rllib.utils.test_utils import add_rllib_example_script_args
+
+# ws-template-imports-end
+
+parser = add_rllib_example_script_args(
+    default_reward=float("inf"),
+    default_timesteps=3000000,
+    default_iters=100000000000,
+)
+parser.set_defaults(
+    enable_new_api_stack=True,
+    env="ale_py:ALE/Pong-v5",
+)
+# Use `parser` to add your own custom command line options to this script
+# and (if needed) use their values to set up `config` below.
+args = parser.parse_args()
+
+NUM_LEARNERS = args.num_learners or 1
+ENV = args.env
+
+
+# These tags allow extracting portions of this script on Anyscale.
+# ws-template-code-start
+def _make_env_to_module_connector(env):
+    return FrameStackingEnvToModule(num_frames=4)
+
+
+def _make_learner_connector(input_observation_space, input_action_space):
+    return FrameStackingLearner(num_frames=4)
+
+
+# Create a custom Atari setup (w/o the usual RLlib-hard-coded framestacking in it).
+# We would like our frame stacking connector to do this job.
+def _env_creator(cfg):
+    return wrap_atari_for_new_api_stack(
+        gym.make(ENV, **cfg, render_mode="rgb_array"),
+        # Perform frame-stacking through ConnectorV2 API.
+        framestack=None,
+    )
+
+
+tune.register_env("env", _env_creator)
+
+config = (
+    PPOConfig()
+    .environment(
+        "env",
+        env_config={
+            # Make analogous to old v4 + NoFrameskip.
+            "frameskip": 1,
+            "full_action_space": False,
+            "repeat_action_probability": 0.0,
+        },
+        clip_rewards=True,
+    )
+    .env_runners(
+        env_to_module_connector=_make_env_to_module_connector,
+    )
+    .training(
+        learner_connector=_make_learner_connector,
+        train_batch_size_per_learner=4000,
+        minibatch_size=128,
+        lambda_=0.95,
+        kl_coeff=0.5,
+        clip_param=0.1,
+        vf_clip_param=10.0,
+        entropy_coeff=0.01,
+        num_epochs=10,
+        lr=0.00015 * NUM_LEARNERS,
+        grad_clip=100.0,
+        grad_clip_by="global_norm",
+    )
+    .rl_module(
+        model_config=DefaultModelConfig(
+            conv_filters=[[16, 4, 2], [32, 4, 2], [64, 4, 2], [128, 4, 2]],
+            conv_activation="relu",
+            head_fcnet_hiddens=[256],
+            vf_share_layers=True,
+        ),
+    )
+)
+# ws-template-code-end
+
+if __name__ == "__main__":
+    from ray.rllib.utils.test_utils import run_rllib_example_script_experiment
+
+    run_rllib_example_script_experiment(config, args=args)
--- a/ray/cartpole_ppo.py
+++ b/ray/cartpole_ppo.py
@ -0,0 +1,32 @@
+from ray.rllib.algorithms.ppo import PPOConfig
+from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
+from ray.rllib.utils.test_utils import add_rllib_example_script_args
+
+parser = add_rllib_example_script_args(default_reward=450.0, default_timesteps=300000)
+parser.set_defaults(enable_new_api_stack=True)
+# Use `parser` to add your own custom command line options to this script
+# and (if needed) use their values to set up `config` below.
+args = parser.parse_args()
+
+config = (
+    PPOConfig()
+    .environment("CartPole-v1")
+    .training(
+        lr=0.0003,
+        num_epochs=6,
+        vf_loss_coeff=0.01,
+    )
+    .rl_module(
+        model_config=DefaultModelConfig(
+            fcnet_hiddens=[32],
+            fcnet_activation="linear",
+            vf_share_layers=True,
+        ),
+    )
+)
+
+
+if __name__ == "__main__":
+    from ray.rllib.utils.test_utils import run_rllib_example_script_experiment
+
+    run_rllib_example_script_experiment(config, args)
--- a/ray/partition_maze_ppo.py
+++ b/ray/partition_maze_ppo.py
@ -0,0 +1,38 @@
+import gymnasium as gym
+from ray import tune
+from ray.rllib.algorithms.ppo import PPOConfig
+from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
+from ray.rllib.utils.test_utils import add_rllib_example_script_args
+from env import PartitionMazeEnv  # 导入自定义环境
+
+# 注册自定义环境
+gym.envs.register(
+    id='PartitionMazeEnv-v0',
+    entry_point='env:PartitionMazeEnv',
+)
+
+parser = add_rllib_example_script_args(default_reward=450.0, default_timesteps=300000)
+parser.set_defaults(enable_new_api_stack=True)
+args = parser.parse_args()
+
+config = (
+    PPOConfig()
+    .environment("PartitionMazeEnv-v0")
+    .training(
+        lr=0.0003,
+        num_epochs=6,
+        vf_loss_coeff=0.01,
+    )
+    .rl_module(
+        model_config=DefaultModelConfig(
+            fcnet_hiddens=[32],
+            fcnet_activation="linear",
+            vf_share_layers=True,
+        ),
+    )
+)
+
+if __name__ == "__main__":
+    from ray.rllib.utils.test_utils import run_rllib_example_script_experiment
+
+    run_rllib_example_script_experiment(config, args=args)