diff --git a/Duel_Double_DQN/DQN.py b/Duel_Double_DQN/DQN.py
index d03adf8..92773a8 100644
--- a/Duel_Double_DQN/DQN.py
+++ b/Duel_Double_DQN/DQN.py
@@ -69,11 +69,11 @@ class DQN_agent(object):
 			else:
 				if state[0][0] == 0:
 					q_value = self.q_net(state)
-					q_value[10:] = - float('inf')
+					q_value[0][10:] = - float('inf')
 					a = q_value.argmax().item()
 				else:
 					q_value = self.q_net(state)
-					q_value[:10] = - float('inf')
+					q_value[0][:10] = - float('inf')
 					a = q_value.argmax().item()
 		return a
 
diff --git a/Duel_Double_DQN/main.py b/Duel_Double_DQN/main.py
index fff5c96..e470bdf 100644
--- a/Duel_Double_DQN/main.py
+++ b/Duel_Double_DQN/main.py
@@ -1,15 +1,17 @@
+from DQN import DQN_agent
+from datetime import datetime
+from utils import evaluate_policy, str2bool
 import gymnasium as gym
-import os
 import shutil
 import argparse
 import torch
 import numpy as np
+# fmt: off
 import sys
+import os
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from env_dis import PartitionMazeEnv
-from utils import evaluate_policy, str2bool
-from datetime import datetime
-from DQN import DQN_agent
+# fmt: on
 
 '''Hyperparameter Setting'''
 parser = argparse.ArgumentParser()
@@ -66,7 +68,6 @@ def main():
     eval_env = PartitionMazeEnv()
     opt.state_dim = env.observation_space.shape[0]
     opt.action_dim = env.action_space.n
-    opt.max_e_steps = 50
 
     # Algorithm Setting
     if opt.Duel:
@@ -87,7 +88,7 @@ def main():
     print("Random Seed: {}".format(opt.seed))
 
     print('Algorithm:', algo_name, '  Env:', BriefEnvName[opt.EnvIdex], '  state_dim:', opt.state_dim,
-          '  action_dim:', opt.action_dim, '  Random Seed:', opt.seed, '  max_e_steps:', opt.max_e_steps, '\n')
+          '  action_dim:', opt.action_dim, '  Random Seed:', opt.seed, '\n')
 
     if opt.write:
         from torch.utils.tensorboard import SummaryWriter
diff --git a/Duel_Double_DQN/utils.py b/Duel_Double_DQN/utils.py
index bd342ff..362efa6 100644
--- a/Duel_Double_DQN/utils.py
+++ b/Duel_Double_DQN/utils.py
@@ -1,16 +1,21 @@
+import numpy as np
+
 def evaluate_policy(env, agent, turns = 3):
     total_scores = 0
     for j in range(turns):
         s = env.reset()
         done = False
+        action_series = []
         while not done:
             # Take deterministic actions at test time
             a = agent.select_action(s, deterministic=True)
             s_next, r, dw, tr, info = env.step(a)
             done = (dw or tr)
-
+            action_series.append(a)
             total_scores += r
             s = s_next
+    print('action series: ', np.roudn(action_series, 3))
+    print('state: ', s)
     return int(total_scores/turns)
 
 
diff --git a/env.py b/env.py
index 0ad2e18..789c4bb 100644
--- a/env.py
+++ b/env.py
@@ -71,6 +71,7 @@ class PartitionMazeEnv(gym.Env):
         self.car_pos = [(self.H / 2, self.W / 2) for _ in range(self.num_cars)]
         self.car_traj = [[] for _ in range(self.num_cars)]
         self.current_car_index = 0
+        self.previous_T = 0
 
     def reset(self, seed=None, options=None):
         # 重置所有变量，回到切分阶段（phase 0）
@@ -290,9 +291,19 @@ class PartitionMazeEnv(gym.Env):
                 # 区域覆盖完毕，根据轨迹计算各车队的执行时间
                 T = max([self._compute_motorcade_time(idx)
                         for idx in range(self.num_cars)])
-                reward += self.BASE_LINE / T * 1000
-                # reward += self.BASE_LINE - T
-                # print(reward)
+                # TODO 让奖励在baseline附近变化更剧烈
+                # reward = math.exp(-T / self.BASE_LINE) * 1000
+                reward = self.BASE_LINE / T * 1000
+                if T < self.BASE_LINE:
+                    reward *= 10
+                    print(reward)
+                
+
+                # if reward > self.BASE_LINE:
+                #     reward -= 200
+                # # TODO 计算len(self.car_traj)的值，需要修改轨迹记录法则
+                # reward -= 10 * self.step_count
+                # TODO 动态调整baseline
             elif done and self.step_count >= self.MAX_STEPS:
                 reward += -1000
 
diff --git a/env_dis.py b/env_dis.py
index e96715f..277057b 100644
--- a/env_dis.py
+++ b/env_dis.py
@@ -41,6 +41,7 @@ class PartitionMazeEnv(gym.Env):
         ##############################
         self.CUT_NUM = 4    # 横切一半，竖切一半
         self.BASE_LINE = 4000     # 基准时间，通过greedy或者蒙特卡洛计算出来
+        self.MAX_STEPS = 50         # 迷宫走法步数上限
 
         self.phase = 0    # 阶段控制，0：区域划分阶段，1：迷宫初始化阶段，2：走迷宫阶段
         self.partition_step = 0      # 区域划分阶段步数，范围 0~4
@@ -52,11 +53,11 @@ class PartitionMazeEnv(gym.Env):
         self.action_space = spaces.Discrete(15)
 
         # 定义观察空间为8维向量
-        # TODO 返回的状态目前只有位置坐标
         # 阶段 0 状态：前 4 维表示已决策的切分值（未决策部分为 0）
         # 阶段 1 状态：车辆位置 (2D)
+        max_regions = (self.CUT_NUM // 2 + 1) ** 2
         self.observation_space = spaces.Box(
-            low=0.0, high=1.0, shape=(1 + self.CUT_NUM + 2 * self.num_cars,), dtype=np.float32)
+            low=0.0, high=1.0, shape=(1 + self.CUT_NUM + max_regions,), dtype=np.float32)
 
         # 切分阶段相关变量
         self.col_cuts = []     # 存储竖切位置（c₁, c₂），当值为0时表示不切
@@ -65,7 +66,6 @@ class PartitionMazeEnv(gym.Env):
         self.init_maze_step = 0
 
         # 路径规划阶段相关变量
-        self.MAX_STEPS = 50         # 迷宫走法步数上限
         self.step_count = 0
         self.rectangles = {}
         self.car_pos = [(self.H / 2, self.W / 2) for _ in range(self.num_cars)]
@@ -87,8 +87,12 @@ class PartitionMazeEnv(gym.Env):
         self.car_traj = [[] for _ in range(self.num_cars)]
         self.current_car_index = 0
         # 状态：前 4 维为 partition_values，其余补 0
-        state = np.concatenate(
-            [[self.phase], self.partition_values, np.zeros(np.array(self.car_pos).flatten().shape[0], dtype=np.float32)])
+        max_regions = (self.CUT_NUM // 2 + 1) ** 2
+        state = np.concatenate([
+            [self.phase],
+            self.partition_values,
+            np.zeros(max_regions, dtype=np.float32)
+        ])
         return state
 
     def step(self, action):
@@ -102,7 +106,7 @@ class PartitionMazeEnv(gym.Env):
             # 构造当前状态：前 partition_step 个为已决策值，其余为 0，再补 7 个 0
             state = np.concatenate(
                 [[self.phase], self.partition_values, np.zeros(
-                    np.array(self.car_pos).flatten().shape[0], dtype=np.float32)]
+                    (self.CUT_NUM // 2 + 1) ** 2, dtype=np.float32)]
             )
 
             # 如果未完成 4 步，则仍处于切分阶段，不发奖励，done 为 False
@@ -153,7 +157,9 @@ class PartitionMazeEnv(gym.Env):
                 if not valid_partition:
                     reward = -10000
                     state = np.concatenate(
-                        [[self.phase], self.partition_values, np.zeros(np.array(self.car_pos).flatten().shape[0], dtype=np.float32)])
+                        [[self.phase], self.partition_values, np.zeros(
+                            (self.CUT_NUM // 2 + 1) ** 2, dtype=np.float32)]
+                    )
                     return state, reward, True, False, {}
                 else:
                     # 初始化迷宫
@@ -184,10 +190,21 @@ class PartitionMazeEnv(gym.Env):
 
                     # 进入阶段 2：走迷宫
                     self.phase = 2
+
+                    # 构造访问状态向量
+                    max_regions = (self.CUT_NUM // 2 + 1) ** 2
+                    visit_status = np.zeros(max_regions, dtype=np.float32)
+
+                    # 将实际区域的访问状态填入向量
+                    for i in range(len(self.row_cuts) - 1):
+                        for j in range(len(self.col_cuts) - 1):
+                            idx = i * (len(self.col_cuts) - 1) + j
+                            visit_status[idx] = float(
+                                self.rectangles[(i, j)]['is_visited'])
+                    for i in range(idx + 1, max_regions):
+                        visit_status[i] = 100
                     state = np.concatenate(
-                        [[self.phase], self.partition_values,
-                            np.array(self.car_pos).flatten()]
-                    )
+                        [[self.phase], self.partition_values, visit_status])
                     return state, reward, False, False, {}
 
         elif self.phase == 2:
@@ -224,9 +241,20 @@ class PartitionMazeEnv(gym.Env):
             self.rectangles[(new_row, new_col)]['is_visited'] = True
 
             # 观察状态
-            state = np.concatenate(
-                [[self.phase], self.partition_values, np.array(self.car_pos).flatten()])
             reward = 0
+            max_regions = (self.CUT_NUM // 2 + 1) ** 2
+            visit_status = np.zeros(max_regions, dtype=np.float32)
+
+            # 将实际区域的访问状态填入向量
+            for i in range(len(self.row_cuts) - 1):
+                for j in range(len(self.col_cuts) - 1):
+                    idx = i * (len(self.col_cuts) - 1) + j
+                    visit_status[idx] = float(
+                        self.rectangles[(i, j)]['is_visited'])
+            for i in range(idx + 1, max_regions):
+                visit_status[i] = 100
+            state = np.concatenate(
+                [[self.phase], self.partition_values, visit_status])
 
             # Episode 终止条件：所有网格均被访问或步数达到上限
             done = all([value['is_visited'] for _, value in self.rectangles.items()]) or (
@@ -238,7 +266,7 @@ class PartitionMazeEnv(gym.Env):
                 # print(T)
                 # print(self.partition_values)
                 # print(self.car_traj)
-                reward += self.BASE_LINE / T * 100
+                reward += self.BASE_LINE / T * 1000
             elif done and self.step_count >= self.MAX_STEPS:
                 reward += -1000
 
diff --git a/human_action.py b/human_action.py
index d2565d2..6adcbff 100644
--- a/human_action.py
+++ b/human_action.py
@@ -1,17 +1,18 @@
-from env import PartitionMazeEnv
-# from env_dis import PartitionMazeEnv
+# from env import PartitionMazeEnv
+from env_dis import PartitionMazeEnv
 
 env = PartitionMazeEnv()
 
 state = env.reset()
 print(state)
 
-action_series = [[0], [0], [0.4], [0], [0.1]]
-# action_series = [0, 0, 3, 0, 0, 10]
+# action_series = [[0], [0], [0.4], [0], [0.1]]
+action_series = [0, 0, 3, 0, 10]
 
 for i in range(100):
     action = action_series[i]
     state, reward, done, info, _ = env.step(action)
-    print(state, reward, done, info)
+    print(state)
+    print(reward)
     if done:
         break