调整奖励函数

2025-03-31 11:12:01 +08:00 · 2025-03-31 11:12:01 +08:00 · dab8f4fd8f
commit dab8f4fd8f
parent 84f69f4293
1 changed files with 38 additions and 18 deletions
--- a/env_partion_dist.py
+++ b/env_partion_dist.py
@ -37,6 +37,7 @@ class PartitionEnv(gym.Env):
        self.col_cuts = self.ORI_COL_CUTS[:]
        self.rectangles = []
        self.adjust_step = 0
        self.best_path = None
        # 车队参数设置
        with open(self.params + '.yml', 'r', encoding='utf-8') as file:
@ -64,6 +65,7 @@ class PartitionEnv(gym.Env):
        self.col_cuts = self.ORI_COL_CUTS[:]
        self.rectangles = []
        self.adjust_step = 0
        self.best_path = None
        # 状态：前 4 维为 partition_values，其余为区域访问状态（初始全0）
        state = np.array(self.row_cuts + self.col_cuts)
@ -90,37 +92,34 @@ class PartitionEnv(gym.Env):
        elif action == 9:
            pass
        self.adjust_step += 1
        state = np.array(self.row_cuts + self.col_cuts)
        if self.row_cuts[0] < self.row_cuts[1] < self.row_cuts[2] < self.row_cuts[3] < self.row_cuts[4] and self.col_cuts[0] < self.col_cuts[1] < self.col_cuts[2]:
-            # 调整合法，验证分区情况是否满足条件
+            # 调整是合法的，验证分区情况是否满足条件
            rectangles = self.if_valid_partition()
            if not rectangles:
-                # 不满足条件，结束
+                # 不满足条件，时间给一个很大的值
-                reward = -10000
+                best_time = self.BASE_LINE * 2
                return state, reward, True, False, {}
            else:
                # 满足条件，继续进行路径规划
                # 每隔10步计算一次路径，第一次也需要计算路径，记录最佳路径
-                if self.adjust_step % 10 == 0 or self.adjust_step == 1:
+                if self.adjust_step % 10 == 0 or self.adjust_step == 1 or self.best_path is None:
                    best_time, self.best_path = self.ga_solver(rectangles)
                else:
                    # 根据最佳路径计算当前时间
                    best_time = self.get_best_time(self.best_path, rectangles)
-                reward = self.BASE_LINE - best_time
+        else:
            # 调整不合法，时间给一个很大的值
            best_time = self.BASE_LINE * 2
        reward = self.calc_reward(best_time)
        self.adjust_step += 1
        state = np.array(self.row_cuts + self.col_cuts)
        if self.adjust_step < self.MAX_ADJUST_STEP:
-                    done = False
+            return state, reward, False, False, {}
        else:
-                    done = True
+            return state, reward, True, False, {}
                return state, reward, done, False, self.best_path
        else:
            # 调整不合法，结束
            return state, -10, True, False, {}
    def if_valid_partition(self):
        rectangles = []
@ -220,6 +219,27 @@ class PartitionEnv(gym.Env):
        best_time = ga.compute_pathlen(best_path)
        return best_time
    def calc_reward(self, best_time):
        """
        计算奖励
        Args:
            best_time (float): 当前路径的时间
        Returns:
            float: 计算得到的奖励值
        """
        time_diff = self.BASE_LINE - best_time
        # 归一化时间差
        normalized_diff = 1 / (1 + np.exp(-time_diff/20))
        # 计算轮次权重
        step_weight = 1 / (1 + np.exp(-self.adjust_step/10))
        # 计算最终奖励（添加缩放因子）
        reward = normalized_diff * step_weight * 10  # 10是缩放因子
        return reward
    def render(self):
        if self.phase == 1:
            print("Phase 1: Initialize maze environment.")