From f347ca8276a6becb9201fac37e98fbaa3613d952 Mon Sep 17 00:00:00 2001 From: weixin_46229132 Date: Sat, 29 Mar 2025 16:28:30 +0800 Subject: [PATCH] =?UTF-8?q?=E5=BE=AE=E8=B0=83=E5=88=86=E5=8C=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- PPO_Continuous/main.py | 16 +-- PPO_Continuous/utils.py | 6 +- env.py | 7 +- env_partion.py | 217 ++++++++++++++++++++-------------------- human_action.py | 2 +- 5 files changed, 128 insertions(+), 120 deletions(-) diff --git a/PPO_Continuous/main.py b/PPO_Continuous/main.py index f306702..b44ea85 100644 --- a/PPO_Continuous/main.py +++ b/PPO_Continuous/main.py @@ -10,6 +10,7 @@ import sys import os sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from env_partion import PartitionEnv +# from env import PartitionMazeEnv # fmt: on '''Hyperparameter Setting''' @@ -18,7 +19,7 @@ parser.add_argument('--dvc', type=str, default='cpu', help='running device: cuda or cpu') parser.add_argument('--EnvIdex', type=int, default=0, help='PM_PPO_Con, PV1, Lch_Cv2, Humanv4, HCv4, BWv3, BWHv3') -parser.add_argument('--write', type=str2bool, default=True, +parser.add_argument('--write', type=str2bool, default=False, help='Use SummaryWriter to record the training') parser.add_argument('--render', type=str2bool, default=False, help='Render or Not') @@ -28,7 +29,7 @@ parser.add_argument('--ModelIdex', type=int, default=500, help='which model to load') parser.add_argument('--seed', type=int, default=0, help='random seed') -parser.add_argument('--T_horizon', type=int, default=20, +parser.add_argument('--T_horizon', type=int, default=15, help='lenth of long trajectory') parser.add_argument('--Distribution', type=str, default='Beta', help='Should be one of Beta ; GS_ms ; GS_m') @@ -36,7 +37,7 @@ parser.add_argument('--Max_train_steps', type=int, default=int(5e8), help='Max training steps') parser.add_argument('--save_interval', type=int, default=int(5e5), help='Model saving interval, in steps.') -parser.add_argument('--eval_interval', type=int, default=int(5e1), +parser.add_argument('--eval_interval', type=int, default=int(5e3), help='Model evaluating interval, in steps.') parser.add_argument('--gamma', type=float, default=0.99, @@ -74,10 +75,10 @@ def main(): 'Humanv4', 'HCv4', 'BWv3', 'BWHv3'] # Build Env - # env = gym.make(EnvName[opt.EnvIdex], render_mode = "human" if opt.render else None) env = PartitionEnv() - # eval_env = gym.make(EnvName[opt.EnvIdex]) + # env = PartitionMazeEnv() eval_env = PartitionEnv() + # eval_env = PartitionMazeEnv() opt.state_dim = env.observation_space.shape[0] opt.action_dim = env.action_space.shape[0] opt.max_action = float(env.action_space.high[0]) @@ -129,9 +130,9 @@ def main(): '''Interact with Env''' a, logprob_a = agent.select_action( s, deterministic=False) # use stochastic when training - # act = Action_adapter(a,opt.max_action) #[0,1] to [-max,max] + act = Action_adapter(a,opt.max_action) #[0,1] to [-max,max] s_next, r, dw, tr, info = env.step( - a) # dw: dead&win; tr: truncated + act) # dw: dead&win; tr: truncated # r = Reward_adapter(r, opt.EnvIdex) done = (dw or tr) @@ -152,6 +153,7 @@ def main(): # evaluate the policy for 3 times, and get averaged result score = evaluate_policy( eval_env, agent, opt.max_action, turns=1) + # TODO 保存新的路径 if opt.write: writer.add_scalar( 'ep_r', score, global_step=total_steps) diff --git a/PPO_Continuous/utils.py b/PPO_Continuous/utils.py index 58b339d..aeb26cd 100644 --- a/PPO_Continuous/utils.py +++ b/PPO_Continuous/utils.py @@ -143,10 +143,10 @@ def evaluate_policy(env, agent, max_action, turns): while not done: # Take deterministic actions when evaluation a, logprob_a = agent.select_action(s, deterministic=True) - # act = Action_adapter(a, max_action) # [0,1] to [-max,max] - s_next, r, dw, tr, info = env.step(a) + act = Action_adapter(a, max_action) # [0,1] to [-max,max] + s_next, r, dw, tr, info = env.step(act) done = (dw or tr) - action_series.append(a[0]) + action_series.append(act[0]) total_scores += r s = s_next print('action series: ', np.round(action_series, 3)) diff --git a/env.py b/env.py index 3696dd8..ec81e68 100644 --- a/env.py +++ b/env.py @@ -39,9 +39,9 @@ class PartitionMazeEnv(gym.Env): ############################## # 可能需要手动修改的超参数 ############################## - self.CUT_NUM = 4 # 横切一半,竖切一半 - self.BASE_LINE = 3500 # 基准时间,通过greedy或者蒙特卡洛计算出来 - self.MAX_STEPS = 10 # 迷宫走法步数上限 + self.CUT_NUM = 6 # 横切一半,竖切一半 + self.BASE_LINE = 10000 # 基准时间,通过greedy或者蒙特卡洛计算出来 + self.MAX_STEPS = 20 # 迷宫走法步数上限 self.phase = 0 # 阶段控制,0:区域划分阶段,1:迷宫初始化阶段,2:走迷宫阶段 self.partition_step = 0 # 区域划分阶段步数,范围 0~4 @@ -172,6 +172,7 @@ class PartitionMazeEnv(gym.Env): ]) return state, reward, True, False, {} else: + print(self.partition_values) # 进入阶段 1:初始化迷宫 self.phase = 1 reward = 0.2 diff --git a/env_partion.py b/env_partion.py index 65b20d3..a9da83f 100644 --- a/env_partion.py +++ b/env_partion.py @@ -19,12 +19,28 @@ class PartitionEnv(gym.Env): # 可能需要手动修改的超参数 ############################## self.params = 'params2' + self.ORI_ROW_CUTS = [0, 0.2, 0.4, 0.7, 1] + self.ORI_COL_CUTS = [0, 0.5, 1] self.CUT_NUM = 4 self.ROW_CUT_LIMIT = 3 self.COL_CUT_LIMIT = 1 self.BASE_LINE = 10000 self.mTSP_STEPS = 10000 + # 定义动作空间:全部动作均为 1 维连续 [0,1] + self.action_space = spaces.Box( + low=0.0, high=1.0, shape=(1,), dtype=np.float32) + + # 定义观察空间为8维向量 + # 前 4 维表示已决策的切分值(未决策部分为 0) + self.observation_space = spaces.Box( + low=0.0, high=1.0, shape=(self.CUT_NUM + 4,), dtype=np.float32) + + self.partition_step = 0 + self.ori_row_cuts = self.ORI_ROW_CUTS[:] + self.ori_col_cuts = self.ORI_COL_CUTS[:] + self.rectangles = [] + # 车队参数设置 with open(self.params + '.yml', 'r', encoding='utf-8') as file: params = yaml.safe_load(file) @@ -45,140 +61,129 @@ class PartitionEnv(gym.Env): self.trans_energy_factor = params['trans_energy_factor'] self.battery_energy_capacity = params['battery_energy_capacity'] - self.partition_step = 0 # 区域划分阶段步数,范围 0~4 - self.partition_values = np.zeros( - self.CUT_NUM, dtype=np.float32) # 存储 c₁, c₂, r₁, r₂ - - # 定义动作空间:全部动作均为 1 维连续 [0,1] - self.action_space = spaces.Box( - low=0.0, high=1.0, shape=(1,), dtype=np.float32) - - # 定义观察空间为8维向量 - # 前 4 维表示已决策的切分值(未决策部分为 0) - self.observation_space = spaces.Box( - low=0.0, high=1.0, shape=(self.CUT_NUM,), dtype=np.float32) - - # 切分阶段相关变量 - self.col_cuts = [] # 存储竖切位置(c₁, c₂),当值为0时表示不切 - self.row_cuts = [] # 存储横切位置(r₁, r₂) - self.rectangles = [] - def reset(self, seed=None, options=None): # 重置所有变量,回到切分阶段(phase 0) self.phase = 0 self.partition_step = 0 - self.partition_values = np.zeros(self.CUT_NUM, dtype=np.float32) - self.col_cuts = [] - self.row_cuts = [] + self.ori_row_cuts = self.ORI_ROW_CUTS[:] + self.ori_col_cuts = self.ORI_COL_CUTS[:] self.rectangles = [] # 状态:前 4 维为 partition_values,其余为区域访问状态(初始全0) - state = self.partition_values + state = np.array(self.ori_row_cuts + self.ori_col_cuts) return state def step(self, action): # 在所有阶段动作均为 1 维连续动作,取 action[0] - a = float(action[0]) - self.partition_values[self.partition_step] = a + adjust = float(action[0]) + valid_adjust = True + + if self.partition_step < self.ROW_CUT_LIMIT: + row_cut = self.ori_row_cuts[self.partition_step + 1] + new_row_cut = row_cut + adjust + self.ori_row_cuts[self.partition_step + 1] = new_row_cut + + if self.ori_row_cuts[self.partition_step] < new_row_cut < self.ori_row_cuts[self.partition_step + 2]: + pass + else: + valid_adjust = False + reward = -100 + else: + col_idx = self.partition_step - self.ROW_CUT_LIMIT + col_cut = self.ori_col_cuts[col_idx + 1] + new_col_cut = col_cut + adjust + self.ori_col_cuts[col_idx + 1] = new_col_cut + + if self.ori_col_cuts[col_idx] < new_col_cut < self.ori_col_cuts[col_idx + 2]: + pass + else: + valid_adjust = False + reward = -100 + self.partition_step += 1 - # 构造当前状态:前 partition_step 个为已决策值,其余为 0,再补 7 个 0 - state = self.partition_values + state = np.array(self.ori_row_cuts + self.ori_col_cuts) - # 如果未完成 4 步,则仍处于切分阶段,不发奖励,done 为 False - if self.partition_step < self.CUT_NUM: - return state, 0.0, False, False, {} + # 出现无效调整,直接结束 + if not valid_adjust: + return state, reward, True, False, {} else: - # 完成 4 步后,计算切分边界 - # 过滤掉 0,并去重后排序 - rows = sorted( - set(v for v in self.partition_values[:self.ROW_CUT_LIMIT] if v > 0)) - cols = sorted( - set(v for v in self.partition_values[self.ROW_CUT_LIMIT:] if v > 0)) - rows = rows if rows else [] - cols = cols if cols else [] - - # 边界:始终包含 0 和 1 - self.row_cuts = [0.0] + rows + [1.0] - self.col_cuts = [0.0] + cols + [1.0] - - # 判断分区是否合理,并计算各个分区的任务卸载率ρ - valid_partition = True - for i in range(len(self.row_cuts) - 1): - for j in range(len(self.col_cuts) - 1): - d = (self.col_cuts[j+1] - self.col_cuts[j]) * self.W * \ - (self.row_cuts[i+1] - self.row_cuts[i]) * self.H - rho_time_limit = (self.flight_time_factor - self.trans_time_factor) / \ - (self.comp_time_factor - self.trans_time_factor) - rho_energy_limit = (self.battery_energy_capacity - self.flight_energy_factor * d - self.trans_energy_factor * d) / \ - (self.comp_energy_factor * d - - self.trans_energy_factor * d) - if rho_energy_limit < 0: - valid_partition = False - break - rho = min(rho_time_limit, rho_energy_limit) - - flight_time = self.flight_time_factor * d - bs_time = self.bs_time_factor * (1 - rho) * d - - self.rectangles.append({ - 'center': ((self.row_cuts[i] + self.row_cuts[i+1]) * self.H / 2, (self.col_cuts[j+1] + self.col_cuts[j]) * self.W / 2), - 'flight_time': flight_time, - 'bs_time': bs_time, - }) - if not valid_partition: - break - - if not valid_partition: - reward = -100 - state = self.partition_values - return state, reward, True, False, {} + if self.partition_step < self.CUT_NUM: + return state, 0.0, False, False, {} else: - reward = 0 - state = self.partition_values + # 完成 4 步后,判断分区是否合理,并计算各个分区的任务卸载率ρ + valid_partition = True + for i in range(len(self.ori_row_cuts) - 1): + for j in range(len(self.ori_col_cuts) - 1): + d = (self.ori_col_cuts[j+1] - self.ori_col_cuts[j]) * self.W * \ + (self.ori_row_cuts[i+1] - + self.ori_row_cuts[i]) * self.H + rho_time_limit = (self.flight_time_factor - self.trans_time_factor) / \ + (self.comp_time_factor - self.trans_time_factor) + rho_energy_limit = (self.battery_energy_capacity - self.flight_energy_factor * d - self.trans_energy_factor * d) / \ + (self.comp_energy_factor * d - + self.trans_energy_factor * d) + if rho_energy_limit < 0: + valid_partition = False + break + rho = min(rho_time_limit, rho_energy_limit) - # 继续进行路径规划 - # 使用q_learning解多旅行商 - # cities: [[x1, x2, x3...], [y1, y2, y3...]] 城市坐标 - # rec_center_lt = [rec_info['center'] - # for rec_info in self.rectangles] - # cities = np.column_stack(rec_center_lt) - # cities = np.column_stack((self.center, cities)) + flight_time = self.flight_time_factor * d + bs_time = self.bs_time_factor * (1 - rho) * d - # center_idx = [] - # for i in range(self.num_cars - 1): - # cities = np.column_stack((cities, self.center)) - # center_idx.append(cities.shape[1] - 1) + self.rectangles.append({ + 'center': ((self.ori_row_cuts[i] + self.ori_row_cuts[i+1]) * self.H / 2, (self.ori_col_cuts[j+1] + self.ori_col_cuts[j]) * self.W / 2), + 'flight_time': flight_time, + 'bs_time': bs_time, + }) + if not valid_partition: + break - # tsp = mTSP(params=self.params, num_cities=cities.shape[1], cities=cities, num_cars=self.num_cars, - # center_idx=center_idx, rectangles=self.rectangles) + if not valid_partition: + reward = -10 + return state, reward, True, False, {} + else: + # 继续进行路径规划 + # 使用q_learning解多旅行商 + # cities: [[x1, x2, x3...], [y1, y2, y3...]] 城市坐标 + # rec_center_lt = [rec_info['center'] + # for rec_info in self.rectangles] + # cities = np.column_stack(rec_center_lt) + # cities = np.column_stack((self.center, cities)) - # best_time, best_path = tsp.train(self.mTSP_STEPS) + # center_idx = [] + # for i in range(self.num_cars - 1): + # cities = np.column_stack((cities, self.center)) + # center_idx.append(cities.shape[1] - 1) - # 使用遗传算法解多旅行商 - cities = [self.center] - for rec in self.rectangles: - cities.append(rec['center']) - cities = np.array(cities) + # tsp = mTSP(params=self.params, num_cities=cities.shape[1], cities=cities, num_cars=self.num_cars, + # center_idx=center_idx, rectangles=self.rectangles) - center_idx = [0] - for i in range(self.num_cars - 1): - cities = np.row_stack((cities, self.center)) - center_idx.append(cities.shape[0] - 1) + # best_time, best_path = tsp.train(self.mTSP_STEPS) - ga = GA(num_drones=self.num_cars, num_city=cities.shape[0], num_total=20, - data=cities, to_process_idx=center_idx, rectangles=self.rectangles) + # 使用遗传算法解多旅行商 + cities = [self.center] + for rec in self.rectangles: + cities.append(rec['center']) + cities = np.array(cities) - best_path, best_time = ga.run() + center_idx = [0] + for i in range(self.num_cars - 1): + cities = np.row_stack((cities, self.center)) + center_idx.append(cities.shape[0] - 1) - # print(best_time) - # print(best_path) + ga = GA(num_drones=self.num_cars, num_city=cities.shape[0], num_total=20, + data=cities, to_process_idx=center_idx, rectangles=self.rectangles) - reward += self.BASE_LINE - best_time - print(reward) + best_path, best_time = ga.run() - return state, reward, True, False, best_path + # print(best_time) + # print(best_path) + + reward = self.BASE_LINE / best_time + + return state, reward, True, False, best_path def render(self): if self.phase == 1: diff --git a/human_action.py b/human_action.py index 9e3fe34..f8d6006 100644 --- a/human_action.py +++ b/human_action.py @@ -11,7 +11,7 @@ print('state:', state) # action_series = [[0.67], [0], [0], [0], [0.7]] # action_series = [0, 0, 3, 0, 10] action_series = [[0.2], [0.4], [0.7], [0.5]] -# action_series = [[0.5], [0.5]] +action_series = [[-0.1], [0], [0], [0]] for i in range(100): action = action_series[i]