From 343008bc9f1db139109644eed450073d534cf978 Mon Sep 17 00:00:00 2001 From: weixin_46229132 Date: Tue, 18 Mar 2025 17:27:49 +0800 Subject: [PATCH] =?UTF-8?q?=E7=AE=80=E5=8C=96=E5=88=9D=E5=A7=8B=E5=8C=96?= =?UTF-8?q?=E8=BF=B7=E5=AE=AB=E7=9A=84=E6=96=B9=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 1 - DDPG_solver/main.py | 9 +- DQN/RL_brain.py | 254 ++++++++++++++++++++++++++++++ DQN/dqn.py | 94 ----------- DQN/env.py | 134 ---------------- DQN/env_allocation.py | 140 ---------------- DQN/env_partition.py | 88 ----------- DQN/env_routing.py | 152 ------------------ DQN/run_dqn.py | 95 ----------- DQN/run_hierarchical.py | 118 -------------- DQN/run_this.py | 58 +++++++ env.py | 48 +++--- params.yml | 6 +- solutions/best_solution_mtkl.json | 54 +++++++ 14 files changed, 397 insertions(+), 854 deletions(-) create mode 100644 DQN/RL_brain.py delete mode 100644 DQN/dqn.py delete mode 100644 DQN/env.py delete mode 100644 DQN/env_allocation.py delete mode 100644 DQN/env_partition.py delete mode 100644 DQN/env_routing.py delete mode 100644 DQN/run_dqn.py delete mode 100644 DQN/run_hierarchical.py create mode 100644 DQN/run_this.py create mode 100644 solutions/best_solution_mtkl.json diff --git a/.gitignore b/.gitignore index a3041d4..8f96f8a 100644 --- a/.gitignore +++ b/.gitignore @@ -9,7 +9,6 @@ __pycache__/ # Pytorch weights weights/ -solutions/ PPO_preTrained/ PPO_logs/ logs/ diff --git a/DDPG_solver/main.py b/DDPG_solver/main.py index 8f19591..ab28f69 100644 --- a/DDPG_solver/main.py +++ b/DDPG_solver/main.py @@ -1,15 +1,14 @@ -from env import PartitionMazeEnv -from utils import str2bool, evaluate_policy -from datetime import datetime -from DDPG import DDPG_agent import gymnasium as gym import os import shutil import argparse import torch - import sys sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from env import PartitionMazeEnv +from utils import str2bool, evaluate_policy +from datetime import datetime +from DDPG import DDPG_agent '''Hyperparameter Setting''' parser = argparse.ArgumentParser() diff --git a/DQN/RL_brain.py b/DQN/RL_brain.py new file mode 100644 index 0000000..b1149b0 --- /dev/null +++ b/DQN/RL_brain.py @@ -0,0 +1,254 @@ +""" +Deep Q Network off-policy +""" +import torch +import torch.nn as nn +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt + +np.random.seed(42) +torch.manual_seed(2) + + +class Network(nn.Module): + """ + Network Structure + """ + def __init__(self, + n_features, + n_actions, + n_neuron=10 + ): + super(Network, self).__init__() + self.net = nn.Sequential( + nn.Linear(in_features=n_features, out_features=n_neuron, bias=True), + nn.Linear(in_features=n_neuron, out_features=n_actions, bias=True), + nn.ReLU() + ) + + def forward(self, s): + """ + + :param s: s + :return: q + """ + q = self.net(s) + return q + + +class DeepQNetwork(nn.Module): + """ + Q Learning Algorithm + """ + def __init__(self, + n_actions, + n_features, + learning_rate=0.01, + reward_decay=0.9, + e_greedy=0.9, + replace_target_iter=300, + memory_size=500, + batch_size=32, + e_greedy_increment=None): + super(DeepQNetwork, self).__init__() + + self.n_actions = n_actions + self.n_features = n_features + self.lr = learning_rate + self.gamma = reward_decay + self.epsilon_max = e_greedy + self.replace_target_iter = replace_target_iter + self.memory_size = memory_size + self.batch_size = batch_size + self.epsilon_increment = e_greedy_increment + self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max + + # total learning step + self.learn_step_counter = 0 + + # initialize zero memory [s, a, r, s_] + # 这里用pd.DataFrame创建的表格作为memory + # 表格的行数是memory的大小,也就是transition的个数 + # 表格的列数是transition的长度,一个transition包含[s, a, r, s_],其中a和r分别是一个数字,s和s_的长度分别是n_features + self.memory = pd.DataFrame(np.zeros((self.memory_size, self.n_features*2+2))) + + # build two network: eval_net and target_net + self.eval_net = Network(n_features=self.n_features, n_actions=self.n_actions) + self.target_net = Network(n_features=self.n_features, n_actions=self.n_actions) + self.loss_function = nn.MSELoss() + self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=self.lr) + + # 记录每一步的误差 + self.cost_his = [] + + + def store_transition(self, s, a, r, s_): + if not hasattr(self, 'memory_counter'): + # hasattr用于判断对象是否包含对应的属性。 + self.memory_counter = 0 + + transition = np.hstack((s, [a,r], s_)) + + # replace the old memory with new memory + index = self.memory_counter % self.memory_size + self.memory.iloc[index, :] = transition + + self.memory_counter += 1 + + def choose_action(self, observation): + observation = observation[np.newaxis, :] + + if np.random.uniform() < self.epsilon: + # forward feed the observation and get q value for every actions + s = torch.FloatTensor(observation) + actions_value = self.eval_net(s) + action = [np.argmax(actions_value.detach().numpy())][0] + else: + action = np.random.randint(0, self.n_actions) + return action + + def _replace_target_params(self): + # 复制网络参数 + self.target_net.load_state_dict(self.eval_net.state_dict()) + + def learn(self): + # check to replace target parameters + if self.learn_step_counter % self.replace_target_iter == 0: + self._replace_target_params() + print('\ntarget params replaced\n') + + # sample batch memory from all memory + batch_memory = self.memory.sample(self.batch_size) \ + if self.memory_counter > self.memory_size \ + else self.memory.iloc[:self.memory_counter].sample(self.batch_size, replace=True) + + # run the nextwork + s = torch.FloatTensor(batch_memory.iloc[:, :self.n_features].values) + s_ = torch.FloatTensor(batch_memory.iloc[:, -self.n_features:].values) + q_eval = self.eval_net(s) + q_next = self.target_net(s_) + + # change q_target w.r.t q_eval's action + q_target = q_eval.clone() + + # 更新值 + batch_index = np.arange(self.batch_size, dtype=np.int32) + eval_act_index = batch_memory.iloc[:, self.n_features].values.astype(int) + reward = batch_memory.iloc[:, self.n_features + 1].values + + q_target[batch_index, eval_act_index] = torch.FloatTensor(reward) + self.gamma * q_next.max(dim=1).values + + # train eval network + loss = self.loss_function(q_target, q_eval) + self.optimizer.zero_grad() + loss.backward() + self.optimizer.step() + + self.cost_his.append(loss.detach().numpy()) + + # increasing epsilon + self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max + self.learn_step_counter += 1 + + def plot_cost(self): + plt.figure() + plt.plot(np.arange(len(self.cost_his)), self.cost_his) + plt.show() + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/DQN/dqn.py b/DQN/dqn.py deleted file mode 100644 index 4d90e89..0000000 --- a/DQN/dqn.py +++ /dev/null @@ -1,94 +0,0 @@ -import torch -import torch.nn as nn -import torch.optim as optim -import numpy as np -from collections import deque -import random - -class DQN(nn.Module): - def __init__(self, state_dim, action_dim): - super(DQN, self).__init__() - - self.network = nn.Sequential( - nn.Linear(state_dim, 128), - nn.ReLU(), - nn.Linear(128, 128), - nn.ReLU(), - nn.Linear(128, action_dim) - ) - - def forward(self, x): - return self.network(x) - -class Agent: - def __init__(self, state_dim, action_dim): - self.state_dim = state_dim - self.action_dim = action_dim - - # DQN网络 - self.eval_net = DQN(state_dim, action_dim) - self.target_net = DQN(state_dim, action_dim) - self.target_net.load_state_dict(self.eval_net.state_dict()) - - # 训练参数 - self.learning_rate = 0.001 - self.gamma = 0.99 - self.epsilon = 1.0 - self.epsilon_min = 0.01 - self.epsilon_decay = 0.995 - self.memory = deque(maxlen=10000) - self.batch_size = 64 - self.optimizer = optim.Adam(self.eval_net.parameters(), lr=self.learning_rate) - - def choose_action(self, state): - if random.random() < self.epsilon: - # 随机选择动作 - return random.randint(0, self.action_dim - 1) - else: - # 根据Q值选择动作 - state = torch.FloatTensor(state).unsqueeze(0) - q_values = self.eval_net(state) - return torch.argmax(q_values).item() - - def store_transition(self, state, action, reward, next_state, done): - self.memory.append((state, action, reward, next_state, done)) - - def learn(self): - if len(self.memory) < self.batch_size: - return - - # 随机采样batch - batch = random.sample(self.memory, self.batch_size) - states = torch.FloatTensor([x[0] for x in batch]) - actions = torch.LongTensor([x[1] for x in batch]) - rewards = torch.FloatTensor([x[2] for x in batch]) - next_states = torch.FloatTensor([x[3] for x in batch]) - dones = torch.FloatTensor([x[4] for x in batch]) - - # 计算当前Q值 - current_q_values = self.eval_net(states).gather(1, actions.unsqueeze(1)) - - # 计算目标Q值 - next_q_values = self.target_net(next_states).detach() - max_next_q = torch.max(next_q_values, dim=1)[0] - target_q_values = rewards + (1 - dones) * self.gamma * max_next_q - - # 计算损失 - loss = nn.MSELoss()(current_q_values.squeeze(), target_q_values) - - # 更新网络 - self.optimizer.zero_grad() - loss.backward() - self.optimizer.step() - - # 更新epsilon - self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay) - - # 定期更新目标网络 - if self.learn.counter % 100 == 0: - self.target_net.load_state_dict(self.eval_net.state_dict()) - - self.learn.counter += 1 - - # 添加计数器属性 - learn.counter = 0 diff --git a/DQN/env.py b/DQN/env.py deleted file mode 100644 index 8c8ed6f..0000000 --- a/DQN/env.py +++ /dev/null @@ -1,134 +0,0 @@ -import numpy as np -import gym -from gym import spaces - - -class Env(gym.Env): - """多车-巢-机系统的区域覆盖环境""" - - def __init__(self): - super(Env, self).__init__() - - # 环境参数 - self.H = 20 # 区域高度 - self.W = 25 # 区域宽度 - self.k = 1 # 系统数量 - - # 时间系数 - self.flight_time_factor = 3 # 每张照片飞行时间 - self.comp_uav_factor = 5 # 无人机计算时间 - self.trans_time_factor = 0.3 # 传输时间 - self.car_move_time_factor = 100 # 汽车移动时间 - self.comp_bs_factor = 5 # 机巢计算时间 - - # 能量参数 - self.flight_energy_factor = 0.05 # 飞行能耗 - self.comp_energy_factor = 0.05 # 计算能耗 - self.trans_energy_factor = 0.0025 # 传输能耗 - self.battery_capacity = 30 # 电池容量 - - # 动作空间 - # [垂直切割数, 水平切割数, 卸载率] - self.action_space = spaces.Box( - low=np.array([1, 1, 0]), - high=np.array([5, 5, 1]), - dtype=np.float32 - ) - - # 状态空间 - # [当前垂直切割数, 当前水平切割数, 当前最大完成时间] - self.observation_space = spaces.Box( - low=np.array([1, 1, 0]), - high=np.array([5, 5, float('inf')]), - dtype=np.float32 - ) - - self.state = None - self.current_step = 0 - self.max_steps = 1000 - - def step(self, action): - self.current_step += 1 - - # 解析动作 - v_cuts = int(action[0]) # 垂直切割数 - h_cuts = int(action[1]) # 水平切割数 - # rho = action[2] # 卸载率 - - # TODO 生成切割位置,目前是均匀切割 - v_boundaries = np.linspace(0, self.H, v_cuts + 1) - h_boundaries = np.linspace(0, self.W, h_cuts + 1) - - # 计算每个子区域的指标 - total_time = 0 - valid_partition = True - - for i in range(len(v_boundaries) - 1): - for j in range(len(h_boundaries) - 1): - # 计算子区域大小 - height = v_boundaries[i+1] - v_boundaries[i] - width = h_boundaries[j+1] - h_boundaries[j] - area = height * width - - # 求解rho - rho_time_limit = (self.flight_time_factor - self.trans_time_factor) / \ - (self.comp_uav_factor - self.trans_time_factor) - rho_energy_limit = (self.battery_capacity - self.flight_energy_factor * area - self.trans_energy_factor * area) / \ - (self.comp_energy_factor * area - self.trans_energy_factor * area) - if rho_energy_limit < 0: - valid_partition = False - break - rho = min(rho_time_limit, rho_energy_limit) - - # 计算各阶段时间 - flight_time = self.flight_time_factor * area - comp_time = self.comp_uav_factor * rho * area - trans_time = self.trans_time_factor * (1 - rho) * area - comp_bs_time = self.comp_bs_factor * (1 - rho) * area - - # # 计算能耗 - # flight_energy = self.flight_energy_factor * area - # comp_energy = self.comp_energy_factor * rho * area - # trans_energy = self.trans_energy_factor * (1 - rho) * area - # total_energy = flight_energy + comp_energy + trans_energy - - # # 检查约束 - # if total_energy > self.battery_capacity or (comp_time + trans_time > flight_time): - # valid_partition = False - # break - - # 计算子区域中心到区域中心的距离 - center_y = (v_boundaries[i] + v_boundaries[i+1]) / 2 - center_x = (h_boundaries[j] + h_boundaries[j+1]) / 2 - dist_to_center = np.sqrt( - (center_y - self.H/2)**2 + (center_x - self.W/2)**2) - car_time = dist_to_center * self.car_move_time_factor - - # 更新总时间 - task_time = max(flight_time + car_time, comp_bs_time) - total_time = max(total_time, task_time) - - if not valid_partition: - break - - # 计算奖励 - if not valid_partition: - reward = -10000 # 惩罚无效方案 - done = True - else: - reward = -total_time # 负的完成时间作为奖励 - done = self.current_step >= self.max_steps - - # 更新状态 - self.state = np.array([v_cuts, h_cuts, total_time]) - - return self.state, reward, done, {} - - def reset(self): - # 初始化状态 - self.state = np.array([1, 1, 0]) - self.current_step = 0 - return self.state - - def render(self, mode='human'): - pass diff --git a/DQN/env_allocation.py b/DQN/env_allocation.py deleted file mode 100644 index d18def4..0000000 --- a/DQN/env_allocation.py +++ /dev/null @@ -1,140 +0,0 @@ -import numpy as np -import gym -from gym import spaces - -class AllocationEnv(gym.Env): - """任务分配环境(第二层)""" - def __init__(self, subareas, num_systems): - super(AllocationEnv, self).__init__() - - self.subareas = subareas # 子区域列表 - self.num_systems = num_systems # 系统数量 - - # 时间系数 - self.flight_time_factor = 3 # 每张照片飞行时间 - self.comp_uav_factor = 5 # 无人机计算时间 - self.trans_time_factor = 0.3 # 传输时间 - self.car_move_time_factor = 100 # 汽车移动时间 - self.comp_bs_factor = 5 # 机巢计算时间 - - # 能量参数 - self.flight_energy_factor = 0.05 # 飞行能耗 - self.comp_energy_factor = 0.05 # 计算能耗 - self.trans_energy_factor = 0.0025 # 传输能耗 - self.battery_capacity = 30 # 电池容量 - - # 动作空间:每个子区域分配给哪个系统 - self.action_space = spaces.MultiDiscrete([num_systems] * len(subareas)) - - # 状态空间:[各系统当前负载] - self.observation_space = spaces.Box( - low=np.zeros(num_systems), - high=np.ones(num_systems) * float('inf'), - dtype=np.float32 - ) - - self.state = None - self.current_step = 0 - self.max_steps = 1000 - - def calculate_rho(self, area): - """计算最优卸载率""" - rho_time_limit = (self.flight_time_factor - self.trans_time_factor) / \ - (self.comp_uav_factor - self.trans_time_factor) - rho_energy_limit = (self.battery_capacity - self.flight_energy_factor * area - self.trans_energy_factor * area) / \ - (self.comp_energy_factor * area - self.trans_energy_factor * area) - if rho_energy_limit < 0: - return None - return min(rho_time_limit, rho_energy_limit) - - def step(self, action): - self.current_step += 1 - - # 初始化每个系统的任务列表 - system_tasks = {i: [] for i in range(self.num_systems)} - - # 根据动作分配任务 - for i, system_id in enumerate(action): - system_tasks[system_id].append(self.subareas[i]) - - # 计算每个系统的完成时间 - system_times = [] - valid_allocation = True - - for system_id, tasks in system_tasks.items(): - if not tasks: # 如果系统没有分配任务 - system_times.append(0) - continue - - # 调用第三层(路径规划)获取结果 - from env_routing import RoutingEnv - route_env = RoutingEnv(tasks) - completion_time, valid = route_env.optimize() - - if not valid: - valid_allocation = False - break - - system_times.append(completion_time) - - total_time = max(system_times) if system_times else 0 - - # 计算奖励 - if not valid_allocation: - reward = -10000 - done = True - else: - reward = -total_time - done = self.current_step >= self.max_steps - - # 更新状态(各系统的负载) - self.state = np.array([len(tasks) for tasks in system_tasks.values()]) - - return self.state, reward, done, {} - - def reset(self): - self.state = np.zeros(self.num_systems) - self.current_step = 0 - return self.state - - def render(self, mode='human'): - pass - - def optimize(self): - """使用DQN优化任务分配""" - from dqn import Agent - - state_dim = self.observation_space.shape[0] - action_dim = self.num_systems * len(self.subareas) - - agent = Agent(state_dim, action_dim) - - # 训练参数 - episodes = 100 # 减少训练轮数,因为这是子问题 - max_steps = 100 - - best_reward = float('-inf') - best_time = float('inf') - valid_solution = False - - for episode in range(episodes): - state = self.reset() - episode_reward = 0 - - for step in range(max_steps): - action = agent.choose_action(state) - next_state, reward, done, _ = self.step(action) - - agent.store_transition(state, action, reward, next_state, done) - agent.learn() - - episode_reward += reward - state = next_state - - if done: - if reward != -10000: # 如果是有效解 - valid_solution = True - best_time = min(best_time, -reward) - break - - return best_time, valid_solution diff --git a/DQN/env_partition.py b/DQN/env_partition.py deleted file mode 100644 index faf7580..0000000 --- a/DQN/env_partition.py +++ /dev/null @@ -1,88 +0,0 @@ -import numpy as np -import gym -from gym import spaces - -class PartitionEnv(gym.Env): - """区域划分环境(第一层)""" - def __init__(self): - super(PartitionEnv, self).__init__() - - # 环境参数 - self.H = 20 # 区域高度 - self.W = 25 # 区域宽度 - self.k = 1 # 系统数量 - - # 动作空间:[垂直切割数, 水平切割数] - self.action_space = spaces.Box( - low=np.array([1, 1]), - high=np.array([5, 5]), - dtype=np.float32 - ) - - # 状态空间:[当前垂直切割数, 当前水平切割数, 当前最大完成时间] - self.observation_space = spaces.Box( - low=np.array([1, 1, 0]), - high=np.array([5, 5, float('inf')]), - dtype=np.float32 - ) - - self.state = None - self.current_step = 0 - self.max_steps = 1000 - - def generate_subareas(self, v_cuts, h_cuts): - """生成子区域信息""" - v_boundaries = np.linspace(0, self.H, v_cuts + 1) - h_boundaries = np.linspace(0, self.W, h_cuts + 1) - - subareas = [] - for i in range(len(v_boundaries) - 1): - for j in range(len(h_boundaries) - 1): - height = v_boundaries[i+1] - v_boundaries[i] - width = h_boundaries[j+1] - h_boundaries[j] - center_y = (v_boundaries[i] + v_boundaries[i+1]) / 2 - center_x = (h_boundaries[j] + h_boundaries[j+1]) / 2 - - subareas.append({ - 'height': height, - 'width': width, - 'area': height * width, - 'center': (center_y, center_x) - }) - return subareas - - def step(self, action): - self.current_step += 1 - - # 解析动作 - v_cuts = int(action[0]) # 垂直切割数 - h_cuts = int(action[1]) # 水平切割数 - - # 生成子区域 - subareas = self.generate_subareas(v_cuts, h_cuts) - - # 调用第二层(任务分配)获取结果 - from env_allocation import AllocationEnv - alloc_env = AllocationEnv(subareas, self.k) - total_time, valid = alloc_env.optimize() - - # 计算奖励 - if not valid: - reward = -10000 # 惩罚无效方案 - done = True - else: - reward = -total_time # 负的完成时间作为奖励 - done = self.current_step >= self.max_steps - - # 更新状态 - self.state = np.array([v_cuts, h_cuts, total_time]) - - return self.state, reward, done, {} - - def reset(self): - self.state = np.array([1, 1, 0]) - self.current_step = 0 - return self.state - - def render(self, mode='human'): - pass diff --git a/DQN/env_routing.py b/DQN/env_routing.py deleted file mode 100644 index 5cf881b..0000000 --- a/DQN/env_routing.py +++ /dev/null @@ -1,152 +0,0 @@ -import numpy as np -import gym -from gym import spaces - -class RoutingEnv(gym.Env): - """路径规划环境(第三层)""" - def __init__(self, tasks): - super(RoutingEnv, self).__init__() - - self.tasks = tasks # 任务列表 - self.H = 20 # 区域高度 - self.W = 25 # 区域宽度 - self.region_center = (self.H/2, self.W/2) - - # 时间系数 - self.flight_time_factor = 3 # 每张照片飞行时间 - self.comp_uav_factor = 5 # 无人机计算时间 - self.trans_time_factor = 0.3 # 传输时间 - self.car_move_time_factor = 100 # 汽车移动时间 - self.comp_bs_factor = 5 # 机巢计算时间 - - # 动作空间:选择下一个要访问的任务索引 - self.action_space = spaces.Discrete(len(tasks)) - - # 状态空间:[当前位置x, 当前位置y, 未访问任务的mask] - self.observation_space = spaces.Box( - low=np.array([0, 0] + [0] * len(tasks)), - high=np.array([self.H, self.W] + [1] * len(tasks)), - dtype=np.float32 - ) - - self.state = None - self.current_position = self.region_center - self.unvisited_mask = np.ones(len(tasks)) - self.total_flight_time = 0 - - def calculate_task_time(self, task): - """计算单个任务的执行时间""" - area = task['area'] - - # 计算最优卸载率 - rho_time_limit = (self.flight_time_factor - self.trans_time_factor) / \ - (self.comp_uav_factor - self.trans_time_factor) - rho_energy_limit = (30 - self.flight_time_factor * area - self.trans_time_factor * area) / \ - (self.comp_uav_factor * area - self.trans_time_factor * area) - if rho_energy_limit < 0: - return None, None - rho = min(rho_time_limit, rho_energy_limit) - - # 计算各阶段时间 - flight_time = self.flight_time_factor * area - comp_time = self.comp_uav_factor * rho * area - trans_time = self.trans_time_factor * (1 - rho) * area - comp_bs_time = self.comp_bs_factor * (1 - rho) * area - - task_time = max(flight_time, comp_bs_time) - return task_time, rho - - def calculate_move_time(self, from_pos, to_pos): - """计算移动时间""" - dist = np.sqrt((from_pos[0] - to_pos[0])**2 + (from_pos[1] - to_pos[1])**2) - return dist * self.car_move_time_factor - - def step(self, action): - # 检查动作是否有效 - if self.unvisited_mask[action] == 0: - return self.state, -10000, True, {} # 惩罚选择已访问的任务 - - # 获取选中的任务 - task = self.tasks[action] - task_center = task['center'] - - # 计算移动时间 - move_time = self.calculate_move_time(self.current_position, task_center) - - # 计算任务执行时间 - task_time, rho = self.calculate_task_time(task) - if task_time is None: # 任务不可行 - return self.state, -10000, True, {} - - # 更新状态 - self.current_position = task_center - self.unvisited_mask[action] = 0 - self.total_flight_time += task_time - - # 构建新状态 - self.state = np.concatenate([ - np.array(self.current_position), - self.unvisited_mask - ]) - - # 检查是否所有任务都已完成 - done = np.sum(self.unvisited_mask) == 0 - - # 计算奖励(负的总时间) - total_time = max(self.total_flight_time, move_time) - reward = -total_time if done else -move_time - - return self.state, reward, done, {} - - def reset(self): - self.current_position = self.region_center - self.unvisited_mask = np.ones(len(self.tasks)) - self.total_flight_time = 0 - - self.state = np.concatenate([ - np.array(self.current_position), - self.unvisited_mask - ]) - return self.state - - def render(self, mode='human'): - pass - - def optimize(self): - """使用DQN优化路径规划""" - from dqn import Agent - - state_dim = self.observation_space.shape[0] - action_dim = len(self.tasks) - - agent = Agent(state_dim, action_dim) - - # 训练参数 - episodes = 50 # 进一步减少训练轮数,因为这是最底层子问题 - max_steps = len(self.tasks) + 1 # 最多访问所有任务+返回 - - best_reward = float('-inf') - best_time = float('inf') - valid_solution = False - - for episode in range(episodes): - state = self.reset() - episode_reward = 0 - - for step in range(max_steps): - action = agent.choose_action(state) - next_state, reward, done, _ = self.step(action) - - agent.store_transition(state, action, reward, next_state, done) - agent.learn() - - episode_reward += reward - state = next_state - - if done: - if reward != -10000: # 如果是有效解 - valid_solution = True - best_time = min(best_time, -reward) - break - - return best_time, valid_solution diff --git a/DQN/run_dqn.py b/DQN/run_dqn.py deleted file mode 100644 index a123e41..0000000 --- a/DQN/run_dqn.py +++ /dev/null @@ -1,95 +0,0 @@ -from env import Env -from dqn import Agent -import numpy as np -import matplotlib.pyplot as plt - - -def train(): - # 创建环境和智能体 - env = Env() - state_dim = env.observation_space.shape[0] - action_dim = 10 # len(垂直切割数)+len(水平切割数) - - agent = Agent(state_dim, action_dim) - - # 训练参数 - episodes = 1000 - max_steps = 1000 - - # 记录训练过程 - rewards_history = [] - best_reward = float('-inf') - best_solution = None - - # 开始训练 - for episode in range(episodes): - state = env.reset() - episode_reward = 0 - - for step in range(max_steps): - # 选择动作 - action = agent.choose_action(state) - - # 执行动作 - next_state, reward, done, _ = env.step(action) - - # 存储经验 - agent.store_transition(state, action, reward, next_state, done) - - # 学习 - agent.learn() - - episode_reward += reward - state = next_state - - if done: - break - - # 记录每个episode的总奖励 - rewards_history.append(episode_reward) - - # 更新最佳解 - if episode_reward > best_reward: - best_reward = episode_reward - best_solution = { - 'vertical_cuts': int(action[0]), - 'horizontal_cuts': int(action[1]), - # 'offload_ratio': action[2], - 'total_time': -reward if reward != -1000 else float('inf'), - 'episode': episode - } - - # 打印训练进度 - if (episode + 1) % 10 == 0: - avg_reward = np.mean(rewards_history[-10:]) - print(f"Episode {episode + 1}, Average Reward: {avg_reward:.2f}") - - return best_solution, rewards_history - - -def plot_training_results(rewards_history): - plt.figure(figsize=(10, 5)) - plt.plot(rewards_history) - plt.title('Training Progress') - plt.xlabel('Episode') - plt.ylabel('Total Reward') - plt.grid(True) - plt.show() - - -def print_solution(solution): - print("\n最佳解决方案:") - print(f"在第 {solution['episode']} 轮找到") - print(f"垂直切割数: {solution['vertical_cuts']}") - print(f"水平切割数: {solution['horizontal_cuts']}") - print(f"任务卸载率: {solution['offload_ratio']:.2f}") - print(f"总完成时间: {solution['total_time']:.2f} 秒") - - -if __name__ == "__main__": - # 训练模型 - best_solution, rewards_history = train() - - # 显示结果 - plot_training_results(rewards_history) - print_solution(best_solution) diff --git a/DQN/run_hierarchical.py b/DQN/run_hierarchical.py deleted file mode 100644 index 30663c6..0000000 --- a/DQN/run_hierarchical.py +++ /dev/null @@ -1,118 +0,0 @@ -from env_partition import PartitionEnv -from env_allocation import AllocationEnv -from env_routing import RoutingEnv -from dqn import Agent -import numpy as np -import matplotlib.pyplot as plt - -def train_hierarchical(): - """训练分层强化学习系统""" - # 创建第一层环境(区域划分) - partition_env = PartitionEnv() - partition_state_dim = partition_env.observation_space.shape[0] - partition_action_dim = 10 # 5个垂直切割选项 + 5个水平切割选项 - - partition_agent = Agent(partition_state_dim, partition_action_dim) - - # 训练参数 - episodes = 1000 - max_steps = 1000 - - # 记录训练过程 - rewards_history = [] - best_reward = float('-inf') - best_solution = None - - # 开始训练 - print("开始训练分层强化学习系统...") - - for episode in range(episodes): - state = partition_env.reset() - episode_reward = 0 - - for step in range(max_steps): - # 选择动作 - action = partition_agent.choose_action(state) - - # 执行动作(这会触发第二层和第三层的优化) - next_state, reward, done, _ = partition_env.step(action) - - # 存储经验 - partition_agent.store_transition(state, action, reward, next_state, done) - - # 学习 - partition_agent.learn() - - episode_reward += reward - state = next_state - - if done: - break - - # 记录每个episode的总奖励 - rewards_history.append(episode_reward) - - # 更新最佳解 - if episode_reward > best_reward: - best_reward = episode_reward - best_solution = { - 'vertical_cuts': int(action[0]), - 'horizontal_cuts': int(action[1]), - 'total_time': -reward if reward != -10000 else float('inf'), - 'episode': episode - } - - # 打印训练进度 - if (episode + 1) % 10 == 0: - avg_reward = np.mean(rewards_history[-10:]) - print(f"Episode {episode + 1}, Average Reward: {avg_reward:.2f}") - - return best_solution, rewards_history - -def plot_training_results(rewards_history): - plt.figure(figsize=(10, 5)) - plt.plot(rewards_history) - plt.title('Hierarchical DQN Training Progress') - plt.xlabel('Episode') - plt.ylabel('Total Reward') - plt.grid(True) - plt.show() - -def print_solution(solution): - print("\n最佳解决方案:") - print(f"在第 {solution['episode']} 轮找到") - print(f"垂直切割数: {solution['vertical_cuts']}") - print(f"水平切割数: {solution['horizontal_cuts']}") - print(f"总完成时间: {solution['total_time']:.2f} 秒") - -def visualize_partition(solution): - """可视化区域划分结果""" - H, W = 20, 25 - v_cuts = solution['vertical_cuts'] - h_cuts = solution['horizontal_cuts'] - - plt.figure(figsize=(10, 8)) - - # 绘制网格 - for i in range(v_cuts + 1): - y = i * (H / v_cuts) - plt.axhline(y=y, color='b', linestyle='-', alpha=0.5) - - for i in range(h_cuts + 1): - x = i * (W / h_cuts) - plt.axvline(x=x, color='b', linestyle='-', alpha=0.5) - - plt.title('Area Partition Visualization') - plt.xlabel('Width') - plt.ylabel('Height') - plt.grid(True, alpha=0.3) - plt.show() - -if __name__ == "__main__": - # 训练模型 - best_solution, rewards_history = train_hierarchical() - - # 显示结果 - plot_training_results(rewards_history) - print_solution(best_solution) - visualize_partition(best_solution) diff --git a/DQN/run_this.py b/DQN/run_this.py new file mode 100644 index 0000000..8398068 --- /dev/null +++ b/DQN/run_this.py @@ -0,0 +1,58 @@ +from RL_brain import DeepQNetwork +import os +import sys +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from env import PartitionMazeEnv + +def run_maze(): + step = 0 # 为了记录走到第几步,记忆录中积累经验(也就是积累一些transition)之后再开始学习 + for episode in range(200): + # initial observation + observation = env.reset() + + while True: + # refresh env + env.render() + + # RL choose action based on observation + action = RL.choose_action(observation) + + # RL take action and get next observation and reward + observation_, reward, done = env.step(action) + + # !! restore transition + RL.store_transition(observation, action, reward, observation_) + + # 超过200条transition之后每隔5步学习一次 + if (step > 200) and (step % 5 == 0): + RL.learn() + + # swap observation + observation = observation_ + + # break while loop when end of this episode + if done: + break + step += 1 + + # end of game + print("game over") + env.destroy() + + +if __name__ == "__main__": + # maze game + env = PartitionMazeEnv() + + # TODO 代码还没有写完,跑不了!!! + RL = DeepQNetwork(env.n_actions, env.n_features, + learning_rate=0.01, + reward_decay=0.9, + e_greedy=0.9, + replace_target_iter=200, + memory_size=2000) + env.after(100, run_maze) + env.mainloop() + RL.plot_cost() + + diff --git a/env.py b/env.py index 7691c86..6c94765 100644 --- a/env.py +++ b/env.py @@ -39,8 +39,8 @@ class PartitionMazeEnv(gym.Env): ############################## # 可能需要手动修改的超参数 ############################## - self.CUT_NUM = 6 # 横切一半,竖切一半 - self.BASE_LINE = 12133.250161412347 # 基准时间,通过greedy或者蒙特卡洛计算出来 + self.CUT_NUM = 2 # 横切一半,竖切一半 + self.BASE_LINE = 4000 # 基准时间,通过greedy或者蒙特卡洛计算出来 self.phase = 0 # 阶段控制,0:区域划分阶段,1:迷宫初始化阶段,2:走迷宫阶段 self.partition_step = 0 # 区域划分阶段步数,范围 0~4 @@ -168,30 +168,30 @@ class PartitionMazeEnv(gym.Env): return state, reward, False, False, {} elif self.phase == 1: - # 阶段 1:初始化迷宫,让多个车辆从区域中心出发,前往划分区域的中心点 - # 确保 action 的值在 [0, 1],然后映射到 0~(num_regions-1) 的索引 - num_regions = (len(self.col_cuts) - 1) * \ - (len(self.row_cuts) - 1) - target_region_index = int(np.floor(a * num_regions)) - target_region_index = np.clip( - target_region_index, 0, num_regions - 1) - # 将index映射到笛卡尔坐标 - coord = (target_region_index // (len(self.col_cuts) - 1), - target_region_index % (len(self.col_cuts) - 1)) - self.car_pos[self.init_maze_step] = self.rectangles[coord]['center'] - self.car_traj[self.init_maze_step].append(coord) - self.rectangles[coord]['is_visited'] = True + # 阶段 1:初始化迷宫,让多个车辆从区域中心出发,前往最近的几个区域中心点 + region_centers = [ + (i, j, self.rectangles[(i, j)]['center']) + for i in range(len(self.row_cuts) - 1) + for j in range(len(self.col_cuts) - 1) + ] + # 按照与区域中心的距离从近到远排序 + region_centers.sort( + key=lambda x: math.dist(x[2], (self.H / 2, self.W / 2)) + ) - # 计数 - self.init_maze_step += 1 + # 分配最近的区域给每辆车 + for idx in range(self.num_cars): + i, j, center = region_centers[idx] + self.car_pos[idx] = center + self.car_traj[idx].append((i, j)) + self.rectangles[(i, j)]['is_visited'] = True + + # 进入阶段 2:走迷宫 + self.phase = 2 state = np.concatenate( - [self.partition_values, np.array(self.car_pos).flatten()]) - if self.init_maze_step < self.num_cars: - return state, 0.0, False, False, {} - else: - # 进入阶段 2:走迷宫 - self.phase = 2 - return state, 0.0, False, False, {} + [self.partition_values, np.array(self.car_pos).flatten()] + ) + return state, 0.0, False, False, {} elif self.phase == 2: # 阶段 2:路径规划(走迷宫) diff --git a/params.yml b/params.yml index 382ea74..8233493 100644 --- a/params.yml +++ b/params.yml @@ -1,6 +1,6 @@ -H : 50 # 区域高度,网格点之间的距离为25m(单位距离) -W : 50 # 区域宽度 -num_cars : 3 # 系统数量(车-巢-机系统个数) +H : 20 # 区域高度,网格点之间的距离为25m(单位距离) +W : 25 # 区域宽度 +num_cars : 1 # 系统数量(车-巢-机系统个数) # 时间系数(单位:秒,每个网格一张照片) flight_time_factor : 3 # 每张照片对应的飞行时间,无人机飞行速度为9.5m/s,拍摄照片的时间间隔为3s diff --git a/solutions/best_solution_mtkl.json b/solutions/best_solution_mtkl.json new file mode 100644 index 0000000..9e7403f --- /dev/null +++ b/solutions/best_solution_mtkl.json @@ -0,0 +1,54 @@ +{ + "row_boundaries": [ + 0.0, + 0.2, + 0.4, + 0.7, + 1.0 + ], + "col_boundaries": [ + 0.0, + 0.5, + 1.0 + ], + "car_paths": { + "0": [ + [ + 15.0, + 12.5 + ], + [ + 5.0, + 12.5 + ] + ], + "1": [ + [ + 42.5, + 12.5 + ], + [ + 42.5, + 37.5 + ] + ], + "2": [ + [ + 27.5, + 12.5 + ], + [ + 27.5, + 37.5 + ], + [ + 15.0, + 37.5 + ], + [ + 5.0, + 37.5 + ] + ] + } +} \ No newline at end of file