简化初始化迷宫的方式

This commit is contained in:
weixin_46229132 2025-03-18 17:27:49 +08:00
parent 55e45fe14e
commit 343008bc9f
14 changed files with 397 additions and 854 deletions

1
.gitignore vendored
View File

@ -9,7 +9,6 @@ __pycache__/
# Pytorch weights # Pytorch weights
weights/ weights/
solutions/
PPO_preTrained/ PPO_preTrained/
PPO_logs/ PPO_logs/
logs/ logs/

View File

@ -1,15 +1,14 @@
from env import PartitionMazeEnv
from utils import str2bool, evaluate_policy
from datetime import datetime
from DDPG import DDPG_agent
import gymnasium as gym import gymnasium as gym
import os import os
import shutil import shutil
import argparse import argparse
import torch import torch
import sys import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from env import PartitionMazeEnv
from utils import str2bool, evaluate_policy
from datetime import datetime
from DDPG import DDPG_agent
'''Hyperparameter Setting''' '''Hyperparameter Setting'''
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()

254
DQN/RL_brain.py Normal file
View File

@ -0,0 +1,254 @@
"""
Deep Q Network off-policy
"""
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
np.random.seed(42)
torch.manual_seed(2)
class Network(nn.Module):
"""
Network Structure
"""
def __init__(self,
n_features,
n_actions,
n_neuron=10
):
super(Network, self).__init__()
self.net = nn.Sequential(
nn.Linear(in_features=n_features, out_features=n_neuron, bias=True),
nn.Linear(in_features=n_neuron, out_features=n_actions, bias=True),
nn.ReLU()
)
def forward(self, s):
"""
:param s: s
:return: q
"""
q = self.net(s)
return q
class DeepQNetwork(nn.Module):
"""
Q Learning Algorithm
"""
def __init__(self,
n_actions,
n_features,
learning_rate=0.01,
reward_decay=0.9,
e_greedy=0.9,
replace_target_iter=300,
memory_size=500,
batch_size=32,
e_greedy_increment=None):
super(DeepQNetwork, self).__init__()
self.n_actions = n_actions
self.n_features = n_features
self.lr = learning_rate
self.gamma = reward_decay
self.epsilon_max = e_greedy
self.replace_target_iter = replace_target_iter
self.memory_size = memory_size
self.batch_size = batch_size
self.epsilon_increment = e_greedy_increment
self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max
# total learning step
self.learn_step_counter = 0
# initialize zero memory [s, a, r, s_]
# 这里用pd.DataFrame创建的表格作为memory
# 表格的行数是memory的大小也就是transition的个数
# 表格的列数是transition的长度一个transition包含[s, a, r, s_]其中a和r分别是一个数字s和s_的长度分别是n_features
self.memory = pd.DataFrame(np.zeros((self.memory_size, self.n_features*2+2)))
# build two network: eval_net and target_net
self.eval_net = Network(n_features=self.n_features, n_actions=self.n_actions)
self.target_net = Network(n_features=self.n_features, n_actions=self.n_actions)
self.loss_function = nn.MSELoss()
self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=self.lr)
# 记录每一步的误差
self.cost_his = []
def store_transition(self, s, a, r, s_):
if not hasattr(self, 'memory_counter'):
# hasattr用于判断对象是否包含对应的属性。
self.memory_counter = 0
transition = np.hstack((s, [a,r], s_))
# replace the old memory with new memory
index = self.memory_counter % self.memory_size
self.memory.iloc[index, :] = transition
self.memory_counter += 1
def choose_action(self, observation):
observation = observation[np.newaxis, :]
if np.random.uniform() < self.epsilon:
# forward feed the observation and get q value for every actions
s = torch.FloatTensor(observation)
actions_value = self.eval_net(s)
action = [np.argmax(actions_value.detach().numpy())][0]
else:
action = np.random.randint(0, self.n_actions)
return action
def _replace_target_params(self):
# 复制网络参数
self.target_net.load_state_dict(self.eval_net.state_dict())
def learn(self):
# check to replace target parameters
if self.learn_step_counter % self.replace_target_iter == 0:
self._replace_target_params()
print('\ntarget params replaced\n')
# sample batch memory from all memory
batch_memory = self.memory.sample(self.batch_size) \
if self.memory_counter > self.memory_size \
else self.memory.iloc[:self.memory_counter].sample(self.batch_size, replace=True)
# run the nextwork
s = torch.FloatTensor(batch_memory.iloc[:, :self.n_features].values)
s_ = torch.FloatTensor(batch_memory.iloc[:, -self.n_features:].values)
q_eval = self.eval_net(s)
q_next = self.target_net(s_)
# change q_target w.r.t q_eval's action
q_target = q_eval.clone()
# 更新值
batch_index = np.arange(self.batch_size, dtype=np.int32)
eval_act_index = batch_memory.iloc[:, self.n_features].values.astype(int)
reward = batch_memory.iloc[:, self.n_features + 1].values
q_target[batch_index, eval_act_index] = torch.FloatTensor(reward) + self.gamma * q_next.max(dim=1).values
# train eval network
loss = self.loss_function(q_target, q_eval)
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
self.cost_his.append(loss.detach().numpy())
# increasing epsilon
self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
self.learn_step_counter += 1
def plot_cost(self):
plt.figure()
plt.plot(np.arange(len(self.cost_his)), self.cost_his)
plt.show()

View File

@ -1,94 +0,0 @@
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import deque
import random
class DQN(nn.Module):
def __init__(self, state_dim, action_dim):
super(DQN, self).__init__()
self.network = nn.Sequential(
nn.Linear(state_dim, 128),
nn.ReLU(),
nn.Linear(128, 128),
nn.ReLU(),
nn.Linear(128, action_dim)
)
def forward(self, x):
return self.network(x)
class Agent:
def __init__(self, state_dim, action_dim):
self.state_dim = state_dim
self.action_dim = action_dim
# DQN网络
self.eval_net = DQN(state_dim, action_dim)
self.target_net = DQN(state_dim, action_dim)
self.target_net.load_state_dict(self.eval_net.state_dict())
# 训练参数
self.learning_rate = 0.001
self.gamma = 0.99
self.epsilon = 1.0
self.epsilon_min = 0.01
self.epsilon_decay = 0.995
self.memory = deque(maxlen=10000)
self.batch_size = 64
self.optimizer = optim.Adam(self.eval_net.parameters(), lr=self.learning_rate)
def choose_action(self, state):
if random.random() < self.epsilon:
# 随机选择动作
return random.randint(0, self.action_dim - 1)
else:
# 根据Q值选择动作
state = torch.FloatTensor(state).unsqueeze(0)
q_values = self.eval_net(state)
return torch.argmax(q_values).item()
def store_transition(self, state, action, reward, next_state, done):
self.memory.append((state, action, reward, next_state, done))
def learn(self):
if len(self.memory) < self.batch_size:
return
# 随机采样batch
batch = random.sample(self.memory, self.batch_size)
states = torch.FloatTensor([x[0] for x in batch])
actions = torch.LongTensor([x[1] for x in batch])
rewards = torch.FloatTensor([x[2] for x in batch])
next_states = torch.FloatTensor([x[3] for x in batch])
dones = torch.FloatTensor([x[4] for x in batch])
# 计算当前Q值
current_q_values = self.eval_net(states).gather(1, actions.unsqueeze(1))
# 计算目标Q值
next_q_values = self.target_net(next_states).detach()
max_next_q = torch.max(next_q_values, dim=1)[0]
target_q_values = rewards + (1 - dones) * self.gamma * max_next_q
# 计算损失
loss = nn.MSELoss()(current_q_values.squeeze(), target_q_values)
# 更新网络
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
# 更新epsilon
self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
# 定期更新目标网络
if self.learn.counter % 100 == 0:
self.target_net.load_state_dict(self.eval_net.state_dict())
self.learn.counter += 1
# 添加计数器属性
learn.counter = 0

View File

@ -1,134 +0,0 @@
import numpy as np
import gym
from gym import spaces
class Env(gym.Env):
"""多车-巢-机系统的区域覆盖环境"""
def __init__(self):
super(Env, self).__init__()
# 环境参数
self.H = 20 # 区域高度
self.W = 25 # 区域宽度
self.k = 1 # 系统数量
# 时间系数
self.flight_time_factor = 3 # 每张照片飞行时间
self.comp_uav_factor = 5 # 无人机计算时间
self.trans_time_factor = 0.3 # 传输时间
self.car_move_time_factor = 100 # 汽车移动时间
self.comp_bs_factor = 5 # 机巢计算时间
# 能量参数
self.flight_energy_factor = 0.05 # 飞行能耗
self.comp_energy_factor = 0.05 # 计算能耗
self.trans_energy_factor = 0.0025 # 传输能耗
self.battery_capacity = 30 # 电池容量
# 动作空间
# [垂直切割数, 水平切割数, 卸载率]
self.action_space = spaces.Box(
low=np.array([1, 1, 0]),
high=np.array([5, 5, 1]),
dtype=np.float32
)
# 状态空间
# [当前垂直切割数, 当前水平切割数, 当前最大完成时间]
self.observation_space = spaces.Box(
low=np.array([1, 1, 0]),
high=np.array([5, 5, float('inf')]),
dtype=np.float32
)
self.state = None
self.current_step = 0
self.max_steps = 1000
def step(self, action):
self.current_step += 1
# 解析动作
v_cuts = int(action[0]) # 垂直切割数
h_cuts = int(action[1]) # 水平切割数
# rho = action[2] # 卸载率
# TODO 生成切割位置,目前是均匀切割
v_boundaries = np.linspace(0, self.H, v_cuts + 1)
h_boundaries = np.linspace(0, self.W, h_cuts + 1)
# 计算每个子区域的指标
total_time = 0
valid_partition = True
for i in range(len(v_boundaries) - 1):
for j in range(len(h_boundaries) - 1):
# 计算子区域大小
height = v_boundaries[i+1] - v_boundaries[i]
width = h_boundaries[j+1] - h_boundaries[j]
area = height * width
# 求解rho
rho_time_limit = (self.flight_time_factor - self.trans_time_factor) / \
(self.comp_uav_factor - self.trans_time_factor)
rho_energy_limit = (self.battery_capacity - self.flight_energy_factor * area - self.trans_energy_factor * area) / \
(self.comp_energy_factor * area - self.trans_energy_factor * area)
if rho_energy_limit < 0:
valid_partition = False
break
rho = min(rho_time_limit, rho_energy_limit)
# 计算各阶段时间
flight_time = self.flight_time_factor * area
comp_time = self.comp_uav_factor * rho * area
trans_time = self.trans_time_factor * (1 - rho) * area
comp_bs_time = self.comp_bs_factor * (1 - rho) * area
# # 计算能耗
# flight_energy = self.flight_energy_factor * area
# comp_energy = self.comp_energy_factor * rho * area
# trans_energy = self.trans_energy_factor * (1 - rho) * area
# total_energy = flight_energy + comp_energy + trans_energy
# # 检查约束
# if total_energy > self.battery_capacity or (comp_time + trans_time > flight_time):
# valid_partition = False
# break
# 计算子区域中心到区域中心的距离
center_y = (v_boundaries[i] + v_boundaries[i+1]) / 2
center_x = (h_boundaries[j] + h_boundaries[j+1]) / 2
dist_to_center = np.sqrt(
(center_y - self.H/2)**2 + (center_x - self.W/2)**2)
car_time = dist_to_center * self.car_move_time_factor
# 更新总时间
task_time = max(flight_time + car_time, comp_bs_time)
total_time = max(total_time, task_time)
if not valid_partition:
break
# 计算奖励
if not valid_partition:
reward = -10000 # 惩罚无效方案
done = True
else:
reward = -total_time # 负的完成时间作为奖励
done = self.current_step >= self.max_steps
# 更新状态
self.state = np.array([v_cuts, h_cuts, total_time])
return self.state, reward, done, {}
def reset(self):
# 初始化状态
self.state = np.array([1, 1, 0])
self.current_step = 0
return self.state
def render(self, mode='human'):
pass

View File

@ -1,140 +0,0 @@
import numpy as np
import gym
from gym import spaces
class AllocationEnv(gym.Env):
"""任务分配环境(第二层)"""
def __init__(self, subareas, num_systems):
super(AllocationEnv, self).__init__()
self.subareas = subareas # 子区域列表
self.num_systems = num_systems # 系统数量
# 时间系数
self.flight_time_factor = 3 # 每张照片飞行时间
self.comp_uav_factor = 5 # 无人机计算时间
self.trans_time_factor = 0.3 # 传输时间
self.car_move_time_factor = 100 # 汽车移动时间
self.comp_bs_factor = 5 # 机巢计算时间
# 能量参数
self.flight_energy_factor = 0.05 # 飞行能耗
self.comp_energy_factor = 0.05 # 计算能耗
self.trans_energy_factor = 0.0025 # 传输能耗
self.battery_capacity = 30 # 电池容量
# 动作空间:每个子区域分配给哪个系统
self.action_space = spaces.MultiDiscrete([num_systems] * len(subareas))
# 状态空间:[各系统当前负载]
self.observation_space = spaces.Box(
low=np.zeros(num_systems),
high=np.ones(num_systems) * float('inf'),
dtype=np.float32
)
self.state = None
self.current_step = 0
self.max_steps = 1000
def calculate_rho(self, area):
"""计算最优卸载率"""
rho_time_limit = (self.flight_time_factor - self.trans_time_factor) / \
(self.comp_uav_factor - self.trans_time_factor)
rho_energy_limit = (self.battery_capacity - self.flight_energy_factor * area - self.trans_energy_factor * area) / \
(self.comp_energy_factor * area - self.trans_energy_factor * area)
if rho_energy_limit < 0:
return None
return min(rho_time_limit, rho_energy_limit)
def step(self, action):
self.current_step += 1
# 初始化每个系统的任务列表
system_tasks = {i: [] for i in range(self.num_systems)}
# 根据动作分配任务
for i, system_id in enumerate(action):
system_tasks[system_id].append(self.subareas[i])
# 计算每个系统的完成时间
system_times = []
valid_allocation = True
for system_id, tasks in system_tasks.items():
if not tasks: # 如果系统没有分配任务
system_times.append(0)
continue
# 调用第三层(路径规划)获取结果
from env_routing import RoutingEnv
route_env = RoutingEnv(tasks)
completion_time, valid = route_env.optimize()
if not valid:
valid_allocation = False
break
system_times.append(completion_time)
total_time = max(system_times) if system_times else 0
# 计算奖励
if not valid_allocation:
reward = -10000
done = True
else:
reward = -total_time
done = self.current_step >= self.max_steps
# 更新状态(各系统的负载)
self.state = np.array([len(tasks) for tasks in system_tasks.values()])
return self.state, reward, done, {}
def reset(self):
self.state = np.zeros(self.num_systems)
self.current_step = 0
return self.state
def render(self, mode='human'):
pass
def optimize(self):
"""使用DQN优化任务分配"""
from dqn import Agent
state_dim = self.observation_space.shape[0]
action_dim = self.num_systems * len(self.subareas)
agent = Agent(state_dim, action_dim)
# 训练参数
episodes = 100 # 减少训练轮数,因为这是子问题
max_steps = 100
best_reward = float('-inf')
best_time = float('inf')
valid_solution = False
for episode in range(episodes):
state = self.reset()
episode_reward = 0
for step in range(max_steps):
action = agent.choose_action(state)
next_state, reward, done, _ = self.step(action)
agent.store_transition(state, action, reward, next_state, done)
agent.learn()
episode_reward += reward
state = next_state
if done:
if reward != -10000: # 如果是有效解
valid_solution = True
best_time = min(best_time, -reward)
break
return best_time, valid_solution

View File

@ -1,88 +0,0 @@
import numpy as np
import gym
from gym import spaces
class PartitionEnv(gym.Env):
"""区域划分环境(第一层)"""
def __init__(self):
super(PartitionEnv, self).__init__()
# 环境参数
self.H = 20 # 区域高度
self.W = 25 # 区域宽度
self.k = 1 # 系统数量
# 动作空间:[垂直切割数, 水平切割数]
self.action_space = spaces.Box(
low=np.array([1, 1]),
high=np.array([5, 5]),
dtype=np.float32
)
# 状态空间:[当前垂直切割数, 当前水平切割数, 当前最大完成时间]
self.observation_space = spaces.Box(
low=np.array([1, 1, 0]),
high=np.array([5, 5, float('inf')]),
dtype=np.float32
)
self.state = None
self.current_step = 0
self.max_steps = 1000
def generate_subareas(self, v_cuts, h_cuts):
"""生成子区域信息"""
v_boundaries = np.linspace(0, self.H, v_cuts + 1)
h_boundaries = np.linspace(0, self.W, h_cuts + 1)
subareas = []
for i in range(len(v_boundaries) - 1):
for j in range(len(h_boundaries) - 1):
height = v_boundaries[i+1] - v_boundaries[i]
width = h_boundaries[j+1] - h_boundaries[j]
center_y = (v_boundaries[i] + v_boundaries[i+1]) / 2
center_x = (h_boundaries[j] + h_boundaries[j+1]) / 2
subareas.append({
'height': height,
'width': width,
'area': height * width,
'center': (center_y, center_x)
})
return subareas
def step(self, action):
self.current_step += 1
# 解析动作
v_cuts = int(action[0]) # 垂直切割数
h_cuts = int(action[1]) # 水平切割数
# 生成子区域
subareas = self.generate_subareas(v_cuts, h_cuts)
# 调用第二层(任务分配)获取结果
from env_allocation import AllocationEnv
alloc_env = AllocationEnv(subareas, self.k)
total_time, valid = alloc_env.optimize()
# 计算奖励
if not valid:
reward = -10000 # 惩罚无效方案
done = True
else:
reward = -total_time # 负的完成时间作为奖励
done = self.current_step >= self.max_steps
# 更新状态
self.state = np.array([v_cuts, h_cuts, total_time])
return self.state, reward, done, {}
def reset(self):
self.state = np.array([1, 1, 0])
self.current_step = 0
return self.state
def render(self, mode='human'):
pass

View File

@ -1,152 +0,0 @@
import numpy as np
import gym
from gym import spaces
class RoutingEnv(gym.Env):
"""路径规划环境(第三层)"""
def __init__(self, tasks):
super(RoutingEnv, self).__init__()
self.tasks = tasks # 任务列表
self.H = 20 # 区域高度
self.W = 25 # 区域宽度
self.region_center = (self.H/2, self.W/2)
# 时间系数
self.flight_time_factor = 3 # 每张照片飞行时间
self.comp_uav_factor = 5 # 无人机计算时间
self.trans_time_factor = 0.3 # 传输时间
self.car_move_time_factor = 100 # 汽车移动时间
self.comp_bs_factor = 5 # 机巢计算时间
# 动作空间:选择下一个要访问的任务索引
self.action_space = spaces.Discrete(len(tasks))
# 状态空间:[当前位置x, 当前位置y, 未访问任务的mask]
self.observation_space = spaces.Box(
low=np.array([0, 0] + [0] * len(tasks)),
high=np.array([self.H, self.W] + [1] * len(tasks)),
dtype=np.float32
)
self.state = None
self.current_position = self.region_center
self.unvisited_mask = np.ones(len(tasks))
self.total_flight_time = 0
def calculate_task_time(self, task):
"""计算单个任务的执行时间"""
area = task['area']
# 计算最优卸载率
rho_time_limit = (self.flight_time_factor - self.trans_time_factor) / \
(self.comp_uav_factor - self.trans_time_factor)
rho_energy_limit = (30 - self.flight_time_factor * area - self.trans_time_factor * area) / \
(self.comp_uav_factor * area - self.trans_time_factor * area)
if rho_energy_limit < 0:
return None, None
rho = min(rho_time_limit, rho_energy_limit)
# 计算各阶段时间
flight_time = self.flight_time_factor * area
comp_time = self.comp_uav_factor * rho * area
trans_time = self.trans_time_factor * (1 - rho) * area
comp_bs_time = self.comp_bs_factor * (1 - rho) * area
task_time = max(flight_time, comp_bs_time)
return task_time, rho
def calculate_move_time(self, from_pos, to_pos):
"""计算移动时间"""
dist = np.sqrt((from_pos[0] - to_pos[0])**2 + (from_pos[1] - to_pos[1])**2)
return dist * self.car_move_time_factor
def step(self, action):
# 检查动作是否有效
if self.unvisited_mask[action] == 0:
return self.state, -10000, True, {} # 惩罚选择已访问的任务
# 获取选中的任务
task = self.tasks[action]
task_center = task['center']
# 计算移动时间
move_time = self.calculate_move_time(self.current_position, task_center)
# 计算任务执行时间
task_time, rho = self.calculate_task_time(task)
if task_time is None: # 任务不可行
return self.state, -10000, True, {}
# 更新状态
self.current_position = task_center
self.unvisited_mask[action] = 0
self.total_flight_time += task_time
# 构建新状态
self.state = np.concatenate([
np.array(self.current_position),
self.unvisited_mask
])
# 检查是否所有任务都已完成
done = np.sum(self.unvisited_mask) == 0
# 计算奖励(负的总时间)
total_time = max(self.total_flight_time, move_time)
reward = -total_time if done else -move_time
return self.state, reward, done, {}
def reset(self):
self.current_position = self.region_center
self.unvisited_mask = np.ones(len(self.tasks))
self.total_flight_time = 0
self.state = np.concatenate([
np.array(self.current_position),
self.unvisited_mask
])
return self.state
def render(self, mode='human'):
pass
def optimize(self):
"""使用DQN优化路径规划"""
from dqn import Agent
state_dim = self.observation_space.shape[0]
action_dim = len(self.tasks)
agent = Agent(state_dim, action_dim)
# 训练参数
episodes = 50 # 进一步减少训练轮数,因为这是最底层子问题
max_steps = len(self.tasks) + 1 # 最多访问所有任务+返回
best_reward = float('-inf')
best_time = float('inf')
valid_solution = False
for episode in range(episodes):
state = self.reset()
episode_reward = 0
for step in range(max_steps):
action = agent.choose_action(state)
next_state, reward, done, _ = self.step(action)
agent.store_transition(state, action, reward, next_state, done)
agent.learn()
episode_reward += reward
state = next_state
if done:
if reward != -10000: # 如果是有效解
valid_solution = True
best_time = min(best_time, -reward)
break
return best_time, valid_solution

View File

@ -1,95 +0,0 @@
from env import Env
from dqn import Agent
import numpy as np
import matplotlib.pyplot as plt
def train():
# 创建环境和智能体
env = Env()
state_dim = env.observation_space.shape[0]
action_dim = 10 # len(垂直切割数)+len(水平切割数)
agent = Agent(state_dim, action_dim)
# 训练参数
episodes = 1000
max_steps = 1000
# 记录训练过程
rewards_history = []
best_reward = float('-inf')
best_solution = None
# 开始训练
for episode in range(episodes):
state = env.reset()
episode_reward = 0
for step in range(max_steps):
# 选择动作
action = agent.choose_action(state)
# 执行动作
next_state, reward, done, _ = env.step(action)
# 存储经验
agent.store_transition(state, action, reward, next_state, done)
# 学习
agent.learn()
episode_reward += reward
state = next_state
if done:
break
# 记录每个episode的总奖励
rewards_history.append(episode_reward)
# 更新最佳解
if episode_reward > best_reward:
best_reward = episode_reward
best_solution = {
'vertical_cuts': int(action[0]),
'horizontal_cuts': int(action[1]),
# 'offload_ratio': action[2],
'total_time': -reward if reward != -1000 else float('inf'),
'episode': episode
}
# 打印训练进度
if (episode + 1) % 10 == 0:
avg_reward = np.mean(rewards_history[-10:])
print(f"Episode {episode + 1}, Average Reward: {avg_reward:.2f}")
return best_solution, rewards_history
def plot_training_results(rewards_history):
plt.figure(figsize=(10, 5))
plt.plot(rewards_history)
plt.title('Training Progress')
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.grid(True)
plt.show()
def print_solution(solution):
print("\n最佳解决方案:")
print(f"在第 {solution['episode']} 轮找到")
print(f"垂直切割数: {solution['vertical_cuts']}")
print(f"水平切割数: {solution['horizontal_cuts']}")
print(f"任务卸载率: {solution['offload_ratio']:.2f}")
print(f"总完成时间: {solution['total_time']:.2f}")
if __name__ == "__main__":
# 训练模型
best_solution, rewards_history = train()
# 显示结果
plot_training_results(rewards_history)
print_solution(best_solution)

View File

@ -1,118 +0,0 @@
from env_partition import PartitionEnv
from env_allocation import AllocationEnv
from env_routing import RoutingEnv
from dqn import Agent
import numpy as np
import matplotlib.pyplot as plt
def train_hierarchical():
"""训练分层强化学习系统"""
# 创建第一层环境(区域划分)
partition_env = PartitionEnv()
partition_state_dim = partition_env.observation_space.shape[0]
partition_action_dim = 10 # 5个垂直切割选项 + 5个水平切割选项
partition_agent = Agent(partition_state_dim, partition_action_dim)
# 训练参数
episodes = 1000
max_steps = 1000
# 记录训练过程
rewards_history = []
best_reward = float('-inf')
best_solution = None
# 开始训练
print("开始训练分层强化学习系统...")
for episode in range(episodes):
state = partition_env.reset()
episode_reward = 0
for step in range(max_steps):
# 选择动作
action = partition_agent.choose_action(state)
# 执行动作(这会触发第二层和第三层的优化)
next_state, reward, done, _ = partition_env.step(action)
# 存储经验
partition_agent.store_transition(state, action, reward, next_state, done)
# 学习
partition_agent.learn()
episode_reward += reward
state = next_state
if done:
break
# 记录每个episode的总奖励
rewards_history.append(episode_reward)
# 更新最佳解
if episode_reward > best_reward:
best_reward = episode_reward
best_solution = {
'vertical_cuts': int(action[0]),
'horizontal_cuts': int(action[1]),
'total_time': -reward if reward != -10000 else float('inf'),
'episode': episode
}
# 打印训练进度
if (episode + 1) % 10 == 0:
avg_reward = np.mean(rewards_history[-10:])
print(f"Episode {episode + 1}, Average Reward: {avg_reward:.2f}")
return best_solution, rewards_history
def plot_training_results(rewards_history):
plt.figure(figsize=(10, 5))
plt.plot(rewards_history)
plt.title('Hierarchical DQN Training Progress')
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.grid(True)
plt.show()
def print_solution(solution):
print("\n最佳解决方案:")
print(f"在第 {solution['episode']} 轮找到")
print(f"垂直切割数: {solution['vertical_cuts']}")
print(f"水平切割数: {solution['horizontal_cuts']}")
print(f"总完成时间: {solution['total_time']:.2f}")
def visualize_partition(solution):
"""可视化区域划分结果"""
H, W = 20, 25
v_cuts = solution['vertical_cuts']
h_cuts = solution['horizontal_cuts']
plt.figure(figsize=(10, 8))
# 绘制网格
for i in range(v_cuts + 1):
y = i * (H / v_cuts)
plt.axhline(y=y, color='b', linestyle='-', alpha=0.5)
for i in range(h_cuts + 1):
x = i * (W / h_cuts)
plt.axvline(x=x, color='b', linestyle='-', alpha=0.5)
plt.title('Area Partition Visualization')
plt.xlabel('Width')
plt.ylabel('Height')
plt.grid(True, alpha=0.3)
plt.show()
if __name__ == "__main__":
# 训练模型
best_solution, rewards_history = train_hierarchical()
# 显示结果
plot_training_results(rewards_history)
print_solution(best_solution)
visualize_partition(best_solution)

58
DQN/run_this.py Normal file
View File

@ -0,0 +1,58 @@
from RL_brain import DeepQNetwork
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from env import PartitionMazeEnv
def run_maze():
step = 0 # 为了记录走到第几步记忆录中积累经验也就是积累一些transition之后再开始学习
for episode in range(200):
# initial observation
observation = env.reset()
while True:
# refresh env
env.render()
# RL choose action based on observation
action = RL.choose_action(observation)
# RL take action and get next observation and reward
observation_, reward, done = env.step(action)
# !! restore transition
RL.store_transition(observation, action, reward, observation_)
# 超过200条transition之后每隔5步学习一次
if (step > 200) and (step % 5 == 0):
RL.learn()
# swap observation
observation = observation_
# break while loop when end of this episode
if done:
break
step += 1
# end of game
print("game over")
env.destroy()
if __name__ == "__main__":
# maze game
env = PartitionMazeEnv()
# TODO 代码还没有写完,跑不了!!!
RL = DeepQNetwork(env.n_actions, env.n_features,
learning_rate=0.01,
reward_decay=0.9,
e_greedy=0.9,
replace_target_iter=200,
memory_size=2000)
env.after(100, run_maze)
env.mainloop()
RL.plot_cost()

44
env.py
View File

@ -39,8 +39,8 @@ class PartitionMazeEnv(gym.Env):
############################## ##############################
# 可能需要手动修改的超参数 # 可能需要手动修改的超参数
############################## ##############################
self.CUT_NUM = 6 # 横切一半,竖切一半 self.CUT_NUM = 2 # 横切一半,竖切一半
self.BASE_LINE = 12133.250161412347 # 基准时间通过greedy或者蒙特卡洛计算出来 self.BASE_LINE = 4000 # 基准时间通过greedy或者蒙特卡洛计算出来
self.phase = 0 # 阶段控制0区域划分阶段1迷宫初始化阶段2走迷宫阶段 self.phase = 0 # 阶段控制0区域划分阶段1迷宫初始化阶段2走迷宫阶段
self.partition_step = 0 # 区域划分阶段步数,范围 0~4 self.partition_step = 0 # 区域划分阶段步数,范围 0~4
@ -168,29 +168,29 @@ class PartitionMazeEnv(gym.Env):
return state, reward, False, False, {} return state, reward, False, False, {}
elif self.phase == 1: elif self.phase == 1:
# 阶段 1初始化迷宫让多个车辆从区域中心出发前往划分区域的中心点 # 阶段 1初始化迷宫让多个车辆从区域中心出发前往最近的几个区域中心点
# 确保 action 的值在 [0, 1],然后映射到 0~(num_regions-1) 的索引 region_centers = [
num_regions = (len(self.col_cuts) - 1) * \ (i, j, self.rectangles[(i, j)]['center'])
(len(self.row_cuts) - 1) for i in range(len(self.row_cuts) - 1)
target_region_index = int(np.floor(a * num_regions)) for j in range(len(self.col_cuts) - 1)
target_region_index = np.clip( ]
target_region_index, 0, num_regions - 1) # 按照与区域中心的距离从近到远排序
# 将index映射到笛卡尔坐标 region_centers.sort(
coord = (target_region_index // (len(self.col_cuts) - 1), key=lambda x: math.dist(x[2], (self.H / 2, self.W / 2))
target_region_index % (len(self.col_cuts) - 1)) )
self.car_pos[self.init_maze_step] = self.rectangles[coord]['center']
self.car_traj[self.init_maze_step].append(coord) # 分配最近的区域给每辆车
self.rectangles[coord]['is_visited'] = True for idx in range(self.num_cars):
i, j, center = region_centers[idx]
self.car_pos[idx] = center
self.car_traj[idx].append((i, j))
self.rectangles[(i, j)]['is_visited'] = True
# 计数
self.init_maze_step += 1
state = np.concatenate(
[self.partition_values, np.array(self.car_pos).flatten()])
if self.init_maze_step < self.num_cars:
return state, 0.0, False, False, {}
else:
# 进入阶段 2走迷宫 # 进入阶段 2走迷宫
self.phase = 2 self.phase = 2
state = np.concatenate(
[self.partition_values, np.array(self.car_pos).flatten()]
)
return state, 0.0, False, False, {} return state, 0.0, False, False, {}
elif self.phase == 2: elif self.phase == 2:

View File

@ -1,6 +1,6 @@
H : 50 # 区域高度网格点之间的距离为25m单位距离 H : 20 # 区域高度网格点之间的距离为25m单位距离
W : 50 # 区域宽度 W : 25 # 区域宽度
num_cars : 3 # 系统数量(车-巢-机系统个数) num_cars : 1 # 系统数量(车-巢-机系统个数)
# 时间系数(单位:秒,每个网格一张照片) # 时间系数(单位:秒,每个网格一张照片)
flight_time_factor : 3 # 每张照片对应的飞行时间无人机飞行速度为9.5m/s拍摄照片的时间间隔为3s flight_time_factor : 3 # 每张照片对应的飞行时间无人机飞行速度为9.5m/s拍摄照片的时间间隔为3s

View File

@ -0,0 +1,54 @@
{
"row_boundaries": [
0.0,
0.2,
0.4,
0.7,
1.0
],
"col_boundaries": [
0.0,
0.5,
1.0
],
"car_paths": {
"0": [
[
15.0,
12.5
],
[
5.0,
12.5
]
],
"1": [
[
42.5,
12.5
],
[
42.5,
37.5
]
],
"2": [
[
27.5,
12.5
],
[
27.5,
37.5
],
[
15.0,
37.5
],
[
5.0,
37.5
]
]
}
}