简化初始化迷宫的方式
This commit is contained in:
parent
55e45fe14e
commit
343008bc9f
1
.gitignore
vendored
1
.gitignore
vendored
@ -9,7 +9,6 @@ __pycache__/
|
||||
|
||||
# Pytorch weights
|
||||
weights/
|
||||
solutions/
|
||||
PPO_preTrained/
|
||||
PPO_logs/
|
||||
logs/
|
||||
|
@ -1,15 +1,14 @@
|
||||
from env import PartitionMazeEnv
|
||||
from utils import str2bool, evaluate_policy
|
||||
from datetime import datetime
|
||||
from DDPG import DDPG_agent
|
||||
import gymnasium as gym
|
||||
import os
|
||||
import shutil
|
||||
import argparse
|
||||
import torch
|
||||
|
||||
import sys
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
from env import PartitionMazeEnv
|
||||
from utils import str2bool, evaluate_policy
|
||||
from datetime import datetime
|
||||
from DDPG import DDPG_agent
|
||||
|
||||
'''Hyperparameter Setting'''
|
||||
parser = argparse.ArgumentParser()
|
||||
|
254
DQN/RL_brain.py
Normal file
254
DQN/RL_brain.py
Normal file
@ -0,0 +1,254 @@
|
||||
"""
|
||||
Deep Q Network off-policy
|
||||
"""
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
np.random.seed(42)
|
||||
torch.manual_seed(2)
|
||||
|
||||
|
||||
class Network(nn.Module):
|
||||
"""
|
||||
Network Structure
|
||||
"""
|
||||
def __init__(self,
|
||||
n_features,
|
||||
n_actions,
|
||||
n_neuron=10
|
||||
):
|
||||
super(Network, self).__init__()
|
||||
self.net = nn.Sequential(
|
||||
nn.Linear(in_features=n_features, out_features=n_neuron, bias=True),
|
||||
nn.Linear(in_features=n_neuron, out_features=n_actions, bias=True),
|
||||
nn.ReLU()
|
||||
)
|
||||
|
||||
def forward(self, s):
|
||||
"""
|
||||
|
||||
:param s: s
|
||||
:return: q
|
||||
"""
|
||||
q = self.net(s)
|
||||
return q
|
||||
|
||||
|
||||
class DeepQNetwork(nn.Module):
|
||||
"""
|
||||
Q Learning Algorithm
|
||||
"""
|
||||
def __init__(self,
|
||||
n_actions,
|
||||
n_features,
|
||||
learning_rate=0.01,
|
||||
reward_decay=0.9,
|
||||
e_greedy=0.9,
|
||||
replace_target_iter=300,
|
||||
memory_size=500,
|
||||
batch_size=32,
|
||||
e_greedy_increment=None):
|
||||
super(DeepQNetwork, self).__init__()
|
||||
|
||||
self.n_actions = n_actions
|
||||
self.n_features = n_features
|
||||
self.lr = learning_rate
|
||||
self.gamma = reward_decay
|
||||
self.epsilon_max = e_greedy
|
||||
self.replace_target_iter = replace_target_iter
|
||||
self.memory_size = memory_size
|
||||
self.batch_size = batch_size
|
||||
self.epsilon_increment = e_greedy_increment
|
||||
self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max
|
||||
|
||||
# total learning step
|
||||
self.learn_step_counter = 0
|
||||
|
||||
# initialize zero memory [s, a, r, s_]
|
||||
# 这里用pd.DataFrame创建的表格作为memory
|
||||
# 表格的行数是memory的大小,也就是transition的个数
|
||||
# 表格的列数是transition的长度,一个transition包含[s, a, r, s_],其中a和r分别是一个数字,s和s_的长度分别是n_features
|
||||
self.memory = pd.DataFrame(np.zeros((self.memory_size, self.n_features*2+2)))
|
||||
|
||||
# build two network: eval_net and target_net
|
||||
self.eval_net = Network(n_features=self.n_features, n_actions=self.n_actions)
|
||||
self.target_net = Network(n_features=self.n_features, n_actions=self.n_actions)
|
||||
self.loss_function = nn.MSELoss()
|
||||
self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=self.lr)
|
||||
|
||||
# 记录每一步的误差
|
||||
self.cost_his = []
|
||||
|
||||
|
||||
def store_transition(self, s, a, r, s_):
|
||||
if not hasattr(self, 'memory_counter'):
|
||||
# hasattr用于判断对象是否包含对应的属性。
|
||||
self.memory_counter = 0
|
||||
|
||||
transition = np.hstack((s, [a,r], s_))
|
||||
|
||||
# replace the old memory with new memory
|
||||
index = self.memory_counter % self.memory_size
|
||||
self.memory.iloc[index, :] = transition
|
||||
|
||||
self.memory_counter += 1
|
||||
|
||||
def choose_action(self, observation):
|
||||
observation = observation[np.newaxis, :]
|
||||
|
||||
if np.random.uniform() < self.epsilon:
|
||||
# forward feed the observation and get q value for every actions
|
||||
s = torch.FloatTensor(observation)
|
||||
actions_value = self.eval_net(s)
|
||||
action = [np.argmax(actions_value.detach().numpy())][0]
|
||||
else:
|
||||
action = np.random.randint(0, self.n_actions)
|
||||
return action
|
||||
|
||||
def _replace_target_params(self):
|
||||
# 复制网络参数
|
||||
self.target_net.load_state_dict(self.eval_net.state_dict())
|
||||
|
||||
def learn(self):
|
||||
# check to replace target parameters
|
||||
if self.learn_step_counter % self.replace_target_iter == 0:
|
||||
self._replace_target_params()
|
||||
print('\ntarget params replaced\n')
|
||||
|
||||
# sample batch memory from all memory
|
||||
batch_memory = self.memory.sample(self.batch_size) \
|
||||
if self.memory_counter > self.memory_size \
|
||||
else self.memory.iloc[:self.memory_counter].sample(self.batch_size, replace=True)
|
||||
|
||||
# run the nextwork
|
||||
s = torch.FloatTensor(batch_memory.iloc[:, :self.n_features].values)
|
||||
s_ = torch.FloatTensor(batch_memory.iloc[:, -self.n_features:].values)
|
||||
q_eval = self.eval_net(s)
|
||||
q_next = self.target_net(s_)
|
||||
|
||||
# change q_target w.r.t q_eval's action
|
||||
q_target = q_eval.clone()
|
||||
|
||||
# 更新值
|
||||
batch_index = np.arange(self.batch_size, dtype=np.int32)
|
||||
eval_act_index = batch_memory.iloc[:, self.n_features].values.astype(int)
|
||||
reward = batch_memory.iloc[:, self.n_features + 1].values
|
||||
|
||||
q_target[batch_index, eval_act_index] = torch.FloatTensor(reward) + self.gamma * q_next.max(dim=1).values
|
||||
|
||||
# train eval network
|
||||
loss = self.loss_function(q_target, q_eval)
|
||||
self.optimizer.zero_grad()
|
||||
loss.backward()
|
||||
self.optimizer.step()
|
||||
|
||||
self.cost_his.append(loss.detach().numpy())
|
||||
|
||||
# increasing epsilon
|
||||
self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
|
||||
self.learn_step_counter += 1
|
||||
|
||||
def plot_cost(self):
|
||||
plt.figure()
|
||||
plt.plot(np.arange(len(self.cost_his)), self.cost_his)
|
||||
plt.show()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
94
DQN/dqn.py
94
DQN/dqn.py
@ -1,94 +0,0 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.optim as optim
|
||||
import numpy as np
|
||||
from collections import deque
|
||||
import random
|
||||
|
||||
class DQN(nn.Module):
|
||||
def __init__(self, state_dim, action_dim):
|
||||
super(DQN, self).__init__()
|
||||
|
||||
self.network = nn.Sequential(
|
||||
nn.Linear(state_dim, 128),
|
||||
nn.ReLU(),
|
||||
nn.Linear(128, 128),
|
||||
nn.ReLU(),
|
||||
nn.Linear(128, action_dim)
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
return self.network(x)
|
||||
|
||||
class Agent:
|
||||
def __init__(self, state_dim, action_dim):
|
||||
self.state_dim = state_dim
|
||||
self.action_dim = action_dim
|
||||
|
||||
# DQN网络
|
||||
self.eval_net = DQN(state_dim, action_dim)
|
||||
self.target_net = DQN(state_dim, action_dim)
|
||||
self.target_net.load_state_dict(self.eval_net.state_dict())
|
||||
|
||||
# 训练参数
|
||||
self.learning_rate = 0.001
|
||||
self.gamma = 0.99
|
||||
self.epsilon = 1.0
|
||||
self.epsilon_min = 0.01
|
||||
self.epsilon_decay = 0.995
|
||||
self.memory = deque(maxlen=10000)
|
||||
self.batch_size = 64
|
||||
self.optimizer = optim.Adam(self.eval_net.parameters(), lr=self.learning_rate)
|
||||
|
||||
def choose_action(self, state):
|
||||
if random.random() < self.epsilon:
|
||||
# 随机选择动作
|
||||
return random.randint(0, self.action_dim - 1)
|
||||
else:
|
||||
# 根据Q值选择动作
|
||||
state = torch.FloatTensor(state).unsqueeze(0)
|
||||
q_values = self.eval_net(state)
|
||||
return torch.argmax(q_values).item()
|
||||
|
||||
def store_transition(self, state, action, reward, next_state, done):
|
||||
self.memory.append((state, action, reward, next_state, done))
|
||||
|
||||
def learn(self):
|
||||
if len(self.memory) < self.batch_size:
|
||||
return
|
||||
|
||||
# 随机采样batch
|
||||
batch = random.sample(self.memory, self.batch_size)
|
||||
states = torch.FloatTensor([x[0] for x in batch])
|
||||
actions = torch.LongTensor([x[1] for x in batch])
|
||||
rewards = torch.FloatTensor([x[2] for x in batch])
|
||||
next_states = torch.FloatTensor([x[3] for x in batch])
|
||||
dones = torch.FloatTensor([x[4] for x in batch])
|
||||
|
||||
# 计算当前Q值
|
||||
current_q_values = self.eval_net(states).gather(1, actions.unsqueeze(1))
|
||||
|
||||
# 计算目标Q值
|
||||
next_q_values = self.target_net(next_states).detach()
|
||||
max_next_q = torch.max(next_q_values, dim=1)[0]
|
||||
target_q_values = rewards + (1 - dones) * self.gamma * max_next_q
|
||||
|
||||
# 计算损失
|
||||
loss = nn.MSELoss()(current_q_values.squeeze(), target_q_values)
|
||||
|
||||
# 更新网络
|
||||
self.optimizer.zero_grad()
|
||||
loss.backward()
|
||||
self.optimizer.step()
|
||||
|
||||
# 更新epsilon
|
||||
self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
|
||||
|
||||
# 定期更新目标网络
|
||||
if self.learn.counter % 100 == 0:
|
||||
self.target_net.load_state_dict(self.eval_net.state_dict())
|
||||
|
||||
self.learn.counter += 1
|
||||
|
||||
# 添加计数器属性
|
||||
learn.counter = 0
|
134
DQN/env.py
134
DQN/env.py
@ -1,134 +0,0 @@
|
||||
import numpy as np
|
||||
import gym
|
||||
from gym import spaces
|
||||
|
||||
|
||||
class Env(gym.Env):
|
||||
"""多车-巢-机系统的区域覆盖环境"""
|
||||
|
||||
def __init__(self):
|
||||
super(Env, self).__init__()
|
||||
|
||||
# 环境参数
|
||||
self.H = 20 # 区域高度
|
||||
self.W = 25 # 区域宽度
|
||||
self.k = 1 # 系统数量
|
||||
|
||||
# 时间系数
|
||||
self.flight_time_factor = 3 # 每张照片飞行时间
|
||||
self.comp_uav_factor = 5 # 无人机计算时间
|
||||
self.trans_time_factor = 0.3 # 传输时间
|
||||
self.car_move_time_factor = 100 # 汽车移动时间
|
||||
self.comp_bs_factor = 5 # 机巢计算时间
|
||||
|
||||
# 能量参数
|
||||
self.flight_energy_factor = 0.05 # 飞行能耗
|
||||
self.comp_energy_factor = 0.05 # 计算能耗
|
||||
self.trans_energy_factor = 0.0025 # 传输能耗
|
||||
self.battery_capacity = 30 # 电池容量
|
||||
|
||||
# 动作空间
|
||||
# [垂直切割数, 水平切割数, 卸载率]
|
||||
self.action_space = spaces.Box(
|
||||
low=np.array([1, 1, 0]),
|
||||
high=np.array([5, 5, 1]),
|
||||
dtype=np.float32
|
||||
)
|
||||
|
||||
# 状态空间
|
||||
# [当前垂直切割数, 当前水平切割数, 当前最大完成时间]
|
||||
self.observation_space = spaces.Box(
|
||||
low=np.array([1, 1, 0]),
|
||||
high=np.array([5, 5, float('inf')]),
|
||||
dtype=np.float32
|
||||
)
|
||||
|
||||
self.state = None
|
||||
self.current_step = 0
|
||||
self.max_steps = 1000
|
||||
|
||||
def step(self, action):
|
||||
self.current_step += 1
|
||||
|
||||
# 解析动作
|
||||
v_cuts = int(action[0]) # 垂直切割数
|
||||
h_cuts = int(action[1]) # 水平切割数
|
||||
# rho = action[2] # 卸载率
|
||||
|
||||
# TODO 生成切割位置,目前是均匀切割
|
||||
v_boundaries = np.linspace(0, self.H, v_cuts + 1)
|
||||
h_boundaries = np.linspace(0, self.W, h_cuts + 1)
|
||||
|
||||
# 计算每个子区域的指标
|
||||
total_time = 0
|
||||
valid_partition = True
|
||||
|
||||
for i in range(len(v_boundaries) - 1):
|
||||
for j in range(len(h_boundaries) - 1):
|
||||
# 计算子区域大小
|
||||
height = v_boundaries[i+1] - v_boundaries[i]
|
||||
width = h_boundaries[j+1] - h_boundaries[j]
|
||||
area = height * width
|
||||
|
||||
# 求解rho
|
||||
rho_time_limit = (self.flight_time_factor - self.trans_time_factor) / \
|
||||
(self.comp_uav_factor - self.trans_time_factor)
|
||||
rho_energy_limit = (self.battery_capacity - self.flight_energy_factor * area - self.trans_energy_factor * area) / \
|
||||
(self.comp_energy_factor * area - self.trans_energy_factor * area)
|
||||
if rho_energy_limit < 0:
|
||||
valid_partition = False
|
||||
break
|
||||
rho = min(rho_time_limit, rho_energy_limit)
|
||||
|
||||
# 计算各阶段时间
|
||||
flight_time = self.flight_time_factor * area
|
||||
comp_time = self.comp_uav_factor * rho * area
|
||||
trans_time = self.trans_time_factor * (1 - rho) * area
|
||||
comp_bs_time = self.comp_bs_factor * (1 - rho) * area
|
||||
|
||||
# # 计算能耗
|
||||
# flight_energy = self.flight_energy_factor * area
|
||||
# comp_energy = self.comp_energy_factor * rho * area
|
||||
# trans_energy = self.trans_energy_factor * (1 - rho) * area
|
||||
# total_energy = flight_energy + comp_energy + trans_energy
|
||||
|
||||
# # 检查约束
|
||||
# if total_energy > self.battery_capacity or (comp_time + trans_time > flight_time):
|
||||
# valid_partition = False
|
||||
# break
|
||||
|
||||
# 计算子区域中心到区域中心的距离
|
||||
center_y = (v_boundaries[i] + v_boundaries[i+1]) / 2
|
||||
center_x = (h_boundaries[j] + h_boundaries[j+1]) / 2
|
||||
dist_to_center = np.sqrt(
|
||||
(center_y - self.H/2)**2 + (center_x - self.W/2)**2)
|
||||
car_time = dist_to_center * self.car_move_time_factor
|
||||
|
||||
# 更新总时间
|
||||
task_time = max(flight_time + car_time, comp_bs_time)
|
||||
total_time = max(total_time, task_time)
|
||||
|
||||
if not valid_partition:
|
||||
break
|
||||
|
||||
# 计算奖励
|
||||
if not valid_partition:
|
||||
reward = -10000 # 惩罚无效方案
|
||||
done = True
|
||||
else:
|
||||
reward = -total_time # 负的完成时间作为奖励
|
||||
done = self.current_step >= self.max_steps
|
||||
|
||||
# 更新状态
|
||||
self.state = np.array([v_cuts, h_cuts, total_time])
|
||||
|
||||
return self.state, reward, done, {}
|
||||
|
||||
def reset(self):
|
||||
# 初始化状态
|
||||
self.state = np.array([1, 1, 0])
|
||||
self.current_step = 0
|
||||
return self.state
|
||||
|
||||
def render(self, mode='human'):
|
||||
pass
|
@ -1,140 +0,0 @@
|
||||
import numpy as np
|
||||
import gym
|
||||
from gym import spaces
|
||||
|
||||
class AllocationEnv(gym.Env):
|
||||
"""任务分配环境(第二层)"""
|
||||
def __init__(self, subareas, num_systems):
|
||||
super(AllocationEnv, self).__init__()
|
||||
|
||||
self.subareas = subareas # 子区域列表
|
||||
self.num_systems = num_systems # 系统数量
|
||||
|
||||
# 时间系数
|
||||
self.flight_time_factor = 3 # 每张照片飞行时间
|
||||
self.comp_uav_factor = 5 # 无人机计算时间
|
||||
self.trans_time_factor = 0.3 # 传输时间
|
||||
self.car_move_time_factor = 100 # 汽车移动时间
|
||||
self.comp_bs_factor = 5 # 机巢计算时间
|
||||
|
||||
# 能量参数
|
||||
self.flight_energy_factor = 0.05 # 飞行能耗
|
||||
self.comp_energy_factor = 0.05 # 计算能耗
|
||||
self.trans_energy_factor = 0.0025 # 传输能耗
|
||||
self.battery_capacity = 30 # 电池容量
|
||||
|
||||
# 动作空间:每个子区域分配给哪个系统
|
||||
self.action_space = spaces.MultiDiscrete([num_systems] * len(subareas))
|
||||
|
||||
# 状态空间:[各系统当前负载]
|
||||
self.observation_space = spaces.Box(
|
||||
low=np.zeros(num_systems),
|
||||
high=np.ones(num_systems) * float('inf'),
|
||||
dtype=np.float32
|
||||
)
|
||||
|
||||
self.state = None
|
||||
self.current_step = 0
|
||||
self.max_steps = 1000
|
||||
|
||||
def calculate_rho(self, area):
|
||||
"""计算最优卸载率"""
|
||||
rho_time_limit = (self.flight_time_factor - self.trans_time_factor) / \
|
||||
(self.comp_uav_factor - self.trans_time_factor)
|
||||
rho_energy_limit = (self.battery_capacity - self.flight_energy_factor * area - self.trans_energy_factor * area) / \
|
||||
(self.comp_energy_factor * area - self.trans_energy_factor * area)
|
||||
if rho_energy_limit < 0:
|
||||
return None
|
||||
return min(rho_time_limit, rho_energy_limit)
|
||||
|
||||
def step(self, action):
|
||||
self.current_step += 1
|
||||
|
||||
# 初始化每个系统的任务列表
|
||||
system_tasks = {i: [] for i in range(self.num_systems)}
|
||||
|
||||
# 根据动作分配任务
|
||||
for i, system_id in enumerate(action):
|
||||
system_tasks[system_id].append(self.subareas[i])
|
||||
|
||||
# 计算每个系统的完成时间
|
||||
system_times = []
|
||||
valid_allocation = True
|
||||
|
||||
for system_id, tasks in system_tasks.items():
|
||||
if not tasks: # 如果系统没有分配任务
|
||||
system_times.append(0)
|
||||
continue
|
||||
|
||||
# 调用第三层(路径规划)获取结果
|
||||
from env_routing import RoutingEnv
|
||||
route_env = RoutingEnv(tasks)
|
||||
completion_time, valid = route_env.optimize()
|
||||
|
||||
if not valid:
|
||||
valid_allocation = False
|
||||
break
|
||||
|
||||
system_times.append(completion_time)
|
||||
|
||||
total_time = max(system_times) if system_times else 0
|
||||
|
||||
# 计算奖励
|
||||
if not valid_allocation:
|
||||
reward = -10000
|
||||
done = True
|
||||
else:
|
||||
reward = -total_time
|
||||
done = self.current_step >= self.max_steps
|
||||
|
||||
# 更新状态(各系统的负载)
|
||||
self.state = np.array([len(tasks) for tasks in system_tasks.values()])
|
||||
|
||||
return self.state, reward, done, {}
|
||||
|
||||
def reset(self):
|
||||
self.state = np.zeros(self.num_systems)
|
||||
self.current_step = 0
|
||||
return self.state
|
||||
|
||||
def render(self, mode='human'):
|
||||
pass
|
||||
|
||||
def optimize(self):
|
||||
"""使用DQN优化任务分配"""
|
||||
from dqn import Agent
|
||||
|
||||
state_dim = self.observation_space.shape[0]
|
||||
action_dim = self.num_systems * len(self.subareas)
|
||||
|
||||
agent = Agent(state_dim, action_dim)
|
||||
|
||||
# 训练参数
|
||||
episodes = 100 # 减少训练轮数,因为这是子问题
|
||||
max_steps = 100
|
||||
|
||||
best_reward = float('-inf')
|
||||
best_time = float('inf')
|
||||
valid_solution = False
|
||||
|
||||
for episode in range(episodes):
|
||||
state = self.reset()
|
||||
episode_reward = 0
|
||||
|
||||
for step in range(max_steps):
|
||||
action = agent.choose_action(state)
|
||||
next_state, reward, done, _ = self.step(action)
|
||||
|
||||
agent.store_transition(state, action, reward, next_state, done)
|
||||
agent.learn()
|
||||
|
||||
episode_reward += reward
|
||||
state = next_state
|
||||
|
||||
if done:
|
||||
if reward != -10000: # 如果是有效解
|
||||
valid_solution = True
|
||||
best_time = min(best_time, -reward)
|
||||
break
|
||||
|
||||
return best_time, valid_solution
|
@ -1,88 +0,0 @@
|
||||
import numpy as np
|
||||
import gym
|
||||
from gym import spaces
|
||||
|
||||
class PartitionEnv(gym.Env):
|
||||
"""区域划分环境(第一层)"""
|
||||
def __init__(self):
|
||||
super(PartitionEnv, self).__init__()
|
||||
|
||||
# 环境参数
|
||||
self.H = 20 # 区域高度
|
||||
self.W = 25 # 区域宽度
|
||||
self.k = 1 # 系统数量
|
||||
|
||||
# 动作空间:[垂直切割数, 水平切割数]
|
||||
self.action_space = spaces.Box(
|
||||
low=np.array([1, 1]),
|
||||
high=np.array([5, 5]),
|
||||
dtype=np.float32
|
||||
)
|
||||
|
||||
# 状态空间:[当前垂直切割数, 当前水平切割数, 当前最大完成时间]
|
||||
self.observation_space = spaces.Box(
|
||||
low=np.array([1, 1, 0]),
|
||||
high=np.array([5, 5, float('inf')]),
|
||||
dtype=np.float32
|
||||
)
|
||||
|
||||
self.state = None
|
||||
self.current_step = 0
|
||||
self.max_steps = 1000
|
||||
|
||||
def generate_subareas(self, v_cuts, h_cuts):
|
||||
"""生成子区域信息"""
|
||||
v_boundaries = np.linspace(0, self.H, v_cuts + 1)
|
||||
h_boundaries = np.linspace(0, self.W, h_cuts + 1)
|
||||
|
||||
subareas = []
|
||||
for i in range(len(v_boundaries) - 1):
|
||||
for j in range(len(h_boundaries) - 1):
|
||||
height = v_boundaries[i+1] - v_boundaries[i]
|
||||
width = h_boundaries[j+1] - h_boundaries[j]
|
||||
center_y = (v_boundaries[i] + v_boundaries[i+1]) / 2
|
||||
center_x = (h_boundaries[j] + h_boundaries[j+1]) / 2
|
||||
|
||||
subareas.append({
|
||||
'height': height,
|
||||
'width': width,
|
||||
'area': height * width,
|
||||
'center': (center_y, center_x)
|
||||
})
|
||||
return subareas
|
||||
|
||||
def step(self, action):
|
||||
self.current_step += 1
|
||||
|
||||
# 解析动作
|
||||
v_cuts = int(action[0]) # 垂直切割数
|
||||
h_cuts = int(action[1]) # 水平切割数
|
||||
|
||||
# 生成子区域
|
||||
subareas = self.generate_subareas(v_cuts, h_cuts)
|
||||
|
||||
# 调用第二层(任务分配)获取结果
|
||||
from env_allocation import AllocationEnv
|
||||
alloc_env = AllocationEnv(subareas, self.k)
|
||||
total_time, valid = alloc_env.optimize()
|
||||
|
||||
# 计算奖励
|
||||
if not valid:
|
||||
reward = -10000 # 惩罚无效方案
|
||||
done = True
|
||||
else:
|
||||
reward = -total_time # 负的完成时间作为奖励
|
||||
done = self.current_step >= self.max_steps
|
||||
|
||||
# 更新状态
|
||||
self.state = np.array([v_cuts, h_cuts, total_time])
|
||||
|
||||
return self.state, reward, done, {}
|
||||
|
||||
def reset(self):
|
||||
self.state = np.array([1, 1, 0])
|
||||
self.current_step = 0
|
||||
return self.state
|
||||
|
||||
def render(self, mode='human'):
|
||||
pass
|
@ -1,152 +0,0 @@
|
||||
import numpy as np
|
||||
import gym
|
||||
from gym import spaces
|
||||
|
||||
class RoutingEnv(gym.Env):
|
||||
"""路径规划环境(第三层)"""
|
||||
def __init__(self, tasks):
|
||||
super(RoutingEnv, self).__init__()
|
||||
|
||||
self.tasks = tasks # 任务列表
|
||||
self.H = 20 # 区域高度
|
||||
self.W = 25 # 区域宽度
|
||||
self.region_center = (self.H/2, self.W/2)
|
||||
|
||||
# 时间系数
|
||||
self.flight_time_factor = 3 # 每张照片飞行时间
|
||||
self.comp_uav_factor = 5 # 无人机计算时间
|
||||
self.trans_time_factor = 0.3 # 传输时间
|
||||
self.car_move_time_factor = 100 # 汽车移动时间
|
||||
self.comp_bs_factor = 5 # 机巢计算时间
|
||||
|
||||
# 动作空间:选择下一个要访问的任务索引
|
||||
self.action_space = spaces.Discrete(len(tasks))
|
||||
|
||||
# 状态空间:[当前位置x, 当前位置y, 未访问任务的mask]
|
||||
self.observation_space = spaces.Box(
|
||||
low=np.array([0, 0] + [0] * len(tasks)),
|
||||
high=np.array([self.H, self.W] + [1] * len(tasks)),
|
||||
dtype=np.float32
|
||||
)
|
||||
|
||||
self.state = None
|
||||
self.current_position = self.region_center
|
||||
self.unvisited_mask = np.ones(len(tasks))
|
||||
self.total_flight_time = 0
|
||||
|
||||
def calculate_task_time(self, task):
|
||||
"""计算单个任务的执行时间"""
|
||||
area = task['area']
|
||||
|
||||
# 计算最优卸载率
|
||||
rho_time_limit = (self.flight_time_factor - self.trans_time_factor) / \
|
||||
(self.comp_uav_factor - self.trans_time_factor)
|
||||
rho_energy_limit = (30 - self.flight_time_factor * area - self.trans_time_factor * area) / \
|
||||
(self.comp_uav_factor * area - self.trans_time_factor * area)
|
||||
if rho_energy_limit < 0:
|
||||
return None, None
|
||||
rho = min(rho_time_limit, rho_energy_limit)
|
||||
|
||||
# 计算各阶段时间
|
||||
flight_time = self.flight_time_factor * area
|
||||
comp_time = self.comp_uav_factor * rho * area
|
||||
trans_time = self.trans_time_factor * (1 - rho) * area
|
||||
comp_bs_time = self.comp_bs_factor * (1 - rho) * area
|
||||
|
||||
task_time = max(flight_time, comp_bs_time)
|
||||
return task_time, rho
|
||||
|
||||
def calculate_move_time(self, from_pos, to_pos):
|
||||
"""计算移动时间"""
|
||||
dist = np.sqrt((from_pos[0] - to_pos[0])**2 + (from_pos[1] - to_pos[1])**2)
|
||||
return dist * self.car_move_time_factor
|
||||
|
||||
def step(self, action):
|
||||
# 检查动作是否有效
|
||||
if self.unvisited_mask[action] == 0:
|
||||
return self.state, -10000, True, {} # 惩罚选择已访问的任务
|
||||
|
||||
# 获取选中的任务
|
||||
task = self.tasks[action]
|
||||
task_center = task['center']
|
||||
|
||||
# 计算移动时间
|
||||
move_time = self.calculate_move_time(self.current_position, task_center)
|
||||
|
||||
# 计算任务执行时间
|
||||
task_time, rho = self.calculate_task_time(task)
|
||||
if task_time is None: # 任务不可行
|
||||
return self.state, -10000, True, {}
|
||||
|
||||
# 更新状态
|
||||
self.current_position = task_center
|
||||
self.unvisited_mask[action] = 0
|
||||
self.total_flight_time += task_time
|
||||
|
||||
# 构建新状态
|
||||
self.state = np.concatenate([
|
||||
np.array(self.current_position),
|
||||
self.unvisited_mask
|
||||
])
|
||||
|
||||
# 检查是否所有任务都已完成
|
||||
done = np.sum(self.unvisited_mask) == 0
|
||||
|
||||
# 计算奖励(负的总时间)
|
||||
total_time = max(self.total_flight_time, move_time)
|
||||
reward = -total_time if done else -move_time
|
||||
|
||||
return self.state, reward, done, {}
|
||||
|
||||
def reset(self):
|
||||
self.current_position = self.region_center
|
||||
self.unvisited_mask = np.ones(len(self.tasks))
|
||||
self.total_flight_time = 0
|
||||
|
||||
self.state = np.concatenate([
|
||||
np.array(self.current_position),
|
||||
self.unvisited_mask
|
||||
])
|
||||
return self.state
|
||||
|
||||
def render(self, mode='human'):
|
||||
pass
|
||||
|
||||
def optimize(self):
|
||||
"""使用DQN优化路径规划"""
|
||||
from dqn import Agent
|
||||
|
||||
state_dim = self.observation_space.shape[0]
|
||||
action_dim = len(self.tasks)
|
||||
|
||||
agent = Agent(state_dim, action_dim)
|
||||
|
||||
# 训练参数
|
||||
episodes = 50 # 进一步减少训练轮数,因为这是最底层子问题
|
||||
max_steps = len(self.tasks) + 1 # 最多访问所有任务+返回
|
||||
|
||||
best_reward = float('-inf')
|
||||
best_time = float('inf')
|
||||
valid_solution = False
|
||||
|
||||
for episode in range(episodes):
|
||||
state = self.reset()
|
||||
episode_reward = 0
|
||||
|
||||
for step in range(max_steps):
|
||||
action = agent.choose_action(state)
|
||||
next_state, reward, done, _ = self.step(action)
|
||||
|
||||
agent.store_transition(state, action, reward, next_state, done)
|
||||
agent.learn()
|
||||
|
||||
episode_reward += reward
|
||||
state = next_state
|
||||
|
||||
if done:
|
||||
if reward != -10000: # 如果是有效解
|
||||
valid_solution = True
|
||||
best_time = min(best_time, -reward)
|
||||
break
|
||||
|
||||
return best_time, valid_solution
|
@ -1,95 +0,0 @@
|
||||
from env import Env
|
||||
from dqn import Agent
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
def train():
|
||||
# 创建环境和智能体
|
||||
env = Env()
|
||||
state_dim = env.observation_space.shape[0]
|
||||
action_dim = 10 # len(垂直切割数)+len(水平切割数)
|
||||
|
||||
agent = Agent(state_dim, action_dim)
|
||||
|
||||
# 训练参数
|
||||
episodes = 1000
|
||||
max_steps = 1000
|
||||
|
||||
# 记录训练过程
|
||||
rewards_history = []
|
||||
best_reward = float('-inf')
|
||||
best_solution = None
|
||||
|
||||
# 开始训练
|
||||
for episode in range(episodes):
|
||||
state = env.reset()
|
||||
episode_reward = 0
|
||||
|
||||
for step in range(max_steps):
|
||||
# 选择动作
|
||||
action = agent.choose_action(state)
|
||||
|
||||
# 执行动作
|
||||
next_state, reward, done, _ = env.step(action)
|
||||
|
||||
# 存储经验
|
||||
agent.store_transition(state, action, reward, next_state, done)
|
||||
|
||||
# 学习
|
||||
agent.learn()
|
||||
|
||||
episode_reward += reward
|
||||
state = next_state
|
||||
|
||||
if done:
|
||||
break
|
||||
|
||||
# 记录每个episode的总奖励
|
||||
rewards_history.append(episode_reward)
|
||||
|
||||
# 更新最佳解
|
||||
if episode_reward > best_reward:
|
||||
best_reward = episode_reward
|
||||
best_solution = {
|
||||
'vertical_cuts': int(action[0]),
|
||||
'horizontal_cuts': int(action[1]),
|
||||
# 'offload_ratio': action[2],
|
||||
'total_time': -reward if reward != -1000 else float('inf'),
|
||||
'episode': episode
|
||||
}
|
||||
|
||||
# 打印训练进度
|
||||
if (episode + 1) % 10 == 0:
|
||||
avg_reward = np.mean(rewards_history[-10:])
|
||||
print(f"Episode {episode + 1}, Average Reward: {avg_reward:.2f}")
|
||||
|
||||
return best_solution, rewards_history
|
||||
|
||||
|
||||
def plot_training_results(rewards_history):
|
||||
plt.figure(figsize=(10, 5))
|
||||
plt.plot(rewards_history)
|
||||
plt.title('Training Progress')
|
||||
plt.xlabel('Episode')
|
||||
plt.ylabel('Total Reward')
|
||||
plt.grid(True)
|
||||
plt.show()
|
||||
|
||||
|
||||
def print_solution(solution):
|
||||
print("\n最佳解决方案:")
|
||||
print(f"在第 {solution['episode']} 轮找到")
|
||||
print(f"垂直切割数: {solution['vertical_cuts']}")
|
||||
print(f"水平切割数: {solution['horizontal_cuts']}")
|
||||
print(f"任务卸载率: {solution['offload_ratio']:.2f}")
|
||||
print(f"总完成时间: {solution['total_time']:.2f} 秒")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 训练模型
|
||||
best_solution, rewards_history = train()
|
||||
|
||||
# 显示结果
|
||||
plot_training_results(rewards_history)
|
||||
print_solution(best_solution)
|
@ -1,118 +0,0 @@
|
||||
from env_partition import PartitionEnv
|
||||
from env_allocation import AllocationEnv
|
||||
from env_routing import RoutingEnv
|
||||
from dqn import Agent
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
def train_hierarchical():
|
||||
"""训练分层强化学习系统"""
|
||||
# 创建第一层环境(区域划分)
|
||||
partition_env = PartitionEnv()
|
||||
partition_state_dim = partition_env.observation_space.shape[0]
|
||||
partition_action_dim = 10 # 5个垂直切割选项 + 5个水平切割选项
|
||||
|
||||
partition_agent = Agent(partition_state_dim, partition_action_dim)
|
||||
|
||||
# 训练参数
|
||||
episodes = 1000
|
||||
max_steps = 1000
|
||||
|
||||
# 记录训练过程
|
||||
rewards_history = []
|
||||
best_reward = float('-inf')
|
||||
best_solution = None
|
||||
|
||||
# 开始训练
|
||||
print("开始训练分层强化学习系统...")
|
||||
|
||||
for episode in range(episodes):
|
||||
state = partition_env.reset()
|
||||
episode_reward = 0
|
||||
|
||||
for step in range(max_steps):
|
||||
# 选择动作
|
||||
action = partition_agent.choose_action(state)
|
||||
|
||||
# 执行动作(这会触发第二层和第三层的优化)
|
||||
next_state, reward, done, _ = partition_env.step(action)
|
||||
|
||||
# 存储经验
|
||||
partition_agent.store_transition(state, action, reward, next_state, done)
|
||||
|
||||
# 学习
|
||||
partition_agent.learn()
|
||||
|
||||
episode_reward += reward
|
||||
state = next_state
|
||||
|
||||
if done:
|
||||
break
|
||||
|
||||
# 记录每个episode的总奖励
|
||||
rewards_history.append(episode_reward)
|
||||
|
||||
# 更新最佳解
|
||||
if episode_reward > best_reward:
|
||||
best_reward = episode_reward
|
||||
best_solution = {
|
||||
'vertical_cuts': int(action[0]),
|
||||
'horizontal_cuts': int(action[1]),
|
||||
'total_time': -reward if reward != -10000 else float('inf'),
|
||||
'episode': episode
|
||||
}
|
||||
|
||||
# 打印训练进度
|
||||
if (episode + 1) % 10 == 0:
|
||||
avg_reward = np.mean(rewards_history[-10:])
|
||||
print(f"Episode {episode + 1}, Average Reward: {avg_reward:.2f}")
|
||||
|
||||
return best_solution, rewards_history
|
||||
|
||||
def plot_training_results(rewards_history):
|
||||
plt.figure(figsize=(10, 5))
|
||||
plt.plot(rewards_history)
|
||||
plt.title('Hierarchical DQN Training Progress')
|
||||
plt.xlabel('Episode')
|
||||
plt.ylabel('Total Reward')
|
||||
plt.grid(True)
|
||||
plt.show()
|
||||
|
||||
def print_solution(solution):
|
||||
print("\n最佳解决方案:")
|
||||
print(f"在第 {solution['episode']} 轮找到")
|
||||
print(f"垂直切割数: {solution['vertical_cuts']}")
|
||||
print(f"水平切割数: {solution['horizontal_cuts']}")
|
||||
print(f"总完成时间: {solution['total_time']:.2f} 秒")
|
||||
|
||||
def visualize_partition(solution):
|
||||
"""可视化区域划分结果"""
|
||||
H, W = 20, 25
|
||||
v_cuts = solution['vertical_cuts']
|
||||
h_cuts = solution['horizontal_cuts']
|
||||
|
||||
plt.figure(figsize=(10, 8))
|
||||
|
||||
# 绘制网格
|
||||
for i in range(v_cuts + 1):
|
||||
y = i * (H / v_cuts)
|
||||
plt.axhline(y=y, color='b', linestyle='-', alpha=0.5)
|
||||
|
||||
for i in range(h_cuts + 1):
|
||||
x = i * (W / h_cuts)
|
||||
plt.axvline(x=x, color='b', linestyle='-', alpha=0.5)
|
||||
|
||||
plt.title('Area Partition Visualization')
|
||||
plt.xlabel('Width')
|
||||
plt.ylabel('Height')
|
||||
plt.grid(True, alpha=0.3)
|
||||
plt.show()
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 训练模型
|
||||
best_solution, rewards_history = train_hierarchical()
|
||||
|
||||
# 显示结果
|
||||
plot_training_results(rewards_history)
|
||||
print_solution(best_solution)
|
||||
visualize_partition(best_solution)
|
58
DQN/run_this.py
Normal file
58
DQN/run_this.py
Normal file
@ -0,0 +1,58 @@
|
||||
from RL_brain import DeepQNetwork
|
||||
import os
|
||||
import sys
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
from env import PartitionMazeEnv
|
||||
|
||||
def run_maze():
|
||||
step = 0 # 为了记录走到第几步,记忆录中积累经验(也就是积累一些transition)之后再开始学习
|
||||
for episode in range(200):
|
||||
# initial observation
|
||||
observation = env.reset()
|
||||
|
||||
while True:
|
||||
# refresh env
|
||||
env.render()
|
||||
|
||||
# RL choose action based on observation
|
||||
action = RL.choose_action(observation)
|
||||
|
||||
# RL take action and get next observation and reward
|
||||
observation_, reward, done = env.step(action)
|
||||
|
||||
# !! restore transition
|
||||
RL.store_transition(observation, action, reward, observation_)
|
||||
|
||||
# 超过200条transition之后每隔5步学习一次
|
||||
if (step > 200) and (step % 5 == 0):
|
||||
RL.learn()
|
||||
|
||||
# swap observation
|
||||
observation = observation_
|
||||
|
||||
# break while loop when end of this episode
|
||||
if done:
|
||||
break
|
||||
step += 1
|
||||
|
||||
# end of game
|
||||
print("game over")
|
||||
env.destroy()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# maze game
|
||||
env = PartitionMazeEnv()
|
||||
|
||||
# TODO 代码还没有写完,跑不了!!!
|
||||
RL = DeepQNetwork(env.n_actions, env.n_features,
|
||||
learning_rate=0.01,
|
||||
reward_decay=0.9,
|
||||
e_greedy=0.9,
|
||||
replace_target_iter=200,
|
||||
memory_size=2000)
|
||||
env.after(100, run_maze)
|
||||
env.mainloop()
|
||||
RL.plot_cost()
|
||||
|
||||
|
48
env.py
48
env.py
@ -39,8 +39,8 @@ class PartitionMazeEnv(gym.Env):
|
||||
##############################
|
||||
# 可能需要手动修改的超参数
|
||||
##############################
|
||||
self.CUT_NUM = 6 # 横切一半,竖切一半
|
||||
self.BASE_LINE = 12133.250161412347 # 基准时间,通过greedy或者蒙特卡洛计算出来
|
||||
self.CUT_NUM = 2 # 横切一半,竖切一半
|
||||
self.BASE_LINE = 4000 # 基准时间,通过greedy或者蒙特卡洛计算出来
|
||||
|
||||
self.phase = 0 # 阶段控制,0:区域划分阶段,1:迷宫初始化阶段,2:走迷宫阶段
|
||||
self.partition_step = 0 # 区域划分阶段步数,范围 0~4
|
||||
@ -168,30 +168,30 @@ class PartitionMazeEnv(gym.Env):
|
||||
return state, reward, False, False, {}
|
||||
|
||||
elif self.phase == 1:
|
||||
# 阶段 1:初始化迷宫,让多个车辆从区域中心出发,前往划分区域的中心点
|
||||
# 确保 action 的值在 [0, 1],然后映射到 0~(num_regions-1) 的索引
|
||||
num_regions = (len(self.col_cuts) - 1) * \
|
||||
(len(self.row_cuts) - 1)
|
||||
target_region_index = int(np.floor(a * num_regions))
|
||||
target_region_index = np.clip(
|
||||
target_region_index, 0, num_regions - 1)
|
||||
# 将index映射到笛卡尔坐标
|
||||
coord = (target_region_index // (len(self.col_cuts) - 1),
|
||||
target_region_index % (len(self.col_cuts) - 1))
|
||||
self.car_pos[self.init_maze_step] = self.rectangles[coord]['center']
|
||||
self.car_traj[self.init_maze_step].append(coord)
|
||||
self.rectangles[coord]['is_visited'] = True
|
||||
# 阶段 1:初始化迷宫,让多个车辆从区域中心出发,前往最近的几个区域中心点
|
||||
region_centers = [
|
||||
(i, j, self.rectangles[(i, j)]['center'])
|
||||
for i in range(len(self.row_cuts) - 1)
|
||||
for j in range(len(self.col_cuts) - 1)
|
||||
]
|
||||
# 按照与区域中心的距离从近到远排序
|
||||
region_centers.sort(
|
||||
key=lambda x: math.dist(x[2], (self.H / 2, self.W / 2))
|
||||
)
|
||||
|
||||
# 计数
|
||||
self.init_maze_step += 1
|
||||
# 分配最近的区域给每辆车
|
||||
for idx in range(self.num_cars):
|
||||
i, j, center = region_centers[idx]
|
||||
self.car_pos[idx] = center
|
||||
self.car_traj[idx].append((i, j))
|
||||
self.rectangles[(i, j)]['is_visited'] = True
|
||||
|
||||
# 进入阶段 2:走迷宫
|
||||
self.phase = 2
|
||||
state = np.concatenate(
|
||||
[self.partition_values, np.array(self.car_pos).flatten()])
|
||||
if self.init_maze_step < self.num_cars:
|
||||
return state, 0.0, False, False, {}
|
||||
else:
|
||||
# 进入阶段 2:走迷宫
|
||||
self.phase = 2
|
||||
return state, 0.0, False, False, {}
|
||||
[self.partition_values, np.array(self.car_pos).flatten()]
|
||||
)
|
||||
return state, 0.0, False, False, {}
|
||||
|
||||
elif self.phase == 2:
|
||||
# 阶段 2:路径规划(走迷宫)
|
||||
|
@ -1,6 +1,6 @@
|
||||
H : 50 # 区域高度,网格点之间的距离为25m(单位距离)
|
||||
W : 50 # 区域宽度
|
||||
num_cars : 3 # 系统数量(车-巢-机系统个数)
|
||||
H : 20 # 区域高度,网格点之间的距离为25m(单位距离)
|
||||
W : 25 # 区域宽度
|
||||
num_cars : 1 # 系统数量(车-巢-机系统个数)
|
||||
|
||||
# 时间系数(单位:秒,每个网格一张照片)
|
||||
flight_time_factor : 3 # 每张照片对应的飞行时间,无人机飞行速度为9.5m/s,拍摄照片的时间间隔为3s
|
||||
|
54
solutions/best_solution_mtkl.json
Normal file
54
solutions/best_solution_mtkl.json
Normal file
@ -0,0 +1,54 @@
|
||||
{
|
||||
"row_boundaries": [
|
||||
0.0,
|
||||
0.2,
|
||||
0.4,
|
||||
0.7,
|
||||
1.0
|
||||
],
|
||||
"col_boundaries": [
|
||||
0.0,
|
||||
0.5,
|
||||
1.0
|
||||
],
|
||||
"car_paths": {
|
||||
"0": [
|
||||
[
|
||||
15.0,
|
||||
12.5
|
||||
],
|
||||
[
|
||||
5.0,
|
||||
12.5
|
||||
]
|
||||
],
|
||||
"1": [
|
||||
[
|
||||
42.5,
|
||||
12.5
|
||||
],
|
||||
[
|
||||
42.5,
|
||||
37.5
|
||||
]
|
||||
],
|
||||
"2": [
|
||||
[
|
||||
27.5,
|
||||
12.5
|
||||
],
|
||||
[
|
||||
27.5,
|
||||
37.5
|
||||
],
|
||||
[
|
||||
15.0,
|
||||
37.5
|
||||
],
|
||||
[
|
||||
5.0,
|
||||
37.5
|
||||
]
|
||||
]
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user