简化初始化迷宫的方式

2025-03-18 17:27:49 +08:00 · 2025-03-18 17:27:49 +08:00 · 343008bc9f
commit 343008bc9f
parent 55e45fe14e
14 changed files with 397 additions and 854 deletions
--- a/.gitignore
+++ b/.gitignore
@ -9,7 +9,6 @@ __pycache__/

 # Pytorch weights
 weights/
-solutions/
 PPO_preTrained/
 PPO_logs/
 logs/
--- a/DDPG_solver/main.py
+++ b/DDPG_solver/main.py
@ -1,15 +1,14 @@
-from env import PartitionMazeEnv
-from utils import str2bool, evaluate_policy
-from datetime import datetime
-from DDPG import DDPG_agent
 import gymnasium as gym
 import os
 import shutil
 import argparse
 import torch
-
 import sys
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from env import PartitionMazeEnv
+from utils import str2bool, evaluate_policy
+from datetime import datetime
+from DDPG import DDPG_agent

 '''Hyperparameter Setting'''
 parser = argparse.ArgumentParser()
--- a/DQN/RL_brain.py
+++ b/DQN/RL_brain.py
@ -0,0 +1,254 @@
+"""
+Deep Q Network off-policy
+"""
+import torch
+import torch.nn as nn
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+
+np.random.seed(42)
+torch.manual_seed(2)
+
+
+class Network(nn.Module):
+    """
+    Network Structure
+    """
+    def __init__(self,
+                 n_features,
+                 n_actions,
+                 n_neuron=10
+                 ):
+        super(Network, self).__init__()
+        self.net = nn.Sequential(
+            nn.Linear(in_features=n_features, out_features=n_neuron, bias=True),
+            nn.Linear(in_features=n_neuron, out_features=n_actions, bias=True),
+            nn.ReLU()
+        )
+
+    def forward(self, s):
+        """
+
+        :param s: s
+        :return: q
+        """
+        q = self.net(s)
+        return q
+
+
+class DeepQNetwork(nn.Module):
+    """
+    Q Learning Algorithm
+    """
+    def __init__(self,
+                 n_actions,
+                 n_features,
+                 learning_rate=0.01,
+                 reward_decay=0.9,
+                 e_greedy=0.9,
+                 replace_target_iter=300,
+                 memory_size=500,
+                 batch_size=32,
+                 e_greedy_increment=None):
+        super(DeepQNetwork, self).__init__()
+
+        self.n_actions = n_actions
+        self.n_features = n_features
+        self.lr = learning_rate
+        self.gamma = reward_decay
+        self.epsilon_max = e_greedy
+        self.replace_target_iter = replace_target_iter
+        self.memory_size = memory_size
+        self.batch_size = batch_size
+        self.epsilon_increment = e_greedy_increment
+        self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max
+
+        # total learning step
+        self.learn_step_counter = 0
+
+        # initialize zero memory [s, a, r, s_]
+        # 这里用pd.DataFrame创建的表格作为memory
+        # 表格的行数是memory的大小，也就是transition的个数
+        # 表格的列数是transition的长度，一个transition包含[s, a, r, s_]，其中a和r分别是一个数字，s和s_的长度分别是n_features
+        self.memory = pd.DataFrame(np.zeros((self.memory_size, self.n_features*2+2)))
+
+        # build two network: eval_net and target_net
+        self.eval_net = Network(n_features=self.n_features, n_actions=self.n_actions)
+        self.target_net = Network(n_features=self.n_features, n_actions=self.n_actions)
+        self.loss_function = nn.MSELoss()
+        self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=self.lr)
+
+        # 记录每一步的误差
+        self.cost_his = []
+
+
+    def store_transition(self, s, a, r, s_):
+        if not hasattr(self, 'memory_counter'):
+            # hasattr用于判断对象是否包含对应的属性。
+            self.memory_counter = 0
+
+        transition = np.hstack((s, [a,r], s_))
+
+        # replace the old memory with new memory
+        index = self.memory_counter % self.memory_size
+        self.memory.iloc[index, :] = transition
+
+        self.memory_counter += 1
+
+    def choose_action(self, observation):
+        observation = observation[np.newaxis, :]
+
+        if np.random.uniform() < self.epsilon:
+            # forward feed the observation and get q value for every actions
+            s = torch.FloatTensor(observation)
+            actions_value = self.eval_net(s)
+            action = [np.argmax(actions_value.detach().numpy())][0]
+        else:
+            action = np.random.randint(0, self.n_actions)
+        return action
+
+    def _replace_target_params(self):
+        # 复制网络参数
+        self.target_net.load_state_dict(self.eval_net.state_dict())
+
+    def learn(self):
+        # check to replace target parameters
+        if self.learn_step_counter % self.replace_target_iter == 0:
+            self._replace_target_params()
+            print('\ntarget params replaced\n')
+
+        # sample batch memory from all memory
+        batch_memory = self.memory.sample(self.batch_size) \
+            if self.memory_counter > self.memory_size \
+            else self.memory.iloc[:self.memory_counter].sample(self.batch_size, replace=True)
+
+        # run the nextwork
+        s = torch.FloatTensor(batch_memory.iloc[:, :self.n_features].values)
+        s_ = torch.FloatTensor(batch_memory.iloc[:, -self.n_features:].values)
+        q_eval = self.eval_net(s)
+        q_next = self.target_net(s_)
+
+        # change q_target w.r.t q_eval's action
+        q_target = q_eval.clone()
+
+        # 更新值
+        batch_index = np.arange(self.batch_size, dtype=np.int32)
+        eval_act_index = batch_memory.iloc[:, self.n_features].values.astype(int)
+        reward = batch_memory.iloc[:, self.n_features + 1].values
+
+        q_target[batch_index, eval_act_index] = torch.FloatTensor(reward) + self.gamma * q_next.max(dim=1).values
+
+        # train eval network
+        loss = self.loss_function(q_target, q_eval)
+        self.optimizer.zero_grad()
+        loss.backward()
+        self.optimizer.step()
+
+        self.cost_his.append(loss.detach().numpy())
+
+        # increasing epsilon
+        self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
+        self.learn_step_counter += 1
+
+    def plot_cost(self):
+        plt.figure()
+        plt.plot(np.arange(len(self.cost_his)), self.cost_his)
+        plt.show()
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/DQN/dqn.py
+++ b/DQN/dqn.py
@ -1,94 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.optim as optim
-import numpy as np
-from collections import deque
-import random
-
-class DQN(nn.Module):
-    def __init__(self, state_dim, action_dim):
-        super(DQN, self).__init__()
-        
-        self.network = nn.Sequential(
-            nn.Linear(state_dim, 128),
-            nn.ReLU(),
-            nn.Linear(128, 128),
-            nn.ReLU(),
-            nn.Linear(128, action_dim)
-        )
-        
-    def forward(self, x):
-        return self.network(x)
-
-class Agent:
-    def __init__(self, state_dim, action_dim):
-        self.state_dim = state_dim
-        self.action_dim = action_dim
-        
-        # DQN网络
-        self.eval_net = DQN(state_dim, action_dim)
-        self.target_net = DQN(state_dim, action_dim)
-        self.target_net.load_state_dict(self.eval_net.state_dict())
-        
-        # 训练参数
-        self.learning_rate = 0.001
-        self.gamma = 0.99
-        self.epsilon = 1.0
-        self.epsilon_min = 0.01
-        self.epsilon_decay = 0.995
-        self.memory = deque(maxlen=10000)
-        self.batch_size = 64
-        self.optimizer = optim.Adam(self.eval_net.parameters(), lr=self.learning_rate)
-        
-    def choose_action(self, state):
-        if random.random() < self.epsilon:
-            # 随机选择动作
-            return random.randint(0, self.action_dim - 1)
-        else:
-            # 根据Q值选择动作
-            state = torch.FloatTensor(state).unsqueeze(0)
-            q_values = self.eval_net(state)
-            return torch.argmax(q_values).item()
-    
-    def store_transition(self, state, action, reward, next_state, done):
-        self.memory.append((state, action, reward, next_state, done))
-    
-    def learn(self):
-        if len(self.memory) < self.batch_size:
-            return
-        
-        # 随机采样batch
-        batch = random.sample(self.memory, self.batch_size)
-        states = torch.FloatTensor([x[0] for x in batch])
-        actions = torch.LongTensor([x[1] for x in batch])
-        rewards = torch.FloatTensor([x[2] for x in batch])
-        next_states = torch.FloatTensor([x[3] for x in batch])
-        dones = torch.FloatTensor([x[4] for x in batch])
-        
-        # 计算当前Q值
-        current_q_values = self.eval_net(states).gather(1, actions.unsqueeze(1))
-        
-        # 计算目标Q值
-        next_q_values = self.target_net(next_states).detach()
-        max_next_q = torch.max(next_q_values, dim=1)[0]
-        target_q_values = rewards + (1 - dones) * self.gamma * max_next_q
-        
-        # 计算损失
-        loss = nn.MSELoss()(current_q_values.squeeze(), target_q_values)
-        
-        # 更新网络
-        self.optimizer.zero_grad()
-        loss.backward()
-        self.optimizer.step()
-        
-        # 更新epsilon
-        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
-        
-        # 定期更新目标网络
-        if self.learn.counter % 100 == 0:
-            self.target_net.load_state_dict(self.eval_net.state_dict())
-        
-        self.learn.counter += 1
-    
-    # 添加计数器属性
-    learn.counter = 0
--- a/DQN/env.py
+++ b/DQN/env.py
@ -1,134 +0,0 @@
-import numpy as np
-import gym
-from gym import spaces
-
-
-class Env(gym.Env):
-    """多车-巢-机系统的区域覆盖环境"""
-
-    def __init__(self):
-        super(Env, self).__init__()
-
-        # 环境参数
-        self.H = 20  # 区域高度
-        self.W = 25  # 区域宽度
-        self.k = 1   # 系统数量
-
-        # 时间系数
-        self.flight_time_factor = 3     # 每张照片飞行时间
-        self.comp_uav_factor = 5        # 无人机计算时间
-        self.trans_time_factor = 0.3    # 传输时间
-        self.car_move_time_factor = 100  # 汽车移动时间
-        self.comp_bs_factor = 5         # 机巢计算时间
-
-        # 能量参数
-        self.flight_energy_factor = 0.05    # 飞行能耗
-        self.comp_energy_factor = 0.05      # 计算能耗
-        self.trans_energy_factor = 0.0025   # 传输能耗
-        self.battery_capacity = 30          # 电池容量
-
-        # 动作空间
-        # [垂直切割数, 水平切割数, 卸载率]
-        self.action_space = spaces.Box(
-            low=np.array([1, 1, 0]),
-            high=np.array([5, 5, 1]),
-            dtype=np.float32
-        )
-
-        # 状态空间
-        # [当前垂直切割数, 当前水平切割数, 当前最大完成时间]
-        self.observation_space = spaces.Box(
-            low=np.array([1, 1, 0]),
-            high=np.array([5, 5, float('inf')]),
-            dtype=np.float32
-        )
-
-        self.state = None
-        self.current_step = 0
-        self.max_steps = 1000
-
-    def step(self, action):
-        self.current_step += 1
-
-        # 解析动作
-        v_cuts = int(action[0])  # 垂直切割数
-        h_cuts = int(action[1])  # 水平切割数
-        # rho = action[2]          # 卸载率
-
-        # TODO 生成切割位置，目前是均匀切割
-        v_boundaries = np.linspace(0, self.H, v_cuts + 1)
-        h_boundaries = np.linspace(0, self.W, h_cuts + 1)
-
-        # 计算每个子区域的指标
-        total_time = 0
-        valid_partition = True
-
-        for i in range(len(v_boundaries) - 1):
-            for j in range(len(h_boundaries) - 1):
-                # 计算子区域大小
-                height = v_boundaries[i+1] - v_boundaries[i]
-                width = h_boundaries[j+1] - h_boundaries[j]
-                area = height * width
-
-                # 求解rho
-                rho_time_limit = (self.flight_time_factor - self.trans_time_factor) / \
-                    (self.comp_uav_factor - self.trans_time_factor)
-                rho_energy_limit = (self.battery_capacity - self.flight_energy_factor * area - self.trans_energy_factor * area) / \
-                    (self.comp_energy_factor * area - self.trans_energy_factor * area)
-                if rho_energy_limit < 0:
-                    valid_partition = False
-                    break
-                rho = min(rho_time_limit, rho_energy_limit)
-
-                # 计算各阶段时间
-                flight_time = self.flight_time_factor * area
-                comp_time = self.comp_uav_factor * rho * area
-                trans_time = self.trans_time_factor * (1 - rho) * area
-                comp_bs_time = self.comp_bs_factor * (1 - rho) * area
-
-                # # 计算能耗
-                # flight_energy = self.flight_energy_factor * area
-                # comp_energy = self.comp_energy_factor * rho * area
-                # trans_energy = self.trans_energy_factor * (1 - rho) * area
-                # total_energy = flight_energy + comp_energy + trans_energy
-
-                # # 检查约束
-                # if total_energy > self.battery_capacity or (comp_time + trans_time > flight_time):
-                #     valid_partition = False
-                #     break
-
-                # 计算子区域中心到区域中心的距离
-                center_y = (v_boundaries[i] + v_boundaries[i+1]) / 2
-                center_x = (h_boundaries[j] + h_boundaries[j+1]) / 2
-                dist_to_center = np.sqrt(
-                    (center_y - self.H/2)**2 + (center_x - self.W/2)**2)
-                car_time = dist_to_center * self.car_move_time_factor
-
-                # 更新总时间
-                task_time = max(flight_time + car_time, comp_bs_time)
-                total_time = max(total_time, task_time)
-
-            if not valid_partition:
-                break
-
-        # 计算奖励
-        if not valid_partition:
-            reward = -10000  # 惩罚无效方案
-            done = True
-        else:
-            reward = -total_time  # 负的完成时间作为奖励
-            done = self.current_step >= self.max_steps
-
-        # 更新状态
-        self.state = np.array([v_cuts, h_cuts, total_time])
-
-        return self.state, reward, done, {}
-
-    def reset(self):
-        # 初始化状态
-        self.state = np.array([1, 1, 0])
-        self.current_step = 0
-        return self.state
-
-    def render(self, mode='human'):
-        pass
--- a/DQN/env_allocation.py
+++ b/DQN/env_allocation.py
@ -1,140 +0,0 @@
-import numpy as np
-import gym
-from gym import spaces
-
-class AllocationEnv(gym.Env):
-    """任务分配环境（第二层）"""
-    def __init__(self, subareas, num_systems):
-        super(AllocationEnv, self).__init__()
-        
-        self.subareas = subareas  # 子区域列表
-        self.num_systems = num_systems  # 系统数量
-        
-        # 时间系数
-        self.flight_time_factor = 3     # 每张照片飞行时间
-        self.comp_uav_factor = 5        # 无人机计算时间
-        self.trans_time_factor = 0.3    # 传输时间
-        self.car_move_time_factor = 100  # 汽车移动时间
-        self.comp_bs_factor = 5         # 机巢计算时间
-        
-        # 能量参数
-        self.flight_energy_factor = 0.05    # 飞行能耗
-        self.comp_energy_factor = 0.05      # 计算能耗
-        self.trans_energy_factor = 0.0025   # 传输能耗
-        self.battery_capacity = 30          # 电池容量
-        
-        # 动作空间：每个子区域分配给哪个系统
-        self.action_space = spaces.MultiDiscrete([num_systems] * len(subareas))
-        
-        # 状态空间：[各系统当前负载]
-        self.observation_space = spaces.Box(
-            low=np.zeros(num_systems),
-            high=np.ones(num_systems) * float('inf'),
-            dtype=np.float32
-        )
-        
-        self.state = None
-        self.current_step = 0
-        self.max_steps = 1000
-    
-    def calculate_rho(self, area):
-        """计算最优卸载率"""
-        rho_time_limit = (self.flight_time_factor - self.trans_time_factor) / \
-            (self.comp_uav_factor - self.trans_time_factor)
-        rho_energy_limit = (self.battery_capacity - self.flight_energy_factor * area - self.trans_energy_factor * area) / \
-            (self.comp_energy_factor * area - self.trans_energy_factor * area)
-        if rho_energy_limit < 0:
-            return None
-        return min(rho_time_limit, rho_energy_limit)
-    
-    def step(self, action):
-        self.current_step += 1
-        
-        # 初始化每个系统的任务列表
-        system_tasks = {i: [] for i in range(self.num_systems)}
-        
-        # 根据动作分配任务
-        for i, system_id in enumerate(action):
-            system_tasks[system_id].append(self.subareas[i])
-        
-        # 计算每个系统的完成时间
-        system_times = []
-        valid_allocation = True
-        
-        for system_id, tasks in system_tasks.items():
-            if not tasks:  # 如果系统没有分配任务
-                system_times.append(0)
-                continue
-                
-            # 调用第三层（路径规划）获取结果
-            from env_routing import RoutingEnv
-            route_env = RoutingEnv(tasks)
-            completion_time, valid = route_env.optimize()
-            
-            if not valid:
-                valid_allocation = False
-                break
-                
-            system_times.append(completion_time)
-        
-        total_time = max(system_times) if system_times else 0
-        
-        # 计算奖励
-        if not valid_allocation:
-            reward = -10000
-            done = True
-        else:
-            reward = -total_time
-            done = self.current_step >= self.max_steps
-        
-        # 更新状态（各系统的负载）
-        self.state = np.array([len(tasks) for tasks in system_tasks.values()])
-        
-        return self.state, reward, done, {}
-    
-    def reset(self):
-        self.state = np.zeros(self.num_systems)
-        self.current_step = 0
-        return self.state
-    
-    def render(self, mode='human'):
-        pass
-    
-    def optimize(self):
-        """使用DQN优化任务分配"""
-        from dqn import Agent
-        
-        state_dim = self.observation_space.shape[0]
-        action_dim = self.num_systems * len(self.subareas)
-        
-        agent = Agent(state_dim, action_dim)
-        
-        # 训练参数
-        episodes = 100  # 减少训练轮数，因为这是子问题
-        max_steps = 100
-        
-        best_reward = float('-inf')
-        best_time = float('inf')
-        valid_solution = False
-        
-        for episode in range(episodes):
-            state = self.reset()
-            episode_reward = 0
-            
-            for step in range(max_steps):
-                action = agent.choose_action(state)
-                next_state, reward, done, _ = self.step(action)
-                
-                agent.store_transition(state, action, reward, next_state, done)
-                agent.learn()
-                
-                episode_reward += reward
-                state = next_state
-                
-                if done:
-                    if reward != -10000:  # 如果是有效解
-                        valid_solution = True
-                        best_time = min(best_time, -reward)
-                    break
-        
-        return best_time, valid_solution
--- a/DQN/env_partition.py
+++ b/DQN/env_partition.py
@ -1,88 +0,0 @@
-import numpy as np
-import gym
-from gym import spaces
-
-class PartitionEnv(gym.Env):
-    """区域划分环境（第一层）"""
-    def __init__(self):
-        super(PartitionEnv, self).__init__()
-        
-        # 环境参数
-        self.H = 20  # 区域高度
-        self.W = 25  # 区域宽度
-        self.k = 1   # 系统数量
-        
-        # 动作空间：[垂直切割数, 水平切割数]
-        self.action_space = spaces.Box(
-            low=np.array([1, 1]),
-            high=np.array([5, 5]),
-            dtype=np.float32
-        )
-        
-        # 状态空间：[当前垂直切割数, 当前水平切割数, 当前最大完成时间]
-        self.observation_space = spaces.Box(
-            low=np.array([1, 1, 0]),
-            high=np.array([5, 5, float('inf')]),
-            dtype=np.float32
-        )
-        
-        self.state = None
-        self.current_step = 0
-        self.max_steps = 1000
-    
-    def generate_subareas(self, v_cuts, h_cuts):
-        """生成子区域信息"""
-        v_boundaries = np.linspace(0, self.H, v_cuts + 1)
-        h_boundaries = np.linspace(0, self.W, h_cuts + 1)
-        
-        subareas = []
-        for i in range(len(v_boundaries) - 1):
-            for j in range(len(h_boundaries) - 1):
-                height = v_boundaries[i+1] - v_boundaries[i]
-                width = h_boundaries[j+1] - h_boundaries[j]
-                center_y = (v_boundaries[i] + v_boundaries[i+1]) / 2
-                center_x = (h_boundaries[j] + h_boundaries[j+1]) / 2
-                
-                subareas.append({
-                    'height': height,
-                    'width': width,
-                    'area': height * width,
-                    'center': (center_y, center_x)
-                })
-        return subareas
-    
-    def step(self, action):
-        self.current_step += 1
-        
-        # 解析动作
-        v_cuts = int(action[0])  # 垂直切割数
-        h_cuts = int(action[1])  # 水平切割数
-        
-        # 生成子区域
-        subareas = self.generate_subareas(v_cuts, h_cuts)
-        
-        # 调用第二层（任务分配）获取结果
-        from env_allocation import AllocationEnv
-        alloc_env = AllocationEnv(subareas, self.k)
-        total_time, valid = alloc_env.optimize()
-        
-        # 计算奖励
-        if not valid:
-            reward = -10000  # 惩罚无效方案
-            done = True
-        else:
-            reward = -total_time  # 负的完成时间作为奖励
-            done = self.current_step >= self.max_steps
-        
-        # 更新状态
-        self.state = np.array([v_cuts, h_cuts, total_time])
-        
-        return self.state, reward, done, {}
-    
-    def reset(self):
-        self.state = np.array([1, 1, 0])
-        self.current_step = 0
-        return self.state
-    
-    def render(self, mode='human'):
-        pass
--- a/DQN/env_routing.py
+++ b/DQN/env_routing.py
@ -1,152 +0,0 @@
-import numpy as np
-import gym
-from gym import spaces
-
-class RoutingEnv(gym.Env):
-    """路径规划环境（第三层）"""
-    def __init__(self, tasks):
-        super(RoutingEnv, self).__init__()
-        
-        self.tasks = tasks  # 任务列表
-        self.H = 20  # 区域高度
-        self.W = 25  # 区域宽度
-        self.region_center = (self.H/2, self.W/2)
-        
-        # 时间系数
-        self.flight_time_factor = 3     # 每张照片飞行时间
-        self.comp_uav_factor = 5        # 无人机计算时间
-        self.trans_time_factor = 0.3    # 传输时间
-        self.car_move_time_factor = 100  # 汽车移动时间
-        self.comp_bs_factor = 5         # 机巢计算时间
-        
-        # 动作空间：选择下一个要访问的任务索引
-        self.action_space = spaces.Discrete(len(tasks))
-        
-        # 状态空间：[当前位置x, 当前位置y, 未访问任务的mask]
-        self.observation_space = spaces.Box(
-            low=np.array([0, 0] + [0] * len(tasks)),
-            high=np.array([self.H, self.W] + [1] * len(tasks)),
-            dtype=np.float32
-        )
-        
-        self.state = None
-        self.current_position = self.region_center
-        self.unvisited_mask = np.ones(len(tasks))
-        self.total_flight_time = 0
-        
-    def calculate_task_time(self, task):
-        """计算单个任务的执行时间"""
-        area = task['area']
-        
-        # 计算最优卸载率
-        rho_time_limit = (self.flight_time_factor - self.trans_time_factor) / \
-            (self.comp_uav_factor - self.trans_time_factor)
-        rho_energy_limit = (30 - self.flight_time_factor * area - self.trans_time_factor * area) / \
-            (self.comp_uav_factor * area - self.trans_time_factor * area)
-        if rho_energy_limit < 0:
-            return None, None
-        rho = min(rho_time_limit, rho_energy_limit)
-        
-        # 计算各阶段时间
-        flight_time = self.flight_time_factor * area
-        comp_time = self.comp_uav_factor * rho * area
-        trans_time = self.trans_time_factor * (1 - rho) * area
-        comp_bs_time = self.comp_bs_factor * (1 - rho) * area
-        
-        task_time = max(flight_time, comp_bs_time)
-        return task_time, rho
-    
-    def calculate_move_time(self, from_pos, to_pos):
-        """计算移动时间"""
-        dist = np.sqrt((from_pos[0] - to_pos[0])**2 + (from_pos[1] - to_pos[1])**2)
-        return dist * self.car_move_time_factor
-    
-    def step(self, action):
-        # 检查动作是否有效
-        if self.unvisited_mask[action] == 0:
-            return self.state, -10000, True, {}  # 惩罚选择已访问的任务
-        
-        # 获取选中的任务
-        task = self.tasks[action]
-        task_center = task['center']
-        
-        # 计算移动时间
-        move_time = self.calculate_move_time(self.current_position, task_center)
-        
-        # 计算任务执行时间
-        task_time, rho = self.calculate_task_time(task)
-        if task_time is None:  # 任务不可行
-            return self.state, -10000, True, {}
-        
-        # 更新状态
-        self.current_position = task_center
-        self.unvisited_mask[action] = 0
-        self.total_flight_time += task_time
-        
-        # 构建新状态
-        self.state = np.concatenate([
-            np.array(self.current_position),
-            self.unvisited_mask
-        ])
-        
-        # 检查是否所有任务都已完成
-        done = np.sum(self.unvisited_mask) == 0
-        
-        # 计算奖励（负的总时间）
-        total_time = max(self.total_flight_time, move_time)
-        reward = -total_time if done else -move_time
-        
-        return self.state, reward, done, {}
-    
-    def reset(self):
-        self.current_position = self.region_center
-        self.unvisited_mask = np.ones(len(self.tasks))
-        self.total_flight_time = 0
-        
-        self.state = np.concatenate([
-            np.array(self.current_position),
-            self.unvisited_mask
-        ])
-        return self.state
-    
-    def render(self, mode='human'):
-        pass
-    
-    def optimize(self):
-        """使用DQN优化路径规划"""
-        from dqn import Agent
-        
-        state_dim = self.observation_space.shape[0]
-        action_dim = len(self.tasks)
-        
-        agent = Agent(state_dim, action_dim)
-        
-        # 训练参数
-        episodes = 50  # 进一步减少训练轮数，因为这是最底层子问题
-        max_steps = len(self.tasks) + 1  # 最多访问所有任务+返回
-        
-        best_reward = float('-inf')
-        best_time = float('inf')
-        valid_solution = False
-        
-        for episode in range(episodes):
-            state = self.reset()
-            episode_reward = 0
-            
-            for step in range(max_steps):
-                action = agent.choose_action(state)
-                next_state, reward, done, _ = self.step(action)
-                
-                agent.store_transition(state, action, reward, next_state, done)
-                agent.learn()
-                
-                episode_reward += reward
-                state = next_state
-                
-                if done:
-                    if reward != -10000:  # 如果是有效解
-                        valid_solution = True
-                        best_time = min(best_time, -reward)
-                    break
-        
-        return best_time, valid_solution
--- a/DQN/run_dqn.py
+++ b/DQN/run_dqn.py
@ -1,95 +0,0 @@
-from env import Env
-from dqn import Agent
-import numpy as np
-import matplotlib.pyplot as plt
-
-
-def train():
-    # 创建环境和智能体
-    env = Env()
-    state_dim = env.observation_space.shape[0]
-    action_dim = 10  # len(垂直切割数)+len(水平切割数)
-
-    agent = Agent(state_dim, action_dim)
-
-    # 训练参数
-    episodes = 1000
-    max_steps = 1000
-
-    # 记录训练过程
-    rewards_history = []
-    best_reward = float('-inf')
-    best_solution = None
-
-    # 开始训练
-    for episode in range(episodes):
-        state = env.reset()
-        episode_reward = 0
-
-        for step in range(max_steps):
-            # 选择动作
-            action = agent.choose_action(state)
-
-            # 执行动作
-            next_state, reward, done, _ = env.step(action)
-
-            # 存储经验
-            agent.store_transition(state, action, reward, next_state, done)
-
-            # 学习
-            agent.learn()
-
-            episode_reward += reward
-            state = next_state
-
-            if done:
-                break
-
-        # 记录每个episode的总奖励
-        rewards_history.append(episode_reward)
-
-        # 更新最佳解
-        if episode_reward > best_reward:
-            best_reward = episode_reward
-            best_solution = {
-                'vertical_cuts': int(action[0]),
-                'horizontal_cuts': int(action[1]),
-                # 'offload_ratio': action[2],
-                'total_time': -reward if reward != -1000 else float('inf'),
-                'episode': episode
-            }
-
-        # 打印训练进度
-        if (episode + 1) % 10 == 0:
-            avg_reward = np.mean(rewards_history[-10:])
-            print(f"Episode {episode + 1}, Average Reward: {avg_reward:.2f}")
-
-    return best_solution, rewards_history
-
-
-def plot_training_results(rewards_history):
-    plt.figure(figsize=(10, 5))
-    plt.plot(rewards_history)
-    plt.title('Training Progress')
-    plt.xlabel('Episode')
-    plt.ylabel('Total Reward')
-    plt.grid(True)
-    plt.show()
-
-
-def print_solution(solution):
-    print("\n最佳解决方案:")
-    print(f"在第 {solution['episode']} 轮找到")
-    print(f"垂直切割数: {solution['vertical_cuts']}")
-    print(f"水平切割数: {solution['horizontal_cuts']}")
-    print(f"任务卸载率: {solution['offload_ratio']:.2f}")
-    print(f"总完成时间: {solution['total_time']:.2f} 秒")
-
-
-if __name__ == "__main__":
-    # 训练模型
-    best_solution, rewards_history = train()
-
-    # 显示结果
-    plot_training_results(rewards_history)
-    print_solution(best_solution)
--- a/DQN/run_hierarchical.py
+++ b/DQN/run_hierarchical.py
@ -1,118 +0,0 @@
-from env_partition import PartitionEnv
-from env_allocation import AllocationEnv
-from env_routing import RoutingEnv
-from dqn import Agent
-import numpy as np
-import matplotlib.pyplot as plt
-
-def train_hierarchical():
-    """训练分层强化学习系统"""
-    # 创建第一层环境（区域划分）
-    partition_env = PartitionEnv()
-    partition_state_dim = partition_env.observation_space.shape[0]
-    partition_action_dim = 10  # 5个垂直切割选项 + 5个水平切割选项
-    
-    partition_agent = Agent(partition_state_dim, partition_action_dim)
-    
-    # 训练参数
-    episodes = 1000
-    max_steps = 1000
-    
-    # 记录训练过程
-    rewards_history = []
-    best_reward = float('-inf')
-    best_solution = None
-    
-    # 开始训练
-    print("开始训练分层强化学习系统...")
-    
-    for episode in range(episodes):
-        state = partition_env.reset()
-        episode_reward = 0
-        
-        for step in range(max_steps):
-            # 选择动作
-            action = partition_agent.choose_action(state)
-            
-            # 执行动作（这会触发第二层和第三层的优化）
-            next_state, reward, done, _ = partition_env.step(action)
-            
-            # 存储经验
-            partition_agent.store_transition(state, action, reward, next_state, done)
-            
-            # 学习
-            partition_agent.learn()
-            
-            episode_reward += reward
-            state = next_state
-            
-            if done:
-                break
-        
-        # 记录每个episode的总奖励
-        rewards_history.append(episode_reward)
-        
-        # 更新最佳解
-        if episode_reward > best_reward:
-            best_reward = episode_reward
-            best_solution = {
-                'vertical_cuts': int(action[0]),
-                'horizontal_cuts': int(action[1]),
-                'total_time': -reward if reward != -10000 else float('inf'),
-                'episode': episode
-            }
-        
-        # 打印训练进度
-        if (episode + 1) % 10 == 0:
-            avg_reward = np.mean(rewards_history[-10:])
-            print(f"Episode {episode + 1}, Average Reward: {avg_reward:.2f}")
-    
-    return best_solution, rewards_history
-
-def plot_training_results(rewards_history):
-    plt.figure(figsize=(10, 5))
-    plt.plot(rewards_history)
-    plt.title('Hierarchical DQN Training Progress')
-    plt.xlabel('Episode')
-    plt.ylabel('Total Reward')
-    plt.grid(True)
-    plt.show()
-
-def print_solution(solution):
-    print("\n最佳解决方案:")
-    print(f"在第 {solution['episode']} 轮找到")
-    print(f"垂直切割数: {solution['vertical_cuts']}")
-    print(f"水平切割数: {solution['horizontal_cuts']}")
-    print(f"总完成时间: {solution['total_time']:.2f} 秒")
-
-def visualize_partition(solution):
-    """可视化区域划分结果"""
-    H, W = 20, 25
-    v_cuts = solution['vertical_cuts']
-    h_cuts = solution['horizontal_cuts']
-    
-    plt.figure(figsize=(10, 8))
-    
-    # 绘制网格
-    for i in range(v_cuts + 1):
-        y = i * (H / v_cuts)
-        plt.axhline(y=y, color='b', linestyle='-', alpha=0.5)
-    
-    for i in range(h_cuts + 1):
-        x = i * (W / h_cuts)
-        plt.axvline(x=x, color='b', linestyle='-', alpha=0.5)
-    
-    plt.title('Area Partition Visualization')
-    plt.xlabel('Width')
-    plt.ylabel('Height')
-    plt.grid(True, alpha=0.3)
-    plt.show()
-
-if __name__ == "__main__":
-    # 训练模型
-    best_solution, rewards_history = train_hierarchical()
-    
-    # 显示结果
-    plot_training_results(rewards_history)
-    print_solution(best_solution)
-    visualize_partition(best_solution)
--- a/DQN/run_this.py
+++ b/DQN/run_this.py
@ -0,0 +1,58 @@
+from RL_brain import DeepQNetwork
+import os
+import sys
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from env import PartitionMazeEnv
+
+def run_maze():
+    step = 0  # 为了记录走到第几步，记忆录中积累经验（也就是积累一些transition）之后再开始学习
+    for episode in range(200):
+        # initial observation
+        observation = env.reset()
+
+        while True:
+            # refresh env
+            env.render()
+
+            # RL choose action based on observation
+            action = RL.choose_action(observation)
+
+            # RL take action and get next observation and reward
+            observation_, reward, done = env.step(action)
+
+            # !! restore transition
+            RL.store_transition(observation, action, reward, observation_)
+
+            # 超过200条transition之后每隔5步学习一次
+            if (step > 200) and (step % 5 == 0):
+                RL.learn()
+
+            # swap observation
+            observation = observation_
+
+            # break while loop when end of this episode
+            if done:
+                break
+            step += 1
+
+    # end of game
+    print("game over")
+    env.destroy()
+
+
+if __name__ == "__main__":
+    # maze game
+    env = PartitionMazeEnv()
+
+    # TODO 代码还没有写完，跑不了！！！
+    RL = DeepQNetwork(env.n_actions, env.n_features,
+                      learning_rate=0.01,
+                      reward_decay=0.9,
+                      e_greedy=0.9,
+                      replace_target_iter=200,
+                      memory_size=2000)
+    env.after(100, run_maze)
+    env.mainloop()
+    RL.plot_cost()
+
+
--- a/env.py
+++ b/env.py
@ -39,8 +39,8 @@ class PartitionMazeEnv(gym.Env):
        ##############################
        # 可能需要手动修改的超参数
        ##############################
-        self.CUT_NUM = 6    # 横切一半，竖切一半
-        self.BASE_LINE = 12133.250161412347     # 基准时间，通过greedy或者蒙特卡洛计算出来
+        self.CUT_NUM = 2    # 横切一半，竖切一半
+        self.BASE_LINE = 4000     # 基准时间，通过greedy或者蒙特卡洛计算出来

        self.phase = 0    # 阶段控制，0：区域划分阶段，1：迷宫初始化阶段，2：走迷宫阶段
        self.partition_step = 0      # 区域划分阶段步数，范围 0~4
@ -168,30 +168,30 @@ class PartitionMazeEnv(gym.Env):
                    return state, reward, False, False, {}

        elif self.phase == 1:
-            # 阶段 1：初始化迷宫，让多个车辆从区域中心出发，前往划分区域的中心点
-            # 确保 action 的值在 [0, 1]，然后映射到 0~(num_regions-1) 的索引
-            num_regions = (len(self.col_cuts) - 1) * \
-                (len(self.row_cuts) - 1)
-            target_region_index = int(np.floor(a * num_regions))
-            target_region_index = np.clip(
-                target_region_index, 0, num_regions - 1)
-            # 将index映射到笛卡尔坐标
-            coord = (target_region_index // (len(self.col_cuts) - 1),
-                     target_region_index % (len(self.col_cuts) - 1))
-            self.car_pos[self.init_maze_step] = self.rectangles[coord]['center']
-            self.car_traj[self.init_maze_step].append(coord)
-            self.rectangles[coord]['is_visited'] = True
+            # 阶段 1：初始化迷宫，让多个车辆从区域中心出发，前往最近的几个区域中心点
+            region_centers = [
+                (i, j, self.rectangles[(i, j)]['center'])
+                for i in range(len(self.row_cuts) - 1)
+                for j in range(len(self.col_cuts) - 1)
+            ]
+            # 按照与区域中心的距离从近到远排序
+            region_centers.sort(
+                key=lambda x: math.dist(x[2], (self.H / 2, self.W / 2))
+            )

-            # 计数
-            self.init_maze_step += 1
+            # 分配最近的区域给每辆车
+            for idx in range(self.num_cars):
+                i, j, center = region_centers[idx]
+                self.car_pos[idx] = center
+                self.car_traj[idx].append((i, j))
+                self.rectangles[(i, j)]['is_visited'] = True
+
+            # 进入阶段 2：走迷宫
+            self.phase = 2
            state = np.concatenate(
-                [self.partition_values, np.array(self.car_pos).flatten()])
-            if self.init_maze_step < self.num_cars:
-                return state, 0.0, False, False, {}
-            else:
-                # 进入阶段 2：走迷宫
-                self.phase = 2
-                return state, 0.0, False, False, {}
+                [self.partition_values, np.array(self.car_pos).flatten()]
+            )
+            return state, 0.0, False, False, {}

        elif self.phase == 2:
            # 阶段 2：路径规划（走迷宫）
--- a/params.yml
+++ b/params.yml
@ -1,6 +1,6 @@
-H : 50         # 区域高度，网格点之间的距离为25m（单位距离）
-W : 50         # 区域宽度
-num_cars : 3           # 系统数量（车-巢-机系统个数）
+H : 20         # 区域高度，网格点之间的距离为25m（单位距离）
+W : 25         # 区域宽度
+num_cars : 1           # 系统数量（车-巢-机系统个数）

 # 时间系数（单位：秒，每个网格一张照片）
 flight_time_factor : 3     # 每张照片对应的飞行时间，无人机飞行速度为9.5m/s，拍摄照片的时间间隔为3s
--- a/solutions/best_solution_mtkl.json
+++ b/solutions/best_solution_mtkl.json
@ -0,0 +1,54 @@
+{
+    "row_boundaries": [
+        0.0,
+        0.2,
+        0.4,
+        0.7,
+        1.0
+    ],
+    "col_boundaries": [
+        0.0,
+        0.5,
+        1.0
+    ],
+    "car_paths": {
+        "0": [
+            [
+                15.0,
+                12.5
+            ],
+            [
+                5.0,
+                12.5
+            ]
+        ],
+        "1": [
+            [
+                42.5,
+                12.5
+            ],
+            [
+                42.5,
+                37.5
+            ]
+        ],
+        "2": [
+            [
+                27.5,
+                12.5
+            ],
+            [
+                27.5,
+                37.5
+            ],
+            [
+                15.0,
+                37.5
+            ],
+            [
+                5.0,
+                37.5
+            ]
+        ]
+    }
+}