HPCC2025/Q_learning/q_table.py

import random
import numpy as np
import json
import math
import yaml
# 参数设置
STEP = 0.01
VALUES = [round(i*STEP, 2) for i in range(101)]  # 0.00~1.00
ACTION_DELTA = [STEP, -STEP]  # 增加或减少 0.01
ACTIONS = []  # 每个动作为 (var_index, delta)
for i in range(3):
    for delta in ACTION_DELTA:
        ACTIONS.append((i, delta))

ALPHA = 0.1         # 学习率
GAMMA = 0.9         # 折扣因子
EPSILON = 0.2       # 探索率
NUM_EPISODES = 100

def f(state):
    """
    计算切分比例的目标值 T（占位函数）
    :param row_cuts: 行切分比例
    :param col_cuts: 列切分比例
    :return: 目标值 T
    """
    with open('params2.yml', 'r', encoding='utf-8') as file:
        params = yaml.safe_load(file)

    H = params['H']
    W = params['W']
    num_cars = params['num_cars']

    flight_time_factor = params['flight_time_factor']
    comp_time_factor = params['comp_time_factor']
    trans_time_factor = params['trans_time_factor']
    car_time_factor = params['car_time_factor']
    bs_time_factor = params['bs_time_factor']

    flight_energy_factor = params['flight_energy_factor']
    comp_energy_factor = params['comp_energy_factor']
    trans_energy_factor = params['trans_energy_factor']
    battery_energy_capacity = params['battery_energy_capacity']

    col_cuts = list(state)
    col_cuts.insert(0, 0)
    col_cuts.append(1)
    row_cuts = [0, 0.5, 1]
    rectangles = []
    for i in range(len(row_cuts) - 1):
        for j in range(len(col_cuts) - 1):
            d = (col_cuts[j+1] - col_cuts[j]) * W * \
                (row_cuts[i+1] - row_cuts[i]) * H
            rho_time_limit = (flight_time_factor - trans_time_factor) / \
                (comp_time_factor - trans_time_factor)
            rho_energy_limit = (battery_energy_capacity - flight_energy_factor * d - trans_energy_factor * d) / (comp_energy_factor * d - trans_energy_factor * d)
            if rho_energy_limit < 0:
                return 100000
            rho = min(rho_time_limit, rho_energy_limit)

            flight_time = flight_time_factor * d
            bs_time = bs_time_factor * (1 - rho) * d

            rectangles.append({
                'flight_time': flight_time,
                'bs_time': bs_time,
                'center': ((row_cuts[i] + row_cuts[i+1]) / 2.0 * H,
                            (col_cuts[j] + col_cuts[j+1]) / 2.0 * W)
            })

    mortorcade_time_lt = []
    for idx in range(num_cars):
        car_path = car_paths[idx]

        flight_time = sum(rectangles[point]['flight_time']
                            for point in car_path)
        bs_time = sum(rectangles[point]['bs_time'] for point in car_path)

        car_time = 0
        for i in range(len(car_path) - 1):
            first_point = car_path[i]
            second_point = car_path[i + 1]
            car_time += math.dist(
                rectangles[first_point]['center'], rectangles[second_point]['center']) * car_time_factor
        car_time += math.dist(rectangles[car_path[0]]['center'],
                                [H / 2, W / 2]) * car_time_factor
        car_time += math.dist(rectangles[car_path[-1]]['center'],
                                [H / 2, W / 2]) * car_time_factor
        mortorcade_time_lt.append(max(car_time + flight_time, bs_time))

    return max(mortorcade_time_lt)

# 环境类：定义状态转移与奖励
class FunctionEnv:
    def __init__(self, initial_state):
        self.state = initial_state  # 初始状态 (x1,x2,x3)
        self.best_value = float('inf')  # 记录最佳值
        self.no_improvement_count = 0  # 记录连续未改善的次数
        self.last_state = None  # 记录上一个状态
        self.min_improvement = 0.001  # 最小改善阈值
        self.max_no_improvement = 10  # 最大允许连续未改善次数
        self.target_threshold = 10000  # 目标函数值的可接受阈值

    def step(self, action):
        # action: (var_index, delta)
        var_index, delta = action
        new_state = list(self.state)
        new_state[var_index] = round(new_state[var_index] + delta, 2)
        # 保证取值在0-1范围内
        if new_state[var_index] < 0 or new_state[var_index] > 1:
            return self.state, -10000.0, True  # episode结束
        # 检查约束：x1 < x2 < x3
        if not (0 < new_state[0] < new_state[1] < new_state[2] < 1):
            return self.state, -10000.0, True

        next_state = tuple(new_state)
        current_value = f(next_state)

        # 检查是否达到目标阈值
        if current_value < self.target_threshold:
            return next_state, 12000 - current_value, True

        # 检查状态变化是否很小
        if self.last_state is not None:
            state_diff = sum(abs(a - b) for a, b in zip(next_state, self.last_state))
            if state_diff < self.min_improvement:
                self.no_improvement_count += 1
            else:
                self.no_improvement_count = 0

        # 检查是否有改善
        if current_value < self.best_value:
            self.best_value = current_value
            self.no_improvement_count = 0
        else:
            self.no_improvement_count += 1

        # 如果连续多次没有改善，结束episode
        if self.no_improvement_count >= self.max_no_improvement:
            return next_state, 12000 - current_value, True

        self.last_state = next_state
        self.state = next_state
        return next_state, 12000 - current_value, False

    def reset(self, state):
        self.state = state
        return self.state

# 初始化 Q-table：使用字典表示，key 为状态 tuple，value 为 dict: action->Q值
Q_table = {}

def get_Q(state, action):
    if state not in Q_table:
        Q_table[state] = {a: 0.0 for a in ACTIONS}
    return Q_table[state][action]

def set_Q(state, action, value):
    if state not in Q_table:
        Q_table[state] = {a: 0.0 for a in ACTIONS}
    Q_table[state][action] = value

def choose_action(state, epsilon):
    # ε-greedy 策略
    if random.random() < epsilon:
        return random.choice(ACTIONS)
    else:
        if state not in Q_table:
            Q_table[state] = {a: 0.0 for a in ACTIONS}
        # 返回Q值最大的动作
        return max(Q_table[state].items(), key=lambda x: x[1])[0]

def load_initial_solution(file_path):
    """
    从 JSON 文件加载初始解
    :param file_path: JSON 文件路径
    :return: 行切分比例、列切分比例
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    row_cuts = data['row_boundaries']
    col_cuts = data['col_boundaries']
    car_paths = data['car_paths']
    return row_cuts, col_cuts, car_paths

if __name__ == "__main__":
    random.seed(42)

    # ---------------------------
    # 需要修改的超参数
    # ---------------------------
    solution_path = r"solutions\trav_ga_params2_parallel.json"
    params_file = r"params2"

    initial_row_cuts, initial_col_cuts, car_paths = load_initial_solution(
        solution_path)

    initial_state = (0.2, 0.4, 0.7)
    # Q-learning 主循环
    env = FunctionEnv(initial_state)

    for episode in range(NUM_EPISODES):
        print(f"Episode {episode + 1} of {NUM_EPISODES}")
        state = env.reset(initial_state)
        done = False

        while not done:
            # 选择动作
            action = choose_action(state, EPSILON)
            # 环境执行动作
            next_state, reward, done = env.step(action)
            # Q-learning 更新：Q(s,a) = Q(s,a) + α [r + γ * max_a' Q(s', a') - Q(s,a)]
            if next_state not in Q_table:
                Q_table[next_state] = {a: 0.0 for a in ACTIONS}
            max_next_Q = max(Q_table[next_state].values())
            current_Q = get_Q(state, action)
            new_Q = current_Q + ALPHA * (reward + GAMMA * max_next_Q - current_Q)
            set_Q(state, action, new_Q)
            state = next_state

        # 可逐步减小探索率
        EPSILON = max(0.01, EPSILON * 0.999)

    # 输出 Q-table 中最佳策略的状态和值
    best_state = None
    best_value = float('inf')
    for state in Q_table:
        # 这里根据函数值来评价解的好坏
        state_value = f(state)
        if state_value < best_value:
            best_value = state_value
            best_state = state

    print("找到的最优状态:", best_state, "对应函数值:", best_value)