添加调优baseline

2026-02-11 11:09:40 +08:00
parent 1bd2f32b09
commit 73c1aca15b
10 changed files with 186 additions and 109 deletions
--- a/.claude/settings.local.json
+++ b/.claude/settings.local.json
@@ -0,0 +1,7 @@
+{
+  "permissions": {
+    "allow": [
+      "Bash(python:*)"
+    ]
+  }
+}
--- a/exp/tune_exp1_1.pdf
+++ b/exp/tune_exp1_1.pdf
--- a/references.bib
+++ b/references.bib
@@ -626,3 +626,64 @@
 	publisher    = {{IEEE/ACM}},
 	year         = {2012}
 }
+
+@inproceedings{Chen98SA,
+	author       = {Ying Chen and
+	Marianne Winslett and
+	Yong Cho and
+	Szu{-}Wen Kuo},
+	editor       = {Gary L. Miller and
+	Phillip B. Gibbons},
+	title        = {Automatic Parallel {I/O} Performance Optimization in Panda},
+	booktitle    = {Proceedings of the Tenth Annual {ACM} Symposium on Parallel Algorithms
+	and Architectures, {SPAA} '98, Puerto Vallarta, Mexico, June 28 -
+	July 2, 1998},
+	pages        = {108--118},
+	publisher    = {{ACM}},
+	year         = {1998}
+}
+
+@inproceedings{Robert20SA,
+	author       = {Sophie Robert and
+	Soraya Zertal and
+	Philippe Couv{\'{e}}e},
+	editor       = {Maria Carla Calzarossa and
+	Erol Gelenbe and
+	Krzysztof Grochla and
+	Ricardo Lent and
+	Tadeusz Czach{\'{o}}rski},
+	title        = {SHAMan: {A} Flexible Framework for Auto-tuning {HPC} Systems},
+	booktitle    = {Modelling, Analysis, and Simulation of Computer and Telecommunication
+	Systems - 28th International Symposium, {MASCOTS} 2020, Nice, France,
+	November 17-19, 2020, Revised Selected Papers},
+	series       = {Lecture Notes in Computer Science},
+	volume       = {12527},
+	pages        = {147--158},
+	publisher    = {Springer},
+	year         = {2020}
+}
+
+@inproceedings{Agarwal19TPE,
+	author       = {Megha Agarwal and
+	Divyansh Singhvi and
+	Preeti Malakar and
+	Suren Byna},
+	title        = {Active Learning-based Automatic Tuning and Prediction of Parallel
+	{I/O} Performance},
+	booktitle    = {{IEEE/ACM} Fourth International Parallel Data Systems Workshop, PDSW@SC
+	2019, Denver, CO, USA, November 18, 2019},
+	pages        = {20--29},
+	publisher    = {{IEEE}},
+	year         = {2019}
+}
+
+@inproceedings{Bagbaba20RF,
+	author       = {Ayse Bagbaba},
+	title        = {Improving Collective {I/O} Performance with Machine Learning Supported
+	Auto-tuning},
+	booktitle    = {2020 {IEEE} International Parallel and Distributed Processing Symposium
+	Workshops, {IPDPSW} 2020, New Orleans, LA, USA, May 18-22, 2020},
+	pages        = {814--821},
+	publisher    = {{IEEE}},
+	year         = {2020}
+}
--- a/rs_retrieval.aux
+++ b/rs_retrieval.aux
@@ -141,6 +141,11 @@
 \@writefile{toc}{\contentsline {subsubsection}{\numberline {\mbox  {VIII-C}2}Storage-Level Effects and Request Collapse}{14}{}\protected@file@percent }
 \newlabel{sec:ModeSwitch}{{\mbox  {VIII-C}3}{14}}
 \@writefile{toc}{\contentsline {subsubsection}{\numberline {\mbox  {VIII-C}3}Deterministic and Non-Deterministic Modes}{14}{}\protected@file@percent }
+\citation{Behzad13HDF5}
+\citation{Chen98SA,Robert20SA}
+\citation{Agarwal19TPE}
+\citation{Bagbaba20RF}
+\citation{Rajesh24TunIO}
 \newlabel{fig:cc_exp1_3}{{12(a)}{15}}
 \newlabel{sub@fig:cc_exp1_3}{{(a)}{15}}
 \newlabel{fig:cc_exp1_2}{{12(b)}{15}}
@@ -160,24 +165,18 @@
 \newlabel{fig:cc_exp3}{{13}{15}}
 \@writefile{lof}{\contentsline {figure}{\numberline {14}{\ignorespaces Mode Switching}}{15}{}\protected@file@percent }
 \newlabel{fig:cc_exp4}{{14}{15}}
-\newlabel{fig:tune_exp1_1}{{\mbox  {VIII-D}1}{15}}
-\newlabel{fig:tune_exp1_2}{{\mbox  {VIII-D}1}{15}}
-\@writefile{lof}{\contentsline {figure}{\numberline {15}{\ignorespaces Efficiency analysis of the tuning framework.}}{15}{}\protected@file@percent }
-\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Tuning steps}}}{15}{}\protected@file@percent }
-\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Time (mins)}}}{15}{}\protected@file@percent }
-\newlabel{fig:tune_exp1}{{15}{15}}
 \@writefile{toc}{\contentsline {subsection}{\numberline {\mbox  {VIII-D}}Evaluating the I/O Tuning}{15}{}\protected@file@percent }
-\@writefile{toc}{\contentsline {subsubsection}{\numberline {\mbox  {VIII-D}1}Convergence Speed and Tuning Cost}{15}{}\protected@file@percent }
-\citation{Rajesh24TunIO}
-\bibstyle{IEEEtran}
-\bibdata{IEEEabrv,references}
-\bibcite{Ma15RS_bigdata}{1}
-\newlabel{eq:roti}{{12}{16}}
+\@writefile{lof}{\contentsline {figure}{\numberline {15}{\ignorespaces Efficiency analysis of the tuning framework.}}{16}{}\protected@file@percent }
+\newlabel{fig:tune_exp1}{{15}{16}}
+\@writefile{toc}{\contentsline {subsubsection}{\numberline {\mbox  {VIII-D}1}Convergence Speed and Tuning Cost}{16}{}\protected@file@percent }
 \@writefile{toc}{\contentsline {subsubsection}{\numberline {\mbox  {VIII-D}2}Adaptation to Workload Shifts}{16}{}\protected@file@percent }
 \@writefile{lof}{\contentsline {figure}{\numberline {16}{\ignorespaces Mode Switching}}{16}{}\protected@file@percent }
 \newlabel{fig:tune_exp3}{{16}{16}}
 \@writefile{toc}{\contentsline {section}{\numberline {IX}Conclusions}{16}{}\protected@file@percent }
 \newlabel{sec:Con}{{IX}{16}}
+\bibstyle{IEEEtran}
+\bibdata{IEEEabrv,references}
+\bibcite{Ma15RS_bigdata}{1}
 \bibcite{Haut21DDL_RS}{2}
 \bibcite{LEWIS17datacube}{3}
 \bibcite{Yan21RS_manage1}{4}
@@ -208,5 +207,9 @@
 \bibcite{Behzad13HDF5}{29}
 \bibcite{Wang26RethinkingTuning}{30}
 \bibcite{Xie12supercomputer}{31}
+\bibcite{Chen98SA}{32}
+\bibcite{Robert20SA}{33}
+\bibcite{Agarwal19TPE}{34}
+\bibcite{Bagbaba20RF}{35}
 \@writefile{toc}{\contentsline {section}{References}{17}{}\protected@file@percent }
 \gdef \@abspage@last{17}
--- a/rs_retrieval.bbl
+++ b/rs_retrieval.bbl
@@ -208,4 +208,34 @@ B.~Xie, J.~S. Chase, D.~Dillow, O.~Drokin, S.~Klasky, S.~Oral, and
  J.~K. Hollingsworth, Ed.\hskip 1em plus 0.5em minus 0.4em\relax {IEEE/ACM},
  2012, p.~8.

+\bibitem{Chen98SA}
+Y.~Chen, M.~Winslett, Y.~Cho, and S.~Kuo, ``Automatic parallel {I/O}
+  performance optimization in panda,'' in \emph{Proceedings of the Tenth Annual
+  {ACM} Symposium on Parallel Algorithms and Architectures, {SPAA} '98, Puerto
+  Vallarta, Mexico, June 28 - July 2, 1998}, G.~L. Miller and P.~B. Gibbons,
+  Eds.\hskip 1em plus 0.5em minus 0.4em\relax {ACM}, 1998, pp. 108--118.
+
+\bibitem{Robert20SA}
+S.~Robert, S.~Zertal, and P.~Couv{\'{e}}e, ``Shaman: {A} flexible framework for
+  auto-tuning {HPC} systems,'' in \emph{Modelling, Analysis, and Simulation of
+  Computer and Telecommunication Systems - 28th International Symposium,
+  {MASCOTS} 2020, Nice, France, November 17-19, 2020, Revised Selected Papers},
+  ser. Lecture Notes in Computer Science, M.~C. Calzarossa, E.~Gelenbe,
+  K.~Grochla, R.~Lent, and T.~Czach{\'{o}}rski, Eds., vol. 12527.\hskip 1em
+  plus 0.5em minus 0.4em\relax Springer, 2020, pp. 147--158.
+
+\bibitem{Agarwal19TPE}
+M.~Agarwal, D.~Singhvi, P.~Malakar, and S.~Byna, ``Active learning-based
+  automatic tuning and prediction of parallel {I/O} performance,'' in
+  \emph{{IEEE/ACM} Fourth International Parallel Data Systems Workshop, PDSW@SC
+  2019, Denver, CO, USA, November 18, 2019}.\hskip 1em plus 0.5em minus
+  0.4em\relax {IEEE}, 2019, pp. 20--29.
+
+\bibitem{Bagbaba20RF}
+A.~Bagbaba, ``Improving collective {I/O} performance with machine learning
+  supported auto-tuning,'' in \emph{2020 {IEEE} International Parallel and
+  Distributed Processing Symposium Workshops, {IPDPSW} 2020, New Orleans, LA,
+  USA, May 18-22, 2020}.\hskip 1em plus 0.5em minus 0.4em\relax {IEEE}, 2020,
+  pp. 814--821.
+
 \end{thebibliography}
--- a/rs_retrieval.blg
+++ b/rs_retrieval.blg
@@ -20,45 +20,45 @@ Warning--empty author in LEWIS17datacube
 Warning--empty booktitle in Lim17OCC

 Done.
-You've used 31 entries,
+You've used 35 entries,
            4087 wiz_defined-function locations,
-            1821 strings with 30947 characters,
-and the built_in function-call counts, 29476 in all, are:
-= -- 2286
-> -- 900
-< -- 263
-+ -- 484
- -- 165
-* -- 1465
-:= -- 4128
-add.period$ -- 74
-call.type$ -- 31
-change.case$ -- 45
-chr.to.int$ -- 615
-cite$ -- 34
-duplicate$ -- 2045
-empty$ -- 2263
-format.name$ -- 197
-if$ -- 6923
+            1851 strings with 32312 characters,
+and the built_in function-call counts, 33516 in all, are:
+= -- 2590
+> -- 1011
+< -- 298
+ -- 542
+- -- 184
+* -- 1661
+:= -- 4680
+add.period$ -- 86
+call.type$ -- 35
+change.case$ -- 53
+chr.to.int$ -- 705
+cite$ -- 38
+duplicate$ -- 2325
+empty$ -- 2614
+format.name$ -- 221
+if$ -- 7884
 int.to.chr$ -- 0
-int.to.str$ -- 31
-missing$ -- 383
-newline$ -- 118
-num.names$ -- 40
-pop$ -- 910
+int.to.str$ -- 35
+missing$ -- 436
+newline$ -- 130
+num.names$ -- 48
+pop$ -- 1041
 preamble$ -- 1
 purify$ -- 0
 quote$ -- 2
-skip$ -- 2224
+skip$ -- 2527
 stack$ -- 0
-substring$ -- 1485
-swap$ -- 1763
-text.length$ -- 58
+substring$ -- 1699
+swap$ -- 1994
+text.length$ -- 63
 text.prefix$ -- 0
 top$ -- 5
-type$ -- 31
+type$ -- 35
 warning$ -- 3
-while$ -- 136
-width$ -- 33
-write$ -- 335
+while$ -- 158
+width$ -- 37
+write$ -- 375
 (There were 3 warnings)
--- a/rs_retrieval.log
+++ b/rs_retrieval.log
@@ -1,4 +1,4 @@
-This is pdfTeX, Version 3.141592653-2.6-1.40.25 (MiKTeX 23.4) (preloaded format=pdflatex 2025.10.23)  5 FEB 2026 17:24
+This is pdfTeX, Version 3.141592653-2.6-1.40.25 (MiKTeX 23.4) (preloaded format=pdflatex 2025.10.23)  11 FEB 2026 11:08
 entering extended mode
 restricted \write18 enabled.
 %&-line parsing enabled.
@@ -647,22 +647,12 @@ Package pdftex.def Info: exp/cc_exp4.pdf  used on input line 714.
 [14]
 Underfull \vbox (badness 10000) has occurred while \output is active []

-
-Underfull \hbox (badness 1874) in paragraph at lines 734--735
-\OT1/ptm/m/n/10 high-impact pa-ram-e-ter se-lec-tion and Re-in-force-ment
- []
-
-<exp/tune_exp1_1.pdf, id=242, 253.94875pt x 208.78pt>
+<exp/tune_exp1_1.pdf, id=242, 253.94875pt x 224.84pt>
 File: exp/tune_exp1_1.pdf Graphic file (type pdf)
 <use exp/tune_exp1_1.pdf>
-Package pdftex.def Info: exp/tune_exp1_1.pdf  used on input line 745.
-(pdftex.def)             Requested size: 110.10678pt x 90.52228pt.
-<exp/tune_exp1_2.pdf, id=243, 253.94875pt x 204.765pt>
-File: exp/tune_exp1_2.pdf Graphic file (type pdf)
-<use exp/tune_exp1_2.pdf>
-Package pdftex.def Info: exp/tune_exp1_2.pdf  used on input line 751.
-(pdftex.def)             Requested size: 113.62068pt x 91.61536pt.
-[15 <./exp/cc_exp1_3.pdf> <./exp/cc_exp1_2.pdf
+Package pdftex.def Info: exp/tune_exp1_1.pdf  used on input line 749.
+(pdftex.def)             Requested size: 130.08621pt x 115.17805pt.
+ [15 <./exp/cc_exp1_3.pdf> <./exp/cc_exp1_2.pdf

 pdfTeX warning: pdflatex.exe (file ./exp/cc_exp1_2.pdf): PDF inclusion: multipl
 e pdfs with page group included in a single page
@@ -682,46 +672,42 @@ e pdfs with page group included in a single page

 pdfTeX warning: pdflatex.exe (file ./exp/cc_exp4.pdf): PDF inclusion: multiple 
 pdfs with page group included in a single page
-> <./exp/tune_exp1_1.pdf
-
-pdfTeX warning: pdflatex.exe (file ./exp/tune_exp1_1.pdf): PDF inclusion: multi
-ple pdfs with page group included in a single page
-> <./exp/tune_exp1_2.pdf
-
-pdfTeX warning: pdflatex.exe (file ./exp/tune_exp1_2.pdf): PDF inclusion: multi
-ple pdfs with page group included in a single page
 >]
-<exp/tune_exp3_1.pdf, id=332, 253.94875pt x 216.81pt>
+<exp/tune_exp3_1.pdf, id=311, 253.94875pt x 216.81pt>
 File: exp/tune_exp3_1.pdf Graphic file (type pdf)
 <use exp/tune_exp3_1.pdf>
-Package pdftex.def Info: exp/tune_exp3_1.pdf  used on input line 774.
+Package pdftex.def Info: exp/tune_exp3_1.pdf  used on input line 764.
 (pdftex.def)             Requested size: 130.08621pt x 111.06456pt.
+ [16 <./exp/tune_exp1_1.pdf> <./exp/tune_exp3_1.pdf

-Underfull \hbox (badness 2495) in paragraph at lines 789--790
+pdfTeX warning: pdflatex.exe (file ./exp/tune_exp3_1.pdf): PDF inclusion: multi
+ple pdfs with page group included in a single page
+>]
+Underfull \hbox (badness 2495) in paragraph at lines 779--780
 []\OT1/ptm/m/n/10 This work is sup-ported by the Na-tional Key R&D
 []


-Underfull \hbox (badness 2799) in paragraph at lines 789--790
+Underfull \hbox (badness 2799) in paragraph at lines 779--780
 \OT1/ptm/m/n/10 Pro-gram of China ``In-ter-gov-ern-men-tal In-ter-na-tional Sci
 -
 []


-Underfull \hbox (badness 7576) in paragraph at lines 789--790
+Underfull \hbox (badness 7576) in paragraph at lines 779--780
 \OT1/ptm/m/n/10 ence and Tech-nol-ogy In-no-va-tion Co-op-er-a-tion" (Grant
 []

-(rs_retrieval.bbl [16 <./exp/tune_exp3_1.pdf>]) [17] (rs_retrieval.aux)
+(rs_retrieval.bbl) [17] (rs_retrieval.aux)

 LaTeX Warning: There were multiply-defined labels.

 ) 
 Here is how much of TeX's memory you used:
- 5773 strings out of 476331
- 98594 string characters out of 5797649
+ 5769 strings out of 476331
+ 98432 string characters out of 5797649
 1882660 words of memory out of 5000000
- 26076 multiletter control sequences out of 15000+600000
+ 26073 multiletter control sequences out of 15000+600000
 561830 words of font info for 131 fonts, out of 8000000 for 9000
 1145 hyphenation exceptions out of 8191
 62i,17n,67p,2484b,497s stack positions out of 10000i,1000n,20000p,200000b,200000s
@@ -743,9 +729,9 @@ urier/ucrr8a.pfb><D:/software/ctex/MiKTeX/fonts/type1/urw/times/utmb8a.pfb><D:/
 software/ctex/MiKTeX/fonts/type1/urw/times/utmbi8a.pfb><D:/software/ctex/MiKTeX
 /fonts/type1/urw/times/utmr8a.pfb><D:/software/ctex/MiKTeX/fonts/type1/urw/time
 s/utmri8a.pfb>
-Output written on rs_retrieval.pdf (17 pages, 2533024 bytes).
+Output written on rs_retrieval.pdf (17 pages, 2530708 bytes).
 PDF statistics:
- 440 PDF objects out of 1000 (max. 8388607)
+ 432 PDF objects out of 1000 (max. 8388607)
 0 named destinations out of 1000 (max. 500000)
- 121 words of extra memory for PDF output out of 10000 (max. 10000000)
+ 116 words of extra memory for PDF output out of 10000 (max. 10000000)

--- a/rs_retrieval.pdf
+++ b/rs_retrieval.pdf
--- a/rs_retrieval.synctex.gz
+++ b/rs_retrieval.synctex.gz
--- a/rs_retrieval.tex
+++ b/rs_retrieval.tex
@@ -531,7 +531,7 @@ The comparative methods are categorized as follows:

 	\item \textbf{OpenDataCube (Window-based I/O):} A state-of-the-art data cube system that couples PostGIS indexes with windowed I/O via rasterio, enabling partial reads from monolithic image files. By leveraging GeoBox-based ROI computation and automatic overview selection, OpenDataCube represents the theoretical optimum for I/O selectivity but incurs runtime geospatial computation overhead to resolve pixel-to-geographic mappings.

-	\item \textbf{rio-tiler (Window-based I/O):} A lightweight raster reading engine optimized for dynamic tile generation. Similar to OpenDataCube, it employs PostGIS for spatial indexing and windowed I/O for partial data access, but features a streamlined execution path with minimal abstraction layers, resulting in lower per-query overhead. rio-tiler serves as a high-performance baseline for windowed reading without the complexity of full data cube management.
+	\item \textbf{Rio-tiler (Window-based I/O):} A lightweight raster reading engine optimized for dynamic tile generation. Similar to OpenDataCube, it employs PostGIS for spatial indexing and windowed I/O for partial data access, but features a streamlined execution path with minimal abstraction layers, resulting in lower per-query overhead. rio-tiler serves as a high-performance baseline for windowed reading without the complexity of full data cube management.

 	\item \textbf{Ours (I/O-aware Indexing):} The proposed approach leverages a dual-layer inverted index structure comprising Grid-to-Image (G2I) and Image-to-Grid (I2G) mappings. By pre-materializing grid-to-pixel correspondences at ingestion time, our method translates spatio-temporal predicates directly into byte-level read plans, completely eliminating runtime geometric computations while preserving minimal I/O volume through precise windowed access.
 \end{enumerate}
@@ -727,46 +727,36 @@ Our hybrid approach successfully combines the benefits of both worlds. As shown
 \subsection{Evaluating the I/O Tuning}
 In this section, we evaluate the effectiveness of the proposed SA-GMAB tuning framework. The experiments are designed to verify four key properties: fast convergence speed, robustness against stochastic noise, adaptability to workload shifts, and tangible end-to-end performance gains.

-For comparison, we benchmark against three representative tuning strategies: 
+To comprehensively assess SA-GMAB across different optimization paradigms, we benchmark against five representative tuning strategies spanning heuristic search, probabilistic modeling, simulation-based prediction, and reinforcement learning approaches:

-\begin{enumerate} 
-	\item \textbf{Genetic algorithm (GA):} The standard genetic algorithm to explore the configuration space, serving as the basic algorithm in the TunIO.
-	\item \textbf{TunIO:} A state-of-the-art framework that integrates high-impact parameter selection and Reinforcement Learning (RL)-driven early stopping to balance tuning cost and performance in complex HPC I/O stacks.
-	\item \textbf{SA-GMAB (Ours):} The proposed framework combining surrogate modeling with a Genetic Multi-Armed Bandit strategy, explicitly designed to accelerate convergence and handle the stochastic performance fluctuations of concurrent workloads.
+\begin{enumerate}
+	\item \textbf{Genetic Algorithm (GA):} A canonical evolutionary search method that explores the configuration space through selection, crossover, and mutation operators \cite{Behzad13HDF5}. GA serves as the foundational algorithm in TunIO and represents the baseline heuristic approach.
+
+	\item \textbf{Simulated Annealing (SA):} A classical stochastic optimization technique inspired by metallurgical annealing \cite{Chen98SA, Robert20SA}. SA has been widely applied in HPC I/O tuning for over two decades and provides a mature baseline for convergence analysis.
+
+	\item \textbf{Bayesian Optimization with TPE:} A model-based sequential optimization method that constructs a surrogate using Tree-structured Parzen Estimators and selects candidates via Expected Improvement \cite{Agarwal19TPE}. TPE represents state-of-the-art probabilistic optimization and achieves rapid convergence in recent HPC I/O studies.
+
+	\item \textbf{Random Forest Regression (RF):} A simulation-based approach that trains an ensemble predictor on historical execution logs to rank candidate configurations offline \cite{Bagbaba20RF}. RF drastically reduces tuning time from hours to seconds by avoiding repeated real-system evaluations.
+
+	\item \textbf{TunIO:} A recent framework integrating high-impact parameter selection with Reinforcement Learning-driven early stopping \cite{Rajesh24TunIO}. TunIO balances tuning cost and performance in complex HPC I/O stacks and represents the state-of-the-art RL-based approach.
+
+	\item \textbf{SA-GMAB (Ours):} The proposed framework combining surrogate modeling with a Genetic Multi-Armed Bandit strategy, explicitly designed to accelerate convergence and handle stochastic performance fluctuations in concurrent workloads.
 \end{enumerate}

 \subsubsection{Convergence Speed and Tuning Cost}
-\begin{figure}[tb]
+\begin{figure}
 	\centering
-	\subfigure[Tuning steps]{
-		\begin{minipage}[b]{0.227\textwidth}
-			\includegraphics[width=0.94\textwidth]{exp/tune_exp1_1.pdf}
-		\end{minipage}
-	}
-	\label{fig:tune_exp1_1}
-	\subfigure[Time (mins)]{
-		\begin{minipage}[b]{0.227\textwidth}
-			\includegraphics[width=0.97\textwidth]{exp/tune_exp1_2.pdf}
-		\end{minipage}
-	}
-	\label{fig:tune_exp1_2}
+	\includegraphics[width=1.8in]{exp/tune_exp1_1.pdf}
 	\caption{Efficiency analysis of the tuning framework.}
 	\label{fig:tune_exp1}
 \end{figure}

-We first initiated a cold-start tuning session to evaluate how efficiently each method identifies high-quality configurations starting from a default, unoptimized state. Fig.~\ref{fig:tune_exp1}(a) reports the convergence trajectory of the best-observed latency over tuning steps.
+We conduct a cold-start tuning experiment to evaluate how efficiently each method identifies high-performance I/O configurations from an unoptimized initial state. All methods start from the same default configuration with an initial latency of 834 ms. Each tuning step corresponds to evaluating one candidate configuration on the actual system, and we record the best-observed latency over 100 steps.

-As illustrated in Fig.~\ref{fig:tune_exp1}(a), the three methods exhibit distinct search behaviors. The GA baseline demonstrates the slowest convergence. It exhibits a staircase-like descent with prolonged plateaus, requiring over 100 steps to reduce latency significantly. This sluggishness is attributed to its mutation mechanism, which lacks historical memory and repeatedly explores ineffective parameter spaces. The RL-based TunIO outperforms GA but still suffers from a slow start. While it eventually reaches a competitive latency ($\approx 277$ ms at step 140), its exploration phase is costly. The reinforcement learning agent requires a substantial number of interaction samples to learn the complex mapping between I/O parameters and reward signals. Our method achieves the fastest latency drop, rapidly decreasing from $500$ ms to a near-optimal zone ($\approx 315$ ms) within a short window. Unlike GA and TunIO, SA-GMAB leverages the surrogate model to pre-screen candidates. By effectively pruning unpromising configurations before they incur actual execution costs, SA-GMAB maximizes the information gain per step, making it particularly suitable for online scenarios where tuning overhead must be minimized.
+\par
+Corresponding to the convergence trajectories in Fig.~\ref{fig:tune_exp1}, the six methods exhibit distinct convergence patterns that can be categorized into three groups. SA exhibits the poorest performance, with latency initially surging to 1,009 ms at step~3 before gradually declining to 536 ms. Its non-monotonic acceptance of worse configurations proves detrimental in expensive I/O tuning scenarios.  GA demonstrates steady but slow improvement, following a characteristic staircase-like descent with prolonged plateaus. GA requires over 100 steps to reach 394 ms. The mutation operator repeatedly explores ineffective regions, resulting in low information gain per evaluation. RF achieves rapid initial descent, dropping to approximately 480 ms within the first 10 steps and eventually reaching 336 ms. By constructing a surrogate model from historical execution data, RF can rank  candidates without direct system evaluation. However, the plateau observed after step~15 suggests that the surrogate's predictive accuracy becomes a bottleneck—the model cannot extrapolate beyond the training distribution, limiting further improvement. BO-TPE exhibits the best performance among model-based methods, converging to 310 ms by step~100. BO-TPE effectively balances exploration and exploitation by maintaining a probabilistic surrogate and selecting candidates via expected improvement.

-To strictly quantify the cost-effectiveness of the tuning process, we adopt the \textit{Return on Tuning Investment} (RoTI) metric proposed in TunIO \cite{Rajesh24TunIO}. We define the application performance $\mathcal{P}$ as the reciprocal of the query latency (i.e., $\mathcal{P} \propto 1/\mathcal{L}$). The RoTI metric is formalized as follows:
-
-\begin{equation}
-	\label{eq:roti}
-	RoTI(t) = \frac{\mathcal{P}_{achieved}(t) - \mathcal{P}_{initial}}{t},
-\end{equation}
-where $t$ denotes the cumulative tuning time (overhead). $\mathcal{P}_{initial} = 1 / \mathcal{L}_{0}$ represents the baseline performance derived from the default configuration, and $\mathcal{P}_{achieved}(t) = 1 / \mathcal{L}_{t}$ represents the maximum performance achieved up to time $t$. Functionally, this metric represents the performance gain purchased per unit of tuning time. A higher RoTI value signifies that the optimizer rapidly identifies low-latency configurations with minimal computational overhead.
-
-Fig.~\ref{fig:tune_exp1}(b) plots the RoTI curves over time. Our method (SA-GMAB) reaches a remarkable RoTI peak ($\approx 100$) at the early stage ($t=825$). This indicates that SA-GMAB yields the highest immediate return on investment, successfully locating high-quality configurations when the tuning budget is strictly limited. In contrast, TunIO peaks at a significantly lower value ($\approx 68$), while GA remains flat and inefficient ($\approx 46$). This confirms that the surrogate-assisted mechanism effectively amplifies the value of each exploration step. All curves exhibit a decaying trend as time progresses ($t \rightarrow \infty$). This is expected behavior: as the system converges to the global optimum, the marginal performance gain ($\Delta \mathcal{P}$) saturates while the accumulated time $t$ continues to grow. Notably, SA-GMAB's RoTI decays faster in the late stages simply because it has already exhausted the potential for improvement much earlier than the baselines.
+The RL-based TunIO outperforms above baselines but still suffers from a slow start. While it eventually reaches a competitive latency ($\approx 266$ ms at step~71), its exploration phase is costly. The RL agent requires a substantial number of interaction samples to learn the complex mapping between I/O parameters and reward signals. Our method achieves the fastest latency drop, rapidly decreasing from initial latency to a near-optimal zone ($\approx 277$ ms) within a short time. SA-GMAB leverages the surrogate model to pre-screen candidates. Its permanent memory mechanism enables more efficient candidate pruning, making it particularly suitable for online scenarios where tuning overhead must be minimized.

 \subsubsection{Adaptation to Workload Shifts}
 \begin{figure}