%
\documentclass[9pt,journal,cspaper,compsoc]{IEEEtran}
\linespread{0.885}
%


% *** CITATION PACKAGES ***
%
\ifCLASSOPTIONcompsoc
  % IEEE Computer Society needs nocompress option
  % requires cite.sty v4.0 or later (November 2003)
  % \usepackage[nocompress]{cite}
\else
  % normal IEEE
  % \usepackage{cite}
\fi


% *** GRAPHICS RELATED PACKAGES ***
%
\ifCLASSINFOpdf
  \usepackage[pdftex]{graphicx}

  % declare the path(s) where your graphic files are
  \graphicspath{{../pdf/}{../jpeg/}}
  % and their extensions so you won't have to specify these with
  % every instance of \includegraphics
  \DeclareGraphicsExtensions{.pdf,.jpeg,.png}
\else
  % or other class option (dvipsone, dvipdf, if not using dvips). graphicx
  % will default to the driver specified in the system graphics.cfg if no
  % driver is specified.
  \usepackage[dvips]{graphicx}

  % declare the path(s) where your graphic files are
  \graphicspath{{../eps/}}
  % and their extensions so you won't have to specify these with
  % every instance of \includegraphics
  \DeclareGraphicsExtensions{.eps}
\fi
% graphicx was written by David Carlisle and Sebastian Rahtz. It is


% *** MATH PACKAGES ***
%
\usepackage[cmex10]{amsmath}
% A popular package from the American Mathematical Society that provides
% many useful and powerful commands for dealing with mathematics. If using
% it, be sure to load this package with the cmex10 option to ensure that
% only type 1 fonts will utilized at all point sizes. Without this option,
% it is possible that some math symbols, particularly those within
% footnotes, will be rendered in bitmap form which will result in a
% document that can not be IEEE Xplore compliant!
%
% Also, note that the amsmath package sets \interdisplaylinepenalty to 10000
% thus preventing page breaks from occurring within multiline equations. Use:
\interdisplaylinepenalty=2500
% after loading amsmath to restore such page breaks as IEEEtran.cls normally
% does. amsmath.sty is already installed on most LaTeX systems. The latest
% version and documentation can be obtained at:
% http://www.ctan.org/tex-archive/macros/latex/required/amslatex/math/


% *** SPECIALIZED LIST PACKAGES ***
%
\usepackage{algorithmic}
% algorithmic.sty was written by Peter Williams and Rogerio Brito.
% This package provides an algorithmic environment fo describing algorithms.
% You can use the algorithmic environment in-text or within a figure
% environment to provide for a floating algorithm. Do NOT use the algorithm
% floating environment provided by algorithm.sty (by the same authors) or
% algorithm2e.sty (by Christophe Fiorio) as IEEE does not use dedicated
% algorithm float types and packages that provide these will not provide
% correct IEEE style captions. The latest version and documentation of
% algorithmic.sty can be obtained at:
% http://www.ctan.org/tex-archive/macros/latex/contrib/algorithms/
% There is also a support site at:
% http://algorithms.berlios.de/index.html
% Also of interest may be the (relatively newer and more customizable)
% algorithmicx.sty package by Szasz Janos:
% http://www.ctan.org/tex-archive/macros/latex/contrib/algorithmicx/


% *** ALIGNMENT PACKAGES ***
%
\usepackage{array}
% Frank Mittelbach's and David Carlisle's array.sty patches and improves
% the standard LaTeX2e array and tabular environments to provide better
% appearance and additional user controls. As the default LaTeX2e table
% generation code is lacking to the point of almost being broken with
% respect to the quality of the end results, all users are strongly
% advised to use an enhanced (at the very least that provided by array.sty)
% set of table tools. array.sty is already installed on most systems. The
% latest version and documentation can be obtained at:
% http://www.ctan.org/tex-archive/macros/latex/required/tools/


%\usepackage{mdwmath}
%\usepackage{mdwtab}
% Also highly recommended is Mark Wooding's extremely powerful MDW tools,
% especially mdwmath.sty and mdwtab.sty which are used to format equations
% and tables, respectively. The MDWtools set is already installed on most
% LaTeX systems. The lastest version and documentation is available at:
% http://www.ctan.org/tex-archive/macros/latex/contrib/mdwtools/


% IEEEtran contains the IEEEeqnarray family of commands that can be used to
% generate multiline equations as well as matrices, tables, etc., of high
% quality.

\usepackage{makecell}


\usepackage{eqparbox}
% Also of notable interest is Scott Pakin's eqparbox package for creating
% (automatically sized) equal width boxes - aka "natural width parboxes".
% Available at:
% http://www.ctan.org/tex-archive/macros/latex/contrib/eqparbox/


% *** SUBFIGURE PACKAGES ***
\ifCLASSOPTIONcompsoc
\usepackage[tight,normalsize,sf,SF]{subfigure}
\else
\usepackage[tight,footnotesize]{subfigure}
\fi


% special characters
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}


% for algorithm
\usepackage[linesnumbered,lined,ruled]{algorithm2e}

\usepackage{enumerate}
\usepackage{amssymb}

\usepackage{xcolor}   % 加载颜色宏包

% correct bad hyphenation here
\hyphenation{op-tical net-works semi-conduc-tor}


\newcommand\toPrintComments{true}
\newcommand{\kol}[1]{
\ifthenelse{\equal{\toPrintComments}{true}}{
{\em#1}
}{}
}


\begin{document}
%
% paper title
% can use linebreaks \\ within to get better formatting as desired
\title{An I/O efficient approach for concurrent spatio-temporal range retrievals over large-scale remote sensing image data}
%
%
% author names and IEEE memberships


\author{Ze~Deng,
        Yue Wang,
        Tao Liu,
        Schahram Dustdar,\IEEEmembership{Fellow,~IEEE,}
        Rajiv Ranjan,
        Albert Zomaya, \IEEEmembership{Fellow,~IEEE,}
        Yizhi Liu
        and  Lizhe~Wang$^{\dagger}$, ~\IEEEmembership{Fellow,~IEEE,}

        % <-this % stops a space

\IEEEcompsocitemizethanks{
\IEEEcompsocthanksitem Z. Deng, L. Wang (Corresponding author, lizhe.Wang@gmail.com), Y. Wang, T. Liu, and Y. Liu are with the School of Computer Science, China University of Geosciences, Wuhan, 430078, P.R.China.

\IEEEcompsocthanksitem Z. Deng, and L. Wang (Corresponding author, lizhe.Wang@gmail.com) are also with Hubei Key Laboratory of Intelligent Geo-Information Processing, China University of Geosciences, Wuhan 430074, China.

\IEEEcompsocthanksitem Schahram Dustdar is with the Technische Universit$\ddot{a}$t Wien, Austria.

\IEEEcompsocthanksitem R. Ranjan is with School of Computing, Newcastle University, U.K.

\IEEEcompsocthanksitem A. Zomaya is with the School of Information Technologies, The University of Sydney, Sydney, Australia.


%%\IEEEcompsocthanksitem M. Shell is with the Department
%%of Electrical and Computer Engineering, Georgia Institute of Technology, Atlanta,
%%GA, 30332.\protect\\
% note need leading \protect in front of \\ to get a newline within \thanks as
% \\ is fragile and will error, could use \hfil\break instead.
%%%E-mail: see http://www.michaelshell.org/contact.html
%%\IEEEcompsocthanksitem J. Doe and J. Doe are with Anonymous %%University.
}% <-this % stops a space
\thanks{}
}

% note the % following the last \IEEEmembership and also \thanks -
% these prevent an unwanted space from occurring between the last author name
% and the end of the author line. i.e., if you had this:
%
% \author{....lastname \thanks{...} \thanks{...} }
%                     ^------------^------------^----Do not want these spaces!
%
% a space would be appended to the last name and could cause every name on that
% line to be shifted left slightly. This is one of those "LaTeX things". For
% instance, "\textbf{A} \textbf{B}" will typeset as "A B" not "AB". To get
% "AB" then you have to do: "\textbf{A}\textbf{B}"
% \thanks is no different in this regard, so shield the last } of each \thanks
% that ends a line with a % and do not let a space in before the next \thanks.
% Spaces after \IEEEmembership other than the last one are OK (and needed) as
% you are supposed to have spaces between the names. For what it is worth,
% this is a minor point as most people would not even notice if the said evil
% space somehow managed to creep in.


% The paper headers
\markboth{IEEE Transactions on Computers,~Vol.~XX, No.~X, January~2014}%
{Shell \MakeLowercase{\textit{et al.}}: Bare Demo of IEEEtran.cls for Computer Society Journals}
% The only time the second header will appear is for the odd numbered pages


% for Computer Society papers, we must declare the abstract and index terms
% PRIOR to the title within the \IEEEcompsoctitleabstractindextext IEEEtran
% command as these need to go into the title area created by \maketitle.
\IEEEcompsoctitleabstractindextext{%
\begin{abstract}
%\boldmath
High-performance remote sensing analytics workflows require ingesting and retrieving massive image archives to support real-time spatio-temporal applications. While modern systems utilize window-based I/O reading to reduce data transfer, they face a dual bottleneck: the prohibitive overhead of runtime geospatial computations caused by the decoupling of logical indexing from physical storage, and severe storage-level I/O contention triggered by uncoordinated concurrent reads. To address these limitations, we present a comprehensive I/O-aware retrieval processing approach based on a novel "Index-as-an-Execution-Plan" paradigm. We introduce a dual-layer inverted structure that serves as a deterministic I/O planner, pre-materializing grid-to-pixel mappings to completely eliminate runtime geometric calculations. Furthermore, we design a hybrid concurrency-aware I/O coordination protocol that adaptively integrates Calvin-style deterministic ordering with optimistic execution, effectively converting I/O contention into request merging opportunities. To handle fluctuating workloads, we incorporate a Surrogate-Assisted Genetic Multi-Armed Bandit mechanism for automatic parameter tuning. Evaluated on a distributed cluster with Sentinel-2 datasets, our approach reduces end-to-end latency by an order of magnitude compared to standard window-based reading, achieves linear throughput scaling under high concurrency, and demonstrates superior convergence speed in automatic tuning.

\end{abstract}
% IEEEtran.cls defaults to using nonbold math in the Abstract.
% This preserves the distinction between vectors and scalars. However,
% if the journal you are submitting to favors bold math in the abstract,
% then you can use LaTeX's standard command \boldmath at the very start
% of the abstract to achieve this. Many IEEE journals frown on math
% in the abstract anyway. In particular, the Computer Society does
% not want either math or citations to appear in the abstract.

% Note that keywords are not normally used for peer review papers.
\begin{keywords}
Remote sensing data management, Spatio-temporal range retrievals, I/O-aware indexing, Concurrency control, I/O tuning
\end{keywords}}


% make the title area
\maketitle


% To allow for easy dual compilation without having to reenter the
% abstract/keywords data, the \IEEEcompsoctitleabstractindextext text will
% not be used in maketitle, but will appear (i.e., to be "transported")
% here as \IEEEdisplaynotcompsoctitleabstractindextext when compsoc mode
% is not selected <OR> if conference mode is selected - because compsoc
% conference papers position the abstract like regular (non-compsoc)
% papers do!
\IEEEdisplaynotcompsoctitleabstractindextext
% \IEEEdisplaynotcompsoctitleabstractindextext has no effect when using
% compsoc under a non-conference mode.


% For peer review papers, you can put extra information on the cover
% page as needed:
% \ifCLASSOPTIONpeerreview
% \begin{center} \bfseries EDICS Category: 3-BBND \end{center}
% \fi
%
% For peerreview papers, this IEEEtran command inserts a page break and
% creates the second title. It will be ignored for other modes.
\IEEEpeerreviewmaketitle


\section{Introduction}


%%\hfill mds

%%\hfill January 11, 2007
\IEEEPARstart{A} massive amount of remote sensing (RS) data, characterized by high spatial, temporal, and spectral resolutions, is being generated at an unprecedented speed due to the rapid advancement of Earth observation missions \cite{Ma15RS_bigdata}. For instance, NASA's AVIRIS-NG acquires nearly 9 GB of data per hour, while the EO-1 Hyperion sensor generates over 1.6 TB daily \cite{Haut21DDL_RS}. Beyond the sheer volume of data, these datasets are increasingly subjected to intensive concurrent access from global research communities and real-time emergency response systems (e.g., multi-departmental coordination during natural disasters). Consequently, modern RS platforms are required to provide not only massive storage capacity but also high-throughput retrieval capabilities to satisfy the simultaneous demands of numerous spatio-temporal analysis tasks.

\par
Existing RS data management systems \cite{LEWIS17datacube, Yan21RS_manage1, liu24mstgi} typically decompose a spatio-temporal range retrieval into a decoupled two-phase execution model. The first phase is the metadata filtering phase, which utilizes spatio-temporal metadata (e.g., footprints, timestamps) to identify candidate image files that intersect the retrieval predicate. Recent advancements have transitioned from traditional tree-based indexes \cite{Strobl08PostGIS, Simoes16PostGIST} to scalable distributed schemes based on grid encodings and space-filling curves, such as GeoHash \cite{suwardi15geohash}, GeoSOT \cite{Yan21RS_manage1}, and GeoMesa \cite{hughes15geomesa}. By leveraging these high-dimensional indexing structures, the search complexity of the first phase has been effectively reduced to $O(\log N)$ or even $O(1)$, making metadata discovery extremely efficient even for billion-scale datasets.

\par
The second phase is the data extraction phase, where the system reads the actual pixel data from the identified raw image files stored in distributed file systems or object stores. A critical observation in modern high-performance RS analytics is that the primary system bottleneck has fundamentally shifted from the first phase to the second. While the metadata search completes in milliseconds, the end-to-end retrieval latency is now dominated by the massive I/O overhead required to fetch, decompress, and process large-scale raw images. Traditional systems attempted to reduce I/O overhead by pre-slicing tiles and building pyramids (e.g., approaches used in Google Earth Engine \cite{gorelick17GEE} that store metadata in HBase and serve pre-tiled image pyramids), but aggressive tiling increases management complexity and produces many small files. More recent Cloud-Optimized GeoTIFF (COG) formats and COG-aware frameworks \cite{LEWIS17datacube}, \cite{riotiler25riotiler} exploit internal overviews and window-based I/O to read only the portions of files that spatially intersect a retrieval. 

While window-based I/O effectively reduces raw data transfer, it introduces a new "computation wall" due to the decoupling of logical indexing from physical storage. Current state-of-the-art systems operate on a "Search-then-Compute-then-Read" model: after identifying candidate files, they must perform fine-grained, per-image geospatial computations at runtime to map retrieval coordinates to precise file offsets and clip boundaries. This runtime geometric resolution ($C_{geo}$) becomes computationally prohibitive when processing a large volume of candidate images, often negating the benefits of I/O reduction. Moreover, under concurrent workloads, the lack of coordination among these independent read requests leads to severe I/O contention and storage thrashing, rendering traditional indexing-centric optimizations insufficient for real-time applications.

To address the problems above, we propose a novel "Index-as-an-Execution-Plan" paradigm to strictly bound the retrieval latency. Unlike conventional approaches that treat indexing and I/O execution as separate stages, our approach integrates fine-grained partial retrieval directly into the indexing structure. By pre-materializing the mapping between logical spatial grids and physical pixel windows, our system enables deterministic I/O planning without runtime geometric computation. To further ensure scalability, we introduce a concurrency control protocol tailored for spatio-temporal range retrievals and an automatic I/O tuning mechanism. The principal contributions of this paper are summarized as follows:

\begin{enumerate}
	\item We propose an I/O-aware "Index-as-an-Execution-Plan" schema. Instead of merely returning candidate image identifiers, our index directly translates high-level spatio-temporal predicates into concrete, byte-level windowed read plans. This design bridges the semantic gap between logical retrievals and physical storage, eliminating expensive runtime geospatial computations and ensuring that I/O cost is proportional strictly to the retrieval footprint.

	\item We propose a hybrid concurrency-aware I/O coordination protocol. This protocol adapts transaction processing principles by integrating Calvin-style deterministic ordering \cite{Thomson12Calvin} with optimistic execution \cite{Lim17OCC}. It shifts the focus from protecting database rows to coordinating shared I/O flows. This protocol dynamically switches strategies based on spatial contention, effectively converting "I/O contention" into "request merging opportunities."

	\item We proposed an automatic I/O tuning method to improve the I/O performance of spatio-temporal range retrievals over remote sensing data. The method extends an existing AI-powered I/O tuning framework \cite{Rajesh24TunIO} based on a surrogate-assisted genetic multi-armed bandits algorithm \cite{Preil25GMAB}.
\end{enumerate}

\par
The remainder of this paper is organized as follows:
Section~\ref{sec:RW} presents the related work.
Section~\ref{sec:DF} proposes the definition concerning the spatio-temporal range retrieval problem.
Section~\ref{sec:Index} proposes the indexing structre.
Section~\ref{sec:CC} proposes the hybrid concurrency control protocol.
Section~\ref{sec:Tuning} proposes the method of I/O stack tuning.
Section~\ref{sec:EXP}  presents the experiments and results.
Section~\ref{sec:Con} concludes this paper with a summary.

\section{Related Work}\label{sec:RW}
This section describes the most salient studies of I/O-efficient spatio-temporal retrieval processing, concurrency control and I/O Performance Tuning.

\subsection{I/O-Efficient Spatio-Temporal Retrieval Processing}
Efficient spatio-temporal query processing for remote sensing data has been extensively studied, with early efforts primarily focusing on metadata organization and index-level pruning in relational database systems. Traditional approaches typically extend tree-based spatial indexes, such as R-tree \cite{Strobl08PostGIS}, quadtree \cite{Tang12Quad-Tree}, and their spatio-temporal variants \cite{Simoes16PostGIST}, to organize image footprints together with temporal attributes, and are commonly implemented on relational backends (e.g., MySQL and PostgreSQL). These methods provide efficient range filtering for moderate-scale datasets, but their reliance on balanced tree structures often leads to high maintenance overhead and limited scalability as the volume of remote sensing metadata grows rapidly. With the continuous increase in data volume and ingestion rate, recent systems have gradually shifted toward grid-based spatio-temporal indexing schemes deployed on distributed NoSQL stores. By encoding spatial footprints into uniform spatial grids using GeoHash \cite{suwardi15geohash}, GeoSOT \cite{Yan21RS_manage1}, or space-filling curves \cite{hughes15geomesa}, \cite{liu24mstgi}, and combining them with temporal identifiers, these approaches enable lightweight index construction and better horizontal scalability on backends such as HBase and Elasticsearch. Such grid-based indexes can effectively reduce the candidate search space through coarse-grained pruning and are more suitable for large-scale, continuously growing remote sensing archives. 

\par
However, index pruning alone is insufficient to guarantee end-to-end retrieval efficiency for remote sensing workloads, where individual images are usually large and retrieval results require further pixel-level processing. To reduce the amount of raw I/O, Google Earth system \cite{gorelick17GEE} rely on tiling and multi-resolution pyramids that physically split images into small blocks. While more recent solutions leverage COG and window-based I/O to enable partial reads from monolithic image files. Frameworks such as OpenDataCube \cite{LEWIS17datacube} exploit these features to read only the image regions intersecting a retrieval window, thereby reducing unnecessary data transfer. Nevertheless, after candidate images are identified, most systems still perform fine-grained geospatial computations for each image, including coordinate transformations and precise pixel-window derivation, which may incur substantial overhead when many images are involved.

\subsection{Concurrency Control}
Concurrency control has long been studied to provide correctness and high throughput in multi-user database and storage systems, with two broad paradigms dominating the literature: deterministic scheduling \cite{Thomson12Calvin} and non-deterministic schemes \cite{Bernstein812PL}, \cite{KungR81OCC}. Hybrid approaches \cite{WangK16MVOCC}, \cite{Hong25HDCC} that adaptively combine these paradigms seek to exploit the low-conflict efficiency of deterministic execution while retaining the flexibility of optimistic techniques. More recent proposals such as OOCC target read-heavy, disaggregated settings by reducing validation and round-trips for read-only transactions, achieving low latency under OLTP-like workloads \cite{Wu25OOCC}. These CC families are primarily optimized for record- or key-level access patterns: their metrics and designs emphasize transaction latency, abort rates, and throughput under workloads with small, well-defined read/write sets.

\par
Overall, existing concurrency control mechanisms are largely designed around transaction-level correctness and throughput, assuming record- or key-based access patterns and treating storage I/O as a black box. Their optimization objectives rarely account for I/O amplification or fine-grained storage contention induced by concurrent range retrievals. Consequently, these approaches are ill-suited for data-intensive spatio-temporal workloads, where coordinating overlapping window reads and mitigating storage-level interference are critical to achieving scalable performance under multi-user access.

\subsection{I/O Performance Tuning in Storage Systems}
I/O performance tuning has been extensively studied in the context of HPC and data-intensive storage systems, where complex multi-layer I/O stacks expose a large number of tunable parameters. These parameters span different layers, including application-level I/O libraries, middleware, and underlying storage systems, and their interactions often lead to highly non-linear performance behaviors. As a result, manual tuning is time-consuming and error-prone, motivating a wide range of auto-tuning approaches.

\par
Several studies focus on improving the efficiency of the tuning pipeline itself by reformulating the search space or optimization objectives. Chen et al. \cite{Chen21Tuning1} proposed a meta multi-objectivization (MMO) model that introduces auxiliary performance objectives to mitigate premature convergence to local optima. While such techniques can improve optimization robustness, they are largely domain-agnostic and do not explicitly account for the characteristics of I/O-intensive workloads. Other works, such as the contextual bandit-based approach by Bez et al. \cite{Bez20TuningLayer}, optimize specific layers of the I/O stack (e.g., I/O forwarding) by exploiting observed access patterns. However, these methods are primarily designed for administrator-level tuning and target isolated components rather than end-to-end application I/O behavior.

\par
User-level I/O tuning has also been explored, most notably by H5Tuner \cite{Behzad13HDF5}, which employs genetic algorithms to optimize the configuration of the HDF5 I/O library. Although effective for single-layer tuning, H5Tuner does not consider cross-layer interactions and lacks mechanisms for reducing tuning cost, such as configuration prioritization or early stopping.

\par
More recently, TunIO \cite{Rajesh24TunIO} proposed an AI-powered I/O tuning framework that explicitly targets the growing configuration spaces of modern I/O stacks. TunIO integrates several advanced techniques, including I/O kernel extraction, smart selection of high-impact parameters, and reinforcement learning–driven early stopping, to balance tuning cost and performance gain across multiple layers. Despite its effectiveness, TunIO and related frameworks primarily focus on single-application or isolated workloads, assuming stable access patterns during tuning. Retrieval-level I/O behaviors, such as fine-grained window access induced by spatio-temporal range retrievals, as well as interference among concurrent users, are generally outside the scope of existing I/O tuning approaches.

\section{Definition}\label{sec:DF}
This section formalizes the spatio-temporal range retrieval problem and establishes the cost models for retrieval execution. We assume a distributed storage environment where large-scale remote sensing images are stored as objects or files.

\par
Definition~1 (Spatio-temporal Remote Sensing Image). A remote sensing image $R$ is defined as a tuple:
\vspace{-0.05in}
\begin{equation}
	\label{eqn:pre_rs}
	R=\langle id, \Omega, \mathcal{D}, t \rangle,
\end{equation}
where $id$ is the unique identifier; $\Omega = [0, W] \times [0, H]$ denotes the pixel coordinate space; $\mathcal{D}$ represents the raw pixel data; and $t$ is the temporal validity interval. The image is associated with a spatial footprint $MBR(R)$ in the global coordinate reference system.

\par
Definition 2 (Spatio-temporal Range Retrieval). Given a dataset $\mathbb{R}$, a retrieval $Q$ is defined by a spatio-temporal predicate $Q = \langle S, T \rangle$, where $S$ is the spatial bounding box and $T$ is the time interval. The retrieval result set $\mathcal{R}_Q$ is defined as:
\vspace{-0.05in}
\begin{equation}
	\label{eqn:pre_st_query}
	\mathcal{R}_Q=R\in \mathbb{R}\mid MBR\left( R \right) \cap S\ne \emptyset \land R.t\cap T\ne \emptyset .
\end{equation}

\par
For each $R \in \mathcal{R}_Q$, the system must return the pixel matrix corresponding to the intersection region $MBR(R) \cap S$.

\par
Definition 3 (Retrieval Execution Cost Model). The execution latency of a retrieval $Q$, denoted as $Cost(Q)$, is composed of two phases: metadata filtering and data extraction.
\begin{equation}
	\label{eqn:cost_total}
	Cost\left( Q \right) =C_{meta}\left( Q \right) +\sum_{R\in \mathcal{R}_Q}{\left( C_{geo}\left( R,Q \right) +C_{io}\left( R,Q \right) \right)}.
\end{equation}

\par
Here, $C_{meta}(Q)$ is the cost of identifying candidate images $\mathcal{R}_Q$ using indices. The data extraction cost for each image consists of two components: geospatial computation cost ($C_{geo}$) and I/O access cost ($C_{io}$). $C_{geo}$ is the CPU time required to calculate the pixel-to-geographic mapping, determine the exact read windows (offsets and lengths), and handle boundary clipping. In window-based partial reading schemes, this cost is non-negligible due to the complexity of coordinate transformations. $C_{io}$ is the latency to fetch the actual binary data from storage.

\par
Definition ~4 (Concurrent Spatio-temporal Retrievals). Let $\mathcal{Q} = \{Q_1, Q_2, \ldots, Q_N\}$ denote a set of spatio-temporal range retrievals issued concurrently by multiple users.
Each retrieval $Q_i$ independently specifies a spatio-temporal window $\langle S_i, T_i \rangle$ and may overlap with others in both spatial and temporal dimensions. Concurrent execution of $\mathcal{Q}$ may induce overlapping partial reads over the same images or image regions, leading to redundant I/O and storage-level contention if retrievals are processed independently.

\par
\textbf{Problem Statement (Latency-Optimized Concurrent Retrieval Processing).} Given a dataset $\mathbb{R}$ and a concurrent workload $\mathcal{Q}$, the objective is to minimize the total execution latency:
\vspace{-0.05in}
\begin{equation}
	\label{eqn_pre_objective}
	\min \sum_{Q_i\in \mathcal{Q}}{\left( C_{meta}\left( Q_i \right) +\sum_{R\in \mathcal{R}_{Q_i}}{\left( C_{geo}\left( R,Q_i \right) +C_{io}\left( R,Q_i \right) \right)} \right)},
\end{equation}
subject to:
\begin{enumerate}
	\item \textit{Correctness:} The returned data must strictly match the spatio-temporal predicate defined in Eq. (\ref{eqn:pre_st_query}).
	\item \textit{Isolation:} Concurrent reads must effectively share I/O bandwidth without causing starvation or excessive thrashing.
\end{enumerate}

\section{I/O-aware Indexing stucture}\label{sec:Index}
This section introduces the details of indexing structre for spatio-temporal range retrieval over remote sensing image data.

\begin{figure*}[htb]
	\centering
	\includegraphics[width=0.90\textwidth]{fig/index.png}
	\caption{Index schema design.}
	\label{fig:index}
\end{figure*}

\subsection{Index schema design}
\par
To enable I/O-efficient spatio-temporal query processing, we first decompose the global spatial domain into a uniform grid that serves as the basic unit for query pruning and data access coordination. Specifically, we adopt a fixed-resolution global tiling scheme based on the Web Mercator (or EPSG:4326) coordinate system, using zoom level 14 to partition the Earth’s surface into fine-grained grid cells (experiments show that the 14-level grid has the highest indexing efficiency which can be referred to Section~\ref{sec:Index_exp_3}). This resolution strikes a practical balance between spatial selectivity and index size: finer levels would significantly increase metadata volume and maintenance cost, while coarser levels would reduce pruning effectiveness and lead to unnecessary image I/O. At this scale, each grid cell typically corresponds to a spatial extent comparable to common query footprints and to the internal tiling granularity used by modern raster formats, making it well suited for partial data access.

\par
\textbf{Grid-to-Image Mapping (G2I).} 
Based on the grid decomposition, we construct a grid-centric inverted index to associate spatial units with covering images. In our system, each grid cell is assigned a unique \emph{GridKey}, encoded as a 64-bit Z-order value to preserve spatial locality and enable efficient range scans in key-value stores such as HBase. The \emph{G2I table} stores one row per grid cell, where the row key is the GridKey and the value maintains the list of image identifiers (ImageKeys) whose spatial footprints intersect the corresponding cell, as illustrated in Fig.~\ref{fig:index}(a).

\par
This grid-to-image mapping allows retrieval processing to begin with a lightweight enumeration of grid cells covered by a retrieval region, followed by direct lookups of candidate images via exact GridKey matches. By treating each grid cell as an independent spatial bucket, the G2I table provides efficient metadata-level pruning and avoids costly geometric intersection tests over large image footprints.

\par
However, the G2I table alone is insufficient for I/O-efficient retrieval execution. While it identifies which images are relevant to a given grid cell, it does not capture how the grid cell maps to pixel regions within each image. As a result, a grid-only representation cannot directly guide partial reads and would still require per-image geospatial computations at retrieval time. Therefore, the G2I table functions as a coarse spatial filter and must be complemented by an image-centric structure that materializes the correspondence between grid cells and pixel windows, enabling fine-grained, window-based I/O.

\par
\textbf{Image-to-Grid Mapping (I2G).}
To complement the grid-centric G2I table and enable fine-grained, I/O-efficient data access, we introduce an image-centric inverted structure, referred to as the Image-to-Grid mapping (I2G). In contrast to G2I, which organizes metadata by spatial grids, the I2G table stores all grid-level access information of a remote sensing image in a single row. Each image therefore occupies exactly one row in the table, significantly improving locality during retrieval execution.

\par
As illustrated in Fig.~\ref{fig:index}(b), the row key of the I2G table is the \emph{ImageKey}, i.e., the unique identifier of a remote sensing image. The row value is organized into three column families, each serving a distinct role in retrieval-time pruning and I/O coordination:

\par
\textit{Grid–Window Mapping.}
This column family records the list of grid cells intersected by the image together with their corresponding pixel windows in the image coordinate space. Each entry has the form
\[
\langle \textit{GridKey}, W_{ImageKey\_GridKey} \rangle,
\]
where \textit{GridKey} identifies a grid cell at the chosen global resolution, and $W_{ImageKey\_GridKey}$ denotes the minimal pixel bounding rectangle within the image that exactly covers that grid cell.

\par
These precomputed window offsets allow the retrieval executor to directly issue windowed reads on large raster files without loading entire images into memory or recomputing geographic-to-pixel transformations at retrieval time. As a result, grid cells become the smallest unit of coordinated I/O, enabling precise partial reads and effective elimination of redundant disk accesses.

\par
\textit{Temporal Metadata.}
To support spatio-temporal range retrievals, each image row includes a lightweight temporal column family that stores its acquisition time information, such as the sensing timestamp or time interval. This metadata enables efficient temporal filtering to be performed jointly with spatial grid matching, without consulting external catalogs or secondary indexes.

\par
\textit{Storage Pointer.}
This column family contains the information required to retrieve image data from the underlying storage system. It stores a stable file identifier, such as an object key in an object store (e.g., MinIO/S3) or an absolute path in a POSIX-compatible file system. By decoupling logical image identifiers from physical storage locations, this design supports flexible deployment across heterogeneous storage backends while allowing the retrieval engine to directly access image files once relevant pixel windows have been identified.

\par
The I2G table offers several advantages. First, all grid-level access information for the same image is colocated in a single row, avoiding repeated random lookups and improving cache locality during retrieval execution. Second, by materializing grid-to-window correspondences at ingestion time, the system completely avoids expensive per-retrieval geometric computations and directly translates spatial overlap into byte-range I/O requests. Third, the number of rows in the I2G table scales with the number of images rather than the number of grid cells, substantially reducing metadata volume and maintenance overhead.

\par
During data ingestion, the grid–window mappings are generated by projecting grid boundaries into the image coordinate system using the image’s georeferencing parameters. This process requires only lightweight affine or RPC transformations and does not involve storing explicit geometries or performing polygon clipping. As a result, the I2G structure enables efficient partial reads while keeping metadata compact and ingestion costs manageable.

\subsection{Retrieval-time Execution}

\begin{figure}
	\centering
	\includegraphics[width=2.2in]{fig/st-query.png}
	\caption{Retrieval-time Execution}
	\label{fig_ST_Query}
\end{figure}

The I/O-aware index enables efficient spatio-temporal range retrievals by directly translating retrieval predicates into windowed read plans, while avoiding both full-image loading and expensive geometric computations. Given a user-specified spatio-temporal retrieval
$q = \langle [x_{\min}, y_{\min}, x_{\max}, y_{\max}], [t_s, t_e] \rangle$,
the system resolves the retrieval through three consecutive stages: \emph{Grid Enumeration}, \emph{Candidate Image Retrieval with Temporal Pruning}, and \emph{Windowed Read Plan Generation}. As illustrated in Fig.~\ref{fig_ST_Query}, this execution pipeline bridges high-level retrieval predicates and low-level I/O operations in a fully deterministic manner.

\par
\textbf{Grid Enumeration.}
As shown in Step~1 and Step~2 of Fig.~\ref{fig_ST_Query}, the retrieval execution starts by rasterizing the spatial footprint of $q$ into the fixed global grid at zoom level 14. Instead of performing recursive space decomposition as in quadtrees or hierarchical spatial indexes, our system enumerates the minimal set of grid cells
$\{g_1, \ldots, g_k\}$
whose footprints intersect the retrieval bounding box.

\par
Each grid cell corresponds to a unique 64-bit \textit{GridKey}, which directly matches the primary key of the G2I table. This design has important implications: grid enumeration has constant depth and low computational cost and the resulting GridKeys can be directly used as lookup keys without any geometric refinement. Consequently, spatial key generation is reduced to simple arithmetic operations on integer grid coordinates.

\par
\textbf{Candidate Image Retrieval with Temporal Pruning.}
Given the enumerated grid set $\{g_1, \ldots, g_k\}$, the retrieval processor performs a batched multi-get on the G2I table. Each G2I row corresponds to a single grid cell and stores the identifiers of all images whose spatial footprints intersect that cell. For each grid $g_i$, the lookup returns:
\[
G2I[g_i] = \{ imgKey_1, \ldots, imgKey_m \}.
\]

\par
All retrieved image identifiers are unioned to form the spatial candidate set
$C_s = \bigcup_{i=1}^{k} G2I[g_i]$.
This step eliminates the need for per-image polygon intersection tests that are commonly required in spatial databases and data cube systems.

\par
To incorporate the temporal constraint $[t_s, t_e]$, each candidate image in $C_s$ is further filtered using the temporal column family of the Image-to-Grid (I2G) table. Images whose acquisition time does not intersect the retrieval interval are discarded early, yielding the final candidate set $C$. This lightweight temporal pruning is performed without accessing any image data and introduces negligible overhead.

\par
\textbf{Windowed Read Plan Generation.}
As shown in Step~3 of Fig.~\ref{fig_ST_Query}, the final stage translates the candidate image set into a concrete I/O plan. For each image $I \in C$, the retrieval executor issues a selective range-get on the I2G table to retrieve only the grid–window mappings relevant to the retrieval grids:

\begin{equation}
	\label{eqn_pre_spatial_query}
I2G\left[ I,\{g_1,...,g_k\} \right] =\left\{ W_{I\_g_i}\mid g_i\cap I\ne \emptyset \right\} .
\end{equation}

\par
Each $W_{I\_g_i}$ specifies the exact pixel window in the original raster file that corresponds to grid cell $g_i$. Since these window offsets are precomputed during ingestion, retrieval execution requires only key-based lookups and arithmetic filtering. No geographic coordinate transformation, polygon clipping, or raster–vector intersection is performed at retrieval time.

\par
The resulting collection of pixel windows constitutes a \emph{windowed read plan}, which can be directly translated into byte-range I/O requests against the storage backend. This approach avoids loading entire scenes and ensures that the total I/O volume is proportional to the retrieved spatial extent rather than the image size.

\subsection{Why I/O-aware}
The key reason our indexing design is I/O-aware lies in the fact that the index lookup results are not merely candidate identifiers, but constitute a concrete I/O access plan. Unlike traditional spatial indexes, where retrieval processing yields a set of objects that must still be fetched through opaque storage accesses, our Grid-to-Image and Image-to-Grid lookups deterministically produce the exact pixel windows to be read from disk. As a result, the logical retrieval plan and the physical I/O plan are tightly coupled: resolving a spatio-temporal predicate directly specifies which byte ranges should be accessed and which can be skipped.

\par
This tight coupling fundamentally changes the optimization objective. Instead of minimizing index traversal cost or result-set size, the system explicitly minimizes data movement by ensuring that disk I/O is proportional to the retrieval's spatio-temporal footprint. Consequently, the index serves as an execution-aware abstraction that bridges retrieval semantics and storage behavior, enabling predictable, bounded I/O under both single-retrieval and concurrent workloads.

\par
\textbf{Theoretical Cost Analysis.}
To rigorously quantify the performance advantage, we revisit the retrieval cost model defined in Eq. (\ref{eqn:cost_total}):
\begin{equation*}
	Cost(Q) = C_{meta}(Q) + \sum_{R \in \mathcal{R}_Q} \left( C_{geo}(R, Q) + C_{io}(R, Q) \right).
\end{equation*}

\par
In traditional full-image reading systems, although the geospatial computation cost is negligible ($C_{geo} = 0$) as no clipping is performed, the I/O cost $C_{io}$ is determined by the full file size. Consequently, the total latency is entirely dominated by massive I/O overhead, rendering $C_{meta}$ (typically milliseconds) irrelevant.

\par
Existing window-based I/O systems (e.g., ODC or COG-aware libraries) successfully reduce the I/O cost to the size of the requested window. However, this reduction comes at the expense of a significant surge in $C_{geo}$. For every candidate image, the system must perform on-the-fly coordinate transformations and polygon clipping to calculate read offsets. When a retrieval involves thousands of images, the accumulated CPU time ($\sum C_{geo}$) becomes a new bottleneck (e.g., hundreds of milliseconds to seconds), often negating the benefits of I/O reduction (detailed quantitative comparisons are provided in Sec.~\ref{sec:Index_exp_2}).

\par
In contrast, our I/O-aware indexing approach fundamentally alters this trade-off. By materializing the grid-to-pixel mapping in the I2G table, we effectively shift the computational burden from retrieval time to ingestion time. Although the two-phase lookup (G2I and I2G) introduces a slight overhead compared to simple tree traversals, $C_{meta}$ remains in the order of milliseconds—orders of magnitude smaller than disk I/O latency. Since the precise pixel windows are pre-calculated and stored, the runtime geospatial computation is effectively eliminated, i.e., $C_{geo} = 0$. The system retains the minimal I/O cost characteristic of window-based approaches, fetching only relevant byte ranges. Therefore, our design achieves the theoretical minimum for both computation and I/O components within the retrieval execution critical path.

\section{Hybrid Concurrency-Aware I/O Coordination}\label{sec:CC}
In this section, we propose a hybrid coordination mechanism that adaptively employs either lock-free non-deterministic execution or deterministic coordinated scheduling based on the real-time contention level of spatio-temporal workloads.

\begin{figure}
	\centering
	\includegraphics[width=3.0in]{fig/cc.png}
	\caption{Hybrid Concurrency-Aware I/O Coordination.}
	\label{fig:cc}
\end{figure}


\subsection{Retrieval Admission and I/O Plan Generation}
When a spatio-temporal range retrieval $Q$ arrives, the system first performs index-driven plan generation. The retrieval footprint is rasterized into the global grid to enumerate the intersecting grid cells. The G2I table is then consulted to retrieve the set of candidate images, followed by selective lookups in the I2G table to obtain the corresponding pixel windows.

\par
As a result, each retrieval is translated into an explicit \emph{I/O access plan} consisting of image–window pairs:
\vspace{-0.05in}
\begin{equation}
	\label{eq:io_plan}
	Plan\left( Q \right) =\left\{ \left( img_1,w_1 \right) ,\left( img_1,w_2 \right) ,\left( img_3,w_5 \right) ,... \right\},
\end{equation}
where each window $w$ denotes a concrete pixel range to be accessed via byte-range I/O. Upon admission, the system assigns each retrieval a unique \emph{RetrievalID} and records its arrival timestamp.

\subsection{Contention Estimation and Path Selection}
To minimize the overhead of global ordering in low-contention scenarios, the system introduces a Contention-Aware Switch. Upon the arrival of a retrieval batch $\mathcal{Q} = \{Q_1, Q_2, ..., Q_n\}$, the system first estimates the Spatial Overlap Ratio ($\sigma$) among their generated I/O plans.

\par
Let $A(Plan(Q_i))$ be the aggregate spatial area of all pixel windows in the I/O plan of retrieval $Q_i$. The overlap ratio $\sigma$ for a batch is defined as:
\vspace{-0.05in}
\begin{equation}
	\vspace{-0.05in}
	\label{eqn_tuning_table}
	\sigma = 1 - \frac{\text{A}(\bigcup_{i=1}^n Plan(Q_i))}{\sum_{i=1}^n \text{A}(Plan(Q_i))},
\end{equation}
where $\sigma \in [0, 1]$. A high $\sigma$ indicates that multiple retrievals are competing for the same image regions, leading to high I/O amplification if executed independently.

\par
The system utilizes a rule-based assignment mechanism similar to HDCC \cite{Hong25HDCC} to select the execution path:
\begin{enumerate}
	\item Path A (Non-deterministic/OCC-style): If $\sigma < \tau$ (where $\tau$ is a configurable threshold), retrievals proceed directly to execution to maximize concurrency.
	\item Path B (Deterministic/Calvin-style): If $\sigma \ge \tau$, retrievals are routed to the Global I/O Plan Queue for coordinated merging.
\end{enumerate}

\subsection{Deterministic Coordinated and Non-deterministic Execution}
When $\sigma \ge \tau$, the system switches to a deterministic path to mitigate storage-level contention and I/O amplification, as shown in Fig.~\ref{fig:cc}. To coordinate concurrent access to shared storage resources, we introduce a \emph{Global I/O Plan Queue} that enforces a deterministic ordering over all admitted I/O plans. Each windowed access $(img, w)$ derived from incoming retrievals is inserted into this queue according to a predefined policy, such as FIFO based on arrival time or lexicographic ordering by $(timestamp, RetrievalID)$.

\par
This design is inspired by deterministic scheduling in systems such as Calvin, but differs fundamentally in its scope: the ordering is imposed on \emph{window-level I/O operations} rather than on transactions. As a result, accesses to the same image region across different retrievals follow a globally consistent order, preventing uncontrolled interleaving of reads and reducing contention at the storage layer. The deterministic ordering also provides a stable foundation for subsequent I/O coordination and sharing.

\par
The core of our approach lies in coordinating concurrent windowed reads at the image level. Windows originating from different retrievals may overlap spatially, be adjacent, or even be identical. Executing these requests independently would lead to redundant reads and excessive I/O amplification.

\par
To address this, the system performs three coordination steps within each scheduling interval. Stage 1: Global De-duplication. The system first extracts all windowed access pairs $(img, w)$ from the admitted retrievals and inserts them into a global window set ($\mathcal{W}_{total}$). If multiple retrievals $Q_1, Q_2, ..., Q_n$ request the same pixel window $w$ from image $img$, the system retains only one unique entry in $\mathcal{W}_{total}$. This stage ensures that any specific byte range is identified as a single logical requirement, effectively preventing the redundant retrieval of overlapping spatial grids. Stage 2: Range Merging. After de-duplication, the system analyzes the physical disk offsets of all unique windows in $\mathcal{W}_{total}$. Following the principle of improving access locality, windows that are physically contiguous or separated by a gap smaller than a threshold $\theta$ are merged into a single read. Stage 3: Dispatching. This stage maintains a mapping between the physical byte-offsets in the buffer and the logical window requirements of each active retrieval. Each retrieval $Q_i$ receives only the exact pixel windows $w \in Plan(Q_i)$ it originally requested. This is achieved via zero-copy memory mapping where possible, or by slicing the shared system buffer into local thread-wise structures. This ensures that while the physical I/O is shared to reduce amplification, the logical execution of each retrieval remains independent and free from irrelevant data interference.

\par
For example, when $Q_1$ requests grids $\{1, 2\}$ and $Q_2$ requests grids $\{2, 3\}$, Stage 1 identifies the unique requirement set $\{1, 2, 3\}$. Stage 2 then merges these into a single contiguous I/O operation covering the entire range $[1, 3]$. In Stage 3, the dispatcher identifies memory offsets corresponding to grids $1$ and $2$ within the buffer and maps these slices to the private cache of $Q_1$. For $Q_2$, similarly, the dispatcher extracts and delivers slices for grids $2$ and $3$ to $Q_2$.

\par
Through these mechanisms, concurrent retrievals collaboratively share I/O, and the execution unit becomes a coordinated window read rather than an isolated request. Importantly, this coordination operates entirely at the I/O planning level and does not require any form of locking or transaction-level synchronization.

\par
When contention remains below the threshold ($\sigma < \tau$), the system prioritizes low latency over merging efficiency by adopting an optimistic dispatch mechanism, as shown in Fig.~\ref{fig:cc}. Instead of undergoing heavy-weight sorting, I/O plans are immediately offloaded to the execution engine. By utilizing thread-local sublists, each thread independently handles its byte-range requests.

\subsection{Optimistic Read Execution and Completion}
Once a coordinated window read is scheduled, the system issues the corresponding byte-range I/O request immediately. Read execution is fully optimistic: there is no validation phase, no abort, and no rollback. This is enabled by the immutability of remote-sensing imagery and by the deterministic ordering of I/O plans, which together ensure consistent and repeatable read behavior.

\par
A retrieval is considered complete when all windows in its I/O plan have been served and the associated local processing (e.g., reprojection or mosaicking) has finished. By eliminating validation overhead and allowing read execution to proceed independently once scheduled, the system achieves low-latency retrieval completion while maintaining predictable I/O behavior under concurrency.

\par
Overall, this concurrency-aware I/O coordination mechanism reinterprets concurrency control as a problem of \emph{coordinating shared I/O flows}. By operating at the granularity of windowed reads and leveraging deterministic ordering and optimistic execution, it effectively reduces redundant I/O and improves scalability for multi-user spatio-temporal retrieval workloads.

\section{I/O Stack Tuning}\label{sec:Tuning}
We first describe an I/O stack tuning problem and then the surrogate-assisted GMAB algorithm is proposed to solve the problem.

\subsection{Formulation of Online I/O Tuning}
% TODO 这一节的小标题：Tuning Model？
We study a concurrency spatio-temporal retrieval engine that processes many range retrievals at the same time. The system works on large remote sensing images stored in shared storage. Different from traditional HPC jobs or single-application I/O workloads, the system does not run one fixed job. Instead, it keeps receiving a stream of user retrievals. Each retrieval is turned into many small I/O operations that often touch overlapping regions in large raster files.

\par
Let $\mathcal{Q} = \{Q_1, Q_2, \ldots, Q_N\}$ denote a stream of spatio-temporal range retrievals submitted by multiple users. Each retrieval $q$ is decomposed by the I/O-aware index into a set of grid-aligned spatial windows based on a predefined global grid system. These windows are further mapped to sub-regions of one or more large remote sensing images. In this way, every retrieval produces an I/O execution context $c= \langle W,M,S \rangle$, where $W$ describes the set of image windows to be accessed, including their sizes, spatial overlap, and distribution across images. $M$ captures window-level coordination opportunities, such as window merging, deduplication, or shared reads across concurrent retrievals. $S$ represents system-level execution decisions, including batching strategies, I/O scheduling order, and concurrency limits. Importantly, the I/O behavior of the system is not determined solely by static application code, but emerges dynamically from the interaction between retrieval workloads, execution plans, and system policies.

\par
The goal of I/O tuning in this system is to optimize the performance of retrieval-induced I/O execution under continuous, concurrent workloads. We focus on minimizing the observed I/O cost per retrieval, which may be measured by metrics such as average retrieval latency, effective I/O throughput, or amortized disk read time. Let $\theta \in \varTheta$ denote a tuning configuration, where each configuration specifies a combination of system-level I/O control parameters, including window batching size, merge thresholds, queue depth, concurrency limits, and selected storage-level parameters exposed to the engine. Unlike traditional I/O tuning frameworks, the decision variables $\theta$ are applied at the retrieval execution level, rather than at application startup or compilation time.

\par
For a given tuning configuration $\theta $ and execution context $c$, the observed I/O performance is inherently stochastic due to: interference among concurrent retrievals; shared storage contention; variability in window overlap and access locality. We model the observed performance outcome as a random variable:
\vspace{-0.05in}
\begin{equation}
	\vspace{-0.05in}
	\label{eqn_tuning_table}
	Y\left( \theta ,c \right) =f\left( \theta ,c \right) +\epsilon ,
\end{equation}
where $f\left( \cdot \right) $ is an unknown performance function and $\epsilon$ captures stochastic noise. Moreover, as retrieval workloads evolve over time, the distribution of execution contexts $c$ may change, making the tuning problem non-stationary.

\par
Given a stream of retrievals $\mathcal{Q}$ and the resulting sequence of execution contexts $\left\{ c_t \right\} $, the problem is to design an online tuning strategy that adaptively selects tuning configurations $\theta _t$ for retrieval execution, so as to minimize the long-term expected I/O cost:
\vspace{-0.05in}
\begin{equation}
	\vspace{-0.05in}
	\label{eqn_tuning_table}
	\min_{\left\{ \theta _t \right\}}\mathbb{E}\left[ \sum_{t=1}^T{Y}\left( \theta _t,c_t \right) \right] ,
\end{equation}
subject to practical constraints on tuning overhead and system stability.

% TODO：加个限制条件的表格

\subsection{Surrogate-Assisted GMAB for Online I/O Tuning}

\begin{algorithm}[!htb]
	\caption{Surrogate-Assisted Genetic Multi-Armed Bandit (SA-GMAB)}
	\label{alg:sa-gmab}
	\SetKwInOut{Input}{Input}\SetKwInOut{Output}{Output}
	
	\Input{Configuration space $\Theta$, Initial population size $P$, Exploration parameter $\alpha$, Surrogate update interval $\Delta$}
	\Output{Online selection of I/O coordination configuration $\theta_t$}
	
	\BlankLine
	\tcp{Initialization}
	Initialize memory table $\mathcal{M} = \emptyset$\;
	Initialize surrogate model $\tilde{f}$ with empty training data\;
	Generate an initial population $\mathcal{P}_0 \subset \Theta$\;
	Set tuning step counter $t \leftarrow 0$\;
	
	\BlankLine
	\tcp{Online Tuning Loop}
	\While{arrival of retrieval $q_t$ with execution context $c_t$}{

		\tcp{Candidate Generation}
		Apply genetic operators (selection, crossover, mutation) on current population to generate candidate set $\mathcal{C}_t \subset \Theta$\;

		\tcp{Surrogate-based Pre-evaluation}
		\ForEach{$\theta \in \mathcal{C}_t$}{
			$\hat{r}_\theta \leftarrow \tilde{f}(\theta, c_t)$\;
		}

		\tcp{Candidate Filtering}
		Select top-$K$ configurations $\mathcal{C}'_t \subset \mathcal{C}_t$ based on $\hat{r}_\theta$ or uncertainty\;

		\tcp{Bandit-based Selection}
		\ForEach{$\theta \in \mathcal{C}'_t$}{
			$\text{Score}(\theta) = \hat{\mu}_\theta + \alpha \sqrt{\frac{\log(t+1)}{n_\theta + 1}}$\;
		}
		Select configuration: $\theta_t = \arg\max_{\theta \in \mathcal{C}'_t} \text{Score}(\theta)$\;

		\tcp{Retrieval Execution \& Reward Observation}
		Execute retrieval $q_t$ using I/O coordination policy $\theta_t$\;
		Measure performance outcome and compute reward $r_t$\;
		
		\tcp{State Update}
		Update memory entry for $\theta_t$: $n_{\theta_t} \leftarrow n_{\theta_t} + 1$\;
		$\hat{\mu}_{\theta_t} \leftarrow \hat{\mu}_{\theta_t} + \frac{r_t - \hat{\mu}_{\theta_t}}{n_{\theta_t}}$\;
		Update population $\mathcal{P}$ by inserting $\theta_t$ (optionally evicting low-performing ones)\;
		
		\If{$t \bmod \Delta = 0$}{
			Retrain surrogate model $\tilde{f}$ using observations in $\mathcal{M}$\;
		}
		$t \leftarrow t + 1$\;
	}
\end{algorithm}

\par
To address the online I/O tuning problem, we use a Surrogate-Assisted Genetic Multi-Armed Bandit (SA-GMAB) framework. It combines genetic search, bandit-style exploration, and a simple performance model. The goal is to handle workloads where behavior changes over time, where results are random, and where retrievals may affect each other. The main steps of this framework are shown in Algorithm~\ref{alg:sa-gmab}.

\par
We first initialize the memory table and the surrogate model, and then generate an initial population of configurations (lines 1-–4). In our system, each arm is an I/O tuning configuration $\theta \in \varTheta$. A configuration is a group of I/O control parameters, such as merge thresholds, batch size, queue depth, and limits on parallel requests. The space of possible configurations is large and discrete. It is not possible to list or test all of them. So we do not fix all arms in advance. Instead, new configurations are created dynamically by genetic operators during candidate generation (line 6). Each configuration acts as a policy that tells the system how to run I/O plans during a scheduling period.

\par
When a retrieval $q_t$ with context $c_t$ arrives, the framework enters the online tuning loop (line 5). For this retrieval, a set of candidate configurations is created through selection, crossover, and mutation (line 6). For every candidate configuration, the surrogate model predicts its reward under the current context (lines 7–-9). These predicted rewards are then used to filter and keep only the top promising configurations, or those with high uncertainty (line 10).

\par
When a configuration $\theta$ is used to process a retrieval $q_t$ with context $c_t$, the system observes a random performance result $Y_t=Y\left( \theta ,c_t \right)$. We define the reward as a simple transformation of I/O cost so that a higher reward means better performance. A common form is the negative latency of the retrieval, or the negative I/O time per unit work. Because other retrievals run at the same time, the reward may change even for the same configuration. Thus, many samples are needed to estimate the expected reward.

\par
For the remaining candidates, the framework computes a bandit score using both historical average reward and exploration term (lines 11--13), and then selects the configuration with the highest score (line 14). In this way, the method prefers configurations that have performed well before, but it also tries configurations that have been used only a few times.

\par
The selected configuration is then applied to execute the retrieval (line 15). After execution, the system observes the performance result and converts it into a reward value (line 16). For each configuration $\theta$, the system keeps a memory entry that records how many times it has been used and its average reward. These values are updated after each execution (lines 17-–18). This keeps all historical observations instead of discarding older ones, so estimates become more accurate over time, and poor configurations are not repeatedly tried.

The selected configuration may also be added into the population, while poor ones may be removed (line 19). The surrogate model is retrained periodically using data stored in memory (lines 20-–22), so that its predictions follow the most recent workload. The tuning step counter is then increased (line 23), and the framework continues with the next retrieval (line 24).

\section{Performance Evaluation}\label{sec:EXP}
First, we introduce the experimental setup, covering the dataset characteristics, retrieval workload generation, and the distributed cluster environment. Then, we present the experimental results evaluating the proposed I/O-aware indexing structure, the hybrid concurrency-aware I/O coordination mechanism, and the online I/O tuning framework, respectively.

\subsection{Experimental Setup}

\subsubsection{Dataset}
We employed a large-scale real-world remote sensing dataset derived from the Sentinel-2 mission \footnote[1]{https://sentinel.esa.int/web/sentinel/missions/sentinel-2}, specifically the Level-2A atmospherically corrected products. The dataset comprises multi-spectral images covering global land surfaces from 2019 to 2023. To simulate a cloud-native storage environment, all images are converted into Cloud-Optimized GeoTIFF (COG) format and stored in a distributed object store. The statistics of the dataset are summarized in Table~\ref{table_dataset}.

% table 1: Dataset
\begin{table}
	\renewcommand{\arraystretch}{1.3}
	\caption{Dataset Statistics}
	\label{table_dataset}
	\vspace{-0.13in}
	\centering
	\begin{tabular}{|m{1.5cm}|m{1.5cm}|m{1.5cm}|m{2.0cm}|}
		\hline
		\makecell[c]{\textbf{Dataset}} &\bfseries Resolution & \bfseries Time Span & \bfseries Total Volume \\
		\hline
		\hline
		\makecell[c]{Sentinel-2}&\makecell[c]{10m - 60m} & \makecell[c]{2019--2023} & \makecell[c]{15.4 TB}\\
		\hline
		\makecell[c]{Landsat-8}&\makecell[c]{30m} & \makecell[c]{2020--2022} & \makecell[c]{4.2 TB}\\
		\hline
	\end{tabular}
\end{table}

\subsubsection{Retrieval Workload}
\par
To evaluate the system performance under diverse scenarios, we developed a synthetic workload generator that simulates concurrent spatio-temporal range retrievals. The retrieval parameters are configured as follows:
\begin{itemize}
	\item \textbf{Spatial Extent:} The spatial range of retrievals follows a log-uniform distribution, ranging from small tile-level access ($0.001\%$ of the scene) to large-scale regional mosaics ($1\%$ to $100\%$ of the scene).
	\item \textbf{Temporal Range:} Each retrieval specifies a time interval randomly chosen between 1 day and 1 month.
	\item \textbf{Concurrency \& Contention:} The number of concurrent clients $N$ varies from 1 to 64. To test the coordination mechanism, we control the Spatial Overlap Ratio $\sigma \in [0, 0.9]$ to simulate workloads ranging from disjoint access to highly concentrated hotspots.
\end{itemize}

\subsubsection{Experimental Environment}
\label{sec_exp_env}
All experiments are conducted on a cluster with 9 homogenous nodes (1 master node and 8 worker nodes). The cluster is connected via a 10Gbps high-speed Ethernet to ensure that network bandwidth is not the primary bottleneck compared to storage I/O. Table \ref{table_config} lists the detailed hardware and software configurations. The I/O-aware index (G2I/I2G) is deployed on HBase, while the raw image data is served by a MinIO distributed object storage cluster.

% table 2: Environment
\begin{table}
	\renewcommand{\arraystretch}{1.3}
	\caption{Cluster Configurations}
	\label{table_config}
	\vspace{-0.13in}
	\centering
	\begin{tabular}{|m{2.2cm}|m{5.5cm}|}
		\hline
		\multicolumn{2}{|c|}{\textbf{Hardware Configuration (Per Node)}} \\
		\hline
		\makecell[c]{CPU} & Dual Intel Xeon Gold 6248 (20 cores, 2.50GHz)\\
		\hline
		\makecell[c]{Memory} & \makecell[c]{128GB DDR4 ECC}\\
		\hline
		\makecell[c]{Storage} & \makecell[c]{4TB NVMe SSD (Data) + 500GB SSD (OS)}\\
		\hline
		\makecell[c]{Network} & \makecell[c]{10 Gigabit Ethernet (10GbE)}\\
		\hline\hline
		
		\multicolumn{2}{|c|}{\textbf{Software Stack}} \\
		\hline
		\makecell[c]{OS} & \makecell[c]{Ubuntu 20.04 LTS} \\
		\hline
		\makecell[c]{Storage} & \makecell[c]{Hadoop 3.3.1, HBase 2.4.5, Lustre}\\
		\hline
		\makecell[c]{Framework} & \makecell[c]{OpenJDK 11, Spark 3.2.1}\\
		\hline
	\end{tabular}
\end{table}


\subsection{Evaluating the data indexing structure}
In the following experiments, we measured the indexing on a single node in the cluster, bacause each nodes needs to the indexing for spatial retrieval. We investigated of retrieval performance of the indexing for remote sensing images.

\subsubsection{I/O Selectivity Analysis}\label{sec:Index_exp_1}

\begin{figure}[tb]
	\centering
	\subfigure[I/O Selectivity]{
		\begin{minipage}[b]{0.227\textwidth}
			\includegraphics[width=0.98\textwidth]{exp/index_exp1_1.pdf}
		\end{minipage}
	}
	\label{fig:index_exp1_1}
	\subfigure[[Unnecessary data fraction]{
		\begin{minipage}[b]{0.227\textwidth}
			\includegraphics[width=0.905\textwidth]{exp/index_exp1_2.pdf}
		\end{minipage}
	}
	\label{fig:index_exp1_2}
	\caption{The computing cost of spatial-keyword skylines}
	\label{fig:index_exp1}
\end{figure}

\par
First, we evaluated the effectiveness of data reduction by measuring the I/O selectivity, defined as the ratio of the retrieved data volume to the total file size. Fig.~\ref{fig:index_exp1} compares our method against Baseline 1 (full-file retrieval) and Baseline 2 (exact window-based reading, e.g., OpenDataCube). As illustrated in Fig.~\ref{fig:index_exp1}(a), Baseline 1 exhibits a linear increase in I/O volume proportional to the file size, resulting in poor selectivity regardless of the retrieval footprint. In contrast, both Baseline 2 and Ours significantly reduce I/O traffic by enabling partial reads. It is worth noting that our method incurs slightly higher I/O volume (approximately $16\%-23\%$ of the file size for small retrievals) compared to the theoretically optimal Baseline 2 ($10\%-20\%$). This marginal data redundancy is attributed to the grid alignment effect: our index retrieves pixel blocks based on fixed grid boundaries, whereas Baseline 2 performs precise geospatial clipping. Fig.~\ref{fig:index_exp1}(b) further presents the distribution of unnecessary data fraction. While our method introduces a small amount of "over-reading" due to grid padding, it successfully avoids the massive data waste observed in Baseline 1. As we will demonstrate in the next section, this slight compromise in I/O precision is a strategic trade-off that eliminates expensive runtime computations.

\subsubsection{End-to-End Retrieval Latency}\label{sec:Index_exp_2}

\begin{figure}[tb]
	\centering
	\subfigure[The retrieval latency]{
		\begin{minipage}[b]{0.227\textwidth}
			\includegraphics[width=0.98\textwidth]{exp/index_exp2_1.pdf}
		\end{minipage}
	}
	\label{fig:index_exp2_1}
	\subfigure[Latency Breakdownn]{
		\begin{minipage}[b]{0.227\textwidth}
			\includegraphics[width=0.905\textwidth]{exp/index_exp2_2.pdf}
		\end{minipage}
	}
	\label{fig:index_exp2_2}
	\caption{End-to-End Retrieval Latency}
	\label{fig:index_exp2}
\end{figure}

\par
We next measured the end-to-end retrieval latency to verify whether the I/O reduction translates into time efficiency. Fig.~\ref{fig:index_exp2}(a) reports the mean and 95th percentile (P95) latency across varying retrieval footprint ratios (log scale).The results reveal three distinct performance behaviors:Baseline 1 shows a high and flat latency curve ($\approx 4500$ ms), dominated by the cost of transferring entire images.Baseline 2, despite its optimal I/O selectivity, exhibits a significant latency floor ($\approx 380$ ms for small retrievals). This overhead stems from the on-the-fly geospatial computations required to calculate precise read windows.Ours achieves the lowest latency, ranging from 34 ms to 59 ms for typical tile-level retrievals ($10^{-4}$ coverage).Crucially, for small-to-medium retrievals, our method outperforms Baseline 2 by an order of magnitude. The gap between the two curves highlights the advantage of our deterministic indexing approach: by pre-materializing grid-to-window mappings, we eliminate runtime coordinate transformations. Although our I/O volume is slightly larger (as shown in Sec.~\ref{sec:Index_exp_1}), the time saved by avoiding computational overhead far outweighs the cost of transferring a few extra kilobytes of padding data.

To empirically validate the cost model proposed in Eq.~\ref{eqn:cost_total}, we further decomposed the retrieval latency into three components: metadata lookup ($C_{meta}$), geospatial computation ($C_{geo}$), and I/O access ($C_{io}$). Fig.~\ref{fig:index_exp2}(b) presents the time consumption breakdown for a representative medium-scale retrieval (involving approx. 50 image tiles). As expected, the latency of Baseline 1 is entirely dominated by $C_{io}$ ($>99\%$), rendering $C_{meta}$ and $C_{geo}$ negligible. The massive data transfer masks all other overheads. While $C_{io}$ of Baseline 2 is successfully reduced to the window size, a new bottleneck emerges in $C_{geo}$. The runtime coordinate transformations and polygon clipping consume nearly $70\%$ of the total execution time (approx. 350 ms). This observation confirms our theoretical analysis that window-based I/O shifts the bottleneck from storage to CPU. The proposed method exhibits a balanced profile. Although $C_{meta}$ increases slightly (approx. 60 ms) due to the two-phase index lookup (G2I + I2G), this cost is well-amortized. Crucially, $C_{geo}$ is effectively eliminated ($<1$ ms) thanks to the pre-computed grid-window mappings. Consequently, our approach achieves a total latency of approx. 150 ms, providing a $3\times$ speedup over Baseline 2 by removing the computational bottleneck without regressing on I/O performance.

\subsubsection{Ablation Study}\label{sec:Index_exp_3}
\begin{figure}[tb]
	\centering
	\subfigure[I/O Reduction Analysis]{
		\begin{minipage}[b]{0.227\textwidth}
			\includegraphics[width=0.9\textwidth]{exp/index_exp3_1.pdf}
		\end{minipage}
	}
	\label{fig:index_exp3_1}
	\subfigure[Latency Breakdown]{
		\begin{minipage}[b]{0.227\textwidth}
			\includegraphics[width=0.9\textwidth]{exp/index_exp3_2.pdf}
		\end{minipage}
	}
	\label{fig:index_exp3_2}
	\caption{Ablation Analysis}
	\label{fig:index_exp3}
\end{figure}

\begin{figure}
	\centering
	\includegraphics[width=1.8in]{exp/index_exp3_3.pdf}
	\caption{Impact of grid resolution on query latency}
	\label{fig:index_exp3_3}
\end{figure}

\par
To quantify the individual contributions of the G2I (coarse filtering) and I2G (fine-grained access) components, we decomposed the system into four variants. Fig.~\ref{fig:index_exp3} breaks down the performance in terms of I/O volume and latency components (Metadata Lookup vs. Storage I/O). Fig.~\ref{fig:index_exp3}(a) confirms that removing either component leads to suboptimal I/O behavior. The "No Index" and "G2I Only" variants result in 100\% I/O volume (full-file reads), as they lack the window information required for partial access. Conversely, "I2G Only" and "Full" (Ours) achieve minimal I/O volume ($\approx 10\%$).However, I/O volume alone does not tell the full story. Fig.~\ref{fig:index_exp3}(b) reveals the latency breakdown:No Index: Suffers from both high metadata scanning cost (full table scan) and high storage I/O cost.G2I Only: Efficiently reduces metadata lookup time ($\approx 50$ ms) but fails to reduce storage I/O ($\approx 8000$ ms).I2G Only: Although it minimizes storage I/O ($\approx 100$ ms), it incurs prohibitive metadata lookup overhead ($\approx 1500$ ms) because the system must scan the entire I2G table to identify relevant images without spatial pruning.G2I + I2G (Ours): Achieves the "best of both worlds," maintaining low metadata latency ($\approx 60$ ms) via G2I pruning while ensuring minimal storage I/O ($\approx 100$ ms) via I2G windowing.

Moreover, the choice of grid resolution (Zoom Level) is a critical parameter that dictates the trade-off between metadata management overhead ($C_{meta}$) and I/O precision ($C_{io}$). To justify our selection of Zoom Level 14, we conducted a sensitivity analysis by varying the grid resolution from Level 12 to Level 16 under a fixed workload of medium-scale range queries. Fig.~\ref{fig:index_exp3_3} illustrates the latency breakdown across different resolutions. The results reveal a clear convex trajectory in total query latency, driven by two opposing forces. For coarse-grained grids (Level $\le 13$), while metadata lookup is extremely fast ($C_{meta} < 30$ ms) due to the small number of grid keys, the I/O cost ($C_{io}$) is prohibitively high. Large grid cells force the system to read significant amounts of irrelevant pixel data outside the actual query boundary (high read amplification), serving as the dominant bottleneck. Conversely, finer grids (Level 15, 16) maximize I/O precision, reducing $C_{io}$ to its theoretical minimum. However, this comes at the cost of an explosion in metadata volume. A single query may intersect thousands of Level 16 micro-grids, causing $C_{meta}$ to surge drastically ($>280$ ms) due to the overhead of scanning and processing massive key lists in the G2I/I2G tables. As evidenced by the trough in the total latency curve, Zoom Level 14 represents the optimal "sweet spot" for our dataset. At this resolution, the grid cell size (approx. $20 \times 20$ meters at the equator) roughly matches the typical internal tile size of remote sensing images, keeping I/O waste low while maintaining a manageable number of index keys. Consequently, our system adopts Level 14 as the default global configuration.

\subsubsection{Index Construction and Storage Overhead}
\begin{figure}[tb]
	\centering
	\subfigure[Ingestion Scalability]{
		\begin{minipage}[b]{0.227\textwidth}
			\includegraphics[width=0.9\textwidth]{exp/index_exp4_1.pdf}
		\end{minipage}
	}
	\label{fig:index_exp4_1}
	\subfigure[Storage Consumption Overhead]{
		\begin{minipage}[b]{0.227\textwidth}
			\includegraphics[width=0.9\textwidth]{exp/index_exp4_2.pdf}
		\end{minipage}
	}
	\label{fig:index_exp4_2}
	\caption{Index Construction and Storage Overhead}
	\label{fig:index_exp4}
\end{figure}

\par
Finally, we evaluated the scalability and cost of maintaining the index. Fig.~\ref{fig:index_exp4} compares our method against PostGIS (R-tree) and GeoMesa (Z-order) during the ingestion of $10^6$ images.Fig.~\ref{fig:index_exp4}(a) illustrates the ingestion throughput. PostGIS exhibits a degrading trend as the dataset grows, bottlenecked by the logarithmic cost of R-tree rebalancing. In contrast, Ours maintains a stable throughput ($\approx 2100$ img/sec). Although slightly lower than the lightweight GeoMesa ($\approx 2500$ img/sec) due to the dual-table write overhead, our method demonstrates linear scalability suitable for high-velocity streaming data.Regarding storage cost (Fig.~\ref{fig:index_exp4}(b)), our index occupies approximately 0.83\% of the raw data size. While this is higher than GeoMesa (0.15\%) and PostGIS (0.51\%) due to the storage of grid-window mappings, it remains strictly below the 1\% threshold. This result validates that the proposed method achieves significant performance gains with a negligible storage penalty.

\subsection{Evaluating the Concurrency Control}
In this section, we evaluate the proposed hybrid coordination mechanism on a distributed storage cluster to assess its scalability, robustness under contention, and internal storage efficiency. We investigated end-to-end latency, throughput, tail latency, and I/O amplification under varying degrees of concurrency and spatial contention.

\par
To systematically control the workload characteristics, we developed a synthetic workload generator. We define the \textit{Spatial Overlap Ratio} ($\sigma$) to quantify the extent of shared data regions among concurrent queries, ranging from $\sigma=0$ (disjoint) to $\sigma=0.9$ (highly concentrated hotspots). The number of concurrent clients varies from $N=1$ to $N=64$.
For comparison, we evaluate the following execution schemes:
\begin{enumerate}
	\item \textbf{Baseline A (Naive):} Queries function as isolated threads with independent I/O execution.
	\item \textbf{Baseline B (Shared Index):} Metadata access is shared, but data retrieval remains uncoordinated, representing the state-of-the-practice in systems like GeoMesa.
	\item \textbf{Ours:} The proposed mechanism featuring contention-aware switching, global I/O plan ordering, and window merging.
\end{enumerate}

\subsubsection{Concurrency Scalability}
\begin{figure}[tb]
	\centering
	\subfigure[The query latency]{
		\begin{minipage}[b]{0.227\textwidth}
			\includegraphics[width=0.98\textwidth]{exp/cc_exp1_1.pdf}
		\end{minipage}
	}
	\label{fig:cc_exp1_1}
	\subfigure[Aggregate Throughput]{
		\begin{minipage}[b]{0.227\textwidth}
			\includegraphics[width=0.905\textwidth]{exp/cc_exp1_2.pdf}
		\end{minipage}
	}
	\label{fig:cc_exp1_2}
	\caption{The computing cost of spatial-keyword skylines}
	\label{fig:cc_exp1}
\end{figure}

\par
First, we investigated the system scalability by increasing the number of concurrent clients from 1 to 64 under a high-overlap scenario ($\sigma \approx 0.8$). Fig.~\ref{fig:cc_exp1} reports the mean latency, P95 tail latency, and aggregate throughput. Note that the latency axes in Fig.~\ref{fig:cc_exp1}(a) are plotted on a log scale to visualize the orders-of-magnitude difference.

\par
As shown in Fig.~\ref{fig:cc_exp1}(a), both Baseline A and Baseline B exhibit exponential latency degradation. At 64 clients, the mean latency of Baseline A spikes to 12,000 ms, indicating severe storage saturation. This bottleneck arises from the ``I/O blender effect,'' where randomized concurrent reads trigger severe disk seek thrashing. In contrast, Ours maintains a stable latency profile, increasing only marginally to 110 ms at 64 clients.

\par
Fig.~\ref{fig:cc_exp1}(b) further demonstrates the throughput advantage. While Baselines saturate at approximately 16--32 clients, Ours demonstrates super-linear throughput scaling relative to logical requests. This is attributed to the request collapse mechanism, where higher concurrency increases the probability of window merging, thereby reducing the physical I/O cost per query.

\subsubsection{Tail Latency and Contention Sensitivity}
\begin{figure}[tb]
	\centering
	\subfigure[Tail Latency Sensitivity]{
		\begin{minipage}[b]{0.227\textwidth}
			\includegraphics[width=0.9\textwidth]{exp/cc_exp2_1.pdf}
		\end{minipage}
	}
	\label{fig:cc_exp2_1}
	\subfigure[Fairness under Contention]{
		\begin{minipage}[b]{0.227\textwidth}
			\includegraphics[width=0.9\textwidth]{exp/cc_exp2_2.pdf}
		\end{minipage}
	}
	\label{fig:cc_exp2_2}
	\caption{Tail Latency and Contention Sensitivity}
	\label{fig:cc_exp2}
\end{figure}

\par
Next, we fixed the concurrency at $N=32$ and swept the Spatial Overlap Ratio $\sigma$ from 0 to 0.9 to evaluate the system's resilience to hotspots. Fig.~\ref{fig:cc_exp2} depicts the P95 latency and fairness index.

\par
Intuitively, higher contention typically degrades performance. However, Fig.~\ref{fig:cc_exp2}(a) reveals a \emph{counter-intuitive} phenomenon for our system: the P95 latency remains flat ($\approx 48$ ms) even as $\sigma$ approaches 0.9. This indicates that our coordination mechanism successfully converts contention'' into optimization opportunities'' via window merging. Conversely, both Baselines exhibit a sharp ``performance cliff'' when $\sigma > 0.5$, with Baseline A reaching 8,500 ms at $\sigma=0.9$.

\par
Furthermore, Fig.~\ref{fig:cc_exp2}(b) shows that our system maintains a Jain’s Fairness Index near 1.0, whereas Baselines drop to 0.25--0.35. This confirms that the deterministic plan queue effectively prevents the starvation of queries accessing contended regions.

\subsubsection{Storage-Level Effects and Request Collapse}
\begin{figure}[tb]
	\centering
	\subfigure[Data Volume Reduction]{
		\begin{minipage}[b]{0.227\textwidth}
			\includegraphics[width=0.9\textwidth]{exp/cc_exp3_1.pdf}
		\end{minipage}
	}
	\label{fig:cc_exp3_1}
	\subfigure[Request Collapse (IOPS)]{
		\begin{minipage}[b]{0.227\textwidth}
			\includegraphics[width=0.9\textwidth]{exp/cc_exp3_2.pdf}
		\end{minipage}
	}
	\label{fig:cc_exp3_2}
	\caption{Storage-Level Effects and Request Collapse}
	\label{fig:cc_exp3}
\end{figure}

\begin{figure}
	\centering
	\includegraphics[width=1.8in]{exp/cc_exp3_3.pdf}
	\caption{Merging Efficiency}
	\label{fig:cc_exp3_3}
\end{figure}

\par
To explain the performance gains observed above, we analyzed the internal I/O behavior. Fig.~\ref{fig:cc_exp3} compares the physical data movement against logical query demands. Note that in this experiment, Baseline A and Baseline B are grouped as a single baseline, as neither implements window-level coordination.

\par
Fig.~\ref{fig:cc_exp3}(a) and Fig.~\ref{fig:cc_exp3}(b) demonstrate the Request Collapse effect. While 64 concurrent clients generate 12,800 IOPS in the baseline, our system collapses them into fewer than 600 physical operations. Fig.~\ref{fig:cc_exp3_3} quantifies this using the Merging Efficiency. As the overlap ratio $\sigma$ increases, the I/O amplification factor of our system drops linearly from 1.0 to 0.15. This mathematically proves that the throughput gains are derived from a fundamental reduction in physical I/O volume rather than mere CPU scheduling.

\subsubsection{Deterministic vs Non-Deterministic Modes}
\begin{figure}
	\centering
	\includegraphics[width=1.8in]{exp/cc_exp4_1.pdf}
	\caption{Adaptive Performance Switching}
	\label{fig:cc_exp4}
\end{figure}

\par
We then validated the effectiveness of the hybrid switching logic by comparing it against static Forced Optimistic'' and Forced Deterministic'' policies. As shown in Fig.~\ref{fig:cc_exp4}, the static policies exhibit distinct weaknesses: the Deterministic mode incurs high coordination overhead ($\approx 60$ ms) at low $\sigma$, while the Optimistic mode suffers from exponential thrashing at high $\sigma$. The Hybrid curve successfully tracks the lower performance envelope of the two.

\subsubsection{Microbenchmark of Window Merging}
\begin{figure*}[htb]
	\centering
	\subfigure[Reduction Pipeline]{\label{fig:cc_exp5_1}
		\includegraphics[width=2.1in]{exp/cc_exp5_1.pdf}}
	\subfigure[Run Length Distribution]{\label{fig:cc_exp5_2}
		\includegraphics[width=2.1in]{exp/cc_exp5_2.pdf}}
	%\subfigure[]{\label{fig:trans_candidate}
		%\includegraphics[width=0.6in]{trans_candidate.eps}}
	\subfigure[Cost-Benefit Analysis]{\label{fig:cc_exp5_3}
		\includegraphics[width=2.1in]{exp/cc_exp5_3.pdf}}
	%\subfigure[]{\label{fig:diagram3}
		%\includegraphics[width=0.7in]{routing.eps}}
	\caption{% (a) Illustration of avoiding couplers.
		Microbenchmark of Window Merging}
	\label{fig:cc_exp5}
\end{figure*}

\par
Finally, we dissected the efficiency of the three-stage reduction pipeline. Fig.~\ref{fig:cc_exp5_1} shows that the combination of De-duplication (Stage 1) and Range Merging (Stage 2) achieves a cumulative reduction in request count consistent with the findings in Section 5.3.3.

\par
Fig.~\ref{fig:cc_exp5_2} presents the Run Length Distribution (CDF) of I/O requests. The proposed mechanism shifts the I/O pattern from small, fragmented reads (typical in baselines) to larger, sequential chunks, which significantly amortizes disk seek times.Fig.~\ref{fig:cc_exp5_3} presents the cost-benefit analysis. The CPU overhead of the dispatcher remains negligible ($< 2.5 \mu s$ per window) compared to the benefit of achieving a $>90\%$ zero-copy ratio, verifying that the algorithmic complexity of coordination yields a high return on investment in terms of system throughput.

\subsection{Evaluating the I/O tuning}
In this section, we evaluate the effectiveness of the proposed SA-GMAB tuning framework. The experiments are designed to verify four key properties: fast convergence speed, robustness against stochastic noise, adaptability to workload shifts, and tangible end-to-end performance gains.

\subsubsection{Convergence Speed and Tuning Cost}
\begin{figure*}[htb]
	\centering
	\subfigure[Convergence Speed]{\label{fig:tune_exp1_1}
		\includegraphics[width=2.1in]{exp/tune_exp1_1.pdf}}
	\subfigure[Cumulative Tuning Overhead]{\label{fig:tune_exp1_2}
		\includegraphics[width=2.1in]{exp/tune_exp1_2.pdf}}
	%\subfigure[]{\label{fig:trans_candidate}
		%\includegraphics[width=0.6in]{trans_candidate.eps}}
	\subfigure[Search Efficiency]{\label{fig:tune_exp1_3}
		\includegraphics[width=2.1in]{exp/tune_exp1_3.pdf}}
	%\subfigure[]{\label{fig:diagram3}
		%\includegraphics[width=0.7in]{routing.eps}}
	\caption{% (a) Illustration of avoiding couplers.
		Convergence Speed and Tuning Cost}
	\label{fig:tune_exp1}
\end{figure*}

\par
First, we initiated a cold-start tuning session to evaluate how efficiently each method identifies high-quality configurations. Fig.~\ref{fig:tune_exp1} reports the convergence trajectory, cumulative tuning cost, and search efficiency.

\par
As shown in Fig.~\ref{fig:tune_exp1_1}, the \textbf{Default} configuration remains trapped in a high-latency state ($\approx 450$ ms). While \textbf{H5Tuner} and \textbf{TunIO} gradually improve performance, they exhibit slow decay rates, requiring over 80 steps to stabilize. In contrast, \textbf{SA-GMAB} achieves a sharp drop in best-observed latency within the first 15--20 steps. This acceleration is attributed to the surrogate model, which effectively prunes unpromising configurations before costly execution.

\par
Fig.~\ref{fig:tune_exp1_2} plots the cumulative tuning overhead (regret). The steep slope of the GA-based baselines indicates that they repeatedly explore poor configurations due to their memory-less nature. Our method exhibits the flattest curve, minimizing the cumulative performance loss during exploration. Furthermore, Fig.~\ref{fig:tune_exp1_3} confirms the high sample efficiency: SA-GMAB reaches the near-optimal zone ($\approx 50$ ms) after evaluating significantly fewer unique configurations compared to H5Tuner and TunIO.

\subsubsection{Robustness under Stochastic Interference}
\begin{figure*}[htb]
	\centering
	\subfigure[Reward Stability under Noise]{\label{fig:tune_exp2_1}
		\includegraphics[width=2.1in]{exp/tune_exp2_1.pdf}}
	\subfigure[Regret Growth]{\label{fig:tune_exp2_2}
		\includegraphics[width=2.1in]{exp/tune_exp2_2.pdf}}
	%\subfigure[]{\label{fig:trans_candidate}
		%\includegraphics[width=0.6in]{trans_candidate.eps}}
	\subfigure[Configuration Stability]{\label{fig:tune_exp2_3}
		\includegraphics[width=2.1in]{exp/tune_exp2_3.pdf}}
	%\subfigure[]{\label{fig:diagram3}
		%\includegraphics[width=0.7in]{routing.eps}}
	\caption{% (a) Illustration of avoiding couplers.
		Robustness under Stochastic Interference}
	\label{fig:tune_exp2}
\end{figure*}

\par
In concurrent I/O environments, performance measurements are inherently noisy. Fig.~\ref{fig:tune_exp2} evaluates the robustness of the tuning algorithms under such stochastic interference.

\par
Fig.~\ref{fig:tune_exp2_1} tracks the instantaneous reward over time. \textbf{H5Tuner} exhibits high variance, frequently dropping to low-reward regions because it discards good configurations that perform poorly once due to transient noise. In contrast, \textbf{SA-GMAB} maintains a stable high-reward trajectory. By aggregating historical observations in the memory table, our method ``smooths out'' the noise and correctly identifies optimal configurations despite fluctuations. Fig.~\ref{fig:tune_exp2_3} further breaks down the decision quality. Our method selects the \textbf{Optimal Configuration} for \textbf{88\%} of the rounds, whereas H5Tuner selects it only \textbf{35\%} of the time, wasting the majority of its budget on suboptimal or poor parameters.

\subsubsection{Adaptation to Workload Shifts}
\begin{figure*}[htb]
	\centering
	\subfigure[Response to Workload Shift]{\label{fig:tune_exp3_1}
		\includegraphics[width=2.1in]{exp/tune_exp3_1.pdf}}
	\subfigure[Parameter Adaptation]{\label{fig:tune_exp3_2}
		\includegraphics[width=2.1in]{exp/tune_exp3_2.pdf}}
	%\subfigure[]{\label{fig:trans_candidate}
		%\includegraphics[width=0.6in]{trans_candidate.eps}}
	\subfigure[Speed of Adaptation]{\label{fig:tune_exp3_3}
		\includegraphics[width=2.1in]{exp/tune_exp3_3.pdf}}
	%\subfigure[]{\label{fig:diagram3}
		%\includegraphics[width=0.7in]{routing.eps}}
	\caption{% (a) Illustration of avoiding couplers.
		Adaptation to Workload Shifts}
	\label{fig:tune_exp3}
\end{figure*}

\par
We then investigated the system's ability to adapt to non-stationary environments. We introduced a sudden workload shift at $t=60$, changing the query pattern from sparse random access to dense sequential scans.

\par
As illustrated in Fig.~\ref{fig:tune_exp3_1}, the shift causes an immediate latency spike ($>300$ ms) for all methods. The \textbf{Default} policy fails to adapt. \textbf{H5Tuner} reacts sluggishly, requiring many generations to evolve parameters for the new regime. \textbf{SA-GMAB}, however, detects the context change and leverages its surrogate model to rapidly propose new candidates, achieving a full recovery to the new optimal latency ($\approx 80$ ms) within fewer than 15 batches (Fig.~\ref{fig:tune_exp3_3}).Fig.~\ref{fig:tune_exp3_2} traces the evolution of the \textit{Merge Threshold} parameter. While baselines drift slowly, our method executes a decisive shift from 0.2 to 0.8, effectively locking onto the new optimal region required by the sequential workload.

\subsubsection{Impact on End-to-End Query Performance}
\begin{figure*}[htb]
	\centering
	\subfigure[Steady-State Stability]{\label{fig:tune_exp4_1}
		\includegraphics[width=2.1in]{exp/tune_exp4_1.pdf}}
	\subfigure[End-to-End Throughput]{\label{fig:tune_exp4_2}
		\includegraphics[width=2.1in]{exp/tune_exp4_2.pdf}}
	%\subfigure[]{\label{fig:trans_candidate}
		%\includegraphics[width=0.6in]{trans_candidate.eps}}
	\subfigure[I/O Efficiency Tuning]{\label{fig:tune_exp4_3}
		\includegraphics[width=2.1in]{exp/tune_exp4_3.pdf}}
	%\subfigure[]{\label{fig:diagram3}
		%\includegraphics[width=0.7in]{routing.eps}}
	\caption{% (a) Illustration of avoiding couplers.
		Impact on End-to-End Query Performance}
	\label{fig:tune_exp4}
\end{figure*}

\par
Finally, we measured the steady-state performance of the fully optimized system. Fig.~\ref{fig:tune_exp4} compares the end-to-end metrics across different tuning methods.

\par
Fig.~\ref{fig:tune_exp4_1} presents a latency trace during steady-state operation. While \textbf{Default} suffers from high latency and \textbf{GA-based} methods exhibit jitter due to unstable exploration, \textbf{SA-GMAB} maintains a consistently low and smooth latency profile ($\approx 45$ ms). This stability is critical for meeting SLA requirements in real-time analytics. Fig.~\ref{fig:tune_exp4_2} summarizes the aggregate throughput gain. Our method achieves a \textbf{5.6$\times$} improvement over the default configuration. Fig.~\ref{fig:tune_exp4_3} reveals the underlying reason: under high contention, the tuner automatically selects aggressive batching and merging parameters, driving the I/O amplification factor down to \textbf{0.2}. This confirms that SA-GMAB effectively aligns the system configuration with real-time workload characteristics to maximize I/O efficiency.


\section{Conclusions}\label{sec:Con}
Modern high-performance remote sensing data management systems face a critical bottleneck shift from metadata discovery to data extraction, driven by prohibitive runtime geospatial computations ($C_{geo}$) and severe I/O contention under concurrent access. This paper presents a comprehensive I/O-aware retrieval processing framework designed to strictly bound retrieval latency and maximize throughput for large-scale spatio-temporal analytics. By introducing the "Index-as-an-Execution-Plan" paradigm and a dual-layer inverted structure (G2I and I2G), we bridge the semantic gap between logical indexing and physical storage, effectively shifting the computational burden from retrieval time to ingestion time.To address the scalability challenges in multi-user environments, we developed a hybrid concurrency-aware I/O coordination protocol that adaptively switches between deterministic ordering and optimistic execution based on spatial contention. Furthermore, to handle the complexity of parameter configuration in fluctuating workloads, we integrated a Surrogate-Assisted Genetic Multi-Armed Bandit (SA-GMAB) mechanism for online automatic I/O tuning.Our empirical evaluation on large-scale Sentinel-2 datasets demonstrates that the proposed I/O-aware index reduces end-to-end latency by an order of magnitude compared to standard window-based reading approaches. The hybrid coordination mechanism effectively converts I/O contention into request merging opportunities, achieving linear throughput scaling significantly superior to traditional isolated execution. Additionally, the SA-GMAB tuning method exhibits faster convergence speed and greater robustness against stochastic noise compared to existing genetic baselines. These findings provide a scalable and predictable path for next-generation remote sensing platforms to support real-time, data-intensive concurrent workloads.

% if have a single appendix:
%\appendix[Proof of the Zonklar Equations]
% or
%\appendix  % for no appendix heading
% do not use \section anymore after \appendix, only \section*
% is possibly needed

% use appendices with more than one appendix
% then use \section to start each appendix
% you must declare a \section before using any
% \subsection or using \label (\appendices by itself
% starts a section numbered zero.)
%


%%\appendices
%%\section{Proof of the First Zonklar Equation}
%%Appendix one text goes here.

% you can choose not to have a title for an appendix
% if you want by leaving the argument blank
%%\section{}
%%Appendix two text goes here.Appendix two text goes here.Appendix two text goes here.Appendix two text goes here.Appendix two text goes here.Appendix two text goes here.Appendix two text goes here.Appendix two text goes here.Appendix two text goes here.Appendix two text goes here.Appendix two text goes here.Appendix two text goes here.Appendix two text goes here.Appendix two text goes here.Appendix two text goes here.Appendix two text goes here.Appendix two text goes here.Appendix two text goes here.Appendix two text goes here.Appendix two text goes here.Appendix two text goes here.Appendix two text goes here.Appendix two text goes here.Appendix two text goes here.Appendix two text goes here.


% use section* for acknowledgement
\ifCLASSOPTIONcompsoc
  % The Computer Society usually uses the plural form
  \section*{Acknowledgments}
This work is supported in part by the National Natural
Science Foundation of China (No. U21A2013, No. 41925007 and No. 62076224), Open Research Project of The Hubei Key Laboratory of Intelligent Geo-Information Processing(KLIGIP-2019B14).
\else
  % regular IEEE prefers the singular form
 % \section*{Acknowledgment}
%%Dr. L. Wang's work is funded by ``One-Hundred Talents Program'' of
%Chinese Academy of Sciences. Drs. X. Chen, Z. Deng and D. Chen were
%supported in part by the by the National Natural Science Foundation of
%China (No. 61272314), the Program for New Century Excellent Talents in
%University (NCET-11-0722), the Excellent Youth Foundation of Hubei
%Scientific Committee (No. 2012FFA025), the Natural Science Foundation
%of Hubei Province (No. 2011CDB159), the Specialized Research Fund for
%the Doctoral Program of Higher Education (20110145110010), the
%Fundamental Research Funds for the Central Universities, China University of Geosciences(Wuhan) (No.
%CUG120114, No. CUG130617), and Beijing Microelectronics Technology Institute under
%the University Research Programme (No. BM-KJ-FK-WX-20130731-0013).

\fi


% Can use something like this to put references on a page
% by themselves when using endfloat and the captionsoff option.
\ifCLASSOPTIONcaptionsoff
  \newpage
\fi


% trigger a \newpage just before the given reference
% number - used to balance the columns on the last page
% adjust value as needed - may need to be readjusted if
% the document is modified later
%\IEEEtriggeratref{8}
% The "triggered" command can be changed if desired:
%\IEEEtriggercmd{\enlargethispage{-5in}}

% references section

% can use a bibliography generated by BibTeX as a .bbl file
% BibTeX documentation can be easily obtained at:
% http://www.ctan.org/tex-archive/biblio/bibtex/contrib/doc/
% The IEEEtran BibTeX style support page is at:
% http://www.michaelshell.org/tex/ieeetran/bibtex/
\bibliographystyle{IEEEtran}
\bibliography{bib/references}

% argument is your BibTeX string definitions and bibliography database(s)
%\bibliography{IEEEabrv,../bib/paper}
%
% <OR> manually copy in the resultant .bbl file


% that's all folks
\end{document}