修改实验部分

This commit is contained in:
龙澳
2026-04-08 17:21:29 +08:00
parent e8c942d293
commit e0b8c89e73
9 changed files with 304 additions and 174 deletions

View File

@@ -51,31 +51,43 @@
\newlabel{equ:conflict detection}{{17}{5}}
\newlabel{equ:explainable conflict}{{18}{5}}
\@writefile{lot}{\contentsline {table}{\numberline {I}{\ignorespaces Physics-Informed Conflict Triage Categories}}{6}{}\protected@file@percent }
\newlabel{table_conflict_triage}{{I}{6}}
\newlabel{table:conflict_triage}{{I}{6}}
\newlabel{equ:conflict classification feature vector}{{19}{6}}
\newlabel{equ:conflict classification}{{20}{6}}
\newlabel{equ:conflict classification}{{21}{6}}
\newlabel{equ:conflict recalibration}{{21}{6}}
\newlabel{equ:Anti-Over-Smoothing Guarantee}{{22}{6}}
\@writefile{toc}{\contentsline {subsection}{\numberline {\mbox {III-E}}AreoRAG Prompting}{6}{}\protected@file@percent }
\newlabel{sec:prompt}{{\mbox {III-E}}{6}}
\@writefile{loa}{\contentsline {algorithm}{\numberline {1}{\ignorespaces AreoRAG Prompting (ARP)}}{6}{}\protected@file@percent }
\newlabel{alg:arp}{{1}{6}}
\citation{yang18hotpotqa}
\citation{ho202WikiMultiHopQA}
\citation{Wu25MultiRAG}
\@writefile{toc}{\contentsline {section}{\numberline {IV}Experiments}{7}{}\protected@file@percent }
\@writefile{lot}{\contentsline {table}{\numberline {II}{\ignorespaces Statistics of the Planetary Datasets}}{7}{}\protected@file@percent }
\newlabel{table_planetary_datasets}{{II}{7}}
\@writefile{toc}{\contentsline {subsection}{\numberline {\mbox {IV-A}}Experimental Settings}{7}{}\protected@file@percent }
\citation{Lewis20RAG}
\citation{Harsh23IRCoT}
\citation{Chan24RQRAG}
\citation{Wu25MultiRAG}
\citation{luo25hyperrag}
\citation{lien26hyperrag}
\citation{liu26truthfulrag}
\citation{Zhou24MetaRAG}
\citation{Wu25MultiRAG}
\@writefile{lot}{\contentsline {table}{\numberline {II}{\ignorespaces Statistics of the Planetary Datasets}}{8}{}\protected@file@percent }
\newlabel{table:planetary_datasets}{{II}{8}}
\@writefile{toc}{\contentsline {subsection}{\numberline {\mbox {IV-B}}Overall Retrieval and QA Performance (Q1)}{8}{}\protected@file@percent }
\@writefile{lot}{\contentsline {table}{\numberline {III}{\ignorespaces Comparison with Baseline Methods on Planetary and General QA Datasets}}{9}{}\protected@file@percent }
\newlabel{table_comparison}{{III}{9}}
\newlabel{table:comparison_QA}{{III}{9}}
\@writefile{toc}{\contentsline {subsection}{\numberline {\mbox {IV-C}}Robustness Under Spatial Sparsity and Conflict Intensity (Q2)}{9}{}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {\mbox {IV-D}}Ablation Study (Q3)}{9}{}\protected@file@percent }
\@writefile{lot}{\contentsline {table}{\numberline {IV}{\ignorespaces Ablation Experiments of HySH and PICT Modules}}{10}{}\protected@file@percent }
\newlabel{table_ablation}{{IV}{10}}
\newlabel{table:ablation}{{IV}{10}}
\@writefile{lot}{\contentsline {table}{\numberline {V}{\ignorespaces Conflict Handling Performance on MarsConflict-50}}{10}{}\protected@file@percent }
\newlabel{table_conflict}{{V}{10}}
\newlabel{table:conflict}{{V}{10}}
\@writefile{toc}{\contentsline {subsection}{\numberline {\mbox {IV-E}}Conflict Preservation Evaluation (Q4)}{10}{}\protected@file@percent }
\@writefile{lot}{\contentsline {table}{\numberline {VI}{\ignorespaces Time Cost Analysis Across Modules}}{10}{}\protected@file@percent }
\newlabel{table_time_cost}{{VI}{10}}
\newlabel{table:time_cost}{{VI}{10}}
\@writefile{toc}{\contentsline {subsection}{\numberline {\mbox {IV-F}}Efficiency Analysis (Q5)}{10}{}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {\mbox {IV-G}}Case Study}{10}{}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {\mbox {IV-H}}Limitations}{11}{}\protected@file@percent }
@@ -105,4 +117,9 @@
\bibcite{liu26truthfulrag}{15}
\bibcite{tang26diagnosing}{16}
\bibcite{Li25SubGraphRAG}{17}
\bibcite{yang18hotpotqa}{18}
\bibcite{ho202WikiMultiHopQA}{19}
\bibcite{Harsh23IRCoT}{20}
\bibcite{Chan24RQRAG}{21}
\bibcite{Zhou24MetaRAG}{22}
\gdef \@abspage@last{14}

View File

@@ -140,4 +140,44 @@ M.~Li, S.~Miao, and P.~Li, ``Simple is effective: The roles of graphs and large
{ICLR} 2025, Singapore, April 24-28, 2025}.\hskip 1em plus 0.5em minus
0.4em\relax OpenReview.net, 2025.
\bibitem{yang18hotpotqa}
Z.~Yang, P.~Qi, S.~Zhang, Y.~Bengio, W.~Cohen, R.~Salakhutdinov, and C.~D.
Manning, ``{H}otpot{QA}: A dataset for diverse, explainable multi-hop
question answering,'' in \emph{Proceedings of the 2018 Conference on
Empirical Methods in Natural Language Processing}, E.~Riloff, D.~Chiang,
J.~Hockenmaier, and J.~Tsujii, Eds.\hskip 1em plus 0.5em minus 0.4em\relax
Brussels, Belgium: Association for Computational Linguistics, Oct.-Nov. 2018,
pp. 2369--2380.
\bibitem{ho202WikiMultiHopQA}
X.~Ho, A.-K. Duong~Nguyen, S.~Sugawara, and A.~Aizawa, ``Constructing a
multi-hop {QA} dataset for comprehensive evaluation of reasoning steps,'' in
\emph{Proceedings of the 28th International Conference on Computational
Linguistics}, D.~Scott, N.~Bel, and C.~Zong, Eds.\hskip 1em plus 0.5em minus
0.4em\relax Barcelona, Spain (Online): International Committee on
Computational Linguistics, Dec. 2020, pp. 6609--6625.
\bibitem{Harsh23IRCoT}
H.~Trivedi, N.~Balasubramanian, T.~Khot, and A.~Sabharwal, ``Interleaving
retrieval with chain-of-thought reasoning for knowledge-intensive multi-step
questions,'' in \emph{Proceedings of the 61st Annual Meeting of the
Association for Computational Linguistics (Volume 1: Long Papers)},
A.~Rogers, J.~Boyd-Graber, and N.~Okazaki, Eds.\hskip 1em plus 0.5em minus
0.4em\relax Toronto, Canada: Association for Computational Linguistics, Jul.
2023, pp. 10\,014--10\,037.
\bibitem{Chan24RQRAG}
\BIBentryALTinterwordspacing
C.~Chan, C.~Xu, R.~Yuan, H.~Luo, W.~Xue, Y.~Guo, and J.~Fu, ``{RQ-RAG:}
learning to refine queries for retrieval augmented generation,'' \emph{CoRR},
vol. abs/2404.00610, 2024. [Online]. Available:
\url{https://doi.org/10.48550/arXiv.2404.00610}
\BIBentrySTDinterwordspacing
\bibitem{Zhou24MetaRAG}
Y.~Zhou, Z.~Liu, J.~Jin, J.-Y. Nie, and Z.~Dou, ``Metacognitive
retrieval-augmented large language models,'' in \emph{Proceedings of the ACM
Web Conference 2024}, ser. WWW '24.\hskip 1em plus 0.5em minus 0.4em\relax
New York, NY, USA: Association for Computing Machinery, 2024, p. 14531463.
\end{thebibliography}

View File

@@ -17,44 +17,44 @@ Database file #2: references.bib
-- See the "IEEEtran_bst_HOWTO.pdf" manual for usage information.
Done.
You've used 17 entries,
You've used 22 entries,
4087 wiz_defined-function locations,
1720 strings with 29554 characters,
and the built_in function-call counts, 17601 in all, are:
= -- 1159
> -- 910
< -- 74
+ -- 477
- -- 228
* -- 914
:= -- 2301
add.period$ -- 38
call.type$ -- 17
change.case$ -- 19
chr.to.int$ -- 159
cite$ -- 17
duplicate$ -- 1258
empty$ -- 1330
format.name$ -- 228
if$ -- 4113
1756 strings with 31069 characters,
and the built_in function-call counts, 23074 in all, are:
= -- 1567
> -- 1098
< -- 111
+ -- 578
- -- 271
* -- 1195
:= -- 3021
add.period$ -- 52
call.type$ -- 22
change.case$ -- 30
chr.to.int$ -- 255
cite$ -- 22
duplicate$ -- 1646
empty$ -- 1776
format.name$ -- 273
if$ -- 5409
int.to.chr$ -- 0
int.to.str$ -- 17
missing$ -- 318
newline$ -- 74
num.names$ -- 19
pop$ -- 854
int.to.str$ -- 22
missing$ -- 396
newline$ -- 91
num.names$ -- 30
pop$ -- 1058
preamble$ -- 1
purify$ -- 0
quote$ -- 2
skip$ -- 1260
skip$ -- 1651
stack$ -- 0
substring$ -- 492
swap$ -- 1040
text.length$ -- 21
substring$ -- 759
swap$ -- 1368
text.length$ -- 26
text.prefix$ -- 0
top$ -- 5
type$ -- 17
type$ -- 22
warning$ -- 0
while$ -- 38
width$ -- 19
write$ -- 182
while$ -- 59
width$ -- 24
write$ -- 234

View File

@@ -1,14 +1,14 @@
# Fdb version 4
["bibtex MarsRAG"] 1775632489.46607 "MarsRAG.aux" "MarsRAG.bbl" "MarsRAG" 1775632696.24948 0
"./references.bib" 1775631546.85579 21060 cb755e3d069bfb993edf430f576e97e4 ""
["bibtex MarsRAG"] 1775639990.07184 "MarsRAG.aux" "MarsRAG.bbl" "MarsRAG" 1775639991.99347 0
"./references.bib" 1775638278.60838 24162 b9adf6194105d5c3fb6cb906c31e6a6e ""
"D:/software/ctex/MiKTeX/bibtex/bib/ieeetran/IEEEabrv.bib" 1440617548 20898 a5d2167c380db7dfff810b085c77ed63 ""
"D:/software/ctex/MiKTeX/bibtex/bst/ieeetran/IEEEtran.bst" 1440622569 57748 7c8250ecf02814ce6ddc0cdbb63df1dd ""
"MarsRAG.aux" 1775632695.3425 6707 a13591f13f0356ed038e62f2e3954e3b "pdflatex"
"MarsRAG.aux" 1775639991.0783 7155 80d476163ab6fb991f4e9360609894fc "pdflatex"
(generated)
"MarsRAG.bbl"
"MarsRAG.blg"
(rewritten before read)
["pdflatex"] 1775632694.59958 "d:/onedrive/Desktop/Multi-RAG/MarsRAG/MarsRAG.tex" "MarsRAG.pdf" "MarsRAG" 1775632696.25065 0
["pdflatex"] 1775639990.35782 "d:/onedrive/Desktop/Multi-RAG/MarsRAG/MarsRAG.tex" "MarsRAG.pdf" "MarsRAG" 1775639991.99449 0
"D:/software/ctex/CTeX/fonts/sfd/UGBK.sfd" 1241576166 185529 821b4d3a4d64ce6f0757b41592552808 ""
"D:/software/ctex/CTeX/fonts/sfd/Unicode.sfd" 1241576166 5003 ada6fc83625c51dfd3000a816ae41161 ""
"D:/software/ctex/MiKTeX/fonts/enc/dvips/base/8r.enc" 1458473886 4993 1194fb36dfcb11d9ae9802f2b00b60a3 ""
@@ -121,9 +121,9 @@
"D:/software/ctex/MiKTeX/tex/latex/url/url.sty" 1388490452 12796 8edb7d69a20b857904dd0ea757c14ec9 ""
"D:/software/ctex/UserData/fonts/map/pdftex/pdftex.map" 1775617581.10787 280574 11b05735ccd5db23e8baf414abaacaec ""
"D:/software/ctex/UserData/miktex/data/le/pdftex/pdflatex.fmt" 1761184910.39527 23169076 a8cfad5eb3d5cf02ce2e7aad0517b308 ""
"MarsRAG.aux" 1775632695.3425 6707 a13591f13f0356ed038e62f2e3954e3b "pdflatex"
"MarsRAG.bbl" 1775632489.67722 7364 b7a485e756f8a1d65e4cec99e531b487 "bibtex MarsRAG"
"d:/onedrive/Desktop/Multi-RAG/MarsRAG/MarsRAG.tex" 1775632693.94638 85154 5f21c8829e6299d9a2ee36bb0f37039b ""
"MarsRAG.aux" 1775639991.0783 7155 80d476163ab6fb991f4e9360609894fc "pdflatex"
"MarsRAG.bbl" 1775639990.2931 9518 df284673f23e991a573218007c8127e3 "bibtex MarsRAG"
"d:/onedrive/Desktop/Multi-RAG/MarsRAG/MarsRAG.tex" 1775639987.7302 85896 05280e70c98689acda7edec5ecfb29b6 ""
(generated)
"MarsRAG.aux"
"MarsRAG.log"

View File

@@ -1,4 +1,4 @@
This is pdfTeX, Version 3.141592653-2.6-1.40.25 (MiKTeX 23.4) (preloaded format=pdflatex 2025.10.23) 8 APR 2026 15:18
This is pdfTeX, Version 3.141592653-2.6-1.40.25 (MiKTeX 23.4) (preloaded format=pdflatex 2025.10.23) 8 APR 2026 17:19
entering extended mode
restricted \write18 enabled.
file:line:error style messages enabled.
@@ -292,11 +292,7 @@ File: l3backend-pdftex.def 2023-03-30 L3 backend support: PDF output (pdfTeX)
LaTeX Warning: Unused global option(s):
[lettersize].
(MarsRAG.aux
LaTeX Warning: Label `equ:conflict classification' multiply defined.
)
(MarsRAG.aux)
\openout1 = `MarsRAG.aux'.
LaTeX Font Info: Checking defaults for OML/cmm/m/it on input line 18.
@@ -457,199 +453,198 @@ File: omsptm.fd
LaTeX Font Info: Font shape `OMS/ptm/m/n' in size <10> not available
(Font) Font shape `OMS/cmsy/m/n' tried instead on input line 329.
[6]
Underfull \hbox (badness 10000) in paragraph at lines 383--383
Underfull \hbox (badness 10000) in paragraph at lines 382--382
|[]
[]
Overfull \hbox (14.39503pt too wide) in paragraph at lines 383--383
Overfull \hbox (14.39503pt too wide) in paragraph at lines 382--382
[]
[]
Underfull \hbox (badness 10000) in paragraph at lines 383--383
Underfull \hbox (badness 10000) in paragraph at lines 382--382
|[]
[]
Overfull \hbox (11.8429pt too wide) in paragraph at lines 383--383
Overfull \hbox (11.8429pt too wide) in paragraph at lines 382--382
[]
[]
Overfull \hbox (25.19485pt too wide) in paragraph at lines 386--386
Overfull \hbox (25.19485pt too wide) in paragraph at lines 385--385
[]|[]|
[]
Underfull \hbox (badness 10000) in paragraph at lines 386--386
Underfull \hbox (badness 10000) in paragraph at lines 385--385
|[]
[]
Overfull \hbox (27.67467pt too wide) in paragraph at lines 386--386
Overfull \hbox (27.67467pt too wide) in paragraph at lines 385--385
[]
[]
Underfull \hbox (badness 10000) in paragraph at lines 388--388
Underfull \hbox (badness 10000) in paragraph at lines 387--387
|[]
[]
Overfull \hbox (18.33882pt too wide) in paragraph at lines 388--388
Overfull \hbox (18.33882pt too wide) in paragraph at lines 387--387
[]
[]
Underfull \hbox (badness 10000) in paragraph at lines 390--390
Underfull \hbox (badness 10000) in paragraph at lines 389--389
|[]
[]
Overfull \hbox (27.23465pt too wide) in paragraph at lines 390--390
Overfull \hbox (27.23465pt too wide) in paragraph at lines 389--389
[]
[]
Underfull \hbox (badness 10000) in paragraph at lines 392--392
Underfull \hbox (badness 10000) in paragraph at lines 391--391
|[]
[]
Overfull \hbox (25.89078pt too wide) in paragraph at lines 392--392
Overfull \hbox (25.89078pt too wide) in paragraph at lines 391--391
[]
[]
Underfull \hbox (badness 10000) in paragraph at lines 394--394
Underfull \hbox (badness 10000) in paragraph at lines 393--393
|[]
[]
Overfull \hbox (24.43471pt too wide) in paragraph at lines 394--394
Overfull \hbox (24.43471pt too wide) in paragraph at lines 393--393
[]
[]
Underfull \hbox (badness 10000) in paragraph at lines 394--394
Underfull \hbox (badness 10000) in paragraph at lines 393--393
|[]
[]
Overfull \hbox (30.10707pt too wide) in paragraph at lines 394--394
Overfull \hbox (30.10707pt too wide) in paragraph at lines 393--393
[]
[]
Underfull \hbox (badness 10000) in paragraph at lines 396--396
Underfull \hbox (badness 10000) in paragraph at lines 395--395
|[]
[]
Overfull \hbox (32.7467pt too wide) in paragraph at lines 396--396
Overfull \hbox (32.7467pt too wide) in paragraph at lines 395--395
[]
[]
Underfull \hbox (badness 10000) in paragraph at lines 396--396
Underfull \hbox (badness 10000) in paragraph at lines 395--395
|[]
[]
Overfull \hbox (30.10707pt too wide) in paragraph at lines 396--396
Overfull \hbox (30.10707pt too wide) in paragraph at lines 395--395
[]
[]
Underfull \hbox (badness 2452) in paragraph at lines 401--402
Underfull \hbox (badness 2452) in paragraph at lines 400--401
[]\OT1/ptm/m/n/10 Additionally, to val-i-date gen-er-al-iza-tion on es-tab-lished
[]
[7]
Underfull \hbox (badness 10000) in paragraph at lines 431--432
[]\OT1/ptm/m/n/10 1) **Stan-dard RAG** [6]: Con-ven-tional retrieval-
Underfull \hbox (badness 4752) in paragraph at lines 411--412
[]\OT1/ptm/m/n/10 Conflict Clas-si-fi-ca-tion Ac-cu-racy (CCA): Four-class
[]
Underfull \hbox (badness 1603) in paragraph at lines 441--442
[]\OT1/ptm/m/n/10 5) **Hy-per-GraphRAG** [25]: Hypergraph-based RAG
Underfull \hbox (badness 4699) in paragraph at lines 411--412
\OT1/ptm/m/n/10 clas-si-fi-ca-tion ac-cu-racy over the con-flict types on
[]
Underfull \hbox (badness 2698) in paragraph at lines 495--496
Underfull \hbox (badness 2698) in paragraph at lines 486--487
\OT1/ptm/m/n/10 ti-HopQA), Are-oRAG main-tains com-pet-i-tive per-for-mance
[]
[8]
Underfull \hbox (badness 10000) in paragraph at lines 540--540
Underfull \hbox (badness 10000) in paragraph at lines 531--531
[]|\OT1/ptm/m/n/8 w/o In-ter-ac-tion En-tropy (use
[]
Underfull \hbox (badness 3271) in paragraph at lines 547--548
[]\OT1/ptm/m/n/10 **a) HySH Mod-ule Anal-y-sis:** The HySH mod-ule
Underfull \hbox (badness 1515) in paragraph at lines 538--539
\OT1/ptm/m/n/10 F1 drops of 11.2% on MarsRegion-QA and 12.3% on
[]
Underfull \hbox (badness 1917) in paragraph at lines 549--550
Underfull \hbox (badness 1917) in paragraph at lines 540--541
\OT1/ptm/m/n/10 F1 im-prove-ment over Eu-clidean hy-per-graph (49.2% vs.
[]
[9]
Underfull \hbox (badness 10000) in paragraph at lines 572--572
Underfull \hbox (badness 10000) in paragraph at lines 563--563
[]|\OT1/ptm/m/n/8 Standard
[]
Underfull \hbox (badness 10000) in paragraph at lines 574--574
Underfull \hbox (badness 10000) in paragraph at lines 565--565
[]|\OT1/ptm/m/n/8 MultiRAG
[]
Underfull \hbox (badness 10000) in paragraph at lines 580--580
Underfull \hbox (badness 10000) in paragraph at lines 571--571
[]|\OT1/ptm/b/n/8 AreoRAG
[]
Underfull \hbox (badness 1442) in paragraph at lines 614--615
[]\OT1/ptm/m/n/10 The pre-pro-cess-ing time (86.5s) is higher than Mul-ti-
[]
[10] [11]
Package textcomp Info: Symbol \textrightarrow not provided by
(textcomp) font family ptm in TS1 encoding.
(textcomp) Default family used instead on input line 672.
(textcomp) Default family used instead on input line 663.
Package textcomp Info: Symbol \textrightarrow not provided by
(textcomp) font family ptm in TS1 encoding.
(textcomp) Default family used instead on input line 672.
(textcomp) Default family used instead on input line 663.
[12{D:/software/ctex/MiKTeX/fonts/enc/dvips/cm-super/cm-super-ts1.enc}]
Underfull \hbox (badness 2495) in paragraph at lines 706--707
Underfull \hbox (badness 2495) in paragraph at lines 697--698
[]\OT1/ptm/m/n/10 This work is sup-ported by the Na-tional Key R&D
[]
Underfull \hbox (badness 2799) in paragraph at lines 706--707
Underfull \hbox (badness 2799) in paragraph at lines 697--698
\OT1/ptm/m/n/10 Pro-gram of China ``In-ter-gov-ern-men-tal In-ter-na-tional Sci-
[]
Underfull \hbox (badness 7576) in paragraph at lines 706--707
Underfull \hbox (badness 7576) in paragraph at lines 697--698
\OT1/ptm/m/n/10 ence and Tech-nol-ogy In-no-va-tion Co-op-er-a-tion" (Grant
[]
(MarsRAG.bbl [13]) [14
] (MarsRAG.aux)
LaTeX Warning: There were multiply-defined labels.
)
(MarsRAG.bbl [13]) [14] (MarsRAG.aux) )
Here is how much of TeX's memory you used:
5495 strings out of 476331
91763 string characters out of 5797649
1892660 words of memory out of 5000000
25840 multiletter control sequences out of 15000+600000
5501 strings out of 476331
91873 string characters out of 5797649
1896660 words of memory out of 5000000
25846 multiletter control sequences out of 15000+600000
562405 words of font info for 135 fonts, out of 8000000 for 9000
1145 hyphenation exceptions out of 8191
57i,19n,63p,2307b,408s stack positions out of 10000i,1000n,20000p,200000b,200000s
<D:/software/ctex/MiKTeX/fonts/type1/public/amsfonts/cm/cmbx10.pfb><D:/software/ctex/MiKTeX/fonts/type1/public/amsfonts/cm/cmbx7.pfb><D:/software/ctex/MiKTeX/fonts/type1/public/amsfonts/cm/cmex10.pfb><D:/software/ctex/MiKTeX/fonts/type1/public/amsfonts/cm/cmmi10.pfb><D:/software/ctex/MiKTeX/fonts/type1/public/amsfonts/cm/cmmi5.pfb><D:/software/ctex/MiKTeX/fonts/type1/public/amsfonts/cm/cmmi6.pfb><D:/software/ctex/MiKTeX/fonts/type1/public/amsfonts/cm/cmmi7.pfb><D:/software/ctex/MiKTeX/fonts/type1/public/amsfonts/cm/cmmi8.pfb><D:/software/ctex/MiKTeX/fonts/type1/public/amsfonts/cm/cmmi9.pfb><D:/software/ctex/MiKTeX/fonts/type1/public/amsfonts/cm/cmr10.pfb><D:/software/ctex/MiKTeX/fonts/type1/public/amsfonts/cm/cmr6.pfb><D:/software/ctex/MiKTeX/fonts/type1/public/amsfonts/cm/cmr7.pfb><D:/software/ctex/MiKTeX/fonts/type1/public/amsfonts/cm/cmr8.pfb><D:/software/ctex/MiKTeX/fonts/type1/public/amsfonts/cm/cmr9.pfb><D:/software/ctex/MiKTeX/fonts/type1/public/amsfonts/cm/cmsy10.pfb><D:/software/ctex/MiKTeX/fonts/type1/public/amsfonts/cm/cmsy5.pfb><D:/software/ctex/MiKTeX/fonts/type1/public/amsfonts/cm/cmsy7.pfb><D:/software/ctex/MiKTeX/fonts/type1/public/amsfonts/cm/cmsy8.pfb><D:/software/ctex/MiKTeX/fonts/type1/public/amsfonts/symbols/msam10.pfb><D:/software/ctex/MiKTeX/fonts/type1/public/amsfonts/symbols/msbm10.pfb><D:/software/ctex/MiKTeX/fonts/type1/public/cm-super/sfrm1000.pfb><D:/software/ctex/MiKTeX/fonts/type1/urw/times/utmb8a.pfb><D:/software/ctex/MiKTeX/fonts/type1/urw/times/utmbi8a.pfb><D:/software/ctex/MiKTeX/fonts/type1/urw/times/utmr8a.pfb><D:/software/ctex/MiKTeX/fonts/type1/urw/times/utmri8a.pfb>
Output written on MarsRAG.pdf (14 pages, 381958 bytes).
Output written on MarsRAG.pdf (14 pages, 386973 bytes).
PDF statistics:
175 PDF objects out of 1000 (max. 8388607)
0 named destinations out of 1000 (max. 500000)

Binary file not shown.

Binary file not shown.

View File

@@ -259,7 +259,7 @@ i.e., the apparent inconsistency is resolvable by accounting for observation con
\begin{table}
\renewcommand{\arraystretch}{1.3}
\caption{Physics-Informed Conflict Triage Categories}
\label{table_conflict_triage}
\label{table:conflict_triage}
\vspace{-0.13in}
\centering
\begin{tabular}{|m{2.1cm}|m{2.8cm}|m{2.8cm}|}
@@ -278,7 +278,7 @@ i.e., the apparent inconsistency is resolvable by accounting for observation con
\end{tabular}
\end{table}
Based on this distinction, we define four conflict categories, each with a differentiated processing strategy, as shown in Table~\ref{table_conflict_triage}. For each detected conflict, we construct a feature vector that fuses information-theoretic, physical, and neural signals:
Based on this distinction, we define four conflict categories, each with a differentiated processing strategy, as shown in Table~\ref{table:conflict_triage}. For each detected conflict, we construct a feature vector that fuses information-theoretic, physical, and neural signals:
\begin{equation}
\label{equ:conflict classification feature vector}
\mathbf{z}_{conf} = \left[\mathcal{H}_{inter}, \; \|\Omega_i - \Omega_j\|, \; |\log(\ell_{res}^i / \ell_{res}^j)|, \; \Delta\mathcal{T}, \; \rho_{auth}(i,j), \; \mathbf{h}^{(l^*)}_{conf}\right],
@@ -295,7 +295,7 @@ Lemma~1 (Conflict Type Separability). The four conflict types are distinguished
3) Conflict-Aware Confidence Recalibration: Based on the classification result, we recalibrate the node confidence. This is the key departure from MultiRAG's MCC, which uniformly penalizes inconsistency:
\begin{equation}
\label{equ:conflict classification}
\label{equ:conflict recalibration}
C_{triage}\left( v \right) =\begin{cases}
C_{base}\left( v \right)& \text{if\,\,}v\ni \mathcal{C}^{detected}\\
\alpha \cdot C_{base}\left( v \right) +\left( 1-\alpha \right) \cdot \eta& \text{if\,\,}\hat{c}=noise\\
@@ -352,30 +352,29 @@ It should be noted that the ARP algorithm constructs the HySH offline as a prepr
\section{Experiments}
This section conducts experiments and performance analysis on the Hyperbolic Spatial Hypergraph (HySH) construction and the Physics-Informed Conflict Triage (PICT) modules. Baseline methods are compared with SOTA multi-source retrieval, graph-based RAG, and conflict-resolution methods. Extensive experiments are conducted to assess the robustness and efficiency of AreoRAG, which aims to answer the following questions.
- **Q1**: How does the overall retrieval and QA performance of AreoRAG compare with existing multi-source RAG and graph-based RAG methods on planetary spatial data?
- **Q2**: What are the respective impacts of spatial sparsity and inter-source conflict intensity on retrieval quality?
- **Q3**: How effective are the two core modules (HySH and PICT) of AreoRAG individually?
- **Q4**: Can PICT correctly preserve scientifically valuable conflicts while filtering noise, and how does this compare with conventional conflict-elimination approaches?
- **Q5**: What are the time costs of the various modules in AreoRAG?
\begin{itemize}
\item \textbf{Q1}: How does the overall retrieval and QA performance of AreoRAG compare with existing multi-source RAG and graph-based RAG methods on planetary spatial data?
\item \textbf{Q2}: What are the respective impacts of spatial sparsity and inter-source conflict intensity on retrieval quality?
\item \textbf{Q3}: How effective are the two core modules (HySH and PICT) of AreoRAG individually?
\item \textbf{Q4}: Can PICT correctly preserve scientifically valuable conflicts while filtering noise, and how does this compare with conventional conflict-elimination approaches?
\item \textbf{Q5}: What are the time costs of the various modules in AreoRAG?
\end{itemize}
\subsection{Experimental Settings}
**a) Datasets:** To validate the effectiveness of AreoRAG in planetary multi-source spatial data retrieval, we construct three datasets from real Mars exploration archives and further evaluate on two general multi-hop QA benchmarks. The planetary datasets are summarized in Table I.
a) \textbf{Datasets}: To validate the effectiveness of AreoRAG in planetary multi-source spatial data retrieval, we construct three datasets from real Mars exploration archives and further evaluate on two general multi-hop QA benchmarks. The planetary datasets are summarized in Table~\ref{table:planetary_datasets}.
(1) **MarsRegion-QA**: A multi-source spatial QA dataset constructed from the Mars Orbital Data Explorer (ODE) archives. We select five scientifically significant regions on Mars — Jezero Crater, Gale Crater, Utopia Planitia, Valles Marineris, and Olympus Mons — and aggregate orbital observations from HiRISE (0.3 m), CTX (6 m), CRISM (18 m), and MOLA (460 m). Each query targets cross-source spatial reasoning (e.g., "What mineral signatures have been detected in the clay-bearing unit at the western delta of Jezero Crater, and do different orbital sensors agree?"). We construct 200 queries with expert-annotated ground truth answers and conflict labels.
% TODO 这个例子是否恰当?
(1) MarsRegion-QA: A multi-source spatial QA dataset constructed from the Mars Orbital Data Explorer archives. We select five scientifically significant regions on Mars: Jezero Crater, Gale Crater, Utopia Planitia, Valles Marineris, and Olympus Mons. For these areas, we aggregate orbital observations from HiRISE (0.5 m), CTX (5 m), CRISM (18 m), MoRIC (76 m), and MOLA (460 m). Each query targets cross-source spatial reasoning (e.g., "What mineral signatures have been detected in the clay-bearing unit at the western delta of Jezero Crater, and do different orbital sensors agree?"). We construct 200 queries with expert-annotated ground truth answers and conflict labels.
(2) **MarsConflict-50**: A curated subset of 50 observation pairs exhibiting known scientific conflicts documented in the planetary science literature (e.g., CRISM detection of hydrated minerals vs. contradictory results from other spectral sensors at the same location). Each pair is annotated with conflict type (instrument-inherent, scale-dependent, temporal-evolution, or noise) by domain experts. This dataset serves as the primary benchmark for evaluating PICT's conflict classification accuracy.
(2) MarsConflict-50 is a curated dataset comprising 50 observation pairs that exhibit established scientific conflicts documented in planetary science literature. A representative example involves discrepancies between hydrated mineral detections by CRISM and contradictory measurements from other spectral sensors at identical locations. Each pair is rigorously annotated by domain experts into four categories: instrument-inherent, scale-dependent, temporal-evolution, and noise-induced. This dataset serves as the primary benchmark for evaluating the conflict classification accuracy of the PICT framework.
(3) **MarsTemporal-QA**: A temporal reasoning dataset comprising 150 queries about surface changes observed across different Mars Years (MY), such as recurring slope lineae (RSL) activity, dust storm impacts, and seasonal frost patterns. Each query requires integrating observations spanning $L_s$ ranges to assess temporal evolution.
(3) MarsTemporal-QA: A temporal reasoning dataset comprising 150 queries about surface changes observed across different Mars Years (MY), such as recurring slope lineae activity, dust storm impacts, and seasonal frost patterns. Each query requires integrating observations spanning $L_s$ ranges to assess temporal evolution.
\begin{table}
\renewcommand{\arraystretch}{1.3}
\caption{Statistics of the Planetary Datasets}
\label{table_planetary_datasets}
\label{table:planetary_datasets}
\vspace{-0.13in}
\centering
\begin{tabular}{|m{1cm}|m{1cm}|m{1cm}|m{1cm}|m{1cm}|m{1cm}|}
@@ -398,66 +397,58 @@ This section conducts experiments and performance analysis on the Hyperbolic Spa
\end{tabular}
\end{table}
Additionally, to validate generalization on established benchmarks, we evaluate on HotpotQA [38] and 2WikiMultiHopQA [39], using the same 300-question subsamples as MultiRAG [14] for fair comparison.
Additionally, to validate generalization on established benchmarks, we evaluate on HotpotQA \cite{yang18hotpotqa} and 2WikiMultiHopQA \cite{ho202WikiMultiHopQA}, using the same 300-question subsamples as MultiRAG \cite{Wu25MultiRAG} for fair comparison.
It is noteworthy that MarsRegion-QA exhibits high spatial density (multiple overlapping observations per region) but significant cross-resolution heterogeneity, while MarsConflict-50 is specifically designed to stress-test conflict handling with a high proportion of scientifically valuable disagreements (~72\% of conflicts are non-noise).
It is noteworthy that MarsRegion-QA exhibits multiple overlapping observations per region but significant cross-resolution heterogeneity, while MarsConflict-50 is specifically designed to stress-test conflict handling with a high proportion of scientifically valuable disagreements (~72\% of conflicts are non-noise).
**b) Evaluation Metrics:** We adopt multiple metrics to comprehensively evaluate retrieval quality, answer accuracy, and conflict handling:
b) \textbf{Evaluation Metrics}: We adopt multiple metrics to comprehensively evaluate retrieval quality, answer accuracy, and conflict handling:
- **F1 score**: The harmonic mean of precision and recall, assessing overall retrieval and answer quality:
\begin{itemize}
\item F1 score: The harmonic mean of precision and recall, assessing overall retrieval and answer quality: $F1 = 2 \times \frac{P \times R}{P + R}$.
\item Recall@K: Recall at rank $K$, measuring the proportion of relevant documents retrieved within the top-$K$ results.
\item Conflict Preservation Rate (CPR): The proportion of scientifically valuable conflicts (annotated as instrument-inherent, scale-dependent, or temporal-evolution) that are correctly preserved rather than filtered:$CPR = \frac{|\mathcal{C}^{sci}_{preserved}|}{|\mathcal{C}^{sci}_{total}|}$.
\item Noise Rejection Rate (NRR): The proportion of noise conflicts that are correctly filtered:$NRR = \frac{|\mathcal{C}^{noise}_{filtered}|}{|\mathcal{C}^{noise}_{total}|}$.
\item Conflict Classification Accuracy (CCA): Four-class classification accuracy over the conflict types on MarsConflict-50.
\item Query Time (QT) and Preprocessing Time (PT): Measured in seconds, assessing online and offline efficiency.
\end{itemize}
$$F1 = 2 \times \frac{P \times R}{P + R}$$
c) \textbf{Hyper-parameter Settings}: All methods were implemented in Python 3.12 and CUDA 12.1 environment. The base LLM is Llama3-8B-Instruct for all methods except where noted. For HySH construction, the hyperbolic curvature is set to $K = -1.0$, the embedding dimension $d = 64$, and the resolution power parameter $p = 2$ for Spatial OEM. For PICT, the interaction entropy threshold is $\epsilon = 0.3$, the noise penalty $\eta = -0.5$, the scientific boost coefficient $\beta = 0.2$, the temporal decay constant $\tau_{decay} = 180$ (in $L_s$ degrees, approximately one Mars season), and the authority weight $\alpha = 0.5$. The MLP conflict classifier uses a two-layer architecture ($256 \rightarrow 128 \rightarrow 4$) with ReLU activation, trained on MarsConflict-50 with 5-fold cross-validation. The plausibility scoring MLP $f_\theta$ for retrieval follows the architecture in [18] with adaptive threshold $\tau_0 = 0.5$ and decay factor $c = 0.1$. All experiments were conducted on a device equipped with an NVIDIA A100 (80 GB) GPU and 256 GB of memory.
- **Recall@K**: Recall at rank $K$, measuring the proportion of relevant documents retrieved within the top-$K$ results.
d) \textbf{Baseline Models}: To demonstrate the superiority of AreoRAG, we compare with the following categories of methods.
- **Conflict Preservation Rate (CPR)**: The proportion of scientifically valuable conflicts (annotated as instrument-inherent, scale-dependent, or temporal-evolution) that are correctly preserved rather than filtered:
General RAG Methods:
$$CPR = \frac{|\mathcal{C}^{sci}_{preserved}|}{|\mathcal{C}^{sci}_{total}|}$$
1) Standard RAG \cite{Lewis20RAG}: Conventional retrieval-augmented generation with dense vector retrieval.
- **Noise Rejection Rate (NRR)**: The proportion of noise conflicts that are correctly filtered:
2) IRCoT \cite{Harsh23IRCoT}: Iterative retrieval with chain-of-thought reasoning refinement.
$$NRR = \frac{|\mathcal{C}^{noise}_{filtered}|}{|\mathcal{C}^{noise}_{total}|}$$
3) RQ-RAG \cite{Chan24RQRAG}: Retrieval with optimized query decomposition for complex queries.
- **Conflict Classification Accuracy (CCA)**: Four-class classification accuracy over the conflict types on MarsConflict-50.
Graph-based RAG Methods:
- **Query Time (QT)** and **Preprocessing Time (PT)**: Measured in seconds, assessing online and offline efficiency.
4) MultiRAG \cite{Wu25MultiRAG}: Multi-source line graph with multi-level confidence computing (the primary comparison target).
**c) Hyper-parameter Settings:** All methods were implemented in Python 3.10 and CUDA 12.1 environment. The base LLM is Llama3-8B-Instruct for all methods except where noted. For HySH construction, the hyperbolic curvature is set to $K = -1.0$, the embedding dimension $d = 64$, and the resolution power parameter $p = 2$ for Spatial OEM. For PICT, the interaction entropy threshold is $\epsilon = 0.3$, the noise penalty $\eta = -0.5$, the scientific boost coefficient $\beta = 0.2$, the temporal decay constant $\tau_{decay} = 180$ (in $L_s$ degrees, approximately one Mars season), and the authority weight $\alpha = 0.5$. The MLP conflict classifier uses a two-layer architecture ($256 \rightarrow 128 \rightarrow 4$) with ReLU activation, trained on MarsConflict-50 with 5-fold cross-validation. The plausibility scoring MLP $f_\theta$ for retrieval follows the architecture in [18] with adaptive threshold $\tau_0 = 0.5$ and decay factor $c = 0.1$. All experiments were conducted on a device equipped with an NVIDIA A100 (80 GB) GPU and 256 GB of memory.
5) HyperGraphRAG \cite{luo25hyperrag}: Hypergraph-based RAG with $n$-ary relational facts retrieval.
**d) Baseline Models:** To demonstrate the superiority of AreoRAG, we compare with the following categories of methods:
6) HyperRAG \cite{lien26hyperrag}: MLP-based retrieval over $n$-ary hypergraphs with adaptive search.
*General RAG Methods:*
% TODO Conflict-Resolution需要换一下
Conflict-Resolution Methods:
1) **Standard RAG** [6]: Conventional retrieval-augmented generation with dense vector retrieval.
7) TruthfulRAG \cite{liu26truthfulrag}: Knowledge graph-based conflict resolution via entropy-based filtering.
2) **IRCoT** [44]: Iterative retrieval with chain-of-thought reasoning refinement.
8) MetaRAG \cite{Zhou24MetaRAG}: Metacognitive strategies for hallucination mitigation in retrieval.
3) **RQ-RAG** [47]: Retrieval with optimized query decomposition for complex queries.
*Graph-based RAG Methods:*
4) **MultiRAG** [14]: Multi-source line graph with multi-level confidence computing (the primary comparison target).
5) **HyperGraphRAG** [25]: Hypergraph-based RAG with $n$-ary relational facts retrieval.
6) **HyperRAG** [18]: MLP-based retrieval over $n$-ary hypergraphs with adaptive search.
*Conflict-Resolution Methods:*
7) **TruthfulRAG** [17]: Knowledge graph-based conflict resolution via entropy-based filtering.
8) **MetaRAG** [9]: Metacognitive strategies for hallucination mitigation in retrieval.
**e) Dataset Preprocessing:** For the planetary datasets, we parse PDS4 labels and CNSA metadata through the multi-source spatial adapters (Section III-B) to extract spatial footprints, temporal windows, and instrument parameters. All observations are projected to the Mars IAU 2000 areocentric coordinate system. Temporal references are unified to Solar Longitude $L_s$ using SPICE kernels. For the general QA benchmarks, we follow the same preprocessing pipeline as MultiRAG [14] to ensure fair comparison.
e) \textbf{Dataset Preprocessing}: For the planetary datasets, we parse PDS4 labels and CNSA metadata through the multi-source spatial adapters (Section~\ref{sec:HySH}) to extract spatial footprints, temporal windows, and instrument parameters. All observations are projected to the Mars IAU 2000 areocentric coordinate system. Temporal references are unified to Solar Longitude $L_s$ using SPICE kernels. For the general QA benchmarks, we follow the same preprocessing pipeline as MultiRAG \cite{Wu25MultiRAG} to ensure fair comparison.
\subsection{Overall Retrieval and QA Performance (Q1)}
To validate the effectiveness of AreoRAG, we assess it using F1 scores and query times across the planetary datasets and the two general multi-hop QA benchmarks. Table II summarizes the performance comparison.
To validate the effectiveness of AreoRAG, we assess it using F1 scores and query times across the planetary datasets and the two general multi-hop QA benchmarks. Table~\ref{table:comparison_QA} summarizes the performance comparison.
\begin{table*}
\renewcommand{\arraystretch}{1.3}
\caption{Comparison with Baseline Methods on Planetary and General QA Datasets}
\label{table_comparison}
\label{table:comparison_QA}
\vspace{-0.13in}
\centering
\begin{tabular}{|m{2.5cm}|m{1.1cm}|m{1.3cm}|m{1.1cm}|m{1.3cm}|m{1.1cm}|m{1.3cm}|m{1.1cm}|m{1.3cm}|}
@@ -488,9 +479,9 @@ To validate the effectiveness of AreoRAG, we assess it using F1 scores and query
\end{tabular}
\end{table*}
Table II demonstrates that AreoRAG outperforms all comparative methods across both planetary and general QA datasets. On MarsRegion-QA, AreoRAG achieves an F1 score of 55.8\%, representing a 13.5\% absolute improvement over MultiRAG (42.3%) and a 9.3% improvement over the best graph-based baseline HyperRAG (46.5%). This significant gap validates the effectiveness of HySH in capturing spatial relationships that discrete line graphs and standard hypergraphs miss.
Table~\ref{table:comparison_QA} demonstrates that AreoRAG outperforms all comparative methods across both planetary and general QA datasets. On MarsRegion-QA, AreoRAG achieves an F1 score of 55.8\%, representing a 13.5\% absolute improvement over MultiRAG (42.3\%) and a 9.3\% improvement over the best graph-based baseline HyperRAG (46.5\%). This significant gap validates the effectiveness of HySH in capturing spatial relationships that discrete line graphs and standard hypergraphs miss.
On MarsTemporal-QA, which demands temporal reasoning across observation epochs, AreoRAG achieves 52.4\% F1, outperforming all baselines by at least 10.6\%. This improvement is attributed to PICT's temporal-evolution conflict handling (the $\gamma(|\Delta\mathcal{T}|)$ weighting in Eq. 20), which preserves temporal change signals rather than filtering them as inconsistencies.
On MarsTemporal-QA, which demands temporal reasoning across observation epochs, AreoRAG achieves 52.4\% F1, outperforming all baselines by at least 10.6\%. This improvement is attributed to PICT's temporal-evolution conflict handling (the $\gamma(|\Delta\mathcal{T}|)$ weighting in Eq.~\ref{equ:conflict recalibration}), which preserves temporal change signals rather than filtering them as inconsistencies.
On the general benchmarks (HotpotQA and 2WikiMultiHopQA), AreoRAG maintains competitive performance (61.7\% and 57.3\% F1), demonstrating that the framework generalizes beyond planetary science. The modest improvements over MultiRAG on these benchmarks (2.4\% and 1.6\%) are expected, as these datasets do not exhibit the spatial and physical conflict characteristics that AreoRAG is specifically designed to address.
@@ -500,22 +491,22 @@ Notably, HyperRAG and HyperGraphRAG perform well on planetary datasets (46.5\% a
AreoRAG demonstrates strong robustness under varying spatial sparsity and conflict intensity. We conduct experiments from two perspectives.
**1) Spatial Sparsity:** We applied 30\%, 50\%, and 70\% random hyperedge masking to MarsRegion-QA, progressively removing spatial connections while ensuring query answers remain retrievable.
a) Spatial Sparsity: We applied 30\%, 50\%, and 70\% random hyperedge masking to MarsRegion-QA, progressively removing spatial connections while ensuring query answers remain retrievable.
As shown in Fig. 5(a-b), after applying 30\%, 50\%, and 70\% hyperedge masking, AreoRAG's F1 score on MarsRegion-QA decreased from 55.8\% to 52.1\%, 49.3\%, and 45.6\% respectively. In contrast, MultiRAG's F1 dropped more sharply from 42.3\% to 37.8\%, 32.5\%, and 26.1\%. HyperRAG shows moderate degradation (46.5\% to 42.7\%, 38.9\%, 33.4\%). The superior robustness of AreoRAG under sparsity is attributed to two factors: (i) hyperbolic embedding preserves proximity information even when explicit graph edges are removed, as geodesic distance in $\mathbb{H}_K^d$ encodes spatial proximity independently of graph connectivity; and (ii) the Spatial OEM aggregation maintains representational quality by amplifying high-resolution signals that survive masking.
**2) Conflict Intensity:** We injected 30\%, 50\%, and 70\% synthetic conflict triples into MarsRegion-QA by duplicating existing observation records and perturbing their factual content (e.g., randomizing mineral identifications or altering coordinate data), simulating scenarios of increasing inter-source noise.
b) Conflict Intensity: We injected 30\%, 50\%, and 70\% synthetic conflict triples into MarsRegion-QA by duplicating existing observation records and perturbing their factual content (e.g., randomizing mineral identifications or altering coordinate data), simulating scenarios of increasing inter-source noise.
As shown in Fig. 5(c-d), AreoRAG's F1 score decreased only moderately from 55.8\% to 54.2\%, 52.8\%, and 50.1\% under 30\%, 50\%, and 70\% conflict injection respectively. MultiRAG exhibited steeper degradation (42.3\% to 40.1\%, 36.4\%, 30.7\%), and TruthfulRAG showed similar sensitivity (40.8\% to 38.2\%, 34.6\%, 29.3\%). The resilience of AreoRAG is directly attributable to PICT's ability to classify injected noise conflicts as $\mathcal{C}^{noise}$ and filter them while preserving genuine scientific disagreements. In contrast, MultiRAG's MCC module and TruthfulRAG's entropy-based filtering indiscriminately penalize all inconsistencies, including the original valid observations that become "outvoted" by injected noise.
As shown in Fig. 5(c-d), AreoRAG's F1 score decreased only moderately from 55.8\% to 54.2\%, 52.8\%, and 50.1\% under 30\%, 50\%, and 70\% conflict injection respectively. MultiRAG exhibited steeper degradation (42.3\% to 40.1\%, 36.4\%, 30.7\%), and TruthfulRAG showed similar sensitivity (40.8\% to 38.2\%, 34.6\%, 29.3\%). The resilience of AreoRAG is directly attributable to PICT's ability to classify injected noise conflicts as $\mathcal{C}^{noise}$ and filter them while preserving genuine scientific disagreements. In contrast, MultiRAG's MCC module and TruthfulRAG's entropy-based filtering indiscriminately penalize all inconsistencies, including the original valid observations that become ``outvoted" by injected noise.
\subsection{Ablation Study (Q3)}
To evaluate the individual contributions of HySH and PICT, we conduct systematic ablation experiments. Table III reports results on MarsRegion-QA and MarsTemporal-QA.
To evaluate the individual contributions of HySH and PICT, we conduct systematic ablation experiments. Table~\ref{table:ablation} reports results on MarsRegion-QA and MarsTemporal-QA.
\begin{table*}
\renewcommand{\arraystretch}{1.3}
\caption{Ablation Experiments of HySH and PICT Modules}
\label{table_ablation}
\label{table:ablation}
\vspace{-0.13in}
\centering
\begin{tabular}{|m{4cm}|m{1.1cm}|m{1.1cm}|m{1.1cm}|m{1.1cm}|m{1.1cm}|m{1.1cm}|}
@@ -544,15 +535,15 @@ To evaluate the individual contributions of HySH and PICT, we conduct systematic
\end{tabular}
\end{table*}
**a) HySH Module Analysis:** The HySH module achieves significant improvements in both accuracy and efficiency. Replacing HySH with MultiRAG's MLG (w/o HySH) causes F1 drops of 11.2\% on MarsRegion-QA and 12.3\% on MarsTemporal-QA, while query time increases by 8.4$\times$ (3.42s to 28.7s) due to the edge explosion problem in pairwise spatial encoding. This validates the $O(k)$ vs. $O(k^2)$ complexity advantage of hyperedges.
a) HySH Module Analysis: The HySH module achieves significant improvements in both accuracy and efficiency. Replacing HySH with MultiRAG's MLG (w/o HySH) causes F1 drops of 11.2\% on MarsRegion-QA and 12.3\% on MarsTemporal-QA, while query time increases by 8.4$\times$ (3.42s to 28.7s) due to the edge explosion problem in pairwise spatial encoding. This validates the $O(k)$ vs. $O(k^2)$ complexity advantage of hyperedges.
Within HySH, the hyperbolic embedding contributes 6.6\% F1 improvement over Euclidean hypergraph (49.2\% vs. 55.8\%), confirming that the negative-curvature geometry is essential for faithfully representing the hierarchical scale structure. The Spatial OEM contributes an additional 4.5\% F1 over standard Einstein midpoint aggregation (51.3\% vs. 55.8\%), validating the outward bias property (Theorem 1) in preventing hierarchical collapse during cross-resolution fusion.
Within HySH, the hyperbolic embedding contributes 6.6\% F1 improvement over Euclidean hypergraph (49.2\% vs. 55.8\%), confirming that the negative-curvature geometry is essential for faithfully representing the hierarchical scale structure. The Spatial OEM contributes an additional 4.5\% F1 over standard Einstein midpoint aggregation (51.3\% vs. 55.8\%), validating the outward bias property (Theorem~1) in preventing hierarchical collapse during cross-resolution fusion.
**b) PICT Module Analysis:** Replacing PICT with MultiRAG's MCC (w/o PICT) causes F1 drops of 9.9\% on MarsRegion-QA and 12.7\% on MarsTemporal-QA. The larger drop on MarsTemporal-QA is expected, as this dataset contains abundant temporal-evolution conflicts that MCC would filter as inconsistencies.
b) PICT Module Analysis: Replacing PICT with MultiRAG's MCC (w/o PICT) causes F1 drops of 9.9\% on MarsRegion-QA and 12.7\% on MarsTemporal-QA. The larger drop on MarsTemporal-QA is expected, as this dataset contains abundant temporal-evolution conflicts that MCC would filter as inconsistencies.
The ablation further reveals the contribution of each PICT component. Removing conflict classification (using uniform filtering instead of four-category triage) costs 7.7\% F1 on MarsRegion-QA. Replacing cross-source interaction entropy with TruthfulRAG's $\Delta H_p$ metric costs 5.4\% F1, confirming that the cross-source formulation (Eq. 14) is more appropriate for the all-external-knowledge setting of planetary observations.
The ablation further reveals the contribution of each PICT component. Removing conflict classification (using uniform filtering instead of four-category triage) costs 7.7\% F1 on MarsRegion-QA. Replacing cross-source interaction entropy with TruthfulRAG's $\Delta H_p$ metric costs 5.4\% F1, confirming that the cross-source formulation (Eq.\ref{equ:interaction entropy}) is more appropriate for the all-external-knowledge setting of planetary observations.
**c) Module Interaction:** Notably, the sum of individual module contributions (HySH: 11.2\% + PICT: 9.9\% = 21.1\%) exceeds the gap between the full model and Standard RAG (55.8\% - 28.4\% = 27.4\%), but the actual synergy is evident in the coupling points. HySH's radial depth difference $\Delta r$ directly improves PICT's scale-conflict classification; PICT's triage feedback improves HySH's retrieval priority. Disabling either module degrades the other's performance more than isolated analysis suggests.
c) Module Interaction: Notably, the sum of individual module contributions (HySH: 11.2\% + PICT: 9.9\% = 21.1\%) exceeds the gap between the full model and Standard RAG (55.8\% - 28.4\% = 27.4\%), but the actual synergy is evident in the coupling points. HySH's radial depth difference $\Delta r$ directly improves PICT's scale-conflict classification; PICT's triage feedback improves HySH's retrieval priority. Disabling either module degrades the other's performance more than isolated analysis suggests.
\subsection{Conflict Preservation Evaluation (Q4)}
@@ -561,7 +552,7 @@ A defining capability of AreoRAG is the ability to preserve scientifically valua
\begin{table}
\renewcommand{\arraystretch}{1.3}
\caption{Conflict Handling Performance on MarsConflict-50}
\label{table_conflict}
\label{table:conflict}
\vspace{-0.13in}
\centering
\begin{tabular}{|m{1.5cm}|m{1cm}|m{1cm}|m{1cm}|m{1cm}|}
@@ -582,9 +573,9 @@ A defining capability of AreoRAG is the ability to preserve scientifically valua
\end{tabular}
\end{table}
*Standard RAG preserves all information indiscriminately (CPR=100\%) because it has no conflict handling mechanism, resulting in noise contamination and low F1. "—" indicates the method does not perform explicit conflict classification.*
Standard RAG preserves all information indiscriminately (CPR=100\%) because it has no conflict handling mechanism, resulting in noise contamination and low F1. Symbol ``—" indicates the method does not perform explicit conflict classification.
Table IV reveals the fundamental difference between AreoRAG and existing methods. MultiRAG achieves a high Noise Rejection Rate (85.7\%) but at the cost of a catastrophically low Conflict Preservation Rate (8.3\%) — it filters 91.7\% of scientifically valuable conflicts as "unreliable data." TruthfulRAG and MetaRAG show similar behavior (CPR of 13.9\% and 11.1\%), confirming that existing conflict-resolution methods systematically destroy scientific anomaly signals.
Table~\ref{table:conflict} reveals the fundamental difference between AreoRAG and existing methods. MultiRAG achieves a high Noise Rejection Rate (85.7\%) but at the cost of a catastrophically low Conflict Preservation Rate (8.3\%) — it filters 91.7\% of scientifically valuable conflicts as "unreliable data." TruthfulRAG and MetaRAG show similar behavior (CPR of 13.9\% and 11.1\%), confirming that existing conflict-resolution methods systematically destroy scientific anomaly signals.
In contrast, AreoRAG achieves a CPR of 91.7\% while maintaining the same NRR (85.7\%) as MultiRAG, demonstrating that PICT successfully decouples noise filtering from scientific conflict preservation. The Conflict Classification Accuracy of 84.0\% on the four-category task validates the separability claim in Lemma~1. Error analysis reveals that the primary source of misclassification is between instrument-inherent and scale-dependent conflicts (12.3\% confusion rate), which is expected as both involve observation geometry differences. Noise vs. scientific conflict misclassification is rare (3.7\%), confirming the robustness of the explainable/opaque distinction (Definition 7).
@@ -595,7 +586,7 @@ Furthermore, the F1 score improvement (53.1\% vs. 35.2\% for MultiRAG) demonstra
\begin{table}
\renewcommand{\arraystretch}{1.3}
\caption{Time Cost Analysis Across Modules}
\label{table_time_cost}
\label{table:time_cost}
\vspace{-0.13in}
\centering
\begin{tabular}{|m{2cm}|m{1cm}|m{1cm}|m{1cm}|m{1cm}|}
@@ -620,7 +611,7 @@ Furthermore, the F1 score improvement (53.1\% vs. 35.2\% for MultiRAG) demonstra
AreoRAG's query time (3.42s on MarsRegion-QA) is competitive with HyperRAG (2.95s) and substantially faster than MultiRAG (4.87s) and TruthfulRAG (5.62s). The faster online query is attributable to the $O(k)$ hyperedge traversal complexity and the lightweight MLP-based plausibility scoring, which avoids the expensive mutual information entropy computation required by MultiRAG's MCC at query time.
The preprocessing time (86.5s) is higher than MultiRAG (15.2s) due to the hyperbolic embedding computation (Eq. 6-8), but lower than HyperRAG (142.7s) because we do not require the full contrastive training pipeline. Importantly, HySH construction is a one-time offline cost amortized across all queries. The PICT module adds minimal online overhead: the conflict classifier (Eq. 19) requires $<$0.1s per detected conflict pair, and the interaction entropy computation (Eq. 14) adds approximately 0.8s per query through parallel LLM forward passes.
The preprocessing time (86.5s) is higher than MultiRAG (15.2s) due to the hyperbolic embedding computation (Eq.~\ref{equ:embedding mapping},~\ref{equ:Spatial Scale-Curvature Correspondence}), but lower than HyperRAG (142.7s) because we do not require the full contrastive training pipeline. Importantly, HySH construction is a one-time offline cost amortized across all queries. The PICT module adds minimal online overhead: the conflict classifier (Eq.~\ref{equ:conflict classification}) requires $<$0.1s per detected conflict pair, and the interaction entropy computation (Eq.~\ref{equ:interaction entropy}) adds approximately 0.8s per query through parallel LLM forward passes.
\subsection{Case Study}

View File

@@ -416,3 +416,90 @@
publisher = {OpenReview.net},
year = {2025}
}
@inproceedings{yang18hotpotqa,
title = "{H}otpot{QA}: A Dataset for Diverse, Explainable Multi-hop Question Answering",
author = "Yang, Zhilin and
Qi, Peng and
Zhang, Saizheng and
Bengio, Yoshua and
Cohen, William and
Salakhutdinov, Ruslan and
Manning, Christopher D.",
editor = "Riloff, Ellen and
Chiang, David and
Hockenmaier, Julia and
Tsujii, Jun{'}ichi",
booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing",
month = oct # "-" # nov,
year = "2018",
address = "Brussels, Belgium",
publisher = "Association for Computational Linguistics",
pages = "2369--2380",
}
@inproceedings{ho202WikiMultiHopQA,
title = "Constructing A Multi-hop {QA} Dataset for Comprehensive Evaluation of Reasoning Steps",
author = "Ho, Xanh and
Duong Nguyen, Anh-Khoa and
Sugawara, Saku and
Aizawa, Akiko",
editor = "Scott, Donia and
Bel, Nuria and
Zong, Chengqing",
booktitle = "Proceedings of the 28th International Conference on Computational Linguistics",
month = dec,
year = "2020",
address = "Barcelona, Spain (Online)",
publisher = "International Committee on Computational Linguistics",
pages = "6609--6625",
}
@inproceedings{Harsh23IRCoT,
title = "Interleaving Retrieval with Chain-of-Thought Reasoning for Knowledge-Intensive Multi-Step Questions",
author = "Trivedi, Harsh and
Balasubramanian, Niranjan and
Khot, Tushar and
Sabharwal, Ashish",
editor = "Rogers, Anna and
Boyd-Graber, Jordan and
Okazaki, Naoaki",
booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
pages = "10014--10037",
}
@inproceedings{Zhou24MetaRAG,
author = {Zhou, Yujia and Liu, Zheng and Jin, Jiajie and Nie, Jian-Yun and Dou, Zhicheng},
title = {Metacognitive Retrieval-Augmented Large Language Models},
year = {2024},
isbn = {9798400701719},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
booktitle = {Proceedings of the ACM Web Conference 2024},
pages = {14531463},
numpages = {11},
location = {Singapore, Singapore},
series = {WWW '24}
}
@article{Chan24RQRAG,
author = {Chi{-}Min Chan and
Chunpu Xu and
Ruibin Yuan and
Hongyin Luo and
Wei Xue and
Yike Guo and
Jie Fu},
title = {{RQ-RAG:} Learning to Refine Queries for Retrieval Augmented Generation},
journal = {CoRR},
volume = {abs/2404.00610},
year = {2024},
url = {https://doi.org/10.48550/arXiv.2404.00610},
doi = {10.48550/ARXIV.2404.00610},
eprinttype = {arXiv},
eprint = {2404.00610}
}