\relax 
\providecommand\hyper@newdestlabel[2]{}
\bibstyle{acl_natbib}
\providecommand\HyperFirstAtBeginDocument{\AtBeginDocument}
\HyperFirstAtBeginDocument{\ifx\hyper@anchor\@undefined
\global\let\oldcontentsline\contentsline
\gdef\contentsline#1#2#3#4{\oldcontentsline{#1}{#2}{#3}}
\global\let\oldnewlabel\newlabel
\gdef\newlabel#1#2{\newlabelxx{#1}#2}
\gdef\newlabelxx#1#2#3#4#5#6{\oldnewlabel{#1}{{#2}{#3}}}
\AtEndDocument{\ifx\hyper@anchor\@undefined
\let\contentsline\oldcontentsline
\let\newlabel\oldnewlabel
\fi}
\fi}
\global\let\hyper@last\relax 
\gdef\HyperFirstAtBeginDocument#1{#1}
\providecommand\HyField@AuxAddToFields[1]{}
\providecommand\HyField@AuxAddToCoFields[2]{}
\citation{vaswani2017attentionisallyouneed}
\citation{dai2019transformer}
\citation{kwon2023efficient}
\citation{li2024snapkv}
\citation{feng2026ada}
\citation{kim2026kvzip}
\citation{lin2011documentsummarization}
\@LN@col{1}
\@LN{0}{0}
\@LN{1}{0}
\@LN{2}{0}
\@LN{3}{0}
\@LN{4}{0}
\@LN{5}{0}
\@LN{6}{0}
\@LN{7}{0}
\@LN{8}{0}
\@LN{9}{0}
\@LN{10}{0}
\@LN{11}{0}
\@LN{12}{0}
\@LN{13}{0}
\@LN{14}{0}
\@LN{15}{0}
\@LN{16}{0}
\@LN{17}{0}
\@LN{18}{0}
\@LN{19}{0}
\@LN{20}{0}
\@LN{21}{0}
\@LN{22}{0}
\@LN{23}{0}
\@LN{24}{0}
\@LN{25}{0}
\@LN{26}{0}
\@LN{27}{0}
\@LN{28}{0}
\@LN{29}{0}
\@LN{30}{0}
\@writefile{toc}{\contentsline {section}{\numberline {1}Introduction}{1}{section.1}\protected@file@percent }
\@LN{31}{0}
\@LN{32}{0}
\@LN{33}{0}
\@LN{34}{0}
\@LN{35}{0}
\@LN{36}{0}
\@LN{37}{0}
\@LN{38}{0}
\@LN{39}{0}
\@LN{40}{0}
\@LN{41}{0}
\@LN{42}{0}
\@LN@col{2}
\@LN{43}{0}
\@LN{44}{0}
\@LN{45}{0}
\@LN{46}{0}
\@LN{47}{0}
\@LN{48}{0}
\@LN{49}{0}
\@LN{50}{0}
\@LN{51}{0}
\@LN{52}{0}
\@LN{53}{0}
\@LN{54}{0}
\@LN{55}{0}
\@LN{56}{0}
\@LN{57}{0}
\@LN{58}{0}
\@LN{59}{0}
\@LN{60}{0}
\@LN{61}{0}
\@LN{62}{0}
\@LN{63}{0}
\@LN{64}{0}
\@LN{65}{0}
\@LN{66}{0}
\@LN{67}{0}
\@LN{68}{0}
\@LN{69}{0}
\@LN{70}{0}
\@LN{71}{0}
\@LN{72}{0}
\@LN{73}{0}
\@LN{74}{0}
\@LN{75}{0}
\@LN{76}{0}
\@LN{77}{0}
\@LN{78}{0}
\@LN{79}{0}
\@LN{80}{0}
\@LN{81}{0}
\@LN{82}{0}
\@LN{83}{0}
\citation{krause2014submodular}
\citation{nemhauser1978analysis}
\citation{li2024snapkv,feng2026ada,kim2026kvzip,fastkv2026}
\citation{kim2026kvzip}
\@LN@col{1}
\@LN{84}{1}
\@LN{85}{1}
\@LN{86}{1}
\@LN{87}{1}
\@LN{88}{1}
\@LN{89}{1}
\@LN{90}{1}
\@LN{91}{1}
\@LN{92}{1}
\@LN{93}{1}
\@LN{94}{1}
\@LN{95}{1}
\@LN{96}{1}
\@LN{97}{1}
\@LN{98}{1}
\@LN{99}{1}
\@LN{100}{1}
\@LN{101}{1}
\@LN{102}{1}
\@LN{103}{1}
\@LN{104}{1}
\@LN{105}{1}
\@LN{106}{1}
\@LN{107}{1}
\@LN{108}{1}
\@LN{109}{1}
\@LN{110}{1}
\@LN{111}{1}
\@LN{112}{1}
\@LN{113}{1}
\@LN{114}{1}
\@LN{115}{1}
\@LN{116}{1}
\@LN{117}{1}
\@LN{118}{1}
\@LN{119}{1}
\@LN{120}{1}
\@LN{121}{1}
\@LN{122}{1}
\@LN{123}{1}
\@LN{124}{1}
\@LN{125}{1}
\@LN{126}{1}
\@LN{127}{1}
\@LN{128}{1}
\@LN{129}{1}
\@LN{130}{1}
\@LN{131}{1}
\@LN{132}{1}
\@LN{133}{1}
\@LN{134}{1}
\@LN@col{2}
\@LN{135}{1}
\@LN{136}{1}
\@LN{137}{1}
\@LN{138}{1}
\@LN{139}{1}
\@LN{140}{1}
\@LN{141}{1}
\@LN{142}{1}
\@LN{143}{1}
\@LN{144}{1}
\@LN{145}{1}
\@LN{146}{1}
\@LN{147}{1}
\@LN{148}{1}
\@writefile{toc}{\contentsline {section}{\numberline {2}Preliminaries}{2}{section.2}\protected@file@percent }
\@LN{149}{1}
\@writefile{toc}{\contentsline {subsection}{\numberline {2.1}Notation and Problem Formulation}{2}{subsection.2.1}\protected@file@percent }
\@LN{150}{1}
\@LN{151}{1}
\@LN{152}{1}
\@LN{153}{1}
\@LN{154}{1}
\@LN{155}{1}
\@LN{156}{1}
\@LN{157}{1}
\@LN{158}{1}
\@LN{159}{1}
\@LN{160}{1}
\@writefile{toc}{\contentsline {subsection}{\numberline {2.2}Submodular Set Functions}{2}{subsection.2.2}\protected@file@percent }
\@LN{161}{1}
\@LN{162}{1}
\@LN{163}{1}
\@LN{164}{1}
\@LN{165}{1}
\@LN{166}{1}
\@LN{167}{1}
\@LN{168}{1}
\@LN{169}{1}
\@LN{170}{1}
\@LN{171}{1}
\@LN{172}{1}
\@writefile{toc}{\contentsline {subsection}{\numberline {2.3}Empirical Analysis and Observations}{2}{subsection.2.3}\protected@file@percent }
\newlabel{sec:empirical}{{2.3}{2}{Empirical Analysis and Observations}{subsection.2.3}{}}
\@LN{173}{1}
\@LN{174}{1}
\@LN{175}{1}
\@LN{176}{1}
\@LN{177}{1}
\citation{lin2011documentsummarization}
\providecommand*\caption@xref[2]{\@setref\relax\@undefined{#1}}
\newlabel{fig:empirical_a}{{1a}{3}{Marginal Spatial Gain\relax }{figure.caption.1}{}}
\newlabel{sub@fig:empirical_a}{{a}{3}{Marginal Spatial Gain\relax }{figure.caption.1}{}}
\newlabel{fig:empirical_b}{{1b}{3}{Locality of Importance\relax }{figure.caption.1}{}}
\newlabel{sub@fig:empirical_b}{{b}{3}{Locality of Importance\relax }{figure.caption.1}{}}
\newlabel{fig:empirical_c}{{1c}{3}{Cumulative Context Coverage\relax }{figure.caption.1}{}}
\newlabel{sub@fig:empirical_c}{{c}{3}{Cumulative Context Coverage\relax }{figure.caption.1}{}}
\@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces \textbf  {Empirical Motivation for Redundancy-Penalized Eviction.} \textbf  {(a)} The marginal spatial coverage of Top-K declines due to score clustering. \textbf  {(b)} Autocorrelation analysis indicates that proxy importance scores are statistically localized. \textbf  {(c)} By suppressing local score clusters, 1D-NMS improves global segment coverage.\relax }}{3}{figure.caption.1}\protected@file@percent }
\newlabel{fig:empirical_analysis}{{1}{3}{\textbf {Empirical Motivation for Redundancy-Penalized Eviction.} \textbf {(a)} The marginal spatial coverage of Top-K declines due to score clustering. \textbf {(b)} Autocorrelation analysis indicates that proxy importance scores are statistically localized. \textbf {(c)} By suppressing local score clusters, 1D-NMS improves global segment coverage.\relax }{figure.caption.1}{}}
\@LN@col{1}
\@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces \textbf  {Visualization of Localized Attention mass.} Raw attention weights across heads reveal clustered high-attention anchors (vertical stripes, highlighted by red dashed boxes) versus diffuse background attention (horizontal variance), motivating head-aware scoring.\relax }}{3}{figure.caption.2}\protected@file@percent }
\newlabel{fig:raw_attention_heatmaps}{{2}{3}{\textbf {Visualization of Localized Attention mass.} Raw attention weights across heads reveal clustered high-attention anchors (vertical stripes, highlighted by red dashed boxes) versus diffuse background attention (horizontal variance), motivating head-aware scoring.\relax }{figure.caption.2}{}}
\@LN{178}{2}
\@LN{179}{2}
\@LN{180}{2}
\@LN{181}{2}
\@LN{182}{2}
\@LN{183}{2}
\@LN{184}{2}
\@LN{185}{2}
\@LN{186}{2}
\@LN{187}{2}
\@LN{188}{2}
\@LN{189}{2}
\@LN{190}{2}
\@LN{191}{2}
\@LN{192}{2}
\@LN{193}{2}
\@LN{194}{2}
\@LN{195}{2}
\@LN{196}{2}
\@LN{197}{2}
\@LN{198}{2}
\@LN@col{2}
\@LN{199}{2}
\@LN{200}{2}
\@LN{201}{2}
\@LN{202}{2}
\@LN{203}{2}
\@LN{204}{2}
\@LN{205}{2}
\@LN{206}{2}
\@LN{207}{2}
\@LN{208}{2}
\@LN{209}{2}
\@LN{210}{2}
\@LN{211}{2}
\@LN{212}{2}
\@LN{213}{2}
\@LN{214}{2}
\@LN{215}{2}
\@LN{216}{2}
\@LN{217}{2}
\@LN{218}{2}
\@LN{219}{2}
\@LN{220}{2}
\@LN{221}{2}
\@LN{222}{2}
\@LN{223}{2}
\@LN{224}{2}
\@LN{225}{2}
\@LN{226}{2}
\@LN{227}{2}
\@LN{228}{2}
\@LN{229}{2}
\@LN{230}{2}
\@LN{231}{2}
\@LN{232}{2}
\@LN{233}{2}
\@LN{234}{2}
\@LN{235}{2}
\citation{li2024snapkv}
\citation{kim2026kvzip}
\citation{nemhauser1978analysis}
\citation{lin2011documentsummarization}
\citation{cornuejols1983uncapacitatedfacilitylocationproblem}
\@LN@col{1}
\@LN{236}{3}
\@LN{237}{3}
\@LN{238}{3}
\@LN{239}{3}
\@LN{240}{3}
\@LN{241}{3}
\@LN{242}{3}
\@LN{243}{3}
\@LN{244}{3}
\@writefile{toc}{\contentsline {section}{\numberline {3}Methodology}{4}{section.3}\protected@file@percent }
\@LN{245}{3}
\@LN{246}{3}
\@LN{247}{3}
\@LN{248}{3}
\@LN{249}{3}
\@LN{250}{3}
\@LN{251}{3}
\@LN{252}{3}
\@LN{253}{3}
\@LN{254}{3}
\@LN{255}{3}
\@writefile{toc}{\contentsline {subsection}{\numberline {3.1}KV Cache Eviction with Capacity Constraints}{4}{subsection.3.1}\protected@file@percent }
\newlabel{sec:formulation}{{3.1}{4}{KV Cache Eviction with Capacity Constraints}{subsection.3.1}{}}
\@LN{256}{3}
\@LN{257}{3}
\@LN{258}{3}
\@LN{259}{3}
\@LN{260}{3}
\@LN{261}{3}
\@LN{262}{3}
\@LN{263}{3}
\@LN{264}{3}
\newlabel{eq:problem1}{{3}{4}{KV Cache Eviction with Capacity Constraints}{equation.3.3}{}}
\@LN{265}{3}
\@LN{266}{3}
\@LN{267}{3}
\@LN{268}{3}
\@LN{269}{3}
\@LN{270}{3}
\@LN{271}{3}
\@LN{272}{3}
\@LN{273}{3}
\@LN{274}{3}
\@LN{275}{3}
\@LN{276}{3}
\@LN{277}{3}
\@LN{278}{3}
\@LN{279}{3}
\@LN{280}{3}
\@LN{281}{3}
\@LN{282}{3}
\@LN@col{2}
\@LN{283}{3}
\@LN{284}{3}
\@LN{285}{3}
\@LN{286}{3}
\@LN{287}{3}
\@LN{288}{3}
\@LN{289}{3}
\@LN{290}{3}
\@LN{291}{3}
\@LN{292}{3}
\@LN{293}{3}
\@LN{294}{3}
\@LN{295}{3}
\@writefile{toc}{\contentsline {subsection}{\numberline {3.2}Score-Weighted Local Coverage Objective}{4}{subsection.3.2}\protected@file@percent }
\newlabel{sec:objective}{{3.2}{4}{Score-Weighted Local Coverage Objective}{subsection.3.2}{}}
\@LN{296}{3}
\@LN{297}{3}
\@LN{298}{3}
\@LN{299}{3}
\@LN{300}{3}
\@LN{301}{3}
\@LN{302}{3}
\@LN{303}{3}
\newlabel{eq:facility}{{4}{4}{Score-Weighted Local Coverage Objective}{equation.3.4}{}}
\@LN{304}{3}
\@LN{305}{3}
\@LN{306}{3}
\@LN{307}{3}
\@LN{308}{3}
\@LN{309}{3}
\@LN{310}{3}
\@LN{311}{3}
\@LN{312}{3}
\@LN{313}{3}
\@LN{314}{3}
\@LN{315}{3}
\@LN{316}{3}
\@LN{317}{3}
\@LN{318}{3}
\@LN{319}{3}
\@LN{320}{3}
\@LN{321}{3}
\@LN{322}{3}
\@LN{323}{3}
\@LN{324}{3}
\@LN{325}{3}
\@LN{326}{3}
\@writefile{lof}{\contentsline {figure}{\numberline {3}{\ignorespaces \textbf  {Main prefill results on Qwen3-8B.} We compare HubKV-refined KVZip/FastKVZip scores against the corresponding base scorers, SnapKV, Expected Attention, and the full-cache reference across 16 tasks. The x-axis reports the true KV compression ratio, ordered from aggressive compression on the left to full cache on the right.\relax }}{5}{figure.caption.3}\protected@file@percent }
\newlabel{fig:prefill_main_results}{{3}{5}{\textbf {Main prefill results on Qwen3-8B.} We compare HubKV-refined KVZip/FastKVZip scores against the corresponding base scorers, SnapKV, Expected Attention, and the full-cache reference across 16 tasks. The x-axis reports the true KV compression ratio, ordered from aggressive compression on the left to full cache on the right.\relax }{figure.caption.3}{}}
\@LN@col{1}
\@LN{327}{4}
\@LN{328}{4}
\@writefile{toc}{\contentsline {subsection}{\numberline {3.3}One-Pass Marginal-Gain Proxy via HubKV}{5}{subsection.3.3}\protected@file@percent }
\newlabel{sec:smd}{{3.3}{5}{One-Pass Marginal-Gain Proxy via HubKV}{subsection.3.3}{}}
\@LN{329}{4}
\@LN{330}{4}
\@LN{331}{4}
\@LN{332}{4}
\@LN{333}{4}
\@LN{334}{4}
\@LN{335}{4}
\@LN{336}{4}
\@LN{337}{4}
\@LN{338}{4}
\@LN{339}{4}
\@LN{340}{4}
\@LN{341}{4}
\@LN{342}{4}
\@LN{343}{4}
\@LN{344}{4}
\@LN{345}{4}
\@LN{346}{4}
\@LN{347}{4}
\@LN@col{2}
\@LN{348}{4}
\@LN{349}{4}
\@LN{350}{4}
\@LN{351}{4}
\@LN{352}{4}
\@LN{353}{4}
\@LN{354}{4}
\@LN{355}{4}
\@LN{356}{4}
\@LN{357}{4}
\@LN{358}{4}
\@LN{359}{4}
\@LN{360}{4}
\@LN{361}{4}
\@LN{362}{4}
\@LN{363}{4}
\@LN{364}{4}
\@LN{365}{4}
\@writefile{lof}{\contentsline {figure}{\numberline {4}{\ignorespaces \textbf  {LongBench average across model backbones.} We report LongBench average results for Qwen3-8B, Qwen2.5-7B-Instruct-1M, Qwen3-14B, and Llama-3.1-8B-Instruct. Each panel reports the compression ratios available for that backbone and includes the full-cache reference at $r=0.00$.\relax }}{6}{figure.caption.4}\protected@file@percent }
\newlabel{fig:longbench_average_model_comparison}{{4}{6}{\textbf {LongBench average across model backbones.} We report LongBench average results for Qwen3-8B, Qwen2.5-7B-Instruct-1M, Qwen3-14B, and Llama-3.1-8B-Instruct. Each panel reports the compression ratios available for that backbone and includes the full-cache reference at $r=0.00$.\relax }{figure.caption.4}{}}
\@LN@col{1}
\@writefile{lof}{\contentsline {figure}{\numberline {5}{\ignorespaces \textbf  {Decoding-stage results on Qwen3-8B.} We evaluate AIME25 and MATH with a cache update interval of 128 decoding steps and target KV lengths of 1024, 2048, 4096, and 6144. HubKV denotes HubKV(+FastKVZip), which refines FastKVZip scores before the same target-length pruning step is applied.\relax }}{6}{figure.caption.5}\protected@file@percent }
\newlabel{fig:decoding_results}{{5}{6}{\textbf {Decoding-stage results on Qwen3-8B.} We evaluate AIME25 and MATH with a cache update interval of 128 decoding steps and target KV lengths of 1024, 2048, 4096, and 6144. HubKV denotes HubKV(+FastKVZip), which refines FastKVZip scores before the same target-length pruning step is applied.\relax }{figure.caption.5}{}}
\@LN{366}{5}
\@LN{367}{5}
\@LN{368}{5}
\@LN{369}{5}
\@LN{370}{5}
\@LN{371}{5}
\@LN{372}{5}
\@LN{373}{5}
\@LN{374}{5}
\@LN{375}{5}
\@LN{376}{5}
\@LN{377}{5}
\@LN{378}{5}
\@LN{379}{5}
\@LN{380}{5}
\@LN@col{2}
\@LN{381}{5}
\@LN{382}{5}
\@LN{383}{5}
\@LN{384}{5}
\@LN{385}{5}
\@LN{386}{5}
\@LN{387}{5}
\@LN{388}{5}
\@LN{389}{5}
\@LN{390}{5}
\@LN{391}{5}
\@LN{392}{5}
\@LN{393}{5}
\@LN{394}{5}
\@LN{395}{5}
\@LN{396}{5}
\@LN{397}{5}
\@LN{398}{5}
\@LN{399}{5}
\@LN{400}{5}
\@LN{401}{5}
\@LN{402}{5}
\@LN{403}{5}
\@LN{404}{5}
\@LN{405}{5}
\@LN{406}{5}
\@LN{407}{5}
\@LN{408}{5}
\@LN{409}{5}
\@LN{410}{5}
\@LN{411}{5}
\@LN{412}{5}
\citation{yang2025qwen3}
\citation{devoto2025expected}
\citation{hendrycks2021MATH}
\@LN@col{1}
\@LN{413}{6}
\@LN{414}{6}
\@LN{415}{6}
\@LN{416}{6}
\@LN{417}{6}
\@LN{418}{6}
\@LN{419}{6}
\@LN{420}{6}
\@LN{421}{6}
\@LN{422}{6}
\@LN{423}{6}
\@LN{424}{6}
\@LN{425}{6}
\@LN{426}{6}
\@LN{427}{6}
\@LN{428}{6}
\@LN{429}{6}
\@LN{430}{6}
\@LN{431}{6}
\@LN{432}{6}
\@LN{433}{6}
\@writefile{toc}{\contentsline {section}{\numberline {4}Experiments}{7}{section.4}\protected@file@percent }
\newlabel{sec:experiments}{{4}{7}{Experiments}{section.4}{}}
\@LN{434}{6}
\@LN{435}{6}
\@LN{436}{6}
\@LN{437}{6}
\@LN{438}{6}
\@LN{439}{6}
\@LN{440}{6}
\@LN{441}{6}
\@LN{442}{6}
\@LN{443}{6}
\@LN{444}{6}
\@LN{445}{6}
\@LN{446}{6}
\@LN{447}{6}
\@writefile{toc}{\contentsline {subsection}{\numberline {4.1}Experimental Setup}{7}{subsection.4.1}\protected@file@percent }
\@LN{448}{6}
\@LN{449}{6}
\@LN{450}{6}
\@LN{451}{6}
\@LN{452}{6}
\@LN{453}{6}
\@LN{454}{6}
\@LN{455}{6}
\@LN{456}{6}
\@LN{457}{6}
\@LN{458}{6}
\@LN{459}{6}
\@LN{460}{6}
\@LN{461}{6}
\@LN@col{2}
\@writefile{lot}{\contentsline {table}{\numberline {1}{\ignorespaces \textbf  {Paired average gain by task family and compression ratio.} Each value is the mean score-point difference between HubKV-refined scores and the corresponding base scorer, averaged over all tasks in the family at the specified compression ratio in Figure\nobreakspace  {}\ref  {fig:prefill_main_results}.\relax }}{7}{table.caption.6}\protected@file@percent }
\newlabel{tab:paired_prefill_gain}{{1}{7}{\textbf {Paired average gain by task family and compression ratio.} Each value is the mean score-point difference between HubKV-refined scores and the corresponding base scorer, averaged over all tasks in the family at the specified compression ratio in Figure~\ref {fig:prefill_main_results}.\relax }{table.caption.6}{}}
\@LN{462}{6}
\@LN{463}{6}
\@LN{464}{6}
\@LN{465}{6}
\@LN{466}{6}
\@LN{467}{6}
\@LN{468}{6}
\@LN{469}{6}
\@LN{470}{6}
\@LN{471}{6}
\@LN{472}{6}
\@LN{473}{6}
\@LN{474}{6}
\@LN{475}{6}
\@writefile{toc}{\contentsline {subsection}{\numberline {4.2}Main Results}{7}{subsection.4.2}\protected@file@percent }
\@LN{476}{6}
\@LN{477}{6}
\@LN{478}{6}
\@LN{479}{6}
\@LN{480}{6}
\@LN{481}{6}
\@LN{482}{6}
\@LN{483}{6}
\@LN{484}{6}
\@LN{485}{6}
\@LN{486}{6}
\@LN{487}{6}
\@LN{488}{6}
\@LN{489}{6}
\@LN{490}{6}
\@LN{491}{6}
\@LN{492}{6}
\@writefile{toc}{\contentsline {subsection}{\numberline {4.3}Decoding-Stage Results}{7}{subsection.4.3}\protected@file@percent }
\newlabel{sec:decoding_results}{{4.3}{7}{Decoding-Stage Results}{subsection.4.3}{}}
\@LN{493}{6}
\@LN{494}{6}
\@LN{495}{6}
\@LN{496}{6}
\citation{kwon2023efficient}
\citation{xiao2024efficient}
\citation{han2024lm}
\citation{zhang2024h2o}
\citation{liu2024scissorhands}
\citation{ge2023model}
\citation{li2024snapkv}
\citation{zhang2024pyramidkv}
\citation{zhao2024duoattention}
\citation{devoto2025expected}
\citation{kim2026kvzip}
\citation{liu2026chunkkv}
\citation{fastkv2026}
\bibdata{custom}
\bibcite{bai2023longbench}{{1}{2024}{{Bai et~al.}}{{Bai, Lv, Zhang, Lyu, Tang, Huang, Du, Liu, Zeng, Hou et~al.}}}
\@LN@col{1}
\@LN{497}{7}
\@LN{498}{7}
\@LN{499}{7}
\@LN{500}{7}
\@LN{501}{7}
\@LN{502}{7}
\@LN{503}{7}
\@LN{504}{7}
\@LN{505}{7}
\@LN{506}{7}
\@LN{507}{7}
\@writefile{toc}{\contentsline {section}{\numberline {5}Related Work}{8}{section.5}\protected@file@percent }
\newlabel{sec:related_works}{{5}{8}{Related Work}{section.5}{}}
\@writefile{toc}{\contentsline {paragraph}{KV Cache Eviction and Sparsification.}{8}{section*.7}\protected@file@percent }
\@LN{508}{7}
\@LN{509}{7}
\@LN{510}{7}
\@LN{511}{7}
\@LN{512}{7}
\@LN{513}{7}
\@LN{514}{7}
\@LN{515}{7}
\@LN{516}{7}
\@LN{517}{7}
\@LN{518}{7}
\@LN{519}{7}
\@LN{520}{7}
\@writefile{toc}{\contentsline {paragraph}{Prefill-based and Query-Agnostic Compression.}{8}{section*.8}\protected@file@percent }
\@LN{521}{7}
\@LN{522}{7}
\@LN{523}{7}
\@LN{524}{7}
\@LN{525}{7}
\@LN{526}{7}
\@LN{527}{7}
\@LN{528}{7}
\@LN{529}{7}
\@LN{530}{7}
\@LN{531}{7}
\@LN{532}{7}
\@LN{533}{7}
\@LN{534}{7}
\@LN{535}{7}
\@LN{536}{7}
\@LN{537}{7}
\@LN{538}{7}
\@LN{539}{7}
\@LN{540}{7}
\@LN{541}{7}
\@LN{542}{7}
\@LN{543}{7}
\@LN{544}{7}
\@LN{545}{7}
\@LN{546}{7}
\@LN@col{2}
\@LN{547}{7}
\@LN{548}{7}
\@LN{549}{7}
\@LN{550}{7}
\@LN{551}{7}
\@LN{552}{7}
\@LN{553}{7}
\@LN{554}{7}
\@LN{555}{7}
\@LN{556}{7}
\@LN{557}{7}
\@LN{558}{7}
\@LN{559}{7}
\@LN{560}{7}
\@LN{561}{7}
\@LN{562}{7}
\@writefile{toc}{\contentsline {section}{\numberline {6}Conclusion}{8}{section.6}\protected@file@percent }
\newlabel{sec:conclusion}{{6}{8}{Conclusion}{section.6}{}}
\@LN{563}{7}
\@LN{564}{7}
\@LN{565}{7}
\@LN{566}{7}
\@LN{567}{7}
\@LN{568}{7}
\@LN{569}{7}
\@LN{570}{7}
\@LN{571}{7}
\@LN{572}{7}
\@LN{573}{7}
\@LN{574}{7}
\@LN{575}{7}
\@LN{576}{7}
\@writefile{toc}{\contentsline {section}{\numberline {7}Limitations}{8}{section.7}\protected@file@percent }
\newlabel{sec:limitations}{{7}{8}{Limitations}{section.7}{}}
\@LN{577}{7}
\@LN{578}{7}
\@LN{579}{7}
\@LN{580}{7}
\@LN{581}{7}
\@LN{582}{7}
\@LN{583}{7}
\@LN{584}{7}
\@LN{585}{7}
\@LN{586}{7}
\@LN{587}{7}
\@LN{588}{7}
\@LN{589}{7}
\@LN{590}{7}
\@LN{591}{7}
\@LN{592}{7}
\@LN{593}{7}
\@LN{594}{7}
\bibcite{zhang2024pyramidkv}{{2}{2024}{{Cai et~al.}}{{Cai, Zhang, Gao, Liu, Li, Liu, Lu, Xiong, Dong, Hu et~al.}}}
\bibcite{cobbe2021gsm8k}{{3}{2021}{{Cobbe et~al.}}{{Cobbe, Kosaraju, Bavarian, Chen, Jun, Kaiser, Plappert, Tworek, Hilton, Nakano et~al.}}}
\bibcite{cornuejols1983uncapacitatedfacilitylocationproblem}{{4}{1983}{{Cornu{\'e}jols et~al.}}{{Cornu{\'e}jols, Nemhauser, and Wolsey}}}
\bibcite{dai2019transformer}{{5}{2019}{{Dai et~al.}}{{Dai, Yang, Yang, Carbonell, Le, and Salakhutdinov}}}
\bibcite{devoto2025expected}{{6}{2025}{{Devoto et~al.}}{{Devoto, Jeblick, and J{\'e}gou}}}
\bibcite{feng2026ada}{{7}{2026}{{Feng et~al.}}{{Feng, Lv, Cao, Xie, and Zhou}}}
\bibcite{ge2023model}{{8}{2024}{{Ge et~al.}}{{Ge, Zhang, Liu, Zhang, Han, and Gao}}}
\bibcite{han2024lm}{{9}{2024}{{Han et~al.}}{{Han, Wang, Peng, Xiong, Chen, Ji, and Wang}}}
\bibcite{hendrycks2021MATH}{{10}{2021}{{Hendrycks et~al.}}{{Hendrycks, Burns, Kadavath, Arora, Basart, Tang, Song, and Steinhardt}}}
\bibcite{hooper2024kvquant}{{11}{2024}{{Hooper et~al.}}{{Hooper, Kim, Mohammadzadeh, Mahoney, Shao, Keutzer, and Gholami}}}
\bibcite{hsieh2024ruler}{{12}{2024}{{Hsieh et~al.}}{{Hsieh, Sun, Kriman, Acharya, Rekesh, Jia, Zhang, and Ginsburg}}}
\bibcite{fastkv2026}{{13}{2026{a}}{{Kim et~al.}}{{Kim, Han, and Yun}}}
\bibcite{kim2026kvzip}{{14}{2026{b}}{{Kim et~al.}}{{Kim, Kim, Kwon, Lee, Yun, and Song}}}
\bibcite{krause2014submodular}{{15}{2014}{{Krause and Golovin}}{{}}}
\bibcite{kwon2023efficient}{{16}{2023}{{Kwon et~al.}}{{Kwon, Li, Zhuang, Sheng, Zheng, Yu, Gonzalez, Zhang, and Stoica}}}
\bibcite{li2025scbench}{{17}{2025}{{Li et~al.}}{{Li, Jiang, Wu, Luo, Ahn, Zhang, Abdi, Li, Gao, Yang et~al.}}}
\bibcite{li2024snapkv}{{18}{2024}{{Li et~al.}}{{Li, Huang, Yang, Venkitesh, Locatelli, Ye, Cai, Lewis, and Chen}}}
\bibcite{lin2011documentsummarization}{{19}{2011}{{Lin and Bilmes}}{{}}}
\bibcite{liu2026chunkkv}{{20}{2026}{{Liu et~al.}}{{Liu, Tang, Dong, Li, Li, Hu, and Chu}}}
\bibcite{liu2024scissorhands}{{21}{2023}{{Liu et~al.}}{{Liu, Desai, Liao, Wang, Xie, Xu, Kyrillidis, and Shrivastava}}}
\bibcite{liu2024kivi}{{22}{2024}{{Liu et~al.}}{{Liu, Yuan, Jin, Zhong, Xu, Braverman, Chen, and Hu}}}
\@LN@col{1}
\@LN{595}{8}
\@LN{596}{8}
\@LN{597}{8}
\@LN{598}{8}
\@LN{599}{8}
\@LN{600}{8}
\@LN{601}{8}
\@LN{602}{8}
\@LN{603}{8}
\@LN{604}{8}
\@LN{605}{8}
\@LN{606}{8}
\@LN{607}{8}
\@LN{608}{8}
\@LN{609}{8}
\@LN{610}{8}
\@LN{611}{8}
\@LN{612}{8}
\@LN{613}{8}
\@LN{614}{8}
\@LN{615}{8}
\@LN{616}{8}
\@LN{617}{8}
\@LN{618}{8}
\@LN{619}{8}
\@LN{620}{8}
\@LN{621}{8}
\@LN{622}{8}
\@LN{623}{8}
\@LN{624}{8}
\@LN{625}{8}
\@LN{626}{8}
\@LN{627}{8}
\@LN{628}{8}
\@LN{629}{8}
\@LN{630}{8}
\@LN{631}{8}
\@LN{632}{8}
\@LN{633}{8}
\@LN{634}{8}
\@LN{635}{8}
\@LN{636}{8}
\@LN{637}{8}
\@LN{638}{8}
\@LN{639}{8}
\@LN{640}{8}
\@LN{641}{8}
\@LN{642}{8}
\@LN{643}{8}
\@LN{644}{8}
\@LN{645}{8}
\@LN{646}{8}
\@LN{647}{8}
\@LN{648}{8}
\@LN{649}{8}
\@LN{650}{8}
\@LN@col{2}
\@LN{651}{8}
\@LN{652}{8}
\@LN{653}{8}
\@LN{654}{8}
\@LN{655}{8}
\@LN{656}{8}
\@LN{657}{8}
\@LN{658}{8}
\@LN{659}{8}
\@LN{660}{8}
\@LN{661}{8}
\@LN{662}{8}
\@LN{663}{8}
\@LN{664}{8}
\@LN{665}{8}
\@LN{666}{8}
\@LN{667}{8}
\@LN{668}{8}
\@LN{669}{8}
\@LN{670}{8}
\@LN{671}{8}
\@LN{672}{8}
\@LN{673}{8}
\@LN{674}{8}
\@LN{675}{8}
\@LN{676}{8}
\@LN{677}{8}
\@LN{678}{8}
\@LN{679}{8}
\@LN{680}{8}
\@LN{681}{8}
\@LN{682}{8}
\@LN{683}{8}
\@LN{684}{8}
\@LN{685}{8}
\@LN{686}{8}
\@LN{687}{8}
\@LN{688}{8}
\@LN{689}{8}
\@LN{690}{8}
\@LN{691}{8}
\@LN{692}{8}
\@LN{693}{8}
\@LN{694}{8}
\@LN{695}{8}
\@LN{696}{8}
\@LN{697}{8}
\@LN{698}{8}
\@LN{699}{8}
\@LN{700}{8}
\@LN{701}{8}
\@LN{702}{8}
\@LN{703}{8}
\@LN{704}{8}
\@LN{705}{8}
\bibcite{nemhauser1978analysis}{{23}{1978}{{Nemhauser et~al.}}{{Nemhauser, Wolsey, and Fisher}}}
\bibcite{rajpurkar2016squad}{{24}{2016}{{Rajpurkar et~al.}}{{Rajpurkar, Zhang, Lopyrev, and Liang}}}
\bibcite{vaswani2017attentionisallyouneed}{{25}{2017}{{Vaswani et~al.}}{{Vaswani, Shazeer, Parmar, Uszkoreit, Jones, Gomez, Kaiser, and Polosukhin}}}
\bibcite{zhao2024duoattention}{{26}{2025}{{Xiao et~al.}}{{Xiao, Tang, Zuo, Guo, Yang, Tang, Fu, and Han}}}
\bibcite{xiao2024efficient}{{27}{2024}{{Xiao et~al.}}{{Xiao, Tian, Chen, Han, and Lewis}}}
\bibcite{yang2025qwen3}{{28}{2025}{{Yang et~al.}}{{Yang, Li, Yang, Zhang, Hui, Zheng, Yu, Gao, Huang, Lv et~al.}}}
\bibcite{zhang2024h2o}{{29}{2023}{{Zhang et~al.}}{{Zhang, Sheng, Zhou, Chen, Zheng, Cai, Song, Tian, R{\'e}, Barrett et~al.}}}
\@LN@col{1}
\@LN{706}{9}
\@LN{707}{9}
\@LN{708}{9}
\@LN{709}{9}
\@LN{710}{9}
\@LN{711}{9}
\@LN{712}{9}
\@LN{713}{9}
\@LN{714}{9}
\@LN{715}{9}
\@LN{716}{9}
\@LN{717}{9}
\@LN{718}{9}
\@LN{719}{9}
\@LN{720}{9}
\@LN{721}{9}
\@LN{722}{9}
\@LN{723}{9}
\@LN{724}{9}
\@LN{725}{9}
\@LN{726}{9}
\@LN{727}{9}
\@LN{728}{9}
\@LN{729}{9}
\@LN{730}{9}
\@LN{731}{9}
\@LN{732}{9}
\@LN{733}{9}
\@LN{734}{9}
\@LN{735}{9}
\@LN{736}{9}
\@LN{737}{9}
\@LN{738}{9}
\@LN{739}{9}
\@LN{740}{9}
\@LN{741}{9}
\@LN{742}{9}
\@LN{743}{9}
\@LN{744}{9}
\@LN{745}{9}
\@LN{746}{9}
\@LN{747}{9}
\@LN@col{2}
\citation{bai2023longbench}
\citation{li2025scbench}
\citation{hsieh2024ruler}
\citation{rajpurkar2016squad}
\citation{cobbe2021gsm8k}
\citation{fastkv2026}
\citation{kim2026kvzip}
\citation{li2024snapkv}
\@LN@col{1}
\@LN{748}{10}
\@writefile{toc}{\contentsline {section}{\numberline {A}Experimental Details}{11}{appendix.A}\protected@file@percent }
\newlabel{app:experimental_details}{{A}{11}{Experimental Details}{appendix.A}{}}
\@LN{749}{10}
\@LN{750}{10}
\@LN{751}{10}
\@LN{752}{10}
\@LN{753}{10}
\@LN{754}{10}
\@LN{755}{10}
\@LN{756}{10}
\@LN{757}{10}
\@LN{758}{10}
\@LN{759}{10}
\@LN{760}{10}
\@LN{761}{10}
\@LN{762}{10}
\@LN{763}{10}
\@LN{764}{10}
\@LN{765}{10}
\@LN{766}{10}
\@LN{767}{10}
\@LN{768}{10}
\@LN{769}{10}
\@LN{770}{10}
\@LN{771}{10}
\@LN{772}{10}
\@LN{773}{10}
\@LN{774}{10}
\@LN{775}{10}
\@LN{776}{10}
\@LN{777}{10}
\@LN{778}{10}
\@LN{779}{10}
\@LN{780}{10}
\@LN{781}{10}
\@LN{782}{10}
\@LN{783}{10}
\@LN{784}{10}
\@LN{785}{10}
\@LN{786}{10}
\@LN{787}{10}
\@LN{788}{10}
\@LN{789}{10}
\@LN{790}{10}
\@LN{791}{10}
\@LN{792}{10}
\@LN{793}{10}
\@LN{794}{10}
\@LN{795}{10}
\@LN{796}{10}
\@LN{797}{10}
\@LN{798}{10}
\@LN@col{2}
\@LN{799}{10}
\@LN{800}{10}
\@LN{801}{10}
\@LN{802}{10}
\@LN{803}{10}
\@LN{804}{10}
\@LN{805}{10}
\@LN{806}{10}
\@LN{807}{10}
\@writefile{toc}{\contentsline {subsection}{\numberline {A.1}Protected-Token Handling}{11}{subsection.A.1}\protected@file@percent }
\newlabel{app:protected_tokens}{{A.1}{11}{Protected-Token Handling}{subsection.A.1}{}}
\@LN{808}{10}
\@LN{809}{10}
\@LN{810}{10}
\@LN{811}{10}
\@LN{812}{10}
\@LN{813}{10}
\@LN{814}{10}
\@LN{815}{10}
\@LN{816}{10}
\@LN{817}{10}
\@LN{818}{10}
\@LN{819}{10}
\@LN{820}{10}
\@LN{821}{10}
\@LN{822}{10}
\@LN{823}{10}
\@LN{824}{10}
\@LN{825}{10}
\@LN{826}{10}
\@LN{827}{10}
\@LN{828}{10}
\@LN{829}{10}
\@LN{830}{10}
\@LN{831}{10}
\@LN{832}{10}
\@LN{833}{10}
\@LN{834}{10}
\@LN{835}{10}
\@LN{836}{10}
\@LN{837}{10}
\@LN{838}{10}
\@LN{839}{10}
\@LN{840}{10}
\@LN{841}{10}
\@LN{842}{10}
\@LN{843}{10}
\@LN{844}{10}
\@LN@col{1}
\@LN{845}{11}
\@LN{846}{11}
\@LN{847}{11}
\@LN{848}{11}
\@LN{849}{11}
\@LN{850}{11}
\@LN{851}{11}
\@LN{852}{11}
\@LN{853}{11}
\@LN{854}{11}
\@LN{855}{11}
\@LN{856}{11}
\@LN{857}{11}
\@LN{858}{11}
\@LN{859}{11}
\@LN{860}{11}
\@LN{861}{11}
\@LN{862}{11}
\@LN{863}{11}
\@LN{864}{11}
\@LN{865}{11}
\@LN{866}{11}
\@LN{867}{11}
\@LN{868}{11}
\@LN{869}{11}
\@LN{870}{11}
\@LN{871}{11}
\@LN{872}{11}
\@LN{873}{11}
\@LN{874}{11}
\@LN{875}{11}
\@LN{876}{11}
\@LN{877}{11}
\@LN{878}{11}
\@LN{879}{11}
\@LN{880}{11}
\@LN{881}{11}
\@LN{882}{11}
\@LN{883}{11}
\@LN{884}{11}
\@LN{885}{11}
\@LN{886}{11}
\@LN{887}{11}
\@LN{888}{11}
\@LN{889}{11}
\@LN{890}{11}
\@writefile{toc}{\contentsline {section}{\numberline {B}Ablation Results}{12}{appendix.B}\protected@file@percent }
\newlabel{app:ablation_results}{{B}{12}{Ablation Results}{appendix.B}{}}
\@LN{891}{11}
\@LN{892}{11}
\@LN{893}{11}
\@LN{894}{11}
\@LN@col{2}
\@LN{895}{11}
\@LN{896}{11}
\@LN{897}{11}
\@LN{898}{11}
\@LN{899}{11}
\@LN{900}{11}
\@writefile{lot}{\contentsline {table}{\numberline {2}{\ignorespaces \textbf  {Ablation of HubKV design choices on Qwen3-8B.} The component block reports average scores and gains over the paired base scorer on six shared ablation tasks; ``Ungated'' applies the local hub discounting and head-wise calibration without the compression-ratio gate. The head-budget block reports gains over the paired base scorer on six shared ablation tasks.\relax }}{12}{table.caption.10}\protected@file@percent }
\newlabel{tab:hubkv_ablation}{{2}{12}{\textbf {Ablation of HubKV design choices on Qwen3-8B.} The component block reports average scores and gains over the paired base scorer on six shared ablation tasks; ``Ungated'' applies the local hub discounting and head-wise calibration without the compression-ratio gate. The head-budget block reports gains over the paired base scorer on six shared ablation tasks.\relax }{table.caption.10}{}}
\@LN{901}{11}
\@writefile{toc}{\contentsline {section}{\numberline {C}Detailed Results Analysis}{12}{appendix.C}\protected@file@percent }
\newlabel{app:detailed_results_analysis}{{C}{12}{Detailed Results Analysis}{appendix.C}{}}
\@LN{902}{11}
\@LN{903}{11}
\@LN{904}{11}
\@LN{905}{11}
\@LN{906}{11}
\@LN{907}{11}
\@LN{908}{11}
\@LN{909}{11}
\@LN{910}{11}
\@LN{911}{11}
\@LN{912}{11}
\@LN{913}{11}
\@LN{914}{11}
\@LN{915}{11}
\@LN{916}{11}
\@LN{917}{11}
\@LN{918}{11}
\@LN{919}{11}
\@LN{920}{11}
\@LN{921}{11}
\@LN{922}{11}
\@LN{923}{11}
\@LN{924}{11}
\@LN{925}{11}
\@LN{926}{11}
\@LN{927}{11}
\@LN{928}{11}
\@LN{929}{11}
\@LN{930}{11}
\@LN@col{1}
\@LN{931}{12}
\@LN{932}{12}
\@LN{933}{12}
\@LN{934}{12}
\@LN{935}{12}
\@LN{936}{12}
\@LN{937}{12}
\@LN{938}{12}
\@LN{939}{12}
\@LN{940}{12}
\@LN{941}{12}
\@LN{942}{12}
\@LN{943}{12}
\@LN{944}{12}
\@LN{945}{12}
\@LN{946}{12}
\@LN{947}{12}
\@LN{948}{12}
\@LN{949}{12}
\@LN{950}{12}
\@LN{951}{12}
\@LN{952}{12}
\@LN{953}{12}
\@writefile{toc}{\contentsline {section}{\numberline {D}Code and Math Locality Stress Test}{13}{appendix.D}\protected@file@percent }
\newlabel{app:code_math_stress}{{D}{13}{Code and Math Locality Stress Test}{appendix.D}{}}
\@LN{954}{12}
\@LN{955}{12}
\@LN{956}{12}
\@LN{957}{12}
\@LN{958}{12}
\@LN{959}{12}
\@LN{960}{12}
\@LN{961}{12}
\@LN{962}{12}
\@LN{963}{12}
\@LN{964}{12}
\@LN{965}{12}
\@LN{966}{12}
\@LN{967}{12}
\@LN{968}{12}
\@LN{969}{12}
\@LN{970}{12}
\@LN{971}{12}
\@LN{972}{12}
\@LN{973}{12}
\@LN{974}{12}
\@LN{975}{12}
\@LN{976}{12}
\@LN{977}{12}
\@LN{978}{12}
\@LN{979}{12}
\@LN{980}{12}
\@LN@col{2}
\@LN{981}{12}
\@LN{982}{12}
\@LN{983}{12}
\@LN{984}{12}
\@LN{985}{12}
\@LN{986}{12}
\@LN{987}{12}
\@LN{988}{12}
\@LN{989}{12}
\@LN{990}{12}
\@LN{991}{12}
\@LN{992}{12}
\@LN{993}{12}
\@LN{994}{12}
\@LN{995}{12}
\@LN{996}{12}
\@LN{997}{12}
\@LN{998}{12}
\@LN{999}{12}
\@LN{1000}{12}
\@LN{1001}{12}
\@LN{1002}{12}
\@LN{1003}{12}
\@LN{1004}{12}
\@LN{1005}{12}
\@LN{1006}{12}
\@LN{1007}{12}
\@LN{1008}{12}
\@LN{1009}{12}
\@writefile{toc}{\contentsline {section}{\numberline {E}Efficiency Analysis}{13}{appendix.E}\protected@file@percent }
\newlabel{app:efficiency}{{E}{13}{Efficiency Analysis}{appendix.E}{}}
\@LN{1010}{12}
\@LN{1011}{12}
\@LN{1012}{12}
\@LN{1013}{12}
\@LN{1014}{12}
\@LN{1015}{12}
\@LN{1016}{12}
\@LN{1017}{12}
\@LN{1018}{12}
\@LN{1019}{12}
\@LN{1020}{12}
\@LN{1021}{12}
\@LN{1022}{12}
\@LN{1023}{12}
\@LN{1024}{12}
\@LN{1025}{12}
\@LN{1026}{12}
\@LN{1027}{12}
\@LN{1028}{12}
\@LN{1029}{12}
\@LN{1030}{12}
\citation{kim2026kvzip}
\citation{hsieh2024ruler}
\@writefile{lot}{\contentsline {table}{\numberline {3}{\ignorespaces \textbf  {Code and math gain-loss analysis on Qwen3-8B.} Each value is the paired score difference between HubKV-refined scores and the corresponding base scorer under the same prefill compression ratio. ``H'' denotes HubKV. Positive values indicate improvements; negative values identify cases where the locality prior can perturb useful local evidence chains.\relax }}{14}{table.caption.11}\protected@file@percent }
\newlabel{tab:code_math_gain_loss}{{3}{14}{\textbf {Code and math gain-loss analysis on Qwen3-8B.} Each value is the paired score difference between HubKV-refined scores and the corresponding base scorer under the same prefill compression ratio. ``H'' denotes HubKV. Positive values indicate improvements; negative values identify cases where the locality prior can perturb useful local evidence chains.\relax }{table.caption.11}{}}
\@LN@col{1}
\@writefile{lot}{\contentsline {table}{\numberline {4}{\ignorespaces \textbf  {Retained-token chain diagnostic for code and math spans.} Values are mean per-token retention fractions over annotated code/formula spans, averaged across examples and positions. The diagnostic is qualitative: lower span retention under HubKV is consistent with the small negative pockets in Table\nobreakspace  {}\ref  {tab:code_math_gain_loss}, but downstream benchmark deltas remain the primary evidence.\relax }}{14}{table.caption.12}\protected@file@percent }
\newlabel{tab:retained_chain_diagnostic}{{4}{14}{\textbf {Retained-token chain diagnostic for code and math spans.} Values are mean per-token retention fractions over annotated code/formula spans, averaged across examples and positions. The diagnostic is qualitative: lower span retention under HubKV is consistent with the small negative pockets in Table~\ref {tab:code_math_gain_loss}, but downstream benchmark deltas remain the primary evidence.\relax }{table.caption.12}{}}
\@LN{1031}{13}
\@LN{1032}{13}
\@LN{1033}{13}
\@LN{1034}{13}
\@LN{1035}{13}
\@LN{1036}{13}
\@LN{1037}{13}
\@LN{1038}{13}
\@LN{1039}{13}
\@LN{1040}{13}
\@LN{1041}{13}
\@LN{1042}{13}
\@LN{1043}{13}
\@LN{1044}{13}
\@LN{1045}{13}
\@LN{1046}{13}
\@LN{1047}{13}
\@LN{1048}{13}
\@LN{1049}{13}
\@LN{1050}{13}
\@writefile{toc}{\contentsline {section}{\numberline {F}Additional LongBench Results on Qwen2.5-7B-Instruct-1M}{14}{appendix.F}\protected@file@percent }
\newlabel{app:qwen25_longbench}{{F}{14}{Additional LongBench Results on Qwen2.5-7B-Instruct-1M}{appendix.F}{}}
\@LN{1051}{13}
\@LN{1052}{13}
\@LN{1053}{13}
\@LN{1054}{13}
\@LN{1055}{13}
\@LN{1056}{13}
\@LN{1057}{13}
\@LN@col{2}
\@LN{1058}{13}
\@LN{1059}{13}
\@LN{1060}{13}
\@LN{1061}{13}
\@LN{1062}{13}
\@LN{1063}{13}
\@writefile{toc}{\contentsline {section}{\numberline {G}Additional LongBench Results on Llama-3.1-8B-Instruct}{14}{appendix.G}\protected@file@percent }
\newlabel{app:llama31_longbench}{{G}{14}{Additional LongBench Results on Llama-3.1-8B-Instruct}{appendix.G}{}}
\@LN{1064}{13}
\@LN{1065}{13}
\@LN{1066}{13}
\@LN{1067}{13}
\@LN{1068}{13}
\@LN{1069}{13}
\@LN{1070}{13}
\@LN{1071}{13}
\@LN{1072}{13}
\@LN{1073}{13}
\@LN{1074}{13}
\@LN{1075}{13}
\@LN{1076}{13}
\@writefile{toc}{\contentsline {section}{\numberline {H}Additional LongBench Results on Qwen3-14B}{14}{appendix.H}\protected@file@percent }
\newlabel{app:qwen3_14b_longbench}{{H}{14}{Additional LongBench Results on Qwen3-14B}{appendix.H}{}}
\@LN{1077}{13}
\@LN{1078}{13}
\@LN{1079}{13}
\@LN{1080}{13}
\@LN{1081}{13}
\@LN{1082}{13}
\@LN{1083}{13}
\@LN{1084}{13}
\@LN{1085}{13}
\@LN{1086}{13}
\@LN{1087}{13}
\@LN{1088}{13}
\@LN{1089}{13}
\@writefile{toc}{\contentsline {section}{\numberline {I}Details of Observation Experiments}{14}{appendix.I}\protected@file@percent }
\newlabel{app:observation_details}{{I}{14}{Details of Observation Experiments}{appendix.I}{}}
\@LN{1090}{13}
\@LN{1091}{13}
\@LN{1092}{13}
\@LN{1093}{13}
\@LN{1094}{13}
\@LN{1095}{13}
\@writefile{lot}{\contentsline {table}{\numberline {5}{\ignorespaces \textbf  {Score-stage efficiency microbenchmark.} Synthetic score tensors match the Qwen3-8B layer/head shape and use \texttt  {bfloat16}. The table reports median latency over the measured repeats. Across all rows, the maximum relative overhead is 68.0\%, but the absolute refine-plus-select latency remains below 27 ms even at $N=32768$ and batch size 4.\relax }}{15}{table.caption.13}\protected@file@percent }
\newlabel{tab:efficiency_microbenchmark}{{5}{15}{\textbf {Score-stage efficiency microbenchmark.} Synthetic score tensors match the Qwen3-8B layer/head shape and use \texttt {bfloat16}. The table reports median latency over the measured repeats. Across all rows, the maximum relative overhead is 68.0\%, but the absolute refine-plus-select latency remains below 27 ms even at $N=32768$ and batch size 4.\relax }{table.caption.13}{}}
\@writefile{lot}{\contentsline {table}{\numberline {6}{\ignorespaces \textbf  {End-to-end efficiency on Qwen3-8B.} We compare FastKVZip and HubKV(+FastKVZip) at compression ratio $r=0.95$, batch size 1, and 32 generated tokens. TTFT denotes prefill plus the first decode step. The measured TTFT overheads for HubKV are +0.49\%, -0.89\%, +1.26\%, and -0.72\% for $N=4096,8192,16384,32768$, respectively.\relax }}{15}{table.caption.14}\protected@file@percent }
\newlabel{tab:efficiency_e2e}{{6}{15}{\textbf {End-to-end efficiency on Qwen3-8B.} We compare FastKVZip and HubKV(+FastKVZip) at compression ratio $r=0.95$, batch size 1, and 32 generated tokens. TTFT denotes prefill plus the first decode step. The measured TTFT overheads for HubKV are +0.49\%, -0.89\%, +1.26\%, and -0.72\% for $N=4096,8192,16384,32768$, respectively.\relax }{table.caption.14}{}}
\@LN@col{1}
\@LN{1096}{14}
\@LN{1097}{14}
\@LN{1098}{14}
\@LN{1099}{14}
\@LN{1100}{14}
\@LN{1101}{14}
\@LN{1102}{14}
\@LN{1103}{14}
\@LN{1104}{14}
\@LN{1105}{14}
\@LN{1106}{14}
\@LN{1107}{14}
\@LN{1108}{14}
\@LN{1109}{14}
\@LN{1110}{14}
\@LN{1111}{14}
\@LN{1112}{14}
\@LN{1113}{14}
\@LN{1114}{14}
\@LN{1115}{14}
\@LN{1116}{14}
\@LN{1117}{14}
\@LN{1118}{14}
\@LN{1119}{14}
\@LN{1120}{14}
\@LN@col{2}
\@LN{1121}{14}
\@LN{1122}{14}
\@LN{1123}{14}
\@LN{1124}{14}
\@LN{1125}{14}
\@LN{1126}{14}
\@LN{1127}{14}
\@LN{1128}{14}
\@LN{1129}{14}
\@LN{1130}{14}
\@LN{1131}{14}
\@LN{1132}{14}
\@LN{1133}{14}
\@LN{1134}{14}
\@LN{1135}{14}
\@LN{1136}{14}
\@LN{1137}{14}
\@LN{1138}{14}
\@LN{1139}{14}
\@LN{1140}{14}
\@LN{1141}{14}
\@LN{1142}{14}
\@LN{1143}{14}
\@LN{1144}{14}
\@LN{1145}{14}
\@writefile{lof}{\contentsline {figure}{\numberline {6}{\ignorespaces \textbf  {Additional LongBench results on Qwen2.5-7B-Instruct-1M.} We compare LongBench average scores from HubKV-refined KVZip/FastKVZip against the corresponding base scorers, SnapKV, Expected Attention, and the full-cache reference. The x-axis reports the true KV compression ratio, ordered from aggressive compression on the left to full cache on the right.\relax }}{16}{figure.caption.15}\protected@file@percent }
\newlabel{fig:qwen25_longbench_appendix}{{6}{16}{\textbf {Additional LongBench results on Qwen2.5-7B-Instruct-1M.} We compare LongBench average scores from HubKV-refined KVZip/FastKVZip against the corresponding base scorers, SnapKV, Expected Attention, and the full-cache reference. The x-axis reports the true KV compression ratio, ordered from aggressive compression on the left to full cache on the right.\relax }{figure.caption.15}{}}
\@LN@col{1}
\@LN{1146}{15}
\@LN{1147}{15}
\@LN{1148}{15}
\@LN{1149}{15}
\@LN{1150}{15}
\@LN{1151}{15}
\@LN{1152}{15}
\@LN{1153}{15}
\@LN{1154}{15}
\@LN{1155}{15}
\@LN{1156}{15}
\@LN{1157}{15}
\@LN{1158}{15}
\@LN{1159}{15}
\@LN{1160}{15}
\@LN{1161}{15}
\@writefile{toc}{\contentsline {section}{\numberline {J}Theoretical Analysis of the SMD Ranking Proxy}{16}{appendix.J}\protected@file@percent }
\newlabel{app:theory}{{J}{16}{Theoretical Analysis of the SMD Ranking Proxy}{appendix.J}{}}
\@LN{1162}{15}
\@LN{1163}{15}
\@LN{1164}{15}
\@LN{1165}{15}
\@LN{1166}{15}
\@LN{1167}{15}
\@LN{1168}{15}
\@LN{1169}{15}
\@LN{1170}{15}
\@LN{1171}{15}
\@LN{1172}{15}
\@writefile{toc}{\contentsline {subsection}{\numberline {J.1}Properties of the Score-Weighted Coverage Objective}{16}{subsection.J.1}\protected@file@percent }
\@LN{1173}{15}
\@LN{1174}{15}
\@LN@col{2}
\@LN{1175}{15}
\@LN{1176}{15}
\@LN{1177}{15}
\@LN{1178}{15}
\@LN{1179}{15}
\@LN{1180}{15}
\@LN{1181}{15}
\@LN{1182}{15}
\@LN{1183}{15}
\@LN{1184}{15}
\@LN{1185}{15}
\@LN{1186}{15}
\@LN{1187}{15}
\@LN{1188}{15}
\@LN{1189}{15}
\@writefile{toc}{\contentsline {subsection}{\numberline {J.2}Local Properties of SMD}{16}{subsection.J.2}\protected@file@percent }
\@LN{1190}{15}
\@LN{1191}{15}
\@LN{1192}{15}
\@LN{1193}{15}
\@LN{1194}{15}
\@LN{1195}{15}
\@LN{1196}{15}
\@LN{1197}{15}
\@LN{1198}{15}
\@LN{1199}{15}
\@LN{1200}{15}
\@LN{1201}{15}
\@LN{1202}{15}
\@LN{1203}{15}
\@writefile{lof}{\contentsline {figure}{\numberline {7}{\ignorespaces \textbf  {Additional LongBench results on Llama-3.1-8B-Instruct.} We compare LongBench average scores from HubKV-refined KVZip against KVZip, SnapKV, Expected Attention, and the full-cache reference. The x-axis reports the true KV compression ratio, ordered from aggressive compression on the left to full cache on the right, covering compressed ratios $r \in \{0.50,0.75,0.80,0.85,0.88,0.90,0.95\}$ plus the full-cache point $r=0.00$.\relax }}{17}{figure.caption.16}\protected@file@percent }
\newlabel{fig:llama31_longbench_appendix}{{7}{17}{\textbf {Additional LongBench results on Llama-3.1-8B-Instruct.} We compare LongBench average scores from HubKV-refined KVZip against KVZip, SnapKV, Expected Attention, and the full-cache reference. The x-axis reports the true KV compression ratio, ordered from aggressive compression on the left to full cache on the right, covering compressed ratios $r \in \{0.50,0.75,0.80,0.85,0.88,0.90,0.95\}$ plus the full-cache point $r=0.00$.\relax }{figure.caption.16}{}}
\@LN@col{1}
\newlabel{prop:sandwich}{{2}{17}{Local Marginal Sandwich}{proposition.2}{}}
\@LN{1204}{16}
\@LN{1205}{16}
\@LN{1206}{16}
\@LN{1207}{16}
\@LN{1208}{16}
\@LN{1209}{16}
\@LN{1210}{16}
\@LN{1211}{16}
\@LN{1212}{16}
\@LN{1213}{16}
\@LN{1214}{16}
\@LN{1215}{16}
\@LN{1216}{16}
\@LN{1217}{16}
\@LN{1218}{16}
\@LN{1219}{16}
\@LN{1220}{16}
\@LN{1221}{16}
\@LN{1222}{16}
\@LN{1223}{16}
\@LN{1224}{16}
\@LN{1225}{16}
\@LN{1226}{16}
\@LN{1227}{16}
\@LN{1228}{16}
\@LN{1229}{16}
\@writefile{toc}{\contentsline {subsection}{\numberline {J.3}Bounded Effect of Ratio-Gated Head Calibration}{17}{subsection.J.3}\protected@file@percent }
\@LN{1230}{16}
\@LN{1231}{16}
\@LN{1232}{16}
\@LN@col{2}
\@LN{1233}{16}
\@LN{1234}{16}
\@LN{1235}{16}
\@LN{1236}{16}
\@LN{1237}{16}
\@LN{1238}{16}
\@LN{1239}{16}
\@LN{1240}{16}
\@LN{1241}{16}
\@LN{1242}{16}
\@LN{1243}{16}
\@LN{1244}{16}
\@LN{1245}{16}
\@LN{1246}{16}
\@LN{1247}{16}
\@LN{1248}{16}
\@LN{1249}{16}
\@LN{1250}{16}
\@LN{1251}{16}
\@LN{1252}{16}
\@LN{1253}{16}
\@LN{1254}{16}
\@LN{1255}{16}
\@LN{1256}{16}
\@LN{1257}{16}
\@LN{1258}{16}
\citation{bai2023longbench}
\@writefile{lof}{\contentsline {figure}{\numberline {8}{\ignorespaces \textbf  {Additional LongBench results on Qwen3-14B.} We compare LongBench average scores from HubKV-refined KVZip/FastKVZip against the corresponding base scorers, SnapKV, Expected Attention, and the full-cache reference. The x-axis reports the true KV compression ratio, ordered from aggressive compression on the left to full cache on the right, covering compressed ratios $r \in \{0.50,0.75,0.80,0.85,0.88\}$ plus the full-cache point $r=0.00$.\relax }}{18}{figure.caption.17}\protected@file@percent }
\newlabel{fig:qwen3_14b_longbench_appendix}{{8}{18}{\textbf {Additional LongBench results on Qwen3-14B.} We compare LongBench average scores from HubKV-refined KVZip/FastKVZip against the corresponding base scorers, SnapKV, Expected Attention, and the full-cache reference. The x-axis reports the true KV compression ratio, ordered from aggressive compression on the left to full cache on the right, covering compressed ratios $r \in \{0.50,0.75,0.80,0.85,0.88\}$ plus the full-cache point $r=0.00$.\relax }{figure.caption.17}{}}
\@LN@col{1}
\@LN{1259}{17}
\@LN{1260}{17}
\@LN{1261}{17}
\@LN{1262}{17}
\@LN{1263}{17}
\@LN{1264}{17}
\@LN{1265}{17}
\@LN{1266}{17}
\@writefile{toc}{\contentsline {subsection}{\numberline {J.4}Global Robustness and Limitations}{18}{subsection.J.4}\protected@file@percent }
\@LN{1267}{17}
\@LN{1268}{17}
\@LN{1269}{17}
\@LN{1270}{17}
\@LN{1271}{17}
\@LN{1272}{17}
\@LN{1273}{17}
\@LN{1274}{17}
\@LN{1275}{17}
\@LN{1276}{17}
\@LN{1277}{17}
\@LN{1278}{17}
\@LN{1279}{17}
\@LN{1280}{17}
\@LN{1281}{17}
\@LN{1282}{17}
\@LN{1283}{17}
\@LN{1284}{17}
\@LN{1285}{17}
\@LN{1286}{17}
\@LN{1287}{17}
\@LN{1288}{17}
\@LN@col{2}
\@LN{1289}{17}
\@LN{1290}{17}
\@LN{1291}{17}
\@LN{1292}{17}
\@LN{1293}{17}
\@writefile{toc}{\contentsline {section}{\numberline {K}Empirical Validation via Raw Attention Analysis}{18}{appendix.K}\protected@file@percent }
\newlabel{app:raw_attention}{{K}{18}{Empirical Validation via Raw Attention Analysis}{appendix.K}{}}
\@LN{1294}{17}
\@LN{1295}{17}
\@LN{1296}{17}
\@LN{1297}{17}
\@LN{1298}{17}
\@writefile{lof}{\contentsline {figure}{\numberline {9}{\ignorespaces \textbf  {Spatial correlation of raw attention weights.} The key-side attention density exhibits significant local autocorrelation at short token distances, providing additional evidence that model-native attention mass is spatially clustered.\relax }}{18}{figure.caption.18}\protected@file@percent }
\newlabel{fig:raw_attention_locality}{{9}{18}{\textbf {Spatial correlation of raw attention weights.} The key-side attention density exhibits significant local autocorrelation at short token distances, providing additional evidence that model-native attention mass is spatially clustered.\relax }{figure.caption.18}{}}
\@LN{1299}{17}
\@LN{1300}{17}
\citation{zhang2024h2o,liu2024scissorhands,ge2023model}
\citation{li2024snapkv,zhang2024pyramidkv,devoto2025expected,kim2026kvzip,fastkv2026}
\citation{liu2026chunkkv}
\citation{liu2024kivi,hooper2024kvquant}
\@LN@col{1}
\@LN{1301}{18}
\@LN{1302}{18}
\@LN{1303}{18}
\@LN{1304}{18}
\@LN{1305}{18}
\@LN{1306}{18}
\@LN{1307}{18}
\@LN{1308}{18}
\@LN{1309}{18}
\@LN{1310}{18}
\@writefile{toc}{\contentsline {section}{\numberline {L}Additional Related Work and Baseline Scope}{19}{appendix.L}\protected@file@percent }
\newlabel{app:additional_related_work}{{L}{19}{Additional Related Work and Baseline Scope}{appendix.L}{}}
\@writefile{toc}{\contentsline {paragraph}{Token eviction and compression baselines.}{19}{section*.19}\protected@file@percent }
\@LN{1311}{18}
\@LN{1312}{18}
\@LN{1313}{18}
\@LN{1314}{18}
\@LN{1315}{18}
\@LN{1316}{18}
\@LN{1317}{18}
\@LN{1318}{18}
\@LN{1319}{18}
\@LN{1320}{18}
\@LN{1321}{18}
\@LN{1322}{18}
\@LN{1323}{18}
\@LN{1324}{18}
\@LN{1325}{18}
\@LN{1326}{18}
\@LN{1327}{18}
\@writefile{toc}{\contentsline {paragraph}{Semantic and chunk-level baselines.}{19}{section*.20}\protected@file@percent }
\@LN{1328}{18}
\@LN{1329}{18}
\@LN{1330}{18}
\@LN{1331}{18}
\@LN{1332}{18}
\@LN{1333}{18}
\@LN{1334}{18}
\@LN{1335}{18}
\@LN{1336}{18}
\@LN{1337}{18}
\@LN{1338}{18}
\@LN{1339}{18}
\@LN{1340}{18}
\@writefile{toc}{\contentsline {paragraph}{KV-cache quantization.}{19}{section*.21}\protected@file@percent }
\@LN{1341}{18}
\@LN{1342}{18}
\@LN{1343}{18}
\@LN{1344}{18}
\@LN{1345}{18}
\@LN{1346}{18}
\@LN{1347}{18}
\@LN{1348}{18}
\@LN{1349}{18}
\@LN@col{2}
\@LN{1350}{18}
\@LN{1351}{18}
\@LN{1352}{18}
\@LN{1353}{18}
\@LN{1354}{18}
\@writefile{toc}{\contentsline {section}{\numberline {M}Additional Discussion}{19}{appendix.M}\protected@file@percent }
\newlabel{app:additional_discussion}{{M}{19}{Additional Discussion}{appendix.M}{}}
\@LN{1355}{18}
\@LN{1356}{18}
\@LN{1357}{18}
\@LN{1358}{18}
\@LN{1359}{18}
\@LN{1360}{18}
\@LN{1361}{18}
\@LN{1362}{18}
\@LN{1363}{18}
\@LN{1364}{18}
\@LN{1365}{18}
\@LN{1366}{18}
\@LN{1367}{18}
\@LN{1368}{18}
\@LN{1369}{18}
\@LN{1370}{18}
\@LN{1371}{18}
\@LN{1372}{18}
\@LN{1373}{18}
\@LN{1374}{18}
\@LN{1375}{18}
\@LN{1376}{18}
\@LN{1377}{18}
\@LN{1378}{18}
\@LN{1379}{18}
\@LN{1380}{18}
\@LN{1381}{18}
\@writefile{toc}{\contentsline {section}{\numberline {N}Ethical Considerations}{19}{appendix.N}\protected@file@percent }
\newlabel{app:ethical_considerations}{{N}{19}{Ethical Considerations}{appendix.N}{}}
\@LN{1382}{18}
\@LN{1383}{18}
\@LN{1384}{18}
\@LN{1385}{18}
\@LN{1386}{18}
\@LN{1387}{18}
\@LN{1388}{18}
\@LN{1389}{18}
\@LN{1390}{18}
\@LN{1391}{18}
\@LN{1392}{18}
\@LN{1393}{18}
\@LN{1394}{18}
\@LN{1395}{18}
\@LN{1396}{18}
\@LN{1397}{18}
\@LN{1398}{18}
\@LN@col{1}
\@LN{1399}{19}
\@LN{1400}{19}
\@LN{1401}{19}
\@writefile{toc}{\contentsline {section}{\numberline {O}Reproducibility Statement}{20}{appendix.O}\protected@file@percent }
\newlabel{app:reproducibility_statement}{{O}{20}{Reproducibility Statement}{appendix.O}{}}
\@LN{1402}{19}
\@LN{1403}{19}
\@LN{1404}{19}
\@LN{1405}{19}
\@LN{1406}{19}
\@LN{1407}{19}
\@LN{1408}{19}
\@LN{1409}{19}
\@LN{1410}{19}
\@LN@col{2}
\gdef \@abspage@last{20}