\begin{thebibliography}{29}
\providecommand{\natexlab}[1]{#1}

\bibitem[{Bai et~al.(2024)Bai, Lv, Zhang, Lyu, Tang, Huang, Du, Liu, Zeng, Hou
  et~al.}]{bai2023longbench}
Yushi Bai, Xin Lv, Jiajie Zhang, Hongchang Lyu, Jiankai Tang, Zhidian Huang,
  Zhengxiao Du, Xiao Liu, Aohan Zeng, Lei Hou, and 1 others. 2024.
\newblock Longbench: A bilingual, multitask benchmark for long context
  understanding.
\newblock In \emph{Proceedings of the 62nd annual meeting of the association
  for computational linguistics (volume 1: Long papers)}, pages 3119--3137.

\bibitem[{Cai et~al.(2024)Cai, Zhang, Gao, Liu, Li, Liu, Lu, Xiong, Dong, Hu
  et~al.}]{zhang2024pyramidkv}
Zefan Cai, Yichi Zhang, Bofei Gao, Yuliang Liu, Yucheng Li, Tianyu Liu, Keming
  Lu, Wayne Xiong, Yue Dong, Junjie Hu, and 1 others. 2024.
\newblock Pyramidkv: Dynamic kv cache compression based on pyramidal
  information funneling.
\newblock \emph{arXiv preprint arXiv:2406.02069}.

\bibitem[{Cobbe et~al.(2021)Cobbe, Kosaraju, Bavarian, Chen, Jun, Kaiser,
  Plappert, Tworek, Hilton, Nakano et~al.}]{cobbe2021gsm8k}
Karl Cobbe, Vineet Kosaraju, Mohammad Bavarian, Mark Chen, Heewoo Jun, Lukasz
  Kaiser, Matthias Plappert, Jerry Tworek, Jacob Hilton, Reiichiro Nakano, and
  1 others. 2021.
\newblock Training verifiers to solve math word problems.
\newblock \emph{arXiv preprint arXiv:2110.14168}.

\bibitem[{Cornu{\'e}jols et~al.(1983)Cornu{\'e}jols, Nemhauser, and
  Wolsey}]{cornuejols1983uncapacitatedfacilitylocationproblem}
G{\'e}rard Cornu{\'e}jols, George Nemhauser, and Laurence Wolsey. 1983.
\newblock The uncapacitated facility location problem.
\newblock Technical report, Cornell University Operations Research and
  Industrial Engineering.

\bibitem[{Dai et~al.(2019)Dai, Yang, Yang, Carbonell, Le, and
  Salakhutdinov}]{dai2019transformer}
Zihang Dai, Zhilin Yang, Yiming Yang, Jaime~G Carbonell, Quoc Le, and Ruslan
  Salakhutdinov. 2019.
\newblock Transformer-xl: Attentive language models beyond a fixed-length
  context.
\newblock In \emph{Proceedings of the 57th annual meeting of the association
  for computational linguistics}, pages 2978--2988.

\bibitem[{Devoto et~al.(2025)Devoto, Jeblick, and
  J{\'e}gou}]{devoto2025expected}
Alessio Devoto, Maximilian Jeblick, and Simon J{\'e}gou. 2025.
\newblock Expected attention: Kv cache compression by estimating attention from
  future queries distribution.
\newblock \emph{arXiv preprint arXiv:2510.00636}.

\bibitem[{Feng et~al.(2026)Feng, Lv, Cao, Xie, and Zhou}]{feng2026ada}
Yuan Feng, Junlin Lv, Yukun Cao, Xike Xie, and S~Kevin Zhou. 2026.
\newblock Ada-kv: Optimizing kv cache eviction by adaptive budget allocation
  for efficient llm inference.
\newblock \emph{Advances in Neural Information Processing Systems},
  38:113152--113188.

\bibitem[{Ge et~al.(2024)Ge, Zhang, Liu, Zhang, Han, and Gao}]{ge2023model}
Suyu Ge, Yunan Zhang, Liyuan Liu, Minjia Zhang, Jiawei Han, and Jianfeng Gao.
  2024.
\newblock Model tells you what to discard: Adaptive kv cache compression for
  llms.
\newblock In \emph{International Conference on Learning Representations},
  volume 2024, pages 22975--22988.

\bibitem[{Han et~al.(2024)Han, Wang, Peng, Xiong, Chen, Ji, and
  Wang}]{han2024lm}
Chi Han, Qifan Wang, Hao Peng, Wenhan Xiong, Yu~Chen, Heng Ji, and Sinong Wang.
  2024.
\newblock Lm-infinite: Zero-shot extreme length generalization for large
  language models.
\newblock In \emph{Proceedings of the 2024 Conference of the North American
  Chapter of the Association for Computational Linguistics: Human Language
  Technologies (Volume 1: Long Papers)}, pages 3991--4008.

\bibitem[{Hendrycks et~al.(2021)Hendrycks, Burns, Kadavath, Arora, Basart,
  Tang, Song, and Steinhardt}]{hendrycks2021MATH}
Dan Hendrycks, Collin Burns, Saurav Kadavath, Akul Arora, Steven Basart, Eric
  Tang, Dawn Song, and Jacob Steinhardt. 2021.
\newblock Measuring mathematical problem solving with the math dataset.
\newblock \emph{arXiv preprint arXiv:2103.03874}.

\bibitem[{Hooper et~al.(2024)Hooper, Kim, Mohammadzadeh, Mahoney, Shao,
  Keutzer, and Gholami}]{hooper2024kvquant}
Coleman Hooper, Sehoon Kim, Hiva Mohammadzadeh, Michael~W. Mahoney,
  Yakun~Sophia Shao, Kurt Keutzer, and Amir Gholami. 2024.
\newblock Kvquant: Towards 10 million context length llm inference with kv
  cache quantization.
\newblock In \emph{Advances in Neural Information Processing Systems}.

\bibitem[{Hsieh et~al.(2024)Hsieh, Sun, Kriman, Acharya, Rekesh, Jia, Zhang,
  and Ginsburg}]{hsieh2024ruler}
Cheng-Ping Hsieh, Simeng Sun, Samuel Kriman, Shantanu Acharya, Dima Rekesh, Fei
  Jia, Yang Zhang, and Boris Ginsburg. 2024.
\newblock Ruler: What's the real context size of your long-context language
  models?
\newblock \emph{arXiv preprint arXiv:2404.06654}.

\bibitem[{Kim et~al.(2026{\natexlab{a}})Kim, Han, and Yun}]{fastkv2026}
Jang-Hyun Kim, Dongyoon Han, and Sangdoo Yun. 2026{\natexlab{a}}.
\newblock Fast kvzip: Efficient and accurate llm inference with gated kv
  eviction.
\newblock \emph{arXiv preprint arXiv:2601.17668}.

\bibitem[{Kim et~al.(2026{\natexlab{b}})Kim, Kim, Kwon, Lee, Yun, and
  Song}]{kim2026kvzip}
Jang-Hyun Kim, Jinuk Kim, Sangwoo Kwon, Jae~W Lee, Sangdoo Yun, and Hyun~Oh
  Song. 2026{\natexlab{b}}.
\newblock Kvzip: Query-agnostic kv cache compression with context
  reconstruction.
\newblock \emph{Advances in Neural Information Processing Systems},
  38:167563--167591.

\bibitem[{Krause and Golovin(2014)}]{krause2014submodular}
Andreas Krause and Daniel Golovin. 2014.
\newblock Submodular function maximization.
\newblock \emph{Tractability}, 3(71-104):3.

\bibitem[{Kwon et~al.(2023)Kwon, Li, Zhuang, Sheng, Zheng, Yu, Gonzalez, Zhang,
  and Stoica}]{kwon2023efficient}
Woosuk Kwon, Zhuohan Li, Siyuan Zhuang, Ying Sheng, Lianmin Zheng, Cody~Hao Yu,
  Joseph Gonzalez, Hao Zhang, and Ion Stoica. 2023.
\newblock Efficient memory management for large language model serving with
  pagedattention.
\newblock In \emph{Proceedings of the 29th symposium on operating systems
  principles}, pages 611--626.

\bibitem[{Li et~al.(2025)Li, Jiang, Wu, Luo, Ahn, Zhang, Abdi, Li, Gao, Yang
  et~al.}]{li2025scbench}
Yucheng Li, Huiqiang Jiang, Qianhui Wu, Xufang Luo, Surin Ahn, Chengruidong
  Zhang, Amir Abdi, Dongsheng Li, Jianfeng Gao, Yuqing Yang, and 1 others.
  2025.
\newblock Scbench: A kv cache-centric analysis of long-context methods.
\newblock In \emph{International Conference on Learning Representations},
  volume 2025, pages 66063--66093.

\bibitem[{Li et~al.(2024)Li, Huang, Yang, Venkitesh, Locatelli, Ye, Cai, Lewis,
  and Chen}]{li2024snapkv}
Yuhong Li, Yingbing Huang, Bowen Yang, Bharat Venkitesh, Acyr Locatelli,
  Hanchen Ye, Tianle Cai, Patrick Lewis, and Deming Chen. 2024.
\newblock Snapkv: Llm knows what you are looking for before generation.
\newblock \emph{Advances in Neural Information Processing Systems},
  37:22947--22970.

\bibitem[{Lin and Bilmes(2011)}]{lin2011documentsummarization}
Hui Lin and Jeff Bilmes. 2011.
\newblock A class of submodular functions for document summarization.
\newblock In \emph{Proceedings of the 49th annual meeting of the association
  for computational linguistics: human language technologies}, pages 510--520.

\bibitem[{Liu et~al.(2026)Liu, Tang, Dong, Li, Li, Hu, and
  Chu}]{liu2026chunkkv}
Xiang Liu, Zhenheng Tang, Peijie Dong, Zeyu Li, Bo~Li, Xuming Hu, and Xiaowen
  Chu. 2026.
\newblock Chunkkv: Semantic-preserving kv cache compression for efficient
  long-context llm inference.
\newblock \emph{Advances in Neural Information Processing Systems},
  38:28728--28778.

\bibitem[{Liu et~al.(2023)Liu, Desai, Liao, Wang, Xie, Xu, Kyrillidis, and
  Shrivastava}]{liu2024scissorhands}
Zichang Liu, Aditya Desai, Fangshuo Liao, Weitao Wang, Victor Xie, Zhaozhuo Xu,
  Anastasios Kyrillidis, and Anshumali Shrivastava. 2023.
\newblock Scissorhands: Exploiting the persistence of importance hypothesis for
  llm kv cache compression at test time.
\newblock \emph{Advances in Neural Information Processing Systems},
  36:52342--52364.

\bibitem[{Liu et~al.(2024)Liu, Yuan, Jin, Zhong, Xu, Braverman, Chen, and
  Hu}]{liu2024kivi}
Zirui Liu, Jiayi Yuan, Hongye Jin, Shaochen Zhong, Zhaozhuo Xu, Vladimir
  Braverman, Beidi Chen, and Xia Hu. 2024.
\newblock Kivi: A tuning-free asymmetric 2bit quantization for kv cache.
\newblock In \emph{International Conference on Machine Learning}.

\bibitem[{Nemhauser et~al.(1978)Nemhauser, Wolsey, and
  Fisher}]{nemhauser1978analysis}
George~L Nemhauser, Laurence~A Wolsey, and Marshall~L Fisher. 1978.
\newblock An analysis of approximations for maximizing submodular set
  functions—i.
\newblock \emph{Mathematical programming}, 14(1):265--294.

\bibitem[{Rajpurkar et~al.(2016)Rajpurkar, Zhang, Lopyrev, and
  Liang}]{rajpurkar2016squad}
Pranav Rajpurkar, Jian Zhang, Konstantin Lopyrev, and Percy Liang. 2016.
\newblock Squad: 100,000+ questions for machine comprehension of text.
\newblock In \emph{Proceedings of the 2016 conference on empirical methods in
  natural language processing}, pages 2383--2392.

\bibitem[{Vaswani et~al.(2017)Vaswani, Shazeer, Parmar, Uszkoreit, Jones,
  Gomez, Kaiser, and Polosukhin}]{vaswani2017attentionisallyouneed}
Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones,
  Aidan~N Gomez, {\L}ukasz Kaiser, and Illia Polosukhin. 2017.
\newblock Attention is all you need.
\newblock \emph{Advances in neural information processing systems}, 30.

\bibitem[{Xiao et~al.(2025)Xiao, Tang, Zuo, Guo, Yang, Tang, Fu, and
  Han}]{zhao2024duoattention}
Guangxuan Xiao, Jiaming Tang, Jingwei Zuo, Junxian Guo, Shang Yang, Haotian
  Tang, Yao Fu, and Song Han. 2025.
\newblock Duoattention: Efficient long-context llm inference with retrieval and
  streaming heads.
\newblock In \emph{International Conference on Learning Representations},
  volume 2025, pages 37228--37253.

\bibitem[{Xiao et~al.(2024)Xiao, Tian, Chen, Han, and
  Lewis}]{xiao2024efficient}
Guangxuan Xiao, Yuandong Tian, Beidi Chen, Song Han, and Mike Lewis. 2024.
\newblock Efficient streaming language models with attention sinks.
\newblock In \emph{International Conference on Learning Representations},
  volume 2024, pages 21875--21895.

\bibitem[{Yang et~al.(2025)Yang, Li, Yang, Zhang, Hui, Zheng, Yu, Gao, Huang,
  Lv et~al.}]{yang2025qwen3}
An~Yang, Anfeng Li, Baosong Yang, Beichen Zhang, Binyuan Hui, Bo~Zheng, Bowen
  Yu, Chang Gao, Chengen Huang, Chenxu Lv, and 1 others. 2025.
\newblock Qwen3 technical report.
\newblock \emph{arXiv preprint arXiv:2505.09388}.

\bibitem[{Zhang et~al.(2023)Zhang, Sheng, Zhou, Chen, Zheng, Cai, Song, Tian,
  R{\'e}, Barrett et~al.}]{zhang2024h2o}
Zhenyu Zhang, Ying Sheng, Tianyi Zhou, Tianlong Chen, Lianmin Zheng, Ruisi Cai,
  Zhao Song, Yuandong Tian, Christopher R{\'e}, Clark Barrett, and 1 others.
  2023.
\newblock H2o: Heavy-hitter oracle for efficient generative inference of large
  language models.
\newblock \emph{Advances in Neural Information Processing Systems},
  36:34661--34710.

\end{thebibliography}