\begin{thebibliography}{60}
\providecommand{\natexlab}[1]{#1}
\providecommand{\url}[1]{\texttt{#1}}
\expandafter\ifx\csname urlstyle\endcsname\relax
  \providecommand{\doi}[1]{doi: #1}\else
  \providecommand{\doi}{doi: \begingroup \urlstyle{rm}\Url}\fi

\bibitem[Achiam et~al.(2023)Achiam, Adler, Agarwal, Ahmad, Akkaya, Aleman, Almeida, Altenschmidt, Altman, Anadkat, et~al.]{gpt4}
J.~Achiam, S.~Adler, S.~Agarwal, L.~Ahmad, I.~Akkaya, F.~L. Aleman, D.~Almeida, J.~Altenschmidt, S.~Altman, S.~Anadkat, et~al.
\newblock Gpt-4 technical report.
\newblock \emph{arXiv preprint arXiv:2303.08774}, 2023.

\bibitem[Agrawal et~al.(2024)Agrawal, Kedia, Panwar, Mohan, Kwatra, Gulavani, Tumanov, and Ramjee]{prefill}
A.~Agrawal, N.~Kedia, A.~Panwar, J.~Mohan, N.~Kwatra, B.~Gulavani, A.~Tumanov, and R.~Ramjee.
\newblock Taming throughput-latency tradeoff in llm inference with sarathi-serve.
\newblock In \emph{18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)}, 2024.

\bibitem[Ainslie et~al.(2023)Ainslie, Lee-Thorp, De~Jong, Zemlyanskiy, Lebr{\'o}n, and Sanghai]{gqa}
J.~Ainslie, J.~Lee-Thorp, M.~De~Jong, Y.~Zemlyanskiy, F.~Lebr{\'o}n, and S.~Sanghai.
\newblock Gqa: Training generalized multi-query transformer models from multi-head checkpoints.
\newblock \emph{EMNLP}, 2023.

\bibitem[Anagnostidis et~al.(2023)Anagnostidis, Pavllo, Biggio, Noci, Lucchi, and Hofmann]{dynamicpruning}
S.~Anagnostidis, D.~Pavllo, L.~Biggio, L.~Noci, A.~Lucchi, and T.~Hofmann.
\newblock Dynamic context pruning for efficient and interpretable autoregressive transformers.
\newblock \emph{Advances in Neural Information Processing Systems}, 2023.

\bibitem[Bai et~al.(2024)Bai, Lv, Zhang, Lyu, Tang, et~al.]{longbench}
Y.~Bai, X.~Lv, J.~Zhang, H.~Lyu, J.~Tang, et~al.
\newblock Longbench: A bilingual, multitask benchmark for long context understanding.
\newblock \emph{ACL}, 2024.

\bibitem[Cai et~al.(2024)Cai, Zhang, Gao, Liu, Liu, Lu, et~al.]{pyramid}
Z.~Cai, Y.~Zhang, B.~Gao, Y.~Liu, T.~Liu, K.~Lu, et~al.
\newblock Pyramidkv: Dynamic kv cache compression based on pyramidal information funneling.
\newblock \emph{arXiv preprint arXiv:2406.02069}, 2024.

\bibitem[Chan et~al.(2024)Chan, Chen, Cheng, and Huang]{cag}
B.~J. Chan, C.-T. Chen, J.-H. Cheng, and H.-H. Huang.
\newblock Don't do rag: When cache-augmented generation is all you need for knowledge tasks.
\newblock \emph{arXiv preprint arXiv:2412.15605}, 2024.

\bibitem[Character.AI(2024)]{characterai}
Character.AI.
\newblock Optimizing ai inference at character.ai, 2024.
\newblock URL \url{https://research.character.ai/optimizing-inference/}.

\bibitem[Chen et~al.(2024)Chen, Wang, Shang, Cui, Zhang, Liu, Wang, Sun, Yu, and Wu]{nacl}
Y.~Chen, G.~Wang, J.~Shang, S.~Cui, Z.~Zhang, T.~Liu, S.~Wang, Y.~Sun, D.~Yu, and H.~Wu.
\newblock Nacl: A general and effective kv cache eviction framework for llms at inference time.
\newblock \emph{ACL}, 2024.

\bibitem[Chen et~al.(2025)Chen, Sadhukhan, Ye, Zhou, Zhang, et~al.]{magicpig}
Z.~Chen, R.~Sadhukhan, Z.~Ye, Y.~Zhou, J.~Zhang, et~al.
\newblock Magicpig: Lsh sampling for efficient llm generation.
\newblock \emph{ICLR}, 2025.

\bibitem[Child et~al.(2019)Child, Gray, Radford, and Sutskever]{sparsetransformer}
R.~Child, S.~Gray, A.~Radford, and I.~Sutskever.
\newblock Generating long sequences with sparse transformers.
\newblock \emph{arXiv preprint arXiv:1904.10509}, 2019.

\bibitem[Cobbe et~al.(2021)Cobbe, Kosaraju, Bavarian, Chen, Jun, Kaiser, Plappert, Tworek, Hilton, Nakano, et~al.]{gsm}
K.~Cobbe, V.~Kosaraju, M.~Bavarian, M.~Chen, H.~Jun, L.~Kaiser, M.~Plappert, J.~Tworek, J.~Hilton, R.~Nakano, et~al.
\newblock Training verifiers to solve math word problems.
\newblock \emph{arXiv preprint arXiv:2110.14168}, 2021.

\bibitem[Corallo and Papotti(2024)]{corallo2024finch}
G.~Corallo and P.~Papotti.
\newblock Finch: Prompt-guided key-value cache compression for large language models.
\newblock \emph{Transactions of the Association for Computational Linguistics}, 12, 2024.

\bibitem[Corallo et~al.(2025)Corallo, Weller, Petroni, and Papotti]{corallo2025beyond}
G.~Corallo, O.~Weller, F.~Petroni, and P.~Papotti.
\newblock Beyond rag: Task-aware kv cache compression for comprehensive knowledge reasoning.
\newblock \emph{arXiv preprint arXiv:2503.04973}, 2025.

\bibitem[Dao(2024)]{flashattn}
T.~Dao.
\newblock Flashattention-2: Faster attention with better parallelism and work partitioning.
\newblock \emph{ICLR}, 2024.

\bibitem[Devlin et~al.(2019)Devlin, Chang, Lee, and Toutanova]{bert}
J.~Devlin, M.-W. Chang, K.~Lee, and K.~Toutanova.
\newblock Bert: Pre-training of deep bidirectional transformers for language understanding.
\newblock \emph{NAACL}, 2019.

\bibitem[Feng et~al.(2024)Feng, Lv, Cao, Xie, and Zhou]{adakv}
Y.~Feng, J.~Lv, Y.~Cao, X.~Xie, and S.~K. Zhou.
\newblock Ada-kv: Optimizing kv cache eviction by adaptive budget allocation for efficient llm inference.
\newblock \emph{arXiv preprint arXiv:2407.11550}, 2024.

\bibitem[Ge et~al.(2024)Ge, Zhang, Liu, Zhang, Han, and Gao]{fastgen}
S.~Ge, Y.~Zhang, L.~Liu, M.~Zhang, J.~Han, and J.~Gao.
\newblock Model tells you what to discard: Adaptive kv cache compression for llms.
\newblock \emph{ICLR}, 2024.

\bibitem[Goyal et~al.(2016)Goyal, Lamb, Zhang, Zhang, Courville, and Bengio]{teacherforcing}
A.~Goyal, A.~Lamb, Y.~Zhang, S.~Zhang, A.~Courville, and Y.~Bengio.
\newblock Professor forcing: A new algorithm for training recurrent networks.
\newblock \emph{Advances in neural information processing systems}, 29, 2016.

\bibitem[gradientAI(2024)]{gradientai}
gradientAI.
\newblock Llama-3 8b gradient instruct 1048k, 2024.
\newblock URL \url{https://huggingface.co/gradientai/Llama-3-8B-Instruct-Gradient-1048k}.

\bibitem[Grattafiori et~al.(2024)Grattafiori, Dubey, Jauhri, Pandey, et~al.]{llama3}
A.~Grattafiori, A.~Dubey, A.~Jauhri, A.~Pandey, et~al.
\newblock The llama 3 herd of models.
\newblock \emph{arXiv preprint arXiv:2407.21783}, 2024.

\bibitem[He et~al.(2022)He, Chen, Xie, Li, Doll{\'a}r, and Girshick]{mae}
K.~He, X.~Chen, S.~Xie, Y.~Li, P.~Doll{\'a}r, and R.~Girshick.
\newblock Masked autoencoders are scalable vision learners.
\newblock In \emph{CVPR}, 2022.

\bibitem[Hsieh et~al.(2024)Hsieh, Sun, Kriman, Acharya, Rekesh, Jia, Zhang, and Ginsburg]{ruler}
C.-P. Hsieh, S.~Sun, S.~Kriman, S.~Acharya, D.~Rekesh, F.~Jia, Y.~Zhang, and B.~Ginsburg.
\newblock Ruler: What's the real context size of your long-context language models?
\newblock \emph{COLM}, 2024.

\bibitem[Jiang et~al.(2023)Jiang, Sablayrolles, Mensch, Bamford, et~al.]{mistral}
A.~Q. Jiang, A.~Sablayrolles, A.~Mensch, C.~Bamford, et~al.
\newblock Mistral 7b, 2023.

\bibitem[Jiang et~al.(2024)Jiang, Li, Zhang, Wu, Luo, et~al.]{minference}
H.~Jiang, Y.~Li, C.~Zhang, Q.~Wu, X.~Luo, et~al.
\newblock Minference 1.0: Accelerating pre-filling for long-context llms via dynamic sparse attention.
\newblock \emph{Advances in Neural Information Processing Systems}, 2024.

\bibitem[Kamradt(2023)]{needle}
G.~Kamradt.
\newblock Needle in a haystack-pressure testing llms, 2023.

\bibitem[Katz(1989)]{zip}
P.~W. Katz.
\newblock Zip file format specification, 1989.
\newblock URL \url{https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT}.

\bibitem[Kim et~al.(2024{\natexlab{a}})Kim, Yeom, Yun, and Song]{ccm}
J.-H. Kim, J.~Yeom, S.~Yun, and H.~O. Song.
\newblock Compressed context memory for online language model interaction.
\newblock \emph{ICLR}, 2024{\natexlab{a}}.

\bibitem[Kim et~al.(2024{\natexlab{b}})Kim, Shim, Choi, and Chang]{kim2024infinipot}
M.~Kim, K.~Shim, J.~Choi, and S.~Chang.
\newblock Infinipot: Infinite context processing on memory-constrained llms.
\newblock \emph{arXiv preprint arXiv:2410.01518}, 2024{\natexlab{b}}.

\bibitem[Kim et~al.(2022)Kim, Shen, Thorsley, Gholami, Kwon, Hassoun, and Keutzer]{ltp}
S.~Kim, S.~Shen, D.~Thorsley, A.~Gholami, W.~Kwon, J.~Hassoun, and K.~Keutzer.
\newblock Learned token pruning for transformers.
\newblock In \emph{Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining}, 2022.

\bibitem[Kwon et~al.(2023)Kwon, Li, Zhuang, Sheng, Zheng, Yu, Gonzalez, Zhang, and Stoica]{vllm}
W.~Kwon, Z.~Li, S.~Zhuang, Y.~Sheng, L.~Zheng, C.~H. Yu, J.~Gonzalez, H.~Zhang, and I.~Stoica.
\newblock Efficient memory management for large language model serving with pagedattention.
\newblock In \emph{Proceedings of the 29th Symposium on Operating Systems Principles}, 2023.

\bibitem[Lee et~al.(2024)Lee, Lee, Seo, and Sim]{infinigen}
W.~Lee, J.~Lee, J.~Seo, and J.~Sim.
\newblock Infinigen: Efficient generative inference of large language models with dynamic kv cache management.
\newblock In \emph{18th USENIX Symposium on Operating Systems Design and Implementation (OSDI)}, 2024.

\bibitem[Li et~al.(2024{\natexlab{a}})Li, Huang, Yang, Venkitesh, Locatelli, Ye, Cai, Lewis, and Chen]{snapkv}
Y.~Li, Y.~Huang, B.~Yang, B.~Venkitesh, A.~Locatelli, H.~Ye, T.~Cai, P.~Lewis, and D.~Chen.
\newblock Snapkv: Llm knows what you are looking for before generation.
\newblock \emph{Advances in Neural Information Processing Systems}, 2024{\natexlab{a}}.

\bibitem[Li et~al.(2024{\natexlab{b}})Li, Wen, Wang, Li, Yuan, Liu, et~al.]{personal}
Y.~Li, H.~Wen, W.~Wang, X.~Li, Y.~Yuan, G.~Liu, et~al.
\newblock Personal llm agents: Insights and survey about the capability, efficiency and security.
\newblock \emph{arXiv preprint arXiv:2401.05459}, 2024{\natexlab{b}}.

\bibitem[Li et~al.(2025)Li, Jiang, Wu, Luo, Ahn, Zhang, Abdi, Li, Gao, Yang, et~al.]{scbench}
Y.~Li, H.~Jiang, Q.~Wu, X.~Luo, S.~Ahn, C.~Zhang, A.~H. Abdi, D.~Li, J.~Gao, Y.~Yang, et~al.
\newblock Scbench: A kv cache-centric analysis of long-context methods.
\newblock \emph{ICLR}, 2025.

\bibitem[Lin et~al.(2024)Lin, Tang, Yang, Zhang, Xiao, Gan, and Han]{qserve}
Y.~Lin, H.~Tang, S.~Yang, Z.~Zhang, G.~Xiao, C.~Gan, and S.~Han.
\newblock Qserve: W4a8kv4 quantization and system co-design for efficient llm serving.
\newblock \emph{arXiv preprint arXiv:2405.04532}, 2024.

\bibitem[Liu et~al.(2024{\natexlab{a}})Liu, Chen, Lu, Jiang, Han, Zhang, et~al.]{retrieval}
D.~Liu, M.~Chen, B.~Lu, H.~Jiang, Z.~Han, Q.~Zhang, et~al.
\newblock Retrievalattention: Accelerating long-context llm inference via vector retrieval.
\newblock \emph{arXiv preprint arXiv:2409.10516}, 2024{\natexlab{a}}.

\bibitem[Liu et~al.(2023{\natexlab{a}})Liu, Desai, Liao, Wang, Xie, Xu, Kyrillidis, and Shrivastava]{scissorhands}
Z.~Liu, A.~Desai, F.~Liao, W.~Wang, V.~Xie, Z.~Xu, A.~Kyrillidis, and A.~Shrivastava.
\newblock Scissorhands: Exploiting the persistence of importance hypothesis for llm kv cache compression at test time.
\newblock \emph{Advances in Neural Information Processing Systems}, 2023{\natexlab{a}}.

\bibitem[Liu et~al.(2023{\natexlab{b}})Liu, Wang, Dao, Zhou, Yuan, Song, et~al.]{dejavu}
Z.~Liu, J.~Wang, T.~Dao, T.~Zhou, B.~Yuan, Z.~Song, et~al.
\newblock Deja vu: Contextual sparsity for efficient llms at inference time.
\newblock In \emph{International Conference on Machine Learning}, 2023{\natexlab{b}}.

\bibitem[Liu et~al.(2024{\natexlab{b}})Liu, Yuan, Jin, Zhong, Xu, Braverman, Chen, and Hu]{kivi}
Z.~Liu, J.~Yuan, H.~Jin, S.~Zhong, Z.~Xu, V.~Braverman, B.~Chen, and X.~Hu.
\newblock Kivi: A tuning-free asymmetric 2bit quantization for kv cache.
\newblock \emph{ICML}, 2024{\natexlab{b}}.

\bibitem[Oren et~al.(2024)Oren, Hassid, Yarden, Adi, and Schwartz]{tova}
M.~Oren, M.~Hassid, N.~Yarden, Y.~Adi, and R.~Schwartz.
\newblock Transformers are multi-state rnns.
\newblock \emph{arXiv preprint arXiv:2401.06104}, 2024.

\bibitem[Peters et~al.(2018)Peters, Neumann, Iyyer, Gardner, Clark, Lee, and Zettlemoyer]{elmo}
M.~Peters, M.~Neumann, M.~Iyyer, M.~Gardner, C.~Clark, K.~Lee, and L.~Zettlemoyer.
\newblock Deep contextualized word representations.
\newblock In \emph{NAACL}, 2018.

\bibitem[Qi et~al.(2025)Qi, Panda, Lyu, Ma, Roy, Beirami, Mittal, and Henderson]{shallowalignment}
X.~Qi, A.~Panda, K.~Lyu, X.~Ma, S.~Roy, A.~Beirami, P.~Mittal, and P.~Henderson.
\newblock Safety alignment should be made more than just a few tokens deep.
\newblock \emph{ICLR}, 2025.

\bibitem[Radford et~al.(2018)Radford, Narasimhan, Salimans, Sutskever, et~al.]{gpt}
A.~Radford, K.~Narasimhan, T.~Salimans, I.~Sutskever, et~al.
\newblock Improving language understanding by generative pre-training, 2018.

\bibitem[Radford et~al.(2019)Radford, Wu, Child, Luan, Amodei, Sutskever, et~al.]{gpt2}
A.~Radford, J.~Wu, R.~Child, D.~Luan, D.~Amodei, I.~Sutskever, et~al.
\newblock Language models are unsupervised multitask learners.
\newblock \emph{OpenAI blog}, 2019.

\bibitem[Rae et~al.(2020)Rae, Potapenko, Jayakumar, and Lillicrap]{compressive}
J.~W. Rae, A.~Potapenko, S.~M. Jayakumar, and T.~P. Lillicrap.
\newblock Compressive transformers for long-range sequence modelling.
\newblock \emph{ICLR}, 2020.

\bibitem[Rajpurkar et~al.(2016)Rajpurkar, Zhang, Lopyrev, and Liang]{squad}
P.~Rajpurkar, J.~Zhang, K.~Lopyrev, and P.~Liang.
\newblock Squad: 100,000+ questions for machine comprehension of text.
\newblock \emph{EMNLP}, 2016.

\bibitem[Tang et~al.(2024)Tang, Zhao, Zhu, Xiao, Kasikci, and Han]{quest}
J.~Tang, Y.~Zhao, K.~Zhu, G.~Xiao, B.~Kasikci, and S.~Han.
\newblock Quest: Query-aware sparsity for efficient long-context llm inference.
\newblock \emph{ICML}, 2024.

\bibitem[Team et~al.(2025)Team, Kamath, Ferret, Pathak, Vieillard, et~al.]{gemma3}
G.~Team, A.~Kamath, J.~Ferret, S.~Pathak, N.~Vieillard, et~al.
\newblock Gemma 3 technical report.
\newblock \emph{arXiv preprint arXiv:2503.19786}, 2025.

\bibitem[Vaswani et~al.(2017)Vaswani, Shazeer, Parmar, Uszkoreit, Jones, Gomez, Kaiser, and Polosukhin]{transformer}
A.~Vaswani, N.~Shazeer, N.~Parmar, J.~Uszkoreit, L.~Jones, A.~N. Gomez, {\L}.~Kaiser, and I.~Polosukhin.
\newblock Attention is all you need.
\newblock \emph{Advances in neural information processing systems}, 2017.

\bibitem[Wang et~al.(2023)Wang, Chen, Pei, Xie, Kang, et~al.]{decodingtrust}
B.~Wang, W.~Chen, H.~Pei, C.~Xie, M.~Kang, et~al.
\newblock Decodingtrust: A comprehensive assessment of trustworthiness in gpt models.
\newblock In \emph{NeurIPS}, 2023.

\bibitem[Xiao et~al.(2024)Xiao, Tian, Chen, Han, and Lewis]{streaming}
G.~Xiao, Y.~Tian, B.~Chen, S.~Han, and M.~Lewis.
\newblock Efficient streaming language models with attention sinks.
\newblock \emph{ICLR}, 2024.

\bibitem[Xiao et~al.(2025)Xiao, Tang, Zuo, Guo, Yang, Tang, Fu, and Han]{duo}
G.~Xiao, J.~Tang, J.~Zuo, J.~Guo, S.~Yang, H.~Tang, Y.~Fu, and S.~Han.
\newblock Duoattention: Efficient long-context llm inference with retrieval and streaming heads.
\newblock \emph{ICLR}, 2025.

\bibitem[Yang et~al.(2025)Yang, Yu, Li, Liu, Huang, Huang, et~al.]{qwen}
A.~Yang, B.~Yu, C.~Li, D.~Liu, F.~Huang, H.~Huang, et~al.
\newblock Qwen2.5-1m technical report.
\newblock \emph{arXiv preprint arXiv:2501.15383}, 2025.

\bibitem[Yang et~al.(2024{\natexlab{a}})Yang, Han, Gao, Hu, Zhang, and Zhao]{yang2024pyramidinfer}
D.~Yang, X.~Han, Y.~Gao, Y.~Hu, S.~Zhang, and H.~Zhao.
\newblock Pyramidinfer: Pyramid kv cache compression for high-throughput llm inference.
\newblock \emph{arXiv preprint arXiv:2405.12532}, 2024{\natexlab{a}}.

\bibitem[Yang et~al.(2024{\natexlab{b}})Yang, Kim, Bae, Kwon, Park, Yang, Kwon, and Lee]{notoken}
J.~Y. Yang, B.~Kim, J.~Bae, B.~Kwon, G.~Park, E.~Yang, S.~J. Kwon, and D.~Lee.
\newblock No token left behind: Reliable kv cache compression via importance-aware mixed precision quantization.
\newblock \emph{arXiv preprint arXiv:2402.18096}, 2024{\natexlab{b}}.

\bibitem[Ye et~al.(2025)Ye, Dong, Xia, Sun, Zhu, Huang, and Wei]{differential}
T.~Ye, L.~Dong, Y.~Xia, Y.~Sun, Y.~Zhu, G.~Huang, and F.~Wei.
\newblock Differential transformer.
\newblock \emph{ICLR}, 2025.

\bibitem[Zaheer et~al.(2020)Zaheer, Guruganesh, Dubey, Ainslie, Alberti, Ontanon, Pham, Ravula, Wang, Yang, et~al.]{bigbird}
M.~Zaheer, G.~Guruganesh, K.~A. Dubey, J.~Ainslie, C.~Alberti, S.~Ontanon, P.~Pham, A.~Ravula, Q.~Wang, L.~Yang, et~al.
\newblock Big bird: Transformers for longer sequences.
\newblock \emph{Advances in neural information processing systems}, 2020.

\bibitem[Zhang et~al.(2024)Zhang, Chen, Hu, Xu, Chen, et~al.]{inftybench}
X.~Zhang, Y.~Chen, S.~Hu, Z.~Xu, J.~Chen, et~al.
\newblock $\infty$bench: Extending long context evaluation beyond 100k tokens.
\newblock \emph{ACL}, 2024.

\bibitem[Zhang et~al.(2023)Zhang, Sheng, Zhou, Chen, Zheng, et~al.]{h2o}
Z.~Zhang, Y.~Sheng, T.~Zhou, T.~Chen, L.~Zheng, et~al.
\newblock H2o: Heavy-hitter oracle for efficient generative inference of large language models.
\newblock \emph{Advances in Neural Information Processing Systems}, 2023.

\end{thebibliography}