@article{vaswani2017attentionisallyouneed,
  title={Attention is all you need},
  author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia},
  journal={Advances in neural information processing systems},
  volume={30},
  year={2017}
}

@inproceedings{dai2019transformer,
  title={Transformer-xl: Attentive language models beyond a fixed-length context},
  author={Dai, Zihang and Yang, Zhilin and Yang, Yiming and Carbonell, Jaime G and Le, Quoc and Salakhutdinov, Ruslan},
  booktitle={Proceedings of the 57th annual meeting of the association for computational linguistics},
  pages={2978--2988},
  year={2019}
}

@inproceedings{kwon2023efficient,
  title={Efficient memory management for large language model serving with pagedattention},
  author={Kwon, Woosuk and Li, Zhuohan and Zhuang, Siyuan and Sheng, Ying and Zheng, Lianmin and Yu, Cody Hao and Gonzalez, Joseph and Zhang, Hao and Stoica, Ion},
  booktitle={Proceedings of the 29th symposium on operating systems principles},
  pages={611--626},
  year={2023}
}

@article{kim2026kvzip,
  title={Kvzip: Query-agnostic kv cache compression with context reconstruction},
  author={Kim, Jang-Hyun and Kim, Jinuk and Kwon, Sangwoo and Lee, Jae W and Yun, Sangdoo and Song, Hyun Oh},
  journal={Advances in Neural Information Processing Systems},
  volume={38},
  pages={167563--167591},
  year={2026}
}

@inproceedings{li2025scbench,
  title={Scbench: A kv cache-centric analysis of long-context methods},
  author={Li, Yucheng and Jiang, Huiqiang and Wu, Qianhui and Luo, Xufang and Ahn, Surin and Zhang, Chengruidong and Abdi, Amir and Li, Dongsheng and Gao, Jianfeng and Yang, Yuqing and others},
  booktitle={International Conference on Learning Representations},
  volume={2025},
  pages={66063--66093},
  year={2025}
}

@inproceedings{xiao2024efficient,
  title={Efficient streaming language models with attention sinks},
  author={Xiao, Guangxuan and Tian, Yuandong and Chen, Beidi and Han, Song and Lewis, Mike},
  booktitle={International Conference on Learning Representations},
  volume={2024},
  pages={21875--21895},
  year={2024}
}

@inproceedings{han2024lm,
  title={Lm-infinite: Zero-shot extreme length generalization for large language models},
  author={Han, Chi and Wang, Qifan and Peng, Hao and Xiong, Wenhan and Chen, Yu and Ji, Heng and Wang, Sinong},
  booktitle={Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)},
  pages={3991--4008},
  year={2024}
}

@article{zhang2024h2o,
  title={H2o: Heavy-hitter oracle for efficient generative inference of large language models},
  author={Zhang, Zhenyu and Sheng, Ying and Zhou, Tianyi and Chen, Tianlong and Zheng, Lianmin and Cai, Ruisi and Song, Zhao and Tian, Yuandong and R{\'e}, Christopher and Barrett, Clark and others},
  journal={Advances in Neural Information Processing Systems},
  volume={36},
  pages={34661--34710},
  year={2023}
}

@article{liu2024scissorhands,
  title={Scissorhands: Exploiting the persistence of importance hypothesis for llm kv cache compression at test time},
  author={Liu, Zichang and Desai, Aditya and Liao, Fangshuo and Wang, Weitao and Xie, Victor and Xu, Zhaozhuo and Kyrillidis, Anastasios and Shrivastava, Anshumali},
  journal={Advances in Neural Information Processing Systems},
  volume={36},
  pages={52342--52364},
  year={2023}
}

@inproceedings{ge2023model,
  title={Model tells you what to discard: Adaptive kv cache compression for llms},
  author={Ge, Suyu and Zhang, Yunan and Liu, Liyuan and Zhang, Minjia and Han, Jiawei and Gao, Jianfeng},
  booktitle={International Conference on Learning Representations},
  volume={2024},
  pages={22975--22988},
  year={2024}
}

@article{zhang2024pyramidkv,
  title={Pyramidkv: Dynamic kv cache compression based on pyramidal information funneling},
  author={Cai, Zefan and Zhang, Yichi and Gao, Bofei and Liu, Yuliang and Li, Yucheng and Liu, Tianyu and Lu, Keming and Xiong, Wayne and Dong, Yue and Hu, Junjie and others},
  journal={arXiv preprint arXiv:2406.02069},
  year={2024}
}

@inproceedings{zhao2024duoattention,
  title={Duoattention: Efficient long-context llm inference with retrieval and streaming heads},
  author={Xiao, Guangxuan and Tang, Jiaming and Zuo, Jingwei and Guo, Junxian and Yang, Shang and Tang, Haotian and Fu, Yao and Han, Song},
  booktitle={International Conference on Learning Representations},
  volume={2025},
  pages={37228--37253},
  year={2025}
}

@article{liu2026chunkkv,
  title={Chunkkv: Semantic-preserving kv cache compression for efficient long-context llm inference},
  author={Liu, Xiang and Tang, Zhenheng and Dong, Peijie and Li, Zeyu and Li, Bo and Hu, Xuming and Chu, Xiaowen},
  journal={Advances in Neural Information Processing Systems},
  volume={38},
  pages={28728--28778},
  year={2026}
}

@article{fastkv2026,
  title={Fast KVzip: Efficient and Accurate LLM Inference with Gated KV Eviction},
  author={Kim, Jang-Hyun and Han, Dongyoon and Yun, Sangdoo},
  journal={arXiv preprint arXiv:2601.17668},
  year={2026}
}

@article{feng2026ada,
  title={Ada-kv: Optimizing kv cache eviction by adaptive budget allocation for efficient llm inference},
  author={Feng, Yuan and Lv, Junlin and Cao, Yukun and Xie, Xike and Zhou, S Kevin},
  journal={Advances in Neural Information Processing Systems},
  volume={38},
  pages={113152--113188},
  year={2026}
}

@article{li2024snapkv,
  title={Snapkv: Llm knows what you are looking for before generation},
  author={Li, Yuhong and Huang, Yingbing and Yang, Bowen and Venkitesh, Bharat and Locatelli, Acyr and Ye, Hanchen and Cai, Tianle and Lewis, Patrick and Chen, Deming},
  journal={Advances in Neural Information Processing Systems},
  volume={37},
  pages={22947--22970},
  year={2024}
}

@inproceedings{sener2018activelearning,
  title={Active Learning for Convolutional Neural Networks: A Core-Set Approach},
  author={Sener, Ozan and Savarese, Silvio},
  booktitle={International Conference on Learning Representations},
  year={2018}
}

@inproceedings{lin2011documentsummarization,
  title={A class of submodular functions for document summarization},
  author={Lin, Hui and Bilmes, Jeff},
  booktitle={Proceedings of the 49th annual meeting of the association for computational linguistics: human language technologies},
  pages={510--520},
  year={2011}
}

@article{nemhauser1978analysis,
  title={An analysis of approximations for maximizing submodular set functions—I},
  author={Nemhauser, George L and Wolsey, Laurence A and Fisher, Marshall L},
  journal={Mathematical programming},
  volume={14},
  number={1},
  pages={265--294},
  year={1978},
  publisher={Springer}
}

@techreport{cornuejols1983uncapacitatedfacilitylocationproblem,
  title={The uncapacitated facility location problem},
  author={Cornu{\'e}jols, G{\'e}rard and Nemhauser, George and Wolsey, Laurence},
  year={1983},
  institution={Cornell University Operations Research and Industrial Engineering}
}

@article{krause2014submodular,
  title={Submodular function maximization.},
  author={Krause, Andreas and Golovin, Daniel},
  journal={Tractability},
  volume={3},
  number={71-104},
  pages={3},
  year={2014}
}

@article{hsieh2024ruler,
  title={RULER: What's the real context size of your long-context language models?},
  author={Hsieh, Cheng-Ping and Sun, Simeng and Kriman, Samuel and Acharya, Shantanu and Rekesh, Dima and Jia, Fei and Zhang, Yang and Ginsburg, Boris},
  journal={arXiv preprint arXiv:2404.06654},
  year={2024}
}

@inproceedings{bai2023longbench,
  title={Longbench: A bilingual, multitask benchmark for long context understanding},
  author={Bai, Yushi and Lv, Xin and Zhang, Jiajie and Lyu, Hongchang and Tang, Jiankai and Huang, Zhidian and Du, Zhengxiao and Liu, Xiao and Zeng, Aohan and Hou, Lei and others},
  booktitle={Proceedings of the 62nd annual meeting of the association for computational linguistics (volume 1: Long papers)},
  pages={3119--3137},
  year={2024}
}

@inproceedings{rajpurkar2016squad,
  title={Squad: 100,000+ questions for machine comprehension of text},
  author={Rajpurkar, Pranav and Zhang, Jian and Lopyrev, Konstantin and Liang, Percy},
  booktitle={Proceedings of the 2016 conference on empirical methods in natural language processing},
  pages={2383--2392},
  year={2016}
}

@article{cobbe2021gsm8k,
  title={Training verifiers to solve math word problems},
  author={Cobbe, Karl and Kosaraju, Vineet and Bavarian, Mohammad and Chen, Mark and Jun, Heewoo and Kaiser, Lukasz and Plappert, Matthias and Tworek, Jerry and Hilton, Jacob and Nakano, Reiichiro and others},
  journal={arXiv preprint arXiv:2110.14168},
  year={2021}
}

@article{yang2025qwen3,
  title={Qwen3 technical report},
  author={Yang, An and Li, Anfeng and Yang, Baosong and Zhang, Beichen and Hui, Binyuan and Zheng, Bo and Yu, Bowen and Gao, Chang and Huang, Chengen and Lv, Chenxu and others},
  journal={arXiv preprint arXiv:2505.09388},
  year={2025}
}

@article{devoto2025expected,
  title={Expected attention: Kv cache compression by estimating attention from future queries distribution},
  author={Devoto, Alessio and Jeblick, Maximilian and J{\'e}gou, Simon},
  journal={arXiv preprint arXiv:2510.00636},
  year={2025}
}

@article{hendrycks2021MATH,
  title={Measuring mathematical problem solving with the math dataset},
  author={Hendrycks, Dan and Burns, Collin and Kadavath, Saurav and Arora, Akul and Basart, Steven and Tang, Eric and Song, Dawn and Steinhardt, Jacob},
  journal={arXiv preprint arXiv:2103.03874},
  year={2021}
}

@inproceedings{liu2024kivi,
  title={KIVI: A Tuning-Free Asymmetric 2bit Quantization for KV Cache},
  author={Liu, Zirui and Yuan, Jiayi and Jin, Hongye and Zhong, Shaochen and Xu, Zhaozhuo and Braverman, Vladimir and Chen, Beidi and Hu, Xia},
  booktitle={International Conference on Machine Learning},
  year={2024}
}

@inproceedings{hooper2024kvquant,
  title={KVQuant: Towards 10 Million Context Length LLM Inference with KV Cache Quantization},
  author={Hooper, Coleman and Kim, Sehoon and Mohammadzadeh, Hiva and Mahoney, Michael W. and Shao, Yakun Sophia and Keutzer, Kurt and Gholami, Amir},
  booktitle={Advances in Neural Information Processing Systems},
  year={2024}
}

@article{kovcisky2018narrativeqa,
  title={The narrativeqa reading comprehension challenge},
  author={Ko{\v{c}}isk{\`y}, Tom{\'a}{\v{s}} and Schwarz, Jonathan and Blunsom, Phil and Dyer, Chris and Hermann, Karl Moritz and Melis, G{\'a}bor and Grefenstette, Edward},
  journal={Transactions of the Association for Computational Linguistics},
  volume={6},
  pages={317--328},
  year={2018}
}