%%%% token eviction
%% training 
@article{bigbird,
  title={Big bird: Transformers for longer sequences},
  author={Zaheer, Manzil and Guruganesh, Guru and Dubey, Kumar Avinava and Ainslie, Joshua and Alberti, Chris and Ontanon, Santiago and Pham, Philip and Ravula, Anirudh and Wang, Qifan and Yang, Li and others},
  journal={Advances in neural information processing systems},
  year={2020}
}

@article{sparsetransformer,
  title={Generating long sequences with sparse transformers},
  author={Child, Rewon and Gray, Scott and Radford, Alec and Sutskever, Ilya},
  journal={arXiv preprint arXiv:1904.10509},
  year={2019}
}
@inproceedings{ltp,
  title={Learned token pruning for transformers},
  author={Kim, Sehoon and Shen, Sheng and Thorsley, David and Gholami, Amir and Kwon, Woosuk and Hassoun, Joseph and Keutzer, Kurt},
  booktitle={Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining},
  year={2022}
}

%% inference
@inproceedings{dejavu,
  title={Deja vu: Contextual sparsity for efficient llms at inference time},
  author={Liu, Zichang and Wang, Jue and Dao, Tri and Zhou, Tianyi and Yuan, Binhang and Song, Zhao and others},
  booktitle={International Conference on Machine Learning},
  year={2023},
}

@article{scissorhands,
  title={Scissorhands: Exploiting the persistence of importance hypothesis for llm kv cache compression at test time},
  author={Liu, Zichang and Desai, Aditya and Liao, Fangshuo and Wang, Weitao and Xie, Victor and Xu, Zhaozhuo and Kyrillidis, Anastasios and Shrivastava, Anshumali},
  journal={Advances in Neural Information Processing Systems},
  year={2023}
}

@article{h2o,
  title={H2o: Heavy-hitter oracle for efficient generative inference of large language models},
  author={Zhang, Zhenyu and Sheng, Ying and Zhou, Tianyi and Chen, Tianlong and Zheng, Lianmin and others},
  journal={Advances in Neural Information Processing Systems},
  year={2023}
}

@article{dynamicpruning,
  title={Dynamic context pruning for efficient and interpretable autoregressive transformers},
  author={Anagnostidis, Sotiris and Pavllo, Dario and Biggio, Luca and Noci, Lorenzo and Lucchi, Aurelien and Hofmann, Thomas},
  journal={Advances in Neural Information Processing Systems},
  year={2023}
}

@article{nacl,
  title={Nacl: A general and effective kv cache eviction framework for llms at inference time},
  author={Chen, Yilong and Wang, Guoxia and Shang, Junyuan and Cui, Shiyao and Zhang, Zhenyu and Liu, Tingwen and Wang, Shuohuan and Sun, Yu and Yu, Dianhai and Wu, Hua},
  journal={ACL},
  year={2024}
}

@article{tova,
  title={Transformers are multi-state rnns},
  author={Oren, Matanel and Hassid, Michael and Yarden, Nir and Adi, Yossi and Schwartz, Roy},
  journal={arXiv preprint arXiv:2401.06104},
  year={2024}
}

@article{streaming,
  title={Efficient streaming language models with attention sinks},
  author={Xiao, Guangxuan and Tian, Yuandong and Chen, Beidi and Han, Song and Lewis, Mike},
  journal={ICLR},
  year={2024}
}

@article{snapkv,
  title={Snapkv: Llm knows what you are looking for before generation},
  author={Li, Yuhong and Huang, Yingbing and Yang, Bowen and Venkitesh, Bharat and Locatelli, Acyr and Ye, Hanchen and Cai, Tianle and Lewis, Patrick and Chen, Deming},
  journal={Advances in Neural Information Processing Systems},
  year={2024}
}

@article{pyramid,
  title={Pyramidkv: Dynamic kv cache compression based on pyramidal information funneling},
  author={Cai, Zefan and Zhang, Yichi and Gao, Bofei and Liu, Yuliang and Liu, Tianyu and Lu, Keming and others},
  journal={arXiv preprint arXiv:2406.02069},
  year={2024}
}

@article{yang2024pyramidinfer,
  title={Pyramidinfer: Pyramid kv cache compression for high-throughput llm inference},
  author={Yang, Dongjie and Han, XiaoDong and Gao, Yan and Hu, Yao and Zhang, Shilin and Zhao, Hai},
  journal={arXiv preprint arXiv:2405.12532},
  year={2024}
}

@article{adakv,
  title={Ada-kv: Optimizing kv cache eviction by adaptive budget allocation for efficient llm inference},
  author={Feng, Yuan and Lv, Junlin and Cao, Yukun and Xie, Xike and Zhou, S Kevin},
  journal={arXiv preprint arXiv:2407.11550},
  year={2024}
}


%%%%%% head profiling
@article{fastgen,
  title={Model tells you what to discard: Adaptive kv cache compression for llms},
  author={Ge, Suyu and Zhang, Yunan and Liu, Liyuan and Zhang, Minjia and Han, Jiawei and Gao, Jianfeng},
  journal={ICLR},
  year={2024}
}

@article{duo,
  title={Duoattention: Efficient long-context llm inference with retrieval and streaming heads},
  author={Xiao, Guangxuan and Tang, Jiaming and Zuo, Jingwei and Guo, Junxian and Yang, Shang and Tang, Haotian and Fu, Yao and Han, Song},
  journal={ICLR},
  year={2025}
}


%%%%% training
@article{compressive,
  title={Compressive transformers for long-range sequence modelling},
  author={Rae, Jack W and Potapenko, Anna and Jayakumar, Siddhant M and Lillicrap, Timothy P},
  journal={ICLR},
  year={2020}
}

@article{gqa,
  title={Gqa: Training generalized multi-query transformer models from multi-head checkpoints},
  author={Ainslie, Joshua and Lee-Thorp, James and De Jong, Michiel and Zemlyanskiy, Yury and Lebr{\'o}n, Federico and Sanghai, Sumit},
  journal={EMNLP},
  year={2023}
}

@article{
    ccm,
    title={Compressed Context Memory for Online Language Model Interaction},
    author={Jang-Hyun Kim and Junyoung Yeom and Sangdoo Yun and Hyun Oh Song},
    journal={ICLR},
    year={2024},
}


%%%%%% sparse attention and Off loading
@article{minference,
  title={Minference 1.0: Accelerating pre-filling for long-context llms via dynamic sparse attention},
  author={Jiang, Huiqiang and Li, Yucheng and Zhang, Chengruidong and Wu, Qianhui and Luo, Xufang and others},
  journal={Advances in Neural Information Processing Systems},
  year={2024}
}

@article{quest,
  title={Quest: Query-aware sparsity for efficient long-context llm inference},
  author={Tang, Jiaming and Zhao, Yilong and Zhu, Kan and Xiao, Guangxuan and Kasikci, Baris and Han, Song},
  journal={ICML},
  year={2024}
}

@inproceedings{infinigen,
  title={InfiniGen: Efficient generative inference of large language models with dynamic KV cache management},
  author={Lee, Wonbeom and Lee, Jungi and Seo, Junghwan and Sim, Jaewoong},
  booktitle={18th USENIX Symposium on Operating Systems Design and Implementation (OSDI)},
  year={2024}
}

@article{magicpig,
  title={Magicpig: Lsh sampling for efficient llm generation},
  author={Chen, Zhuoming and Sadhukhan, Ranajoy and Ye, Zihao and Zhou, Yang and Zhang, Jianyu and others},
  journal={ICLR},
  year={2025}
}

@article{retrieval,
  title={Retrievalattention: Accelerating long-context llm inference via vector retrieval},
  author={Liu, Di and Chen, Meng and Lu, Baotong and Jiang, Huiqiang and Han, Zhenhua and Zhang, Qianxi and others},
  journal={arXiv preprint arXiv:2409.10516},
  year={2024}
}


%%%% Quantization
@article{kivi,
  title={Kivi: A tuning-free asymmetric 2bit quantization for kv cache},
  author={Liu, Zirui and Yuan, Jiayi and Jin, Hongye and Zhong, Shaochen and Xu, Zhaozhuo and Braverman, Vladimir and Chen, Beidi and Hu, Xia},
  journal={ICML},
  year={2024}
}

@article{kvquant,
  title={Kvquant: Towards 10 million context length llm inference with kv cache quantization},
  author={Hooper, Coleman and Kim, Sehoon and Mohammadzadeh, Hiva and Mahoney, Michael W and Shao, Sophia and Keutzer, Kurt and Gholami, Amir},
  journal={Advances in Neural Information Processing Systems},
  volume={37},
  pages={1270--1303},
  year={2024}
}

@article{qserve,
  title={Qserve: W4a8kv4 quantization and system co-design for efficient llm serving},
  author={Lin, Yujun and Tang, Haotian and Yang, Shang and Zhang, Zhekai and Xiao, Guangxuan and Gan, Chuang and Han, Song},
  journal={arXiv preprint arXiv:2405.04532},
  year={2024}
}


%%%% model
@article{transformer,
  title={Attention is all you need},
  author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia},
  journal={Advances in neural information processing systems},
  year={2017}
}

@article{qwen,
  title={Qwen2.5-1M Technical Report},
  author={Yang, An and Yu, Bowen and Li, Chengyuan and Liu, Dayiheng and Huang, Fei and Huang, Haoyan and others},
  journal={arXiv preprint arXiv:2501.15383},
  year={2025}
}

@article{llama3,
  title={The llama 3 herd of models},
  author={Grattafiori, Aaron and Dubey, Abhimanyu and Jauhri, Abhinav and Pandey, Abhinav and others},
  journal={arXiv preprint arXiv:2407.21783},
  year={2024}
}

@article{gemma3,
  title={Gemma 3 technical report},
  author={Team, Gemma and Kamath, Aishwarya and Ferret, Johan and Pathak, Shreya and Vieillard, Nino and others},
  journal={arXiv preprint arXiv:2503.19786},
  year={2025}
}

@article{gpt4,
  title={Gpt-4 technical report},
  author={Achiam, Josh and Adler, Steven and Agarwal, Sandhini and Ahmad, Lama and Akkaya, Ilge and Aleman, Florencia Leoni and Almeida, Diogo and Altenschmidt, Janko and Altman, Sam and Anadkat, Shyamal and others},
  journal={arXiv preprint arXiv:2303.08774},
  year={2023}
}

@misc{mistral,
    title={Mistral 7B},
    author={Albert Q. Jiang and Alexandre Sablayrolles and Arthur Mensch and Chris Bamford and others},
    year={2023},
    journal={arXiv preprint arXiv:2310.06825},
}

%%%% dataset
@article{scbench,
  title={Scbench: A kv cache-centric analysis of long-context methods},
  author={Li, Yucheng and Jiang, Huiqiang and Wu, Qianhui and Luo, Xufang and Ahn, Surin and Zhang, Chengruidong and Abdi, Amir H and Li, Dongsheng and Gao, Jianfeng and Yang, Yuqing and others},
  journal={ICLR},
  year={2025}
}

@article{squad,
  title={Squad: 100,000+ questions for machine comprehension of text},
  author={Rajpurkar, Pranav and Zhang, Jian and Lopyrev, Konstantin and Liang, Percy},
  journal={EMNLP},
  year={2016}
}

@article{gsm,
  title={Training verifiers to solve math word problems},
  author={Cobbe, Karl and Kosaraju, Vineet and Bavarian, Mohammad and Chen, Mark and Jun, Heewoo and Kaiser, Lukasz and Plappert, Matthias and Tworek, Jerry and Hilton, Jacob and Nakano, Reiichiro and others},
  journal={arXiv preprint arXiv:2110.14168},
  year={2021}
}

@misc{needle,
  title={Needle in a haystack-pressure testing llms},
  author={Kamradt, Greg},
  year={2023}
}

@article{longbench,
  title={Longbench: A bilingual, multitask benchmark for long context understanding},
  author={Bai, Yushi and Lv, Xin and Zhang, Jiajie and Lyu, Hongchang and Tang, Jiankai and others},
  journal={ACL},
  year={2024}
}

@article{inftybench,
  title={$\infty$Bench: Extending Long Context Evaluation Beyond 100K Tokens},
  author={Zhang, Xinrong and Chen, Yingfa and Hu, Shengding and Xu, Zihang and Chen, Junhao and others},
  journal={ACL},
  year={2024}
}

@article{flashattn,
  title={Flashattention-2: Faster attention with better parallelism and work partitioning},
  author={Dao, Tri},
  journal={ICLR},
  year={2024}
}

@article{cag,
  title={Don't Do RAG: When Cache-Augmented Generation is All You Need for Knowledge Tasks},
  author={Chan, Brian J and Chen, Chao-Ting and Cheng, Jui-Hung and Huang, Hen-Hsen},
  journal={arXiv preprint arXiv:2412.15605},
  year={2024}
}

@article{personal,
  title={Personal llm agents: Insights and survey about the capability, efficiency and security},
  author={Li, Yuanchun and Wen, Hao and Wang, Weijun and Li, Xiangyu and Yuan, Yizhen and Liu, Guohong and others},
  journal={arXiv preprint arXiv:2401.05459},
  year={2024}
}

@misc{gpt,
  title={Improving language understanding by generative pre-training},
  author={Radford, Alec and Narasimhan, Karthik and Salimans, Tim and Sutskever, Ilya and others},
  year={2018},
}

@article{gpt2,
  title={Language models are unsupervised multitask learners},
  author={Radford, Alec and Wu, Jeffrey and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya and others},
  journal={OpenAI blog},
  year={2019}
}

@article{bert,
  title={Bert: Pre-training of deep bidirectional transformers for language understanding},
  author={Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
  journal={NAACL},
  year={2019}
}}

@misc{staticcache,
  author       = {Hugging Face},
  title        = {KV Cache Strategies},
  year         = {2024},
  url          = {https://huggingface.co/docs/transformers/en/kv_cache},
}

@misc{characterai,
  author       = {Character.AI},
  title        = {Optimizing AI Inference at Character.AI},
  year         = {2024},
  url          = {https://research.character.ai/optimizing-inference/},
}

@misc{gradientai,
  author       = {gradientAI},
  title        = {Llama-3 8B Gradient Instruct 1048k},
  year         = {2024},
  url          = {https://huggingface.co/gradientai/Llama-3-8B-Instruct-Gradient-1048k},
}


@inproceedings{prefill,
  title={Taming Throughput-Latency tradeoff in LLM inference with Sarathi-Serve},
  author={Agrawal, Amey and Kedia, Nitin and Panwar, Ashish and Mohan, Jayashree and Kwatra, Nipun and Gulavani, Bhargav and Tumanov, Alexey and Ramjee, Ramachandran},
  booktitle={18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)},
  year={2024}
}

@inproceedings{vllm,
  title={Efficient memory management for large language model serving with pagedattention},
  author={Kwon, Woosuk and Li, Zhuohan and Zhuang, Siyuan and Sheng, Ying and Zheng, Lianmin and Yu, Cody Hao and Gonzalez, Joseph and Zhang, Hao and Stoica, Ion},
  booktitle={Proceedings of the 29th Symposium on Operating Systems Principles},
  year={2023}
}

@inproceedings{elmo,
   title={Deep Contextualized Word Representations},
   booktitle={NAACL},
   author={Peters, Matthew and Neumann, Mark and Iyyer, Mohit and Gardner, Matt and Clark, Christopher and Lee, Kenton and Zettlemoyer, Luke},
   year={2018} 
}

@inproceedings{decodingtrust,
  title={DecodingTrust: A Comprehensive Assessment of Trustworthiness in GPT Models.},
  author={Wang, Boxin and Chen, Weixin and Pei, Hengzhi and Xie, Chulin and Kang, Mintong  and others},
  booktitle={NeurIPS},
  year={2023}
}

@article{ruler,
  title={RULER: What's the Real Context Size of Your Long-Context Language Models?},
  author={Hsieh, Cheng-Ping and Sun, Simeng and Kriman, Samuel and Acharya, Shantanu and Rekesh, Dima and Jia, Fei and Zhang, Yang and Ginsburg, Boris},
  journal={COLM},
  year={2024}
}

@article{shallowalignment,
  title={Safety alignment should be made more than just a few tokens deep},
  author={Qi, Xiangyu and Panda, Ashwinee and Lyu, Kaifeng and Ma, Xiao and Roy, Subhrajit and Beirami, Ahmad and Mittal, Prateek and Henderson, Peter},
  journal={ICLR},
  year={2025}
}

@article{differential,
  title={Differential transformer},
  author={Ye, Tianzhu and Dong, Li and Xia, Yuqing and Sun, Yutao and Zhu, Yi and Huang, Gao and Wei, Furu},
  journal={ICLR},
  year={2025}
}

@inproceedings{mae,
  title={Masked autoencoders are scalable vision learners},
  author={He, Kaiming and Chen, Xinlei and Xie, Saining and Li, Yanghao and Doll{\'a}r, Piotr and Girshick, Ross},
  booktitle={CVPR},
  year={2022}
}

@article{teacherforcing,
  title={Professor forcing: A new algorithm for training recurrent networks},
  author={Goyal, Anirudh and Lamb, Alex and Zhang, Ying and Zhang, Saizheng and Courville, Aaron and Bengio, Yoshua},
  journal={Advances in neural information processing systems},
  volume={29},
  year={2016}
}

@misc{zip,
  author = {Katz, Phillip W.},
  title = {ZIP File Format Specification},
  url = {https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT},
  year = {1989},
}

@article{notoken,
  title={No token left behind: Reliable kv cache compression via importance-aware mixed precision quantization},
  author={Yang, June Yong and Kim, Byeongwook and Bae, Jeongin and Kwon, Beomseok and Park, Gunho and Yang, Eunho and Kwon, Se Jung and Lee, Dongsoo},
  journal={arXiv preprint arXiv:2402.18096},
  year={2024}
}

@article{kim2024infinipot,
  title={Infinipot: Infinite context processing on memory-constrained llms},
  author={Kim, Minsoo and Shim, Kyuhong and Choi, Jungwook and Chang, Simyung},
  journal={arXiv preprint arXiv:2410.01518},
  year={2024}
}

@article{corallo2024finch,
  title={Finch: Prompt-guided key-value cache compression for large language models},
  author={Corallo, Giulio and Papotti, Paolo},
  journal={Transactions of the Association for Computational Linguistics},
  volume={12},
  year={2024},
}

@article{corallo2025beyond,
  title={Beyond RAG: Task-Aware KV Cache Compression for Comprehensive Knowledge Reasoning},
  author={Corallo, Giulio and Weller, Orion and Petroni, Fabio and Papotti, Paolo},
  journal={arXiv preprint arXiv:2503.04973},
  year={2025}
}