\begin{thebibliography}{101}
\providecommand{\natexlab}[1]{#1}
\providecommand{\url}[1]{\texttt{#1}}
\expandafter\ifx\csname urlstyle\endcsname\relax
  \providecommand{\doi}[1]{doi: #1}\else
  \providecommand{\doi}{doi: \begingroup \urlstyle{rm}\Url}\fi

\bibitem[Brown et~al.(2020)Brown, Mann, Ryder, Subbiah, Kaplan, Dhariwal, Neelakantan, Shyam, Sastry, Askell, et~al.]{brown2020language}
Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared~D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et~al.
\newblock Language models are few-shot learners.
\newblock \emph{Advances in neural information processing systems}, 33:\penalty0 1877--1901, 2020.

\bibitem[Tay et~al.(2022)Tay, Dehghani, Tran, Garcia, Bahri, Schuster, Zheng, Houlsby, and Metzler]{tay2022unifying}
Yi~Tay, Mostafa Dehghani, Vinh~Q Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu~Steven Zheng, Neil Houlsby, and Donald Metzler.
\newblock Unifying language learning paradigms.
\newblock \emph{ArXiv preprint}, abs/2205.05131, 2022.
\newblock URL \url{https://arxiv.org/abs/2205.05131}.

\bibitem[Tang et~al.(2025{\natexlab{a}})Tang, Liu, Wang, Dong, He, Chu, and Li]{tang2025the}
Zhenheng Tang, Xiang Liu, Qian Wang, Peijie Dong, Bingsheng He, Xiaowen Chu, and Bo~Li.
\newblock The lottery {LLM} hypothesis, rethinking what abilities should {LLM} compression preserve?
\newblock In \emph{The Fourth Blogpost Track at ICLR 2025}, 2025{\natexlab{a}}.

\bibitem[Wang et~al.(2025{\natexlab{a}})Wang, Tang, Jiang, Chen, Wang, and He]{wang2025agenttaxo}
Qian Wang, Zhenheng Tang, Zichen Jiang, Nuo Chen, Tianyu Wang, and Bingsheng He.
\newblock Agenttaxo: Dissecting and benchmarking token distribution of llm multi-agent systems.
\newblock In \emph{ICLR 2025 Workshop on Foundation Models in the Wild}, 2025{\natexlab{a}}.

\bibitem[Touvron et~al.(2023)Touvron, Martin, Stone, Albert, Almahairi, Babaei, Bashlykov, Batra, Bhargava, Bhosale, et~al.]{touvron2023llama2}
Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, et~al.
\newblock Llama 2: Open foundation and fine-tuned chat models.
\newblock \emph{ArXiv preprint}, abs/2307.09288, 2023.
\newblock URL \url{https://arxiv.org/abs/2307.09288}.

\bibitem[Dao(2024)]{flash-attn2}
Tri Dao.
\newblock Flash{A}ttention-2: Faster attention with better parallelism and work partitioning.
\newblock In \emph{International Conference on Learning Representations (ICLR)}, 2024.

\bibitem[Jacobs et~al.(2023)]{jacobs2023deepspeed}
Sam~Ade Jacobs et~al.
\newblock {DeepSpeed Ulysses}: System optimizations for enabling training of extreme long sequence {Transformer} models.
\newblock \emph{ArXiv preprint}, abs/2309.14509, 2023.
\newblock URL \url{https://arxiv.org/abs/2309.14509}.

\bibitem[Xiao et~al.(2024)Xiao, Tian, Chen, Han, and Lewis]{xiao2024efficient}
Guangxuan Xiao, Yuandong Tian, Beidi Chen, Song Han, and Mike Lewis.
\newblock Efficient streaming language models with attention sinks.
\newblock In \emph{The Twelfth International Conference on Learning Representations}, 2024.
\newblock URL \url{https://openreview.net/forum?id=NG7sS51zVF}.

\bibitem[Liu et~al.(2024{\natexlab{a}})Liu, Yan, Zaharia, and Abbeel]{liu2024world}
Hao Liu, Wilson Yan, Matei Zaharia, and Pieter Abbeel.
\newblock World model on million-length video and language with ringattention.
\newblock \emph{ArXiv preprint}, abs/2402.08268, 2024{\natexlab{a}}.
\newblock URL \url{https://arxiv.org/abs/2402.08268}.

\bibitem[Young et~al.(2024)Young, Chen, Li, Huang, Zhang, Zhang, Li, Zhu, Chen, Chang, et~al.]{young2024yi}
Alex Young, Bei Chen, Chao Li, Chengen Huang, Ge~Zhang, Guanwei Zhang, Heng Li, Jiangcheng Zhu, Jianqun Chen, Jing Chang, et~al.
\newblock Yi: Open foundation models by 01. ai.
\newblock \emph{ArXiv preprint}, abs/2403.04652, 2024.
\newblock URL \url{https://arxiv.org/abs/2403.04652}.

\bibitem[Zhang et~al.(2023)Zhang, Sheng, Zhou, Chen, Zheng, Cai, Song, Tian, R{\'e}, Barrett, et~al.]{zhang2024h2o}
Zhenyu Zhang, Ying Sheng, Tianyi Zhou, Tianlong Chen, Lianmin Zheng, Ruisi Cai, Zhao Song, Yuandong Tian, Christopher R{\'e}, Clark Barrett, et~al.
\newblock H2o: Heavy-hitter oracle for efficient generative inference of large language models.
\newblock \emph{Advances in Neural Information Processing Systems}, 36:\penalty0 34661--34710, 2023.

\bibitem[Zhu et~al.(2025)Zhu, Tang, Liu, Li, Li, Chu, and Han]{zhu2025oraclekv}
Yuanbing Zhu, Zhenheng Tang, Xiang Liu, Ang Li, Bo~Li, Xiaowen Chu, and Bo~Han.
\newblock Oracle{KV}: Oracle guidance for question-independent {KV} cache compression.
\newblock In \emph{ICML 2025 Workshop on Long-Context Foundation Models}, 2025.
\newblock URL \url{https://openreview.net/forum?id=KHM2YOGgX9}.

\bibitem[Wang et~al.(2025{\natexlab{b}})Wang, Wang, Tang, Li, Chen, Liang, and He]{wang2025all}
Qian Wang, Tianyu Wang, Zhenheng Tang, Qinbin Li, Nuo Chen, Jingsheng Liang, and Bingsheng He.
\newblock Megaagent: A large-scale autonomous llm-based multi-agent system without predefined sops.
\newblock In \emph{The 63rd Annual Meeting of the Association for Computational Linguistics}, 2025{\natexlab{b}}.

\bibitem[Reid et~al.(2024)Reid, Savinov, Teplyashin, Lepikhin, Lillicrap, Alayrac, Soricut, Lazaridou, Firat, Schrittwieser, et~al.]{geminiteam2024gemini}
Machel Reid, Nikolay Savinov, Denis Teplyashin, Dmitry Lepikhin, Timothy Lillicrap, Jean-baptiste Alayrac, Radu Soricut, Angeliki Lazaridou, Orhan Firat, Julian Schrittwieser, et~al.
\newblock Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context.
\newblock \emph{ArXiv preprint}, abs/2403.05530, 2024.
\newblock URL \url{https://arxiv.org/abs/2403.05530}.

\bibitem[Li et~al.(2024)Li, Huang, Yang, Venkitesh, Locatelli, Ye, Cai, Lewis, and Chen]{li2024snapkv}
Yuhong Li, Yingbing Huang, Bowen Yang, Bharat Venkitesh, Acyr Locatelli, Hanchen Ye, Tianle Cai, Patrick Lewis, and Deming Chen.
\newblock Snapkv: Llm knows what you are looking for before generation.
\newblock \emph{ArXiv preprint}, abs/2404.14469, 2024.
\newblock URL \url{https://arxiv.org/abs/2404.14469}.

\bibitem[Ge et~al.(2023)Ge, Zhang, Liu, Zhang, Han, and Gao]{ge2023model}
Suyu Ge, Yunan Zhang, Liyuan Liu, Minjia Zhang, Jiawei Han, and Jianfeng Gao.
\newblock Model tells you what to discard: Adaptive kv cache compression for llms.
\newblock \emph{ArXiv preprint}, abs/2310.01801, 2023.
\newblock URL \url{https://arxiv.org/abs/2310.01801}.

\bibitem[Cai et~al.(2024)Cai, Zhang, Gao, Liu, Liu, Lu, Xiong, Dong, Chang, Hu, et~al.]{zhang2024pyramidkv}
Zefan Cai, Yichi Zhang, Bofei Gao, Yuliang Liu, Tianyu Liu, Keming Lu, Wayne Xiong, Yue Dong, Baobao Chang, Junjie Hu, et~al.
\newblock Pyramidkv: Dynamic kv cache compression based on pyramidal information funneling.
\newblock \emph{arXiv preprint arXiv:2406.02069}, 2024.

\bibitem[Fu et~al.(2024{\natexlab{a}})Fu, Cho, Merth, Mehta, Rastegari, and Najibi]{fu2024lazyllm}
Qichen Fu, Minsik Cho, Thomas Merth, Sachin Mehta, Mohammad Rastegari, and Mahyar Najibi.
\newblock Lazy{LLM}: Dynamic token pruning for efficient long context {LLM} inference.
\newblock In \emph{Workshop on Efficient Systems for Foundation Models II @ ICML2024}, 2024{\natexlab{a}}.
\newblock URL \url{https://openreview.net/forum?id=gGZD1dsJqZ}.

\bibitem[Yang et~al.(2024{\natexlab{a}})Yang, Han, Gao, Hu, Zhang, and Zhao]{yang2024pyramidinfer}
Dongjie Yang, Xiaodong Han, Yan Gao, Yao Hu, Shilin Zhang, and Hai Zhao.
\newblock {P}yramid{I}nfer: Pyramid {KV} cache compression for high-throughput {LLM} inference.
\newblock In Lun-Wei Ku, Andre Martins, and Vivek Srikumar, editors, \emph{Findings of the Association for Computational Linguistics ACL 2024}, pages 3258--3270, Bangkok, Thailand and virtual meeting, 2024{\natexlab{a}}. Association for Computational Linguistics.
\newblock \doi{10.18653/v1/2024.findings-acl.195}.
\newblock URL \url{https://aclanthology.org/2024.findings-acl.195}.

\bibitem[Liu et~al.(2024{\natexlab{b}})Liu, Desai, Liao, Wang, Xie, Xu, Kyrillidis, and Shrivastava]{liu2024scissorhands}
Zichang Liu, Aditya Desai, Fangshuo Liao, Weitao Wang, Victor Xie, Zhaozhuo Xu, Anastasios Kyrillidis, and Anshumali Shrivastava.
\newblock Scissorhands: Exploiting the persistence of importance hypothesis for llm kv cache compression at test time.
\newblock \emph{Advances in Neural Information Processing Systems}, 36, 2024{\natexlab{b}}.

\bibitem[Tang et~al.(2024{\natexlab{a}})Tang, Zhao, Zhu, Xiao, Kasikci, and Han]{tang2024quest}
Jiaming Tang, Yilong Zhao, Kan Zhu, Guangxuan Xiao, Baris Kasikci, and Song Han.
\newblock Quest: Query-aware sparsity for efficient long-context llm inference.
\newblock \emph{ArXiv preprint}, abs/2406.10774, 2024{\natexlab{a}}.
\newblock URL \url{https://arxiv.org/abs/2406.10774}.

\bibitem[Miller(1956)]{miller1956information}
George~A Miller.
\newblock Information and memory.
\newblock \emph{Scientific American}, 195\penalty0 (2):\penalty0 42--47, 1956.

\bibitem[Ramshaw and Marcus(1999)]{ramshaw1999text}
Lance~A Ramshaw and Mitchell~P Marcus.
\newblock Text chunking using transformation-based learning.
\newblock In \emph{Natural language processing using very large corpora}, pages 157--176. Springer, 1999.

\bibitem[Fang and Xie(2022)]{xie2022an}
Hongchao Fang and Pengtao Xie.
\newblock An end-to-end contrastive self-supervised learning framework for language understanding.
\newblock \emph{Transactions of the Association for Computational Linguistics}, 10:\penalty0 1324--1340, 2022.
\newblock \doi{10.1162/tacl_a_00521}.
\newblock URL \url{https://aclanthology.org/2022.tacl-1.76/}.

\bibitem[Bai et~al.(2024)Bai, Lv, Zhang, Lyu, Tang, Huang, Du, Liu, Zeng, Hou, Dong, Tang, and Li]{bai2023longbench}
Yushi Bai, Xin Lv, Jiajie Zhang, Hongchang Lyu, Jiankai Tang, Zhidian Huang, Zhengxiao Du, Xiao Liu, Aohan Zeng, Lei Hou, Yuxiao Dong, Jie Tang, and Juanzi Li.
\newblock {L}ong{B}ench: A bilingual, multitask benchmark for long context understanding.
\newblock In Lun-Wei Ku, Andre Martins, and Vivek Srikumar, editors, \emph{Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, pages 3119--3137, Bangkok, Thailand, August 2024. Association for Computational Linguistics.
\newblock \doi{10.18653/v1/2024.acl-long.172}.
\newblock URL \url{https://aclanthology.org/2024.acl-long.172}.

\bibitem[Kamradt(2023)]{needle}
Gregory Kamradt.
\newblock {Needle In A Haystack} - pressure testing {LLM}s.
\newblock \emph{Github}, 2023.
\newblock URL \url{https://github.com/gkamradt/LLMTest_NeedleInAHaystack/tree/main}.

\bibitem[Cobbe et~al.(2021)Cobbe, Kosaraju, Bavarian, Chen, Jun, Kaiser, Plappert, Tworek, Hilton, Nakano, et~al.]{gsm8k}
Karl Cobbe, Vineet Kosaraju, Mohammad Bavarian, Mark Chen, Heewoo Jun, Lukasz Kaiser, Matthias Plappert, Jerry Tworek, Jacob Hilton, Reiichiro Nakano, et~al.
\newblock Training verifiers to solve math word problems.
\newblock \emph{ArXiv preprint}, abs/2110.14168, 2021.
\newblock URL \url{https://arxiv.org/abs/2110.14168}.

\bibitem[Luo et~al.(2024)Luo, Ma, Liu, Guo, and Xiao]{jailbreakv}
Weidi Luo, Siyuan Ma, Xiaogeng Liu, Xiaoyu Guo, and Chaowei Xiao.
\newblock Jailbreakv: A benchmark for assessing the robustness of multimodal large language models against jailbreak attacks.
\newblock In \emph{First Conference on Language Modeling}, 2024.
\newblock URL \url{https://openreview.net/forum?id=GC4mXVfquq}.

\bibitem[Guo et~al.(2025)Guo, Yang, Zhang, Song, Zhang, Xu, Zhu, Ma, Wang, Bi, et~al.]{deepseekr1}
Daya Guo, Dejian Yang, Haowei Zhang, Junxiao Song, Ruoyu Zhang, Runxin Xu, Qihao Zhu, Shirong Ma, Peiyi Wang, Xiao Bi, et~al.
\newblock Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning.
\newblock \emph{arXiv preprint arXiv:2501.12948}, 2025.

\bibitem[Meta(2024)]{meta2024llama3}
Meta.
\newblock Introducing meta llama 3: The most capable openly available llm to date.
\newblock \url{https://ai.meta.com/blog/meta-llama-3/}, 2024.
\newblock Accessed: 2024-06-07.

\bibitem[Jiang et~al.(2023{\natexlab{a}})Jiang, Sablayrolles, Mensch, Bamford, Chaplot, de~las Casas, Bressand, Lengyel, Lample, Saulnier, Lavaud, Lachaux, Stock, Scao, Lavril, Wang, Lacroix, and Sayed]{jiang2023mistral7b}
Albert~Q. Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra~Singh Chaplot, Diego de~las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lucile Saulnier, Lélio~Renard Lavaud, Marie-Anne Lachaux, Pierre Stock, Teven~Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, and William~El Sayed.
\newblock Mistral 7b, 2023{\natexlab{a}}.
\newblock URL \url{https://arxiv.org/abs/2310.06825}.

\bibitem[Yang et~al.(2024{\natexlab{b}})Yang, Yang, Hui, Zheng, Yu, Zhou, Li, Li, Liu, Huang, et~al.]{qwen2}
An~Yang, Baosong Yang, Binyuan Hui, Bo~Zheng, Bowen Yu, Chang Zhou, Chengpeng Li, Chengyuan Li, Dayiheng Liu, Fei Huang, et~al.
\newblock Qwen2 technical report.
\newblock \emph{ArXiv preprint}, abs/2407.10671, 2024{\natexlab{b}}.
\newblock URL \url{https://arxiv.org/abs/2407.10671}.

\bibitem[Han et~al.(2024)Han, Wang, Peng, Xiong, Chen, Ji, and Wang]{han2024lm}
Chi Han, Qifan Wang, Hao Peng, Wenhan Xiong, Yu~Chen, Heng Ji, and Sinong Wang.
\newblock {LM}-infinite: Zero-shot extreme length generalization for large language models.
\newblock In Kevin Duh, Helena Gomez, and Steven Bethard, editors, \emph{Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)}, pages 3991--4008, Mexico City, Mexico, 2024. Association for Computational Linguistics.
\newblock URL \url{https://aclanthology.org/2024.naacl-long.222}.

\bibitem[Liu et~al.(2025{\natexlab{a}})Liu, Chen, Hu, and Chu]{liu2025flowkv}
Xiang Liu, Hong Chen, Xuming Hu, and Xiaowen Chu.
\newblock Flow{KV}: Enhancing multi-turn conversational coherence in {LLM}s via isolated key-value cache management.
\newblock In \emph{First Workshop on Multi-Turn Interactions in Large Language Models}, 2025{\natexlab{a}}.
\newblock URL \url{https://openreview.net/forum?id=rZumU1owkr}.

\bibitem[Tang et~al.(2025{\natexlab{b}})Tang, Tang, Pan, Liu, Lai, Chu, and Li]{tang2025ghost}
Zichen Tang, Zhenheng Tang, Gaoning Pan, Buhua Liu, Kunfeng Lai, Xiaowen Chu, and Bo~Li.
\newblock Ghost in the cloud: Your geo-distributed large language models training is easily manipulated.
\newblock In \emph{ICML 2025 Workshop on Data in Generative Models - The Bad, the Ugly, and the Greats}, 2025{\natexlab{b}}.
\newblock URL \url{https://openreview.net/forum?id=dpDdqgfcTM}.

\bibitem[Wei et~al.(2025)Wei, Tang, Zeng, Liu, Zhang, Chu, and Han]{weifan2025jailbreaklora}
Fanjunduo Wei, Zhenheng Tang, Rongfei Zeng, Tongliang Liu, Chengqi Zhang, Xiaowen Chu, and Bo~Han.
\newblock Jailbreaklo{RA}: Your downloaded lo{RA} from sharing platforms might be unsafe.
\newblock In \emph{ICML 2025 Workshop on Data in Generative Models - The Bad, the Ugly, and the Greats}, 2025.
\newblock URL \url{https://openreview.net/forum?id=RjaeiNswGh}.

\bibitem[Wei et~al.(2022)Wei, Wang, Schuurmans, Bosma, Xia, Chi, Le, Zhou, et~al.]{wei2022chain}
Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Fei Xia, Ed~Chi, Quoc~V Le, Denny Zhou, et~al.
\newblock Chain-of-thought prompting elicits reasoning in large language models.
\newblock \emph{Advances in neural information processing systems}, 35:\penalty0 24824--24837, 2022.

\bibitem[Lai et~al.(2025)Lai, Tang, Pan, Dong, Liu, Chen, Shen, Li, and Chu]{lai2025mediatormemoryefficientllmmerging}
Kunfeng Lai, Zhenheng Tang, Xinglin Pan, Peijie Dong, Xiang Liu, Haolan Chen, Li~Shen, Bo~Li, and Xiaowen Chu.
\newblock Mediator: Memory-efficient llm merging with less parameter conflicts and uncertainty based routing.
\newblock \emph{arxiv preprint arXiv:2502.04411}, 2025.

\bibitem[Diao et~al.(2024)Diao, Wang, Lin, Pan, Liu, and Zhang]{diao2023active}
Shizhe Diao, Pengcheng Wang, Yong Lin, Rui Pan, Xiang Liu, and Tong Zhang.
\newblock Active prompting with chain-of-thought for large language models.
\newblock In Lun-Wei Ku, Andre Martins, and Vivek Srikumar, editors, \emph{Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, pages 1330--1350, Bangkok, Thailand, August 2024. Association for Computational Linguistics.
\newblock \doi{10.18653/v1/2024.acl-long.73}.
\newblock URL \url{https://aclanthology.org/2024.acl-long.73}.

\bibitem[Pan et~al.(2024{\natexlab{a}})Pan, Xing, Diao, Sun, Liu, Shum, Zhang, Pi, and Zhang]{pan2023plum}
Rui Pan, Shuo Xing, Shizhe Diao, Wenhe Sun, Xiang Liu, KaShun Shum, Jipeng Zhang, Renjie Pi, and Tong Zhang.
\newblock Plum: Prompt learning using metaheuristics.
\newblock In Lun-Wei Ku, Andre Martins, and Vivek Srikumar, editors, \emph{Findings of the Association for Computational Linguistics ACL 2024}, pages 2177--2197, Bangkok, Thailand and virtual meeting, August 2024{\natexlab{a}}. Association for Computational Linguistics.
\newblock \doi{10.18653/v1/2024.findings-acl.129}.
\newblock URL \url{https://aclanthology.org/2024.findings-acl.129}.

\bibitem[Chowdhery et~al.(2022)Chowdhery, Narang, Devlin, Bosma, Mishra, Roberts, Barham, Chung, Sutton, Gehrmann, et~al.]{chowdhery2022palm}
Aakanksha Chowdhery, Sharan Narang, Jacob Devlin, Maarten Bosma, Gaurav Mishra, Adam Roberts, Paul Barham, Hyung~Won Chung, Charles Sutton, Sebastian Gehrmann, et~al.
\newblock Palm: Scaling language modeling with pathways.
\newblock \emph{ArXiv preprint}, abs/2204.02311, 2022.
\newblock URL \url{https://arxiv.org/abs/2204.02311}.

\bibitem[Agarwal et~al.(2024)Agarwal, Singh, Zhang, Bohnet, Rosias, Chan, Zhang, Anand, Abbas, Nova, et~al.]{agarwal2024many}
Rishabh Agarwal, Avi Singh, Lei~M Zhang, Bernd Bohnet, Luis Rosias, Stephanie Chan, Biao Zhang, Ankesh Anand, Zaheer Abbas, Azade Nova, et~al.
\newblock Many-shot in-context learning.
\newblock \emph{arXiv preprint arXiv:2404.11018}, 2024.

\bibitem[Zheng et~al.(2023)Zheng, Chiang, Sheng, Zhuang, Wu, Zhuang, Lin, Li, Li, Xing, et~al.]{zheng2023judging}
Lianmin Zheng, Wei-Lin Chiang, Ying Sheng, Siyuan Zhuang, Zhanghao Wu, Yonghao Zhuang, Zi~Lin, Zhuohan Li, Dacheng Li, Eric Xing, et~al.
\newblock Judging llm-as-a-judge with mt-bench and chatbot arena.
\newblock \emph{Advances in Neural Information Processing Systems}, 36:\penalty0 46595--46623, 2023.

\bibitem[{OpenAI}(2023)]{openai2023gpt4omini}
{OpenAI}.
\newblock Gpt-4o-mini: Advancing cost-efficient intelligence, 2023.
\newblock Accessed: 2023-12-14.

\bibitem[Liu et~al.(2024{\natexlab{c}})Liu, Yuan, Jin, Zhong, Xu, Braverman, Chen, and Hu]{liu2024kivi}
Zirui Liu, Jiayi Yuan, Hongye Jin, Shaochen Zhong, Zhaozhuo Xu, Vladimir Braverman, Beidi Chen, and Xia Hu.
\newblock Kivi: A tuning-free asymmetric 2bit quantization for kv cache.
\newblock \emph{arXiv preprint arXiv:2402.02750}, 2024{\natexlab{c}}.

\bibitem[Chang et~al.(2024)Chang, Lin, Lin, Chen, Hu, Wang, Huang, Ceze, Abdelfattah, and Wu]{chang2024palu}
Chi-Chih Chang, Wei-Cheng Lin, Chien-Yu Lin, Chong-Yan Chen, Yu-Fang Hu, Pei-Shuo Wang, Ning-Chi Huang, Luis Ceze, Mohamed~S Abdelfattah, and Kai-Chiang Wu.
\newblock Palu: Compressing kv-cache with low-rank projection.
\newblock \emph{arXiv preprint arXiv:2407.21118}, 2024.

\bibitem[Fu et~al.(2024{\natexlab{b}})Fu, Bailis, Stoica, and Zhang]{fubreak}
Yichao Fu, Peter Bailis, Ion Stoica, and Hao Zhang.
\newblock Break the sequential dependency of llm inference using lookahead decoding.
\newblock \emph{arXiv preprint arXiv:2402.02057}, 2024{\natexlab{b}}.

\bibitem[Zhou et~al.(2024)Zhou, Tao, Zhu, Luo, Wang, and Han]{zhoucan}
Zhanke Zhou, Rong Tao, Jianing Zhu, Yiwen Luo, Zengmao Wang, and Bo~Han.
\newblock Can language models perform robust reasoning in chain-of-thought prompting with noisy rationales?
\newblock In \emph{The Thirty-eighth Annual Conference on Neural Information Processing Systems}, 2024.

\bibitem[Steinwart(2007)]{Steinwart2007HowTC}
Ingo Steinwart.
\newblock How to compare different loss functions and their risks.
\newblock \emph{Constructive Approximation}, 26:\penalty0 225--287, 2007.
\newblock URL \url{https://api.semanticscholar.org/CorpusID:16660598}.

\bibitem[Pires and Szepesv{\'a}ri(2016)]{pires2016multiclass}
Bernardo~{\'A}vila Pires and Csaba Szepesv{\'a}ri.
\newblock Multiclass classification calibration functions.
\newblock \emph{arXiv preprint arXiv:1609.06385}, 2016.

\bibitem[Kleijn and der Vaart(2012)]{Kleijn2012TheBT}
Kleijn and Van der Vaart.
\newblock The bernstein-von-mises theorem under misspecification.
\newblock \emph{Electronic Journal of Statistics}, 6:\penalty0 354--381, 2012.
\newblock URL \url{https://api.semanticscholar.org/CorpusID:85548207}.

\bibitem[Tjong Kim~Sang and Veenstra(1999)]{sang1999representing}
Erik~F. Tjong Kim~Sang and Jorn Veenstra.
\newblock Representing text chunks.
\newblock In Henry~S. Thompson and Alex Lascarides, editors, \emph{Ninth Conference of the {E}uropean Chapter of the Association for Computational Linguistics}, pages 173--179, Bergen, Norway, 1999. Association for Computational Linguistics.
\newblock URL \url{https://aclanthology.org/E99-1023}.

\bibitem[Shi et~al.(2024)Shi, Min, Lomeli, Zhou, Li, Lin, Smith, Zettlemoyer, Yih, and Lewis]{shicontext}
Weijia Shi, Sewon Min, Maria Lomeli, Chunting Zhou, Margaret Li, Xi~Victoria Lin, Noah~A Smith, Luke Zettlemoyer, Wen-tau Yih, and Mike Lewis.
\newblock In-context pretraining: Language modeling beyond document boundaries.
\newblock In \emph{The Twelfth International Conference on Learning Representations}, 2024.

\bibitem[Fei et~al.(2024)Fei, Niu, Zhou, Hou, Bai, Deng, and Han]{fei-etal-2024-extending}
Weizhi Fei, Xueyan Niu, Pingyi Zhou, Lu~Hou, Bo~Bai, Lei Deng, and Wei Han.
\newblock Extending context window of large language models via semantic compression.
\newblock In Lun-Wei Ku, Andre Martins, and Vivek Srikumar, editors, \emph{Findings of the Association for Computational Linguistics ACL 2024}, pages 5169--5181, Bangkok, Thailand and virtual meeting, August 2024. Association for Computational Linguistics.
\newblock \doi{10.18653/v1/2024.findings-acl.306}.
\newblock URL \url{https://aclanthology.org/2024.findings-acl.306}.

\bibitem[Yepes et~al.(2024)Yepes, You, Milczek, Laverde, and Li]{yepes2024financialreportchunkingeffective}
Antonio~Jimeno Yepes, Yao You, Jan Milczek, Sebastian Laverde, and Renyu Li.
\newblock Financial report chunking for effective retrieval augmented generation, 2024.
\newblock URL \url{https://arxiv.org/abs/2402.05131}.

\bibitem[Smith and Troynikov(2024)]{smith2024evaluating}
Brandon Smith and Anton Troynikov.
\newblock Evaluating chunking strategies for retrieval.
\newblock Technical report, Chroma, 2024.
\newblock URL \url{https://research.trychroma.com/evaluating-chunking}.

\bibitem[Anthropic(2024)]{anthropic_contextual_retrieval_2024}
Anthropic.
\newblock Introducing contextual retrieval, 2024.
\newblock URL \url{https://www.anthropic.com/news/contextual-retrieval}.

\bibitem[Pan et~al.(2024{\natexlab{b}})Pan, Liu, Diao, Pi, Zhang, Han, and Zhang]{pan2024lisa}
Rui Pan, Xiang Liu, Shizhe Diao, Renjie Pi, Jipeng Zhang, Chi Han, and Tong Zhang.
\newblock Lisa: Layerwise importance sampling for memory-efficient large language model fine-tuning.
\newblock \emph{ArXiv preprint}, abs/2403.17919, 2024{\natexlab{b}}.
\newblock URL \url{https://arxiv.org/abs/2403.17919}.

\bibitem[Hu et~al.(2022)Hu, Shen, Wallis, Allen{-}Zhu, Li, Wang, Wang, and Chen]{hu2021lora}
Edward~J. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen{-}Zhu, Yuanzhi Li, Shean Wang, Lu~Wang, and Weizhu Chen.
\newblock Lora: Low-rank adaptation of large language models.
\newblock In \emph{The Tenth International Conference on Learning Representations, {ICLR} 2022, Virtual Event, April 25-29, 2022}. OpenReview.net, 2022.
\newblock URL \url{https://openreview.net/forum?id=nZeVKeeFYf9}.

\bibitem[Tang et~al.(2024{\natexlab{b}})Tang, Kang, Yin, Pan, Wang, He, Wang, Zeng, Zhao, Shi, Zhou, Li, He, and Chu]{tang2024fusionllmdecentralizedllmtraining}
Zhenheng Tang, Xueze Kang, Yiming Yin, Xinglin Pan, Yuxin Wang, Xin He, Qiang Wang, Rongfei Zeng, Kaiyong Zhao, Shaohuai Shi, Amelie~Chi Zhou, Bo~Li, Bingsheng He, and Xiaowen Chu.
\newblock Fusionllm: A decentralized llm training system on geo-distributed gpus with adaptive compression.
\newblock \emph{arxiv preprint arXiv:2410.12707}, 2024{\natexlab{b}}.

\bibitem[You et~al.(2020)You, Li, Reddi, Hseu, Kumar, Bhojanapalli, Song, Demmel, Keutzer, and Hsieh]{you2019lamb}
Yang You, Jing Li, Sashank~J. Reddi, Jonathan Hseu, Sanjiv Kumar, Srinadh Bhojanapalli, Xiaodan Song, James Demmel, Kurt Keutzer, and Cho{-}Jui Hsieh.
\newblock Large batch optimization for deep learning: Training {BERT} in 76 minutes.
\newblock In \emph{8th International Conference on Learning Representations, {ICLR} 2020, Addis Ababa, Ethiopia, April 26-30, 2020}. OpenReview.net, 2020.
\newblock URL \url{https://openreview.net/forum?id=Syx4wnEtvH}.

\bibitem[Chuang et~al.(2023)Chuang, Xie, Luo, Kim, Glass, and He]{chuang2023dola}
Yung-Sung Chuang, Yujia Xie, Hongyin Luo, Yoon Kim, James Glass, and Pengcheng He.
\newblock Dola: Decoding by contrasting layers improves factuality in large language models.
\newblock \emph{ArXiv preprint}, abs/2309.03883, 2023.
\newblock URL \url{https://arxiv.org/abs/2309.03883}.

\bibitem[Wu and Tu(2024)]{wu2024layercondensedkvcacheefficient}
Haoyi Wu and Kewei Tu.
\newblock Layer-condensed kv cache for efficient inference of large language models, 2024.
\newblock URL \url{https://arxiv.org/abs/2405.10637}.

\bibitem[Sun et~al.(2024)Sun, Dong, Zhu, Huang, Wang, Ma, Zhang, Wang, and Wei]{sun2024yoco}
Yutao Sun, Li~Dong, Yi~Zhu, Shaohan Huang, Wenhui Wang, Shuming Ma, Quanlu Zhang, Jianyong Wang, and Furu Wei.
\newblock You only cache once: Decoder-decoder architectures for language models.
\newblock \emph{arXiv preprint arXiv:2405.05254}, 2024.

\bibitem[Brandon et~al.(2024)Brandon, Mishra, Nrusimha, Panda, and Kelly]{brandon2024reducing}
William Brandon, Mayank Mishra, Aniruddha Nrusimha, Rameswar Panda, and Jonathan~Ragan Kelly.
\newblock Reducing transformer key-value cache size with cross-layer attention.
\newblock \emph{arXiv preprint arXiv:2405.12981}, 2024.

\bibitem[Liu et~al.(2024{\natexlab{d}})Liu, Liu, Pan, He, Haffari, and Zhuang]{liu2024minicache}
Akide Liu, Jing Liu, Zizheng Pan, Yefei He, Gholamreza Haffari, and Bohan Zhuang.
\newblock Minicache: Kv cache compression in depth dimension for large language models.
\newblock \emph{arXiv preprint arXiv:2405.14366}, 2024{\natexlab{d}}.

\bibitem[Zhang et~al.(2024)Zhang, Chen, Hu, Xu, Chen, Hao, Han, Thai, Wang, Liu, et~al.]{zhang2024infty}
Xinrong Zhang, Yingfa Chen, Shengding Hu, Zihang Xu, Junhao Chen, Moo~Khai Hao, Xu~Han, Zhen~Leng Thai, Shuo Wang, Zhiyuan Liu, et~al.
\newblock $\infty$-bench: Extending long context evaluation beyond 100k tokens.
\newblock \emph{ArXiv preprint}, abs/2402.13718, 2024.
\newblock URL \url{https://arxiv.org/abs/2402.13718}.

\bibitem[Shaham et~al.(2023)Shaham, Ivgi, Efrat, Berant, and Levy]{shaham2023zeroscrolls}
Uri Shaham, Maor Ivgi, Avia Efrat, Jonathan Berant, and Omer Levy.
\newblock {Z}ero{SCROLLS}: A zero-shot benchmark for long text understanding.
\newblock In Houda Bouamor, Juan Pino, and Kalika Bali, editors, \emph{Findings of the Association for Computational Linguistics: EMNLP 2023}, pages 7977--7989, Singapore, 2023. Association for Computational Linguistics.
\newblock \doi{10.18653/v1/2023.findings-emnlp.536}.
\newblock URL \url{https://aclanthology.org/2023.findings-emnlp.536}.

\bibitem[An et~al.(2023)An, Gong, Zhong, Li, Zhang, Kong, and Qiu]{an2023eval}
Chenxin An, Shansan Gong, Ming Zhong, Mukai Li, Jun Zhang, Lingpeng Kong, and Xipeng Qiu.
\newblock L-eval: Instituting standardized evaluation for long context language models.
\newblock \emph{ArXiv preprint}, abs/2307.11088, 2023.
\newblock URL \url{https://arxiv.org/abs/2307.11088}.

\bibitem[Mohtashami and Jaggi(2023)]{mohtashami2023landmark}
Amirkeivan Mohtashami and Martin Jaggi.
\newblock Landmark attention: Random-access infinite context length for transformers.
\newblock \emph{ArXiv preprint}, abs/2305.16300, 2023.
\newblock URL \url{https://arxiv.org/abs/2305.16300}.

\bibitem[Li et~al.(2023)Li, Shao, et~al.]{longchat}
Dacheng Li, Rulin Shao, et~al.
\newblock How long can open-source {LLMs} truly promise on context length?, 2023.
\newblock URL \url{https://lmsys.org/blog/2023-06-29-longchat}.

\bibitem[Liu et~al.(2024{\natexlab{e}})Liu, Lin, Hewitt, Paranjape, Bevilacqua, Petroni, and Liang]{liu2024lost}
Nelson~F. Liu, Kevin Lin, John Hewitt, Ashwin Paranjape, Michele Bevilacqua, Fabio Petroni, and Percy Liang.
\newblock Lost in the middle: How language models use long contexts.
\newblock \emph{Transactions of the Association for Computational Linguistics}, 12:\penalty0 157--173, 2024{\natexlab{e}}.
\newblock \doi{10.1162/tacl_a_00638}.
\newblock URL \url{https://aclanthology.org/2024.tacl-1.9}.

\bibitem[Hsieh et~al.(2024)Hsieh, Sun, Kriman, Acharya, Rekesh, Jia, Zhang, and Ginsburg]{hsieh2024ruler}
Cheng-Ping Hsieh, Simeng Sun, Samuel Kriman, Shantanu Acharya, Dima Rekesh, Fei Jia, Yang Zhang, and Boris Ginsburg.
\newblock Ruler: What's the real context size of your long-context language models?
\newblock \emph{ArXiv preprint}, abs/2404.06654, 2024.
\newblock URL \url{https://arxiv.org/abs/2404.06654}.

\bibitem[Tay et~al.(2021)Tay, Dehghani, Abnar, Shen, Bahri, Pham, Rao, Yang, Ruder, and Metzler]{tay2020long}
Yi~Tay, Mostafa Dehghani, Samira Abnar, Yikang Shen, Dara Bahri, Philip Pham, Jinfeng Rao, Liu Yang, Sebastian Ruder, and Donald Metzler.
\newblock Long range arena : {A} benchmark for efficient transformers.
\newblock In \emph{9th International Conference on Learning Representations, {ICLR} 2021, Virtual Event, Austria, May 3-7, 2021}. OpenReview.net, 2021.
\newblock URL \url{https://openreview.net/forum?id=qVyeW-grC2k}.

\bibitem[Liu et~al.(2025{\natexlab{b}})Liu, Tang, Chen, Dong, Li, Zhou, Li, Hu, and Chu]{liu2025can}
Xiang Liu, Zhenheng Tang, Hong Chen, Peijie Dong, Zeyu Li, Xiuze Zhou, Bo~Li, Xuming Hu, and Xiaowen Chu.
\newblock Can llms maintain fundamental abilities under kv cache compression?
\newblock \emph{arXiv preprint arXiv:2502.01941}, 2025{\natexlab{b}}.

\bibitem[Liu et~al.(2024{\natexlab{f}})Liu, Dong, Hu, and Chu]{liu2024longgenbench}
Xiang Liu, Peijie Dong, Xuming Hu, and Xiaowen Chu.
\newblock Longgenbench: Long-context generation benchmark.
\newblock \emph{arXiv preprint arXiv:2410.04199}, 2024{\natexlab{f}}.

\bibitem[Wingate et~al.(2022)Wingate, Shoeybi, and Sorensen]{wingate-etal-2022-prompt}
David Wingate, Mohammad Shoeybi, and Taylor Sorensen.
\newblock Prompt compression and contrastive conditioning for controllability and toxicity reduction in language models.
\newblock In \emph{Findings of the Association for Computational Linguistics: EMNLP 2022}, pages 5621--5634, Abu Dhabi, United Arab Emirates, December 2022. Association for Computational Linguistics.
\newblock \doi{10.18653/v1/2022.findings-emnlp.412}.
\newblock URL \url{https://aclanthology.org/2022.findings-emnlp.412}.

\bibitem[Chevalier et~al.(2023)Chevalier, Wettig, Ajith, and Chen]{Chevalier2023AdaptingLM}
Alexis Chevalier, Alexander Wettig, Anirudh Ajith, and Danqi Chen.
\newblock Adapting language models to compress contexts.
\newblock In Houda Bouamor, Juan Pino, and Kalika Bali, editors, \emph{Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing}, pages 3829--3846, Singapore, December 2023. Association for Computational Linguistics.
\newblock \doi{10.18653/v1/2023.emnlp-main.232}.
\newblock URL \url{https://aclanthology.org/2023.emnlp-main.232}.

\bibitem[Zhou et~al.(2023)Zhou, Jiang, Cui, Wang, Xiao, Hou, Cotterell, and Sachan]{zhou2023recurrentgpt}
Wangchunshu Zhou, Yuchen~Eleanor Jiang, Peng Cui, Tiannan Wang, Zhenxin Xiao, Yifan Hou, Ryan Cotterell, and Mrinmaya Sachan.
\newblock Recurrentgpt: Interactive generation of (arbitrarily) long text, 2023.

\bibitem[Wang et~al.(2023)Wang, Ding, Cao, Tian, Wang, Tao, and Guo]{wang2023recursively}
Qingyue Wang, Liang Ding, Yanan Cao, Zhiliang Tian, Shi Wang, Dacheng Tao, and Li~Guo.
\newblock Recursively summarizing enables long-term dialogue memory in large language models.
\newblock \emph{arXiv preprint arXiv:2308.15022}, 2023.

\bibitem[Jiang et~al.(2023{\natexlab{b}})Jiang, Wu, Lin, Yang, and Qiu]{jiang-etal-2023-llmlingua}
Huiqiang Jiang, Qianhui Wu, Chin-Yew Lin, Yuqing Yang, and Lili Qiu.
\newblock {LLML}ingua: Compressing prompts for accelerated inference of large language models.
\newblock In Houda Bouamor, Juan Pino, and Kalika Bali, editors, \emph{Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing}, pages 13358--13376, Singapore, December 2023{\natexlab{b}}. Association for Computational Linguistics.
\newblock \doi{10.18653/v1/2023.emnlp-main.825}.
\newblock URL \url{https://aclanthology.org/2023.emnlp-main.825}.

\bibitem[Jiang et~al.(2024)Jiang, Wu, , Luo, Li, Lin, Yang, and Qiu]{jiang-etal-2024-longllmlingua}
Huiqiang Jiang, Qianhui Wu, , Xufang Luo, Dongsheng Li, Chin-Yew Lin, Yuqing Yang, and Lili Qiu.
\newblock {L}ong{LLML}ingua: Accelerating and enhancing {LLM}s in long context scenarios via prompt compression.
\newblock In Lun-Wei Ku, Andre Martins, and Vivek Srikumar, editors, \emph{Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, pages 1658--1677, Bangkok, Thailand, August 2024. Association for Computational Linguistics.
\newblock URL \url{https://aclanthology.org/2024.acl-long.91}.

\bibitem[Xiao et~al.(2023)Xiao, Lin, Seznec, Wu, Demouth, and Han]{xiao2023smoothquant}
Guangxuan Xiao, Ji~Lin, Mickael Seznec, Hao Wu, Julien Demouth, and Song Han.
\newblock Smoothquant: Accurate and efficient post-training quantization for large language models.
\newblock In \emph{International Conference on Machine Learning}, pages 38087--38099. PMLR, 2023.

\bibitem[Zhao et~al.(2024)Zhao, Lin, Zhu, Ye, Chen, Zheng, Ceze, Krishnamurthy, Chen, and Kasikci]{zhao2024atom}
Yilong Zhao, Chien-Yu Lin, Kan Zhu, Zihao Ye, Lequn Chen, Size Zheng, Luis Ceze, Arvind Krishnamurthy, Tianqi Chen, and Baris Kasikci.
\newblock Atom: Low-bit quantization for efficient and accurate llm serving.
\newblock \emph{Proceedings of Machine Learning and Systems}, 6:\penalty0 196--209, 2024.

\bibitem[Sheng et~al.(2023)Sheng, Zheng, Yuan, Li, Ryabinin, Chen, Liang, R{\'e}, Stoica, and Zhang]{flexgen}
Ying Sheng, Lianmin Zheng, Binhang Yuan, Zhuohan Li, Max Ryabinin, Beidi Chen, Percy Liang, Christopher R{\'e}, Ion Stoica, and Ce~Zhang.
\newblock Flexgen: High-throughput generative inference of large language models with a single gpu.
\newblock In \emph{International Conference on Machine Learning}, pages 31094--31116. PMLR, 2023.

\bibitem[Li et~al.(2025)Li, Xiao, Wang, Liu, Tang, Lu, Yang, Chen, and Chu]{li2025antkv}
Zeyu Li, Chuanfu Xiao, Yang Wang, Xiang Liu, Zhenheng Tang, Baotong Lu, Mao Yang, Xinyu Chen, and Xiaowen Chu.
\newblock Antkv: Anchor token-aware sub-bit vector quantization for kv cache in large language models, 2025.
\newblock URL \url{https://arxiv.org/abs/2506.19505}.

\bibitem[Ko{\v{c}}isk{\'y} et~al.(2018)Ko{\v{c}}isk{\'y}, Schwarz, Blunsom, Dyer, Hermann, Melis, and Grefenstette]{kovcisky2018narrativeqa}
Tom{\'a}{\v{s}} Ko{\v{c}}isk{\'y}, Jonathan Schwarz, Phil Blunsom, Chris Dyer, Karl~Moritz Hermann, G{\'a}bor Melis, and Edward Grefenstette.
\newblock The {N}arrative{QA} reading comprehension challenge.
\newblock \emph{Transactions of the Association for Computational Linguistics}, 6:\penalty0 317--328, 2018.
\newblock \doi{10.1162/tacl_a_00023}.
\newblock URL \url{https://aclanthology.org/Q18-1023}.

\bibitem[Dasigi et~al.(2021)Dasigi, Lo, Beltagy, Cohan, Smith, and Gardner]{dasigi2021dataset}
Pradeep Dasigi, Kyle Lo, Iz~Beltagy, Arman Cohan, Noah~A. Smith, and Matt Gardner.
\newblock A dataset of information-seeking questions and answers anchored in research papers.
\newblock In Kristina Toutanova, Anna Rumshisky, Luke Zettlemoyer, Dilek Hakkani-Tur, Iz~Beltagy, Steven Bethard, Ryan Cotterell, Tanmoy Chakraborty, and Yichao Zhou, editors, \emph{Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies}, pages 4599--4610, Online, 2021. Association for Computational Linguistics.
\newblock \doi{10.18653/v1/2021.naacl-main.365}.
\newblock URL \url{https://aclanthology.org/2021.naacl-main.365}.

\bibitem[Yang et~al.(2018)Yang, Qi, Zhang, Bengio, Cohen, Salakhutdinov, and Manning]{yang2018hotpotqa}
Zhilin Yang, Peng Qi, Saizheng Zhang, Yoshua Bengio, William Cohen, Ruslan Salakhutdinov, and Christopher~D. Manning.
\newblock {H}otpot{QA}: A dataset for diverse, explainable multi-hop question answering.
\newblock In Ellen Riloff, David Chiang, Julia Hockenmaier, and Jun{'}ichi Tsujii, editors, \emph{Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing}, pages 2369--2380, Brussels, Belgium, 2018. Association for Computational Linguistics.
\newblock \doi{10.18653/v1/D18-1259}.
\newblock URL \url{https://aclanthology.org/D18-1259}.

\bibitem[Ho et~al.(2020)Ho, Duong~Nguyen, Sugawara, and Aizawa]{ho2020constructing}
Xanh Ho, Anh-Khoa Duong~Nguyen, Saku Sugawara, and Akiko Aizawa.
\newblock Constructing a multi-hop {QA} dataset for comprehensive evaluation of reasoning steps.
\newblock In Donia Scott, Nuria Bel, and Chengqing Zong, editors, \emph{Proceedings of the 28th International Conference on Computational Linguistics}, pages 6609--6625, Barcelona, Spain (Online), 2020. International Committee on Computational Linguistics.
\newblock \doi{10.18653/v1/2020.coling-main.580}.
\newblock URL \url{https://aclanthology.org/2020.coling-main.580}.

\bibitem[Trivedi et~al.(2022)Trivedi, Balasubramanian, Khot, and Sabharwal]{trivedi2022musique}
Harsh Trivedi, Niranjan Balasubramanian, Tushar Khot, and Ashish Sabharwal.
\newblock {M}u{S}i{Q}ue: Multihop questions via single-hop question composition.
\newblock \emph{Transactions of the Association for Computational Linguistics}, 10:\penalty0 539--554, 2022.
\newblock \doi{10.1162/tacl_a_00475}.
\newblock URL \url{https://aclanthology.org/2022.tacl-1.31}.

\bibitem[He et~al.(2018)He, Liu, Liu, Lyu, Zhao, Xiao, Liu, Wang, Wu, She, Liu, Wu, and Wang]{he2017dureader}
Wei He, Kai Liu, Jing Liu, Yajuan Lyu, Shiqi Zhao, Xinyan Xiao, Yuan Liu, Yizhong Wang, Hua Wu, Qiaoqiao She, Xuan Liu, Tian Wu, and Haifeng Wang.
\newblock {D}u{R}eader: a {C}hinese machine reading comprehension dataset from real-world applications.
\newblock In Eunsol Choi, Minjoon Seo, Danqi Chen, Robin Jia, and Jonathan Berant, editors, \emph{Proceedings of the Workshop on Machine Reading for Question Answering}, pages 37--46, Melbourne, Australia, 2018. Association for Computational Linguistics.
\newblock \doi{10.18653/v1/W18-2605}.
\newblock URL \url{https://aclanthology.org/W18-2605}.

\bibitem[Huang et~al.(2021)Huang, Cao, Parulian, Ji, and Wang]{huang2021efficient}
Luyang Huang, Shuyang Cao, Nikolaus Parulian, Heng Ji, and Lu~Wang.
\newblock Efficient attentions for long document summarization.
\newblock In Kristina Toutanova, Anna Rumshisky, Luke Zettlemoyer, Dilek Hakkani-Tur, Iz~Beltagy, Steven Bethard, Ryan Cotterell, Tanmoy Chakraborty, and Yichao Zhou, editors, \emph{Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies}, pages 1419--1436, Online, 2021. Association for Computational Linguistics.
\newblock \doi{10.18653/v1/2021.naacl-main.112}.
\newblock URL \url{https://aclanthology.org/2021.naacl-main.112}.

\bibitem[Zhong et~al.(2021)Zhong, Yin, Yu, Zaidi, Mutuma, Jha, Awadallah, Celikyilmaz, Liu, Qiu, and Radev]{zhong2021qmsum}
Ming Zhong, Da~Yin, Tao Yu, Ahmad Zaidi, Mutethia Mutuma, Rahul Jha, Ahmed~Hassan Awadallah, Asli Celikyilmaz, Yang Liu, Xipeng Qiu, and Dragomir Radev.
\newblock {QMS}um: A new benchmark for query-based multi-domain meeting summarization.
\newblock In Kristina Toutanova, Anna Rumshisky, Luke Zettlemoyer, Dilek Hakkani-Tur, Iz~Beltagy, Steven Bethard, Ryan Cotterell, Tanmoy Chakraborty, and Yichao Zhou, editors, \emph{Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies}, pages 5905--5921, Online, 2021. Association for Computational Linguistics.
\newblock \doi{10.18653/v1/2021.naacl-main.472}.
\newblock URL \url{https://aclanthology.org/2021.naacl-main.472}.

\bibitem[Fabbri et~al.(2019)Fabbri, Li, She, Li, and Radev]{fabbri2019multi}
Alexander Fabbri, Irene Li, Tianwei She, Suyi Li, and Dragomir Radev.
\newblock Multi-news: A large-scale multi-document summarization dataset and abstractive hierarchical model.
\newblock In Anna Korhonen, David Traum, and Llu{\'\i}s M{\`a}rquez, editors, \emph{Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics}, pages 1074--1084, Florence, Italy, 2019. Association for Computational Linguistics.
\newblock \doi{10.18653/v1/P19-1102}.
\newblock URL \url{https://aclanthology.org/P19-1102}.

\bibitem[Wu et~al.(2023)Wu, Zhan, Tan, Hou, Liang, and Song]{wu2023vcsum}
Han Wu, Mingjie Zhan, Haochen Tan, Zhaohui Hou, Ding Liang, and Linqi Song.
\newblock {VCSUM}: A versatile {C}hinese meeting summarization dataset.
\newblock In Anna Rogers, Jordan Boyd-Graber, and Naoaki Okazaki, editors, \emph{Findings of the Association for Computational Linguistics: ACL 2023}, pages 6065--6079, Toronto, Canada, 2023. Association for Computational Linguistics.
\newblock \doi{10.18653/v1/2023.findings-acl.377}.
\newblock URL \url{https://aclanthology.org/2023.findings-acl.377}.

\bibitem[Li and Roth(2002)]{li2002learning}
Xin Li and Dan Roth.
\newblock Learning question classifiers.
\newblock In \emph{{COLING} 2002: The 19th International Conference on Computational Linguistics}, 2002.
\newblock URL \url{https://aclanthology.org/C02-1150}.

\bibitem[Gliwa et~al.(2019)Gliwa, Mochol, Biesek, and Wawer]{gliwa2019samsum}
Bogdan Gliwa, Iwona Mochol, Maciej Biesek, and Aleksander Wawer.
\newblock {SAMS}um corpus: A human-annotated dialogue dataset for abstractive summarization.
\newblock In Lu~Wang, Jackie Chi~Kit Cheung, Giuseppe Carenini, and Fei Liu, editors, \emph{Proceedings of the 2nd Workshop on New Frontiers in Summarization}, pages 70--79, Hong Kong, China, 2019. Association for Computational Linguistics.
\newblock \doi{10.18653/v1/D19-5409}.
\newblock URL \url{https://aclanthology.org/D19-5409}.

\bibitem[Joshi et~al.(2017)Joshi, Choi, Weld, and Zettlemoyer]{joshi2017triviaqa}
Mandar Joshi, Eunsol Choi, Daniel Weld, and Luke Zettlemoyer.
\newblock {T}rivia{QA}: A large scale distantly supervised challenge dataset for reading comprehension.
\newblock In Regina Barzilay and Min-Yen Kan, editors, \emph{Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, pages 1601--1611, Vancouver, Canada, 2017. Association for Computational Linguistics.
\newblock \doi{10.18653/v1/P17-1147}.
\newblock URL \url{https://aclanthology.org/P17-1147}.

\bibitem[Guo et~al.(2023)Guo, Xu, Duan, Yin, and McAuley]{guo2023longcoder}
Daya Guo, Canwen Xu, Nan Duan, Jian Yin, and Julian~J. McAuley.
\newblock Longcoder: {A} long-range pre-trained language model for code completion.
\newblock In Andreas Krause, Emma Brunskill, Kyunghyun Cho, Barbara Engelhardt, Sivan Sabato, and Jonathan Scarlett, editors, \emph{International Conference on Machine Learning, {ICML} 2023, 23-29 July 2023, Honolulu, Hawaii, {USA}}, volume 202 of \emph{Proceedings of Machine Learning Research}, pages 12098--12107. {PMLR}, 2023.
\newblock URL \url{https://proceedings.mlr.press/v202/guo23j.html}.

\bibitem[Liu et~al.(2024{\natexlab{g}})Liu, Xu, and McAuley]{liu2023repobench}
Tianyang Liu, Canwen Xu, and Julian McAuley.
\newblock Repobench: Benchmarking repository-level code auto-completion systems.
\newblock In \emph{The Twelfth International Conference on Learning Representations}, 2024{\natexlab{g}}.
\newblock URL \url{https://openreview.net/forum?id=pPjZIOuQuF}.

\end{thebibliography}
