@inproceedings{
liu2025flowkv,
title={Flow{KV}: Enhancing Multi-Turn Conversational Coherence in {LLM}s via Isolated Key-Value Cache Management},
author={Xiang Liu and Hong Chen and Xuming Hu and Xiaowen Chu},
booktitle={First Workshop on Multi-Turn Interactions in Large Language Models},
year={2025},
url={https://openreview.net/forum?id=rZumU1owkr}
}

@article{chang2024palu,
  title={Palu: Compressing kv-cache with low-rank projection},
  author={Chang, Chi-Chih and Lin, Wei-Cheng and Lin, Chien-Yu and Chen, Chong-Yan and Hu, Yu-Fang and Wang, Pei-Shuo and Huang, Ning-Chi and Ceze, Luis and Abdelfattah, Mohamed S and Wu, Kai-Chiang},
  journal={arXiv preprint arXiv:2407.21118},
  year={2024}
}

@article{liu2024longgenbench,
  title={Longgenbench: Long-context generation benchmark},
  author={Liu, Xiang and Dong, Peijie and Hu, Xuming and Chu, Xiaowen},
  journal={arXiv preprint arXiv:2410.04199},
  year={2024}
}

@misc{li2025antkv,
      title={AnTKV: Anchor Token-Aware Sub-Bit Vector Quantization for KV Cache in Large Language Models}, 
      author={Zeyu Li and Chuanfu Xiao and Yang Wang and Xiang Liu and Zhenheng Tang and Baotong Lu and Mao Yang and Xinyu Chen and Xiaowen Chu},
      year={2025},
      eprint={2506.19505},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2506.19505}, 
}

@article{liu2025can,
  title={Can LLMs Maintain Fundamental Abilities under KV Cache Compression?},
  author={Liu, Xiang and Tang, Zhenheng and Chen, Hong and Dong, Peijie and Li, Zeyu and Zhou, Xiuze and Li, Bo and Hu, Xuming and Chu, Xiaowen},
  journal={arXiv preprint arXiv:2502.01941},
  year={2025}
}


@inproceedings{zhu2025oraclekv,
title={Oracle{KV}: Oracle Guidance for Question-Independent {KV} Cache Compression},
author={Yuanbing Zhu and Zhenheng Tang and Xiang Liu and Ang Li and Bo Li and Xiaowen Chu and Bo Han},
booktitle={ICML 2025 Workshop on Long-Context Foundation Models},
year={2025},
url={https://openreview.net/forum?id=KHM2YOGgX9}
}

@inproceedings{weifan2025jailbreaklora,
title={JailbreakLo{RA}: Your Downloaded Lo{RA} from Sharing Platforms might be Unsafe},
author={Fanjunduo Wei and Zhenheng Tang and Rongfei Zeng and Tongliang Liu and Chengqi Zhang and Xiaowen Chu and Bo Han},
booktitle={ICML 2025 Workshop on Data in Generative Models - The Bad, the Ugly, and the Greats},
year={2025},
url={https://openreview.net/forum?id=RjaeiNswGh}
}


@inproceedings{tang2025ghost,
title={Ghost in the Cloud: Your Geo-Distributed Large Language Models Training is Easily Manipulated},
author={Zichen Tang and Zhenheng Tang and Gaoning Pan and Buhua Liu and Kunfeng Lai and Xiaowen Chu and Bo Li},
booktitle={ICML 2025 Workshop on Data in Generative Models - The Bad, the Ugly, and the Greats},
year={2025},
url={https://openreview.net/forum?id=dpDdqgfcTM}
}

@inproceedings{wang2025agenttaxo,
  title={AgentTaxo: Dissecting and Benchmarking Token Distribution of LLM Multi-Agent Systems},
  author={Wang, Qian and Tang, Zhenheng and Jiang, Zichen and Chen, Nuo and Wang, Tianyu and He, Bingsheng},
  booktitle={ICLR 2025 Workshop on Foundation Models in the Wild},
  year={2025},
}

@inproceedings{wang2025all,
  title={MegaAgent: A Large-Scale Autonomous LLM-based Multi-Agent System Without Predefined SOPs},
  author={Wang, Qian and Wang, Tianyu and Tang, Zhenheng and Li, Qinbin and Chen, Nuo and Liang, Jingsheng and He, Bingsheng},
  booktitle={The 63rd Annual Meeting of the Association for Computational Linguistics},
  year={2025},
}


@article{lai2025mediatormemoryefficientllmmerging,
      title={Mediator: Memory-efficient LLM Merging with Less Parameter Conflicts and Uncertainty Based Routing}, 
      author={Kunfeng Lai and Zhenheng Tang and Xinglin Pan and Peijie Dong and Xiang Liu and Haolan Chen and Li Shen and Bo Li and Xiaowen Chu},
      year={2025},
      journal={arxiv preprint arXiv:2502.04411},
}


@inproceedings{
tang2025the,
title={The Lottery {LLM} Hypothesis, Rethinking What Abilities Should {LLM} Compression Preserve?},
author={Zhenheng Tang and Xiang Liu and Qian Wang and Peijie Dong and Bingsheng He and Xiaowen Chu and Bo Li},
booktitle={The Fourth Blogpost Track at ICLR 2025},
year={2025},
}

@article{tang2024fusionllmdecentralizedllmtraining,
      title={FusionLLM: A Decentralized LLM Training System on Geo-distributed GPUs with Adaptive Compression}, 
      author={Zhenheng Tang and Xueze Kang and Yiming Yin and Xinglin Pan and Yuxin Wang and Xin He and Qiang Wang and Rongfei Zeng and Kaiyong Zhao and Shaohuai Shi and Amelie Chi Zhou and Bo Li and Bingsheng He and Xiaowen Chu},
      year={2024},
      journal={arxiv preprint arXiv:2410.12707},
}


@incollection{ramshaw1999text,
  title={Text chunking using transformation-based learning},
  author={Ramshaw, Lance A and Marcus, Mitchell P},
  booktitle={Natural language processing using very large corpora},
  pages={157--176},
  year={1999},
  publisher={Springer}
}

@article{miller1956information,
  title={Information and memory},
  author={Miller, George A},
  journal={Scientific American},
  volume={195},
  number={2},
  pages={42--47},
  year={1956},
  publisher={JSTOR}
}

@article{adnan2024keyformer,
  author  = {Adnan, Muhammad and Arunkumar, Akhil and Jain, Gaurav and Nair, Prashant and Soloveychik, Ilya and Kamath, Purushotham},
  journal = {Proceedings of Machine Learning and Systems},
  pages   = {114--127},
  title   = {Keyformer: Kv cache reduction through key tokens selection for efficient generative inference},
  volume  = {6},
  year    = {2024}
}

@article{agarwal2024many,
  title   = {Many-shot in-context learning},
  author  = {Agarwal, Rishabh and Singh, Avi and Zhang, Lei M and Bohnet, Bernd and Rosias, Luis and Chan, Stephanie and Zhang, Biao and Anand, Ankesh and Abbas, Zaheer and Nova, Azade and others},
  journal = {arXiv preprint arXiv:2404.11018},
  year    = {2024}
}

@article{an2023eval,
  author  = {An, Chenxin and Gong, Shansan and Zhong, Ming and Li, Mukai and Zhang, Jun and Kong, Lingpeng and Qiu, Xipeng},
  journal = {ArXiv preprint},
  title   = {L-eval: Instituting standardized evaluation for long context language models},
  url     = {https://arxiv.org/abs/2307.11088},
  volume  = {abs/2307.11088},
  year    = {2023}
}

@misc{anthropic_contextual_retrieval_2024,
  author = {Anthropic},
  title  = {Introducing Contextual Retrieval},
  url    = {https://www.anthropic.com/news/contextual-retrieval},
  year   = {2024}
}

@inproceedings{bai2023longbench,
  title     = {{L}ong{B}ench: A Bilingual, Multitask Benchmark for Long Context Understanding},
  author    = {Bai, Yushi  and
               Lv, Xin  and
               Zhang, Jiajie  and
               Lyu, Hongchang  and
               Tang, Jiankai  and
               Huang, Zhidian  and
               Du, Zhengxiao  and
               Liu, Xiao  and
               Zeng, Aohan  and
               Hou, Lei  and
               Dong, Yuxiao  and
               Tang, Jie  and
               Li, Juanzi},
  editor    = {Ku, Lun-Wei  and
               Martins, Andre  and
               Srikumar, Vivek},
  booktitle = {Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = aug,
  year      = {2024},
  address   = {Bangkok, Thailand},
  publisher = {Association for Computational Linguistics},
  url       = {https://aclanthology.org/2024.acl-long.172},
  doi       = {10.18653/v1/2024.acl-long.172},
  pages     = {3119--3137},
  abstract  = {Although large language models (LLMs) demonstrate impressive performance for many language tasks, most of them can only handle texts a few thousand tokens long, limiting their applications on longer sequence inputs, such as books, reports, and codebases. Recent works have proposed methods to improve LLMs{'} long context capabilities by extending context windows and more sophisticated memory mechanisms. However, comprehensive benchmarks tailored for evaluating long context understanding are lacking. In this paper, we introduce LongBench, the first bilingual, multi-task benchmark for long context understanding, enabling a more rigorous evaluation of long context understanding. LongBench comprises 21 datasets across 6 task categories in both English and Chinese, with an average length of 6,711 words (English) and 13,386 characters (Chinese). These tasks cover key long-text application areas including single-doc QA, multi-doc QA, summarization, few-shot learning, synthetic tasks, and code completion. All datasets in LongBench are standardized into a unified format, allowing for effortless automatic evaluation of LLMs. Upon comprehensive evaluation of 8 LLMs on LongBench, we find that: (1) Commercial model (GPT-3.5-Turbo-16k) outperforms other open-sourced models, but still struggles on longer contexts. (2) Scaled position embedding and fine-tuning on longer sequences lead to substantial improvement on long context understanding. (3) Context compression technique such as retrieval brings improvement for model with weak ability on long contexts, but the performance still lags behind models that have strong long context understanding capability.}
}

@incollection{Bengio+chapter2007,
  author    = {Bengio, Yoshua and LeCun, Yann},
  booktitle = {Large Scale Kernel Machines},
  publisher = {MIT Press},
  title     = {Scaling Learning Algorithms Towards {AI}},
  year      = {2007}
}

@article{brandon2024reducing,
  title   = {Reducing Transformer Key-Value Cache Size with Cross-Layer Attention},
  author  = {Brandon, William and Mishra, Mayank and Nrusimha, Aniruddha and Panda, Rameswar and Kelly, Jonathan Ragan},
  journal = {arXiv preprint arXiv:2405.12981},
  year    = {2024}
}
@article{brown2020language,
  title   = {Language models are few-shot learners},
  author  = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and others},
  journal = {Advances in neural information processing systems},
  volume  = {33},
  pages   = {1877--1901},
  year    = {2020}
}


@article{chen2023extending,
  author  = {Chen, Shouyuan and Wong, Sherman and Chen, Liangjian and Tian, Yuandong},
  journal = {ArXiv preprint},
  title   = {Extending context window of large language models via positional interpolation},
  url     = {https://arxiv.org/abs/2306.15595},
  volume  = {abs/2306.15595},
  year    = {2023}
}

@inproceedings{chen2023longlora,
  author    = {Chen, Yukang and Qian, Shengju and Tang, Haotian and Lai, Xin and Liu, Zhijian and Han, Song and Jia, Jiaya},
  booktitle = {The Twelfth International Conference on Learning Representations},
  title     = {LongLoRA: Efficient Fine-tuning of Long-Context Large Language Models},
  year      = {2023}
}

@inproceedings{Chevalier2023AdaptingLM,
  title     = {Adapting Language Models to Compress Contexts},
  author    = {Chevalier, Alexis  and
               Wettig, Alexander  and
               Ajith, Anirudh  and
               Chen, Danqi},
  editor    = {Bouamor, Houda  and
               Pino, Juan  and
               Bali, Kalika},
  booktitle = {Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing},
  month     = dec,
  year      = {2023},
  address   = {Singapore},
  publisher = {Association for Computational Linguistics},
  url       = {https://aclanthology.org/2023.emnlp-main.232},
  doi       = {10.18653/v1/2023.emnlp-main.232},
  pages     = {3829--3846},
  abstract  = {Transformer-based language models (LMs) are powerful and widely-applicable tools, but their usefulness is constrained by a finite context window and the expensive computational cost of processing long text documents. We propose to adapt pre-trained LMs into AutoCompressors. These language models are capable of compressing long contexts into summary vectors, which are then accessible to the model as soft prompts. Summary vectors are trained with an unsupervised objective, whereby long documents are processed in segments, and summary vectors from all previous segments are used in language modeling. We fine-tune OPT and Llama-2 models on sequences of up to 30,720 tokens and show that AutoCompressors can utilize long contexts to improve perplexity. We evaluate AutoCompressors on in-context learning by compressing task demonstrations and find that summary vectors are good substitutes for plain-text demonstrations, increasing accuracy while reducing inference costs. Finally, we explore the benefits of pre-computing summary vectors for large corpora by applying summary vectors to retrieval-augmented language modeling and a passage re-ranking task. Overall, AutoCompressors emerge as a simple and inexpensive solution to extend the context window of LMs while speeding up inference over long contexts.}
}


@article{chowdhery2022palm,
  author  = {Chowdhery, Aakanksha and Narang, Sharan and Devlin, Jacob and Bosma, Maarten and Mishra, Gaurav and Roberts, Adam and Barham, Paul and Chung, Hyung Won and Sutton, Charles and Gehrmann, Sebastian and others},
  journal = {ArXiv preprint},
  title   = {Palm: Scaling language modeling with pathways},
  url     = {https://arxiv.org/abs/2204.02311},
  volume  = {abs/2204.02311},
  year    = {2022}
}


@article{chuang2023dola,
  author  = {Chuang, Yung-Sung and Xie, Yujia and Luo, Hongyin and Kim, Yoon and Glass, James and He, Pengcheng},
  journal = {ArXiv preprint},
  title   = {Dola: Decoding by contrasting layers improves factuality in large language models},
  url     = {https://arxiv.org/abs/2309.03883},
  volume  = {abs/2309.03883},
  year    = {2023}
}

@misc{claude3,
  author = {Anthropic},
  title  = {Introducing the next generation of Claude},
  url    = {https://www.anthropic.com/news/claude-3-family},
  year   = {2024}
}

@inproceedings{dasigi2021dataset,
  address   = {Online},
  author    = {Dasigi, Pradeep  and
               Lo, Kyle  and
               Beltagy, Iz  and
               Cohan, Arman  and
               Smith, Noah A.  and
               Gardner, Matt},
  booktitle = {Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
  doi       = {10.18653/v1/2021.naacl-main.365},
  editor    = {Toutanova, Kristina  and
               Rumshisky, Anna  and
               Zettlemoyer, Luke  and
               Hakkani-Tur, Dilek  and
               Beltagy, Iz  and
               Bethard, Steven  and
               Cotterell, Ryan  and
               Chakraborty, Tanmoy  and
               Zhou, Yichao},
  pages     = {4599--4610},
  publisher = {Association for Computational Linguistics},
  title     = {A Dataset of Information-Seeking Questions and Answers Anchored in Research Papers},
  url       = {https://aclanthology.org/2021.naacl-main.365},
  year      = {2021}
}

@article{deepseekr1,
  title   = {DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning},
  author  = {Guo, Daya and Yang, Dejian and Zhang, Haowei and Song, Junxiao and Zhang, Ruoyu and Xu, Runxin and Zhu, Qihao and Ma, Shirong and Wang, Peiyi and Bi, Xiao and others},
  journal = {arXiv preprint arXiv:2501.12948},
  year    = {2025}
}

@misc{deepseekv2,
  archiveprefix = {arXiv},
  author        = {DeepSeek-AI},
  eprint        = {2405.04434},
  primaryclass  = {cs.CL},
  title         = {DeepSeek-V2: A Strong, Economical, and Efficient Mixture-of-Experts Language Model},
  year          = {2024}
}

@inproceedings{diao2023active,
  title     = {Active Prompting with Chain-of-Thought for Large Language Models},
  author    = {Diao, Shizhe  and
               Wang, Pengcheng  and
               Lin, Yong  and
               Pan, Rui  and
               Liu, Xiang  and
               Zhang, Tong},
  editor    = {Ku, Lun-Wei  and
               Martins, Andre  and
               Srikumar, Vivek},
  booktitle = {Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = aug,
  year      = {2024},
  address   = {Bangkok, Thailand},
  publisher = {Association for Computational Linguistics},
  url       = {https://aclanthology.org/2024.acl-long.73},
  doi       = {10.18653/v1/2024.acl-long.73},
  pages     = {1330--1350},
  abstract  = {The increasing scale of large language models (LLMs) brings emergent abilities to various complex tasks requiring reasoning, such as arithmetic and commonsense reasoning. It is known that the effective design of task-specific prompts is critical for LLMs{'} ability to produce high-quality answers. In particular, an effective approach for complex question-and-answering tasks is example-based prompting with chain-of-thought (CoT) reasoning, which significantly improves the performance of LLMs. However, current CoT methods rely on a fixed set of human-annotated exemplars, which are not necessarily the most effective examples for different tasks. This paper proposes a new method, Active-Prompt, to adapt LLMs to different tasks with task-specific example prompts (annotated with human-designed CoT reasoning). For this purpose, we propose a solution to the key problem of determining which questions are the most important and helpful to annotate from a pool of task-specific queries. By borrowing ideas from the related problem of uncertainty-based active learning, we introduce several metrics to characterize the uncertainty so as to select the most uncertain questions for annotation. Experimental results demonstrate the superiority of our proposed method, achieving superior performance on eight complex reasoning tasks. Further analyses of different uncertainty metrics, pool sizes, zero-shot learning, and accuracy-uncertainty relationships demonstrate the effectiveness of our method.}
}

@article{dong2024get,
  author  = {Dong, Harry and Yang, Xinyu and Zhang, Zhenyu and Wang, Zhangyang and Chi, Yuejie and Chen, Beidi},
  journal = {ArXiv preprint},
  title   = {Get More with LESS: Synthesizing Recurrence with KV Cache Compression for Efficient LLM Inference},
  url     = {https://arxiv.org/abs/2402.09398},
  volume  = {abs/2402.09398},
  year    = {2024}
}
@inproceedings{fabbri2019multi,
  address   = {Florence, Italy},
  author    = {Fabbri, Alexander  and
               Li, Irene  and
               She, Tianwei  and
               Li, Suyi  and
               Radev, Dragomir},
  booktitle = {Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics},
  doi       = {10.18653/v1/P19-1102},
  editor    = {Korhonen, Anna  and
               Traum, David  and
               M{\`a}rquez, Llu{\'\i}s},
  pages     = {1074--1084},
  publisher = {Association for Computational Linguistics},
  title     = {Multi-News: A Large-Scale Multi-Document Summarization Dataset and Abstractive Hierarchical Model},
  url       = {https://aclanthology.org/P19-1102},
  year      = {2019}
}
@inproceedings{fei-etal-2024-extending,
  title     = {Extending Context Window of Large Language Models via Semantic Compression},
  author    = {Fei, Weizhi  and
               Niu, Xueyan  and
               Zhou, Pingyi  and
               Hou, Lu  and
               Bai, Bo  and
               Deng, Lei  and
               Han, Wei},
  editor    = {Ku, Lun-Wei  and
               Martins, Andre  and
               Srikumar, Vivek},
  booktitle = {Findings of the Association for Computational Linguistics ACL 2024},
  month     = aug,
  year      = {2024},
  address   = {Bangkok, Thailand and virtual meeting},
  publisher = {Association for Computational Linguistics},
  url       = {https://aclanthology.org/2024.findings-acl.306},
  doi       = {10.18653/v1/2024.findings-acl.306},
  pages     = {5169--5181},
  abstract  = {Transformer based Large Language Models (LLMs) often impose limitations on the length of the text input to ensure the generation of fluent and relevant responses due to the quadratic complexity. These constraints restrict their applicability in long text scenarios. In this paper, we propose a novel semantic compression method that enables generalization to texts that are 6-8 times longer without incurring significant computational costs or requiring fine-tuning. Our proposed framework draws inspiration from source coding in information theory and employs a pre-trained model to reduce the semantic redundancy of long inputs before passing them to the LLMs for downstream tasks. Experimental results demonstrate that our method effectively extends the context window of LLMs across a range of tasks including question answering, summarization, few-shot learning, and information retrieval. Furthermore, the proposed semantic compression method exhibits consistent fluency in text generation while reducing the associated computational overhead.}
}
@article{flash-attn,
  title   = {Flashattention: Fast and memory-efficient exact attention with io-awareness},
  author  = {Dao, Tri and Fu, Dan and Ermon, Stefano and Rudra, Atri and R{\'e}, Christopher},
  journal = {Advances in Neural Information Processing Systems},
  volume  = {35},
  pages   = {16344--16359},
  year    = {2022}
}

@inproceedings{flash-attn2,
  title     = {Flash{A}ttention-2: Faster Attention with Better Parallelism and Work Partitioning},
  author    = {Dao, Tri},
  booktitle = {International Conference on Learning Representations (ICLR)},
  year      = {2024}
}

@inproceedings{flexgen,
  title        = {Flexgen: High-throughput generative inference of large language models with a single gpu},
  author       = {Sheng, Ying and Zheng, Lianmin and Yuan, Binhang and Li, Zhuohan and Ryabinin, Max and Chen, Beidi and Liang, Percy and R{\'e}, Christopher and Stoica, Ion and Zhang, Ce},
  booktitle    = {International Conference on Machine Learning},
  pages        = {31094--31116},
  year         = {2023},
  organization = {PMLR}
}

@inproceedings{fu2024lazyllm,
  author    = {Qichen Fu and Minsik Cho and Thomas Merth and Sachin Mehta and Mohammad Rastegari and Mahyar Najibi},
  booktitle = {Workshop on Efficient Systems for Foundation Models II @ ICML2024},
  title     = {Lazy{LLM}: Dynamic Token Pruning for Efficient Long Context {LLM} Inference},
  url       = {https://openreview.net/forum?id=gGZD1dsJqZ},
  year      = {2024}
}
@article{fubreak,
  title   = {Break the sequential dependency of llm inference using lookahead decoding},
  author  = {Fu, Yichao and Bailis, Peter and Stoica, Ion and Zhang, Hao},
  journal = {arXiv preprint arXiv:2402.02057},
  year    = {2024}
}

@article{ge2023model,
  author  = {Ge, Suyu and Zhang, Yunan and Liu, Liyuan and Zhang, Minjia and Han, Jiawei and Gao, Jianfeng},
  journal = {ArXiv preprint},
  title   = {Model tells you what to discard: Adaptive kv cache compression for llms},
  url     = {https://arxiv.org/abs/2310.01801},
  volume  = {abs/2310.01801},
  year    = {2023}
}


@article{geminiteam2024gemini,
  author  = {Reid, Machel and Savinov, Nikolay and Teplyashin, Denis and Lepikhin, Dmitry and Lillicrap, Timothy and Alayrac, Jean-baptiste and Soricut, Radu and Lazaridou, Angeliki and Firat, Orhan and Schrittwieser, Julian and others},
  journal = {ArXiv preprint},
  title   = {Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context},
  url     = {https://arxiv.org/abs/2403.05530},
  volume  = {abs/2403.05530},
  year    = {2024}
}


@inproceedings{gliwa2019samsum,
  address   = {Hong Kong, China},
  author    = {Gliwa, Bogdan  and
               Mochol, Iwona  and
               Biesek, Maciej  and
               Wawer, Aleksander},
  booktitle = {Proceedings of the 2nd Workshop on New Frontiers in Summarization},
  doi       = {10.18653/v1/D19-5409},
  editor    = {Wang, Lu  and
               Cheung, Jackie Chi Kit  and
               Carenini, Giuseppe  and
               Liu, Fei},
  pages     = {70--79},
  publisher = {Association for Computational Linguistics},
  title     = {{SAMS}um Corpus: A Human-annotated Dialogue Dataset for Abstractive Summarization},
  url       = {https://aclanthology.org/D19-5409},
  year      = {2019}
}


@inproceedings{goodfellow2016deep,
  author    = {Ruslan Salakhutdinov},
  bibsource = {dblp computer science bibliography, https://dblp.org},
  biburl    = {https://dblp.org/rec/conf/kdd/Salakhutdinov14.bib},
  booktitle = {The 20th {ACM} {SIGKDD} International Conference on Knowledge Discovery
               and Data Mining, {KDD} '14, New York, NY, {USA} - August 24 - 27,
               2014},
  doi       = {10.1145/2623330.2630809},
  editor    = {Sofus A. Macskassy and
               Claudia Perlich and
               Jure Leskovec and
               Wei Wang and
               Rayid Ghani},
  pages     = {1973},
  publisher = {{ACM}},
  timestamp = {Tue, 06 Nov 2018 00:00:00 +0100},
  title     = {Deep learning},
  url       = {https://doi.org/10.1145/2623330.2630809},
  year      = {2014}
}


@misc{grok,
  author = {X.AI},
  title  = {Announcing Grok-1.5},
  url    = {https://x.ai/blog/grok-1.5},
  year   = {2024}
}

@article{gsm8k,
  author  = {Cobbe, Karl and Kosaraju, Vineet and Bavarian, Mohammad and Chen, Mark and Jun, Heewoo and Kaiser, Lukasz and Plappert, Matthias and Tworek, Jerry and Hilton, Jacob and Nakano, Reiichiro and others},
  journal = {ArXiv preprint},
  title   = {Training verifiers to solve math word problems},
  url     = {https://arxiv.org/abs/2110.14168},
  volume  = {abs/2110.14168},
  year    = {2021}
}

@inproceedings{guo2023longcoder,
  author    = {Daya Guo and
               Canwen Xu and
               Nan Duan and
               Jian Yin and
               Julian J. McAuley},
  bibsource = {dblp computer science bibliography, https://dblp.org},
  biburl    = {https://dblp.org/rec/conf/icml/GuoXD0M23.bib},
  booktitle = {International Conference on Machine Learning, {ICML} 2023, 23-29 July
               2023, Honolulu, Hawaii, {USA}},
  editor    = {Andreas Krause and
               Emma Brunskill and
               Kyunghyun Cho and
               Barbara Engelhardt and
               Sivan Sabato and
               Jonathan Scarlett},
  pages     = {12098--12107},
  publisher = {{PMLR}},
  series    = {Proceedings of Machine Learning Research},
  timestamp = {Mon, 28 Aug 2023 01:00:00 +0200},
  title     = {LongCoder: {A} Long-Range Pre-trained Language Model for Code Completion},
  url       = {https://proceedings.mlr.press/v202/guo23j.html},
  volume    = {202},
  year      = {2023}
}

@inproceedings{han2024lm,
  address   = {Mexico City, Mexico},
  author    = {Han, Chi  and
               Wang, Qifan  and
               Peng, Hao  and
               Xiong, Wenhan  and
               Chen, Yu  and
               Ji, Heng  and
               Wang, Sinong},
  booktitle = {Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)},
  editor    = {Duh, Kevin  and
               Gomez, Helena  and
               Bethard, Steven},
  pages     = {3991--4008},
  publisher = {Association for Computational Linguistics},
  title     = {{LM}-Infinite: Zero-Shot Extreme Length Generalization for Large Language Models},
  url       = {https://aclanthology.org/2024.naacl-long.222},
  year      = {2024}
}

@inproceedings{he2017dureader,
  address   = {Melbourne, Australia},
  author    = {He, Wei  and
               Liu, Kai  and
               Liu, Jing  and
               Lyu, Yajuan  and
               Zhao, Shiqi  and
               Xiao, Xinyan  and
               Liu, Yuan  and
               Wang, Yizhong  and
               Wu, Hua  and
               She, Qiaoqiao  and
               Liu, Xuan  and
               Wu, Tian  and
               Wang, Haifeng},
  booktitle = {Proceedings of the Workshop on Machine Reading for Question Answering},
  doi       = {10.18653/v1/W18-2605},
  editor    = {Choi, Eunsol  and
               Seo, Minjoon  and
               Chen, Danqi  and
               Jia, Robin  and
               Berant, Jonathan},
  pages     = {37--46},
  publisher = {Association for Computational Linguistics},
  title     = {{D}u{R}eader: a {C}hinese Machine Reading Comprehension Dataset from Real-world Applications},
  url       = {https://aclanthology.org/W18-2605},
  year      = {2018}
}

@article{Hinton06,
  author  = {Hinton, Geoffrey E. and Osindero, Simon and Teh, Yee Whye},
  journal = {Neural Computation},
  pages   = {1527--1554},
  title   = {A Fast Learning Algorithm for Deep Belief Nets},
  volume  = {18},
  year    = {2006}
}

@inproceedings{ho2020constructing,
  address   = {Barcelona, Spain (Online)},
  author    = {Ho, Xanh  and
               Duong Nguyen, Anh-Khoa  and
               Sugawara, Saku  and
               Aizawa, Akiko},
  booktitle = {Proceedings of the 28th International Conference on Computational Linguistics},
  doi       = {10.18653/v1/2020.coling-main.580},
  editor    = {Scott, Donia  and
               Bel, Nuria  and
               Zong, Chengqing},
  pages     = {6609--6625},
  publisher = {International Committee on Computational Linguistics},
  title     = {Constructing A Multi-hop {QA} Dataset for Comprehensive Evaluation of Reasoning Steps},
  url       = {https://aclanthology.org/2020.coling-main.580},
  year      = {2020}
}


@article{hsieh2024ruler,
  author  = {Cheng-Ping Hsieh and Simeng Sun and Samuel Kriman and Shantanu Acharya and Dima Rekesh and Fei Jia and Yang Zhang and Boris Ginsburg},
  journal = {ArXiv preprint},
  title   = {RULER: What's the Real Context Size of Your Long-Context Language Models?},
  url     = {https://arxiv.org/abs/2404.06654},
  volume  = {abs/2404.06654},
  year    = {2024}
}

@inproceedings{hu2021lora,
  author    = {Edward J. Hu and
               Yelong Shen and
               Phillip Wallis and
               Zeyuan Allen{-}Zhu and
               Yuanzhi Li and
               Shean Wang and
               Lu Wang and
               Weizhu Chen},
  bibsource = {dblp computer science bibliography, https://dblp.org},
  biburl    = {https://dblp.org/rec/conf/iclr/HuSWALWWC22.bib},
  booktitle = {The Tenth International Conference on Learning Representations, {ICLR}
               2022, Virtual Event, April 25-29, 2022},
  publisher = {OpenReview.net},
  timestamp = {Sat, 20 Aug 2022 01:00:00 +0200},
  title     = {LoRA: Low-Rank Adaptation of Large Language Models},
  url       = {https://openreview.net/forum?id=nZeVKeeFYf9},
  year      = {2022}
}

@inproceedings{huang2021efficient,
  address   = {Online},
  author    = {Huang, Luyang  and
               Cao, Shuyang  and
               Parulian, Nikolaus  and
               Ji, Heng  and
               Wang, Lu},
  booktitle = {Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
  doi       = {10.18653/v1/2021.naacl-main.112},
  editor    = {Toutanova, Kristina  and
               Rumshisky, Anna  and
               Zettlemoyer, Luke  and
               Hakkani-Tur, Dilek  and
               Beltagy, Iz  and
               Bethard, Steven  and
               Cotterell, Ryan  and
               Chakraborty, Tanmoy  and
               Zhou, Yichao},
  pages     = {1419--1436},
  publisher = {Association for Computational Linguistics},
  title     = {Efficient Attentions for Long Document Summarization},
  url       = {https://aclanthology.org/2021.naacl-main.112},
  year      = {2021}
}

@article{jacobs2023deepspeed,
  author  = {Sam Ade Jacobs and others},
  journal = {ArXiv preprint},
  title   = {{DeepSpeed Ulysses}: System Optimizations for Enabling Training of Extreme Long Sequence {Transformer} Models},
  url     = {https://arxiv.org/abs/2309.14509},
  volume  = {abs/2309.14509},
  year    = {2023}
}


@inproceedings{jailbreakv,
  title     = {JailBreakV: A Benchmark for Assessing the Robustness of MultiModal Large Language Models against Jailbreak Attacks},
  author    = {Weidi Luo and Siyuan Ma and Xiaogeng Liu and Xiaoyu Guo and Chaowei Xiao},
  booktitle = {First Conference on Language Modeling},
  year      = {2024},
  url       = {https://openreview.net/forum?id=GC4mXVfquq}
}
@misc{jamba,
  author = {AI21},
  title  = {Introducing Jamba: AI21's Groundbreaking SSM-Transformer Model},
  url    = {https://www.ai21.com/blog/announcing-jamba},
  year   = {2024}
}

@inproceedings{jiang-etal-2023-llmlingua,
  title     = {{LLML}ingua: Compressing Prompts for Accelerated Inference of Large Language Models},
  author    = {Huiqiang Jiang and Qianhui Wu and Chin-Yew Lin and Yuqing Yang and Lili Qiu},
  editor    = {Bouamor, Houda  and
               Pino, Juan  and
               Bali, Kalika},
  booktitle = {Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing},
  month     = dec,
  year      = {2023},
  address   = {Singapore},
  publisher = {Association for Computational Linguistics},
  url       = {https://aclanthology.org/2023.emnlp-main.825},
  doi       = {10.18653/v1/2023.emnlp-main.825},
  pages     = {13358--13376}
}

@inproceedings{jiang-etal-2024-longllmlingua,
  title     = {{L}ong{LLML}ingua: Accelerating and Enhancing {LLM}s in Long Context Scenarios via Prompt Compression},
  author    = {Huiqiang Jiang and Qianhui Wu and and Xufang Luo and Dongsheng Li and Chin-Yew Lin and Yuqing Yang and Lili Qiu},
  editor    = {Ku, Lun-Wei  and
               Martins, Andre  and
               Srikumar, Vivek},
  booktitle = {Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month     = aug,
  year      = {2024},
  address   = {Bangkok, Thailand},
  publisher = {Association for Computational Linguistics},
  url       = {https://aclanthology.org/2024.acl-long.91},
  pages     = {1658--1677}
}

@misc{jiang2023mistral7b,
  author  = {Albert Q. Jiang and Alexandre Sablayrolles and Arthur Mensch and Chris Bamford and Devendra Singh Chaplot and Diego de las Casas and Florian Bressand and Gianna Lengyel and Guillaume Lample and Lucile Saulnier and Lélio Renard Lavaud and Marie-Anne Lachaux and Pierre Stock and Teven Le Scao and Thibaut Lavril and Thomas Wang and Timothée Lacroix and William El Sayed},
  journal = {ArXiv preprint},
  title   = {Mistral 7B},
  url     = {https://arxiv.org/abs/2310.06825},
  volume  = {abs/2310.06825},
  year    = {2023}
}

@inproceedings{joshi2017triviaqa,
  address   = {Vancouver, Canada},
  author    = {Joshi, Mandar  and
               Choi, Eunsol  and
               Weld, Daniel  and
               Zettlemoyer, Luke},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  doi       = {10.18653/v1/P17-1147},
  editor    = {Barzilay, Regina  and
               Kan, Min-Yen},
  pages     = {1601--1611},
  publisher = {Association for Computational Linguistics},
  title     = {{T}rivia{QA}: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension},
  url       = {https://aclanthology.org/P17-1147},
  year      = {2017}
}

@article{Kleijn2012TheBT,
  title   = {The Bernstein-Von-Mises theorem under misspecification},
  author  = {Kleijn and Van der Vaart},
  journal = {Electronic Journal of Statistics},
  year    = {2012},
  volume  = {6},
  pages   = {354-381},
  url     = {https://api.semanticscholar.org/CorpusID:85548207}
}

@article{kovcisky2018narrativeqa,
  address   = {Cambridge, MA},
  author    = {Ko{\v{c}}isk{\'y}, Tom{\'a}{\v{s}}  and
               Schwarz, Jonathan  and
               Blunsom, Phil  and
               Dyer, Chris  and
               Hermann, Karl Moritz  and
               Melis, G{\'a}bor  and
               Grefenstette, Edward},
  doi       = {10.1162/tacl_a_00023},
  editor    = {Lee, Lillian  and
               Johnson, Mark  and
               Toutanova, Kristina  and
               Roark, Brian},
  journal   = {Transactions of the Association for Computational Linguistics},
  pages     = {317--328},
  publisher = {MIT Press},
  title     = {The {N}arrative{QA} Reading Comprehension Challenge},
  url       = {https://aclanthology.org/Q18-1023},
  volume    = {6},
  year      = {2018}
}

@inproceedings{li2002learning,
  author    = {Li, Xin  and
               Roth, Dan},
  booktitle = {{COLING} 2002: The 19th International Conference on Computational Linguistics},
  title     = {Learning Question Classifiers},
  url       = {https://aclanthology.org/C02-1150},
  year      = {2002}
}

@article{li2024snapkv,
  author  = {Li, Yuhong and Huang, Yingbing and Yang, Bowen and Venkitesh, Bharat and Locatelli, Acyr and Ye, Hanchen and Cai, Tianle and Lewis, Patrick and Chen, Deming},
  journal = {ArXiv preprint},
  title   = {SnapKV: LLM Knows What You are Looking for Before Generation},
  url     = {https://arxiv.org/abs/2404.14469},
  volume  = {abs/2404.14469},
  year    = {2024}
}

@inproceedings{liu2023repobench,
  title     = {RepoBench: Benchmarking Repository-Level Code Auto-Completion Systems},
  author    = {Tianyang Liu and Canwen Xu and Julian McAuley},
  booktitle = {The Twelfth International Conference on Learning Representations},
  year      = {2024},
  url       = {https://openreview.net/forum?id=pPjZIOuQuF}
}

@article{liu2024kivi,
  title   = {Kivi: A tuning-free asymmetric 2bit quantization for kv cache},
  author  = {Liu, Zirui and Yuan, Jiayi and Jin, Hongye and Zhong, Shaochen and Xu, Zhaozhuo and Braverman, Vladimir and Chen, Beidi and Hu, Xia},
  journal = {arXiv preprint arXiv:2402.02750},
  year    = {2024}
}

@article{liu2024lost,
  address   = {Cambridge, MA},
  author    = {Liu, Nelson F.  and
               Lin, Kevin  and
               Hewitt, John  and
               Paranjape, Ashwin  and
               Bevilacqua, Michele  and
               Petroni, Fabio  and
               Liang, Percy},
  doi       = {10.1162/tacl_a_00638},
  journal   = {Transactions of the Association for Computational Linguistics},
  pages     = {157--173},
  publisher = {MIT Press},
  title     = {Lost in the Middle: How Language Models Use Long Contexts},
  url       = {https://aclanthology.org/2024.tacl-1.9},
  volume    = {12},
  year      = {2024}
}

@article{liu2024minicache,
  title   = {MiniCache: KV Cache Compression in Depth Dimension for Large Language Models},
  author  = {Liu, Akide and Liu, Jing and Pan, Zizheng and He, Yefei and Haffari, Gholamreza and Zhuang, Bohan},
  journal = {arXiv preprint arXiv:2405.14366},
  year    = {2024}
}

@article{liu2024scissorhands,
  title   = {Scissorhands: Exploiting the persistence of importance hypothesis for llm kv cache compression at test time},
  author  = {Liu, Zichang and Desai, Aditya and Liao, Fangshuo and Wang, Weitao and Xie, Victor and Xu, Zhaozhuo and Kyrillidis, Anastasios and Shrivastava, Anshumali},
  journal = {Advances in Neural Information Processing Systems},
  volume  = {36},
  year    = {2024}
}

@article{liu2024world,
  author  = {Liu, Hao and Yan, Wilson and Zaharia, Matei and Abbeel, Pieter},
  journal = {ArXiv preprint},
  title   = {World Model on Million-Length Video And Language With RingAttention},
  url     = {https://arxiv.org/abs/2402.08268},
  volume  = {abs/2402.08268},
  year    = {2024}
}

@misc{longchat,
  author = {Dacheng Li and Rulin Shao and others},
  title  = {How Long Can Open-Source {LLMs} Truly Promise on Context Length?},
  url    = {https://lmsys.org/blog/2023-06-29-longchat},
  year   = {2023}
}

@misc{meta2024llama3,
  author       = {Meta},
  howpublished = {\url{https://ai.meta.com/blog/meta-llama-3/}},
  note         = {Accessed: 2024-06-07},
  title        = {Introducing Meta Llama 3: The most capable openly available LLM to date},
  year         = {2024}
}

@article{mohtashami2023landmark,
  author  = {Mohtashami, Amirkeivan and Jaggi, Martin},
  journal = {ArXiv preprint},
  title   = {Landmark attention: Random-access infinite context length for transformers},
  url     = {https://arxiv.org/abs/2305.16300},
  volume  = {abs/2305.16300},
  year    = {2023}
}

@article{needle,
  author  = {Gregory Kamradt},
  journal = {Github},
  title   = {{Needle In A Haystack} - Pressure Testing {LLM}s},
  url     = {https://github.com/gkamradt/LLMTest_NeedleInAHaystack/tree/main},
  year    = {2023}
}


@misc{openai2023gpt4omini,
  author       = {{OpenAI}},
  note         = {Accessed: 2023-12-14},
  organization = {OpenAI},
  title        = {GPT-4o-mini: Advancing Cost-Efficient Intelligence},
  year         = {2023}
}

@inproceedings{pan-etal-2024-llmlingua,
  title     = {{LLML}ingua-2: Data Distillation for Efficient and Faithful Task-Agnostic Prompt Compression},
  author    = {Zhuoshi Pan and Qianhui Wu and Huiqiang Jiang and Menglin Xia and Xufang Luo and Jue Zhang and Qingwei Lin and Victor Ruhle and Yuqing Yang and Chin-Yew Lin and H. Vicky Zhao and Lili Qiu and Dongmei Zhang},
  editor    = {Ku, Lun-Wei  and
               Martins, Andre  and
               Srikumar, Vivek},
  booktitle = {Findings of the Association for Computational Linguistics ACL 2024},
  month     = aug,
  year      = {2024},
  address   = {Bangkok, Thailand and virtual meeting},
  publisher = {Association for Computational Linguistics},
  url       = {https://aclanthology.org/2024.findings-acl.57},
  pages     = {963--981}
}

@inproceedings{pan2023plum,
  title     = {Plum: Prompt Learning using Metaheuristics},
  author    = {Pan, Rui  and
               Xing, Shuo  and
               Diao, Shizhe  and
               Sun, Wenhe  and
               Liu, Xiang  and
               Shum, KaShun  and
               Zhang, Jipeng  and
               Pi, Renjie  and
               Zhang, Tong},
  editor    = {Ku, Lun-Wei  and
               Martins, Andre  and
               Srikumar, Vivek},
  booktitle = {Findings of the Association for Computational Linguistics ACL 2024},
  month     = aug,
  year      = {2024},
  address   = {Bangkok, Thailand and virtual meeting},
  publisher = {Association for Computational Linguistics},
  url       = {https://aclanthology.org/2024.findings-acl.129},
  doi       = {10.18653/v1/2024.findings-acl.129},
  pages     = {2177--2197},
  abstract  = {Since the emergence of large language models, prompt learning has become a popular method for optimizing and customizing these models. Special prompts, such as Chain-of-Thought, have even revealed previously unknown reasoning capabilities within these models. However, the progress of discovering effective prompts has been slow, driving a desire for general prompt optimization methods. Unfortunately, few existing prompt learning methods satisfy the criteria of being truly {``}general{''}, i.e., automatic, discrete, black-box, gradient-free, and interpretable all at once. In this paper, we introduce metaheuristics, a branch of discrete non-convex optimization methods with over 100 options, as a promising approach to prompt learning. Within our paradigm, we test six typical methods: hill climbing, simulated annealing, genetic algorithms with/without crossover, tabu search, and harmony search, demonstrating their effectiveness in white-box and black-box prompt learning. Furthermore, we show that these methods can be used to discover more human-understandable prompts that were previously unknown in both reasoning and image generation tasks, opening the door to a cornucopia of possibilities in prompt optimization.}
}

@article{pan2024lisa,
  author  = {Pan, Rui and Liu, Xiang and Diao, Shizhe and Pi, Renjie and Zhang, Jipeng and Han, Chi and Zhang, Tong},
  journal = {ArXiv preprint},
  title   = {LISA: Layerwise Importance Sampling for Memory-Efficient Large Language Model Fine-Tuning},
  url     = {https://arxiv.org/abs/2403.17919},
  volume  = {abs/2403.17919},
  year    = {2024}
}


@inproceedings{peng2024yarn,
  author    = {Bowen Peng and Jeffrey Quesnelle and Honglu Fan and Enrico Shippole},
  booktitle = {The Twelfth International Conference on Learning Representations},
  title     = {Ya{RN}: Efficient Context Window Extension of Large Language Models},
  url       = {https://openreview.net/forum?id=wHBfxhZu1u},
  year      = {2024}
}
@article{pires2016multiclass,
  title   = {Multiclass classification calibration functions},
  author  = {Pires, Bernardo {\'A}vila and Szepesv{\'a}ri, Csaba},
  journal = {arXiv preprint arXiv:1609.06385},
  year    = {2016}
}

@article{qwen2,
  author  = {Yang, An and Yang, Baosong and Hui, Binyuan and Zheng, Bo and Yu, Bowen and Zhou, Chang and Li, Chengpeng and Li, Chengyuan and Liu, Dayiheng and Huang, Fei and others},
  journal = {ArXiv preprint},
  title   = {Qwen2 technical report},
  url     = {https://arxiv.org/abs/2407.10671},
  volume  = {abs/2407.10671},
  year    = {2024}
}

@article{raffel2020exploring,
  author    = {Colin Raffel and
               Noam Shazeer and
               Adam Roberts and
               Katherine Lee and
               Sharan Narang and
               Michael Matena and
               Yanqi Zhou and
               Wei Li and
               Peter J. Liu},
  bibsource = {dblp computer science bibliography, https://dblp.org},
  biburl    = {https://dblp.org/rec/journals/jmlr/RaffelSRLNMZLL20.bib},
  journal   = {J. Mach. Learn. Res.},
  pages     = {140:1--140:67},
  timestamp = {Fri, 05 Feb 2021 00:00:00 +0100},
  title     = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text
               Transformer},
  url       = {http://jmlr.org/papers/v21/20-074.html},
  volume    = {21},
  year      = {2020}
}

@inproceedings{sang1999representing,
  address   = {Bergen, Norway},
  author    = {Tjong Kim Sang, Erik F.  and
               Veenstra, Jorn},
  booktitle = {Ninth Conference of the {E}uropean Chapter of the Association for Computational Linguistics},
  editor    = {Thompson, Henry S.  and
               Lascarides, Alex},
  pages     = {173--179},
  publisher = {Association for Computational Linguistics},
  title     = {Representing Text Chunks},
  url       = {https://aclanthology.org/E99-1023},
  year      = {1999}
}

@inproceedings{shaham2023zeroscrolls,
  address   = {Singapore},
  author    = {Shaham, Uri  and
               Ivgi, Maor  and
               Efrat, Avia  and
               Berant, Jonathan  and
               Levy, Omer},
  booktitle = {Findings of the Association for Computational Linguistics: EMNLP 2023},
  doi       = {10.18653/v1/2023.findings-emnlp.536},
  editor    = {Bouamor, Houda  and
               Pino, Juan  and
               Bali, Kalika},
  pages     = {7977--7989},
  publisher = {Association for Computational Linguistics},
  title     = {{Z}ero{SCROLLS}: A Zero-Shot Benchmark for Long Text Understanding},
  url       = {https://aclanthology.org/2023.findings-emnlp.536},
  year      = {2023}
}

@inproceedings{shicontext,
  author    = {Shi, Weijia and Min, Sewon and Lomeli, Maria and Zhou, Chunting and Li, Margaret and Lin, Xi Victoria and Smith, Noah A and Zettlemoyer, Luke and Yih, Wen-tau and Lewis, Mike},
  booktitle = {The Twelfth International Conference on Learning Representations},
  title     = {In-Context Pretraining: Language Modeling Beyond Document Boundaries},
  year      = {2024}
}

@techreport{smith2024evaluating,
  author      = {Smith, Brandon and Troynikov, Anton},
  institution = {Chroma},
  title       = {Evaluating Chunking Strategies for Retrieval},
  url         = {https://research.trychroma.com/evaluating-chunking},
  year        = {2024}
}

@article{Steinwart2007HowTC,
  title   = {How to Compare Different Loss Functions and Their Risks},
  author  = {Ingo Steinwart},
  journal = {Constructive Approximation},
  year    = {2007},
  volume  = {26},
  pages   = {225-287},
  url     = {https://api.semanticscholar.org/CorpusID:16660598}
}

@article{sun2024yoco,
  title   = {You only cache once: Decoder-decoder architectures for language models},
  author  = {Sun, Yutao and Dong, Li and Zhu, Yi and Huang, Shaohan and Wang, Wenhui and Ma, Shuming and Zhang, Quanlu and Wang, Jianyong and Wei, Furu},
  journal = {arXiv preprint arXiv:2405.05254},
  year    = {2024}
}

@article{tang2024quest,
  author  = {Tang, Jiaming and Zhao, Yilong and Zhu, Kan and Xiao, Guangxuan and Kasikci, Baris and Han, Song},
  journal = {ArXiv preprint},
  title   = {Quest: Query-Aware Sparsity for Efficient Long-Context LLM Inference},
  url     = {https://arxiv.org/abs/2406.10774},
  volume  = {abs/2406.10774},
  year    = {2024}
}

@inproceedings{tay2020long,
  author    = {Yi Tay and
               Mostafa Dehghani and
               Samira Abnar and
               Yikang Shen and
               Dara Bahri and
               Philip Pham and
               Jinfeng Rao and
               Liu Yang and
               Sebastian Ruder and
               Donald Metzler},
  bibsource = {dblp computer science bibliography, https://dblp.org},
  biburl    = {https://dblp.org/rec/conf/iclr/Tay0ASBPRYRM21.bib},
  booktitle = {9th International Conference on Learning Representations, {ICLR} 2021,
               Virtual Event, Austria, May 3-7, 2021},
  publisher = {OpenReview.net},
  timestamp = {Wed, 23 Jun 2021 01:00:00 +0200},
  title     = {Long Range Arena : {A} Benchmark for Efficient Transformers},
  url       = {https://openreview.net/forum?id=qVyeW-grC2k},
  year      = {2021}
}

@article{tay2022unifying,
  author  = {Tay, Yi and Dehghani, Mostafa and Tran, Vinh Q and Garcia, Xavier and Bahri, Dara and Schuster, Tal and Zheng, Huaixiu Steven and Houlsby, Neil and Metzler, Donald},
  journal = {ArXiv preprint},
  title   = {Unifying Language Learning Paradigms},
  url     = {https://arxiv.org/abs/2205.05131},
  volume  = {abs/2205.05131},
  year    = {2022}
}

@article{touvron2023llama,
  author  = {Touvron, Hugo and Lavril, Thibaut and Izacard, Gautier and Martinet, Xavier and Lachaux, Marie-Anne and Lacroix, Timoth{\'e}e and Rozi{\`e}re, Baptiste and Goyal, Naman and Hambro, Eric and Azhar, Faisal and others},
  journal = {ArXiv preprint},
  title   = {Llama: Open and efficient foundation language models},
  url     = {https://arxiv.org/abs/2302.13971},
  volume  = {abs/2302.13971},
  year    = {2023}
}

@article{touvron2023llama2,
  author  = {Touvron, Hugo and Martin, Louis and Stone, Kevin and Albert, Peter and Almahairi, Amjad and Babaei, Yasmine and Bashlykov, Nikolay and Batra, Soumya and Bhargava, Prajjwal and Bhosale, Shruti and others},
  journal = {ArXiv preprint},
  title   = {Llama 2: Open foundation and fine-tuned chat models},
  url     = {https://arxiv.org/abs/2307.09288},
  volume  = {abs/2307.09288},
  year    = {2023}
}

@article{trivedi2022musique,
  address   = {Cambridge, MA},
  author    = {Trivedi, Harsh  and
               Balasubramanian, Niranjan  and
               Khot, Tushar  and
               Sabharwal, Ashish},
  doi       = {10.1162/tacl_a_00475},
  editor    = {Roark, Brian  and
               Nenkova, Ani},
  journal   = {Transactions of the Association for Computational Linguistics},
  pages     = {539--554},
  publisher = {MIT Press},
  title     = {{M}u{S}i{Q}ue: Multihop Questions via Single-hop Question Composition},
  url       = {https://aclanthology.org/2022.tacl-1.31},
  volume    = {10},
  year      = {2022}
}

@article{wang2023recursively,
  title   = {Recursively summarizing enables long-term dialogue memory in large language models},
  author  = {Wang, Qingyue and Ding, Liang and Cao, Yanan and Tian, Zhiliang and Wang, Shi and Tao, Dacheng and Guo, Li},
  journal = {arXiv preprint arXiv:2308.15022},
  year    = {2023}
}

@article{wei2022chain,
  title   = {Chain-of-thought prompting elicits reasoning in large language models},
  author  = {Wei, Jason and Wang, Xuezhi and Schuurmans, Dale and Bosma, Maarten and Xia, Fei and Chi, Ed and Le, Quoc V and Zhou, Denny and others},
  journal = {Advances in neural information processing systems},
  volume  = {35},
  pages   = {24824--24837},
  year    = {2022}
}

@inproceedings{wingate-etal-2022-prompt,
  title     = {Prompt Compression and Contrastive Conditioning for Controllability and Toxicity Reduction in Language Models},
  author    = {Wingate, David  and
               Shoeybi, Mohammad  and
               Sorensen, Taylor},
  booktitle = {Findings of the Association for Computational Linguistics: EMNLP 2022},
  month     = dec,
  year      = {2022},
  address   = {Abu Dhabi, United Arab Emirates},
  publisher = {Association for Computational Linguistics},
  url       = {https://aclanthology.org/2022.findings-emnlp.412},
  doi       = {10.18653/v1/2022.findings-emnlp.412},
  pages     = {5621--5634},
  abstract  = {We explore the idea of compressing the prompts used to condition language models, and show that compressed prompts can retain a substantive amount of information about the original prompt. For severely compressed prompts, while fine-grained information is lost, abstract information and general sentiments can be retained with surprisingly few parameters, which can be useful in the context of decode-time algorithms for controllability and toxicity reduction. We find that some complex prompts can be effectively compressed into a single token to guide generation. We also show that compressed prompts are largely compositional, and can be constructed such that they can be used to control independent aspects of generated text.}
}

@inproceedings{wu2023vcsum,
  address   = {Toronto, Canada},
  author    = {Wu, Han  and
               Zhan, Mingjie  and
               Tan, Haochen  and
               Hou, Zhaohui  and
               Liang, Ding  and
               Song, Linqi},
  booktitle = {Findings of the Association for Computational Linguistics: ACL 2023},
  doi       = {10.18653/v1/2023.findings-acl.377},
  editor    = {Rogers, Anna  and
               Boyd-Graber, Jordan  and
               Okazaki, Naoaki},
  pages     = {6065--6079},
  publisher = {Association for Computational Linguistics},
  title     = {{VCSUM}: A Versatile {C}hinese Meeting Summarization Dataset},
  url       = {https://aclanthology.org/2023.findings-acl.377},
  year      = {2023}
}

@misc{wu2024layercondensedkvcacheefficient,
  title         = {Layer-Condensed KV Cache for Efficient Inference of Large Language Models},
  author        = {Haoyi Wu and Kewei Tu},
  year          = {2024},
  eprint        = {2405.10637},
  archiveprefix = {arXiv},
  primaryclass  = {cs.CL},
  url           = {https://arxiv.org/abs/2405.10637}
}

@inproceedings{xiao2023smoothquant,
  title        = {Smoothquant: Accurate and efficient post-training quantization for large language models},
  author       = {Xiao, Guangxuan and Lin, Ji and Seznec, Mickael and Wu, Hao and Demouth, Julien and Han, Song},
  booktitle    = {International Conference on Machine Learning},
  pages        = {38087--38099},
  year         = {2023},
  organization = {PMLR}
}

@inproceedings{xiao2024efficient,
  author    = {Guangxuan Xiao and Yuandong Tian and Beidi Chen and Song Han and Mike Lewis},
  booktitle = {The Twelfth International Conference on Learning Representations},
  title     = {Efficient Streaming Language Models with Attention Sinks},
  url       = {https://openreview.net/forum?id=NG7sS51zVF},
  year      = {2024}
}

@article{xie2022an,
  title     = {An End-to-End Contrastive Self-Supervised Learning Framework for Language Understanding},
  author    = {Fang, Hongchao  and
               Xie, Pengtao},
  editor    = {Roark, Brian  and
               Nenkova, Ani},
  journal   = {Transactions of the Association for Computational Linguistics},
  volume    = {10},
  year      = {2022},
  address   = {Cambridge, MA},
  publisher = {MIT Press},
  url       = {https://aclanthology.org/2022.tacl-1.76/},
  doi       = {10.1162/tacl_a_00521},
  pages     = {1324--1340},
  abstract  = {Self-supervised learning (SSL) methods such as Word2vec, BERT, and GPT have shown great effectiveness in language understanding. Contrastive learning, as a recent SSL approach, has attracted increasing attention in NLP. Contrastive learning learns data representations by predicting whether two augmented data instances are generated from the same original data example. Previous contrastive learning methods perform data augmentation and contrastive learning separately. As a result, the augmented data may not be optimal for contrastive learning. To address this problem, we propose a four-level optimization framework that performs data augmentation and contrastive learning end-to-end, to enable the augmented data to be tailored to the contrastive learning task. This framework consists of four learning stages, including training machine translation models for sentence augmentation, pretraining a text encoder using contrastive learning, finetuning a text classification model, and updating weights of translation data by minimizing the validation loss of the classification model, which are performed in a unified way. Experiments on datasets in the GLUE benchmark (Wang et al., 2018a) and on datasets used in Gururangan et al. (2020) demonstrate the effectiveness of our method.}
}

@inproceedings{xiong2023effective,
  address   = {Mexico City, Mexico},
  author    = {Xiong, Wenhan  and
               Liu, Jingyu  and
               Molybog, Igor  and
               Zhang, Hejia  and
               Bhargava, Prajjwal  and
               Hou, Rui  and
               Martin, Louis  and
               Rungta, Rashi  and
               Sankararaman, Karthik Abinav  and
               Oguz, Barlas  and
               Khabsa, Madian  and
               Fang, Han  and
               Mehdad, Yashar  and
               Narang, Sharan  and
               Malik, Kshitiz  and
               Fan, Angela  and
               Bhosale, Shruti  and
               Edunov, Sergey  and
               Lewis, Mike  and
               Wang, Sinong  and
               Ma, Hao},
  booktitle = {Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)},
  editor    = {Duh, Kevin  and
               Gomez, Helena  and
               Bethard, Steven},
  pages     = {4643--4663},
  publisher = {Association for Computational Linguistics},
  title     = {Effective Long-Context Scaling of Foundation Models},
  url       = {https://aclanthology.org/2024.naacl-long.260},
  year      = {2024}
}
@inproceedings{yang2018hotpotqa,
  address   = {Brussels, Belgium},
  author    = {Yang, Zhilin  and
               Qi, Peng  and
               Zhang, Saizheng  and
               Bengio, Yoshua  and
               Cohen, William  and
               Salakhutdinov, Ruslan  and
               Manning, Christopher D.},
  booktitle = {Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing},
  doi       = {10.18653/v1/D18-1259},
  editor    = {Riloff, Ellen  and
               Chiang, David  and
               Hockenmaier, Julia  and
               Tsujii, Jun{'}ichi},
  pages     = {2369--2380},
  publisher = {Association for Computational Linguistics},
  title     = {{H}otpot{QA}: A Dataset for Diverse, Explainable Multi-hop Question Answering},
  url       = {https://aclanthology.org/D18-1259},
  year      = {2018}
}

@inproceedings{yang2024pyramidinfer,
  abstract  = {Large Language Models (LLMs) have shown remarkable comprehension abilities but face challenges in GPU memory usage during inference, hindering their scalability for real-time applications like chatbots. To accelerate inference, we store computed keys and values (KV cache) in the GPU memory. Existing methods study the KV cache compression to reduce memory by pruning the pre-computed KV cache. However, they neglect the inter-layer dependency between layers and huge memory consumption in pre-computation. To explore these deficiencies, we find that the number of crucial keys and values that influence future generations decreases layer by layer and we can extract them by the consistency in attention weights. Based on the findings, we propose PyramidInfer, a method that compresses the KV cache by layer-wise retaining crucial context. PyramidInfer saves significant memory by computing fewer keys and values without sacrificing performance. Experimental results show PyramidInfer improves 2.2x throughput compared to Accelerate with over 54{\%} GPU memory reduction in KV cache.},
  address   = {Bangkok, Thailand and virtual meeting},
  author    = {Yang, Dongjie  and
               Han, Xiaodong  and
               Gao, Yan  and
               Hu, Yao  and
               Zhang, Shilin  and
               Zhao, Hai},
  booktitle = {Findings of the Association for Computational Linguistics ACL 2024},
  doi       = {10.18653/v1/2024.findings-acl.195},
  editor    = {Ku, Lun-Wei  and
               Martins, Andre  and
               Srikumar, Vivek},
  pages     = {3258--3270},
  publisher = {Association for Computational Linguistics},
  title     = {{P}yramid{I}nfer: Pyramid {KV} Cache Compression for High-throughput {LLM} Inference},
  url       = {https://aclanthology.org/2024.findings-acl.195},
  year      = {2024}
}

@misc{yepes2024financialreportchunkingeffective,
  author  = {Antonio Jimeno Yepes and Yao You and Jan Milczek and Sebastian Laverde and Renyu Li},
  journal = {ArXiv preprint},
  title   = {Financial Report Chunking for Effective Retrieval Augmented Generation},
  url     = {https://arxiv.org/abs/2402.05131},
  volume  = {abs/2402.05131},
  year    = {2024}
}

@inproceedings{you2019lamb,
  author    = {Yang You and
               Jing Li and
               Sashank J. Reddi and
               Jonathan Hseu and
               Sanjiv Kumar and
               Srinadh Bhojanapalli and
               Xiaodan Song and
               James Demmel and
               Kurt Keutzer and
               Cho{-}Jui Hsieh},
  bibsource = {dblp computer science bibliography, https://dblp.org},
  biburl    = {https://dblp.org/rec/conf/iclr/YouLRHKBSDKH20.bib},
  booktitle = {8th International Conference on Learning Representations, {ICLR} 2020,
               Addis Ababa, Ethiopia, April 26-30, 2020},
  publisher = {OpenReview.net},
  timestamp = {Thu, 07 May 2020 01:00:00 +0200},
  title     = {Large Batch Optimization for Deep Learning: Training {BERT} in 76
               minutes},
  url       = {https://openreview.net/forum?id=Syx4wnEtvH},
  year      = {2020}
}

@article{young2024yi,
  author  = {Young, Alex and Chen, Bei and Li, Chao and Huang, Chengen and Zhang, Ge and Zhang, Guanwei and Li, Heng and Zhu, Jiangcheng and Chen, Jianqun and Chang, Jing and others},
  journal = {ArXiv preprint},
  title   = {Yi: Open foundation models by 01. ai},
  url     = {https://arxiv.org/abs/2403.04652},
  volume  = {abs/2403.04652},
  year    = {2024}
}

@article{zhang2024h2o,
  title   = {H2o: Heavy-hitter oracle for efficient generative inference of large language models},
  author  = {Zhang, Zhenyu and Sheng, Ying and Zhou, Tianyi and Chen, Tianlong and Zheng, Lianmin and Cai, Ruisi and Song, Zhao and Tian, Yuandong and R{\'e}, Christopher and Barrett, Clark and others},
  journal = {Advances in Neural Information Processing Systems},
  volume  = {36},
  pages   = {34661--34710},
  year    = {2023}
}

@article{zhang2024infty,
  author  = {Zhang, Xinrong and Chen, Yingfa and Hu, Shengding and Xu, Zihang and Chen, Junhao and Hao, Moo Khai and Han, Xu and Thai, Zhen Leng and Wang, Shuo and Liu, Zhiyuan and others},
  journal = {ArXiv preprint},
  title   = {$\infty$-Bench: Extending Long Context Evaluation Beyond 100K Tokens},
  url     = {https://arxiv.org/abs/2402.13718},
  volume  = {abs/2402.13718},
  year    = {2024}
}

@article{zhang2024pyramidkv,
  title   = {Pyramidkv: Dynamic kv cache compression based on pyramidal information funneling},
  author  = {Cai, Zefan and Zhang, Yichi and Gao, Bofei and Liu, Yuliang and Liu, Tianyu and Lu, Keming and Xiong, Wayne and Dong, Yue and Chang, Baobao and Hu, Junjie and others},
  journal = {arXiv preprint arXiv:2406.02069},
  year    = {2024}
}

@article{zhao2024atom,
  title   = {Atom: Low-bit quantization for efficient and accurate llm serving},
  author  = {Zhao, Yilong and Lin, Chien-Yu and Zhu, Kan and Ye, Zihao and Chen, Lequn and Zheng, Size and Ceze, Luis and Krishnamurthy, Arvind and Chen, Tianqi and Kasikci, Baris},
  journal = {Proceedings of Machine Learning and Systems},
  volume  = {6},
  pages   = {196--209},
  year    = {2024}
}


@article{zheng2023judging,
  title   = {Judging llm-as-a-judge with mt-bench and chatbot arena},
  author  = {Zheng, Lianmin and Chiang, Wei-Lin and Sheng, Ying and Zhuang, Siyuan and Wu, Zhanghao and Zhuang, Yonghao and Lin, Zi and Li, Zhuohan and Li, Dacheng and Xing, Eric and others},
  journal = {Advances in Neural Information Processing Systems},
  volume  = {36},
  pages   = {46595--46623},
  year    = {2023}
}

@inproceedings{zhong2021qmsum,
  address   = {Online},
  author    = {Zhong, Ming  and
               Yin, Da  and
               Yu, Tao  and
               Zaidi, Ahmad  and
               Mutuma, Mutethia  and
               Jha, Rahul  and
               Awadallah, Ahmed Hassan  and
               Celikyilmaz, Asli  and
               Liu, Yang  and
               Qiu, Xipeng  and
               Radev, Dragomir},
  booktitle = {Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
  doi       = {10.18653/v1/2021.naacl-main.472},
  editor    = {Toutanova, Kristina  and
               Rumshisky, Anna  and
               Zettlemoyer, Luke  and
               Hakkani-Tur, Dilek  and
               Beltagy, Iz  and
               Bethard, Steven  and
               Cotterell, Ryan  and
               Chakraborty, Tanmoy  and
               Zhou, Yichao},
  pages     = {5905--5921},
  publisher = {Association for Computational Linguistics},
  title     = {{QMS}um: A New Benchmark for Query-based Multi-domain Meeting Summarization},
  url       = {https://aclanthology.org/2021.naacl-main.472},
  year      = {2021}
}

@misc{zhou2023recurrentgpt,
  title         = {RecurrentGPT: Interactive Generation of (Arbitrarily) Long Text},
  author        = {Wangchunshu Zhou and Yuchen Eleanor Jiang and Peng Cui and Tiannan Wang and Zhenxin Xiao and Yifan Hou and Ryan Cotterell and Mrinmaya Sachan},
  year          = {2023},
  eprint        = {2305.13304},
  archiveprefix = {arXiv},
  primaryclass  = {cs.CL}
}


@inproceedings{zhoucan,
  title     = {Can Language Models Perform Robust Reasoning in Chain-of-thought Prompting with Noisy Rationales?},
  author    = {Zhou, Zhanke and Tao, Rong and Zhu, Jianing and Luo, Yiwen and Wang, Zengmao and Han, Bo},
  booktitle = {The Thirty-eighth Annual Conference on Neural Information Processing Systems},
  year      = {2024}
}