\newpage
\appendix

\section{Implementation Details}\label{appendix:implement}
\paragraph{Pseudo Code.}

\Cref{algo} details the pseudo code for our KV importance scoring algorithm.

\begin{algorithm}[h]
    \caption{KV Importance Scoring}
    \label{algo}
    \begin{algorithmic}
    \STATE \textbf{Input:} Transformer $\lm$, context $c$ (token length $n_c$), chunk size $m$ (fixed to 2K)
    \STATE \textcolor{gray!80}{\# $\lm$ has $L$ layers, $H$ KV heads, $G$ grouped-query size, $d$ feature dimension}
    \STATE $\kvc \leftarrow$ Prefill cache by forwarding $c$ through $\lm$
    \STATE $c_1,\ldots,c_T \leftarrow$ Partition $c$ into $T = \lceil\frac{n_c}{m}\rceil$ chunks, each of token length $m$
    \STATE $S \leftarrow 0^{L\times H\times n_c}$ \hfill\textcolor{gray!80}{\# placeholder}
    \FOR{$t = 1,\ldots,T$}
        \IF{$t = 1$}
            \STATE input $\leftarrow$ ``Repeat the previous context:'' $+\ c_t$
        \ELSE
            \STATE $c_{t-1,\text{last}} \leftarrow$ A trailing span of $c_{t-1}$ with 8 tokens
            \STATE input $\leftarrow$ ``Repeat the previous context starting with'' $+\ c_{t-1,\text{last}} + \text{``:''} +\ c_t$
        \ENDIF
        \STATE Forward the input (token length $n_\text{in}$) through $\lm$ with $\kvc$
        \FOR{$l = 1,\ldots,L$}
            \STATE $Q \leftarrow$ Queries in the $l$-th attention layer \hfill\textcolor{gray!80}{\# shape: {\small$G\times H\times n_\text{in}\times d$}}
            \STATE $K \leftarrow$ Keys in the $l$-th attention layer \hfill\textcolor{gray!80}{\# shape: {\small$H\times (n_c+n_\text{in})\times d$}}
            \STATE $\bar{K} \leftarrow$ Subsample keys in $\kvc$ corresponding to $c_t$ \hfill\textcolor{gray!80}{\# shape: {\footnotesize$H\times (m+n_\text{in})\times d$}}
            \STATE $A \leftarrow \text{Softmax}(Q\bar{K}^\intercal)$ \hfill\textcolor{gray!80}{\# broadcast over $G$ groups; shape: {\footnotesize$G\times H\times n_\text{in}\times (m+n_\text{in})$}}
            \STATE $\bar{A} \leftarrow A[\ldots,:m]$ \hfill\textcolor{gray!80}{\# attention received by keys in $\kvc$; shape: {\footnotesize$G\times H\times n_\text{in}\times m$}}
            \STATE $S_{l,t} \leftarrow \max_{g = 1,\ldots,G;\ i = 1,\ldots,n_\text{in}} \bar{A}[g,:,i]$ \hfill\textcolor{gray!80}{\# shape: {\footnotesize$H\times m$}}
            \STATE $S[$\texttt{\footnotesize$l,:,(t{-}1)m:tm$}$] \leftarrow S_{l,t}$
        \ENDFOR
    \ENDFOR
    \STATE $S_\text{head} \leftarrow \max_{i = 1,\ldots,n_c} S[:,:,i]$ \hfill\textcolor{gray!80}{\# shape: {\footnotesize$L\times H$}}
    \STATE \textbf{Output:} Score $S$, Head-level score $S_\text{head}$
    \end{algorithmic}
\end{algorithm}


\paragraph{Baseline Methods.}
We implement SnapKV and PyramidKV following their official GitHub implementations \citep{snapkv,pyramid}. We apply max pooling with a kernel size of 7 and an observation window size of 32, consistent with original hyperparameters \citep{snapkv}. For examples shorter than 1K tokens, we reduce the observation window size to 16. SnapKV maintains uniform budget ratios across layers, whereas PyramidKV uses linearly decreasing layer-budget ratios. In the main experiments (\Cref{sec:exp_benchmark}), we adopt a non-uniform head-budget allocation strategy, which demonstrates superior performance over uniform head-budget allocation \citep{adakv}. Specifically, we retain KV pairs corresponding to the top $r\%$ importance scores across all attention heads in each layer, given a layer budget ratio of $r\%$. \Cref{appendix:uniform} provides results with uniform head-budget allocation.

We implement the prefill version of $\text{H}_2\text{O}$ based on the official GitHub code provided by PyramidKV\footnote{\url{https://github.com/Zefan-Cai/KVCache-Factory}}. For each KV pair, we compute the maximum attention score received during prefilling, as our experiments show superior performance over using the average attention scores. This result aligns with observations by \citet{tova}. $\text{H}_2\text{O}$ serves as a counterpart to KVzip by utilizing self-attention scores from prefilling, while our method employs self-attention scores from reconstruction.


\paragraph{Datasets.}
In our main experiment described in \Cref{sec:exp_benchmark}, we consider nine English tasks from SCBench \citep{scbench}. Additionally, SCBench provides multi-task datasets, \textit{i.e.}, Mix.Sum+NIAH and Mix.RepoQA+KV, each composed of two distinct tasks. As performance patterns for these multi-task datasets closely resemble our main results on individual tasks, we present their results separately in \Cref{appendix:individual}. Considering the 128K context length limitation of LLaMA3.1 and Gemma3, we exclude data examples from the En.QA and En.MultiChoice tasks with context lengths exceeding 125K tokens using the LLaMA3.1 tokenizer. For synthetic tasks such as Retr.KV, context lengths span up to 125K tokens with the LLaMA3.1 tokenizer and up to 170K tokens with the Qwen2.5 tokenizer. 

SnapKV retains KV pairs in a trailing context window \citep{snapkv}, notably biasing shorter contexts toward recent tokens which results in degraded performance. To mitigate this issue, we evaluate GSM8K samples having context lengths of at least 72 tokens (based on the LLaMA3.1 tokenizer) \citep{gsm}, aligning with SnapKV's observation window size of 16.
For the Needle-in-a-Haystack (NIAH) task \citep{needle}, we utilize the published GitHub repository\footnote{\url{https://github.com/FranxYao/Long-Context-Data-Engineering}}. Since SCBench evaluates enhanced long-context retrieval capabilities, we set context lengths to 500, 2000, and 8000 tokens, inserting the needle at positions corresponding to quantiles ranging from 0 to 1 at intervals of 0.1 for a comprehensive evaluation.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Broader Impacts and Limitations}\label{appendix:impacts}

\paragraph{Broader Impacts.}
Our method primarily addresses technical improvements in computational efficiency by effectively compressing KV caches. Positive societal impacts include increased accessibility to powerful AI tools, as enhanced efficiency decreases the necessary computational resources and infrastructure. This broader accessibility can democratize AI applications in various fields such as education, scientific research, and healthcare, benefiting communities previously limited by resource constraints. While our method specifically targets technical efficiency, we acknowledge potential changes in model behavior due to compression, as analyzed in \Cref{tab:behavior}. 

\paragraph{Limitations.}
Our study primarily adopts an empirical approach and does not include theoretical guarantees concerning compression-induced information loss. As noted in \Cref{tab:behavior}, KV eviction might raise potential concerns regarding privacy leakage. Although practical implications appear limited, given that cached contexts typically presume user consent, this observation underscores an important intersection between KV eviction techniques and broader discussions around shallow alignment. Finally, our approach involves a compression overhead, as detailed in \Cref{sec:method_complexity}. This overhead can be amortized over multiple queries. While context-independent head-level eviction strategies can effectively eliminate overhead at deployment, their compression efficiency generally falls short compared to context-dependent approaches, as shown in \Cref{fig:pruning_structure}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Analysis and Experiments}

\subsection{Reconstruction Chunk Size}\label{appendix:chunk_size}
\Cref{fig:chunk_exp} analyzes how scoring chunk size $m$ influences performance. Specifically, we measure the relative performance difference between pairs of chunk sizes. For instance, the relative difference between chunk sizes 1K and 2K equals $|p_{\text{1k}} - p_{\text{2k}}| / p_{\text{2k}}$, where $p$ denotes performance at each chunk size. Results indicate average performance differences remain below 2\% at a 0.3 KV cache ratio, confirming negligible impact. Given these results, we adopt a chunk size of 2K for all experiments, as this achieves optimal computational efficiency while negligibly affecting the token position index limit (\Cref{fig:complexity}).

\begin{figure}[!ht]
    \centering
    \input{figure/g-chunk}
    \vspace{-0.2em}
    \caption{
    Relative performance differences for varying scoring chunk sizes, averaged over SCBench datasets with LLaMA3.1-8B.}
    \label{fig:chunk_exp}
\end{figure}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Repeat Prompts}\label{appendix:prompt}
In our experiment, we use the repeat prompt: ``Repeat the previous context:''. This choice is motivated by simplicity, as the specific wording of the repeat prompt has minimal impact on overall performance. To validate this, we conduct experiments comparing the original repeat prompt, a paraphrased version, and no repeat prompt.
\Cref{tab:prompt} shows that our method is robust to variations in the repeat prompt; even without the repeat prompt, context reconstruction remains effective. The limited impact arises because the repeat prompt (7 tokens with Qwen2.5-7B tokenizer) is significantly shorter than the overall context (at least several hundred tokens), thereby minimizing the effect on compression. 

To further clarify this, we analyze attention patterns. Specifically, we measure the proportion of prefilled KV pairs whose maximum cross-attention scores during reconstruction originated from the repeated context rather than the repeat prompt (see \Cref{fig:method}). For a 2K token-length context from NIAH, 98.1\% of KV pairs have their maximum attention from the repeated context. 
Among the KV pairs retained after 30\% compression, 99.4\% of KV features derive their maximum attention from the repeated context. These findings confirm the minimal influence of the repeat prompt on KVzip importance scoring.

\begin{table}[h]
\centering
\caption{Test performance of Qwen2.5-7B on SQuAD at a 30\% KV cache ratio. Note, SnapKV achieves 32.15\% in this setting.}
\vspace{0.5em}
\label{tab:prompt}
\begin{tabular}{l c}
\toprule
\textbf{Repeat prompt type} & \textbf{Accuracy (\%)} \\
\midrule
Original (``Repeat the previous context:'')     & 94.37 \\
Paraphrased (``Reproduce the preceding context without any changes.'')  & 94.45 \\
No (``\textbackslash n\textbackslash n'')           & 94.25 \\
\bottomrule
\end{tabular}
\end{table}


\subsection{Softmax-Free Importance Scoring}\label{appendix:logit}
In \Cref{algo}, we use the Softmax-normalized attention scores as the KV importance scores. To obtain query and key vectors at each layer, we forward the repeated input through $\lm$ using FlashAttention. Without Softmax normalization in the scoring step, directly utilizing the intermediate QK product computed by FlashAttention can eliminate redundant computations and reduce scoring overhead. Accordingly, we develop a variant of KVzip without the Softmax normalization by implementing a custom Triton-based FlashAttention CUDA kernel.

In \Cref{algo}, the scoring procedure accounts for approximately 10\% of the total forward computation time using $\lm$. Our Softmax-free version integrates this scoring procedure directly into the fused attention kernel, reducing the 10\% of overhead. However, as illustrated in \Cref{fig:logit}, omitting Softmax normalization results in approximately a 10\% degradation in compression ratios. Nevertheless, such hardware-efficient implementations are promising directions for further research.

\begin{figure}[!ht]
    \centering
    \input{figure/g-logit}
    \vspace{-0.3em}
    \caption{Performance of the Softmax-free variant of KVzip (\textit{logit}) on Retr.KV in SCBench with LLaMA3.1-8B.}
    \label{fig:logit}
\end{figure}


\subsection{Uniform KV Head Budgets}\label{appendix:uniform}
\Cref{fig:uniform} compares the performance of uniform head-budget allocation with the non-uniform allocation adopted in the main experiments. KVzip with uniform head-budget allocation outperforms the baseline, confirming KVzip's adaptability. However, non-uniform allocation achieves superior compression performance—consistent with previous findings by \citet{adakv}—by more effectively capturing variations in importance across heads, as illustrated in \Cref{fig:visual_kv}.

\begin{figure}[!ht]
    \centering
    \input{figure/g-uniform}
    \vspace{-0.3em}
    \caption{Performance comparison using non-uniform and uniform head-budget allocations on SQuAD with LLaMA3.1-8B. \textit{Unif.} refers to the uniform allocation.}
    \label{fig:uniform}
\end{figure}

% \section{Score Visualization}\label{appendix:visualization}

% \Cref{fig:visual_kv} provides visualization of KV importance scores by KVzip, while comparing other types of importance scores obtained through different tasks and obtained during prefill (\Cref{sec:observation}--\Cref{fig:observation,fig:observation-heat}). For explanation, please refer to the caption. \Cref{tab:task-inputs,tab:task-inputs-gsm} provides text inputs for the tasks considered in the visualization. \Cref{fig:visual_head} illustrates head-scores comparing DuoAttention's head-scores (\Cref{sec:exp_benchmark}--\Cref{fig:pruning_structure}).


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Individual Dataset Performance}\label{appendix:individual}

\paragraph{Model Scale and Architecture.} \Cref{fig:qwen14b,fig:llama8b,fig:gemma12b,fig:quant} presents performance results on individual datasets for the models Qwen2.5-14B-1M \citep{qwen}, LLaMA3.1-8B \citep{llama3}, Gemma3-12B \citep{gemma3}, and LLaMA3-8B-W8A8KV4 \citep{qserve}.

For the Gemma model, Retr.KV and Retr.Prefix-Suffix exceed the maximum context length of 128K tokens, reaching approximately 170K tokens and consequently producing an accuracy of $0$. Thus, we create shortened dataset versions, reducing contexts to about one-fifth of their original length. 

Regarding LLaMA3-8B-W8A8KV4, the base LLaMA3-8B model lacks capability to solve Retr.KV, Retr.Prefix-Suffix, and Math.Find tasks, resulting in near-zero accuracy. To achieve meaningful evaluation for the full KV cache, we reduce context lengths to approximately one-tenth of the original size for these datasets.


\paragraph{Multi-Task Datasets.} \Cref{fig:multi} presents evaluation results on multi-task datasets from SCBench, \textit{i.e.}, Mix.Sum+NIAH and Mix.RepoQA+KV, each composed of two distinct tasks \citep{scbench}. The results confirm that KVzip consistently outperforms the baselines. \Cref{fig:llama3b} presents results for LLaMA3.1-3B \citep{llama3}, demonstrating the superior performance of KVzip on this smaller-scale model.


\paragraph{RULER Benchmark.} 
To further highlight KVzip's effectiveness, we present results on the RULER benchmark \citep{ruler}. 
These results are publicly available by the NVIDIA KVPress repository\footnote{\url{https://huggingface.co/spaces/nvidia/kvpress-leaderboard}}. 
\Cref{fig:ruler} demonstrates that KVzip significantly outperforms current state-of-the-art KV eviction methods, maintaining performance at a 25\% compression rate, whereas others experience significant performance degradation.

\begin{figure}[!ht]
    \centering
    \input{figure/g-ruler}
    \vspace{-0.3em}
    \caption{Average performance on the RULER benchmark using Qwen3-8B.}
    \label{fig:ruler}
\end{figure}


\newpage
\begin{figure}[!ht]
    \centering
    \input{figure/a-qwen14b}
    \vspace{-1em}
    \caption{Benchmark results using Qwen2.5-14B-1M \citep{qwen} across compression ratios from 0.1 to 1.0.}
    \label{fig:qwen14b}

    \vspace{3em}
    \centering
    \input{figure/a-llama8b}
    \vspace{-1em}
    \caption{Benchmark results using LLaMA3.1-8B \citep{llama3} across compression ratios from 0.1 to 1.0.}
    \label{fig:llama8b}
\end{figure}


\newpage
\begin{figure}[!ht]
    \centering
    \input{figure/a-gemma12b}
    \vspace{-1em}
    \caption{Benchmark results using Gemma3-12B \citep{gemma3} across compression ratios from 0.1 to 1.0.}
    \label{fig:gemma12b}

    \vspace{3em}
    \centering
    \input{figure/a-quant}
    \vspace{-1em}
    \caption{Benchmark results using LLaMA3-8B-W8A8KV4 \citep{qserve} across compression ratios from 0.1 to 1.0.}
    \label{fig:quant}
\end{figure}


\newpage
\begin{figure}[!ht]
    \vspace{7em}
    \centering
    \input{figure/a-multi}
    \caption{Benchmark results on SCBench multi-task datasets using Qwen2.5-7B-1M \citep{qwen} across compression ratios from 0.1 to 1.0.}
    \label{fig:multi}

    \vspace{9em}
    \centering
    \input{figure/a-llama3b}
    \vspace{-0.5em}
    \caption{Benchmark results for LLaMA3.1-3B \citep{llama3} across compression ratios ranging from 0.1 to 1.0. The evaluation focuses on shorter contexts, as LLaMA3.1-3B lacks the capability to solve SCBench tasks, resulting in near-zero accuracy.}
    \label{fig:llama3b}
\end{figure}


\newpage
\begin{table}[!ht]
    \vspace{4em}
    \centering
    \caption{Inputs for KV cache importance scoring from a SQuAD example (used in the visualizations in \Cref{fig:observation-heat} and \Cref{fig:visual_kv}). The context is included in the input of the repeat task.}
    \vspace{0.5em}
    \input{table/task}
    \label{tab:task-inputs}

    \vspace{4em}
    \centering
    \caption{Inputs for importance scoring from a GSM8K example used in the visualization in \Cref{fig:observation-heat}, a reasoning task. The context is included in the input of the repeat task.}
    \vspace{0.5em}
    \input{table/task-gsm}
    \label{tab:task-inputs-gsm}
    \vspace{2em}
\end{table}


\newpage
\begin{figure}[!ht]
    \vspace{4cm}
    \centering
    \input{figure/v-head}
    \vspace{-0.5em}
    \caption{\textbf{Visualization of head-level importance scores} for context-independent compression in \Cref{sec:exp_benchmark}.
    We use the head scores obtained from an En.QA example in our primary experiments (\Cref{fig:pruning_structure}). For reference, (c)-(e) show head scores derived from alternative data sources from SCBench \citep{scbench}. Our scoring method yields a more uniformly distributed importance pattern compared to DuoAttention. We select the En.QA sample for our main experiments due to its comprehensive overlap with importance patterns from other data sources, whereas Retr.KV, composed of synthetic passkeys, exhibits sparser importance patterns.}
    \label{fig:visual_head}
    \vspace{2em}
\end{figure}