Fix: Discard KV cache for last token before reusing prompt cache with…

… suffix added to prompt
huggingface · sannat17 · Oct 21, 2024 · Oct 21, 2024 · Oct 21, 2024 · 6606949bb0b42d534bec129897787aae7990b19a
commit 6606949bb0b42d534bec129897787aae7990b19a
diff --git a/performance_optimization/prompt_reuse.py b/performance_optimization/prompt_reuse.py
@@ -18,6 +18,8 @@
 inputs = tokenizer(INITIAL_PROMPT, return_tensors="pt").to("cuda")
 with torch.no_grad():
     prompt_cache = model(**inputs, past_key_values = prompt_cache).past_key_values
+    prompt_cache.key_cache = [x[:, :, :-1] for x in prompt_cache.key_cache]
+    prompt_cache.value_cache = [x[:, :, :-1] for x in prompt_cache.value_cache]
 
 
 prompt = "Why are french people obsessed with french?"