Skip to content

Commit

Permalink
fix: Display performance metrics by default
Browse files Browse the repository at this point in the history
  • Loading branch information
shakalaca committed Jan 18, 2025
1 parent 83ed554 commit d2f0d12
Showing 1 changed file with 4 additions and 0 deletions.
4 changes: 4 additions & 0 deletions llama_cpp/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ def __init__(
offload_kqv: bool = True,
flash_attn: bool = False,
# Sampling Params
no_perf: bool = False,
last_n_tokens_size: int = 64,
# LoRA Params
lora_base: Optional[str] = None,
Expand Down Expand Up @@ -173,6 +174,7 @@ def __init__(
embedding: Embedding mode only.
offload_kqv: Offload K, Q, V to GPU.
flash_attn: Use flash attention.
no_perf: Measure performance timings.
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
lora_path: Path to a LoRA file to apply to the model.
Expand Down Expand Up @@ -351,6 +353,7 @@ def __init__(
if type_v is not None:
self.context_params.type_v = type_v
# Sampling Params
self.context_params.no_perf = no_perf
self.last_n_tokens_size = last_n_tokens_size

self.cache: Optional[BaseLlamaCache] = None
Expand Down Expand Up @@ -2093,6 +2096,7 @@ def __getstate__(self):
offload_kqv=self.context_params.offload_kqv,
flash_attn=self.context_params.flash_attn,
# Sampling Params
no_perf=self.context_params.no_perf,
last_n_tokens_size=self.last_n_tokens_size,
# LoRA Params
lora_base=self.lora_base,
Expand Down

0 comments on commit d2f0d12

Please sign in to comment.