From 95bf5f7d87ad34e8419ab7b4ad5fd336d9068fe1 Mon Sep 17 00:00:00 2001
From: Flipbook <26732651+TheFlipbook@users.noreply.github.com>
Date: Sat, 6 Apr 2024 18:29:05 -0700
Subject: [PATCH] llama_sampling_sample with default args is more naively
 usable

* Batches populated by either llama_batch_get_one or llama_batch_add work with default args
  * Previously get_one could use the default argument
  * Previously add should usually have used the last index where logits[idx] == true
* This hopefully encourages the use of llama_batch_add
  * By giving expected results when using default arguments.
* Adds "negative indexing" feature to llama_get_logits_ith and llama_get_embeddings_ith
* Believed to work with any currently well behaved program
  * Default arg now works for both cases (previously would give strange results for add case)
  * Any non-negative number is unaffected and behaves as previously
  * Negative arguments were previously invalid.
* Implemented as a special case of indexing as suggested by @compilade in https://github.com/ggerganov/llama.cpp/pull/6519
---
 common/sampling.h |  2 +-
 llama.cpp         | 38 +++++++++++++++++++++++++++++---------
 llama.h           |  6 ++++--
 3 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/common/sampling.h b/common/sampling.h
index 56ed991b8478a..639b819ab4fb2 100644
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -129,7 +129,7 @@ llama_token llama_sampling_sample(
         struct llama_sampling_context * ctx_sampling,
         struct llama_context * ctx_main,
         struct llama_context * ctx_cfg,
-        int idx = 0);
+        int idx = -1);
 
 // Prepares and adjusts the set of token candidates for sampling based on penalties, biases, and sampling parameters.
 llama_token_data_array llama_sampling_prepare(
diff --git a/llama.cpp b/llama.cpp
index 217726184879f..d700a53fe811b 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2177,7 +2177,7 @@ struct llama_context {
 
     std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
     size_t  output_size = 0; // capacity (of tokens positions) for the output buffers
-    int32_t n_outputs   = 0; // number of actually-used outputs in the current ubatch
+    int32_t n_outputs   = 0; // number of actually-used outputs in the current ubatch or last logical batch
 
     bool logits_all = false;
 
@@ -10411,6 +10411,9 @@ static int llama_decode_internal(
         n_outputs_prev += lctx.n_outputs;
     }
 
+    // set to total number of outputs in the batch, for use in llama_get_logits_ith
+    lctx.n_outputs = n_outputs;
+
     // wait for the computation to finish (automatically done when obtaining the model output)
     //llama_synchronize(&lctx);
 
@@ -15511,23 +15514,31 @@ float * llama_get_logits(struct llama_context * ctx) {
 }
 
 float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
+    int32_t j = -1;
     llama_synchronize(ctx);
 
     try {
         if (ctx->logits == nullptr) {
             throw std::runtime_error("no logits");
         }
-        if ((size_t) i >= ctx->output_ids.size()) {
+
+        if (i < 0) {
+            j = ctx->n_outputs + i;
+            if (j < 0) {
+                throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
+            }
+        } else if ((size_t) i >= ctx->output_ids.size()) {
             throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
+        } else {
+            j = ctx->output_ids[i];
         }
-        const int32_t j = ctx->output_ids[i];
 
         if (j < 0) {
             throw std::runtime_error(format("batch.logits[%d] != true", i));
         }
-        if ((size_t) j >= ctx->output_size) {
+        if ((size_t) j >= ctx->n_outputs) {
             // This should not happen
-            throw std::runtime_error(format("corrupt output buffer (j=%d, output_size=%lu)", j, ctx->output_size));
+            throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%lu)", j, ctx->n_outputs));
         }
 
         return ctx->logits + j*ctx->model.hparams.n_vocab;
@@ -15547,23 +15558,32 @@ float * llama_get_embeddings(struct llama_context * ctx) {
 }
 
 float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
+    int32_t j = -1;
+
     llama_synchronize(ctx);
 
     try {
         if (ctx->embd == nullptr) {
             throw std::runtime_error("no embeddings");
         }
-        if ((size_t) i >= ctx->output_ids.size()) {
+
+        if (i < 0) {
+            j = ctx->n_outputs + i;
+            if (j < 0) {
+                throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
+            }
+        } else if ((size_t) i >= ctx->output_ids.size()) {
             throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
+        } else {
+            j = ctx->output_ids[i];
         }
-        const int32_t j = ctx->output_ids[i];
 
         if (j < 0) {
             throw std::runtime_error(format("batch.logits[%d] != true", i));
         }
-        if ((size_t) j >= ctx->output_size) {
+        if ((size_t) j >= ctx->n_outputs) {
             // This should not happen
-            throw std::runtime_error(format("corrupt output buffer (j=%d, output_size=%lu)", j, ctx->output_size));
+            throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%lu)", j, ctx->n_outputs));
         }
 
         return ctx->embd + j*ctx->model.hparams.n_embd;
diff --git a/llama.h b/llama.h
index 036b3268533cf..bb84c2a9a8f50 100644
--- a/llama.h
+++ b/llama.h
@@ -684,8 +684,9 @@ extern "C" {
     // Cols: n_vocab
     LLAMA_API float * llama_get_logits(struct llama_context * ctx);
 
-    // Logits for the ith token. Equivalent to:
+    // Logits for the ith token. For positive indices, Equivalent to:
     // llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
+    // Negative indicies can be used to access logits in reverse order, -1 is the last logit.
     // returns NULL for invalid ids.
     LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
 
@@ -697,8 +698,9 @@ extern "C" {
     // Otherwise, returns NULL.
     LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
 
-    // Get the embeddings for the ith token. Equivalent to:
+    // Get the embeddings for the ith token. For positive indices, Equivalent to:
     // llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
+    // Negative indicies can be used to access embeddings in reverse order, -1 is the last embedding.
     // shape: [n_embd] (1-dimensional)
     // returns NULL for invalid ids.
     LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);