ggml-org · phymbert · Mar 2, 2024 · Mar 2, 2024 · Mar 2, 2024 · Mar 2, 2024
diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
@@ -70,12 +70,6 @@ jobs:
         run: |
           pip install -r examples/server/tests/requirements.txt
 
-      - name: Download models
-        id: download_models
-        run: |
-          cd examples/server/tests
-          ../../../scripts/hf.sh --repo ggml-org/models --file tinyllamas/stories260K.gguf
-
       - name: Tests
         id: server_integration_test
         run: |

@@ -441,8 +441,8 @@ struct llama_server_context
             const int ga_w = params.grp_attn_w;
 
             if (ga_n != 1) {
-                GGML_ASSERT(ga_n > 0                    && "ga_n must be positive");                     // NOLINT
-                GGML_ASSERT(ga_w % ga_n == 0            && "ga_w must be a multiple of ga_n");     // NOLINT
+                GGML_ASSERT(ga_n > 0                    && "ga_n must be positive");                       // NOLINT
+                GGML_ASSERT(ga_w % ga_n == 0            && "ga_w must be a multiple of ga_n");             // NOLINT
                 //GGML_ASSERT(n_ctx_train % ga_w == 0     && "n_ctx_train must be a multiple of ga_w");    // NOLINT
                 //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT
 
@@ -1709,8 +1709,8 @@ struct llama_server_context
                     }
                     slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
 
-                    // if input prompt is too big, truncate it
-                    if (slot.n_prompt_tokens >= slot.n_ctx)
+                    // if input prompt is too big, truncate it, if group attention self-extend is disabled
+                    if (slot.ga_n == 1 && slot.n_prompt_tokens >= slot.n_ctx)
                     {
                         const int n_left = slot.n_ctx - slot.params.n_keep;
                         const int n_block_size = n_left / 2;
@@ -1785,9 +1785,11 @@ struct llama_server_context
                         }
 
                         LOG_INFO("slot progression", {
-                            { "slot_id", slot.id },
-                            { "task_id", slot.task_id },
-                            { "n_past",  slot.n_past },
+                            { "slot_id",    slot.id },
+                            { "task_id",    slot.task_id },
+                            { "n_past",     slot.n_past },
+                            { "n_past_se",  slot.n_past_se },
+                            { "ga_i",       slot.ga_i },
                             { "n_prompt_tokens_processed", slot.n_prompt_tokens_processed }
                         });
                     }
@@ -2001,6 +2003,17 @@ struct llama_server_context
         LOG_VERBOSE("slots updated", {});
         return true;
     }
+
+    json model_meta() {
+        return json{
+                {"vocab_type", llama_vocab_type(model)},
+                {"n_vocab", llama_n_vocab(model)},
+                {"n_ctx_train", llama_n_ctx_train(model)},
+                {"n_embd", llama_n_embd(model)},
+                {"n_params", llama_model_n_params(model)},
+                {"size", llama_model_size(model)},
+        };
+    }
 };
 
 static void server_print_usage(const char *argv0, const gpt_params &params,
@@ -2994,6 +3007,7 @@ int main(int argc, char **argv)
         state.store(SERVER_STATE_READY);
         LOG_INFO("model loaded", {});
     }
+    const auto model_meta = llama.model_meta();
 
     if (sparams.chat_template.empty()) { // custom chat template is not supplied
         // check if the template comes with the model is supported by us
@@ -3143,7 +3157,7 @@ int main(int argc, char **argv)
                 }
             });
 
-    svr.Get("/v1/models", [&params](const httplib::Request& req, httplib::Response& res)
+    svr.Get("/v1/models", [&params, &model_meta](const httplib::Request& req, httplib::Response& res)
             {
                 res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
                 std::time_t t = std::time(0);
@@ -3152,10 +3166,11 @@ int main(int argc, char **argv)
                     {"object", "list"},
                     {"data", {
                         {
-                            {"id", params.model_alias},
-                            {"object", "model"},
-                            {"created", t},
-                            {"owned_by", "llamacpp"}
+                            {"id",       params.model_alias},
+                            {"object",   "model"},
+                            {"created",  t},
+                            {"owned_by", "llamacpp"},
+                            {"meta",     model_meta}
                         },
                     }}
                 };

@@ -1,47 +1,67 @@
 # Server tests
 
-Python based server tests scenario using [BDD](https://en.wikipedia.org/wiki/Behavior-driven_development) and [behave](https://behave.readthedocs.io/en/latest/):
- * [issues.feature](./features/issues.feature) Pending issues scenario
- * [parallel.feature](./features/parallel.feature) Scenario involving multi slots and concurrent requests
- * [security.feature](./features/security.feature) Security, CORS and API Key
- * [server.feature](./features/server.feature) Server base scenario: completion, embedding, tokenization, etc...
+Python based server tests scenario using [BDD](https://en.wikipedia.org/wiki/Behavior-driven_development)
+and [behave](https://behave.readthedocs.io/en/latest/):
+
+* [issues.feature](./features/issues.feature) Pending issues scenario
+* [parallel.feature](./features/parallel.feature) Scenario involving multi slots and concurrent requests
+* [security.feature](./features/security.feature) Security, CORS and API Key
+* [server.feature](./features/server.feature) Server base scenario: completion, embedding, tokenization, etc...
 
 Tests target GitHub workflows job runners with 4 vCPU.
 
-Requests are using [aiohttp](https://docs.aiohttp.org/en/stable/client_reference.html), [asyncio](https://docs.python.org/fr/3/library/asyncio.html) based http client.
+Requests are
+using [aiohttp](https://docs.aiohttp.org/en/stable/client_reference.html), [asyncio](https://docs.python.org/fr/3/library/asyncio.html)
+based http client.
 
-Note: If the host architecture inference speed is faster than GitHub runners one, parallel scenario may randomly fail. To mitigate it, you can increase values in `n_predict`, `kv_size`.
+Note: If the host architecture inference speed is faster than GitHub runners one, parallel scenario may randomly fail.
+To mitigate it, you can increase values in `n_predict`, `kv_size`.
 
 ### Install dependencies
+
 `pip install -r requirements.txt`
 
 ### Run tests
+
 1. Build the server
+
 ```shell
 cd ../../..
 mkdir build
 cd build
 cmake ../
 cmake --build . --target server
 ```
-2. download required models:
-   1. `../../../scripts/hf.sh --repo ggml-org/models --file tinyllamas/stories260K.gguf`
-3. Start the test: `./tests.sh`
+
+2. Start the test: `./tests.sh`
 
 It's possible to override some scenario steps values with environment variables:
- - `PORT` -> `context.server_port` to set the listening port of the server during scenario, default: `8080`
- - `LLAMA_SERVER_BIN_PATH` -> to change the server binary path, default: `../../../build/bin/server`
- - `DEBUG` -> "ON" to enable steps and server verbose mode `--verbose`
- - `SERVER_LOG_FORMAT_JSON` -> if set switch server logs to json format
+
+| variable                 | description                                                                                    |
+|--------------------------|------------------------------------------------------------------------------------------------|
+| `PORT`                   | `context.server_port` to set the listening port of the server during scenario, default: `8080` |
+| `LLAMA_SERVER_BIN_PATH`  | to change the server binary path, default: `../../../build/bin/server`                         |
+| `DEBUG`                  | "ON" to enable steps and server verbose mode `--verbose`                                       |
+| `SERVER_LOG_FORMAT_JSON` | if set switch server logs to json format                                                       |
+| `N_GPU_LAYERS`           | number of model layers to offload to VRAM `-ngl --n-gpu-layers`                                |
 
 ### Run @bug, @wip or @wrong_usage annotated scenario
 
 Feature or Scenario must be annotated with `@llama.cpp` to be included in the default scope.
+
 - `@bug` annotation aims to link a scenario with a GitHub issue.
 - `@wrong_usage` are meant to show user issue that are actually an expected behavior
 - `@wip` to focus on a scenario working in progress
+- `@slow` heavy test, disabled by default
 
 To run a scenario annotated with `@bug`, start:
-`DEBUG=ON ./tests.sh --no-skipped --tags bug`
+
+```shell
+DEBUG=ON ./tests.sh --no-skipped --tags bug
+```
 
 After changing logic in `steps.py`, ensure that `@bug` and `@wrong_usage` scenario are updated.
+
+```shell
+./tests.sh --no-skipped --tags bug,wrong_usage || echo "should failed but compile"
+```
@@ -7,7 +7,10 @@
 
 
 def before_scenario(context, scenario):
-    print(f"\x1b[33;42mStarting new scenario: {scenario.name}!\x1b[0m")
+    context.debug = 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON'
+    if context.debug:
+        print("DEBUG=ON\n")
+    print(f"\x1b[33;42mStarting new scenario: {scenario.name}!\x1b[0m\n")
     port = 8080
     if 'PORT' in os.environ:
         port = int(os.environ['PORT'])

@@ -1,11 +1,12 @@
 @llama.cpp
+@parallel
 Feature: Parallel
 
   Background: Server startup
     Given a server listening on localhost:8080
-    And   a model file stories260K.gguf
-    And   a model alias tinyllama-2
+    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
     And   42 as server seed
+    And   512 as batch size
     And   64 KV cache size
     And   2 slots
     And   embeddings extraction

@@ -0,0 +1,53 @@
+#@llama.cpp
+@passkey
+@wip
+@slow
+@bug
+Feature: Passkey / Self-extend with context shift
+
+  Background: Server startup
+    Given a server listening on localhost:8080
+
+  # Generates a long text of junk and inserts a secret passkey number inside it.
+  # We process the entire prompt using batches of n_batch and shifting the cache
+  # when it is full and then we query the LLM for the secret passkey.
+  # see #3856 and #4810
+  Scenario Outline: Passkey
+    Given a model file <hf_file> from HF repo <hf_repo>
+    And   <n_batch> as batch size
+    And   <n_junk> as number of junk
+    And   a self-extend context with a factor of <n_grp>
+    And   <seed> as seed
+    And   a KV cache size based on the model trained context <n_ctx_train> extended by <n_grp> with additional <n_keep> tokens
+    And   <n_slots> slots
+    And   <n_ga> group attention factor to extend context size through self-extend
+    And   <n_ga_w> group attention width to extend context size through self-extend
+    # Can be override with N_GPU_LAYERS
+    And   <ngl> GPU offloaded layers
+    Then  the server is starting
+    Then  the server is healthy
+    Given available models
+    Then  model 0 is trained on <n_ctx_train> tokens context
+    Given a prefix prompt:
+    """
+    here is an important info hidden inside a lot of irrelevant text. Find it and memorize them. I will quiz you about the important information there.
+    """
+    And a passkey prompt template:
+    """
+    The pass key is <passkey> Remember it. <passkey> is the pass key.
+    """
+    And a junk suffix prompt:
+    """
+    The grass is green. The sky is blue. The sun is yellow. Here we go. There and back again.
+    """
+    And a suffix prompt:
+    """
+    What is the pass key? The pass key is
+    """
+    Given a "<passkey>" passkey challenge prompt with the passkey inserted every <i_pos> junk
+    And  a completion request with no api error
+    Then <n_predicted> tokens are predicted matching <re_content>
+
+    Examples:
+      | hf_repo             | hf_file           | n_ctx_train | ngl | n_batch | n_slots | n_ga | n_ga_w | n_junk | n_grp | i_pos | seed | n_keep | passkey | n_predicted | re_content |
+      | TheBloke/phi-2-GGUF | phi-2.Q4_K_M.gguf | 2048        | 5   | 512     | 1       | 4    | 2048   | 250    | 4     | 50    | 86   | 32     | 42      | -1          | .*42.*     |
@@ -1,9 +1,10 @@
 @llama.cpp
+@security
 Feature: Security
 
   Background: Server startup with an api key defined
     Given a server listening on localhost:8080
-    And   a model file stories260K.gguf
+    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
     And   a server api key llama.cpp
     Then  the server is starting
     Then  the server is healthy

@@ -1,15 +1,17 @@
 @llama.cpp
+@server
 Feature: llama.cpp server
 
   Background: Server startup
     Given a server listening on localhost:8080
-    And   a model file stories260K.gguf
+    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
     And   a model alias tinyllama-2
     And   42 as server seed
       # KV Cache corresponds to the total amount of tokens
       # that can be stored across all independent sequences: #4130
       # see --ctx-size and #5568
     And   32 KV cache size
+    And   512 as batch size
     And   1 slots
     And   embeddings extraction
     And   32 server max tokens to predict
@@ -75,10 +77,15 @@ Feature: llama.cpp server
     When an OAI compatible embeddings computation request for multiple inputs
     Then embeddings are generated
 
-
   Scenario: Tokenize / Detokenize
     When tokenizing:
     """
     What is the capital of France ?
     """
     Then tokens can be detokenize
+
+  Scenario: Models available
+    Given available models
+    Then  1 models are supported
+    Then  model 0 is identified by tinyllama-2
+    Then  model 0 is trained on 128 tokens context