From 41abfa3ea8d973b7e8847fc39755f8178f7e04f0 Mon Sep 17 00:00:00 2001 From: Justine Tunney Date: Sat, 16 Nov 2024 00:28:59 -0800 Subject: [PATCH] Work around multiple image handling --- llamafile/server/main.1 | 4 ++ llamafile/server/main.1.asc | 103 +++++++++++++++++++----------------- llamafile/server/slot.cpp | 21 +++++++- llamafile/server/slots.h | 2 +- 4 files changed, 79 insertions(+), 51 deletions(-) diff --git a/llamafile/server/main.1 b/llamafile/server/main.1 index ebe6777e44..b821ac6371 100644 --- a/llamafile/server/main.1 +++ b/llamafile/server/main.1 @@ -53,6 +53,10 @@ resources, and control how much completion parallelism can happen. Please note that .Fl Fl ctx-size has a strong influence on how many slots can be created. +.It Fl p Ar TEXT , Fl Fl prompt Ar TEXT +Specifies system prompt. This value is passed along to the web frontend. +.It Fl Fl no-display-prompt Ar TEXT +Hide system prompt from web user interface. .It Fl Fl url-prefix Ar URLPREFIX Specifies a URL prefix (subdirectory) under which the HTTP server will make the API accessible, e.g. /lamafiler. Useful when running llamafiler diff --git a/llamafile/server/main.1.asc b/llamafile/server/main.1.asc index 4fe4017a20..9b498f8ec7 100644 --- a/llamafile/server/main.1.asc +++ b/llamafile/server/main.1.asc @@ -59,9 +59,16 @@ note that --ctx-size has a strong influence on how many slots can be created. + -p TEXT, --prompt TEXT + Specifies system prompt. This value is passed along to the web + frontend. + + --no-display-prompt TEXT + Hide system prompt from web user interface. + --url-prefix URLPREFIX - Specifies a URL prefix (subdirectory) under which the HTTP - server will make the API accessible, e.g. /lamafiler. Useful + Specifies a URL prefix (subdirectory) under which the HTTP + server will make the API accessible, e.g. /lamafiler. Useful when running llamafiler behind a reverse proxy such as NGINX or Redbean. By default, this is set to / (root). @@ -69,80 +76,80 @@ Number of HTTP client handling threads. --trust CIDR - Adds a network to the trusted network list. This argument is - specified in the form IPV4/MASKBITS, e.g. 192.168.0.0/24. By + Adds a network to the trusted network list. This argument is + specified in the form IPV4/MASKBITS, e.g. 192.168.0.0/24. By default, all clients are untrusted, which means they're subject to token bucket throttling, and additional security precautions - that may cause request handling to go slightly slower. There‐ - fore this flag is important to use if you want to accurately - benchmark llamafiler, since the server will otherwise see the + that may cause request handling to go slightly slower. There‐ + fore this flag is important to use if you want to accurately + benchmark llamafiler, since the server will otherwise see the benchmark as a DDOS and deprioritize its traffic accordingly. --ip-header STR - If this flag is passed a value, e.g. X-Forwarded-For, then any + If this flag is passed a value, e.g. X-Forwarded-For, then any trusted may send this header to your llamafile server to let it - know what the true effective client IPv4 address actually is. - After this happens the default security restrictions, e.g. to‐ - ken bucket, will be measured and applied against that IPv4 ad‐ + know what the true effective client IPv4 address actually is. + After this happens the default security restrictions, e.g. to‐ + ken bucket, will be measured and applied against that IPv4 ad‐ dress and its adjacent networks. --token-rate N - Specifies how many times per second a token is dropped in each - bucket. This setting is used to define a limitation on how - many TCP connects and HTTP messages each chunk of the IPv4 ad‐ + Specifies how many times per second a token is dropped in each + bucket. This setting is used to define a limitation on how + many TCP connects and HTTP messages each chunk of the IPv4 ad‐ dress space is permitted to send to llamafiler over a sustained - period of time. The default token rate is 1, which means that, - on a long enough timeline, a class-C network will be depriori‐ - tized if it sends more than one request per second. No real - penalty actually applies though until the server runs out of + period of time. The default token rate is 1, which means that, + on a long enough timeline, a class-C network will be depriori‐ + tized if it sends more than one request per second. No real + penalty actually applies though until the server runs out of resources, e.g. HTTP request workers. --token-burst N Specifies how many HTTP requests and TCP connects a given slice - of the IPv4 address space is permitted to send within a short - period of time, before token bucket restrictions kick in, and + of the IPv4 address space is permitted to send within a short + period of time, before token bucket restrictions kick in, and cause the client to be deprioritized. By default, this value is - set to 100. It may be tuned to any value between 1 and 127 in‐ + set to 100. It may be tuned to any value between 1 and 127 in‐ clusive. --token-cidr N - Specifies IPv4 address space granularity of token bucket algo‐ - rithm, in network bits. By default, this value is set to 24 - which means individual IPv4 addresses are viewed as being rep‐ - resentative members of a class-C network, or in other words, - each group of 256 IPv4 addresses is lumped together. If one IP - in the group does something bad, then bad things happen to all - the other IPv4 addresses in that granule. This number may be - set to any integer between 3 and 32 inclusive. Specifying a + Specifies IPv4 address space granularity of token bucket algo‐ + rithm, in network bits. By default, this value is set to 24 + which means individual IPv4 addresses are viewed as being rep‐ + resentative members of a class-C network, or in other words, + each group of 256 IPv4 addresses is lumped together. If one IP + in the group does something bad, then bad things happen to all + the other IPv4 addresses in that granule. This number may be + set to any integer between 3 and 32 inclusive. Specifying a higher number will trade away system memory to increase network - specificity. For example, using 32 means that 4 billion indi‐ - vidual token buckets will be created. By default, a background - thread drops one token in each bucket every second, so that + specificity. For example, using 32 means that 4 billion indi‐ + vidual token buckets will be created. By default, a background + thread drops one token in each bucket every second, so that could potentially be a lot of busy work. A value of three means - that everyone on the Internet who talks to your server will + that everyone on the Internet who talks to your server will have to fight over only eight token buckets in total. --unsecure - Disables sandboxing. By default, llamafiler puts itself in a + Disables sandboxing. By default, llamafiler puts itself in a SECCOMP BPF sandbox, so that even if your server gets hacked in - the worst possible way (some kind of C++ memory bug) then + the worst possible way (some kind of C++ memory bug) then there's very little damage an attacker will be able to do. This works by restricting system calls using Cosmopolitan Libc's im‐ - plementation of pledge() which is currently only supported on - Linux (other OSes will simply be unsecured by default). The - pledge security policy that's used by default is "stdio anet" - which means that only relatively harmless system calls like - read(), write(), and accept() are allowed once the server has - finished initializing. It's not possible for remotely executed - code to do things like launch subprocesses, read or write to + plementation of pledge() which is currently only supported on + Linux (other OSes will simply be unsecured by default). The + pledge security policy that's used by default is "stdio anet" + which means that only relatively harmless system calls like + read(), write(), and accept() are allowed once the server has + finished initializing. It's not possible for remotely executed + code to do things like launch subprocesses, read or write to the filesystem, or initiate a new connection to a server. -k N, --keepalive N - Specifies the TCP keepalive interval in seconds. This value is - passed along to both TCP_KEEPIDLE and TCP_KEEPINTVL if they're - supported by the host operating system. If this value is - greater than 0, then the the SO_KEEPALIVE and TCP_NODELAY op‐ - tions are enabled on network sockets, if supported by the host + Specifies the TCP keepalive interval in seconds. This value is + passed along to both TCP_KEEPIDLE and TCP_KEEPINTVL if they're + supported by the host operating system. If this value is + greater than 0, then the the SO_KEEPALIVE and TCP_NODELAY op‐ + tions are enabled on network sockets, if supported by the host operating system. The default keepalive is 5. --http-obuf-size N @@ -165,7 +172,7 @@ curl -v http://127.0.0.1:8080/embedding?content=hello+world DOCUMENTATION - Read our Markdown documentation for additional help and tutorials. See + Read our Markdown documentation for additional help and tutorials. See llamafile/server/doc/index.md in the source repository on GitHub. SEE ALSO diff --git a/llamafile/server/slot.cpp b/llamafile/server/slot.cpp index 9628a81a5a..9bff95ab03 100644 --- a/llamafile/server/slot.cpp +++ b/llamafile/server/slot.cpp @@ -57,6 +57,22 @@ generate_system_fingerprint(const llama_context_params* cparams) return b; } +// having multiple images in the context window is janky right now, so +// let's erase old images from the chat history until we find out more +static std::vector +remove_old_image_atoms(const std::vector& atoms) +{ + int last_image_idx = -1; + for (int i = 0; i < atoms.size(); ++i) + if (atoms[i].is_image()) + last_image_idx = i; + std::vector result; + for (int i = 0; i < atoms.size(); i++) + if (!atoms[i].is_image() || i == last_image_idx) + result.emplace_back(atoms[i]); + return result; +} + const char* Slot::describe_error(int err) { @@ -245,10 +261,11 @@ Slot::eval_atoms(const std::vector& atoms) } int -Slot::prefill(const std::vector& atoms) +Slot::prefill(const std::vector& atoms_) { if (!ctx_) return uninitialized; + std::vector atoms = remove_old_image_atoms(atoms_); int used_tokens = ctx_used(); int reuse_atoms = 0; int reuse_tokens = 0; @@ -269,10 +286,10 @@ Slot::prefill(const std::vector& atoms) history_.resize(reuse_atoms); } else { SLOG("failed to remove tokens from KV cache"); - llama_kv_cache_clear(ctx_); reuse_atoms = 0; reuse_tokens = 0; erase_tokens = used_tokens; + llama_kv_cache_clear(ctx_); history_.clear(); } } diff --git a/llamafile/server/slots.h b/llamafile/server/slots.h index 64e6b77545..261090bb4d 100644 --- a/llamafile/server/slots.h +++ b/llamafile/server/slots.h @@ -39,7 +39,7 @@ struct Slots // first elements are most recently used // last elements are least recently used - Dll* free_slots_; + Dll* free_slots_ = nullptr; explicit Slots(llama_model*); ~Slots();