From 41abfa3ea8d973b7e8847fc39755f8178f7e04f0 Mon Sep 17 00:00:00 2001
From: Justine Tunney <jtunney@mozilla.com>
Date: Sat, 16 Nov 2024 00:28:59 -0800
Subject: [PATCH] Work around multiple image handling

---
 llamafile/server/main.1     |   4 ++
 llamafile/server/main.1.asc | 103 +++++++++++++++++++-----------------
 llamafile/server/slot.cpp   |  21 +++++++-
 llamafile/server/slots.h    |   2 +-
 4 files changed, 79 insertions(+), 51 deletions(-)

diff --git a/llamafile/server/main.1 b/llamafile/server/main.1
index ebe6777e44..b821ac6371 100644
--- a/llamafile/server/main.1
+++ b/llamafile/server/main.1
@@ -53,6 +53,10 @@ resources, and control how much completion parallelism can happen.
 Please note that
 .Fl Fl ctx-size
 has a strong influence on how many slots can be created.
+.It Fl p Ar TEXT , Fl Fl prompt Ar TEXT
+Specifies system prompt. This value is passed along to the web frontend.
+.It Fl Fl no-display-prompt Ar TEXT
+Hide system prompt from web user interface.
 .It Fl Fl url-prefix Ar URLPREFIX
 Specifies a URL prefix (subdirectory) under which the HTTP server will
 make the API accessible, e.g. /lamafiler. Useful when running llamafiler
diff --git a/llamafile/server/main.1.asc b/llamafile/server/main.1.asc
index 4fe4017a20..9b498f8ec7 100644
--- a/llamafile/server/main.1.asc
+++ b/llamafile/server/main.1.asc
@@ -59,9 +59,16 @@
                note  that  [1m--ctx-size [22mhas a strong influence on how many slots
                can be created.
 
+       [1m-p [4m[22mTEXT[24m, [1m--prompt [4m[22mTEXT[0m
+               Specifies system prompt. This value is passed along to the  web
+               frontend.
+
+       [1m--no-display-prompt [4m[22mTEXT[0m
+               Hide system prompt from web user interface.
+
        [1m--url-prefix [4m[22mURLPREFIX[0m
-               Specifies a URL prefix  (subdirectory)  under  which  the  HTTP
-               server  will  make  the API accessible, e.g. /lamafiler. Useful
+               Specifies  a  URL  prefix  (subdirectory)  under which the HTTP
+               server will make the API accessible,  e.g.  /lamafiler.  Useful
                when running llamafiler behind a reverse proxy such as NGINX or
                Redbean. By default, this is set to / (root).
 
@@ -69,80 +76,80 @@
                Number of HTTP client handling threads.
 
        [1m--trust [4m[22mCIDR[0m
-               Adds a network to the trusted network list.  This  argument  is
-               specified  in  the  form IPV4/MASKBITS, e.g. 192.168.0.0/24. By
+               Adds  a  network  to the trusted network list. This argument is
+               specified in the form IPV4/MASKBITS,  e.g.  192.168.0.0/24.  By
                default, all clients are untrusted, which means they're subject
                to token bucket throttling, and additional security precautions
-               that may cause request handling to go slightly  slower.  There‐
-               fore  this  flag  is important to use if you want to accurately
-               benchmark llamafiler, since the server will otherwise  see  the
+               that  may  cause request handling to go slightly slower. There‐
+               fore this flag is important to use if you  want  to  accurately
+               benchmark  llamafiler,  since the server will otherwise see the
                benchmark as a DDOS and deprioritize its traffic accordingly.
 
        [1m--ip-header [4m[22mSTR[0m
-               If  this flag is passed a value, e.g. X-Forwarded-For, then any
+               If this flag is passed a value, e.g. X-Forwarded-For, then  any
                trusted may send this header to your llamafile server to let it
-               know what the true effective client IPv4 address  actually  is.
-               After  this happens the default security restrictions, e.g. to‐
-               ken bucket, will be measured and applied against that IPv4  ad‐
+               know  what  the true effective client IPv4 address actually is.
+               After this happens the default security restrictions, e.g.  to‐
+               ken  bucket, will be measured and applied against that IPv4 ad‐
                dress and its adjacent networks.
 
        [1m--token-rate [4m[22mN[0m
-               Specifies  how many times per second a token is dropped in each
-               bucket.  This setting is used to define  a  limitation  on  how
-               many  TCP connects and HTTP messages each chunk of the IPv4 ad‐
+               Specifies how many times per second a token is dropped in  each
+               bucket.   This  setting  is  used to define a limitation on how
+               many TCP connects and HTTP messages each chunk of the IPv4  ad‐
                dress space is permitted to send to llamafiler over a sustained
-               period of time. The default token rate is 1, which means  that,
-               on  a long enough timeline, a class-C network will be depriori‐
-               tized if it sends more than one request  per  second.  No  real
-               penalty  actually  applies  though until the server runs out of
+               period  of time. The default token rate is 1, which means that,
+               on a long enough timeline, a class-C network will be  depriori‐
+               tized  if  it  sends  more than one request per second. No real
+               penalty actually applies though until the server  runs  out  of
                resources, e.g. HTTP request workers.
 
        [1m--token-burst [4m[22mN[0m
                Specifies how many HTTP requests and TCP connects a given slice
-               of the IPv4 address space is permitted to send within  a  short
-               period  of  time, before token bucket restrictions kick in, and
+               of  the  IPv4 address space is permitted to send within a short
+               period of time, before token bucket restrictions kick  in,  and
                cause the client to be deprioritized. By default, this value is
-               set to 100. It may be tuned to any value between 1 and 127  in‐
+               set  to 100. It may be tuned to any value between 1 and 127 in‐
                clusive.
 
        [1m--token-cidr [4m[22mN[0m
-               Specifies  IPv4 address space granularity of token bucket algo‐
-               rithm, in network bits. By default, this value  is  set  to  24
-               which  means individual IPv4 addresses are viewed as being rep‐
-               resentative members of a class-C network, or  in  other  words,
-               each  group of 256 IPv4 addresses is lumped together. If one IP
-               in the group does something bad, then bad things happen to  all
-               the  other  IPv4  addresses in that granule. This number may be
-               set to any integer between 3 and  32  inclusive.  Specifying  a
+               Specifies IPv4 address space granularity of token bucket  algo‐
+               rithm,  in  network  bits.  By default, this value is set to 24
+               which means individual IPv4 addresses are viewed as being  rep‐
+               resentative  members  of  a class-C network, or in other words,
+               each group of 256 IPv4 addresses is lumped together. If one  IP
+               in  the group does something bad, then bad things happen to all
+               the other IPv4 addresses in that granule. This  number  may  be
+               set  to  any  integer  between 3 and 32 inclusive. Specifying a
                higher number will trade away system memory to increase network
-               specificity.   For example, using 32 means that 4 billion indi‐
-               vidual token buckets will be created. By default, a  background
-               thread  drops  one  token  in each bucket every second, so that
+               specificity.  For example, using 32 means that 4 billion  indi‐
+               vidual  token buckets will be created. By default, a background
+               thread drops one token in each bucket  every  second,  so  that
                could potentially be a lot of busy work. A value of three means
-               that everyone on the Internet who talks  to  your  server  will
+               that  everyone  on  the  Internet who talks to your server will
                have to fight over only eight token buckets in total.
 
        [1m--unsecure[0m
-               Disables  sandboxing.  By  default, llamafiler puts itself in a
+               Disables sandboxing. By default, llamafiler puts  itself  in  a
                SECCOMP BPF sandbox, so that even if your server gets hacked in
-               the worst possible way (some  kind  of  C++  memory  bug)  then
+               the  worst  possible  way  (some  kind  of C++ memory bug) then
                there's very little damage an attacker will be able to do. This
                works by restricting system calls using Cosmopolitan Libc's im‐
-               plementation  of  pledge() which is currently only supported on
-               Linux (other OSes will simply be  unsecured  by  default).  The
-               pledge  security  policy that's used by default is "stdio anet"
-               which means that only relatively  harmless  system  calls  like
-               read(),  write(),  and accept() are allowed once the server has
-               finished initializing. It's not possible for remotely  executed
-               code  to  do  things like launch subprocesses, read or write to
+               plementation of pledge() which is currently only  supported  on
+               Linux  (other  OSes  will  simply be unsecured by default). The
+               pledge security policy that's used by default is  "stdio  anet"
+               which  means  that  only  relatively harmless system calls like
+               read(), write(), and accept() are allowed once the  server  has
+               finished  initializing. It's not possible for remotely executed
+               code to do things like launch subprocesses, read  or  write  to
                the filesystem, or initiate a new connection to a server.
 
        [1m-k [4m[22mN[24m, [1m--keepalive [4m[22mN[0m
-               Specifies the TCP keepalive interval in seconds. This value  is
-               passed  along to both TCP_KEEPIDLE and TCP_KEEPINTVL if they're
-               supported by the  host  operating  system.  If  this  value  is
-               greater  than  0, then the the SO_KEEPALIVE and TCP_NODELAY op‐
-               tions are enabled on network sockets, if supported by the  host
+               Specifies  the TCP keepalive interval in seconds. This value is
+               passed along to both TCP_KEEPIDLE and TCP_KEEPINTVL if  they're
+               supported  by  the  host  operating  system.  If  this value is
+               greater than 0, then the the SO_KEEPALIVE and  TCP_NODELAY  op‐
+               tions  are enabled on network sockets, if supported by the host
                operating system. The default keepalive is 5.
 
        [1m--http-obuf-size [4m[22mN[0m
@@ -165,7 +172,7 @@
              [1mcurl -v http://127.0.0.1:8080/embedding?content=hello+world[0m
 
 [1mDOCUMENTATION[0m
-       Read  our Markdown documentation for additional help and tutorials. See
+       Read our Markdown documentation for additional help and tutorials.  See
        llamafile/server/doc/index.md in the source repository on GitHub.
 
 [1mSEE ALSO[0m
diff --git a/llamafile/server/slot.cpp b/llamafile/server/slot.cpp
index 9628a81a5a..9bff95ab03 100644
--- a/llamafile/server/slot.cpp
+++ b/llamafile/server/slot.cpp
@@ -57,6 +57,22 @@ generate_system_fingerprint(const llama_context_params* cparams)
     return b;
 }
 
+// having multiple images in the context window is janky right now, so
+// let's erase old images from the chat history until we find out more
+static std::vector<Atom>
+remove_old_image_atoms(const std::vector<Atom>& atoms)
+{
+    int last_image_idx = -1;
+    for (int i = 0; i < atoms.size(); ++i)
+        if (atoms[i].is_image())
+            last_image_idx = i;
+    std::vector<Atom> result;
+    for (int i = 0; i < atoms.size(); i++)
+        if (!atoms[i].is_image() || i == last_image_idx)
+            result.emplace_back(atoms[i]);
+    return result;
+}
+
 const char*
 Slot::describe_error(int err)
 {
@@ -245,10 +261,11 @@ Slot::eval_atoms(const std::vector<Atom>& atoms)
 }
 
 int
-Slot::prefill(const std::vector<Atom>& atoms)
+Slot::prefill(const std::vector<Atom>& atoms_)
 {
     if (!ctx_)
         return uninitialized;
+    std::vector<Atom> atoms = remove_old_image_atoms(atoms_);
     int used_tokens = ctx_used();
     int reuse_atoms = 0;
     int reuse_tokens = 0;
@@ -269,10 +286,10 @@ Slot::prefill(const std::vector<Atom>& atoms)
             history_.resize(reuse_atoms);
         } else {
             SLOG("failed to remove tokens from KV cache");
-            llama_kv_cache_clear(ctx_);
             reuse_atoms = 0;
             reuse_tokens = 0;
             erase_tokens = used_tokens;
+            llama_kv_cache_clear(ctx_);
             history_.clear();
         }
     }
diff --git a/llamafile/server/slots.h b/llamafile/server/slots.h
index 64e6b77545..261090bb4d 100644
--- a/llamafile/server/slots.h
+++ b/llamafile/server/slots.h
@@ -39,7 +39,7 @@ struct Slots
 
     // first elements are most recently used
     // last elements are least recently used
-    Dll* free_slots_;
+    Dll* free_slots_ = nullptr;
 
     explicit Slots(llama_model*);
     ~Slots();