diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
index 04e3fc0c17cf9..e385e03f3a245 100644
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -47,6 +47,8 @@ jobs:
- name: Clone
id: checkout
uses: actions/checkout@v3
+ with:
+ fetch-depth: 0
- name: Dependencies
id: depends
@@ -58,7 +60,7 @@ jobs:
cmake \
python3-pip \
wget \
- psmisc
+ language-pack-en
- name: Build
id: cmake_build
@@ -89,3 +91,46 @@ jobs:
run: |
cd examples/server/tests
PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow
+
+
+ server-windows:
+ runs-on: windows-latest
+
+ steps:
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v3
+ with:
+ fetch-depth: 0
+
+ - name: Build
+ id: cmake_build
+ run: |
+ mkdir build
+ cd build
+ cmake .. -DLLAMA_BUILD_SERVER=ON -DCMAKE_BUILD_TYPE=Release ;
+ cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server
+
+ - name: Python setup
+ id: setup_python
+ uses: actions/setup-python@v5
+ with:
+ python-version: '3.11'
+
+ - name: Tests dependencies
+ id: test_dependencies
+ run: |
+ pip install -r examples/server/tests/requirements.txt
+
+ - name: Tests
+ id: server_integration_tests
+ run: |
+ cd examples/server/tests
+ behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
+
+ - name: Slow tests
+ id: server_integration_tests_slow
+ if: ${{ github.event.schedule != '' || github.event.inputs.slow_tests == 'true' }}
+ run: |
+ cd examples/server/tests
+ behave.exe --stop --no-skipped --no-capture --tags slow
diff --git a/.gitignore b/.gitignore
index 5edb75607fd66..1a45e7c082058 100644
--- a/.gitignore
+++ b/.gitignore
@@ -32,6 +32,7 @@ models-mnt
/embedding
/gguf
/gguf-llama-simple
+/gritlm
/imatrix
/infill
/libllama.so
@@ -82,6 +83,8 @@ examples/jeopardy/results.txt
poetry.lock
poetry.toml
+ggml-metal-merged.metal
+
# Test binaries
tests/test-grammar-parser
/tests/test-llama-grammar
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 559929c78c434..abb2d5bbacb26 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -427,7 +427,13 @@ add_library(common2
common/common.cpp
common/common.h
common/grammar-parser.h
- common/grammar-parser.cpp)
+ common/grammar-parser.cpp
+ examples/llava/llava.cpp
+ examples/llava/llava.h
+ examples/llava/clip.cpp
+ examples/llava/clip.h
+ unicode.h
+ unicode.cpp)
target_include_directories(common2 PUBLIC . ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./examples ./common)
target_compile_features(common2 PUBLIC cxx_std_11) # don't bump
target_link_libraries(common2 PRIVATE ggml ${LLAMA_EXTRA_LIBS})
diff --git a/Makefile b/Makefile
index 7e80687d62410..771204906a66e 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
default: koboldcpp_default koboldcpp_failsafe koboldcpp_openblas koboldcpp_noavx2 koboldcpp_clblast koboldcpp_clblast_noavx2 koboldcpp_cublas koboldcpp_hipblas koboldcpp_vulkan koboldcpp_vulkan_noavx2
-tools: quantize_gpt2 quantize_gptj quantize_llama quantize_neox quantize_mpt llama-bench perplexity
+tools: quantize_gpt2 quantize_gptj quantize_gguf quantize_neox quantize_mpt quantize_clip llama-bench perplexity
dev: koboldcpp_openblas
dev2: koboldcpp_clblast
@@ -223,8 +223,11 @@ ifdef LLAMA_HIPBLAS
ROCM_PATH ?= /opt/rocm
HCC := $(ROCM_PATH)/llvm/bin/clang
HCXX := $(ROCM_PATH)/llvm/bin/clang++
+ ifdef ALL_AMD_GPU
GPU_TARGETS ?= gfx803 gfx900 gfx906 gfx908 gfx90a gfx1010 gfx1030 gfx1031 gfx1032 gfx1100 gfx1101 gfx1102 $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
-
+ else
+ GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
+ endif
endif
LLAMA_CUDA_DMMV_X ?= 32
LLAMA_CUDA_MMV_Y ?= 1
@@ -266,6 +269,8 @@ ifdef LLAMA_METAL
OBJS += ggml-metal.o
ggml-metal.o: ggml-metal.m ggml-metal.h
+ @echo "== Preparing merged Metal file =="
+ @sed -e '/#include "ggml-common.h"/r ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml-metal.metal > ggml-metal-merged.metal
$(CC) $(CFLAGS) -c $< -o $@
endif # LLAMA_METAL
@@ -384,31 +389,31 @@ $(info )
# Build library
#
-ggml.o: ggml.c ggml.h ggml-cuda.h
+ggml.o: ggml.c ggml.h ggml-cuda.h ggml-common.h
$(CC) $(FASTCFLAGS) $(FULLCFLAGS) -c $< -o $@
-ggml_v4_openblas.o: ggml.c ggml.h ggml-cuda.h
+ggml_v4_openblas.o: ggml.c ggml.h ggml-cuda.h ggml-common.h
$(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@
-ggml_v4_failsafe.o: ggml.c ggml.h ggml-cuda.h
+ggml_v4_failsafe.o: ggml.c ggml.h ggml-cuda.h ggml-common.h
$(CC) $(FASTCFLAGS) $(NONECFLAGS) -c $< -o $@
-ggml_v4_noavx2.o: ggml.c ggml.h ggml-cuda.h
+ggml_v4_noavx2.o: ggml.c ggml.h ggml-cuda.h ggml-common.h
$(CC) $(FASTCFLAGS) $(SIMPLECFLAGS) -c $< -o $@
-ggml_v4_clblast.o: ggml.c ggml.h ggml-cuda.h
+ggml_v4_clblast.o: ggml.c ggml.h ggml-cuda.h ggml-common.h
$(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
-ggml_v4_cublas.o: ggml.c ggml.h ggml-cuda.h
+ggml_v4_cublas.o: ggml.c ggml.h ggml-cuda.h ggml-common.h
$(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@
-ggml_v4_clblast_noavx2.o: ggml.c ggml.h ggml-cuda.h
+ggml_v4_clblast_noavx2.o: ggml.c ggml.h ggml-cuda.h ggml-common.h
$(CC) $(FASTCFLAGS) $(SIMPLECFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
-ggml_v4_vulkan.o: ggml.c ggml.h ggml-cuda.h
+ggml_v4_vulkan.o: ggml.c ggml.h ggml-cuda.h ggml-common.h
$(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(VULKAN_FLAGS) -c $< -o $@
-ggml_v4_vulkan_noavx2.o: ggml.c ggml.h ggml-cuda.h
+ggml_v4_vulkan_noavx2.o: ggml.c ggml.h ggml-cuda.h ggml-common.h
$(CC) $(FASTCFLAGS) $(SIMPLECFLAGS) $(VULKAN_FLAGS) -c $< -o $@
#quants
-ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-cuda.h
+ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-cuda.h ggml-common.h
$(CC) $(CFLAGS) $(FULLCFLAGS) -c $< -o $@
-ggml-quants_noavx2.o: ggml-quants.c ggml.h ggml-quants.h ggml-cuda.h
+ggml-quants_noavx2.o: ggml-quants.c ggml.h ggml-quants.h ggml-cuda.h ggml-common.h
$(CC) $(CFLAGS) $(SIMPLECFLAGS) -c $< -o $@
-ggml-quants_failsafe.o: ggml-quants.c ggml.h ggml-quants.h ggml-cuda.h
+ggml-quants_failsafe.o: ggml-quants.c ggml.h ggml-quants.h ggml-cuda.h ggml-common.h
$(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@
@@ -417,6 +422,12 @@ ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
$(CC) $(CFLAGS) -c $< -o $@
ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
$(CC) $(CFLAGS) -c $< -o $@
+llava.o: examples/llava/llava.cpp examples/llava/llava.h
+ $(CXX) $(CXXFLAGS) -c $< -o $@
+llavaclip.o: examples/llava/clip.cpp examples/llava/clip.h
+ $(CXX) $(CXXFLAGS) -c $< -o $@
+unicode.o: unicode.cpp unicode.h
+ $(CXX) $(CXXFLAGS) -c $< -o $@
#version 3 libs
ggml_v3.o: otherarch/ggml_v3.c otherarch/ggml_v3.h
@@ -506,26 +517,26 @@ gpttype_adapter_vulkan_noavx2.o: $(GPTTYPE_ADAPTER)
$(CXX) $(CXXFLAGS) $(FAILSAFE_FLAGS) $(VULKAN_FLAGS) -c $< -o $@
clean:
- rm -vf *.o main sdmain quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state gguf imatrix imatrix.exe gguf.exe main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp_default.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_noavx2.dll koboldcpp_clblast.dll koboldcpp_clblast_noavx2.dll koboldcpp_cublas.dll koboldcpp_hipblas.dll koboldcpp_vulkan.dll koboldcpp_vulkan_noavx2.dll koboldcpp_default.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_noavx2.so koboldcpp_clblast.so koboldcpp_clblast_noavx2.so koboldcpp_cublas.so koboldcpp_hipblas.so koboldcpp_vulkan.so koboldcpp_vulkan_noavx2.so
+ rm -vf *.o main sdmain quantize_gguf quantize_clip quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state gguf imatrix imatrix.exe gguf.exe main.exe quantize_clip.exe quantize_gguf.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp_default.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_noavx2.dll koboldcpp_clblast.dll koboldcpp_clblast_noavx2.dll koboldcpp_cublas.dll koboldcpp_hipblas.dll koboldcpp_vulkan.dll koboldcpp_vulkan_noavx2.dll koboldcpp_default.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_noavx2.so koboldcpp_clblast.so koboldcpp_clblast_noavx2.so koboldcpp_cublas.so koboldcpp_hipblas.so koboldcpp_vulkan.so koboldcpp_vulkan_noavx2.so
# useful tools
-main: examples/main/main.cpp common/sampling.cpp build-info.h ggml.o ggml-quants.o ggml-alloc.o ggml-backend.o llama.o common.o console.o grammar-parser.o $(OBJS)
+main: examples/main/main.cpp common/sampling.cpp build-info.h ggml.o ggml-quants.o ggml-alloc.o unicode.o ggml-backend.o llama.o common.o console.o grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
@echo '==== Run ./main -h for help. ===='
-sdmain: otherarch/sdcpp/util.cpp otherarch/sdcpp/main.cpp otherarch/sdcpp/stable-diffusion.cpp otherarch/sdcpp/upscaler.cpp otherarch/sdcpp/model.cpp otherarch/sdcpp/thirdparty/zip.c build-info.h ggml.o ggml-quants.o ggml-alloc.o ggml-backend.o llama.o common.o console.o grammar-parser.o $(OBJS)
+sdmain: otherarch/sdcpp/util.cpp otherarch/sdcpp/main.cpp otherarch/sdcpp/stable-diffusion.cpp otherarch/sdcpp/upscaler.cpp otherarch/sdcpp/model.cpp otherarch/sdcpp/thirdparty/zip.c build-info.h ggml.o ggml-quants.o ggml-alloc.o unicode.o ggml-backend.o llama.o common.o console.o grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-imatrix: examples/imatrix/imatrix.cpp common/sampling.cpp build-info.h ggml.o ggml-quants.o ggml-alloc.o ggml-backend.o llama.o common.o console.o grammar-parser.o $(OBJS)
+imatrix: examples/imatrix/imatrix.cpp common/sampling.cpp build-info.h ggml.o ggml-quants.o ggml-alloc.o unicode.o ggml-backend.o llama.o common.o console.o grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-gguf: examples/gguf/gguf.cpp build-info.h ggml.o llama.o $(OBJS)
+gguf: examples/gguf/gguf.cpp build-info.h ggml.o llama.o unicode.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
#generated libraries
-koboldcpp_default: ggml.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o common.o gpttype_adapter.o ggml-quants.o ggml-alloc.o ggml-backend.o grammar-parser.o sdcpp_default.o $(OBJS)
+koboldcpp_default: ggml.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o common.o gpttype_adapter.o ggml-quants.o ggml-alloc.o ggml-backend.o llava.o llavaclip.o unicode.o grammar-parser.o sdcpp_default.o $(OBJS)
$(DEFAULT_BUILD)
ifdef OPENBLAS_BUILD
-koboldcpp_openblas: ggml_v4_openblas.o ggml_v3_openblas.o ggml_v2_openblas.o ggml_v1.o expose.o common.o gpttype_adapter.o ggml-quants.o ggml-alloc.o ggml-backend.o grammar-parser.o sdcpp_default.o $(OBJS)
+koboldcpp_openblas: ggml_v4_openblas.o ggml_v3_openblas.o ggml_v2_openblas.o ggml_v1.o expose.o common.o gpttype_adapter.o ggml-quants.o ggml-alloc.o ggml-backend.o llava.o llavaclip.o unicode.o grammar-parser.o sdcpp_default.o $(OBJS)
$(OPENBLAS_BUILD)
else
koboldcpp_openblas:
@@ -533,7 +544,7 @@ koboldcpp_openblas:
endif
ifdef FAILSAFE_BUILD
-koboldcpp_failsafe: ggml_v4_failsafe.o ggml_v3_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o ggml-quants_failsafe.o ggml-alloc.o ggml-backend.o grammar-parser.o sdcpp_default.o $(OBJS)
+koboldcpp_failsafe: ggml_v4_failsafe.o ggml_v3_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o ggml-quants_failsafe.o ggml-alloc.o ggml-backend.o llava.o llavaclip.o unicode.o grammar-parser.o sdcpp_default.o $(OBJS)
$(FAILSAFE_BUILD)
else
koboldcpp_failsafe:
@@ -541,7 +552,7 @@ koboldcpp_failsafe:
endif
ifdef NOAVX2_BUILD
-koboldcpp_noavx2: ggml_v4_noavx2.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o ggml-quants_noavx2.o ggml-alloc.o ggml-backend.o grammar-parser.o sdcpp_default.o $(OBJS)
+koboldcpp_noavx2: ggml_v4_noavx2.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o ggml-quants_noavx2.o ggml-alloc.o ggml-backend.o llava.o llavaclip.o unicode.o grammar-parser.o sdcpp_default.o $(OBJS)
$(NOAVX2_BUILD)
else
koboldcpp_noavx2:
@@ -549,10 +560,10 @@ koboldcpp_noavx2:
endif
ifdef CLBLAST_BUILD
-koboldcpp_clblast: ggml_v4_clblast.o ggml_v3_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v3-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o ggml-quants.o ggml-alloc.o ggml-backend.o grammar-parser.o sdcpp_default.o $(OBJS)
+koboldcpp_clblast: ggml_v4_clblast.o ggml_v3_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v3-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o ggml-quants.o ggml-alloc.o ggml-backend.o llava.o llavaclip.o unicode.o grammar-parser.o sdcpp_default.o $(OBJS)
$(CLBLAST_BUILD)
ifdef NOAVX2_BUILD
-koboldcpp_clblast_noavx2: ggml_v4_clblast_noavx2.o ggml_v3_clblast_noavx2.o ggml_v2_clblast_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_clblast_noavx2.o ggml-opencl.o ggml_v3-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o ggml-quants_noavx2.o ggml-alloc.o ggml-backend.o grammar-parser.o sdcpp_default.o $(OBJS)
+koboldcpp_clblast_noavx2: ggml_v4_clblast_noavx2.o ggml_v3_clblast_noavx2.o ggml_v2_clblast_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_clblast_noavx2.o ggml-opencl.o ggml_v3-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o ggml-quants_noavx2.o ggml-alloc.o ggml-backend.o llava.o llavaclip.o unicode.o grammar-parser.o sdcpp_default.o $(OBJS)
$(CLBLAST_BUILD)
else
koboldcpp_clblast_noavx2:
@@ -566,7 +577,7 @@ koboldcpp_clblast_noavx2:
endif
ifdef CUBLAS_BUILD
-koboldcpp_cublas: ggml_v4_cublas.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o ggml-quants.o ggml-alloc.o ggml-backend.o grammar-parser.o sdcpp_cublas.o $(CUBLAS_OBJS) $(OBJS)
+koboldcpp_cublas: ggml_v4_cublas.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o ggml-quants.o ggml-alloc.o ggml-backend.o llava.o llavaclip.o unicode.o grammar-parser.o sdcpp_cublas.o $(CUBLAS_OBJS) $(OBJS)
$(CUBLAS_BUILD)
else
koboldcpp_cublas:
@@ -574,7 +585,7 @@ koboldcpp_cublas:
endif
ifdef HIPBLAS_BUILD
-koboldcpp_hipblas: ggml_v4_cublas.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o ggml-quants.o ggml-alloc.o ggml-backend.o grammar-parser.o sdcpp_cublas.o $(HIP_OBJS) $(OBJS)
+koboldcpp_hipblas: ggml_v4_cublas.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o ggml-quants.o ggml-alloc.o ggml-backend.o llava.o llavaclip.o unicode.o grammar-parser.o sdcpp_cublas.o $(HIP_OBJS) $(OBJS)
$(HIPBLAS_BUILD)
else
koboldcpp_hipblas:
@@ -582,10 +593,10 @@ koboldcpp_hipblas:
endif
ifdef VULKAN_BUILD
-koboldcpp_vulkan: ggml_v4_vulkan.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o common.o gpttype_adapter_vulkan.o ggml-vulkan.o ggml-quants.o ggml-alloc.o ggml-backend.o grammar-parser.o sdcpp_default.o $(OBJS)
+koboldcpp_vulkan: ggml_v4_vulkan.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o common.o gpttype_adapter_vulkan.o ggml-vulkan.o ggml-quants.o ggml-alloc.o ggml-backend.o llava.o llavaclip.o unicode.o grammar-parser.o sdcpp_default.o $(OBJS)
$(VULKAN_BUILD)
ifdef NOAVX2_BUILD
-koboldcpp_vulkan_noavx2: ggml_v4_vulkan_noavx2.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_vulkan_noavx2.o ggml-vulkan.o ggml-quants_noavx2.o ggml-alloc.o ggml-backend.o grammar-parser.o sdcpp_default.o $(OBJS)
+koboldcpp_vulkan_noavx2: ggml_v4_vulkan_noavx2.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_vulkan_noavx2.o ggml-vulkan.o ggml-quants_noavx2.o ggml-alloc.o ggml-backend.o llava.o llavaclip.o unicode.o grammar-parser.o sdcpp_default.o $(OBJS)
$(VULKAN_BUILD)
else
koboldcpp_vulkan_noavx2:
@@ -599,15 +610,17 @@ koboldcpp_vulkan_noavx2:
endif
# tools
-quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o
+quantize_gguf: examples/quantize/quantize.cpp ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o
+ $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+quantize_gptj: ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o otherarch/tools/gptj_quantize.cpp otherarch/tools/common-ggml.cpp
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
-quantize_gptj: ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o otherarch/tools/gptj_quantize.cpp otherarch/tools/common-ggml.cpp
+quantize_gpt2: ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o otherarch/tools/gpt2_quantize.cpp otherarch/tools/common-ggml.cpp
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
-quantize_gpt2: ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o otherarch/tools/gpt2_quantize.cpp otherarch/tools/common-ggml.cpp
+quantize_neox: ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o otherarch/tools/neox_quantize.cpp otherarch/tools/common-ggml.cpp
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
-quantize_neox: ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o otherarch/tools/neox_quantize.cpp otherarch/tools/common-ggml.cpp
+quantize_mpt: ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o otherarch/tools/mpt_quantize.cpp otherarch/tools/common-ggml.cpp
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
-quantize_mpt: ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o otherarch/tools/mpt_quantize.cpp otherarch/tools/common-ggml.cpp
+quantize_clip: ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o unicode.o examples/llava/clip.cpp examples/llava/clip.h examples/llava/quantclip.cpp
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
perplexity: examples/perplexity/perplexity.cpp build-info.h ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o k_quants.o ggml-alloc.o $(CUBLAS_OBJS) $(HIP_OBJS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(HIPLDFLAGS)
diff --git a/class.py b/class.py
index 2e3d50e1f2c8e..c1cd4a81ae0bf 100644
--- a/class.py
+++ b/class.py
@@ -273,7 +273,7 @@ def _load(self, save_model: bool, initial_load: bool) -> None:
unbantokens=False, bantokens=None, usemirostat=None, forceversion=0, nommap=self.kcpp_nommap,
usemlock=False, noavx2=self.kcpp_noavx2, debugmode=self.kcpp_debugmode, skiplauncher=True, hordeconfig=None, noblas=self.kcpp_noblas,
useclblast=self.kcpp_useclblast, usecublas=self.kcpp_usecublas, usevulkan=self.kcpp_usevulkan, gpulayers=self.kcpp_gpulayers, tensor_split=self.kcpp_tensor_split, config=None,
- onready='', multiuser=False, foreground=False, preloadstory=None, noshift=False, remotetunnel=False, ssl=False, benchmark=False, nocertify=False, sdconfig=None)
+ onready='', multiuser=False, foreground=False, preloadstory=None, noshift=False, remotetunnel=False, ssl=False, benchmark=None, nocertify=False, sdconfig=None, mmproj=None, password=None)
#koboldcpp.main(kcppargs,False) #initialize library without enabling Lite http server
diff --git a/colab.ipynb b/colab.ipynb
index 0b6c12e1dcd1f..cc9d608a156fa 100644
--- a/colab.ipynb
+++ b/colab.ipynb
@@ -48,13 +48,18 @@
"source": [
"#@title v-- Enter your model below and then click this to start Koboldcpp\r\n",
"\r\n",
- "Model = \"https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_M.gguf\" #@param [\"https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/MythoMax-L2-13B-GGUF/resolve/main/mythomax-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/ReMM-SLERP-L2-13B-GGUF/resolve/main/remm-slerp-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/Xwin-LM-13B-v0.2-GGUF/resolve/main/xwin-lm-13b-v0.2.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/Stheno-L2-13B-GGUF/resolve/main/stheno-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/MythoMax-L2-Kimiko-v2-13B-GGUF/resolve/main/mythomax-l2-kimiko-v2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/airoboros-mistral2.2-7B-GGUF/resolve/main/airoboros-mistral2.2-7b.Q4_K_S.gguf\",\"https://huggingface.co/afrideva/phi-2-uncensored-GGUF/resolve/main/phi-2-uncensored.q3_k_m.gguf\"]{allow-input: true}\r\n",
+ "Model = \"https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_S.gguf\" #@param [\"https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_S.gguf\",\"https://huggingface.co/Sao10K/Fimbulvetr-11B-v2-GGUF/resolve/main/Fimbulvetr-11B-v2-Test-14.q4_K_M.gguf\",\"https://huggingface.co/TheBloke/MythoMax-L2-13B-GGUF/resolve/main/mythomax-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/ReMM-SLERP-L2-13B-GGUF/resolve/main/remm-slerp-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/Xwin-LM-13B-v0.2-GGUF/resolve/main/xwin-lm-13b-v0.2.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/Stheno-L2-13B-GGUF/resolve/main/stheno-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/MythoMax-L2-Kimiko-v2-13B-GGUF/resolve/main/mythomax-l2-kimiko-v2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/airoboros-mistral2.2-7B-GGUF/resolve/main/airoboros-mistral2.2-7b.Q4_K_S.gguf\",\"https://huggingface.co/afrideva/phi-2-uncensored-GGUF/resolve/main/phi-2-uncensored.q3_k_m.gguf\"]{allow-input: true}\r\n",
"Layers = 99 #@param [99]{allow-input: true}\r\n",
"ContextSize = 4096 #@param [4096] {allow-input: true}\r\n",
"ForceRebuild = False #@param {type:\"boolean\"}\r\n",
- "LoadImageModel = False \r\n",
- "SDModel = \"\"\r\n",
- "SDCommand = \"\"\r\n",
+ "#@markdown
\r\n",
+ "LoadLLaVAmmproj = False #@param {type:\"boolean\"}\r\n",
+ "LLaVAmmproj = \"https://huggingface.co/koboldcpp/mmproj/resolve/main/llama-13b-mmproj-v1.5.Q4_1.gguf\" #@param [\"https://huggingface.co/koboldcpp/mmproj/resolve/main/llama-13b-mmproj-v1.5.Q4_1.gguf\",\"https://huggingface.co/koboldcpp/mmproj/resolve/main/mistral-7b-mmproj-v1.5-Q4_1.gguf\",\"https://huggingface.co/koboldcpp/mmproj/resolve/main/llama-7b-mmproj-v1.5-Q4_0.gguf\"]{allow-input: true}\r\n",
+ "VCommand = \"\"\r\n",
+ "#@markdown
\r\n",
+ "LoadImgModel = False #@param {type:\"boolean\"}\r\n",
+ "ImgModel = \"https://huggingface.co/koboldcpp/imgmodel/resolve/main/imgmodel_older_q4_0.gguf\" #@param [\"https://huggingface.co/koboldcpp/imgmodel/resolve/main/imgmodel_older_q4_0.gguf\"]{allow-input: true}\r\n",
+ "SCommand = \"\"\r\n",
"\r\n",
"import os\r\n",
"if not os.path.isfile(\"/opt/bin/nvidia-smi\"):\r\n",
@@ -67,20 +72,27 @@
"kvers = kvers[0]\r\n",
"if ForceRebuild:\r\n",
" kvers = \"force_rebuild\"\r\n",
- "if SDModel and LoadImageModel:\r\n",
- " SDCommand = \"--sdconfig sdmodel.safetensors clamped 4 quant\"\r\n",
+ "if LLaVAmmproj and LoadLLaVAmmproj:\r\n",
+ " VCommand = \"--mmproj vmodel.gguf\"\r\n",
"else:\r\n",
- " SDCommand = \"\"\r\n",
+ " SCommand = \"\"\r\n",
+ "if ImgModel and LoadImgModel:\r\n",
+ " SCommand = \"--sdconfig imodel.gguf clamped 4 quant\"\r\n",
+ "else:\r\n",
+ " SCommand = \"\"\r\n",
"!echo Finding prebuilt binary for {kvers}\r\n",
"!wget -O dlfile.tmp https://kcppcolab.concedo.workers.dev/?{kvers} && mv dlfile.tmp koboldcpp_cublas.so\r\n",
"!test -f koboldcpp_cublas.so && echo Prebuilt Binary Exists || echo Prebuilt Binary Does Not Exist\r\n",
"!test -f koboldcpp_cublas.so && echo Build Skipped || make koboldcpp_cublas LLAMA_CUBLAS=1 LLAMA_COLAB=1 LLAMA_PORTABLE=1\r\n",
"!cp koboldcpp_cublas.so koboldcpp_cublas.dat\r\n",
+ "!apt update\r\n",
"!apt install aria2 -y\r\n",
"!aria2c -x 10 -o model.gguf --summary-interval=5 --download-result=default --allow-overwrite=true --file-allocation=none $Model\r\n",
- "if SDCommand:\r\n",
- " !aria2c -x 10 -o sdmodel.safetensors --summary-interval=5 --download-result=default --allow-overwrite=true --file-allocation=none $SDModel\r\n",
- "!python koboldcpp.py model.gguf --usecublas 0 mmq --multiuser --gpulayers $Layers --contextsize $ContextSize --quiet --remotetunnel $SDCommand\r\n"
+ "if VCommand:\r\n",
+ " !aria2c -x 10 -o vmodel.gguf --summary-interval=5 --download-result=default --allow-overwrite=true --file-allocation=none $LLaVAmmproj\r\n",
+ "if SCommand:\r\n",
+ " !aria2c -x 10 -o imodel.gguf --summary-interval=5 --download-result=default --allow-overwrite=true --file-allocation=none $ImgModel\r\n",
+ "!python koboldcpp.py model.gguf --usecublas 0 mmq --multiuser --gpulayers $Layers --contextsize $ContextSize --quiet --remotetunnel $VCommand $SCommand\r\n"
]
}
],
diff --git a/common/common.cpp b/common/common.cpp
index 857d4d2963aa1..ec92fa02e15da 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -484,6 +484,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
break;
}
params.n_batch = std::stoi(argv[i]);
+ } else if (arg == "-ub" || arg == "--ubatch-size") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.n_ubatch = std::stoi(argv[i]);
} else if (arg == "--keep") {
if (++i >= argc) {
invalid_param = true;
@@ -978,7 +984,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" binary file containing multiple choice tasks.\n");
printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
- printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
+ printf(" -b N, --batch-size N logical maximum batch size (default: %d)\n", params.n_batch);
+ printf(" -ub N, --ubatch-size N\n");
+ printf(" physical maximum batch size (default: %d)\n", params.n_ubatch);
printf(" --samplers samplers that will be used for generation in the order, separated by \';\'\n");
printf(" (default: %s)\n", sampler_type_names.c_str());
printf(" --sampling-seq simplified sequence for samplers that will be used (default: %s)\n", sampler_type_chars.c_str());
@@ -1288,7 +1296,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
auto cparams = llama_context_default_params();
cparams.n_ctx = params.n_ctx;
+ cparams.n_seq_max = params.n_parallel;
cparams.n_batch = params.n_batch;
+ cparams.n_ubatch = params.n_ubatch;
cparams.n_threads = params.n_threads;
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
cparams.seed = params.seed;
@@ -1379,6 +1389,7 @@ std::tuple llama_init_from_gpt_par
std::vector tmp = { llama_token_bos(model), llama_token_eos(model), };
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
llama_kv_cache_clear(lctx);
+ llama_synchronize(lctx);
llama_reset_timings(lctx);
}
@@ -1786,17 +1797,17 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) {
static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
- view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
+ view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
llama_kv_cache_view_cell * c_curr = view.cells;
llama_seq_id * cs_curr = view.cells_sequences;
- for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
+ for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
if (i % row_size == 0) {
printf("\n%5d: ", i);
}
int seq_count = 0;
- for (int j = 0; j < view.n_max_seq; j++) {
+ for (int j = 0; j < view.n_seq_max; j++) {
if (cs_curr[j] >= 0) { seq_count++; }
}
putchar(slot_chars[std::min(sizeof(slot_chars) - 2, size_t(seq_count))]);
@@ -1809,14 +1820,14 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
- view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
+ view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
std::unordered_map seqs;
llama_kv_cache_view_cell * c_curr = view.cells;
llama_seq_id * cs_curr = view.cells_sequences;
- for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
- for (int j = 0; j < view.n_max_seq; j++) {
+ for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
+ for (int j = 0; j < view.n_seq_max; j++) {
if (cs_curr[j] < 0) { continue; }
if (seqs.find(cs_curr[j]) == seqs.end()) {
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
@@ -1835,11 +1846,11 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
c_curr = view.cells;
cs_curr = view.cells_sequences;
- for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
+ for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
if (i % row_size == 0) {
printf("\n%5d: ", i);
}
- for (int j = 0; j < view.n_max_seq; j++) {
+ for (int j = 0; j < view.n_seq_max; j++) {
if (cs_curr[j] >= 0) {
const auto & it = seqs.find(cs_curr[j]);
putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+');
@@ -1852,3 +1863,18 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
printf("\n=== Done dumping\n");
}
+
+void llama_embd_normalize(const float * inp, float * out, int n) {
+ double sum = 0.0;
+ for (int i = 0; i < n; i++) {
+ sum += inp[i] * inp[i];
+ }
+ sum = sqrt(sum);
+
+ const float norm = sum > 0.0 ? 1.0f / sum : 0.0f;
+
+ for (int i = 0; i < n; i++) {
+ out[i] = inp[i] * norm;
+ }
+}
+
diff --git a/common/common.h b/common/common.h
index 411204dbf9eec..6dc9395a41c45 100644
--- a/common/common.h
+++ b/common/common.h
@@ -45,7 +45,8 @@ struct gpt_params {
int32_t n_threads_batch_draft = -1;
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 512; // context size
- int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
+ int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
+ int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_draft = 5; // number of tokens to draft during speculative decoding
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
@@ -274,3 +275,10 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);
// Dump the KV cache view showing individual sequences in each cell (long output).
void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
+
+//
+// Embedding utils
+//
+
+void llama_embd_normalize(const float * inp, float * out, int n);
+
diff --git a/common/grammar-parser.cpp b/common/grammar-parser.cpp
index bf89a96f3617f..2a1301569793a 100644
--- a/common/grammar-parser.cpp
+++ b/common/grammar-parser.cpp
@@ -278,6 +278,22 @@ namespace grammar_parser {
while (*pos) {
pos = parse_rule(state, pos);
}
+ // Validate the state to ensure that all rules are defined
+ for (const auto & rule : state.rules) {
+ for (const auto & elem : rule) {
+ if (elem.type == LLAMA_GRETYPE_RULE_REF) {
+ // Ensure that the rule at that location exists
+ if (elem.value >= state.rules.size() || state.rules[elem.value].empty()) {
+ // Get the name of the rule that is missing
+ for (const auto & kv : state.symbol_ids) {
+ if (kv.second == elem.value) {
+ throw std::runtime_error("Undefined rule identifier '" + kv.first + "'");
+ }
+ }
+ }
+ }
+ }
+ }
return state;
} catch (const std::exception & err) {
fprintf(stderr, "%s: error parsing grammar: %s\n", __func__, err.what());
diff --git a/common/log.h b/common/log.h
index e4e1b9f4f01aa..eb111e784bc9f 100644
--- a/common/log.h
+++ b/common/log.h
@@ -297,7 +297,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
#ifndef _MSC_VER
#define LOG(...) LOG_IMPL(__VA_ARGS__, "")
#else
- #define LOG(str, ...) LOG_IMPL("%s" str, "", __VA_ARGS__, "")
+ #define LOG(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "")
#endif
// Main TEE macro.
@@ -311,7 +311,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
#ifndef _MSC_VER
#define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
#else
- #define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", __VA_ARGS__, "")
+ #define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "")
#endif
// LOG macro variants with auto endline.
@@ -319,8 +319,8 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
#define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
#define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
#else
- #define LOGLN(str, ...) LOG_IMPL("%s" str, "", __VA_ARGS__, "\n")
- #define LOG_TEELN(str, ...) LOG_TEE_IMPL("%s" str, "", __VA_ARGS__, "\n")
+ #define LOGLN(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "\n")
+ #define LOG_TEELN(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "\n")
#endif
// INTERNAL, DO NOT USE
diff --git a/common/sampling.cpp b/common/sampling.cpp
index c2e952f42831a..da06f16f05de3 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -17,6 +17,13 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
return nullptr;
}
+ // Ensure that there is a "root" node.
+ if (result->parsed_grammar.symbol_ids.find("root") == result->parsed_grammar.symbol_ids.end()) {
+ fprintf(stderr, "%s: grammar does not contain a 'root' symbol\n", __func__);
+ delete result;
+ return nullptr;
+ }
+
std::vector grammar_rules(result->parsed_grammar.c_rules());
result->grammar = llama_grammar_init(
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index f6369af38081d..5eee320163d29 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -1847,6 +1847,124 @@ class StarCoder2Model(Model):
model_arch = gguf.MODEL_ARCH.STARCODER2
+@Model.register("MambaForCausalLM", "MambaLMHeadModel")
+class MambaModel(Model):
+ model_arch = gguf.MODEL_ARCH.MAMBA
+
+ def set_vocab(self):
+ vocab_size = self.hparams["vocab_size"]
+ # Round vocab size to next multiple of 8
+ pad_vocab = self.hparams.get("pad_vocab_size_multiple", 8)
+ # pad using ceiling division
+ # ref: https://stackoverflow.com/a/17511341/22827863
+ vocab_size = -(vocab_size // -pad_vocab) * pad_vocab
+ self.hparams["vocab_size"] = vocab_size
+
+ if (self.dir_model / "tokenizer.json").is_file():
+ self._set_vocab_gpt2()
+ else:
+ # Use the GPT-NeoX tokenizer when no tokenizer files are present
+ tokenizer_path = Path(sys.path[0]) / "models" / "ggml-vocab-gpt-neox.gguf"
+ print(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'")
+ neox_reader = gguf.GGUFReader(tokenizer_path, "r")
+
+ field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL)
+ self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]))
+ field = neox_reader.get_field(gguf.Keys.Tokenizer.LIST)
+ self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size])
+ field = neox_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE)
+ self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
+ field = neox_reader.get_field(gguf.Keys.Tokenizer.MERGES)
+ self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data])
+ field = neox_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)
+ self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0])
+ field = neox_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)
+ self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0])
+ field = neox_reader.get_field(gguf.Keys.Tokenizer.UNK_ID)
+ self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0])
+
+ def set_gguf_parameters(self):
+ d_model = self.find_hparam(["hidden_size", "d_model"])
+ d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4
+ d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model
+ d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 16
+ # ceiling division
+ # ref: https://stackoverflow.com/a/17511341/22827863
+ # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58
+ dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16)
+ rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5
+
+ # Fail early for models which don't have a block expansion factor of 2
+ assert d_inner == 2 * d_model
+
+ self.gguf_writer.add_name(self.dir_model.name)
+ self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
+ self.gguf_writer.add_embedding_length(d_model)
+ self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
+ self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading
+ self.gguf_writer.add_block_count(self.hparams["n_layer"])
+ self.gguf_writer.add_ssm_conv_kernel(d_conv)
+ self.gguf_writer.add_ssm_inner_size(d_inner)
+ self.gguf_writer.add_ssm_state_size(d_state)
+ self.gguf_writer.add_ssm_time_step_rank(dt_rank)
+ self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
+ self.gguf_writer.add_file_type(self.ftype)
+
+ def write_tensors(self):
+ block_count = self.hparams["n_layer"]
+ tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
+
+ tok_embd = None
+ tok_embd_name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.TOKEN_EMBD] + ".weight"
+ output_name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.OUTPUT] + ".weight"
+
+ for name, data_torch in self.get_tensors():
+ old_dtype = data_torch.dtype
+
+ # convert any unsupported data types to float32
+ if data_torch.dtype not in (torch.float16, torch.float32):
+ data_torch = data_torch.to(torch.float32)
+
+ # map tensor names
+ new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
+ if new_name is None:
+ print(f"Can not map tensor {name!r}")
+ sys.exit()
+
+ if name.endswith(".A_log"):
+ print("A_log --> A ==> " + new_name)
+ data_torch = -torch.exp(data_torch)
+
+ # assuming token_embd.weight is seen before output.weight
+ if tok_embd is not None and new_name == output_name:
+ if torch.equal(tok_embd, data_torch):
+ print(f"{output_name} is equivalent to {tok_embd_name}, omitting")
+ continue
+ if new_name == tok_embd_name:
+ tok_embd = data_torch
+
+ data = data_torch.squeeze().numpy()
+
+ n_dims = len(data.shape)
+ data_dtype = data.dtype
+
+ # if f32 desired, convert any float16 to float32
+ if self.ftype == 0 and data_dtype == np.float16:
+ data = data.astype(np.float32)
+
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+ if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+ data = data.astype(np.float32)
+
+ # if f16 desired, convert big float32 2-dim weight tensors to float16
+ if self.ftype == 1 and data_dtype == np.float32 and new_name.removesuffix(".weight").endswith((".ssm_in", ".ssm_out", "token_embd", "output")) and n_dims == 2:
+ data = data.astype(np.float16)
+
+ print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+
+ self.gguf_writer.add_tensor(new_name, data)
+
+
###### CONVERSION LOGIC ######
diff --git a/convert.py b/convert.py
index 6e3a0319b1e46..c15f8c47ea4f7 100755
--- a/convert.py
+++ b/convert.py
@@ -1377,7 +1377,6 @@ def main(args_in: list[str] | None = None) -> None:
# We currently only support Q8_0 output on little endian systems.
output_choices.append("q8_0")
parser = argparse.ArgumentParser(description="Convert a LLaMA model to a GGML compatible file")
- parser.add_argument("--awq-path", type=Path, help="Path to scale awq cache file", default=None)
parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
@@ -1393,18 +1392,6 @@ def main(args_in: list[str] | None = None) -> None:
parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing")
args = parser.parse_args(args_in)
- if args.awq_path:
- sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
- from awq.apply_awq import add_scale_weights # type: ignore[import-not-found]
- tmp_model_path = args.model / "weighted_model"
- if tmp_model_path.is_dir():
- print(f"{tmp_model_path} exists as a weighted model.")
- else:
- tmp_model_path.mkdir(parents=True, exist_ok=True)
- print("Saving new weighted model ...")
- add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path))
- print(f"Saved weighted model at {tmp_model_path}.")
- args.model = tmp_model_path
if args.dump_single:
model_plus = lazy_load_file(args.model)
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 653abc73ac98f..e762cf8b9238b 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -20,6 +20,7 @@ else()
add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(embedding)
add_subdirectory(finetune)
+ add_subdirectory(gritlm)
add_subdirectory(infill)
add_subdirectory(llama-bench)
add_subdirectory(llava)
diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp
index 19aff18aefde7..19674dfd36708 100644
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -105,6 +105,9 @@ int main(int argc, char ** argv) {
ctx_params.n_threads = params.n_threads;
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+ // ensure enough sequences are available
+ ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end());
+
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
if (ctx == NULL) {
@@ -135,6 +138,8 @@ int main(int argc, char ** argv) {
LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
return false;
}
+
+ llama_synchronize(ctx);
}
return true;
@@ -174,10 +179,10 @@ int main(int argc, char ** argv) {
llama_batch_clear(batch);
- const int n_tokens = is_pp_shared ? pp : pl*pp;
-
- for (int i = 0; i < n_tokens; ++i) {
- llama_batch_add(batch, 0, i, { 0 }, false);
+ for (int i = 0; i < pp; ++i) {
+ for (int j = 0; j < (is_pp_shared ? 1 : pl); ++j) {
+ llama_batch_add(batch, 0, i, { j }, false);
+ }
}
batch.logits[batch.n_tokens - 1] = true;
@@ -192,7 +197,7 @@ int main(int argc, char ** argv) {
if (is_pp_shared) {
for (int32_t i = 1; i < pl; ++i) {
- llama_kv_cache_seq_cp(ctx, 0, i, 0, pp);
+ llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
}
}
diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp
index 9be7eb56bcd8a..ee1f8f1bf5dd2 100644
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -80,6 +80,7 @@ int main(int argc, char ** argv) {
ctx_params.seed = 1234;
ctx_params.n_ctx = n_kv_req;
ctx_params.n_batch = std::max(n_len, n_parallel);
+ ctx_params.n_seq_max = n_parallel;
ctx_params.n_threads = params.n_threads;
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
@@ -132,7 +133,7 @@ int main(int argc, char ** argv) {
// assign the system KV cache to all parallel sequences
// this way, the parallel sequences will "reuse" the prompt tokens without having to copy them
for (int32_t i = 1; i < n_parallel; ++i) {
- llama_kv_cache_seq_cp(ctx, 0, i, 0, batch.n_tokens);
+ llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
}
if (n_parallel > 1) {
diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp
index 8c48df00e54cd..72fc025ed1a35 100644
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@@ -190,12 +190,10 @@ int main(int argc, char ** argv) {
int32_t nelements = sizex*sizey;
- std::vector hist_cur(1 << 4, 0);
-
// Set up a the benchmark matrices
// printf("Creating new tensor q11 & Running quantize\n");
struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
- ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements/m11->ne[0], m11->ne[0], hist_cur.data(), nullptr);
+ ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements/m11->ne[0], m11->ne[0], nullptr);
// Set up a the compute graph
// printf("Creating new tensor q31\n");
@@ -208,7 +206,7 @@ int main(int argc, char ** argv) {
// Set up a second graph computation to make sure we override the CPU cache lines
// printf("Creating new tensor q12 & Running quantize\n");
struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
- ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements/m12->ne[0], m12->ne[0], hist_cur.data(), nullptr);
+ ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements/m12->ne[0], m12->ne[0], nullptr);
// printf("Creating new tensor q32\n");
struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index 4da81998afc45..7b436556280fe 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -24,17 +24,6 @@ static void batch_add_seq(llama_batch & batch, const std::vector & toke
}
}
-static void normalize(const float * vec, float * out, int n) {
- float norm = 0;
- for (int i = 0; i < n; i++) {
- norm += vec[i] * vec[i];
- }
- norm = sqrt(norm);
- for (int i = 0; i < n; i++) {
- out[i] = vec[i] / norm;
- }
-}
-
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
// clear previous kv_cache values (irrelevant for embeddings)
llama_kv_cache_clear(ctx);
@@ -45,7 +34,6 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
fprintf(stderr, "%s : failed to decode\n", __func__);
}
- // normalize on copy
for (int i = 0; i < batch.n_tokens; i++) {
if (!batch.logits[i]) {
continue;
@@ -62,7 +50,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
}
float * out = output + batch.seq_id[i][0] * n_embd;
- normalize(embd, out, n_embd);
+ llama_embd_normalize(embd, out, n_embd);
}
}
@@ -120,7 +108,7 @@ int main(int argc, char ** argv) {
// max batch size
const uint64_t n_batch = params.n_batch;
- GGML_ASSERT(params.n_batch == params.n_ctx);
+ GGML_ASSERT(params.n_batch >= params.n_ctx);
// tokenize the prompts and trim
std::vector> inputs;
diff --git a/examples/gritlm/CMakeLists.txt b/examples/gritlm/CMakeLists.txt
new file mode 100644
index 0000000000000..ac4a5ae7937ea
--- /dev/null
+++ b/examples/gritlm/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET gritlm)
+add_executable(${TARGET} gritlm.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp
new file mode 100644
index 0000000000000..3d4b085d69b6f
--- /dev/null
+++ b/examples/gritlm/gritlm.cpp
@@ -0,0 +1,229 @@
+#include "common.h"
+#include "llama.h"
+
+#include
+#include
+
+// #define GRIT_DEBUG
+
+static float dot_product(const std::vector & v1, const std::vector & v2) {
+ float dot = 0.0f;
+ for (uint64_t i = 0; i < v1.size(); ++i) {
+ dot += v1[i] * v2[i];
+ }
+ return dot;
+}
+
+static float norm(const std::vector & v) {
+ return std::sqrt(dot_product(v, v));
+}
+
+static float cosine_similarity(const std::vector & v1, const std::vector & v2) {
+ return dot_product(v1, v2) / (norm(v1) * norm(v2));
+}
+
+static std::vector> encode(llama_context * ctx, const std::vector & sentences, const std::string & instruction) {
+ std::vector> result;
+
+ const llama_model * mdl = llama_get_model(ctx);
+
+ llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1);
+
+ for (uint64_t i = 0; i < sentences.size(); i++) {
+ llama_batch_clear(batch);
+
+ const std::string input_string = instruction + sentences[i];
+
+ std::vector inputs = llama_tokenize(mdl, input_string, true, false);
+
+ const int32_t n_toks = inputs.size();
+
+ // GritLM seems to have EOS = ""
+ // https://github.com/ContextualAI/gritlm/blob/92025b16534712b31b3c4aaaf069350e222bd5f8/gritlm/gritlm.py#L18
+ // inputs.push_back(llama_token_eos(mdl));
+
+ // we want to ignore instruction tokens for mean pooling
+ const int32_t n_inst = llama_tokenize(mdl, instruction, true, false).size();
+
+#ifdef GRIT_DEBUG
+ // debug tokens - should be matching as referenced in the GritLM sample
+ std::for_each(inputs.begin(), inputs.end(), [&ctx](llama_token t) {
+ std::printf("[%u:%s]", t, llama_token_to_piece(ctx, t).c_str());
+ });
+ std::printf("\n");
+#endif
+
+ // add input to batch (this increments n_tokens)
+ for (int32_t j = 0; j < n_toks; j++) {
+ llama_batch_add(batch, inputs[j], j, { 0 }, j >= n_inst);
+ }
+
+ // clear previous kv_cache values (irrelevant for embeddings)
+ llama_kv_cache_clear(ctx);
+ llama_set_causal_attn(ctx, false);
+
+ // run model
+ llama_decode(ctx, batch);
+
+ // get embedding dimensions
+ uint64_t n_embd = llama_n_embd(mdl);
+
+ // allocate embedding output
+ std::vector emb_unorm(n_embd, 0.0f);
+
+ // sum up all token embeddings
+ for (int32_t k = n_inst; k < n_toks; k++) {
+ float * emb = llama_get_embeddings_ith(ctx, k);
+ for (uint64_t j = 0; j < n_embd; j++) {
+ emb_unorm[j] += emb[j];
+ }
+ }
+
+ // divide by number of tokens (mean pooling)
+ {
+ const uint64_t n_sent = n_toks - n_inst;
+
+ for (uint64_t j = 0; j < n_embd; j++) {
+ emb_unorm[j] /= n_sent;
+ }
+ }
+
+ std::vector emb_norm(emb_unorm.size());
+ llama_embd_normalize(emb_unorm.data(), emb_norm.data(), n_embd);
+ result.push_back(emb_norm);
+
+#ifdef GRIT_DEBUG
+ // print out emb_norm
+ std::printf("embedding %ld: ", i);
+ for (uint64_t j = 0; j < n_embd; j++) {
+ std::printf("%.5f ", emb_norm[j]);
+ }
+ std::printf("\n\n");
+#endif
+ }
+
+ llama_batch_free(batch);
+
+ return result;
+}
+
+static std::string generate(llama_context * ctx, const std::string & prompt, bool stream) {
+ std::string result;
+
+ const llama_model * mdl = llama_get_model(ctx);
+ llama_token eos_token = llama_token_eos(mdl);
+
+ llama_kv_cache_clear(ctx);
+ llama_set_causal_attn(ctx, true);
+ llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
+
+ std::vector inputs = llama_tokenize(mdl, prompt, false, true);
+ int32_t i_current_token = 0;
+
+ while (true) {
+ llama_batch_clear(bat);
+ auto n_inputs = (int32_t)inputs.size();
+ for (int32_t i = 0; i < n_inputs; i++) {
+ llama_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1);
+ }
+ inputs.clear();
+
+ llama_decode(ctx, bat);
+ auto logits = llama_get_logits_ith(ctx, bat.n_tokens - 1);
+
+ auto candidates = std::vector(llama_n_vocab(mdl));
+ auto n_candidates = (int32_t)candidates.size();
+ for (int32_t token = 0; token < n_candidates; token++) {
+ candidates[token] = llama_token_data{ token, logits[token], 0.0f };
+ }
+ auto candidates_p = llama_token_data_array{ candidates.data(), candidates.size(), false };
+
+ llama_token token = llama_sample_token_greedy(ctx, &candidates_p);
+ if (token == eos_token) {
+ break;
+ }
+
+ std::string piece = llama_token_to_piece(ctx, token);
+ if (stream) {
+ std::printf("%s", piece.c_str());
+ std::fflush(stdout);
+ }
+
+ inputs.push_back(token);
+
+ result += piece;
+ }
+
+ if (stream) {
+ std::printf("\n");
+ }
+
+ llama_batch_free(bat);
+
+ return result;
+}
+
+static std::string gritlm_instruction(const std::string & instruction) {
+ return !instruction.empty() ? "<|user|>\n" + instruction + "\n<|embed|>\n" : "<|embed|>\n";
+}
+
+int main(int argc, char * argv[]) {
+ gpt_params params;
+ if (!gpt_params_parse(argc, argv, params)) {
+ return 1;
+ }
+
+ llama_model_params mparams = llama_model_params_from_gpt_params(params);
+ llama_context_params cparams = llama_context_params_from_gpt_params(params);
+
+ llama_backend_init();
+
+ llama_model * mdl = llama_load_model_from_file(params.model.c_str(), mparams);
+
+ // create new context - set to embedding mode
+ cparams.embeddings = true;
+ llama_context * ctx = llama_new_context_with_model(mdl, cparams);
+
+ // ### Embedding/Representation ###
+ // samples taken from: https://github.com/ContextualAI/gritlm#basic
+ {
+ const std::string instruction = "Given a scientific paper title, retrieve the paper's abstract";
+
+ const std::vector queries = {
+ "Bitcoin: A Peer-to-Peer Electronic Cash System",
+ "Generative Representational Instruction Tuning",
+ };
+
+ const std::vector documents = {
+ "A purely peer-to-peer version of electronic cash would allow online payments to be sent directly from one party to another without going through a financial institution. Digital signatures provide part of the solution, but the main benefits are lost if a trusted third party is still required to prevent double-spending. We propose a solution to the double-spending problem using a peer-to-peer network. The network timestamps transactions by hashing them into an ongoing chain of hash-based proof-of-work, forming a record that cannot be changed without redoing the proof-of-work. The longest chain not only serves as proof of the sequence of events witnessed, but proof that it came from the largest pool of CPU power. As long as a majority of CPU power is controlled by nodes that are not cooperating to attack the network, they'll generate the longest chain and outpace attackers. The network itself requires minimal structure. Messages are broadcast on a best effort basis, and nodes can leave and rejoin the network at will, accepting the longest proof-of-work chain as proof of what happened while they were gone.",
+ "All text-based language problems can be reduced to either generation or embedding. Current models only perform well at one or the other. We introduce generative representational instruction tuning (GRIT) whereby a large language model is trained to handle both generative and embedding tasks by distinguishing between them through instructions. Compared to other open models, our resulting GritLM 7B sets a new state of the art on the Massive Text Embedding Benchmark (MTEB) and outperforms all models up to its size on a range of generative tasks. By scaling up further, GritLM 8X7B outperforms all open generative language models that we tried while still being among the best embedding models. Notably, we find that GRIT matches training on only generative or embedding data, thus we can unify both at no performance loss. Among other benefits, the unification via GRIT speeds up Retrieval-Augmented Generation (RAG) by > 60% for long documents, by no longer requiring separate retrieval and generation models. Models, code, etc. are freely available at https://github.com/ContextualAI/gritlm.",
+ };
+
+ // No need to add instruction for retrieval documents
+ const std::vector> d_rep = encode(ctx, documents, gritlm_instruction(""));
+ const std::vector> q_rep = encode(ctx, queries, gritlm_instruction(instruction));
+
+ const float cosine_sim_q0_d0 = cosine_similarity(q_rep[0], d_rep[0]);
+ const float cosine_sim_q0_d1 = cosine_similarity(q_rep[0], d_rep[1]);
+ const float cosine_sim_q1_d0 = cosine_similarity(q_rep[1], d_rep[0]);
+ const float cosine_sim_q1_d1 = cosine_similarity(q_rep[1], d_rep[1]);
+
+ std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[0].c_str(), cosine_sim_q0_d0);
+ std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[1].c_str(), cosine_sim_q0_d1);
+ std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[1].c_str(), documents[0].c_str(), cosine_sim_q1_d0);
+ std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[1].c_str(), documents[1].c_str(), cosine_sim_q1_d1);
+ }
+
+ // ### Generation ###
+ // GritLM models are not finetuned with system prompts, as you can just include system-like instructions together with your user instruction
+ {
+ const std::string prompt = "<|user|>\nPlease write me a poem about my recent hike of Mt. Fuji at midnight in the style of Shakespeare.\n<|assistant|>\n";
+ std::string response = generate(ctx, prompt, true);
+ }
+
+ llama_free(ctx);
+ llama_free_model(mdl);
+ llama_backend_free();
+
+ return 0;
+}
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index 1acb3cacb7fcb..a19a9d665a3f1 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -165,6 +165,7 @@ struct cmd_params {
std::vector n_prompt;
std::vector n_gen;
std::vector n_batch;
+ std::vector n_ubatch;
std::vector type_k;
std::vector type_v;
std::vector n_threads;
@@ -174,6 +175,7 @@ struct cmd_params {
std::vector no_kv_offload;
std::vector> tensor_split;
std::vector use_mmap;
+ std::vector embeddings;
int reps;
bool verbose;
output_formats output_format;
@@ -183,7 +185,8 @@ static const cmd_params cmd_params_defaults = {
/* model */ {"models/7B/ggml-model-q4_0.gguf"},
/* n_prompt */ {512},
/* n_gen */ {128},
- /* n_batch */ {512},
+ /* n_batch */ {2048},
+ /* n_ubatch */ {512},
/* type_k */ {GGML_TYPE_F16},
/* type_v */ {GGML_TYPE_F16},
/* n_threads */ {get_num_physical_cores()},
@@ -193,6 +196,7 @@ static const cmd_params cmd_params_defaults = {
/* no_kv_offload */ {false},
/* tensor_split */ {std::vector(llama_max_devices(), 0.0f)},
/* use_mmap */ {true},
+ /* embeddings */ {false},
/* reps */ 5,
/* verbose */ false,
/* output_format */ MARKDOWN
@@ -207,6 +211,7 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -p, --n-prompt (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
printf(" -n, --n-gen (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
printf(" -b, --batch-size (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
+ printf(" -ub N, --ubatch-size (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
printf(" -ctk , --cache-type-k (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
printf(" -ctv , --cache-type-v (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
printf(" -t, --threads (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
@@ -215,7 +220,8 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -mg, --main-gpu (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
- printf(" -ts, --tensor_split (default: 0)\n");
+ printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
+ printf(" -ts, --tensor-split (default: 0)\n");
printf(" -r, --repetitions (default: %d)\n", cmd_params_defaults.reps);
printf(" -o, --output (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
@@ -295,6 +301,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
}
auto p = split(argv[i], split_delim);
params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
+ } else if (arg == "-ub" || arg == "--ubatch-size") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ auto p = split(argv[i], split_delim);
+ params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end());
} else if (arg == "-ctk" || arg == "--cache-type-k") {
if (++i >= argc) {
invalid_param = true;
@@ -383,6 +396,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
}
auto p = split(argv[i], split_delim);
params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
+ } else if (arg == "-embd" || arg == "--embeddings") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ auto p = split(argv[i], split_delim);
+ params.embeddings.insert(params.embeddings.end(), p.begin(), p.end());
} else if (arg == "-ts" || arg == "--tensor-split") {
if (++i >= argc) {
invalid_param = true;
@@ -446,6 +466,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; }
if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; }
if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; }
+ if (params.n_ubatch.empty()) { params.n_ubatch = cmd_params_defaults.n_ubatch; }
if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; }
if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; }
if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
@@ -454,6 +475,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
+ if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; }
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
return params;
@@ -464,6 +486,7 @@ struct cmd_params_instance {
int n_prompt;
int n_gen;
int n_batch;
+ int n_ubatch;
ggml_type type_k;
ggml_type type_v;
int n_threads;
@@ -473,6 +496,7 @@ struct cmd_params_instance {
bool no_kv_offload;
std::vector tensor_split;
bool use_mmap;
+ bool embeddings;
llama_model_params to_llama_mparams() const {
llama_model_params mparams = llama_model_default_params();
@@ -500,9 +524,11 @@ struct cmd_params_instance {
cparams.n_ctx = n_prompt + n_gen;
cparams.n_batch = n_batch;
+ cparams.n_ubatch = n_ubatch;
cparams.type_k = type_k;
cparams.type_v = type_v;
cparams.offload_kqv = !no_kv_offload;
+ cparams.embeddings = embeddings;
return cparams;
}
@@ -518,7 +544,9 @@ static std::vector get_cmd_params_instances(const cmd_param
for (const auto & mg : params.main_gpu)
for (const auto & ts : params.tensor_split)
for (const auto & mmp : params.use_mmap)
+ for (const auto & embd : params.embeddings)
for (const auto & nb : params.n_batch)
+ for (const auto & nub : params.n_ubatch)
for (const auto & tk : params.type_k)
for (const auto & tv : params.type_v)
for (const auto & nkvo : params.no_kv_offload)
@@ -532,6 +560,7 @@ static std::vector get_cmd_params_instances(const cmd_param
/* .n_prompt = */ n_prompt,
/* .n_gen = */ 0,
/* .n_batch = */ nb,
+ /* .n_ubatch = */ nub,
/* .type_k = */ tk,
/* .type_v = */ tv,
/* .n_threads = */ nt,
@@ -541,6 +570,7 @@ static std::vector get_cmd_params_instances(const cmd_param
/* .no_kv_offload= */ nkvo,
/* .tensor_split = */ ts,
/* .use_mmap = */ mmp,
+ /* .embeddings = */ embd,
};
instances.push_back(instance);
}
@@ -554,6 +584,7 @@ static std::vector get_cmd_params_instances(const cmd_param
/* .n_prompt = */ 0,
/* .n_gen = */ n_gen,
/* .n_batch = */ nb,
+ /* .n_ubatch = */ nub,
/* .type_k = */ tk,
/* .type_v = */ tv,
/* .n_threads = */ nt,
@@ -563,6 +594,7 @@ static std::vector get_cmd_params_instances(const cmd_param
/* .no_kv_offload= */ nkvo,
/* .tensor_split = */ ts,
/* .use_mmap = */ mmp,
+ /* .embeddings = */ embd,
};
instances.push_back(instance);
}
@@ -589,6 +621,7 @@ struct test {
uint64_t model_size;
uint64_t model_n_params;
int n_batch;
+ int n_ubatch;
int n_threads;
ggml_type type_k;
ggml_type type_v;
@@ -598,6 +631,7 @@ struct test {
bool no_kv_offload;
std::vector tensor_split;
bool use_mmap;
+ bool embeddings;
int n_prompt;
int n_gen;
std::string test_time;
@@ -611,6 +645,7 @@ struct test {
model_size = llama_model_size(lmodel);
model_n_params = llama_model_n_params(lmodel);
n_batch = inst.n_batch;
+ n_ubatch = inst.n_ubatch;
n_threads = inst.n_threads;
type_k = inst.type_k;
type_v = inst.type_v;
@@ -620,6 +655,7 @@ struct test {
no_kv_offload = inst.no_kv_offload;
tensor_split = inst.tensor_split;
use_mmap = inst.use_mmap;
+ embeddings = inst.embeddings;
n_prompt = inst.n_prompt;
n_gen = inst.n_gen;
// RFC 3339 date-time format
@@ -688,10 +724,11 @@ struct test {
"cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas",
"cpu_info", "gpu_info",
"model_filename", "model_type", "model_size", "model_n_params",
- "n_batch", "n_threads", "type_k", "type_v",
+ "n_batch", "n_ubatch",
+ "n_threads", "type_k", "type_v",
"n_gpu_layers", "split_mode",
"main_gpu", "no_kv_offload",
- "tensor_split", "use_mmap",
+ "tensor_split", "use_mmap", "embeddings",
"n_prompt", "n_gen", "test_time",
"avg_ns", "stddev_ns",
"avg_ts", "stddev_ts"
@@ -702,7 +739,8 @@ struct test {
enum field_type {STRING, BOOL, INT, FLOAT};
static field_type get_field_type(const std::string & field) {
- if (field == "build_number" || field == "n_batch" || field == "n_threads" ||
+ if (field == "build_number" || field == "n_batch" || field == "n_ubatch" ||
+ field == "n_threads" ||
field == "model_size" || field == "model_n_params" ||
field == "n_gpu_layers" || field == "main_gpu" ||
field == "n_prompt" || field == "n_gen" ||
@@ -711,7 +749,7 @@ struct test {
}
if (field == "cuda" || field == "opencl" || field == "vulkan" || field == "kompute" || field == "metal" ||
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
- field == "use_mmap") {
+ field == "use_mmap" || field == "embeddings") {
return BOOL;
}
if (field == "avg_ts" || field == "stddev_ts") {
@@ -742,10 +780,11 @@ struct test {
std::to_string(metal), std::to_string(sycl), std::to_string(gpu_blas), std::to_string(blas),
cpu_info, gpu_info,
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
- std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
+ std::to_string(n_batch), std::to_string(n_ubatch),
+ std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
std::to_string(n_gpu_layers), split_mode_str(split_mode),
std::to_string(main_gpu), std::to_string(no_kv_offload),
- tensor_split_str, std::to_string(use_mmap),
+ tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
std::to_string(n_prompt), std::to_string(n_gen), test_time,
std::to_string(avg_ns()), std::to_string(stdev_ns()),
std::to_string(avg_ts()), std::to_string(stdev_ts())
@@ -915,6 +954,9 @@ struct markdown_printer : public printer {
if (field == "use_mmap") {
return "mmap";
}
+ if (field == "embeddings") {
+ return "embd";
+ }
if (field == "tensor_split") {
return "ts";
}
@@ -937,6 +979,9 @@ struct markdown_printer : public printer {
if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
fields.emplace_back("n_batch");
}
+ if (params.n_ubatch.size() > 1 || params.n_ubatch != cmd_params_defaults.n_ubatch) {
+ fields.emplace_back("n_ubatch");
+ }
if (params.type_k.size() > 1 || params.type_k != cmd_params_defaults.type_k) {
fields.emplace_back("type_k");
}
@@ -958,6 +1003,9 @@ struct markdown_printer : public printer {
if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) {
fields.emplace_back("use_mmap");
}
+ if (params.embeddings.size() > 1 || params.embeddings != cmd_params_defaults.embeddings) {
+ fields.emplace_back("embeddings");
+ }
fields.emplace_back("test");
fields.emplace_back("t/s");
@@ -1073,25 +1121,32 @@ struct sql_printer : public printer {
};
static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) {
+ llama_set_n_threads(ctx, n_threads, n_threads);
+
+ //std::vector tokens(n_prompt, llama_token_bos(llama_get_model(ctx)));
+ //llama_decode(ctx, llama_batch_get_one(tokens.data(), n_prompt, n_past, 0));
+ //GGML_UNUSED(n_batch);
+
std::vector tokens(n_batch, llama_token_bos(llama_get_model(ctx)));
int n_processed = 0;
- llama_set_n_threads(ctx, n_threads, n_threads);
-
while (n_processed < n_prompt) {
int n_tokens = std::min(n_prompt - n_processed, n_batch);
llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens, n_past + n_processed, 0));
n_processed += n_tokens;
}
+
+ llama_synchronize(ctx);
}
static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
- llama_token token = llama_token_bos(llama_get_model(ctx));
-
llama_set_n_threads(ctx, n_threads, n_threads);
+ llama_token token = llama_token_bos(llama_get_model(ctx));
+
for (int i = 0; i < n_gen; i++) {
llama_decode(ctx, llama_batch_get_one(&token, 1, n_past + i, 0));
+ llama_synchronize(ctx);
}
}
@@ -1180,7 +1235,8 @@ int main(int argc, char ** argv) {
// warmup run
if (t.n_prompt > 0) {
- test_prompt(ctx, std::min(2, t.n_batch), 0, t.n_batch, t.n_threads);
+ //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
+ test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
}
if (t.n_gen > 0) {
test_gen(ctx, 1, 0, t.n_threads);
@@ -1196,6 +1252,7 @@ int main(int argc, char ** argv) {
if (t.n_gen > 0) {
test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads);
}
+
uint64_t t_ns = get_time_ns() - t_start;
t.samples_ns.push_back(t_ns);
}
diff --git a/examples/llama.android/app/src/main/cpp/llama-android.cpp b/examples/llama.android/app/src/main/cpp/llama-android.cpp
index 2beb1e0d5321d..ce8ab3b709407 100644
--- a/examples/llama.android/app/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/app/src/main/cpp/llama-android.cpp
@@ -33,6 +33,45 @@ jclass la_int_var;
jmethodID la_int_var_value;
jmethodID la_int_var_inc;
+std::string cached_token_chars;
+
+bool is_valid_utf8(const char * string) {
+ if (!string) {
+ return true;
+ }
+
+ const unsigned char * bytes = (const unsigned char *)string;
+ int num;
+
+ while (*bytes != 0x00) {
+ if ((*bytes & 0x80) == 0x00) {
+ // U+0000 to U+007F
+ num = 1;
+ } else if ((*bytes & 0xE0) == 0xC0) {
+ // U+0080 to U+07FF
+ num = 2;
+ } else if ((*bytes & 0xF0) == 0xE0) {
+ // U+0800 to U+FFFF
+ num = 3;
+ } else if ((*bytes & 0xF8) == 0xF0) {
+ // U+10000 to U+10FFFF
+ num = 4;
+ } else {
+ return false;
+ }
+
+ bytes += 1;
+ for (int i = 1; i < num; ++i) {
+ if ((*bytes & 0xC0) != 0x80) {
+ return false;
+ }
+ bytes += 1;
+ }
+ }
+
+ return true;
+}
+
static void log_callback(ggml_log_level level, const char * fmt, void * data) {
if (level == GGML_LOG_LEVEL_ERROR) __android_log_print(ANDROID_LOG_ERROR, TAG, fmt, data);
else if (level == GGML_LOG_LEVEL_INFO) __android_log_print(ANDROID_LOG_INFO, TAG, fmt, data);
@@ -295,6 +334,8 @@ Java_com_example_llama_Llm_completion_1init(
jint n_len
) {
+ cached_token_chars.clear();
+
const auto text = env->GetStringUTFChars(jtext, 0);
const auto context = reinterpret_cast(context_pointer);
const auto batch = reinterpret_cast(batch_pointer);
@@ -372,8 +413,16 @@ Java_com_example_llama_Llm_completion_1loop(
}
auto new_token_chars = llama_token_to_piece(context, new_token_id);
- LOGi("new_token_chars: `%s`", new_token_chars.c_str());
- auto new_token = env->NewStringUTF(new_token_chars.c_str());
+ cached_token_chars += new_token_chars;
+
+ jstring new_token = nullptr;
+ if (is_valid_utf8(cached_token_chars.c_str())) {
+ new_token = env->NewStringUTF(cached_token_chars.c_str());
+ LOGi("cached: %s, new_token_chars: `%s`, id: %d", cached_token_chars.c_str(), new_token_chars.c_str(), new_token_id);
+ cached_token_chars.clear();
+ } else {
+ new_token = env->NewStringUTF("");
+ }
llama_batch_clear(*batch);
llama_batch_add(*batch, new_token_id, n_cur, { 0 }, true);
diff --git a/examples/llama.android/app/src/main/java/com/example/llama/Llm.kt b/examples/llama.android/app/src/main/java/com/example/llama/Llm.kt
index 5f32703724a49..d86afee379083 100644
--- a/examples/llama.android/app/src/main/java/com/example/llama/Llm.kt
+++ b/examples/llama.android/app/src/main/java/com/example/llama/Llm.kt
@@ -71,7 +71,7 @@ class Llm {
batch: Long,
nLen: Int,
ncur: IntVar
- ): String
+ ): String?
private external fun kv_cache_clear(context: Long)
@@ -115,7 +115,7 @@ class Llm {
val ncur = IntVar(completion_init(state.context, state.batch, message, nlen))
while (ncur.value <= nlen) {
val str = completion_loop(state.context, state.batch, nlen, ncur)
- if (str.isEmpty()) {
+ if (str == null) {
break
}
emit(str)
diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
index 58fcf40c6fb69..c249291aea110 100644
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@@ -221,6 +221,7 @@ actor LlamaContext {
if llama_decode(context, batch) != 0 {
print("llama_decode() failed during prompt")
}
+ llama_synchronize(context)
let t_pp_end = ggml_time_us()
@@ -240,6 +241,7 @@ actor LlamaContext {
if llama_decode(context, batch) != 0 {
print("llama_decode() failed during text generation")
}
+ llama_synchronize(context)
}
let t_tg_end = ggml_time_us()
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index ef9e4ba7a6b5a..6653b815d93a1 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -1862,7 +1862,6 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
std::vector work(512);
std::vector conv_buf(512);
- std::vector hist_all(1 << 4, 0);
size_t total_size_org = 0;
size_t total_size_new = 0;
@@ -1917,48 +1916,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
}
new_data = work.data();
- std::vector hist_cur(1 << 4, 0);
-
- switch (new_type) {
- case GGML_TYPE_Q4_0: {
- new_size = ggml_quantize_q4_0(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
- } break;
- case GGML_TYPE_Q4_1: {
- new_size = ggml_quantize_q4_1(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
- } break;
- case GGML_TYPE_Q5_0: {
- new_size = ggml_quantize_q5_0(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
- } break;
- case GGML_TYPE_Q5_1: {
- new_size = ggml_quantize_q5_1(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
- } break;
- case GGML_TYPE_Q8_0: {
- new_size = ggml_quantize_q8_0(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
- } break;
- case GGML_TYPE_Q2_K: {
- new_size = ggml_quantize_q2_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
- } break;
- case GGML_TYPE_Q3_K: {
- new_size = ggml_quantize_q3_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
- } break;
- case GGML_TYPE_Q4_K: {
- new_size = ggml_quantize_q4_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
- } break;
- case GGML_TYPE_Q5_K: {
- new_size = ggml_quantize_q5_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
- } break;
- case GGML_TYPE_Q6_K: {
- new_size = ggml_quantize_q6_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data());
- } break;
- default: {
- fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, new_type);
- return false;
- }
- }
-
- for (size_t j = 0; j < hist_cur.size(); ++j) {
- hist_all[j] += hist_cur[j];
- }
+ new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, n_elms/cur->ne[0], cur->ne[0], nullptr);
} else {
new_type = cur->type;
new_data = cur->data;
@@ -1993,17 +1951,6 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
{
printf("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
printf("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
-
- int64_t sum_all = 0;
- for (size_t i = 0; i < hist_all.size(); ++i) {
- sum_all += hist_all[i];
- }
-
- printf("%s: hist: ", __func__);
- for (size_t i = 0; i < hist_all.size(); ++i) {
- printf("%5.3f ", hist_all[i] / (float)sum_all);
- }
- printf("\n");
}
return true;
diff --git a/examples/llava/quantclip.cpp b/examples/llava/quantclip.cpp
new file mode 100644
index 0000000000000..ed6f9f1cb8c57
--- /dev/null
+++ b/examples/llava/quantclip.cpp
@@ -0,0 +1,30 @@
+#include "ggml.h"
+#include "common.h"
+#include "clip.h"
+#include "llava.h"
+#include "llama.h"
+
+#include "base64.hpp"
+
+#include
+#include
+#include
+
+
+int main(int argc, char ** argv) {
+ ggml_time_init();
+
+ if (argc != 3) {
+ fprintf(stderr, "usage: %s mmproj-f16.gguf output-mmproj-quantized.gguf\n", argv[0]);
+ return 1;
+ }
+
+ const std::string fname_inp = argv[1];
+ const std::string fname_out = argv[2];
+
+ printf("quantizing mmproj clip model to q4_1... ");
+ clip_model_quantize(fname_inp.c_str(), fname_out.c_str(), GGML_TYPE_Q4_1);
+ printf("done\n");
+
+ return 0;
+}
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index d693cecc05c99..8293dfa5f224c 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -879,6 +879,7 @@ int main(int argc, char ** argv) {
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
const auto line_inp = ::llama_tokenize(ctx, buffer, false, false);
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
+
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
embd_inp.insert(embd_inp.end(), line_pfx.begin(), line_pfx.end());
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index 3548cec0771cd..6c8a6661ebacc 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -109,6 +109,9 @@ int main(int argc, char ** argv) {
// number of simultaneous "clients" to simulate
const int32_t n_clients = params.n_parallel;
+ // dedicate one sequence to the system prompt
+ params.n_parallel += 1;
+
// requests to simulate
const int32_t n_seq = params.n_sequences;
@@ -198,8 +201,8 @@ int main(int argc, char ** argv) {
}
// assign the system KV cache to all parallel sequences
- for (int32_t i = 1; i < n_clients; ++i) {
- llama_kv_cache_seq_cp(ctx, 0, i, 0, n_tokens_system);
+ for (int32_t i = 1; i <= n_clients; ++i) {
+ llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
}
LOG_TEE("\n");
@@ -223,15 +226,17 @@ int main(int argc, char ** argv) {
client.i_batch = batch.n_tokens;
- llama_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id }, true);
+ llama_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id + 1 }, true);
client.n_decoded += 1;
}
if (batch.n_tokens == 0) {
// all sequences have ended - clear the entire KV cache
- for (int i = 0; i < n_clients; ++i) {
- llama_kv_cache_seq_rm(ctx, i, n_tokens_system, -1);
+ for (int i = 1; i <= n_clients; ++i) {
+ llama_kv_cache_seq_rm(ctx, i, -1, -1);
+ // but keep the system prompt
+ llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
}
LOG_TEE("%s: clearing the KV cache\n", __func__);
@@ -257,7 +262,7 @@ int main(int argc, char ** argv) {
tokens_prompt = ::llama_tokenize(ctx, client.prompt, false);
for (size_t i = 0; i < tokens_prompt.size(); ++i) {
- llama_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id }, false);
+ llama_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false);
}
// extract the logits only for the last token
@@ -368,7 +373,8 @@ int main(int argc, char ** argv) {
}
// delete only the generated part of the sequence, i.e. keep the system prompt in the cache
- llama_kv_cache_seq_rm(ctx, client.id, n_tokens_system, -1);
+ llama_kv_cache_seq_rm(ctx, client.id + 1, -1, -1);
+ llama_kv_cache_seq_cp(ctx, 0, client.id + 1, -1, -1);
const auto t_main_end = ggml_time_us();
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 194743cf46888..322c0280d9aa1 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -443,7 +443,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
return {tokens, std::exp(nll / count), logit_history, prob_history};
}
-static results_perplexity perplexity(llama_context * ctx, const gpt_params & params) {
+static results_perplexity perplexity(llama_context * ctx, const gpt_params & params, const int32_t n_ctx) {
if (params.ppl_stride > 0) {
return perplexity_v2(ctx, params);
}
@@ -454,7 +454,6 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
// BOS tokens will be added for each chunk before eval
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
- const int n_ctx = llama_n_ctx(ctx);
std::ofstream logits_stream;
if (!params.logits_file.empty()) {
@@ -500,13 +499,19 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
double nll2 = 0.0;
const int num_batches = (n_ctx + n_batch - 1) / n_batch;
+ const int n_seq = std::max(1, n_batch / n_ctx);
+
+ GGML_ASSERT(n_batch < n_ctx || n_batch % n_ctx == 0);
+ GGML_ASSERT(params.n_ctx == n_seq * n_ctx);
+
+ llama_batch batch = llama_batch_init(std::min(n_batch, n_ctx*n_seq), 0, 1);
std::vector logits;
if (num_batches > 1) {
logits.reserve((size_t)n_ctx * n_vocab);
}
- fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
+ fprintf(stderr, "%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);
std::vector workers(std::thread::hardware_concurrency() - 1);
@@ -519,10 +524,26 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
log_probs.resize(n_ctx * nv);
}
- for (int i = 0; i < n_chunk; ++i) {
+ // We get the logits for all the tokens in the context window (params.n_ctx)
+ // from llama_eval above. Now, based on https://huggingface.co/docs/transformers/perplexity,
+ // calculate the perplexity over the last half of the window (so the model always has
+ // some context to predict the token).
+ //
+ // We rely on the fact that attention in the forward pass only looks at previous
+ // tokens here, so the logits returned for each token are an accurate representation
+ // of what the model would have predicted at that point.
+ //
+ // Example, we have a context window of 512, we will compute perplexity for each of the
+ // last 256 tokens. Then, we split the input up into context window size chunks to
+ // process the entire prompt.
+ const int first = n_ctx/2;
+
+ for (int i = 0; i < n_chunk; i += n_seq) {
const int start = i * n_ctx;
const int end = start + n_ctx;
+ const int n_seq_batch = std::min(n_seq, n_chunk - i);
+
const auto t_start = std::chrono::high_resolution_clock::now();
// clear the KV cache
@@ -532,34 +553,50 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
const int batch_start = start + j * n_batch;
const int batch_size = std::min(end - batch_start, n_batch);
- // save original token and restore it after eval
- const auto token_org = tokens[batch_start];
+ batch.n_tokens = 0;
+ for (int seq = 0; seq < n_seq_batch; seq++) {
+ int seq_start = batch_start + seq*n_ctx;
- // add BOS token for the first batch of each chunk
- if (add_bos && j == 0) {
- tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
+ // save original token and restore it after eval
+ const auto token_org = tokens[seq_start];
+
+ // add BOS token for the first batch of each chunk
+ if (add_bos && j == 0) {
+ tokens[seq_start] = llama_token_bos(llama_get_model(ctx));
+ }
+
+ for (int k = 0; k < batch_size; ++k) {
+ const int idx = seq*n_ctx + k;
+ batch.token[idx] = tokens[seq_start + k];
+ batch.pos[idx] = j*n_batch + k;
+ batch.n_seq_id[idx] = 1;
+ batch.seq_id[idx][0] = seq;
+ batch.logits[idx] = batch.pos[idx] >= first ? 1 : 0;
+ }
+ batch.n_tokens += batch_size;
+
+ // restore the original token in case it was set to BOS
+ tokens[seq_start] = token_org;
}
- if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
+ if (llama_decode(ctx, batch)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return {tokens, -1, logit_history, prob_history};
}
- // restore the original token in case it was set to BOS
- tokens[batch_start] = token_org;
-
if (num_batches > 1) {
const auto * batch_logits = llama_get_logits(ctx);
logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
}
}
- const auto t_end = std::chrono::high_resolution_clock::now();
if (i == 0) {
+ llama_synchronize(ctx);
+ const auto t_end = std::chrono::high_resolution_clock::now();
const float t_total = std::chrono::duration(t_end - t_start).count();
fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
- int total_seconds = (int)(t_total * n_chunk);
+ int total_seconds = (int)(t_total*n_chunk/n_seq);
if (total_seconds >= 60*60) {
fprintf(stderr, "%d hours ", total_seconds / (60*60));
total_seconds = total_seconds % (60*60);
@@ -567,37 +604,31 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
}
- // We get the logits for all the tokens in the context window (params.n_ctx)
- // from llama_eval above. Now, based on https://huggingface.co/docs/transformers/perplexity,
- // calculate the perplexity over the last half of the window (so the model always has
- // some context to predict the token).
- //
- // We rely on the fact that attention in the forward pass only looks at previous
- // tokens here, so the logits returned for each token are an accurate representation
- // of what the model would have predicted at that point.
- //
- // Example, we have a context window of 512, we will compute perplexity for each of the
- // last 256 tokens. Then, we split the input up into context window size chunks to
- // process the entire prompt.
- const int first = n_ctx/2;
- const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
- if (!params.logits_file.empty()) {
- process_logits(logits_stream, n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
- workers, log_probs, nll, nll2);
- } else {
- process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
- workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
- }
- count += n_ctx - first - 1;
-
- // perplexity is e^(average negative log-likelihood)
- if (params.ppl_output_type == 0) {
- printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
- } else {
- double av = nll/count;
- double av2 = nll2/count - av*av;
- if (av2 > 0) av2 = sqrt(av2/(count-1));
- printf("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
+ for (int seq = 0; seq < n_seq_batch; seq++) {
+ const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits_ith(ctx, seq*n_ctx);
+ llama_token * tokens_data = tokens.data() + start + seq*n_ctx + first;
+ if (!params.logits_file.empty()) {
+ process_logits(logits_stream, n_vocab, all_logits + first*n_vocab,
+ tokens_data, n_ctx - 1 - first,
+ workers, log_probs, nll, nll2);
+ } else {
+ process_logits(n_vocab, all_logits + first*n_vocab,
+ tokens_data, n_ctx - 1 - first,
+ workers, nll, nll2,
+ logit_history.data() + start + seq*n_ctx + first,
+ prob_history.data() + start + seq*n_ctx + first);
+ }
+ count += n_ctx - first - 1;
+
+ // perplexity is e^(average negative log-likelihood)
+ if (params.ppl_output_type == 0) {
+ printf("[%d]%.4lf,", i + seq + 1, std::exp(nll / count));
+ } else {
+ double av = nll/count;
+ double av2 = nll2/count - av*av;
+ if (av2 > 0) av2 = sqrt(av2/(count-1));
+ printf("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
+ }
}
fflush(stdout);
@@ -616,6 +647,8 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
printf("Unexpected negative standard deviation of log(prob)\n");
}
+ llama_batch_free(batch);
+
return {tokens, ppl, logit_history, prob_history};
}
@@ -810,7 +843,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
const int n_batch = params.n_batch;
const int max_tasks_per_batch = 32;
- const int max_seq = 4*max_tasks_per_batch;
+ const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
@@ -1087,7 +1120,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
const int n_batch = params.n_batch;
const int max_tasks_per_batch = 128;
- const int max_seq = 2*max_tasks_per_batch;
+ const int max_seq = std::min(2*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
@@ -1439,7 +1472,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
const int n_batch = params.n_batch;
const int max_tasks_per_batch = 32;
- const int max_seq = 4*max_tasks_per_batch;
+ const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
@@ -1783,13 +1816,24 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
int main(int argc, char ** argv) {
gpt_params params;
- params.n_batch = 512;
if (!gpt_params_parse(argc, argv, params)) {
return 1;
}
params.logits_all = true;
- params.n_batch = std::min(params.n_batch, params.n_ctx);
+
+ const int32_t n_ctx = params.n_ctx;
+
+ const bool ppl = !params.hellaswag && !params.winogrande && !params.multiple_choice && !params.kl_divergence;
+ if (ppl) {
+ int n_seq = std::max(1, params.n_batch / n_ctx);
+ int32_t n_kv = n_seq * n_ctx;
+ params.n_parallel = n_seq;
+ params.n_ctx = n_kv;
+ params.n_batch = std::min(params.n_batch, n_kv);
+ } else {
+ params.n_batch = std::min(params.n_batch, params.n_ctx);
+ }
if (params.ppl_stride > 0) {
fprintf(stderr, "Will perform strided perplexity calculation -> adjusting context size from %d to %d\n",
@@ -1816,6 +1860,9 @@ int main(int argc, char ** argv) {
llama_model * model;
llama_context * ctx;
+ // ensure there's at least enough seq_ids for HellaSwag
+ params.n_parallel = std::max(4, params.n_parallel);
+
// load the model and apply lora adapter, if any
std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (model == NULL) {
@@ -1845,7 +1892,7 @@ int main(int argc, char ** argv) {
} else if (params.kl_divergence) {
kl_divergence(ctx, params);
} else {
- results = perplexity(ctx, params);
+ results = perplexity(ctx, params, n_ctx);
}
llama_print_timings(ctx);
diff --git a/examples/server-embd.py b/examples/server-embd.py
index c5c4ea87b09fc..118e042716c02 100644
--- a/examples/server-embd.py
+++ b/examples/server-embd.py
@@ -13,7 +13,7 @@ async def main():
model_url = "http://127.0.0.1:6900"
responses: list[requests.Response] = await asyncio.gather(*[requests_post_async(
url= f"{model_url}/embedding",
- json= {"content": str(i)*1024}
+ json= {"content": str(0)*1024}
) for i in range(n)])
for response in responses:
diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt
index cc13b2d630652..f94de1e99b7e9 100644
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -1,12 +1,18 @@
set(TARGET server)
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
+option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF)
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-add_executable(${TARGET} server.cpp oai.hpp utils.hpp json.hpp httplib.h)
+add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h)
install(TARGETS ${TARGET} RUNTIME)
target_compile_definitions(${TARGET} PRIVATE
SERVER_VERBOSE=$
)
-target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
+if (LLAMA_SERVER_SSL)
+ find_package(OpenSSL REQUIRED)
+ target_link_libraries(${TARGET} PRIVATE OpenSSL::SSL OpenSSL::Crypto)
+ target_compile_definitions(${TARGET} PRIVATE CPPHTTPLIB_OPENSSL_SUPPORT)
+endif()
if (WIN32)
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
endif()
diff --git a/examples/server/README.md b/examples/server/README.md
index 21da7a0a04e23..8f8454affaecd 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -42,7 +42,7 @@ see https://github.com/ggerganov/llama.cpp/issues/1437
- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.
- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`.
- `--port`: Set the port to listen. Default: `8080`.
-- `--path`: path from which to serve static files (default examples/server/public)
+- `--path`: path from which to serve static files (default: disabled)
- `--api-key`: Set an api key for request authorization. By default the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys.
- `--api-key-file`: path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`'s.
- `--embedding`: Enable embedding extraction, Default: disabled.
@@ -59,6 +59,10 @@ see https://github.com/ggerganov/llama.cpp/issues/1437
- `--log-disable`: Output logs to stdout only, default: enabled.
- `--log-format FORMAT`: Define the log output to FORMAT: json or text (default: json)
+**If compiled with `LLAMA_SERVER_SSL=ON`**
+- `--ssl-key-file FNAME`: path to file a PEM-encoded SSL private key
+- `--ssl-cert-file FNAME`: path to file a PEM-encoded SSL certificate
+
## Build
server is build alongside everything else from the root of the project
@@ -75,6 +79,28 @@ server is build alongside everything else from the root of the project
cmake --build . --config Release
```
+## Build with SSL
+
+server can also be built with SSL support using OpenSSL 3
+
+- Using `make`:
+
+ ```bash
+ # NOTE: For non-system openssl, use the following:
+ # CXXFLAGS="-I /path/to/openssl/include"
+ # LDFLAGS="-L /path/to/openssl/lib"
+ make LLAMA_SERVER_SSL=true server
+ ```
+
+- Using `CMake`:
+
+ ```bash
+ mkdir build
+ cd build
+ cmake .. -DLLAMA_SERVER_SSL=ON
+ make server
+ ```
+
## Quick Start
To get started right away, run the following command, making sure to use the correct path for the model you have:
@@ -97,10 +123,10 @@ You can consume the endpoints with Postman or NodeJS with axios library. You can
### Docker
```bash
-docker run -p 8080:8080 -v /path/to/models:/models ggerganov/llama.cpp:server -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080
+docker run -p 8080:8080 -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:server -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080
# or, with CUDA:
-docker run -p 8080:8080 -v /path/to/models:/models --gpus all ggerganov/llama.cpp:server-cuda -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080 --n-gpu-layers 99
+docker run -p 8080:8080 -v /path/to/models:/models --gpus all ghcr.io/ggerganov/llama.cpp:server-cuda -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080 --n-gpu-layers 99
```
## Testing with CURL
@@ -169,7 +195,11 @@ node index.js
*Options:*
- `prompt`: Provide the prompt for this completion as a string or as an array of strings or numbers representing tokens. Internally, the prompt is compared to the previous completion and only the "unseen" suffix is evaluated. If the prompt is a string or an array with the first element given as a string, a `bos` token is inserted in the front like `main` does.
+ `prompt`: Provide the prompt for this completion as a string or as an array of strings or numbers representing tokens. Internally, if `cache_prompt` is `true`, the prompt is compared to the previous completion and only the "unseen" suffix is evaluated. A `BOS` token is inserted at the start, if all of the following conditions are true:
+
+ - The prompt is a string or an array with the first element given as a string
+ - The model's `tokenizer.ggml.add_bos_token` metadata is `true`
+ - The system prompt is empty
`temperature`: Adjust the randomness of the generated text (default: 0.8).
@@ -282,7 +312,7 @@ Notice that each `probs` is an array of length `n_probs`.
`content`: Set the text to tokenize.
- Note that the special `BOS` token is not added in front of the text and also a space character is not inserted automatically as it is for `/completion`.
+ Note that a special `BOS` token is never inserted.
- **POST** `/detokenize`: Convert tokens to text.
@@ -436,7 +466,7 @@ Notice that each `probs` is an array of length `n_probs`.
"next_token": {
"has_next_token": true,
"n_remain": -1,
- "num_tokens_predicted": 0,
+ "n_decoded": 0,
"stopped_eos": false,
"stopped_limit": false,
"stopped_word": false,
@@ -526,13 +556,55 @@ Run with bash:
bash chat.sh
```
-### API like OAI
+### OAI-like API
+
+The HTTP server supports OAI-like API: https://github.com/openai/openai-openapi
+
+### API errors
+
+Server returns error in the same format as OAI: https://github.com/openai/openai-openapi
+
+Example of an error:
+
+```json
+{
+ "error": {
+ "code": 401,
+ "message": "Invalid API Key",
+ "type": "authentication_error"
+ }
+}
+```
+
+Apart from error types supported by OAI, we also have custom types that are specific to functionalities of llama.cpp:
+
+**When /metrics or /slots endpoint is disabled**
-The HTTP server supports OAI-like API
+```json
+{
+ "error": {
+ "code": 501,
+ "message": "This server does not support metrics endpoint.",
+ "type": "not_supported_error"
+ }
+}
+```
+
+**When the server receives invalid grammar via */completions endpoint**
+
+```json
+{
+ "error": {
+ "code": 400,
+ "message": "Failed to parse grammar",
+ "type": "invalid_request_error"
+ }
+}
+```
### Extending or building alternative Web Front End
-The default location for the static files is `examples/server/public`. You can extend the front end by running the server binary with `--path` set to `./your-directory` and importing `/completion.js` to get access to the llamaComplete() method.
+You can extend the front end by running the server binary with `--path` set to `./your-directory` and importing `/completion.js` to get access to the llamaComplete() method.
Read the documentation in `/completion.js` to see convenient ways to access llama.
diff --git a/examples/server/bench/README.md b/examples/server/bench/README.md
new file mode 100644
index 0000000000000..a53ad64d7359b
--- /dev/null
+++ b/examples/server/bench/README.md
@@ -0,0 +1,88 @@
+### Server benchmark tools
+
+Benchmark is using [k6](https://k6.io/).
+
+##### Install k6
+
+Follow instruction from: https://k6.io/docs/get-started/installation/
+
+Example for ubuntu:
+```shell
+snap install k6
+```
+
+#### Download a dataset
+
+This dataset was originally proposed in [vLLM benchmarks](https://github.com/vllm-project/vllm/blob/main/benchmarks/README.md).
+
+```shell
+wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+```
+
+#### Download a model
+Example for PHI-2
+
+```shell
+../../../scripts/hf.sh --repo ggml-org/models --file phi-2/ggml-model-q4_0.gguf
+```
+
+#### Start the server
+The server must answer OAI Chat completion requests on `http://localhost:8080/v1` or according to the environment variable `SERVER_BENCH_URL`.
+
+Example:
+```shell
+server --host localhost --port 8080 \
+ --model ggml-model-q4_0.gguf \
+ --cont-batching \
+ --metrics \
+ --parallel 8 \
+ --batch-size 512 \
+ --ctx-size 4096 \
+ --log-format text \
+ -ngl 33
+```
+
+#### Run the benchmark
+
+For 500 chat completions request with 8 concurrent users during maximum 10 minutes, run:
+```shell
+k6 run script.js --duration 10m --iterations 500 --vus 8
+```
+
+The benchmark values can be overridden with:
+- `SERVER_BENCH_URL` server url prefix for chat completions, default `http://localhost:8080/v1`
+- `SERVER_BENCH_N_PROMPTS` total prompts to randomly select in the benchmark, default `480`
+- `SERVER_BENCH_MODEL_ALIAS` model alias to pass in the completion request, default `my-model`
+- `SERVER_BENCH_MAX_TOKENS` max tokens to predict, default: `512`
+- `SERVER_BENCH_DATASET` path to the benchmark dataset file
+- `SERVER_BENCH_MAX_PROMPT_TOKENS` maximum prompt tokens to filter out in the dataset: default `1024`
+- `SERVER_BENCH_MAX_CONTEXT` maximum context size of the completions request to filter out in the dataset: prompt + predicted tokens, default `2048`
+
+Note: the local tokenizer is just a string space split, real number of tokens will differ.
+
+Or with [k6 options](https://k6.io/docs/using-k6/k6-options/reference/):
+
+```shell
+SERVER_BENCH_N_PROMPTS=500 k6 run script.js --duration 10m --iterations 500 --vus 8
+```
+
+To [debug http request](https://k6.io/docs/using-k6/http-debugging/) use `--http-debug="full"`.
+
+#### Metrics
+
+Following metrics are available computed from the OAI chat completions response `usage`:
+- `llamacpp_tokens_second` Trend of `usage.total_tokens / request duration`
+- `llamacpp_prompt_tokens` Trend of `usage.prompt_tokens`
+- `llamacpp_prompt_tokens_total_counter` Counter of `usage.prompt_tokens`
+- `llamacpp_completion_tokens` Trend of `usage.completion_tokens`
+- `llamacpp_completion_tokens_total_counter` Counter of `usage.completion_tokens`
+- `llamacpp_completions_truncated_rate` Rate of completions truncated, i.e. if `finish_reason === 'length'`
+- `llamacpp_completions_stop_rate` Rate of completions stopped by the model, i.e. if `finish_reason === 'stop'`
+
+The script will fail if too many completions are truncated, see `llamacpp_completions_truncated_rate`.
+
+K6 metrics might be compared against [server metrics](../README.md), with:
+
+```shell
+curl http://localhost:8080/metrics
+```
diff --git a/examples/server/bench/script.js b/examples/server/bench/script.js
new file mode 100644
index 0000000000000..a4f5ac5ab22ad
--- /dev/null
+++ b/examples/server/bench/script.js
@@ -0,0 +1,120 @@
+import http from 'k6/http'
+import {check, sleep} from 'k6'
+import {SharedArray} from 'k6/data'
+import {Counter, Rate, Trend} from 'k6/metrics'
+import exec from 'k6/execution';
+
+// Server chat completions prefix
+const server_url = __ENV.SERVER_BENCH_URL ? __ENV.SERVER_BENCH_URL : 'http://localhost:8080/v1'
+
+// Number of total prompts in the dataset - default 10m / 10 seconds/request * number of users
+const n_prompt = __ENV.SERVER_BENCH_N_PROMPTS ? parseInt(__ENV.SERVER_BENCH_N_PROMPTS) : 600 / 10 * 8
+
+// Model name to request
+const model = __ENV.SERVER_BENCH_MODEL_ALIAS ? __ENV.SERVER_BENCH_MODEL_ALIAS : 'my-model'
+
+// Dataset path
+const dataset_path = __ENV.SERVER_BENCH_DATASET ? __ENV.SERVER_BENCH_DATASET : './ShareGPT_V3_unfiltered_cleaned_split.json'
+
+// Max tokens to predict
+const max_tokens = __ENV.SERVER_BENCH_MAX_TOKENS ? parseInt(__ENV.SERVER_BENCH_MAX_TOKENS) : 512
+
+// Max prompt tokens
+const n_prompt_tokens = __ENV.SERVER_BENCH_MAX_PROMPT_TOKENS ? parseInt(__ENV.SERVER_BENCH_MAX_PROMPT_TOKENS) : 1024
+
+// Max slot context
+const n_ctx_slot = __ENV.SERVER_BENCH_MAX_CONTEXT ? parseInt(__ENV.SERVER_BENCH_MAX_CONTEXT) : 2048
+
+export function setup() {
+ console.info(`Benchmark config: server_url=${server_url} n_prompt=${n_prompt} model=${model} dataset_path=${dataset_path} max_tokens=${max_tokens}`)
+}
+
+const data = new SharedArray('conversations', function () {
+ const tokenizer = (message) => message.split(/[\s,'".?]/)
+
+ return JSON.parse(open(dataset_path))
+ // Filter out the conversations with less than 2 turns.
+ .filter(data => data["conversations"].length >= 2)
+ .filter(data => data["conversations"][0]["from"] === "human")
+ .map(data => {
+ return {
+ prompt: data["conversations"][0]["value"],
+ n_prompt_tokens: tokenizer(data["conversations"][0]["value"]).length,
+ n_completion_tokens: tokenizer(data["conversations"][1]["value"]).length,
+ }
+ })
+ // Filter out too short sequences
+ .filter(conv => conv.n_prompt_tokens >= 4 && conv.n_completion_tokens >= 4)
+ // Filter out too long sequences.
+ .filter(conv => conv.n_prompt_tokens <= n_prompt_tokens && conv.n_prompt_tokens + conv.n_completion_tokens <= n_ctx_slot)
+ // Keep only first n prompts
+ .slice(0, n_prompt)
+})
+
+const llamacpp_prompt_tokens = new Trend('llamacpp_prompt_tokens')
+const llamacpp_completion_tokens = new Trend('llamacpp_completion_tokens')
+const llamacpp_tokens_second = new Trend('llamacpp_tokens_second')
+
+const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter')
+const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter')
+
+const llamacpp_completions_truncated_rate = new Rate('llamacpp_completions_truncated_rate')
+const llamacpp_completions_stop_rate = new Rate('llamacpp_completions_stop_rate')
+
+export const options = {
+ thresholds: {
+ llamacpp_completions_truncated_rate: [
+ // more than 80% of truncated input will abort the test
+ {threshold: 'rate < 0.8', abortOnFail: true, delayAbortEval: '1m'},
+ ],
+ },
+ duration: '10m',
+ vus: 8,
+}
+
+export default function () {
+ const conversation = data[exec.scenario.iterationInInstance % data.length]
+ const payload = {
+ "messages": [
+ {
+ "role": "system",
+ "content": "You are ChatGPT, an AI assistant.",
+ },
+ {
+ "role": "user",
+ "content": conversation.prompt,
+ }
+ ],
+ "model": model,
+ "stream": false,
+ "max_tokens": max_tokens
+ }
+
+ const body = JSON.stringify(payload)
+
+ let res = http.post(`${server_url}/chat/completions`, body, {
+ headers: {'Content-Type': 'application/json'},
+ timeout: '300s'
+ })
+
+ check(res, {'success completion': (r) => r.status === 200})
+
+ if (res.status === 200) {
+ const completions = res.json()
+
+ llamacpp_prompt_tokens.add(completions.usage.prompt_tokens)
+ llamacpp_prompt_tokens_total_counter.add(completions.usage.prompt_tokens)
+
+ llamacpp_completion_tokens.add(completions.usage.completion_tokens)
+ llamacpp_completion_tokens_total_counter.add(completions.usage.completion_tokens)
+
+ llamacpp_completions_truncated_rate.add(completions.choices[0].finish_reason === 'length')
+ llamacpp_completions_stop_rate.add(completions.choices[0].finish_reason === 'stop')
+
+ llamacpp_tokens_second.add(completions.usage.total_tokens / res.timings.duration * 1.e3)
+ } else {
+ console.error(`response: ${res.body} request=${payload}`)
+ }
+
+ sleep(0.3)
+}
diff --git a/examples/server/completion.js.hpp b/examples/server/completion.js.hpp
index f5e696e17edfe..10734091eeaf6 100644
--- a/examples/server/completion.js.hpp
+++ b/examples/server/completion.js.hpp
@@ -231,255 +231,256 @@ unsigned char completion_js[] = {
0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74,
0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
- 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65,
- 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x20, 0x3d,
- 0x20, 0x4a, 0x53, 0x4f, 0x4e, 0x2e, 0x70, 0x61, 0x72, 0x73, 0x65, 0x28,
- 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72,
- 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
- 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c,
- 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6e, 0x74,
- 0x65, 0x6e, 0x74, 0x2e, 0x69, 0x6e, 0x63, 0x6c, 0x75, 0x64, 0x65, 0x73,
- 0x28, 0x27, 0x73, 0x6c, 0x6f, 0x74, 0x20, 0x75, 0x6e, 0x61, 0x76, 0x61,
- 0x69, 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x27, 0x29, 0x29, 0x20, 0x7b, 0x0a,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x72,
+ 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74,
+ 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x20, 0x3d, 0x20, 0x4a, 0x53, 0x4f,
+ 0x4e, 0x2e, 0x70, 0x61, 0x72, 0x73, 0x65, 0x28, 0x72, 0x65, 0x73, 0x75,
+ 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x3b, 0x0a, 0x20,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+ 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e,
+ 0x65, 0x72, 0x72, 0x6f, 0x72, 0x2e, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67,
+ 0x65, 0x2e, 0x69, 0x6e, 0x63, 0x6c, 0x75, 0x64, 0x65, 0x73, 0x28, 0x27,
+ 0x73, 0x6c, 0x6f, 0x74, 0x20, 0x75, 0x6e, 0x61, 0x76, 0x61, 0x69, 0x6c,
+ 0x61, 0x62, 0x6c, 0x65, 0x27, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x2f, 0x2f, 0x20, 0x54, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x61,
0x6e, 0x20, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x20, 0x74, 0x6f, 0x20, 0x62,
0x65, 0x20, 0x63, 0x61, 0x75, 0x67, 0x68, 0x74, 0x20, 0x62, 0x79, 0x20,
0x75, 0x70, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x20, 0x63, 0x61, 0x6c,
0x6c, 0x65, 0x72, 0x73, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
- 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x72, 0x6f, 0x77,
- 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x27,
- 0x73, 0x6c, 0x6f, 0x74, 0x20, 0x75, 0x6e, 0x61, 0x76, 0x61, 0x69, 0x6c,
- 0x61, 0x62, 0x6c, 0x65, 0x27, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
- 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c,
- 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
- 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f,
- 0x6c, 0x65, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x60, 0x6c, 0x6c,
- 0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70, 0x70, 0x20, 0x65, 0x72, 0x72, 0x6f,
- 0x72, 0x3a, 0x20, 0x24, 0x7b, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e,
- 0x65, 0x72, 0x72, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e,
- 0x74, 0x7d, 0x60, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x72,
+ 0x6f, 0x77, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45, 0x72, 0x72, 0x6f, 0x72,
+ 0x28, 0x27, 0x73, 0x6c, 0x6f, 0x74, 0x20, 0x75, 0x6e, 0x61, 0x76, 0x61,
+ 0x69, 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x27, 0x29, 0x3b, 0x0a, 0x20, 0x20,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+ 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+ 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x65, 0x72, 0x72,
+ 0x6f, 0x72, 0x28, 0x60, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70,
+ 0x70, 0x20, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x20, 0x5b, 0x24, 0x7b, 0x72,
+ 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x2e,
+ 0x63, 0x6f, 0x64, 0x65, 0x7d, 0x20, 0x2d, 0x20, 0x24, 0x7b, 0x72, 0x65,
+ 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x2e, 0x74,
+ 0x79, 0x70, 0x65, 0x7d, 0x5d, 0x3a, 0x20, 0x24, 0x7b, 0x72, 0x65, 0x73,
+ 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x2e, 0x6d, 0x65,
+ 0x73, 0x73, 0x61, 0x67, 0x65, 0x7d, 0x60, 0x29, 0x3b, 0x0a, 0x20, 0x20,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+ 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+ 0x20, 0x20, 0x7d, 0x20, 0x63, 0x61, 0x74, 0x63, 0x68, 0x28, 0x65, 0x29,
+ 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65,
+ 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x60, 0x6c, 0x6c, 0x61, 0x6d,
+ 0x61, 0x2e, 0x63, 0x70, 0x70, 0x20, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x20,
+ 0x24, 0x7b, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72,
+ 0x6f, 0x72, 0x7d, 0x60, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
- 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65,
- 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x20,
- 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
- 0x20, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72,
- 0x6f, 0x72, 0x20, 0x3d, 0x20, 0x4a, 0x53, 0x4f, 0x4e, 0x2e, 0x70, 0x61,
- 0x72, 0x73, 0x65, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65,
- 0x72, 0x72, 0x6f, 0x72, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
- 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f,
- 0x6c, 0x65, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x60, 0x6c, 0x6c,
- 0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70, 0x70, 0x20, 0x65, 0x72, 0x72, 0x6f,
- 0x72, 0x3a, 0x20, 0x24, 0x7b, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e,
- 0x65, 0x72, 0x72, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e,
- 0x74, 0x7d, 0x60, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
- 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
- 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x63, 0x61,
- 0x74, 0x63, 0x68, 0x20, 0x28, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
- 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x65, 0x2e, 0x6e, 0x61, 0x6d, 0x65,
- 0x20, 0x21, 0x3d, 0x3d, 0x20, 0x27, 0x41, 0x62, 0x6f, 0x72, 0x74, 0x45,
- 0x72, 0x72, 0x6f, 0x72, 0x27, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
- 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x65,
- 0x72, 0x72, 0x6f, 0x72, 0x28, 0x22, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x20,
- 0x65, 0x72, 0x72, 0x6f, 0x72, 0x3a, 0x20, 0x22, 0x2c, 0x20, 0x65, 0x29,
- 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
- 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x7d,
- 0x0a, 0x20, 0x20, 0x66, 0x69, 0x6e, 0x61, 0x6c, 0x6c, 0x79, 0x20, 0x7b,
- 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c,
- 0x6c, 0x65, 0x72, 0x2e, 0x61, 0x62, 0x6f, 0x72, 0x74, 0x28, 0x29, 0x3b,
- 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75,
- 0x72, 0x6e, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a,
- 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x61, 0x6c, 0x6c, 0x20, 0x6c,
- 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e,
- 0x20, 0x61, 0x6e, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x20, 0x74, 0x61,
- 0x72, 0x67, 0x65, 0x74, 0x20, 0x74, 0x68, 0x61, 0x74, 0x20, 0x79, 0x6f,
- 0x75, 0x20, 0x63, 0x61, 0x6e, 0x20, 0x73, 0x75, 0x62, 0x73, 0x63, 0x72,
- 0x69, 0x62, 0x65, 0x20, 0x74, 0x6f, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f,
- 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x3a, 0x0a, 0x2f, 0x2f,
- 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6f, 0x72,
- 0x74, 0x20, 0x7b, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x45, 0x76, 0x65,
- 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x7d, 0x20, 0x66,
- 0x72, 0x6f, 0x6d, 0x20, 0x27, 0x2f, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65,
- 0x74, 0x69, 0x6f, 0x6e, 0x2e, 0x6a, 0x73, 0x27, 0x0a, 0x2f, 0x2f, 0x0a,
- 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
- 0x63, 0x6f, 0x6e, 0x6e, 0x20, 0x3d, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
- 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x28,
- 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20,
- 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x6e, 0x2e, 0x61, 0x64, 0x64, 0x45, 0x76,
- 0x65, 0x6e, 0x74, 0x4c, 0x69, 0x73, 0x74, 0x65, 0x6e, 0x65, 0x72, 0x28,
- 0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x22, 0x2c, 0x20, 0x28,
- 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a,
- 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75,
- 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69, 0x74, 0x65, 0x28, 0x63,
- 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x2e,
- 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20,
- 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x2f, 0x2f, 0x0a, 0x65, 0x78, 0x70,
- 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c,
- 0x61, 0x6d, 0x61, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67,
- 0x65, 0x74, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74,
- 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x7b,
- 0x7d, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20, 0x3d, 0x20,
- 0x7b, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x63,
- 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61,
- 0x72, 0x67, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45,
- 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x28, 0x29,
- 0x3b, 0x0a, 0x20, 0x20, 0x28, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28,
- 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c,
- 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d,
- 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72,
- 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73,
- 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x6c,
- 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c,
- 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e,
- 0x66, 0x69, 0x67, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
- 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e,
- 0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
- 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20,
- 0x2b, 0x3d, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74,
- 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x20,
+ 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x7d, 0x20,
+ 0x63, 0x61, 0x74, 0x63, 0x68, 0x20, 0x28, 0x65, 0x29, 0x20, 0x7b, 0x0a,
+ 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x65, 0x2e, 0x6e, 0x61,
+ 0x6d, 0x65, 0x20, 0x21, 0x3d, 0x3d, 0x20, 0x27, 0x41, 0x62, 0x6f, 0x72,
+ 0x74, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x27, 0x29, 0x20, 0x7b, 0x0a, 0x20,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65,
+ 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x22, 0x6c, 0x6c, 0x61, 0x6d,
+ 0x61, 0x20, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x3a, 0x20, 0x22, 0x2c, 0x20,
+ 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
+ 0x20, 0x20, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x65, 0x3b, 0x0a, 0x20,
+ 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x66, 0x69, 0x6e, 0x61, 0x6c, 0x6c, 0x79,
+ 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72,
+ 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x61, 0x62, 0x6f, 0x72, 0x74, 0x28,
+ 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x72, 0x65,
+ 0x74, 0x75, 0x72, 0x6e, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74,
+ 0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x61, 0x6c, 0x6c,
+ 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72, 0x65, 0x74, 0x75,
+ 0x72, 0x6e, 0x20, 0x61, 0x6e, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x20,
+ 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x74, 0x68, 0x61, 0x74, 0x20,
+ 0x79, 0x6f, 0x75, 0x20, 0x63, 0x61, 0x6e, 0x20, 0x73, 0x75, 0x62, 0x73,
+ 0x63, 0x72, 0x69, 0x62, 0x65, 0x20, 0x74, 0x6f, 0x0a, 0x2f, 0x2f, 0x0a,
+ 0x2f, 0x2f, 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x3a, 0x0a,
+ 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70,
+ 0x6f, 0x72, 0x74, 0x20, 0x7b, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x45,
+ 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x7d,
+ 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x27, 0x2f, 0x63, 0x6f, 0x6d, 0x70,
+ 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x2e, 0x6a, 0x73, 0x27, 0x0a, 0x2f,
+ 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73,
+ 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x6e, 0x20, 0x3d, 0x20, 0x6c, 0x6c, 0x61,
+ 0x6d, 0x61, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65,
+ 0x74, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x0a, 0x2f, 0x2f,
+ 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x6e, 0x2e, 0x61, 0x64, 0x64,
+ 0x45, 0x76, 0x65, 0x6e, 0x74, 0x4c, 0x69, 0x73, 0x74, 0x65, 0x6e, 0x65,
+ 0x72, 0x28, 0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x22, 0x2c,
+ 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x29, 0x20, 0x3d, 0x3e, 0x20,
+ 0x7b, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f,
+ 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69, 0x74, 0x65,
+ 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x65, 0x74, 0x61, 0x69,
+ 0x6c, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a, 0x2f,
+ 0x2f, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x2f, 0x2f, 0x0a, 0x65,
+ 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
+ 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61,
+ 0x72, 0x67, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x6d,
+ 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d,
+ 0x20, 0x7b, 0x7d, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20,
+ 0x3d, 0x20, 0x7b, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20,
+ 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74,
+ 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77,
+ 0x20, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74,
+ 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x28, 0x61, 0x73, 0x79, 0x6e, 0x63,
+ 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
+ 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74,
+ 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66,
+ 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x28, 0x63, 0x6f,
+ 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66,
+ 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70,
+ 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63,
+ 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
+ 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e,
+ 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e,
+ 0x74, 0x20, 0x2b, 0x3d, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64,
+ 0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b,
+ 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65,
+ 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73,
+ 0x70, 0x61, 0x74, 0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e,
+ 0x65, 0x77, 0x20, 0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65,
+ 0x6e, 0x74, 0x28, 0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x22,
+ 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20,
+ 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x7d,
+ 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x68,
+ 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e,
+ 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74,
+ 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+ 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72,
+ 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68,
+ 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75,
+ 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x67,
+ 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65,
+ 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64,
+ 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b,
+ 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61,
+ 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67,
+ 0x73, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+ 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20,
+ 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e,
+ 0x74, 0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74,
0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61,
0x74, 0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77,
0x20, 0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74,
- 0x28, 0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x22, 0x2c, 0x20,
+ 0x28, 0x22, 0x74, 0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x22, 0x2c, 0x20,
0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68,
- 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x7d, 0x29, 0x29,
- 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
- 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e,
- 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72,
- 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e,
- 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
- 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65,
- 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45, 0x76,
- 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73, 0x74,
- 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x67, 0x65, 0x6e,
- 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74,
- 0x69, 0x6e, 0x67, 0x73, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74,
- 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64,
- 0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69,
- 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20,
- 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
- 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63,
- 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74, 0x69,
- 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
- 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61,
- 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63,
- 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43,
- 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22,
- 0x74, 0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x22, 0x2c, 0x20, 0x7b, 0x20,
- 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, 0x75, 0x6e,
- 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74, 0x69, 0x6d, 0x69, 0x6e,
- 0x67, 0x73, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
- 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
- 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65,
- 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45, 0x76,
- 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73, 0x74,
- 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x64, 0x6f, 0x6e,
- 0x65, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c,
- 0x3a, 0x20, 0x7b, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20,
- 0x7d, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x29, 0x28,
- 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
- 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x3b,
- 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x61, 0x6c, 0x6c, 0x20,
- 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72,
- 0x6e, 0x20, 0x61, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x20,
- 0x74, 0x68, 0x61, 0x74, 0x20, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, 0x65,
- 0x73, 0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x6f, 0x6d,
- 0x70, 0x6c, 0x65, 0x74, 0x65, 0x64, 0x20, 0x74, 0x65, 0x78, 0x74, 0x2e,
- 0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x64, 0x6f, 0x65, 0x73, 0x20, 0x6e,
- 0x6f, 0x74, 0x20, 0x73, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x73,
- 0x74, 0x72, 0x65, 0x61, 0x6d, 0x69, 0x6e, 0x67, 0x0a, 0x2f, 0x2f, 0x0a,
- 0x2f, 0x2f, 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x3a, 0x0a,
- 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6c,
- 0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x70,
- 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x2e, 0x74, 0x68, 0x65, 0x6e, 0x28,
- 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x20, 0x3d, 0x3e,
- 0x20, 0x7b, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
- 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69,
- 0x74, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a,
- 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x2f, 0x2f,
- 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x72, 0x0a, 0x2f,
- 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
- 0x73, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d,
- 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
- 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x70, 0x72, 0x6f, 0x6d,
- 0x70, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64,
- 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69, 0x74,
- 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a, 0x2f,
- 0x2f, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e,
- 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d,
- 0x69, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70,
- 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20,
- 0x7b, 0x7d, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20, 0x3d,
- 0x20, 0x7b, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20,
- 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x50,
- 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x61, 0x73, 0x79, 0x6e, 0x63,
- 0x20, 0x28, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, 0x65, 0x2c, 0x20, 0x72,
- 0x65, 0x6a, 0x65, 0x63, 0x74, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a,
- 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74,
- 0x65, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20,
- 0x20, 0x20, 0x74, 0x72, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+ 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74, 0x69, 0x6d,
+ 0x69, 0x6e, 0x67, 0x73, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20,
+ 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
+ 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72,
+ 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68,
+ 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75,
+ 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x64,
+ 0x6f, 0x6e, 0x65, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61,
+ 0x69, 0x6c, 0x3a, 0x20, 0x7b, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e,
+ 0x74, 0x20, 0x7d, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d,
+ 0x29, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72,
+ 0x6e, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65,
+ 0x74, 0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x61, 0x6c,
+ 0x6c, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72, 0x65, 0x74,
+ 0x75, 0x72, 0x6e, 0x20, 0x61, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x69, 0x73,
+ 0x65, 0x20, 0x74, 0x68, 0x61, 0x74, 0x20, 0x72, 0x65, 0x73, 0x6f, 0x6c,
+ 0x76, 0x65, 0x73, 0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63,
+ 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x65, 0x64, 0x20, 0x74, 0x65, 0x78,
+ 0x74, 0x2e, 0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x64, 0x6f, 0x65, 0x73,
+ 0x20, 0x6e, 0x6f, 0x74, 0x20, 0x73, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74,
+ 0x20, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x69, 0x6e, 0x67, 0x0a, 0x2f,
+ 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65,
+ 0x3a, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20,
+ 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65,
+ 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x2e, 0x74, 0x68, 0x65,
+ 0x6e, 0x28, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x20,
+ 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20,
+ 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77,
+ 0x72, 0x69, 0x74, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74,
+ 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a,
+ 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x72,
+ 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63,
+ 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74,
+ 0x20, 0x3d, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x6c, 0x6c, 0x61,
+ 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x70, 0x72,
+ 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20,
+ 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72,
+ 0x69, 0x74, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29,
+ 0x0a, 0x2f, 0x2f, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63,
+ 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x50, 0x72,
+ 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f,
+ 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20,
+ 0x3d, 0x20, 0x7b, 0x7d, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67,
+ 0x20, 0x3d, 0x20, 0x7b, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a,
+ 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x65, 0x77,
+ 0x20, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x61, 0x73, 0x79,
+ 0x6e, 0x63, 0x20, 0x28, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, 0x65, 0x2c,
+ 0x20, 0x72, 0x65, 0x6a, 0x65, 0x63, 0x74, 0x29, 0x20, 0x3d, 0x3e, 0x20,
+ 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f,
+ 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a,
+ 0x20, 0x20, 0x20, 0x20, 0x74, 0x72, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20,
+ 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69,
+ 0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75,
+ 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28,
+ 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61,
+ 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x29, 0x29,
+ 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63,
+ 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x2b, 0x3d, 0x20, 0x63, 0x68,
+ 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e,
+ 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+ 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x6f,
+ 0x6c, 0x76, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29,
+ 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x63, 0x61, 0x74, 0x63,
+ 0x68, 0x20, 0x28, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x20, 0x7b, 0x0a,
+ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x6a, 0x65, 0x63, 0x74,
+ 0x28, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20,
+ 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x7d, 0x3b, 0x0a,
+ 0x0a, 0x2f, 0x2a, 0x2a, 0x0a, 0x20, 0x2a, 0x20, 0x28, 0x64, 0x65, 0x70,
+ 0x72, 0x65, 0x63, 0x61, 0x74, 0x65, 0x64, 0x29, 0x0a, 0x20, 0x2a, 0x2f,
+ 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73,
+ 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x43, 0x6f, 0x6d, 0x70, 0x6c,
+ 0x65, 0x74, 0x65, 0x20, 0x3d, 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20,
+ 0x28, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e,
+ 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2c, 0x20, 0x63, 0x61, 0x6c,
+ 0x6c, 0x62, 0x61, 0x63, 0x6b, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a,
0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20,
0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b,
- 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72,
- 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73,
- 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x29, 0x29, 0x20, 0x7b,
- 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
- 0x74, 0x65, 0x6e, 0x74, 0x20, 0x2b, 0x3d, 0x20, 0x63, 0x68, 0x75, 0x6e,
- 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65,
- 0x6e, 0x74, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
- 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76,
- 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x3b, 0x0a,
- 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x63, 0x61, 0x74, 0x63, 0x68, 0x20,
- 0x28, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
- 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x65,
- 0x72, 0x72, 0x6f, 0x72, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d,
- 0x0a, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x7d, 0x3b, 0x0a, 0x0a, 0x2f,
- 0x2a, 0x2a, 0x0a, 0x20, 0x2a, 0x20, 0x28, 0x64, 0x65, 0x70, 0x72, 0x65,
- 0x63, 0x61, 0x74, 0x65, 0x64, 0x29, 0x0a, 0x20, 0x2a, 0x2f, 0x0a, 0x65,
- 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
- 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74,
- 0x65, 0x20, 0x3d, 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x70,
- 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72,
- 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2c, 0x20, 0x63, 0x61, 0x6c, 0x6c, 0x62,
- 0x61, 0x63, 0x6b, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20,
- 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x28, 0x63,
- 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f,
- 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x61, 0x72, 0x61,
- 0x6d, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70,
- 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x7b, 0x20, 0x63, 0x6f, 0x6e,
- 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20, 0x7d, 0x29, 0x29, 0x20,
- 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x61, 0x6c, 0x6c, 0x62, 0x61,
- 0x63, 0x6b, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x29, 0x3b, 0x0a, 0x20,
- 0x20, 0x7d, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x47, 0x65, 0x74,
- 0x20, 0x74, 0x68, 0x65, 0x20, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x20, 0x69,
- 0x6e, 0x66, 0x6f, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x74, 0x68, 0x65,
- 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2e, 0x20, 0x54, 0x68, 0x69,
- 0x73, 0x20, 0x69, 0x73, 0x20, 0x75, 0x73, 0x65, 0x66, 0x75, 0x6c, 0x20,
- 0x66, 0x6f, 0x72, 0x20, 0x67, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x20,
- 0x74, 0x68, 0x65, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x20,
- 0x77, 0x69, 0x6e, 0x64, 0x6f, 0x77, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x73,
- 0x6f, 0x20, 0x6f, 0x6e, 0x2e, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74,
- 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
- 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x49, 0x6e, 0x66, 0x6f, 0x20, 0x3d, 0x20,
- 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20,
- 0x7b, 0x0a, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x67, 0x65, 0x6e,
- 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74,
- 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
- 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x20,
- 0x3d, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x66, 0x65, 0x74, 0x63,
- 0x68, 0x28, 0x22, 0x2f, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x22, 0x29, 0x2e,
- 0x74, 0x68, 0x65, 0x6e, 0x28, 0x72, 0x20, 0x3d, 0x3e, 0x20, 0x72, 0x2e,
- 0x6a, 0x73, 0x6f, 0x6e, 0x28, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20,
- 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f,
- 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x3d, 0x20, 0x70,
- 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74,
- 0x5f, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f,
- 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x3b, 0x0a, 0x20, 0x20,
- 0x7d, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x67,
+ 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x61,
+ 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c,
+ 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x7b, 0x20, 0x63,
+ 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20, 0x7d, 0x29,
+ 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x61, 0x6c, 0x6c,
+ 0x62, 0x61, 0x63, 0x6b, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x29, 0x3b,
+ 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x47,
+ 0x65, 0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6d, 0x6f, 0x64, 0x65, 0x6c,
+ 0x20, 0x69, 0x6e, 0x66, 0x6f, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x74,
+ 0x68, 0x65, 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2e, 0x20, 0x54,
+ 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x75, 0x73, 0x65, 0x66, 0x75,
+ 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x67, 0x65, 0x74, 0x74, 0x69, 0x6e,
+ 0x67, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78,
+ 0x74, 0x20, 0x77, 0x69, 0x6e, 0x64, 0x6f, 0x77, 0x20, 0x61, 0x6e, 0x64,
+ 0x20, 0x73, 0x6f, 0x20, 0x6f, 0x6e, 0x2e, 0x0a, 0x65, 0x78, 0x70, 0x6f,
+ 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61,
+ 0x6d, 0x61, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x49, 0x6e, 0x66, 0x6f, 0x20,
+ 0x3d, 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x29, 0x20, 0x3d,
+ 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x67,
0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65,
- 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x3b, 0x0a, 0x7d, 0x0a
+ 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
+ 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70, 0x72, 0x6f, 0x70,
+ 0x73, 0x20, 0x3d, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x66, 0x65,
+ 0x74, 0x63, 0x68, 0x28, 0x22, 0x2f, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x22,
+ 0x29, 0x2e, 0x74, 0x68, 0x65, 0x6e, 0x28, 0x72, 0x20, 0x3d, 0x3e, 0x20,
+ 0x72, 0x2e, 0x6a, 0x73, 0x6f, 0x6e, 0x28, 0x29, 0x29, 0x3b, 0x0a, 0x20,
+ 0x20, 0x20, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f,
+ 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x3d,
+ 0x20, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x64, 0x65, 0x66, 0x61, 0x75,
+ 0x6c, 0x74, 0x5f, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f,
+ 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x3b, 0x0a,
+ 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e,
+ 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f,
+ 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x3b, 0x0a, 0x7d, 0x0a
};
-unsigned int completion_js_len = 5782;
+unsigned int completion_js_len = 5796;
diff --git a/examples/server/oai.hpp b/examples/server/oai.hpp
deleted file mode 100644
index ff4ad69943552..0000000000000
--- a/examples/server/oai.hpp
+++ /dev/null
@@ -1,225 +0,0 @@
-#pragma once
-
-#include
-#include
-#include
-#include
-#include
-#include
-
-#include "json.hpp"
-#include "utils.hpp"
-
-#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
-
-using json = nlohmann::json;
-
-inline static json oaicompat_completion_params_parse(
- const struct llama_model * model,
- const json &body, /* openai api json semantics */
- const std::string &chat_template)
-{
- json llama_params;
-
- llama_params["__oaicompat"] = true;
-
- // Map OpenAI parameters to llama.cpp parameters
- //
- // For parameters that are defined by the OpenAI documentation (e.g.
- // temperature), we explicitly specify OpenAI's intended default; we
- // need to do that because sometimes OpenAI disagrees with llama.cpp
- //
- // https://platform.openai.com/docs/api-reference/chat/create
- llama_sampling_params default_sparams;
- llama_params["model"] = json_value(body, "model", std::string("unknown"));
- llama_params["prompt"] = format_chat(model, chat_template, body["messages"]);
- llama_params["cache_prompt"] = json_value(body, "cache_prompt", false);
- llama_params["temperature"] = json_value(body, "temperature", 0.0);
- llama_params["top_k"] = json_value(body, "top_k", default_sparams.top_k);
- llama_params["top_p"] = json_value(body, "top_p", 1.0);
- llama_params["n_predict"] = json_value(body, "max_tokens", -1);
- llama_params["logit_bias"] = json_value(body, "logit_bias",json::object());
- llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0);
- llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0);
- llama_params["seed"] = json_value(body, "seed", LLAMA_DEFAULT_SEED);
- llama_params["stream"] = json_value(body, "stream", false);
- llama_params["mirostat"] = json_value(body, "mirostat", default_sparams.mirostat);
- llama_params["mirostat_tau"] = json_value(body, "mirostat_tau", default_sparams.mirostat_tau);
- llama_params["mirostat_eta"] = json_value(body, "mirostat_eta", default_sparams.mirostat_eta);
- llama_params["penalize_nl"] = json_value(body, "penalize_nl", default_sparams.penalize_nl);
- llama_params["typical_p"] = json_value(body, "typical_p", default_sparams.typical_p);
- llama_params["repeat_last_n"] = json_value(body, "repeat_last_n", default_sparams.penalty_last_n);
- llama_params["ignore_eos"] = json_value(body, "ignore_eos", false);
- llama_params["tfs_z"] = json_value(body, "tfs_z", default_sparams.tfs_z);
-
- if (body.count("grammar") != 0) {
- llama_params["grammar"] = json_value(body, "grammar", json::object());
- }
-
- // Handle 'stop' field
- if (body.contains("stop") && body["stop"].is_string()) {
- llama_params["stop"] = json::array({body["stop"].get()});
- } else {
- llama_params["stop"] = json_value(body, "stop", json::array());
- }
-
- // Ensure there is ChatML-specific end sequence among stop words
- llama_params["stop"].push_back("<|im_end|>");
-
- return llama_params;
-}
-
-inline static json format_final_response_oaicompat(const json &request, const task_result &response, bool streaming = false)
-{
- json result = response.result_json;
-
- bool stopped_word = result.count("stopped_word") != 0;
- bool stopped_eos = json_value(result, "stopped_eos", false);
- int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
- int num_prompt_tokens = json_value(result, "tokens_evaluated", 0);
- std::string content = json_value(result, "content", std::string(""));
-
- std::string finish_reason = "length";
- if (stopped_word || stopped_eos) {
- finish_reason = "stop";
- }
-
- json choices =
- streaming ? json::array({json{{"finish_reason", finish_reason},
- {"index", 0},
- {"delta", json::object()}}})
- : json::array({json{{"finish_reason", finish_reason},
- {"index", 0},
- {"message", json{{"content", content},
- {"role", "assistant"}}}}});
-
- std::time_t t = std::time(0);
-
- json res =
- json{{"choices", choices},
- {"created", t},
- {"model",
- json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
- {"object", streaming ? "chat.completion.chunk" : "chat.completion"},
- {"usage",
- json{{"completion_tokens", num_tokens_predicted},
- {"prompt_tokens", num_prompt_tokens},
- {"total_tokens", num_tokens_predicted + num_prompt_tokens}}},
- {"id", gen_chatcmplid()}};
-
- if (server_verbose) {
- res["__verbose"] = result;
- }
-
- if (result.contains("completion_probabilities")) {
- res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
- }
-
- return res;
-}
-
-// return value is vector as there is one case where we might need to generate two responses
-inline static std::vector format_partial_response_oaicompat(const task_result &response) {
- json result = response.result_json;
-
- if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
- return std::vector({response.result_json});
- }
-
- bool first = json_value(result, "oaicompat_token_ctr", 0) == 0;
- std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
-
- bool stopped_word = json_value(result, "stopped_word", false);
- bool stopped_eos = json_value(result, "stopped_eos", false);
- bool stopped_limit = json_value(result, "stopped_limit", false);
- std::string content = json_value(result, "content", std::string(""));
-
- std::string finish_reason;
- if (stopped_word || stopped_eos) {
- finish_reason = "stop";
- }
- if (stopped_limit) {
- finish_reason = "length";
- }
-
- std::time_t t = std::time(0);
-
- json choices;
-
- if (!finish_reason.empty()) {
- choices = json::array({json{{"finish_reason", finish_reason},
- {"index", 0},
- {"delta", json::object()}}});
- } else {
- if (first) {
- if (content.empty()) {
- choices = json::array({json{{"finish_reason", nullptr},
- {"index", 0},
- {"delta", json{{"role", "assistant"}}}}});
- } else {
- // We have to send this as two updates to conform to openai behavior
- json initial_ret = json{{"choices", json::array({json{
- {"finish_reason", nullptr},
- {"index", 0},
- {"delta", json{
- {"role", "assistant"}
- }}}})},
- {"created", t},
- {"id", gen_chatcmplid()},
- {"model", modelname},
- {"object", "chat.completion.chunk"}};
-
- json second_ret = json{
- {"choices", json::array({json{{"finish_reason", nullptr},
- {"index", 0},
- {"delta", json{
- {"content", content}}}
- }})},
- {"created", t},
- {"id", gen_chatcmplid()},
- {"model", modelname},
- {"object", "chat.completion.chunk"}};
-
- return std::vector({initial_ret, second_ret});
- }
- } else {
- // Some idiosyncrasy in task processing logic makes several trailing calls
- // with empty content, we ignore these at the calee site.
- if (content.empty()) {
- return std::vector({json::object()});
- }
-
- choices = json::array({json{
- {"finish_reason", nullptr},
- {"index", 0},
- {"delta",
- json{
- {"content", content},
- }},
- }});
- }
- }
-
- json ret = json{{"choices", choices},
- {"created", t},
- {"id", gen_chatcmplid()},
- {"model", modelname},
- {"object", "chat.completion.chunk"}};
-
- return std::vector({ret});
-}
-
-inline static json format_embeddings_response_oaicompat(const json &request, const json &embeddings)
-{
- json res =
- json{
- {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
- {"object", "list"},
- {"usage",
- json{{"prompt_tokens", 0},
- {"total_tokens", 0}}},
- {"data", embeddings}
- };
- return res;
-}
-
diff --git a/examples/server/public/completion.js b/examples/server/public/completion.js
index ab38a7b409df1..835ce6e68422a 100644
--- a/examples/server/public/completion.js
+++ b/examples/server/public/completion.js
@@ -96,18 +96,18 @@ export async function* llama(prompt, params = {}, config = {}) {
}
}
if (result.error) {
- result.error = JSON.parse(result.error);
- if (result.error.content.includes('slot unavailable')) {
- // Throw an error to be caught by upstream callers
- throw new Error('slot unavailable');
- } else {
- console.error(`llama.cpp error: ${result.error.content}`);
+ try {
+ result.error = JSON.parse(result.error);
+ if (result.error.message.includes('slot unavailable')) {
+ // Throw an error to be caught by upstream callers
+ throw new Error('slot unavailable');
+ } else {
+ console.error(`llama.cpp error [${result.error.code} - ${result.error.type}]: ${result.error.message}`);
+ }
+ } catch(e) {
+ console.error(`llama.cpp error ${result.error}`)
}
}
- if (result.error) {
- result.error = JSON.parse(result.error);
- console.error(`llama.cpp error: ${result.error.content}`);
- }
}
}
}
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 2ee1e2a698847..69c6d97d4c2a3 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1,14 +1,9 @@
+#include "utils.hpp"
+
#include "common.h"
#include "llama.h"
#include "build-info.h"
#include "grammar-parser.h"
-#include "utils.hpp"
-#include "oai.hpp"
-
-#include "../llava/clip.h"
-#include "../llava/llava.h"
-
-#include "stb_image.h"
#ifndef NDEBUG
// crash the server in debug mode, otherwise send an http 500 error
@@ -25,46 +20,77 @@
#include "completion.js.hpp"
#include "json-schema-to-grammar.mjs.hpp"
-#include
-#include
+#include
#include
#include
-#include
+#include
+#include
+#include
+#include
#include
+#include
using json = nlohmann::json;
-struct server_params {
- std::string hostname = "127.0.0.1";
- std::vector api_keys;
- std::string public_path = "examples/server/public";
- std::string chat_template = "";
- int32_t port = 8080;
- int32_t read_timeout = 600;
- int32_t write_timeout = 600;
- bool slots_endpoint = true;
- bool metrics_endpoint = false;
- int n_threads_http = -1;
-};
-
bool server_verbose = false;
bool server_log_json = true;
enum stop_type {
- STOP_FULL,
- STOP_PARTIAL,
+ STOP_TYPE_FULL,
+ STOP_TYPE_PARTIAL,
};
-// TODO: can become bool if we can't find use of more states
enum slot_state {
- IDLE,
- PROCESSING,
+ SLOT_STATE_IDLE,
+ SLOT_STATE_PROCESSING,
};
enum slot_command {
- NONE,
- LOAD_PROMPT,
- RELEASE,
+ SLOT_COMMAND_NONE,
+ SLOT_COMMAND_LOAD_PROMPT,
+ SLOT_COMMAND_RELEASE,
+};
+
+enum server_state {
+ SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet
+ SERVER_STATE_READY, // Server is ready and model is loaded
+ SERVER_STATE_ERROR // An error occurred, load_model failed
+};
+
+enum server_task_type {
+ SERVER_TASK_TYPE_COMPLETION,
+ SERVER_TASK_TYPE_CANCEL,
+ SERVER_TASK_TYPE_NEXT_RESPONSE,
+ SERVER_TASK_TYPE_METRICS
+};
+
+struct server_task {
+ int id = -1; // to be filled by server_queue
+ int id_multi = -1;
+ int id_target = -1;
+
+ server_task_type type;
+ json data;
+
+ bool infill = false;
+ bool embedding = false;
+};
+
+struct server_task_result {
+ int id = -1;
+ int id_multi = -1;
+
+ json data;
+
+ bool stop;
+ bool error;
+};
+
+struct server_task_multi {
+ int id = -1;
+
+ std::set subtasks_remaining;
+ std::vector results;
};
struct slot_params {
@@ -81,26 +107,37 @@ struct slot_params {
json input_suffix;
};
-struct slot_image {
- int32_t id;
+struct server_params {
+ int32_t port = 8080;
+ int32_t read_timeout = 600;
+ int32_t write_timeout = 600;
+ int32_t n_threads_http = -1;
+
+ std::string hostname = "127.0.0.1";
+ std::string public_path = "";
+ std::string chat_template = "";
+ std::string system_prompt = "";
- bool request_encode_image = false;
- float * image_embedding = nullptr;
- int32_t image_tokens = 0;
+ std::vector api_keys;
- clip_image_u8 * img_data;
+#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
+ std::string ssl_key_file = "";
+ std::string ssl_cert_file = "";
+#endif
- std::string prefix_prompt; // before of this image
+ bool slots_endpoint = true;
+ bool metrics_endpoint = false;
};
struct server_slot {
int id;
- int task_id = -1;
+ int id_task = -1;
+ int id_multi = -1;
struct slot_params params;
- slot_state state = IDLE;
- slot_command command = NONE;
+ slot_state state = SLOT_STATE_IDLE;
+ slot_command command = SLOT_COMMAND_NONE;
// used to determine the slot that has been used the longest
int64_t t_last_used = -1;
@@ -111,33 +148,37 @@ struct server_slot {
int32_t n_decoded = 0;
int32_t n_remaining = -1;
int32_t i_batch = -1;
- int32_t n_predict = -1;
+ int32_t n_predict = -1; // TODO: disambiguate from params.n_predict
int32_t n_prompt_tokens = 0;
int32_t n_prompt_tokens_processed = 0;
json prompt;
+
+ // when a task is submitted, we first tokenize the prompt and store it here
+ std::vector prompt_tokens;
+
std::string generated_text;
- llama_token sampled;
std::vector cache_tokens;
std::vector generated_token_probs;
- bool infill = false;
- bool embedding = false;
+ bool infill = false;
+ bool embedding = false;
bool has_next_token = true;
- bool truncated = false;
- bool stopped_eos = false;
- bool stopped_word = false;
- bool stopped_limit = false;
+ bool truncated = false;
+ bool stopped_eos = false;
+ bool stopped_word = false;
+ bool stopped_limit = false;
bool oaicompat = false;
- std::string oaicompat_model;
+ std::string oaicompat_model;
std::string stopping_word;
// sampling
+ llama_token sampled;
struct llama_sampling_params sparams;
- llama_sampling_context *ctx_sampling = nullptr;
+ llama_sampling_context * ctx_sampling = nullptr;
int32_t ga_i = 0; // group-attention state
int32_t ga_n = 1; // group-attention factor
@@ -145,48 +186,32 @@ struct server_slot {
int32_t n_past_se = 0; // self-extend
- // multimodal
- std::vector images;
-
// stats
size_t n_sent_text = 0; // number of sent text character
size_t n_sent_token_probs = 0;
int64_t t_start_process_prompt;
- int64_t t_start_genereration;
+ int64_t t_start_generation;
double t_prompt_processing; // ms
double t_token_generation; // ms
- // multitasks
- int multitask_id = -1;
-
void reset() {
- n_prompt_tokens = 0;
- generated_text = "";
- truncated = false;
- stopped_eos = false;
- stopped_word = false;
- stopped_limit = false;
- stopping_word = "";
- n_past = 0;
- n_sent_text = 0;
- n_sent_token_probs = 0;
- infill = false;
- ga_i = 0;
- n_past_se = 0;
+ n_prompt_tokens = 0;
+ generated_text = "";
+ truncated = false;
+ stopped_eos = false;
+ stopped_word = false;
+ stopped_limit = false;
+ stopping_word = "";
+ n_past = 0;
+ n_sent_text = 0;
+ n_sent_token_probs = 0;
+ infill = false;
+ ga_i = 0;
+ n_past_se = 0;
generated_token_probs.clear();
-
- for (slot_image & img : images) {
- free(img.image_embedding);
- if (img.img_data) {
- clip_image_u8_free(img.img_data);
- }
- img.prefix_prompt = "";
- }
-
- images.clear();
}
bool has_budget(gpt_params &global_params) {
@@ -206,32 +231,29 @@ struct server_slot {
}
bool available() const {
- return state == IDLE && command == NONE;
+ return state == SLOT_STATE_IDLE && command == SLOT_COMMAND_NONE;
}
bool is_processing() const {
- return (state == IDLE && command == LOAD_PROMPT) || state == PROCESSING;
+ return (state == SLOT_STATE_IDLE && command == SLOT_COMMAND_LOAD_PROMPT) || state == SLOT_STATE_PROCESSING;
}
- void add_token_string(const completion_token_output &token) {
- if (command == RELEASE) {
+ void add_token_string(const completion_token_output & token) {
+ if (command == SLOT_COMMAND_RELEASE) {
return;
}
- cache_tokens.push_back(token.tok);
generated_token_probs.push_back(token);
}
void release() {
- if (state == PROCESSING)
- {
- t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3;
- command = RELEASE;
+ if (state == SLOT_STATE_PROCESSING) {
+ t_token_generation = (ggml_time_us() - t_start_generation) / 1e3;
+ command = SLOT_COMMAND_RELEASE;
}
}
- json get_formated_timings() {
- return json
- {
+ json get_formated_timings() const {
+ return json {
{"prompt_n", n_prompt_tokens_processed},
{"prompt_ms", t_prompt_processing},
{"prompt_per_token_ms", t_prompt_processing / n_prompt_tokens_processed},
@@ -244,16 +266,47 @@ struct server_slot {
};
}
+ size_t find_stopping_strings(const std::string & text, const size_t last_token_size, const stop_type type) {
+ size_t stop_pos = std::string::npos;
+
+ for (const std::string & word : params.antiprompt) {
+ size_t pos;
+
+ if (type == STOP_TYPE_FULL) {
+ const size_t tmp = word.size() + last_token_size;
+ const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0;
+
+ pos = text.find(word, from_pos);
+ } else {
+ pos = find_partial_stop_string(word, text);
+ }
+
+ if (pos != std::string::npos && (stop_pos == std::string::npos || pos < stop_pos)) {
+ if (type == STOP_TYPE_FULL) {
+ stopped_word = true;
+ stopping_word = word;
+ has_next_token = false;
+ }
+ stop_pos = pos;
+ }
+ }
+
+ return stop_pos;
+ }
+
void print_timings() const {
- char buffer[512];
+ char buffer[512];
+
double t_token = t_prompt_processing / n_prompt_tokens_processed;
double n_tokens_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
- sprintf(buffer, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
+
+ snprintf(buffer, 512, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
t_prompt_processing, n_prompt_tokens_processed,
t_token, n_tokens_second);
+
LOG_INFO(buffer, {
- {"slot_id", id},
- {"task_id", task_id},
+ {"id_slot", id},
+ {"id_task", id_task},
{"t_prompt_processing", t_prompt_processing},
{"n_prompt_tokens_processed", n_prompt_tokens_processed},
{"t_token", t_token},
@@ -262,22 +315,25 @@ struct server_slot {
t_token = t_token_generation / n_decoded;
n_tokens_second = 1e3 / t_token_generation * n_decoded;
- sprintf(buffer, "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)",
+
+ snprintf(buffer, 512, "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)",
t_token_generation, n_decoded,
t_token, n_tokens_second);
+
LOG_INFO(buffer, {
- {"slot_id", id},
- {"task_id", task_id},
+ {"id_slot", id},
+ {"id_task", id_task},
{"t_token_generation", t_token_generation},
{"n_decoded", n_decoded},
{"t_token", t_token},
{"n_tokens_second", n_tokens_second},
});
- sprintf(buffer, " total time = %10.2f ms", t_prompt_processing + t_token_generation);
+ snprintf(buffer, 512, " total time = %10.2f ms", t_prompt_processing + t_token_generation);
+
LOG_INFO(buffer, {
- {"slot_id", id},
- {"task_id", task_id},
+ {"id_slot", id},
+ {"id_task", id_task},
{"t_prompt_processing", t_prompt_processing},
{"t_token_generation", t_token_generation},
{"t_total", t_prompt_processing + t_token_generation},
@@ -286,26 +342,35 @@ struct server_slot {
};
struct server_metrics {
+ int64_t t_start = 0;
+
uint64_t n_prompt_tokens_processed_total = 0;
+ uint64_t t_prompt_processing_total = 0;
uint64_t n_tokens_predicted_total = 0;
+ uint64_t t_tokens_generation_total = 0;
uint64_t n_prompt_tokens_processed = 0;
uint64_t t_prompt_processing = 0;
- uint64_t n_tokens_predicted = 0;
- uint64_t t_tokens_generation = 0;
+ uint64_t n_tokens_predicted = 0;
+ uint64_t t_tokens_generation = 0;
+ void init() {
+ t_start = ggml_time_us();
+ }
- void on_prompt_eval(const server_slot &slot) {
+ void on_prompt_eval(const server_slot & slot) {
n_prompt_tokens_processed_total += slot.n_prompt_tokens_processed;
n_prompt_tokens_processed += slot.n_prompt_tokens_processed;
t_prompt_processing += slot.t_prompt_processing;
+ t_prompt_processing_total += slot.t_prompt_processing;
}
- void on_prediction(const server_slot &slot) {
- n_tokens_predicted_total += slot.n_decoded;
- n_tokens_predicted += slot.n_decoded;
- t_tokens_generation += slot.t_token_generation;
+ void on_prediction(const server_slot & slot) {
+ n_tokens_predicted_total += slot.n_decoded;
+ n_tokens_predicted += slot.n_decoded;
+ t_tokens_generation += slot.t_token_generation;
+ t_tokens_generation_total += slot.t_token_generation;
}
void reset_bucket() {
@@ -316,23 +381,261 @@ struct server_metrics {
}
};
-struct llama_server_context
-{
- llama_model *model = nullptr;
- llama_context *ctx = nullptr;
+struct server_queue {
+ int id = 0;
+ bool running;
+
+ // queues
+ std::vector queue_tasks;
+ std::vector queue_tasks_deferred;
+
+ std::vector queue_multitasks;
+
+ std::mutex mutex_tasks;
+ std::condition_variable condition_tasks;
+
+ // callback functions
+ std::function callback_new_task;
+ std::function callback_finish_multitask;
+ std::function callback_update_slots;
+
+ // Add a new task to the end of the queue
+ int post(server_task task) {
+ std::unique_lock lock(mutex_tasks);
+ if (task.id == -1) {
+ task.id = id++;
+ LOG_VERBOSE("new task id", {{"new_id", task.id}});
+ }
+ queue_tasks.push_back(std::move(task));
+ condition_tasks.notify_one();
+ return task.id;
+ }
+
+ // Add a new task, but defer until one slot is available
+ void defer(server_task task) {
+ std::unique_lock lock(mutex_tasks);
+ queue_tasks_deferred.push_back(std::move(task));
+ }
+
+ // Get the next id for creating anew task
+ int get_new_id() {
+ std::unique_lock lock(mutex_tasks);
+ int new_id = id++;
+ LOG_VERBOSE("new task id", {{"new_id", new_id}});
+ return new_id;
+ }
+
+ // Register function to process a new task
+ void on_new_task(std::function callback) {
+ callback_new_task = std::move(callback);
+ }
+
+ // Register function to process a multitask when it is finished
+ void on_finish_multitask(std::function callback) {
+ callback_finish_multitask = std::move(callback);
+ }
+
+ // Register the function to be called when all slots data is ready to be processed
+ void on_update_slots(std::function callback) {
+ callback_update_slots = std::move(callback);
+ }
+
+ // Call when the state of one slot is changed
+ void notify_slot_changed() {
+ // move deferred tasks back to main loop
+ std::unique_lock lock(mutex_tasks);
+ for (auto & task : queue_tasks_deferred) {
+ queue_tasks.push_back(std::move(task));
+ }
+ queue_tasks_deferred.clear();
+ }
+
+ // end the start_loop routine
+ void terminate() {
+ std::unique_lock lock(mutex_tasks);
+ running = false;
+ condition_tasks.notify_all();
+ }
+
+ /**
+ * Main loop consists of these steps:
+ * - Wait until a new task arrives
+ * - Process the task (i.e. maybe copy data into slot)
+ * - Check if multitask is finished
+ * - Update all slots
+ */
+ void start_loop() {
+ running = true;
+
+ while (true) {
+ LOG_VERBOSE("new task may arrive", {});
+
+ while (true) {
+ std::unique_lock lock(mutex_tasks);
+ if (queue_tasks.empty()) {
+ lock.unlock();
+ break;
+ }
+ server_task task = queue_tasks.front();
+ queue_tasks.erase(queue_tasks.begin());
+ lock.unlock();
+ LOG_VERBOSE("callback_new_task", {{"id_task", task.id}});
+ callback_new_task(task);
+ }
+
+ LOG_VERBOSE("update_multitasks", {});
+
+ // check if we have any finished multitasks
+ auto queue_iterator = queue_multitasks.begin();
+ while (queue_iterator != queue_multitasks.end()) {
+ if (queue_iterator->subtasks_remaining.empty()) {
+ // all subtasks done == multitask is done
+ server_task_multi current_multitask = *queue_iterator;
+ callback_finish_multitask(current_multitask);
+ // remove this multitask
+ queue_iterator = queue_multitasks.erase(queue_iterator);
+ } else {
+ ++queue_iterator;
+ }
+ }
+
+ // all tasks in the current loop is processed, slots data is now ready
+ LOG_VERBOSE("callback_update_slots", {});
+
+ callback_update_slots();
+
+ LOG_VERBOSE("wait for new task", {});
+ {
+ std::unique_lock lock(mutex_tasks);
+ if (queue_tasks.empty()) {
+ if (!running) {
+ LOG_VERBOSE("ending start_loop", {});
+ return;
+ }
+ condition_tasks.wait(lock, [&]{
+ return (!queue_tasks.empty() || !running);
+ });
+ }
+ }
+ }
+ }
+
+ //
+ // functions to manage multitasks
+ //
+
+ // add a multitask by specifying the id of all subtask (subtask is a server_task)
+ void add_multitask(int id_multi, std::vector & sub_ids) {
+ std::lock_guard lock(mutex_tasks);
+ server_task_multi multi;
+ multi.id = id_multi;
+ std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end()));
+ queue_multitasks.push_back(multi);
+ }
+
+ // updatethe remaining subtasks, while appending results to multitask
+ void update_multitask(int id_multi, int id_sub, server_task_result & result) {
+ std::lock_guard lock(mutex_tasks);
+ for (auto & multitask : queue_multitasks) {
+ if (multitask.id == id_multi) {
+ multitask.subtasks_remaining.erase(id_sub);
+ multitask.results.push_back(result);
+ }
+ }
+ }
+};
+
+struct server_response {
+ typedef std::function callback_multitask_t;
+ callback_multitask_t callback_update_multitask;
- clip_ctx *clp_ctx = nullptr;
+ // for keeping track of all tasks waiting for the result
+ std::set waiting_task_ids;
+
+ // the main result queue
+ std::vector queue_results;
+
+ std::mutex mutex_results;
+ std::condition_variable condition_results;
+
+ // add the id_task to the list of tasks waiting for response
+ void add_waiting_task_id(int id_task) {
+ LOG_VERBOSE("waiting for task id", {{"id_task", id_task}});
+
+ std::unique_lock lock(mutex_results);
+ waiting_task_ids.insert(id_task);
+ }
+
+ // when the request is finished, we can remove task associated with it
+ void remove_waiting_task_id(int id_task) {
+ LOG_VERBOSE("remove waiting for task id", {{"id_task", id_task}});
+
+ std::unique_lock lock(mutex_results);
+ waiting_task_ids.erase(id_task);
+ }
+
+ // This function blocks the thread until there is a response for this id_task
+ server_task_result recv(int id_task) {
+ while (true) {
+ std::unique_lock lock(mutex_results);
+ condition_results.wait(lock, [&]{
+ return !queue_results.empty();
+ });
+
+ for (int i = 0; i < (int) queue_results.size(); i++) {
+ if (queue_results[i].id == id_task) {
+ assert(queue_results[i].id_multi == -1);
+ server_task_result res = queue_results[i];
+ queue_results.erase(queue_results.begin() + i);
+ return res;
+ }
+ }
+ }
+
+ // should never reach here
+ }
+
+ // Register the function to update multitask
+ void on_multitask_update(callback_multitask_t callback) {
+ callback_update_multitask = std::move(callback);
+ }
+
+ // Send a new result to a waiting id_task
+ void send(server_task_result result) {
+ LOG_VERBOSE("send new result", {{"id_task", result.id}});
+
+ std::unique_lock lock(mutex_results);
+ for (const auto & id_task : waiting_task_ids) {
+ // LOG_TEE("waiting task id %i \n", id_task);
+ // for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
+ if (result.id_multi == id_task) {
+ LOG_VERBOSE("callback_update_multitask", {{"id_task", id_task}});
+ callback_update_multitask(id_task, result.id, result);
+ continue;
+ }
+
+ if (result.id == id_task) {
+ LOG_VERBOSE("queue_results.push_back", {{"id_task", id_task}});
+ queue_results.push_back(result);
+ condition_results.notify_all();
+ return;
+ }
+ }
+ }
+};
+
+struct server_context {
+ llama_model * model = nullptr;
+ llama_context * ctx = nullptr;
gpt_params params;
llama_batch batch;
- bool multimodal = false;
- bool clean_kv_cache = true;
- bool all_slots_are_idle = false;
- bool add_bos_token = true;
+ bool clean_kv_cache = true;
+ bool add_bos_token = true;
- int32_t n_ctx; // total context for all clients / slots
+ int32_t n_ctx; // total context for all clients / slots
// system prompt
bool system_need_update = false;
@@ -347,60 +650,36 @@ struct llama_server_context
std::vector slots;
json default_generation_settings_for_props;
- llama_server_queue queue_tasks;
- llama_server_response queue_results;
+ server_queue queue_tasks;
+ server_response queue_results;
server_metrics metrics;
- ~llama_server_context()
- {
- if (ctx)
- {
+ ~server_context() {
+ if (ctx) {
llama_free(ctx);
ctx = nullptr;
}
- if (model)
- {
+
+ if (model) {
llama_free_model(model);
model = nullptr;
}
}
- bool load_model(const gpt_params ¶ms_)
- {
+ bool load_model(const gpt_params & params_) {
params = params_;
- if (!params.mmproj.empty()) {
- multimodal = true;
- LOG_INFO("Multi Modal Mode Enabled", {});
- clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
- if(clp_ctx == nullptr) {
- LOG_ERROR("unable to load clip model", {{"model", params.mmproj}});
- return false;
- }
- if (params.n_ctx < 2048) { // request larger context for the image embedding
- params.n_ctx = 2048;
- }
- }
+ // dedicate one sequence to the system prompt
+ params.n_parallel += 1;
std::tie(model, ctx) = llama_init_from_gpt_params(params);
- if (model == nullptr)
- {
+ params.n_parallel -= 1; // but be sneaky about it
+ if (model == nullptr) {
LOG_ERROR("unable to load model", {{"model", params.model}});
return false;
}
- if (multimodal) {
- const int n_embd_clip = clip_n_mmproj_embd(clp_ctx);
- const int n_embd_llm = llama_n_embd(model);
- if (n_embd_clip != n_embd_llm) {
- LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm);
- llama_free(ctx);
- llama_free_model(model);
- return false;
- }
- }
-
n_ctx = llama_n_ctx(ctx);
add_bos_token = llama_should_add_bos_token(model);
@@ -408,25 +687,20 @@ struct llama_server_context
return true;
}
- void validate_model_chat_template(server_params & sparams) {
+ bool validate_model_chat_template() const {
llama_chat_message chat[] = {{"user", "test"}};
- std::vector buf(1);
- int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size());
- if (res < 0) {
- LOG_ERROR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
- sparams.chat_template = "chatml";
- }
- }
- void initialize() {
- // create slots
- all_slots_are_idle = true;
+ const int res = llama_chat_apply_template(model, nullptr, chat, 1, true, nullptr, 0);
+ return res > 0;
+ }
+
+ void init() {
const int32_t n_ctx_slot = n_ctx / params.n_parallel;
LOG_INFO("initializing slots", {{"n_slots", params.n_parallel}});
- for (int i = 0; i < params.n_parallel; i++)
- {
+
+ for (int i = 0; i < params.n_parallel; i++) {
server_slot slot;
slot.id = i;
@@ -434,7 +708,7 @@ struct llama_server_context
slot.n_predict = params.n_predict;
LOG_INFO("new slot", {
- {"slot_id", slot.id},
+ {"id_slot", slot.id},
{"n_ctx_slot", slot.n_ctx}
});
@@ -448,9 +722,9 @@ struct llama_server_context
//GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT
LOG_INFO("slot self-extend", {
- {"slot_id", slot.id},
- {"ga_n", ga_n},
- {"ga_w", ga_w}
+ {"id_slot", slot.id},
+ {"ga_n", ga_n},
+ {"ga_w", ga_w}
});
}
@@ -466,11 +740,18 @@ struct llama_server_context
default_generation_settings_for_props = get_formated_generation(slots.front());
default_generation_settings_for_props["seed"] = -1;
- batch = llama_batch_init(n_ctx, 0, params.n_parallel);
+ // the update_slots() logic will always submit a maximum of n_batch tokens
+ // note that n_batch can be > n_ctx (e.g. for non-causal attention models such as BERT where the KV cache is not used)
+ {
+ const int32_t n_batch = llama_n_batch(ctx);
+
+ batch = llama_batch_init(n_batch, 0, params.n_parallel);
+ }
+
+ metrics.init();
}
- std::vector tokenize(const json & json_prompt, bool add_bos) const
- {
+ std::vector tokenize(const json & json_prompt, bool add_bos) const {
// TODO: currently, we tokenize using special tokens by default
// this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216)
// but it's better compared to completely ignoring ChatML and other chat templates
@@ -480,38 +761,30 @@ struct llama_server_context
// or the first element of the json_prompt array is a string.
std::vector prompt_tokens;
- if (json_prompt.is_array())
- {
+ if (json_prompt.is_array()) {
bool first = true;
- for (const auto& p : json_prompt)
- {
- if (p.is_string())
- {
+ for (const auto & p : json_prompt) {
+ if (p.is_string()) {
auto s = p.template get();
+
std::vector p;
- if (first)
- {
+ if (first) {
p = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
first = false;
- }
- else
- {
+ } else {
p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
}
+
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
- }
- else
- {
- if (first)
- {
+ } else {
+ if (first) {
first = false;
}
+
prompt_tokens.push_back(p.template get());
}
}
- }
- else
- {
+ } else {
auto s = json_prompt.template get();
prompt_tokens = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
}
@@ -519,19 +792,18 @@ struct llama_server_context
return prompt_tokens;
}
- server_slot* get_slot(int id) {
+ server_slot * get_slot(int id) {
int64_t t_last = ggml_time_us();
- server_slot *last_used = nullptr;
- for (server_slot & slot : slots)
- {
- if (slot.id == id && slot.available())
- {
+ server_slot * last_used = nullptr;
+
+ for (server_slot & slot : slots) {
+ if (slot.id == id && slot.available()) {
return &slot;
}
- if (slot.available() && slot.t_last_used < t_last)
- {
+ // among all available slots, find the one that has been least recently used
+ if (slot.available() && slot.t_last_used < t_last) {
last_used = &slot;
t_last = slot.t_last_used;
}
@@ -540,295 +812,225 @@ struct llama_server_context
return last_used;
}
- bool launch_slot_with_data(server_slot* &slot, json data) {
+ bool launch_slot_with_task(server_slot & slot, const server_task & task) {
slot_params default_params;
llama_sampling_params default_sparams;
+ auto & data = task.data;
if (data.count("__oaicompat") != 0) {
- slot->oaicompat = true;
- slot->oaicompat_model = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
+ slot.oaicompat = true;
+ slot.oaicompat_model = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
} else {
- slot->oaicompat = false;
- slot->oaicompat_model = "";
- }
-
- slot->params.stream = json_value(data, "stream", false);
- slot->params.cache_prompt = json_value(data, "cache_prompt", false);
- slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict);
- slot->sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
- slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
- slot->sparams.min_p = json_value(data, "min_p", default_sparams.min_p);
- slot->sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z);
- slot->sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p);
- slot->sparams.temp = json_value(data, "temperature", default_sparams.temp);
- slot->sparams.dynatemp_range = json_value(data, "dynatemp_range", default_sparams.dynatemp_range);
- slot->sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent);
- slot->sparams.penalty_last_n = json_value(data, "repeat_last_n", default_sparams.penalty_last_n);
- slot->sparams.penalty_repeat = json_value(data, "repeat_penalty", default_sparams.penalty_repeat);
- slot->sparams.penalty_freq = json_value(data, "frequency_penalty", default_sparams.penalty_freq);
- slot->sparams.penalty_present = json_value(data, "presence_penalty", default_sparams.penalty_present);
- slot->sparams.mirostat = json_value(data, "mirostat", default_sparams.mirostat);
- slot->sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau);
- slot->sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta);
- slot->sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl);
- slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep);
- slot->params.seed = json_value(data, "seed", default_params.seed);
- slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
- slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
- slot->sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
-
- if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) {
+ slot.oaicompat = false;
+ slot.oaicompat_model = "";
+ }
+
+ slot.params.stream = json_value(data, "stream", false);
+ slot.params.cache_prompt = json_value(data, "cache_prompt", false);
+ slot.params.n_predict = json_value(data, "n_predict", default_params.n_predict);
+ slot.sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
+ slot.sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
+ slot.sparams.min_p = json_value(data, "min_p", default_sparams.min_p);
+ slot.sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z);
+ slot.sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p);
+ slot.sparams.temp = json_value(data, "temperature", default_sparams.temp);
+ slot.sparams.dynatemp_range = json_value(data, "dynatemp_range", default_sparams.dynatemp_range);
+ slot.sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent);
+ slot.sparams.penalty_last_n = json_value(data, "repeat_last_n", default_sparams.penalty_last_n);
+ slot.sparams.penalty_repeat = json_value(data, "repeat_penalty", default_sparams.penalty_repeat);
+ slot.sparams.penalty_freq = json_value(data, "frequency_penalty", default_sparams.penalty_freq);
+ slot.sparams.penalty_present = json_value(data, "presence_penalty", default_sparams.penalty_present);
+ slot.sparams.mirostat = json_value(data, "mirostat", default_sparams.mirostat);
+ slot.sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau);
+ slot.sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta);
+ slot.sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl);
+ slot.params.n_keep = json_value(data, "n_keep", slot.params.n_keep);
+ slot.params.seed = json_value(data, "seed", default_params.seed);
+ slot.sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
+ slot.sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
+ slot.sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
+
+ if (slot.params.cache_prompt && slot.ga_n != 1) {
+ LOG_WARNING("cache_prompt is not supported with group-attention", {});
+ slot.params.cache_prompt = false;
+ }
+
+ if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
// Might be better to reject the request with a 400 ?
LOG_WARNING("Max tokens to predict exceeds server configuration", {
- {"params.n_predict", slot->params.n_predict},
- {"slot.n_predict", slot->n_predict},
+ {"params.n_predict", slot.params.n_predict},
+ {"slot.n_predict", slot.n_predict},
});
- slot->params.n_predict = slot->n_predict;
+ slot.params.n_predict = slot.n_predict;
}
// infill
- if (data.count("input_prefix") != 0)
- {
- slot->params.input_prefix = data["input_prefix"];
- }
- else
- {
- slot->params.input_prefix = "";
- }
+ slot.params.input_prefix = json_value(data, "input_prefix", default_params.input_prefix);
+ slot.params.input_suffix = json_value(data, "input_suffix", default_params.input_suffix);
- if (data.count("input_suffix") != 0)
- {
- slot->params.input_suffix = data["input_suffix"];
- }
- else
+ // get prompt
{
- slot->params.input_suffix = "";
+ const auto & prompt = data.find("prompt");
+ if (prompt == data.end()) {
+ send_error(task, "Either \"prompt\" or \"messages\" must be provided", ERROR_TYPE_INVALID_REQUEST);
+ return false;
+ } else {
+ slot.prompt = *prompt;
+ }
+ if (slot.prompt.is_array() && slot.prompt.size() == 0) {
+ send_error(task, "\"prompt\" cannot be an empty array", ERROR_TYPE_INVALID_REQUEST);
+ return false;
+ }
}
- if (data.count("prompt") != 0)
- {
- slot->prompt = data["prompt"];
- }
- else
+ // penalize user-provided tokens
{
- slot->prompt = "";
- }
+ slot.sparams.penalty_prompt_tokens.clear();
+ slot.sparams.use_penalty_prompt_tokens = false;
- slot->sparams.penalty_prompt_tokens.clear();
- slot->sparams.use_penalty_prompt_tokens = false;
- const auto &penalty_prompt = data.find("penalty_prompt");
- if (penalty_prompt != data.end())
- {
- if (penalty_prompt->is_string())
- {
- const auto penalty_prompt_string = penalty_prompt->get();
- auto penalty_tokens = llama_tokenize(model, penalty_prompt_string, false);
- slot->sparams.penalty_prompt_tokens.swap(penalty_tokens);
- if (slot->params.n_predict > 0)
- {
- slot->sparams.penalty_prompt_tokens.reserve(slot->sparams.penalty_prompt_tokens.size() + slot->params.n_predict);
+ const auto & penalty_prompt = data.find("penalty_prompt");
+
+ if (penalty_prompt != data.end()) {
+ if (penalty_prompt->is_string()) {
+ const auto penalty_prompt_string = penalty_prompt->get();
+ slot.sparams.penalty_prompt_tokens = llama_tokenize(model, penalty_prompt_string, false);
+
+ if (slot.params.n_predict > 0) {
+ slot.sparams.penalty_prompt_tokens.reserve(slot.sparams.penalty_prompt_tokens.size() + slot.params.n_predict);
+ }
+ slot.sparams.use_penalty_prompt_tokens = true;
+
+ LOG_VERBOSE("penalty_prompt_tokens", {
+ {"id_slot", slot.id},
+ {"tokens", slot.sparams.penalty_prompt_tokens},
+ });
}
- slot->sparams.use_penalty_prompt_tokens = true;
- }
- else if (penalty_prompt->is_array())
- {
- const auto n_tokens = penalty_prompt->size();
- slot->sparams.penalty_prompt_tokens.reserve(n_tokens + std::max(0, slot->params.n_predict));
- const int n_vocab = llama_n_vocab(model);
- for (const auto &penalty_token : *penalty_prompt)
- {
- if (penalty_token.is_number_integer())
- {
- const auto tok = penalty_token.get();
- if (tok >= 0 && tok < n_vocab)
- {
- slot->sparams.penalty_prompt_tokens.push_back(tok);
+ else if (penalty_prompt->is_array()) {
+ const auto n_tokens = penalty_prompt->size();
+ slot.sparams.penalty_prompt_tokens.reserve(n_tokens + std::max(0, slot.params.n_predict));
+
+ const int n_vocab = llama_n_vocab(model);
+ for (const auto & penalty_token : *penalty_prompt) {
+ if (penalty_token.is_number_integer()) {
+ const auto tok = penalty_token.get();
+ if (tok >= 0 && tok < n_vocab) {
+ slot.sparams.penalty_prompt_tokens.push_back(tok);
+ }
}
}
+ slot.sparams.use_penalty_prompt_tokens = true;
+
+ LOG_VERBOSE("penalty_prompt_tokens", {
+ {"id_slot", slot.id},
+ {"tokens", slot.sparams.penalty_prompt_tokens},
+ });
}
- slot->sparams.use_penalty_prompt_tokens = true;
}
}
- slot->sparams.logit_bias.clear();
-
- if (json_value(data, "ignore_eos", false))
{
- slot->sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
- }
+ slot.sparams.logit_bias.clear();
- const auto &logit_bias = data.find("logit_bias");
- if (logit_bias != data.end() && logit_bias->is_array())
- {
- const int n_vocab = llama_n_vocab(model);
- for (const auto &el : *logit_bias)
- {
- if (el.is_array() && el.size() == 2)
- {
- float bias;
- if (el[1].is_number())
- {
- bias = el[1].get();
- }
- else if (el[1].is_boolean() && !el[1].get())
- {
- bias = -INFINITY;
- }
- else
- {
- continue;
- }
+ if (json_value(data, "ignore_eos", false)) {
+ slot.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
+ }
- if (el[0].is_number_integer())
- {
- llama_token tok = el[0].get();
- if (tok >= 0 && tok < n_vocab)
- {
- slot->sparams.logit_bias[tok] = bias;
+ const auto & logit_bias = data.find("logit_bias");
+ if (logit_bias != data.end() && logit_bias->is_array()) {
+ const int n_vocab = llama_n_vocab(model);
+ for (const auto & el : *logit_bias) {
+ // TODO: we may want to throw errors here, in case "el" is incorrect
+ if (el.is_array() && el.size() == 2) {
+ float bias;
+ if (el[1].is_number()) {
+ bias = el[1].get();
+ } else if (el[1].is_boolean() && !el[1].get()) {
+ bias = -INFINITY;
+ } else {
+ continue;
}
- }
- else if (el[0].is_string())
- {
- auto toks = llama_tokenize(model, el[0].get(), false);
- for (auto tok : toks)
- {
- slot->sparams.logit_bias[tok] = bias;
+
+ if (el[0].is_number_integer()) {
+ llama_token tok = el[0].get();
+ if (tok >= 0 && tok < n_vocab) {
+ slot.sparams.logit_bias[tok] = bias;
+ }
+ } else if (el[0].is_string()) {
+ auto toks = llama_tokenize(model, el[0].get(), false);
+ for (auto tok : toks) {
+ slot.sparams.logit_bias[tok] = bias;
+ }
}
}
}
}
}
- slot->params.antiprompt.clear();
-
- const auto &stop = data.find("stop");
- if (stop != data.end() && stop->is_array())
{
- for (const auto &word : *stop)
- {
- if (!word.empty())
- {
- slot->params.antiprompt.push_back(word);
+ slot.params.antiprompt.clear();
+
+ const auto & stop = data.find("stop");
+ if (stop != data.end() && stop->is_array()) {
+ for (const auto & word : *stop) {
+ if (!word.empty()) {
+ slot.params.antiprompt.push_back(word);
+ }
}
}
}
- const auto &samplers_sequence = data.find("samplers");
- if (samplers_sequence != data.end() && samplers_sequence->is_array())
{
- std::vector sampler_names;
- for (const auto &sampler_name : *samplers_sequence)
- {
- if (sampler_name.is_string())
- {
- sampler_names.emplace_back(sampler_name);
+ const auto & samplers_sequence = data.find("samplers");
+ if (samplers_sequence != data.end() && samplers_sequence->is_array()) {
+ std::vector sampler_names;
+ for (const auto & sampler_name : *samplers_sequence) {
+ if (sampler_name.is_string()) {
+ sampler_names.emplace_back(sampler_name);
+ }
}
+ slot.sparams.samplers_sequence = sampler_types_from_names(sampler_names, false);
+ } else {
+ slot.sparams.samplers_sequence = default_sparams.samplers_sequence;
}
- slot->sparams.samplers_sequence = sampler_types_from_names(sampler_names, false);
}
- else
+
{
- slot->sparams.samplers_sequence = default_sparams.samplers_sequence;
+ if (slot.ctx_sampling != nullptr) {
+ llama_sampling_free(slot.ctx_sampling);
+ }
+ slot.ctx_sampling = llama_sampling_init(slot.sparams);
+ if (slot.ctx_sampling == nullptr) {
+ // for now, the only error that may happen here is invalid grammar
+ send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST);
+ return false;
+ }
+ llama_set_rng_seed(ctx, slot.params.seed);
}
- if (multimodal)
- {
- const auto &images_data = data.find("image_data");
- if (images_data != data.end() && images_data->is_array())
- {
- for (const auto &img : *images_data)
- {
- const std::vector image_buffer = base64_decode(img["data"].get());
-
- slot_image img_sl;
- img_sl.id = img.count("id") != 0 ? img["id"].get() : slot->images.size();
- img_sl.img_data = clip_image_u8_init();
- if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
- {
- LOG_ERROR("failed to load image", {
- {"slot_id", slot->id},
- {"img_sl_id", img_sl.id}
- });
- return false;
- }
- LOG_VERBOSE("image loaded", {
- {"slot_id", slot->id},
- {"img_sl_id", img_sl.id}
- });
- img_sl.request_encode_image = true;
- slot->images.push_back(img_sl);
- }
- // process prompt
- // example: system prompt [img-102] user [img-103] describe [img-134] -> [{id: 102, prefix: 'system prompt '}, {id: 103, prefix: ' user '}, {id: 134, prefix: ' describe '}]}
- if (slot->images.size() > 0 && !slot->prompt.is_array())
- {
- std::string prompt = slot->prompt.get();
- size_t pos = 0, begin_prefix = 0;
- std::string pattern = "[img-";
- while ((pos = prompt.find(pattern, pos)) != std::string::npos) {
- size_t end_prefix = pos;
- pos += pattern.length();
- size_t end_pos = prompt.find(']', pos);
- if (end_pos != std::string::npos)
- {
- std::string image_id = prompt.substr(pos, end_pos - pos);
- try
- {
- int img_id = std::stoi(image_id);
- bool found = false;
- for (slot_image &img : slot->images)
- {
- if (img.id == img_id) {
- found = true;
- img.prefix_prompt = prompt.substr(begin_prefix, end_prefix - begin_prefix);
- begin_prefix = end_pos + 1;
- break;
- }
- }
- if (!found) {
- LOG_TEE("ERROR: Image with id: %i, not found.\n", img_id);
- slot->images.clear();
- return false;
- }
- } catch (const std::invalid_argument& e) {
- LOG_TEE("Invalid image number id in prompt\n");
- slot->images.clear();
- return false;
- }
- }
- }
- slot->prompt = "";
- slot->params.input_suffix = prompt.substr(begin_prefix);
- slot->params.cache_prompt = false; // multimodal doesn't support cache prompt
- }
- }
- }
-
- if (slot->ctx_sampling != nullptr)
- {
- llama_sampling_free(slot->ctx_sampling);
- }
- slot->ctx_sampling = llama_sampling_init(slot->sparams);
- llama_set_rng_seed(ctx, slot->params.seed);
- slot->command = LOAD_PROMPT;
-
- all_slots_are_idle = false;
+ slot.command = SLOT_COMMAND_LOAD_PROMPT;
+ slot.prompt_tokens.clear();
LOG_INFO("slot is processing task", {
- {"slot_id", slot->id},
- {"task_id", slot->task_id},
+ {"id_slot", slot.id},
+ {"id_task", slot.id_task},
});
return true;
}
void kv_cache_clear() {
+ LOG_VERBOSE("clearing KV cache", {});
+
// clear the entire KV cache
llama_kv_cache_clear(ctx);
clean_kv_cache = false;
}
void system_prompt_update() {
+ LOG_VERBOSE("system prompt update", {
+ {"system_prompt", system_prompt},
+ });
+
kv_cache_clear();
system_tokens.clear();
@@ -837,14 +1039,14 @@ struct llama_server_context
llama_batch_clear(batch);
- for (int i = 0; i < (int)system_tokens.size(); ++i)
- {
+ for (int i = 0; i < (int)system_tokens.size(); ++i) {
llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
}
- for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += params.n_batch)
- {
- const int32_t n_tokens = std::min(params.n_batch, (int32_t) (batch.n_tokens - i));
+ const int32_t n_batch = llama_n_batch(ctx);
+
+ for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
+ const int32_t n_tokens = std::min(params.n_batch, batch.n_tokens - i);
llama_batch batch_view = {
n_tokens,
batch.token + i,
@@ -855,78 +1057,42 @@ struct llama_server_context
batch.logits + i,
0, 0, 0, // unused
};
- if (llama_decode(ctx, batch_view) != 0)
- {
+
+ if (llama_decode(ctx, batch_view) != 0) {
LOG_TEE("%s: llama_decode() failed\n", __func__);
return;
}
}
// assign the system KV cache to all parallel sequences
- for (int32_t i = 1; i < params.n_parallel; ++i)
- {
- llama_kv_cache_seq_cp(ctx, 0, i, 0, system_tokens.size());
+ for (int32_t i = 1; i <= params.n_parallel; ++i) {
+ llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
}
}
- LOG_TEE("system prompt updated\n");
system_need_update = false;
}
- void system_prompt_notify() {
- // release all slots
- for (server_slot &slot : slots)
- {
- slot.release();
- }
-
- system_need_update = true;
- }
-
- void system_prompt_process(const json &sys_props) {
+ void system_prompt_set(const json & sys_props) {
system_prompt = sys_props.value("prompt", "");
name_user = sys_props.value("anti_prompt", "");
name_assistant = sys_props.value("assistant_name", "");
+ LOG_VERBOSE("system prompt process", {
+ {"system_prompt", system_prompt},
+ {"name_user", name_user},
+ {"name_assistant", name_assistant},
+ });
- system_prompt_notify();
- }
-
- static size_t find_stopping_strings(const std::string &text, const size_t last_token_size,
- const stop_type type, server_slot &slot)
- {
- size_t stop_pos = std::string::npos;
-
- for (const std::string &word : slot.params.antiprompt)
- {
- size_t pos;
- if (type == STOP_FULL)
- {
- const size_t tmp = word.size() + last_token_size;
- const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0;
- pos = text.find(word, from_pos);
- }
- else
- {
- pos = find_partial_stop_string(word, text);
- }
- if (pos != std::string::npos &&
- (stop_pos == std::string::npos || pos < stop_pos))
- {
- if (type == STOP_FULL)
- {
- slot.stopped_word = true;
- slot.stopping_word = word;
- slot.has_next_token = false;
- }
- stop_pos = pos;
- }
+ // release all slots
+ for (server_slot & slot : slots) {
+ slot.release();
}
- return stop_pos;
+ system_need_update = true;
}
- bool process_token(completion_token_output &result, server_slot &slot) {
+ bool process_token(completion_token_output & result, server_slot & slot) {
// remember which tokens were sampled - used for repetition penalties during sampling
const std::string token_str = llama_token_to_piece(ctx, result.tok);
slot.sampled = result.tok;
@@ -935,34 +1101,26 @@ struct llama_server_context
slot.generated_text += token_str;
slot.has_next_token = true;
- if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1)
- {
+ if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1) {
// we can change penalty_prompt_tokens because it is always created from scratch each request
slot.ctx_sampling->params.penalty_prompt_tokens.push_back(result.tok);
}
// check if there is incomplete UTF-8 character at the end
bool incomplete = false;
- for (unsigned i = 1; i < 5 && i <= slot.generated_text.size(); ++i)
- {
+ for (unsigned i = 1; i < 5 && i <= slot.generated_text.size(); ++i) {
unsigned char c = slot.generated_text[slot.generated_text.size() - i];
- if ((c & 0xC0) == 0x80)
- {
+ if ((c & 0xC0) == 0x80) {
// continuation byte: 10xxxxxx
continue;
}
- if ((c & 0xE0) == 0xC0)
- {
+ if ((c & 0xE0) == 0xC0) {
// 2-byte character: 110xxxxx ...
incomplete = i < 2;
- }
- else if ((c & 0xF0) == 0xE0)
- {
+ } else if ((c & 0xF0) == 0xE0) {
// 3-byte character: 1110xxxx ...
incomplete = i < 3;
- }
- else if ((c & 0xF8) == 0xF0)
- {
+ } else if ((c & 0xF8) == 0xF0) {
// 4-byte character: 11110xxx ...
incomplete = i < 4;
}
@@ -970,206 +1128,192 @@ struct llama_server_context
break;
}
- if (!incomplete)
- {
+ if (!incomplete) {
size_t pos = std::min(slot.n_sent_text, slot.generated_text.size());
+
const std::string str_test = slot.generated_text.substr(pos);
bool is_stop_full = false;
- size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
- if (stop_pos != std::string::npos)
- {
+
+ size_t stop_pos = slot.find_stopping_strings(str_test, token_str.size(), STOP_TYPE_FULL);
+ if (stop_pos != std::string::npos) {
is_stop_full = true;
slot.generated_text.erase(
slot.generated_text.begin() + pos + stop_pos,
slot.generated_text.end());
pos = std::min(slot.n_sent_text, slot.generated_text.size());
- }
- else
- {
+ } else {
is_stop_full = false;
- stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_PARTIAL, slot);
+ stop_pos = slot.find_stopping_strings(str_test, token_str.size(), STOP_TYPE_PARTIAL);
}
// check if there is any token to predict
- if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0))
- {
+ if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0)) {
// no send the stop word in the response
result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
slot.n_sent_text += result.text_to_send.size();
// add the token to slot queue and cache
}
+
slot.add_token_string(result);
- if (slot.params.stream)
- {
+ if (slot.params.stream) {
send_partial_response(slot, result);
}
}
- if (incomplete)
- {
+ if (incomplete) {
slot.has_next_token = true;
}
// check the limits
- if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params))
- {
- slot.stopped_limit = true;
+ if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params)) {
+ slot.stopped_limit = true;
slot.has_next_token = false;
+
+ LOG_VERBOSE("stopped by limit", {
+ {"id_slot", slot.id},
+ {"id_task", slot.id_task},
+ {"n_decoded", slot.n_decoded},
+ {"n_predict", slot.params.n_predict},
+ });
}
- if (!slot.cache_tokens.empty() && result.tok == llama_token_eos(model))
- {
- slot.stopped_eos = true;
+ if (result.tok == llama_token_eos(model)) {
+ slot.stopped_eos = true;
slot.has_next_token = false;
+
LOG_VERBOSE("eos token found", {});
}
LOG_VERBOSE("next token", {
- {"token", result.tok},
- {"token_text", tokens_to_output_formatted_string(ctx, result.tok)},
- {"has_next_token", slot.has_next_token},
- {"n_remain", slot.n_remaining},
- {"num_tokens_predicted", slot.n_decoded},
- {"stopped_eos", slot.stopped_eos},
- {"stopped_word", slot.stopped_word},
- {"stopped_limit", slot.stopped_limit},
- {"stopping_word", slot.stopping_word},
- });
+ {"id_slot", slot.id},
+ {"id_task", slot.id_task},
+ {"token", result.tok},
+ {"token_text", tokens_to_output_formatted_string(ctx, result.tok)},
+ {"has_next_token", slot.has_next_token},
+ {"n_remain", slot.n_remaining},
+ {"n_decoded", slot.n_decoded},
+ {"stopped_eos", slot.stopped_eos},
+ {"stopped_word", slot.stopped_word},
+ {"stopped_limit", slot.stopped_limit},
+ {"stopping_word", slot.stopping_word},
+ });
return slot.has_next_token; // continue
}
- bool process_images(server_slot &slot) const
- {
- for (slot_image &img : slot.images)
- {
- if (!img.request_encode_image)
- {
- continue;
- }
-
- if (!llava_image_embed_make_with_clip_img(clp_ctx, params.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
- LOG_TEE("Error processing the given image");
- return false;
- }
-
-
- img.request_encode_image = false;
- }
-
- return slot.images.size() > 0;
- }
-
- void send_error(task_server& task, const std::string &error)
- {
- LOG_TEE("task %i - error: %s\n", task.id, error.c_str());
- task_result res;
- res.id = task.id;
- res.multitask_id = task.multitask_id;
- res.stop = false;
- res.error = true;
- res.result_json = { { "content", error } };
- queue_results.send(res);
- }
-
- json get_formated_generation(server_slot &slot)
- {
+ json get_formated_generation(const server_slot & slot) const {
const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
- const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() &&
- eos_bias->second < 0.0f && std::isinf(eos_bias->second);
+ const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() && eos_bias->second < 0.0f && std::isinf(eos_bias->second);
+
std::vector samplers_sequence;
- for (const auto &sampler_type : slot.sparams.samplers_sequence)
- {
+ samplers_sequence.reserve(slot.sparams.samplers_sequence.size());
+ for (const auto & sampler_type : slot.sparams.samplers_sequence) {
samplers_sequence.emplace_back(sampler_type_to_name_string(sampler_type));
}
return json {
- {"n_ctx", slot.n_ctx},
- {"n_predict", slot.n_predict},
- {"model", params.model_alias},
- {"seed", slot.params.seed},
- {"temperature", slot.sparams.temp},
- {"dynatemp_range", slot.sparams.dynatemp_range},
- {"dynatemp_exponent", slot.sparams.dynatemp_exponent},
- {"top_k", slot.sparams.top_k},
- {"top_p", slot.sparams.top_p},
- {"min_p", slot.sparams.min_p},
- {"tfs_z", slot.sparams.tfs_z},
- {"typical_p", slot.sparams.typical_p},
- {"repeat_last_n", slot.sparams.penalty_last_n},
- {"repeat_penalty", slot.sparams.penalty_repeat},
- {"presence_penalty", slot.sparams.penalty_present},
- {"frequency_penalty", slot.sparams.penalty_freq},
- {"penalty_prompt_tokens", slot.sparams.penalty_prompt_tokens},
+ {"n_ctx", slot.n_ctx},
+ {"n_predict", slot.n_predict},
+ {"model", params.model_alias},
+ {"seed", slot.params.seed},
+ {"temperature", slot.sparams.temp},
+ {"dynatemp_range", slot.sparams.dynatemp_range},
+ {"dynatemp_exponent", slot.sparams.dynatemp_exponent},
+ {"top_k", slot.sparams.top_k},
+ {"top_p", slot.sparams.top_p},
+ {"min_p", slot.sparams.min_p},
+ {"tfs_z", slot.sparams.tfs_z},
+ {"typical_p", slot.sparams.typical_p},
+ {"repeat_last_n", slot.sparams.penalty_last_n},
+ {"repeat_penalty", slot.sparams.penalty_repeat},
+ {"presence_penalty", slot.sparams.penalty_present},
+ {"frequency_penalty", slot.sparams.penalty_freq},
+ {"penalty_prompt_tokens", slot.sparams.penalty_prompt_tokens},
{"use_penalty_prompt_tokens", slot.sparams.use_penalty_prompt_tokens},
- {"mirostat", slot.sparams.mirostat},
- {"mirostat_tau", slot.sparams.mirostat_tau},
- {"mirostat_eta", slot.sparams.mirostat_eta},
- {"penalize_nl", slot.sparams.penalize_nl},
- {"stop", slot.params.antiprompt},
- {"n_predict", slot.params.n_predict},
- {"n_keep", params.n_keep},
- {"ignore_eos", ignore_eos},
- {"stream", slot.params.stream},
- {"logit_bias", slot.sparams.logit_bias},
- {"n_probs", slot.sparams.n_probs},
- {"min_keep", slot.sparams.min_keep},
- {"grammar", slot.sparams.grammar},
- {"samplers", samplers_sequence}
+ {"mirostat", slot.sparams.mirostat},
+ {"mirostat_tau", slot.sparams.mirostat_tau},
+ {"mirostat_eta", slot.sparams.mirostat_eta},
+ {"penalize_nl", slot.sparams.penalize_nl},
+ {"stop", slot.params.antiprompt},
+ {"n_predict", slot.params.n_predict}, // TODO: fix duplicate key n_predict
+ {"n_keep", params.n_keep},
+ {"ignore_eos", ignore_eos},
+ {"stream", slot.params.stream},
+ {"logit_bias", slot.sparams.logit_bias},
+ {"n_probs", slot.sparams.n_probs},
+ {"min_keep", slot.sparams.min_keep},
+ {"grammar", slot.sparams.grammar},
+ {"samplers", samplers_sequence}
};
}
- void send_partial_response(server_slot &slot, completion_token_output tkn)
- {
- task_result res;
- res.id = slot.task_id;
- res.multitask_id = slot.multitask_id;
- res.error = false;
- res.stop = false;
+ void send_error(const server_task & task, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) {
+ send_error(task.id, task.id_multi, error, type);
+ }
- res.result_json = json
- {
+ void send_error(const server_slot & slot, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) {
+ send_error(slot.id_task, slot.id_multi, error, type);
+ }
+
+ void send_error(const int id_task, const int id_multi, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) {
+ LOG_TEE("task %i - error: %s\n", id_task, error.c_str());
+
+ server_task_result res;
+ res.id = id_task;
+ res.id_multi = id_multi;
+ res.stop = false;
+ res.error = true;
+ res.data = format_error_response(error, type);
+
+ queue_results.send(res);
+ }
+
+ void send_partial_response(server_slot & slot, completion_token_output tkn) {
+ server_task_result res;
+ res.id = slot.id_task;
+ res.id_multi = slot.id_multi;
+ res.error = false;
+ res.stop = false;
+ res.data = json {
{"content", tkn.text_to_send},
{"stop", false},
- {"slot_id", slot.id},
- {"multimodal", multimodal}
+ {"id_slot", slot.id},
+ {"multimodal", false}
};
- if (slot.sparams.n_probs > 0)
- {
- std::vector probs_output = {};
+ if (slot.sparams.n_probs > 0) {
const std::vector to_send_toks = llama_tokenize(ctx, tkn.text_to_send, false);
- size_t probs_pos = std::min(slot.n_sent_token_probs, slot.generated_token_probs.size());
- size_t probs_stop_pos = std::min(slot.n_sent_token_probs + to_send_toks.size(), slot.generated_token_probs.size());
- if (probs_pos < probs_stop_pos)
- {
- probs_output = std::vector(slot.generated_token_probs.begin() + probs_pos, slot.generated_token_probs.begin() + probs_stop_pos);
+ const size_t probs_pos = std::min(slot.n_sent_token_probs, slot.generated_token_probs.size());
+ const size_t probs_stop_pos = std::min(slot.n_sent_token_probs + to_send_toks.size(), slot.generated_token_probs.size());
+
+ std::vector probs_output;
+ if (probs_pos < probs_stop_pos) {
+ probs_output = std::vector(
+ slot.generated_token_probs.begin() + probs_pos,
+ slot.generated_token_probs.begin() + probs_stop_pos);
}
slot.n_sent_token_probs = probs_stop_pos;
- res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs_output);
+
+ res.data["completion_probabilities"] = probs_vector_to_json(ctx, probs_output);
}
- if (slot.oaicompat)
- {
- res.result_json["oaicompat_token_ctr"] = slot.n_decoded;
- res.result_json["model"] = slot.oaicompat_model;
+ if (slot.oaicompat) {
+ res.data["oaicompat_token_ctr"] = slot.n_decoded;
+ res.data["model"] = slot.oaicompat_model;
}
queue_results.send(res);
}
- void send_final_response(server_slot &slot)
- {
- task_result res;
- res.id = slot.task_id;
- res.multitask_id = slot.multitask_id;
- res.error = false;
- res.stop = true;
-
- res.result_json = json
- {
+ void send_final_response(const server_slot & slot) {
+ server_task_result res;
+ res.id = slot.id_task;
+ res.id_multi = slot.id_multi;
+ res.error = false;
+ res.stop = true;
+ res.data = json {
{"content", !slot.params.stream ? slot.generated_text : ""},
- {"slot_id", slot.id},
+ {"id_slot", slot.id},
{"stop", true},
{"model", params.model_alias},
{"tokens_predicted", slot.n_decoded},
@@ -1185,96 +1329,91 @@ struct llama_server_context
{"timings", slot.get_formated_timings()}
};
- if (slot.sparams.n_probs > 0)
- {
- std::vector probs = {};
- if (!slot.params.stream && slot.stopped_word)
- {
+ if (slot.sparams.n_probs > 0) {
+ std::vector probs;
+ if (!slot.params.stream && slot.stopped_word) {
const std::vector stop_word_toks = llama_tokenize(ctx, slot.stopping_word, false);
- probs = std::vector(slot.generated_token_probs.begin(), slot.generated_token_probs.end() - stop_word_toks.size());
- }
- else
- {
+
+ probs = std::vector(
+ slot.generated_token_probs.begin(),
+ slot.generated_token_probs.end() - stop_word_toks.size());
+ } else {
probs = std::vector(
- slot.generated_token_probs.begin(),
- slot.generated_token_probs.end());
+ slot.generated_token_probs.begin(),
+ slot.generated_token_probs.end());
}
- res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs);
+
+ res.data["completion_probabilities"] = probs_vector_to_json(ctx, probs);
}
- if (slot.oaicompat)
- {
- res.result_json["oaicompat_token_ctr"] = slot.n_decoded;
- res.result_json["model"] = slot.oaicompat_model;
+ if (slot.oaicompat) {
+ res.data["oaicompat_token_ctr"] = slot.n_decoded;
+ res.data["model"] = slot.oaicompat_model;
}
queue_results.send(res);
}
- void send_embedding(server_slot & slot, const llama_batch & batch)
- {
- task_result res;
- res.id = slot.task_id;
- res.multitask_id = slot.multitask_id;
- res.error = false;
- res.stop = true;
+ void send_embedding(const server_slot & slot, const llama_batch & batch) {
+ server_task_result res;
+ res.id = slot.id_task;
+ res.id_multi = slot.id_multi;
+ res.error = false;
+ res.stop = true;
const int n_embd = llama_n_embd(model);
- if (!params.embedding)
- {
- LOG_WARNING("embedding disabled", {{"params.embedding", params.embedding}});
- res.result_json = json
- {
- {"embedding", std::vector(n_embd, 0.0f)},
- };
- }
- else
- {
- for (int i = 0; i < batch.n_tokens; ++i) {
- if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
- continue;
- }
+ std::vector embd_res(n_embd, 0.0f);
- const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
- if (embd == NULL) {
- embd = llama_get_embeddings_ith(ctx, i);
- if (embd == NULL) {
- LOG_ERROR("failed to get embeddings for token", {{"token", batch.token[i]}, {"seq_id", batch.seq_id[i][0]}});
- res.result_json = json
- {
- {"embedding", std::vector(n_embd, 0.0f)},
- };
- continue;
- }
- }
+ for (int i = 0; i < batch.n_tokens; ++i) {
+ if (!batch.logits[i] || batch.seq_id[i][0] != slot.id + 1) {
+ continue;
+ }
- res.result_json = json
- {
- {"embedding", std::vector(embd, embd + n_embd)},
+ const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
+ if (embd == NULL) {
+ embd = llama_get_embeddings_ith(ctx, i);
+ }
+
+ if (embd == NULL) {
+ LOG_ERROR("failed to get embeddings", {
+ {"token", batch.token [i]},
+ {"seq_id", batch.seq_id[i][0]}
+ });
+
+ res.data = json {
+ {"embedding", std::vector(n_embd, 0.0f)},
};
+
+ continue;
}
+
+ llama_embd_normalize(embd, embd_res.data(), n_embd);
+
+ res.data = json {
+ {"embedding", embd_res},
+ };
}
+
queue_results.send(res);
}
- void request_completion(int task_id, json data, bool infill, bool embedding, int multitask_id)
- {
- task_server task;
- task.id = task_id;
- task.target_id = 0;
- task.data = std::move(data);
- task.infill_mode = infill;
- task.embedding_mode = embedding;
- task.type = TASK_TYPE_COMPLETION;
- task.multitask_id = multitask_id;
+ void request_completion(int id_task, int id_multi, json data, bool infill, bool embedding) {
+ server_task task;
+ task.id = id_task;
+ task.id_multi = id_multi;
+ task.id_target = 0;
+ task.data = std::move(data);
+ task.infill = infill;
+ task.embedding = embedding;
+ task.type = SERVER_TASK_TYPE_COMPLETION;
// when a completion task's prompt array is not a singleton, we split it into multiple requests
// otherwise, it's a single-prompt task, we actually queue it
// if there's numbers in the prompt array it will be treated as an array of tokens
if (task.data.count("prompt") != 0 && task.data.at("prompt").size() > 1) {
bool numbers = false;
- for (const auto& e : task.data.at("prompt")) {
+ for (const auto & e : task.data.at("prompt")) {
if (e.is_number()) {
numbers = true;
break;
@@ -1289,106 +1428,23 @@ struct llama_server_context
if (numbers) {
queue_tasks.post(task);
} else {
- split_multiprompt_task(task_id, task);
+ split_multiprompt_task(id_task, task);
}
} else {
- // an empty prompt can make slot become buggy
- if (task.data.contains("prompt") && task.data["prompt"].is_string() && task.data["prompt"].get().empty()) {
- task.data["prompt"] = " "; // add a space so that we have one token
- }
queue_tasks.post(task);
}
}
- // for multiple images processing
- bool ingest_images(server_slot &slot, int n_batch)
- {
- int image_idx = 0;
-
- while (image_idx < (int) slot.images.size())
- {
- slot_image &img = slot.images[image_idx];
-
- // process prefix prompt
- for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
- {
- const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
- llama_batch batch_view = {
- n_tokens,
- batch.token + i,
- nullptr,
- batch.pos + i,
- batch.n_seq_id + i,
- batch.seq_id + i,
- batch.logits + i,
- 0, 0, 0, // unused
- };
- if (llama_decode(ctx, batch_view))
- {
- LOG_TEE("%s : failed to eval\n", __func__);
- return false;
- }
- }
-
- // process image with llm
- for (int i = 0; i < img.image_tokens; i += n_batch)
- {
- int n_eval = img.image_tokens - i;
- if (n_eval > n_batch)
- {
- n_eval = n_batch;
- }
-
- const int n_embd = llama_n_embd(model);
- llama_batch batch_img = {
- n_eval,
- nullptr,
- (img.image_embedding + i * n_embd),
- nullptr,
- nullptr,
- nullptr,
- nullptr,
- slot.n_past,
- 1, 0
- };
- if (llama_decode(ctx, batch_img))
- {
- LOG_TEE("%s : failed to eval image\n", __func__);
- return false;
- }
- slot.n_past += n_eval;
- }
- image_idx++;
-
- llama_batch_clear(batch);
-
- // append prefix of next image
- const auto json_prompt = (image_idx >= (int) slot.images.size()) ?
- slot.params.input_suffix : // no more images, then process suffix prompt
- (json)(slot.images[image_idx].prefix_prompt);
-
- std::vector append_tokens = tokenize(json_prompt, false); // has next image
- for (int i = 0; i < (int) append_tokens.size(); ++i)
- {
- llama_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true);
- slot.n_past += 1;
- }
- }
-
- return true;
- }
+ void request_cancel(int id_task) {
+ server_task task;
+ task.type = SERVER_TASK_TYPE_CANCEL;
+ task.id_target = id_task;
- void request_cancel(int task_id)
- {
- task_server task;
- task.type = TASK_TYPE_CANCEL;
- task.target_id = task_id;
queue_tasks.post(task);
}
- void split_multiprompt_task(int multitask_id, task_server& multiprompt_task)
- {
- int prompt_count = multiprompt_task.data.at("prompt").size();
+ void split_multiprompt_task(int id_multi, const server_task & multiprompt_task) {
+ const int prompt_count = multiprompt_task.data.at("prompt").size();
if (prompt_count <= 1) {
send_error(multiprompt_task, "error while handling multiple prompts");
return;
@@ -1396,133 +1452,129 @@ struct llama_server_context
// generate all the ID for subtask
std::vector subtask_ids(prompt_count);
- for (int i = 0; i < prompt_count; i++)
- {
+ for (int i = 0; i < prompt_count; i++) {
subtask_ids[i] = queue_tasks.get_new_id();
}
// queue up the multitask so we can track its subtask progression
- queue_tasks.add_multitask(multitask_id, subtask_ids);
+ queue_tasks.add_multitask(id_multi, subtask_ids);
// add subtasks
- for (int i = 0; i < prompt_count; i++)
- {
+ for (int i = 0; i < prompt_count; i++) {
json subtask_data = multiprompt_task.data;
subtask_data["prompt"] = subtask_data["prompt"][i];
// subtasks inherit everything else (infill mode, embedding mode, etc.)
- request_completion(subtask_ids[i], subtask_data, multiprompt_task.infill_mode, multiprompt_task.embedding_mode, multitask_id);
+ request_completion(subtask_ids[i], id_multi, subtask_data, multiprompt_task.infill, multiprompt_task.embedding);
}
}
- void process_single_task(task_server& task)
- {
- switch (task.type)
- {
- case TASK_TYPE_COMPLETION: {
- server_slot *slot = get_slot(json_value(task.data, "slot_id", -1));
- if (slot == nullptr)
- {
- // if no slot is available, we defer this task for processing later
- LOG_VERBOSE("no slot is available", {{"task_id", task.id}});
- queue_tasks.defer(task);
- break;
- }
-
- if (task.data.contains("system_prompt"))
+ void process_single_task(const server_task & task) {
+ switch (task.type) {
+ case SERVER_TASK_TYPE_COMPLETION:
{
- if (!all_slots_are_idle) {
- send_error(task, "system prompt can only be updated when all slots are idle");
+ server_slot * slot = get_slot(json_value(task.data, "id_slot", -1));
+ if (slot == nullptr) {
+ // if no slot is available, we defer this task for processing later
+ LOG_VERBOSE("no slot is available", {{"id_task", task.id}});
+ queue_tasks.defer(task);
break;
}
- system_prompt_process(task.data["system_prompt"]);
- // reset cache_tokens for all slots
- for (server_slot &slot : slots)
- {
- slot.cache_tokens.clear();
- slot.n_past = 0;
- slot.n_past_se = 0;
+ if (task.data.contains("system_prompt")) {
+ system_prompt_set(task.data["system_prompt"]);
+
+ for (server_slot & slot : slots) {
+ slot.n_past = 0;
+ slot.n_past_se = 0;
+ }
}
- }
- slot->reset();
+ slot->reset();
- slot->infill = task.infill_mode;
- slot->embedding = task.embedding_mode;
- slot->task_id = task.id;
- slot->multitask_id = task.multitask_id;
+ slot->id_task = task.id;
+ slot->id_multi = task.id_multi;
+ slot->infill = task.infill;
+ slot->embedding = task.embedding;
- if (!launch_slot_with_data(slot, task.data))
- {
- // send error result
- send_error(task, "internal_error");
- break;
- }
- } break;
- case TASK_TYPE_CANCEL: { // release slot linked with the task id
- for (auto & slot : slots)
- {
- if (slot.task_id == task.target_id)
- {
- slot.release();
+ if (!launch_slot_with_task(*slot, task)) {
+ LOG_ERROR("error while launching slot", task.data);
break;
}
- }
- } break;
- case TASK_TYPE_NEXT_RESPONSE: {
- // do nothing
- } break;
- case TASK_TYPE_METRICS: {
- json slots_data = json::array();
- int n_idle_slots = 0;
- int n_processing_slots = 0;
-
- for (server_slot &slot: slots) {
- json slot_data = get_formated_generation(slot);
- slot_data["id"] = slot.id;
- slot_data["task_id"] = slot.task_id;
- slot_data["state"] = slot.state;
- slot_data["prompt"] = slot.prompt;
- slot_data["next_token"] = {
- {"has_next_token", slot.has_next_token},
- {"n_remain", slot.n_remaining},
- {"num_tokens_predicted", slot.n_decoded},
- {"stopped_eos", slot.stopped_eos},
- {"stopped_word", slot.stopped_word},
- {"stopped_limit", slot.stopped_limit},
- {"stopping_word", slot.stopping_word},
- };
- if (slot_data["state"] == IDLE) {
- n_idle_slots++;
- } else {
- n_processing_slots++;
+ } break;
+ case SERVER_TASK_TYPE_CANCEL:
+ {
+ // release slot linked with the task id
+ for (auto & slot : slots) {
+ if (slot.id_task == task.id_target) {
+ slot.release();
+ break;
+ }
}
- slots_data.push_back(slot_data);
- }
- LOG_INFO("slot data", {
- {"task_id", task.id},
- {"n_idle_slots", n_idle_slots},
- {"n_processing_slots", n_processing_slots}
- });
- LOG_VERBOSE("slot data", {
- {"task_id", task.id},
- {"n_idle_slots", n_idle_slots},
- {"n_processing_slots", n_processing_slots},
- {"slots", slots_data}
- });
- task_result res;
- res.id = task.id;
- res.multitask_id = task.multitask_id;
- res.stop = true;
- res.error = false;
- res.result_json = {
+ } break;
+ case SERVER_TASK_TYPE_NEXT_RESPONSE:
+ {
+ // do nothing
+ } break;
+ case SERVER_TASK_TYPE_METRICS:
+ {
+ json slots_data = json::array();
+
+ int n_idle_slots = 0;
+ int n_processing_slots = 0;
+
+ for (server_slot & slot : slots) {
+ json slot_data = get_formated_generation(slot);
+ slot_data["id"] = slot.id;
+ slot_data["id_task"] = slot.id_task;
+ slot_data["state"] = slot.state;
+ slot_data["prompt"] = slot.prompt;
+ slot_data["next_token"] = {
+ {"has_next_token", slot.has_next_token},
+ {"n_remain", slot.n_remaining},
+ {"n_decoded", slot.n_decoded},
+ {"stopped_eos", slot.stopped_eos},
+ {"stopped_word", slot.stopped_word},
+ {"stopped_limit", slot.stopped_limit},
+ {"stopping_word", slot.stopping_word},
+ };
+
+ if (slot_data["state"] == SLOT_STATE_IDLE) {
+ n_idle_slots++;
+ } else {
+ n_processing_slots++;
+ }
+
+ slots_data.push_back(slot_data);
+ }
+ LOG_INFO("slot data", {
+ {"id_task", task.id},
+ {"n_idle_slots", n_idle_slots},
+ {"n_processing_slots", n_processing_slots}
+ });
+
+ LOG_VERBOSE("slot data", {
+ {"id_task", task.id},
+ {"n_idle_slots", n_idle_slots},
+ {"n_processing_slots", n_processing_slots},
+ {"slots", slots_data}
+ });
+
+ server_task_result res;
+ res.id = task.id;
+ res.id_multi = task.id_multi;
+ res.stop = true;
+ res.error = false;
+ res.data = {
{ "idle", n_idle_slots },
{ "processing", n_processing_slots },
{ "deferred", queue_tasks.queue_tasks_deferred.size() },
+ { "t_start", metrics.t_start},
{ "n_prompt_tokens_processed_total", metrics.n_prompt_tokens_processed_total},
+ { "t_tokens_generation_total", metrics.t_tokens_generation_total},
{ "n_tokens_predicted_total", metrics.n_tokens_predicted_total},
+ { "t_prompt_processing_total", metrics.t_prompt_processing_total},
{ "n_prompt_tokens_processed", metrics.n_prompt_tokens_processed},
{ "t_prompt_processing", metrics.t_prompt_processing},
@@ -1533,71 +1585,106 @@ struct llama_server_context
{ "kv_cache_used_cells", llama_get_kv_cache_used_cells(ctx)},
{ "slots", slots_data },
- };
- metrics.reset_bucket();
- queue_results.send(res);
- } break;
+ };
+
+ if (json_value(task.data, "reset_bucket", false)) {
+ metrics.reset_bucket();
+ }
+ queue_results.send(res);
+ } break;
}
}
- void on_finish_multitask(task_multi& multitask)
- {
+ void on_finish_multitask(const server_task_multi & multitask) {
// all subtasks done == multitask is done
- task_result result;
- result.id = multitask.id;
- result.stop = true;
+ server_task_result result;
+ result.id = multitask.id;
+ result.stop = true;
result.error = false;
// collect json results into one json result
std::vector result_jsons;
- for (auto& subres : multitask.results)
- {
- result_jsons.push_back(subres.result_json);
+ for (const auto & subres : multitask.results) {
+ result_jsons.push_back(subres.data);
result.error = result.error && subres.error;
}
- result.result_json = json{ { "results", result_jsons } };
+ result.data = json {
+ { "results", result_jsons }
+ };
+
queue_results.send(result);
}
- bool update_slots() {
- if (system_need_update)
- {
- LOG_INFO("updating system prompt", {});
+ void update_slots() {
+ if (system_need_update) {
system_prompt_update();
}
- llama_batch_clear(batch);
+ // release slots
+ for (auto & slot : slots) {
+ if (slot.command == SLOT_COMMAND_RELEASE) {
+ slot.state = SLOT_STATE_IDLE;
+ slot.command = SLOT_COMMAND_NONE;
+ slot.t_last_used = ggml_time_us();
- if (all_slots_are_idle)
- {
- if (system_prompt.empty() && clean_kv_cache)
- {
- LOG_INFO("all slots are idle and system prompt is empty, clear the KV cache", {});
- kv_cache_clear();
- }
- return true;
- }
+ LOG_INFO("slot released", {
+ {"id_slot", slot.id},
+ {"id_task", slot.id_task},
+ {"n_ctx", n_ctx},
+ {"n_past", slot.n_past},
+ {"n_system_tokens", system_tokens.size()},
+ {"n_cache_tokens", slot.cache_tokens.size()},
+ {"truncated", slot.truncated}
+ });
- LOG_VERBOSE("posting NEXT_RESPONSE", {});
- task_server task;
- task.type = TASK_TYPE_NEXT_RESPONSE;
- task.target_id = -1;
- queue_tasks.post(task);
+ queue_tasks.notify_slot_changed();
+ }
+ }
- for (server_slot &slot : slots)
+ // check if all slots are idle
{
- if (slot.ga_n == 1)
- {
- if (slot.is_processing() && system_tokens.size() + slot.cache_tokens.size() >= (size_t) slot.n_ctx)
- {
+ bool all_idle = true;
+
+ for (auto & slot : slots) {
+ if (slot.state != SLOT_STATE_IDLE || slot.command != SLOT_COMMAND_NONE) {
+ all_idle = false;
+ break;
+ }
+ }
+
+ if (all_idle) {
+ LOG_INFO("all slots are idle", {});
+ if (system_prompt.empty() && clean_kv_cache) {
+ kv_cache_clear();
+ }
+
+ return;
+ }
+ }
+
+ {
+ LOG_VERBOSE("posting NEXT_RESPONSE", {});
+
+ server_task task;
+ task.type = SERVER_TASK_TYPE_NEXT_RESPONSE;
+ task.id_target = -1;
+
+ queue_tasks.post(task);
+ }
+
+ // apply context-shift if needed
+ // TODO: simplify and improve
+ for (server_slot & slot : slots) {
+ if (slot.ga_n == 1) {
+ if (slot.is_processing() && (int) system_tokens.size() + slot.n_past >= slot.n_ctx - 1) {
// Shift context
const int n_keep = slot.params.n_keep + add_bos_token;
const int n_left = (int) system_tokens.size() + slot.n_past - n_keep;
const int n_discard = n_left / 2;
LOG_INFO("slot context shift", {
- {"slot_id", slot.id},
- {"task_id", slot.task_id},
+ {"id_slot", slot.id},
+ {"id_task", slot.id_task},
{"n_keep", n_keep},
{"n_left", n_left},
{"n_discard", n_discard},
@@ -1606,15 +1693,17 @@ struct llama_server_context
{"n_system_tokens", system_tokens.size()},
{"n_cache_tokens", slot.cache_tokens.size()}
});
- llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);
- llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
- for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++)
- {
- slot.cache_tokens[i - n_discard] = slot.cache_tokens[i];
- }
+ llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep , n_keep + n_discard);
+ llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
+
+ if (slot.params.cache_prompt) {
+ for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
+ slot.cache_tokens[i - n_discard] = slot.cache_tokens[i];
+ }
- slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard);
+ slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard);
+ }
slot.n_past -= n_discard;
@@ -1623,33 +1712,12 @@ struct llama_server_context
}
}
- // decode any currently ongoing sequences
- LOG_VERBOSE("decoding ongoing sequences", {});
- for (auto & slot : slots)
- {
- // release the slot
- if (slot.command == RELEASE)
- {
- slot.state = IDLE;
- slot.command = NONE;
- slot.t_last_used = ggml_time_us();
-
- LOG_INFO("slot released", {
- {"slot_id", slot.id},
- {"task_id", slot.task_id},
- {"n_ctx", n_ctx},
- {"n_past", slot.n_past},
- {"n_system_tokens", system_tokens.size()},
- {"n_cache_tokens", slot.cache_tokens.size()},
- {"truncated", slot.truncated}
- });
- queue_tasks.notify_slot_changed();
-
- continue;
- }
+ // start populating the batch for this iteration
+ llama_batch_clear(batch);
- if (slot.state == IDLE)
- {
+ // frist, add sampled tokens from any ongoing sequences
+ for (auto & slot : slots) {
+ if (slot.state == SLOT_STATE_IDLE) {
continue;
}
@@ -1659,194 +1727,217 @@ struct llama_server_context
// TODO: we always have to take into account the "system_tokens"
// this is not great and needs to be improved somehow
- llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id }, true);
+ llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id + 1 }, true);
+
slot.n_past += 1;
+
+ if (slot.params.cache_prompt) {
+ slot.cache_tokens.push_back(slot.sampled);
+ }
+
+ LOG_VERBOSE("slot decode token", {
+ {"id_slot", slot.id},
+ {"id_task", slot.id_task},
+ {"n_ctx", n_ctx},
+ {"n_past", slot.n_past},
+ {"n_system_tokens", system_tokens.size()},
+ {"n_cache_tokens", slot.cache_tokens.size()},
+ {"truncated", slot.truncated}
+ });
}
// process in chunks of params.n_batch
- int32_t n_batch = params.n_batch;
+ int32_t n_batch = llama_n_batch(ctx);
+ int32_t n_ubatch = llama_n_ubatch(ctx);
+
+ // next, batch any pending prompts without exceeding n_batch
+ if (params.cont_batching || batch.n_tokens == 0) {
+ for (auto & slot : slots) {
+ // this slot still has a prompt to be processed
+ if (slot.state == SLOT_STATE_IDLE && slot.command == SLOT_COMMAND_LOAD_PROMPT) {
+ auto & prompt_tokens = slot.prompt_tokens;
+
+ // we haven't tokenized the prompt yet - do it now:
+ if (prompt_tokens.empty()) {
+ LOG_VERBOSE("tokenizing prompt", {
+ {"id_slot", slot.id},
+ {"id_task", slot.id_task}
+ });
- // assign workload to the slots
- if (params.cont_batching || batch.n_tokens == 0)
- {
- for (auto & slot : slots)
- {
- const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get().empty()) || !slot.images.empty();
+ slot.t_start_process_prompt = ggml_time_us();
+ slot.t_start_generation = 0;
- // empty prompt passed -> release the slot and send empty response
- // note: infill mode allows empty prompt
- if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt && !slot.infill)
- {
- slot.release();
- slot.print_timings();
- send_final_response(slot);
- continue;
- }
+ if (slot.infill) {
+ bool suff_rm_leading_spc = true;
+ if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
+ params.input_suffix.erase(0, 1);
+ suff_rm_leading_spc = false;
+ }
- // need process the prompt
- if (slot.state == IDLE && slot.command == LOAD_PROMPT)
- {
- slot.state = PROCESSING;
- slot.command = NONE;
- std::vector prompt_tokens;
- slot.t_start_process_prompt = ggml_time_us();
- slot.t_start_genereration = 0;
-
- if (slot.infill)
- {
- bool suff_rm_leading_spc = true;
- if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1)
- {
- params.input_suffix.erase(0, 1);
- suff_rm_leading_spc = false;
+ auto prefix_tokens = tokenize(slot.params.input_prefix, false);
+ auto suffix_tokens = tokenize(slot.params.input_suffix, false);
+
+ const int space_token = 29871; // TODO: this should not be hardcoded
+ if (suff_rm_leading_spc && !suffix_tokens.empty() && suffix_tokens[0] == space_token) {
+ suffix_tokens.erase(suffix_tokens.begin());
+ }
+
+ prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
+ prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS
+ prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(model));
+ prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
+ prefix_tokens.push_back(llama_token_middle(model));
+ prompt_tokens = prefix_tokens;
+ } else {
+ prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token); // add BOS if there isn't system prompt
}
- auto prefix_tokens = tokenize(slot.params.input_prefix, false);
- auto suffix_tokens = tokenize(slot.params.input_suffix, false);
- const int space_token = 29871; // TODO: this should not be hardcoded
- if (suff_rm_leading_spc && !suffix_tokens.empty() && suffix_tokens[0] == space_token) {
- suffix_tokens.erase(suffix_tokens.begin());
+ slot.n_past = 0;
+ slot.n_prompt_tokens = prompt_tokens.size();
+
+ LOG_VERBOSE("prompt tokenized", {
+ {"id_slot", slot.id},
+ {"id_task", slot.id_task},
+ {"n_ctx", slot.n_ctx},
+ {"n_keep", slot.params.n_keep},
+ {"n_prompt_tokens", slot.n_prompt_tokens},
+ {"prompt_tokens", tokens_to_str(ctx, prompt_tokens.cbegin(), prompt_tokens.cend())},
+ });
+
+ // empty prompt passed -> release the slot and send empty response
+ if (prompt_tokens.empty()) {
+ LOG_INFO("empty prompt - releasing slot", {
+ {"id_slot", slot.id},
+ {"id_task", slot.id_task}
+ });
+
+ slot.state = SLOT_STATE_PROCESSING;
+ slot.command = SLOT_COMMAND_NONE;
+ slot.release();
+ slot.print_timings();
+ send_final_response(slot);
+ continue;
}
- prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
- prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS
- prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(model));
- prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
- prefix_tokens.push_back(llama_token_middle(model));
- prompt_tokens = prefix_tokens;
- }
- else
- {
- prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token); // add BOS if there isn't system prompt
- }
+ if (slot.embedding) {
+ // this prompt is too large to process - discard it
+ if (slot.n_prompt_tokens > n_ubatch) {
+ slot.state = SLOT_STATE_PROCESSING;
+ slot.command = SLOT_COMMAND_NONE;
+ slot.release();
+ slot.print_timings();
+ send_final_response(slot);
+ continue;
+ }
+ } else {
+ if (slot.params.n_keep < 0) {
+ slot.params.n_keep = slot.n_prompt_tokens;
+ }
+ slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
- slot.n_prompt_tokens = prompt_tokens.size();
+ // if input prompt is too big, truncate it (if group attention self-extend is disabled)
+ if (slot.ga_n == 1 && slot.n_prompt_tokens >= slot.n_ctx) {
+ const int n_left = slot.n_ctx - slot.params.n_keep;
- if (slot.params.n_keep < 0)
- {
- slot.params.n_keep = slot.n_prompt_tokens;
- }
- slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
-
- // if input prompt is too big, truncate it, if group attention self-extend is disabled
- if (slot.ga_n == 1 && slot.n_prompt_tokens >= slot.n_ctx)
- {
- const int n_left = slot.n_ctx - slot.params.n_keep;
- const int n_block_size = n_left / 2;
- const int erased_blocks = (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size;
-
- std::vector new_tokens(
- prompt_tokens.begin(),
- prompt_tokens.begin() + slot.params.n_keep);
- new_tokens.insert(
- new_tokens.end(),
- prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size,
- prompt_tokens.end());
-
- LOG_VERBOSE("input truncated", {
- {"n_ctx", slot.n_ctx},
- {"n_keep", slot.params.n_keep},
- {"n_left", n_left},
- {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
- });
- slot.truncated = true;
- prompt_tokens = new_tokens;
+ const int n_block_size = n_left / 2;
+ const int erased_blocks = (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size;
- slot.n_prompt_tokens = prompt_tokens.size();
- GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx);
- }
+ std::vector new_tokens(
+ prompt_tokens.begin(),
+ prompt_tokens.begin() + slot.params.n_keep);
- if (!slot.params.cache_prompt)
- {
- llama_sampling_reset(slot.ctx_sampling);
+ new_tokens.insert(
+ new_tokens.end(),
+ prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size,
+ prompt_tokens.end());
- slot.n_past = 0;
- slot.n_past_se = 0;
- slot.ga_i = 0;
- slot.n_prompt_tokens_processed = slot.n_prompt_tokens;
- }
- else
- {
- // push the prompt into the sampling context (do not apply grammar)
- for (auto &token : prompt_tokens)
- {
- llama_sampling_accept(slot.ctx_sampling, ctx, token, false);
- }
+ prompt_tokens = std::move(new_tokens);
- slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
+ slot.truncated = true;
+ slot.n_prompt_tokens = prompt_tokens.size();
- // the last token of the cache is not in the KV cache until the next call to llama_decode
- // (it was sampled, pushed into the "cache_tokens", but not yet put in the context)
- if (slot.n_past > 0 && slot.n_past == (int32_t) slot.cache_tokens.size())
- {
- slot.n_past -= 1;
- }
+ LOG_VERBOSE("input truncated", {
+ {"id_slot", slot.id},
+ {"id_task", slot.id_task},
+ {"n_ctx", slot.n_ctx},
+ {"n_keep", slot.params.n_keep},
+ {"n_left", n_left},
+ {"n_prompt_tokens", slot.n_prompt_tokens},
+ {"prompt_tokens", tokens_to_str(ctx, prompt_tokens.cbegin(), prompt_tokens.cend())},
+ });
- slot.n_prompt_tokens_processed = slot.n_prompt_tokens - slot.n_past;
-
- if (slot.ga_n != 1)
- {
- int ga_i = 0;
- int32_t ga_n = slot.ga_n;
- int32_t ga_w = slot.ga_w;
- int32_t slot_npast = 0;
- for (int k = 0; k < slot.n_past; ++k)
- {
- while (slot_npast >= ga_i + ga_w) {
- const int bd = (ga_w/ga_n)*(ga_n - 1);
- slot_npast -= bd;
- ga_i += ga_w/ga_n;
+ GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx);
+ }
+
+ llama_sampling_reset(slot.ctx_sampling);
+
+ if (!slot.params.cache_prompt) {
+ slot.n_past_se = 0;
+ slot.ga_i = 0;
+ } else {
+ GGML_ASSERT(slot.ga_n == 1);
+
+ // reuse any previously computed tokens that are common with the new prompt
+ slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
+
+ // push the prompt into the sampling context (do not apply grammar)
+ for (int i = 0; i < slot.n_past; ++i) {
+ llama_sampling_accept(slot.ctx_sampling, ctx, slot.cache_tokens[i], false);
}
- slot_npast++;
}
- slot.n_past_se = slot_npast;
- slot.ga_i = ga_i;
}
- LOG_INFO("slot progression", {
- { "slot_id", slot.id },
- { "task_id", slot.task_id },
- { "n_past", slot.n_past },
- { "n_past_se", slot.n_past_se },
- { "ga_i", slot.ga_i },
- { "n_prompt_tokens_processed", slot.n_prompt_tokens_processed }
- });
- }
+ if (slot.n_past == slot.n_prompt_tokens && slot.n_past > 0) {
+ // we have to evaluate at least 1 token to generate logits.
+ LOG_INFO("we have to evaluate at least 1 token to generate logits", {
+ { "id_slot", slot.id },
+ { "id_task", slot.id_task }
+ });
- slot.cache_tokens = prompt_tokens;
+ slot.n_past--;
+ if (slot.ga_i > 0) {
+ slot.n_past_se--;
+ }
+ }
- if (slot.n_past == slot.n_prompt_tokens && slot.n_past > 0)
- {
- // we have to evaluate at least 1 token to generate logits.
- LOG_INFO("we have to evaluate at least 1 token to generate logits", {
- { "slot_id", slot.id },
- { "task_id", slot.task_id }
- });
- slot.n_past--;
- if (slot.ga_i > 0)
- {
- slot.n_past_se--;
+ slot.n_prompt_tokens_processed = 0;
+ }
+
+ if (slot.embedding) {
+ // cannot fit the prompt in the current batch - will try next iter
+ if (batch.n_tokens + slot.n_prompt_tokens > n_batch) {
+ continue;
}
}
+ // keep only the common part
int p0 = (int) system_tokens.size() + slot.n_past;
- LOG_INFO("kv cache rm [p0, end)", {
- { "slot_id", slot.id },
- { "task_id", slot.task_id },
- { "p0", p0 }
- });
- llama_kv_cache_seq_rm(ctx, slot.id, p0, -1);
+ if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, p0, -1)) {
+ // could not partially delete (likely using a non-Transformer model)
+ llama_kv_cache_seq_rm(ctx, slot.id + 1, -1, -1);
+
+ p0 = (int) system_tokens.size();
+ if (p0 != 0) {
+ // copy over the system prompt when there is one
+ llama_kv_cache_seq_cp(ctx, 0, slot.id + 1, -1, -1);
+ }
- LOG_VERBOSE("prompt ingested", {
- {"n_past", slot.n_past},
- {"cached", tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cbegin() + slot.n_past)},
- {"to_eval", tokens_to_str(ctx, slot.cache_tokens.cbegin() + slot.n_past, slot.cache_tokens.cend())},
- });
+ // there is no common part left (except for the system prompt)
+ slot.n_past = 0;
+ slot.n_past_se = 0;
+ slot.ga_i = 0;
+ // TODO: is the system prompt ever in the sampling context?
+ llama_sampling_reset(slot.ctx_sampling);
+ }
- const bool has_images = process_images(slot);
+ // remove the non-common part from the cache
+ slot.cache_tokens.resize(slot.n_past);
- // process the prefix of first image
- std::vector prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) : prompt_tokens;
+ LOG_INFO("kv cache rm [p0, end)", {
+ { "id_slot", slot.id },
+ { "id_task", slot.id_task },
+ { "p0", p0 }
+ });
int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
@@ -1854,61 +1945,81 @@ struct llama_server_context
int32_t ga_n = slot.ga_n;
int32_t ga_w = slot.ga_w;
- for (; slot.n_past < (int) prefix_tokens.size(); ++slot.n_past)
- {
- if (slot.ga_n != 1)
- {
+ // add prompt tokens for processing in the current batch
+ // TODO: the self-extend stuff here is a mess - simplify and/or abstract it somehow
+ for (; slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch; ++slot.n_past) {
+ if (slot.ga_n != 1) {
while (slot_npast >= ga_i + ga_w) {
const int bd = (ga_w/ga_n)*(ga_n - 1);
slot_npast -= bd;
ga_i += ga_w/ga_n;
}
}
- llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, { slot.id }, false);
+
+ llama_batch_add(batch, prompt_tokens[slot.n_past], system_tokens.size() + slot_npast, { slot.id + 1 }, false);
+
+ if (slot.params.cache_prompt) {
+ slot.cache_tokens.push_back(prompt_tokens[slot.n_past]);
+ }
+
+ slot.n_prompt_tokens_processed++;
slot_npast++;
}
- if (has_images && !ingest_images(slot, n_batch))
- {
- LOG_ERROR("failed processing images", {
- {"slot_id", slot.id},
- {"task_id", slot.task_id},
- });
- // FIXME @phymbert: to be properly tested
- // early returning without changing the slot state will block the slot for ever
- // no one at the moment is checking the return value
- return false;
- }
+ LOG_VERBOSE("prompt processing progress", {
+ {"id_slot", slot.id},
+ {"n_past", slot.n_past},
+ {"n_ctx", n_ctx},
+ {"n_tokens", batch.n_tokens},
+ {"progress", (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens},
+ });
+
+ // entire prompt has been processed - start decoding new tokens
+ if (slot.n_past == slot.n_prompt_tokens) {
+ slot.state = SLOT_STATE_PROCESSING;
+ slot.command = SLOT_COMMAND_NONE;
- // extract the logits only for the last token
- if (batch.n_tokens > 0)
- {
+ GGML_ASSERT(batch.n_tokens > 0);
+
+ // extract the logits only for the last token
batch.logits[batch.n_tokens - 1] = true;
+
+ slot.n_decoded = 0;
+ slot.i_batch = batch.n_tokens - 1;
+
+ LOG_VERBOSE("prompt done", {
+ {"id_slot", slot.id},
+ {"n_past", slot.n_past},
+ {"n_ctx", n_ctx},
+ {"n_tokens", batch.n_tokens},
+ });
}
+ }
- slot.n_decoded = 0;
- slot.i_batch = batch.n_tokens - 1;
+ if (batch.n_tokens >= n_batch) {
+ break;
}
}
}
- if (batch.n_tokens == 0)
- {
- all_slots_are_idle = true;
- return true;
+ if (batch.n_tokens == 0) {
+ LOG_VERBOSE("no tokens to decode", {});
+ return;
}
- for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
- {
+ LOG_VERBOSE("decoding batch", {
+ {"n_tokens", batch.n_tokens},
+ });
+
+ // process the created batch of tokens
+ for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
- for (auto & slot : slots)
- {
- if (slot.ga_n != 1)
- {
+ for (auto & slot : slots) {
+ if (slot.ga_n != 1) {
// context extension via Self-Extend
- while (slot.n_past_se >= slot.ga_i + slot.ga_w)
- {
+ // TODO: simplify and/or abstract this
+ while (slot.n_past_se >= slot.ga_i + slot.ga_w) {
const int ib = (slot.ga_n * slot.ga_i) / slot.ga_w;
const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1);
const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w;
@@ -1918,9 +2029,9 @@ struct llama_server_context
LOG_TEE("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
- llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd);
- llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n);
- llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i + ib * bd + slot.ga_w,slot.n_past_se + ib * bd, dd);
+ llama_kv_cache_seq_add(ctx, slot.id + 1, slot.ga_i, slot.n_past_se, ib * bd);
+ llama_kv_cache_seq_div(ctx, slot.id + 1, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n);
+ llama_kv_cache_seq_add(ctx, slot.id + 1, slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd);
slot.n_past_se -= bd;
@@ -1928,12 +2039,12 @@ struct llama_server_context
LOG_TEE("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i);
}
+
slot.n_past_se += n_tokens;
}
}
- llama_batch batch_view =
- {
+ llama_batch batch_view = {
n_tokens,
batch.token + i,
nullptr,
@@ -1946,13 +2057,17 @@ struct llama_server_context
const int ret = llama_decode(ctx, batch_view);
- if (ret != 0)
- {
- if (n_batch == 1 || ret < 0)
- {
+ if (ret != 0) {
+ if (n_batch == 1 || ret < 0) {
// if you get here, it means the KV cache is full - try increasing it via the context size
LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
- return false;
+ for (auto & slot : slots) {
+ slot.state = SLOT_STATE_PROCESSING;
+ slot.command = SLOT_COMMAND_NONE;
+ slot.release();
+ send_error(slot, "Input prompt is too big compared to KV size. Please try increasing KV size.");
+ }
+ break; // break loop of n_batch
}
LOG_TEE("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2);
@@ -1960,23 +2075,21 @@ struct llama_server_context
// retry with half the batch size to try to find a free slot in the KV cache
n_batch /= 2;
i -= n_batch;
- continue;
+
+ continue; // continue loop of n_batch
}
- for (auto & slot : slots)
- {
- if (slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens))
- {
- continue;
+ for (auto & slot : slots) {
+ if (slot.state != SLOT_STATE_PROCESSING || slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens)) {
+ continue; // continue loop of slots
}
// prompt evaluated for embedding
- if (slot.embedding)
- {
+ if (slot.embedding) {
send_embedding(slot, batch_view);
slot.release();
slot.i_batch = -1;
- continue;
+ continue; // continue loop of slots
}
completion_token_output result;
@@ -1985,10 +2098,9 @@ struct llama_server_context
llama_sampling_accept(slot.ctx_sampling, ctx, id, true);
slot.n_decoded += 1;
- if (slot.n_decoded == 1)
- {
- slot.t_start_genereration = ggml_time_us();
- slot.t_prompt_processing = (slot.t_start_genereration - slot.t_start_process_prompt) / 1e3;
+ if (slot.n_decoded == 1) {
+ slot.t_start_generation = ggml_time_us();
+ slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3;
metrics.on_prompt_eval(slot);
}
@@ -1996,19 +2108,19 @@ struct llama_server_context
result.tok = id;
const int32_t n_probs = slot.sparams.n_probs;
- if (slot.sparams.temp <= 0 && n_probs > 0)
- {
+ if (slot.sparams.temp <= 0 && n_probs > 0) {
// for llama_sample_token_greedy we need to sort candidates
llama_sample_softmax(ctx, &cur_p);
}
- for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i)
- {
- result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p});
+ for (size_t i = 0; i < std::min(cur_p.size, (size_t) n_probs); ++i) {
+ result.probs.push_back({
+ cur_p.data[i].id,
+ cur_p.data[i].p
+ });
}
- if (!process_token(result, slot))
- {
+ if (!process_token(result, slot)) {
slot.release();
slot.print_timings();
send_final_response(slot);
@@ -2019,25 +2131,22 @@ struct llama_server_context
}
}
- LOG_VERBOSE("slots updated", {});
- return true;
+ LOG_VERBOSE("run slots completed", {});
}
- json model_meta() {
- return json{
- {"vocab_type", llama_vocab_type(model)},
- {"n_vocab", llama_n_vocab(model)},
- {"n_ctx_train", llama_n_ctx_train(model)},
- {"n_embd", llama_n_embd(model)},
- {"n_params", llama_model_n_params(model)},
- {"size", llama_model_size(model)},
+ json model_meta() const {
+ return json {
+ {"vocab_type", llama_vocab_type (model)},
+ {"n_vocab", llama_n_vocab (model)},
+ {"n_ctx_train", llama_n_ctx_train (model)},
+ {"n_embd", llama_n_embd (model)},
+ {"n_params", llama_model_n_params(model)},
+ {"size", llama_model_size (model)},
};
}
};
-static void server_print_usage(const char *argv0, const gpt_params ¶ms,
- const server_params &sparams)
-{
+static void server_print_usage(const char * argv0, const gpt_params & params, const server_params & sparams) {
printf("usage: %s [options]\n", argv0);
printf("\n");
printf("options:\n");
@@ -2055,17 +2164,17 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
- printf(" --pooling {none,mean,cls}\n");
- printf(" pooling type for embeddings, use model default if unspecified\n");
- printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
+ printf(" --pooling {none,mean,cls} pooling type for embeddings, use model default if unspecified\n");
+ printf(" -dt N, --defrag-thold N\n");
+ printf(" KV cache defragmentation threshold (default: %.1f, < 0 - disabled)\n", params.defrag_thold);
+ printf(" -b N, --batch-size N logical maximum batch size (default: %d)\n", params.n_batch);
+ printf(" -ub N, --ubatch-size N physical maximum batch size (default: %d)\n", params.n_ubatch);
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
- if (llama_supports_mlock())
- {
+ if (llama_supports_mlock()) {
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
}
- if (llama_supports_mmap())
- {
+ if (llama_supports_mmap()) {
printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
}
printf(" --numa TYPE attempt optimizations that help on some NUMA systems\n");
@@ -2093,11 +2202,15 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
printf(" --port PORT port to listen (default (default: %d)\n", sparams.port);
- printf(" --path PUBLIC_PATH path from which to serve static files (default %s)\n", sparams.public_path.c_str());
+ printf(" --path PUBLIC_PATH path from which to serve static files (default: disabled)\n");
printf(" --api-key API_KEY optional api key to enhance server security. If set, requests must include this key for access.\n");
printf(" --api-key-file FNAME path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access.\n");
+#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
+ printf(" --ssl-key-file FNAME path to file a PEM-encoded SSL private key\n");
+ printf(" --ssl-cert-file FNAME path to file a PEM-encoded SSL certificate\n");
+#endif
printf(" -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
- printf(" --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
+ printf(" --embeddings enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
printf(" -np N, --parallel N number of slots for process requests (default: %d)\n", params.n_parallel);
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
printf(" -spf FNAME, --system-prompt-file FNAME\n");
@@ -2106,7 +2219,6 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
printf(" KV cache data type for K (default: f16)\n");
printf(" -ctv TYPE, --cache-type-v TYPE\n");
printf(" KV cache data type for V (default: f16)\n");
- printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
printf(" --log-format log output format: json or text (default: json)\n");
printf(" --log-disable disables logging to a file.\n");
printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n");
@@ -2120,61 +2232,46 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`\n");
printf(" --chat-template JINJA_TEMPLATE\n");
printf(" set custom jinja chat template (default: template taken from model's metadata)\n");
- printf(" Note: only commonly used templates are accepted, since we don't have jinja parser\n");
+ printf(" only commonly used templates are accepted:\n");
+ printf(" https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template\n");
printf("\n");
}
-static void server_params_parse(int argc, char **argv, server_params &sparams,
- gpt_params ¶ms, llama_server_context& llama)
-{
- gpt_params default_params;
+static void server_params_parse(int argc, char ** argv, server_params & sparams, gpt_params & params) {
+ gpt_params default_params;
server_params default_sparams;
+
std::string arg;
bool invalid_param = false;
- for (int i = 1; i < argc; i++)
- {
+ for (int i = 1; i < argc; i++) {
arg = argv[i];
- if (arg == "--port")
- {
- if (++i >= argc)
- {
+ if (arg == "--port") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
sparams.port = std::stoi(argv[i]);
- }
- else if (arg == "--host")
- {
- if (++i >= argc)
- {
+ } else if (arg == "--host") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
sparams.hostname = argv[i];
- }
- else if (arg == "--path")
- {
- if (++i >= argc)
- {
+ } else if (arg == "--path") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
sparams.public_path = argv[i];
- }
- else if (arg == "--api-key")
- {
- if (++i >= argc)
- {
+ } else if (arg == "--api-key") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
- sparams.api_keys.emplace_back(argv[i]);
- }
- else if (arg == "--api-key-file")
- {
- if (++i >= argc)
- {
+ sparams.api_keys.push_back(argv[i]);
+ } else if (arg == "--api-key-file") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
@@ -2191,53 +2288,53 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
}
}
key_file.close();
+
}
- else if (arg == "--timeout" || arg == "-to")
- {
- if (++i >= argc)
- {
+#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
+ else if (arg == "--ssl-key-file") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ sparams.ssl_key_file = argv[i];
+ } else if (arg == "--ssl-cert-file") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ sparams.ssl_cert_file = argv[i];
+ }
+#endif
+ else if (arg == "--timeout" || arg == "-to") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
sparams.read_timeout = std::stoi(argv[i]);
sparams.write_timeout = std::stoi(argv[i]);
- }
- else if (arg == "-m" || arg == "--model")
- {
- if (++i >= argc)
- {
+ } else if (arg == "-m" || arg == "--model") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
params.model = argv[i];
- }
- else if (arg == "-a" || arg == "--alias")
- {
- if (++i >= argc)
- {
+ } else if (arg == "-a" || arg == "--alias") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
params.model_alias = argv[i];
- }
- else if (arg == "-h" || arg == "--help")
- {
+ } else if (arg == "-h" || arg == "--help") {
server_print_usage(argv[0], default_params, default_sparams);
exit(0);
- }
- else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size")
- {
- if (++i >= argc)
- {
+ } else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
params.n_ctx = std::stoi(argv[i]);
- }
- else if (arg == "--rope-scaling")
- {
- if (++i >= argc)
- {
+ } else if (arg == "--rope-scaling") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
@@ -2246,59 +2343,44 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
else { invalid_param = true; break; }
- }
- else if (arg == "--rope-freq-base")
- {
- if (++i >= argc)
- {
+ } else if (arg == "--rope-freq-base") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
params.rope_freq_base = std::stof(argv[i]);
- }
- else if (arg == "--rope-freq-scale")
- {
- if (++i >= argc)
- {
+ } else if (arg == "--rope-freq-scale") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
params.rope_freq_scale = std::stof(argv[i]);
- }
- else if (arg == "--yarn-ext-factor")
- {
+ } else if (arg == "--yarn-ext-factor") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.yarn_ext_factor = std::stof(argv[i]);
}
- else if (arg == "--yarn-attn-factor")
- {
+ else if (arg == "--yarn-attn-factor") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.yarn_attn_factor = std::stof(argv[i]);
- }
- else if (arg == "--yarn-beta-fast")
- {
+ } else if (arg == "--yarn-beta-fast") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.yarn_beta_fast = std::stof(argv[i]);
- }
- else if (arg == "--yarn-beta-slow")
- {
+ } else if (arg == "--yarn-beta-slow") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.yarn_beta_slow = std::stof(argv[i]);
- }
- else if (arg == "--pooling")
- {
+ } else if (arg == "--pooling") {
if (++i >= argc) {
invalid_param = true;
break;
@@ -2308,108 +2390,91 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
else { invalid_param = true; break; }
- }
- else if (arg == "--threads" || arg == "-t")
- {
+ } else if (arg == "--defrag-thold" || arg == "-dt") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.defrag_thold = std::stof(argv[i]);
+ } else if (arg == "--threads" || arg == "-t") {
if (++i >= argc)
{
invalid_param = true;
break;
}
params.n_threads = std::stoi(argv[i]);
- }
- else if (arg == "--grp-attn-n" || arg == "-gan")
- {
+ } else if (arg == "--grp-attn-n" || arg == "-gan") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.grp_attn_n = std::stoi(argv[i]);
- }
- else if (arg == "--grp-attn-w" || arg == "-gaw")
- {
- if (++i >= argc)
- {
+ } else if (arg == "--grp-attn-w" || arg == "-gaw") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
params.grp_attn_w = std::stoi(argv[i]);
- }
- else if (arg == "--threads-batch" || arg == "-tb")
- {
- if (++i >= argc)
- {
+ } else if (arg == "--threads-batch" || arg == "-tb") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
params.n_threads_batch = std::stoi(argv[i]);
- }
- else if (arg == "--threads-http")
- {
- if (++i >= argc)
- {
+ } else if (arg == "--threads-http") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
sparams.n_threads_http = std::stoi(argv[i]);
- }
- else if (arg == "-b" || arg == "--batch-size")
- {
- if (++i >= argc)
- {
+ } else if (arg == "-b" || arg == "--batch-size") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
params.n_batch = std::stoi(argv[i]);
- }
- else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers")
- {
- if (++i >= argc)
- {
+ } else if (arg == "-ub" || arg == "--ubatch-size") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.n_ubatch = std::stoi(argv[i]);
+ } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
if (llama_supports_gpu_offload()) {
params.n_gpu_layers = std::stoi(argv[i]);
} else {
- LOG_WARNING("Not compiled with GPU offload support, --n-gpu-layers option will be ignored. "
- "See main README.md for information on enabling GPU BLAS support",
- {{"n_gpu_layers", params.n_gpu_layers}});
+ LOG_WARNING(
+ "Not compiled with GPU offload support, --n-gpu-layers option will be ignored. "
+ "See main README.md for information on enabling GPU BLAS support",
+ {{"n_gpu_layers", params.n_gpu_layers}});
}
- }
- else if (arg == "--split-mode" || arg == "-sm")
- {
+ } else if (arg == "--split-mode" || arg == "-sm") {
if (++i >= argc) {
invalid_param = true;
break;
}
std::string arg_next = argv[i];
- if (arg_next == "none")
- {
+ if (arg_next == "none") {
params.split_mode = LLAMA_SPLIT_MODE_NONE;
- }
- else if (arg_next == "layer")
- {
+ } else if (arg_next == "layer") {
params.split_mode = LLAMA_SPLIT_MODE_LAYER;
- }
- else if (arg_next == "row")
- {
+ } else if (arg_next == "row") {
params.split_mode = LLAMA_SPLIT_MODE_ROW;
- }
- else {
+ } else {
invalid_param = true;
break;
}
#ifndef GGML_USE_CUBLAS
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n");
#endif // GGML_USE_CUBLAS
- }
- else if (arg == "--tensor-split" || arg == "-ts")
- {
- if (++i >= argc)
- {
+ } else if (arg == "--tensor-split" || arg == "-ts") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
@@ -2422,25 +2487,18 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
std::vector split_arg{it, {}};
GGML_ASSERT(split_arg.size() <= llama_max_devices());
- for (size_t i_device = 0; i_device < llama_max_devices(); ++i_device)
- {
- if (i_device < split_arg.size())
- {
+ for (size_t i_device = 0; i_device < llama_max_devices(); ++i_device) {
+ if (i_device < split_arg.size()) {
params.tensor_split[i_device] = std::stof(split_arg[i_device]);
- }
- else
- {
+ } else {
params.tensor_split[i_device] = 0.0f;
}
}
#else
LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n", {});
#endif // GGML_USE_CUBLAS
- }
- else if (arg == "--main-gpu" || arg == "-mg")
- {
- if (++i >= argc)
- {
+ } else if (arg == "--main-gpu" || arg == "-mg") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
@@ -2449,98 +2507,70 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
#else
LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {});
#endif
- }
- else if (arg == "--lora")
- {
- if (++i >= argc)
- {
+ } else if (arg == "--lora") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
params.lora_adapter.emplace_back(argv[i], 1.0f);
params.use_mmap = false;
- }
- else if (arg == "--lora-scaled")
- {
- if (++i >= argc)
- {
+ } else if (arg == "--lora-scaled") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
const char * lora_adapter = argv[i];
- if (++i >= argc)
- {
+ if (++i >= argc) {
invalid_param = true;
break;
}
params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
params.use_mmap = false;
- }
- else if (arg == "--lora-base")
- {
- if (++i >= argc)
- {
+ } else if (arg == "--lora-base") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
params.lora_base = argv[i];
- }
- else if (arg == "-v" || arg == "--verbose")
- {
+ } else if (arg == "-v" || arg == "--verbose") {
#if SERVER_VERBOSE != 1
LOG_WARNING("server.cpp is not built with verbose logging.", {});
#else
server_verbose = true;
#endif
- }
- else if (arg == "--mlock")
- {
+ } else if (arg == "--mlock") {
params.use_mlock = true;
- }
- else if (arg == "--no-mmap")
- {
+ } else if (arg == "--no-mmap") {
params.use_mmap = false;
- }
- else if (arg == "--numa") {
+ } else if (arg == "--numa") {
if (++i >= argc) {
invalid_param = true;
break;
} else {
std::string value(argv[i]);
/**/ if (value == "distribute" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
- else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
- else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
+ else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
+ else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
else { invalid_param = true; break; }
}
- }
- else if (arg == "--embedding")
- {
+ } else if (arg == "--embedding" || arg == "--embeddings") {
params.embedding = true;
- }
- else if (arg == "-cb" || arg == "--cont-batching")
- {
+ } else if (arg == "-cb" || arg == "--cont-batching") {
params.cont_batching = true;
- }
- else if (arg == "-np" || arg == "--parallel")
- {
- if (++i >= argc)
- {
+ } else if (arg == "-np" || arg == "--parallel") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
params.n_parallel = std::stoi(argv[i]);
- } else if (arg == "-n" || arg == "--n-predict")
- {
- if (++i >= argc)
- {
+ } else if (arg == "-n" || arg == "--n-predict") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
params.n_predict = std::stoi(argv[i]);
- } else if (arg == "-spf" || arg == "--system-prompt-file")
- {
- if (++i >= argc)
- {
+ } else if (arg == "-spf" || arg == "--system-prompt-file") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
@@ -2550,67 +2580,39 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
invalid_param = true;
break;
}
- std::string systm_content;
+ std::string system_prompt;
std::copy(
std::istreambuf_iterator(file),
std::istreambuf_iterator(),
- std::back_inserter(systm_content)
+ std::back_inserter(system_prompt)
);
- llama.system_prompt_process(json::parse(systm_content));
- }
- else if (arg == "-ctk" || arg == "--cache-type-k") {
+ sparams.system_prompt = system_prompt;
+ } else if (arg == "-ctk" || arg == "--cache-type-k") {
params.cache_type_k = argv[++i];
- }
- else if (arg == "-ctv" || arg == "--cache-type-v") {
+ } else if (arg == "-ctv" || arg == "--cache-type-v") {
params.cache_type_v = argv[++i];
- }
- else if(arg == "--mmproj")
- {
- if (++i >= argc)
- {
- invalid_param = true;
- break;
- }
- params.mmproj = argv[i];
- }
- else if (arg == "--log-format")
- {
- if (++i >= argc)
- {
+ } else if (arg == "--log-format") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
- if (std::strcmp(argv[i], "json") == 0)
- {
+ if (std::strcmp(argv[i], "json") == 0) {
server_log_json = true;
- }
- else if (std::strcmp(argv[i], "text") == 0)
- {
+ } else if (std::strcmp(argv[i], "text") == 0) {
server_log_json = false;
- }
- else
- {
+ } else {
invalid_param = true;
break;
}
- }
- else if (arg == "--log-disable")
- {
+ } else if (arg == "--log-disable") {
log_set_target(stdout);
LOG_INFO("logging to file is disabled.", {});
- }
- else if (arg == "--slots-endpoint-disable")
- {
+ } else if (arg == "--slots-endpoint-disable") {
sparams.slots_endpoint = false;
- }
- else if (arg == "--metrics")
- {
+ } else if (arg == "--metrics") {
sparams.metrics_endpoint = true;
- }
- else if (arg == "--chat-template")
- {
- if (++i >= argc)
- {
+ } else if (arg == "--chat-template") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
@@ -2621,9 +2623,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
break;
}
sparams.chat_template = argv[i];
- }
- else if (arg == "--override-kv")
- {
+ } else if (arg == "--override-kv") {
if (++i >= argc) {
invalid_param = true;
break;
@@ -2634,6 +2634,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
invalid_param = true;
break;
}
+
struct llama_model_kv_override kvo;
std::strncpy(kvo.key, argv[i], sep - argv[i]);
kvo.key[sep - argv[i]] = 0;
@@ -2664,67 +2665,28 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
break;
}
params.kv_overrides.push_back(kvo);
- }
- else
- {
+ } else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
server_print_usage(argv[0], default_params, default_sparams);
exit(1);
}
}
+
if (!params.kv_overrides.empty()) {
params.kv_overrides.emplace_back();
params.kv_overrides.back().key[0] = 0;
}
- if (invalid_param)
- {
+ if (invalid_param) {
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
server_print_usage(argv[0], default_params, default_sparams);
exit(1);
}
}
-/* llama.cpp completion api semantics */
-static json format_partial_response(
- llama_server_context &llama, server_slot *slot, const std::string &content, const std::vector &probs
-) {
- json res = json
- {
- {"content", content },
- {"stop", false},
- {"slot_id", slot->id },
- {"multimodal", llama.multimodal }
- };
-
- if (slot->sparams.n_probs > 0)
- {
- res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
- }
-
- return res;
-}
-
-static json format_tokenizer_response(const std::vector &tokens)
-{
- return json {
- {"tokens", tokens}
- };
-}
-
-static json format_detokenized_response(std::string content)
-{
- return json {
- {"content", content}
- };
-}
-
-
-static void log_server_request(const httplib::Request &req, const httplib::Response &res)
-{
+static void log_server_request(const httplib::Request & req, const httplib::Response & res) {
// skip GH copilot requests when using default port
- if (req.path == "/v1/health" || req.path == "/v1/completions")
- {
+ if (req.path == "/v1/health" || req.path == "/v1/completions") {
return;
}
@@ -2743,24 +2705,9 @@ static void log_server_request(const httplib::Request &req, const httplib::Respo
});
}
-static void append_to_generated_text_from_generated_token_probs(llama_server_context &llama, server_slot *slot)
-{
- auto & gtps = slot->generated_token_probs;
- auto translator = token_translator{llama.ctx};
- auto add_strlen = [=](size_t sum, const completion_token_output & cto) { return sum + translator(cto).size(); };
- const size_t len = std::accumulate(gtps.begin(), gtps.end(), size_t(0), add_strlen);
- if (slot->generated_text.capacity() < slot->generated_text.size() + len)
- {
- slot->generated_text.reserve(slot->generated_text.size() + len);
- }
- for (const completion_token_output & cto : gtps)
- {
- slot->generated_text += translator(cto);
- }
-}
-
std::function shutdown_handler;
std::atomic_flag is_terminating = ATOMIC_FLAG_INIT;
+
inline void signal_handler(int signal) {
if (is_terminating.test_and_set()) {
// in case it hangs, we can force terminate the server by hitting Ctrl+C twice
@@ -2768,294 +2715,196 @@ inline void signal_handler(int signal) {
fprintf(stderr, "Received second interrupt, terminating immediately.\n");
exit(1);
}
+
shutdown_handler(signal);
}
-int main(int argc, char **argv)
-{
+int main(int argc, char ** argv) {
#if SERVER_VERBOSE != 1
log_disable();
#endif
// own arguments required by this example
- gpt_params params;
+ gpt_params params;
server_params sparams;
// struct that contains llama context and inference
- llama_server_context llama;
+ server_context ctx_server;
- server_params_parse(argc, argv, sparams, params, llama);
+ server_params_parse(argc, argv, sparams, params);
- if (params.model_alias == "unknown")
- {
+ if (!sparams.system_prompt.empty()) {
+ ctx_server.system_prompt_set(json::parse(sparams.system_prompt));
+ }
+
+ if (params.model_alias == "unknown") {
params.model_alias = params.model;
}
llama_backend_init();
llama_numa_init(params.numa);
- LOG_INFO("build info", {{"build", LLAMA_BUILD_NUMBER},
- {"commit", LLAMA_COMMIT}});
+ LOG_INFO("build info", {
+ {"build", LLAMA_BUILD_NUMBER},
+ {"commit", LLAMA_COMMIT}
+ });
LOG_INFO("system info", {
- {"n_threads", params.n_threads},
- {"n_threads_batch", params.n_threads_batch},
- {"total_threads", std::thread::hardware_concurrency()},
- {"system_info", llama_print_system_info()},
- });
+ {"n_threads", params.n_threads},
+ {"n_threads_batch", params.n_threads_batch},
+ {"total_threads", std::thread::hardware_concurrency()},
+ {"system_info", llama_print_system_info()},
+ });
- httplib::Server svr;
+ std::unique_ptr svr;
+#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
+ if (sparams.ssl_key_file != "" && sparams.ssl_cert_file != "") {
+ LOG_INFO("Running with SSL", {{"key", sparams.ssl_key_file}, {"cert", sparams.ssl_cert_file}});
+ svr.reset(
+ new httplib::SSLServer(sparams.ssl_cert_file.c_str(), sparams.ssl_key_file.c_str())
+ );
+ } else {
+ LOG_INFO("Running without SSL", {});
+ svr.reset(new httplib::Server());
+ }
+#else
+ svr.reset(new httplib::Server());
+#endif
std::atomic state{SERVER_STATE_LOADING_MODEL};
- svr.set_default_headers({{"Server", "llama.cpp"}});
+ svr->set_default_headers({{"Server", "llama.cpp"}});
// CORS preflight
- svr.Options(R"(.*)", [](const httplib::Request &req, httplib::Response &res) {
- res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
+ svr->Options(R"(.*)", [](const httplib::Request & req, httplib::Response & res) {
+ res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
res.set_header("Access-Control-Allow-Credentials", "true");
- res.set_header("Access-Control-Allow-Methods", "POST");
- res.set_header("Access-Control-Allow-Headers", "*");
+ res.set_header("Access-Control-Allow-Methods", "POST");
+ res.set_header("Access-Control-Allow-Headers", "*");
+ return res.set_content("", "application/json; charset=utf-8");
});
- svr.Get("/health", [&](const httplib::Request& req, httplib::Response& res) {
- server_state current_state = state.load();
- switch(current_state) {
- case SERVER_STATE_READY: {
- // request slots data using task queue
- task_server task;
- task.id = llama.queue_tasks.get_new_id();
- task.type = TASK_TYPE_METRICS;
- task.target_id = -1;
-
- llama.queue_results.add_waiting_task_id(task.id);
- llama.queue_tasks.post(task);
+ svr->set_logger(log_server_request);
- // get the result
- task_result result = llama.queue_results.recv(task.id);
- llama.queue_results.remove_waiting_task_id(task.id);
+ auto res_error = [](httplib::Response & res, json error_data) {
+ json final_response {{"error", error_data}};
+ res.set_content(final_response.dump(), "application/json; charset=utf-8");
+ res.status = json_value(error_data, "code", 500);
+ };
- int n_idle_slots = result.result_json["idle"];
- int n_processing_slots = result.result_json["processing"];
+ svr->set_exception_handler([&res_error](const httplib::Request &, httplib::Response & res, std::exception_ptr ep) {
+ std::string message;
+ try {
+ std::rethrow_exception(std::move(ep));
+ } catch (std::exception & e) {
+ message = e.what();
+ } catch (...) {
+ message = "Unknown Exception";
+ }
- json health = {
- {"status", "ok"},
- {"slots_idle", n_idle_slots},
- {"slots_processing", n_processing_slots}};
- res.status = 200; // HTTP OK
- if (sparams.slots_endpoint && req.has_param("include_slots")) {
- health["slots"] = result.result_json["slots"];
- }
+ json formatted_error = format_error_response(message, ERROR_TYPE_SERVER);
+ LOG_VERBOSE("Got exception", formatted_error);
+ res_error(res, formatted_error);
+ });
- if (n_idle_slots == 0) {
- health["status"] = "no slot available";
- if (req.has_param("fail_on_no_slot")) {
- res.status = 503; // HTTP Service Unavailable
- }
- }
- res.set_content(health.dump(), "application/json");
- break;
- }
- case SERVER_STATE_LOADING_MODEL:
- res.set_content(R"({"status": "loading model"})", "application/json");
- res.status = 503; // HTTP Service Unavailable
- break;
- case SERVER_STATE_ERROR:
- res.set_content(R"({"status": "error", "error": "Model failed to load"})", "application/json");
- res.status = 500; // HTTP Internal Server Error
- break;
+ svr->set_error_handler([&res_error](const httplib::Request &, httplib::Response & res) {
+ if (res.status == 404) {
+ res_error(res, format_error_response("File Not Found", ERROR_TYPE_NOT_FOUND));
}
+ // for other error codes, we skip processing here because it's already done by res_error()
});
- if (sparams.slots_endpoint) {
- svr.Get("/slots", [&](const httplib::Request&, httplib::Response& res) {
- // request slots data using task queue
- task_server task;
- task.id = llama.queue_tasks.get_new_id();
- task.type = TASK_TYPE_METRICS;
- task.target_id = -1;
-
- llama.queue_results.add_waiting_task_id(task.id);
- llama.queue_tasks.post(task);
-
- // get the result
- task_result result = llama.queue_results.recv(task.id);
- llama.queue_results.remove_waiting_task_id(task.id);
-
- res.set_content(result.result_json["slots"].dump(), "application/json");
- res.status = 200; // HTTP OK
- });
- }
-
- if (sparams.metrics_endpoint) {
- svr.Get("/metrics", [&](const httplib::Request&, httplib::Response& res) {
- // request slots data using task queue
- task_server task;
- task.id = llama.queue_tasks.get_new_id();
- task.type = TASK_TYPE_METRICS;
- task.target_id = -1;
-
- llama.queue_results.add_waiting_task_id(task.id);
- llama.queue_tasks.post(task);
-
- // get the result
- task_result result = llama.queue_results.recv(task.id);
- llama.queue_results.remove_waiting_task_id(task.id);
-
- json data = result.result_json;
-
- uint64_t n_prompt_tokens_processed = data["n_prompt_tokens_processed"];
- uint64_t t_prompt_processing = data["t_prompt_processing"];
-
- uint64_t n_tokens_predicted = data["n_tokens_predicted"];
- uint64_t t_tokens_generation = data["t_tokens_generation"];
-
- int32_t kv_cache_used_cells = data["kv_cache_used_cells"];
-
- // metrics definition: https://prometheus.io/docs/practices/naming/#metric-names
- json all_metrics_def = json {
- {"counter", {{
- {"name", "prompt_tokens_total"},
- {"help", "Number of prompt tokens processed."},
- {"value", data["n_prompt_tokens_processed_total"]}
- }, {
- {"name", "tokens_predicted_total"},
- {"help", "Number of generation tokens processed."},
- {"value", data["n_tokens_predicted_total"]}
- }}},
- {"gauge", {{
- {"name", "prompt_tokens_seconds"},
- {"help", "Average prompt throughput in tokens/s."},
- {"value", n_prompt_tokens_processed ? 1e3 / t_prompt_processing * n_prompt_tokens_processed : 0}
- },{
- {"name", "predicted_tokens_seconds"},
- {"help", "Average generation throughput in tokens/s."},
- {"value", n_tokens_predicted ? 1e3 / t_tokens_generation * n_tokens_predicted : 0}
- },{
- {"name", "kv_cache_usage_ratio"},
- {"help", "KV-cache usage. 1 means 100 percent usage."},
- {"value", 1. * kv_cache_used_cells / params.n_ctx}
- },{
- {"name", "kv_cache_tokens"},
- {"help", "KV-cache tokens."},
- {"value", data["kv_cache_tokens_count"]}
- },{
- {"name", "requests_processing"},
- {"help", "Number of request processing."},
- {"value", data["processing"]}
- },{
- {"name", "requests_deferred"},
- {"help", "Number of request deferred."},
- {"value", data["deferred"]}
- }}}
- };
-
- std::stringstream prometheus;
- for (const auto& el : all_metrics_def.items()) {
- const auto& type = el.key();
- const auto& metrics_def = el.value();
- for (const auto& metric_def : metrics_def) {
- std::string name = metric_def["name"];
- std::string help = metric_def["help"];
- auto value = json_value(metric_def, "value", 0);
- prometheus << "# HELP llamacpp:" << name << " " << help << "\n"
- << "# TYPE llamacpp:" << name << " " << type << "\n"
- << "llamacpp:" << name << " " << value << "\n";
- }
- }
-
- res.set_content(prometheus.str(), "text/plain; version=0.0.4");
- res.status = 200; // HTTP OK
- });
- }
-
- svr.set_logger(log_server_request);
-
- svr.set_exception_handler([](const httplib::Request &, httplib::Response &res, std::exception_ptr ep)
- {
- const char fmt[] = "500 Internal Server Error\n%s";
- char buf[BUFSIZ];
- try
- {
- std::rethrow_exception(std::move(ep));
- }
- catch (std::exception &e)
- {
- snprintf(buf, sizeof(buf), fmt, e.what());
- }
- catch (...)
- {
- snprintf(buf, sizeof(buf), fmt, "Unknown Exception");
- }
- res.set_content(buf, "text/plain; charset=utf-8");
- res.status = 500;
- });
-
- svr.set_error_handler([](const httplib::Request &, httplib::Response &res)
- {
- if (res.status == 401)
- {
- res.set_content("Unauthorized", "text/plain; charset=utf-8");
- }
- if (res.status == 400)
- {
- res.set_content("Invalid request", "text/plain; charset=utf-8");
- }
- else if (res.status == 404)
- {
- res.set_content("File Not Found", "text/plain; charset=utf-8");
- res.status = 404;
- }
- });
-
// set timeouts and change hostname and port
- svr.set_read_timeout (sparams.read_timeout);
- svr.set_write_timeout(sparams.write_timeout);
+ svr->set_read_timeout (sparams.read_timeout);
+ svr->set_write_timeout(sparams.write_timeout);
- if (!svr.bind_to_port(sparams.hostname, sparams.port))
- {
+ if (!svr->bind_to_port(sparams.hostname, sparams.port)) {
fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n", sparams.hostname.c_str(), sparams.port);
return 1;
}
- // Set the base directory for serving static files
- svr.set_base_dir(sparams.public_path);
-
std::unordered_map log_data;
+
log_data["hostname"] = sparams.hostname;
- log_data["port"] = std::to_string(sparams.port);
+ log_data["port"] = std::to_string(sparams.port);
if (sparams.api_keys.size() == 1) {
- log_data["api_key"] = "api_key: ****" + sparams.api_keys[0].substr(sparams.api_keys[0].length() - 4);
+ auto key = sparams.api_keys[0];
+ log_data["api_key"] = "api_key: ****" + key.substr(std::max((int)(key.length() - 4), 0));
} else if (sparams.api_keys.size() > 1) {
log_data["api_key"] = "api_key: " + std::to_string(sparams.api_keys.size()) + " keys loaded";
}
// load the model
- if (!llama.load_model(params))
- {
+ if (!ctx_server.load_model(params)) {
state.store(SERVER_STATE_ERROR);
return 1;
} else {
- llama.initialize();
+ ctx_server.init();
state.store(SERVER_STATE_READY);
- LOG_INFO("model loaded", {});
}
- const auto model_meta = llama.model_meta();
- if (sparams.chat_template.empty()) { // custom chat template is not supplied
- // check if the template comes with the model is supported by us
- llama.validate_model_chat_template(sparams);
+ LOG_INFO("model loaded", {});
+
+ const auto model_meta = ctx_server.model_meta();
+
+ // if a custom chat template is not supplied, we will use the one that comes with the model (if any)
+ if (sparams.chat_template.empty()) {
+ if (!ctx_server.validate_model_chat_template()) {
+ LOG_ERROR("The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
+ sparams.chat_template = "chatml";
+ }
}
- // Middleware for API key validation
- auto validate_api_key = [&sparams](const httplib::Request &req, httplib::Response &res) -> bool {
+ // print sample chat example to make it clear which template is used
+ {
+ json chat;
+ chat.push_back({{"role", "system"}, {"content", "You are a helpful assistant"}});
+ chat.push_back({{"role", "user"}, {"content", "Hello"}});
+ chat.push_back({{"role", "assistant"}, {"content", "Hi there"}});
+ chat.push_back({{"role", "user"}, {"content", "How are you?"}});
+
+ const std::string chat_example = format_chat(ctx_server.model, sparams.chat_template, chat);
+
+ LOG_INFO("chat template", {
+ {"chat_example", chat_example},
+ {"built_in", sparams.chat_template.empty()},
+ });
+ }
+
+ //
+ // Middlewares
+ //
+
+ auto middleware_validate_api_key = [&sparams, &res_error](const httplib::Request & req, httplib::Response & res) {
+ // TODO: should we apply API key to all endpoints, including "/health" and "/models"?
+ static const std::set protected_endpoints = {
+ "/props",
+ "/completion",
+ "/completions",
+ "/v1/completions",
+ "/chat/completions",
+ "/v1/chat/completions",
+ "/infill",
+ "/tokenize",
+ "/detokenize",
+ "/embedding",
+ "/embeddings",
+ "/v1/embeddings",
+ };
+
// If API key is not set, skip validation
if (sparams.api_keys.empty()) {
return true;
}
+ // If path is not in protected_endpoints list, skip validation
+ if (protected_endpoints.find(req.path) == protected_endpoints.end()) {
+ return true;
+ }
+
// Check for API key in the header
auto auth_header = req.get_header_value("Authorization");
+
std::string prefix = "Bearer ";
if (auth_header.substr(0, prefix.size()) == prefix) {
std::string received_api_key = auth_header.substr(prefix.size());
@@ -3065,188 +2914,341 @@ int main(int argc, char **argv)
}
// API key is invalid or not provided
- res.set_content("Unauthorized: Invalid API Key", "text/plain; charset=utf-8");
- res.status = 401; // Unauthorized
+ // TODO: make another middleware for CORS related logic
+ res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
+ res_error(res, format_error_response("Invalid API Key", ERROR_TYPE_AUTHENTICATION));
LOG_WARNING("Unauthorized: Invalid API Key", {});
return false;
};
- // this is only called if no index.html is found in the public --path
- svr.Get("/", [](const httplib::Request &, httplib::Response &res)
- {
- res.set_content(reinterpret_cast(&index_html), index_html_len, "text/html; charset=utf-8");
- return false;
- });
+ // register server middlewares
+ svr->set_pre_routing_handler([&middleware_validate_api_key](const httplib::Request & req, httplib::Response & res) {
+ if (!middleware_validate_api_key(req, res)) {
+ return httplib::Server::HandlerResponse::Handled;
+ }
+ return httplib::Server::HandlerResponse::Unhandled;
+ });
- // this is only called if no index.js is found in the public --path
- svr.Get("/index.js", [](const httplib::Request &, httplib::Response &res)
- {
- res.set_content(reinterpret_cast(&index_js), index_js_len, "text/javascript; charset=utf-8");
- return false;
- });
+ //
+ // Route handlers (or controllers)
+ //
- // this is only called if no index.html is found in the public --path
- svr.Get("/completion.js", [](const httplib::Request &, httplib::Response &res)
- {
- res.set_content(reinterpret_cast(&completion_js), completion_js_len, "application/javascript; charset=utf-8");
- return false;
- });
+ const auto handle_health = [&](const httplib::Request & req, httplib::Response & res) {
+ server_state current_state = state.load();
+ switch (current_state) {
+ case SERVER_STATE_READY:
+ {
+ // request slots data using task queue
+ server_task task;
+ task.id = ctx_server.queue_tasks.get_new_id();
+ task.type = SERVER_TASK_TYPE_METRICS;
+ task.id_target = -1;
- // this is only called if no index.html is found in the public --path
- svr.Get("/json-schema-to-grammar.mjs", [](const httplib::Request &, httplib::Response &res)
- {
- res.set_content(reinterpret_cast(&json_schema_to_grammar_mjs), json_schema_to_grammar_mjs_len, "application/javascript; charset=utf-8");
- return false;
- });
+ ctx_server.queue_results.add_waiting_task_id(task.id);
+ ctx_server.queue_tasks.post(task);
- svr.Get("/props", [&llama](const httplib::Request & req, httplib::Response &res)
- {
- res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
- json data = {
- { "user_name", llama.name_user.c_str() },
- { "assistant_name", llama.name_assistant.c_str() },
- { "default_generation_settings", llama.default_generation_settings_for_props },
- { "total_slots", llama.params.n_parallel }
- };
- res.set_content(data.dump(), "application/json; charset=utf-8");
- });
+ // get the result
+ server_task_result result = ctx_server.queue_results.recv(task.id);
+ ctx_server.queue_results.remove_waiting_task_id(task.id);
- svr.Post("/completion", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
- {
- res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
- if (!validate_api_key(req, res)) {
- return;
- }
- json data = json::parse(req.body);
- const int task_id = llama.queue_tasks.get_new_id();
- llama.queue_results.add_waiting_task_id(task_id);
- llama.request_completion(task_id, data, false, false, -1);
- if (!json_value(data, "stream", false)) {
- std::string completion_text;
- task_result result = llama.queue_results.recv(task_id);
- if (!result.error && result.stop) {
- res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8");
+ const int n_idle_slots = result.data["idle"];
+ const int n_processing_slots = result.data["processing"];
+
+ json health = {
+ {"status", "ok"},
+ {"slots_idle", n_idle_slots},
+ {"slots_processing", n_processing_slots}
+ };
+
+ res.status = 200; // HTTP OK
+ if (sparams.slots_endpoint && req.has_param("include_slots")) {
+ health["slots"] = result.data["slots"];
}
- else
- {
- res.status = 404;
- res.set_content(result.result_json["content"], "text/plain; charset=utf-8");
+
+ if (n_idle_slots == 0) {
+ health["status"] = "no slot available";
+ if (req.has_param("fail_on_no_slot")) {
+ res.status = 503; // HTTP Service Unavailable
+ }
}
- llama.queue_results.remove_waiting_task_id(task_id);
- } else {
- const auto chunked_content_provider = [task_id, &llama](size_t, httplib::DataSink & sink)
- {
- while (true)
- {
- task_result result = llama.queue_results.recv(task_id);
- if (!result.error) {
- const std::string str =
- "data: " +
- result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) +
- "\n\n";
- LOG_VERBOSE("data stream", {
- { "to_send", str }
- });
- if (!sink.write(str.c_str(), str.size()))
- {
- llama.queue_results.remove_waiting_task_id(task_id);
- return false;
- }
- if (result.stop) {
- break;
- }
- } else {
- const std::string str =
- "error: " +
- result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) +
- "\n\n";
- LOG_VERBOSE("data stream", {
- { "to_send", str }
- });
- if (!sink.write(str.c_str(), str.size()))
- {
- llama.queue_results.remove_waiting_task_id(task_id);
- return false;
- }
- break;
- }
+
+ res.set_content(health.dump(), "application/json");
+ break;
+ }
+ case SERVER_STATE_LOADING_MODEL:
+ {
+ res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE));
+ } break;
+ case SERVER_STATE_ERROR:
+ {
+ res_error(res, format_error_response("Model failed to load", ERROR_TYPE_SERVER));
+ } break;
+ }
+ };
+
+ const auto handle_slots = [&](const httplib::Request &, httplib::Response & res) {
+ if (!sparams.slots_endpoint) {
+ res_error(res, format_error_response("This server does not support slots endpoint.", ERROR_TYPE_NOT_SUPPORTED));
+ return;
+ }
+
+ // request slots data using task queue
+ server_task task;
+ task.id = ctx_server.queue_tasks.get_new_id();
+ task.id_multi = -1;
+ task.id_target = -1;
+ task.type = SERVER_TASK_TYPE_METRICS;
+
+ ctx_server.queue_results.add_waiting_task_id(task.id);
+ ctx_server.queue_tasks.post(task);
+
+ // get the result
+ server_task_result result = ctx_server.queue_results.recv(task.id);
+ ctx_server.queue_results.remove_waiting_task_id(task.id);
+
+ res.set_content(result.data["slots"].dump(), "application/json");
+ res.status = 200; // HTTP OK
+ };
+
+ const auto handle_metrics = [&](const httplib::Request &, httplib::Response & res) {
+ if (!sparams.metrics_endpoint) {
+ res_error(res, format_error_response("This server does not support metrics endpoint.", ERROR_TYPE_NOT_SUPPORTED));
+ return;
+ }
+
+ // request slots data using task queue
+ server_task task;
+ task.id = ctx_server.queue_tasks.get_new_id();
+ task.id_multi = -1;
+ task.id_target = -1;
+ task.type = SERVER_TASK_TYPE_METRICS;
+ task.data.push_back({{"reset_bucket", true}});
+
+ ctx_server.queue_results.add_waiting_task_id(task.id);
+ ctx_server.queue_tasks.post(task);
+
+ // get the result
+ server_task_result result = ctx_server.queue_results.recv(task.id);
+ ctx_server.queue_results.remove_waiting_task_id(task.id);
+
+ json data = result.data;
+
+ const uint64_t n_prompt_tokens_processed = data["n_prompt_tokens_processed"];
+ const uint64_t t_prompt_processing = data["t_prompt_processing"];
+
+ const uint64_t n_tokens_predicted = data["n_tokens_predicted"];
+ const uint64_t t_tokens_generation = data["t_tokens_generation"];
+
+ const int32_t kv_cache_used_cells = data["kv_cache_used_cells"];
+
+ // metrics definition: https://prometheus.io/docs/practices/naming/#metric-names
+ json all_metrics_def = json {
+ {"counter", {{
+ {"name", "prompt_tokens_total"},
+ {"help", "Number of prompt tokens processed."},
+ {"value", (uint64_t) data["n_prompt_tokens_processed_total"]}
+ }, {
+ {"name", "prompt_seconds_total"},
+ {"help", "Prompt process time"},
+ {"value", (uint64_t) data["t_prompt_processing_total"] / 1.e3}
+ }, {
+ {"name", "tokens_predicted_total"},
+ {"help", "Number of generation tokens processed."},
+ {"value", (uint64_t) data["n_tokens_predicted_total"]}
+ }, {
+ {"name", "tokens_predicted_seconds_total"},
+ {"help", "Predict process time"},
+ {"value", (uint64_t) data["t_tokens_generation_total"] / 1.e3}
+ }}},
+ {"gauge", {{
+ {"name", "prompt_tokens_seconds"},
+ {"help", "Average prompt throughput in tokens/s."},
+ {"value", n_prompt_tokens_processed ? 1.e3 / t_prompt_processing * n_prompt_tokens_processed : 0.}
+ },{
+ {"name", "predicted_tokens_seconds"},
+ {"help", "Average generation throughput in tokens/s."},
+ {"value", n_tokens_predicted ? 1.e3 / t_tokens_generation * n_tokens_predicted : 0.}
+ },{
+ {"name", "kv_cache_usage_ratio"},
+ {"help", "KV-cache usage. 1 means 100 percent usage."},
+ {"value", 1. * kv_cache_used_cells / params.n_ctx}
+ },{
+ {"name", "kv_cache_tokens"},
+ {"help", "KV-cache tokens."},
+ {"value", (uint64_t) data["kv_cache_tokens_count"]}
+ },{
+ {"name", "requests_processing"},
+ {"help", "Number of request processing."},
+ {"value", (uint64_t) data["processing"]}
+ },{
+ {"name", "requests_deferred"},
+ {"help", "Number of request deferred."},
+ {"value", (uint64_t) data["deferred"]}
+ }}}
+ };
+
+ std::stringstream prometheus;
+
+ for (const auto & el : all_metrics_def.items()) {
+ const auto & type = el.key();
+ const auto & metrics_def = el.value();
+
+ for (const auto & metric_def : metrics_def) {
+ const std::string name = metric_def["name"];
+ const std::string help = metric_def["help"];
+
+ auto value = json_value(metric_def, "value", 0.);
+ prometheus << "# HELP llamacpp:" << name << " " << help << "\n"
+ << "# TYPE llamacpp:" << name << " " << type << "\n"
+ << "llamacpp:" << name << " " << value << "\n";
+ }
+ }
+
+ const int64_t t_start = data["t_start"];
+ res.set_header("Process-Start-Time-Unix", std::to_string(t_start));
+
+ res.set_content(prometheus.str(), "text/plain; version=0.0.4");
+ res.status = 200; // HTTP OK
+ };
+
+ const auto handle_props = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
+ res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
+ json data = {
+ { "user_name", ctx_server.name_user.c_str() },
+ { "assistant_name", ctx_server.name_assistant.c_str() },
+ { "default_generation_settings", ctx_server.default_generation_settings_for_props },
+ { "total_slots", ctx_server.params.n_parallel }
+ };
+
+ res.set_content(data.dump(), "application/json; charset=utf-8");
+ };
+
+ const auto handle_completions = [&ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) {
+ res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
+
+ json data = json::parse(req.body);
+
+ const int id_task = ctx_server.queue_tasks.get_new_id();
+
+ ctx_server.queue_results.add_waiting_task_id(id_task);
+ ctx_server.request_completion(id_task, -1, data, false, false);
+
+ if (!json_value(data, "stream", false)) {
+ server_task_result result = ctx_server.queue_results.recv(id_task);
+ if (!result.error && result.stop) {
+ res.set_content(result.data.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8");
+ } else {
+ res_error(res, result.data);
+ }
+
+ ctx_server.queue_results.remove_waiting_task_id(id_task);
+ } else {
+ const auto chunked_content_provider = [id_task, &ctx_server](size_t, httplib::DataSink & sink) {
+ while (true) {
+ server_task_result result = ctx_server.queue_results.recv(id_task);
+ if (!result.error) {
+ const std::string str =
+ "data: " +
+ result.data.dump(-1, ' ', false, json::error_handler_t::replace) +
+ "\n\n";
+
+ LOG_VERBOSE("data stream", {
+ { "to_send", str }
+ });
+
+ if (!sink.write(str.c_str(), str.size())) {
+ ctx_server.queue_results.remove_waiting_task_id(id_task);
+ return false;
}
- llama.queue_results.remove_waiting_task_id(task_id);
- sink.done();
- return true;
- };
+ if (result.stop) {
+ break;
+ }
+ } else {
+ const std::string str =
+ "error: " +
+ result.data.dump(-1, ' ', false, json::error_handler_t::replace) +
+ "\n\n";
- auto on_complete = [task_id, &llama] (bool)
- {
- // cancel
- llama.request_cancel(task_id);
- llama.queue_results.remove_waiting_task_id(task_id);
- };
+ LOG_VERBOSE("data stream", {
+ { "to_send", str }
+ });
- res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
+ if (!sink.write(str.c_str(), str.size())) {
+ ctx_server.queue_results.remove_waiting_task_id(id_task);
+ return false;
+ }
+
+ break;
+ }
}
- });
- svr.Get("/v1/models", [¶ms, &model_meta](const httplib::Request& req, httplib::Response& res)
- {
- res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
- std::time_t t = std::time(0);
-
- json models = {
- {"object", "list"},
- {"data", {
- {
- {"id", params.model_alias},
- {"object", "model"},
- {"created", t},
- {"owned_by", "llamacpp"},
- {"meta", model_meta}
- },
- }}
- };
+ ctx_server.queue_results.remove_waiting_task_id(id_task);
+ sink.done();
- res.set_content(models.dump(), "application/json; charset=utf-8");
- });
+ return true;
+ };
- const auto chat_completions = [&llama, &validate_api_key, &sparams](const httplib::Request &req, httplib::Response &res)
- {
- res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
- if (!validate_api_key(req, res)) {
- return;
+ auto on_complete = [id_task, &ctx_server] (bool) {
+ // cancel
+ ctx_server.request_cancel(id_task);
+ ctx_server.queue_results.remove_waiting_task_id(id_task);
+ };
+
+ res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
}
- json data = oaicompat_completion_params_parse(llama.model, json::parse(req.body), sparams.chat_template);
+ };
- const int task_id = llama.queue_tasks.get_new_id();
- llama.queue_results.add_waiting_task_id(task_id);
- llama.request_completion(task_id, data, false, false, -1);
+ const auto handle_models = [¶ms, &model_meta](const httplib::Request & req, httplib::Response & res) {
+ res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
+
+ json models = {
+ {"object", "list"},
+ {"data", {
+ {
+ {"id", params.model_alias},
+ {"object", "model"},
+ {"created", std::time(0)},
+ {"owned_by", "llamacpp"},
+ {"meta", model_meta}
+ },
+ }}
+ };
+ res.set_content(models.dump(), "application/json; charset=utf-8");
+ };
+
+ const auto handle_chat_completions = [&ctx_server, &sparams, &res_error](const httplib::Request & req, httplib::Response & res) {
+ res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
+ json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), sparams.chat_template);
+
+ const int id_task = ctx_server.queue_tasks.get_new_id();
+
+ ctx_server.queue_results.add_waiting_task_id(id_task);
+ ctx_server.request_completion(id_task, -1, data, false, false);
+
+ const auto completion_id = gen_chatcmplid();
if (!json_value(data, "stream", false)) {
- std::string completion_text;
- task_result result = llama.queue_results.recv(task_id);
+ server_task_result result = ctx_server.queue_results.recv(id_task);
if (!result.error && result.stop) {
- json oaicompat_result = format_final_response_oaicompat(data, result);
+ json result_oai = format_final_response_oaicompat(data, result.data, completion_id);
- res.set_content(oaicompat_result.dump(-1, ' ', false,
- json::error_handler_t::replace),
- "application/json; charset=utf-8");
+ res.set_content(result_oai.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8");
} else {
- res.status = 500;
- res.set_content(result.result_json["content"], "text/plain; charset=utf-8");
+ res_error(res, result.data);
}
- llama.queue_results.remove_waiting_task_id(task_id);
+ ctx_server.queue_results.remove_waiting_task_id(id_task);
} else {
- const auto chunked_content_provider = [task_id, &llama](size_t, httplib::DataSink &sink) {
+ const auto chunked_content_provider = [id_task, &ctx_server, completion_id](size_t, httplib::DataSink & sink) {
while (true) {
- task_result llama_result = llama.queue_results.recv(task_id);
- if (!llama_result.error) {
- std::vector result_array = format_partial_response_oaicompat( llama_result);
+ server_task_result result = ctx_server.queue_results.recv(id_task);
+ if (!result.error) {
+ std::vector result_array = format_partial_response_oaicompat(result.data, completion_id);
- for (auto it = result_array.begin(); it != result_array.end(); ++it)
- {
+ for (auto it = result_array.begin(); it != result_array.end(); ++it) {
if (!it->empty()) {
const std::string str =
"data: " +
@@ -3254,288 +3256,264 @@ int main(int argc, char **argv)
"\n\n";
LOG_VERBOSE("data stream", {{"to_send", str}});
if (!sink.write(str.c_str(), str.size())) {
- llama.queue_results.remove_waiting_task_id(task_id);
+ ctx_server.queue_results.remove_waiting_task_id(id_task);
return false;
}
}
}
- if (llama_result.stop) {
+ if (result.stop) {
break;
}
} else {
const std::string str =
"error: " +
- llama_result.result_json.dump(-1, ' ', false,
- json::error_handler_t::replace) +
+ result.data.dump(-1, ' ', false, json::error_handler_t::replace) +
"\n\n";
LOG_VERBOSE("data stream", {{"to_send", str}});
if (!sink.write(str.c_str(), str.size())) {
- llama.queue_results.remove_waiting_task_id(task_id);
+ ctx_server.queue_results.remove_waiting_task_id(id_task);
return false;
}
break;
}
}
sink.done();
- llama.queue_results.remove_waiting_task_id(task_id);
+ ctx_server.queue_results.remove_waiting_task_id(id_task);
return true;
};
- auto on_complete = [task_id, &llama](bool) {
+ auto on_complete = [id_task, &ctx_server](bool) {
// cancel request
- llama.request_cancel(task_id);
- llama.queue_results.remove_waiting_task_id(task_id);
+ ctx_server.request_cancel(id_task);
+ ctx_server.queue_results.remove_waiting_task_id(id_task);
};
res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
}
};
- svr.Post("/chat/completions", chat_completions);
- svr.Post("/v1/chat/completions", chat_completions);
-
- svr.Post("/infill", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
- {
- res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
- if (!validate_api_key(req, res)) {
- return;
- }
- json data = json::parse(req.body);
- const int task_id = llama.queue_tasks.get_new_id();
- llama.queue_results.add_waiting_task_id(task_id);
- llama.request_completion(task_id, data, true, false, -1);
- if (!json_value(data, "stream", false)) {
- std::string completion_text;
- task_result result = llama.queue_results.recv(task_id);
- if (!result.error && result.stop)
- {
- res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8");
- }
- else
- {
- res.status = 404;
- res.set_content(result.result_json["content"], "text/plain; charset=utf-8");
- }
- llama.queue_results.remove_waiting_task_id(task_id);
- } else {
- const auto chunked_content_provider = [task_id, &llama](size_t, httplib::DataSink & sink) {
- while (true)
- {
- task_result result = llama.queue_results.recv(task_id);
- if (!result.error) {
- const std::string str =
- "data: " +
- result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) +
- "\n\n";
- LOG_VERBOSE("data stream", {
- { "to_send", str }
- });
- if (!sink.write(str.c_str(), str.size()))
- {
- llama.queue_results.remove_waiting_task_id(task_id);
- return false;
- }
- if (result.stop)
- {
- break;
- }
- }
- else
- {
- break;
- }
- }
+ const auto handle_infill = [&ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) {
+ res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
- llama.queue_results.remove_waiting_task_id(task_id);
- sink.done();
- return true;
- };
+ json data = json::parse(req.body);
- auto on_complete = [task_id, &llama] (bool)
- {
- // cancel
- llama.request_cancel(task_id);
- };
+ const int id_task = ctx_server.queue_tasks.get_new_id();
- res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
- }
- });
+ ctx_server.queue_results.add_waiting_task_id(id_task);
+ ctx_server.request_completion(id_task, -1, data, true, false);
- svr.Options(R"(/.*)", [](const httplib::Request &, httplib::Response &res)
- { return res.set_content("", "application/json; charset=utf-8"); });
+ if (!json_value(data, "stream", false)) {
+ server_task_result result = ctx_server.queue_results.recv(id_task);
+ if (!result.error && result.stop) {
+ res.set_content(result.data.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8");
+ } else {
+ res_error(res, result.data);
+ }
- svr.Post("/tokenize", [&llama](const httplib::Request &req, httplib::Response &res)
- {
- res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
- const json body = json::parse(req.body);
- std::vector tokens;
- if (body.count("content") != 0)
- {
- tokens = llama.tokenize(body["content"], false);
- }
- const json data = format_tokenizer_response(tokens);
- return res.set_content(data.dump(), "application/json; charset=utf-8");
- });
+ ctx_server.queue_results.remove_waiting_task_id(id_task);
+ } else {
+ const auto chunked_content_provider = [id_task, &ctx_server](size_t, httplib::DataSink & sink) {
+ while (true) {
+ server_task_result result = ctx_server.queue_results.recv(id_task);
+ if (!result.error) {
+ const std::string str =
+ "data: " +
+ result.data.dump(-1, ' ', false, json::error_handler_t::replace) +
+ "\n\n";
- svr.Post("/detokenize", [&llama](const httplib::Request &req, httplib::Response &res)
- {
- res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
- const json body = json::parse(req.body);
- std::string content;
- if (body.count("tokens") != 0)
- {
- const std::vector tokens = body["tokens"];
- content = tokens_to_str(llama.ctx, tokens.cbegin(), tokens.cend());
- }
+ LOG_VERBOSE("data stream", {
+ { "to_send", str }
+ });
- const json data = format_detokenized_response(content);
- return res.set_content(data.dump(), "application/json; charset=utf-8");
- });
+ if (!sink.write(str.c_str(), str.size())) {
+ ctx_server.queue_results.remove_waiting_task_id(id_task);
+ return false;
+ }
- svr.Post("/embedding", [&llama](const httplib::Request &req, httplib::Response &res)
- {
- res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
- const json body = json::parse(req.body);
- json prompt;
- if (body.count("content") != 0)
- {
- prompt = body["content"];
- }
- else
- {
- prompt = "";
+ if (result.stop) {
+ break;
+ }
+ } else {
+ break;
+ }
}
- json image_data;
- if (body.count("image_data") != 0) {
- image_data = body["image_data"];
- }
- else
- {
- image_data = "";
- }
+ ctx_server.queue_results.remove_waiting_task_id(id_task);
+ sink.done();
- // create and queue the task
- const int task_id = llama.queue_tasks.get_new_id();
- llama.queue_results.add_waiting_task_id(task_id);
- llama.request_completion(task_id, { {"prompt", prompt}, { "n_predict", 0}, {"image_data", image_data} }, false, true, -1);
+ return true;
+ };
- // get the result
- task_result result = llama.queue_results.recv(task_id);
- llama.queue_results.remove_waiting_task_id(task_id);
+ auto on_complete = [id_task, &ctx_server] (bool) {
+ ctx_server.request_cancel(id_task);
+ };
- // send the result
- return res.set_content(result.result_json.dump(), "application/json; charset=utf-8");
- });
+ res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
+ }
+ };
- svr.Post("/v1/embeddings", [&llama](const httplib::Request &req, httplib::Response &res)
- {
- res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
- const json body = json::parse(req.body);
+ const auto handle_tokenize = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
+ res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
+ const json body = json::parse(req.body);
- json prompt;
- if (body.count("input") != 0)
- {
- prompt = body["input"];
- // batch
- if(prompt.is_array()) {
- json data = json::array();
- int i = 0;
- for (const json &elem : prompt) {
- const int task_id = llama.queue_tasks.get_new_id();
- llama.queue_results.add_waiting_task_id(task_id);
- llama.request_completion(task_id, { {"prompt", elem}, { "n_predict", 0} }, false, true, -1);
-
- // get the result
- task_result result = llama.queue_results.recv(task_id);
- llama.queue_results.remove_waiting_task_id(task_id);
-
- json embedding = json{
- {"embedding", json_value(result.result_json, "embedding", json::array())},
- {"index", i++},
- {"object", "embedding"}
- };
- data.push_back(embedding);
- }
- json result = format_embeddings_response_oaicompat(body, data);
- return res.set_content(result.dump(), "application/json; charset=utf-8");
- }
- }
- else
- {
- prompt = "";
- }
+ std::vector tokens;
+ if (body.count("content") != 0) {
+ tokens = ctx_server.tokenize(body["content"], false);
+ }
+ const json data = format_tokenizer_response(tokens);
+ return res.set_content(data.dump(), "application/json; charset=utf-8");
+ };
- // create and queue the task
- const int task_id = llama.queue_tasks.get_new_id();
- llama.queue_results.add_waiting_task_id(task_id);
- llama.request_completion(task_id, { {"prompt", prompt}, { "n_predict", 0}}, false, true, -1);
+ const auto handle_detokenize = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
+ res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
+ const json body = json::parse(req.body);
- // get the result
- task_result result = llama.queue_results.recv(task_id);
- llama.queue_results.remove_waiting_task_id(task_id);
+ std::string content;
+ if (body.count("tokens") != 0) {
+ const std::vector tokens = body["tokens"];
+ content = tokens_to_str(ctx_server.ctx, tokens.cbegin(), tokens.cend());
+ }
- json data = json::array({json{
- {"embedding", json_value(result.result_json, "embedding", json::array())},
- {"index", 0},
- {"object", "embedding"}
- }}
- );
+ const json data = format_detokenized_response(content);
+ return res.set_content(data.dump(), "application/json; charset=utf-8");
+ };
- json root = format_embeddings_response_oaicompat(body, data);
+ const auto handle_embeddings = [¶ms, &ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) {
+ res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
+ if (!params.embedding) {
+ res.status = 501;
+ res.set_content("This server does not support embeddings. Start it with `--embeddings`", "text/plain; charset=utf-8");
+ return;
+ }
- // send the result
- return res.set_content(root.dump(), "application/json; charset=utf-8");
- });
+ const json body = json::parse(req.body);
+ bool is_openai = false;
- // GG: if I put the main loop inside a thread, it crashes on the first request when build in Debug!?
- // "Bus error: 10" - this is on macOS, it does not crash on Linux
- //std::thread t2([&]()
- /*{
- bool running = true;
- while (running)
+ // an input prompt can be a string or a list of tokens (integer)
+ json prompt;
+ if (body.count("input") != 0) {
+ is_openai = true;
+ prompt = body["input"];
+ } else if (body.count("content") != 0) {
+ // with "content", we only support single prompt
+ prompt = std::vector{body["content"]};
+ } else {
+ res_error(res, format_error_response("\"input\" or \"content\" must be provided", ERROR_TYPE_INVALID_REQUEST));
+ return;
+ }
+
+ // create and queue the task
+ json responses;
{
- running = llama.update_slots();
+ const int id_task = ctx_server.queue_tasks.get_new_id();
+ ctx_server.queue_results.add_waiting_task_id(id_task);
+ ctx_server.request_completion(id_task, -1, {{"prompt", prompt}}, false, true);
+
+ // get the result
+ server_task_result result = ctx_server.queue_results.recv(id_task);
+ ctx_server.queue_results.remove_waiting_task_id(id_task);
+ if (!result.error) {
+ if (result.data.count("results")) {
+ // result for multi-task
+ responses = result.data["results"];
+ } else {
+ // result for single task
+ responses = std::vector{result.data};
+ }
+ } else {
+ // error received, ignore everything else
+ res_error(res, result.data);
+ return;
+ }
}
- }*/
- //);
+ // write JSON response
+ json root = is_openai
+ ? format_embeddings_response_oaicompat(body, responses)
+ : responses[0];
+ return res.set_content(root.dump(), "application/json; charset=utf-8");
+ };
+
+ auto handle_static_file = [](unsigned char * content, size_t len, const char * mime_type) {
+ return [content, len, mime_type](const httplib::Request &, httplib::Response & res) {
+ res.set_content(reinterpret_cast(content), len, mime_type);
+ return false;
+ };
+ };
+
+ //
+ // Router
+ //
+
+ // register static assets routes
+ if (!sparams.public_path.empty()) {
+ // Set the base directory for serving static files
+ svr->set_base_dir(sparams.public_path);
+ }
+
+ // using embedded static files
+ svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
+ svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8"));
+ svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8"));
+ svr->Get("/json-schema-to-grammar.mjs", handle_static_file(
+ json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8"));
+
+ // register API routes
+ svr->Get ("/health", handle_health);
+ svr->Get ("/slots", handle_slots);
+ svr->Get ("/metrics", handle_metrics);
+ svr->Get ("/props", handle_props);
+ svr->Get ("/v1/models", handle_models);
+ svr->Post("/completion", handle_completions); // legacy
+ svr->Post("/completions", handle_completions);
+ svr->Post("/v1/completions", handle_completions);
+ svr->Post("/chat/completions", handle_chat_completions);
+ svr->Post("/v1/chat/completions", handle_chat_completions);
+ svr->Post("/infill", handle_infill);
+ svr->Post("/embedding", handle_embeddings); // legacy
+ svr->Post("/embeddings", handle_embeddings);
+ svr->Post("/v1/embeddings", handle_embeddings);
+ svr->Post("/tokenize", handle_tokenize);
+ svr->Post("/detokenize", handle_detokenize);
+
+ //
+ // Start the server
+ //
if (sparams.n_threads_http < 1) {
// +2 threads for monitoring endpoints
sparams.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1);
}
log_data["n_threads_http"] = std::to_string(sparams.n_threads_http);
- svr.new_task_queue = [&sparams] { return new httplib::ThreadPool(sparams.n_threads_http); };
+ svr->new_task_queue = [&sparams] { return new httplib::ThreadPool(sparams.n_threads_http); };
LOG_INFO("HTTP server listening", log_data);
+
// run the HTTP server in a thread - see comment below
- std::thread t([&]()
- {
- if (!svr.listen_after_bind())
- {
- state.store(SERVER_STATE_ERROR);
- return 1;
- }
+ std::thread t([&]() {
+ if (!svr->listen_after_bind()) {
+ state.store(SERVER_STATE_ERROR);
+ return 1;
+ }
- return 0;
- });
+ return 0;
+ });
- llama.queue_tasks.on_new_task(std::bind(
- &llama_server_context::process_single_task, &llama, std::placeholders::_1));
- llama.queue_tasks.on_finish_multitask(std::bind(
- &llama_server_context::on_finish_multitask, &llama, std::placeholders::_1));
- llama.queue_tasks.on_run_slots(std::bind(
- &llama_server_context::update_slots, &llama));
- llama.queue_results.on_multitask_update(std::bind(
- &llama_server_queue::update_multitask,
- &llama.queue_tasks,
+ ctx_server.queue_tasks.on_new_task(std::bind(
+ &server_context::process_single_task, &ctx_server, std::placeholders::_1));
+ ctx_server.queue_tasks.on_finish_multitask(std::bind(
+ &server_context::on_finish_multitask, &ctx_server, std::placeholders::_1));
+ ctx_server.queue_tasks.on_update_slots(std::bind(
+ &server_context::update_slots, &ctx_server));
+ ctx_server.queue_results.on_multitask_update(std::bind(
+ &server_queue::update_multitask,
+ &ctx_server.queue_tasks,
std::placeholders::_1,
std::placeholders::_2,
std::placeholders::_3
));
shutdown_handler = [&](int) {
- llama.queue_tasks.terminate();
+ ctx_server.queue_tasks.terminate();
};
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
@@ -3550,10 +3528,13 @@ int main(int argc, char **argv)
};
SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true);
#endif
- llama.queue_tasks.start_loop();
- svr.stop();
+
+ ctx_server.queue_tasks.start_loop();
+
+ svr->stop();
t.join();
llama_backend_free();
+
return 0;
}
diff --git a/examples/server/tests/features/embeddings.feature b/examples/server/tests/features/embeddings.feature
new file mode 100644
index 0000000000000..57359b267a668
--- /dev/null
+++ b/examples/server/tests/features/embeddings.feature
@@ -0,0 +1,95 @@
+@llama.cpp
+@embeddings
+Feature: llama.cpp server
+
+ Background: Server startup
+ Given a server listening on localhost:8080
+ And a model file bert-bge-small/ggml-model-f16.gguf from HF repo ggml-org/models
+ And a model alias bert-bge-small
+ And 42 as server seed
+ And 2 slots
+ And 1024 as batch size
+ And 1024 as ubatch size
+ And 2048 KV cache size
+ And embeddings extraction
+ Then the server is starting
+ Then the server is healthy
+
+ Scenario: Embedding
+ When embeddings are computed for:
+ """
+ What is the capital of Bulgaria ?
+ """
+ Then embeddings are generated
+
+ Scenario: OAI Embeddings compatibility
+ Given a model bert-bge-small
+ When an OAI compatible embeddings computation request for:
+ """
+ What is the capital of Spain ?
+ """
+ Then embeddings are generated
+
+ Scenario: OAI Embeddings compatibility with multiple inputs
+ Given a model bert-bge-small
+ Given a prompt:
+ """
+ In which country Paris is located ?
+ """
+ And a prompt:
+ """
+ Is Madrid the capital of Spain ?
+ """
+ When an OAI compatible embeddings computation request for multiple inputs
+ Then embeddings are generated
+
+ Scenario: Multi users embeddings
+ Given a prompt:
+ """
+ Write a very long story about AI.
+ """
+ And a prompt:
+ """
+ Write another very long music lyrics.
+ """
+ And a prompt:
+ """
+ Write a very long poem.
+ """
+ And a prompt:
+ """
+ Write a very long joke.
+ """
+ Given concurrent embedding requests
+ Then the server is busy
+ Then the server is idle
+ Then all embeddings are generated
+
+ Scenario: Multi users OAI compatibility embeddings
+ Given a prompt:
+ """
+ In which country Paris is located ?
+ """
+ And a prompt:
+ """
+ Is Madrid the capital of Spain ?
+ """
+ And a prompt:
+ """
+ What is the biggest US city ?
+ """
+ And a prompt:
+ """
+ What is the capital of Bulgaria ?
+ """
+ And a model bert-bge-small
+ Given concurrent OAI embedding requests
+ Then the server is busy
+ Then the server is idle
+ Then all embeddings are generated
+
+ Scenario: All embeddings should be the same
+ Given 10 fixed prompts
+ And a model bert-bge-small
+ Given concurrent OAI embedding requests
+ Then all embeddings are the same
diff --git a/examples/server/tests/features/environment.py b/examples/server/tests/features/environment.py
index 9fd330db6ddc9..8ad987e1bb618 100644
--- a/examples/server/tests/features/environment.py
+++ b/examples/server/tests/features/environment.py
@@ -1,9 +1,10 @@
+import errno
import os
import socket
import subprocess
import time
from contextlib import closing
-from signal import SIGKILL
+import signal
def before_scenario(context, scenario):
@@ -29,44 +30,71 @@ def after_scenario(context, scenario):
for line in f:
print(line)
if not is_server_listening(context.server_fqdn, context.server_port):
- print("\x1b[33;101mERROR: Server stopped listening\x1b[0m")
+ print("\x1b[33;101mERROR: Server stopped listening\x1b[0m\n")
if not pid_exists(context.server_process.pid):
assert False, f"Server not running pid={context.server_process.pid} ..."
- print(f"stopping server pid={context.server_process.pid} ...")
- context.server_process.kill()
+ server_graceful_shutdown(context)
+
# Wait few for socket to free up
time.sleep(0.05)
attempts = 0
- while is_server_listening(context.server_fqdn, context.server_port):
- print(f"stopping server pid={context.server_process.pid} ...")
- os.kill(context.server_process.pid, SIGKILL)
+ while pid_exists(context.server_process.pid) or is_server_listening(context.server_fqdn, context.server_port):
+ server_kill(context)
time.sleep(0.1)
attempts += 1
if attempts > 5:
- print(f"Server dangling exits, killing all {context.server_path} ...")
- process = subprocess.run(['killall', '-9', context.server_path],
- stderr=subprocess.PIPE,
- universal_newlines=True)
- print(process)
+ server_kill_hard(context)
+
+
+def server_graceful_shutdown(context):
+ print(f"shutting down server pid={context.server_process.pid} ...\n")
+ if os.name == 'nt':
+ os.kill(context.server_process.pid, signal.CTRL_C_EVENT)
+ else:
+ os.kill(context.server_process.pid, signal.SIGINT)
+
+
+def server_kill(context):
+ print(f"killing server pid={context.server_process.pid} ...\n")
+ context.server_process.kill()
+
+
+def server_kill_hard(context):
+ pid = context.server_process.pid
+ path = context.server_path
+
+ print(f"Server dangling exits, hard killing force {pid}={path}...\n")
+ if os.name == 'nt':
+ process = subprocess.check_output(['taskkill', '/F', '/pid', str(pid)]).decode()
+ print(process)
+ else:
+ os.kill(-pid, signal.SIGKILL)
def is_server_listening(server_fqdn, server_port):
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
result = sock.connect_ex((server_fqdn, server_port))
- return result == 0
+ _is_server_listening = result == 0
+ if _is_server_listening:
+ print(f"server is listening on {server_fqdn}:{server_port}...\n")
+ return _is_server_listening
def pid_exists(pid):
"""Check whether pid exists in the current process table."""
- import errno
if pid < 0:
return False
- try:
- os.kill(pid, 0)
- except OSError as e:
- return e.errno == errno.EPERM
+ if os.name == 'nt':
+ output = subprocess.check_output(['TASKLIST', '/FI', f'pid eq {pid}']).decode()
+ print(output)
+ return "No tasks are running" not in output
else:
- return True
+ try:
+ os.kill(pid, 0)
+ except OSError as e:
+ return e.errno == errno.EPERM
+ else:
+ return True
diff --git a/examples/server/tests/features/parallel.feature b/examples/server/tests/features/parallel.feature
index 86cdf72829f8c..a66fed626619d 100644
--- a/examples/server/tests/features/parallel.feature
+++ b/examples/server/tests/features/parallel.feature
@@ -6,10 +6,9 @@ Feature: Parallel
Given a server listening on localhost:8080
And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
And 42 as server seed
- And 512 as batch size
- And 64 KV cache size
+ And 128 as batch size
+ And 256 KV cache size
And 2 slots
- And embeddings extraction
And continuous batching
Then the server is starting
Then the server is healthy
@@ -77,6 +76,7 @@ Feature: Parallel
| disabled | 128 |
| enabled | 64 |
+
Scenario: Multi users with total number of tokens to predict exceeds the KV Cache size #3969
Given a prompt:
"""
@@ -99,48 +99,3 @@ Feature: Parallel
Then the server is busy
Then the server is idle
Then all prompts are predicted
-
- Scenario: Multi users embeddings
- Given a prompt:
- """
- Write a very long story about AI.
- """
- And a prompt:
- """
- Write another very long music lyrics.
- """
- And a prompt:
- """
- Write a very long poem.
- """
- And a prompt:
- """
- Write a very long joke.
- """
- Given concurrent embedding requests
- Then the server is busy
- Then the server is idle
- Then all embeddings are generated
-
- Scenario: Multi users OAI compatibility embeddings
- Given a prompt:
- """
- In which country Paris is located ?
- """
- And a prompt:
- """
- Is Madrid the capital of Spain ?
- """
- And a prompt:
- """
- What is the biggest US city ?
- """
- And a prompt:
- """
- What is the capital of Bulgaria ?
- """
- And a model tinyllama-2
- Given concurrent OAI embedding requests
- Then the server is busy
- Then the server is idle
- Then all embeddings are generated
diff --git a/examples/server/tests/features/security.feature b/examples/server/tests/features/security.feature
index 42a6709a53380..1d6aa40ea6985 100644
--- a/examples/server/tests/features/security.feature
+++ b/examples/server/tests/features/security.feature
@@ -39,8 +39,9 @@ Feature: Security
Scenario Outline: CORS Options
- When an OPTIONS request is sent from
- Then CORS header is set to
+ Given a user api key llama.cpp
+ When an OPTIONS request is sent from
+ Then CORS header is set to
Examples: Headers
| origin | cors_header | cors_header_value |
diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature
index 7c977bccecaad..5014f326dc050 100644
--- a/examples/server/tests/features/server.feature
+++ b/examples/server/tests/features/server.feature
@@ -10,11 +10,10 @@ Feature: llama.cpp server
# KV Cache corresponds to the total amount of tokens
# that can be stored across all independent sequences: #4130
# see --ctx-size and #5568
- And 32 KV cache size
- And 512 as batch size
- And 1 slots
- And embeddings extraction
- And 32 server max tokens to predict
+ And 256 KV cache size
+ And 32 as batch size
+ And 2 slots
+ And 64 server max tokens to predict
And prometheus compatible metrics exposed
Then the server is starting
Then the server is healthy
@@ -23,17 +22,35 @@ Feature: llama.cpp server
Then the server is ready
And all slots are idle
+
Scenario Outline: Completion
Given a prompt
And max tokens to predict
And a completion request with no api error
Then tokens are predicted matching
+ And the completion is truncated
+ And prompt tokens are processed
And prometheus metrics are exposed
+ And metric llamacpp:tokens_predicted is
Examples: Prompts
- | prompt | n_predict | re_content | n_predicted |
- | I believe the meaning of life is | 8 | (read\|going)+ | 8 |
- | Write a joke about AI | 64 | (park\|friends\|scared\|always)+ | 32 |
+ | prompt | n_predict | re_content | n_prompt | n_predicted | truncated |
+ | I believe the meaning of life is | 8 | (read\|going)+ | 18 | 8 | not |
+ | Write a joke about AI from a very long prompt which will not be truncated | 256 | (princesses\|everyone\|kids)+ | 46 | 64 | not |
+
+ Scenario: Completion prompt truncated
+ Given a prompt:
+ """
+ Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
+ Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
+ Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
+ Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
+ """
+ And a completion request with no api error
+ Then 64 tokens are predicted matching fun|Annaks|popcorns|pictry
+ And the completion is truncated
+ And 109 prompt tokens are processed
+
Scenario Outline: OAI Compatibility
Given a model
@@ -43,39 +60,14 @@ Feature: llama.cpp server
And streaming is
Given an OAI compatible chat completions request with no api error
Then tokens are predicted matching
+ And prompt tokens are processed
+ And the completion is truncated
Examples: Prompts
- | model | system_prompt | user_prompt | max_tokens | re_content | n_predicted | enable_streaming |
- | llama-2 | Book | What is the best book | 8 | (Mom\|what)+ | 8 | disabled |
- | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 64 | (thanks\|happy\|bird)+ | 32 | enabled |
+ | model | system_prompt | user_prompt | max_tokens | re_content | n_prompt | n_predicted | enable_streaming | truncated |
+ | llama-2 | Book | What is the best book | 8 | (Here\|what)+ | 77 | 8 | disabled | not |
+ | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 128 | (thanks\|happy\|bird)+ | -1 | 64 | enabled | |
- Scenario: Embedding
- When embeddings are computed for:
- """
- What is the capital of Bulgaria ?
- """
- Then embeddings are generated
-
- Scenario: OAI Embeddings compatibility
- Given a model tinyllama-2
- When an OAI compatible embeddings computation request for:
- """
- What is the capital of Spain ?
- """
- Then embeddings are generated
-
- Scenario: OAI Embeddings compatibility with multiple inputs
- Given a model tinyllama-2
- Given a prompt:
- """
- In which country Paris is located ?
- """
- And a prompt:
- """
- Is Madrid the capital of Spain ?
- """
- When an OAI compatible embeddings computation request for multiple inputs
- Then embeddings are generated
Scenario: Tokenize / Detokenize
When tokenizing:
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
index 3195278022ffb..cfa9f96ec5306 100644
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -10,6 +10,7 @@
from re import RegexFlag
import aiohttp
+import numpy as np
import openai
from behave import step
from behave.api.async_step import async_run_until_complete
@@ -17,23 +18,28 @@
from prometheus_client import parser
-@step(u"a server listening on {server_fqdn}:{server_port}")
+@step("a server listening on {server_fqdn}:{server_port}")
def step_server_config(context, server_fqdn, server_port):
context.server_fqdn = server_fqdn
context.server_port = int(server_port)
if 'PORT' in os.environ:
context.server_port = int(os.environ['PORT'])
print(f"$PORT set, overriding server port with to {context.server_port}")
+ if 'FQDN' in os.environ:
+ context.server_fqdn = os.environ['FQDN']
+ print(f"$FQDN set, overriding server fqdn with to {context.server_fqdn}")
context.base_url = f'http://{context.server_fqdn}:{context.server_port}'
context.model_alias = None
context.n_batch = None
+ context.n_ubatch = None
context.n_ctx = None
context.n_ga = None
context.n_ga_w = None
context.n_gpu_layer = None
context.n_predict = None
+ context.n_prompts = 0
context.n_server_predict = None
context.n_slots = None
context.prompt_prefix = None
@@ -52,24 +58,24 @@ def step_server_config(context, server_fqdn, server_port):
context.prompts = []
-@step(u'a model file {hf_file} from HF repo {hf_repo}')
+@step('a model file {hf_file} from HF repo {hf_repo}')
def step_download_hf_model(context, hf_file, hf_repo):
context.model_file = hf_hub_download(repo_id=hf_repo, filename=hf_file)
if context.debug:
print(f"model file: {context.model_file}\n")
-@step(u'a model alias {model_alias}')
+@step('a model alias {model_alias}')
def step_model_alias(context, model_alias):
context.model_alias = model_alias
-@step(u'{seed:d} as server seed')
+@step('{seed:d} as server seed')
def step_seed(context, seed):
context.server_seed = seed
-@step(u'{ngl:d} GPU offloaded layers')
+@step('{ngl:d} GPU offloaded layers')
def step_n_gpu_layer(context, ngl):
if 'N_GPU_LAYERS' in os.environ:
new_ngl = int(os.environ['N_GPU_LAYERS'])
@@ -79,37 +85,37 @@ def step_n_gpu_layer(context, ngl):
context.n_gpu_layer = ngl
-@step(u'{n_ctx:d} KV cache size')
+@step('{n_ctx:d} KV cache size')
def step_n_ctx(context, n_ctx):
context.n_ctx = n_ctx
-@step(u'{n_slots:d} slots')
+@step('{n_slots:d} slots')
def step_n_slots(context, n_slots):
context.n_slots = n_slots
-@step(u'{n_predict:d} server max tokens to predict')
+@step('{n_predict:d} server max tokens to predict')
def step_server_n_predict(context, n_predict):
context.n_server_predict = n_predict
-@step(u'continuous batching')
+@step('continuous batching')
def step_server_continuous_batching(context):
context.server_continuous_batching = True
-@step(u'embeddings extraction')
+@step('embeddings extraction')
def step_server_embeddings(context):
context.server_embeddings = True
-@step(u'prometheus compatible metrics exposed')
+@step('prometheus compatible metrics exposed')
def step_server_metrics(context):
context.server_metrics = True
-@step(u"the server is starting")
+@step("the server is starting")
def step_start_server(context):
start_server_background(context)
attempts = 0
@@ -126,7 +132,7 @@ def step_start_server(context):
time.sleep(0.1)
-@step(u"the server is {expecting_status}")
+@step("the server is {expecting_status}")
@async_run_until_complete
async def step_wait_for_the_server_to_be_started(context, expecting_status):
match expecting_status:
@@ -155,7 +161,7 @@ async def step_wait_for_the_server_to_be_started(context, expecting_status):
assert False, "unknown status"
-@step(u'all slots are {expected_slot_status_string}')
+@step('all slots are {expected_slot_status_string}')
@async_run_until_complete
async def step_all_slots_status(context, expected_slot_status_string):
match expected_slot_status_string:
@@ -171,7 +177,7 @@ async def step_all_slots_status(context, expected_slot_status_string):
await request_slots_status(context, expected_slots)
-@step(u'a completion request with {api_error} api error')
+@step('a completion request with {api_error} api error')
@async_run_until_complete
async def step_request_completion(context, api_error):
expect_api_error = api_error == 'raised'
@@ -189,108 +195,138 @@ async def step_request_completion(context, api_error):
assert completion == 401, f"completion must be an 401 status code: {completion}"
-@step(u'{predicted_n:d} tokens are predicted matching {re_content}')
+@step('{predicted_n:d} tokens are predicted matching {re_content}')
def step_n_tokens_predicted_with_content(context, predicted_n, re_content):
- assert_n_tokens_predicted(context.tasks_result.pop(), predicted_n, re_content)
+ context.completion = context.tasks_result.pop()
+ assert_n_tokens_predicted(context.completion, predicted_n, re_content)
-@step(u'{predicted_n:d} tokens are predicted')
+@step('{predicted_n:d} tokens are predicted')
def step_n_tokens_predicted(context, predicted_n):
- assert_n_tokens_predicted(context.tasks_result.pop(), predicted_n)
+ context.completion = context.tasks_result.pop()
+ assert_n_tokens_predicted(context.completion, predicted_n)
-@step(u'a user prompt {user_prompt}')
+@step('the completion is truncated')
+def step_assert_completion_truncated(context):
+ step_assert_completion_truncated(context, '')
+
+
+@step('the completion is {truncated} truncated')
+def step_assert_completion_truncated(context, truncated):
+ truncated = truncated != "not"
+ assert context.completion['truncated'] == truncated, f'{context.completion}'
+
+
+@step('{n_prompt:d} prompt tokens are processed')
+def step_impl(context, n_prompt):
+ assert n_prompt < 0 or n_prompt == context.completion['timings']['prompt_n'], f"n_prompt={context.completion['timings']['prompt_n']}"
+
+
+@step('a user prompt {user_prompt}')
def step_user_prompt(context, user_prompt):
context.prompts.append(user_prompt)
+ context.n_prompts = len(context.prompts)
-@step(u'a system prompt {system_prompt}')
+@step('a system prompt {system_prompt}')
def step_system_prompt(context, system_prompt):
context.system_prompt = system_prompt
-@step(u'a model {model}')
+@step('a model {model}')
def step_model(context, model):
context.model = model
-@step(u'{max_tokens:d} max tokens to predict')
+@step('{max_tokens:d} max tokens to predict')
def step_max_tokens(context, max_tokens):
context.n_predict = max_tokens
-@step(u'streaming is {enable_streaming}')
+@step('streaming is {enable_streaming}')
def step_streaming(context, enable_streaming):
context.enable_streaming = enable_streaming == 'enabled'
-@step(u'a user api key {user_api_key}')
+@step('a user api key {user_api_key}')
def step_user_api_key(context, user_api_key):
context.user_api_key = user_api_key
-@step(u'no user api key')
+@step('no user api key')
def step_no_user_api_key(context):
context.user_api_key = None
-@step(u'a user api key ')
+@step('a user api key ')
def step_no_user_api_key_space(context):
context.user_api_key = None
-@step(u'a server api key {server_api_key}')
+@step('a server api key {server_api_key}')
def step_server_api_key(context, server_api_key):
context.server_api_key = server_api_key
-@step(u'{n_junk:d} as number of junk')
+@step('{n_junk:d} as number of junk')
def step_n_junk(context, n_junk):
context.n_junk = n_junk
-@step(u'{n_batch:d} as batch size')
+@step('{n_batch:d} as batch size')
def step_n_batch(context, n_batch):
context.n_batch = n_batch
-@step(u'{seed:d} as seed')
+@step('{n_ubatch:d} as ubatch size')
+def step_n_ubatch(context, n_ubatch):
+ context.n_ubatch = n_ubatch
+
+
+@step('{seed:d} as seed')
def step_seed(context, seed):
context.seed = seed
-@step(u'a prefix prompt')
+@step('a prefix prompt')
def step_prompt_prefix(context):
- context.prompt_prefix = context.text
+ context.prompt_prefix = context_text(context)
-@step(u'a junk suffix prompt')
+@step('a junk suffix prompt')
def step_prompt_junk_suffix(context):
- context.prompt_junk_suffix = context.text
+ context.prompt_junk_suffix = context_text(context)
-@step(u'a suffix prompt')
+@step('a suffix prompt')
def step_prompt_suffix(context):
- context.prompt_suffix = context.text
+ context.prompt_suffix = context_text(context)
-@step(u'{n_ga:d} group attention factor'
- u' to extend context size through self-extend')
+@step('{n_ga:d} group attention factor'
+ ' to extend context size through self-extend')
def step_impl(context, n_ga):
context.n_ga = n_ga
-@step(u'{n_ga_w:d} group attention width to extend context size through self-extend')
+@step('{n_ga_w:d} group attention width to extend context size through self-extend')
def step_impl(context, n_ga_w):
context.n_ga_w = n_ga_w
-@step(u'a passkey prompt template')
+@step('a passkey prompt template')
def step_prompt_passkey(context):
- context.prompt_passkey = context.text
+ context.prompt_passkey = context_text(context)
-@step(u'a "{passkey}" passkey challenge prompt with the passkey inserted every {i_pos:d} junk')
+@step('{n_prompts:d} fixed prompts')
+def step_fixed_prompts(context, n_prompts):
+ context.prompts.extend([str(0)*(context.n_batch if context.n_batch is not None else 512) for i in range(n_prompts)])
+ context.n_prompts = n_prompts
+
+
+@step('a "{passkey}" passkey challenge prompt with the passkey inserted every {i_pos:d} junk')
def step_prompt_passkey(context, passkey, i_pos):
prompt = ""
for i in range(context.n_junk):
@@ -301,9 +337,10 @@ def step_prompt_passkey(context, passkey, i_pos):
passkey_highlight = "\x1b[33m" + passkey + "\x1b[0m"
print(f"Passkey challenge:\n```{prompt.replace(passkey, passkey_highlight)}```\n")
context.prompts.append(context.prompt_prefix + prompt + context.prompt_suffix)
+ context.n_prompts = len(context.prompts)
-@step(u'an OAI compatible chat completions request with {api_error} api error')
+@step('an OAI compatible chat completions request with {api_error} api error')
@async_run_until_complete
async def step_oai_chat_completions(context, api_error):
if context.debug:
@@ -338,17 +375,19 @@ async def step_oai_chat_completions(context, api_error):
print(f"Completion response: {completion}")
-@step(u'a prompt')
+@step('a prompt')
def step_a_prompt(context):
- context.prompts.append(context.text)
+ context.prompts.append(context_text(context))
+ context.n_prompts = len(context.prompts)
-@step(u'a prompt {prompt}')
+@step('a prompt {prompt}')
def step_a_prompt_prompt(context, prompt):
context.prompts.append(prompt)
+ context.n_prompts = len(context.prompts)
-@step(u'concurrent completion requests')
+@step('concurrent completion requests')
@async_run_until_complete()
async def step_concurrent_completion_requests(context):
await concurrent_requests(context,
@@ -364,7 +403,7 @@ async def step_concurrent_completion_requests(context):
'user_api_key') else None)
-@step(u'concurrent OAI completions requests')
+@step('concurrent OAI completions requests')
@async_run_until_complete
async def step_oai_chat_completions(context):
await concurrent_requests(context, oai_chat_completions,
@@ -384,7 +423,7 @@ async def step_oai_chat_completions(context):
if hasattr(context, 'user_api_key') else None)
-@step(u'concurrent OAI completions requests no v1')
+@step('concurrent OAI completions requests no v1')
@async_run_until_complete
async def step_oai_chat_completions(context):
await concurrent_requests(context, oai_chat_completions,
@@ -407,13 +446,13 @@ async def step_oai_chat_completions(context):
if hasattr(context, 'user_api_key') else None)
-@step(u'all prompts are predicted')
+@step('all prompts are predicted')
@async_run_until_complete
async def step_all_prompts_are_predicted(context):
await all_prompts_are_predicted(context)
-@step(u'all prompts are predicted with {n_expected_predicted:d} tokens')
+@step('all prompts are predicted with {n_expected_predicted:d} tokens')
@async_run_until_complete
async def step_all_prompts_are_predicted_with_n_tokens(context, n_expected_predicted):
await all_prompts_are_predicted(context, n_expected_predicted)
@@ -427,44 +466,68 @@ async def all_prompts_are_predicted(context, expected_predicted_n=None):
assert len(context.concurrent_tasks) == 0, f"{len(context.concurrent_tasks)} pending requests"
-@step(u'embeddings are computed for')
+@step('embeddings are computed for')
@async_run_until_complete
async def step_compute_embedding(context):
- context.embeddings = await request_embedding(context.text, base_url=context.base_url)
+ context.n_prompts = 1
+ context.embeddings = await request_embedding(context_text(context), base_url=context.base_url)
+
+
+@step('all embeddings are the same')
+@async_run_until_complete
+async def step_all_embeddings_are_the_same(context):
+ n_embedding_requests = await gather_tasks_results(context)
+ assert n_embedding_requests > 0
+ embeddings = []
+ for i in range(n_embedding_requests):
+ embedding = context.tasks_result.pop().pop()
+ embeddings.append(embedding)
+ assert_embeddings(embedding)
+ n = len(embeddings)
+ for i in range(n-1):
+ for j in range(i+1, n):
+ embedding1 = np.array(embeddings[i])
+ embedding2 = np.array(embeddings[j])
+ if context.debug:
+ print(f"embedding1: {embedding1[-8:]}\n")
+ print(f"embedding2: {embedding2[-8:]}\n")
+ similarity = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
+ msg = f"Similarity between {i} and {j}: {similarity:.10f}"
+ if context.debug:
+ print(f"{msg}\n")
+ assert np.isclose(similarity, 1.0, rtol=1e-05, atol=1e-08, equal_nan=False), msg
-@step(u'embeddings are generated')
+@step('embeddings are generated')
def step_assert_embeddings(context):
- if len(context.prompts) == 0:
- assert_embeddings(context.embeddings)
- else:
- assert len(context.embeddings) == len(context.prompts), (f"unexpected response:\n"
- f"context.prompts={context.prompts}\n"
- f"context.embeddings={context.embeddings}")
- for embedding in context.embeddings:
- context.prompts.pop()
- assert_embeddings(embedding)
+ assert context.n_prompts == len(context.embeddings), (f"unexpected response:\n"
+ f"context.n_prompts={context.n_prompts}\n"
+ f"context.embeddings={context.embeddings}")
+ for embedding in context.embeddings:
+ assert_embeddings(embedding)
-@step(u'an OAI compatible embeddings computation request for')
+@step('an OAI compatible embeddings computation request for')
@async_run_until_complete
async def step_oai_compute_embeddings(context):
- context.embeddings = await request_oai_embeddings(context.text,
+ context.n_prompts = 1
+ context.embeddings = await request_oai_embeddings(context_text(context),
base_url=context.base_url,
user_api_key=context.user_api_key,
model=context.model)
-@step(u'an OAI compatible embeddings computation request for multiple inputs')
+@step('an OAI compatible embeddings computation request for multiple inputs')
@async_run_until_complete
async def step_oai_compute_embeddings_multiple_inputs(context):
context.embeddings = await request_oai_embeddings(context.prompts,
base_url=context.base_url,
user_api_key=context.user_api_key,
model=context.model)
+ context.prompts.clear()
-@step(u'concurrent embedding requests')
+@step('concurrent embedding requests')
@async_run_until_complete()
async def step_concurrent_embedding_requests(context):
await concurrent_requests(context,
@@ -473,7 +536,7 @@ async def step_concurrent_embedding_requests(context):
base_url=context.base_url)
-@step(u'concurrent OAI embedding requests')
+@step('concurrent OAI embedding requests')
@async_run_until_complete()
async def step_concurrent_oai_embedding_requests(context):
await concurrent_requests(context,
@@ -484,19 +547,19 @@ async def step_concurrent_oai_embedding_requests(context):
model=context.model)
-@step(u'all embeddings are generated')
+@step('all embeddings are generated')
@async_run_until_complete()
async def all_embeddings_are_generated(context):
n_embedding_requests = await gather_tasks_results(context)
- assert n_embedding_requests > 0
+ assert n_embedding_requests == context.n_prompts
for i in range(n_embedding_requests):
- assert_embeddings(context.tasks_result.pop())
+ assert_embeddings(context.tasks_result.pop().pop())
-@step(u'tokenizing')
+@step('tokenizing')
@async_run_until_complete
async def step_tokenize(context):
- context.tokenized_text = context.text
+ context.tokenized_text = context_text(context)
async with aiohttp.ClientSession() as session:
async with session.post(f'{context.base_url}/tokenize',
json={
@@ -507,7 +570,7 @@ async def step_tokenize(context):
context.tokens = tokenize_json['tokens']
-@step(u'tokens can be detokenize')
+@step('tokens can be detokenize')
@async_run_until_complete
async def step_detokenize(context):
assert len(context.tokens) > 0
@@ -522,22 +585,23 @@ async def step_detokenize(context):
assert context.tokenized_text == detokenize_json['content'].strip()
-@step(u'an OPTIONS request is sent from {origin}')
+@step('an OPTIONS request is sent from {origin}')
@async_run_until_complete
async def step_options_request(context, origin):
async with aiohttp.ClientSession() as session:
+ headers = {'Authorization': f'Bearer {context.user_api_key}', 'Origin': origin}
async with session.options(f'{context.base_url}/v1/chat/completions',
- headers={"Origin": origin}) as response:
+ headers=headers) as response:
assert response.status == 200
context.options_response = response
-@step(u'CORS header {cors_header} is set to {cors_header_value}')
+@step('CORS header {cors_header} is set to {cors_header_value}')
def step_check_options_header_value(context, cors_header, cors_header_value):
assert context.options_response.headers[cors_header] == cors_header_value
-@step(u'prometheus metrics are exposed')
+@step('prometheus metrics are exposed')
@async_run_until_complete
async def step_prometheus_metrics_exported(context):
async with aiohttp.ClientSession() as session:
@@ -548,15 +612,25 @@ async def step_prometheus_metrics_exported(context):
metric_exported = False
if context.debug:
print(f"/metrics answer:\n{metrics_raw}\n")
+ context.metrics = {}
for metric in parser.text_string_to_metric_families(metrics_raw):
match metric.name:
case "llamacpp:kv_cache_usage_ratio":
assert len(metric.samples) > 0
metric_exported = True
+ context.metrics[metric.name] = metric
+ assert int(metrics_response.headers["Process-Start-Time-Unix"]) > 0, "no header process start time"
assert metric_exported, "No metrics exported"
-@step(u'available models')
+@step('metric {metric_name} is {metric_value:d}')
+def step_assert_metric_value(context, metric_name, metric_value):
+ if metric_name not in context.metrics:
+ assert False, f"no metric {metric_name} in {context.metrics.keys()}"
+ assert context.metrics[metric_name].samples[0].value == metric_value, f"metric: {context.metrics[metric_name]}"
+
+
+@step('available models')
def step_available_models(context):
# openai client always expects an api_key
openai.api_key = context.user_api_key if context.user_api_key is not None else 'nope'
@@ -564,14 +638,14 @@ def step_available_models(context):
context.models = openai.Model.list().data
-@step(u'{n_model:d} models are supported')
+@step('{n_model:d} models are supported')
def step_supported_models(context, n_model):
if context.debug:
print("server models available:", context.models)
assert len(context.models) == n_model
-@step(u'model {i_model:d} is {param} {preposition} {param_value}')
+@step('model {i_model:d} is {param} {preposition} {param_value}')
def step_supported_models(context, i_model, param, preposition, param_value):
assert i_model < len(context.models)
model = context.models[i_model]
@@ -588,11 +662,11 @@ def step_supported_models(context, i_model, param, preposition, param_value):
async def concurrent_requests(context, f_completion, *args, **kwargs):
- n_prompts = len(context.prompts)
+ context.n_prompts = len(context.prompts)
if context.debug:
- print(f"starting {n_prompts} concurrent completion requests...")
- assert n_prompts > 0
- for prompt_no in range(n_prompts):
+ print(f"starting {context.n_prompts} concurrent completion requests...")
+ assert context.n_prompts > 0
+ for prompt_no in range(context.n_prompts):
shifted_args = [context.prompts.pop(), *args]
context.concurrent_tasks.append(asyncio.create_task(f_completion(*shifted_args, **kwargs)))
await asyncio.sleep(0.1)
@@ -674,7 +748,8 @@ async def oai_chat_completions(user_prompt,
completion_response = {
'content': '',
'timings': {
- 'predicted_n': 0
+ 'predicted_n': 0,
+ 'prompt_n': 0
}
}
if async_client:
@@ -715,7 +790,8 @@ async def oai_chat_completions(user_prompt,
completion_response = {
'content': chat_completion_raw['choices'][0]['message'],
'timings': {
- 'predicted_n': chat_completion_raw['usage']['completion_tokens']
+ 'predicted_n': chat_completion_raw['usage']['completion_tokens'],
+ 'prompt_n': chat_completion_raw['usage']['prompt_tokens']
}
}
else:
@@ -731,7 +807,7 @@ async def oai_chat_completions(user_prompt,
stream=enable_streaming,
seed=seed
)
- except openai.error.APIError as e:
+ except openai.error.AuthenticationError as e:
if expect_api_error is not None and expect_api_error:
return 401
else:
@@ -744,13 +820,16 @@ async def oai_chat_completions(user_prompt,
if 'content' in delta:
completion_response['content'] += delta['content']
completion_response['timings']['predicted_n'] += 1
+ completion_response['truncated'] = chunk.choices[0].finish_reason != 'stop'
else:
assert len(chat_completion.choices) == 1
completion_response = {
'content': chat_completion.choices[0].message.content,
'timings': {
- 'predicted_n': chat_completion.usage.completion_tokens
- }
+ 'predicted_n': chat_completion.usage.completion_tokens,
+ 'prompt_n': chat_completion.usage.prompt_tokens
+ },
+ 'truncated': chat_completion.choices[0].finish_reason != 'stop'
}
if debug:
print("OAI response formatted to llama.cpp:", completion_response)
@@ -765,7 +844,7 @@ async def request_embedding(content, base_url=None):
}) as response:
assert response.status == 200
response_json = await response.json()
- return response_json['embedding']
+ return [response_json['embedding']]
async def request_oai_embeddings(input,
@@ -775,6 +854,7 @@ async def request_oai_embeddings(input,
user_api_key = user_api_key if user_api_key is not None else 'nope'
if async_client:
origin = 'llama.cpp'
+ headers=[]
if user_api_key is not None:
headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin}
async with aiohttp.ClientSession() as session:
@@ -783,14 +863,21 @@ async def request_oai_embeddings(input,
"input": input,
"model": model,
},
- headers=headers) as response:
+ headers=headers,
+ timeout=3600) as response:
assert response.status == 200, f"received status code not expected: {response.status}"
assert response.headers['Access-Control-Allow-Origin'] == origin
assert response.headers['Content-Type'] == "application/json; charset=utf-8"
response_json = await response.json()
assert response_json['model'] == model, f"invalid model received: {response_json['model']}"
assert response_json['object'] == 'list'
- return response_json['data']
+ if isinstance(input, collections.abc.Sequence):
+ embeddings = []
+ for an_oai_embeddings in response_json['data']:
+ embeddings.append(an_oai_embeddings['embedding'])
+ else:
+ embeddings = [response_json['data']['embedding']]
+ return embeddings
else:
openai.api_key = user_api_key
openai.api_base = f'{base_url}/v1'
@@ -804,7 +891,7 @@ async def request_oai_embeddings(input,
for an_oai_embeddings in oai_embeddings.data:
embeddings.append(an_oai_embeddings.embedding)
else:
- embeddings = oai_embeddings.data.embedding
+ embeddings = [oai_embeddings.data.embedding]
return embeddings
@@ -833,7 +920,6 @@ def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re
f' {n_predicted} <> {expected_predicted_n}')
-
async def gather_tasks_results(context):
n_tasks = len(context.concurrent_tasks)
if context.debug:
@@ -899,6 +985,8 @@ def assert_embeddings(embeddings):
assert len(embeddings) > 0
embeddings_computed = False
for emb in embeddings:
+ if not isinstance(emb, float):
+ assert False, f"Bad embeddings: {embeddings}"
if emb != 0:
embeddings_computed = True
assert embeddings_computed, f"Embeddings: {embeddings}"
@@ -926,17 +1014,29 @@ async def completions_seed(context):
else context.server_seed if hasattr(context, 'server_seed') else None
+def context_text(context):
+ return context.text.replace('\r', '')
+
+
def start_server_background(context):
- context.server_path = '../../../build/bin/server'
+ if os.name == 'nt':
+ context.server_path = '../../../build/bin/Release/server.exe'
+ else:
+ context.server_path = '../../../build/bin/server'
if 'LLAMA_SERVER_BIN_PATH' in os.environ:
context.server_path = os.environ['LLAMA_SERVER_BIN_PATH']
+ server_listen_addr = context.server_fqdn
+ if os.name == 'nt':
+ server_listen_addr = '0.0.0.0'
server_args = [
- '--host', context.server_fqdn,
+ '--host', server_listen_addr,
'--port', context.server_port,
'--model', context.model_file
]
if context.n_batch:
server_args.extend(['--batch-size', context.n_batch])
+ if context.n_ubatch:
+ server_args.extend(['--ubatch-size', context.n_ubatch])
if context.n_gpu_layer:
server_args.extend(['--n-gpu-layers', context.n_gpu_layer])
if context.server_continuous_batching:
@@ -964,7 +1064,16 @@ def start_server_background(context):
if 'SERVER_LOG_FORMAT_JSON' not in os.environ:
server_args.extend(['--log-format', "text"])
print(f"starting server with: {context.server_path} {server_args}\n")
+ flags = 0
+ if 'nt' == os.name:
+ flags |= subprocess.DETACHED_PROCESS
+ flags |= subprocess.CREATE_NEW_PROCESS_GROUP
+ flags |= subprocess.CREATE_NO_WINDOW
+
+ pkwargs = {
+ 'creationflags': flags,
+ }
context.server_process = subprocess.Popen(
[str(arg) for arg in [context.server_path, *server_args]],
- close_fds=True)
- print(f"server pid={context.server_process.pid}")
+ **pkwargs)
+ print(f"server pid={context.server_process.pid}, behave pid={os.getpid()}")
diff --git a/examples/server/tests/requirements.txt b/examples/server/tests/requirements.txt
index 5d4210164a50a..2e4f42ad28c23 100644
--- a/examples/server/tests/requirements.txt
+++ b/examples/server/tests/requirements.txt
@@ -1,5 +1,6 @@
aiohttp~=3.9.3
behave~=1.2.6
huggingface_hub~=0.20.3
+numpy~=1.24.4
openai~=0.25.0
prometheus-client~=0.20.0
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index b6e49d8b98a2a..2ddb2cd21f8d6 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -1,18 +1,30 @@
#pragma once
-#include
-#include
-#include
-#include
-#include
-#include
+#include "llama.h"
+#include "common.h"
#include "json.hpp"
-#include "../llava/clip.h"
+#include
+#include
+#include
+#include
+
+#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
using json = nlohmann::json;
+// https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
+enum error_type {
+ ERROR_TYPE_INVALID_REQUEST,
+ ERROR_TYPE_AUTHENTICATION,
+ ERROR_TYPE_SERVER,
+ ERROR_TYPE_NOT_FOUND,
+ ERROR_TYPE_PERMISSION,
+ ERROR_TYPE_UNAVAILABLE, // custom error
+ ERROR_TYPE_NOT_SUPPORTED, // custom error
+};
+
extern bool server_verbose;
extern bool server_log_json;
@@ -37,83 +49,35 @@ extern bool server_log_json;
#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
-enum server_state {
- SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet
- SERVER_STATE_READY, // Server is ready and model is loaded
- SERVER_STATE_ERROR // An error occurred, load_model failed
-};
-
-enum task_type {
- TASK_TYPE_COMPLETION,
- TASK_TYPE_CANCEL,
- TASK_TYPE_NEXT_RESPONSE,
- TASK_TYPE_METRICS
-};
-
-struct task_server {
- int id = -1; // to be filled by llama_server_queue
- int target_id;
- task_type type;
- json data;
- bool infill_mode = false;
- bool embedding_mode = false;
- int multitask_id = -1;
-};
-
-struct task_result {
- int id;
- int multitask_id = -1;
- bool stop;
- bool error;
- json result_json;
-};
-
-struct task_multi {
- int id;
- std::set subtasks_remaining{};
- std::vector results{};
-};
-
-// completion token output with probabilities
-struct completion_token_output {
- struct token_prob
- {
- llama_token tok;
- float prob;
- };
-
- std::vector probs;
- llama_token tok;
- std::string text_to_send;
-};
-
-struct token_translator {
- llama_context * ctx;
- std::string operator()(llama_token tok) const { return llama_token_to_piece(ctx, tok); }
- std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); }
-};
+template
+static T json_value(const json &body, const std::string &key, const T &default_value) {
+ // Fallback null to default value
+ return body.contains(key) && !body.at(key).is_null()
+ ? body.value(key, default_value)
+ : default_value;
+}
static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra) {
std::stringstream ss_tid;
ss_tid << std::this_thread::get_id();
json log = nlohmann::ordered_json{
- {"tid", ss_tid.str()},
+ {"tid", ss_tid.str()},
{"timestamp", time(nullptr)},
};
if (server_log_json) {
- log.merge_patch(
- {
- {"level", level},
- {"function", function},
- {"line", line},
- {"msg", message},
- });
+ log.merge_patch( {
+ {"level", level},
+ {"function", function},
+ {"line", line},
+ {"msg", message},
+ });
+
if (!extra.empty()) {
log.merge_patch(extra);
}
- std::cout << log.dump(-1, ' ', false, json::error_handler_t::replace) << "\n" << std::flush;
+ printf("%s\n", log.dump(-1, ' ', false, json::error_handler_t::replace).c_str());
} else {
char buf[1024];
snprintf(buf, 1024, "%4s [%24s] %s", level, function, message);
@@ -136,22 +100,13 @@ static inline void server_log(const char *level, const char *function, int line,
}
//
-// server utils
+// chat template utils
//
-template
-static T json_value(const json &body, const std::string &key, const T &default_value) {
- // Fallback null to default value
- return body.contains(key) && !body.at(key).is_null()
- ? body.value(key, default_value)
- : default_value;
-}
-
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
inline bool verify_custom_template(const std::string & tmpl) {
llama_chat_message chat[] = {{"user", "test"}};
- std::vector buf(1);
- int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, buf.data(), buf.size());
+ int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
return res >= 0;
}
@@ -163,7 +118,7 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
std::vector chat(messages.size());
for (size_t i = 0; i < messages.size(); ++i) {
- auto &curr_msg = messages[i];
+ const auto & curr_msg = messages[i];
str[i*2 + 0] = json_value(curr_msg, "role", std::string(""));
str[i*2 + 1] = json_value(curr_msg, "content", std::string(""));
alloc_size += str[i*2 + 1].length();
@@ -183,261 +138,13 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
}
- std::string formatted_chat(buf.data(), res);
+ const std::string formatted_chat(buf.data(), res);
+
LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
return formatted_chat;
}
-//
-// work queue utils
-//
-
-struct llama_server_queue {
- int id = 0;
- std::mutex mutex_tasks;
- bool running;
- // queues
- std::vector queue_tasks;
- std::vector queue_tasks_deferred;
- std::vector queue_multitasks;
- std::condition_variable condition_tasks;
- // callback functions
- std::function callback_new_task;
- std::function