-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathDockerfile
89 lines (78 loc) · 3.5 KB
/
Dockerfile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# Build stage.
# Using a lightweight Alpine Linux base image for a minimal final image size.
FROM alpine:3.20 AS build
# The target platform, set by Docker Buildx.
ARG TARGETPLATFORM
# The LLaMA.cpp git tag, passed by the GitHub Action, to checkout and compile.
ARG LLAMA_GIT_TAG
# Set the working directory.
WORKDIR /opt/llama.cpp
# Compile the official LLaMA.cpp HTTP Server.
RUN \
# Since my GitHub Action uses QEMU for the ARM64 build, I need to disable the native build.
# Otherwise, the build will fail. See: https://github.com/ggerganov/llama.cpp/issues/10933
# The CPU architecture is set to "armv8-a" because it's the one used by the Raspberry Pi 3+.
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
export GGML_NATIVE=OFF; \
export GGML_CPU_ARM_ARCH="armv8-a"; \
else \
export GGML_NATIVE=ON; \
export GGML_CPU_ARM_ARCH=""; \
fi && \
# Install build dependencies.
# Using --no-cache to avoid caching the Alpine packages index
# and --virtual to group build dependencies for easier cleanup.
apk add --no-cache --virtual .build-deps \
git=~2.45 \
g++=~13.2 \
make=~4.4 \
cmake=~3.29 \
linux-headers=~6.6 \
curl-dev=~8.11 && \
# Checkout the llama.cpp repository to the wanted version (git tag).
# A shallow clone (--depth 1) is used to minimize the data transfer.
git clone -b ${LLAMA_GIT_TAG} --depth 1 https://github.com/ggerganov/llama.cpp . && \
# To save space, empty the index.html that will be embedded in the llama-server executable.
touch examples/server/public/index.html && \
gzip -f examples/server/public/index.html && \
# Configure the CMake build system.
cmake -B build \
# On AMD64, use the native build; but on ARM64, disable it.
-DGGML_NATIVE=${GGML_NATIVE} \
# On ARM64, set the CPU architecture to "armv8-a". See the note above.
-DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH} \
# Enable building only the llama-server executable.
-DLLAMA_BUILD_SERVER=ON \
# Enable support for cURL during build to allow the download of GGUF model at first run.
-DLLAMA_CURL=ON \
# Include the GGML lib inside this executable to ease deployment...
-DBUILD_SHARED_LIBS=OFF && \
# Compile the llama-server target in Release mode for a production use.
cmake --build build --target llama-server --config Release && \
# Remove non-essential symbols from this executable to save some space.
strip build/bin/llama-server && \
# Copy this executable in a safe place...
cp build/bin/llama-server /opt && \
# before cleaning all other build files.
rm -rf /opt/llama.cpp/* && \
# Remove build dependencies since they are useless now.
apk del .build-deps
# Final stage.
# Starting a new stage to create a smaller, cleaner image containing only the runtime environment.
FROM alpine:3.20
# Set the working directory.
WORKDIR /opt/llama.cpp
# Install runtime dependencies: C++, cURL & OpenMP.
RUN apk add --no-cache \
libstdc++=~13.2 \
libcurl=~8.11 \
libgomp=~13.2
# Copy the compiled llama-server executable from the build stage to the current working directory.
COPY --from=build /opt/llama-server .
# Server will listen on 8080.
EXPOSE 8080
# Run the LLaMA.cpp HTTP Server.
CMD [ "sh", "-c", "/opt/llama.cpp/llama-server --host 0.0.0.0 --no-webui" ]
# Notes for the above command:
# - The host is set to 0.0.0.0 to allow HTTP access outside the container.
# - The --no-webui flag is used to disable the embedded web interface. Cf. emptied out index.html