Skip to content

Commit

Permalink
Merge branch 'master' into compilade/bitnet-ternary
Browse files Browse the repository at this point in the history
  • Loading branch information
compilade committed Aug 11, 2024
2 parents 96b3d41 + 4134999 commit d911cd1
Show file tree
Hide file tree
Showing 138 changed files with 6,990 additions and 1,862 deletions.
4 changes: 2 additions & 2 deletions .devops/llama-server.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION AS build

RUN apt-get update && \
apt-get install -y build-essential git libcurl4-openssl-dev curl
apt-get install -y build-essential git libcurl4-openssl-dev

WORKDIR /app

Expand All @@ -16,7 +16,7 @@ RUN make -j$(nproc) llama-server
FROM ubuntu:$UBUNTU_VERSION AS runtime

RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev libgomp1
apt-get install -y libcurl4-openssl-dev libgomp1 curl

COPY --from=build /app/llama-server /llama-server

Expand Down
13 changes: 3 additions & 10 deletions .devops/nix/package.nix
Original file line number Diff line number Diff line change
Expand Up @@ -126,16 +126,9 @@ let
++ optionals useMetalKit [ MetalKit ];

cudaBuildInputs = with cudaPackages; [
cuda_cccl.dev # <nv/target>

# A temporary hack for reducing the closure size, remove once cudaPackages
# have stopped using lndir: https://github.com/NixOS/nixpkgs/issues/271792
cuda_cudart.dev
cuda_cudart.lib
cuda_cudart.static
libcublas.dev
libcublas.lib
libcublas.static
cuda_cudart
cuda_cccl # <nv/target>
libcublas
];

rocmBuildInputs = with rocmPackages; [
Expand Down
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,6 @@ models-mnt
!models/ggml-vocab-*.gguf*

# Zig

zig-out/
zig-cache/

Expand Down
3 changes: 2 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,8 @@ set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location o
# determining _precisely_ which defines are necessary for the llama-config
# package.
#
get_directory_property(GGML_DIR_DEFINES DIRECTORY ggml/src COMPILE_DEFINITIONS)
get_target_property(GGML_DIRECTORY ggml SOURCE_DIR)
get_directory_property(GGML_DIR_DEFINES DIRECTORY ${GGML_DIRECTORY} COMPILE_DEFINITIONS)
get_target_property(GGML_TARGET_DEFINES ggml COMPILE_DEFINITIONS)
set(GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES} ${GGML_DIR_DEFINES})
get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)
Expand Down
1 change: 1 addition & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
- Execute [the full CI locally on your machine](ci/README.md) before publishing
- Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs.
- The PR template has a series of review complexity checkboxes `[ ]` that [you can mark as](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) `[X]` for your convenience
- Consider allowing write access to your branch for faster review
- If your PR becomes stale, don't hesitate to ping the maintainers in the comments

# Pull requests (for collaborators)
Expand Down
67 changes: 37 additions & 30 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ BUILD_TARGETS = \
llama-imatrix \
llama-infill \
llama-llava-cli \
llama-minicpmv-cli\
llama-lookahead \
llama-lookup \
llama-lookup-create \
Expand Down Expand Up @@ -888,15 +889,16 @@ ggml/src/ggml-metal-embed.o: \
ggml/src/ggml-common.h
@echo "Embedding Metal library"
@sed -e '/#include "ggml-common.h"/r ggml/src/ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml/src/ggml-metal.metal > ggml/src/ggml-metal-embed.metal
$(eval TEMP_ASSEMBLY=$(shell mktemp))
@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)
@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)
@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)
@echo ".incbin \"ggml/src/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)
@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)
@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)
@$(AS) $(TEMP_ASSEMBLY) -o $@
@rm -f ${TEMP_ASSEMBLY}
$(eval TEMP_ASSEMBLY=$(shell mktemp -d))
@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)/ggml-metal-embed.s
@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
@echo ".incbin \"ggml/src/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
$(CC) $(CFLAGS) -c $(TEMP_ASSEMBLY)/ggml-metal-embed.s -o $@
@rm -f ${TEMP_ASSEMBLY}/ggml-metal-embed.s
@rmdir ${TEMP_ASSEMBLY}
endif
endif # GGML_METAL

Expand Down Expand Up @@ -1205,6 +1207,7 @@ clean:
rm -rvf ggml/*.dll
rm -rvf ggml/*.so
rm -vrf ggml/src/*.o
rm -rvf ggml/src/llamafile/*.o
rm -rvf common/build-info.cpp
rm -vrf ggml/src/ggml-metal-embed.metal
rm -vrf ggml/src/ggml-cuda/*.o
Expand Down Expand Up @@ -1451,15 +1454,20 @@ libllava.a: examples/llava/llava.cpp \
$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual

llama-llava-cli: examples/llava/llava-cli.cpp \
examples/llava/clip.h \
examples/llava/clip.cpp \
examples/llava/llava.cpp \
examples/llava/llava.h \
examples/llava/clip.cpp \
examples/llava/clip.h \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual

llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \
examples/llava/llava.cpp \
examples/llava/llava.h \
examples/llava/clip.cpp \
examples/llava/clip.h \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
$(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp)
$(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS)
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual

ifeq ($(UNAME_S),Darwin)
swift: examples/batched.swift
Expand Down Expand Up @@ -1605,42 +1613,41 @@ llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \
# Mark legacy binary targets as .PHONY so that they are always checked.
.PHONY: main quantize perplexity embedding server

# Define the object file target
examples/deprecation-warning/deprecation-warning.o: examples/deprecation-warning/deprecation-warning.cpp
$(CXX) $(CXXFLAGS) -c $< -o $@

# NOTE: We currently will always build the deprecation-warning `main` and `server` binaries to help users migrate.
# Eventually we will want to remove these target from building all the time.
main: examples/deprecation-warning/deprecation-warning.cpp
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
main: examples/deprecation-warning/deprecation-warning.o
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
@echo "NOTICE: The 'main' binary is deprecated. Please use 'llama-cli' instead."

server: examples/deprecation-warning/deprecation-warning.cpp
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
server: examples/deprecation-warning/deprecation-warning.o
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
@echo "NOTICE: The 'server' binary is deprecated. Please use 'llama-server' instead."

quantize: examples/deprecation-warning/deprecation-warning.cpp
quantize: examples/deprecation-warning/deprecation-warning.o
ifneq (,$(wildcard quantize))
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
@echo "#########"
@echo "WARNING: The 'quantize' binary is deprecated. Please use 'llama-quantize' instead."
@echo " Remove the 'quantize' binary to remove this warning."
@echo "#########"
endif

perplexity: examples/deprecation-warning/deprecation-warning.cpp
perplexity: examples/deprecation-warning/deprecation-warning.o
ifneq (,$(wildcard perplexity))
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
@echo "#########"
@echo "WARNING: The 'perplexity' binary is deprecated. Please use 'llama-perplexity' instead."
@echo " Remove the 'perplexity' binary to remove this warning."
@echo "#########"
endif

embedding: examples/deprecation-warning/deprecation-warning.cpp
embedding: examples/deprecation-warning/deprecation-warning.o
ifneq (,$(wildcard embedding))
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
@echo "#########"
@echo "WARNING: The 'embedding' binary is deprecated. Please use 'llama-embedding' instead."
@echo " Remove the 'embedding' binary to remove this warning."
Expand Down
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,16 @@ Typically finetunes of the base models below are supported as well.
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
- [x] [OLMo](https://allenai.org/olmo)
- [x] [Granite models](https://huggingface.co/collections/ibm-granite/granite-code-models-6624c5cec322e4c148c8b330)
- [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)
- [x] [Snowflake-Arctic MoE](https://huggingface.co/collections/Snowflake/arctic-66290090abe542894a5ac520)
- [x] [Smaug](https://huggingface.co/models?search=Smaug)
- [x] [Poro 34B](https://huggingface.co/LumiOpen/Poro-34B)
- [x] [Bitnet b1.58 models](https://huggingface.co/1bitLLM)
- [x] [Flan T5](https://huggingface.co/models?search=flan-t5)
- [x] [Open Elm models](https://huggingface.co/collections/apple/openelm-instruct-models-6619ad295d7ae9f868b759ca)
- [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b)
- [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)

(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))

Expand Down Expand Up @@ -145,6 +153,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
- [Faraday](https://faraday.dev/) (proprietary)
- [LMStudio](https://lmstudio.ai/) (proprietary)
- [Layla](https://play.google.com/store/apps/details?id=com.laylalite) (proprietary)
- [ramalama](https://github.com/containers/ramalama) (MIT)
- [LocalAI](https://github.com/mudler/LocalAI) (MIT)
- [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile)
Expand Down
Loading

0 comments on commit d911cd1

Please sign in to comment.