diff --git a/Dockerfile b/Dockerfile index 15796446473b3..4d4b0b430e9db 100644 --- a/Dockerfile +++ b/Dockerfile @@ -123,6 +123,11 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist --mount=type=cache,target=/root/.cache/pip \ pip install dist/*.whl --verbose +# UPSTREAM SYNC: Install sparsity extras +RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \ + --mount=type=cache,target=/root/.cache/pip \ + pip install nm-magic-wand-nightly --extra-index-url https://pypi.neuralmagic.com/simple + RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \ --mount=type=cache,target=/root/.cache/pip \ pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir @@ -135,10 +140,6 @@ FROM vllm-base AS test ADD . /vllm-workspace/ -# UPSTREAM SYNC: Install sparsity extras -RUN --mount=type=cache,target=/root/.cache/pip \ - pip3 install nm-magic-wand-nightly - # install development dependencies (for testing) RUN --mount=type=cache,target=/root/.cache/pip \ pip install -r requirements-dev.txt diff --git a/README.md b/README.md index 10473d910c554..d8d5c4ef1d70e 100644 --- a/README.md +++ b/README.md @@ -12,12 +12,12 @@ The [nm-vllm PyPi package](https://pypi.org/project/nm-vllm/) includes pre-compi Install it using pip: ```bash -pip install nm-vllm +pip install nm-vllm --extra-index-url https://pypi.neuralmagic.com/simple ``` For utilizing weight-sparsity kernels, such as through `sparsity="sparse_w16a16"`, you can extend the installation with the `sparsity` extras: ```bash -pip install nm-vllm[sparse] +pip install nm-vllm[sparse] --extra-index-url https://pypi.neuralmagic.com/simple ``` You can also build and install `nm-vllm` from source (this will take ~10 minutes): diff --git a/examples-neuralmagic/deploy_compressed_huggingface_models/Deploy_Compressed_LLMs_from_Hugging_Face_with_nm_vllm.ipynb b/examples-neuralmagic/deploy_compressed_huggingface_models/Deploy_Compressed_LLMs_from_Hugging_Face_with_nm_vllm.ipynb index a2ed6f485c23e..04f16768ca194 100644 --- a/examples-neuralmagic/deploy_compressed_huggingface_models/Deploy_Compressed_LLMs_from_Hugging_Face_with_nm_vllm.ipynb +++ b/examples-neuralmagic/deploy_compressed_huggingface_models/Deploy_Compressed_LLMs_from_Hugging_Face_with_nm_vllm.ipynb @@ -5386,7 +5386,7 @@ } ], "source": [ - "!pip install nm-vllm[sparse]" + "!pip install nm-vllm[sparse] --extra-index-url https://pypi.neuralmagic.com/simple" ] }, { @@ -6219,4 +6219,4 @@ } } ] -} \ No newline at end of file +} diff --git a/examples-neuralmagic/marlin_quantization_and_deploy/Performantly_Quantize_LLMs_to_4_bits_with_Marlin_and_nm_vllm.ipynb b/examples-neuralmagic/marlin_quantization_and_deploy/Performantly_Quantize_LLMs_to_4_bits_with_Marlin_and_nm_vllm.ipynb index 3107fae0a4c11..e973c6e4aaa58 100644 --- a/examples-neuralmagic/marlin_quantization_and_deploy/Performantly_Quantize_LLMs_to_4_bits_with_Marlin_and_nm_vllm.ipynb +++ b/examples-neuralmagic/marlin_quantization_and_deploy/Performantly_Quantize_LLMs_to_4_bits_with_Marlin_and_nm_vllm.ipynb @@ -1506,7 +1506,7 @@ } ], "source": [ - "!pip install nm-vllm" + "!pip install nm-vllm --extra-index-url https://pypi.neuralmagic.com/simple" ] }, { diff --git a/examples-neuralmagic/sparsegpt_compress_and_deploy/Apply_SparseGPT_to_LLMs_and_deploy_with_nm_vllm.ipynb b/examples-neuralmagic/sparsegpt_compress_and_deploy/Apply_SparseGPT_to_LLMs_and_deploy_with_nm_vllm.ipynb index cd7fdde9ce11d..c68f2812b7bee 100644 --- a/examples-neuralmagic/sparsegpt_compress_and_deploy/Apply_SparseGPT_to_LLMs_and_deploy_with_nm_vllm.ipynb +++ b/examples-neuralmagic/sparsegpt_compress_and_deploy/Apply_SparseGPT_to_LLMs_and_deploy_with_nm_vllm.ipynb @@ -7899,7 +7899,7 @@ { "cell_type": "code", "source": [ - "!pip install nm-vllm[sparse]" + "!pip install nm-vllm[sparse] --extra-index-url https://pypi.neuralmagic.com/simple" ], "metadata": { "id": "Sz2Cs4BtPa7_" @@ -8062,4 +8062,4 @@ "outputs": [] } ] -} \ No newline at end of file +}