vllm-project · ApostaC · Jan 27, 2025 · Jan 26, 2025
diff --git a/.gitignore b/.gitignore
@@ -93,3 +93,4 @@ perf-test.py
 /try
 
 values-*.yaml
+helm/examples
diff --git a/helm/Chart.yaml b/helm/Chart.yaml
@@ -15,7 +15,7 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 0.0.1
+version: 0.0.2
 
 maintainers:
   - name: apostac
diff --git a/helm/templates/deployment-vllm-multi.yaml b/helm/templates/deployment-vllm-multi.yaml
@@ -65,6 +65,13 @@ spec:
           env:
           - name: HF_HOME
             value: /data
+          {{- if $modelSpec.hf_token }}
+          - name: HF_TOKEN
+            valueFrom: 
+              secretKeyRef:
+                name: {{ .Release.Name }}-secrets
+                key: hf_token_{{ $modelSpec.name }}
+          {{- end }}
           {{- with $modelSpec.env }}
           {{- toYaml . | nindent 10 }}
           {{- end }}

diff --git a/helm/templates/secrets.yaml b/helm/templates/secrets.yaml
@@ -0,0 +1,14 @@
+apiVersion: v1
+kind: Secret
+metadata:
+  name: "{{ .Release.Name }}-secrets"
+  namespace: {{ .Release.Namespace }}
+type: Opaque
+data:
+  {{- range $modelSpec := .Values.servingEngineSpec.modelSpec }}
+  {{- with $ -}}
+  {{-   if $modelSpec.hf_token }}
+  hf_token_{{ $modelSpec.name }}: {{ $modelSpec.hf_token | b64enc | quote }}
+  {{-   end }}
+  {{- end }}
+  {{- end }}
diff --git a/helm/values.schema.json b/helm/values.schema.json
@@ -53,6 +53,9 @@
                 },
                 "required": ["enabled", "cpuOffloadingBufferSize"]
               },
+              "hf_token": {
+                  "type": "string"
+              },
               "env": {
                 "type": "array",
                 "items": {

diff --git a/helm/values.yaml b/helm/values.yaml
@@ -35,6 +35,8 @@ servingEngineSpec:
   #   - enabled: (optional, bool) Enable LMCache, e.g., true
   #   - cpuOffloadingBufferSize: (optional, string) The CPU offloading buffer size, e.g., "30"
   #
+  # - hf_token: (optional, string) the Huggingface tokens for this model
+  #
   # - env: (optional, list) The environment variables to set in the container, e.g., your HF_TOKEN
   #
   # - nodeSelectorTerms: (optional, list) The node selector terms to match the nodes

diff --git a/tutorials/02-basic-vllm-config.md b/tutorials/02-basic-vllm-config.md
@@ -12,13 +12,13 @@ This tutorial guides you through the basic configurations required to deploy a v
 ## Prerequisites
 - A Kubernetes environment with GPU support, as set up in the [00-install-kubernetes-env tutorial](00-install-kubernetes-env.md).
 - Helm installed on your system.
-- Access to a Hugging Face token (`HF_TOKEN`).
+- Access to a HuggingFace token (`HF_TOKEN`).
 
 ## Step 1: Preparing the Configuration File
 
 1. Locate the example configuration file `tutorials/assets/values-02-basic-config.yaml`.
 2. Open the file and update the following fields:
-    - Replace `<USERS SHOULD PUT THEIR HF_TOKEN HERE>` with your actual Hugging Face token.
+    - Write your actual huggingface token in `hf_token: <YOUR HF TOKEN>` in the yaml file.
 
 ### Explanation of Key Items in `values-02-basic-config.yaml`
 
@@ -37,7 +37,8 @@ This tutorial guides you through the basic configurations required to deploy a v
   - `maxModelLen`: The maximum sequence length the model can handle.
   - `dtype`: Data type for computations, e.g., `bfloat16` for faster performance on modern GPUs.
   - `extraArgs`: Additional arguments passed to the vLLM engine for fine-tuning behavior.
-- **`env`**: Environment variables such as `HF_TOKEN` for authentication with Hugging Face.
+- **`hf_token`**: The Hugging Face token for authenticating with the Hugging Face model hub.
+- **`env`**: Extra environment variables to pass to the model-serving engine.
 
 ### Example Snippet
 ```yaml
@@ -62,10 +63,7 @@ servingEngineSpec:
       dtype: "bfloat16"
       extraArgs: ["--disable-log-requests", "--gpu-memory-utilization", "0.8"]
 
-    env:
-      - name: HF_TOKEN
-        value: <YOUR_HF_TOKEN>
-
+    hf_token: <YOUR HF TOKEN>
 ```
 
 ## Step 2: Applying the Configuration

diff --git a/tutorials/03-load-model-from-pv.md b/tutorials/03-load-model-from-pv.md
@@ -84,14 +84,12 @@ servingEngineSpec:
     vllmConfig:
       maxModelLen: 4096
 
-    env:
-      - name: HF_TOKEN
-        value: <YOUR_HF_TOKEN>
+    hf_token: <YOUR HF TOKEN>
 ```
 
 > **Explanation:** The `pvcMatchLabels` field specifies the labels to match an existing Persistent Volume. In this example, it ensures that the deployment uses the PV with the label `model: "llama3-pv"`. This provides a way to link a specific PV to your application.
 
-> **Note:** Make sure to replace `<YOUR_HF_TOKEN>` with your actual Hugging Face token in the `env` section.
+> **Note:** Make sure to replace `<YOUR_HF_TOKEN>` with your actual Hugging Face token in the yaml.
 
 2. Deploy the Helm chart:
 

diff --git a/tutorials/04-launch-multiple-model.md b/tutorials/04-launch-multiple-model.md
@@ -36,9 +36,7 @@ servingEngineSpec:
     pvcStorage: "50Gi"
     vllmConfig:
       maxModelLen: 4096
-    env:
-      - name: HF_TOKEN
-        value: <YOUR_HF_TOKEN_FOR_LLAMA3.1>
+    hf_token: <YOUR HF TOKEN FOR LLAMA 3.1>
 
   - name: "mistral"
     repository: "vllm/vllm-openai"
@@ -51,12 +49,10 @@ servingEngineSpec:
     pvcStorage: "50Gi"
     vllmConfig:
       maxModelLen: 4096
-    env:
-      - name: HF_TOKEN
-        value: <YOUR_HF_TOKEN_FOR_MISTRAL>
+    hf_token: <YOUR HF TOKEN FOR MISTRAL>
 ```
 
-> **Note:** Replace `<YOUR_HF_TOKEN_FOR_LLAMA3.1>` and `<YOUR_HF_TOKEN_FOR_MISTRAL>` with your Hugging Face tokens.
+> **Note:** Replace `<YOUR HF TOKEN FOR LLAMA 3.1>` and `<YOUR HF TOKEN FOR MISTRAL>` with your Hugging Face tokens.
 
 
 ## Step 2: Deploying the Helm Chart

diff --git a/tutorials/05-offload-kv-cache.md b/tutorials/05-offload-kv-cache.md
@@ -44,12 +44,10 @@ servingEngineSpec:
       enabled: true
       cpuOffloadingBufferSize: "20"
 
-    env:
-      - name: HF_TOKEN
-        value: <YOUR_HF_TOKEN_HERE>
+    hf_token: <YOUR HF TOKEN>
 ```
 
-> **Note:** Replace `<YOUR_HF_TOKEN_HERE>` with your actual Hugging Face token.
+> **Note:** Replace `<YOUR HF TOKEN>` with your actual Hugging Face token.
 
 The `lmcacheConfig` field enables LMCache and sets the CPU offloading buffer size to `20`GB. You can adjust this value based on your workload.
 

diff --git a/tutorials/assets/values-02-basic-config.yaml b/tutorials/assets/values-02-basic-config.yaml
@@ -19,6 +19,4 @@ servingEngineSpec:
       dtype: "bfloat16"
       extraArgs: ["--disable-log-requests", "--gpu-memory-utilization", "0.8"]
 
-    env:
-      - name: HF_TOKEN
-        value: <YOUR_HF_TOKEN>
+    hf_token: <YOUR HF TOKEN>
diff --git a/tutorials/assets/values-03-match-pv.yaml b/tutorials/assets/values-03-match-pv.yaml
@@ -17,6 +17,4 @@ servingEngineSpec:
     vllmConfig:
       maxModelLen: 4096
 
-    env:
-      - name: HF_TOKEN
-        value: <YOUR_HF_TOKEN>
+    hf_token: <YOUR HF TOKEN>
diff --git a/tutorials/assets/values-04-multiple-models.yaml b/tutorials/assets/values-04-multiple-models.yaml
@@ -11,9 +11,7 @@ servingEngineSpec:
     pvcStorage: "50Gi"
     vllmConfig:
       maxModelLen: 4096
-    env:
-      - name: HF_TOKEN
-        value: <YOUR_HF_TOKEN_FOR_LLAMA3.1>
+    hf_token: <YOUR HF TOKEN FOR LLAMA3.1>
 
   - name: "mistral"
     repository: "vllm/vllm-openai"
@@ -26,6 +24,4 @@ servingEngineSpec:
     pvcStorage: "50Gi"
     vllmConfig:
       maxModelLen: 4096
-    env:
-      - name: HF_TOKEN
-        value: <YOUR_HF_TOKEN_FOR_MISTRAL>
+    hf_token: <YOUR HF TOKEN FOR MISTRAL>
diff --git a/tutorials/assets/values-05-cpu-offloading.yaml b/tutorials/assets/values-05-cpu-offloading.yaml
@@ -18,6 +18,4 @@ servingEngineSpec:
       enabled: true
       cpuOffloadingBufferSize: "20"
 
-    env:
-      - name: HF_TOKEN
-        value: <YOUR_HF_TOKEN_HERE>
+    hf_token: <YOUR HF TOKEN>
Original file line number	Diff line number	Diff line change
Expand Up		@@ -93,3 +93,4 @@ perf-test.py
		/try

		values-*.yaml
		helm/examples