diff --git a/README.md b/README.md index 54d208fab..d97a14c0e 100644 --- a/README.md +++ b/README.md @@ -146,18 +146,17 @@ Within the deployment specification, locate and modify the command field. #### Original ```sh -accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype bfloat16 +accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --torch_dtype bfloat16 ``` #### Modify to enable 4-bit Quantization ```sh -accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype bfloat16 --load_in_4bit +accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --torch_dtype bfloat16 --load_in_4bit ``` Currently, we allow users to change the following paramenters manually: -- `pipeline`: For text-generation models this can be either `text-generation` or `conversational`. - `load_in_4bit` or `load_in_8bit`: Model quantization resolution. Should you need to customize other parameters, kindly file an issue for potential future inclusion. diff --git a/docs/custom-model-integration/custom-deployment-template.yaml b/docs/custom-model-integration/custom-deployment-template.yaml index fe9c2c4ad..c369c9cdb 100644 --- a/docs/custom-model-integration/custom-deployment-template.yaml +++ b/docs/custom-model-integration/custom-deployment-template.yaml @@ -23,8 +23,6 @@ inference: - "--gpu_ids" - "all" - "tfs/inference_api.py" - - "--pipeline" - - "text-generation" - "--torch_dtype" - "float16" # Set to "float16" for compatibility with V100 GPUs; use "bfloat16" for A100, H100 or newer GPUs volumeMounts: diff --git a/docs/custom-model-integration/reference-image-deployment.yaml b/docs/custom-model-integration/reference-image-deployment.yaml index 3a77dba08..36d518638 100644 --- a/docs/custom-model-integration/reference-image-deployment.yaml +++ b/docs/custom-model-integration/reference-image-deployment.yaml @@ -23,8 +23,6 @@ inference: - "--gpu_ids" - "all" - "inference_api.py" - - "--pipeline" - - "text-generation" - "--trust_remote_code" - "--allow_remote_files" - "--pretrained_model_name_or_path" diff --git a/presets/workspace/models/falcon/model.go b/presets/workspace/models/falcon/model.go index 34aac7824..5cf07e221 100644 --- a/presets/workspace/models/falcon/model.go +++ b/presets/workspace/models/falcon/model.go @@ -48,7 +48,6 @@ var ( baseCommandPresetFalconTuning = "cd /workspace/tfs/ && python3 metrics_server.py & accelerate launch" falconRunParams = map[string]string{ "torch_dtype": "bfloat16", - "pipeline": "text-generation", "chat_template": "/workspace/chat_templates/falcon-instruct.jinja", } falconRunParamsVLLM = map[string]string{ diff --git a/presets/workspace/models/mistral/model.go b/presets/workspace/models/mistral/model.go index b3b8497f0..54b2604e6 100644 --- a/presets/workspace/models/mistral/model.go +++ b/presets/workspace/models/mistral/model.go @@ -35,7 +35,6 @@ var ( baseCommandPresetMistralTuning = "cd /workspace/tfs/ && python3 metrics_server.py & accelerate launch" mistralRunParams = map[string]string{ "torch_dtype": "bfloat16", - "pipeline": "text-generation", "chat_template": "/workspace/chat_templates/mistral-instruct.jinja", } mistralRunParamsVLLM = map[string]string{ diff --git a/presets/workspace/models/phi2/model.go b/presets/workspace/models/phi2/model.go index bb7989df9..8afa66f19 100644 --- a/presets/workspace/models/phi2/model.go +++ b/presets/workspace/models/phi2/model.go @@ -29,7 +29,6 @@ var ( baseCommandPresetPhiTuning = "cd /workspace/tfs/ && python3 metrics_server.py & accelerate launch" phiRunParams = map[string]string{ "torch_dtype": "float16", - "pipeline": "text-generation", } phiRunParamsVLLM = map[string]string{ "dtype": "float16", diff --git a/presets/workspace/models/phi3/model.go b/presets/workspace/models/phi3/model.go index c8c40e4d1..84eb4d544 100644 --- a/presets/workspace/models/phi3/model.go +++ b/presets/workspace/models/phi3/model.go @@ -53,7 +53,6 @@ var ( baseCommandPresetPhiTuning = "cd /workspace/tfs/ && python3 metrics_server.py & accelerate launch" phiRunParams = map[string]string{ "torch_dtype": "auto", - "pipeline": "text-generation", "trust_remote_code": "", } phiRunParamsVLLM = map[string]string{ diff --git a/presets/workspace/models/qwen/model.go b/presets/workspace/models/qwen/model.go index 20a09df74..f03dfdde0 100644 --- a/presets/workspace/models/qwen/model.go +++ b/presets/workspace/models/qwen/model.go @@ -29,7 +29,6 @@ var ( baseCommandPresetQwenTuning = "cd /workspace/tfs/ && python3 metrics_server.py & accelerate launch" qwenRunParams = map[string]string{ "torch_dtype": "bfloat16", - "pipeline": "text-generation", } qwenRunParamsVLLM = map[string]string{ "dtype": "float16", diff --git a/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct_hf.yaml b/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct_hf.yaml index a44043894..25d8cd96a 100644 --- a/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct_hf.yaml +++ b/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct_hf.yaml @@ -19,7 +19,7 @@ spec: command: - /bin/sh - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16 + - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --torch_dtype bfloat16 resources: requests: nvidia.com/gpu: 2 diff --git a/presets/workspace/test/manifests/falcon-40b/falcon-40b_hf.yaml b/presets/workspace/test/manifests/falcon-40b/falcon-40b_hf.yaml index 514d12e60..446d0b00c 100644 --- a/presets/workspace/test/manifests/falcon-40b/falcon-40b_hf.yaml +++ b/presets/workspace/test/manifests/falcon-40b/falcon-40b_hf.yaml @@ -19,7 +19,7 @@ spec: command: - /bin/sh - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16 + - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --torch_dtype bfloat16 resources: requests: nvidia.com/gpu: 2 diff --git a/presets/workspace/test/manifests/falcon-7b-adapter/falcon-7b-adapter.yaml b/presets/workspace/test/manifests/falcon-7b-adapter/falcon-7b-adapter.yaml index c48a1c2cf..7a02f0633 100644 --- a/presets/workspace/test/manifests/falcon-7b-adapter/falcon-7b-adapter.yaml +++ b/presets/workspace/test/manifests/falcon-7b-adapter/falcon-7b-adapter.yaml @@ -30,7 +30,7 @@ spec: command: - /bin/sh - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16 + - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --torch_dtype bfloat16 resources: requests: nvidia.com/gpu: 2 diff --git a/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct_hf.yaml b/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct_hf.yaml index 1b2092b36..02b3bbb86 100644 --- a/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct_hf.yaml +++ b/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct_hf.yaml @@ -19,7 +19,7 @@ spec: command: - /bin/sh - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16 + - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --torch_dtype bfloat16 resources: requests: nvidia.com/gpu: 1 diff --git a/presets/workspace/test/manifests/falcon-7b/falcon-7b_hf.yaml b/presets/workspace/test/manifests/falcon-7b/falcon-7b_hf.yaml index 56a775fff..36a97b2a8 100644 --- a/presets/workspace/test/manifests/falcon-7b/falcon-7b_hf.yaml +++ b/presets/workspace/test/manifests/falcon-7b/falcon-7b_hf.yaml @@ -19,7 +19,7 @@ spec: command: - /bin/sh - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16 + - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --torch_dtype bfloat16 resources: requests: nvidia.com/gpu: 1 diff --git a/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct_hf.yaml b/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct_hf.yaml index 75179683f..9980fcf42 100644 --- a/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct_hf.yaml +++ b/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct_hf.yaml @@ -18,7 +18,7 @@ spec: command: - /bin/sh - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16 + - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --torch_dtype bfloat16 resources: requests: nvidia.com/gpu: 1 diff --git a/presets/workspace/test/manifests/mistral-7b/mistral-7b_hf.yaml b/presets/workspace/test/manifests/mistral-7b/mistral-7b_hf.yaml index 3eff5594f..7b810a353 100644 --- a/presets/workspace/test/manifests/mistral-7b/mistral-7b_hf.yaml +++ b/presets/workspace/test/manifests/mistral-7b/mistral-7b_hf.yaml @@ -18,7 +18,7 @@ spec: command: - /bin/sh - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16 + - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --torch_dtype bfloat16 resources: requests: nvidia.com/gpu: 1 diff --git a/presets/workspace/test/manifests/phi-2/phi-2_hf.yaml b/presets/workspace/test/manifests/phi-2/phi-2_hf.yaml index cbc6f94e7..9d382d96d 100644 --- a/presets/workspace/test/manifests/phi-2/phi-2_hf.yaml +++ b/presets/workspace/test/manifests/phi-2/phi-2_hf.yaml @@ -18,7 +18,7 @@ spec: command: - /bin/sh - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16 + - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --torch_dtype bfloat16 resources: requests: nvidia.com/gpu: 1 diff --git a/presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct_hf.yaml b/presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct_hf.yaml index 0adb122e4..5be9d124f 100644 --- a/presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct_hf.yaml +++ b/presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct_hf.yaml @@ -18,7 +18,7 @@ spec: command: - /bin/sh - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code + - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --torch_dtype auto --trust_remote_code resources: requests: nvidia.com/gpu: 1 diff --git a/presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct_hf.yaml b/presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct_hf.yaml index 1d0d64e47..800b80886 100644 --- a/presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct_hf.yaml +++ b/presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct_hf.yaml @@ -18,7 +18,7 @@ spec: command: - /bin/sh - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code + - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --torch_dtype auto --trust_remote_code resources: requests: nvidia.com/gpu: 1 diff --git a/presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct_hf.yaml b/presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct_hf.yaml index cf8898015..5f3759534 100644 --- a/presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct_hf.yaml +++ b/presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct_hf.yaml @@ -18,7 +18,7 @@ spec: command: - /bin/sh - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code + - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --torch_dtype auto --trust_remote_code resources: requests: nvidia.com/gpu: 1 diff --git a/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct_hf.yaml b/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct_hf.yaml index 1d7069a38..fb7619d75 100644 --- a/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct_hf.yaml +++ b/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct_hf.yaml @@ -18,7 +18,7 @@ spec: command: - /bin/sh - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code + - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --torch_dtype auto --trust_remote_code resources: requests: nvidia.com/gpu: 1 diff --git a/presets/workspace/test/manifests/phi-3-small-128k-instruct/phi-3-small-128k-instruct.yaml b/presets/workspace/test/manifests/phi-3-small-128k-instruct/phi-3-small-128k-instruct.yaml index 1827155f4..680d4efbe 100644 --- a/presets/workspace/test/manifests/phi-3-small-128k-instruct/phi-3-small-128k-instruct.yaml +++ b/presets/workspace/test/manifests/phi-3-small-128k-instruct/phi-3-small-128k-instruct.yaml @@ -18,7 +18,7 @@ spec: command: - /bin/sh - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code + - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --torch_dtype auto --trust_remote_code resources: requests: nvidia.com/gpu: 1 diff --git a/presets/workspace/test/manifests/phi-3-small-8k-instruct/phi-3-small-8k-instruct.yaml b/presets/workspace/test/manifests/phi-3-small-8k-instruct/phi-3-small-8k-instruct.yaml index 1f515cc6a..c693b97af 100644 --- a/presets/workspace/test/manifests/phi-3-small-8k-instruct/phi-3-small-8k-instruct.yaml +++ b/presets/workspace/test/manifests/phi-3-small-8k-instruct/phi-3-small-8k-instruct.yaml @@ -18,7 +18,7 @@ spec: command: - /bin/sh - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code + - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --torch_dtype auto --trust_remote_code resources: requests: nvidia.com/gpu: 1 diff --git a/presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct_hf.yaml b/presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct_hf.yaml index e92d906d7..81f096b05 100644 --- a/presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct_hf.yaml +++ b/presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct_hf.yaml @@ -18,7 +18,7 @@ spec: command: - /bin/sh - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code + - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --torch_dtype auto --trust_remote_code resources: requests: nvidia.com/gpu: 2