Manikandan-Thangaraj-ZS0321 · Manikandan-Thangaraj-ZS0321 · Aug 28, 2024 · Aug 27, 2024 · Aug 27, 2024 · Aug 27, 2024
diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
@@ -75,6 +75,7 @@ docker run \
         --network host \
         --shm-size=16gb \
         --rm \
+        -e HIP_VISIBLE_DEVICES=0 \
         -e HF_TOKEN \
         -v ${HF_CACHE}:${HF_MOUNT} \
         -e HF_HOME=${HF_MOUNT} \

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -233,12 +233,13 @@ steps:
   parallelism: 4
 
 - label: Tensorizer Test # 11min
+  mirror_hardwares: [amd]
   soft_fail: true
   source_file_dependencies:
   - vllm/model_executor/model_loader
   - tests/tensorizer_loader
   commands:
-    - apt-get install -y curl libsodium23
+    - apt-get update && apt-get install -y curl libsodium23
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     - pytest -v -s tensorizer_loader
 

diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
@@ -35,7 +35,6 @@ jobs:
         mypy
         mypy tests --follow-imports skip
         mypy vllm/attention --follow-imports skip
-        mypy vllm/core --follow-imports skip
         mypy vllm/distributed --follow-imports skip
         mypy vllm/engine  --follow-imports skip
         mypy vllm/executor --follow-imports skip

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -296,6 +296,11 @@ set(VLLM_MOE_EXT_SRC
   "csrc/moe/torch_bindings.cpp"
   "csrc/moe/topk_softmax_kernels.cu")
 
+if(VLLM_GPU_LANG STREQUAL "CUDA")
+  list(APPEND VLLM_MOE_EXT_SRC
+      "csrc/moe/marlin_moe_ops.cu")
+endif()
+
 define_gpu_extension_target(
   _moe_C
   DESTINATION vllm

diff --git a/benchmarks/launch_tgi_server.sh b/benchmarks/launch_tgi_server.sh
@@ -6,7 +6,7 @@ TOKENS=$2
 
 docker run -e HF_TOKEN=$HF_TOKEN --gpus all --shm-size 1g -p $PORT:80 \
            -v $PWD/data:/data \
-           ghcr.io/huggingface/text-generation-inference:1.4.0 \
+           ghcr.io/huggingface/text-generation-inference:2.2.0 \
            --model-id $MODEL \
            --sharded false  \
            --max-input-length 1024 \

diff --git a/csrc/core/scalar_type.hpp b/csrc/core/scalar_type.hpp
@@ -387,7 +387,8 @@ class ScalarTypeTorch : public torch::CustomClassHolder, public ScalarType {
   // This needs to be implemented and throw a TypeError in order for
   // PyTorch's opcheck to work on ops that use ScalarTypes.
   int64_t len() const {
-    throw c10::TypeError("__len__ not implemented");
+    throw c10::TypeError({__func__, __FILE__, static_cast<uint32_t>(__LINE__)},
+                         "__len__ not implemented");
     return 0;
   }