diff --git a/Dockerfile.ubi b/Dockerfile.ubi index f92ab3fcf18a1..003e93282212a 100644 --- a/Dockerfile.ubi +++ b/Dockerfile.ubi @@ -206,7 +206,7 @@ COPY --from=gen-protos /workspace/vllm/entrypoints/grpc/pb vllm/entrypoints/grpc ENV CCACHE_DIR=/root/.cache/ccache RUN --mount=type=cache,target=/root/.cache/ccache \ --mount=type=cache,target=/root/.cache/pip \ - python setup.py bdist_wheel --dist-dir=dist + VLLM_USE_PRECOMPILED=1 python3 setup.py bdist_wheel --dist-dir=dist #################### FLASH_ATTENTION Build IMAGE #################### FROM dev as flash-attn-builder @@ -258,7 +258,7 @@ RUN pip install \ --no-binary="all" \ --no-cache-dir \ "vllm-nccl-cu12==2.18.1.0.4.0" && \ - mv /root/.config/vllm/nccl/cu12/libnccl.so.2.18.1 /opt/vllm/ + mv /root/.config/vllm/nccl/cu12/libnccl.so.2.18.1 /opt/vllm/lib/ # Install flash attention (from pre-built wheel) RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \ @@ -277,7 +277,7 @@ ENV HF_HUB_OFFLINE=1 \ PORT=8000 \ GRPC_PORT=8033 \ HOME=/home/vllm \ - VLLM_NCCL_SO_PATH=/opt/vllm/libnccl.so.2.18.1 \ + VLLM_NCCL_SO_PATH=/opt/vllm/lib/libnccl.so.2.18.1 \ VLLM_USAGE_SOURCE=production-docker-image # setup non-root user for OpenShift diff --git a/vllm/tgis_utils/metrics.py b/vllm/tgis_utils/metrics.py index 8867158c33dbf..d05e1d92b688a 100644 --- a/vllm/tgis_utils/metrics.py +++ b/vllm/tgis_utils/metrics.py @@ -71,7 +71,7 @@ def observe_queue_time(self, engine_output: RequestOutput): engine_output.metrics.time_in_queue) def count_request_failure(self, reason: FailureReasonLabel): - self.tgi_request_failure.labels({"err": reason}).inc(1) + self.tgi_request_failure.labels(err=reason).inc(1) class TGISStatLogger(StatLogger): @@ -118,17 +118,13 @@ def log(self, stats: Stats) -> None: # Then log TGIS specific ones self.tgi_queue_size.set(stats.num_waiting_sys + stats.num_swapped_sys) self.tgi_batch_current_size.set(stats.num_running_sys) - self.tgi_queue_size.set(stats.num_waiting_sys + stats.num_swapped_sys) - self.tgi_batch_current_size.set(stats.num_running_sys) for ttft in stats.time_to_first_tokens_iter: self.tgi_batch_inference_duration.labels( - {"method": "prefill"} - ).observe(ttft) + method="prefill").observe(ttft) for tpot in stats.time_per_output_tokens_iter: self.tgi_batch_inference_duration.labels( - {"method": "next_token"} - ).observe(tpot) + method="next_token").observe(tpot) for input_len in stats.num_prompt_tokens_requests: self.tgi_request_input_length.observe(input_len)