Skip to content

Commit

Permalink
Merge pull request #25 from z103cb/ibm_main_update_05162022
Browse files Browse the repository at this point in the history
Ibm main update 2024-05-16
  • Loading branch information
dtrifiro authored May 16, 2024
2 parents 059b81b + a72d13a commit 81954a7
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 10 deletions.
6 changes: 3 additions & 3 deletions Dockerfile.ubi
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ COPY --from=gen-protos /workspace/vllm/entrypoints/grpc/pb vllm/entrypoints/grpc
ENV CCACHE_DIR=/root/.cache/ccache
RUN --mount=type=cache,target=/root/.cache/ccache \
--mount=type=cache,target=/root/.cache/pip \
python setup.py bdist_wheel --dist-dir=dist
VLLM_USE_PRECOMPILED=1 python3 setup.py bdist_wheel --dist-dir=dist

#################### FLASH_ATTENTION Build IMAGE ####################
FROM dev as flash-attn-builder
Expand Down Expand Up @@ -258,7 +258,7 @@ RUN pip install \
--no-binary="all" \
--no-cache-dir \
"vllm-nccl-cu12==2.18.1.0.4.0" && \
mv /root/.config/vllm/nccl/cu12/libnccl.so.2.18.1 /opt/vllm/
mv /root/.config/vllm/nccl/cu12/libnccl.so.2.18.1 /opt/vllm/lib/

# Install flash attention (from pre-built wheel)
RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
Expand All @@ -277,7 +277,7 @@ ENV HF_HUB_OFFLINE=1 \
PORT=8000 \
GRPC_PORT=8033 \
HOME=/home/vllm \
VLLM_NCCL_SO_PATH=/opt/vllm/libnccl.so.2.18.1 \
VLLM_NCCL_SO_PATH=/opt/vllm/lib/libnccl.so.2.18.1 \
VLLM_USAGE_SOURCE=production-docker-image

# setup non-root user for OpenShift
Expand Down
10 changes: 3 additions & 7 deletions vllm/tgis_utils/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def observe_queue_time(self, engine_output: RequestOutput):
engine_output.metrics.time_in_queue)

def count_request_failure(self, reason: FailureReasonLabel):
self.tgi_request_failure.labels({"err": reason}).inc(1)
self.tgi_request_failure.labels(err=reason).inc(1)


class TGISStatLogger(StatLogger):
Expand Down Expand Up @@ -118,17 +118,13 @@ def log(self, stats: Stats) -> None:
# Then log TGIS specific ones
self.tgi_queue_size.set(stats.num_waiting_sys + stats.num_swapped_sys)
self.tgi_batch_current_size.set(stats.num_running_sys)
self.tgi_queue_size.set(stats.num_waiting_sys + stats.num_swapped_sys)
self.tgi_batch_current_size.set(stats.num_running_sys)

for ttft in stats.time_to_first_tokens_iter:
self.tgi_batch_inference_duration.labels(
{"method": "prefill"}
).observe(ttft)
method="prefill").observe(ttft)
for tpot in stats.time_per_output_tokens_iter:
self.tgi_batch_inference_duration.labels(
{"method": "next_token"}
).observe(tpot)
method="next_token").observe(tpot)

for input_len in stats.num_prompt_tokens_requests:
self.tgi_request_input_length.observe(input_len)
Expand Down

0 comments on commit 81954a7

Please sign in to comment.