Skip to content

Commit

Permalink
[Tests] Fix SkyServe Smoke Test (#4566)
Browse files Browse the repository at this point in the history
* [Tests] Fix SkyServe Smoke Test

* fix llm

* comment

* add wait provisioning in _SERVE_STATUS_WAIT

* fix

* only apply waiting for _check_replica_in_status

* fix `-` for endpoint output

* increase timeout

* increase initial delay
  • Loading branch information
cblmemo authored Jan 17, 2025
1 parent 9e1b4dd commit 6b23582
Show file tree
Hide file tree
Showing 12 changed files with 61 additions and 18 deletions.
2 changes: 1 addition & 1 deletion tests/skyserve/auto_restart.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
service:
readiness_probe:
path: /health
initial_delay_seconds: 20
initial_delay_seconds: 60
replicas: 1


Expand Down
5 changes: 4 additions & 1 deletion tests/skyserve/llm/service.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,10 @@ setup: |
fi
# Install dependencies
pip install "fschat[model_worker,webui]==0.2.24"
# TODO(tian): transformers<4.48.0 is a temporary solution for breaking
# change in transformers 4.48.0. Update to latest version when the issue
# is fixed. Ref: https://github.com/huggingface/transformers/issues/35639
pip install "fschat[model_worker,webui]==0.2.24" "transformers<4.48.0"
pip install sentencepiece protobuf
run: |
Expand Down
2 changes: 1 addition & 1 deletion tests/skyserve/restart/user_bug.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
service:
readiness_probe:
path: /health
initial_delay_seconds: 20
initial_delay_seconds: 60
replicas: 1


Expand Down
3 changes: 2 additions & 1 deletion tests/skyserve/spot/base_ondemand_fallback.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ resources:
cpus: 2+
use_spot: true

workdir: examples/serve/http_server
setup: |
wget https://raw.githubusercontent.com/skypilot-org/skypilot/refs/heads/master/examples/serve/http_server/server.py
# Use 8080 to test jupyter service is terminated
run: python3 server.py --port 8080
2 changes: 1 addition & 1 deletion tests/skyserve/update/bump_version_after.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ service:
replicas: 3

resources:
ports: 8080
ports: 8081
cpus: 2+

setup: |
Expand Down
2 changes: 1 addition & 1 deletion tests/skyserve/update/bump_version_before.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ service:
replicas: 2

resources:
ports: 8080
ports: 8081
cpus: 2+

setup: |
Expand Down
3 changes: 2 additions & 1 deletion tests/skyserve/update/new_autoscaler_after.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ resources:
use_spot: true
cpus: 2+

workdir: examples/serve/http_server
setup: |
wget https://raw.githubusercontent.com/skypilot-org/skypilot/refs/heads/master/examples/serve/http_server/server.py
run: |
if [ $SKYPILOT_SERVE_REPLICA_ID -eq 7 ]; then
Expand Down
5 changes: 3 additions & 2 deletions tests/skyserve/update/new_autoscaler_before.yaml
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
service:
readiness_probe:
path: /health
initial_delay_seconds: 20
initial_delay_seconds: 60
replicas: 2

resources:
ports: 8081
cpus: 2+

workdir: examples/serve/http_server
setup: |
wget https://raw.githubusercontent.com/skypilot-org/skypilot/refs/heads/master/examples/serve/http_server/server.py
run: python3 server.py --port 8081
2 changes: 1 addition & 1 deletion tests/skyserve/update/num_min_one.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
service:
readiness_probe:
path: /health
initial_delay_seconds: 20
initial_delay_seconds: 60
replica_policy:
min_replicas: 1

Expand Down
2 changes: 1 addition & 1 deletion tests/skyserve/update/num_min_two.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
service:
readiness_probe:
path: /health
initial_delay_seconds: 20
initial_delay_seconds: 60
replica_policy:
min_replicas: 2

Expand Down
2 changes: 1 addition & 1 deletion tests/skyserve/update/old.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
service:
readiness_probe:
path: /health
initial_delay_seconds: 20
initial_delay_seconds: 60
replicas: 2
load_balancing_policy: round_robin

Expand Down
49 changes: 43 additions & 6 deletions tests/smoke_tests/test_sky_serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,15 +84,50 @@ def _get_service_name() -> str:
_SERVE_ENDPOINT_WAIT = (
'export ORIGIN_SKYPILOT_DEBUG=$SKYPILOT_DEBUG; export SKYPILOT_DEBUG=0; '
'endpoint=$(sky serve status --endpoint {name}); '
'until ! echo "$endpoint" | grep "Controller is initializing"; '
'until ! echo "$endpoint" | grep -qE "Controller is initializing|^-$"; '
'do echo "Waiting for serve endpoint to be ready..."; '
'sleep 5; endpoint=$(sky serve status --endpoint {name}); done; '
'export SKYPILOT_DEBUG=$ORIGIN_SKYPILOT_DEBUG; echo "$endpoint"')

_SERVE_STATUS_WAIT = ('s=$(sky serve status {name}); '
'until ! echo "$s" | grep "Controller is initializing."; '
'do echo "Waiting for serve status to be ready..."; '
'sleep 5; s=$(sky serve status {name}); done; echo "$s"')
_SERVE_STATUS_WAIT = (
's=$(sky serve status {name}); '
# Wait for "Controller is initializing." to disappear
'until ! echo "$s" | grep "Controller is initializing."; '
'do '
' echo "Waiting for serve status to be ready..."; '
' sleep 5; '
' s=$(sky serve status {name}); '
'done; '
'echo "$s"')

_WAIT_PROVISION_REPR = (
# Once controller is ready, check provisioning vs. vCPU=2. This is for
# the `_check_replica_in_status`, which will check number of `vCPU=2` in the
# `sky serve status` output and use that to suggest the number of replicas.
# However, replicas in provisioning state is possible to have a repr of `-`,
# since the desired `launched_resources` is not decided yet. This would
# cause an error when counting desired number of replicas. We wait for the
# representation of `vCPU=2` the same with number of provisioning replicas
# to avoid this error.
# NOTE(tian): This assumes the replica will not do failover, as the
# requested resources is only 2 vCPU and likely to be immediately available
# on every region, hence no failover. If the replica will go through
# failover
# Check #4565 for more information.
'num_provisioning=$(echo "$s" | grep "PROVISIONING" | wc -l); '
'num_vcpu_in_provision=$(echo "$s" | grep "PROVISIONING" | grep "vCPU=2" | wc -l); '
'until [ "$num_provisioning" -eq "$num_vcpu_in_provision" ]; '
'do '
' echo "Waiting for provisioning resource repr ready..."; '
' echo "PROVISIONING: $num_provisioning, vCPU: $num_vcpu_in_provision"; '
' sleep 2; '
' s=$(sky serve status {name}); '
' num_provisioning=$(echo "$s" | grep "PROVISIONING" | wc -l); '
' num_vcpu_in_provision=$(echo "$s" | grep "PROVISIONING" | grep "vCPU=2" | wc -l); '
'done; '
# Provisioning is complete
'echo "Provisioning complete. PROVISIONING: $num_provisioning, vCPU=2: $num_vcpu_in_provision"'
)


def _get_replica_ip(name: str, replica_id: int) -> str:
Expand Down Expand Up @@ -141,7 +176,9 @@ def _check_replica_in_status(name: str, check_tuples: List[Tuple[int, bool,
resource_str = f'({spot_str}vCPU=2)'
check_cmd += (f' echo "$s" | grep "{resource_str}" | '
f'grep "{status}" | wc -l | grep {count} || exit 1;')
return (f'{_SERVE_STATUS_WAIT.format(name=name)}; echo "$s"; ' + check_cmd)
return (f'{_SERVE_STATUS_WAIT.format(name=name)}; '
f'{_WAIT_PROVISION_REPR.format(name=name)}; '
f'echo "$s"; {check_cmd}')


def _check_service_version(service_name: str, version: str) -> str:
Expand Down

0 comments on commit 6b23582

Please sign in to comment.