From 6b23582d9f2285af2816999185947b6ce0df2c81 Mon Sep 17 00:00:00 2001 From: Tian Xia Date: Fri, 17 Jan 2025 15:22:48 -0800 Subject: [PATCH] [Tests] Fix SkyServe Smoke Test (#4566) * [Tests] Fix SkyServe Smoke Test * fix llm * comment * add wait provisioning in _SERVE_STATUS_WAIT * fix * only apply waiting for _check_replica_in_status * fix `-` for endpoint output * increase timeout * increase initial delay --- tests/skyserve/auto_restart.yaml | 2 +- tests/skyserve/llm/service.yaml | 5 +- tests/skyserve/restart/user_bug.yaml | 2 +- .../skyserve/spot/base_ondemand_fallback.yaml | 3 +- tests/skyserve/update/bump_version_after.yaml | 2 +- .../skyserve/update/bump_version_before.yaml | 2 +- .../skyserve/update/new_autoscaler_after.yaml | 3 +- .../update/new_autoscaler_before.yaml | 5 +- tests/skyserve/update/num_min_one.yaml | 2 +- tests/skyserve/update/num_min_two.yaml | 2 +- tests/skyserve/update/old.yaml | 2 +- tests/smoke_tests/test_sky_serve.py | 49 ++++++++++++++++--- 12 files changed, 61 insertions(+), 18 deletions(-) diff --git a/tests/skyserve/auto_restart.yaml b/tests/skyserve/auto_restart.yaml index 2a3a31051b9..055fd418f1b 100644 --- a/tests/skyserve/auto_restart.yaml +++ b/tests/skyserve/auto_restart.yaml @@ -1,7 +1,7 @@ service: readiness_probe: path: /health - initial_delay_seconds: 20 + initial_delay_seconds: 60 replicas: 1 diff --git a/tests/skyserve/llm/service.yaml b/tests/skyserve/llm/service.yaml index 48160e8f3db..dde5c9313b0 100644 --- a/tests/skyserve/llm/service.yaml +++ b/tests/skyserve/llm/service.yaml @@ -28,7 +28,10 @@ setup: | fi # Install dependencies - pip install "fschat[model_worker,webui]==0.2.24" + # TODO(tian): transformers<4.48.0 is a temporary solution for breaking + # change in transformers 4.48.0. Update to latest version when the issue + # is fixed. Ref: https://github.com/huggingface/transformers/issues/35639 + pip install "fschat[model_worker,webui]==0.2.24" "transformers<4.48.0" pip install sentencepiece protobuf run: | diff --git a/tests/skyserve/restart/user_bug.yaml b/tests/skyserve/restart/user_bug.yaml index 959e725d23d..0fc1fabfd54 100644 --- a/tests/skyserve/restart/user_bug.yaml +++ b/tests/skyserve/restart/user_bug.yaml @@ -1,7 +1,7 @@ service: readiness_probe: path: /health - initial_delay_seconds: 20 + initial_delay_seconds: 60 replicas: 1 diff --git a/tests/skyserve/spot/base_ondemand_fallback.yaml b/tests/skyserve/spot/base_ondemand_fallback.yaml index bee946dedee..7e53305b256 100644 --- a/tests/skyserve/spot/base_ondemand_fallback.yaml +++ b/tests/skyserve/spot/base_ondemand_fallback.yaml @@ -14,7 +14,8 @@ resources: cpus: 2+ use_spot: true -workdir: examples/serve/http_server +setup: | + wget https://raw.githubusercontent.com/skypilot-org/skypilot/refs/heads/master/examples/serve/http_server/server.py # Use 8080 to test jupyter service is terminated run: python3 server.py --port 8080 diff --git a/tests/skyserve/update/bump_version_after.yaml b/tests/skyserve/update/bump_version_after.yaml index 0f2c6925bc6..6e845f54b9e 100644 --- a/tests/skyserve/update/bump_version_after.yaml +++ b/tests/skyserve/update/bump_version_after.yaml @@ -16,7 +16,7 @@ service: replicas: 3 resources: - ports: 8080 + ports: 8081 cpus: 2+ setup: | diff --git a/tests/skyserve/update/bump_version_before.yaml b/tests/skyserve/update/bump_version_before.yaml index de922b66434..c9fd957e41a 100644 --- a/tests/skyserve/update/bump_version_before.yaml +++ b/tests/skyserve/update/bump_version_before.yaml @@ -16,7 +16,7 @@ service: replicas: 2 resources: - ports: 8080 + ports: 8081 cpus: 2+ setup: | diff --git a/tests/skyserve/update/new_autoscaler_after.yaml b/tests/skyserve/update/new_autoscaler_after.yaml index 2d12d3ef109..2fa5fab82db 100644 --- a/tests/skyserve/update/new_autoscaler_after.yaml +++ b/tests/skyserve/update/new_autoscaler_after.yaml @@ -12,7 +12,8 @@ resources: use_spot: true cpus: 2+ -workdir: examples/serve/http_server +setup: | + wget https://raw.githubusercontent.com/skypilot-org/skypilot/refs/heads/master/examples/serve/http_server/server.py run: | if [ $SKYPILOT_SERVE_REPLICA_ID -eq 7 ]; then diff --git a/tests/skyserve/update/new_autoscaler_before.yaml b/tests/skyserve/update/new_autoscaler_before.yaml index 793221080ae..19124f680f5 100644 --- a/tests/skyserve/update/new_autoscaler_before.yaml +++ b/tests/skyserve/update/new_autoscaler_before.yaml @@ -1,13 +1,14 @@ service: readiness_probe: path: /health - initial_delay_seconds: 20 + initial_delay_seconds: 60 replicas: 2 resources: ports: 8081 cpus: 2+ -workdir: examples/serve/http_server +setup: | + wget https://raw.githubusercontent.com/skypilot-org/skypilot/refs/heads/master/examples/serve/http_server/server.py run: python3 server.py --port 8081 diff --git a/tests/skyserve/update/num_min_one.yaml b/tests/skyserve/update/num_min_one.yaml index 12d47c5bff1..77eea55df92 100644 --- a/tests/skyserve/update/num_min_one.yaml +++ b/tests/skyserve/update/num_min_one.yaml @@ -1,7 +1,7 @@ service: readiness_probe: path: /health - initial_delay_seconds: 20 + initial_delay_seconds: 60 replica_policy: min_replicas: 1 diff --git a/tests/skyserve/update/num_min_two.yaml b/tests/skyserve/update/num_min_two.yaml index ebf52bd768e..a42584f4978 100644 --- a/tests/skyserve/update/num_min_two.yaml +++ b/tests/skyserve/update/num_min_two.yaml @@ -1,7 +1,7 @@ service: readiness_probe: path: /health - initial_delay_seconds: 20 + initial_delay_seconds: 60 replica_policy: min_replicas: 2 diff --git a/tests/skyserve/update/old.yaml b/tests/skyserve/update/old.yaml index 4cb19b8327b..b4a98a295fe 100644 --- a/tests/skyserve/update/old.yaml +++ b/tests/skyserve/update/old.yaml @@ -1,7 +1,7 @@ service: readiness_probe: path: /health - initial_delay_seconds: 20 + initial_delay_seconds: 60 replicas: 2 load_balancing_policy: round_robin diff --git a/tests/smoke_tests/test_sky_serve.py b/tests/smoke_tests/test_sky_serve.py index 3ba36d8a092..cb722ef52e8 100644 --- a/tests/smoke_tests/test_sky_serve.py +++ b/tests/smoke_tests/test_sky_serve.py @@ -84,15 +84,50 @@ def _get_service_name() -> str: _SERVE_ENDPOINT_WAIT = ( 'export ORIGIN_SKYPILOT_DEBUG=$SKYPILOT_DEBUG; export SKYPILOT_DEBUG=0; ' 'endpoint=$(sky serve status --endpoint {name}); ' - 'until ! echo "$endpoint" | grep "Controller is initializing"; ' + 'until ! echo "$endpoint" | grep -qE "Controller is initializing|^-$"; ' 'do echo "Waiting for serve endpoint to be ready..."; ' 'sleep 5; endpoint=$(sky serve status --endpoint {name}); done; ' 'export SKYPILOT_DEBUG=$ORIGIN_SKYPILOT_DEBUG; echo "$endpoint"') -_SERVE_STATUS_WAIT = ('s=$(sky serve status {name}); ' - 'until ! echo "$s" | grep "Controller is initializing."; ' - 'do echo "Waiting for serve status to be ready..."; ' - 'sleep 5; s=$(sky serve status {name}); done; echo "$s"') +_SERVE_STATUS_WAIT = ( + 's=$(sky serve status {name}); ' + # Wait for "Controller is initializing." to disappear + 'until ! echo "$s" | grep "Controller is initializing."; ' + 'do ' + ' echo "Waiting for serve status to be ready..."; ' + ' sleep 5; ' + ' s=$(sky serve status {name}); ' + 'done; ' + 'echo "$s"') + +_WAIT_PROVISION_REPR = ( + # Once controller is ready, check provisioning vs. vCPU=2. This is for + # the `_check_replica_in_status`, which will check number of `vCPU=2` in the + # `sky serve status` output and use that to suggest the number of replicas. + # However, replicas in provisioning state is possible to have a repr of `-`, + # since the desired `launched_resources` is not decided yet. This would + # cause an error when counting desired number of replicas. We wait for the + # representation of `vCPU=2` the same with number of provisioning replicas + # to avoid this error. + # NOTE(tian): This assumes the replica will not do failover, as the + # requested resources is only 2 vCPU and likely to be immediately available + # on every region, hence no failover. If the replica will go through + # failover + # Check #4565 for more information. + 'num_provisioning=$(echo "$s" | grep "PROVISIONING" | wc -l); ' + 'num_vcpu_in_provision=$(echo "$s" | grep "PROVISIONING" | grep "vCPU=2" | wc -l); ' + 'until [ "$num_provisioning" -eq "$num_vcpu_in_provision" ]; ' + 'do ' + ' echo "Waiting for provisioning resource repr ready..."; ' + ' echo "PROVISIONING: $num_provisioning, vCPU: $num_vcpu_in_provision"; ' + ' sleep 2; ' + ' s=$(sky serve status {name}); ' + ' num_provisioning=$(echo "$s" | grep "PROVISIONING" | wc -l); ' + ' num_vcpu_in_provision=$(echo "$s" | grep "PROVISIONING" | grep "vCPU=2" | wc -l); ' + 'done; ' + # Provisioning is complete + 'echo "Provisioning complete. PROVISIONING: $num_provisioning, vCPU=2: $num_vcpu_in_provision"' +) def _get_replica_ip(name: str, replica_id: int) -> str: @@ -141,7 +176,9 @@ def _check_replica_in_status(name: str, check_tuples: List[Tuple[int, bool, resource_str = f'({spot_str}vCPU=2)' check_cmd += (f' echo "$s" | grep "{resource_str}" | ' f'grep "{status}" | wc -l | grep {count} || exit 1;') - return (f'{_SERVE_STATUS_WAIT.format(name=name)}; echo "$s"; ' + check_cmd) + return (f'{_SERVE_STATUS_WAIT.format(name=name)}; ' + f'{_WAIT_PROVISION_REPR.format(name=name)}; ' + f'echo "$s"; {check_cmd}') def _check_service_version(service_name: str, version: str) -> str: