Fix broken sanity tests + Add skip wait option in test teardown in mo…

…del serving (red-hat-data-services#1157) * fix canary test * fix concurrency test clean up * skip project delation wait * update serving platform names * remove unused browser session * update sr platform labels * fix deploy form wait * try fix mm inference tests * fix metrics test regression from other PRs * add note about intermittent bug * fix montioring enablement due to other PRs * apply suggested changes * tentative workaround for failing MM sanity * fix inference kw with retries * restore @ before inference input filepath * fix missing args * add sleep btw retries * fix robocop alerts * fix variable name for consistency in ods-2379 * fix robocop alert --------- Co-authored-by: Jorge <[email protected]>
jstourac · Jan 26, 2024 · 29b646f · 29b646f
1 parent db7d9c1
commit 29b646f
Show file tree

Hide file tree

Showing 7 changed files with 82 additions and 45 deletions.
diff --git a/ods_ci/tests/Resources/CLI/ModelServing/llm.resource b/ods_ci/tests/Resources/CLI/ModelServing/llm.resource
@@ -16,8 +16,6 @@ ${DEFAULT_BUCKET_SA_NAME}=        models-bucket-sa
 ${BUCKET_SECRET_FILEPATH}=    ${LLM_RESOURCES_DIRPATH}/bucket_secret.yaml
 ${BUCKET_SA_FILEPATH}=    ${LLM_RESOURCES_DIRPATH}/bucket_sa.yaml
 ${USE_BUCKET_HTTPS}=    "1"
-${UWM_ENABLE_FILEPATH}=    ${LLM_RESOURCES_DIRPATH}/uwm_cm_enable.yaml
-${UWM_CONFIG_FILEPATH}=    ${LLM_RESOURCES_DIRPATH}/uwm_cm_conf.yaml  
 ${MODELS_BUCKET}=    ${S3.BUCKET_3}
 ${SERVICEMESH_CR_NS}=    istio-system
 &{RUNTIME_FLIEPATHS}=    caikit-tgis-runtime=${LLM_RESOURCES_DIRPATH}/serving_runtimes/caikit_tgis_servingruntime_{{protocol}}.yaml
@@ -62,8 +60,8 @@ Set Project And Runtime
     ...    region=${MODELS_BUCKET.REGION}    namespace=${namespace}
     Deploy Serving Runtime    namespace=${namespace}    runtime=${runtime}    protocol=${protocol}
     IF   ${enable_metrics} == ${TRUE}
-        Oc Apply    kind=ConfigMap    src=${UWM_CONFIG_FILEPATH}
-        Oc Apply    kind=ConfigMap    src=${UWM_ENABLE_FILEPATH}
+        Enable User Workload Monitoring
+        Configure User Workload Monitoring
     ELSE
         Log    message=Skipping UserWorkloadMonitoring enablement.
     END
@@ -292,6 +290,7 @@ Compile Deploy And Query LLM model
     [Documentation]    Group together the test steps for preparing, deploying
     ...                and querying a model
     [Arguments]    ${model_storage_uri}    ${model_name}    ${isvc_name}=${model_name}
+    ...            ${runtime}=caikit-tgis-runtime    ${protocol}=grpc    ${inference_type}=all-tokens
     ...            ${canaryTrafficPercent}=${EMPTY}   ${namespace}=${TEST_NS}  ${sa_name}=${DEFAULT_BUCKET_SA_NAME}
     ...            ${n_queries}=${1}    ${query_idx}=${0}    ${validate_response}=${TRUE}
     Compile Inference Service YAML    isvc_name=${isvc_name}
@@ -303,8 +302,9 @@ Compile Deploy And Query LLM model
     Wait For Pods To Be Ready    label_selector=serving.kserve.io/inferenceservice=${isvc_name}
     ...    namespace=${namespace}
     Query Model Multiple Times    isvc_name=${isvc_name}    model_name=${model_name}
-    ...    endpoint=${CAIKIT_ALLTOKENS_ENDPOINT}    n_times=${n_queries}    streamed_response=${FALSE}
-    ...    namespace=${namespace}    query_idx=${query_idx}    validate_response=${validate_response}
+    ...    n_times=${n_queries}    namespace=${namespace}    query_idx=${query_idx}
+    ...    validate_response=${validate_response}    protocol=${protocol}
+    ...    runtime=${runtime}    inference_type=${inference_type}
 
 Upgrade Caikit Runtime Image
     [Documentation]    Replaces the image URL of the Caikit Runtim with the given
@@ -567,7 +567,7 @@ Generate Client TLS Certificates
 Clean Up Test Project
     [Documentation]    Deletes the given InferenceServices, check the NS gets removed from ServiceMeshMemberRoll
     ...                and deletes the DS Project
-    [Arguments]    ${test_ns}    ${isvc_names}    ${isvc_delete}=${TRUE}
+    [Arguments]    ${test_ns}    ${isvc_names}    ${isvc_delete}=${TRUE}    ${wait_prj_deletion}=${TRUE}
     IF    ${isvc_delete} == ${TRUE}
         FOR    ${index}    ${isvc_name}    IN ENUMERATE    @{isvc_names}
               Log    Deleting ${isvc_name}
@@ -580,5 +580,9 @@ Clean Up Test Project
     ...    namespace=${test_ns}
     ${rc}    ${out}=    Run And Return Rc And Output    oc delete project ${test_ns}
     Should Be Equal As Integers    ${rc}    ${0}
-    ${rc}    ${out}=    Run And Return Rc And Output    oc wait --for=delete namespace ${test_ns} --timeout=300s
-    Should Be Equal As Integers    ${rc}    ${0}
+    IF    ${wait_prj_deletion}
+        ${rc}    ${out}=    Run And Return Rc And Output    oc wait --for=delete namespace ${test_ns} --timeout=300s
+        Should Be Equal As Integers    ${rc}    ${0}
+    ELSE
+        Log    Project deletion started, but won't wait for it to finish..
+    END
diff --git a/ods_ci/tests/Resources/Page/ODH/ODHDashboard/ODHDashboardSettingsRuntimes.resource b/ods_ci/tests/Resources/Page/ODH/ODHDashboard/ODHDashboardSettingsRuntimes.resource
@@ -9,9 +9,9 @@ ${SUBMIT_RUNTIME_BTN_XP}=    //button[text()="Create"]
 ${UPLOAD_RUNTIME_BTN_XP}=    //button[text()="Upload files"]
 ${SCRATCH_RUNTIME_BTN_XP}=    //button[text()="Start from scratch"]
 ${EDITOR_RUNTIME_BTN_XP}=    //div[contains(@class, "odh-dashboard__code-editor")]
-&{PLATFORM_NAMES_MAPPING}=    single=Single model serving platform    multi=Multi-model serving platform
-...    both=Both single and multi-model serving platforms
-&{PLATFORM_LABELS_MAPPING}=    single=Single model    multi=Multi-model
+&{PLATFORM_NAMES_MAPPING}=    single=Single-model serving platform    multi=Multi-model serving platform
+...    both=Single-model and multi-model serving platforms
+&{PLATFORM_LABELS_MAPPING}=    single=Single-model    multi=Multi-model
 
 
 *** Keywords ***

diff --git a/ods_ci/tests/Resources/Page/ODH/ODHDashboard/ODHModelServing.resource b/ods_ci/tests/Resources/Page/ODH/ODHDashboard/ODHModelServing.resource
@@ -26,8 +26,6 @@ ${MS_TABLE_STATUS_FAILURE}=     //span[contains(@class,"pf-v5-c-icon__content")]
 ${KSERVE_MODAL_HEADER}=    //header[@class="pf-v5-c-modal-box__header"]/h1[.="Deploy model"]
 ${KSERVE_RUNTIME_DROPDOWN}=    //span[.="Serving runtime"]/../../..//div[@id="serving-runtime-template-selection"]
 ${LLM_RESOURCES_DIRPATH}=    ods_ci/tests/Resources/Files/llm
-${UWM_ENABLE_FILEPATH}=    ${LLM_RESOURCES_DIRPATH}/uwm_cm_enable.yaml
-${UWM_CONFIG_FILEPATH}=    ${LLM_RESOURCES_DIRPATH}/uwm_cm_conf.yaml
 
 
 *** Keywords ***
@@ -56,8 +54,8 @@ Serve Model
     [Arguments]    ${project_name}    ${model_name}    ${framework}    ${data_connection_name}    ${model_path}
     ...    ${existing_data_connection}=${TRUE}    ${model_server}=Model Serving Test
     # TODO: Does not work if there's already a model deployed
-    SeleniumLibrary.Wait Until Page Does Not Contain Element    //article[@id="multi-serving-platform-card"]
-    SeleniumLibrary.Wait Until Page Does Not Contain Element    //article[@id="single-serving-platform-card"]
+    SeleniumLibrary.Wait Until Page Does Not Contain Element    //div[@id="multi-serving-platform-card"]
+    SeleniumLibrary.Wait Until Page Does Not Contain Element    //div[@id="single-serving-platform-card"]
     SeleniumLibrary.Wait Until Page Contains    Deploy model
     SeleniumLibrary.Click Button    Deploy model
     SeleniumLibrary.Wait Until Page Contains Element    xpath://h1[.="Deploy model"]
@@ -248,7 +246,7 @@ Get Model Inference
     ...    ${project_title}=${NONE}
     ${self_managed} =    Is RHODS Self-Managed
     ${url}=    Get Model Route via UI    ${model_name}
-    ${curl_cmd}=     Set Variable    curl -s ${url} -d @${inference_input}
+    ${curl_cmd}=     Set Variable    curl -s ${url} -d ${inference_input}
     IF    ${token_auth}
         IF    "${project_title}" == "${NONE}"
             ${project_title}=    Get Model Project    ${model_name}
@@ -276,6 +274,33 @@ Verify Model Inference
         Fail    msg=comparison between expected and actual failed, ${list}
     END
 
+Verify Model Inference With Retries
+    [Documentation]    We see the inference failing often in the tests. One possible cause might be
+    ...                timing: model not ready to reply yet, despite the pod is up and running and the
+    ...                endpoint exposed.
+    ...                This is a temporary mitigation meanwhile we find a better way to check the model
+    [Arguments]    ${model_name}    ${inference_input}    ${expected_inference_output}
+    ...            ${token_auth}=${FALSE}    ${project_title}=${NONE}    ${retries}=${5}
+    ${status}=    Run Keyword And Return Status    Verify Model Inference
+    ...    ${model_name}    ${inference_input}    ${expected_inference_output}    ${token_auth}    ${project_title}
+    IF    not ${status}
+        ${retry}=    Set Variable    ${0}
+        WHILE    ${retry} < ${retries}
+            IF    ${retry} > 0
+                Log    message=Modelmesh inference call failed ${retry}/${retries}.
+                ...    level=WARN
+            END
+            ${status}=    Run Keyword And Return Status    Verify Model Inference
+            ...    ${model_name}    ${inference_input}    ${expected_inference_output}    ${token_auth}
+            ...    ${project_title}
+            IF    ${status}
+                BREAK
+            END
+            ${retry}=    Evaluate    ${retry} + 1
+            Sleep    5s
+        END
+    END
+
 Clean Up Model Serving Page
     [Documentation]    Deletes all currently deployed models, if any are present.
     # Returns an empty list if no matching elements found
@@ -491,8 +516,8 @@ Set Up Project
     ...            aws_bucket_name=${S3.BUCKET_3.NAME}    aws_s3_endpoint=${S3.BUCKET_3.ENDPOINT}
     ...            aws_region=${S3.BUCKET_3.REGION}
     IF   ${enable_metrics}
-        Oc Apply    kind=ConfigMap    src=${UWM_ENABLE_FILEPATH}
-        Oc Apply    kind=ConfigMap    src=${UWM_CONFIG_FILEPATH}
+        Enable User Workload Monitoring
+        Configure User Workload Monitoring
     ELSE
         Log    message=Skipping UserWorkloadMonitoring enablement.
     END
diff --git a/ods_ci/tests/Tests/400__ods_dashboard/420__model_serving/420__model_serving.robot b/ods_ci/tests/Tests/400__ods_dashboard/420__model_serving/420__model_serving.robot
@@ -63,7 +63,9 @@ Test Inference With Token Authentication
     [Documentation]    Test the inference result after having deployed a model that requires Token Authentication
     [Tags]    Sanity    Tier1
     ...    ODS-1920
-    Run Keyword And Continue On Failure    Verify Model Inference    ${MODEL_NAME}    ${INFERENCE_INPUT}    ${EXPECTED_INFERENCE_OUTPUT}    token_auth=${TRUE}
+    # Run Keyword And Continue On Failure    Verify Model Inference    ${MODEL_NAME}    ${INFERENCE_INPUT}    ${EXPECTED_INFERENCE_OUTPUT}    token_auth=${TRUE}    # robocop: disable
+    Run Keyword And Continue On Failure    Verify Model Inference With Retries
+    ...    ${MODEL_NAME}    ${INFERENCE_INPUT}    ${EXPECTED_INFERENCE_OUTPUT}    token_auth=${TRUE}
     # Testing the same endpoint without token auth, should receive login page
     Open Model Serving Home Page
     ${out}=    Get Model Inference   ${MODEL_NAME}    ${INFERENCE_INPUT}    token_auth=${FALSE}

diff --git a/...tests/Tests/400__ods_dashboard/420__model_serving/423__model_serving_customruntimes.robot b/...tests/Tests/400__ods_dashboard/420__model_serving/423__model_serving_customruntimes.robot
@@ -80,12 +80,14 @@ Verify RHODS Users Can Deploy A Model Using A Custom Serving Runtime
     ...            aws_access_key=${S3.AWS_ACCESS_KEY_ID}    aws_secret_access=${S3.AWS_SECRET_ACCESS_KEY}
     ...            aws_bucket_name=ods-ci-s3
     Create Model Server    server_name=${MODEL_SERVER_NAME}    runtime=${UPLOADED_OVMS_DISPLAYED_NAME}
-    Serve Model    project_name=${PRJ_TITLE}    model_name=${model_name}    framework=onnx    existing_data_connection=${TRUE}
+    Serve Model    project_name=${PRJ_TITLE}    model_name=${model_name}    framework=onnx
+    ...    existing_data_connection=${TRUE}
     ...    data_connection_name=model-serving-connection    model_path=mnist-8.onnx
     Wait Until Runtime Pod Is Running    server_name=${MODEL_SERVER_NAME}
-    ...    project_title=${PRJ_TITLE}    timeout=15s
+    ...    project_title=${PRJ_TITLE}    timeout=40s
     Verify Model Status    ${model_name}    success
-    Verify Model Inference    ${model_name}    ${inference_input}    ${exp_inference_output}    token_auth=${TRUE}
+    Verify Model Inference With Retries    ${model_name}    ${inference_input}    ${exp_inference_output}
+    ...    token_auth=${TRUE}
     ...    project_title=${PRJ_TITLE}
 
 
@@ -94,7 +96,6 @@ Custom Serving Runtime Suite Setup
     [Documentation]    Suite setup steps for testing DSG. It creates some test variables
     ...                and runs RHOSi setup
     Set Library Search Order    SeleniumLibrary
-    Launch Data Science Project Main Page    username=${TEST_USER_3.USERNAME}
     RHOSi Setup
     Fetch CA Certificate If RHODS Is Self-Managed
 

diff --git a/...ci/tests/Tests/400__ods_dashboard/420__model_serving/424_model_serving_bias_metrics.robot b/...ci/tests/Tests/400__ods_dashboard/420__model_serving/424_model_serving_bias_metrics.robot
@@ -148,7 +148,7 @@ Send Batch Inference Data to Model
     [Documentation]    Send Batch Inference data to the already deployed model using Curl commands
     [Arguments]        ${model_name}   ${project_name}    ${lower_range}=1     ${upper_range}=5
     FOR    ${counter}    IN RANGE    ${lower_range}    ${upper_range}
-        ${inference_input}=  Set Variable   ods_ci/tests/Resources/Files/TrustyAI/loan_default_batched/batch_${counter}.json
+        ${inference_input}=  Set Variable   @ods_ci/tests/Resources/Files/TrustyAI/loan_default_batched/batch_${counter}.json
         ${inference_output} =    Get Model Inference    ${model_name}    ${inference_input}    token_auth=${FALSE}
         ...    project_title=${project_name}
         Should Contain    ${inference_output}    model_name