Merge branch 'master' into update_hive_wa_master

red-hat-data-services · Mar 14, 2024 · b42c85e · b42c85e
2 parents 0357f31 + ac2e7a7
commit b42c85e
Show file tree

Hide file tree

Showing 9 changed files with 80 additions and 32 deletions.
diff --git a/ods_ci/tests/Resources/Files/llm/download_model_in_pvc.yaml b/ods_ci/tests/Resources/Files/llm/download_model_in_pvc.yaml
@@ -31,7 +31,10 @@ spec:
         - mountPath: "/mnt/models/"
           name: model-volume
   containers:
-    - name: download-model
+    - resources:
+        requests:
+          memory: 40Gi
+      name: download-model
       imagePullPolicy: IfNotPresent
       image: quay.io/modh/kserve-storage-initializer@sha256:330af2d517b17dbf0cab31beba13cdbe7d6f4b9457114dea8f8485a011e3b138
       args:

diff --git a/...00__deploy/130__operators/130__rhods_operator/135__rhods_operator_logs_verification.robot b/...00__deploy/130__operators/130__rhods_operator/135__rhods_operator_logs_verification.robot
@@ -25,7 +25,6 @@ ${regex_pattern}       level=([Ee]rror).*|([Ff]ailed) to list .*
 *** Test Cases ***
 Verify RHODS Operator log
    [Tags]  Sanity    Tier1
-   ...     ProductBug
    ...     ODS-1007
    #Get the POD name
    ${data}       Run keyword   Oc Get   kind=Pod     namespace=${namespace}   label_selector=name=rhods-operator

diff --git a/ods_ci/tests/Tests/400__ods_dashboard/400__ods_dashboard.robot b/ods_ci/tests/Tests/400__ods_dashboard/400__ods_dashboard.robot
@@ -80,7 +80,6 @@ Verify Disabled Cards Can Be Removed
     ...                 for ods-ci
     [Tags]    Sanity    Tier1
     ...       ODS-1081    ODS-1092
-    ...       AutomationBug
     # Enable Custom App
     # Remove Custom App From Dashboard
     Run Keyword And Warn On Failure    Success Message Should Contain    ODS-CI Custom App

diff --git a/ods_ci/tests/Tests/400__ods_dashboard/414__ods_dashboard_api.robot b/ods_ci/tests/Tests/400__ods_dashboard/414__ods_dashboard_api.robot
@@ -25,7 +25,7 @@ ${DOCS_ENDPOINT}=        api/docs
 ${GETTING_STARTED_ENDPOINT}=        api/getting-started
 ${QUICKSTARTS_ENDPOINT}=        api/quickstarts
 ${SEGMENT_KEY_ENDPOINT}=        api/segment-key
-${GPU_ENDPOINT}=        api/gpu
+${GPU_ENDPOINT}=        api/accelerators
 
 ${NOTEBOOK_NS}=          ${NOTEBOOKS_NAMESPACE}
 ${DASHBOARD_NS}=         ${APPLICATIONS_NAMESPACE}
@@ -472,9 +472,11 @@ Verify Access To groups-config API Endpoint
 Verify Access To images API Endpoint
     [Documentation]     Verifies the endpoint "images" works as expected
     ...                 based on the permissions of the users who query the endpoint
+    ...                 ProductBug RHOAIENG-4469
     [Tags]    ODS-1724
     ...       Tier1    Sanity
     ...       Security
+    ...       ProductBug
     Perform Dashboard API Endpoint POST Call   endpoint=${IMG_ENDPOINT_PT0}    token=${BASIC_USER_TOKEN}
     ...                                       body=${IMG_ENDPOINT_BODY}
     Operation Should Be Unauthorized

diff --git a/...ci/tests/Tests/400__ods_dashboard/420__model_serving/LLMs/422__model_serving_llm_UI.robot b/...ci/tests/Tests/400__ods_dashboard/420__model_serving/LLMs/422__model_serving_llm_UI.robot
@@ -266,8 +266,8 @@ Verify User Can Access Model Metrics From UWM Using The UI  # robocop: disable
 Verify User With Edit Permission Can Deploy Query And Delete A LLM
     [Documentation]    This test case verifies that a user with Edit permission on a DS Project can still deploy, query
     ...    and delete a LLM served with caikit
-    ...    ProductBug: https://issues.redhat.com/browse/RHOAIENG-548
-    [Tags]    Sanity    Tier1    ODS-2581    ProductBug
+    ...    Issue reported for this test in the past: https://issues.redhat.com/browse/RHOAIENG-548
+    [Tags]    Sanity    Tier1    ODS-2581
     [Setup]    Set Up Project    namespace=${TEST_NS}-edit-permission
     ${test_namespace}=    Set Variable     ${TEST_NS}-edit-permission
     ${flan_model_name}=    Set Variable    flan-t5-small-caikit

diff --git a/...ests/Tests/400__ods_dashboard/420__model_serving/LLMs/422__model_serving_llm_models.robot b/...ests/Tests/400__ods_dashboard/420__model_serving/LLMs/422__model_serving_llm_models.robot
@@ -5,7 +5,7 @@ Resource          ../../../../Resources/OCP.resource
 Resource          ../../../../Resources/CLI/ModelServing/llm.resource
 Library            OpenShiftLibrary
 Suite Setup       Suite Setup
-Suite Teardown    RHOSi Teardown
+Suite Teardown    Suite Teardown
 Test Tags         KServe-LLM
 
 
@@ -28,11 +28,12 @@ Verify User Can Serve And Query A bigscience/mt0-xxl Model
     Set Project And Runtime    runtime=${TGIS_RUNTIME_NAME}     namespace=${test_namespace}
     ...    download_in_pvc=${DOWNLOAD_IN_PVC}    model_name=${model_name}
     ...    storage_size=70Gi
+    ${requests}=    Create Dictionary    memory=40Gi
     Compile Inference Service YAML    isvc_name=${model_name}
     ...    sa_name=${EMPTY}
     ...    model_storage_uri=${storage_uri}
     ...    model_format=pytorch    serving_runtime=${TGIS_RUNTIME_NAME}
-    ...    limits_dict=${limits}    kserve_mode=${KSERVE_MODE}
+    ...    limits_dict=${limits}    requests_dict=${requests}    kserve_mode=${KSERVE_MODE}
     Deploy Model Via CLI    isvc_filepath=${INFERENCESERVICE_FILLED_FILEPATH}
     ...    namespace=${test_namespace}
     Wait For Pods To Be Ready    label_selector=serving.kserve.io/inferenceservice=${model_name}
@@ -72,11 +73,12 @@ Verify User Can Serve And Query A google/flan-t5-xl Model
     Set Project And Runtime    runtime=${TGIS_RUNTIME_NAME}     namespace=${test_namespace}
     ...    download_in_pvc=${DOWNLOAD_IN_PVC}    model_name=${model_name}
     ...    storage_size=70Gi
+    ${requests}=    Create Dictionary    memory=40Gi
     Compile Inference Service YAML    isvc_name=${model_name}
     ...    sa_name=${EMPTY}
     ...    model_storage_uri=${storage_uri}
     ...    model_format=pytorch    serving_runtime=${TGIS_RUNTIME_NAME}
-    ...    limits_dict=${limits}    kserve_mode=${KSERVE_MODE}
+    ...    limits_dict=${limits}    requests_dict=${requests}    kserve_mode=${KSERVE_MODE}
     Deploy Model Via CLI    isvc_filepath=${INFERENCESERVICE_FILLED_FILEPATH}
     ...    namespace=${test_namespace}
     Wait For Pods To Be Ready    label_selector=serving.kserve.io/inferenceservice=${model_name}
@@ -116,11 +118,12 @@ Verify User Can Serve And Query A google/flan-t5-xxl Model
     Set Project And Runtime    runtime=${TGIS_RUNTIME_NAME}     namespace=${test_namespace}
     ...    download_in_pvc=${DOWNLOAD_IN_PVC}    model_name=${model_name}
     ...    storage_size=70Gi
+    ${requests}=    Create Dictionary    memory=40Gi
     Compile Inference Service YAML    isvc_name=${model_name}
     ...    sa_name=${EMPTY}
     ...    model_storage_uri=${storage_uri}
     ...    model_format=pytorch    serving_runtime=${TGIS_RUNTIME_NAME}
-    ...    limits_dict=${limits}    kserve_mode=${KSERVE_MODE}
+    ...    limits_dict=${limits}    requests_dict=${requests}    kserve_mode=${KSERVE_MODE}
     Deploy Model Via CLI    isvc_filepath=${INFERENCESERVICE_FILLED_FILEPATH}
     ...    namespace=${test_namespace}
     Wait For Pods To Be Ready    label_selector=serving.kserve.io/inferenceservice=${model_name}
@@ -159,11 +162,12 @@ Verify User Can Serve And Query A elyza/elyza-japanese-llama-2-7b-instruct Model
     Set Project And Runtime    runtime=${TGIS_RUNTIME_NAME}     namespace=${test_namespace}
     ...    download_in_pvc=${DOWNLOAD_IN_PVC}    model_name=${model_name}
     ...    storage_size=70Gi    model_path=${model_path}
+    ${requests}=    Create Dictionary    memory=40Gi
     Compile Inference Service YAML    isvc_name=${model_name}
     ...    sa_name=${EMPTY}
     ...    model_storage_uri=${storage_uri}
     ...    model_format=pytorch    serving_runtime=${TGIS_RUNTIME_NAME}
-    ...    limits_dict=${limits}    kserve_mode=${KSERVE_MODE}
+    ...    limits_dict=${limits}    requests_dict=${requests}    kserve_mode=${KSERVE_MODE}
     Deploy Model Via CLI    isvc_filepath=${INFERENCESERVICE_FILLED_FILEPATH}
     ...    namespace=${test_namespace}
     Wait For Pods To Be Ready    label_selector=serving.kserve.io/inferenceservice=${model_name}
@@ -203,11 +207,12 @@ Verify User Can Serve And Query A ibm/mpt-7b-instruct2 Model
     Set Project And Runtime    runtime=${TGIS_RUNTIME_NAME}     namespace=${test_namespace}
     ...    download_in_pvc=${DOWNLOAD_IN_PVC}    model_name=${model_name}
     ...    storage_size=20Gi
+    ${requests}=    Create Dictionary    memory=40Gi
     Compile Inference Service YAML    isvc_name=${model_name}
     ...    sa_name=${EMPTY}
     ...    model_storage_uri=${storage_uri}
     ...    model_format=pytorch    serving_runtime=${TGIS_RUNTIME_NAME}
-    ...    limits_dict=${limits}    kserve_mode=${KSERVE_MODE}
+    ...    limits_dict=${limits}    requests_dict=${requests}    kserve_mode=${KSERVE_MODE}
     Deploy Model Via CLI    isvc_filepath=${INFERENCESERVICE_FILLED_FILEPATH}
     ...    namespace=${test_namespace}
     Wait For Pods To Be Ready    label_selector=serving.kserve.io/inferenceservice=${model_name}
@@ -247,11 +252,12 @@ Verify User Can Serve And Query A google/flan-ul-2 Model
     Set Project And Runtime    runtime=${TGIS_RUNTIME_NAME}     namespace=${test_namespace}
     ...    download_in_pvc=${DOWNLOAD_IN_PVC}    model_name=${model_name}
     ...    storage_size=70Gi   model_path=${model_path}
+    ${requests}=    Create Dictionary    memory=40Gi
     Compile Inference Service YAML    isvc_name=${model_name}
     ...    sa_name=${EMPTY}
     ...    model_storage_uri=${storage_uri}
     ...    model_format=pytorch    serving_runtime=${TGIS_RUNTIME_NAME}
-    ...    limits_dict=${limits}    kserve_mode=${KSERVE_MODE}
+    ...    limits_dict=${limits}    requests_dict=${requests}    kserve_mode=${KSERVE_MODE}
     Deploy Model Via CLI    isvc_filepath=${INFERENCESERVICE_FILLED_FILEPATH}
     ...    namespace=${test_namespace}
     Wait For Pods To Be Ready    label_selector=serving.kserve.io/inferenceservice=${model_name}
@@ -326,11 +332,12 @@ Verify User Can Serve And Query A meta-llama/llama-2-13b-chat Model
     Set Project And Runtime    runtime=${TGIS_RUNTIME_NAME}     namespace=${test_namespace}
     ...    download_in_pvc=${DOWNLOAD_IN_PVC}    model_name=${model_name}
     ...    storage_size=70Gi    model_path=${model_path}
+    ${requests}=    Create Dictionary    memory=40Gi
     Compile Inference Service YAML    isvc_name=${model_name}
     ...    sa_name=${EMPTY}
     ...    model_storage_uri=${storage_uri}
     ...    model_format=pytorch    serving_runtime=${TGIS_RUNTIME_NAME}
-    ...    limits_dict=${limits}    kserve_mode=${KSERVE_MODE}
+    ...    limits_dict=${limits}    requests_dict=${requests}    kserve_mode=${KSERVE_MODE}
     Deploy Model Via CLI    isvc_filepath=${INFERENCESERVICE_FILLED_FILEPATH}
     ...    namespace=${test_namespace}
     Wait For Pods To Be Ready    label_selector=serving.kserve.io/inferenceservice=${model_name}
@@ -374,11 +381,12 @@ Verify User Can Serve And Query A google/flan-t5-xl Prompt Tuned Model
     ...    bucket_name=${MODELS_BUCKET.NAME}    use_https=${USE_BUCKET_HTTPS}
     ...    storage_size=10Gi    model_path=${model_path}
     ${overlays}=    Create List    prompt-tuned
+    ${requests}=    Create Dictionary    memory=40Gi
     Compile Inference Service YAML    isvc_name=${model_name}
     ...    sa_name=${EMPTY}
     ...    model_storage_uri=${storage_uri}
     ...    model_format=pytorch    serving_runtime=${TGIS_RUNTIME_NAME}
-    ...    limits_dict=${limits}    kserve_mode=${KSERVE_MODE}
+    ...    limits_dict=${limits}    requests_dict=${requests}    kserve_mode=${KSERVE_MODE}
     ...    overlays=${overlays}
     Deploy Model Via CLI    isvc_filepath=${INFERENCESERVICE_FILLED_FILEPATH}
     ...    namespace=${test_namespace}
@@ -426,6 +434,28 @@ Suite Setup
     RHOSi Setup
     Load Expected Responses
     Run    git clone https://github.com/IBM/text-generation-inference/
+    Set Default Storage Class In GCP    default=ssd-csi
+
+Suite Teardown
+    Set Default Storage Class In GCP    default=standard-csi
+    RHOSi Teardown
+
+Set Default Storage Class In GCP
+    [Documentation]    If the storage class exists we can assume we are in GCP. We force ssd-csi to be the default class
+    ...    for the duration of this test suite.
+    [Arguments]    ${default}
+    ${rc}=    Run And Return Rc    oc get storageclass ${default}
+    IF    ${rc} == ${0}
+        IF    "${default}" == "ssd-csi"
+            Run    oc patch storageclass standard-csi -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"false"}}}'  #robocop: disable
+            Run    oc patch storageclass ssd-csi -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}'  #robocop: disable
+        ELSE
+            Run    oc patch storageclass ssd-csi -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"false"}}}'  #robocop: disable
+            Run    oc patch storageclass standard-csi -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}'  #robocop: disable
+        END
+    ELSE
+        Log    Proceeding with default storage class because we're not in GCP
+    END
 
 Setup Test Variables
     [Arguments]    ${model_name}    ${kserve_mode}=Serverless    ${use_pvc}=${FALSE}    ${use_gpu}=${FALSE}

diff --git a/ods_ci/tests/Tests/500__jupyterhub/custom-image.robot b/ods_ci/tests/Tests/500__jupyterhub/custom-image.robot
@@ -94,10 +94,9 @@ Test Bad Image URL
 
 Test Image From Local registry
     [Documentation]  Try creating a custom image using a local registry URL (i.e. OOTB image)
-    ...       ProductBug - https://github.com/opendatahub-io/odh-dashboard/issues/2185
+    ...       Issue reported for this test in the past - https://github.com/opendatahub-io/odh-dashboard/issues/2185
     [Tags]    Sanity    Tier1
     ...       ODS-2470
-    ...       ProductBug
     ${CLEANUP}=  Set Variable  False
     Open Notebook Images Page
     ${local_url} =    Get Standard Data Science Local Registry URL

diff --git a/ods_ci/tests/Tests/650__distributed_workloads/test-run-codeflare-tests.robot b/ods_ci/tests/Tests/650__distributed_workloads/test-run-codeflare-tests.robot
@@ -9,8 +9,7 @@ Resource          ../../../tasks/Resources/RHODS_OLM/install/oc_install.robot
 
 *** Variables ***
 ${CODEFLARE_DIR}                codeflare-operator
-${CODEFLARE_REPO_URL}           %{CODEFLARE_REPO_URL=https://github.com/opendatahub-io/codeflare-operator.git}
-${CODEFLARE_REPO_BRANCH}        %{CODEFLARE_REPO_BRANCH=main}
+${CODEFLARE_RELEASE_ASSETS}     %{CODEFLARE_RELEASE_ASSETS=https://github.com/opendatahub-io/codeflare-operator/releases/latest/download}
 ${ODH_NAMESPACE}                %{ODH_NAMESPACE=redhat-ods-applications}
 ${NOTEBOOK_IMAGE_STREAM_NAME}   %{NOTEBOOK_IMAGE_STREAM_NAME=s2i-generic-data-science-notebook}
 
@@ -39,7 +38,7 @@ Run TestMCADRay ODH test
     ...     Tier2
     ...     DistributedWorkloads
     ...     CodeflareOperator
-    Skip    "Skip because of test failures. Currently being investigated"
+    Skip    "Skip because of https://issues.redhat.com/browse/RHOAIENG-3981"
     Run Codeflare ODH Test    TestMCADRay
 
 Run TestMnistPyTorchMCAD ODH test
@@ -53,31 +52,41 @@ Run TestMnistPyTorchMCAD ODH test
 
 *** Keywords ***
 Prepare Codeflare E2E Test Suite
-    ${result} =    Run Process    git clone -b ${CODEFLARE_REPO_BRANCH} ${CODEFLARE_REPO_URL} ${CODEFLARE_DIR}
-    ...    shell=true    stderr=STDOUT
-    Log To Console    ${result.stdout}
-    IF    ${result.rc} != 0
-        FAIL    Unable to clone Codeflare repo ${CODEFLARE_REPO_URL}:${CODEFLARE_REPO_BRANCH}
-    END
-
     Enable Component    ray
     Enable Component    codeflare
+    Wait Component Ready    ray
+    Wait Component Ready    codeflare
     Create Directory    %{WORKSPACE}/codeflare-e2e-logs
     Create Directory    %{WORKSPACE}/codeflare-odh-logs
     RHOSi Setup
 
 Teardown Codeflare E2E Test Suite
+    Log To Console    "Removing test binaries"
+    ${result} =    Run Process    rm -f e2e odh
+    ...    shell=true
+    ...    stderr=STDOUT
+    Log To Console    ${result.stdout}
+    IF    ${result.rc} != 0
+        FAIL    Unable to remove compiled binaries
+    END
     Disable Component    codeflare
     Disable Component    ray
     RHOSi Teardown
 
 Run Codeflare E2E Test
     [Arguments]    ${TEST_NAME}
+    Log To Console    "Downloading compiled test binary e2e"
+    ${result} =    Run Process    curl --location --silent --output e2e ${CODEFLARE_RELEASE_ASSETS}/e2e && chmod +x e2e
+    ...    shell=true
+    ...    stderr=STDOUT
+    Log To Console    ${result.stdout}
+    IF    ${result.rc} != 0
+        FAIL    Unable to retrieve e2e compiled binary
+    END
     Log To Console    "Running test: ${TEST_NAME}"
-    ${result} =    Run Process    go test -timeout 30m -v ./test/e2e -run ${TEST_NAME}
+    ${result} =    Run Process    ./e2e -test.run ${TEST_NAME}
     ...    shell=true
     ...    stderr=STDOUT
-    ...    cwd=${CODEFLARE_DIR}
     ...    env:CODEFLARE_TEST_TIMEOUT_SHORT=5m
     ...    env:CODEFLARE_TEST_TIMEOUT_MEDIUM=10m
     ...    env:CODEFLARE_TEST_TIMEOUT_LONG=20m
@@ -89,11 +98,18 @@ Run Codeflare E2E Test
 
 Run Codeflare ODH Test
     [Arguments]    ${TEST_NAME}
+    Log To Console    "Downloading compiled test binary odh"
+    ${result} =    Run Process    curl --location --silent --output odh ${CODEFLARE_RELEASE_ASSETS}/odh && chmod +x odh
+    ...    shell=true
+    ...    stderr=STDOUT
+    Log To Console    ${result.stdout}
+    IF    ${result.rc} != 0
+        FAIL    Unable to retrieve odh compiled binary
+    END
     Log To Console    "Running test: ${TEST_NAME}"
-    ${result} =    Run Process    go test -timeout 30m -v ./test/odh -run ${TEST_NAME}
+    ${result} =    Run Process    ./odh -test.run ${TEST_NAME}
     ...    shell=true
     ...    stderr=STDOUT
-    ...    cwd=${CODEFLARE_DIR}
     ...    env:CODEFLARE_TEST_TIMEOUT_SHORT=5m
     ...    env:CODEFLARE_TEST_TIMEOUT_MEDIUM=10m
     ...    env:CODEFLARE_TEST_TIMEOUT_LONG=20m

diff --git a/ods_ci/tests/Tests/650__distributed_workloads/test-run-kueue-e2e-tests.robot b/ods_ci/tests/Tests/650__distributed_workloads/test-run-kueue-e2e-tests.robot
@@ -48,7 +48,7 @@ Prepare Kueue E2E Test Suite
 
     # Add label instance-type=on-demand on worker node
     Log To Console    Add label on worker node ...
-    ${return_code}    ${output}    Run And Return Rc And Output    oc get nodes -o name --selector=node-role.kubernetes.io/worker | tail -n1
+    ${return_code}    ${output}    Run And Return Rc And Output    oc get nodes -o name --selector='node-role.kubernetes.io/worker,node-role.kubernetes.io notin (infra)' | tail -n1
     Set Suite Variable    ${WORKER_NODE}    ${output}
     ${return_code} =    Run And Return Rc    oc label ${WORKER_NODE} instance-type=on-demand
     Should Be Equal As Integers  ${return_code}   0   msg=Fail to label worker node with instance-type=on-demand