From 688d66b38802d513cf556e818b55c75c7cdd5fb0 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Thu, 30 May 2024 15:49:56 -0700 Subject: [PATCH 1/9] [CI] Add nightly CI job to test against dev version of deps --- tests/buildkite/pipeline-nightly.yml | 30 ++++++++++++++++ tests/buildkite/test-python-gpu.sh | 9 ++++- tests/ci_build/Dockerfile.gpu_dev_ver | 51 +++++++++++++++++++++++++++ 3 files changed, 89 insertions(+), 1 deletion(-) create mode 100644 tests/buildkite/pipeline-nightly.yml create mode 100644 tests/ci_build/Dockerfile.gpu_dev_ver diff --git a/tests/buildkite/pipeline-nightly.yml b/tests/buildkite/pipeline-nightly.yml new file mode 100644 index 000000000000..adc09f5c0533 --- /dev/null +++ b/tests/buildkite/pipeline-nightly.yml @@ -0,0 +1,30 @@ +# Nightly CI pipeline, to test against dev versions of dependencies + +env: + DOCKER_CACHE_ECR_ID: "492475357299" + DOCKER_CACHE_ECR_REGION: "us-west-2" + DISABLE_RELEASE: "1" + # Skip uploading artifacts to S3 bucket + # Also, don't build all CUDA archs; just build sm_75 + USE_DEPS_DEV_VER: "1" + # Use dev versions of RAPIDS and other dependencies +steps: + #### -------- CONTAINER BUILD -------- + - label: ":docker: Build containers" + commands: + - "tests/buildkite/build-containers.sh gpu_build_centos7" + - "tests/buildkite/build-containers.sh gpu_dev_ver" + key: build-containers + agents: + queue: linux-amd64-cpu + - wait + - label: ":console: Test Python package, single GPU" + command: "tests/buildkite/test-python-gpu.sh gpu" + key: test-python-gpu + agents: + queue: linux-amd64-gpu + - label: ":console: Test Python package, 4 GPUs" + command: "tests/buildkite/test-python-gpu.sh mgpu" + key: test-python-mgpu + agents: + queue: linux-amd64-mgpu diff --git a/tests/buildkite/test-python-gpu.sh b/tests/buildkite/test-python-gpu.sh index bb61a980de11..51ab6198dee6 100755 --- a/tests/buildkite/test-python-gpu.sh +++ b/tests/buildkite/test-python-gpu.sh @@ -22,7 +22,14 @@ chmod +x build/testxgboost # Allocate extra space in /dev/shm to enable NCCL export CI_DOCKER_EXTRA_PARAMS_INIT='--shm-size=4g' -command_wrapper="tests/ci_build/ci_build.sh gpu --use-gpus --build-arg "` +if [[ -z "${USE_DEPS_DEV_VER}" ]] +then + container_tag='gpu' +else + container_tag='gpu_dev_ver' +fi + +command_wrapper="tests/ci_build/ci_build.sh ${container_tag} --use-gpus --build-arg "` `"CUDA_VERSION_ARG=$CUDA_VERSION --build-arg "` `"RAPIDS_VERSION_ARG=$RAPIDS_VERSION --build-arg "` `"NCCL_VERSION_ARG=$NCCL_VERSION" diff --git a/tests/ci_build/Dockerfile.gpu_dev_ver b/tests/ci_build/Dockerfile.gpu_dev_ver new file mode 100644 index 000000000000..010df4c01966 --- /dev/null +++ b/tests/ci_build/Dockerfile.gpu_dev_ver @@ -0,0 +1,51 @@ +# Container to test XGBoost against dev versions of dependencies + +ARG CUDA_VERSION_ARG +FROM nvidia/cuda:$CUDA_VERSION_ARG-runtime-ubuntu22.04 +ARG CUDA_VERSION_ARG +ARG RAPIDS_VERSION_ARG +ARG NCCL_VERSION_ARG + +# Environment +ENV DEBIAN_FRONTEND noninteractive +SHELL ["/bin/bash", "-c"] # Use Bash as shell + +# Install all basic requirements +RUN \ + apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub && \ + apt-get update && \ + apt-get install -y wget unzip bzip2 libgomp1 build-essential openjdk-8-jdk-headless && \ + # Python + wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/22.11.1-2/Mambaforge-22.11.1-2-Linux-x86_64.sh && \ + bash conda.sh -b -p /opt/mambaforge + +ENV PATH=/opt/mambaforge/bin:$PATH + +# Create new Conda environment with cuDF, Dask, and cuPy +RUN \ + export NCCL_SHORT_VER=$(echo "$NCCL_VERSION_ARG" | cut -d "-" -f 1) && \ + mamba create -y -n gpu_test -c rapidsai-nightly -c nvidia -c conda-forge \ + python=3.10 "cudf>$RAPIDS_VERSION_ARG" "rmm>$RAPIDS_VERSION_ARG" cudatoolkit=$CUDA_VERSION_ARG \ + "nccl>=${NCCL_SHORT_VER}" \ + dask \ + "dask-cuda>$RAPIDS_VERSION_ARG" "dask-cudf>$RAPIDS_VERSION_ARG" cupy \ + numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \ + "pyspark>=3.4.0" cloudpickle cuda-python && \ + mamba clean --all && \ + conda run --no-capture-output -n gpu_test pip install buildkite-test-collector + +ENV GOSU_VERSION 1.10 +ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/ + +# Install lightweight sudo (not bound to TTY) +RUN set -ex; \ + wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ + chmod +x /usr/local/bin/gosu && \ + gosu nobody true + +# Default entry-point to use if running locally +# It will preserve attributes of created files +COPY entrypoint.sh /scripts/ + +WORKDIR /workspace +ENTRYPOINT ["/scripts/entrypoint.sh"] From b85a3604352c70a3620bae8a613f7745ead0b1b6 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Thu, 30 May 2024 16:53:02 -0700 Subject: [PATCH 2/9] Update build-containers.sh --- tests/buildkite/build-containers.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/buildkite/build-containers.sh b/tests/buildkite/build-containers.sh index 9aec33d1ffc4..f19000edaa51 100755 --- a/tests/buildkite/build-containers.sh +++ b/tests/buildkite/build-containers.sh @@ -20,7 +20,7 @@ case "${container}" in cpu) ;; - gpu) + gpu | gpu_dev_ver) BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION" BUILD_ARGS="$BUILD_ARGS --build-arg NCCL_VERSION_ARG=$NCCL_VERSION" BUILD_ARGS="$BUILD_ARGS --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION" From 6640b6537992b0b750865b59ccfb02af635c87ac Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Thu, 30 May 2024 17:11:01 -0700 Subject: [PATCH 3/9] Add build step --- tests/buildkite/pipeline-nightly.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/buildkite/pipeline-nightly.yml b/tests/buildkite/pipeline-nightly.yml index adc09f5c0533..2c04addf5595 100644 --- a/tests/buildkite/pipeline-nightly.yml +++ b/tests/buildkite/pipeline-nightly.yml @@ -18,6 +18,12 @@ steps: agents: queue: linux-amd64-cpu - wait + + - label: ":console: Build CUDA" + command: "tests/buildkite/build-cuda.sh" + key: build-cuda + agents: + queue: linux-amd64-cpu - label: ":console: Test Python package, single GPU" command: "tests/buildkite/test-python-gpu.sh gpu" key: test-python-gpu From 3f7794387c1580129f6361e70b3e515d4720f91c Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Thu, 30 May 2024 18:29:32 -0700 Subject: [PATCH 4/9] Wait for build artifact --- tests/buildkite/pipeline-nightly.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/buildkite/pipeline-nightly.yml b/tests/buildkite/pipeline-nightly.yml index 2c04addf5595..495d1b3e48a9 100644 --- a/tests/buildkite/pipeline-nightly.yml +++ b/tests/buildkite/pipeline-nightly.yml @@ -24,6 +24,7 @@ steps: key: build-cuda agents: queue: linux-amd64-cpu + - wait - label: ":console: Test Python package, single GPU" command: "tests/buildkite/test-python-gpu.sh gpu" key: test-python-gpu From 96591bd746b5c26ff029c9e03dd6c98b2ed03668 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Thu, 30 May 2024 19:37:06 -0700 Subject: [PATCH 5/9] Try pinning dask --- tests/ci_build/Dockerfile.gpu_dev_ver | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ci_build/Dockerfile.gpu_dev_ver b/tests/ci_build/Dockerfile.gpu_dev_ver index 010df4c01966..1efcdaf4ceb5 100644 --- a/tests/ci_build/Dockerfile.gpu_dev_ver +++ b/tests/ci_build/Dockerfile.gpu_dev_ver @@ -27,7 +27,7 @@ RUN \ mamba create -y -n gpu_test -c rapidsai-nightly -c nvidia -c conda-forge \ python=3.10 "cudf>$RAPIDS_VERSION_ARG" "rmm>$RAPIDS_VERSION_ARG" cudatoolkit=$CUDA_VERSION_ARG \ "nccl>=${NCCL_SHORT_VER}" \ - dask \ + dask=2024.1.1 \ "dask-cuda>$RAPIDS_VERSION_ARG" "dask-cudf>$RAPIDS_VERSION_ARG" cupy \ numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \ "pyspark>=3.4.0" cloudpickle cuda-python && \ From 7f3c66613403538dbc53d76eac2bfbcbd2562757 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Fri, 31 May 2024 10:48:55 -0700 Subject: [PATCH 6/9] Address reviewers' comments --- tests/buildkite/build-containers.sh | 8 +------- tests/ci_build/Dockerfile.aarch64 | 4 ++-- tests/ci_build/Dockerfile.cpu | 4 ++-- tests/ci_build/Dockerfile.gpu | 7 +++---- tests/ci_build/Dockerfile.gpu_build_centos7 | 2 +- tests/ci_build/Dockerfile.gpu_build_r_centos7 | 2 +- tests/ci_build/Dockerfile.gpu_dev_ver | 10 +++++----- tests/ci_build/Dockerfile.jvm | 2 +- tests/ci_build/Dockerfile.jvm_cross | 2 +- tests/ci_build/Dockerfile.jvm_gpu_build | 2 +- 10 files changed, 18 insertions(+), 25 deletions(-) diff --git a/tests/buildkite/build-containers.sh b/tests/buildkite/build-containers.sh index f19000edaa51..929817e1533e 100755 --- a/tests/buildkite/build-containers.sh +++ b/tests/buildkite/build-containers.sh @@ -20,13 +20,7 @@ case "${container}" in cpu) ;; - gpu | gpu_dev_ver) - BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION" - BUILD_ARGS="$BUILD_ARGS --build-arg NCCL_VERSION_ARG=$NCCL_VERSION" - BUILD_ARGS="$BUILD_ARGS --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION" - ;; - - gpu_build_centos7) + gpu*) BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION" BUILD_ARGS="$BUILD_ARGS --build-arg NCCL_VERSION_ARG=$NCCL_VERSION" BUILD_ARGS="$BUILD_ARGS --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION" diff --git a/tests/ci_build/Dockerfile.aarch64 b/tests/ci_build/Dockerfile.aarch64 index 9b06e1c83373..f028e635b1f8 100644 --- a/tests/ci_build/Dockerfile.aarch64 +++ b/tests/ci_build/Dockerfile.aarch64 @@ -10,7 +10,7 @@ RUN \ yum update -y && \ yum install -y devtoolset-9 && \ # Python - wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/22.11.1-2/Mambaforge-22.11.1-2-Linux-aarch64.sh && \ + wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-aarch64.sh && \ bash conda.sh -b -p /opt/mambaforge ENV PATH=/opt/mambaforge/bin:$PATH @@ -23,7 +23,7 @@ ENV GOSU_VERSION 1.10 COPY conda_env/aarch64_test.yml /scripts/ RUN mamba create -n aarch64_test && \ mamba env update -n aarch64_test --file=/scripts/aarch64_test.yml && \ - mamba clean --all + mamba clean --all --yes # Install lightweight sudo (not bound to TTY) RUN set -ex; \ diff --git a/tests/ci_build/Dockerfile.cpu b/tests/ci_build/Dockerfile.cpu index fa9ea772df5d..6ca574f513e0 100644 --- a/tests/ci_build/Dockerfile.cpu +++ b/tests/ci_build/Dockerfile.cpu @@ -12,7 +12,7 @@ RUN \ apt-get update && \ apt-get install -y tar unzip wget git build-essential doxygen graphviz llvm libidn12 cmake ninja-build gcc-9 g++-9 openjdk-8-jdk-headless && \ # Python - wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/22.11.1-2/Mambaforge-22.11.1-2-Linux-x86_64.sh && \ + wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-x86_64.sh && \ bash conda.sh -b -p /opt/mambaforge ENV PATH=/opt/mambaforge/bin:$PATH @@ -36,7 +36,7 @@ RUN git clone -b v1.49.1 https://github.com/grpc/grpc.git \ COPY conda_env/linux_cpu_test.yml /scripts/ RUN mamba create -n linux_cpu_test && \ mamba env update -n linux_cpu_test --file=/scripts/linux_cpu_test.yml && \ - mamba clean --all && \ + mamba clean --all --yes && \ conda run --no-capture-output -n linux_cpu_test pip install buildkite-test-collector # Install lightweight sudo (not bound to TTY) diff --git a/tests/ci_build/Dockerfile.gpu b/tests/ci_build/Dockerfile.gpu index 255dd9d71874..a9238b85b982 100644 --- a/tests/ci_build/Dockerfile.gpu +++ b/tests/ci_build/Dockerfile.gpu @@ -14,7 +14,7 @@ RUN \ apt-get update && \ apt-get install -y wget unzip bzip2 libgomp1 build-essential openjdk-8-jdk-headless && \ # Python - wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/22.11.1-2/Mambaforge-22.11.1-2-Linux-x86_64.sh && \ + wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-x86_64.sh && \ bash conda.sh -b -p /opt/mambaforge ENV PATH=/opt/mambaforge/bin:$PATH @@ -22,14 +22,13 @@ ENV PATH=/opt/mambaforge/bin:$PATH # Create new Conda environment with cuDF, Dask, and cuPy RUN \ export NCCL_SHORT_VER=$(echo "$NCCL_VERSION_ARG" | cut -d "-" -f 1) && \ - mamba create -y -n gpu_test -c rapidsai -c nvidia -c conda-forge \ + mamba create -y -n gpu_test -c rapidsai -c conda-forge -c nvidia \ python=3.10 cudf=$RAPIDS_VERSION_ARG* rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG \ "nccl>=${NCCL_SHORT_VER}" \ - dask=2024.1.1 \ dask-cuda=$RAPIDS_VERSION_ARG* dask-cudf=$RAPIDS_VERSION_ARG* cupy \ numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \ "pyspark>=3.4.0" cloudpickle cuda-python && \ - mamba clean --all && \ + mamba clean --all --yes && \ conda run --no-capture-output -n gpu_test pip install buildkite-test-collector ENV GOSU_VERSION 1.10 diff --git a/tests/ci_build/Dockerfile.gpu_build_centos7 b/tests/ci_build/Dockerfile.gpu_build_centos7 index 16445de2a704..9934027258ab 100644 --- a/tests/ci_build/Dockerfile.gpu_build_centos7 +++ b/tests/ci_build/Dockerfile.gpu_build_centos7 @@ -13,7 +13,7 @@ RUN \ yum -y update && \ yum install -y tar unzip wget xz git which ninja-build devtoolset-9-gcc devtoolset-9-binutils devtoolset-9-gcc-c++ && \ # Python - wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/22.11.1-2/Mambaforge-22.11.1-2-Linux-x86_64.sh && \ + wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-x86_64.sh && \ bash conda.sh -b -p /opt/mambaforge && \ /opt/mambaforge/bin/python -m pip install awscli && \ # CMake diff --git a/tests/ci_build/Dockerfile.gpu_build_r_centos7 b/tests/ci_build/Dockerfile.gpu_build_r_centos7 index 7c95f09b59a9..bfe1cf221710 100644 --- a/tests/ci_build/Dockerfile.gpu_build_r_centos7 +++ b/tests/ci_build/Dockerfile.gpu_build_r_centos7 @@ -35,7 +35,7 @@ RUN \ run \ # Python - wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/22.11.1-2/Mambaforge-22.11.1-2-Linux-x86_64.sh && \ + wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-x86_64.sh && \ bash conda.sh -b -p /opt/mambaforge && \ /opt/mambaforge/bin/python -m pip install auditwheel awscli && \ # CMake diff --git a/tests/ci_build/Dockerfile.gpu_dev_ver b/tests/ci_build/Dockerfile.gpu_dev_ver index 1efcdaf4ceb5..8feff4f3ed43 100644 --- a/tests/ci_build/Dockerfile.gpu_dev_ver +++ b/tests/ci_build/Dockerfile.gpu_dev_ver @@ -16,22 +16,22 @@ RUN \ apt-get update && \ apt-get install -y wget unzip bzip2 libgomp1 build-essential openjdk-8-jdk-headless && \ # Python - wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/22.11.1-2/Mambaforge-22.11.1-2-Linux-x86_64.sh && \ + wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-x86_64.sh && \ bash conda.sh -b -p /opt/mambaforge ENV PATH=/opt/mambaforge/bin:$PATH -# Create new Conda environment with cuDF, Dask, and cuPy +# Create new Conda environment with dev versions of cuDF, Dask, and cuPy +# Use >$RAPIDS_VERSION_ARG here because RAPIDS_VERSION_ARG indicates the latest stable version RUN \ export NCCL_SHORT_VER=$(echo "$NCCL_VERSION_ARG" | cut -d "-" -f 1) && \ - mamba create -y -n gpu_test -c rapidsai-nightly -c nvidia -c conda-forge \ + mamba create -y -n gpu_test -c rapidsai-nightly -c conda-forge -c nvidia \ python=3.10 "cudf>$RAPIDS_VERSION_ARG" "rmm>$RAPIDS_VERSION_ARG" cudatoolkit=$CUDA_VERSION_ARG \ "nccl>=${NCCL_SHORT_VER}" \ - dask=2024.1.1 \ "dask-cuda>$RAPIDS_VERSION_ARG" "dask-cudf>$RAPIDS_VERSION_ARG" cupy \ numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \ "pyspark>=3.4.0" cloudpickle cuda-python && \ - mamba clean --all && \ + mamba clean --all --yes && \ conda run --no-capture-output -n gpu_test pip install buildkite-test-collector ENV GOSU_VERSION 1.10 diff --git a/tests/ci_build/Dockerfile.jvm b/tests/ci_build/Dockerfile.jvm index a115fd52c2d9..4f447cbbab54 100644 --- a/tests/ci_build/Dockerfile.jvm +++ b/tests/ci_build/Dockerfile.jvm @@ -9,7 +9,7 @@ RUN \ devtoolset-9-gcc devtoolset-9-binutils devtoolset-9-gcc-c++ \ devtoolset-9-runtime devtoolset-9-libstdc++-devel && \ # Python - wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/22.11.1-2/Mambaforge-22.11.1-2-Linux-x86_64.sh && \ + wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-x86_64.sh && \ bash conda.sh -b -p /opt/mambaforge && \ # CMake wget -nv -nc https://cmake.org/files/v3.18/cmake-3.18.0-Linux-x86_64.sh --no-check-certificate && \ diff --git a/tests/ci_build/Dockerfile.jvm_cross b/tests/ci_build/Dockerfile.jvm_cross index 43988872d989..9ba7b6e69f6a 100644 --- a/tests/ci_build/Dockerfile.jvm_cross +++ b/tests/ci_build/Dockerfile.jvm_cross @@ -13,7 +13,7 @@ RUN \ apt-get update && \ apt-get install -y tar unzip wget openjdk-$JDK_VERSION-jdk libgomp1 && \ # Python - wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/22.11.1-2/Mambaforge-22.11.1-2-Linux-x86_64.sh && \ + wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-x86_64.sh && \ bash conda.sh -b -p /opt/mambaforge && \ /opt/mambaforge/bin/pip install awscli && \ # Maven diff --git a/tests/ci_build/Dockerfile.jvm_gpu_build b/tests/ci_build/Dockerfile.jvm_gpu_build index cee41894266b..7bd49b5a9c70 100644 --- a/tests/ci_build/Dockerfile.jvm_gpu_build +++ b/tests/ci_build/Dockerfile.jvm_gpu_build @@ -12,7 +12,7 @@ RUN \ yum -y update && \ yum install -y tar unzip wget xz git which ninja-build java-1.8.0-openjdk-devel devtoolset-9-gcc devtoolset-9-binutils devtoolset-9-gcc-c++ && \ # Python - wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/22.11.1-2/Mambaforge-22.11.1-2-Linux-x86_64.sh && \ + wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-x86_64.sh && \ bash conda.sh -b -p /opt/mambaforge && \ # CMake wget -nv -nc https://cmake.org/files/v3.18/cmake-3.18.0-Linux-x86_64.sh --no-check-certificate && \ From a34cff866f6a9332d31b117d7d069faa821291a2 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Fri, 31 May 2024 15:49:08 -0700 Subject: [PATCH 7/9] Fix unbound variable error --- tests/buildkite/test-python-gpu.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/buildkite/test-python-gpu.sh b/tests/buildkite/test-python-gpu.sh index 51ab6198dee6..bc5a0abd79ce 100755 --- a/tests/buildkite/test-python-gpu.sh +++ b/tests/buildkite/test-python-gpu.sh @@ -22,7 +22,7 @@ chmod +x build/testxgboost # Allocate extra space in /dev/shm to enable NCCL export CI_DOCKER_EXTRA_PARAMS_INIT='--shm-size=4g' -if [[ -z "${USE_DEPS_DEV_VER}" ]] +if [[ -z "${USE_DEPS_DEV_VER-}" ]] then container_tag='gpu' else From 45e46130ac5f065fdd5cf7b9dd79c2b6d128e9e9 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Fri, 31 May 2024 16:27:41 -0700 Subject: [PATCH 8/9] Specify dev version exactly --- tests/buildkite/build-containers.sh | 8 +++++++- tests/buildkite/conftest.sh | 1 + tests/buildkite/test-python-gpu.sh | 6 ++++-- tests/buildkite/update-rapids.sh | 3 +++ tests/ci_build/Dockerfile.gpu_dev_ver | 6 +++--- 5 files changed, 18 insertions(+), 6 deletions(-) diff --git a/tests/buildkite/build-containers.sh b/tests/buildkite/build-containers.sh index 929817e1533e..1f8c587c9131 100755 --- a/tests/buildkite/build-containers.sh +++ b/tests/buildkite/build-containers.sh @@ -20,12 +20,18 @@ case "${container}" in cpu) ;; - gpu*) + gpu|gpu_build_centos7) BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION" BUILD_ARGS="$BUILD_ARGS --build-arg NCCL_VERSION_ARG=$NCCL_VERSION" BUILD_ARGS="$BUILD_ARGS --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION" ;; + gpu_dev_ver) + BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION" + BUILD_ARGS="$BUILD_ARGS --build-arg NCCL_VERSION_ARG=$NCCL_VERSION" + BUILD_ARGS="$BUILD_ARGS --build-arg RAPIDS_VERSION_ARG=$DEV_RAPIDS_VERSION" + ;; + jvm_gpu_build) BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION" BUILD_ARGS="$BUILD_ARGS --build-arg NCCL_VERSION_ARG=$NCCL_VERSION" diff --git a/tests/buildkite/conftest.sh b/tests/buildkite/conftest.sh index 44043910bc96..ea7a5a00c02a 100755 --- a/tests/buildkite/conftest.sh +++ b/tests/buildkite/conftest.sh @@ -25,6 +25,7 @@ set -x CUDA_VERSION=11.8.0 NCCL_VERSION=2.16.5-1 RAPIDS_VERSION=24.04 +DEV_RAPIDS_VERSION=24.06 SPARK_VERSION=3.4.0 JDK_VERSION=8 R_VERSION=4.3.2 diff --git a/tests/buildkite/test-python-gpu.sh b/tests/buildkite/test-python-gpu.sh index bc5a0abd79ce..d7bd729a2e01 100755 --- a/tests/buildkite/test-python-gpu.sh +++ b/tests/buildkite/test-python-gpu.sh @@ -25,14 +25,16 @@ export CI_DOCKER_EXTRA_PARAMS_INIT='--shm-size=4g' if [[ -z "${USE_DEPS_DEV_VER-}" ]] then container_tag='gpu' + rapids_version=${RAPIDS_VERSION} else container_tag='gpu_dev_ver' + rapids_version=${DEV_RAPIDS_VERSION} fi command_wrapper="tests/ci_build/ci_build.sh ${container_tag} --use-gpus --build-arg "` `"CUDA_VERSION_ARG=$CUDA_VERSION --build-arg "` - `"RAPIDS_VERSION_ARG=$RAPIDS_VERSION --build-arg "` - `"NCCL_VERSION_ARG=$NCCL_VERSION" + `"RAPIDS_VERSION_ARG=${rapids_version} --build-arg "` + `"NCCL_VERSION_ARG=$NCCL_VERSION" # Run specified test suite case "$suite" in diff --git a/tests/buildkite/update-rapids.sh b/tests/buildkite/update-rapids.sh index f617ccd11f58..f6a2675bdfa9 100755 --- a/tests/buildkite/update-rapids.sh +++ b/tests/buildkite/update-rapids.sh @@ -4,7 +4,10 @@ set -euo pipefail LATEST_RAPIDS_VERSION=$(gh api repos/rapidsai/cuml/releases/latest --jq '.name' | sed -e 's/^v\([[:digit:]]\+\.[[:digit:]]\+\).*/\1/') echo "LATEST_RAPIDS_VERSION = $LATEST_RAPIDS_VERSION" +DEV_RAPIDS_VERSION=$(date +%Y-%m-%d -d "20${LATEST_RAPIDS_VERSION//./-}-01 + 2 month" | cut -c3-7 | tr - .) +echo "DEV_RAPIDS_VERSION = $DEV_RAPIDS_VERSION" PARENT_PATH=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P ) sed -i "s/^RAPIDS_VERSION=[[:digit:]]\+\.[[:digit:]]\+/RAPIDS_VERSION=${LATEST_RAPIDS_VERSION}/" $PARENT_PATH/conftest.sh +sed -i "s/^DEV_RAPIDS_VERSION=[[:digit:]]\+\.[[:digit:]]\+/DEV_RAPIDS_VERSION=${DEV_RAPIDS_VERSION}/" $PARENT_PATH/conftest.sh diff --git a/tests/ci_build/Dockerfile.gpu_dev_ver b/tests/ci_build/Dockerfile.gpu_dev_ver index 8feff4f3ed43..a08834d88ba2 100644 --- a/tests/ci_build/Dockerfile.gpu_dev_ver +++ b/tests/ci_build/Dockerfile.gpu_dev_ver @@ -4,6 +4,7 @@ ARG CUDA_VERSION_ARG FROM nvidia/cuda:$CUDA_VERSION_ARG-runtime-ubuntu22.04 ARG CUDA_VERSION_ARG ARG RAPIDS_VERSION_ARG + # Should be first 4 digits of the dev version (e.g. 24.06) ARG NCCL_VERSION_ARG # Environment @@ -22,13 +23,12 @@ RUN \ ENV PATH=/opt/mambaforge/bin:$PATH # Create new Conda environment with dev versions of cuDF, Dask, and cuPy -# Use >$RAPIDS_VERSION_ARG here because RAPIDS_VERSION_ARG indicates the latest stable version RUN \ export NCCL_SHORT_VER=$(echo "$NCCL_VERSION_ARG" | cut -d "-" -f 1) && \ mamba create -y -n gpu_test -c rapidsai-nightly -c conda-forge -c nvidia \ - python=3.10 "cudf>$RAPIDS_VERSION_ARG" "rmm>$RAPIDS_VERSION_ARG" cudatoolkit=$CUDA_VERSION_ARG \ + python=3.10 "cudf=$RAPIDS_VERSION_ARG.*" "rmm=$RAPIDS_VERSION_ARG.*" cudatoolkit=$CUDA_VERSION_ARG \ "nccl>=${NCCL_SHORT_VER}" \ - "dask-cuda>$RAPIDS_VERSION_ARG" "dask-cudf>$RAPIDS_VERSION_ARG" cupy \ + "dask-cuda=$RAPIDS_VERSION_ARG.*" "dask-cudf=$RAPIDS_VERSION_ARG.*" cupy \ numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \ "pyspark>=3.4.0" cloudpickle cuda-python && \ mamba clean --all --yes && \ From c6521a76df2dddeac66a62d094661c9ffd5eafe6 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Mon, 3 Jun 2024 11:03:03 -0700 Subject: [PATCH 9/9] Pin dask=2024.1.1 --- tests/ci_build/Dockerfile.gpu | 1 + tests/ci_build/Dockerfile.gpu_dev_ver | 1 + 2 files changed, 2 insertions(+) diff --git a/tests/ci_build/Dockerfile.gpu b/tests/ci_build/Dockerfile.gpu index a9238b85b982..f68ba9d6b14b 100644 --- a/tests/ci_build/Dockerfile.gpu +++ b/tests/ci_build/Dockerfile.gpu @@ -25,6 +25,7 @@ RUN \ mamba create -y -n gpu_test -c rapidsai -c conda-forge -c nvidia \ python=3.10 cudf=$RAPIDS_VERSION_ARG* rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG \ "nccl>=${NCCL_SHORT_VER}" \ + dask=2024.1.1 \ dask-cuda=$RAPIDS_VERSION_ARG* dask-cudf=$RAPIDS_VERSION_ARG* cupy \ numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \ "pyspark>=3.4.0" cloudpickle cuda-python && \ diff --git a/tests/ci_build/Dockerfile.gpu_dev_ver b/tests/ci_build/Dockerfile.gpu_dev_ver index a08834d88ba2..a592d4891093 100644 --- a/tests/ci_build/Dockerfile.gpu_dev_ver +++ b/tests/ci_build/Dockerfile.gpu_dev_ver @@ -28,6 +28,7 @@ RUN \ mamba create -y -n gpu_test -c rapidsai-nightly -c conda-forge -c nvidia \ python=3.10 "cudf=$RAPIDS_VERSION_ARG.*" "rmm=$RAPIDS_VERSION_ARG.*" cudatoolkit=$CUDA_VERSION_ARG \ "nccl>=${NCCL_SHORT_VER}" \ + dask=2024.1.1 \ "dask-cuda=$RAPIDS_VERSION_ARG.*" "dask-cudf=$RAPIDS_VERSION_ARG.*" cupy \ numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \ "pyspark>=3.4.0" cloudpickle cuda-python && \