From 1c14222c83f71f55fefaeb4ddd6e2f1c3a26576c Mon Sep 17 00:00:00 2001 From: Fabrice Normandin Date: Tue, 21 Jan 2025 16:32:12 -0500 Subject: [PATCH] Remove mock SLURM Cluster CI step (#145) * Try new version of ansible-slurm, continue on err Signed-off-by: Fabrice Normandin * Try to update role to match new version Signed-off-by: Fabrice Normandin * Add --diff -vvv to ansible-playbook command Signed-off-by: Fabrice Normandin * Disable the step for now Signed-off-by: Fabrice Normandin * Remove the mock slurm cluster CI step and files Signed-off-by: Fabrice Normandin --------- Signed-off-by: Fabrice Normandin --- .github/custom_setup_slurm_action/action.yml | 54 -------------- .../slurm-playbook.yml | 74 ------------------- .github/workflows/build.yml | 74 +------------------ 3 files changed, 1 insertion(+), 201 deletions(-) delete mode 100644 .github/custom_setup_slurm_action/action.yml delete mode 100644 .github/custom_setup_slurm_action/slurm-playbook.yml diff --git a/.github/custom_setup_slurm_action/action.yml b/.github/custom_setup_slurm_action/action.yml deleted file mode 100644 index f16e4665..00000000 --- a/.github/custom_setup_slurm_action/action.yml +++ /dev/null @@ -1,54 +0,0 @@ -name: "setup-slurm-action" -description: "Setup slurm cluster on GitHub Actions using https://github.com/galaxyproject/ansible-slurm" -branding: - icon: arrow-down-circle - color: blue -runs: - using: "composite" - steps: - # prior to slurm-setup we need the podmand-correct command - # see https://github.com/containers/podman/issues/13338 - - name: Download slurm ansible roles - shell: bash -e {0} - # ansible-galaxy role install https://github.com/galaxyproject/ansible-slurm/archive/1.0.1.tar.gz - run: | - ansible-galaxy role install https://github.com/mila-iqia/ansible-slurm/archive/1.1.2.tar.gz - - - name: Apt prerequisites - shell: bash -e {0} - run: | - sudo apt-get update - sudo apt-get install retry - - - name: Set XDG_RUNTIME_DIR - shell: bash -e {0} - run: | - mkdir -p /tmp/1002-runtime # work around podman issue (https://github.com/containers/podman/issues/13338) - echo XDG_RUNTIME_DIR=/tmp/1002-runtime >> $GITHUB_ENV - - - name: Setup slurm - shell: bash -e {0} - run: | - ansible-playbook ./.github/custom_setup_slurm_action/slurm-playbook.yml || (journalctl -xe && exit 1) - - - name: Add Slurm Account - shell: bash -e {0} - run: | - sudo retry --until=success -- sacctmgr -i create account "Name=runner" - sudo sacctmgr -i create user "Name=runner" "Account=runner" - - - name: Test srun submission - shell: bash -e {0} - run: | - srun -vvvv echo "hello world" - sudo cat /var/log/slurm/slurmd.log - - - name: Show partition info - shell: bash -e {0} - run: | - scontrol show partition - - - name: Test sbatch submission - shell: bash -e {0} - run: | - sbatch -vvvv -N 1 --mem 5 --wrap "echo 'hello world'" diff --git a/.github/custom_setup_slurm_action/slurm-playbook.yml b/.github/custom_setup_slurm_action/slurm-playbook.yml deleted file mode 100644 index 3b87a135..00000000 --- a/.github/custom_setup_slurm_action/slurm-playbook.yml +++ /dev/null @@ -1,74 +0,0 @@ -- name: Slurm all in One - hosts: localhost - roles: - - role: 1.1.2 - become: true - vars: - slurm_upgrade: true - slurm_roles: ["controller", "exec", "dbd"] - slurm_config_dir: /etc/slurm - slurm_config: - ClusterName: cluster - SlurmctldLogFile: /var/log/slurm/slurmctld.log - SlurmctldPidFile: /run/slurmctld.pid - SlurmdLogFile: /var/log/slurm/slurmd.log - SlurmdPidFile: /run/slurmd.pid - SlurmdSpoolDir: /tmp/slurmd # the default /var/lib/slurm/slurmd does not work because of noexec mounting in github actions - StateSaveLocation: /var/lib/slurm/slurmctld - AccountingStorageType: accounting_storage/slurmdbd - SelectType: select/cons_res - slurmdbd_config: - StorageType: accounting_storage/mysql - PidFile: /run/slurmdbd.pid - LogFile: /var/log/slurm/slurmdbd.log - StoragePass: root - StorageUser: root - StorageHost: 127.0.0.1 # see https://stackoverflow.com/questions/58222386/github-actions-using-mysql-service-throws-access-denied-for-user-rootlocalh - StoragePort: 8888 - DbdHost: localhost - slurm_create_user: yes - #slurm_munge_key: "../../../munge.key" - slurm_nodes: - - name: localhost - State: UNKNOWN - Sockets: 1 - CoresPerSocket: 2 - RealMemory: 2000 - # - name: cn-a[001-011] - # NodeAddr: localhost - # Gres: gpu:rtx8000:8 - # CPUs: 40 - # Boards: 1 - # SocketsPerBoard: 2 - # CoresPerSocket: 20 - # ThreadsPerCore: 1 - # RealMemory: 386618 - # TmpDisk: 3600000 - # State: UNKNOWN - # Feature: x86_64,turing,48gb - # - name: "cn-c[001-010]" - # CoresPerSocket: 18 - # Gres: "gpu:rtx8000:8" - # Sockets: 2 - # ThreadsPerCore: 2 - slurm_partitions: - - name: long - Default: YES - MaxTime: UNLIMITED - Nodes: "localhost" - - name: main - Default: NO - MaxTime: UNLIMITED - Nodes: "localhost" - - name: unkillable - Default: NO - MaxTime: UNLIMITED - Nodes: "localhost" - slurm_user: - comment: "Slurm Workload Manager" - gid: 1002 - group: slurm - home: "/var/lib/slurm" - name: slurm - shell: "/bin/bash" - uid: 1002 diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 55df961b..b5807b0e 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -78,82 +78,10 @@ jobs: name: coverage-reports-unit-${{ matrix.platform }}-${{ matrix.python-version }} path: ./coverage.xml - mock-slurm-integration-tests: - name: integration tests with a mock slurm cluster - needs: [unit-tests] - runs-on: ${{ matrix.platform }} - - strategy: - max-parallel: 5 - matrix: - platform: [ubuntu-latest] - python-version: ['3.9', '3.10', '3.11', '3.12', '3.13'] - - # For the action to work, you have to supply a mysql - # service as defined below. - services: - mysql: - image: mysql:8.0 - env: - MYSQL_ROOT_PASSWORD: root - ports: - - "8888:3306" - options: --health-cmd="mysqladmin ping" --health-interval=10s --health-timeout=5s --health-retries=3 - - steps: - - uses: actions/checkout@v4 - - # NOTE: Replacing this with our customized version of - # - uses: koesterlab/setup-slurm-action@v1 - - uses: ./.github/custom_setup_slurm_action - timeout-minutes: 5 - - - name: Test if the slurm cluster is setup correctly - run: srun --nodes=1 --ntasks=1 --cpus-per-task=1 --mem=1G --time=00:01:00 hostname - - - name: Setup passwordless SSH access to localhost for tests - # Adapted from https://stackoverflow.com/a/60367309/6388696 - run: | - ssh-keygen -t ed25519 -f ~/.ssh/testkey -N '' - cat > ~/.ssh/config < ~/.ssh/authorized_keys - chmod og-rw ~ - ssh -o 'StrictHostKeyChecking no' localhost id - - - name: Install the latest version of uv - uses: astral-sh/setup-uv@v3 - with: - version: "latest" - enable-cache: true - # https://github.com/astral-sh/setup-uv?tab=readme-ov-file#github-authentication-token - github-token: ${{ secrets.GITHUB_TOKEN }} - cache-suffix: ${{ matrix.python-version }} - - name: Pin Python version to ${{ matrix.python-version }} - run: uv python pin ${{ matrix.python-version }} - - name: Install dependencies - run: uv sync - - - name: Launch integration tests - run: uv run pytest --slow --cov=milatools --cov-report=xml --cov-append -vvv --log-level=DEBUG - timeout-minutes: 15 - env: - SLURM_CLUSTER: localhost - - - name: Store coverage report as an artifact - uses: actions/upload-artifact@v4 - with: - name: coverage-reports-mock-${{ matrix.platform }}-${{ matrix.python-version }} - path: ./coverage.xml - real-slurm-integration-tests: name: integration tests with a real SLURM cluster - needs: [mock-slurm-integration-tests] + needs: [unit-tests] strategy: max-parallel: 1