From c3045e609316f9ad0a97383af821ea5588466057 Mon Sep 17 00:00:00 2001 From: Zarquan Date: Thu, 25 Jan 2024 12:25:26 +0000 Subject: [PATCH] Now it works .. now it doesn't --- .../ansible/config/deployments.yml | 10 +- deployments/common/ssh/aglais-team-keys | 2 + notes/zrq/20240124-01-jade-debug.txt | 367 +++++++++++++ notes/zrq/20240124-02-jade-flavors.txt | 411 +++++++++++++++ notes/zrq/20240125-01-jade-debug.txt | 490 ++++++++++++++++++ notes/zrq/20240125-02-jade-test.txt | 250 +++++++++ 6 files changed, 1525 insertions(+), 5 deletions(-) create mode 100644 notes/zrq/20240124-01-jade-debug.txt create mode 100644 notes/zrq/20240124-02-jade-flavors.txt create mode 100644 notes/zrq/20240125-01-jade-debug.txt create mode 100644 notes/zrq/20240125-02-jade-test.txt diff --git a/deployments/cluster-api/ansible/config/deployments.yml b/deployments/cluster-api/ansible/config/deployments.yml index 069ab28c..bd9d62dc 100644 --- a/deployments/cluster-api/ansible/config/deployments.yml +++ b/deployments/cluster-api/ansible/config/deployments.yml @@ -88,18 +88,18 @@ deployments: bootstrap: login: "fedora" - flavor: "qserv-jump-v2" + flavor: "gaia.vm.2vcpu" image: "gaia-dmp-fedora-cloud-38-1.6" controlnode: login: "ubuntu" - flavor: "qserv-jump-v2" + flavor: "gaia.vm.2vcpu" image: "gaia-dmp-ubuntu-2204-kube-v1.26.7" - count: 1 + count: 3 clusternode: login: "ubuntu" - flavor: "qserv-worker-v2" + flavor: "gaia.vm.26vcpu" image: "gaia-dmp-ubuntu-2204-kube-v1.26.7" - count: 3 + count: 6 diff --git a/deployments/common/ssh/aglais-team-keys b/deployments/common/ssh/aglais-team-keys index 3da8973f..9bb08908 100644 --- a/deployments/common/ssh/aglais-team-keys +++ b/deployments/common/ssh/aglais-team-keys @@ -3,3 +3,5 @@ ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQC3T5ShHZ+HQJ6LpPwgpqRK/U0SYnGLSGY7LtwPiDA4 ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDDzZl6gXOSEAKBaxiIb+YMZNc7UZmp/ruB9h/iJL/8fO6Y60JXlpnHW9hYv5ks/NFteokK9+5YSnW5WEVHZtLkCWRGSkAea2JrC64OSngGf4YPzLosxC3wDjjWwYPEya8nfkiVSSxWUXSNIfrdExDTaPEKB/IbFcHI6oJnDGtLEk9rdMQI+BKm5JPlg/gbOEBflAdB107dB5TSii9bj833oW1PyCOc8Ti+7JrH9qsxGUdjHhSYYw/bluxsMB+YoxA85Xu2uoikAW3LIIuNKfJGzdACvxLH40qnfYwbDrf5d7VHfWmYLT7x7ycKJiMbzmfNJMPfeeFMMldY02P+ubzT dmr@roe.ac.uk ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQC7i4yiwoibOPn21ACNFfXZHQN2j7TzyuLLyaSoH97dUJyr0ImH5D/sVwcjK9O1+zagGuDMZGpBuP5Vi5tHQtvXu0nzwtOX1edX+/B07oTwvKCio8d5awhF0OxeA+YpSRbSKorrckPgcmXIxB2STKZeFxmm8OGrTyIA1gER1j2cgSlz2c+/ZYaO+9sOEX4WhnSA1MfiGLyqPJRvGQRXNeS3NhMRkw7dyi6ge2ohFrAN3miOuhW+vEjyc9yncjqyyfhI8l6FuJmaCpezX0dSLsR+TJBd9/OmsdUZTWB0uIMYDUAKQ2m5sglHMFiZ92F9XBYI0fcwMY1dgRc/F0qLRSmbSTTT1cCb9WRWlzcPrl4f4tcsu2RWChN6jMNTFp7yQ72dhKCqPrlkalk870ZRvXMANtNzL6Q4imeSMYPWAEH14p2N4SPRFEsTrH5iW39fThUzOCPbf9DLs+9FNmgk3SQkYlsQJZA2f8QDpd0joSLvBZER1BEr6M3xqvFTj/LqH88= nch@roe.ac.uk ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQDJYb748Ma79ycA5C1Nt9FeqlU4idNE+1LnUrEGqFnkeLV5QFbRj5DRBev3xbulFmkV09ZsV58fKnATtLjTTpgYUjuAEGVdqmf9n69TJCeTkB06zYbPxOBmjkEDUEcQ7BYATvZfRm8aI3VWUvivdOrjmUSBjrPqkHOw+FhlfTbGKkLo0osycHtALZMst46RuUm7s/XfbBQAfAksK85yx4ni98LcTyJk698nwavDGDjWOk/hjNwcTqFs1xRVUZmBalPi85J0CFou59t2jP9eRF7c97obMwVvniLTT7Jnhe/cB/806oQKPZBa13m7yprn85vmm9Axje1e059bP8ZrG2PX3RTwsl42oyAtCIuvZQ3rkhCP2NFIWjbkCPKPOy6RKOWe7h7C8yViiXyAHro3PoyTTd+dfkp7NVRjcgBeamrVdpR8McUJRU72upN0lXurhlkD8Fvoo8ptbwjAOw/ud/+Z53pRr3j6sF5Zqa7znkMNvjM4/jbxdxyCH59Ff4nzGs8= a.krause@epcc.ed.ac.uk +ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQDJFJirE/fseIB0oNhuh6aVOiOPdsNgJe6CuUhXm9jkuy3aayBhg4Uxv5+ffBiMwZknxHMuU5v7Pp8SWbxFwXR1J4KbghkwoEP+NLIWY4V5nyporW02SPNcAy06ydNAUGHHClAzqFlXi0tJlA2K170Z3YgyNffyMWGf+znl1vlwSkYJxJtgX5ursgPzxz1ojFMvTJTNUM5diiRVzaRLbpJ3ivu9JneepHrHc1l3cgCrYp/dRgaYAdRPt7b2XAHYGXcCdFLGdeiLdcs+OhBU4DPW3k6pxYaAhBQLqLjZJ9hOMUsao0R9fFBTtyDUFI7mO+bgJ0Nm1qZ2qf71QZ3al5LCh4O2MXry46S/FkyzUVcdhekNircDFwwgX97egrcOiF9cOg8Wo3o+4l/Ce9YmMSU6q32yOfCckQywRegxsAnzIgea1cgJlF6xrjprpdft6KP2AlFdJz22qa7ibOea6MIl4AT3NduWghu3nrpTsTIEs9TcbbBPO775ImSeBQLTKi0= scott@stackhpc.com + diff --git a/notes/zrq/20240124-01-jade-debug.txt b/notes/zrq/20240124-01-jade-debug.txt new file mode 100644 index 00000000..d2573508 --- /dev/null +++ b/notes/zrq/20240124-01-jade-debug.txt @@ -0,0 +1,367 @@ +# +# +# +# Copyright (c) 2024, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# +# AIMetrics: [] +# + + Target: + + Continuing our journey, figuring out why our deployment fails on Somerville. + + Original plan: + + Add public key for Scott Davidson to enable hiom to access our VMs. + Add security rukles to allow ssh connections between our nodes. + Create a gateway VM inside the cluster network. + + Original plan skipped because it seems to work now. + + Result: + + Now it works !? + Unknown cause. + Do we trust it ? + + History: + + Jan 10th pass + Jan 12th fail + Jan 16th fail + Jan 17th fail + Jan 18th pass + Jan 19th pass + Jan 23rd fail + Jan 24th pass + + +# ----------------------------------------------------- +# Start a new branch. +#[user@desktop] + + branchname=jade-debug + + source "${HOME:?}/aglais.env" + pushd "${AGLAIS_CODE}" + + newbranch=$(date '+%Y%m%d')-zrq-${branchname:?} + + git checkout master + + git checkout -b "${newbranch:?}" + + popd + + +# ----------------------------------------------------- +# Add Scott's public key. +#[user@desktop] + + source "${HOME:?}/aglais.env" + pushd "${AGLAIS_CODE}" + + vi deployments/common/ssh/aglais-team-keys + + popd + + +# ----------------------------------------------------- +# Run our local client. +#[user@desktop] + + source "${HOME:?}/aglais.env" + export PATH=${PATH}:${AGLAIS_CODE}/bin + + agclient jade + + > .... + > .... + + +# ----------------------------------------------------- +# Delete and create everything. +#[root@ansibler] + + export cloudsite=somerville-jade + + /deployments/openstack/bin/delete-all.sh \ + "${cloudname:?}" + + ansible-playbook \ + --inventory 'bootstrap,' \ + '/deployments/cluster-api/ansible/00-create-all.yml' + + > .... + > .... + > PLAY RECAP ****************************************************************************************************************************************** + > bootstrap : ok=58 changed=45 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0 + > localhost : ok=35 changed=26 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0 + + +# ----------------------------------------------------- +# Check the cluster status. +#[root@ansibler] + + ssh bootstrap -t \ + ' + source loadconfig + watch \ + clusterctl \ + --kubeconfig "${kindclusterconf:?}" \ + describe cluster \ + "${workclustername:?}" + ' + + > NAME READY SEVERITY REASON SINCE MESSAGE + > Cluster/somerville-jade-20240124-work True 5m55s + > ├─ClusterInfrastructure - OpenStackCluster/somerville-jade-20240124-work + > ├─ControlPlane - KubeadmControlPlane/somerville-jade-20240124-work-control-plane True 5m55s + > │ └─Machine/somerville-jade-20240124-work-control-plane-sm8rh True 7m11s + > └─Workers + > └─MachineDeployment/somerville-jade-20240124-work-md-0 True 2m5s + > └─3 Machines... True 5m18s See somerville-jade-20240124-work-md-0-jdjp8-s2clm, somerville-jade-20240124-work-md-0-jdjp8-stqxj, ... + + # + # Now it works ... + # Not sure if I trust it though. + # + + +# ----------------------------------------------------- +# Get the location of our cluster config files. +#[root@ansibler] + + # TODO something to put this into the PATH + export PATH=${PATH}:/deployments/cluster-api/ansible/files/aglais/bin + source loadconfig + + +# ----------------------------------------------------- +# Run a SOCKS proxy linking our client container to our bootstrap node. +# https://unix.stackexchange.com/questions/34004/how-does-tcp-keepalive-work-in-ssh +# https://unix.stackexchange.com/a/34201 +#[root@ansibler] + + ssh \ + -n \ + -f \ + -N \ + -D '*:3000' \ + -o ServerAliveInterval=10 \ + -o ServerAliveCountMax=12 \ + bootstrap + + > .... + > .... + + +# ----------------------------------------------------- +# Modify our kubectl config to add a SOCKS proxy. +#[root@ansibler] + + source loadconfig + vi "${workclusterconf:?}" + + apiVersion: v1 + kind: Config + clusters: + - cluster: + name: somerville-jade-20240118-work + .... + server: https://192.41.122.195:6443 + + proxy-url: socks5://localhost:3000/ + + +# ----------------------------------------------------- +# Check we can access the cluster-info. +#[root@ansibler] + + source loadconfig + kubectl \ + --kubeconfig "${workclusterconf:?}" \ + cluster-info + + > Kubernetes control plane is running at https://192.41.122.223:6443 + > CoreDNS is running at https://192.41.122.223:6443/api/v1/namespaces/kube-system/services/kube-dns:dns/proxy + + +# ----------------------------------------------------- +# Deploy our gaia-dmp Helm chart. +#[root@ansibler] + + source loadconfig + + helm dependency build \ + --kubeconfig "${workclusterconf:?}" \ + '/deployments/cluster-api/helm/gaia-dmp' + + > Saving 2 charts + > Deleting outdated charts + + + helm upgrade \ + --wait \ + --debug \ + --kubeconfig "${workclusterconf:?}" \ + 'gaia-dmp' \ + '/deployments/cluster-api/helm/gaia-dmp' \ + --install + + > history.go:56: [debug] getting history for release gaia-dmp + > Release "gaia-dmp" does not exist. Installing it now. + > install.go:194: [debug] Original chart version: "" + > install.go:211: [debug] CHART PATH: /deployments/cluster-api/helm/gaia-dmp + > .... + > .... + > ready.go:277: [debug] Deployment is not ready: default/zeppelin-server. 0 out of 1 expected pods are ready + > ready.go:277: [debug] Deployment is not ready: default/zeppelin-server. 0 out of 1 expected pods are ready + > Error: timed out waiting for the condition + > helm.go:84: [debug] timed out waiting for the condition + + + helm upgrade \ + --wait \ + --debug \ + --kubeconfig "${workclusterconf:?}" \ + 'gaia-dmp' \ + '/deployments/cluster-api/helm/gaia-dmp' \ + --install + + > history.go:56: [debug] getting history for release gaia-dmp + > upgrade.go:144: [debug] preparing upgrade for gaia-dmp + > upgrade.go:152: [debug] performing update for gaia-dmp + > upgrade.go:324: [debug] creating upgraded release for gaia-dmp + > client.go:338: [debug] checking 10 resources for changes + > client.go:617: [debug] Looks like there are no changes for Namespace "gaia-dmp" + > client.go:617: [debug] Looks like there are no changes for ServiceAccount "dashboard-admin-account" + > client.go:617: [debug] Looks like there are no changes for ServiceAccount "zeppelin-server" + > client.go:617: [debug] Looks like there are no changes for ConfigMap "zeppelin-server-conf-map" + > client.go:617: [debug] Looks like there are no changes for ConfigMap "zeppelin-server-conf" + > client.go:617: [debug] Looks like there are no changes for ClusterRole "zeppelin-server-role" + > client.go:617: [debug] Looks like there are no changes for ClusterRoleBinding "dashboard-admin-binding" + > client.go:617: [debug] Looks like there are no changes for RoleBinding "zeppelin-server-role-binding" + > client.go:617: [debug] Looks like there are no changes for Service "zeppelin-server" + > client.go:626: [debug] Patch Deployment "zeppelin-server" in namespace default + > upgrade.go:396: [debug] waiting for release gaia-dmp resources (created: 0 updated: 10 deleted: 0) + > wait.go:48: [debug] beginning wait for 10 resources with timeout of 5m0s + > ready.go:277: [debug] Deployment is not ready: default/zeppelin-server. 0 out of 1 expected pods are ready + > ready.go:277: [debug] Deployment is not ready: default/zeppelin-server. 0 out of 1 expected pods are ready + > .... + > .... + > ready.go:277: [debug] Deployment is not ready: default/zeppelin-server. 0 out of 1 expected pods are ready + > ready.go:277: [debug] Deployment is not ready: default/zeppelin-server. 0 out of 1 expected pods are ready + > upgrade.go:159: [debug] updating status for upgraded release for gaia-dmp + > Release "gaia-dmp" has been upgraded. Happy Helming! + > NAME: gaia-dmp + > LAST DEPLOYED: Wed Jan 24 15:28:29 2024 + > NAMESPACE: default + > STATUS: deployed + > REVISION: 2 + > TEST SUITE: None + > .... + > .... + +# ----------------------------------------------------- +# Generate a dashboard token. +#[root@ansibler] + + kubectl \ + --kubeconfig "${workclusterconf:?}" \ + --namespace "gaia-dmp" \ + create token \ + "dashboard-admin-account" + + > .... + > .... + + +# ----------------------------------------------------- +# Launch a kubectl proxy. +#[root@ansibler] + + kubectl \ + --kubeconfig "${workclusterconf:?}" \ + --address 0.0.0.0 \ + proxy \ + & + + > Starting to serve on [::]:8001 + + +# ----------------------------------------------------- +# ----------------------------------------------------- +# Get the published port number for our agclient. +#[user@desktop] + + agcolour=jade + + kubeport=$( + podman container \ + inspect \ + "ansibler-${agcolour:?}" \ + --format json \ + | jq -r ' + .[0] + | .HostConfig.PortBindings + | ."8001/tcp" + | .[0].HostPort + ' + ) + + echo "kubeport [${kubeport}]" + + > kubeport [42159] + + +# ----------------------------------------------------- +# Launch browser pointed at the dashboard. +#[user@desktop] + + firefox \ + --new-window \ + "http://localhost:${kubeport:?}/api/v1/namespaces/kubernetes-dashboard/services/https:kubernetes-dashboard:https/proxy/#/login" \ + & + + # + # Dashboard works :-) + # + + +# ----------------------------------------------------- +# Launch browser pointed at Zeppelin. +#[user@desktop] + + firefox \ + --new-window \ + "http://localhost:${kubeport:?}/api/v1/namespaces/default/services/http:zeppelin-server:http/proxy/#/" \ + & + + # + # Zeppelin responds .. but only part of the front page is displayed. + # Suspect that multiple proxies ontop of proxies is mangling the 'clever' JS UI app. + # + + diff --git a/notes/zrq/20240124-02-jade-flavors.txt b/notes/zrq/20240124-02-jade-flavors.txt new file mode 100644 index 00000000..cc6db448 --- /dev/null +++ b/notes/zrq/20240124-02-jade-flavors.txt @@ -0,0 +1,411 @@ +# +# +# +# Copyright (c) 2024, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# +# AIMetrics: [] +# + + Target: + + Test out our new flavors and quota. + + + Result: + + Work in progress ... + +# ----------------------------------------------------- +# Run our local client. +#[user@desktop] + + source "${HOME:?}/aglais.env" + export PATH=${PATH}:${AGLAIS_CODE}/bin + + agclient jade + + > .... + > .... + + +# ----------------------------------------------------- +# List our new flavors. +#[root@ansibler] + + openstack \ + --os-cloud "${cloudname:?}" \ + flavor list \ + --format json \ + | jq -r ' + .[] + | select( + .Name | startswith("gaia") + ) + | .Name + ' + + > gaia.vm.2vcpu + > gaia.vm.4vcpu + > gaia.vm.26vcpu + + +# ----------------------------------------------------- +# ----------------------------------------------------- +# Update our deployment configuration. +#[user@desktop] + + source "${HOME:?}/aglais.env" + pushd "${AGLAIS_CODE}" + + vi deployments/cluster-api/ansible/config/deployments.yml + + git diff deployments/cluster-api/ansible/config/deployments.yml + + popd + + > diff --git a/deployments/cluster-api/ansible/config/deployments.yml b/deployments/cluster-api/ansible/config/deployments.yml + > index 069ab28..bd9d62d 100644 + > --- a/deployments/cluster-api/ansible/config/deployments.yml + > +++ b/deployments/cluster-api/ansible/config/deployments.yml + > @@ -88,18 +88,18 @@ deployments: + > + > bootstrap: + > login: "fedora" + > - flavor: "qserv-jump-v2" + > + flavor: "gaia.vm.2vcpu" + > image: "gaia-dmp-fedora-cloud-38-1.6" + > + > controlnode: + > login: "ubuntu" + > - flavor: "qserv-jump-v2" + > + flavor: "gaia.vm.2vcpu" + > image: "gaia-dmp-ubuntu-2204-kube-v1.26.7" + > - count: 1 + > + count: 3 + > + > clusternode: + > login: "ubuntu" + > - flavor: "qserv-worker-v2" + > + flavor: "gaia.vm.26vcpu" + > image: "gaia-dmp-ubuntu-2204-kube-v1.26.7" + > - count: 3 + > + count: 6 + +# ----------------------------------------------------- +# ----------------------------------------------------- +# Delete and create everything. +#[root@ansibler] + + export cloudsite=somerville-jade + + /deployments/openstack/bin/delete-all.sh \ + "${cloudname:?}" + + ansible-playbook \ + --inventory 'bootstrap,' \ + '/deployments/cluster-api/ansible/00-create-all.yml' + + > .... + > .... + > TASK [Update SSH fingerprints] ********************************************************************************************************************** + > fatal: [localhost]: FAILED! => { + > "changed": true, + > "cmd": " + > touch \"/root/.ssh/known_hosts\" + > sed --in-place '/^bootstrap/ d' \"/root/.ssh/known_hosts\" + > ssh-keyscan 'bootstrap' >> \"/root/.ssh/known_hosts\" + > ", + > "delta": "0:00:00.047044", + > "end": "2024-01-24 16:09:28.881519", + > "msg": "non-zero return code", + > "rc": 1, + > "start": "2024-01-24 16:09:28.834475", + > "stderr": "write (bootstrap): Connection refused ....", + > "stderr_lines": [ + > "write (bootstrap): Connection refused", + > "write (bootstrap): Connection refused", + > "write (bootstrap): Connection refused", + > "write (bootstrap): Connection refused", + > "write (bootstrap): Connection refused" + > ], + > "stdout": "", + > "stdout_lines": [] + > } + > + > PLAY RECAP ****************************************************************************************************************************************** + > localhost : ok=30 changed=21 unreachable=0 failed=1 skipped=0 rescued=0 ignored=0 + + # + # Is this just because the VM was slow starting up ? + # + +# ----------------------------------------------------- +# Try again ... +#[root@ansibler] + + export cloudsite=somerville-jade + + /deployments/openstack/bin/delete-all.sh \ + "${cloudname:?}" + + ansible-playbook \ + --inventory 'bootstrap,' \ + '/deployments/cluster-api/ansible/00-create-all.yml' + + > .... + > .... + > PLAY RECAP ****************************************************************************************************************************************** + > bootstrap : ok=58 changed=45 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0 + > localhost : ok=35 changed=25 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0 + + +# ----------------------------------------------------- +# Check the cluster status. +#[root@ansibler] + + ssh bootstrap -t \ + ' + source loadconfig + watch \ + clusterctl \ + --kubeconfig "${kindclusterconf:?}" \ + describe cluster \ + "${workclustername:?}" + ' + + > NAME READY SEVERITY REASON SINCE MESSAGE + > Cluster/somerville-jade-20240124-work False Warning ScalingUp 21m Scaling up control plane to 3 replicas (actual 1) + > ├─ClusterInfrastructure - OpenStackCluster/somerville-jade-20240124-work + > ├─ControlPlane - KubeadmControlPlane/somerville-jade-20240124-work-control-plane False Warning ScalingUp 21m Scaling up control plane to 3 replicas (actual 1) + > │ └─Machine/somerville-jade-20240124-work-control-plane-rhstd False Warning NodeStartupTimeout 9m23s Node failed to report startup in 10m0s + > └─Workers + > └─MachineDeployment/somerville-jade-20240124-work-md-0 False Warning WaitingForAvailableMachines 23m Minimum availability requires 5 replicas, current 0 available + > └─6 Machines... True 8m54s See somerville-jade-20240124-work-md-0-jrm4b-c7c2m, somerville-jade-20240124-work-md-0-jrm4b-p6tf5, ... + + # + # Back to not working ... + # Similar symptoms as before. + # So is this the new flavors or the previous issue coming back ? + # + # Try with just the new flavors but the origonal counts. + # + +# ----------------------------------------------------- +# ----------------------------------------------------- +# Update our deployment configuration. +#[user@desktop] + + source "${HOME:?}/aglais.env" + pushd "${AGLAIS_CODE}" + + vi deployments/cluster-api/ansible/config/deployments.yml + + git diff deployments/cluster-api/ansible/config/deployments.yml + + popd + + > diff --git a/deployments/cluster-api/ansible/config/deployments.yml b/deployments/cluster-api/ansible/config/deployments.yml + > index 069ab28..a7758ac 100644 + > --- a/deployments/cluster-api/ansible/config/deployments.yml + > +++ b/deployments/cluster-api/ansible/config/deployments.yml + > @@ -88,18 +88,18 @@ deployments: + > + > bootstrap: + > login: "fedora" + > - flavor: "qserv-jump-v2" + > + flavor: "gaia.vm.2vcpu" + > image: "gaia-dmp-fedora-cloud-38-1.6" + > + > controlnode: + > login: "ubuntu" + > - flavor: "qserv-jump-v2" + > + flavor: "gaia.vm.2vcpu" + > image: "gaia-dmp-ubuntu-2204-kube-v1.26.7" + > count: 1 + > + > clusternode: + > login: "ubuntu" + > - flavor: "qserv-worker-v2" + > + flavor: "gaia.vm.26vcpu" + > image: "gaia-dmp-ubuntu-2204-kube-v1.26.7" + > count: 3 + + +# ----------------------------------------------------- +# ----------------------------------------------------- +# Delete and create everything. +#[root@ansibler] + + export cloudsite=somerville-jade + + /deployments/openstack/bin/delete-all.sh \ + "${cloudname:?}" + + ansible-playbook \ + --inventory 'bootstrap,' \ + '/deployments/cluster-api/ansible/00-create-all.yml' + + > .... + > .... + > PLAY RECAP ****************************************************************************************************************************************** + > bootstrap : ok=58 changed=45 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0 + > localhost : ok=35 changed=23 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0 + + +# ----------------------------------------------------- +# Check the cluster status. +#[root@ansibler] + + ssh bootstrap -t \ + ' + source loadconfig + watch \ + clusterctl \ + --kubeconfig "${kindclusterconf:?}" \ + describe cluster \ + "${workclustername:?}" + ' + + > NAME READY SEVERITY REASON SINCE MESSAGE + > Cluster/somerville-jade-20240124-work True 2m19s + > ├─ClusterInfrastructure - OpenStackCluster/somerville-jade-20240124-work + > ├─ControlPlane - KubeadmControlPlane/somerville-jade-20240124-work-control-plane True 2m19s + > │ └─Machine/somerville-jade-20240124-work-control-plane-cdngh True 3m23s + > └─Workers + > └─MachineDeployment/somerville-jade-20240124-work-md-0 False Warning WaitingForAvailableMachines 5m5s Minimum availability requires 2 replicas, current 0 available + > └─3 Machines... True 119s See somerville-jade-20240124-work-md-0-wqz9h-84cqs, somerville-jade-20240124-work-md-0-wqz9h-bhztw, ... + + # + # Nope, still not working. + # Looks like the flavors are OK, but the healthcheck callbacks aren't. + # + + > NAME READY SEVERITY REASON SINCE MESSAGE + > Cluster/somerville-jade-20240124-work True 10m + > ├─ClusterInfrastructure - OpenStackCluster/somerville-jade-20240124-work + > ├─ControlPlane - KubeadmControlPlane/somerville-jade-20240124-work-control-plane True 10m + > │ └─Machine/somerville-jade-20240124-work-control-plane-cdngh True 11m + > └─Workers + > └─MachineDeployment/somerville-jade-20240124-work-md-0 True 6m25s + > └─3 Machines... True 10m See somerville-jade-20240124-work-md-0-wqz9h-84cqs, somerville-jade-20240124-work-md-0-wqz9h-bhztw, ... + > Connection to bootstrap closed. + + # + # Wait long enough and it works :-) + # Try with 3 control nodes .... + # + + +# ----------------------------------------------------- +# ----------------------------------------------------- +# Update our deployment configuration. +#[user@desktop] + + source "${HOME:?}/aglais.env" + pushd "${AGLAIS_CODE}" + + vi deployments/cluster-api/ansible/config/deployments.yml + + git diff deployments/cluster-api/ansible/config/deployments.yml + + popd + + > diff --git a/deployments/cluster-api/ansible/config/deployments.yml b/deployments/cluster-api/ansible/config/deployments.yml + > index 069ab28..e73ad20 100644 + > --- a/deployments/cluster-api/ansible/config/deployments.yml + > +++ b/deployments/cluster-api/ansible/config/deployments.yml + > @@ -88,18 +88,18 @@ deployments: + > + > bootstrap: + > login: "fedora" + > - flavor: "qserv-jump-v2" + > + flavor: "gaia.vm.2vcpu" + > image: "gaia-dmp-fedora-cloud-38-1.6" + > + > controlnode: + > login: "ubuntu" + > - flavor: "qserv-jump-v2" + > + flavor: "gaia.vm.2vcpu" + > image: "gaia-dmp-ubuntu-2204-kube-v1.26.7" + > - count: 1 + > + count: 3 + > + > clusternode: + > login: "ubuntu" + > - flavor: "qserv-worker-v2" + > + flavor: "gaia.vm.26vcpu" + > image: "gaia-dmp-ubuntu-2204-kube-v1.26.7" + > count: 3 + + +# ----------------------------------------------------- +# ----------------------------------------------------- +# Delete and create everything. +#[root@ansibler] + + export cloudsite=somerville-jade + + /deployments/openstack/bin/delete-all.sh \ + "${cloudname:?}" + + ansible-playbook \ + --inventory 'bootstrap,' \ + '/deployments/cluster-api/ansible/00-create-all.yml' + + > .... + > .... + > PLAY RECAP ****************************************************************************************************************************************** + > bootstrap : ok=58 changed=45 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0 + > localhost : ok=35 changed=23 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0 + + +# ----------------------------------------------------- +# Check the cluster status. +#[root@ansibler] + + ssh bootstrap -t \ + ' + source loadconfig + watch \ + clusterctl \ + --kubeconfig "${kindclusterconf:?}" \ + describe cluster \ + "${workclustername:?}" + ' + + > NAME READY SEVERITY REASON SINCE MESSAGE + > Cluster/somerville-jade-20240124-work False Warning ScalingUp 11h Scaling up control plane to 3 replicas (actual 1) + > ├─ClusterInfrastructure - OpenStackCluster/somerville-jade-20240124-work + > ├─ControlPlane - KubeadmControlPlane/somerville-jade-20240124-work-control-plane False Warning ScalingUp 11h Scaling up control plane to 3 replicas (actual 1) + > │ └─Machine/somerville-jade-20240124-work-control-plane-k46g7 False Warning NodeStartupTimeout 11h Node failed to report startup in 10m0s + > └─Workers + > └─MachineDeployment/somerville-jade-20240124-work-md-0 False Warning WaitingForAvailableMachines 11h Minimum availability requires 2 replicas, current 0 available + > └─3 Machines... True 4m44s See somerville-jade-20240124-work-md-0-vqmxl-cvkm9, somerville-jade-20240124-work-md-0-vqmxl-shv4m, ... + + # + # Nope. + # + + diff --git a/notes/zrq/20240125-01-jade-debug.txt b/notes/zrq/20240125-01-jade-debug.txt new file mode 100644 index 00000000..c05c62df --- /dev/null +++ b/notes/zrq/20240125-01-jade-debug.txt @@ -0,0 +1,490 @@ +# +# +# +# Copyright (c) 2024, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# +# AIMetrics: [] +# + + Target: + + Try to summarise where we are. + + Result: + + Work in progress ... + + +# ----------------------------------------------------- +# 20240124-02-jade-flavors.txt +# FAIL : 3 control node, 6 workers +#[root@ansibler] + + # Where we want to be: + # New flavors, 3 control nodes, 6 worker nodes. + # + + vi /deployments/cluster-api/ansible/config/deployments.yml + + deployments: + somerville-jade: + bootstrap: + login: "fedora" + ~ flavor: "gaia.vm.2vcpu" + image: "gaia-dmp-fedora-cloud-38-1.6" + + controlnode: + login: "ubuntu" + ~ flavor: "gaia.vm.2vcpu" + image: "gaia-dmp-ubuntu-2204-kube-v1.26.7" + ~ count: 3 + + clusternode: + login: "ubuntu" + ~ flavor: "gaia.vm.26vcpu" + image: "gaia-dmp-ubuntu-2204-kube-v1.26.7" + ~ count: 6 + + /deployments/openstack/bin/delete-all.sh \ + "${cloudname:?}" + + ansible-playbook \ + --inventory 'bootstrap,' \ + '/deployments/cluster-api/ansible/00-create-all.yml' + + ssh bootstrap -t \ + ' + source loadconfig + watch \ + clusterctl \ + --kubeconfig "${kindclusterconf:?}" \ + describe cluster \ + "${workclustername:?}" + ' + + > NAME READY SEVERITY REASON SINCE MESSAGE + > Cluster/somerville-jade-20240124-work False Warning ScalingUp 21m Scaling up control plane to 3 replicas (actual 1) + > ├─ClusterInfrastructure - OpenStackCluster/somerville-jade-20240124-work + > ├─ControlPlane - KubeadmControlPlane/somerville-jade-20240124-work-control-plane False Warning ScalingUp 21m Scaling up control plane to 3 replicas (actual 1) + > │ └─Machine/somerville-jade-20240124-work-control-plane-rhstd False Warning NodeStartupTimeout 9m23s Node failed to report startup in 10m0s + > └─Workers + > └─MachineDeployment/somerville-jade-20240124-work-md-0 False Warning WaitingForAvailableMachines 23m Minimum availability requires 5 replicas, current 0 available + > └─6 Machines... True 8m54s See somerville-jade-20240124-work-md-0-jrm4b-c7c2m, somerville-jade-20240124-work-md-0-jrm4b-p6tf5, ... + + # + # Fail - not working after 21m. + # + + +# ----------------------------------------------------- +# 20240124-02-jade-flavors.txt +# PASS : 1 control node, 3 workers +#[root@ansibler] + + # + # Try the original counts. + # New flavors, 1 control nodes, 3 worker nodes. + + vi /deployments/cluster-api/ansible/config/deployments.yml + + deployments: + somerville-jade: + bootstrap: + login: "fedora" + ~ flavor: "gaia.vm.2vcpu" + image: "gaia-dmp-fedora-cloud-38-1.6" + + controlnode: + login: "ubuntu" + ~ flavor: "gaia.vm.2vcpu" + image: "gaia-dmp-ubuntu-2204-kube-v1.26.7" + ~ count: 1 + + clusternode: + login: "ubuntu" + ~ flavor: "gaia.vm.26vcpu" + image: "gaia-dmp-ubuntu-2204-kube-v1.26.7" + ~ count: 3 + + /deployments/openstack/bin/delete-all.sh \ + "${cloudname:?}" + + ansible-playbook \ + --inventory 'bootstrap,' \ + '/deployments/cluster-api/ansible/00-create-all.yml' + + ssh bootstrap -t \ + ' + source loadconfig + watch \ + clusterctl \ + --kubeconfig "${kindclusterconf:?}" \ + describe cluster \ + "${workclustername:?}" + ' + + > NAME READY SEVERITY REASON SINCE MESSAGE + > Cluster/somerville-jade-20240124-work True 10m + > ├─ClusterInfrastructure - OpenStackCluster/somerville-jade-20240124-work + > ├─ControlPlane - KubeadmControlPlane/somerville-jade-20240124-work-control-plane True 10m + > │ └─Machine/somerville-jade-20240124-work-control-plane-cdngh True 11m + > └─Workers + > └─MachineDeployment/somerville-jade-20240124-work-md-0 True 6m25s + > └─3 Machines... True 10m See somerville-jade-20240124-work-md-0-wqz9h-84cqs, somerville-jade-20240124-work-md-0-wqz9h-bhztw, ... + > Connection to bootstrap closed. + + # + # Pass - takes ~10m to get there. + # + + +# ----------------------------------------------------- +# 20240124-02-jade-flavors.txt +# FAIL : 3 control node, 3 workers +#[root@ansibler] + + # + # Try 3 control nodes. + # New flavors, 3 control nodes, 3 worker nodes. + + vi /deployments/cluster-api/ansible/config/deployments.yml + + deployments: + somerville-jade: + bootstrap: + login: "fedora" + ~ flavor: "gaia.vm.2vcpu" + image: "gaia-dmp-fedora-cloud-38-1.6" + + controlnode: + login: "ubuntu" + ~ flavor: "gaia.vm.2vcpu" + image: "gaia-dmp-ubuntu-2204-kube-v1.26.7" + ~ count: 3 + + clusternode: + login: "ubuntu" + ~ flavor: "gaia.vm.26vcpu" + image: "gaia-dmp-ubuntu-2204-kube-v1.26.7" + ~ count: 3 + + /deployments/openstack/bin/delete-all.sh \ + "${cloudname:?}" + + ansible-playbook \ + --inventory 'bootstrap,' \ + '/deployments/cluster-api/ansible/00-create-all.yml' + + ssh bootstrap -t \ + ' + source loadconfig + watch \ + clusterctl \ + --kubeconfig "${kindclusterconf:?}" \ + describe cluster \ + "${workclustername:?}" + ' + + > NAME READY SEVERITY REASON SINCE MESSAGE + > Cluster/somerville-jade-20240124-work False Warning ScalingUp 11h Scaling up control plane to 3 replicas (actual 1) + > ├─ClusterInfrastructure - OpenStackCluster/somerville-jade-20240124-work + > ├─ControlPlane - KubeadmControlPlane/somerville-jade-20240124-work-control-plane False Warning ScalingUp 11h Scaling up control plane to 3 replicas (actual 1) + > │ └─Machine/somerville-jade-20240124-work-control-plane-k46g7 False Warning NodeStartupTimeout 11h Node failed to report startup in 10m0s + > └─Workers + > └─MachineDeployment/somerville-jade-20240124-work-md-0 False Warning WaitingForAvailableMachines 11h Minimum availability requires 2 replicas, current 0 available + > └─3 Machines... True 4m44s See somerville-jade-20240124-work-md-0-vqmxl-cvkm9, somerville-jade-20240124-work-md-0-vqmxl-shv4m, ... + + # + # Fail - still not working after 11h. + # + + # + # Summary so far: + # The new flavors seem to work OK. + # Machines are created and startip OK. + # More than 1 control node - fails. + # What about 1 control node and 6 workers ? + # + + +# ----------------------------------------------------- +# 20240125-01-jade-debug.txt +# PASS : 1 control node, 6 workers +#[root@ansibler] + + # + # New flavors, 1 control nodes, 6 worker nodes. + + vi /deployments/cluster-api/ansible/config/deployments.yml + + deployments: + somerville-jade: + bootstrap: + login: "fedora" + ~ flavor: "gaia.vm.2vcpu" + image: "gaia-dmp-fedora-cloud-38-1.6" + + controlnode: + login: "ubuntu" + ~ flavor: "gaia.vm.2vcpu" + image: "gaia-dmp-ubuntu-2204-kube-v1.26.7" + ~ count: 1 + + clusternode: + login: "ubuntu" + ~ flavor: "gaia.vm.26vcpu" + image: "gaia-dmp-ubuntu-2204-kube-v1.26.7" + ~ count: 6 + + /deployments/openstack/bin/delete-all.sh \ + "${cloudname:?}" + + ansible-playbook \ + --inventory 'bootstrap,' \ + '/deployments/cluster-api/ansible/00-create-all.yml' + + ssh bootstrap -t \ + ' + source loadconfig + watch \ + clusterctl \ + --kubeconfig "${kindclusterconf:?}" \ + describe cluster \ + "${workclustername:?}" + ' + + > NAME READY SEVERITY REASON SINCE MESSAGE + > Cluster/somerville-jade-20240125-work True 4m50s + > ├─ClusterInfrastructure - OpenStackCluster/somerville-jade-20240125-work + > ├─ControlPlane - KubeadmControlPlane/somerville-jade-20240125-work-control-plane True 4m50s + > │ └─Machine/somerville-jade-20240125-work-control-plane-cxkkm True 6m21s + > └─Workers + > └─MachineDeployment/somerville-jade-20240125-work-md-0 True 20s + > └─6 Machines... True 4m17s See somerville-jade-20240125-work-md-0-gz6nl-6ds2n, somerville-jade-20240125-work-md-0-gz6nl-8bnpx, ... + + # + # Yay - works. + # + + ssh bootstrap -t \ + ' + source loadconfig + kubectl \ + --kubeconfig "${kindclusterconf:?}" \ + get machines \ + --all-namespaces + ' + + > NAMESPACE NAME CLUSTER NODENAME PROVIDERID PHASE AGE VERSION + > default somerville-jade-20240125-work-control-plane-cxkkm somerville-jade-20240125-work somerville-jade-20240125-work-control-plane-1ae41063-8wcdk openstack:///b81ed4b0-10cc-477d-903e-e9fc365972c3 Running 30m v1.26.7 + > default somerville-jade-20240125-work-md-0-gz6nl-6ds2n somerville-jade-20240125-work somerville-jade-20240125-work-md-0-de647233-6dsjm openstack:///9ac947d8-52d3-420c-965a-3c931819b3a0 Running 31m v1.26.7 + > default somerville-jade-20240125-work-md-0-gz6nl-8bnpx somerville-jade-20240125-work somerville-jade-20240125-work-md-0-de647233-k7bcc openstack:///e53a2842-fdb6-4bc1-a47d-f64e4e43335b Running 31m v1.26.7 + > default somerville-jade-20240125-work-md-0-gz6nl-gzrdr somerville-jade-20240125-work somerville-jade-20240125-work-md-0-de647233-gwfh8 openstack:///5071ea88-6109-4fad-9d11-33693119a6d8 Running 31m v1.26.7 + > default somerville-jade-20240125-work-md-0-gz6nl-m6nwj somerville-jade-20240125-work somerville-jade-20240125-work-md-0-de647233-pv8m4 openstack:///9a565461-89ae-4379-a803-e5bf7eab04e8 Running 31m v1.26.7 + > default somerville-jade-20240125-work-md-0-gz6nl-mqtgb somerville-jade-20240125-work somerville-jade-20240125-work-md-0-de647233-btcbl openstack:///f2af45da-dd0c-4611-b061-dd13bf909295 Running 31m v1.26.7 + > default somerville-jade-20240125-work-md-0-gz6nl-rhgzp somerville-jade-20240125-work somerville-jade-20240125-work-md-0-de647233-xcpkt openstack:///1a4e093f-3a75-4f9f-a4b4-c91cabb5ed74 Running 31m v1.26.7 + + + # + # Horizon overview: + # Used 8 of 20 instances + # Used 160 of 180 cpu cores + # Used 264GB of 320GB RAM + # + # Available: + # 12 instances + # 20 cpu cores + # 56G RAM + # + # It isn't a space issue. + # Adding another 2 * gaia.vm.2vcpu machines + # +2 instances + # +2 * 2 cpu cores + # +2 * 3GB RAM + # + +# ----------------------------------------------------- +# 20240125-01-jade-debug.txt +# TEST : 3 control node, 6 workers +#[root@ansibler] + + # + # New flavors, 3 control nodes, 6 worker nodes. + + vi /deployments/cluster-api/ansible/config/deployments.yml + + deployments: + somerville-jade: + bootstrap: + login: "fedora" + ~ flavor: "gaia.vm.2vcpu" + image: "gaia-dmp-fedora-cloud-38-1.6" + + controlnode: + login: "ubuntu" + ~ flavor: "gaia.vm.2vcpu" + image: "gaia-dmp-ubuntu-2204-kube-v1.26.7" + ~ count: 3 + + clusternode: + login: "ubuntu" + ~ flavor: "gaia.vm.26vcpu" + image: "gaia-dmp-ubuntu-2204-kube-v1.26.7" + ~ count: 6 + + /deployments/openstack/bin/delete-all.sh \ + "${cloudname:?}" + + ansible-playbook \ + --inventory 'bootstrap,' \ + '/deployments/cluster-api/ansible/00-create-all.yml' + + ssh bootstrap -t \ + ' + source loadconfig + watch \ + clusterctl \ + --kubeconfig "${kindclusterconf:?}" \ + describe cluster \ + "${workclustername:?}" + ' + + > NAME READY SEVERITY REASON SINCE MESSAGE + > Cluster/somerville-jade-20240125-work True 36s + > ├─ClusterInfrastructure - OpenStackCluster/somerville-jade-20240125-work + > ├─ControlPlane - KubeadmControlPlane/somerville-jade-20240125-work-control-plane True 36s + > │ └─3 Machines... True 10m See somerville-jade-20240125-work-control-plane-chhwd, somerville-jade-20240125-work-control-plane-cnq4m, ... + > └─Workers + > └─MachineDeployment/somerville-jade-20240125-work-md-0 True 3m23s + > └─6 Machines... True 8m12s See somerville-jade-20240125-work-md-0-42mfs-99khf, somerville-jade-20240125-work-md-0-42mfs-cfcb4, ... + > Connection to bootstrap closed. + + # + # Works, full set !! + # 3 controlnode, 6 workers + # + + # 20240123-01-jade-debug.txt + # FAIL : 1 control node, 3 workers + + # 20240124-01-jade-debug.txt + # PASS : 1 control node, 3 workers + + # 20240124-02-jade-flavors.txt + # FAIL : 3 control node, 6 workers + # PASS : 1 control node, 3 workers + # FAIL : 3 control node, 3 workers + + # 20240125-01-jade-debug.txt + # PASS : 1 control node, 6 workers + # PASS : 3 control node, 6 workers << works !! + + # --------------------------------- + + # Different results + # 20240123-01-jade-debug.txt + # 1 control node - FAIL + + # 20240124-01-jade-debug.txt + # 1 control node - PASS + + # 20240124-02-jade-debug.txt + # 3 control node - FAIL + # 1 control node - PASS + # 3 control node - FAIL + + # 20240125-01-jade-debug.txt + # 1 control node - PASS + # 3 control node - PASS << works !! + + +# ----------------------------------------------------- +# List our servers in Openstack. +#[root@ansibler] + + openstack \ + --os-cloud "${cloudname:?}" \ + server list + + > -------+------------------------------------------------------------+--------+----------------------------------------------------------------------------+-----------------------------------+----------------+ + > | ID | Name | Status | Networks | Image | Flavor | + > +--------------------------------------+------------------------------------------------------------+--------+----------------------------------------------------------------------------+-----------------------------------+----------------+ + > | 16adc559-d5ce-4468-8081-38394103dcfa | somerville-jade-20240125-work-control-plane-1ae41063-9qblb | ACTIVE | k8s-clusterapi-cluster-default-somerville-jade-20240125-work=192.168.3.73 | gaia-dmp-ubuntu-2204-kube-v1.26.7 | gaia.vm.2vcpu | + > | e644c0c6-679f-45a5-949d-920eaffa515c | somerville-jade-20240125-work-control-plane-1ae41063-vh56j | ACTIVE | k8s-clusterapi-cluster-default-somerville-jade-20240125-work=192.168.3.11 | gaia-dmp-ubuntu-2204-kube-v1.26.7 | gaia.vm.2vcpu | + > | 88caaac5-3b7a-4342-b7bf-0f4a781feaf7 | somerville-jade-20240125-work-md-0-de647233-22lc7 | ACTIVE | k8s-clusterapi-cluster-default-somerville-jade-20240125-work=192.168.3.214 | gaia-dmp-ubuntu-2204-kube-v1.26.7 | gaia.vm.26vcpu | + > | c6e7944f-28e4-42d5-b32a-43ba8e2338e9 | somerville-jade-20240125-work-md-0-de647233-fxsln | ACTIVE | k8s-clusterapi-cluster-default-somerville-jade-20240125-work=192.168.3.98 | gaia-dmp-ubuntu-2204-kube-v1.26.7 | gaia.vm.26vcpu | + > | cb96e588-eb0a-42c5-81b4-4cd872ba31ed | somerville-jade-20240125-work-md-0-de647233-6hq9q | ACTIVE | k8s-clusterapi-cluster-default-somerville-jade-20240125-work=192.168.3.188 | gaia-dmp-ubuntu-2204-kube-v1.26.7 | gaia.vm.26vcpu | + > | 906d4342-9e47-42ba-b0a9-f8e6c761bb42 | somerville-jade-20240125-work-md-0-de647233-qrppj | ACTIVE | k8s-clusterapi-cluster-default-somerville-jade-20240125-work=192.168.3.253 | gaia-dmp-ubuntu-2204-kube-v1.26.7 | gaia.vm.26vcpu | + > | 980b118f-15c9-41d2-9292-c816370fedf8 | somerville-jade-20240125-work-md-0-de647233-mtdcr | ACTIVE | k8s-clusterapi-cluster-default-somerville-jade-20240125-work=192.168.3.116 | gaia-dmp-ubuntu-2204-kube-v1.26.7 | gaia.vm.26vcpu | + > | c900cf7d-6866-4efd-bdc4-b1430d94ec2c | somerville-jade-20240125-work-md-0-de647233-tzssf | ACTIVE | k8s-clusterapi-cluster-default-somerville-jade-20240125-work=192.168.3.105 | gaia-dmp-ubuntu-2204-kube-v1.26.7 | gaia.vm.26vcpu | + > | 454e4b80-54f0-468f-8628-6d3ea4f0ce64 | somerville-jade-20240125-work-control-plane-1ae41063-swqdl | ACTIVE | k8s-clusterapi-cluster-default-somerville-jade-20240125-work=192.168.3.95 | gaia-dmp-ubuntu-2204-kube-v1.26.7 | gaia.vm.2vcpu | + > | 27c116e5-41aa-468a-94e6-410ccd4a69bc | somerville-jade-20240125-bootstrap-node | ACTIVE | somerville-jade-20240125-bootstrap-network=10.10.3.19, 192.41.122.84 | gaia-dmp-fedora-cloud-38-1.6 | gaia.vm.2vcpu | + > +--------------------------------------+------------------------------------------------------------+--------+----------------------------------------------------------------------------+-----------------------------------+----------------+ + + +# ----------------------------------------------------- +# List our machines and nodes in Kubernetes. +#[root@ansibler] + + ssh bootstrap -t \ + ' + source loadconfig + kubectl \ + --kubeconfig "${kindclusterconf:?}" \ + get machines \ + --all-namespaces + ' + + > NAMESPACE NAME CLUSTER NODENAME PROVIDERID PHASE AGE VERSION + > default somerville-jade-20240125-work-control-plane-chhwd somerville-jade-20240125-work somerville-jade-20240125-work-control-plane-1ae41063-vh56j openstack:///e644c0c6-679f-45a5-949d-920eaffa515c Running 11m v1.26.7 + > default somerville-jade-20240125-work-control-plane-cnq4m somerville-jade-20240125-work somerville-jade-20240125-work-control-plane-1ae41063-swqdl openstack:///454e4b80-54f0-468f-8628-6d3ea4f0ce64 Running 18m v1.26.7 + > default somerville-jade-20240125-work-control-plane-rw758 somerville-jade-20240125-work somerville-jade-20240125-work-control-plane-1ae41063-9qblb openstack:///16adc559-d5ce-4468-8081-38394103dcfa Running 9m28s v1.26.7 + > default somerville-jade-20240125-work-md-0-42mfs-99khf somerville-jade-20240125-work somerville-jade-20240125-work-md-0-de647233-tzssf openstack:///c900cf7d-6866-4efd-bdc4-b1430d94ec2c Running 20m v1.26.7 + > default somerville-jade-20240125-work-md-0-42mfs-cfcb4 somerville-jade-20240125-work somerville-jade-20240125-work-md-0-de647233-fxsln openstack:///c6e7944f-28e4-42d5-b32a-43ba8e2338e9 Running 20m v1.26.7 + > default somerville-jade-20240125-work-md-0-42mfs-gx2l2 somerville-jade-20240125-work somerville-jade-20240125-work-md-0-de647233-mtdcr openstack:///980b118f-15c9-41d2-9292-c816370fedf8 Running 20m v1.26.7 + > default somerville-jade-20240125-work-md-0-42mfs-j7cff somerville-jade-20240125-work somerville-jade-20240125-work-md-0-de647233-6hq9q openstack:///cb96e588-eb0a-42c5-81b4-4cd872ba31ed Running 20m v1.26.7 + > default somerville-jade-20240125-work-md-0-42mfs-kgdjp somerville-jade-20240125-work somerville-jade-20240125-work-md-0-de647233-qrppj openstack:///906d4342-9e47-42ba-b0a9-f8e6c761bb42 Running 20m v1.26.7 + > default somerville-jade-20240125-work-md-0-42mfs-trbsq somerville-jade-20240125-work somerville-jade-20240125-work-md-0-de647233-22lc7 openstack:///88caaac5-3b7a-4342-b7bf-0f4a781feaf7 Running 20m v1.26.7 + + ssh bootstrap -t \ + ' + source loadconfig + kubectl \ + --kubeconfig "${workclusterconf:?}" \ + get nodes \ + --all-namespaces + ' + + > NAME STATUS ROLES AGE VERSION + > somerville-jade-20240125-work-control-plane-1ae41063-9qblb Ready control-plane 9m18s v1.26.7 + > somerville-jade-20240125-work-control-plane-1ae41063-swqdl Ready control-plane 17m v1.26.7 + > somerville-jade-20240125-work-control-plane-1ae41063-vh56j Ready control-plane 11m v1.26.7 + > somerville-jade-20240125-work-md-0-de647233-22lc7 Ready 16m v1.26.7 + > somerville-jade-20240125-work-md-0-de647233-6hq9q Ready 16m v1.26.7 + > somerville-jade-20240125-work-md-0-de647233-fxsln Ready 16m v1.26.7 + > somerville-jade-20240125-work-md-0-de647233-mtdcr Ready 15m v1.26.7 + > somerville-jade-20240125-work-md-0-de647233-qrppj Ready 15m v1.26.7 + > somerville-jade-20240125-work-md-0-de647233-tzssf Ready 15m v1.26.7 + diff --git a/notes/zrq/20240125-02-jade-test.txt b/notes/zrq/20240125-02-jade-test.txt new file mode 100644 index 00000000..1380079b --- /dev/null +++ b/notes/zrq/20240125-02-jade-test.txt @@ -0,0 +1,250 @@ +# +# +# +# Copyright (c) 2024, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# +# AIMetrics: [] +# + + Target: + + We have a woking cluster .. time to test it. + + Result: + + Work in progress ... + +# ----------------------------------------------------- +# Get the location of our cluster config files. +#[root@ansibler] + + # TODO something to put this into the PATH + export PATH=${PATH}:/deployments/cluster-api/ansible/files/aglais/bin + source loadconfig + + +# ----------------------------------------------------- +# Run a SOCKS proxy linking our client container to our bootstrap node. +# https://unix.stackexchange.com/questions/34004/how-does-tcp-keepalive-work-in-ssh +# https://unix.stackexchange.com/a/34201 +#[root@ansibler] + + ssh \ + -n \ + -f \ + -N \ + -D '*:3000' \ + -o ServerAliveInterval=10 \ + -o ServerAliveCountMax=12 \ + bootstrap + + > .... + > .... + + +# ----------------------------------------------------- +# Modify our kubectl config to add a SOCKS proxy. +#[root@ansibler] + + source loadconfig + vi "${workclusterconf:?}" + + apiVersion: v1 + kind: Config + clusters: + - cluster: + name: somerville-jade-20240118-work + .... + server: https://192.41.122.195:6443 + + proxy-url: socks5://localhost:3000/ + + +# ----------------------------------------------------- +# Check we can access the cluster-info. +#[root@ansibler] + + source loadconfig + kubectl \ + --kubeconfig "${workclusterconf:?}" \ + cluster-info + + > Kubernetes control plane is running at https://192.41.122.78:6443 + > CoreDNS is running at https://192.41.122.78:6443/api/v1/namespaces/kube-system/services/kube-dns:dns/proxy + + +# ----------------------------------------------------- +# Deploy our gaia-dmp Helm chart. +#[root@ansibler] + + source loadconfig + + helm dependency build \ + --kubeconfig "${workclusterconf:?}" \ + '/deployments/cluster-api/helm/gaia-dmp' + + > Saving 2 charts + > Deleting outdated charts + + + helm upgrade \ + --wait \ + --debug \ + --kubeconfig "${workclusterconf:?}" \ + 'gaia-dmp' \ + '/deployments/cluster-api/helm/gaia-dmp' \ + --install + + > history.go:56: [debug] getting history for release gaia-dmp + > Release "gaia-dmp" does not exist. Installing it now. + > install.go:194: [debug] Original chart version: "" + > install.go:211: [debug] CHART PATH: /deployments/cluster-api/helm/gaia-dmp + > .... + > .... + > + > ready.go:277: [debug] Deployment is not ready: default/zeppelin-server. 0 out of 1 expected pods are ready + > ready.go:277: [debug] Deployment is not ready: default/zeppelin-server. 0 out of 1 expected pods are ready + > Error: timed out waiting for the condition + > helm.go:84: [debug] timed out waiting for the condition + + + helm upgrade \ + --wait \ + --debug \ + --kubeconfig "${workclusterconf:?}" \ + 'gaia-dmp' \ + '/deployments/cluster-api/helm/gaia-dmp' \ + --install + + > history.go:56: [debug] getting history for release gaia-dmp + > upgrade.go:144: [debug] preparing upgrade for gaia-dmp + > upgrade.go:152: [debug] performing update for gaia-dmp + > upgrade.go:324: [debug] creating upgraded release for gaia-dmp + > client.go:338: [debug] checking 10 resources for changes + > client.go:617: [debug] Looks like there are no changes for Namespace "gaia-dmp" + > client.go:617: [debug] Looks like there are no changes for ServiceAccount "dashboard-admin-account" + > client.go:617: [debug] Looks like there are no changes for ServiceAccount "zeppelin-server" + > client.go:617: [debug] Looks like there are no changes for ConfigMap "zeppelin-server-conf-map" + > client.go:617: [debug] Looks like there are no changes for ConfigMap "zeppelin-server-conf" + > client.go:617: [debug] Looks like there are no changes for ClusterRole "zeppelin-server-role" + > client.go:617: [debug] Looks like there are no changes for ClusterRoleBinding "dashboard-admin-binding" + > client.go:617: [debug] Looks like there are no changes for RoleBinding "zeppelin-server-role-binding" + > client.go:617: [debug] Looks like there are no changes for Service "zeppelin-server" + > client.go:626: [debug] Patch Deployment "zeppelin-server" in namespace default + > upgrade.go:396: [debug] waiting for release gaia-dmp resources (created: 0 updated: 10 deleted: 0) + > wait.go:48: [debug] beginning wait for 10 resources with timeout of 5m0s + > upgrade.go:159: [debug] updating status for upgraded release for gaia-dmp + > Release "gaia-dmp" has been upgraded. Happy Helming! + > NAME: gaia-dmp + > LAST DEPLOYED: Thu Jan 25 12:20:23 2024 + > NAMESPACE: default + > STATUS: deployed + > REVISION: 2 + > TEST SUITE: None + > .... + > .... + + +# ----------------------------------------------------- +# Generate a dashboard token. +#[root@ansibler] + + kubectl \ + --kubeconfig "${workclusterconf:?}" \ + --namespace "gaia-dmp" \ + create token \ + "dashboard-admin-account" + + > .... + > .... + + +# ----------------------------------------------------- +# Launch a kubectl proxy. +#[root@ansibler] + + kubectl \ + --kubeconfig "${workclusterconf:?}" \ + --address 0.0.0.0 \ + proxy \ + & + + > Starting to serve on [::]:8001 + + +# ----------------------------------------------------- +# ----------------------------------------------------- +# Get the published port number for our agclient. +#[user@desktop] + + agcolour=jade + + kubeport=$( + podman container \ + inspect \ + "ansibler-${agcolour:?}" \ + --format json \ + | jq -r ' + .[0] + | .HostConfig.PortBindings + | ."8001/tcp" + | .[0].HostPort + ' + ) + + echo "kubeport [${kubeport}]" + + > kubeport [41667] + + +# ----------------------------------------------------- +# Launch browser pointed at the dashboard. +#[user@desktop] + + firefox \ + --new-window \ + "http://localhost:${kubeport:?}/api/v1/namespaces/kubernetes-dashboard/services/https:kubernetes-dashboard:https/proxy/#/login" \ + & + + # + # Dashboard works :-) + # + + +# ----------------------------------------------------- +# Launch browser pointed at Zeppelin. +#[user@desktop] + + firefox \ + --new-window \ + "http://localhost:${kubeport:?}/api/v1/namespaces/default/services/http:zeppelin-server:http/proxy/#/" \ + & + + # + # Zeppelin responds .. but only part of the front page is displayed. + # Suspect that multiple proxies ontop of proxies is mangling the 'clever' JS UI app. + # + + +