Skip to content

Commit

Permalink
Merge pull request wfau#1260 from Zarquan/20240124-zrq-jade-debug
Browse files Browse the repository at this point in the history
Now it works .. now it doesn't .. now it does.
3 control nodes and 6 workers
  • Loading branch information
Zarquan authored Jan 25, 2024
2 parents b251a8f + c3045e6 commit ed7e8b8
Show file tree
Hide file tree
Showing 6 changed files with 1,525 additions and 5 deletions.
10 changes: 5 additions & 5 deletions deployments/cluster-api/ansible/config/deployments.yml
Original file line number Diff line number Diff line change
Expand Up @@ -88,18 +88,18 @@ deployments:

bootstrap:
login: "fedora"
flavor: "qserv-jump-v2"
flavor: "gaia.vm.2vcpu"
image: "gaia-dmp-fedora-cloud-38-1.6"

controlnode:
login: "ubuntu"
flavor: "qserv-jump-v2"
flavor: "gaia.vm.2vcpu"
image: "gaia-dmp-ubuntu-2204-kube-v1.26.7"
count: 1
count: 3

clusternode:
login: "ubuntu"
flavor: "qserv-worker-v2"
flavor: "gaia.vm.26vcpu"
image: "gaia-dmp-ubuntu-2204-kube-v1.26.7"
count: 3
count: 6

2 changes: 2 additions & 0 deletions deployments/common/ssh/aglais-team-keys
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,5 @@ ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQC3T5ShHZ+HQJ6LpPwgpqRK/U0SYnGLSGY7LtwPiDA4
ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDDzZl6gXOSEAKBaxiIb+YMZNc7UZmp/ruB9h/iJL/8fO6Y60JXlpnHW9hYv5ks/NFteokK9+5YSnW5WEVHZtLkCWRGSkAea2JrC64OSngGf4YPzLosxC3wDjjWwYPEya8nfkiVSSxWUXSNIfrdExDTaPEKB/IbFcHI6oJnDGtLEk9rdMQI+BKm5JPlg/gbOEBflAdB107dB5TSii9bj833oW1PyCOc8Ti+7JrH9qsxGUdjHhSYYw/bluxsMB+YoxA85Xu2uoikAW3LIIuNKfJGzdACvxLH40qnfYwbDrf5d7VHfWmYLT7x7ycKJiMbzmfNJMPfeeFMMldY02P+ubzT [email protected]
ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQC7i4yiwoibOPn21ACNFfXZHQN2j7TzyuLLyaSoH97dUJyr0ImH5D/sVwcjK9O1+zagGuDMZGpBuP5Vi5tHQtvXu0nzwtOX1edX+/B07oTwvKCio8d5awhF0OxeA+YpSRbSKorrckPgcmXIxB2STKZeFxmm8OGrTyIA1gER1j2cgSlz2c+/ZYaO+9sOEX4WhnSA1MfiGLyqPJRvGQRXNeS3NhMRkw7dyi6ge2ohFrAN3miOuhW+vEjyc9yncjqyyfhI8l6FuJmaCpezX0dSLsR+TJBd9/OmsdUZTWB0uIMYDUAKQ2m5sglHMFiZ92F9XBYI0fcwMY1dgRc/F0qLRSmbSTTT1cCb9WRWlzcPrl4f4tcsu2RWChN6jMNTFp7yQ72dhKCqPrlkalk870ZRvXMANtNzL6Q4imeSMYPWAEH14p2N4SPRFEsTrH5iW39fThUzOCPbf9DLs+9FNmgk3SQkYlsQJZA2f8QDpd0joSLvBZER1BEr6M3xqvFTj/LqH88= [email protected]
ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQDJYb748Ma79ycA5C1Nt9FeqlU4idNE+1LnUrEGqFnkeLV5QFbRj5DRBev3xbulFmkV09ZsV58fKnATtLjTTpgYUjuAEGVdqmf9n69TJCeTkB06zYbPxOBmjkEDUEcQ7BYATvZfRm8aI3VWUvivdOrjmUSBjrPqkHOw+FhlfTbGKkLo0osycHtALZMst46RuUm7s/XfbBQAfAksK85yx4ni98LcTyJk698nwavDGDjWOk/hjNwcTqFs1xRVUZmBalPi85J0CFou59t2jP9eRF7c97obMwVvniLTT7Jnhe/cB/806oQKPZBa13m7yprn85vmm9Axje1e059bP8ZrG2PX3RTwsl42oyAtCIuvZQ3rkhCP2NFIWjbkCPKPOy6RKOWe7h7C8yViiXyAHro3PoyTTd+dfkp7NVRjcgBeamrVdpR8McUJRU72upN0lXurhlkD8Fvoo8ptbwjAOw/ud/+Z53pRr3j6sF5Zqa7znkMNvjM4/jbxdxyCH59Ff4nzGs8= [email protected]
ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQDJFJirE/fseIB0oNhuh6aVOiOPdsNgJe6CuUhXm9jkuy3aayBhg4Uxv5+ffBiMwZknxHMuU5v7Pp8SWbxFwXR1J4KbghkwoEP+NLIWY4V5nyporW02SPNcAy06ydNAUGHHClAzqFlXi0tJlA2K170Z3YgyNffyMWGf+znl1vlwSkYJxJtgX5ursgPzxz1ojFMvTJTNUM5diiRVzaRLbpJ3ivu9JneepHrHc1l3cgCrYp/dRgaYAdRPt7b2XAHYGXcCdFLGdeiLdcs+OhBU4DPW3k6pxYaAhBQLqLjZJ9hOMUsao0R9fFBTtyDUFI7mO+bgJ0Nm1qZ2qf71QZ3al5LCh4O2MXry46S/FkyzUVcdhekNircDFwwgX97egrcOiF9cOg8Wo3o+4l/Ce9YmMSU6q32yOfCckQywRegxsAnzIgea1cgJlF6xrjprpdft6KP2AlFdJz22qa7ibOea6MIl4AT3NduWghu3nrpTsTIEs9TcbbBPO775ImSeBQLTKi0= [email protected]

367 changes: 367 additions & 0 deletions notes/zrq/20240124-01-jade-debug.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,367 @@
#
# <meta:header>
# <meta:licence>
# Copyright (c) 2024, ROE (http://www.roe.ac.uk/)
#
# This information is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This information is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# </meta:licence>
# </meta:header>
#
#zrq-notes-time
#zrq-notes-indent
#zrq-notes-crypto
#zrq-notes-ansible
#zrq-notes-osformat
#zrq-notes-zeppelin
#
# AIMetrics: []
#

Target:

Continuing our journey, figuring out why our deployment fails on Somerville.

Original plan:

Add public key for Scott Davidson to enable hiom to access our VMs.
Add security rukles to allow ssh connections between our nodes.
Create a gateway VM inside the cluster network.

Original plan skipped because it seems to work now.

Result:

Now it works !?
Unknown cause.
Do we trust it ?

History:

Jan 10th pass
Jan 12th fail
Jan 16th fail
Jan 17th fail
Jan 18th pass
Jan 19th pass
Jan 23rd fail
Jan 24th pass


# -----------------------------------------------------
# Start a new branch.
#[user@desktop]

branchname=jade-debug

source "${HOME:?}/aglais.env"
pushd "${AGLAIS_CODE}"

newbranch=$(date '+%Y%m%d')-zrq-${branchname:?}

git checkout master

git checkout -b "${newbranch:?}"

popd


# -----------------------------------------------------
# Add Scott's public key.
#[user@desktop]

source "${HOME:?}/aglais.env"
pushd "${AGLAIS_CODE}"

vi deployments/common/ssh/aglais-team-keys

popd


# -----------------------------------------------------
# Run our local client.
#[user@desktop]

source "${HOME:?}/aglais.env"
export PATH=${PATH}:${AGLAIS_CODE}/bin

agclient jade

> ....
> ....


# -----------------------------------------------------
# Delete and create everything.
#[root@ansibler]

export cloudsite=somerville-jade

/deployments/openstack/bin/delete-all.sh \
"${cloudname:?}"

ansible-playbook \
--inventory 'bootstrap,' \
'/deployments/cluster-api/ansible/00-create-all.yml'

> ....
> ....
> PLAY RECAP ******************************************************************************************************************************************
> bootstrap : ok=58 changed=45 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0
> localhost : ok=35 changed=26 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0


# -----------------------------------------------------
# Check the cluster status.
#[root@ansibler]

ssh bootstrap -t \
'
source loadconfig
watch \
clusterctl \
--kubeconfig "${kindclusterconf:?}" \
describe cluster \
"${workclustername:?}"
'

> NAME READY SEVERITY REASON SINCE MESSAGE
> Cluster/somerville-jade-20240124-work True 5m55s
> ├─ClusterInfrastructure - OpenStackCluster/somerville-jade-20240124-work
> ├─ControlPlane - KubeadmControlPlane/somerville-jade-20240124-work-control-plane True 5m55s
> │ └─Machine/somerville-jade-20240124-work-control-plane-sm8rh True 7m11s
> └─Workers
> └─MachineDeployment/somerville-jade-20240124-work-md-0 True 2m5s
> └─3 Machines... True 5m18s See somerville-jade-20240124-work-md-0-jdjp8-s2clm, somerville-jade-20240124-work-md-0-jdjp8-stqxj, ...

#
# Now it works ...
# Not sure if I trust it though.
#


# -----------------------------------------------------
# Get the location of our cluster config files.
#[root@ansibler]

# TODO something to put this into the PATH
export PATH=${PATH}:/deployments/cluster-api/ansible/files/aglais/bin
source loadconfig


# -----------------------------------------------------
# Run a SOCKS proxy linking our client container to our bootstrap node.
# https://unix.stackexchange.com/questions/34004/how-does-tcp-keepalive-work-in-ssh
# https://unix.stackexchange.com/a/34201
#[root@ansibler]

ssh \
-n \
-f \
-N \
-D '*:3000' \
-o ServerAliveInterval=10 \
-o ServerAliveCountMax=12 \
bootstrap

> ....
> ....


# -----------------------------------------------------
# Modify our kubectl config to add a SOCKS proxy.
#[root@ansibler]

source loadconfig
vi "${workclusterconf:?}"

apiVersion: v1
kind: Config
clusters:
- cluster:
name: somerville-jade-20240118-work
....
server: https://192.41.122.195:6443
+ proxy-url: socks5://localhost:3000/


# -----------------------------------------------------
# Check we can access the cluster-info.
#[root@ansibler]

source loadconfig
kubectl \
--kubeconfig "${workclusterconf:?}" \
cluster-info

> Kubernetes control plane is running at https://192.41.122.223:6443
> CoreDNS is running at https://192.41.122.223:6443/api/v1/namespaces/kube-system/services/kube-dns:dns/proxy


# -----------------------------------------------------
# Deploy our gaia-dmp Helm chart.
#[root@ansibler]

source loadconfig

helm dependency build \
--kubeconfig "${workclusterconf:?}" \
'/deployments/cluster-api/helm/gaia-dmp'

> Saving 2 charts
> Deleting outdated charts


helm upgrade \
--wait \
--debug \
--kubeconfig "${workclusterconf:?}" \
'gaia-dmp' \
'/deployments/cluster-api/helm/gaia-dmp' \
--install

> history.go:56: [debug] getting history for release gaia-dmp
> Release "gaia-dmp" does not exist. Installing it now.
> install.go:194: [debug] Original chart version: ""
> install.go:211: [debug] CHART PATH: /deployments/cluster-api/helm/gaia-dmp
> ....
> ....
> ready.go:277: [debug] Deployment is not ready: default/zeppelin-server. 0 out of 1 expected pods are ready
> ready.go:277: [debug] Deployment is not ready: default/zeppelin-server. 0 out of 1 expected pods are ready
> Error: timed out waiting for the condition
> helm.go:84: [debug] timed out waiting for the condition


helm upgrade \
--wait \
--debug \
--kubeconfig "${workclusterconf:?}" \
'gaia-dmp' \
'/deployments/cluster-api/helm/gaia-dmp' \
--install

> history.go:56: [debug] getting history for release gaia-dmp
> upgrade.go:144: [debug] preparing upgrade for gaia-dmp
> upgrade.go:152: [debug] performing update for gaia-dmp
> upgrade.go:324: [debug] creating upgraded release for gaia-dmp
> client.go:338: [debug] checking 10 resources for changes
> client.go:617: [debug] Looks like there are no changes for Namespace "gaia-dmp"
> client.go:617: [debug] Looks like there are no changes for ServiceAccount "dashboard-admin-account"
> client.go:617: [debug] Looks like there are no changes for ServiceAccount "zeppelin-server"
> client.go:617: [debug] Looks like there are no changes for ConfigMap "zeppelin-server-conf-map"
> client.go:617: [debug] Looks like there are no changes for ConfigMap "zeppelin-server-conf"
> client.go:617: [debug] Looks like there are no changes for ClusterRole "zeppelin-server-role"
> client.go:617: [debug] Looks like there are no changes for ClusterRoleBinding "dashboard-admin-binding"
> client.go:617: [debug] Looks like there are no changes for RoleBinding "zeppelin-server-role-binding"
> client.go:617: [debug] Looks like there are no changes for Service "zeppelin-server"
> client.go:626: [debug] Patch Deployment "zeppelin-server" in namespace default
> upgrade.go:396: [debug] waiting for release gaia-dmp resources (created: 0 updated: 10 deleted: 0)
> wait.go:48: [debug] beginning wait for 10 resources with timeout of 5m0s
> ready.go:277: [debug] Deployment is not ready: default/zeppelin-server. 0 out of 1 expected pods are ready
> ready.go:277: [debug] Deployment is not ready: default/zeppelin-server. 0 out of 1 expected pods are ready
> ....
> ....
> ready.go:277: [debug] Deployment is not ready: default/zeppelin-server. 0 out of 1 expected pods are ready
> ready.go:277: [debug] Deployment is not ready: default/zeppelin-server. 0 out of 1 expected pods are ready
> upgrade.go:159: [debug] updating status for upgraded release for gaia-dmp
> Release "gaia-dmp" has been upgraded. Happy Helming!
> NAME: gaia-dmp
> LAST DEPLOYED: Wed Jan 24 15:28:29 2024
> NAMESPACE: default
> STATUS: deployed
> REVISION: 2
> TEST SUITE: None
> ....
> ....

# -----------------------------------------------------
# Generate a dashboard token.
#[root@ansibler]

kubectl \
--kubeconfig "${workclusterconf:?}" \
--namespace "gaia-dmp" \
create token \
"dashboard-admin-account"

> ....
> ....


# -----------------------------------------------------
# Launch a kubectl proxy.
#[root@ansibler]

kubectl \
--kubeconfig "${workclusterconf:?}" \
--address 0.0.0.0 \
proxy \
&

> Starting to serve on [::]:8001


# -----------------------------------------------------
# -----------------------------------------------------
# Get the published port number for our agclient.
#[user@desktop]

agcolour=jade

kubeport=$(
podman container \
inspect \
"ansibler-${agcolour:?}" \
--format json \
| jq -r '
.[0]
| .HostConfig.PortBindings
| ."8001/tcp"
| .[0].HostPort
'
)

echo "kubeport [${kubeport}]"

> kubeport [42159]


# -----------------------------------------------------
# Launch browser pointed at the dashboard.
#[user@desktop]

firefox \
--new-window \
"http://localhost:${kubeport:?}/api/v1/namespaces/kubernetes-dashboard/services/https:kubernetes-dashboard:https/proxy/#/login" \
&

#
# Dashboard works :-)
#


# -----------------------------------------------------
# Launch browser pointed at Zeppelin.
#[user@desktop]

firefox \
--new-window \
"http://localhost:${kubeport:?}/api/v1/namespaces/default/services/http:zeppelin-server:http/proxy/#/" \
&

#
# Zeppelin responds .. but only part of the front page is displayed.
# Suspect that multiple proxies ontop of proxies is mangling the 'clever' JS UI app.
#


Loading

0 comments on commit ed7e8b8

Please sign in to comment.