Skip to content

Commit

Permalink
ibu cnf: add 'rollback after a failed upgrade' test
Browse files Browse the repository at this point in the history
  • Loading branch information
mpmaruthu committed Jun 22, 2024
1 parent d0ade6d commit 92c550b
Show file tree
Hide file tree
Showing 2 changed files with 247 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ const (
LabelPrepAbortFlow = "ibu-prep-abort"
// LabelUpgradeAbortFlow represents upgrade-abort label that can be used for test cases selection.
LabelUpgradeAbortFlow = "ibu-upgrade-abort"
// LabelRollbackFlow represents rollback label that can be used for test cases selection.
LabelRollbackFlow = "ibu-auto-rollback"

// IbuCguNamespace is the namespace where IBU CGUs created on target hub.
IbuCguNamespace = "default"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,245 @@
package upgrade_test

import (
"strings"
"time"

"k8s.io/utils/ptr"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
"github.com/openshift-kni/eco-goinfra/pkg/cgu"
"github.com/openshift-kni/eco-goinfra/pkg/lca"
"github.com/openshift-kni/eco-goinfra/pkg/nodes"
"github.com/openshift-kni/eco-goinfra/pkg/reportxml"
"github.com/openshift-kni/eco-gotests/tests/internal/cluster"
"github.com/openshift-kni/eco-gotests/tests/lca/imagebasedupgrade/cnf/internal/cnfclusterinfo"
"github.com/openshift-kni/eco-gotests/tests/lca/imagebasedupgrade/cnf/internal/cnfhelper"
. "github.com/openshift-kni/eco-gotests/tests/lca/imagebasedupgrade/cnf/internal/cnfinittools"
"github.com/openshift-kni/eco-gotests/tests/lca/imagebasedupgrade/cnf/upgrade-talm/internal/tsparams"
"github.com/openshift-kni/eco-gotests/tests/lca/imagebasedupgrade/internal/nodestate"
"github.com/openshift-kni/eco-gotests/tests/lca/imagebasedupgrade/internal/safeapirequest"
"github.com/openshift-kni/eco-gotests/tests/lca/imagebasedupgrade/internal/seedimage"
)

var (
ibu *lca.ImageBasedUpgradeBuilder
seedImageVersion *seedimage.SeedImageContent
err error
lcaInitMonitorTimeout time.Duration
)

var _ = Describe(
"Validating rollback stage after a failed upgrade",
Label(tsparams.LabelRollbackFlow), func() {

BeforeEach(func() {
By("Fetching target sno cluster name", func() {
err = cnfclusterinfo.PreUpgradeClusterInfo.SaveClusterInfo()
Expect(err).NotTo(HaveOccurred(), "Failed to extract target sno cluster name")

tsparams.TargetSnoClusterName = cnfclusterinfo.PreUpgradeClusterInfo.Name
})

By("Retrieve seed image version", func() {
ibu, err = lca.PullImageBasedUpgrade(TargetSNOAPIClient)
Expect(err).NotTo(HaveOccurred(), "error pulling ibu resource from cluster")

seedImageVersion, err = seedimage.GetContent(TargetSNOAPIClient, ibu.Definition.Spec.SeedImageRef.Version)
Expect(err).NotTo(HaveOccurred(), "error getting seed image version info")
})

By("Setting LCA init-monitor watchdog timer to 5 minutes to trigger rollback upon upgrade failure", func() {
ibu, err = lca.PullImageBasedUpgrade(TargetSNOAPIClient)
Expect(err).NotTo(HaveOccurred(), "error pulling ibu resource from cluster")

ibu.AutoRollbackOnFailureInitMonitorTimeoutSeconds(300)
})
})

AfterEach(func() {
// Deleting CGUs created for validating the test case.
By("Deleting pre-prep cgu created on target hub cluster", func() {
err = cnfhelper.DeleteIbuTestCguOnTargetHub(TargetHubAPIClient, tsparams.PrePrepCguName,
tsparams.IbuCguNamespace)
Expect(err).NotTo(HaveOccurred(), "Failed to delete pre-prep cgu on target hub cluster")
})

By("Deleting prep cgu created on target hub cluster", func() {
err = cnfhelper.DeleteIbuTestCguOnTargetHub(TargetHubAPIClient, tsparams.PrepCguName,
tsparams.IbuCguNamespace)
Expect(err).NotTo(HaveOccurred(), "Failed to delete prep cgu on target hub cluster")
})

By("Deleting upgrade cgu created on target hub cluster", func() {
err = cnfhelper.DeleteIbuTestCguOnTargetHub(TargetHubAPIClient, tsparams.UpgradeCguName,
tsparams.IbuCguNamespace)
Expect(err).NotTo(HaveOccurred(), "Failed to delete upgrade cgu on target hub cluster")
})
})

It("Rollback after a failed upgrade", reportxml.ID("69054"), func() {
By("Creating, enabling ibu pre-prep CGU and waiting for CGU status to report completed", func() {
prePrepCguBuilder := cgu.NewCguBuilder(TargetHubAPIClient,
tsparams.PrePrepCguName, tsparams.IbuCguNamespace, 1).
WithCluster(tsparams.TargetSnoClusterName).
WithManagedPolicy(tsparams.PrePrepPolicyName).
WithCanary(tsparams.TargetSnoClusterName)
prePrepCguBuilder.Definition.Spec.Enable = ptr.To(true)

prePrepCguBuilder, err = prePrepCguBuilder.Create()
Expect(err).NotTo(HaveOccurred(), "Failed to create pre-prep CGU.")

_, err = prePrepCguBuilder.WaitUntilComplete(10 * time.Minute)
Expect(err).NotTo(HaveOccurred(), "Pre-prep CGU did not complete in time.")
})

By("Creating, enabling ibu prep CGU and waiting for CGU status to report completed", func() {
prepCguBuilder := cgu.NewCguBuilder(TargetHubAPIClient,
tsparams.PrepCguName, tsparams.IbuCguNamespace, 1).
WithCluster(tsparams.TargetSnoClusterName).
WithManagedPolicy(tsparams.PrepPolicyName).
WithCanary(tsparams.TargetSnoClusterName)
prepCguBuilder.Definition.Spec.Enable = ptr.To(true)

prepCguBuilder, err = prepCguBuilder.Create()
Expect(err).NotTo(HaveOccurred(), "Failed to create prep CGU.")

_, err = prepCguBuilder.WaitUntilComplete(25 * time.Minute)
Expect(err).NotTo(HaveOccurred(), "Prep CGU did not complete in time.")
})

By("Creating, and enabling ibu upgrade CGU", func() {
upgradeCguBuilder := cgu.NewCguBuilder(TargetHubAPIClient,
tsparams.UpgradeCguName, tsparams.IbuCguNamespace, 1).
WithCluster(tsparams.TargetSnoClusterName).
WithManagedPolicy(tsparams.UpgradePolicyName).
WithCanary(tsparams.TargetSnoClusterName)
upgradeCguBuilder.Definition.Spec.Enable = ptr.To(true)

_, err = upgradeCguBuilder.Create()
Expect(err).NotTo(HaveOccurred(), "Failed to create upgrade CGU.")
})

By("Verifying auto rollback triggered upon upgrade failure", func() {

By("Waiting for node rebooted into stateroot B and cluster become available", func() {

By("Get list of node to be upgraded")

ibuNode, err := nodes.List(TargetSNOAPIClient)
Expect(err).NotTo(HaveOccurred(), "error listing node")

By("Wait for node to become unreachable")

for _, node := range ibuNode {
unreachable, err := nodestate.WaitForNodeToBeUnreachable(node.Object.Name, "6443", time.Minute*15)

Expect(err).To(BeNil(), "error waiting for %s node to shutdown", node.Object.Name)
Expect(unreachable).To(BeTrue(), "error: node %s is still reachable", node.Object.Name)
}

By("Wait for node to become reachable")

for _, node := range ibuNode {
reachable, err := nodestate.WaitForNodeToBeReachable(node.Object.Name, "6443", time.Minute*20)

Expect(err).To(BeNil(), "error waiting for %s node to become reachable", node.Object.Name)
Expect(reachable).To(BeTrue(), "error: node %s is still unreachable", node.Object.Name)
}

By("Wait until node is reporting as Ready")

err = safeapirequest.Do(func() error {
_, err := nodes.WaitForAllNodesAreReady(TargetSNOAPIClient, time.Minute*10)

return err
})
Expect(err).To(BeNil(), "error waiting for node to become ready")

By("Wait for IBU resource to be available")

err = nodestate.WaitForIBUToBeAvailable(TargetSNOAPIClient, ibu, time.Minute*10)
Expect(err).NotTo(HaveOccurred(), "error waiting for ibu resource to become available")
})

By("Verifying current booted stateroot name on target sno cluster node", func() {
getDeploymentIndexCmd := "rpm-ostree status --json | jq '.deployments[0].osname'"
getDesiredStaterootName, err := cluster.ExecCmdWithStdout(TargetSNOAPIClient, getDeploymentIndexCmd)
Expect(err).NotTo(HaveOccurred(), "could not execute command: %s", err)

for _, stdout := range getDesiredStaterootName {
for _, trimStaterootName := range strings.Split(stdout, "rhcos_") {
bootedStaterootNameRes := strings.ReplaceAll(trimStaterootName, "_", "-")
Expect(bootedStaterootNameRes).To(Equal(seedImageVersion),
"Target cluster node booted into stateroot B")
}
}
})

By("Simulate a fault to make upgrade fail, waiting LCA init-monitor timeout, and check upgrade cgu status", func() {
faultInjectCmd := "echo a > /etc/mco/proxy.env"
faultInjectCmdRes, err := cluster.ExecCmdWithStdout(TargetSNOAPIClient, faultInjectCmd)
Expect(err).NotTo(HaveOccurred(), "could not execute command: %s", faultInjectCmdRes)

By("Waiting for LCA init-monitor timeout to trigger auto rollback")
lcaInitMonitorTimeout = 5 * time.Minute

By("Verifying upgrade cgu status on target hub cluster")
upgradeCguStatusCheck, err := cgu.Pull(TargetHubAPIClient,
tsparams.UpgradeCguName,
tsparams.IbuCguNamespace)
Expect(err).NotTo(HaveOccurred(), "Failed to pull upgrade cgu status")

_, err = upgradeCguStatusCheck.WaitUntilComplete(lcaInitMonitorTimeout)
Expect(err).To(HaveOccurred(), "Upgrade CGU expected to report 'InProgress' state.")
})

By("Waiting for node rebooted into stateroot A and cluster become available", func() {

By("Get list of node to be upgraded")

ibuNode, err := nodes.List(TargetSNOAPIClient)
Expect(err).NotTo(HaveOccurred(), "error listing node")

By("Wait for node to become unreachable")

for _, node := range ibuNode {
unreachable, err := nodestate.WaitForNodeToBeUnreachable(node.Object.Name, "6443", time.Minute*15)

Expect(err).To(BeNil(), "error waiting for %s node to shutdown", node.Object.Name)
Expect(unreachable).To(BeTrue(), "error: node %s is still reachable", node.Object.Name)
}

By("Wait for node to become reachable")

for _, node := range ibuNode {
reachable, err := nodestate.WaitForNodeToBeReachable(node.Object.Name, "6443", time.Minute*20)

Expect(err).To(BeNil(), "error waiting for %s node to become reachable", node.Object.Name)
Expect(reachable).To(BeTrue(), "error: node %s is still unreachable", node.Object.Name)
}

By("Wait until node is reporting as Ready")

err = safeapirequest.Do(func() error {
_, err := nodes.WaitForAllNodesAreReady(TargetSNOAPIClient, time.Minute*10)

return err
})
Expect(err).To(BeNil(), "error waiting for node to become ready")

By("Wait for IBU resource to be available")

err = nodestate.WaitForIBUToBeAvailable(TargetSNOAPIClient, ibu, time.Minute*10)
Expect(err).NotTo(HaveOccurred(), "error waiting for ibu resource to become available")
})
})

By("Validating target sno cluster version after auto rollback", func() {
Expect(cnfclusterinfo.PreUpgradeClusterInfo.Version).
To(Equal(cnfclusterinfo.PostUpgradeClusterInfo.Version),
"Target sno cluster reports old cluster version")
})
})
})

0 comments on commit 92c550b

Please sign in to comment.