From 2d471130c64d3e7e1b2249c6412d841b3f1f5525 Mon Sep 17 00:00:00 2001 From: Ishan Tyagi <42602577+ishan16696@users.noreply.github.com> Date: Mon, 13 Feb 2023 09:13:03 +0530 Subject: [PATCH] Enhances the decision to take full snapshot during startup to avoid missing of any full-snapshot. (#574) * Enhances the decision to take full snapshot during startup to avoid missing of any full-snapshot. * Added unit tests. * Update comments. * Address review comments. * Address review comments. * Change one condition Added some more unit tests. * Address review comments. * Address review comments. Fixed unit tests. * Address review comments. * Address review comments 2. --- pkg/miscellaneous/miscellaneous.go | 16 +- pkg/server/backuprestoreserver.go | 63 +++---- pkg/snapshot/snapshotter/snapshotter.go | 71 ++++++- pkg/snapshot/snapshotter/snapshotter_test.go | 184 +++++++++++++++++++ 4 files changed, 295 insertions(+), 39 deletions(-) diff --git a/pkg/miscellaneous/miscellaneous.go b/pkg/miscellaneous/miscellaneous.go index 039f2fb49..a3cb5c1b8 100644 --- a/pkg/miscellaneous/miscellaneous.go +++ b/pkg/miscellaneous/miscellaneous.go @@ -357,7 +357,6 @@ func IsMultiNode(logger *logrus.Entry) bool { } config := map[string]interface{}{} - err = yaml.Unmarshal([]byte(configYML), &config) if err := yaml.Unmarshal([]byte(configYML), &config); err != nil { return false } @@ -550,3 +549,18 @@ func IsPeerURLTLSEnabled() (bool, error) { return peerURL.Scheme == https, nil } + +// GetPrevScheduledSnapTime returns the previous schedule snapshot time. +// TODO: Previous full snapshot time should be calculated on basis of previous cron schedule of full snapshot. +func GetPrevScheduledSnapTime(nextSnapSchedule time.Time, timeWindow float64) time.Time { + return time.Date( + nextSnapSchedule.Year(), + nextSnapSchedule.Month(), + nextSnapSchedule.Day(), + nextSnapSchedule.Hour()-int(timeWindow), + nextSnapSchedule.Minute(), + nextSnapSchedule.Second(), + nextSnapSchedule.Nanosecond(), + nextSnapSchedule.Location(), + ) +} diff --git a/pkg/server/backuprestoreserver.go b/pkg/server/backuprestoreserver.go index 630323eb1..f23bc0db4 100644 --- a/pkg/server/backuprestoreserver.go +++ b/pkg/server/backuprestoreserver.go @@ -307,8 +307,8 @@ func (b *BackupRestoreServer) runServer(ctx context.Context, restoreOpts *brtype // for the case when backup-restore becomes leading sidecar. func (b *BackupRestoreServer) runEtcdProbeLoopWithSnapshotter(ctx context.Context, handler *HTTPHandler, ssr *snapshotter.Snapshotter, ss brtypes.SnapStore, ssrStopCh chan struct{}, ackCh chan struct{}) { var ( - err error - initialDeltaSnapshotTaken bool + err error + initialFullSnapshotTaken bool ) for { @@ -353,16 +353,32 @@ func (b *BackupRestoreServer) runEtcdProbeLoopWithSnapshotter(ctx context.Contex // the delta snapshot memory limit), after which a full snapshot // is taken and the regular snapshot schedule comes into effect. - // TODO: write code to find out if prev full snapshot is older than it is - // supposed to be, according to the given cron schedule, instead of the - // hard-coded "24 hours" full snapshot interval + fullSnapshotMaxTimeWindowInHours := ssr.GetFullSnapshotMaxTimeWindow(b.config.SnapshotterConfig.FullSnapshotSchedule) + initialFullSnapshotTaken = false + if ssr.IsFullSnapshotRequiredAtStartup(fullSnapshotMaxTimeWindowInHours) { + // need to take a full snapshot here + var snapshot *brtypes.Snapshot + metrics.SnapshotRequired.With(prometheus.Labels{metrics.LabelKind: brtypes.SnapshotKindDelta}).Set(0) + metrics.SnapshotRequired.With(prometheus.Labels{metrics.LabelKind: brtypes.SnapshotKindFull}).Set(1) + if snapshot, err = ssr.TakeFullSnapshotAndResetTimer(false); err != nil { + metrics.SnapshotterOperationFailure.With(prometheus.Labels{metrics.LabelError: err.Error()}).Inc() + b.logger.Errorf("Failed to take substitute first full snapshot: %v", err) + continue + } + initialFullSnapshotTaken = true + if b.config.HealthConfig.SnapshotLeaseRenewalEnabled { + leaseUpdatectx, cancel := context.WithTimeout(ctx, brtypes.LeaseUpdateTimeoutDuration) + defer cancel() + if err = heartbeat.FullSnapshotCaseLeaseUpdate(leaseUpdatectx, b.logger, snapshot, ssr.K8sClientset, b.config.HealthConfig.FullSnapshotLeaseName, b.config.HealthConfig.DeltaSnapshotLeaseName); err != nil { + b.logger.Warnf("Snapshot lease update failed : %v", err) + } + } + if b.backoffConfig.Start { + b.backoffConfig.ResetExponentialBackoff() + } + } - // Temporary fix for missing alternate full snapshots for Gardener shoots - // with hibernation schedule set: change value from 24 ot 23.5 to - // accommodate for slight pod spin-up delays on shoot wake-up - const recentFullSnapshotPeriodInHours = 23.5 - initialDeltaSnapshotTaken = false - if ssr.PrevFullSnapshot != nil && !ssr.PrevFullSnapshot.IsFinal && time.Since(ssr.PrevFullSnapshot.CreatedOn).Hours() <= recentFullSnapshotPeriodInHours { + if !initialFullSnapshotTaken { ssrStopped, err := ssr.CollectEventsSincePrevSnapshot(ssrStopCh) if ssrStopped { b.logger.Info("Snapshotter stopped.") @@ -375,7 +391,6 @@ func (b *BackupRestoreServer) runEtcdProbeLoopWithSnapshotter(ctx context.Contex b.logger.Warnf("Failed to take first delta snapshot: snapshotter failed with error: %v", err) continue } - initialDeltaSnapshotTaken = true if b.config.HealthConfig.SnapshotLeaseRenewalEnabled { leaseUpdatectx, cancel := context.WithTimeout(ctx, brtypes.LeaseUpdateTimeoutDuration) defer cancel() @@ -391,28 +406,6 @@ func (b *BackupRestoreServer) runEtcdProbeLoopWithSnapshotter(ctx context.Contex } } - if !initialDeltaSnapshotTaken { - // need to take a full snapshot here - var snapshot *brtypes.Snapshot - metrics.SnapshotRequired.With(prometheus.Labels{metrics.LabelKind: brtypes.SnapshotKindDelta}).Set(0) - metrics.SnapshotRequired.With(prometheus.Labels{metrics.LabelKind: brtypes.SnapshotKindFull}).Set(1) - if snapshot, err = ssr.TakeFullSnapshotAndResetTimer(false); err != nil { - metrics.SnapshotterOperationFailure.With(prometheus.Labels{metrics.LabelError: err.Error()}).Inc() - b.logger.Errorf("Failed to take substitute first full snapshot: %v", err) - continue - } - if b.config.HealthConfig.SnapshotLeaseRenewalEnabled { - leaseUpdatectx, cancel := context.WithTimeout(ctx, brtypes.LeaseUpdateTimeoutDuration) - defer cancel() - if err = heartbeat.FullSnapshotCaseLeaseUpdate(leaseUpdatectx, b.logger, snapshot, ssr.K8sClientset, b.config.HealthConfig.FullSnapshotLeaseName, b.config.HealthConfig.DeltaSnapshotLeaseName); err != nil { - b.logger.Warnf("Snapshot lease update failed : %v", err) - } - } - if b.backoffConfig.Start { - b.backoffConfig.ResetExponentialBackoff() - } - } - // Set snapshotter state to Active ssr.SetSnapshotterActive() @@ -423,7 +416,7 @@ func (b *BackupRestoreServer) runEtcdProbeLoopWithSnapshotter(ctx context.Contex // Start snapshotter b.logger.Infof("Starting snapshotter...") - startWithFullSnapshot := ssr.PrevFullSnapshot == nil || ssr.PrevFullSnapshot.IsFinal || !(time.Since(ssr.PrevFullSnapshot.CreatedOn).Hours() <= recentFullSnapshotPeriodInHours) + startWithFullSnapshot := ssr.IsFullSnapshotRequiredAtStartup(fullSnapshotMaxTimeWindowInHours) if err := ssr.Run(ssrStopCh, startWithFullSnapshot); err != nil { if etcdErr, ok := err.(*errors.EtcdError); ok { metrics.SnapshotterOperationFailure.With(prometheus.Labels{metrics.LabelError: etcdErr.Error()}).Inc() diff --git a/pkg/snapshot/snapshotter/snapshotter.go b/pkg/snapshot/snapshotter/snapshotter.go index 5343a21b0..ccfb9dece 100644 --- a/pkg/snapshot/snapshotter/snapshotter.go +++ b/pkg/snapshot/snapshotter/snapshotter.go @@ -22,6 +22,8 @@ import ( "fmt" "io" "path" + "strconv" + "strings" "sync" "time" @@ -42,6 +44,15 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" ) +const ( + min = iota // Minutes field + hour // Hours field + dayOfMonth // Day of month field + month // Month field + dayOfWeek // Day of week field + defaultFullSnapMaxTimeWindow = 24 // default full snapshot time window in hours +) + var ( emptyStruct struct{} snapstoreHash = make(map[string]interface{}) @@ -104,7 +115,7 @@ func NewSnapshotter(logger *logrus.Entry, config *brtypes.SnapshotterConfig, sto sdl, err := cron.ParseStandard(config.FullSnapshotSchedule) if err != nil { // Ideally this should be validated before. - return nil, fmt.Errorf("invalid schedule provied %s : %v", config.FullSnapshotSchedule, err) + return nil, fmt.Errorf("invalid full snapshot schedule provided %s : %v", config.FullSnapshotSchedule, err) } var prevSnapshot *brtypes.Snapshot @@ -473,12 +484,12 @@ func (ssr *Snapshotter) TakeDeltaSnapshot() (*brtypes.Snapshot, error) { defer rc.Close() if err := ssr.store.Save(*snap, rc); err != nil { - timeTaken := time.Now().Sub(startTime).Seconds() + timeTaken := time.Since(startTime).Seconds() metrics.SnapshotDurationSeconds.With(prometheus.Labels{metrics.LabelKind: brtypes.SnapshotKindDelta, metrics.LabelSucceeded: metrics.ValueSucceededFalse}).Observe(timeTaken) ssr.logger.Errorf("Error saving delta snapshots. %v", err) return nil, err } - timeTaken := time.Now().Sub(startTime).Seconds() + timeTaken := time.Since(startTime).Seconds() metrics.SnapshotDurationSeconds.With(prometheus.Labels{metrics.LabelKind: brtypes.SnapshotKindDelta, metrics.LabelSucceeded: metrics.ValueSucceededTrue}).Observe(timeTaken) logrus.Infof("Total time to save delta snapshot: %f seconds.", timeTaken) ssr.prevSnapshot = snap @@ -740,3 +751,57 @@ func (ssr *Snapshotter) checkSnapstoreSecretUpdate() bool { snapstoreHash[ssr.snapstoreConfig.Provider] = newSnapstoreSecretHash return true } + +// IsFullSnapshotRequiredAtStartup checks whether to take a full snapshot or not during the startup of backup-restore. +func (ssr *Snapshotter) IsFullSnapshotRequiredAtStartup(timeWindow float64) bool { + if ssr.PrevFullSnapshot == nil || ssr.PrevFullSnapshot.IsFinal || time.Since(ssr.PrevFullSnapshot.CreatedOn).Hours() > timeWindow { + return true + } + + if !ssr.WasScheduledFullSnapshotMissed(timeWindow) { + return false + } + return ssr.IsNextFullSnapshotBeyondTimeWindow(timeWindow) +} + +// WasScheduledFullSnapshotMissed determines whether the preceding full-snapshot was missed or not. +func (ssr *Snapshotter) WasScheduledFullSnapshotMissed(timeWindow float64) bool { + now := time.Now() + nextSnapSchedule := ssr.schedule.Next(now) + + if miscellaneous.GetPrevScheduledSnapTime(nextSnapSchedule, timeWindow) == ssr.PrevFullSnapshot.CreatedOn { + ssr.logger.Info("previous full snapshot was taken at scheduled time, skipping the full snapshot at startup") + return false + } + return true +} + +// IsNextFullSnapshotBeyondTimeWindow determines whether the next scheduled full snapshot will exceed the given time window or not. +func (ssr *Snapshotter) IsNextFullSnapshotBeyondTimeWindow(timeWindow float64) bool { + now := time.Now() + nextSnapSchedule := ssr.schedule.Next(now) + timeLeftToTakeNextSnap := nextSnapSchedule.Sub(now) + + return timeLeftToTakeNextSnap.Hours()+time.Since(ssr.PrevFullSnapshot.CreatedOn).Hours() > timeWindow +} + +// GetFullSnapshotMaxTimeWindow returns the maximum time period in hours for which backup-restore must take atleast one full snapshot. +func (ssr *Snapshotter) GetFullSnapshotMaxTimeWindow(fullSnapScheduleSpec string) float64 { + // Split on whitespace. + schedule := strings.Fields(fullSnapScheduleSpec) + if len(schedule) < 5 { + return defaultFullSnapMaxTimeWindow + } + + if schedule[dayOfWeek] != "*" { + return defaultFullSnapMaxTimeWindow * 7 + } + + if schedule[dayOfMonth] == "*" && schedule[dayOfWeek] == "*" && strings.Contains(schedule[hour], "/") { + if timeWindow, err := strconv.ParseFloat(schedule[hour][strings.Index(schedule[hour], "/")+1:], 64); err == nil { + return timeWindow + } + } + + return defaultFullSnapMaxTimeWindow +} diff --git a/pkg/snapshot/snapshotter/snapshotter_test.go b/pkg/snapshot/snapshotter/snapshotter_test.go index 5545d9f8b..2c3aa771d 100644 --- a/pkg/snapshot/snapshotter/snapshotter_test.go +++ b/pkg/snapshot/snapshotter/snapshotter_test.go @@ -572,6 +572,190 @@ var _ = Describe("Snapshotter", func() { }) }) }) + + Describe("Scenarios to take full-snapshot during startup of backup-restore", func() { + var ( + ssr *Snapshotter + currentMin int + currentHour int + fullSnapshotTimeWindow float64 + ) + BeforeEach(func() { + fullSnapshotTimeWindow = 24 + currentHour = time.Now().Hour() + currentMin = time.Now().Minute() + snapstoreConfig = &brtypes.SnapstoreConfig{Container: path.Join(outputDir, "default.bkp")} + store, err = snapstore.GetSnapstore(snapstoreConfig) + Expect(err).ShouldNot(HaveOccurred()) + }) + Context("No previous snapshot was taken", func() { + It("should return true", func() { + snapshotterConfig := &brtypes.SnapshotterConfig{ + FullSnapshotSchedule: fmt.Sprintf("%d %d * * *", (currentMin+1)%60, (currentHour+2)%24), + } + + ssr, err = NewSnapshotter(logger, snapshotterConfig, store, etcdConnectionConfig, compressionConfig, healthConfig, snapstoreConfig) + Expect(err).ShouldNot(HaveOccurred()) + + // No previous snapshot was taken + ssr.PrevFullSnapshot = nil + isFullSnapMissed := ssr.IsFullSnapshotRequiredAtStartup(fullSnapshotTimeWindow) + Expect(isFullSnapMissed).Should(BeTrue()) + }) + }) + Context("If previous snapshot was final full snapshot", func() { + It("should return true", func() { + snapshotterConfig := &brtypes.SnapshotterConfig{ + FullSnapshotSchedule: fmt.Sprintf("%d %d * * *", (currentMin+1)%60, (currentHour+2)%24), + } + + ssr, err = NewSnapshotter(logger, snapshotterConfig, store, etcdConnectionConfig, compressionConfig, healthConfig, snapstoreConfig) + Expect(err).ShouldNot(HaveOccurred()) + + // If previous snapshot was final full snapshot + ssr.PrevFullSnapshot = &brtypes.Snapshot{ + IsFinal: true, + } + isFullSnapMissed := ssr.IsFullSnapshotRequiredAtStartup(fullSnapshotTimeWindow) + Expect(isFullSnapMissed).Should(BeTrue()) + }) + }) + Context("Previous full snapshot was taken more than 24hrs before", func() { + It("should return true", func() { + snapshotterConfig := &brtypes.SnapshotterConfig{ + FullSnapshotSchedule: fmt.Sprintf("%d %d * * *", (currentMin+1)%60, (currentHour+2)%24), + } + + ssr, err = NewSnapshotter(logger, snapshotterConfig, store, etcdConnectionConfig, compressionConfig, healthConfig, snapstoreConfig) + Expect(err).ShouldNot(HaveOccurred()) + + // Previous full snapshot was taken 2 days before + ssr.PrevFullSnapshot = &brtypes.Snapshot{ + CreatedOn: time.Date(time.Now().Year(), time.Now().Month(), time.Now().Day()-2, currentHour, currentMin, 0, 0, time.Local), + } + isFullSnapMissed := ssr.IsFullSnapshotRequiredAtStartup(fullSnapshotTimeWindow) + Expect(isFullSnapMissed).Should(BeTrue()) + }) + }) + + Context("Previous full snapshot was taken exactly at scheduled snapshot time, no FullSnapshot was missed", func() { + It("should return false", func() { + snapshotterConfig := &brtypes.SnapshotterConfig{ + FullSnapshotSchedule: fmt.Sprintf("%d %d * * *", (currentMin+1)%60, (currentHour+2)%24), + } + + ssr, err = NewSnapshotter(logger, snapshotterConfig, store, etcdConnectionConfig, compressionConfig, healthConfig, snapstoreConfig) + Expect(err).ShouldNot(HaveOccurred()) + + // Previous full snapshot was taken 1 days before at exactly at scheduled time + ssr.PrevFullSnapshot = &brtypes.Snapshot{ + CreatedOn: time.Date(time.Now().Year(), time.Now().Month(), time.Now().Day()-1, (currentHour+2)%24, (currentMin+1)%60, 0, 0, time.Local), + } + + isFullSnapMissed := ssr.IsFullSnapshotRequiredAtStartup(fullSnapshotTimeWindow) + Expect(isFullSnapMissed).Should(BeFalse()) + }) + }) + + Context("Previous snapshot was taken within 24hrs and next schedule full-snapshot will be taken within 24hs of time window", func() { + It("should return false", func() { + scheduleHour := (currentHour + 4) % 24 + snapshotterConfig := &brtypes.SnapshotterConfig{ + FullSnapshotSchedule: fmt.Sprintf("%d %d * * *", currentMin, scheduleHour), + } + + ssr, err = NewSnapshotter(logger, snapshotterConfig, store, etcdConnectionConfig, compressionConfig, healthConfig, snapstoreConfig) + Expect(err).ShouldNot(HaveOccurred()) + + // Previous full snapshot was taken 4hrs 10 mins before startup of backup-restore + ssr.PrevFullSnapshot = &brtypes.Snapshot{ + CreatedOn: time.Date(time.Now().Year(), time.Now().Month(), time.Now().Day(), (currentHour-4)%24, (currentMin-10)%60, 0, 0, time.Local), + } + isFullSnapCanBeMissed := ssr.IsFullSnapshotRequiredAtStartup(fullSnapshotTimeWindow) + Expect(isFullSnapCanBeMissed).Should(BeFalse()) + }) + }) + + Context("Previous snapshot was taken within 24hrs and next schedule full-snapshot likely to cross 24hs of time window", func() { + It("should return true", func() { + scheduleHour := (currentHour + 8) % 24 + snapshotterConfig := &brtypes.SnapshotterConfig{ + FullSnapshotSchedule: fmt.Sprintf("%d %d * * *", currentMin, scheduleHour), + } + + ssr, err = NewSnapshotter(logger, snapshotterConfig, store, etcdConnectionConfig, compressionConfig, healthConfig, snapstoreConfig) + Expect(err).ShouldNot(HaveOccurred()) + + // Previous full snapshot was taken 18hrs(<24hrs) before startup of backup-restore + ssr.PrevFullSnapshot = &brtypes.Snapshot{ + CreatedOn: time.Date(time.Now().Year(), time.Now().Month(), time.Now().Day(), (currentHour-18)%24, (currentMin)%60, 0, 0, time.Local), + } + isFullSnapCanBeMissed := ssr.IsFullSnapshotRequiredAtStartup(fullSnapshotTimeWindow) + Expect(isFullSnapCanBeMissed).Should(BeTrue()) + }) + }) + }) + + Describe("Scenarios to get maximum time window for full snapshot", func() { + var ( + ssr *Snapshotter + currentMin int + currentHour int + fullSnapshotTimeWindow float64 + ) + BeforeEach(func() { + fullSnapshotTimeWindow = 24 + currentHour = time.Now().Hour() + currentMin = time.Now().Minute() + snapstoreConfig = &brtypes.SnapstoreConfig{Container: path.Join(outputDir, "default.bkp")} + store, err = snapstore.GetSnapstore(snapstoreConfig) + Expect(err).ShouldNot(HaveOccurred()) + }) + Context("Full snapshot schedule for once a day", func() { + It("should return 24hours of timeWindow", func() { + snapshotterConfig := &brtypes.SnapshotterConfig{ + FullSnapshotSchedule: fmt.Sprintf("%d %d * * *", currentMin, currentHour), + } + + ssr, err = NewSnapshotter(logger, snapshotterConfig, store, etcdConnectionConfig, compressionConfig, healthConfig, snapstoreConfig) + Expect(err).ShouldNot(HaveOccurred()) + + timeWindow := ssr.GetFullSnapshotMaxTimeWindow(snapshotterConfig.FullSnapshotSchedule) + Expect(timeWindow).Should(Equal(fullSnapshotTimeWindow)) + }) + }) + + Context("Full snapshot schedule for once in a week", func() { + It("should return 24*7 hours of timeWindow", func() { + snapshotterConfig := &brtypes.SnapshotterConfig{ + FullSnapshotSchedule: fmt.Sprintf("%d %d * * %d", currentMin, currentHour, time.Thursday), + } + + ssr, err = NewSnapshotter(logger, snapshotterConfig, store, etcdConnectionConfig, compressionConfig, healthConfig, snapstoreConfig) + Expect(err).ShouldNot(HaveOccurred()) + + timeWindow := ssr.GetFullSnapshotMaxTimeWindow(snapshotterConfig.FullSnapshotSchedule) + Expect(timeWindow).Should(Equal(fullSnapshotTimeWindow * 7)) + }) + }) + + Context("Full snapshot schedule for every 4 hours", func() { + It("should return 4 hours of timeWindow", func() { + // every 4 hour + scheduleHour := 4 + snapshotterConfig := &brtypes.SnapshotterConfig{ + FullSnapshotSchedule: fmt.Sprintf("%d */%d * * *", 0, scheduleHour), + } + + ssr, err = NewSnapshotter(logger, snapshotterConfig, store, etcdConnectionConfig, compressionConfig, healthConfig, snapstoreConfig) + Expect(err).ShouldNot(HaveOccurred()) + + timeWindow := ssr.GetFullSnapshotMaxTimeWindow(snapshotterConfig.FullSnapshotSchedule) + Expect(timeWindow).Should(Equal(float64(scheduleHour))) + }) + }) + }) + }) // prepareExpectedSnapshotsList prepares the expected snapshotlist based on directory structure