From fc0bbacca05554b6d266a23bc4ef5233c4b32d82 Mon Sep 17 00:00:00 2001 From: davidby-influx Date: Thu, 29 Oct 2020 15:01:48 -0700 Subject: [PATCH] fix(tsm1): "snapshot in progress" error during backup When an InfluxDB database is very busy writing new points the backup the process can fail because it can not write a new snapshot. The error is: operation timed out with error: create snapshot: snapshot in progress. This happens because InfluxDB takes almost "continuously" a snapshot from the cache caused by the high number of points ingested. The fix for this was https://github.com/influxdata/influxdb/pull/16627 but it was for OSS only, and was not in the code path for backups in clusters. This fix adds a skipCacheOk flag to tsdb.Engine.CreateSnapshot(). A value of true allows the backup to proceed even if a cache snapshot cannot be taken. This flag is set to true in tsm1.Engine.Backup(), the OSS backup code path and in tsdb.Shard.CreateSnapshot(), the cluster backup code path. This flag is set to false in tsm1.Engine.Export() https://github.com/influxdata/plutonium/issues/3227 (cherry picked from commit 23be20bf1bde42cb11cce9494e3ce97055755e9b) (cherry picked from commit 0b1ee04f9fb060d945453907bbf823182a986b9c) --- tsdb/engine.go | 2 +- tsdb/engine/tsm1/engine.go | 34 ++++++++++++++-------------------- tsdb/shard.go | 2 +- 3 files changed, 16 insertions(+), 22 deletions(-) diff --git a/tsdb/engine.go b/tsdb/engine.go index 4f2d0a3511a..eb14c18cda8 100644 --- a/tsdb/engine.go +++ b/tsdb/engine.go @@ -41,7 +41,7 @@ type Engine interface { LoadMetadataIndex(shardID uint64, index Index) error - CreateSnapshot() (string, error) + CreateSnapshot(skipCacheOk bool) (string, error) Backup(w io.Writer, basePath string, since time.Time) error Export(w io.Writer, basePath string, start time.Time, end time.Time) error Restore(r io.Reader, basePath string) error diff --git a/tsdb/engine/tsm1/engine.go b/tsdb/engine/tsm1/engine.go index 4cd83039247..b5042574569 100644 --- a/tsdb/engine/tsm1/engine.go +++ b/tsdb/engine/tsm1/engine.go @@ -909,26 +909,16 @@ func (e *Engine) Free() error { // of the files in the archive. It will force a snapshot of the WAL first // then perform the backup with a read lock against the file store. This means // that new TSM files will not be able to be created in this shard while the -// backup is running. For shards that are still acively getting writes, this -// could cause the WAL to backup, increasing memory usage and evenutally rejecting writes. +// backup is running. For shards that are still actively getting writes, this +// could cause the WAL to backup, increasing memory usage and eventually rejecting writes. func (e *Engine) Backup(w io.Writer, basePath string, since time.Time) error { var err error var path string - for i := 0; i < 3; i++ { - path, err = e.CreateSnapshot() - if err != nil { - switch err { - case ErrSnapshotInProgress: - backoff := time.Duration(math.Pow(32, float64(i))) * time.Millisecond - time.Sleep(backoff) - default: - return err - } - } - } - if err == ErrSnapshotInProgress { - e.logger.Warn("Snapshotter busy: Backup proceeding without snapshot contents.") + path, err = e.CreateSnapshot(true) + if err != nil { + return err } + // Remove the temporary snapshot dir defer func() { if err := os.RemoveAll(path); err != nil { @@ -995,7 +985,7 @@ func (e *Engine) timeStampFilterTarFile(start, end time.Time) func(f os.FileInfo } func (e *Engine) Export(w io.Writer, basePath string, start time.Time, end time.Time) error { - path, err := e.CreateSnapshot() + path, err := e.CreateSnapshot(false) if err != nil { return err } @@ -1918,9 +1908,13 @@ func (e *Engine) WriteSnapshot() (err error) { } // CreateSnapshot will create a temp directory that holds -// temporary hardlinks to the underylyng shard files. -func (e *Engine) CreateSnapshot() (string, error) { - if err := e.WriteSnapshot(); err != nil { +// temporary hardlinks to the underlying shard files. +// skipCacheOk controls whether it is permissible to fail writing out +// in-memory cache data when a previous snapshot is in progress +func (e *Engine) CreateSnapshot(skipCacheOk bool) (string, error) { + if err := e.WriteSnapshot(); (err == ErrSnapshotInProgress) && skipCacheOk { + e.logger.Warn("Snapshotter busy: Backup or export proceeding without cache snapshot contents.") + } else if err != nil { return "", err } diff --git a/tsdb/shard.go b/tsdb/shard.go index 75294f410e1..39f96ff1d0e 100644 --- a/tsdb/shard.go +++ b/tsdb/shard.go @@ -1122,7 +1122,7 @@ func (s *Shard) CreateSnapshot() (string, error) { if err != nil { return "", err } - return engine.CreateSnapshot() + return engine.CreateSnapshot(true) } // ForEachMeasurementName iterates over each measurement in the shard.