util/log: fix a bug in closeFileLocked() and strengthen the API

Prior to this patch, `closeFileLocked()` would inconditionally reset the redirection of internal stderr writes back to the original stderr stream (`OrigStderr`). This was incorrect, because of the existence of secondary loggers. To understand why, consider a setup where client code wants internal stderr writes to be redirected to a logger B, while a non-stderr A logger is active. Then the following sequence can occur: - A creates a file. - B creates a file, and redirects internal stderr writes to it. At this point, the behavior is as expected: stderr writes are redirected. - now the code causes A.closeFileLocked() to be called. This causes the call on A to cancel the effects of the redirects on logger B - that is, unwanted "action at a distance". To correct this, this patch only restores the stderr redirect if the logger was also owning the redirection to start with. Furthermore, the entire idea of "redirecting stderr writes to a file" would be a little confusing if two or more loggers had the boolean set to do it. In that case, two loggers could non-deterministically compete to capture stderr when rotating their files alternatively. In order to clarify the API and prevent client code from accidentally relying on this poorly-defined behavior, the patch adds a new method `takeOverInternalStderr()` which enforces the invariant that at most one logger may take over stderr. Finally, for context, be it known that none of the irregularities fixed by this patch are currently impacting the remainder of the CockroachDB code base: `closeFileLocked()` is only called upon `TestLogScope.Close()`, and the previous patches in this sequence have ensured that no stderr is being redirected in the context of a TestLogScope. This patch thus only exists for the benefit of future uses of the API, or readers of the code. Release note: None
cockroachdb · Aug 4, 2020 · 6aea626 · 6aea626
1 parent 68a93aa
commit 6aea626
Show file tree

Hide file tree

Showing 5 changed files with 206 additions and 37 deletions.
diff --git a/pkg/util/log/clog.go b/pkg/util/log/clog.go
@@ -94,20 +94,6 @@ type loggerT struct {
 	// stderr sink.
 	stderrThreshold Severity
 
-	// redirectInternalStderrWrites, when set, causes this logger to
-	// capture writes to system-wide file descriptor 2 (the standard
-	// error stream) and os.Stderr and redirect them to this logger's
-	// output file.
-	// Users of the logging package should ensure that at most one
-	// logger has this flag set to redirect the system-wide stderr.
-	//
-	// Note that this mechanism redirects file descriptor 2, and does
-	// not only assign a different *os.File reference to
-	// os.Stderr. This is because the Go runtime hardcodes stderr writes
-	// as writes to file descriptor 2 and disregards the value of
-	// os.Stderr entirely.
-	redirectInternalStderrWrites bool
-
 	// whether or not to include redaction markers.
 	// This is atomic because tests using TestLogScope might
 	// override this asynchronously with log calls.
@@ -132,6 +118,29 @@ type loggerT struct {
 
 		// syncWrites if true calls file.Flush and file.Sync on every log write.
 		syncWrites bool
+
+		// redirectInternalStderrWrites, when set, causes this logger to
+		// capture writes to system-wide file descriptor 2 (the standard
+		// error stream) and os.Stderr and redirect them to this logger's
+		// output file.
+		// This is managed by the takeOverInternalStderr() method.
+		//
+		// Note that this mechanism redirects file descriptor 2, and does
+		// not only assign a different *os.File reference to
+		// os.Stderr. This is because the Go runtime hardcodes stderr writes
+		// as writes to file descriptor 2 and disregards the value of
+		// os.Stderr entirely.
+		//
+		// There can be at most one logger with this boolean set. This
+		// constraint is enforced by takeOverInternalStderr().
+		redirectInternalStderrWrites bool
+
+		// currentlyOwnsInternalStderr determines whether a logger
+		// _currently_ has taken over fd 2. This may be false while
+		// redirectInternalStderrWrites above is true, when the logger has
+		// not yet opened its output file, or is in the process of
+		// switching over from one directory to the next.
+		currentlyOwnsInternalStderr bool
 	}
 }
 
@@ -160,7 +169,7 @@ func init() {
 	mainLog.fileThreshold = Severity_INFO
 	// Don't capture stderr output until
 	// SetupRedactionAndStderrRedirects() has been called.
-	mainLog.redirectInternalStderrWrites = false
+	mainLog.mu.redirectInternalStderrWrites = false
 }
 
 // FatalChan is closed when Fatal is called. This can be used to make
@@ -364,6 +373,15 @@ func (l *loggerT) outputToStderr(entry Entry, stacks []byte) error {
 	return err
 }
 
+// hasInternalStderrOwnership() returns true iff this logger is
+// currently owning writes to stderr via internal writes to os.Stderr
+// and fd 2.
+func (l *loggerT) hasInternalStderrOwnership() bool {
+	l.mu.Lock()
+	defer l.mu.Unlock()
+	return l.mu.redirectInternalStderrWrites
+}
+
 const fatalErrorPostamble = `
 
 ****************************************************************************

diff --git a/pkg/util/log/flags.go b/pkg/util/log/flags.go
@@ -95,21 +95,39 @@ func SetupRedactionAndStderrRedirects() (cleanupForTestingOnly func(), err error
 		secLogger := NewSecondaryLogger(ctx, &mainLog.logDir, "stderr",
 			true /* enableGC */, true /* forceSyncWrites */, false /* enableMsgCount */)
 
-		// This logger will capture direct stderr writes.
-		secLogger.logger.redirectInternalStderrWrites = true
 		// Stderr capture produces unsafe strings. This logger
 		// thus generally produces non-redactable entries.
 		secLogger.logger.redactableLogs.Set(false)
 
 		// Force a log entry. This does two things: it forces
-		// the creation of a file and the redirection of fd 2 / os.Stderr.
-		// It also introduces a timestamp marker.
+		// the creation of a file and introduces a timestamp marker
+		// for any writes to the file performed after this point.
 		secLogger.Logf(ctx, "stderr capture started")
+
+		// Now tell this logger to capture internal stderr writes.
+		if err := secLogger.logger.takeOverInternalStderr(); err != nil {
+			// Oof, it turns out we can't use this logger after all. Give up
+			// on it.
+			cancel()
+			secLogger.Close()
+			return nil, err
+		}
+
+		// Now inform the other functions using stderrLog that we
+		// have a new logger for it.
 		prevStderrLogger := stderrLog
 		stderrLog = &secLogger.logger
 
 		// The cleanup fn is for use in tests.
 		cleanup := func() {
+			// Relinquish the stderr redirect.
+			if err := secLogger.logger.relinquishInternalStderr(); err != nil {
+				// This should not fail. If it does, some caller messed up by
+				// switching over stderr redirection to a different logger
+				// without our involvement. That's invalid API usage.
+				panic(err)
+			}
+
 			// Restore the apparent stderr logger used by Shout() and tests.
 			stderrLog = prevStderrLogger
 

diff --git a/pkg/util/log/secondary_log.go b/pkg/util/log/secondary_log.go
@@ -72,10 +72,6 @@ func NewSecondaryLogger(
 			stderrThreshold: mainLog.stderrThreshold.get(),
 			logCounter:      EntryCounter{EnableMsgCount: enableMsgCount},
 			gcNotify:        make(chan struct{}, 1),
-			// Only one logger can have redirectInternalStderrWrites set to
-			// true; this is going to be either mainLog or stderrLog
-			// depending on configuration.
-			redirectInternalStderrWrites: false,
 		},
 		forceSyncWrites: forceSyncWrites,
 	}

diff --git a/pkg/util/log/stderr_redirect.go b/pkg/util/log/stderr_redirect.go
@@ -11,12 +11,20 @@
 package log
 
 import (
+	"fmt"
 	"os"
+	"runtime/debug"
 
 	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
+	"github.com/cockroachdb/errors"
 )
 
-// OrigStderr points to the original stderr stream.
+// OrigStderr points to the original stderr stream when the process
+// started.
+// Note that it is generally not sound to restore os.Stderr
+// and fd 2 from here as long as a stack of loggers (especially test
+// log scopes) are active, as the loggers keep track of what they are
+// redirecting themselves in a stack structure.
 var OrigStderr = func() *os.File {
 	fd, err := dupFD(os.Stderr.Fd())
 	if err != nil {
@@ -44,11 +52,116 @@ func hijackStderr(f *os.File) error {
 	return redirectStderr(f)
 }
 
-// restoreStderr cancels the effect of hijackStderr().
-func restoreStderr() error {
-	return redirectStderr(OrigStderr)
-}
-
 // osStderrMu ensures that concurrent redirects of stderr don't
 // overwrite each other.
 var osStderrMu syncutil.Mutex
+
+// takeOverInternalStderr tells the given logger that it is to take over
+// direct writes to fd 2 by e.g. the Go runtime, or direct writes to
+// os.Stderr.
+//
+// This also enforces that at most one logger can redirect stderr in
+// this way. It also returns an error if stderr has already been
+// taken over in this way. It also errors if the target logger has no
+// valid output directory and no output file has been created (or
+// could be created).
+func (l *loggerT) takeOverInternalStderr() error {
+	takeOverStderrMu.Lock()
+	defer takeOverStderrMu.Unlock()
+
+	if anyLoggerHasInternalStderrOwnership() {
+		return errors.AssertionFailedf(
+			"can't take over stderr; first takeover:\n%s",
+			takeOverStderrMu.previousStderrTakeover)
+	}
+
+	l.mu.Lock()
+	defer l.mu.Unlock()
+
+	// Ensure there's a file to work with.
+	if err := l.ensureFileLocked(); err != nil {
+		return errors.NewAssertionErrorWithWrappedErrf(err, "can't take over stderr without a file")
+	}
+	// Ensure there's a _real_ file to work with.
+	sb, ok := l.mu.file.(*syncBuffer)
+	if !ok {
+		return errors.AssertionFailedf("can't take over stderr with a non-file writer")
+	}
+	// Take over stderr with this writer.
+	if err := hijackStderr(sb.file); err != nil {
+		return errors.Wrap(err, "unable to take over stderr")
+	}
+	// Mark the stderr as taken over.
+	l.mu.currentlyOwnsInternalStderr = true
+	// Request internal stderr redirection for future file rotations.
+	l.mu.redirectInternalStderrWrites = true
+
+	// Success: remember who called us, in case the next
+	// caller comes along with the wrong call sequence.
+	takeOverStderrMu.previousStderrTakeover = string(debug.Stack())
+	return nil
+}
+
+// relinquishInternalStderr relinquishes a takeover by
+// takeOverInternalStderr(). It returns an error if the
+// logger did not take over internal stderr writes already.
+func (l *loggerT) relinquishInternalStderr() error {
+	l.mu.Lock()
+	defer l.mu.Unlock()
+	if !l.mu.redirectInternalStderrWrites {
+		const basemsg = "can't relinquish stderr writes - this logger is not owner%s"
+		// Try to help the caller a bit.
+		takeOverStderrMu.Lock()
+		defer takeOverStderrMu.Unlock()
+		var extra string
+		if anyLoggerHasInternalStderrOwnership() {
+			extra = fmt.Sprintf("; previous take over:\n%s", takeOverStderrMu.previousStderrTakeover)
+		}
+		return errors.AssertionFailedf(basemsg, extra)
+	}
+
+	// If stderr actually redirected, restore it.
+	if l.mu.currentlyOwnsInternalStderr {
+		if err := hijackStderr(OrigStderr); err != nil {
+			return errors.Wrap(err, "unable to restore internal stderr")
+		}
+	}
+
+	// Remove the ownership.
+	l.mu.currentlyOwnsInternalStderr = false
+	l.mu.redirectInternalStderrWrites = false
+	return nil
+}
+
+// anyLoggerHasInternalStderrOwnership returns true iff any of the
+// loggers currently has redirectInternalStderrWrites set.
+//
+// Used by takeOverInternalStderr() to enforce its invariant.
+func anyLoggerHasInternalStderrOwnership() bool {
+	mainLog.mu.Lock()
+	mainLogHasOwnership := mainLog.mu.redirectInternalStderrWrites
+	mainLog.mu.Unlock()
+	if mainLogHasOwnership {
+		return true
+	}
+	secondaryLogRegistry.mu.Lock()
+	defer secondaryLogRegistry.mu.Unlock()
+	for _, secL := range secondaryLogRegistry.mu.loggers {
+		secL.logger.mu.Lock()
+		hasOwnership := secL.logger.mu.redirectInternalStderrWrites
+		secL.logger.mu.Unlock()
+		if hasOwnership {
+			return true
+		}
+	}
+	return false
+}
+
+var takeOverStderrMu struct {
+	syncutil.Mutex
+
+	// previousStderrTakeover is the stack trace of the previous call to
+	// takeOverStderrInternal(). This can be used to troubleshoot
+	// invalid call sequences.
+	previousStderrTakeover string
+}
diff --git a/pkg/util/log/sync_buffer.go b/pkg/util/log/sync_buffer.go
@@ -108,15 +108,39 @@ func (l *loggerT) createFileLocked() error {
 // rotation to new files, in which case a new file is immediately
 // opened. See rotateFileLocked() below.
 func (l *loggerT) closeFileLocked() error {
-	if l.mu.file != nil {
-		if sb, ok := l.mu.file.(*syncBuffer); ok {
-			if err := sb.file.Close(); err != nil {
-				return err
-			}
+	if l.mu.file == nil {
+		return nil
+	}
+
+	// First disconnect stderr, if it was connected. We do this before
+	// closing the file to ensure no direct stderr writes are lost.
+	if err := l.maybeRelinquishInternalStderrLocked(); err != nil {
+		return err
+	}
+
+	if sb, ok := l.mu.file.(*syncBuffer); ok {
+		if err := sb.file.Close(); err != nil {
+			return err
 		}
-		l.mu.file = nil
 	}
-	return restoreStderr()
+	l.mu.file = nil
+
+	return nil
+}
+
+// maybeRelinquishInternalStderrLocked restores the internal fd 2 and
+// os.Stderr if this logger had previously taken ownership of it.
+//
+// This is used by the TestLogScope only when the scope is closed.
+func (l *loggerT) maybeRelinquishInternalStderrLocked() error {
+	if !l.mu.currentlyOwnsInternalStderr {
+		return nil
+	}
+	if err := hijackStderr(OrigStderr); err != nil {
+		return err
+	}
+	l.mu.currentlyOwnsInternalStderr = false
+	return nil
 }
 
 // rotateFileLocked closes the syncBuffer's file and starts a new one.
@@ -175,7 +199,7 @@ func (sb *syncBuffer) rotateFileLocked(now time.Time) (err error) {
 
 	// Switch over internal stderr writes, if currently captured, to the
 	// new file.
-	if sb.logger.redirectInternalStderrWrites {
+	if sb.logger.mu.redirectInternalStderrWrites {
 		if err := hijackStderr(newFile); err != nil {
 			return err
 		}