Skip to content

Commit

Permalink
sqlstats: dont block flush due to sql activity job
Browse files Browse the repository at this point in the history
The sql stats flush worker signals the sql activity update job
on flush completion via an unbuffered channel. It currently
blocks on sending that signal so if that job is stuck
updating stats for some reason this also makes the flush
stuck. This commit ensures that the flush worker can continue
normally even if the sql activity update job is not ready to
receive its next signal. This ensures that the coordinator node
for the sql stats activity job can continue to collect sql stats
normally.

Epic: none
Fixes: #119751
  • Loading branch information
xinhaoz committed Mar 6, 2024
1 parent 593a556 commit 29df8ab
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 0 deletions.
38 changes: 38 additions & 0 deletions pkg/sql/sqlstats/persistedsqlstats/flush_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import (
"math"
"regexp"
"strconv"
"sync/atomic"
"testing"
"time"

Expand Down Expand Up @@ -1071,3 +1072,40 @@ func smallestStatsCountAcrossAllShards(

return numStmtStats
}

func TestSQLStatsFlushDoesntWaitForFlushSigReceiver(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)

sqlStatsKnobs := sqlstats.CreateTestingKnobs()
var sqlStmtFlushCount, sqlTxnFlushCount atomic.Int32
sqlStatsKnobs.OnStmtStatsFlushFinished = func() {
sqlStmtFlushCount.Add(1)
}
sqlStatsKnobs.OnTxnStatsFlushFinished = func() {
sqlTxnFlushCount.Add(1)
}
tc := serverutils.StartCluster(t, 3 /* numNodes */, base.TestClusterArgs{
ServerArgs: base.TestServerArgs{
Knobs: base.TestingKnobs{
SQLStatsKnobs: sqlStatsKnobs,
},
},
})

ctx := context.Background()
defer tc.Stopper().Stop(ctx)

ss := tc.ApplicationLayer(0).SQLServer().(*sql.Server).GetSQLStatsProvider().(*persistedsqlstats.PersistedSQLStats)
flushDoneCh := make(chan struct{})
ss.SetFlushDoneSignalCh(flushDoneCh)

// It should not block on the flush signal receiver.
persistedsqlstats.SQLStatsFlushInterval.Override(ctx, &tc.Server(0).ClusterSettings().SV, 100*time.Millisecond)
testutils.SucceedsSoon(t, func() error {
if sqlStmtFlushCount.Load() < 5 || sqlTxnFlushCount.Load() < 5 {
return errors.New("flush count hasn't been reached yet")
}
return nil
})
}
5 changes: 5 additions & 0 deletions pkg/sql/sqlstats/persistedsqlstats/provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,11 @@ func (s *PersistedSQLStats) startSQLStatsFlushLoop(ctx context.Context, stopper
return
case <-s.drain:
return
default:
// Don't block the flush loop if the sql activity update job is not
// ready to receive. We should at least continue to collect and flush
// stats for this node.
log.Warning(ctx, "sql-stats-worker: unable to signal flush completion")
}
}
}
Expand Down

0 comments on commit 29df8ab

Please sign in to comment.