Skip to content

Commit

Permalink
instancestorage: asynchronously pre-allocate instance IDs in sql_inst…
Browse files Browse the repository at this point in the history
…ances

Related to cockroachdb#85737.

Previously, when a SQL pod starts up, we will allocate an instance ID for it
synchronously. This can be a problem for multi-region setup because that would
now involve a read and write across all regions. This commit is part of the
work to make the system.sql_instances table REGIONAL BY ROW to reduce cold
start times. Here, we would asynchronously pre-allocate instance IDs, and the
startup process will then claim an ID for the SQL pod. Once the sql_instances
table is made REGIONAL BY ROW, we can update the logic to pre-allocate for
every region, and only perform a regional lookup when claiming an ID.

Epic: None

Release note (sql change): The `system.sql_instances` table now includes
pre-allocated ID entries, where all the fields except `id` will be NULL.
  • Loading branch information
jaylim-crl committed Nov 4, 2022
1 parent e1a8ed6 commit ed4742e
Show file tree
Hide file tree
Showing 12 changed files with 1,182 additions and 201 deletions.
2 changes: 2 additions & 0 deletions pkg/ccl/serverccl/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,10 @@ go_test(
"//pkg/server",
"//pkg/server/serverpb",
"//pkg/server/systemconfigwatcher/systemconfigwatchertest",
"//pkg/settings/cluster",
"//pkg/sql",
"//pkg/sql/distsql",
"//pkg/sql/sqlinstance/instancestorage",
"//pkg/sql/tests",
"//pkg/testutils",
"//pkg/testutils/serverutils",
Expand Down
45 changes: 45 additions & 0 deletions pkg/ccl/serverccl/server_sql_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,19 @@ import (
"io"
"strings"
"testing"
"time"

"github.com/cockroachdb/cockroach/pkg/base"
"github.com/cockroachdb/cockroach/pkg/ccl/utilccl"
"github.com/cockroachdb/cockroach/pkg/ccl/utilccl/licenseccl"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/security"
"github.com/cockroachdb/cockroach/pkg/server/systemconfigwatcher/systemconfigwatchertest"
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
"github.com/cockroachdb/cockroach/pkg/sql"
"github.com/cockroachdb/cockroach/pkg/sql/distsql"
"github.com/cockroachdb/cockroach/pkg/sql/sqlinstance/instancestorage"
"github.com/cockroachdb/cockroach/pkg/testutils"
"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
"github.com/cockroachdb/cockroach/pkg/testutils/testcluster"
Expand Down Expand Up @@ -235,6 +239,47 @@ func TestTenantRowIDs(t *testing.T) {
require.Equal(t, numRows, rowCount)
}

// TestTenantInstanceIDReclaimLoop confirms that the sql_instances reclaim loop
// has been started.
func TestTenantInstanceIDReclaimLoop(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)
ctx := context.Background()

settings := cluster.MakeTestingClusterSettings()
tc := serverutils.StartNewTestCluster(t, 3, base.TestClusterArgs{
ServerArgs: base.TestServerArgs{
Settings: settings,
// Don't use a default test tenant. We will explicitly create one.
DisableDefaultTestTenant: true,
},
})
defer tc.Stopper().Stop(ctx)

clusterSettings := tc.Server(0).ClusterSettings()
instancestorage.ReclaimLoopInterval.Override(ctx, &clusterSettings.SV, 250*time.Millisecond)
instancestorage.PreallocatedCount.Override(ctx, &clusterSettings.SV, 5)

_, db := serverutils.StartTenant(
t,
tc.Server(0),
base.TestTenantArgs{TenantID: serverutils.TestTenantID(), Settings: settings},
)
defer db.Close()
sqlDB := sqlutils.MakeSQLRunner(db)

var rowCount int64
testutils.SucceedsSoon(t, func() error {
sqlDB.QueryRow(t, `SELECT count(*) FROM system.sql_instances WHERE addr IS NULL`).Scan(&rowCount)
// We set PreallocatedCount to 5. When the tenant gets started, it drops
// to 4. Eventually this will be 5 if the reclaim loop runs.
if rowCount == 5 {
return nil
}
return fmt.Errorf("waiting for preallocated rows")
})
}

// TestNoInflightTracesVirtualTableOnTenant verifies that internal inflight traces table
// is correctly handled by tenants (which don't provide this functionality as of now).
func TestNoInflightTracesVirtualTableOnTenant(t *testing.T) {
Expand Down
13 changes: 10 additions & 3 deletions pkg/server/server_sql.go
Original file line number Diff line number Diff line change
Expand Up @@ -496,14 +496,15 @@ func newSQLServer(ctx context.Context, cfg sqlServerArgs) (*SQLServer, error) {
return nil, errors.AssertionFailedf("non-system codec used for SQL pod")
}

cfg.sqlInstanceStorage = instancestorage.NewStorage(cfg.db, codec, cfg.sqlLivenessProvider)
cfg.sqlInstanceStorage = instancestorage.NewStorage(
cfg.db, codec, cfg.sqlLivenessProvider, cfg.Settings)
cfg.sqlInstanceReader = instancestorage.NewReader(
cfg.sqlInstanceStorage,
cfg.sqlLivenessProvider.CachedReader(),
cfg.rangeFeedFactory,
codec, cfg.clock, cfg.stopper)

// In a multi-tenant environment, use the sqlInstanceProvider to resolve
// In a multi-tenant environment, use the sqlInstanceReader to resolve
// SQL pod addresses.
addressResolver := func(nodeID roachpb.NodeID) (net.Addr, error) {
info, err := cfg.sqlInstanceReader.GetInstance(cfg.rpcContext.MasterCtx, base.SQLInstanceID(nodeID))
Expand Down Expand Up @@ -1317,7 +1318,13 @@ func (s *SQLServer) preStart(
if err != nil {
return err
}
// Allocate our instance ID.
// Start instance ID reclaim loop.
if err := s.sqlInstanceStorage.RunInstanceIDReclaimLoop(
ctx, stopper, timeutil.DefaultTimeSource{}, session.Expiration,
); err != nil {
return err
}
// Acquire our instance ID.
instanceID, err := s.sqlInstanceStorage.CreateInstance(
ctx, session.ID(), session.Expiration(), s.cfg.AdvertiseAddr, s.distSQLServer.Locality)
if err != nil {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ SELECT * FROM t

# Check sql instance locality in the secondary tenant.
query IT
SELECT id, locality FROM system.sql_instances
SELECT id, locality FROM system.sql_instances WHERE locality IS NOT NULL
----
1 {"Tiers": "region=test"}
2 {"Tiers": "region=test1"}
Expand Down
9 changes: 8 additions & 1 deletion pkg/sql/sqlinstance/instancestorage/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ go_library(
"//pkg/kv/kvclient/rangefeed",
"//pkg/multitenant",
"//pkg/roachpb",
"//pkg/settings",
"//pkg/settings/cluster",
"//pkg/sql/catalog",
"//pkg/sql/catalog/descpb",
"//pkg/sql/catalog/systemschema",
Expand All @@ -34,6 +36,8 @@ go_library(
"//pkg/util/log",
"//pkg/util/stop",
"//pkg/util/syncutil",
"//pkg/util/syncutil/singleflight",
"//pkg/util/timeutil",
"@com_github_cockroachdb_errors//:errors",
],
)
Expand All @@ -42,19 +46,22 @@ go_test(
name = "instancestorage_test",
srcs = [
"instancereader_test.go",
"instancestorage_internal_test.go",
"instancestorage_test.go",
"main_test.go",
],
args = ["-test.timeout=295s"],
embed = [":instancestorage"],
deps = [
":instancestorage",
"//pkg/base",
"//pkg/keys",
"//pkg/kv",
"//pkg/kv/kvclient/rangefeed",
"//pkg/roachpb",
"//pkg/security/securityassets",
"//pkg/security/securitytest",
"//pkg/server",
"//pkg/settings/cluster",
"//pkg/sql/catalog/descpb",
"//pkg/sql/catalog/systemschema",
"//pkg/sql/sqlinstance",
Expand Down
4 changes: 4 additions & 0 deletions pkg/sql/sqlinstance/instancestorage/instancereader.go
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,10 @@ func (r *Reader) getAllLiveInstances(ctx context.Context) ([]instancerow, error)
{
truncated := rows[:0]
for _, row := range rows {
// Skip instances which are preallocated.
if row.isAvailable() {
continue
}
isAlive, err := r.slReader.IsAlive(ctx, row.sessionID)
if err != nil {
return nil, err
Expand Down
2 changes: 1 addition & 1 deletion pkg/sql/sqlinstance/instancestorage/instancereader_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ func TestReader(t *testing.T) {
tDB.Exec(t, schema)
tableID := getTableID(t, tDB, dbName, "sql_instances")
slStorage := slstorage.NewFakeStorage()
storage := instancestorage.NewTestingStorage(s.DB(), keys.SystemSQLCodec, tableID, slStorage)
storage := instancestorage.NewTestingStorage(s.DB(), keys.SystemSQLCodec, tableID, slStorage, s.ClusterSettings())
reader := instancestorage.NewTestingReader(storage, slStorage, s.RangeFeedFactory().(*rangefeed.Factory), keys.SystemSQLCodec, tableID, s.Clock(), s.Stopper())
return storage, slStorage, s.Clock(), reader
}
Expand Down
Loading

0 comments on commit ed4742e

Please sign in to comment.