Skip to content

Commit

Permalink
Merge #93268
Browse files Browse the repository at this point in the history
93268: statusccl: temporarily serve /_status/nodes to tenants r=matthewtodd a=matthewtodd

Fixes #92065
Part of #89949

Our current UI architecture has many of the SQL Activity pages mapping node or sql instance ids to regions at page load time, using the response from this `/_status/nodes` endpoint to get the information it needs.

This approach is problematic for ephemeral serverless tenants but will need a broader reimagining, probably ending up with us storing locality information directly in many of the sqlstats tables, which will probably require something like sending instance localities along in the distsql tracing. An initial exploration in this direction happened in #92259.

In the meantime, as a stop-gap measure, we implement a reduced version of this `/_status/nodes` endpoint for tenants, keeping the SQL Activity pages somewhat working for multiregion tenants.

Release note: None

Co-authored-by: Matthew Todd <[email protected]>
  • Loading branch information
craig[bot] and matthewtodd committed Dec 9, 2022
2 parents e5de991 + 3750ea6 commit 4a641ac
Show file tree
Hide file tree
Showing 6 changed files with 120 additions and 11 deletions.
51 changes: 50 additions & 1 deletion pkg/ccl/serverccl/statusccl/tenant_status_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,21 @@ func TestTenantStatusAPI(t *testing.T) {
StoreDisableCoalesceAdjacent: true,
}

testHelper := serverccl.NewTestTenantHelper(t, 3 /* tenantClusterSize */, knobs)
tenantLocalities := []roachpb.Locality{
{Tiers: []roachpb.Tier{{Key: "region", Value: "gcp-us-west1"}}},
{Tiers: []roachpb.Tier{{Key: "region", Value: "gcp-us-central1"}}},
{Tiers: []roachpb.Tier{{Key: "region", Value: "gcp-us-east1"}}},
}

testHelper := serverccl.NewTestTenantHelperWithTenantArgs(
t,
3, /* tenantClusterSize */
knobs,
func(tenantIdx int, tenantArgs *base.TestTenantArgs) {
tenantArgs.Locality = tenantLocalities[tenantIdx]
},
)

defer testHelper.Cleanup(ctx, t)

t.Run("reset_sql_stats", func(t *testing.T) {
Expand Down Expand Up @@ -114,6 +128,10 @@ func TestTenantStatusAPI(t *testing.T) {
t.Run("tenant_logs", func(t *testing.T) {
testTenantLogs(ctx, t, testHelper)
})

t.Run("tenant_nodes", func(t *testing.T) {
testTenantNodes(ctx, t, testHelper, tenantLocalities)
})
}

func testTenantLogs(ctx context.Context, t *testing.T, helper serverccl.TenantTestHelper) {
Expand Down Expand Up @@ -1239,6 +1257,37 @@ func testTenantAuthOnStatements(
require.NoError(t, err)
}

func testTenantNodes(
ctx context.Context,
t *testing.T,
helper serverccl.TenantTestHelper,
tenantLocalities []roachpb.Locality,
) {
// TODO(todd): Get this test working over HTTP, #93267.
//
// When I first tried, I ran into errors about the SQL user
// `authentic_user_noadmin` already existing, but at first glance it seemed
// that the sync.Once mechanism used in its setup had already accounted
// for that scenario; that we're properly passing by reference all the way
// through; and that these tests aren't being run in parallel.
tenantA := helper.TestCluster().TenantStatusSrv(0)
resp, err := tenantA.Nodes(ctx, &serverpb.NodesRequest{})
require.NoError(t, err)

expected := map[roachpb.NodeID]roachpb.Locality{
1: tenantLocalities[0],
2: tenantLocalities[1],
3: tenantLocalities[2],
}

actual := make(map[roachpb.NodeID]roachpb.Locality)
for _, node := range resp.Nodes {
actual[node.Desc.NodeID] = node.Desc.Locality
}

require.Equal(t, expected, actual)
}

// assertStartKeyInRange compares the pretty printed startKey with the provided
// tenantPrefix key, ensuring that the startKey starts with the tenantPrefix.
func assertStartKeyInRange(t *testing.T, startKey string, tenantPrefix roachpb.Key) {
Expand Down
33 changes: 23 additions & 10 deletions pkg/ccl/serverccl/tenant_test_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,17 +93,11 @@ type TestTenant interface {
var _ TestTenant = &testTenant{}

func newTestTenant(
t *testing.T,
server serverutils.TestServerInterface,
tenantID roachpb.TenantID,
knobs base.TestingKnobs,
t *testing.T, server serverutils.TestServerInterface, args base.TestTenantArgs,
) TestTenant {
t.Helper()

tenantParams := tests.CreateTestTenantParams(tenantID)
tenantParams.TestingKnobs = knobs

tenant, tenantConn := serverutils.StartTenant(t, server, tenantParams)
tenant, tenantConn := serverutils.StartTenant(t, server, args)
sqlDB := sqlutils.MakeSQLRunner(tenantConn)
status := tenant.StatusServer().(serverpb.SQLStatusServer)
sqlStats := tenant.PGServer().(*pgwire.Server).SQLServer.
Expand Down Expand Up @@ -151,6 +145,20 @@ func NewTestTenantHelper(
) TenantTestHelper {
t.Helper()

return NewTestTenantHelperWithTenantArgs(t, tenantClusterSize, knobs, func(int, *base.TestTenantArgs) {})
}

// NewTestTenantHelperWithTenantArgs constructs a TenantTestHelper instance,
// offering the caller the opportunity to modify the arguments passed when
// starting each tenant.
func NewTestTenantHelperWithTenantArgs(
t *testing.T,
tenantClusterSize int,
knobs base.TestingKnobs,
customizeTenantArgs func(tenantIdx int, tenantArgs *base.TestTenantArgs),
) TenantTestHelper {
t.Helper()

params, _ := tests.CreateTestServerParams()
params.Knobs = knobs
// We're running tenant tests, no need for a default tenant.
Expand All @@ -168,6 +176,7 @@ func NewTestTenantHelper(
tenantClusterSize,
security.EmbeddedTenantIDs()[0],
knobs,
customizeTenantArgs,
),
// Spin up a small tenant cluster under a different tenant ID to test
// tenant isolation.
Expand All @@ -177,6 +186,7 @@ func NewTestTenantHelper(
1, /* tenantClusterSize */
security.EmbeddedTenantIDs()[1],
knobs,
func(int, *base.TestTenantArgs) {},
),
}
}
Expand Down Expand Up @@ -224,13 +234,16 @@ func newTenantClusterHelper(
tenantClusterSize int,
tenantID uint64,
knobs base.TestingKnobs,
customizeTenantArgs func(tenantIdx int, tenantArgs *base.TestTenantArgs),
) TenantClusterHelper {
t.Helper()

var cluster tenantCluster = make([]TestTenant, tenantClusterSize)
for i := 0; i < tenantClusterSize; i++ {
cluster[i] =
newTestTenant(t, server, roachpb.MustMakeTenantID(tenantID), knobs)
args := tests.CreateTestTenantParams(roachpb.MustMakeTenantID(tenantID))
args.TestingKnobs = knobs
customizeTenantArgs(i, &args)
cluster[i] = newTestTenant(t, server, args)
}

return cluster
Expand Down
1 change: 1 addition & 0 deletions pkg/server/serverpb/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ type SQLStatusServer interface {
UserSQLRoles(context.Context, *UserSQLRolesRequest) (*UserSQLRolesResponse, error)
TxnIDResolution(context.Context, *TxnIDResolutionRequest) (*TxnIDResolutionResponse, error)
TransactionContentionEvents(context.Context, *TransactionContentionEventsRequest) (*TransactionContentionEventsResponse, error)
Nodes(context.Context, *NodesRequest) (*NodesResponse, error)
NodesList(context.Context, *NodesListRequest) (*NodesListResponse, error)
ListExecutionInsights(context.Context, *ListExecutionInsightsRequest) (*ListExecutionInsightsResponse, error)
LogFilesList(context.Context, *LogFilesListRequest) (*LogFilesListResponse, error)
Expand Down
38 changes: 38 additions & 0 deletions pkg/server/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -1491,6 +1491,44 @@ func (s *statusServer) NodesList(
return s.serverIterator.nodesList(ctx)
}

// Nodes returns a limited subset of the full NodesResponse.
//
// Its existence is a stop-gap measure to support running much of our UI code
// on multiregion tenant clusters. Longer-term, we will need to reconsider
// the overall architecture (e.g. store node ids in sqlstats, map node id ->
// region at UI display time) because sql instances are far more ephemeral
// than cluster nodes, and this Nodes endpoint is only able to answer for
// currently-live sql instances. What I think we want is to store locality
// information directly in the sqlstats tables, sourced from some new tracing
// in the distsql flow.
func (s *statusServer) Nodes(
ctx context.Context, req *serverpb.NodesRequest,
) (*serverpb.NodesResponse, error) {
ctx = propagateGatewayMetadata(ctx)
ctx = s.AnnotateCtx(ctx)

if err := s.privilegeChecker.requireViewActivityOrViewActivityRedactedPermission(ctx); err != nil {
// NB: not using serverError() here since the priv checker
// already returns a proper gRPC error status.
return nil, err
}

instances, err := s.sqlServer.sqlInstanceReader.GetAllInstances(ctx)
if err != nil {
return nil, err
}
var resp serverpb.NodesResponse
for _, instance := range instances {
resp.Nodes = append(resp.Nodes, statuspb.NodeStatus{
Desc: roachpb.NodeDescriptor{
NodeID: roachpb.NodeID(instance.InstanceID),
Locality: instance.Locality,
},
})
}
return &resp, err
}

// Nodes returns all node statuses.
//
// Do not use this method inside the server code! Use
Expand Down
5 changes: 5 additions & 0 deletions pkg/ui/workspaces/cluster-ui/src/util/proto.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -66,5 +66,10 @@ describe("Proto utils", () => {
statusWithRolledMetrics.metrics,
);
});

it("does not explode when node fields are missing", () => {
const emptyNodeStatus: Partial<INodeStatus> = {};
assert.deepEqual(rollupStoreMetrics(emptyNodeStatus), {});
});
});
});
3 changes: 3 additions & 0 deletions pkg/ui/workspaces/cluster-ui/src/util/proto.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ export type StatusMetrics = typeof nodeStatus.metrics;
* with metrics from `store_statuses.metrics` object.
*/
export function rollupStoreMetrics(ns: INodeStatus): StatusMetrics {
if (!ns.store_statuses) {
return ns.metrics || {};
}
return ns.store_statuses
.map(ss => ss.metrics)
.reduce((acc, i) => {
Expand Down

0 comments on commit 4a641ac

Please sign in to comment.