core: enforce strict steps for clients reconnect

lgfa29 · lgfa29 · commit 4331b7a92415 · 2023-01-18T20:11:37.000-05:00
When a Nomad client that is running an allocation with
`max_client_disconnect` set misses a heartbeat the Nomad server will
update its status to `disconnected`.

Upon reconnecting, the client will make three main RPC calls:

- `Node.UpdateStatus` is used to set the client status to `ready`.
- `Node.UpdateAlloc` is used to update the client-side information about
  allocations, such as their `ClientStatus`, task states etc.
- `Node.Register` is used to upsert the entire node information,
  including its status.

These calls are made concurrently and are also running in parallel with
the scheduler. Depending on the order they run the scheduler may end up
with incomplete data when reconciling allocations.

For example, a client disconnects and its replacement allocation cannot
be placed anywhere else, so there's a pending eval waiting for
resources.

When this client comes back the order of events may be:

1. Client calls `Node.UpdateStatus` and is now `ready`.
2. Scheduler reconciles allocations and places the replacement alloc to
   the client. The client is now assigned two allocations: the original
   alloc that is still `unknown` and the replacement that is `pending`.
3. Client calls `Node.UpdateAlloc` and updates the original alloc to
   `running`.
4. Scheduler notices too many allocs and stops the replacement.

This creates unnecessary placements or, in a different order of events,
may leave the job without any allocations running until the whole state
is updated and reconciled.

To avoid problems like this clients must update _all_ of its relevant
information before they can be considered `ready` and available for
scheduling.

To achieve this goal the RPC endpoints mentioned above have been
modified to enforce strict steps for nodes reconnecting:

- `Node.Register` does not set the client status anymore.
- `Node.UpdateStatus` sets the reconnecting client to the `initializing`
  status until it successfully calls `Node.UpdateAlloc`.

These changes are done server-side to avoid the need of additional
coordination between clients and servers. Clients are kept oblivious of
these changes and will keep making these calls as they normally would.

The verification of whether allocations have been updates is done by
storing and comparing the Raft index of the last time the client missed
a heartbeat and the last time it updated its allocations.
diff --git a/nomad/node_endpoint.go b/nomad/node_endpoint.go
@@ -157,11 +157,17 @@ func (n *Node) Register(args *structs.NodeRegisterRequest, reply *structs.NodeUp
 		return err
 	}
 
-	// Check if the SecretID has been tampered with
 	if originalNode != nil {
+		// Check if the SecretID has been tampered with
 		if args.Node.SecretID != originalNode.SecretID && originalNode.SecretID != "" {
 			return fmt.Errorf("node secret ID does not match. Not registering node.")
 		}
+
+		// Don't allow the Register method to update the node status. Only the
+		// UpdateStatus method should be able to do this.
+		if originalNode.Status != "" {
+			args.Node.Status = originalNode.Status
+		}
 	}
 
 	// We have a valid node connection, so add the mapping to cache the
@@ -486,6 +492,26 @@ func (n *Node) UpdateStatus(args *structs.NodeUpdateStatusRequest, reply *struct
 	// Update the timestamp of when the node status was updated
 	args.UpdatedAt = time.Now().Unix()
 
+	// Compute next status.
+	switch node.Status {
+	case structs.NodeStatusInit:
+		if args.Status == structs.NodeStatusReady {
+			allocs, err := snap.AllocsByNodeTerminal(ws, args.NodeID, false)
+			if err != nil {
+				return fmt.Errorf("failed to query node allocs: %v", err)
+			}
+
+			allocsUpdated := node.LastAllocUpdateIndex > node.LastMissedHeartbeatIndex
+			if len(allocs) > 0 && !allocsUpdated {
+				args.Status = structs.NodeStatusInit
+			}
+		}
+	case structs.NodeStatusDisconnected:
+		if args.Status == structs.NodeStatusReady {
+			args.Status = structs.NodeStatusInit
+		}
+	}
+
 	// Commit this update via Raft
 	var index uint64
 	if node.Status != args.Status {
@@ -1179,8 +1205,8 @@ func (n *Node) UpdateAlloc(args *structs.AllocUpdateRequest, reply *structs.Gene
 	if node == nil {
 		return fmt.Errorf("node %s not found", nodeID)
 	}
-	if node.Status != structs.NodeStatusReady {
-		return fmt.Errorf("node %s is %s, not %s", nodeID, node.Status, structs.NodeStatusReady)
+	if node.UnresponsiveStatus() {
+		return fmt.Errorf("node %s is not allow to update allocs while in status %s", nodeID, node.Status)
 	}
 
 	// Ensure that evals aren't set from client RPCs
@@ -1313,6 +1339,17 @@ func (n *Node) UpdateAlloc(args *structs.AllocUpdateRequest, reply *structs.Gene
 		return err
 	}
 
+	// Update node alloc update index.
+	copyNode := node.Copy()
+	copyNode.LastAllocUpdateIndex = future.Index()
+
+	_, _, err = n.srv.raftApply(structs.NodeRegisterRequestType, &structs.NodeRegisterRequest{
+		Node: copyNode,
+	})
+	if err != nil {
+		return fmt.Errorf("node update failed: %v", err)
+	}
+
 	// Setup the response
 	reply.Index = future.Index()
 	return nil
diff --git a/nomad/node_endpoint_test.go b/nomad/node_endpoint_test.go
@@ -23,6 +23,7 @@ import (
 	"github.com/hashicorp/nomad/testutil"
 	vapi "github.com/hashicorp/vault/api"
 	"github.com/kr/pretty"
+	"github.com/shoenig/test/must"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 )
@@ -524,6 +525,171 @@ func TestClientEndpoint_UpdateStatus_Vault(t *testing.T) {
 	}
 }
 
+func TestClientEndpoint_UpdateStatus_Reconnect(t *testing.T) {
+	ci.Parallel(t)
+
+	// Setup server with tighther heartbeat so we don't have to wait so long
+	// for nodes to go down.
+	heartbeatTTL := time.Duration(500*testutil.TestMultiplier()) * time.Millisecond
+	s, cleanupS := TestServer(t, func(c *Config) {
+		c.MinHeartbeatTTL = heartbeatTTL
+		c.HeartbeatGrace = 2 * heartbeatTTL
+	})
+	codec := rpcClient(t, s)
+	defer cleanupS()
+	testutil.WaitForLeader(t, s.RPC)
+
+	// Register node.
+	node := mock.Node()
+	reg := &structs.NodeRegisterRequest{
+		Node:         node,
+		WriteRequest: structs.WriteRequest{Region: "global"},
+	}
+	var nodeUpdateResp structs.NodeUpdateResponse
+	err := msgpackrpc.CallWithCodec(codec, "Node.Register", reg, &nodeUpdateResp)
+	must.NoError(t, err)
+
+	// Start heartbeat.
+	stopHeartbeat := make(chan interface{})
+	heartbeat := func() {
+		ticker := time.NewTicker(heartbeatTTL / 2)
+		for {
+			select {
+			case <-stopHeartbeat:
+				ticker.Stop()
+				return
+			case <-ticker.C:
+				hb := &structs.NodeUpdateStatusRequest{
+					NodeID:       node.ID,
+					Status:       structs.NodeStatusReady,
+					WriteRequest: structs.WriteRequest{Region: "global"},
+				}
+				err := msgpackrpc.CallWithCodec(codec, "Node.UpdateStatus", hb, &nodeUpdateResp)
+				must.NoError(t, err)
+			}
+		}
+	}
+	go heartbeat()
+
+	// Wait for node to be ready.
+	testutil.WaitForClientStatus(t, s.RPC, node.ID, "global", structs.NodeStatusReady)
+
+	// Register job with max_client_disconnect.
+	job := mock.Job()
+	job.Constraints = []*structs.Constraint{}
+	job.TaskGroups[0].Count = 1
+	job.TaskGroups[0].MaxClientDisconnect = pointer.Of(time.Hour)
+	job.TaskGroups[0].Constraints = []*structs.Constraint{}
+	job.TaskGroups[0].Tasks[0].Driver = "mock_driver"
+	job.TaskGroups[0].Tasks[0].Config = map[string]interface{}{
+		"run_for": "10m",
+	}
+
+	jobReq := &structs.JobRegisterRequest{
+		Job: job,
+		WriteRequest: structs.WriteRequest{
+			Region:    "global",
+			Namespace: job.Namespace,
+		},
+	}
+	var jobResp structs.JobRegisterResponse
+	err = msgpackrpc.CallWithCodec(codec, "Job.Register", jobReq, &jobResp)
+	must.NoError(t, err)
+
+	// Wait for alloc run be pending in the server.
+	testutil.WaitForJobAllocStatus(t, s.RPC, job, map[string]int{
+		structs.AllocClientStatusPending: 1,
+	})
+
+	// Get allocs that node should run.
+	allocsReq := &structs.NodeSpecificRequest{
+		NodeID: node.ID,
+		QueryOptions: structs.QueryOptions{
+			Region: "global",
+		},
+	}
+	var allocsResp structs.NodeAllocsResponse
+	err = msgpackrpc.CallWithCodec(codec, "Node.GetAllocs", allocsReq, &allocsResp)
+	must.NoError(t, err)
+	must.Len(t, 1, allocsResp.Allocs)
+
+	// Tell server the alloc is running.
+	// Save the alloc so we can reuse the request later.
+	alloc := allocsResp.Allocs[0].Copy()
+	alloc.ClientStatus = structs.AllocClientStatusRunning
+
+	allocUpdateReq := &structs.AllocUpdateRequest{
+		Alloc: []*structs.Allocation{alloc},
+		WriteRequest: structs.WriteRequest{
+			Region: "global",
+		},
+	}
+	var resp structs.GenericResponse
+	err = msgpackrpc.CallWithCodec(codec, "Node.UpdateAlloc", allocUpdateReq, &resp)
+	must.NoError(t, err)
+
+	// Wait for alloc run be running in the server.
+	testutil.WaitForJobAllocStatus(t, s.RPC, job, map[string]int{
+		structs.AllocClientStatusRunning: 1,
+	})
+
+	// Stop heartbeat and wait for the client to be disconnected and the alloc
+	// to be unknown.
+	close(stopHeartbeat)
+	testutil.WaitForClientStatus(t, s.RPC, node.ID, "global", structs.NodeStatusDisconnected)
+	testutil.WaitForJobAllocStatus(t, s.RPC, job, map[string]int{
+		structs.AllocClientStatusUnknown: 1,
+	})
+
+	// There should be a pending eval for the alloc replacement.
+	state := s.fsm.State()
+	ws := memdb.NewWatchSet()
+	evals, err := state.EvalsByJob(ws, job.Namespace, job.ID)
+	found := false
+	for _, eval := range evals {
+		if eval.Status == structs.EvalStatusPending {
+			found = true
+			break
+		}
+	}
+	must.True(t, found)
+
+	// Restart heartbeat to reconnect node.
+	stopHeartbeat = make(chan interface{})
+	go heartbeat()
+
+	// Wait for node to be initializing.
+	// It must remain initializing until it updates its allocs with the server
+	// so the scheduler have the necessary information to avoid unnecessary
+	// placements by the pending eval.
+	testutil.WaitForClientStatus(t, s.RPC, node.ID, "global", structs.NodeStatusInit)
+
+	// Get allocs that node should run.
+	// The node should only have one alloc assigned until it updates its allocs
+	// status with the server.
+	allocsReq = &structs.NodeSpecificRequest{
+		NodeID: node.ID,
+		QueryOptions: structs.QueryOptions{
+			Region: "global",
+		},
+	}
+	err = msgpackrpc.CallWithCodec(codec, "Node.GetAllocs", allocsReq, &allocsResp)
+	must.NoError(t, err)
+	must.Len(t, 1, allocsResp.Allocs)
+
+	// Tell server the alloc is running.
+	err = msgpackrpc.CallWithCodec(codec, "Node.UpdateAlloc", allocUpdateReq, &resp)
+	must.NoError(t, err)
+
+	// Wait for alloc run be running in the server.
+	testutil.WaitForJobAllocStatus(t, s.RPC, job, map[string]int{
+		structs.AllocClientStatusRunning: 1,
+	})
+
+	// Wait for the client to be ready.
+	testutil.WaitForClientStatus(t, s.RPC, node.ID, "global", structs.NodeStatusReady)
+}
+
 func TestClientEndpoint_UpdateStatus_HeartbeatRecovery(t *testing.T) {
 	ci.Parallel(t)
 	require := require.New(t)
@@ -639,29 +805,25 @@ func TestClientEndpoint_Register_GetEvals(t *testing.T) {
 	}
 
 	// Transition it to down and then ready
-	node.Status = structs.NodeStatusDown
-	reg = &structs.NodeRegisterRequest{
-		Node:         node,
+	req := &structs.NodeUpdateStatusRequest{
+		NodeID:       node.ID,
+		Status:       structs.NodeStatusDown,
 		WriteRequest: structs.WriteRequest{Region: "global"},
 	}
-
-	// Fetch the response
-	if err := msgpackrpc.CallWithCodec(codec, "Node.Register", reg, &resp); err != nil {
+	if err := msgpackrpc.CallWithCodec(codec, "Node.UpdateStatus", req, &resp); err != nil {
 		t.Fatalf("err: %v", err)
 	}
 
 	if len(resp.EvalIDs) != 1 {
 		t.Fatalf("expected one eval; got %#v", resp.EvalIDs)
 	}
 
-	node.Status = structs.NodeStatusReady
-	reg = &structs.NodeRegisterRequest{
-		Node:         node,
+	req = &structs.NodeUpdateStatusRequest{
+		NodeID:       node.ID,
+		Status:       structs.NodeStatusReady,
 		WriteRequest: structs.WriteRequest{Region: "global"},
 	}
-
-	// Fetch the response
-	if err := msgpackrpc.CallWithCodec(codec, "Node.Register", reg, &resp); err != nil {
+	if err := msgpackrpc.CallWithCodec(codec, "Node.UpdateStatus", req, &resp); err != nil {
 		t.Fatalf("err: %v", err)
 	}
 
@@ -1369,12 +1531,12 @@ func TestClientEndpoint_Drain_Down(t *testing.T) {
 	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", dereg, &resp2))
 
 	// Mark the node as down
-	node.Status = structs.NodeStatusDown
-	reg = &structs.NodeRegisterRequest{
-		Node:         node,
+	req := &structs.NodeUpdateStatusRequest{
+		NodeID:       node.ID,
+		Status:       structs.NodeStatusDown,
 		WriteRequest: structs.WriteRequest{Region: "global"},
 	}
-	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", reg, &resp))
+	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateStatus", req, &resp))
 
 	// Ensure that the allocation has transitioned to lost
 	testutil.WaitForResult(func() (bool, error) {
@@ -2581,7 +2743,7 @@ func TestClientEndpoint_UpdateAlloc_NodeNotReady(t *testing.T) {
 	}
 	var allocUpdateResp structs.NodeAllocsResponse
 	err = msgpackrpc.CallWithCodec(codec, "Node.UpdateAlloc", allocUpdateReq, &allocUpdateResp)
-	require.ErrorContains(t, err, "not ready")
+	require.ErrorContains(t, err, "not allow to update allocs")
 
 	// Send request without an explicit node ID.
 	updatedAlloc.NodeID = ""
diff --git a/nomad/state/state_store.go b/nomad/state/state_store.go
@@ -909,6 +909,11 @@ func upsertNodeTxn(txn *txn, index uint64, node *structs.Node) error {
 		node.CreateIndex = exist.CreateIndex
 		node.ModifyIndex = index
 
+		// Update last missed heartbeat if the node became unresponsive.
+		if !exist.UnresponsiveStatus() && node.UnresponsiveStatus() {
+			node.LastMissedHeartbeatIndex = index
+		}
+
 		// Retain node events that have already been set on the node
 		node.Events = exist.Events
 
@@ -923,6 +928,16 @@ func upsertNodeTxn(txn *txn, index uint64, node *structs.Node) error {
 		node.SchedulingEligibility = exist.SchedulingEligibility // Retain the eligibility
 		node.DrainStrategy = exist.DrainStrategy                 // Retain the drain strategy
 		node.LastDrain = exist.LastDrain                         // Retain the drain metadata
+
+		// Retain the last index the node missed a heartbeat.
+		if node.LastMissedHeartbeatIndex < exist.LastMissedHeartbeatIndex {
+			node.LastMissedHeartbeatIndex = exist.LastMissedHeartbeatIndex
+		}
+
+		// Retain the last index the node updated its allocs.
+		if node.LastAllocUpdateIndex < exist.LastAllocUpdateIndex {
+			node.LastAllocUpdateIndex = exist.LastAllocUpdateIndex
+		}
 	} else {
 		// Because this is the first time the node is being registered, we should
 		// also create a node registration event
@@ -1029,6 +1044,11 @@ func (s *StateStore) updateNodeStatusTxn(txn *txn, nodeID, status string, update
 	copyNode.Status = status
 	copyNode.ModifyIndex = txn.Index
 
+	// Update last missed heartbeat if the node became unresponsive.
+	if !existingNode.UnresponsiveStatus() && copyNode.UnresponsiveStatus() {
+		copyNode.LastMissedHeartbeatIndex = txn.Index
+	}
+
 	// Insert the node
 	if err := txn.Insert("nodes", copyNode); err != nil {
 		return fmt.Errorf("node update failed: %v", err)
diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go
@@ -2090,6 +2090,14 @@ type Node struct {
 	// LastDrain contains metadata about the most recent drain operation
 	LastDrain *DrainMetadata
 
+	// LastMissedHeartbeatIndex stores the Raft index when the node
+	// last missed a heartbeat.
+	LastMissedHeartbeatIndex uint64
+
+	// LastAllocUpdateIndex stores the Raft index of the last time the node
+	// updatedd its allocations status.
+	LastAllocUpdateIndex uint64
+
 	// Raft Indexes
 	CreateIndex uint64
 	ModifyIndex uint64
@@ -2184,6 +2192,17 @@ func (n *Node) Copy() *Node {
 	return &nn
 }
 
+// UnresponsiveStatus returns true if the node is a status where it is not
+// communicating with the server.
+func (n *Node) UnresponsiveStatus() bool {
+	switch n.Status {
+	case NodeStatusDown, NodeStatusDisconnected:
+		return true
+	default:
+		return false
+	}
+}
+
 // TerminalStatus returns if the current status is terminal and
 // will no longer transition.
 func (n *Node) TerminalStatus() bool {
diff --git a/testutil/wait.go b/testutil/wait.go