Skip to content

Commit

Permalink
loqrecovery: support mixed version recovery
Browse files Browse the repository at this point in the history
This commit adds mixed version support for half-online loss of quorum
recovery service and cli tools.
This change would allow user to use loq recovery in partially
upgraded clusters by tracking version that generated data and
produce recovery plans which will have identical version so
that versions could be verified on all steps of recovery.

Release note: None
  • Loading branch information
aliher1911 committed Mar 16, 2023
1 parent 01e14f6 commit e2ffffc
Show file tree
Hide file tree
Showing 25 changed files with 1,343 additions and 127 deletions.
4 changes: 3 additions & 1 deletion docs/generated/http/full.md
Original file line number Diff line number Diff line change
Expand Up @@ -7450,6 +7450,7 @@ Support status: [reserved](#support-status)
| range_descriptor | [cockroach.roachpb.RangeDescriptor](#cockroach.server.serverpb.RecoveryCollectReplicaInfoResponse-cockroach.roachpb.RangeDescriptor) | | | [reserved](#support-status) |
| replica_info | [cockroach.kv.kvserver.loqrecovery.loqrecoverypb.ReplicaInfo](#cockroach.server.serverpb.RecoveryCollectReplicaInfoResponse-cockroach.kv.kvserver.loqrecovery.loqrecoverypb.ReplicaInfo) | | | [reserved](#support-status) |
| node_stream_restarted | [RecoveryCollectReplicaRestartNodeStream](#cockroach.server.serverpb.RecoveryCollectReplicaInfoResponse-cockroach.server.serverpb.RecoveryCollectReplicaRestartNodeStream) | | | [reserved](#support-status) |
| metadata | [cockroach.kv.kvserver.loqrecovery.loqrecoverypb.ClusterMetadata](#cockroach.server.serverpb.RecoveryCollectReplicaInfoResponse-cockroach.kv.kvserver.loqrecovery.loqrecoverypb.ClusterMetadata) | | | [reserved](#support-status) |



Expand Down Expand Up @@ -7538,7 +7539,8 @@ Support status: [reserved](#support-status)
| ----- | ---- | ----- | ----------- | -------------- |
| plan | [cockroach.kv.kvserver.loqrecovery.loqrecoverypb.ReplicaUpdatePlan](#cockroach.server.serverpb.RecoveryStagePlanRequest-cockroach.kv.kvserver.loqrecovery.loqrecoverypb.ReplicaUpdatePlan) | | Plan is replica update plan to stage for application on next restart. Plan could be empty in that case existing plan is removed if present. | [reserved](#support-status) |
| all_nodes | [bool](#cockroach.server.serverpb.RecoveryStagePlanRequest-bool) | | If all nodes is true, then receiver should act as a coordinator and perform a fan-out to stage plan on all nodes of the cluster. | [reserved](#support-status) |
| force_plan | [bool](#cockroach.server.serverpb.RecoveryStagePlanRequest-bool) | | Force plan tells receiver to ignore any plan already staged on the node if it is present and replace it with new plan (including empty one). | [reserved](#support-status) |
| force_plan | [bool](#cockroach.server.serverpb.RecoveryStagePlanRequest-bool) | | ForcePlan tells receiver to ignore any plan already staged on the node if it is present and replace it with new plan (including empty one). | [reserved](#support-status) |
| force_local_internal_version | [bool](#cockroach.server.serverpb.RecoveryStagePlanRequest-bool) | | ForceLocalInternalVersion tells server to update internal component of plan version to the one of active cluster version. This option needs to be set if target cluster is stuck in recovery where only part of nodes were successfully migrated. | [reserved](#support-status) |



Expand Down
1 change: 1 addition & 0 deletions pkg/cli/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,7 @@ go_test(
"//pkg/kv/kvserver",
"//pkg/kv/kvserver/allocator/storepool",
"//pkg/kv/kvserver/liveness/livenesspb",
"//pkg/kv/kvserver/loqrecovery",
"//pkg/kv/kvserver/loqrecovery/loqrecoverypb",
"//pkg/kv/kvserver/stateloader",
"//pkg/roachpb",
Expand Down
10 changes: 10 additions & 0 deletions pkg/cli/cliflags/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -1913,6 +1913,16 @@ p - prompt interactively for a confirmation
`,
}

RecoverIgnoreInternalVersion = FlagInfo{
Name: "ignore-internal-version",
Description: `
When set, staging and local store plan application commands will ignore internal
cluster version. This option must only be used to bypass version check if
cluster is stuck in the middle of upgrade and locally stored versions differ
from node to node and previous application or staging attempt failed.
`,
}

PrintKeyLength = FlagInfo{
Name: "print-key-max-length",
Description: `
Expand Down
2 changes: 2 additions & 0 deletions pkg/cli/debug.go
Original file line number Diff line number Diff line change
Expand Up @@ -1451,6 +1451,8 @@ func init() {
cliflags.ConfirmActions.Usage())
f.UintVar(&formatHelper.maxPrintedKeyLength, cliflags.PrintKeyLength.Name,
formatHelper.maxPrintedKeyLength, cliflags.PrintKeyLength.Usage())
f.BoolVar(&debugRecoverExecuteOpts.ignoreInternalVersion, cliflags.RecoverIgnoreInternalVersion.Name,
debugRecoverExecuteOpts.ignoreInternalVersion, cliflags.RecoverIgnoreInternalVersion.Usage())

f = debugMergeLogsCmd.Flags()
f.Var(flagutil.Time(&debugMergeLogsOpts.from), "from",
Expand Down
85 changes: 53 additions & 32 deletions pkg/cli/debug_recover_loss_of_quorum.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,14 @@ import (
"strings"

"github.com/cockroachdb/cockroach/pkg/base"
"github.com/cockroachdb/cockroach/pkg/clusterversion"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvstorage"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/loqrecovery"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/loqrecovery/loqrecoverypb"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/server/serverpb"
"github.com/cockroachdb/cockroach/pkg/storage"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/protoutil"
"github.com/cockroachdb/cockroach/pkg/util/stop"
"github.com/cockroachdb/cockroach/pkg/util/strutil"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
Expand Down Expand Up @@ -353,10 +353,9 @@ func runDebugDeadReplicaCollect(cmd *cobra.Command, args []string) error {
defer outFile.Close()
writer = outFile
}
jsonpb := protoutil.JSONPb{Indent: " "}
out, err := jsonpb.Marshal(&replicaInfo)
out, err := loqrecovery.MarshalReplicaInfo(replicaInfo)
if err != nil {
return errors.Wrap(err, "failed to marshal collected replica info")
return err
}
if _, err := writer.Write(out); err != nil {
return errors.Wrap(err, "failed to write collected replica info")
Expand Down Expand Up @@ -564,33 +563,41 @@ Discarded live replicas: %d
planFile = path.Base(debugRecoverPlanOpts.outputFileName)
}

jsonpb := protoutil.JSONPb{Indent: " "}
var out []byte
if out, err = jsonpb.Marshal(&plan); err != nil {
return errors.Wrap(err, "failed to marshal recovery plan")
if out, err = loqrecovery.MarshalPlan(plan); err != nil {
return err
}
if _, err = writer.Write(out); err != nil {
return errors.Wrap(err, "failed to write recovery plan")
}

// No args means we collected connection info from cluster and need to
// preserve flags for subsequent invocation.
remoteArgs := getCLIClusterFlags(len(args) == 0, cmd, func(flag string) bool {
_, filter := planSpecificFlags[flag]
return filter
})
v := clusterversion.ClusterVersion{
Version: plan.Version,
}
if v.IsActive(clusterversion.V23_1) {
// No args means we collected connection info from cluster and need to
// preserve flags for subsequent invocation.
remoteArgs := getCLIClusterFlags(len(args) == 0, cmd, func(flag string) bool {
_, filter := planSpecificFlags[flag]
return filter
})

_, _ = fmt.Fprintf(stderr, `Plan created.
_, _ = fmt.Fprintf(stderr, `Plan created.
To stage recovery application in half-online mode invoke:
cockroach debug recover apply-plan %s %s
Alternatively distribute plan to below nodes and invoke 'debug recover apply-plan --store=<store-dir> %s' on:
`, remoteArgs, planFile, planFile)
} else {
_, _ = fmt.Fprintf(stderr, `Plan created.
To complete recovery, distribute plan to below nodes and invoke 'debug recover apply-plan --store=<store-dir> %s' on:
`, planFile)
}
for _, node := range report.UpdatedNodes {
_, _ = fmt.Fprintf(stderr, "- node n%d, store(s) %s\n", node.NodeID, strutil.JoinIDs("s", node.StoreIDs))
_, _ = fmt.Fprintf(stderr, "- node n%d, store(s) %s\n", node.NodeID,
strutil.JoinIDs("s", node.StoreIDs))
}

return nil
}

Expand All @@ -602,9 +609,8 @@ func readReplicaInfoData(fileNames []string) (loqrecoverypb.ClusterReplicaInfo,
return loqrecoverypb.ClusterReplicaInfo{}, errors.Wrapf(err, "failed to read replica info file %q", filename)
}

var nodeReplicas loqrecoverypb.ClusterReplicaInfo
jsonpb := protoutil.JSONPb{}
if err = jsonpb.Unmarshal(data, &nodeReplicas); err != nil {
nodeReplicas, err := loqrecovery.UnmarshalReplicaInfo(data)
if err != nil {
return loqrecoverypb.ClusterReplicaInfo{}, errors.WithHint(errors.Wrapf(err,
"failed to unmarshal replica info from file %q", filename),
"Ensure that replica info file is generated with the same binary version and file is not corrupted.")
Expand Down Expand Up @@ -633,8 +639,9 @@ See debug recover command help for more details on how to use this command.
}

var debugRecoverExecuteOpts struct {
Stores base.StoreSpecList
confirmAction confirmActionFlag
Stores base.StoreSpecList
confirmAction confirmActionFlag
ignoreInternalVersion bool
}

// runDebugExecuteRecoverPlan is using the following pattern when performing command
Expand All @@ -655,20 +662,24 @@ func runDebugExecuteRecoverPlan(cmd *cobra.Command, args []string) error {
return errors.Wrapf(err, "failed to read plan file %q", planFile)
}

var nodeUpdates loqrecoverypb.ReplicaUpdatePlan
jsonpb := protoutil.JSONPb{Indent: " "}
if err = jsonpb.Unmarshal(data, &nodeUpdates); err != nil {
nodeUpdates, err := loqrecovery.UnmarshalPlan(data)
if err != nil {
return errors.Wrapf(err, "failed to unmarshal plan from file %q", planFile)
}

if len(debugRecoverExecuteOpts.Stores.Specs) == 0 {
return stageRecoveryOntoCluster(ctx, cmd, planFile, nodeUpdates)
return stageRecoveryOntoCluster(ctx, cmd, planFile, nodeUpdates,
debugRecoverExecuteOpts.ignoreInternalVersion)
}
return applyRecoveryToLocalStore(ctx, nodeUpdates)
return applyRecoveryToLocalStore(ctx, nodeUpdates, debugRecoverExecuteOpts.ignoreInternalVersion)
}

func stageRecoveryOntoCluster(
ctx context.Context, cmd *cobra.Command, planFile string, plan loqrecoverypb.ReplicaUpdatePlan,
ctx context.Context,
cmd *cobra.Command,
planFile string,
plan loqrecoverypb.ReplicaUpdatePlan,
ignoreInternalVersion bool,
) error {
c, finish, err := getAdminClient(ctx, serverCfg)
if err != nil {
Expand Down Expand Up @@ -747,7 +758,11 @@ func stageRecoveryOntoCluster(
return err
}
}
sr, err := c.RecoveryStagePlan(ctx, &serverpb.RecoveryStagePlanRequest{Plan: &plan, AllNodes: true})
sr, err := c.RecoveryStagePlan(ctx, &serverpb.RecoveryStagePlanRequest{
Plan: &plan,
AllNodes: true,
ForceLocalInternalVersion: ignoreInternalVersion,
})
if err := maybeWrapStagingError("failed to stage loss of quorum recovery plan on cluster",
sr, err); err != nil {
return err
Expand Down Expand Up @@ -787,19 +802,21 @@ func sortedKeys[T ~int | ~int32 | ~int64](set map[T]any) []T {
}

func applyRecoveryToLocalStore(
ctx context.Context, nodeUpdates loqrecoverypb.ReplicaUpdatePlan,
ctx context.Context, nodeUpdates loqrecoverypb.ReplicaUpdatePlan, ignoreInternalVersion bool,
) error {
stopper := stop.NewStopper()
defer stopper.Stop(ctx)

var localNodeID roachpb.NodeID
batches := make(map[roachpb.StoreID]storage.Batch)
for _, storeSpec := range debugRecoverExecuteOpts.Stores.Specs {
stores := make([]storage.Engine, len(debugRecoverExecuteOpts.Stores.Specs))
for i, storeSpec := range debugRecoverExecuteOpts.Stores.Specs {
store, err := OpenEngine(storeSpec.Path, stopper, storage.MustExist)
if err != nil {
return errors.Wrapf(err, "failed to open store at path %q. ensure that store path is "+
"correct and that it is not used by another process", storeSpec.Path)
}
stores[i] = store
batch := store.NewBatch()
defer store.Close()
defer batch.Close()
Expand All @@ -818,6 +835,10 @@ func applyRecoveryToLocalStore(
batches[storeIdent.StoreID] = batch
}

if err := loqrecovery.CheckEnginesVersion(ctx, stores, nodeUpdates, ignoreInternalVersion); err != nil {
return err
}

updateTime := timeutil.Now()
prepReport, err := loqrecovery.PrepareUpdateReplicas(
ctx, nodeUpdates, uuid.DefaultGenerator, updateTime, localNodeID, batches)
Expand Down Expand Up @@ -911,8 +932,8 @@ func runDebugVerify(cmd *cobra.Command, args []string) error {
if err != nil {
return errors.Wrapf(err, "failed to read plan file %q", planFile)
}
jsonpb := protoutil.JSONPb{Indent: " "}
if err = jsonpb.Unmarshal(data, &updatePlan); err != nil {
updatePlan, err = loqrecovery.UnmarshalPlan(data)
if err != nil {
return errors.Wrapf(err, "failed to unmarshal plan from file %q", planFile)
}
}
Expand Down
Loading

0 comments on commit e2ffffc

Please sign in to comment.