Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

unsafe recover: multiple stage recovery plan generation #4704

Merged
merged 56 commits into from
May 18, 2022
Merged
Show file tree
Hide file tree
Changes from 43 commits
Commits
Show all changes
56 commits
Select commit Hold shift + click to select a range
66d2bf0
generate plan
Connor1996 Mar 3, 2022
6338999
change kvproto
Connor1996 Mar 22, 2022
b952ffe
refine
Connor1996 Mar 22, 2022
e907704
change test
Connor1996 Mar 24, 2022
69aeb6d
add whole process test and fix bug
Connor1996 Apr 20, 2022
9243e7e
fix select leader
Connor1996 Apr 20, 2022
a9b64d2
update kvproto
Connor1996 Apr 21, 2022
4c1d145
seperate create empty region stage
Connor1996 Apr 21, 2022
90a1d82
update kvproto and add log
Connor1996 Apr 24, 2022
078f33c
refine show output and test
Connor1996 Apr 24, 2022
b751cde
add stage transition graph and failed stage
Connor1996 Apr 25, 2022
ae78fe1
address comment
Connor1996 Apr 25, 2022
5b4b5eb
add abort test
Connor1996 Apr 25, 2022
ce39e1b
address comment's case and fix bugs
Connor1996 Apr 28, 2022
6b20b04
Merge remote-tracking branch 'upstream/master' into unsafe-recover
Connor1996 Apr 28, 2022
f9ef172
add timeout
Connor1996 Apr 28, 2022
15aa4a7
add timeout and recovery step
Connor1996 May 2, 2022
3d893b3
force leader for commit merge first
Connor1996 May 3, 2022
dcdb2d5
consider the case in the midst of joint state
Connor1996 May 5, 2022
8cc5ee1
Merge remote-tracking branch 'upstream/master' into unsafe-recover
Connor1996 May 6, 2022
dfe0625
fix format
Connor1996 May 6, 2022
86f5f61
drop region cache
Connor1996 May 6, 2022
c8aaa49
fix static check
Connor1996 May 6, 2022
2eea260
remove blank line
Connor1996 May 6, 2022
0a7fae1
address comment
Connor1996 May 6, 2022
dd07ee4
use struct{} instead of interface{}
Connor1996 May 6, 2022
0900bed
fix client build
Connor1996 May 6, 2022
d398886
fix test
Connor1996 May 6, 2022
c101c59
refine output
Connor1996 May 6, 2022
19a83bc
json structural output
Connor1996 May 7, 2022
43b34fe
add paranoid check
Connor1996 May 7, 2022
62eace9
adjust output
Connor1996 May 7, 2022
7e1e1d6
remove history api and update transition graph
Connor1996 May 7, 2022
7434737
clean
Connor1996 May 7, 2022
bdb7fa7
add create empty region test
Connor1996 May 9, 2022
0673714
address comment
Connor1996 May 9, 2022
877e7ba
address comment
Connor1996 May 9, 2022
533a869
fix overlap check
Connor1996 May 9, 2022
1cae29e
rename
Connor1996 May 9, 2022
f4bcdd7
smaller timeout
Connor1996 May 10, 2022
d68c30b
update kvproto
Connor1996 May 10, 2022
c58101b
address comment
Connor1996 May 10, 2022
9e6c6a0
address comment
Connor1996 May 11, 2022
7de9f08
consider uninitialized peer
Connor1996 May 12, 2022
b625709
prettify output
Connor1996 May 12, 2022
18c83a3
Merge remote-tracking branch 'upstream/master' into unsafe-recover
Connor1996 May 12, 2022
9bea873
introduce exit force leader stage
Connor1996 May 15, 2022
2d4dd37
update schduler running metrics
Connor1996 May 15, 2022
419a1a9
fix timeout
Connor1996 May 15, 2022
89203ce
Merge remote-tracking branch 'upstream/master' into unsafe-recover
Connor1996 May 15, 2022
d4a1496
remove unnecessary lock
Connor1996 May 16, 2022
efa9cdb
fix test
Connor1996 May 16, 2022
8335085
fix test
Connor1996 May 17, 2022
d4a0a86
add affected tables detail
Connor1996 May 17, 2022
4c286fe
Merge remote-tracking branch 'upstream/master' into unsafe-recover
Connor1996 May 17, 2022
0506fd6
Merge branch 'master' into unsafe-recover
ti-chi-bot May 18, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion client/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ require (
github.com/pingcap/check v0.0.0-20211026125417-57bd13f7b5f0
github.com/pingcap/errors v0.11.5-0.20211224045212-9687c2b0f87c
github.com/pingcap/failpoint v0.0.0-20210918120811-547c13e3eb00
github.com/pingcap/kvproto v0.0.0-20220330070404-8c4cd3f93748
github.com/pingcap/kvproto v0.0.0-20220510035547-0e2f26c0a46a
github.com/pingcap/log v0.0.0-20211215031037-e024ba4eb0ee
github.com/prometheus/client_golang v1.11.0
go.uber.org/goleak v1.1.11
Expand Down
4 changes: 2 additions & 2 deletions client/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,8 @@ github.com/pingcap/errors v0.11.5-0.20211224045212-9687c2b0f87c h1:xpW9bvK+HuuTm
github.com/pingcap/errors v0.11.5-0.20211224045212-9687c2b0f87c/go.mod h1:X2r9ueLEUZgtx2cIogM0v4Zj5uvvzhuuiu7Pn8HzMPg=
github.com/pingcap/failpoint v0.0.0-20210918120811-547c13e3eb00 h1:C3N3itkduZXDZFh4N3vQ5HEtld3S+Y+StULhWVvumU0=
github.com/pingcap/failpoint v0.0.0-20210918120811-547c13e3eb00/go.mod h1:4qGtCB0QK0wBzKtFEGDhxXnSnbQApw1gc9siScUl8ew=
github.com/pingcap/kvproto v0.0.0-20220330070404-8c4cd3f93748 h1:i4MBe1zGq9/r3BH6rTRunizi4T59fpNk8hvBCrB5UAY=
github.com/pingcap/kvproto v0.0.0-20220330070404-8c4cd3f93748/go.mod h1:OYtxs0786qojVTmkVeufx93xe+jUgm56GUYRIKnmaGI=
github.com/pingcap/kvproto v0.0.0-20220510035547-0e2f26c0a46a h1:TxdHGOFeNa1q1mVv6TgReayf26iI4F8PQUm6RnZ/V/E=
github.com/pingcap/kvproto v0.0.0-20220510035547-0e2f26c0a46a/go.mod h1:OYtxs0786qojVTmkVeufx93xe+jUgm56GUYRIKnmaGI=
github.com/pingcap/log v0.0.0-20191012051959-b742a5d432e9/go.mod h1:4rbK1p9ILyIfb6hU7OG2CiWSqMXnp3JMbiaVJ6mvoY8=
github.com/pingcap/log v0.0.0-20211215031037-e024ba4eb0ee h1:VO2t6IBpfvW34TdtD/G10VvnGqjLic1jzOuHjUb5VqM=
github.com/pingcap/log v0.0.0-20211215031037-e024ba4eb0ee/go.mod h1:DWQW5jICDR7UJh4HtxXSM20Churx4CQL0fwL/SoOSA4=
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ require (
github.com/pingcap/errcode v0.3.0
github.com/pingcap/errors v0.11.5-0.20211224045212-9687c2b0f87c
github.com/pingcap/failpoint v0.0.0-20200702092429-9f69995143ce
github.com/pingcap/kvproto v0.0.0-20220330070404-8c4cd3f93748
github.com/pingcap/kvproto v0.0.0-20220510035547-0e2f26c0a46a
github.com/pingcap/log v0.0.0-20210906054005-afc726e70354
github.com/pingcap/sysutil v0.0.0-20211208032423-041a72e5860d
github.com/pingcap/tidb-dashboard v0.0.0-20220331105802-5ac69661755c
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -400,8 +400,8 @@ github.com/pingcap/failpoint v0.0.0-20200702092429-9f69995143ce h1:Y1kCxlCtlPTMt
github.com/pingcap/failpoint v0.0.0-20200702092429-9f69995143ce/go.mod h1:w4PEZ5y16LeofeeGwdgZB4ddv9bLyDuIX+ljstgKZyk=
github.com/pingcap/kvproto v0.0.0-20191211054548-3c6b38ea5107/go.mod h1:WWLmULLO7l8IOcQG+t+ItJ3fEcrL5FxF0Wu+HrMy26w=
github.com/pingcap/kvproto v0.0.0-20200411081810-b85805c9476c/go.mod h1:IOdRDPLyda8GX2hE/jO7gqaCV/PNFh8BZQCQZXfIOqI=
github.com/pingcap/kvproto v0.0.0-20220330070404-8c4cd3f93748 h1:i4MBe1zGq9/r3BH6rTRunizi4T59fpNk8hvBCrB5UAY=
github.com/pingcap/kvproto v0.0.0-20220330070404-8c4cd3f93748/go.mod h1:OYtxs0786qojVTmkVeufx93xe+jUgm56GUYRIKnmaGI=
github.com/pingcap/kvproto v0.0.0-20220510035547-0e2f26c0a46a h1:TxdHGOFeNa1q1mVv6TgReayf26iI4F8PQUm6RnZ/V/E=
github.com/pingcap/kvproto v0.0.0-20220510035547-0e2f26c0a46a/go.mod h1:OYtxs0786qojVTmkVeufx93xe+jUgm56GUYRIKnmaGI=
github.com/pingcap/log v0.0.0-20191012051959-b742a5d432e9/go.mod h1:4rbK1p9ILyIfb6hU7OG2CiWSqMXnp3JMbiaVJ6mvoY8=
github.com/pingcap/log v0.0.0-20200511115504-543df19646ad/go.mod h1:4rbK1p9ILyIfb6hU7OG2CiWSqMXnp3JMbiaVJ6mvoY8=
github.com/pingcap/log v0.0.0-20210625125904-98ed8e2eb1c7/go.mod h1:8AanEdAHATuRurdGxZXBz0At+9avep+ub7U1AGYLIMM=
Expand Down
12 changes: 7 additions & 5 deletions server/api/operator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -405,11 +405,13 @@ func mustPutStore(c *C, svr *server.Server, id uint64, state metapb.StoreState,
},
})
c.Assert(err, IsNil)
_, err = s.StoreHeartbeat(context.Background(), &pdpb.StoreHeartbeatRequest{
Header: &pdpb.RequestHeader{ClusterId: svr.ClusterID()},
Stats: &pdpb.StoreStats{StoreId: id},
})
c.Assert(err, IsNil)
if state == metapb.StoreState_Up {
_, err = s.StoreHeartbeat(context.Background(), &pdpb.StoreHeartbeatRequest{
Header: &pdpb.RequestHeader{ClusterId: svr.ClusterID()},
Stats: &pdpb.StoreStats{StoreId: id},
})
c.Assert(err, IsNil)
}
}

func mustRegionHeartbeat(c *C, svr *server.Server, region *core.RegionInfo) {
Expand Down
2 changes: 0 additions & 2 deletions server/api/router.go
Original file line number Diff line number Diff line change
Expand Up @@ -331,8 +331,6 @@ func createRouter(prefix string, svr *server.Server) *mux.Router {
unsafeOperationHandler.RemoveFailedStores, setMethods("POST"))
registerFunc(clusterRouter, "/admin/unsafe/remove-failed-stores/show",
unsafeOperationHandler.GetFailedStoresRemovalStatus, setMethods("GET"))
registerFunc(clusterRouter, "/admin/unsafe/remove-failed-stores/history",
unsafeOperationHandler.GetFailedStoresRemovalHistory, setMethods("GET"))

// API to set or unset failpoints
failpoint.Inject("enableFailpointAPI", func() {
Expand Down
24 changes: 10 additions & 14 deletions server/api/unsafe_operation.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,17 @@ func (h *unsafeOperationHandler) RemoveFailedStores(w http.ResponseWriter, r *ht
h.rd.JSON(w, http.StatusBadRequest, "Store ids are invalid")
return
}
stores := make(map[uint64]string)
stores := make(map[uint64]struct{})
for _, store := range storeSlice {
stores[store] = ""
stores[store] = struct{}{}
}
if err := rc.GetUnsafeRecoveryController().RemoveFailedStores(stores); err != nil {
timeout := uint64(600)
rawTimeout, exists := input["timeout"].(float64)
if exists {
timeout = uint64(rawTimeout)
}

if err := rc.GetUnsafeRecoveryController().RemoveFailedStores(stores, timeout); err != nil {
h.rd.JSON(w, http.StatusInternalServerError, err.Error())
return
}
Expand All @@ -69,19 +75,9 @@ func (h *unsafeOperationHandler) RemoveFailedStores(w http.ResponseWriter, r *ht
// @Tags unsafe
// @Summary Show the current status of failed stores removal.
// @Produce json
// Success 200 {object} []string
// Success 200 {object} []StageOutput
// @Router /admin/unsafe/remove-failed-stores/show [GET]
func (h *unsafeOperationHandler) GetFailedStoresRemovalStatus(w http.ResponseWriter, r *http.Request) {
rc := getCluster(r)
h.rd.JSON(w, http.StatusOK, rc.GetUnsafeRecoveryController().Show())
}

// @Tags unsafe
// @Summary Show the history of failed stores removal.
// @Produce json
// Success 200 {object} []string
// @Router /admin/unsafe/remove-failed-stores/history [GET]
func (h *unsafeOperationHandler) GetFailedStoresRemovalHistory(w http.ResponseWriter, r *http.Request) {
rc := getCluster(r)
h.rd.JSON(w, http.StatusOK, rc.GetUnsafeRecoveryController().History())
}
17 changes: 13 additions & 4 deletions server/api/unsafe_operation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,10 @@ import (
"fmt"

. "github.com/pingcap/check"
"github.com/pingcap/kvproto/pkg/metapb"
tu "github.com/tikv/pd/pkg/testutil"
"github.com/tikv/pd/server"
"github.com/tikv/pd/server/cluster"
)

var _ = Suite(&testUnsafeAPISuite{})
Expand All @@ -39,6 +41,7 @@ func (s *testUnsafeAPISuite) SetUpSuite(c *C) {
s.urlPrefix = fmt.Sprintf("%s%s/api/v1/admin/unsafe", addr, apiPrefix)

mustBootstrapCluster(c, s.svr)
mustPutStore(c, s.svr, 1, metapb.StoreState_Offline, metapb.NodeState_Removing, nil)
}

func (s *testUnsafeAPISuite) TearDownSuite(c *C) {
Expand All @@ -51,20 +54,26 @@ func (s *testUnsafeAPISuite) TestRemoveFailedStores(c *C) {
err := tu.CheckPostJSON(testDialClient, s.urlPrefix+"/remove-failed-stores", data, tu.StatusNotOK(c),
tu.StringEqual(c, "\"[PD:unsaferecovery:ErrUnsafeRecoveryInvalidInput]invalid input no store specified\"\n"))
c.Assert(err, IsNil)

input = map[string]interface{}{"stores": []string{"abc", "def"}}
data, _ = json.Marshal(input)
err = tu.CheckPostJSON(testDialClient, s.urlPrefix+"/remove-failed-stores", data, tu.StatusNotOK(c),
tu.StringEqual(c, "\"Store ids are invalid\"\n"))
c.Assert(err, IsNil)

input = map[string]interface{}{"stores": []uint64{1, 2}}
data, _ = json.Marshal(input)
err = tu.CheckPostJSON(testDialClient, s.urlPrefix+"/remove-failed-stores", data, tu.StatusNotOK(c),
tu.StringEqual(c, "\"[PD:unsaferecovery:ErrUnsafeRecoveryInvalidInput]invalid input store 2 doesn't exist\"\n"))
c.Assert(err, IsNil)

input = map[string]interface{}{"stores": []uint64{1}}
data, _ = json.Marshal(input)
err = tu.CheckPostJSON(testDialClient, s.urlPrefix+"/remove-failed-stores", data, tu.StatusOK(c))
c.Assert(err, IsNil)

// Test show
var output []string
var output []cluster.StageOutput
err = tu.ReadGetJSON(c, testDialClient, s.urlPrefix+"/remove-failed-stores/show", &output)
c.Assert(err, IsNil)
// Test history
err = tu.ReadGetJSON(c, testDialClient, s.urlPrefix+"/remove-failed-stores/history", &output)
c.Assert(err, IsNil)
}
5 changes: 5 additions & 0 deletions server/cluster/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -935,6 +935,11 @@ func (c *RaftCluster) DropCacheRegion(id uint64) {
c.core.RemoveRegionIfExist(id)
}

// DropCacheAllRegion removes all regions from the cache.
func (c *RaftCluster) DropCacheAllRegion() {
c.core.ResetRegionCache()
}

// GetMetaStores gets stores from cluster.
func (c *RaftCluster) GetMetaStores() []*metapb.Store {
return c.core.GetMetaStores()
Expand Down
Loading