Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

client: support add ready for resp and add backoff mechanism #6974

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion client/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ require (
github.com/pingcap/failpoint v0.0.0-20210918120811-547c13e3eb00
github.com/pingcap/kvproto v0.0.0-20230727073445-53e1f8730c30
github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3
github.com/pkg/errors v0.9.1
github.com/prometheus/client_golang v1.11.1
github.com/stretchr/testify v1.8.2
go.uber.org/goleak v1.1.11
Expand All @@ -25,7 +26,6 @@ require (
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/golang/protobuf v1.5.3 // indirect
github.com/matttproud/golang_protobuf_extensions v1.0.1 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/prometheus/client_model v0.2.0 // indirect
github.com/prometheus/common v0.26.0 // indirect
Expand Down
125 changes: 119 additions & 6 deletions client/pd_service_discovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,16 +29,19 @@ import (
"github.com/pingcap/log"
"github.com/tikv/pd/client/errs"
"github.com/tikv/pd/client/grpcutil"
"github.com/tikv/pd/client/retry"
"github.com/tikv/pd/client/tlsutil"
"go.uber.org/zap"
"google.golang.org/grpc"
"google.golang.org/grpc/connectivity"
)

const (
globalDCLocation = "global"
memberUpdateInterval = time.Minute
serviceModeUpdateInterval = 3 * time.Second
updateMemberTimeout = time.Second // Use a shorter timeout to recover faster from network isolation.
requestTimeout = 2 * time.Second
)

type serviceType int
Expand All @@ -61,7 +64,7 @@ type ServiceDiscovery interface {
GetKeyspaceID() uint32
// GetKeyspaceGroupID returns the ID of the keyspace group
GetKeyspaceGroupID() uint32
// DiscoverServiceURLs discovers the microservice with the specified type and returns the server urls.
// DiscoverMicroservice discovers the microservice with the specified type and returns the server urls.
DiscoverMicroservice(svcType serviceType) ([]string, error)
// GetServiceURLs returns the URLs of the servers providing the service
GetServiceURLs() []string
Expand Down Expand Up @@ -95,6 +98,8 @@ type ServiceDiscovery interface {
// in a quorum-based cluster or any primary/secondary in a primary/secondary configured cluster
// is changed.
AddServiceAddrsSwitchedCallback(callbacks ...func())
// GetBackoffer returns the backoffer.
GetBackoffer() *retry.Backoffer
}

type updateKeyspaceIDFunc func() error
Expand Down Expand Up @@ -153,6 +158,9 @@ type pdServiceDiscovery struct {
tlsCfg *tlsutil.TLSConfig
// Client option.
option *option

successReConnect chan struct{}
bo *retry.Backoffer
}

// newPDServiceDiscovery returns a new PD service discovery-based client.
Expand All @@ -166,6 +174,7 @@ func newPDServiceDiscovery(
) *pdServiceDiscovery {
pdsd := &pdServiceDiscovery{
checkMembershipCh: make(chan struct{}, 1),
successReConnect: make(chan struct{}, 1),
ctx: ctx,
cancel: cancel,
wg: wg,
Expand All @@ -174,6 +183,7 @@ func newPDServiceDiscovery(
keyspaceID: keyspaceID,
tlsCfg: tlsCfg,
option: option,
bo: retry.NewBackoffer(ctx, maxRetryTimes),
}
pdsd.urls.Store(urls)
return pdsd
Expand Down Expand Up @@ -207,7 +217,7 @@ func (c *pdServiceDiscovery) Init() error {
}

c.wg.Add(2)
go c.updateMemberLoop()
go c.reconnectMemberLoop()
go c.updateServiceModeLoop()

c.isInitialized = true
Expand All @@ -231,13 +241,17 @@ func (c *pdServiceDiscovery) initRetry(f func() error) error {
return errors.WithStack(err)
}

func (c *pdServiceDiscovery) updateMemberLoop() {
func (c *pdServiceDiscovery) reconnectMemberLoop() {
defer c.wg.Done()

ctx, cancel := context.WithCancel(c.ctx)
defer cancel()
ticker := time.NewTicker(memberUpdateInterval)
defer ticker.Stop()
failpoint.Inject("acceleratedMemberUpdateInterval", func() {
ticker.Stop()
ticker = time.NewTicker(time.Millisecond * 100)
})

for {
select {
Expand All @@ -246,15 +260,98 @@ func (c *pdServiceDiscovery) updateMemberLoop() {
case <-ticker.C:
case <-c.checkMembershipCh:
}

failpoint.Inject("skipUpdateMember", func() {
failpoint.Continue()
})

if err := c.updateMember(); err != nil {
log.Error("[pd] failed to update member", zap.Strings("urls", c.GetServiceURLs()), errs.ZapError(err))
log.Error("[pd] failed to update member", errs.ZapError(err))
} else {
c.SuccessReconnect()
}
}
}

func (c *pdServiceDiscovery) waitForReady() error {
ctx, cancel := context.WithCancel(c.ctx)
defer cancel()

if e1 := c.waitForLeaderReady(); e1 != nil {
log.Error("[pd.waitForReady] failed to wait for leader ready", errs.ZapError(e1))
return errors.WithStack(e1)
} else if e2 := c.loadMembers(); e2 != nil {
log.Error("[pd.waitForReady] failed to load members", errs.ZapError(e2))
} else {
return nil
}

deadline := time.Now().Add(requestTimeout)
failpoint.Inject("acceleratedRequestTimeout", func() {
deadline = time.Now().Add(500 * time.Millisecond)
})
for {
select {
case <-c.successReConnect:
return nil
case <-time.After(time.Until(deadline)):
log.Error("[pd.waitForReady] timeout")
return errors.New("wait for ready timeout")
case <-ctx.Done():
log.Info("[pd.waitForReady] exit")
return nil
}
}
}

// waitForLeaderReady waits for the leader to be ready.
func (c *pdServiceDiscovery) waitForLeaderReady() error {
ctx, cancel := context.WithTimeout(context.Background(), requestTimeout)
for {
old, ok := c.clientConns.Load(c.getLeaderAddr())
if !ok {
cancel()
return errors.New("no leader")
}
cc := old.(*grpc.ClientConn)

s := cc.GetState()
if s == connectivity.Ready {
cancel()
return nil
}
if !cc.WaitForStateChange(ctx, s) {
cancel()
// ctx got timeout or canceled.
return ctx.Err()
}
}
}

func (c *pdServiceDiscovery) loadMembers() error {
ctx, cancel := context.WithCancel(c.ctx)
defer cancel()

members, err := c.getMembers(ctx, c.getLeaderAddr(), updateMemberTimeout)
if err != nil {
log.Error("[pd.loadMembers] failed to load members ", zap.String("url", c.getLeaderAddr()), errs.ZapError(err))
return errors.WithStack(err)
} else if members.GetHeader() == nil || members.GetLeader() == nil || len(members.GetLeader().GetClientUrls()) == 0 {
err = errs.ErrClientGetLeader.FastGenByArgs("leader address don't exist")
log.Error("[pd.loadMembers] leader address don't exist. ", zap.String("url", c.getLeaderAddr()), errs.ZapError(err))
return errors.WithStack(err)
}

return nil
}

func (c *pdServiceDiscovery) SuccessReconnect() {
select {
case c.successReConnect <- struct{}{}:
default:
}
}

func (c *pdServiceDiscovery) updateServiceModeLoop() {
defer c.wg.Done()
failpoint.Inject("skipUpdateServiceMode", func() {
Expand Down Expand Up @@ -319,7 +416,7 @@ func (c *pdServiceDiscovery) GetKeyspaceGroupID() uint32 {
return defaultKeySpaceGroupID
}

// DiscoverServiceURLs discovers the microservice with the specified type and returns the server urls.
// DiscoverMicroservice discovers the microservice with the specified type and returns the server urls.
func (c *pdServiceDiscovery) DiscoverMicroservice(svcType serviceType) (urls []string, err error) {
switch svcType {
case apiService:
Expand Down Expand Up @@ -382,11 +479,23 @@ func (c *pdServiceDiscovery) GetBackupAddrs() []string {
func (c *pdServiceDiscovery) ScheduleCheckMemberChanged() {
select {
case c.checkMembershipCh <- struct{}{}:
if err := c.waitForReady(); err != nil {
// If backoff times count is greater than 10, reset it.
if c.bo.GetBackoffTimeCnt(retry.BoMemberUpdate.String()) >= 10 {
c.bo.Reset()
}
e := c.bo.Backoff(retry.BoMemberUpdate, err)
if e != nil {
log.Error("[pd] wait for ready backoff failed", errs.ZapError(e))
return
}
log.Error("[pd] wait for ready failed", errs.ZapError(err))
}
default:
}
}

// Immediately check if there is any membership change among the leader/followers in a
// CheckMemberChanged Immediately check if there is any membership change among the leader/followers in a
// quorum-based cluster or among the primary/secondaries in a primary/secondary configured cluster.
func (c *pdServiceDiscovery) CheckMemberChanged() error {
return c.updateMember()
Expand Down Expand Up @@ -669,3 +778,7 @@ func (c *pdServiceDiscovery) switchTSOAllocatorLeaders(allocatorMap map[string]*
func (c *pdServiceDiscovery) GetOrCreateGRPCConn(addr string) (*grpc.ClientConn, error) {
return grpcutil.GetOrCreateGRPCConn(c.ctx, &c.clientConns, addr, c.tlsCfg, c.option.gRPCDialOptions...)
}

func (c *pdServiceDiscovery) GetBackoffer() *retry.Backoffer {
return c.bo
}
Loading