Skip to content
This repository has been archived by the owner on Feb 22, 2024. It is now read-only.

Commit

Permalink
Add fullnode monitoring
Browse files Browse the repository at this point in the history
  • Loading branch information
agouin committed Jul 21, 2022
1 parent d54e6fe commit 8f0d977
Show file tree
Hide file tree
Showing 3 changed files with 113 additions and 91 deletions.
1 change: 1 addition & 0 deletions cmd/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ type Sentry struct {
type ValidatorMonitor struct {
Name string `yaml:"name"`
RPC string `yaml:"rpc"`
FullNode bool `yaml:"fullnode"`
Address string `yaml:"address"`
ChainID string `yaml:"chain-id"`
DiscordStatusMessageID *string `yaml:"discord-status-message-id"`
Expand Down
66 changes: 40 additions & 26 deletions cmd/discord.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,18 @@ func getColorForAlertLevel(alertLevel AlertLevel) int {

func getCurrentStatsEmbed(stats ValidatorStats, vm *ValidatorMonitor) discord.Embed {
var uptime string
if stats.SlashingPeriodUptime == 0 {
uptime = "N/A"
var title string
if vm.FullNode {
title = vm.Name
} else {
uptime = fmt.Sprintf("%.02f", stats.SlashingPeriodUptime)
}
if stats.SlashingPeriodUptime == 0 {
uptime = "N/A"
} else {
uptime = fmt.Sprintf("%.02f", stats.SlashingPeriodUptime)
}

title := fmt.Sprintf("%s (%s%% up)", vm.Name, uptime)
title = fmt.Sprintf("%s (%s%% up)", vm.Name, uptime)
}

var description string
sentryString := ""
Expand Down Expand Up @@ -115,32 +120,37 @@ func getCurrentStatsEmbed(stats ValidatorStats, vm *ValidatorMonitor) discord.Em
rpcStatusIcon = iconError
} else {
rpcStatusIcon = iconGood
var recentSignedBlocksIcon string
if stats.RecentMissedBlockAlertLevel >= alertLevelHigh {
recentSignedBlocksIcon = iconError
} else if stats.RecentMissedBlockAlertLevel == alertLevelWarning {
recentSignedBlocksIcon = iconWarning
} else {
recentSignedBlocksIcon = iconGood
if !vm.FullNode {
var recentSignedBlocksIcon string
if stats.RecentMissedBlockAlertLevel >= alertLevelHigh {
recentSignedBlocksIcon = iconError
} else if stats.RecentMissedBlockAlertLevel == alertLevelWarning {
recentSignedBlocksIcon = iconWarning
} else {
recentSignedBlocksIcon = iconGood
}
recentSignedBlocks = fmt.Sprintf("%s Latest Blocks Signed: **%d/%d**", recentSignedBlocksIcon, recentBlocksToCheck-stats.RecentMissedBlocks, recentBlocksToCheck)
}
recentSignedBlocks = fmt.Sprintf("%s Latest Blocks Signed: **%d/%d**", recentSignedBlocksIcon, recentBlocksToCheck-stats.RecentMissedBlocks, recentBlocksToCheck)

}
latestBlock = fmt.Sprintf("%s Height **%s** - **%s**", rpcStatusIcon, fmt.Sprint(stats.Height), formattedTime(stats.Timestamp))
}

if stats.Height == stats.LastSignedBlockHeight {
description = fmt.Sprintf("%s\n%s%s",
latestBlock, recentSignedBlocks, sentryString)
if vm.FullNode {
description = fmt.Sprintf("%s%s", latestBlock, sentryString)
} else {
var lastSignedBlock string
if stats.LastSignedBlockHeight == -1 {
lastSignedBlock = fmt.Sprintf("%s Last Signed **N/A**", iconError)
if stats.Height == stats.LastSignedBlockHeight {
description = fmt.Sprintf("%s\n%s%s",
latestBlock, recentSignedBlocks, sentryString)
} else {
lastSignedBlock = fmt.Sprintf("%s Last Signed **%s** - **%s**", iconError, fmt.Sprint(stats.LastSignedBlockHeight), formattedTime(stats.LastSignedBlockTimestamp))
var lastSignedBlock string
if stats.LastSignedBlockHeight == -1 {
lastSignedBlock = fmt.Sprintf("%s Last Signed **N/A**", iconError)
} else {
lastSignedBlock = fmt.Sprintf("%s Last Signed **%s** - **%s**", iconError, fmt.Sprint(stats.LastSignedBlockHeight), formattedTime(stats.LastSignedBlockTimestamp))
}
description = fmt.Sprintf("%s\n%s\n%s%s",
latestBlock, lastSignedBlock, recentSignedBlocks, sentryString)
}
description = fmt.Sprintf("%s\n%s\n%s%s",
latestBlock, lastSignedBlock, recentSignedBlocks, sentryString)
}

color := getColorForAlertLevel(stats.AlertLevel)
Expand Down Expand Up @@ -213,10 +223,14 @@ func (service *DiscordNotificationService) SendValidatorAlertNotification(
}

var embedTitle string
if stats.SlashingPeriodUptime > 0 {
embedTitle = fmt.Sprintf("%s (%.02f%% up)", vm.Name, stats.SlashingPeriodUptime)
if vm.FullNode {
embedTitle = vm.Name
} else {
embedTitle = fmt.Sprintf("%s (N/A%% up)", vm.Name)
if stats.SlashingPeriodUptime > 0 {
embedTitle = fmt.Sprintf("%s (%.02f%% up)", vm.Name, stats.SlashingPeriodUptime)
} else {
embedTitle = fmt.Sprintf("%s (N/A%% up)", vm.Name)
}
}

if len(alertNotification.Alerts) > 0 {
Expand Down
137 changes: 72 additions & 65 deletions cmd/validator.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,30 +28,32 @@ func monitorValidator(
errs = append(errs, newGenericRPCError(err.Error()))
return
}
_, hexAddress, err := bech32.DecodeAndConvert(vm.Address)
if err != nil {
errs = append(errs, newIgnorableError(err))
return
}

valInfo, err := getSigningInfo(client, vm.Address)
slashingPeriod := int64(10000)
if err != nil {
errs = append(errs, newGenericRPCError(err.Error()))
} else {
signingInfo := valInfo.ValSigningInfo
if signingInfo.Tombstoned {
errs = append(errs, newTombstonedError())
}
if signingInfo.JailedUntil.After(time.Now()) {
errs = append(errs, newJailedError(signingInfo.JailedUntil))
var hexAddress []byte
if !vm.FullNode {
_, hexAddress, err = bech32.DecodeAndConvert(vm.Address)
if err != nil {
errs = append(errs, newIgnorableError(err))
return
}
slashingInfo, err := getSlashingInfo(client)
valInfo, err := getSigningInfo(client, vm.Address)
if err != nil {
errs = append(errs, newGenericRPCError(err.Error()))
} else {
slashingPeriod = slashingInfo.Params.SignedBlocksWindow
stats.SlashingPeriodUptime = 100.0 - 100.0*(float64(signingInfo.MissedBlocksCounter)/float64(slashingPeriod))
signingInfo := valInfo.ValSigningInfo
if signingInfo.Tombstoned {
errs = append(errs, newTombstonedError())
}
if signingInfo.JailedUntil.After(time.Now()) {
errs = append(errs, newJailedError(signingInfo.JailedUntil))
}
slashingInfo, err := getSlashingInfo(client)
if err != nil {
errs = append(errs, newGenericRPCError(err.Error()))
} else {
slashingPeriod = slashingInfo.Params.SignedBlocksWindow
stats.SlashingPeriodUptime = 100.0 - 100.0*(float64(signingInfo.MissedBlocksCounter)/float64(slashingPeriod))
}
}
}
node, err := client.GetNode()
Expand All @@ -76,35 +78,37 @@ func monitorValidator(
stats.Height = status.SyncInfo.LatestBlockHeight
stats.Timestamp = status.SyncInfo.LatestBlockTime
stats.RecentMissedBlocks = 0
for i := stats.Height; i > stats.Height-recentBlocksToCheck && i > 0; i-- {
blockCtx, blockCtxCancel := context.WithTimeout(context.Background(), time.Duration(time.Second*RPCTimeoutSeconds))
block, err := node.Block(blockCtx, &i)
blockCtxCancel()
if err != nil {
// generic RPC error for this one so it will be included in the generic RPC error retry
errs = append(errs, newGenericRPCError(newBlockFetchError(i, vm.RPC).Error()))
continue
}
if i == 1 {
break
}
found := false
for _, voter := range block.Block.LastCommit.Signatures {
if reflect.DeepEqual(voter.ValidatorAddress, bytes.HexBytes(hexAddress)) {
if block.Block.Height > stats.LastSignedBlockHeight {
stats.LastSignedBlockHeight = block.Block.Height
stats.LastSignedBlockTimestamp = block.Block.Time
}
found = true
if !vm.FullNode {
for i := stats.Height; i > stats.Height-recentBlocksToCheck && i > 0; i-- {
blockCtx, blockCtxCancel := context.WithTimeout(context.Background(), time.Duration(time.Second*RPCTimeoutSeconds))
block, err := node.Block(blockCtx, &i)
blockCtxCancel()
if err != nil {
// generic RPC error for this one so it will be included in the generic RPC error retry
errs = append(errs, newGenericRPCError(newBlockFetchError(i, vm.RPC).Error()))
continue
}
if i == 1 {
break
}
}
if !found {
stats.RecentMissedBlocks++
found := false
for _, voter := range block.Block.LastCommit.Signatures {
if reflect.DeepEqual(voter.ValidatorAddress, bytes.HexBytes(hexAddress)) {
if block.Block.Height > stats.LastSignedBlockHeight {
stats.LastSignedBlockHeight = block.Block.Height
stats.LastSignedBlockTimestamp = block.Block.Time
}
found = true
break
}
}
if !found {
stats.RecentMissedBlocks++
}
}
}

if stats.RecentMissedBlocks > 0 {
if !vm.FullNode && stats.RecentMissedBlocks > 0 {
errs = append(errs, newMissedRecentBlocksError(stats.RecentMissedBlocks))
// Go back to find last signed block
if stats.LastSignedBlockHeight == -1 {
Expand Down Expand Up @@ -209,6 +213,7 @@ func runMonitor(
var sentryErrs []error

wg := sync.WaitGroup{}
fmt.Printf("Monitoring validator\n")
wg.Add(1)
go func() {
var rpcRetries int
Expand Down Expand Up @@ -271,7 +276,7 @@ func runMonitor(
errs = append(errs, sentryErrs...)
}

aggregatedErrs := stats.determineAggregatedErrorsAndAlertLevel()
aggregatedErrs := stats.determineAggregatedErrorsAndAlertLevel(vm.FullNode)
if len(aggregatedErrs) > 0 {
errs = append(errs, aggregatedErrs...)
}
Expand All @@ -297,7 +302,7 @@ func (stats *ValidatorStats) increaseAlertLevel(alertLevel AlertLevel) {
}

// determine alert level and any additional errors now that RPC And sentry checks are complete
func (stats *ValidatorStats) determineAggregatedErrorsAndAlertLevel() (errs []error) {
func (stats *ValidatorStats) determineAggregatedErrorsAndAlertLevel(fullnode bool) (errs []error) {
sentryErrorCount := 0
for _, sentryStat := range stats.SentryStats {
if sentryStat.SentryAlertType != sentryAlertTypeGRPCError {
Expand All @@ -323,36 +328,38 @@ func (stats *ValidatorStats) determineAggregatedErrorsAndAlertLevel() (errs []er
stats.increaseAlertLevel(alertLevelHigh)
}

if stats.Height == stats.LastSignedBlockHeight {
if stats.RecentMissedBlocks == 0 {
if stats.SlashingPeriodUptime > slashingPeriodUptimeWarningThreshold {
// no recent missed blocks and above warning threshold for slashing period uptime, all good
return
if !fullnode {
if stats.Height == stats.LastSignedBlockHeight {
if stats.RecentMissedBlocks == 0 {
if stats.SlashingPeriodUptime > slashingPeriodUptimeWarningThreshold {
// no recent missed blocks and above warning threshold for slashing period uptime, all good
return
} else {
// Warning for recovering from downtime. Not error because we are currently signing
stats.increaseAlertLevel(alertLevelWarning)
return
}
} else {
// Warning for recovering from downtime. Not error because we are currently signing
// Warning for missing recent blocks, but have signed current block
stats.increaseAlertLevel(alertLevelWarning)
return
}
} else {
// Warning for missing recent blocks, but have signed current block
stats.increaseAlertLevel(alertLevelWarning)
return
}
}

// past this, we have not signed the most recent block
// past this, we have not signed the most recent block

if stats.RecentMissedBlocks < recentBlocksToCheck {
// we have missed some, but not all, of the recent blocks to check
if stats.SlashingPeriodUptime > slashingPeriodUptimeErrorThreshold {
stats.increaseAlertLevel(alertLevelWarning)
if stats.RecentMissedBlocks < recentBlocksToCheck {
// we have missed some, but not all, of the recent blocks to check
if stats.SlashingPeriodUptime > slashingPeriodUptimeErrorThreshold {
stats.increaseAlertLevel(alertLevelWarning)
} else {
// we are below slashing period uptime error threshold
stats.increaseAlertLevel(alertLevelHigh)
}
} else {
// we are below slashing period uptime error threshold
// Error, missed all of the recent blocks to check
stats.increaseAlertLevel(alertLevelHigh)
}
} else {
// Error, missed all of the recent blocks to check
stats.increaseAlertLevel(alertLevelHigh)
}
return
}
Expand Down

0 comments on commit 8f0d977

Please sign in to comment.