-
Notifications
You must be signed in to change notification settings - Fork 3.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix snapshot 500 error #2408
Fix snapshot 500 error #2408
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,6 +4,7 @@ import ( | |
"encoding/binary" | ||
"errors" | ||
"fmt" | ||
"os" | ||
"sync" | ||
"time" | ||
|
||
|
@@ -20,33 +21,79 @@ type ShardGroup struct { | |
} | ||
|
||
// newShardGroup returns a new initialized ShardGroup instance. | ||
func newShardGroup() *ShardGroup { return &ShardGroup{} } | ||
func newShardGroup(t time.Time, d time.Duration) *ShardGroup { | ||
sg := ShardGroup{} | ||
sg.StartTime = t.Truncate(d).UTC() | ||
sg.EndTime = sg.StartTime.Add(d).UTC() | ||
|
||
// close closes all shards. | ||
func (g *ShardGroup) close() { | ||
for _, sh := range g.Shards { | ||
_ = sh.close() | ||
return &sg | ||
} | ||
|
||
func (sg *ShardGroup) initialize(index uint64, shardN, replicaN int, db *database, rp *RetentionPolicy, nodes []*DataNode, meta *metastore) error { | ||
sg.Shards = make([]*Shard, shardN) | ||
|
||
// Persist to metastore if a shard was created. | ||
return meta.mustUpdate(index, func(tx *metatx) error { | ||
// Generate an ID for the group. | ||
sg.ID = tx.nextShardGroupID() | ||
|
||
// Generate an ID for each shard. | ||
for i := range sg.Shards { | ||
sg.Shards[i] = newShard(tx.nextShardID()) | ||
} | ||
|
||
// Assign data nodes to shards via round robin. | ||
// Start from a repeatably "random" place in the node list. | ||
nodeIndex := int(index % uint64(len(nodes))) | ||
for _, sh := range sg.Shards { | ||
for i := 0; i < replicaN; i++ { | ||
node := nodes[nodeIndex%len(nodes)] | ||
sh.DataNodeIDs = append(sh.DataNodeIDs, node.ID) | ||
nodeIndex++ | ||
} | ||
} | ||
|
||
// Retention policy has a new shard group, so update the policy. | ||
rp.shardGroups = append(rp.shardGroups, sg) | ||
|
||
return tx.saveDatabase(db) | ||
}) | ||
} | ||
|
||
func (sg *ShardGroup) close(id uint64) error { | ||
for _, shard := range sg.Shards { | ||
// Ignore shards not on this server. | ||
if !shard.HasDataNodeID(id) { | ||
continue | ||
} | ||
|
||
path := shard.store.Path() | ||
shard.close() | ||
if err := os.Remove(path); err != nil { | ||
return fmt.Errorf("shard id %d, path %s : %s", shard.ID, path, err) | ||
} | ||
} | ||
return nil | ||
} | ||
|
||
// ShardBySeriesID returns the shard that a series is assigned to in the group. | ||
func (g *ShardGroup) ShardBySeriesID(seriesID uint64) *Shard { | ||
return g.Shards[int(seriesID)%len(g.Shards)] | ||
func (sg *ShardGroup) ShardBySeriesID(seriesID uint64) *Shard { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I like the letter to typically be the first letter of the variable I'm referencing. If the variable is made of two words, I usually use the abbreviation. Idiomatic might be a strong word for this, and wasn't really the reason I did it. The real reason is that sometimes I have shards, and like to use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 |
||
return sg.Shards[int(seriesID)%len(sg.Shards)] | ||
} | ||
|
||
// Duration returns the duration between the shard group's start and end time. | ||
func (g *ShardGroup) Duration() time.Duration { return g.EndTime.Sub(g.StartTime) } | ||
func (sg *ShardGroup) Duration() time.Duration { return sg.EndTime.Sub(sg.StartTime) } | ||
|
||
// Contains return whether the shard group contains data for the time between min and max | ||
func (g *ShardGroup) Contains(min, max time.Time) bool { | ||
return timeBetweenInclusive(g.StartTime, min, max) || | ||
timeBetweenInclusive(g.EndTime, min, max) || | ||
(g.StartTime.Before(min) && g.EndTime.After(max)) | ||
func (sg *ShardGroup) Contains(min, max time.Time) bool { | ||
return timeBetweenInclusive(sg.StartTime, min, max) || | ||
timeBetweenInclusive(sg.EndTime, min, max) || | ||
(sg.StartTime.Before(min) && sg.EndTime.After(max)) | ||
} | ||
|
||
// dropSeries will delete all data with the seriesID | ||
func (g *ShardGroup) dropSeries(seriesIDs ...uint64) error { | ||
for _, s := range g.Shards { | ||
func (sg *ShardGroup) dropSeries(seriesIDs ...uint64) error { | ||
for _, s := range sg.Shards { | ||
err := s.dropSeries(seriesIDs...) | ||
if err != nil { | ||
return err | ||
|
@@ -79,7 +126,14 @@ type Shard struct { | |
} | ||
|
||
// newShard returns a new initialized Shard instance. | ||
func newShard() *Shard { return &Shard{} } | ||
func newShard(id uint64) *Shard { | ||
s := &Shard{} | ||
s.ID = id | ||
s.stats = NewStats(fmt.Sprintf("shard %d", id)) | ||
s.stats.Inc("initialize") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there much point to this stat? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are you trying to track shard state via the stats? Definitely a valid thing to do, but I guess might think about other ways to do it, long-term. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This stat is a complete hack for debugging right now. If this does indeed fix the problem, we should remove it. It did already help as it showed me that a shard had been created, opened, and closed, and they tried to read from a snapshot. So, if this fixes the bug, then it has outlived it's purpose (which it served well for). |
||
|
||
return s | ||
} | ||
|
||
// open initializes and opens the shard's store. | ||
func (s *Shard) open(path string, conn MessagingConn) error { | ||
|
@@ -180,9 +234,7 @@ func (s *Shard) close() error { | |
if s.store != nil { | ||
_ = s.store.Close() | ||
} | ||
if s.stats != nil { | ||
s.stats.Inc("close") | ||
} | ||
s.stats.Inc("close") | ||
return nil | ||
} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This was causing snapshotting to break.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This "lookup" map has caused me problems before. We should think about seeing if there is a way to solve the problems it is solving in another way...later, of course.