Skip to content

Commit 3005f5a

Browse files
authored
feat(storage/dataflux): run worksteal listing parallel to sequential listing (#10966)
1 parent ebf3657 commit 3005f5a

12 files changed

+692
-193
lines changed

storage/dataflux/example_test.go

+2-3
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ import (
2323
"google.golang.org/api/iterator"
2424
)
2525

26-
func ExampleNextBatch_batch() {
26+
func ExampleLister() {
2727
ctx := context.Background()
2828
// Pass in any client opts or set retry policy here.
2929
client, err := storage.NewClient(ctx)
@@ -42,8 +42,7 @@ func ExampleNextBatch_batch() {
4242
SkipDirectoryObjects: false,
4343
}
4444

45-
// Create Lister with desired options, including number of workers,
46-
// part size, per operation timeout, etc.
45+
// Create Lister with fast-list input.
4746
df := dataflux.NewLister(client, in)
4847
defer df.Close()
4948

storage/dataflux/fast_list.go

+155-82
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"fmt"
2121
"runtime"
2222
"strings"
23+
"sync"
2324

2425
"cloud.google.com/go/storage"
2526
"golang.org/x/sync/errgroup"
@@ -47,9 +48,11 @@ type ListerInput struct {
4748
// Default value is 10x number of available CPU. Optional.
4849
Parallelism int
4950

50-
// BatchSize is the number of objects to list. Default value returns
51-
// all objects at once. The number of objects returned will be
52-
// rounded up to a multiple of gcs page size. Optional.
51+
// BatchSize is the minimum number of objects to list in each batch.
52+
// The number of objects returned in a batch will be rounded up to
53+
// include all the objects received in the last request to GCS.
54+
// By default, the Lister returns all objects in one batch.
55+
// Optional.
5356
BatchSize int
5457

5558
// Query is the query to filter objects for listing. Default value is nil.
@@ -58,10 +61,40 @@ type ListerInput struct {
5861
Query storage.Query
5962

6063
// SkipDirectoryObjects is to indicate whether to list directory objects.
61-
// Default value is false. Optional.
64+
// Note: Even if directory objects are excluded, they contribute to the
65+
// [ListerInput.BatchSize] count. Default value is false. Optional.
6266
SkipDirectoryObjects bool
6367
}
6468

69+
// NewLister creates a new [Lister] that can be used to list objects in the given bucket.
70+
func NewLister(c *storage.Client, in *ListerInput) *Lister {
71+
bucket := c.Bucket(in.BucketName)
72+
73+
// If parallelism is not given, set default value to 10x the number of
74+
// available CPU.
75+
if in.Parallelism == 0 {
76+
in.Parallelism = runtime.NumCPU() * 10
77+
}
78+
// Initialize range channel with entire namespace of object for given
79+
// prefix, startoffset and endoffset. For the default range to list is
80+
// entire namespace, start and end will be empty.
81+
rangeChannel := make(chan *listRange, in.Parallelism*2)
82+
start, end := prefixAdjustedOffsets(in.Query.StartOffset, in.Query.EndOffset, in.Query.Prefix)
83+
rangeChannel <- &listRange{startRange: start, endRange: end}
84+
85+
lister := &Lister{
86+
method: open,
87+
parallelism: in.Parallelism,
88+
pageToken: "",
89+
bucket: bucket,
90+
batchSize: in.BatchSize,
91+
query: in.Query,
92+
skipDirectoryObjects: in.SkipDirectoryObjects,
93+
ranges: rangeChannel,
94+
}
95+
return lister
96+
}
97+
6598
// Lister is used for interacting with Dataflux fast-listing. The caller should
6699
// initialize it with NewLister() instead of creating it directly.
67100
type Lister struct {
@@ -92,116 +125,156 @@ type Lister struct {
92125
skipDirectoryObjects bool
93126
}
94127

95-
// NewLister creates a new dataflux Lister to list objects in the give bucket.
96-
func NewLister(c *storage.Client, in *ListerInput) *Lister {
97-
bucket := c.Bucket(in.BucketName)
98-
99-
// If parallelism is not given, set default value to 10x the number of
100-
// available CPU.
101-
if in.Parallelism == 0 {
102-
in.Parallelism = runtime.NumCPU() * 10
103-
}
104-
// Initialize range channel with entire namespace of object for given
105-
// prefix, startoffset and endoffset. For the default range to list is
106-
// entire namespace, start and end will be empty.
107-
rangeChannel := make(chan *listRange, in.Parallelism*2)
108-
start, end := updateStartEndOffset(in.Query.StartOffset, in.Query.EndOffset, in.Query.Prefix)
109-
rangeChannel <- &listRange{startRange: start, endRange: end}
110-
111-
lister := &Lister{
112-
method: open,
113-
parallelism: in.Parallelism,
114-
pageToken: "",
115-
bucket: bucket,
116-
batchSize: in.BatchSize,
117-
query: in.Query,
118-
skipDirectoryObjects: in.SkipDirectoryObjects,
119-
ranges: rangeChannel,
120-
}
121-
return lister
122-
}
123-
124-
// NextBatch runs worksteal algorithm and sequential listing in parallel to quickly
125-
// return a list of objects in the bucket. For smaller dataset,
126-
// sequential listing is expected to be faster. For larger dataset,
128+
// NextBatch returns the next N objects in the bucket, where N is [ListerInput.BatchSize].
129+
// In case of failure, all processes are stopped and an error is returned immediately. Create a new Lister to retry.
130+
// For the first batch, both worksteal listing and sequential
131+
// listing runs in parallel to quickly list N number of objects in the bucket. For subsequent
132+
// batches, only the method which returned object faster in the first batch is used.
133+
// For smaller dataset, sequential listing is expected to be faster. For larger dataset,
127134
// worksteal listing is expected to be faster.
135+
//
136+
// Worksteal algorithm list objects in GCS bucket in parallel using multiple parallel
137+
// workers and each worker in the list operation is able to steal work from its siblings
138+
// once it has finished all currently slated listing work.
128139
func (c *Lister) NextBatch(ctx context.Context) ([]*storage.ObjectAttrs, error) {
129-
// countError tracks the number of failed listing methods.
130-
countError := 0
131-
var results []*storage.ObjectAttrs
132-
ctx, cancel := context.WithCancel(ctx)
133-
defer cancel()
134-
// Errgroup takes care of running both methods in parallel. As soon as one of
135-
// the method is complete, the running method also stops.
136-
g, childCtx := errgroup.WithContext(ctx)
137-
138-
// To start listing method is Open and runs both worksteal and sequential listing
139-
// in parallel. The method which completes first is used for all subsequent runs.
140140

141-
// TODO: Run worksteal listing when method is Open or WorkSteal.
141+
var results []*storage.ObjectAttrs
142142

143-
// Run sequential listing when method is Open or Sequential.
144-
if c.method != worksteal {
143+
// For the first batch, listing method is open and runs both worksteal and sequential listing
144+
// in parallel. The method which completes first is used for all subsequent NextBatch calls.
145+
switch c.method {
146+
case worksteal:
147+
// Run worksteal algorithm for listing.
148+
objects, err := c.workstealListing(ctx)
149+
if err != nil {
150+
return nil, fmt.Errorf("worksteal listing: %w", err)
151+
}
152+
results = objects
153+
case sequential:
154+
// Run GCS sequential listing.
155+
objects, token, err := c.sequentialListing(ctx)
156+
if err != nil {
157+
return nil, fmt.Errorf("sequential listing: %w", err)
158+
}
159+
results = objects
160+
c.pageToken = token
161+
c.ranges = nil
162+
case open:
163+
// countError tracks the number of failed listing methods.
164+
countErr := &countErr{counter: 0}
165+
166+
ctx, cancel := context.WithCancel(ctx)
167+
defer cancel()
168+
// Errgroup takes care of running both methods in parallel. As soon as one of
169+
// the method is complete, the running method also stops.
170+
g, ctx := errgroup.WithContext(ctx)
171+
wsCompletedfirst := false
172+
seqCompletedfirst := false
173+
var wsObjects []*storage.ObjectAttrs
174+
var seqObjects []*storage.ObjectAttrs
175+
var nextToken string
176+
g.Go(func() error {
177+
objects, err := c.workstealListing(ctx)
178+
if err != nil {
179+
countErr.increment()
180+
return fmt.Errorf("worksteal listing: %w", err)
181+
}
182+
// Close context when worksteal listing is complete.
183+
cancel()
184+
wsCompletedfirst = true
185+
wsObjects = objects
145186

187+
return nil
188+
})
146189
g.Go(func() error {
147-
objects, nextToken, err := c.sequentialListing(childCtx)
190+
objects, token, err := c.sequentialListing(ctx)
148191
if err != nil {
149-
countError++
150-
return fmt.Errorf("error in running sequential listing: %w", err)
192+
countErr.increment()
193+
return fmt.Errorf("sequential listing: %w", err)
151194
}
152-
// If sequential listing completes first, set method to sequential listing
153-
// and ranges to nil. The nextToken will be used to continue sequential listing.
154-
results = objects
155-
c.pageToken = nextToken
156-
c.method = sequential
157195
// Close context when sequential listing is complete.
158196
cancel()
197+
seqCompletedfirst = true
198+
seqObjects = objects
199+
nextToken = token
200+
159201
return nil
160202
})
161-
}
162-
163-
// Close all functions if either sequential listing or worksteal listing is complete.
164-
err := g.Wait()
165-
166-
// If the error is not context.Canceled, then return error instead of falling back
167-
// to the other method. This is so that the error can be fixed and user can take
168-
// advantage of fast-listing.
169-
// As one of the listing method completes, it is expected to cancel context for the
170-
// only then return error. other method. If both sequential and worksteal listing
171-
// fail due to context canceled, return error.
172-
if err != nil && (!errors.Is(err, context.Canceled) || countError > 1) {
173-
return nil, fmt.Errorf("failed waiting for sequntial and work steal lister : %w", err)
203+
// Close all functions if either sequential listing or worksteal listing is complete.
204+
err := g.Wait()
205+
206+
// If the error is not context.Canceled, then return error instead of falling back
207+
// to the other method. This is so that the error can be fixed and user can take
208+
// advantage of fast-listing.
209+
// As one of the listing method completes, it is expected to cancel context and
210+
// return context canceled error for the other method. Since context canceled is expected, it
211+
// will not be considered an error. If both sequential and worksteal listing fail due
212+
// to context canceled, then return error.
213+
if err != nil && (!errors.Is(err, context.Canceled) || countErr.counter > 1) {
214+
return nil, fmt.Errorf("dataflux: %w", err)
215+
}
216+
if wsCompletedfirst {
217+
// If worksteal listing completes first, set method to worksteal listing and nextToken to "".
218+
// The c.ranges channel will be used to continue worksteal listing.
219+
results = wsObjects
220+
c.pageToken = ""
221+
c.method = worksteal
222+
} else if seqCompletedfirst {
223+
// If sequential listing completes first, set method to sequential listing
224+
// and ranges to nil. The nextToken will be used to continue sequential listing.
225+
results = seqObjects
226+
c.pageToken = nextToken
227+
c.method = sequential
228+
c.ranges = nil
229+
}
174230
}
175231

176232
// If ranges for worksteal and pageToken for sequential listing is empty, then
177233
// listing is complete.
178-
if c.pageToken == "" {
234+
if c.pageToken == "" && len(c.ranges) == 0 {
179235
return results, iterator.Done
180236
}
181237
return results, nil
182238
}
183239

184-
// Close closes the range channel of the Lister.
240+
// Close is used to close the Lister.
185241
func (c *Lister) Close() {
186242
if c.ranges != nil {
187243
close(c.ranges)
188244
}
189245
}
190246

191-
// updateStartEndOffset updates start and end offset based on prefix.
192-
// If a prefix is given, adjust start and end value such that it lists
193-
// objects with the given prefix. updateStartEndOffset assumes prefix will
194-
// be added to the object name while listing objects in worksteal algorithm.
247+
type countErr struct {
248+
mu sync.Mutex
249+
counter int
250+
}
251+
252+
func (cc *countErr) increment() {
253+
cc.mu.Lock()
254+
defer cc.mu.Unlock()
255+
cc.counter++
256+
}
257+
258+
// prefixAdjustedOffsets returns a start and end offset adjusted from the given offsets based on the prefix, stripping the prefix.
259+
// These offsets can be used by adding back the prefix, so that the original offsets do not need to be checked.
260+
261+
// This means that if the given offsets are out of range of the prefix
262+
// (for example, offsets {start:"a", end: "b"}, with prefix "c" which is lexicographically
263+
// outside of "a" to "b"), the returned offsets will ensure no strings fall in their range.
264+
265+
// Otherwise, if the offset is too permissive given the prefix, it returns an empty string
266+
// to indicate there is no offset and all objects starting from or ending at the prefix should
267+
// be listed.
195268
//
196269
// For example:
197270
// start = "abc", end = "prefix_a", prefix = "prefix",
198271
//
199-
// end will change to "_a", prefix will be added in worksteal algorithm.
200-
// "abc" is lexicographically smaller than "prefix". So start will be the first
201-
// object with the given prefix.
272+
// "abc" is lexicographically smaller than "prefix". The start offset indicates first
202273
//
203-
// Therefore start will change to ""(empty string) and end to "_a" .
204-
func updateStartEndOffset(start, end, prefix string) (string, string) {
274+
// object with the given prefix should be listed therefor start offset will be empty.
275+
// The end offset will change to "_a" as the prefix is stripped.
276+
// Therefore new offset will change to {start = "", end = "_a" }.
277+
func prefixAdjustedOffsets(start, end, prefix string) (string, string) {
205278
if prefix == "" {
206279
return start, end
207280
}

0 commit comments

Comments
 (0)