forked from googleapis/google-cloud-go
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrange_splitter.go
374 lines (318 loc) · 13.8 KB
/
range_splitter.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
// Copyright 2024 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package dataflux
import (
"fmt"
"math/big"
"sort"
"sync"
)
// rangeSplitter specifies the a list and a map of sorted alphabets.
type rangeSplitter struct {
mu sync.Mutex
sortedAlphabet []rune
alphabetMap map[rune]int
}
// listRange specifies the start and end range for the range splitter.
type listRange struct {
startRange string
endRange string
}
// minimalIntRange specifies start and end range in base-10 form, along with the
// minimal string length for the split range strings.
type minimalIntRange struct {
startInteger *big.Int
endInteger *big.Int
minimalLength int
}
// generateSplitsOpts specifies the parameters needed to generate the split
// range strings.
type generateSplitsOpts struct {
minimalIntRange *minimalIntRange
numSplits int
startRange string
endRange string
}
// newRangeSplitter creates a new RangeSplitter with the given alphabets.
// RangeSplitter determines split points within a given range based on the given
// alphabets. Note that the alphabets are a set of characters guaranteed to be unique.
func newRangeSplitter(alphabet string) (*rangeSplitter, error) {
// Validate that we do not have empty alphabet passed in.
if len(alphabet) == 0 {
return nil, fmt.Errorf("no alphabet specified for the range splitter")
}
// Sort the alphabet lexicographically and store a mapping of each alphabet
// to its index. We need a mapping for efficient index lookup in later operations.
sortedAlphabet := []rune(alphabet)
sortAlphabet(sortedAlphabet)
alphabetMap := constructAlphabetMap(sortedAlphabet)
return &rangeSplitter{
alphabetMap: alphabetMap,
sortedAlphabet: sortedAlphabet,
}, nil
}
// splitRange divides the provided start and end range into approximately equal
// subranges, returning the split points. An empty slice is returned if suitable
// splits cannot be determined. Please note that this method provides a rough
// estimate of split points, without ensuring precise even partitioning of the range.
// Additionally, the number of found splits might be fewer than requested if the
// algorithm struggles to find sufficient split points. If the start range is empty
// the algorithm assumes it to be sequence of smallest possible character and empty
// end range as sequence of highest possible characters.
//
// For example, sorted alphabet = {"a","b","c","d"}
// Input: startRange= "d", endRange= "", numSplits=2
//
// This will be converted from base-N to base-10 integers.
// While calculating base-10 integer, "a" will be appended to startRange
// and "d" will be appended to endRange until the difference between integers is
// more than number of splits.
// startInteger for "da" = 12, endInteger for "dd" = 15
//
// The splits points will be 13 and 14 in base-10. This will be converted back to
// base-N value and returned as split points:
// {"db","dc"}
func (rs *rangeSplitter) splitRange(startRange, endRange string, numSplits int) ([]string, error) {
// Number of splits has to be at least one, otherwise it is not splittable.
if numSplits < 1 {
return nil, fmt.Errorf("number of splits should be at least 1, got %d", numSplits)
}
// End range (if specified) has to be lexicographically greater than the start range
// for the range to be valid.
if len(endRange) != 0 && startRange >= endRange {
return nil, fmt.Errorf("start range %q cannot be lexicographically greater than end range %q", startRange, endRange)
}
rs.addCharsToAlphabet([]rune(startRange))
rs.addCharsToAlphabet([]rune(endRange))
// Validate start range characters and convert into character array form.
startRangeCharArray, err := rs.convertRangeStringToArray(startRange)
if err != nil {
return nil, fmt.Errorf("unable to convert start range %q to array: %v", startRange, err)
}
// Validate end range characters and convert into character array form.
endRangeCharArray, err := rs.convertRangeStringToArray(endRange)
if err != nil {
return nil, fmt.Errorf("unable to convert end range %q to array: %v", endRange, err)
}
// Construct the final split ranges to be returned.
var splitPoints []string
// If the start and end string ranges are equal with padding, no splitting is
// necessary. In such cases, an empty array of split ranges is returned.
if rs.isRangeEqualWithPadding(startRangeCharArray, endRangeCharArray) {
return splitPoints, nil
}
// Convert the range strings from base-N to base-10 and employ a greedy approach
// to determine the smallest splittable integer range difference.
minimalIntRange, err := rs.convertStringRangeToMinimalIntRange(
startRangeCharArray, endRangeCharArray, numSplits)
if err != nil {
return nil, fmt.Errorf("range splitting with start range %q and end range %q: %v",
startRange, endRange, err)
}
// Generate the split points and return them.
splitPoints = rs.generateSplits(generateSplitsOpts{
startRange: startRange,
endRange: endRange,
numSplits: numSplits,
minimalIntRange: minimalIntRange,
})
return splitPoints, nil
}
// generateSplits generates the split points by translating the start and end
// range strings into base-10 integers, performing a split within the integer
// domain, and then converting the splits back into strings. In essence, this
// operation resembles a base-N to base-10 conversion, followed by a split in
// base 10, and finally another base-10 to base-N conversion. In this scenario,
// N represents the size of the alphabet, with the character's position in the
// alphabet indicating the digit's value.
func (rs *rangeSplitter) generateSplits(opts generateSplitsOpts) []string {
startInteger := opts.minimalIntRange.startInteger
endInteger := opts.minimalIntRange.endInteger
minimalLength := opts.minimalIntRange.minimalLength
rangeDifference := new(big.Int).Sub(endInteger, startInteger)
var splitPoints []string
// The number of intervals is one more than the number of split points.
rangeInterval := new(big.Int).SetInt64(int64(opts.numSplits + 1))
for i := 1; i <= opts.numSplits; i++ {
// Combine the range interval and index to determine the split point in base-10 form.
rangeDiffWithIdx := new(big.Int).Mul(rangeDifference, big.NewInt(int64(i)))
rangeInterval := new(big.Int).Div(rangeDiffWithIdx, rangeInterval)
splitPoint := new(big.Int).Add(rangeInterval, startInteger)
// Convert the split point back from base-10 to base-N.
splitString := rs.convertIntToString(splitPoint, minimalLength)
// Due to the approximate nature on how the minimal int range is derived, we need to perform
// another validation to check to ensure each split point falls in valid range.
isGreaterThanStart := len(splitString) > 0 && splitString > opts.startRange
isLessThanEnd := len(opts.endRange) == 0 || (len(splitString) > 0 && splitString < opts.endRange)
if isGreaterThanStart && isLessThanEnd {
splitPoints = append(splitPoints, splitString)
}
}
return splitPoints
}
// sortAlphabet sorts the alphabets string lexicographically.
func sortAlphabet(unsortedAlphabet []rune) {
sort.Slice(unsortedAlphabet, func(i, j int) bool {
return unsortedAlphabet[i] < unsortedAlphabet[j]
})
}
// constructAlphabetMap constructs a mapping from each character in the
// alphabets to its index in the alphabet array.
func constructAlphabetMap(alphabet []rune) map[rune]int {
alphabetMap := make(map[rune]int)
for i, char := range alphabet {
alphabetMap[char] = i
}
return alphabetMap
}
// addCharsToAlphabet adds the given chars to the known alphabet. Repeated chars are ignored
// as alphabet contains unique chars.
func (rs *rangeSplitter) addCharsToAlphabet(characters []rune) {
rs.mu.Lock() // Acquire the lock
defer rs.mu.Unlock() // Release the lock when the function exits
allAlphabet := rs.sortedAlphabet
newChars := false
for _, char := range characters {
if _, exists := rs.alphabetMap[char]; !exists {
allAlphabet = append(allAlphabet, char)
newChars = true
// Update the alphabet map so the new char is not repeated.
rs.alphabetMap[char] = 0
}
}
if newChars {
sortAlphabet(allAlphabet)
rs.sortedAlphabet = allAlphabet
rs.alphabetMap = constructAlphabetMap(rs.sortedAlphabet)
}
}
// isRangeEqualWithPadding checks if two range strings are identical. Equality
// encompasses any padding using the smallest alphabet character from the set.
func (rs *rangeSplitter) isRangeEqualWithPadding(startRange, endRange []rune) bool {
sortedAlphabet := rs.sortedAlphabet
// When the end range is unspecified, it's interpreted as a sequence of the
// highest possible characters. Consequently, they are not deemed equal.
// If start range has highest possible characters, then smaller characters
// are appended to start range to find split points.
if len(endRange) == 0 {
return false
}
// Get the longer length of the two range strings.
maxLength := max(len(startRange), len(endRange))
smallestChar := sortedAlphabet[0]
// Loop through the string range.
for i := 0; i < maxLength; i++ {
// In cases where a character is absent at a specific position (due to a length
// difference), the position is padded with the smallest character in the alphabet.
charStart := charAtOrDefault(startRange, i, smallestChar)
charEnd := charAtOrDefault(endRange, i, smallestChar)
// As soon as we find a difference, we conclude the two strings are different.
if charStart != charEnd {
return false
}
}
// Otherwise, we conclude the two strings are equal.
return true
}
// charAtOrDefault returns the character at the specified position, or the default
// character if the position is out of bounds.
func charAtOrDefault(charArray []rune, position int, defaultChar rune) rune {
if position < 0 || position >= len(charArray) {
return defaultChar
}
return (charArray)[position]
}
// convertStringRangeToMinimalIntRange gradually extends the start and end string
// range in base-10 representation, until the difference reaches a threshold
// suitable for splitting.
func (rs *rangeSplitter) convertStringRangeToMinimalIntRange(
startRange, endRange []rune, numSplits int) (*minimalIntRange, error) {
startInteger := big.NewInt(0)
endInteger := big.NewInt(0)
alphabetLength := len(rs.sortedAlphabet)
startChar := (rs.sortedAlphabet)[0]
endChar := (rs.sortedAlphabet)[alphabetLength-1]
endDefaultChar := startChar
if len(endRange) == 0 {
endDefaultChar = endChar
}
for i := 0; ; i++ {
// Convert each character of the start range string into a big integer
// based on the alphabet system.
startPosition, err := rs.charPosition(charAtOrDefault(startRange, i, startChar))
if err != nil {
return nil, err
}
startInteger.Mul(startInteger, big.NewInt(int64(alphabetLength)))
startInteger.Add(startInteger, big.NewInt(int64(startPosition)))
// Convert each character of the end range string into a big integer
// based on the alphabet system.
endPosition, err := rs.charPosition(charAtOrDefault(endRange, i, endDefaultChar))
if err != nil {
return nil, err
}
endInteger.Mul(endInteger, big.NewInt(int64(alphabetLength)))
endInteger.Add(endInteger, big.NewInt(int64(endPosition)))
// Calculate the difference between the start and end range in big integer representation.
difference := new(big.Int).Sub(endInteger, startInteger)
// If the difference is bigger than the number of split points, we are done.
// In particular, the minimal length is one greater than the index (due to zero indexing).
if difference.Cmp(big.NewInt(int64(numSplits))) > 0 {
return &minimalIntRange{
startInteger: startInteger,
endInteger: endInteger,
minimalLength: i + 1,
}, nil
}
}
}
// charPosition returns the index of the character in the alphabet set.
func (rs *rangeSplitter) charPosition(ch rune) (int, error) {
rs.mu.Lock() // Acquire the lock
defer rs.mu.Unlock() // Release the lock when the function exits
if idx, ok := rs.alphabetMap[ch]; ok {
return idx, nil
}
return -1, fmt.Errorf("character %c is not found in the alphabet map %v", ch, rs.alphabetMap)
}
// convertRangeStringToArray transforms the range string into a rune slice while
// verifying the presence of each character in the alphabets.
func (rs *rangeSplitter) convertRangeStringToArray(rangeString string) ([]rune, error) {
rs.mu.Lock() // Acquire the lock
defer rs.mu.Unlock() // Release the lock when the function exits
for _, char := range rangeString {
if _, exists := rs.alphabetMap[char]; !exists {
return nil, fmt.Errorf("character %c in range string %q is not found in the alphabet array", char, rangeString)
}
}
characterArray := []rune(rangeString)
return characterArray, nil
}
// convertIntToString converts the split point from base-10 to base-N.
func (rs *rangeSplitter) convertIntToString(splitPoint *big.Int, stringLength int) string {
remainder := new(big.Int)
var splitChar []rune
alphabetSize := big.NewInt(int64(len(rs.sortedAlphabet)))
// Iterate through the split point and convert alphabet by alphabet.
for i := 0; i < stringLength; i++ {
remainder.Mod(splitPoint, alphabetSize)
splitPoint.Div(splitPoint, alphabetSize)
splitChar = append(splitChar, (rs.sortedAlphabet)[(int)(remainder.Int64())])
}
// Reverse the converted alphabet order because we originally processed from right to left.
for i, j := 0, len(splitChar)-1; i < j; i, j = i+1, j-1 {
splitChar[i], splitChar[j] = splitChar[j], splitChar[i]
}
return string(splitChar)
}