-
-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathfilter.zig
651 lines (576 loc) · 29.2 KB
/
filter.zig
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
const fastfilter = @import("fastfilter");
const std = @import("std");
const Allocator = std.mem.Allocator;
const testing = std.testing;
/// Comptime options for a sinter filter.
const Options = struct {
/// The binary fuse filter bit size. Either 8, 16, or 32. A higher bit size like 16 could be
/// useful if false positive matches have a very high penalty for your use case (bitsize of 8
/// gives 4% false positives, 16 gives 0.0015%) and you're willing to pay more memory / indexing
/// time. See https://github.com/hexops/fastfilter#benchmarks
filter_bit_size: u16 = 8,
/// The number of divisions in the mid layer.
mid_layer_divisions: usize = 8,
};
/// A sinter filter. They are designed to represent many matching keys (integers, e.g. hashes of
/// words in a text file) which map to a smaller number of results (for example, the text files
/// themselves.) The filter is composed of a 3-layer tree of binary fuse filters, and optimized for
/// query time. Binary fuse filters are a much faster more modern variant of bloom filters by Daniel
/// Lemire and Thomas Mueller Graf, see https://github.com/hexops/fastfilter for details.
///
/// Querying results from a sinter filter is as easy as providing matching keys. Key equality is
/// exact only, you can acquire "fuzzy" matching of results by e.g. emitting a single key for every
/// string you might want to match. To query, you provide a set of keys that should logically AND/OR
/// intersect with results' keys. Due to statistical nature of fastfilters, it is possible to get
/// false-positive matches but never false-negatives.
///
/// A sinter filter is designed to be built, indexed, and queried within a single CPU core. You
/// should look at the indexing time, memory usage, query time, and based on those numbers decide
/// how much data on average to aim to pack into a single sinter filter so that you saturate about
/// half of a CPU core reasonably. A FilterGroup can then be used to operate on multiple sinter
/// filters in parallel across CPU cores. Multiple FilterGroups are typically distributed across
/// physical machines when desirable.
///
/// `zig run-benchmark-filter` shows how efficient sinter filters can be. On an original M1 Macbook
/// Pro, utilizing a single CPU:
///
/// ```
/// | # keys | # results | # keys per result | index time | OR-200 query time | AND-200 query time | writeFile time | readFile time |
/// |-----------|-----------|-------------------|------------|-------------------|--------------------|----------------|---------------|
/// | 100000000 | 200 | 500000 | 22.4s | 2825.0ns | 415755.0ns | 91.0ms | 667.1ms |
/// ```
///
/// That can be read as:
///
/// * We put 200 results into the filter, each with 500,000 unique matching keys.
/// * The filter contains 100,000,000 unique matching keys total.
/// * We can query "which results contain one of these 200 keys? (OR)" in 2825ns
/// * We can query "which results contain ALL 200 of these keys? (AND)" in 0.41ms (415755.0ns)
/// * Serialization and deserialization (including writing to disk) takes under a few hundred ms, 324M on disk file size.
///
/// How sinter filters are structured
///
/// The filter is represented in three layers (all perf measured on Ryzen 9 3900X w/ 100 million
/// keys):
///
/// - outer layer: the topmost fastfilter which is capable of determining if a given key is present
/// in any result within the entire filter. e.g. if a word is present in any of the 200 files
/// (assuming 200 files is about 100,000,000 words/keys.)
/// - Indexing: 2 GiB / 6.9s
/// - Filter size: 107 MiB
/// - Query speed: 167ns
/// - mid layer: A configurable number of fastfilters, which divide the outer layer into N sets
/// (typically 8.) e.g. while the outer layer says "this word is in one of these 200 files", the
/// mid layer says "it's in one of these 25 files"
/// - Indexing: 225 MiB / 572.3ms (per set)
/// - Filter size: 10 MiB (per set)
/// - Query speed: 33ns (per set)
/// - inner layer: the lowest level fastfilter which represents whether or not a given key is
/// present in a final result. e.g. the inner layer says "this word is in this file".
/// - Indexing: <22 MiB / <44.6ms
/// - Filter size: <1 MiB
/// - Query speed: <24ns
///
/// For example, assuming you have 200 files with 100,000,000 words/keys total, then performance
/// could be estimated on a single sinter filter (single CPU core) to be:
///
/// - Indexing peak mem: ~2 GiB
/// - Indexing time: 20.4s (6.9s outer, 4.6s mid, 8.9s inner)
/// - Query (best case): 224ns (167ns outer, 33ns mid, 24ns inner)
/// - Query (worst case): 1031ns (167ns outer, 33ns*8 mid, 24ns*25 inner)
///
pub fn Filter(comptime options: Options, comptime Result: type, comptime Iterator: type) type {
return struct {
/// The original estimated number of keys in this filter.
total_keys_estimate: usize,
/// Total number of keys within this filter.
keys: usize = 0,
/// null until .index() is invoked.
outer_layer: ?BinaryFuseFilter = null,
mid_layer: [options.mid_layer_divisions]MidLayer,
pub const MidLayer = struct {
/// null until .index() is invoked.
filter: ?BinaryFuseFilter = null,
/// Total number of keys within this layer.
keys: usize = 0,
inner_layers: std.MultiArrayList(InnerLayer),
};
pub const InnerLayer = struct {
/// Total number of keys within the inner layer.
keys: usize = 0,
/// null until .index() is invoked.
filter: ?BinaryFuseFilter = null,
keys_iter: ?Iterator = null,
result: Result,
};
pub const FilterType = @Type(.{ .Int = .{ .signedness = .unsigned, .bits = options.filter_bit_size } });
pub const BinaryFuseFilter = fastfilter.BinaryFuse(FilterType);
const Self = @This();
/// Initializes the filter with an approximate number of keys that the filter overall is
/// expected to contain (e.g. 100_000_000) and estimated number of keys per result (e.g. 500_000)
/// which will be used to balance mid layer divisions and keep them at generally equal amounts
/// of keys.
pub fn init(total_keys_estimate: usize) Self {
var mid_layer: [options.mid_layer_divisions]MidLayer = undefined;
comptime var division = 0;
inline while (division < mid_layer.len) : (division += 1) {
mid_layer[division] = .{
.inner_layers = std.MultiArrayList(InnerLayer){},
};
}
return Self{
.total_keys_estimate = total_keys_estimate,
.outer_layer = null,
.mid_layer = mid_layer,
};
}
/// Inserts the given result, computing a fastfilter to represent the result using the given
/// keys iterator.
///
/// For example, if using text files + trigrams, result could be the file name and keys
/// would be an iterator for hashes of the files trigrams. See fastfilter.SliceIterator
///
/// The iterator must remain alive at least until .index() is called.
pub fn insert(filter: *Self, allocator: Allocator, keys_iter: Iterator, result: Result) !void {
const keys_len = keys_iter.len();
const inner_layer = InnerLayer{
.keys = keys_len,
.keys_iter = keys_iter,
.result = result,
};
// Determine which division of mid_layer this inner_layer should be inserted into.
// If we don't find a division with free space below, we'll place it into an evenly
// distributed division based on number of keys.
var target_division: usize = keys_len % options.mid_layer_divisions;
const target_keys_per_division = filter.total_keys_estimate / options.mid_layer_divisions;
for (filter.mid_layer) |division, division_index| {
if (division.keys + inner_layer.keys >= target_keys_per_division) continue;
// Found a division we can place it into.
target_division = division_index;
break;
}
filter.keys += inner_layer.keys;
filter.mid_layer[target_division].keys += inner_layer.keys;
try filter.mid_layer[target_division].inner_layers.append(allocator, inner_layer);
}
/// Iterates every key in a filter.
const AllKeysIter = struct {
filter: *Self,
mid_layer_index: usize = 0,
inner_layer_index: usize = 0,
iter: ?Iterator = null,
pub inline fn next(iter: *@This()) ?u64 {
if (iter.iter == null) {
if (iter.filter.mid_layer[iter.mid_layer_index].inner_layers.len == 0) return null;
iter.iter = iter.filter.mid_layer[iter.mid_layer_index].inner_layers.get(0).keys_iter.?;
}
var final = iter.iter.?.next();
while (final == null) {
if (iter.inner_layer_index + 1 == iter.filter.mid_layer[iter.mid_layer_index].inner_layers.len) {
if (iter.mid_layer_index + 1 == iter.filter.mid_layer.len) {
iter.mid_layer_index = 0;
iter.inner_layer_index = 0;
iter.iter = null;
return null; // no further inner layers
}
// Next inner layer.
iter.mid_layer_index += 1;
iter.inner_layer_index = 0;
if (iter.filter.mid_layer[iter.mid_layer_index].inner_layers.len == 0) {
iter.mid_layer_index = 0;
iter.inner_layer_index = 0;
iter.iter = null;
return null;
}
iter.iter = iter.filter.mid_layer[iter.mid_layer_index].inner_layers.get(iter.inner_layer_index).keys_iter.?;
final = iter.iter.?.next();
} else {
iter.inner_layer_index += 1;
iter.iter = iter.filter.mid_layer[iter.mid_layer_index].inner_layers.get(iter.inner_layer_index).keys_iter.?;
final = iter.iter.?.next();
}
}
return final;
}
pub inline fn len(iter: @This()) usize {
return iter.filter.keys;
}
};
/// Iterates every key in a single mid layer.
const MidLayerIterator = struct {
filter: *Self,
mid_layer_index: usize,
inner_layer_index: usize = 0,
iter: ?Iterator = null,
pub fn next(iter: *@This()) ?u64 {
if (iter.iter == null) {
if (iter.filter.mid_layer[iter.mid_layer_index].inner_layers.len == 0) return null;
iter.iter = iter.filter.mid_layer[iter.mid_layer_index].inner_layers.get(0).keys_iter.?;
}
var final = iter.iter.?.next();
while (final == null) {
if (iter.inner_layer_index + 1 == iter.filter.mid_layer[iter.mid_layer_index].inner_layers.len) {
iter.inner_layer_index = 0;
iter.iter = null;
return null;
} else {
iter.inner_layer_index += 1;
iter.iter = iter.filter.mid_layer[iter.mid_layer_index].inner_layers.get(iter.inner_layer_index).keys_iter.?;
final = iter.iter.?.next();
}
}
return final;
}
pub inline fn len(iter: @This()) usize {
return iter.filter.mid_layer[iter.mid_layer_index].keys;
}
};
/// Indexes the filter, populating all of the fastfilters using the key iterators of the
/// results. Must be performed once finished inserting results. Can be called again to
/// update the filter (although this performs a full rebuild.)
pub fn index(filter: *Self, allocator: Allocator) !void {
// Populate outer layer with all keys.
var all_keys_iter = AllKeysIter{ .filter = filter };
filter.outer_layer = try BinaryFuseFilter.init(allocator, filter.keys);
// try filter.outer_layer.?.populateIter(allocator, &all_keys_iter);
try populateIterUnique(allocator, &filter.outer_layer.?, &all_keys_iter);
// Populate each mid layer filter, with their division of keys.
for (filter.mid_layer) |*mid_layer, mid_layer_index| {
var mid_layer_iter = MidLayerIterator{ .filter = filter, .mid_layer_index = mid_layer_index };
mid_layer.filter = try BinaryFuseFilter.init(allocator, mid_layer.keys);
// try mid_layer.filter.?.populateIter(allocator, &mid_layer_iter);
try populateIterUnique(allocator, &mid_layer.filter.?, &mid_layer_iter);
}
// Populate each inner_layer filter.
for (filter.mid_layer) |*mid_layer| {
var i: usize = 0;
while (i < mid_layer.inner_layers.len) : (i += 1) {
var inner_layer = mid_layer.inner_layers.get(i);
inner_layer.filter = try BinaryFuseFilter.init(allocator, inner_layer.keys);
// try inner_layer.filter.?.populateIter(allocator, inner_layer.keys_iter.?);
try populateIterUnique(allocator, &inner_layer.filter.?, inner_layer.keys_iter.?);
mid_layer.inner_layers.set(i, inner_layer);
}
}
}
pub fn deinit(filter: *Self, allocator: Allocator) void {
if (filter.outer_layer) |*outer_layer| outer_layer.deinit(allocator);
for (filter.mid_layer) |*inner| {
if (inner.filter) |*inner_filter| inner_filter.deinit(allocator);
for (inner.inner_layers.items(.filter)) |inner_layer_data| {
if (inner_layer_data) |*inner_layer_filter| inner_layer_filter.deinit(allocator);
}
inner.inner_layers.deinit(allocator);
}
}
/// reports if the specified key is likely contained by the filter (within the set
/// false-positive rate.)
pub inline fn contains(filter: *const Self, key: u64) bool {
return filter.outer_layer.?.contain(key);
}
/// Queries for results from the filter, returning results whose keys likely match one of
/// the keys in `or_keys`.
///
/// Returns the number of results found.
pub inline fn queryLogicalOr(filter: *Self, allocator: Allocator, or_keys: []const u64, comptime ResultsDst: type, dst: ?ResultsDst) !usize {
var any = blk: {
for (or_keys) |key| {
if (filter.outer_layer.?.contain(key)) {
break :blk true;
}
}
break :blk false;
};
if (!any) return 0;
var results: usize = 0;
for (filter.mid_layer) |*inner| {
var mid_layer = inner.filter.?;
any = blk: {
for (or_keys) |key| {
if (mid_layer.contain(key)) {
break :blk true;
}
}
break :blk false;
};
if (!any) continue;
for (inner.inner_layers.items(.filter)) |inner_layer_filter, i| {
any = blk: {
for (or_keys) |key| {
if (inner_layer_filter.?.contain(key)) {
break :blk true;
}
}
break :blk false;
};
if (!any) continue;
results += 1;
if (dst) |d| try d.append(allocator, inner.inner_layers.get(i).result);
}
}
return results;
}
/// Queries for results from the filter, returning results whose keys likely match all of
/// the keys in `and_keys`.
///
/// Returns the number of results found.
pub inline fn queryLogicalAnd(filter: *Self, allocator: Allocator, and_keys: []const u64, comptime ResultsDst: type, dst: ?ResultsDst) !usize {
var all = blk: {
for (and_keys) |key| {
if (!filter.outer_layer.?.contain(key)) {
break :blk false;
}
}
break :blk true;
};
if (!all) return 0;
var results: usize = 0;
for (filter.mid_layer) |*inner| {
var mid_layer = inner.filter.?;
all = blk: {
for (and_keys) |key| {
if (!mid_layer.contain(key)) {
break :blk false;
}
}
break :blk true;
};
if (!all) continue;
for (inner.inner_layers.items(.filter)) |inner_layer_filter, i| {
all = blk: {
for (and_keys) |key| {
if (!inner_layer_filter.?.contain(key)) {
break :blk false;
}
}
break :blk true;
};
if (!all) continue;
results += 1;
if (dst) |d| try d.append(allocator, inner.inner_layers.get(i).result);
}
}
return results;
}
pub fn sizeInBytes(filter: *const Self) usize {
var size: usize = @sizeOf(Self);
if (filter.outer_layer) |outer_filter| size += outer_filter.sizeInBytes();
for (filter.mid_layer) |*inner| {
if (inner.filter) |inner_filter| size += inner_filter.sizeInBytes();
for (inner.inner_layers.items(.filter)) |inner_layer_filter| {
if (inner_layer_filter) |f| size += f.sizeInBytes();
size += @sizeOf(InnerLayer);
}
}
return size;
}
pub fn writeFile(
filter: *const Self,
allocator: Allocator,
dir: std.fs.Dir,
dest_path: []const u8,
) !void {
const baf = try std.io.BufferedAtomicFile.create(allocator, dir, dest_path, .{});
defer baf.destroy();
try filter.serialize(baf.writer());
try baf.finish();
}
pub fn serialize(filter: *const Self, stream: anytype) !void {
// Constants
const version = 1;
try stream.writeIntLittle(u16, version);
try stream.writeIntLittle(u64, filter.total_keys_estimate);
try stream.writeIntLittle(u16, options.filter_bit_size);
try stream.writeIntLittle(u64, options.mid_layer_divisions);
// Outer layer
try stream.writeIntLittle(u64, filter.keys);
try serializeFilter(stream, &filter.outer_layer.?);
for (filter.mid_layer) |*mid_layer| {
// Mid layer
try stream.writeIntLittle(u64, mid_layer.keys);
try serializeFilter(stream, &mid_layer.filter.?);
try stream.writeIntLittle(u32, @intCast(u32, mid_layer.inner_layers.len));
var i: usize = 0;
while (i < mid_layer.inner_layers.len) : (i += 1) {
// Inner layer
var inner_layer = mid_layer.inner_layers.get(i);
try stream.writeIntLittle(u64, inner_layer.keys);
try serializeFilter(stream, &inner_layer.filter.?);
// TODO: generic result serialization
if (Result == u64) {
try stream.writeIntLittle(u64, inner_layer.result);
} else if (Result == []const u8) {
try stream.writeIntLittle(u32, @intCast(u32, inner_layer.result.len));
try stream.writeAll(inner_layer.result);
} else unreachable;
}
}
}
fn serializeFilter(stream: anytype, filter: *const BinaryFuseFilter) !void {
try stream.writeIntLittle(u64, filter.seed);
try stream.writeIntLittle(u32, filter.segment_length);
try stream.writeIntLittle(u32, filter.segment_length_mask);
try stream.writeIntLittle(u32, filter.segment_count);
try stream.writeIntLittle(u32, filter.segment_count_length);
try stream.writeIntLittle(u32, @intCast(u32, filter.fingerprints.len));
const F = std.meta.Elem(@TypeOf(filter.fingerprints));
const fingerprint_bytes: []const u8 = filter.fingerprints.ptr[0 .. filter.fingerprints.len * @sizeOf(F)];
try stream.writeAll(fingerprint_bytes);
}
pub fn readFile(
allocator: Allocator,
file_path: []const u8,
) !Self {
var file = try std.fs.cwd().openFile(file_path, .{ .mode = .read_only });
defer file.close();
var buf_stream = std.io.bufferedReader(file.reader());
return try deserialize(allocator, buf_stream.reader());
}
pub fn deserialize(allocator: Allocator, stream: anytype) !Self {
// TODO: if reads here fail, filter allocations would leak.
// Constants
const version = try stream.readIntLittle(u16);
std.debug.assert(version == 1);
const total_keys_estimate = try stream.readIntLittle(u64);
const filter_bit_size = try stream.readIntLittle(u16);
const mid_layer_divisions = try stream.readIntLittle(u64);
std.debug.assert(mid_layer_divisions == options.mid_layer_divisions);
std.debug.assert(filter_bit_size == options.filter_bit_size);
// Outer layer
const keys = try stream.readIntLittle(u64);
const outer_layer = try deserializeFilter(allocator, stream);
var mid_layer: [options.mid_layer_divisions]MidLayer = undefined;
var division: usize = 0;
while (division < options.mid_layer_divisions) : (division += 1) {
// Mid layer
const mid_layer_keys = try stream.readIntLittle(u64);
const mid_layer_filter = try deserializeFilter(allocator, stream);
const inner_layers_len = try stream.readIntLittle(u32);
var inner_layers = std.MultiArrayList(InnerLayer){};
try inner_layers.resize(allocator, inner_layers_len);
var i: usize = 0;
while (i < inner_layers.len) : (i += 1) {
// Inner Layer
var inner_layer_keys = try stream.readIntLittle(u64);
var inner_layer_filter = try deserializeFilter(allocator, stream);
// TODO: generic result deserialization
var result = if (Result == u64) blk: {
break :blk try stream.readIntLittle(u64);
} else if (Result == []const u8) blk: {
const data_len = try stream.readIntLittle(u32);
const data = try allocator.alloc(u8, data_len);
const read_bytes = try stream.readAll(data);
if (read_bytes < data.len) {
allocator.free(data);
return error.EndOfStream;
}
break :blk data;
} else unreachable;
inner_layers.set(i, InnerLayer{
.keys = inner_layer_keys,
.filter = inner_layer_filter,
.keys_iter = null,
.result = result,
});
}
mid_layer[division] = MidLayer{
.filter = mid_layer_filter,
.keys = mid_layer_keys,
.inner_layers = inner_layers,
};
}
return Self{
.total_keys_estimate = total_keys_estimate,
.keys = keys,
.outer_layer = outer_layer,
.mid_layer = mid_layer,
};
}
fn deserializeFilter(allocator: Allocator, stream: anytype) !BinaryFuseFilter {
const seed = try stream.readIntLittle(u64);
const segment_length = try stream.readIntLittle(u32);
const segment_length_mask = try stream.readIntLittle(u32);
const segment_count = try stream.readIntLittle(u32);
const segment_count_length = try stream.readIntLittle(u32);
const fingerprints_len = try stream.readIntLittle(u32);
const fingerprints = try allocator.alloc(FilterType, fingerprints_len);
const fingerprint_bytes: []u8 = fingerprints.ptr[0 .. fingerprints.len * @sizeOf(FilterType)];
const read_bytes = try stream.readAll(fingerprint_bytes);
if (read_bytes < fingerprint_bytes.len) {
allocator.free(fingerprints);
return error.EndOfStream;
}
return BinaryFuseFilter{
.seed = seed,
.segment_length = segment_length,
.segment_length_mask = segment_length_mask,
.segment_count = segment_count,
.segment_count_length = segment_count_length,
.fingerprints = fingerprints,
};
}
// This works around an issue in binary fuse filters which is hopefully fixed upstream soon:
// https://github.com/FastFilter/xorfilter/issues/30
//
// It works around the issue by collecting all keys, making them a unique set, and then populating
// the filter with that. This defeats the entire purpose of iterator-based binary fuse filters, of
// course (less memory usage.)
fn populateIterUnique(allocator: Allocator, filter: *BinaryFuseFilter, iter: anytype) !void {
const keys = try allocator.alloc(u64, iter.len());
defer allocator.free(keys);
var i: usize = 0;
while (iter.next()) |key| {
keys[i] = key;
i += 1;
}
const unique_keys = fastfilter.AutoUnique(u64, void)({}, keys);
try filter.populate(allocator, unique_keys);
}
};
}
test "filter" {
const allocator = testing.allocator;
const Iterator = fastfilter.SliceIterator(u64);
const TestFilter = Filter(.{}, []const u8, *Iterator);
const estimated_keys = 100;
var filter = TestFilter.init(estimated_keys);
defer filter.deinit(allocator);
// Insert files.
var keys_iter = Iterator.init(&.{ 1, 2, 3, 4 });
try filter.insert(allocator, &keys_iter, "1-2-3-4");
var keys_iter_2 = Iterator.init(&.{ 3, 4, 5 });
try filter.insert(allocator, &keys_iter_2, "3-4-5");
var keys_iter_3 = Iterator.init(&.{ 6, 7, 8 });
try filter.insert(allocator, &keys_iter_3, "6-7-8");
// Index.
try filter.index(allocator);
// Super fast containment checks.
try testing.expectEqual(true, filter.contains(2));
try testing.expectEqual(true, filter.contains(4));
// Fast queries.
var results = std.ArrayListUnmanaged([]const u8){};
defer results.deinit(allocator);
// Query a single key (5).
results.clearRetainingCapacity();
_ = try filter.queryLogicalOr(allocator, &.{5}, *std.ArrayListUnmanaged([]const u8), &results);
try testing.expectEqual(@as(usize, 1), results.items.len);
try testing.expectEqualStrings("3-4-5", results.items[0]);
// Query logical OR (2, 5)
results.clearRetainingCapacity();
_ = try filter.queryLogicalOr(allocator, &.{ 2, 5 }, *std.ArrayListUnmanaged([]const u8), &results);
try testing.expectEqual(@as(usize, 2), results.items.len);
try testing.expectEqualStrings("1-2-3-4", results.items[0]);
try testing.expectEqualStrings("3-4-5", results.items[1]);
// Query logical AND (2, 5)
results.clearRetainingCapacity();
_ = try filter.queryLogicalAnd(allocator, &.{ 2, 5 }, *std.ArrayListUnmanaged([]const u8), &results);
try testing.expectEqual(@as(usize, 0), results.items.len);
// Query logical AND (3, 4)
results.clearRetainingCapacity();
_ = try filter.queryLogicalAnd(allocator, &.{ 3, 4 }, *std.ArrayListUnmanaged([]const u8), &results);
try testing.expectEqual(@as(usize, 2), results.items.len);
try testing.expectEqualStrings("1-2-3-4", results.items[0]);
try testing.expectEqualStrings("3-4-5", results.items[1]);
try testing.expectEqual(@as(usize, 1676), filter.sizeInBytes());
}
// TODO: serialization/deserialization tests