-
Notifications
You must be signed in to change notification settings - Fork 42
/
Copy pathBits.js
624 lines (519 loc) · 16.9 KB
/
Bits.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
/*
A Succinct Trie for Javascript
By Steve Hanov
Released to the public domain.
This file contains functions for creating a succinctly encoded trie structure
from a list of words. The trie is encoded to a succinct bit string using the
method of Jacobson (1989). The bitstring is then encoded using BASE-64.
The resulting trie does not have to be decoded to be used. This file also
contains functions for looking up a word in the BASE-64 encoded data, in
O(mlogn) time, where m is the number of letters in the target word, and n is
the number of nodes in the trie.
Objects for encoding:
TrieNode
Trie
BitWriter
Objects for decoding:
BitString
FrozenTrieNode
FrozenTrie
QUICK USAGE:
Suppose we let data be some output of the demo encoder:
var data = {
"nodeCount": 37,
"directory": "BMIg",
"trie": "v2qqqqqqqpIUn4A5JZyBZ4ggCKh55ZZgBA5ZZd5vIEl1wx8g8A"
};
var frozenTrie = new FrozenTrie( Data.trie, Data.directory, Data.nodeCount);
alert( frozenTrie.lookup( "hello" ) ); // outputs true
alert( frozenTrie.lookup( "kwijibo" ) ); // outputs false
*/
// Configure the bit writing and reading functions to work natively in BASE-64
// encoding. That way, we don't have to convert back and forth to bytes.
var BASE64 =
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
/**
The width of each unit of the encoding, in bits. Here we use 6, for base-64
encoding.
*/
var W = 6;
/**
Returns the character unit that represents the given value. If this were
binary data, we would simply return id.
*/
function CHR(id)
{
return BASE64[id];
}
/**
Returns the decimal value of the given character unit.
*/
var ORD = {};
for ( var i = 0; i < BASE64.length; i++ ) {
ORD[ BASE64[i] ] = i;
}
/**
Fixed values for the L1 and L2 table sizes in the Rank Directory
*/
var L1 = 32*32;
var L2 = 32;
/**
The BitWriter will create a stream of bytes, letting you write a certain
number of bits at a time. This is part of the encoder, so it is not
optimized for memory or speed.
*/
function BitWriter()
{
this.init();
}
BitWriter.prototype =
{
init: function() {
this.bits = [];
},
/**
Write some data to the bit string. The number of bits must be 32 or
fewer.
*/
write: function( data, numBits ) {
for( var i = numBits - 1; i >= 0; i-- ) {
if ( data & ( 1 << i ) ) {
this.bits.push(1);
} else {
this.bits.push(0);
}
}
},
/**
Get the bitstring represented as a javascript string of bytes
*/
getData: function() {
var chars = [];
var b = 0;
var i = 0;
for ( var j = 0; j < this.bits.length; j++ ) {
b = ( b << 1 ) | this.bits[j];
i += 1;
if ( i === W ) {
chars.push( CHR(b) );
i = b = 0;
}
}
if ( i ) {
chars.push( CHR(b << ( W - i )) );
}
return chars.join("");
},
/**
Returns the bits as a human readable binary string for debugging
*/
getDebugString: function(group) {
var chars = [];
var i = 0;
for( var j = 0; j < this.bits.length; j++ ) {
chars.push( "" + this.bits[j] );
i++;
if ( i === group ) {
chars.push( ' ' );
i = 0;
}
}
return chars.join("");
}
};
/**
Given a string of data (eg, in BASE-64), the BitString class supports
reading or counting a number of bits from an arbitrary position in the
string.
*/
function BitString( str )
{
this.init( str );
}
BitString.MaskTop = [
0x3f, 0x1f, 0x0f, 0x07, 0x03, 0x01, 0x00
];
BitString.BitsInByte = [
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2,
3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3,
3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3,
4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4,
3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5,
6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4,
4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5,
6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5,
3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3,
4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6,
6, 7, 6, 7, 7, 8
];
BitString.prototype = {
init: function( str ) {
this.bytes = str;
this.length = this.bytes.length * W;
},
/**
Returns the internal string of bytes
*/
getData: function() {
return this.bytes;
},
/**
Returns a decimal number, consisting of a certain number, n, of bits
starting at a certain position, p.
*/
get: function( p, n ) {
// case 1: bits lie within the given byte
if ( ( p % W ) + n <= W ) {
return ( ORD[ this.bytes[ p / W | 0 ] ] & BitString.MaskTop[ p % W ] ) >>
( W - p % W - n );
// case 2: bits lie incompletely in the given byte
} else {
var result = ( ORD[ this.bytes[ p / W | 0 ] ] &
BitString.MaskTop[ p % W ] );
var l = W - p % W;
p += l;
n -= l;
while ( n >= W ) {
result = (result << W) | ORD[ this.bytes[ p / W | 0 ] ];
p += W;
n -= W;
}
if ( n > 0 ) {
result = (result << n) | ( ORD[ this.bytes[ p / W | 0 ] ] >>
( W - n ) );
}
return result;
}
},
/**
Counts the number of bits set to 1 starting at position p and
ending at position p + n
*/
count: function( p, n ) {
var count = 0;
while( n >= 8 ) {
count += BitString.BitsInByte[ this.get( p, 8 ) ];
p += 8;
n -= 8;
}
return count + BitString.BitsInByte[ this.get( p, n ) ];
},
/**
Returns the number of bits set to 1 up to and including position x.
This is the slow implementation used for testing.
*/
rank: function( x ) {
var rank = 0;
for( var i = 0; i <= x; i++ ) {
if ( this.get(i, 1) ) {
rank++;
}
}
return rank;
}
};
/**
The rank directory allows you to build an index to quickly compute the
rank() and select() functions. The index can itself be encoded as a binary
string.
*/
function RankDirectory( directoryData, bitData, numBits, l1Size, l2Size )
{
this.init(directoryData, bitData, numBits, l1Size, l2Size);
}
/**
Used to build a rank directory from the given input string.
@param data A javascript string containing the data, as readable using the
BitString object.
@param numBits The number of bits to index.
@param l1Size The number of bits that each entry in the Level 1 table
summarizes. This should be a multiple of l2Size.
@param l2Size The number of bits that each entry in the Level 2 table
summarizes.
*/
RankDirectory.Create = function( data, numBits, l1Size, l2Size ) {
var bits = new BitString( data );
var p = 0;
var i = 0;
var count1 = 0, count2 = 0;
var l1bits = Math.ceil( Math.log( numBits ) / Math.log(2) );
var l2bits = Math.ceil( Math.log( l1Size ) / Math.log(2) );
var directory = new BitWriter();
while( p + l2Size <= numBits ) {
count2 += bits.count( p, l2Size );
i += l2Size;
p += l2Size;
if ( i === l1Size ) {
count1 += count2;
directory.write( count1, l1bits );
count2 = 0;
i = 0;
} else {
directory.write( count2, l2bits );
}
}
return new RankDirectory( directory.getData(), data, numBits, l1Size, l2Size );
};
RankDirectory.prototype = {
init: function( directoryData, bitData, numBits, l1Size, l2Size ) {
this.directory = new BitString( directoryData );
this.data = new BitString( bitData );
this.l1Size = l1Size;
this.l2Size = l2Size;
this.l1Bits = Math.ceil( Math.log( numBits ) / Math.log( 2 ) );
this.l2Bits = Math.ceil( Math.log( l1Size ) / Math.log( 2 ) );
this.sectionBits = (l1Size / l2Size - 1) * this.l2Bits + this.l1Bits;
this.numBits = numBits;
},
/**
Returns the string representation of the directory.
*/
getData: function() {
return this.directory.getData();
},
/**
Returns the number of 1 or 0 bits (depending on the "which" parameter) to
to and including position x.
*/
rank: function( which, x ) {
if ( which === 0 ) {
return x - this.rank( 1, x ) + 1;
}
var rank = 0;
var o = x;
var sectionPos = 0;
if ( o >= this.l1Size ) {
sectionPos = ( o / this.l1Size | 0 ) * this.sectionBits;
rank = this.directory.get( sectionPos - this.l1Bits, this.l1Bits );
o = o % this.l1Size;
}
if ( o >= this.l2Size ) {
sectionPos += ( o / this.l2Size | 0 ) * this.l2Bits;
rank += this.directory.get( sectionPos - this.l2Bits, this.l2Bits );
}
rank += this.data.count( x - x % this.l2Size, x % this.l2Size + 1 );
return rank;
},
/**
Returns the position of the y'th 0 or 1 bit, depending on the "which"
parameter.
*/
select: function( which, y ) {
var high = this.numBits;
var low = -1;
var val = -1;
while ( high - low > 1 ) {
var probe = (high + low) / 2 | 0;
var r = this.rank( which, probe );
if ( r === y ) {
// We have to continue searching after we have found it,
// because we want the _first_ occurrence.
val = probe;
high = probe;
} else if ( r < y ) {
low = probe;
} else {
high = probe;
}
}
return val;
}
};
/**
A Trie node, for use in building the encoding trie. This is not needed for
the decoder.
*/
function TrieNode( letter )
{
this.letter = letter;
this.final = false;
this.children = [];
}
function Trie()
{
this.init();
}
Trie.prototype = {
init: function() {
this.previousWord = "";
this.root = new TrieNode(' ');
this.cache = [ this.root ];
this.nodeCount = 1;
},
/**
Returns the number of nodes in the trie
*/
getNodeCount: function() {
return this.nodeCount;
},
/**
Inserts a word into the trie. This function is fastest if the words are
inserted in alphabetical order.
*/
insert: function( word ) {
var commonPrefix = 0;
for( var i = 0; i < Math.min( word.length, this.previousWord.length );
i++ )
{
if ( word[i] !== this.previousWord[i] ) { break; }
commonPrefix += 1;
}
this.cache.length = commonPrefix + 1;
var node = this.cache[ this.cache.length - 1 ];
for( i = commonPrefix; i < word.length; i++ ) {
var next = new TrieNode( word[i] );
this.nodeCount++;
node.children.push( next );
this.cache.push( next );
node = next;
}
node.final = true;
this.previousWord = word;
},
/**
Apply a function to each node, traversing the trie in level order.
*/
apply: function( fn )
{
var level = [ this.root ];
while( level.length > 0 ) {
var node = level.shift();
for( var i = 0; i < node.children.length; i++ ) {
level.push( node.children[i] );
}
fn( node );
}
},
/**
Encode the trie and all of its nodes. Returns a string representing the
encoded data.
*/
encode: function()
{
// Write the unary encoding of the tree in level order.
var bits = new BitWriter();
bits.write( 0x02, 2 );
this.apply( function( node ) {
for( var i = 0; i < node.children.length; i++ ) {
bits.write( 1, 1 );
}
bits.write( 0, 1 );
});
// Write the data for each node, using 6 bits for node. 1 bit stores
// the "final" indicator. The other 5 bits store one of the 26 letters
// of the alphabet.
var a = ("a").charCodeAt(0);
this.apply( function( node ) {
var value = node.letter.charCodeAt(0) - a;
if ( node.final ) {
value |= 0x20;
}
bits.write( value, 6 );
});
return bits.getData();
}
};
/**
This class is used for traversing the succinctly encoded trie.
*/
function FrozenTrieNode( trie, index, letter, final, firstChild, childCount )
{
this.trie = trie;
this.index = index;
this.letter = letter;
this.final = final;
this.firstChild = firstChild;
this.childCount = childCount;
}
FrozenTrieNode.prototype = {
/**
Returns the number of children.
*/
getChildCount: function()
{
return this.childCount;
},
/**
Returns the FrozenTrieNode for the given child.
@param index The 0-based index of the child of this node. For example, if
the node has 5 children, and you wanted the 0th one, pass in 0.
*/
getChild: function(index)
{
return this.trie.getNodeByIndex( this.firstChild + index );
}
};
/**
The FrozenTrie is used for looking up words in the encoded trie.
@param data A string representing the encoded trie.
@param directoryData A string representing the RankDirectory. The global L1
and L2 constants are used to determine the L1Size and L2size.
@param nodeCount The number of nodes in the trie.
*/
function FrozenTrie( data, directoryData, nodeCount )
{
this.init( data, directoryData, nodeCount );
}
FrozenTrie.prototype = {
init: function( data, directoryData, nodeCount )
{
this.data = new BitString( data );
this.directory = new RankDirectory( directoryData, data,
nodeCount * 2 + 1, L1, L2 );
// The position of the first bit of the data in 0th node. In non-root
// nodes, this would contain 6-bit letters.
this.letterStart = nodeCount * 2 + 1;
},
/**
Retrieve the FrozenTrieNode of the trie, given its index in level-order.
This is a private function that you don't have to use.
*/
getNodeByIndex: function( index )
{
// retrieve the 6-bit letter.
var final = this.data.get( this.letterStart + index * 6, 1 ) === 1;
var letter = String.fromCharCode(
this.data.get( this.letterStart + index * 6 + 1, 5 ) +
'a'.charCodeAt(0));
var firstChild = this.directory.select( 0, index+1 ) - index;
// Since the nodes are in level order, this nodes children must go up
// until the next node's children start.
var childOfNextNode = this.directory.select( 0, index + 2 ) - index - 1;
return new FrozenTrieNode( this, index, letter, final, firstChild,
childOfNextNode - firstChild );
},
/**
Retrieve the root node. You can use this node to obtain all of the other
nodes in the trie.
*/
getRoot: function()
{
return this.getNodeByIndex( 0 );
},
/**
Look-up a word in the trie. Returns true if and only if the word exists
in the trie.
*/
lookup: function( word )
{
var node = this.getRoot();
for ( var i = 0; i < word.length; i++ ) {
var child;
var j = 0;
for ( ; j < node.getChildCount(); j++ ) {
child = node.getChild( j );
if ( child.letter === word[i] ) {
break;
}
}
if ( j === node.getChildCount() ) {
return false;
}
node = child;
}
return node.final;
}
};
if ( typeof exports !== "undefined" ) {
exports.FrozenTrie = FrozenTrie;
}