-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathbytecompress.rs
113 lines (101 loc) · 3.49 KB
/
bytecompress.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
use std::collections::hash_map::Entry;
use std::hash::Hash;
use crate::HashMap;
use crate::{
ast::{byteset_contains, Expr, ExprSet},
ExprRef,
};
pub struct ByteCompressor {
pub mapping: Vec<u8>,
pub alphabet_size: usize,
bytesets: Vec<Vec<u32>>,
}
const INVALID_MAPPING: u8 = 0xff;
impl ByteCompressor {
pub fn new() -> Self {
ByteCompressor {
mapping: Vec::new(),
alphabet_size: 0,
bytesets: Vec::new(),
}
}
fn add_single_byte(&mut self, b: u8) {
if self.mapping[b as usize] == INVALID_MAPPING {
self.mapping[b as usize] = self.alphabet_size as u8;
self.alphabet_size += 1;
}
}
pub fn compress(&mut self, exprset: ExprSet, rx_list: &[ExprRef]) -> (ExprSet, Vec<ExprRef>) {
self.mapping = vec![INVALID_MAPPING; exprset.alphabet_size];
let mut todo = rx_list.to_vec();
let mut visited = vec![false; exprset.len()];
while let Some(e) = todo.pop() {
if visited[e.as_usize()] {
continue;
}
visited[e.as_usize()] = true;
todo.extend_from_slice(exprset.get_args(e));
match exprset.get(e) {
Expr::Byte(b) => self.add_single_byte(b),
Expr::ByteConcat(_, bytes, _) => {
for b in bytes {
self.add_single_byte(*b);
}
}
Expr::ByteSet(bs) => self.bytesets.push(bs.to_vec()),
Expr::RemainderIs { scale, .. } => {
for b in exprset.digits {
self.add_single_byte(b);
}
// if scale==0 then it will only match integers
// and we don't need to distinguish the dot
if scale > 0 {
self.add_single_byte(exprset.digit_dot);
}
}
_ => {}
}
}
let num = self.bytesets.len();
if num <= 64 {
self.compress_bytesets(|_| 0u64, |v, idx| *v |= 1 << idx);
} else {
self.compress_bytesets(
|size| vec![0u32; size.div_ceil(32)],
|v, idx| v[idx / 32] |= 1 << (idx % 32),
);
}
let mut trg = exprset;
// this disables Or->Trie conversion; the input should be already optimized this way
trg.disable_optimizations();
(trg, rx_list.to_vec())
}
#[inline(always)]
fn compress_bytesets<T: Eq + Hash>(
&mut self,
alloc: impl Fn(usize) -> T,
set_true: impl Fn(&mut T, usize),
) {
let mut byte_mapping = HashMap::default();
for b in 0..self.mapping.len() {
if self.mapping[b] == INVALID_MAPPING {
let mut v = alloc(self.bytesets.len());
for (idx, bs) in self.bytesets.iter().enumerate() {
if byteset_contains(bs, b) {
set_true(&mut v, idx);
}
}
match byte_mapping.entry(v) {
Entry::Occupied(e) => {
self.mapping[b] = *e.get();
}
Entry::Vacant(e) => {
self.mapping[b] = self.alphabet_size as u8;
self.alphabet_size += 1;
e.insert(self.mapping[b]);
}
}
}
}
}
}