-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcompressor.h
569 lines (495 loc) · 20.4 KB
/
compressor.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
// ==============================================================================
// PROJECT: zqloader
// FILE: compressor.h
// DESCRIPTION: Definition of class Compressor.
//
// Copyright (c) 2023 Daan Scherft [Oxidaan]
// This project uses the MIT license. See LICENSE.txt for details.
// ==============================================================================
#pragma once
// When set compresses isolated pairs (2 sequential bytes of value 'value_for_pairs') into one byte 'code_for_pairs'
// spares around 5% or a couple of hunderd bytes on average for a game.
// I hoped this would be more because of 16x16 bit sprites might have this often, but:
// That size improvement is largely, if not entirely, undone because the decompression is now slower,
// also zqloader.tap (which is loaded at normal speed!) becomes larger.
// When changing also change same at file zqloader.z80asm!
// #define DO_COMRESS_PAIRS
#include <optional> // std::optional
#include <iostream> // cout
#include <algorithm> // std::sort
#include "byte_tools.h"
///
/// Compressor. Uses RLE compression:
/// In block to compress find:
/// most used (byte) value -> 'most'
/// and the two least used byte values. -> 'code_for_most' and 'code_for_triples'.
/// Then compress:
/// 2 or more sequential of 'most' (typically zero) is compressed by prefixing
/// 'code_for_most', then number of repeats (#repeats cannot be 'code_for_most')
/// (A single byte value 'most' is left untouched).
/// 3 or more sequential of any value are compressed by prefixing
/// 'code_for_triples'; then that byte value; then number of repeats (#repeats cannot be 'code_for_triples')
/// 'code_for_most' and 'code_for_triples' itself are stored 2x - but are asumed 'rare'.
/// Larger blocks (eg repeat more than max value for TData) are just created as two blocks.
///
template <class TDataBlock>
class Compressor
{
public:
using DataBlock = TDataBlock;
using TData = typename TDataBlock::value_type; // usually std::byte
using iterator = typename TDataBlock::iterator;
using const_iterator = typename TDataBlock::const_iterator;
struct RLE_Meta
{
TData code_for_most; // the value that occurs least, will be used as escape code for 'most'
TData code_for_multiples; // the value that occurs 2nd least, will be used as escape code for 'multiples'
TData value_for_most; // the value that occurs most in the block (typically 0) ('most')
TData value_for_pairs; // the value that occurs most as isolated pairs in the block ('pairs')
TData code_for_pairs; // the value that occurs 3rd least, will be used as escape code for 'pairs'
friend std::ostream& operator <<(std::ostream& p_stream, const RLE_Meta& p_header)
{
p_stream
<< "code_for_most = " << int(p_header.code_for_most)
<< " code_for_multiples = " << int(p_header.code_for_multiples)
<< " value_for_most = " << int(p_header.value_for_most)
#ifdef DO_COMRESS_PAIRS
<< " value_for_pairs = " << int(p_header.value_for_pairs)
<< " code_for_pairs = " << int(p_header.code_for_pairs)
#endif
;
return p_stream;
}
};
private:
using Hist = std::vector< std::pair<int, TData> >;
public:
/// Compress, RLE. Return compressed data.
/// Return RLE meta data as output parameter - not written to compressed data.
static DataBlock Compress(const DataBlock& in_buf, RLE_Meta& out_max_min, uint16_t &out_decompress_counter)
{
DataBlock compressed;
out_max_min = DetermineCompressionRleValues(in_buf.begin(), in_buf.end());
auto it = compressed.begin();
out_decompress_counter = Compress(in_buf.begin(), in_buf.end(), compressed, it, out_max_min);
return compressed;
}
/// Compress, RLE. Only succeeds when can be compressed inline, so at same memory block.
/// Return compressed data as optional.
/// Tries multiple meta data values for code_for_most etc to make sure it can be decompressed inline.
/// Return RLE meta data as output parameter - not written to compressed data.
static std::optional<DataBlock> CompressInline(const DataBlock& in_buf, RLE_Meta& out_max_min, uint16_t &out_decompress_counter, int p_max_tries)
{
for (int tr = 0; tr == 0 || tr < p_max_tries; tr++)
{
out_max_min = DetermineCompressionRleValues(in_buf.begin(), in_buf.end(), tr);
DataBlock compressed;
auto it = compressed.begin();
out_decompress_counter = Compress(in_buf.begin(), in_buf.end(), compressed, it, out_max_min);
if (p_max_tries == 0 || CanUseDecompressionInline(in_buf, compressed, out_max_min))
{
// std::cout << "Compression attempt #" << (tr + 1) << " succeeded..." << std::endl;
return compressed;
}
// std::cout << "Compression attempt #" << (tr + 1) << " Failed..." << std::endl;
}
std::cout << "Inline compression failed" << std::endl;
return {}; // failed...
}
/// Decompress given block, return decompressed data.
/// RLE meta data is given as parameters here.
/// Only used for testing plus CanUseDecompressionInline.
static DataBlock DeCompress(const DataBlock& p_compressed, const RLE_Meta& p_max_min)
{
return DeCompress(p_compressed.begin(), p_compressed.end(), p_max_min);
}
/// Decompress between given iterators, return decompressed data
/// RLE meta data is given as parameters here.
/// Only used for testing plus CanUseDecompressionInline.
static DataBlock DeCompress(const_iterator p_begin, const_iterator p_end, const RLE_Meta& p_max_min)
{
DataBlock retval;
DeCompress(p_begin, p_end, retval, retval.begin(), p_max_min);
return retval;
}
private:
// Determine RLE mata data using suplied histogram.
static RLE_Meta DetermineCompressionRleValues(const_iterator p_begin, const_iterator p_end, int p_try = 0)
{
RLE_Meta retval;
// Determine escape codes. These are the values that occur less
{
auto hist = GetSortedHistogram(p_begin, p_end);
retval.code_for_most = ( hist.begin() + p_try )->second; // value that occurs less, will be used to code most value
retval.code_for_multiples = ( hist.begin() + p_try + 1 )->second; // value that occurs 2nd less, will be used to code tripples
retval.code_for_pairs = ( hist.begin() + p_try + 2 )->second; // value that occurs 2nd less, will be used to code tripples
}
auto hist2 = GetSortedHistogram(p_begin, p_end, 2);
retval.value_for_pairs = hist2.back().second;
// Get value that occurs most in sequences of at least 3 or more. -> value_for_most
// will be compressed as [code_for_most][#count]
{
auto hist = GetHistogram(p_begin, p_end, 3, true);
SortHist(hist);
retval.value_for_most = hist.back().second; // value that occurs most, will be coded with min1
}
// value_for_most is same as code_for_most
// There is even no max, dont code it - or skip
if (retval.value_for_most == retval.code_for_most ||
retval.value_for_most == retval.code_for_multiples)
{
retval.value_for_most = TData{ 0 };
retval.code_for_most = TData{ 1 };
}
return retval;
}
static bool IsEscapeCode(const RLE_Meta &rle, TData val)
{
#ifdef DO_COMRESS_PAIRS
return val == rle.code_for_most || val == rle.code_for_multiples || val == rle.code_for_pairs;
#else
return val == rle.code_for_most || val == rle.code_for_multiples;
#endif
}
// Compress block between given iterators to given output iterator.
// RLE meta data given here as parameter.
static uint16_t Compress(const_iterator p_begin, const_iterator p_end, DataBlock& out_buf, iterator& out_it, const RLE_Meta& p_rle)
{
const auto& value_for_most = p_rle.value_for_most; // alias (the value that occurs most in the block (typically 0))
const auto& code_for_most = p_rle.code_for_most;
const auto& code_for_multiples = p_rle.code_for_multiples;
#ifdef DO_COMRESS_PAIRS
const auto& value_for_pairs= p_rle.value_for_pairs;
const auto& code_for_pairs = p_rle.code_for_pairs;
#endif
uint16_t decompress_counter = 0;
const_iterator it;
bool append_mode = ( out_it == out_buf.end());
auto Read = [&]()
{
return Compressor::Read(it);
};
auto Write = [&](TData p_byte)
{
Compressor::Write(out_buf, out_it, p_byte, append_mode);
};
int most_count = 0;
int multiple_count = 0;
TData prev{};
// write-out (flush) the most_count count, taking care of count equals to the min values.
// because a duplicate min value codes for the single min value
// and a single 'most' value is just that value.
auto WriteMost = [&]()
{
#ifdef DO_COMRESS_PAIRS
if(value_for_most == value_for_pairs && most_count == 2)
{
Write(code_for_pairs);
decompress_counter++;
most_count = 0;
}
#endif
// keep writing single 'most' values as long as maxcount equals code_for_most or code_for_triples
// or we have just one or two
while (( IsEscapeCode(p_rle, TData(most_count)) || most_count <= 2) && most_count > 0 )
{
--most_count;
Write(value_for_most);
decompress_counter++;
}
if (most_count > 0 )
{
Write(code_for_most);
decompress_counter++;
Write(TData(most_count));
most_count = 0;
}
};
auto WritePairs = [&]()
{
#ifdef DO_COMRESS_PAIRS
if(prev == value_for_pairs && multiple_count==2)
{
Write(code_for_pairs); // write 2x value_for_pairs as code_for_pairs
decompress_counter++;
multiple_count = 0;
}
#endif
};
auto WriteMultiples = [&]()
{
// keep writing single prev values as long as maxcount equals code_for_most or code_for_triples
// or we have just one or two
// when multiple_count==1 just write one (previous) value.
while ((IsEscapeCode(p_rle, TData(multiple_count)) || multiple_count <= 3) && multiple_count > 0 )
{
--multiple_count;
Write(prev);
decompress_counter++;
}
if (multiple_count > 0 )
{
Write(code_for_multiples);
decompress_counter++;
Write(prev); // Note prev can not be code_for_triples
Write(TData(multiple_count));
multiple_count = 0;
}
};
for (it = p_begin; it < p_end; )
{
auto val = Read();
if (val == value_for_most) // compressed with [code_for_most] [#]
{
WritePairs();
WriteMultiples(); // so flush if present; ,making prev_count 0
if (most_count == int(GetMax<TData>())) // flush when overflow
{
WriteMost(); // max_count now 0
}
++most_count;
}
else if(!IsEscapeCode(p_rle, val))
{
WriteMost(); // so flush if present
if(val == prev) // compressed with [code_for_multiples][val][#]
{
if (multiple_count == int(GetMax<TData>())) // flush when overflow
{
WriteMultiples(); // multiple_count now 0
}
++multiple_count;
}
else
{
WritePairs();
WriteMultiples(); // flush if present; making prev_count 0, also writes all 'normal' values
multiple_count = 1;
}
}
else // escape code, write twice
{
WriteMost();
WritePairs();
WriteMultiples();
Write(val);
Write(val);
decompress_counter++;
}
prev = val;
}
WritePairs();
WriteMost(); // at end flush remaining when present
WriteMultiples();
return decompress_counter;
}
// Decompress block between given iterators to given output iterator.
// RLE meta data is given as parameters here.
static void DeCompress(const_iterator p_begin, const_iterator p_end, DataBlock& out_buf, iterator out_it, const RLE_Meta& p_max_min)
{
const auto& most = p_max_min.value_for_most; // alias
const auto& code_for_most = p_max_min.code_for_most;
const auto& code_for_multiples = p_max_min.code_for_multiples;
#ifdef DO_COMRESS_PAIRS
const auto& code_for_pairs = p_max_min.code_for_pairs;
const auto& value_for_pairs = p_max_min.value_for_pairs;
#endif
const_iterator it;
bool at_end = false;
bool append_mode = ( out_it == out_buf.end());
auto Read = [&]()
{
return Compressor::Read(it);
};
auto Write = [&](TData p_byte)
{
return Compressor::Write(out_buf, out_it, p_byte, append_mode);
};
auto WriteMost = [&]()
{
auto cnt = Read();
if (cnt == code_for_most) // 2nd seen?
{
at_end = Write(code_for_most);
}
else
{
for (int n = 0; n < int(cnt) && !at_end; n++)
{
at_end = Write(most);
}
}
};
auto WriteMultiples = [&]()
{
auto val = Read();
if (val == code_for_multiples) // 2nd seen?
{
at_end = Write(val);
}
else
{
auto cnt = Read();
for (int n = 0; n < int(cnt) && !at_end; n++)
{
at_end = Write(val);
}
}
};
#ifdef DO_COMRESS_PAIRS
auto WritePairs = [&]()
{
if(it < p_end)
{
auto val = Read();
if (val == code_for_pairs) // 2nd seen?
{
at_end = Write(val);
return;
}
it--;
}
Write(value_for_pairs);
Write(value_for_pairs);
};
#endif
for (it = p_begin; it < p_end && !at_end ;)
{
auto b = Read();
if (b == code_for_most)// && it < p_end)
{
WriteMost();
}
#ifdef DO_COMRESS_PAIRS
else if (b == code_for_pairs)// && it < p_end)
{
WritePairs();
}
#endif
else if (b == code_for_multiples)// && it < p_end)
{
WriteMultiples();
}
else
{
at_end = Write(b);
}
}
}
// Check if can use de-compress in same memory location as compressed data.
// Where compressed data is stored at end of block, being overwritten during decompression.
// By just trying.
static bool CanUseDecompressionInline(const DataBlock& p_orig_data, const DataBlock& p_compressed_data, const Compressor<DataBlock>::RLE_Meta& p_rle_meta)
{
DataBlock decompressed_data;
if(p_orig_data.size() > p_compressed_data.size())
{
auto sz = p_orig_data.size() - p_compressed_data.size();
decompressed_data.resize(sz);
// append compressed data
decompressed_data.insert(decompressed_data.end(), p_compressed_data.begin(), p_compressed_data.end());
DeCompress(decompressed_data.cbegin() + sz, decompressed_data.cend(), decompressed_data, decompressed_data.begin(), p_rle_meta);
return decompressed_data == p_orig_data;
}
return false;
}
static TData Read(const_iterator& p_it)
{
auto ret = *p_it;
p_it++;
return ret;
}
// Write one data to the output buffer at given iterator location.
// when p_append_mode append.
static bool Write(std::vector<TData>& out_buf, iterator& out_it, TData p_byte, bool p_append_mode = true)
{
if (p_append_mode)
{
out_buf.push_back(p_byte); // append mode
out_it = out_buf.end();
return false;
}
else if (out_it != out_buf.end())
{
*out_it = p_byte;
out_it++;
return false;
}
return true;
}
static Hist GetHistogram(const_iterator p_begin, const_iterator p_end, int p_sequential = 0, bool p_greater_than = false)
{
// key=freq, value=byte
Hist hist{};
// Make entries for each byte freq zero
for (int n = 0; n < ( 1 << ( 8 * sizeof( TData ))); n++)
{
hist.push_back({ 0, TData(n) });
}
int cnt = 1;
int prev = int(*p_begin) + 1; // so !first
for (auto it = p_begin; it != p_end; it++)
{
if(p_sequential == 0) // do not check sequential same values
{
auto& pair = hist[int(*it)];
pair.first++; // freq++
}
else if(int(*it) != prev)
{
if(!p_greater_than && cnt == p_sequential) // found exactly sequential same values
{
auto& pair = hist[prev];
pair.first+=cnt;
}
else if( p_greater_than && cnt >= p_sequential ) // more than -p_sequential
{
auto& pair = hist[prev];
pair.first+=cnt;
}
cnt = 1;
}
else
{
cnt++; // #values same as previous sequential values
}
prev = int(*it);
}
return hist;
}
static void SortHist(Hist &p_hist)
{
// sort on freq (at pair compare first element first)
auto Compare = [](typename Hist::value_type p1, typename Hist::value_type p2)
{
return p1.first == p2.first ? p1.second > p2.second : p1.first < p2.first;
};
std::sort(p_hist.begin(), p_hist.end(), Compare);
}
static Hist GetSortedHistogram(const_iterator p_begin, const_iterator p_end, int p_sequential = 0, bool p_greater_than = false)
{
auto hist = GetHistogram(p_begin, p_end, p_sequential, p_greater_than);
SortHist(hist);
return hist;
}
// Write RLE meta data to the given iterator position, thus becoming part of the compressed data.
static void WriteRleValues(DataBlock& out_buf, iterator& out_it, const RLE_Meta& p_max_min)
{
Write(out_buf, out_it, p_max_min.value_for_most);
Write(out_buf, out_it, p_max_min.value_for_pairs);
Write(out_buf, out_it, p_max_min.code_for_pairs);
Write(out_buf, out_it, p_max_min.code_for_multiples);
Write(out_buf, out_it, p_max_min.code_for_most);
}
// Write RLE meta data from given iterator position
static RLE_Meta ReadRleValues(const_iterator& p_it)
{
RLE_Meta retval;
retval.value_for_most = Read(p_it);
retval.value_for_pairs = Read(p_it);
retval.code_for_pairs = Read(p_it);
retval.code_for_multiples = Read(p_it);
retval.code_for_most = Read(p_it);
return retval;
}
}; // class Compressor