-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathmork.pl
executable file
·536 lines (433 loc) · 15.5 KB
/
mork.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
#!/usr/bin/perl -w
# Copyright © 2004 Jamie Zawinski <[email protected]>
#
# Permission to use, copy, modify, distribute, and sell this software and its
# documentation for any purpose is hereby granted without fee, provided that
# the above copyright notice appear in all copies and that both that
# copyright notice and this permission notice appear in supporting
# documentation. No representations are made about the suitability of this
# software for any purpose. It is provided "as is" without express or
# implied warranty.
#
# Created: 3-Mar-2004 by Jamie Zawinski, Anonymous, and Jacob Post.
#
##############################################################################
#
# This is a program that can read the Mozilla URL history file --
# normally $HOME/.mozilla/default/*.slt/history.dat -- and prints out
# a list of URLs and their time of last access. With no arguments,
# it prints lines like
#
# 1078333826 1 http://www.jwz.org/hacks/
#
# where the first number is a ctime (number of seconds since Jan 1 1970 GMT)
# and the second number is how many times this URL was visited. The URLs are
# printed most-recent-first.
#
# With --verbose, it prints all the information known about each URL,
# including time of first visit, last visit, document title, etc.
#
# With --html, it produces HTML output instead of plain text.
#
# With "--age 2H", it limits itself to URLs that were loaded within the
# last two hours. Likewise with "sec", "min", "day", "month", etc.
#
##############################################################################
#
# And Now, The Ugly Truth Laid Bare:
#
# In Netscape Navigator 1.0 through 4.0, the history.db file was just a
# Berkeley DBM file. You could trivially bind to it from Perl, and
# pull out the URLs and last-access time. In Mozilla, this has been
# replaced with a "Mork" database for which no tools exist.
#
# Let me make it clear that McCusker is a complete barking lunatic.
# This is just about the stupidest file format I've ever seen.
#
# http://www.mozilla.org/mailnews/arch/mork/primer.txt
# http://jwz.livejournal.com/312657.html
# http://www.jwz.org/doc/mailsum.html
# http://bugzilla.mozilla.org/show_bug.cgi?id=241438
#
# In brief, let's count its sins:
#
# - Two different numerical namespaces that overlap.
#
# - It can't decide what kind of character-quoting syntax to use:
# Backslash? Hex encoding with dollar-sign?
#
# - C++ line comments are allowed sometimes, but sometimes // is just
# a pair of characters in a URL.
#
# - It goes to all this serious compression effort (two different
# string-interning hash tables) and then writes out Unicode strings
# without using UTF-8: writes out the unpacked wchar_t characters!
#
# - Worse, it hex-encodes each wchar_t with a 3-byte encoding,
# meaning the file size will be 3x or 6x (depending on whether
# whchar_t is 2 bytes or 4 bytes.)
#
# - It masquerades as a "textual" file format when in fact it's just
# another binary-blob file, except that it represents all its magic
# numbers in ASCII. It's not human-readable, it's not hand-editable,
# so the only benefit there is to the fact that it uses short lines
# and doesn't use binary characters is that it makes the file bigger.
# Oh wait, my mistake, that isn't actually a benefit at all.
#
# Pure comedy.
#
##############################################################################
require 5;
use diagnostics;
use strict;
use POSIX qw(strftime);
my $progname = $0; $progname =~ s@.*/@@g;
my $version = q{ $Revision: 2.11 $ }; $version =~ s/^[^0-9]+([0-9.]+).*$/$1/;
my $verbose = 0;
my $show_all_p = 1;
my (%key_table, %val_table, %row_hash);
my ($total, $skipped) = (0, 0);
# Returns a list of hashes, the contents of the mork file.
#
sub mork_parse_file {
my ($file, $age) = @_;
local $/ = undef;
local *IN;
my $since = ($age ? time() - $age : 0);
##########################################################################
# Define the messy regexen up here
##########################################################################
my $top_level_comment = qr@//.*\n@;
my $key_table_re = qr/ < \s* < # "< <"
\( a=c \) > # "(a=c)>"
(?> ([^>]*) ) > \s* # Grab anything that's not ">"
/sx;
my $value_table_re = qr/ < ( .*?\) )> \s* /sx;
my $table_re = qr/ \{ -? # "{" or "{-"
[\da-f]+ : # hex, ":"
(?> .*?\{ ) # Eat up to a {...
((?> .*?\} ) # and then the closing }...
(?> .*?\} )) # Finally, grab the table section
\s* /six;
my $row_re = qr/ ( (?> \[ [^]]* \] # "["..."]"
\s*)+ ) # Perhaps repeated many times
/sx;
my $section_begin_re = qr/ \@\$\$\{ # "@$${"
([\dA-F]+) # hex
\{\@ \s* # "{@"
/six;
my $section_end_re = undef;
my $section = "top level";
##########################################################################
# Read in the file.
##########################################################################
open (IN, "<$file") || error ("$file: $!");
print STDERR "$progname: reading $file...\n" if ($verbose);
my $body = <IN>;
close IN;
$body =~ s/\\\)/\$29/gs; # close-paren is quoted with a backslash;
# convert to hex.
$body =~ s/\\\n//gs; # backslash at end of line is continuation.
##########################################################################
# Figure out what we're looking at, and parse it.
##########################################################################
print STDERR "$progname: $file: parsing...\n" if ($verbose);
pos($body) = 0;
my $length = length($body);
while( pos($body) < $length ) {
# Key table
if ( $body =~ m/\G$key_table_re/gc ) {
mork_parse_key_table($file, $section, $1);
# Values
} elsif ( $body =~ m/\G$value_table_re/gco ) {
mork_parse_value_table($file, $section, $1);
# Table
} elsif ( $body =~ m/\G$table_re/gco ) {
mork_parse_table($file, $section, $age, $since, $1);
# Rows (-> table)
} elsif ( $body =~ m/\G$row_re/gco ) {
mork_parse_table($file, $section, $age, $since, $1);
# Section begin
} elsif ( $body =~ m/\G$section_begin_re/gco ) {
$section = $1;
$section_end_re = qr/\@\$\$\}$section\}\@\s*/s;
# Section end
} elsif ( $section_end_re && $body =~ m/\G$section_end_re/gc ) {
$section_end_re = undef;
$section = "top level";
# Comment
} elsif ( $body =~ m/\G$top_level_comment/gco ) {
#no-op
} else {
# $body =~ m/\G (.{0,300}) /gcsx; print "<$1>\n";
error("$file: $section: Cannot parse");
}
}
if($section_end_re) {
error("$file: Unterminated section $section");
}
print STDERR "$progname: $file: sorting...\n" if ($verbose);
my @entries = sort { $b->{LastVisitDate} <=>
$a->{LastVisitDate} } values(%row_hash);
print STDERR "$progname: $file: done! ($total total, $skipped skipped)\n"
if ($verbose);
(%key_table, %val_table, %row_hash, $total, $skipped) = ();
return \@entries;
}
##########################################################################
# parse a row and column table
##########################################################################
sub mork_parse_table {
my($file, $section, $age, $since, $table_part) = (@_);
print STDERR "\n" if ($verbose > 3);
# Assumption: no relevant spaces in values in this section
$table_part =~ s/\s+//g;
# print $table_part; #exit(0);
#Grab each complete [...] block
while( $table_part =~ m/\G [^[]* \[ # find a "["
( [^]]+ ) \] # capture up to "]"
/gcx ) {
$_ = $1;
my %hash;
my ($id, @cells) = split (m/[()]+/s);
next unless scalar(@cells);
# Trim junk
$id =~ s/^-//;
$id =~ s/:.*//;
if($row_hash{$id}) {
%hash = ( %{$row_hash{$id}} );
} else {
%hash = ( 'ID' => $id,
'LastVisitDate' => 0 );
}
foreach (@cells) {
next unless $_;
my ($keyi, $which, $vali) =
m/^\^ ([-\dA-F]+)
([\^=])
(.*)
$/xi;
error ("$file: unparsable cell: $_\n") unless defined ($vali);
# If the key isn't in the key table, ignore it
#
my $key = $key_table{$keyi};
next unless defined($key);
my $val = ($which eq '='
? $vali
: $val_table{$vali});
if ($key eq 'LastVisitDate' || $key eq 'FirstVisitDate') {
$val = int ($val / 1000000); # we don't need milliseconds, dude.
}
$hash{$key} = $val;
#print "$id: $key -> $val\n";
}
if ($age && ($hash{LastVisitDate} || $since) < $since) {
print STDERR "$progname: $file: skipping old: " .
"$hash{LastVisitDate} $hash{URL}\n"
if ($verbose > 3);
$skipped++;
next;
}
$total++;
$row_hash{$id} = \%hash;
}
}
##########################################################################
# parse a values table
##########################################################################
sub mork_parse_value_table {
my($file, $section, $val_part) = (@_);
return unless $val_part;
my @pairs = split (m/\(([^\)]+)\)/, $val_part);
$val_part = undef;
print STDERR "\n" if ($verbose > 3);
foreach (@pairs) {
next unless (m/[^\s]/s);
my ($key, $val) = m/([\dA-F]*)[\t\n ]*=[\t\n ]*(.*)/i;
if (! defined ($val)) {
print STDERR "$progname: $file: $section: unparsable val: $_\n";
next;
}
# Assume that URLs and LastVisited are never hexilated; so
# don't bother unhexilating if we won't be using Name, etc.
if($show_all_p && $val =~ m/\$/) {
# Approximate wchar_t -> ASCII and remove NULs
$val =~ s/\$00//g; # faster if we remove these first
$val =~ s/\$([\dA-F]{2})/chr(hex($1))/ge;
}
$val_table{$key} = $val;
print STDERR "$progname: $file: $section: val $key = \"$val\"\n"
if ($verbose > 3);
}
}
##########################################################################
# parse a key table
##########################################################################
sub mork_parse_key_table {
my ($file, $section, $key_table) = (@_);
print STDERR "\n" if ($verbose > 3);
$key_table =~ s@\s+//.*$@@gm;
my @pairs = split (m/\(([^\)]+)\)/s, $key_table);
$key_table = undef;
foreach (@pairs) {
next unless (m/[^\s]/s);
my ($key, $val) = m/([\dA-F]+)\s*=\s*(.*)/i;
error ("$file: $section: unparsable key: $_") unless defined ($val);
# If we're only emitting URLs and dates, don't even bother
# saving the other fields that we aren't interested in.
#
next if (!$show_all_p &&
$val ne 'URL' && $val ne 'LastVisitDate' &&
$val ne 'VisitCount');
$key_table{$key} = $val;
print STDERR "$progname: $file: $section: key $key = \"$val\"\n"
if ($verbose > 3);
}
}
sub html_quote {
my ($s) = @_;
$s =~ s/&/&/g;
$s =~ s/</</g;
$s =~ s/>/>/g;
$s =~ s/\"/"/g;
return $s;
}
sub html_wrap {
my ($s) = @_;
$s = html_quote ($s);
# while there are non-wrappable chunks of 30 characters,
# insert wrap points at certain punctuation characters every 10 characters.
while ($s =~ m/[^\s]{30}/s) {
last unless ($s =~ s@([^\s]{10})([/;,])([^/\s])@$1$2 $3@gs ||
$s =~ s@([^\s]{10})([-_\$\#?.]|&|%(2F|2C|26))@$1 $2@gs);
}
# if we still have non-wrappable chunks of 40 characters,
# insert wrap points every 30 characters no matter what.
while ($s =~ m/[^\s]{40}/s) {
last unless ($s =~ s@([^\s]{30})@$1 @gs);
}
return $s;
}
sub format_urls {
my ($results, $html_p) = @_;
print "<TABLE BORDER=0 CELLPADDING=" . ($show_all_p ? "4" : "0") .
" CELLSPACING=0>\n"
if ($html_p);
foreach my $hash (@$results) {
if ($show_all_p) {
#
# Print every field in the hash.
#
if ($html_p) {
print " <TR>\n";
print " <TD NOWRAP ALIGN=RIGHT VALIGN=TOP>$hash->{ID} </TD>\n";
print " <TD NOWRAP>\n";
print " <TABLE BORDER=0 CELLPADDING=0 CELLSPACING=0>\n";
}
my %key_sort_table = (
'ID' => ' 0 ',
'URL' => ' 1 ',
'Name' => ' 2 ',
'Hostname' => ' 3 ',
'FirstVisitDate' => ' 4 ',
'LastVisitDate' => ' 5 '
);
foreach my $key (sort { ($key_sort_table{$a} || $a) cmp
($key_sort_table{$b} || $b)
} (keys(%$hash))) {
my $val = $hash->{$key};
if ($key eq 'LastVisitDate' || $key eq 'FirstVisitDate') {
$val = localtime ($val);
}
if ($html_p) {
next if ($key eq 'ID');
$key = html_quote ($key);
$val = ($key eq 'URL'
? "<A HREF=\"$val\">" . html_wrap ($val) . "</A>"
: html_wrap ($val));
print " <TR>\n";
print " <TD VALIGN=TOP NOWRAP ALIGN=RIGHT>$key: </TD>\n";
print " <TD VALIGN=TOP>$val</TD>\n";
print " </TR>\n";
} else {
print sprintf ("%14s = %s\n", $key, $val);
}
}
if ($html_p) {
print " </TABLE>\n";
print " </TD>\n";
print " </TR>\n";
}
print "\n";
} else {
#
# Print just the URLs and their last-load-times.
#
my $url = $hash->{'URL'};
my $date = $hash->{'LastVisitDate'} || 0;
my $count = $hash->{'VisitCount'} || 1;
next unless defined ($url);
if ($html_p) {
$date = strftime("%d %b %l:%M %p", localtime ($date));
my $u2 = html_wrap ($url);
print " <TR>";
print "<TD VALIGN=TOP ALIGN=RIGHT NOWRAP>";
print "($count) " if ($count > 1);
print "$date </TD>";
print "<TD VALIGN=TOP><A HREF=\"$url\">$u2</A></TD>";
print "</TR>\n";
} else {
print "$date\t$count\t$url\n";
}
}
}
print "</TABLE>\n" if ($html_p);
}
sub error {
($_) = @_;
print STDERR "$progname: $_\n";
exit 1;
}
sub usage {
print STDERR "usage: $progname [--verbose] [--html] [--age secs] " .
"mork-input-file\n" .
"\t'age' can be of the form '2h', '3d', etc.\n";
exit 1;
}
sub main {
my ($file, $age, $html_p);
while ($#ARGV >= 0) {
$_ = shift @ARGV;
if ($_ eq "--verbose") { $verbose++; }
elsif (m/^-v+$/) { $verbose += length($_)-1; }
elsif ($_ eq "--age") { $age = shift @ARGV; }
elsif ($_ eq "--html") { $html_p = 1; }
elsif (m/^-./) { usage; }
elsif (!defined($file)) { $file = $_; }
else { usage; }
}
usage() unless defined($file);
$show_all_p = ($verbose > 1);
if (!$age) {
} elsif ($age =~ m/^(\d+)\s*s(ec(onds?)?)?$/i) {
$age = $1 + 0;
} elsif ($age =~ m/^(\d+)\s*m(in(utes?)?)?$/i) {
$age = $1 * 60;
} elsif ($age =~ m/^(\d+)\s*h(ours?)?$/i) {
$age = $1 * 60 * 60;
} elsif ($age =~ m/^(\d+)\s*d(ays?)?$/i) {
$age = $1 * 60 * 60 * 24;
} elsif ($age =~ m/^(\d+)\s*w(eeks?)?$/i) {
$age = $1 * 60 * 60 * 24 * 7;
} elsif ($age =~ m/^(\d+)\s*m(on(ths?)?)?$/i) {
$age = $1 * 60 * 60 * 24 * 30;
} elsif ($age =~ m/^(\d+)\s*y(ears?)?$/i) {
$age = $1 * 60 * 60 * 24 * 365;
} else {
error ("unparsable: --age $age");
}
my $results = mork_parse_file ($file, $age);
format_urls ($results, $html_p);
}
main;
exit 0;