-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdg
161 lines (146 loc) · 6.49 KB
/
dg
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#!/usr/bin/perl -w
use strict;
use warnings;
use English;
use Getopt::Std;
use File::Basename;
use File::Find;
my $kdb_dnm = "$ENV{HOME}/my/repos/pers";
my($scriptname, $scriptdirs, $scriptsuffix) = fileparse($0);
sub die_w_usage {
print STDERR <<"EOT";
$scriptname: diary-grep
usage: $scriptname [-h] [-a] [-c] [-d] [-f kdbfile] term [term [ ... ]]
-h this help
-a search all kdb files
-c entry search (entry=text betw date markers)
-d display date only
-f kdbfile search only kdbfile
EOT
die "abend\n"
}
our ($opt_a,$opt_c,$opt_d,$opt_f,$opt_h);
getopts('acdf:h'); # -a is boolean flag, -f takes an argument
# Sets $opt_* as a side effect.
$opt_h && die_w_usage;
scalar @ARGV > 0 || die_w_usage;
if( scalar @ARGV == 1 && $opt_d ) {
$opt_c = undef;
$opt_d = undef;
}
# stolen from my https://github.com/fwmechanic/file-collection-searcher/blob/master/search-files
sub gen_re_match_all_anyorder {
# https://www.perlmonks.org/?node_id=308753 (thread: https://www.perlmonks.org/?node_id=308744)
# to make a specialized version of \b that views "-" and "/" as "word characters" (sort of), you might use something like this:
# my $w = '\w/-';
my $w = shift or die "gen_re_match_all_anyorder: no args?";
my $b = "(?:(*negative_lookbehind:[$w])(*positive_lookahead:[$w])|(*positive_lookbehind:[$w])(*negative_lookahead:[$w]))";
# my @words = ($rec =~ /${b}[$w]+${b}/g);
# The following implements a brute-force solution to the performance problem
# caused by use of \b (or $b, the specialized version of \b) to implement
# whole-word search terms:
#
# The returned search regex consists of a sequence of look-ahead-assertions
# (LAA), each matching one user search term. Previously, a word search term
# would be wrapped in \b's before being inserted in the (one and only) LAA
# sequence in input (i.e. user-provided) order, however the presence of \b
# (caused by the presence of ANY word search term) caused a huge (100% == 2x)
# performance hit.
#
# This is resolved by adding _TWO_ LAA's into the returned LAA sequence for
# each input word search term:
# 1. a non-word-search (string) LAA is added to @strs for every search term.
# 2. a word-search (\b) LAA is added to @words for every word search term.
#
# The returned sequence of LAA's is the concatenation of @strs followed by
# @words. Thus all candidate strings must FIRST pass all @strs LAA's; these
# are very fast checks. Only those few candidate strings passing all @strs
# LAA's undergo checking against the (SLOW) @words LAA's. Assuming only a low
# percentage of candidate strings pass all @strs LAA's, the performance impact
# of the trailing (SLOW) @words LAA's is reduced to almost nothing (and
# testing shows this to be true).
#
# Given that our search is optimized by failing each candidate string as
# quickly as possible, a further optimization is to sort @strs LAA's by
# descending length: this causes the longest strings among the input search
# terms to be searched for first. The naive idea being that longer strings
# are less likey to be found than short strings.
#
my (@strs,@words);
for my $rawterm (@_) { # construct regex matching lines containing, in any order, ALL of @_
my $term = quotemeta( $rawterm ); # match term
push @strs, "(*positive_lookahead:.*$term)"; # https://stackoverflow.com/a/4389683 https://stackoverflow.com/questions/4389644/regex-to-match-string-containing-two-names-in-any-order
push @words, "(*positive_lookahead:.*$b$term$b)" if ($term =~ m=[A-Z]=) && ($term !~ m=[a-z]=); # all term alphas are caps (i.e. at least one uppercase-alpha and no lowercase-alphas)?: match word as defined by $b + $w
}
@strs = sort { length $b <=> length $a } @strs; # try to find longest strings first
my $rv = '^(?i)' . join('', @strs) . join('', @words) . '.*$'; # print "\npat=$rv\n";
return qr($rv);
}
sub all_files {
my @found;
my $wanted = sub {
return unless -f;
return unless $File::Find::name =~ m|\.kdb$|i;
push @found, $File::Find::name;
};
find( $wanted, $kdb_dnm );
return sort @found;
}
my @files;
if( $opt_a ) { @files = all_files(); }
elsif( $opt_f ) { @files = ( "$kdb_dnm/$opt_f" ); }
else { @files = ( "$kdb_dnm/daily_notes.kdb" ); }
my $rex = gen_re_match_all_anyorder( '\w_', @ARGV );
my $dtag = ''; # current diary-entry's date-tag (aka timestamp)
my @dtxt; # current diary-entry's text (lines)
my ($fnmCopy,%fnmsWhit);
my ($chkDtxtMatch,$chkLnHit); # methods
sub fnmPfx {
print "$fnmCopy\n" unless exists $fnmsWhit{$fnmCopy};
$fnmsWhit{$fnmCopy} = 1;
}
if( $opt_c ) {
$chkDtxtMatch = sub { # check if @dtxt matches
my $all = join ' ', @dtxt;
# print "ALL: " . $all, "\n";
if( $all =~ m|$rex|o ) {
$all =~ s/\W//g;
fnmPfx();
if( $opt_d ) { print $dtag, "\n"; }
else { print $dtag, ' ', join( "\n", @dtxt ), "\n"; }
}
}
}
else {
$chkLnHit = sub {
if( m|$rex|o ) {
s/^\s+|\s+$//g;
fnmPfx();
print "$dtag:$_\n"; # $fnm:$.:
}
}
}
for my $fnm ( @files ) {
open my $ifh, '<', $fnm or die "abend cannot open $fnm for reading: $!\n";
$fnmCopy = $fnm; # annoying Perl for variable scoping rule forces this to expose outside loop
# print "++++ $fnm\n" unless exists $fnmsWhit{$fnm};
while (<$ifh>) {
chomp;
# my ($dt,$dow) = m/^(\d{8}(?:_\d{4,6})?)(\s+(?:Sunday|Sun|Monday|Mon|Tuesday|Tues|Tue|Wednesday|Wed|Thursday|Thurs|Thu|Thur|Friday|Fri|Saturday|Sat))?/; # WORKS
my ($dt,$dow) = m/^(\d{8}(?:_\d{4,6})?)(\s+(?:[SMTWF][unonuesednesdhursriatur]{2,5}day|Sun|Mon|Tues|Tue|Wed|Thurs|Thur|Thu|Fri|Sat)\b)?/; # WORKS
if( $dt ) {
$chkDtxtMatch->() if $chkDtxtMatch; # we're about to replace @dtxt; before doing so, check its presumed full accumulation for a match
$dtag = substr $dt, 0, 8;
# print $dtag . "\n";
# print $dow . "#\n" if $dow;
$_ = substr $_, length($dt) + ($dow ? length($dow) : 0);
@dtxt = ( $_ ); # replace @dtxt
}
else {
push( @dtxt, $_ );
}
$chkLnHit->( $_ ) if $chkLnHit;
}
$chkDtxtMatch->() if $chkDtxtMatch;
close $ifh;
}