dg

#!/usr/bin/perl -w

use strict;
use warnings;
use English;
use Getopt::Std;
use File::Basename;
use File::Find;

my $kdb_dnm = "$ENV{HOME}/my/repos/pers";

my($scriptname, $scriptdirs, $scriptsuffix) = fileparse($0);

sub die_w_usage {
   print STDERR <<"EOT";
$scriptname: diary-grep
usage: $scriptname [-h] [-a] [-c] [-d] [-f kdbfile] term [term [ ... ]]
  -h   this help
  -a   search all kdb files
  -c   entry search (entry=text betw date markers)
  -d   display date only
  -f kdbfile  search only kdbfile
EOT
   die "abend\n"
   }

our ($opt_a,$opt_c,$opt_d,$opt_f,$opt_h);
getopts('acdf:h');  # -a is boolean flag, -f takes an argument
                    # Sets $opt_* as a side effect.

$opt_h           && die_w_usage;
scalar @ARGV > 0 || die_w_usage;

if( scalar @ARGV == 1 && $opt_d ) {
   $opt_c = undef;
   $opt_d = undef;
   }

# stolen from my https://github.com/fwmechanic/file-collection-searcher/blob/master/search-files

sub gen_re_match_all_anyorder {
   # https://www.perlmonks.org/?node_id=308753 (thread: https://www.perlmonks.org/?node_id=308744)
   # to make a specialized version of \b that views "-" and "/" as "word characters" (sort of), you might use something like this:
   # my $w = '\w/-';
   my $w = shift or die "gen_re_match_all_anyorder: no args?";
   my $b = "(?:(*negative_lookbehind:[$w])(*positive_lookahead:[$w])|(*positive_lookbehind:[$w])(*negative_lookahead:[$w]))";
   # my @words = ($rec =~ /${b}[$w]+${b}/g);

   # The following implements a brute-force solution to the performance problem
   # caused by use of \b (or $b, the specialized version of \b) to implement
   # whole-word search terms:
   #
   # The returned search regex consists of a sequence of look-ahead-assertions
   # (LAA), each matching one user search term.  Previously, a word search term
   # would be wrapped in \b's before being inserted in the (one and only) LAA
   # sequence in input (i.e. user-provided) order, however the presence of \b
   # (caused by the presence of ANY word search term) caused a huge (100% == 2x)
   # performance hit.
   #
   # This is resolved by adding _TWO_ LAA's into the returned LAA sequence for
   # each input word search term:
   # 1. a non-word-search (string) LAA is added to @strs for every search term.
   # 2. a word-search (\b) LAA is added to @words for every word search term.
   #
   # The returned sequence of LAA's is the concatenation of @strs followed by
   # @words.  Thus all candidate strings must FIRST pass all @strs LAA's; these
   # are very fast checks.  Only those few candidate strings passing all @strs
   # LAA's undergo checking against the (SLOW) @words LAA's.  Assuming only a low
   # percentage of candidate strings pass all @strs LAA's, the performance impact
   # of the trailing (SLOW) @words LAA's is reduced to almost nothing (and
   # testing shows this to be true).
   #
   # Given that our search is optimized by failing each candidate string as
   # quickly as possible, a further optimization is to sort @strs LAA's by
   # descending length: this causes the longest strings among the input search
   # terms to be searched for first.  The naive idea being that longer strings
   # are less likey to be found than short strings.
   #
   my (@strs,@words);
   for my $rawterm (@_) {  # construct regex matching lines containing, in any order, ALL of @_
      my $term = quotemeta( $rawterm );  #  match term
      push @strs,  "(*positive_lookahead:.*$term)"; # https://stackoverflow.com/a/4389683 https://stackoverflow.com/questions/4389644/regex-to-match-string-containing-two-names-in-any-order
      push @words, "(*positive_lookahead:.*$b$term$b)" if ($term =~ m=[A-Z]=) && ($term !~ m=[a-z]=); # all term alphas are caps (i.e. at least one uppercase-alpha and no lowercase-alphas)?: match word as defined by $b + $w
      }
   @strs = sort { length $b <=> length $a } @strs;  # try to find longest strings first
   my $rv = '^(?i)' . join('', @strs) . join('', @words) . '.*$'; # print "\npat=$rv\n";
   return qr($rv);
   }

sub all_files {
   my @found;
   my $wanted = sub {
      return unless -f;
      return unless $File::Find::name =~ m|\.kdb$|i;
      push @found, $File::Find::name;
      };
   find( $wanted, $kdb_dnm );
   return sort @found;
   }

my @files;

if( $opt_a ) {     @files = all_files();  }
elsif( $opt_f ) {  @files = ( "$kdb_dnm/$opt_f" ); }
else {             @files = ( "$kdb_dnm/daily_notes.kdb" ); }

my $rex = gen_re_match_all_anyorder( '\w_', @ARGV );

my $dtag = ''; # current diary-entry's date-tag (aka timestamp)
my @dtxt;      # current diary-entry's text (lines)
my ($fnmCopy,%fnmsWhit);
my ($chkDtxtMatch,$chkLnHit);  # methods
sub fnmPfx {
   print "$fnmCopy\n" unless exists $fnmsWhit{$fnmCopy};
   $fnmsWhit{$fnmCopy} = 1;
   }
if( $opt_c ) {
   $chkDtxtMatch = sub {  # check if @dtxt matches
      my $all = join ' ', @dtxt;
      # print "ALL: " . $all, "\n";
      if( $all =~ m|$rex|o ) {
         $all =~ s/\W//g;
         fnmPfx();
         if( $opt_d ) {  print $dtag,                           "\n"; }
         else {          print $dtag, ' ', join( "\n", @dtxt ), "\n"; }
         }
      }
   }
else {
   $chkLnHit = sub {
      if( m|$rex|o ) {
         s/^\s+|\s+$//g;
         fnmPfx();
         print "$dtag:$_\n";  # $fnm:$.:
         }
      }
   }
for my $fnm ( @files ) {
   open my $ifh, '<', $fnm or die "abend cannot open $fnm for reading: $!\n";
   $fnmCopy = $fnm;  # annoying Perl for variable scoping rule forces this to expose outside loop
   # print "++++ $fnm\n" unless exists $fnmsWhit{$fnm};
   while (<$ifh>) {
      chomp;
    # my ($dt,$dow) = m/^(\d{8}(?:_\d{4,6})?)(\s+(?:Sunday|Sun|Monday|Mon|Tuesday|Tues|Tue|Wednesday|Wed|Thursday|Thurs|Thu|Thur|Friday|Fri|Saturday|Sat))?/;  # WORKS
      my ($dt,$dow) = m/^(\d{8}(?:_\d{4,6})?)(\s+(?:[SMTWF][unonuesednesdhursriatur]{2,5}day|Sun|Mon|Tues|Tue|Wed|Thurs|Thur|Thu|Fri|Sat)\b)?/;                # WORKS
      if( $dt ) {
         $chkDtxtMatch->() if $chkDtxtMatch;  # we're about to replace @dtxt; before doing so, check its presumed full accumulation for a match
         $dtag = substr $dt, 0, 8;
         # print $dtag . "\n";
         # print $dow . "#\n" if $dow;
         $_ = substr $_, length($dt) + ($dow ? length($dow) : 0);
         @dtxt = ( $_ );  # replace @dtxt
         }
      else {
         push( @dtxt, $_ );
         }
      $chkLnHit->( $_ ) if $chkLnHit;
      }
   $chkDtxtMatch->() if $chkDtxtMatch;
   close $ifh;
   }