Skip to content

Commit

Permalink
glob matcher: match non-wildcard names as is, without needing regex.
Browse files Browse the repository at this point in the history
Simplify regex by short-circuit direct matching non-wildcards as-is.
Before, regular expressions could get really large if someone adds a
set of direct-matches to an include/exclude pattern.
  • Loading branch information
hzeller committed Jun 1, 2024
1 parent 09e216f commit 2ad955c
Show file tree
Hide file tree
Showing 4 changed files with 132 additions and 53 deletions.
1 change: 1 addition & 0 deletions bant/util/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ cc_library(
srcs = ["glob-match-builder.cc"],
hdrs = ["glob-match-builder.h"],
deps = [
"@abseil-cpp//absl/container:flat_hash_set",
"@abseil-cpp//absl/strings",
"@re2",
],
Expand Down
134 changes: 93 additions & 41 deletions bant/util/glob-match-builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,72 +17,124 @@

#include "bant/util/glob-match-builder.h"

#include <cstdlib>
#include <functional>
#include <memory>
#include <set>
#include <string>
#include <string_view>
#include <utility>
#include <vector>

#include "absl/container/flat_hash_set.h"
#include "absl/strings/str_join.h"
#include "absl/strings/str_replace.h"
#include "re2/re2.h"

namespace bant {
void GlobMatchBuilder::AddIncludePattern(std::string_view pattern) {
AddPatternAsRegex(pattern, &include_pattern_);
namespace {
// A matcher that delegates to direct matches or regexp depending on context.
class PathMatcher {
public:
PathMatcher(std::string_view regex,
absl::flat_hash_set<std::string> &&match_set)
: pattern_re_(regex), verbatim_match_(std::move(match_set)) {}

bool Match(std::string_view s) const {
return verbatim_match_.contains(s) || RE2::FullMatch(s, pattern_re_);
}

private:
RE2 pattern_re_;
absl::flat_hash_set<std::string> verbatim_match_;
};

// Needs to be shared, as RE2 can't be copied and std::function also can't
// things std::move()'ed into it.
static std::shared_ptr<PathMatcher> MakeFilenameMatcher(
const std::set<std::string> &patterns) {
std::vector<std::string> re_or_patterns;
absl::flat_hash_set<std::string> verbatim_match;
for (const std::string &p : patterns) {
if (p.contains('*')) {
const std::string escape_special = RE2::QuoteMeta(p); // quote everything
re_or_patterns.emplace_back( // ... then unquote the pattern back
absl::StrReplaceAll(escape_special, {{R"(\*\*)", ".*"}, //
{R"(\*)", "[^/]*"}}));
} else {
verbatim_match.insert(p); // Simple and fast.
}
}
return std::make_shared<PathMatcher>(absl::StrJoin(re_or_patterns, "|"),
std::move(verbatim_match));
}
void GlobMatchBuilder::AddExcludePattern(std::string_view pattern) {
AddPatternAsRegex(pattern, &exclude_pattern_);

static std::shared_ptr<PathMatcher> MakeDirectoryMatcher(
const std::set<std::string> &patterns) {
std::set<std::string> re_or_patterns; // sorted maybe beneficial for RE2
absl::flat_hash_set<std::string> verbatim_match;
for (std::string_view p : patterns) {
const size_t last_slash = p.find_last_of('/');
if (last_slash == std::string_view::npos) {
verbatim_match.insert("");
continue;
}
p = p.substr(0, last_slash); // Only directories for patterns
// TODO: is it allowed to have patterns like '**.txt' with the '**' not
// in directory ? Because then we just snipped it off and it won't work...
if (p.contains('*')) {
// We need to convert file-patterns into directory patterns. Directories
// only go up to the last element and we need to match a prefix of
// directory elments. So foo/bar/baz needs to match foo(/bar(/baz)?)?
const std::string escape_special = RE2::QuoteMeta(p); // quote everything
std::string dir_pattern = // ... then unquote the pattern back
absl::StrReplaceAll(escape_special, {{R"(\*\*)", ".*"}, //
{R"(\*)", "[^/]*"}});
// Now, make this a prefix-match by grouping each part.
const int parens =
absl::StrReplaceAll({{R"(\/)", R"((\/)"}}, &dir_pattern);
for (int i = 0; i < parens; ++i) {
dir_pattern.append(")?");
}
re_or_patterns.insert(dir_pattern);
} else {
size_t pos = 0;
for (;;) {
const size_t next = p.find_first_of('/', pos);
if (next == std::string::npos) break;
verbatim_match.insert(std::string{p.substr(0, next)});
pos = next + 1;
}
verbatim_match.insert(std::string{p});
}
}
return std::make_shared<PathMatcher>(absl::StrJoin(re_or_patterns, "|"),
std::move(verbatim_match));
}
} // namespace

void GlobMatchBuilder::AddPatternAsRegex(std::string_view pattern,
std::vector<std::string> *receiver) {
const std::string escape_special = RE2::QuoteMeta(pattern);
receiver->emplace_back(
absl::StrReplaceAll(escape_special, {{R"(\*\*)", ".*"}, //
{R"(\*)", "[^/]*"}}));
// Public interface
void GlobMatchBuilder::AddIncludePattern(std::string_view pattern) {
include_pattern_.insert(std::string{pattern});
}
void GlobMatchBuilder::AddExcludePattern(std::string_view pattern) {
exclude_pattern_.insert(std::string{pattern});
}

std::function<bool(std::string_view)>
GlobMatchBuilder::BuildFileMatchPredicate() {
auto include_re = std::make_shared<RE2>(absl::StrJoin(include_pattern_, "|"));
auto exclude_re = std::make_shared<RE2>(absl::StrJoin(exclude_pattern_, "|"));
auto include = MakeFilenameMatcher(include_pattern_);
auto exclude = MakeFilenameMatcher(exclude_pattern_);
return [=](std::string_view s) {
if (!RE2::FullMatch(s, *include_re)) {
return false;
}
return !RE2::FullMatch(s, *exclude_re);
if (!include->Match(s)) return false;
return !exclude->Match(s);
};
}

std::function<bool(std::string_view)>
GlobMatchBuilder::BuildDirectoryMatchPredicate() {
std::set<std::string> unique_patterns;

// We need to convert file-patterns into directory patterns. Directories
// only go up to the last element and we need to match a prefix of complete
// directory elmeents. So foo/bar/baz needs to match foo(/bar(/baz)?)?
// TODO: is it easily possible to derive a negative pattern ?
for (const std::string_view file_pattern : include_pattern_) {
std::string dir_pattern;
if (file_pattern.ends_with(".*")) {
dir_pattern = file_pattern;
} else {
auto last_slash_pos = file_pattern.rfind(R"(\/)");
if (last_slash_pos != std::string_view::npos) {
dir_pattern = file_pattern.substr(0, last_slash_pos);
}
}
const int parens = absl::StrReplaceAll({{R"(\/)", R"((\/)"}}, &dir_pattern);
for (int i = 0; i < parens; ++i) {
dir_pattern.append(")?");
}
unique_patterns.insert(dir_pattern);
}

auto include_re = std::make_shared<RE2>(absl::StrJoin(unique_patterns, "|"));
return [=](std::string_view s) { return RE2::FullMatch(s, *include_re); };
auto dir_matcher = MakeDirectoryMatcher(include_pattern_);
return [=](std::string_view s) { return dir_matcher->Match(s); };
}

} // namespace bant
9 changes: 3 additions & 6 deletions bant/util/glob-match-builder.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@
#define BANT_GLOB_MATCH_BUILDER_H

#include <functional>
#include <set>
#include <string>
#include <string_view>
#include <vector>

namespace bant {
// A builder taking glob-patterns and building predicates used in filsystem
Expand All @@ -39,11 +39,8 @@ class GlobMatchBuilder {
std::function<bool(std::string_view)> BuildFileMatchPredicate();

private:
static void AddPatternAsRegex(std::string_view pattern,
std::vector<std::string> *receiver);

std::vector<std::string> include_pattern_;
std::vector<std::string> exclude_pattern_;
std::set<std::string> include_pattern_;
std::set<std::string> exclude_pattern_;
};
} // namespace bant

Expand Down
41 changes: 35 additions & 6 deletions bant/util/glob-match-builder_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
#include "gtest/gtest.h"

namespace bant {
TEST(GlobMatcchBuilderTest, NoDirectorySimpleFileGlob) {
TEST(GlobMatchBuilderTest, NoDirectorySimpleFileGlob) {
GlobMatchBuilder glob_builder;
glob_builder.AddIncludePattern("foo.txt");
glob_builder.AddIncludePattern("b*r.txt");
Expand All @@ -39,7 +39,7 @@ TEST(GlobMatcchBuilderTest, NoDirectorySimpleFileGlob) {
EXPECT_FALSE(dir_is_matching("anythingelse"));
}

TEST(GlobMatcchBuilderTest, ExactlyOneDir) {
TEST(GlobMatchBuilderTest, ExactlyOneDir) {
GlobMatchBuilder glob_builder;
glob_builder.AddIncludePattern("*/foo.txt");
glob_builder.AddIncludePattern("*/b*r.txt");
Expand All @@ -56,7 +56,7 @@ TEST(GlobMatcchBuilderTest, ExactlyOneDir) {
EXPECT_FALSE(dir_is_matching("foo/bar"));
}

TEST(GlobMatcchBuilderTest, MultiDir) {
TEST(GlobMatchBuilderTest, MultiDir) {
GlobMatchBuilder glob_builder;
glob_builder.AddIncludePattern("**/foo.txt");
glob_builder.AddIncludePattern("**/b*r.txt");
Expand All @@ -76,10 +76,13 @@ TEST(GlobMatcchBuilderTest, MultiDir) {
EXPECT_TRUE(dir_is_matching("foo/bar/baz"));
}

TEST(GlobMatcchBuilderTest, MultiDirWithPrefix) {
TEST(GlobMatchBuilderTest, MultiDirWithPrefix) {
GlobMatchBuilder glob_builder;
glob_builder.AddIncludePattern("a/**/foo.txt");
glob_builder.AddIncludePattern("b/**/b*r.txt");
glob_builder.AddIncludePattern("e/**/d/ddd.txt"); // multi dir
glob_builder.AddIncludePattern("e/*/g/ggg.txt"); // one dir
glob_builder.AddIncludePattern("f/g/h/b*r.txt");

auto file_is_matching = glob_builder.BuildFileMatchPredicate();
EXPECT_FALSE(file_is_matching("foo.txt"));
Expand All @@ -94,6 +97,19 @@ TEST(GlobMatcchBuilderTest, MultiDirWithPrefix) {
EXPECT_FALSE(file_is_matching("a/b/c/d/bar.txt"));
EXPECT_TRUE(file_is_matching("b/c/d/bar.txt"));

// Last match dir needs to be d/
EXPECT_FALSE(file_is_matching("e/x/y/z/ddd.txt"));
EXPECT_TRUE(file_is_matching("e/x/y/z/d/ddd.txt"));

// g/ only allows one in-between dir
EXPECT_TRUE(file_is_matching("e/x/g/ggg.txt"));
EXPECT_FALSE(file_is_matching("e/x/y/g/ggg.txt"));
EXPECT_FALSE(file_is_matching("e/g/ggg.txt"));

// Explicit dir prefix match
EXPECT_TRUE(file_is_matching("f/g/h/bar.txt"));
EXPECT_FALSE(file_is_matching("f/g/j/bar.txt"));

auto dir_is_matching = glob_builder.BuildDirectoryMatchPredicate();
EXPECT_FALSE(dir_is_matching("")); // We need to have at least one prefix
EXPECT_TRUE(dir_is_matching("a"));
Expand All @@ -104,18 +120,31 @@ TEST(GlobMatcchBuilderTest, MultiDirWithPrefix) {
EXPECT_TRUE(dir_is_matching("b/c"));
EXPECT_TRUE(dir_is_matching("b/c/d"));

EXPECT_FALSE(dir_is_matching("c"));
EXPECT_TRUE(dir_is_matching("f"));
EXPECT_TRUE(dir_is_matching("f/g"));
EXPECT_TRUE(dir_is_matching("f/g/h"));
EXPECT_FALSE(dir_is_matching("f/g/h/i"));

EXPECT_TRUE(dir_is_matching("e"));
EXPECT_TRUE(dir_is_matching("e/x"));
EXPECT_TRUE(dir_is_matching("e/x/y"));
EXPECT_TRUE(dir_is_matching("e/x/y/z"));
EXPECT_TRUE(dir_is_matching("e/x/y/z/d"));

EXPECT_FALSE(dir_is_matching("c")); // no prefix like that
}

TEST(GlobMatcchBuilderTest, ExcludeFiles) {
TEST(GlobMatchBuilderTest, ExcludeFiles) {
GlobMatchBuilder glob_builder;
glob_builder.AddIncludePattern("*.txt");
glob_builder.AddExcludePattern("*_internal*.txt");
glob_builder.AddExcludePattern("explicit-exclude.txt");

auto file_is_matching = glob_builder.BuildFileMatchPredicate();
EXPECT_TRUE(file_is_matching("foo.txt"));
EXPECT_TRUE(file_is_matching("bar.txt"));
EXPECT_TRUE(file_is_matching("foo_test.txt"));
EXPECT_FALSE(file_is_matching("explicit-exclude.txt"));

EXPECT_TRUE(file_is_matching("foo_intern.txt"));
EXPECT_FALSE(file_is_matching("foo_internal.txt"));
Expand Down

0 comments on commit 2ad955c

Please sign in to comment.