-
Notifications
You must be signed in to change notification settings - Fork 388
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
perf: walk workspace directories in parallel #1893
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,10 +23,12 @@ import ( | |
"os" | ||
"path" | ||
"path/filepath" | ||
"sort" | ||
"strings" | ||
|
||
"github.com/bazelbuild/bazel-gazelle/config" | ||
"github.com/bazelbuild/bazel-gazelle/rule" | ||
"golang.org/x/sync/errgroup" | ||
) | ||
|
||
// Mode determines which directories Walk visits and which directories | ||
|
@@ -122,25 +124,26 @@ func Walk(c *config.Config, cexts []config.Configurer, dirs []string, mode Mode, | |
log.Printf("error loading .bazelignore: %v", err) | ||
} | ||
|
||
visit(c, cexts, isBazelIgnored, knownDirectives, updateRels, wf, c.RepoRoot, "", false) | ||
} | ||
|
||
func visit(c *config.Config, cexts []config.Configurer, isBazelIgnored isIgnoredFunc, knownDirectives map[string]bool, updateRels *UpdateFilter, wf WalkFunc, dir, rel string, updateParent bool) { | ||
if isBazelIgnored(rel) { | ||
return | ||
trie, err := buildTrie(c, isBazelIgnored) | ||
if err != nil { | ||
log.Fatalf("error walking the file system: %v\n", err) | ||
} | ||
|
||
visit(c, cexts, knownDirectives, updateRels, trie, wf, c.RepoRoot, "", false) | ||
} | ||
|
||
func visit(c *config.Config, cexts []config.Configurer, knownDirectives map[string]bool, updateRels *UpdateFilter, trie *pathTrie, wf WalkFunc, dir, rel string, updateParent bool) { | ||
haveError := false | ||
|
||
// TODO: OPT: ReadDir stats all the files, which is slow. We just care about | ||
// names and modes, so we should use something like | ||
// golang.org/x/tools/internal/fastwalk to speed this up. | ||
ents, err := os.ReadDir(dir) | ||
if err != nil { | ||
log.Print(err) | ||
return | ||
ents := make([]fs.DirEntry, 0, len(trie.children)) | ||
jbedard marked this conversation as resolved.
Show resolved
Hide resolved
|
||
for _, node := range trie.children { | ||
ents = append(ents, *node.entry) | ||
} | ||
|
||
sort.Slice(ents, func(i, j int) bool { | ||
return ents[i].Name() < ents[j].Name() | ||
}) | ||
|
||
f, err := loadBuildFile(c, rel, dir, ents) | ||
if err != nil { | ||
log.Print(err) | ||
|
@@ -162,7 +165,7 @@ func visit(c *config.Config, cexts []config.Configurer, isBazelIgnored isIgnored | |
var subdirs, regularFiles []string | ||
for _, ent := range ents { | ||
base := ent.Name() | ||
if isBazelIgnored(path.Join(rel, base)) || wc.isExcluded(rel, base) { | ||
if wc.isExcluded(rel, base) { | ||
continue | ||
} | ||
ent := resolveFileInfo(wc, dir, rel, ent) | ||
|
@@ -179,7 +182,7 @@ func visit(c *config.Config, cexts []config.Configurer, isBazelIgnored isIgnored | |
shouldUpdate := updateRels.shouldUpdate(rel, updateParent) | ||
for _, sub := range subdirs { | ||
if subRel := path.Join(rel, sub); updateRels.shouldVisit(subRel, shouldUpdate) { | ||
visit(c, cexts, isBazelIgnored, knownDirectives, updateRels, wf, filepath.Join(dir, sub), subRel, shouldUpdate) | ||
visit(c, cexts, knownDirectives, updateRels, trie.children[sub], wf, path.Join(dir, sub), subRel, shouldUpdate) | ||
} | ||
} | ||
|
||
|
@@ -356,3 +359,62 @@ func resolveFileInfo(wc *walkConfig, dir, rel string, ent fs.DirEntry) fs.DirEnt | |
} | ||
return fs.FileInfoToDirEntry(fi) | ||
} | ||
|
||
type pathTrie struct { | ||
children map[string]*pathTrie | ||
entry *fs.DirEntry | ||
} | ||
|
||
// Basic factory method to ensure the entry is properly copied | ||
func newTrie(entry fs.DirEntry) *pathTrie { | ||
return &pathTrie{entry: &entry} | ||
} | ||
|
||
func buildTrie(c *config.Config, isIgnored isIgnoredFunc) (*pathTrie, error) { | ||
trie := &pathTrie{ | ||
children: map[string]*pathTrie{}, | ||
} | ||
|
||
// A channel to limit the number of concurrent goroutines | ||
limitCh := make(chan struct{}, 100) | ||
fmeum marked this conversation as resolved.
Show resolved
Hide resolved
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 100 feels pretty hardcoded. Can we check the number of CPUs or max processes we can use? https://pkg.go.dev/runtime#NumCPU There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The operations are likely I/O bound, so a limit based on the number of CPUs would likely be far too low. Eventually we may want to make this configurable, but any reasonably high number should give a speedup. In the end, the Go runtime would know best how to schedule I/O-blocked goroutines. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Today it's all I/O so I figured hardcoding something is fine and number of CPUs isn't really relevant. However that might change soon if we start putting stuff like BUILD-parsing into these goroutines. |
||
|
||
// An error group to handle error propagation | ||
eg := errgroup.Group{} | ||
eg.Go(func() error { | ||
return walkDir(c.RepoRoot, "", &eg, limitCh, isIgnored, trie) | ||
}) | ||
|
||
return trie, eg.Wait() | ||
} | ||
|
||
// walkDir recursively and concurrently descends into the 'rel' directory and builds a trie | ||
func walkDir(root, rel string, eg *errgroup.Group, limitCh chan struct{}, isIgnored isIgnoredFunc, trie *pathTrie) error { | ||
limitCh <- struct{}{} | ||
defer (func() { <-limitCh })() | ||
|
||
entries, err := os.ReadDir(filepath.Join(root, rel)) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
for _, entry := range entries { | ||
entryName := entry.Name() | ||
entryPath := path.Join(rel, entryName) | ||
|
||
// Ignore .git, empty names and ignored paths | ||
jbedard marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if entryName == "" || entryName == ".git" || isIgnored(entryPath) { | ||
continue | ||
} | ||
|
||
entryTrie := newTrie(entry) | ||
trie.children[entry.Name()] = entryTrie | ||
|
||
if entry.IsDir() { | ||
entryTrie.children = map[string]*pathTrie{} | ||
eg.Go(func() error { | ||
return walkDir(root, entryPath, eg, limitCh, isIgnored, entryTrie) | ||
}) | ||
} | ||
} | ||
return nil | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we make parallel walk configurable? It would be good to not enable by default, and allow users to opt in/out
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry I didn't see this and then this was merged. We can probably do this fairly easily.
Lets just make sure we do this (disable it by default) instead of reverting if any issues come up now that there has been a release.