Skip to content

Commit

Permalink
contenthash: improve the correctness of needsScan
Browse files Browse the repository at this point in the history
Commit f724d6f ("contenthash: implement proper Linux symlink
semantics for needsScan") fixed issues with needScan's handling of
symlinks, but the logic used to figure out if a parent path is in the
cache was incorrect in a couple of ways:

 1. The optimisation in getFollowSymlinksCallback to avoid looking up /
    in the cache lead to the callback not being called when we go
    through /. The upshot is that needsScan(/non-existent-path) would
    always return true because we didn't check if / has been scanned.
 2. Similarly, the logic of skipping a cache lookup if the path is the
    same as the current path meant that we skipped the first / lookup
    (because we start with currentPath=/).
 3. Because needsScan would only store the _last_ good path, cases with
    symlink jumps to non-existent paths within directories already
    scanned would result in a re-scan that isn't necessary. Fix this by
    saving a set of prefix paths we have seen. Note that in combination
    with (1) and (2), if / has been scanned then needsScan will always
    return false now (because the / prefix is always checked against).

Fixes: f724d6f ("contenthash: implement proper Linux symlink semantics for needsScan")
Signed-off-by: Aleksa Sarai <[email protected]>
  • Loading branch information
cyphar committed Jun 19, 2024
1 parent 01d7739 commit 918c9f9
Showing 1 changed file with 56 additions and 28 deletions.
84 changes: 56 additions & 28 deletions cache/contenthash/checksum.go
Original file line number Diff line number Diff line change
Expand Up @@ -928,31 +928,68 @@ func (cc *cacheContext) checksum(ctx context.Context, root *iradix.Node[*CacheRe
return cr2, true, nil
}

// pathSet is a set of path prefixes that can be used to see if a given path is
// lexically a child of any path in the set. All paths provided to this set
// MUST be absolute and use / as the separator.
type pathSet struct {
// prefixes contains paths of the form "/a/b/".
prefixes []string
}

// add a path to the set. This is a no-op if includes(path) == true.
func (s *pathSet) add(p string) {
// Make sure the path is absolute and is clean. Callers are expected to
// ensure this, but just make sure.
p = path.Join("/", p)
if !s.includes(p) {
// Add a trailing slash to so that includes correctly detects /a/b as
// being a parent of /a/b/c but not /a/bc.
if p != "/" {
p += "/"
}
s.prefixes = append(s.prefixes, p)
}
}

// includes returns true iff there is a path in the pathSet which is a lexical
// parent of the given path. The provided path MUST be an absolute path and
// MUST NOT contain any ".." components, as they will be path.Clean'd.
func (s pathSet) includes(p string) bool {
// Make sure the path is absolute and is clean. Callers are expected to
// ensure this, but just make sure.
p = path.Join("/", p)
// Add a slash so that the prefix /a/b/ will match the path /a/b.
if p != "/" {
p += "/"
}
for _, prefix := range s.prefixes {
if strings.HasPrefix(p, prefix) {
return true
}
}
return false
}

// needsScan returns false if path is in the tree or a parent path is in tree
// and subpath is missing.
func (cc *cacheContext) needsScan(root *iradix.Node[*CacheRecord], path string, followTrailing bool) (bool, error) {
var (
lastGoodPath string
goodPaths pathSet
hasParentInTree bool
)
k := convertPathToKey(path)
_, cr, err := getFollowLinksCallback(root, k, followTrailing, func(subpath string, cr *CacheRecord) error {
// If we found a path that exists in the cache, add it to the set of
// known-scanned paths. Otherwise, verify whether the not-found subpath
// is inside a known-scanned path (we might have hit a "..", taking us
// out of the scanned paths, or we might hit a non-existent path inside
// a scanned path). getFollowLinksCallback iterates left-to-right, so
// we will always hit ancestors first.
if cr != nil {
// If the path is not a symlink, then for now we have a parent in
// the tree. Otherwise, we reset hasParentInTree because we
// might've jumped to a part of the tree that hasn't been scanned.
hasParentInTree = (cr.Type != CacheRecordTypeSymlink)
if hasParentInTree {
lastGoodPath = subpath
}
} else if hasParentInTree {
// If the current component was something like ".." and the subpath
// couldn't be found, then we need to invalidate hasParentInTree.
// In practice this means that our subpath needs to be prefixed by
// the last good path. We add a trailing slash to make sure the
// prefix is a proper lexical prefix (as opposed to /a/b being seen
// as a prefix of /a/bc).
hasParentInTree = strings.HasPrefix(subpath+"/", lastGoodPath+"/")
hasParentInTree = true
goodPaths.add(subpath)
} else {
hasParentInTree = goodPaths.includes(subpath)
}
return nil
})
Expand Down Expand Up @@ -1005,11 +1042,7 @@ func (cc *cacheContext) scanPath(ctx context.Context, m *mount, p string, follow
if err != nil {
return err
}
p := path.Join("/", filepath.ToSlash(rel))
if p == "/" {
p = ""
}
k := convertPathToKey(p)
k := convertPathToKey(keyPath(rel))
if _, ok := n.Get(k); !ok {
cr := &CacheRecord{
Type: CacheRecordTypeFile,
Expand Down Expand Up @@ -1073,7 +1106,7 @@ func getFollowLinksCallback(root *iradix.Node[*CacheRecord], k []byte, followTra
}

var (
currentPath = "/"
currentPath = ""
remainingPath = convertKeyToPath(k)
linksWalked int
cr *CacheRecord
Expand All @@ -1094,17 +1127,12 @@ func getFollowLinksCallback(root *iradix.Node[*CacheRecord], k []byte, followTra
// our current path contains no symlinks, we can just apply it
// leixically.
nextPath := path.Join("/", currentPath, part)
if nextPath == "/" {
// If we hit the root, just treat it as a directory.
currentPath = "/"
continue
}
if nextPath == currentPath {
// noop component
continue
}

cr, ok = root.Get(convertPathToKey(nextPath))
cr, ok = root.Get(convertPathToKey(keyPath(nextPath)))
if cb != nil {
if err := cb(nextPath, cr); err != nil {
return nil, nil, err
Expand Down

0 comments on commit 918c9f9

Please sign in to comment.