report each hardlink as its own file when scanning images

image scans previously collapsed a set of hardlinks onto a single file, so only
one path per inode showed up in results. dir scans report every hardlink path,
which made image vs dir SBOMs of the same filesystem diverge (and produce
different SPDX `packageVerificationCode` values for packages that own hardlinked
files).

now both image resolvers (squash and all-layers) surface each hardlink at its
own path as a regular file bound to the target's content, matching dir scans.

user-facing impact:
- SBOMs for images containing hardlinks will list more `file` entries
- SPDX `packageVerificationCode` values change for affected packages, now
  matching the equivalent `dir:` scan
- adds `file.NewVirtualLocationFromImage` to the public API

fixes #5019

Signed-off-by: Alex Goodman <wagoodman@users.noreply.github.com>
This commit is contained in:
Alex Goodman 2026-07-02 16:56:42 -04:00
parent 656a4d46d7
commit 26ddb2e23b
No known key found for this signature in database
5 changed files with 258 additions and 38 deletions

View File

@ -152,6 +152,31 @@ func NewLocationFromImage(accessPath string, ref file.Reference, img *image.Imag
}
}
// NewVirtualLocationFromImage creates a new Location surfaced at realPath but whose contents and metadata are
// described by the given reference. This is used to present a hardlink as the underlying type it points to (at the
// hardlink's own path) so that image results are in parity with directory results, which cannot distinguish a
// hardlink from a regular file. Note that RealPath (the hardlink's own path) intentionally differs from
// ref.RealPath (the target's path); callers must not assume the two agree for these locations.
func NewVirtualLocationFromImage(realPath, accessPath string, ref file.Reference, img *image.Image) Location {
// the FileSystemID comes from the target ref's layer. a hardlink and its target are always materialized in the
// same layer tar (a tar hardlink entry can only reference a file within the same archive), so the target's layer
// digest is also the hardlink path's layer.
layer := img.FileCatalog.Layer(ref)
return Location{
LocationData: LocationData{
Coordinates: Coordinates{
RealPath: realPath,
FileSystemID: layer.Metadata.Digest,
},
AccessPath: accessPath,
ref: ref,
},
LocationMetadata: LocationMetadata{
Annotations: map[string]string{},
},
}
}
// NewLocationFromDirectory creates a new Location representing the given path (extracted from the Reference) relative to the given directory.
func NewLocationFromDirectory(responsePath string, fd string, ref file.Reference) Location {
return Location{

View File

@ -53,8 +53,20 @@ func (r *ContainerImageAllLayers) HasPath(path string) bool {
return false
}
func (r *ContainerImageAllLayers) fileByRef(ref stereoscopeFile.Reference, uniqueFileIDs stereoscopeFile.ReferenceSet, layerIdx int) ([]stereoscopeFile.Reference, error) {
uniqueFiles := make([]stereoscopeFile.Reference, 0)
func (r *ContainerImageAllLayers) locationsByRef(ref stereoscopeFile.Reference, accessPath string, uniqueFileIDs stereoscopeFile.ReferenceSet, layerPos int) ([]file.Location, error) {
uniqueLocations := make([]file.Location, 0)
// if the access path is itself a hardlink, surface it as the underlying type it points to, at its own path
// (bound to the target's content), so that image results are in parity with directory results (which cannot
// tell a hardlink from a regular file). the path-based lookup is required because the search that produced ref
// already followed the basename link, collapsing the hardlink onto its target.
if ownRef, targetRef, ok := r.hardLinkAtPath(accessPath, r.layers[layerPos]); ok {
if !uniqueFileIDs.Contains(ownRef) {
uniqueFileIDs.Add(ownRef)
uniqueLocations = append(uniqueLocations, file.NewVirtualLocationFromImage(string(ownRef.RealPath), accessPath, targetRef, r.img))
}
return uniqueLocations, nil
}
// since there is potentially considerable work for each symlink/hardlink that needs to be resolved, let's check to see if this is a symlink/hardlink first
entry, err := r.img.FileCatalog.Get(ref)
@ -65,22 +77,41 @@ func (r *ContainerImageAllLayers) fileByRef(ref stereoscopeFile.Reference, uniqu
if entry.Type == stereoscopeFile.TypeHardLink || entry.Type == stereoscopeFile.TypeSymLink {
// a link may resolve in this layer or higher, assuming a squashed tree is used to search
// we should search all possible resolutions within the valid source
for _, subLayerIdx := range r.layers[layerIdx:] {
for _, subLayerIdx := range r.layers[layerPos:] {
resolvedRef, err := r.img.ResolveLinkByLayerSquash(ref, subLayerIdx)
if err != nil {
return nil, fmt.Errorf("failed to resolve link from layer (layer=%d ref=%+v): %w", subLayerIdx, ref, err)
}
if resolvedRef.HasReference() && !uniqueFileIDs.Contains(*resolvedRef.Reference) {
uniqueFileIDs.Add(*resolvedRef.Reference)
uniqueFiles = append(uniqueFiles, *resolvedRef.Reference)
uniqueLocations = append(uniqueLocations, file.NewLocationFromImage(accessPath, *resolvedRef.Reference, r.img))
}
}
} else if !uniqueFileIDs.Contains(ref) {
uniqueFileIDs.Add(ref)
uniqueFiles = append(uniqueFiles, ref)
uniqueLocations = append(uniqueLocations, file.NewLocationFromImage(accessPath, ref, r.img))
}
return uniqueFiles, nil
return uniqueLocations, nil
}
// hardLinkAtPath returns the hardlink's own reference and its resolved target reference when the basename of path is a
// hardlink within the given layer. ok is false when path does not exist there or is not a hardlink. The lookup does
// not follow the basename link so that the hardlink's own path is preserved. This adds a tree walk per
// matched ref (path x layer); if it shows up in profiles, fold the hardlink check into the existing search resolution.
func (r *ContainerImageAllLayers) hardLinkAtPath(path string, layerIdx int) (stereoscopeFile.Reference, stereoscopeFile.Reference, bool) {
var own stereoscopeFile.Reference
// use the squashed-to-layer view (not just this layer's additions) so a hardlink added in a lower layer is still
// detected when the path is searched at a higher layer; otherwise it would collapse onto its target there.
exists, resolution, err := r.img.Layers[layerIdx].SquashedTree.File(stereoscopeFile.Path(path))
if err != nil || !exists || !resolution.HasReference() {
return own, own, false
}
target, ok := r.resolveHardLinkTarget(*resolution.Reference, layerIdx)
if !ok {
return own, own, false
}
return *resolution.Reference, target, true
}
// FilesByPath returns all file.References that match the given paths from any layer in the image.
@ -112,14 +143,13 @@ func (r *ContainerImageAllLayers) FilesByPath(paths ...string) ([]file.Location,
}
}
results, err := r.fileByRef(*ref.Reference, uniqueFileIDs, idx)
locations, err := r.locationsByRef(*ref.Reference, path, uniqueFileIDs, idx)
if err != nil {
return nil, err
}
for _, result := range results {
l := file.NewLocationFromImage(path, result, r.img)
r.annotateLocation(&l)
uniqueLocations = append(uniqueLocations, l)
for i := range locations {
r.annotateLocation(&locations[i])
uniqueLocations = append(uniqueLocations, locations[i])
}
}
}
@ -158,14 +188,13 @@ func (r *ContainerImageAllLayers) FilesByGlob(patterns ...string) ([]file.Locati
}
}
refResults, err := r.fileByRef(*result.Reference, uniqueFileIDs, idx)
locations, err := r.locationsByRef(*result.Reference, string(result.RequestPath), uniqueFileIDs, idx)
if err != nil {
return nil, err
}
for _, refResult := range refResults {
l := file.NewLocationFromImage(string(result.RequestPath), refResult, r.img)
r.annotateLocation(&l)
uniqueLocations = append(uniqueLocations, l)
for i := range locations {
r.annotateLocation(&locations[i])
uniqueLocations = append(uniqueLocations, locations[i])
}
}
}
@ -233,14 +262,13 @@ func (r *ContainerImageAllLayers) FilesByMIMEType(types ...string) ([]file.Locat
continue
}
refResults, err := r.fileByRef(*ref.Reference, uniqueFileIDs, idx)
locations, err := r.locationsByRef(*ref.Reference, string(ref.RequestPath), uniqueFileIDs, idx)
if err != nil {
return nil, err
}
for _, refResult := range refResults {
l := file.NewLocationFromImage(string(ref.RequestPath), refResult, r.img)
r.annotateLocation(&l)
uniqueLocations = append(uniqueLocations, l)
for i := range locations {
r.annotateLocation(&locations[i])
uniqueLocations = append(uniqueLocations, locations[i])
}
}
}
@ -256,6 +284,11 @@ func (r *ContainerImageAllLayers) AllLocations(ctx context.Context) <-chan file.
tree := r.img.Layers[layerIdx].Tree
for _, ref := range tree.AllFiles(stereoscopeFile.AllTypes()...) {
l := file.NewLocationFromImage(string(ref.RealPath), ref, r.img)
// surface a hardlink as the underlying type it points to (at its own path) so image results match
// directory results, which cannot distinguish a hardlink from a regular file.
if targetRef, ok := r.resolveHardLinkTarget(ref, layerIdx); ok {
l = file.NewVirtualLocationFromImage(string(ref.RealPath), string(ref.RealPath), targetRef, r.img)
}
r.annotateLocation(&l)
select {
case <-ctx.Done():
@ -273,6 +306,21 @@ func (r *ContainerImageAllLayers) FileMetadataByLocation(location file.Location)
return fileMetadataByLocation(r.img, location)
}
// resolveHardLinkTarget returns the reference of a hardlink's underlying target (resolved relative to the given layer)
// when ref is a hardlink; ok is false otherwise. No resolution is performed for non-hardlinks (symlinks keep their
// existing resolution semantics).
func (r *ContainerImageAllLayers) resolveHardLinkTarget(ref stereoscopeFile.Reference, layerIdx int) (stereoscopeFile.Reference, bool) {
metadata, err := r.img.FileCatalog.Get(ref)
if err != nil || metadata.Type != stereoscopeFile.TypeHardLink {
return ref, false
}
resolved, err := r.img.ResolveLinkByLayerSquash(ref, layerIdx)
if err != nil || !resolved.HasReference() {
return ref, false
}
return *resolved.Reference, true
}
func (r *ContainerImageAllLayers) annotateLocation(l *file.Location) {
if !r.markVisibility || l == nil {
return
@ -282,26 +330,24 @@ func (r *ContainerImageAllLayers) annotateLocation(l *file.Location) {
annotation := file.VisibleAnnotation
// if we find a location for a path that matches the query (e.g. **/node_modules) but is not present in the squashed tree, skip it
ref, err := r.img.SquashedSearchContext.SearchByPath(l.RealPath, filetree.DoNotFollowDeadBasenameLinks)
if err != nil || !ref.HasReference() {
annotation = file.HiddenAnnotation
} else if ref.ID() != givenRef.ID() {
// we may have the path in the squashed tree, but this must not be in the same layer
if !r.pathResolvesToRef(l.RealPath, givenRef) {
annotation = file.HiddenAnnotation
}
// not only should the real path to the file exist, but the way we took to get there should also exist
// (e.g. if we are looking for /etc/passwd, but the real path is /etc/passwd -> /etc/passwd-1, then we should
// make certain that /etc/passwd-1 exists)
if annotation == file.VisibleAnnotation && l.AccessPath != "" {
ref, err := r.img.SquashedSearchContext.SearchByPath(l.AccessPath, filetree.DoNotFollowDeadBasenameLinks)
if err != nil || !ref.HasReference() {
annotation = file.HiddenAnnotation
} else if ref.ID() != givenRef.ID() {
// we may have the path in the squashed tree, but this must not be in the same layer
annotation = file.HiddenAnnotation
}
if annotation == file.VisibleAnnotation && l.AccessPath != "" && !r.pathResolvesToRef(l.AccessPath, givenRef) {
annotation = file.HiddenAnnotation
}
l.Annotations[file.VisibleAnnotationKey] = annotation
}
// pathResolvesToRef reports whether the given path in the squashed tree resolves to the given reference.
// SearchByPath always follows basename links, so a hardlink surfaced at its own path (whose reference is its
// target) still resolves to the target here and is correctly considered visible.
func (r *ContainerImageAllLayers) pathResolvesToRef(path string, target stereoscopeFile.Reference) bool {
ref, err := r.img.SquashedSearchContext.SearchByPath(path, filetree.DoNotFollowDeadBasenameLinks)
return err == nil && ref.HasReference() && ref.ID() == target.ID()
}

View File

@ -40,6 +40,17 @@ func (r *ContainerImageSquash) FilesByPath(paths ...string) ([]file.Location, er
uniqueLocations := make([]file.Location, 0)
for _, path := range paths {
// if the requested path is itself a hardlink, surface it at its own path (bound to its target's content)
// rather than collapsing it onto the target's path, so that image results are in parity with directory
// results (which cannot tell a hardlink from a regular file).
if ownRef, targetRef, ok := r.hardLinkAtPath(path); ok {
if !uniqueFileIDs.Contains(ownRef) {
uniqueFileIDs.Add(ownRef)
uniqueLocations = append(uniqueLocations, file.NewVirtualLocationFromImage(string(ownRef.RealPath), path, targetRef, r.img))
}
continue
}
ref, err := r.img.SquashedSearchContext.SearchByPath(path, filetree.FollowBasenameLinks)
if err != nil {
return nil, err
@ -82,7 +93,7 @@ func (r *ContainerImageSquash) FilesByPath(paths ...string) ([]file.Location, er
//
//nolint:gocognit
func (r *ContainerImageSquash) FilesByGlob(patterns ...string) ([]file.Location, error) {
uniqueFileIDs := stereoscopeFile.NewFileReferenceSet()
uniqueCoordinates := file.NewCoordinateSet()
uniqueLocations := make([]file.Location, 0)
for _, pattern := range patterns {
@ -116,10 +127,14 @@ func (r *ContainerImageSquash) FilesByGlob(patterns ...string) ([]file.Location,
return nil, fmt.Errorf("failed to find files by path (result=%+v): %w", result, err)
}
for _, resolvedLocation := range resolvedLocations {
if uniqueFileIDs.Contains(resolvedLocation.Reference()) {
// dedup on the surfaced coordinate rather than the underlying reference: distinct hardlinks share a
// single target reference but each has its own real path, so a reference-based dedup would collapse
// them back onto one entry (the exact behavior this parity fix removes). symlink resolutions keep
// their target's real path, so they still collapse as before.
if uniqueCoordinates.Contains(resolvedLocation.Coordinates) {
continue
}
uniqueFileIDs.Add(resolvedLocation.Reference())
uniqueCoordinates.Add(resolvedLocation.Coordinates)
uniqueLocations = append(uniqueLocations, resolvedLocation)
}
}
@ -179,10 +194,16 @@ func (r *ContainerImageSquash) AllLocations(ctx context.Context) <-chan file.Loc
go func() {
defer close(results)
for _, ref := range r.img.SquashedTree().AllFiles(stereoscopeFile.AllTypes()...) {
loc := file.NewLocationFromImage(string(ref.RealPath), ref, r.img)
// surface a hardlink as the underlying type it points to (at its own path) so image results match
// directory results, which cannot distinguish a hardlink from a regular file.
if targetRef, ok := r.resolveHardLinkTarget(ref); ok {
loc = file.NewVirtualLocationFromImage(string(ref.RealPath), string(ref.RealPath), targetRef, r.img)
}
select {
case <-ctx.Done():
return
case results <- file.NewLocationFromImage(string(ref.RealPath), ref, r.img):
case results <- loc:
continue
}
}
@ -190,6 +211,37 @@ func (r *ContainerImageSquash) AllLocations(ctx context.Context) <-chan file.Loc
return results
}
// hardLinkAtPath returns the hardlink's own reference and its resolved target reference when the basename of path is a
// hardlink. ok is false when path does not exist or is not a hardlink. The lookup does not follow the basename link so
// that the hardlink's own path is preserved. This adds a tree walk per FilesByPath path; if it shows up in
// profiles, fold the hardlink check into the existing SearchByPath resolution.
func (r *ContainerImageSquash) hardLinkAtPath(path string) (stereoscopeFile.Reference, stereoscopeFile.Reference, bool) {
var own stereoscopeFile.Reference
exists, resolution, err := r.img.SquashedTree().File(stereoscopeFile.Path(path))
if err != nil || !exists || !resolution.HasReference() {
return own, own, false
}
target, ok := r.resolveHardLinkTarget(*resolution.Reference)
if !ok {
return own, own, false
}
return *resolution.Reference, target, true
}
// resolveHardLinkTarget returns the reference of a hardlink's underlying target when ref is a hardlink; ok is false
// otherwise. No resolution is performed for non-hardlinks (symlinks keep their existing resolution semantics).
func (r *ContainerImageSquash) resolveHardLinkTarget(ref stereoscopeFile.Reference) (stereoscopeFile.Reference, bool) {
metadata, err := r.img.FileCatalog.Get(ref)
if err != nil || metadata.Type != stereoscopeFile.TypeHardLink {
return ref, false
}
resolved, err := r.img.ResolveLinkByImageSquash(ref)
if err != nil || !resolved.HasReference() {
return ref, false
}
return *resolved.Reference, true
}
func (r *ContainerImageSquash) FilesByMIMEType(types ...string) ([]file.Location, error) {
refs, err := r.img.SquashedSearchContext.SearchByMIMEType(types...)
if err != nil {

View File

@ -14,6 +14,7 @@ import (
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
stereoscopeFile "github.com/anchore/stereoscope/pkg/file"
"github.com/anchore/stereoscope/pkg/imagetest"
"github.com/anchore/syft/syft/file"
)
@ -574,3 +575,87 @@ func TestSquashResolver_AllLocations(t *testing.T) {
assert.ElementsMatchf(t, expected, pathsList, "expected all paths to be indexed, but found different paths: \n%s", cmp.Diff(expected, paths.List()))
}
// a hardlink should be surfaced at its own path as the underlying type it points to (a regular file with the target's
// content and metadata), so that image results are in parity with directory results (which cannot distinguish a
// hardlink from a regular file). this must hold for both the squashed and all-layers resolvers.
func TestImageResolvers_Hardlinks(t *testing.T) {
img := imagetest.GetFixtureImage(t, "docker-archive", "image-hardlinks")
resolvers := map[string]file.Resolver{}
squash, err := NewFromContainerImageSquash(img)
require.NoError(t, err)
resolvers["squashed"] = squash
allLayers, err := NewFromContainerImageAllLayers(img)
require.NoError(t, err)
resolvers["all-layers"] = allLayers
const wantContents = "hardlinked contents\n"
// asserts a location surfaces the hardlink at its own path as a regular file with the target's content, and is
// not marked hidden (all-layers annotates visibility; squash leaves the annotation unset, which must not read as
// hidden either).
assertHardlink := func(t *testing.T, resolver file.Resolver, wantPath string, loc file.Location) {
t.Helper()
assert.Equal(t, wantPath, loc.RealPath, "expected the hardlink's own path, not the target's")
assert.NotEqual(t, file.HiddenAnnotation, loc.Annotations[file.VisibleAnnotationKey], "path=%s should not be hidden", wantPath)
meta, err := resolver.FileMetadataByLocation(loc)
require.NoError(t, err)
assert.Equal(t, stereoscopeFile.TypeRegular, meta.Type, "path=%s", wantPath)
reader, err := resolver.FileContentsByLocation(loc)
require.NoError(t, err)
actual, err := io.ReadAll(reader)
require.NoError(t, err)
assert.Equal(t, wantContents, string(actual), "path=%s", wantPath)
}
for name, resolver := range resolvers {
t.Run(name, func(t *testing.T) {
t.Run("FilesByPath surfaces a hardlink at its own path with the target's content", func(t *testing.T) {
for _, path := range []string{"/hardlink-a", "/hardlink-b", "/file.txt"} {
locs, err := resolver.FilesByPath(path)
require.NoError(t, err)
require.Len(t, locs, 1, "path=%s", path)
assertHardlink(t, resolver, path, locs[0])
}
})
t.Run("FilesByGlob surfaces every hardlink in a matched set, not just one", func(t *testing.T) {
// the whole point of #5019: a glob that matches multiple hardlinks to the same target must return
// all of them (each at its own path), not collapse them onto a single entry.
locs, err := resolver.FilesByGlob("**/hardlink-*")
require.NoError(t, err)
byPath := map[string]file.Location{}
for _, loc := range locs {
_, dup := byPath[loc.RealPath]
assert.Falsef(t, dup, "path %s emitted more than once", loc.RealPath)
byPath[loc.RealPath] = loc
}
for _, path := range []string{"/hardlink-a", "/hardlink-b"} {
loc, ok := byPath[path]
require.Truef(t, ok, "expected glob to surface %s", path)
assertHardlink(t, resolver, path, loc)
}
})
t.Run("AllLocations includes every hardlink path as a regular file exactly once", func(t *testing.T) {
counts := map[string]int{}
locsByPath := map[string]file.Location{}
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
for loc := range resolver.AllLocations(ctx) {
counts[loc.RealPath]++
locsByPath[loc.RealPath] = loc
}
for _, path := range []string{"/file.txt", "/hardlink-a", "/hardlink-b"} {
require.Equalf(t, 1, counts[path], "expected %s to be reported exactly once by AllLocations", path)
assertHardlink(t, resolver, path, locsByPath[path])
}
})
})
}
}

View File

@ -0,0 +1,12 @@
# LAYER 0:
FROM busybox:1.34.0@sha256:e8e5cca392e3cf056fcdb3093e7ac2bf83fcf28b3bcf5818fe8ae71cf360c231
# LAYER 1: a regular file with two hardlinks to it, all created in the same layer so the layer tar contains one
# regular file entry and two hardlink (tar TypeLink) entries pointing at it.
RUN echo "hardlinked contents" > /file.txt && ln /file.txt /hardlink-a && ln /file.txt /hardlink-b
# squash representation (of the files added here; busybox base also contributes hardlinked applets under /bin)
# .
# ├── file.txt
# ├── hardlink-a (hardlink -> file.txt)
# └── hardlink-b (hardlink -> file.txt)