diff --git a/syft/file/location.go b/syft/file/location.go index ad12c8f49..f5e1aa51c 100644 --- a/syft/file/location.go +++ b/syft/file/location.go @@ -152,6 +152,31 @@ func NewLocationFromImage(accessPath string, ref file.Reference, img *image.Imag } } +// NewVirtualLocationFromImage creates a new Location surfaced at realPath but whose contents and metadata are +// described by the given reference. This is used to present a hardlink as the underlying type it points to (at the +// hardlink's own path) so that image results are in parity with directory results, which cannot distinguish a +// hardlink from a regular file. Note that RealPath (the hardlink's own path) intentionally differs from +// ref.RealPath (the target's path); callers must not assume the two agree for these locations. +func NewVirtualLocationFromImage(realPath, accessPath string, ref file.Reference, img *image.Image) Location { + // the FileSystemID comes from the target ref's layer. a hardlink and its target are always materialized in the + // same layer tar (a tar hardlink entry can only reference a file within the same archive), so the target's layer + // digest is also the hardlink path's layer. + layer := img.FileCatalog.Layer(ref) + return Location{ + LocationData: LocationData{ + Coordinates: Coordinates{ + RealPath: realPath, + FileSystemID: layer.Metadata.Digest, + }, + AccessPath: accessPath, + ref: ref, + }, + LocationMetadata: LocationMetadata{ + Annotations: map[string]string{}, + }, + } +} + // NewLocationFromDirectory creates a new Location representing the given path (extracted from the Reference) relative to the given directory. func NewLocationFromDirectory(responsePath string, fd string, ref file.Reference) Location { return Location{ diff --git a/syft/internal/fileresolver/container_image_all_layers.go b/syft/internal/fileresolver/container_image_all_layers.go index ad7e9e0d3..20e8136bc 100644 --- a/syft/internal/fileresolver/container_image_all_layers.go +++ b/syft/internal/fileresolver/container_image_all_layers.go @@ -53,8 +53,20 @@ func (r *ContainerImageAllLayers) HasPath(path string) bool { return false } -func (r *ContainerImageAllLayers) fileByRef(ref stereoscopeFile.Reference, uniqueFileIDs stereoscopeFile.ReferenceSet, layerIdx int) ([]stereoscopeFile.Reference, error) { - uniqueFiles := make([]stereoscopeFile.Reference, 0) +func (r *ContainerImageAllLayers) locationsByRef(ref stereoscopeFile.Reference, accessPath string, uniqueFileIDs stereoscopeFile.ReferenceSet, layerPos int) ([]file.Location, error) { + uniqueLocations := make([]file.Location, 0) + + // if the access path is itself a hardlink, surface it as the underlying type it points to, at its own path + // (bound to the target's content), so that image results are in parity with directory results (which cannot + // tell a hardlink from a regular file). the path-based lookup is required because the search that produced ref + // already followed the basename link, collapsing the hardlink onto its target. + if ownRef, targetRef, ok := r.hardLinkAtPath(accessPath, r.layers[layerPos]); ok { + if !uniqueFileIDs.Contains(ownRef) { + uniqueFileIDs.Add(ownRef) + uniqueLocations = append(uniqueLocations, file.NewVirtualLocationFromImage(string(ownRef.RealPath), accessPath, targetRef, r.img)) + } + return uniqueLocations, nil + } // since there is potentially considerable work for each symlink/hardlink that needs to be resolved, let's check to see if this is a symlink/hardlink first entry, err := r.img.FileCatalog.Get(ref) @@ -65,22 +77,41 @@ func (r *ContainerImageAllLayers) fileByRef(ref stereoscopeFile.Reference, uniqu if entry.Type == stereoscopeFile.TypeHardLink || entry.Type == stereoscopeFile.TypeSymLink { // a link may resolve in this layer or higher, assuming a squashed tree is used to search // we should search all possible resolutions within the valid source - for _, subLayerIdx := range r.layers[layerIdx:] { + for _, subLayerIdx := range r.layers[layerPos:] { resolvedRef, err := r.img.ResolveLinkByLayerSquash(ref, subLayerIdx) if err != nil { return nil, fmt.Errorf("failed to resolve link from layer (layer=%d ref=%+v): %w", subLayerIdx, ref, err) } if resolvedRef.HasReference() && !uniqueFileIDs.Contains(*resolvedRef.Reference) { uniqueFileIDs.Add(*resolvedRef.Reference) - uniqueFiles = append(uniqueFiles, *resolvedRef.Reference) + uniqueLocations = append(uniqueLocations, file.NewLocationFromImage(accessPath, *resolvedRef.Reference, r.img)) } } } else if !uniqueFileIDs.Contains(ref) { uniqueFileIDs.Add(ref) - uniqueFiles = append(uniqueFiles, ref) + uniqueLocations = append(uniqueLocations, file.NewLocationFromImage(accessPath, ref, r.img)) } - return uniqueFiles, nil + return uniqueLocations, nil +} + +// hardLinkAtPath returns the hardlink's own reference and its resolved target reference when the basename of path is a +// hardlink within the given layer. ok is false when path does not exist there or is not a hardlink. The lookup does +// not follow the basename link so that the hardlink's own path is preserved. This adds a tree walk per +// matched ref (path x layer); if it shows up in profiles, fold the hardlink check into the existing search resolution. +func (r *ContainerImageAllLayers) hardLinkAtPath(path string, layerIdx int) (stereoscopeFile.Reference, stereoscopeFile.Reference, bool) { + var own stereoscopeFile.Reference + // use the squashed-to-layer view (not just this layer's additions) so a hardlink added in a lower layer is still + // detected when the path is searched at a higher layer; otherwise it would collapse onto its target there. + exists, resolution, err := r.img.Layers[layerIdx].SquashedTree.File(stereoscopeFile.Path(path)) + if err != nil || !exists || !resolution.HasReference() { + return own, own, false + } + target, ok := r.resolveHardLinkTarget(*resolution.Reference, layerIdx) + if !ok { + return own, own, false + } + return *resolution.Reference, target, true } // FilesByPath returns all file.References that match the given paths from any layer in the image. @@ -112,14 +143,13 @@ func (r *ContainerImageAllLayers) FilesByPath(paths ...string) ([]file.Location, } } - results, err := r.fileByRef(*ref.Reference, uniqueFileIDs, idx) + locations, err := r.locationsByRef(*ref.Reference, path, uniqueFileIDs, idx) if err != nil { return nil, err } - for _, result := range results { - l := file.NewLocationFromImage(path, result, r.img) - r.annotateLocation(&l) - uniqueLocations = append(uniqueLocations, l) + for i := range locations { + r.annotateLocation(&locations[i]) + uniqueLocations = append(uniqueLocations, locations[i]) } } } @@ -158,14 +188,13 @@ func (r *ContainerImageAllLayers) FilesByGlob(patterns ...string) ([]file.Locati } } - refResults, err := r.fileByRef(*result.Reference, uniqueFileIDs, idx) + locations, err := r.locationsByRef(*result.Reference, string(result.RequestPath), uniqueFileIDs, idx) if err != nil { return nil, err } - for _, refResult := range refResults { - l := file.NewLocationFromImage(string(result.RequestPath), refResult, r.img) - r.annotateLocation(&l) - uniqueLocations = append(uniqueLocations, l) + for i := range locations { + r.annotateLocation(&locations[i]) + uniqueLocations = append(uniqueLocations, locations[i]) } } } @@ -233,14 +262,13 @@ func (r *ContainerImageAllLayers) FilesByMIMEType(types ...string) ([]file.Locat continue } - refResults, err := r.fileByRef(*ref.Reference, uniqueFileIDs, idx) + locations, err := r.locationsByRef(*ref.Reference, string(ref.RequestPath), uniqueFileIDs, idx) if err != nil { return nil, err } - for _, refResult := range refResults { - l := file.NewLocationFromImage(string(ref.RequestPath), refResult, r.img) - r.annotateLocation(&l) - uniqueLocations = append(uniqueLocations, l) + for i := range locations { + r.annotateLocation(&locations[i]) + uniqueLocations = append(uniqueLocations, locations[i]) } } } @@ -256,6 +284,11 @@ func (r *ContainerImageAllLayers) AllLocations(ctx context.Context) <-chan file. tree := r.img.Layers[layerIdx].Tree for _, ref := range tree.AllFiles(stereoscopeFile.AllTypes()...) { l := file.NewLocationFromImage(string(ref.RealPath), ref, r.img) + // surface a hardlink as the underlying type it points to (at its own path) so image results match + // directory results, which cannot distinguish a hardlink from a regular file. + if targetRef, ok := r.resolveHardLinkTarget(ref, layerIdx); ok { + l = file.NewVirtualLocationFromImage(string(ref.RealPath), string(ref.RealPath), targetRef, r.img) + } r.annotateLocation(&l) select { case <-ctx.Done(): @@ -273,6 +306,21 @@ func (r *ContainerImageAllLayers) FileMetadataByLocation(location file.Location) return fileMetadataByLocation(r.img, location) } +// resolveHardLinkTarget returns the reference of a hardlink's underlying target (resolved relative to the given layer) +// when ref is a hardlink; ok is false otherwise. No resolution is performed for non-hardlinks (symlinks keep their +// existing resolution semantics). +func (r *ContainerImageAllLayers) resolveHardLinkTarget(ref stereoscopeFile.Reference, layerIdx int) (stereoscopeFile.Reference, bool) { + metadata, err := r.img.FileCatalog.Get(ref) + if err != nil || metadata.Type != stereoscopeFile.TypeHardLink { + return ref, false + } + resolved, err := r.img.ResolveLinkByLayerSquash(ref, layerIdx) + if err != nil || !resolved.HasReference() { + return ref, false + } + return *resolved.Reference, true +} + func (r *ContainerImageAllLayers) annotateLocation(l *file.Location) { if !r.markVisibility || l == nil { return @@ -282,26 +330,24 @@ func (r *ContainerImageAllLayers) annotateLocation(l *file.Location) { annotation := file.VisibleAnnotation // if we find a location for a path that matches the query (e.g. **/node_modules) but is not present in the squashed tree, skip it - ref, err := r.img.SquashedSearchContext.SearchByPath(l.RealPath, filetree.DoNotFollowDeadBasenameLinks) - if err != nil || !ref.HasReference() { - annotation = file.HiddenAnnotation - } else if ref.ID() != givenRef.ID() { - // we may have the path in the squashed tree, but this must not be in the same layer + if !r.pathResolvesToRef(l.RealPath, givenRef) { annotation = file.HiddenAnnotation } // not only should the real path to the file exist, but the way we took to get there should also exist // (e.g. if we are looking for /etc/passwd, but the real path is /etc/passwd -> /etc/passwd-1, then we should // make certain that /etc/passwd-1 exists) - if annotation == file.VisibleAnnotation && l.AccessPath != "" { - ref, err := r.img.SquashedSearchContext.SearchByPath(l.AccessPath, filetree.DoNotFollowDeadBasenameLinks) - if err != nil || !ref.HasReference() { - annotation = file.HiddenAnnotation - } else if ref.ID() != givenRef.ID() { - // we may have the path in the squashed tree, but this must not be in the same layer - annotation = file.HiddenAnnotation - } + if annotation == file.VisibleAnnotation && l.AccessPath != "" && !r.pathResolvesToRef(l.AccessPath, givenRef) { + annotation = file.HiddenAnnotation } l.Annotations[file.VisibleAnnotationKey] = annotation } + +// pathResolvesToRef reports whether the given path in the squashed tree resolves to the given reference. +// SearchByPath always follows basename links, so a hardlink surfaced at its own path (whose reference is its +// target) still resolves to the target here and is correctly considered visible. +func (r *ContainerImageAllLayers) pathResolvesToRef(path string, target stereoscopeFile.Reference) bool { + ref, err := r.img.SquashedSearchContext.SearchByPath(path, filetree.DoNotFollowDeadBasenameLinks) + return err == nil && ref.HasReference() && ref.ID() == target.ID() +} diff --git a/syft/internal/fileresolver/container_image_squash.go b/syft/internal/fileresolver/container_image_squash.go index efcd2e8e4..ef7122af6 100644 --- a/syft/internal/fileresolver/container_image_squash.go +++ b/syft/internal/fileresolver/container_image_squash.go @@ -40,6 +40,17 @@ func (r *ContainerImageSquash) FilesByPath(paths ...string) ([]file.Location, er uniqueLocations := make([]file.Location, 0) for _, path := range paths { + // if the requested path is itself a hardlink, surface it at its own path (bound to its target's content) + // rather than collapsing it onto the target's path, so that image results are in parity with directory + // results (which cannot tell a hardlink from a regular file). + if ownRef, targetRef, ok := r.hardLinkAtPath(path); ok { + if !uniqueFileIDs.Contains(ownRef) { + uniqueFileIDs.Add(ownRef) + uniqueLocations = append(uniqueLocations, file.NewVirtualLocationFromImage(string(ownRef.RealPath), path, targetRef, r.img)) + } + continue + } + ref, err := r.img.SquashedSearchContext.SearchByPath(path, filetree.FollowBasenameLinks) if err != nil { return nil, err @@ -82,7 +93,7 @@ func (r *ContainerImageSquash) FilesByPath(paths ...string) ([]file.Location, er // //nolint:gocognit func (r *ContainerImageSquash) FilesByGlob(patterns ...string) ([]file.Location, error) { - uniqueFileIDs := stereoscopeFile.NewFileReferenceSet() + uniqueCoordinates := file.NewCoordinateSet() uniqueLocations := make([]file.Location, 0) for _, pattern := range patterns { @@ -116,10 +127,14 @@ func (r *ContainerImageSquash) FilesByGlob(patterns ...string) ([]file.Location, return nil, fmt.Errorf("failed to find files by path (result=%+v): %w", result, err) } for _, resolvedLocation := range resolvedLocations { - if uniqueFileIDs.Contains(resolvedLocation.Reference()) { + // dedup on the surfaced coordinate rather than the underlying reference: distinct hardlinks share a + // single target reference but each has its own real path, so a reference-based dedup would collapse + // them back onto one entry (the exact behavior this parity fix removes). symlink resolutions keep + // their target's real path, so they still collapse as before. + if uniqueCoordinates.Contains(resolvedLocation.Coordinates) { continue } - uniqueFileIDs.Add(resolvedLocation.Reference()) + uniqueCoordinates.Add(resolvedLocation.Coordinates) uniqueLocations = append(uniqueLocations, resolvedLocation) } } @@ -179,10 +194,16 @@ func (r *ContainerImageSquash) AllLocations(ctx context.Context) <-chan file.Loc go func() { defer close(results) for _, ref := range r.img.SquashedTree().AllFiles(stereoscopeFile.AllTypes()...) { + loc := file.NewLocationFromImage(string(ref.RealPath), ref, r.img) + // surface a hardlink as the underlying type it points to (at its own path) so image results match + // directory results, which cannot distinguish a hardlink from a regular file. + if targetRef, ok := r.resolveHardLinkTarget(ref); ok { + loc = file.NewVirtualLocationFromImage(string(ref.RealPath), string(ref.RealPath), targetRef, r.img) + } select { case <-ctx.Done(): return - case results <- file.NewLocationFromImage(string(ref.RealPath), ref, r.img): + case results <- loc: continue } } @@ -190,6 +211,37 @@ func (r *ContainerImageSquash) AllLocations(ctx context.Context) <-chan file.Loc return results } +// hardLinkAtPath returns the hardlink's own reference and its resolved target reference when the basename of path is a +// hardlink. ok is false when path does not exist or is not a hardlink. The lookup does not follow the basename link so +// that the hardlink's own path is preserved. This adds a tree walk per FilesByPath path; if it shows up in +// profiles, fold the hardlink check into the existing SearchByPath resolution. +func (r *ContainerImageSquash) hardLinkAtPath(path string) (stereoscopeFile.Reference, stereoscopeFile.Reference, bool) { + var own stereoscopeFile.Reference + exists, resolution, err := r.img.SquashedTree().File(stereoscopeFile.Path(path)) + if err != nil || !exists || !resolution.HasReference() { + return own, own, false + } + target, ok := r.resolveHardLinkTarget(*resolution.Reference) + if !ok { + return own, own, false + } + return *resolution.Reference, target, true +} + +// resolveHardLinkTarget returns the reference of a hardlink's underlying target when ref is a hardlink; ok is false +// otherwise. No resolution is performed for non-hardlinks (symlinks keep their existing resolution semantics). +func (r *ContainerImageSquash) resolveHardLinkTarget(ref stereoscopeFile.Reference) (stereoscopeFile.Reference, bool) { + metadata, err := r.img.FileCatalog.Get(ref) + if err != nil || metadata.Type != stereoscopeFile.TypeHardLink { + return ref, false + } + resolved, err := r.img.ResolveLinkByImageSquash(ref) + if err != nil || !resolved.HasReference() { + return ref, false + } + return *resolved.Reference, true +} + func (r *ContainerImageSquash) FilesByMIMEType(types ...string) ([]file.Location, error) { refs, err := r.img.SquashedSearchContext.SearchByMIMEType(types...) if err != nil { diff --git a/syft/internal/fileresolver/container_image_squash_test.go b/syft/internal/fileresolver/container_image_squash_test.go index 4c1df13a6..82b0b87fb 100644 --- a/syft/internal/fileresolver/container_image_squash_test.go +++ b/syft/internal/fileresolver/container_image_squash_test.go @@ -14,6 +14,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + stereoscopeFile "github.com/anchore/stereoscope/pkg/file" "github.com/anchore/stereoscope/pkg/imagetest" "github.com/anchore/syft/syft/file" ) @@ -574,3 +575,87 @@ func TestSquashResolver_AllLocations(t *testing.T) { assert.ElementsMatchf(t, expected, pathsList, "expected all paths to be indexed, but found different paths: \n%s", cmp.Diff(expected, paths.List())) } + +// a hardlink should be surfaced at its own path as the underlying type it points to (a regular file with the target's +// content and metadata), so that image results are in parity with directory results (which cannot distinguish a +// hardlink from a regular file). this must hold for both the squashed and all-layers resolvers. +func TestImageResolvers_Hardlinks(t *testing.T) { + img := imagetest.GetFixtureImage(t, "docker-archive", "image-hardlinks") + + resolvers := map[string]file.Resolver{} + squash, err := NewFromContainerImageSquash(img) + require.NoError(t, err) + resolvers["squashed"] = squash + + allLayers, err := NewFromContainerImageAllLayers(img) + require.NoError(t, err) + resolvers["all-layers"] = allLayers + + const wantContents = "hardlinked contents\n" + + // asserts a location surfaces the hardlink at its own path as a regular file with the target's content, and is + // not marked hidden (all-layers annotates visibility; squash leaves the annotation unset, which must not read as + // hidden either). + assertHardlink := func(t *testing.T, resolver file.Resolver, wantPath string, loc file.Location) { + t.Helper() + assert.Equal(t, wantPath, loc.RealPath, "expected the hardlink's own path, not the target's") + assert.NotEqual(t, file.HiddenAnnotation, loc.Annotations[file.VisibleAnnotationKey], "path=%s should not be hidden", wantPath) + + meta, err := resolver.FileMetadataByLocation(loc) + require.NoError(t, err) + assert.Equal(t, stereoscopeFile.TypeRegular, meta.Type, "path=%s", wantPath) + + reader, err := resolver.FileContentsByLocation(loc) + require.NoError(t, err) + actual, err := io.ReadAll(reader) + require.NoError(t, err) + assert.Equal(t, wantContents, string(actual), "path=%s", wantPath) + } + + for name, resolver := range resolvers { + t.Run(name, func(t *testing.T) { + t.Run("FilesByPath surfaces a hardlink at its own path with the target's content", func(t *testing.T) { + for _, path := range []string{"/hardlink-a", "/hardlink-b", "/file.txt"} { + locs, err := resolver.FilesByPath(path) + require.NoError(t, err) + require.Len(t, locs, 1, "path=%s", path) + assertHardlink(t, resolver, path, locs[0]) + } + }) + + t.Run("FilesByGlob surfaces every hardlink in a matched set, not just one", func(t *testing.T) { + // the whole point of #5019: a glob that matches multiple hardlinks to the same target must return + // all of them (each at its own path), not collapse them onto a single entry. + locs, err := resolver.FilesByGlob("**/hardlink-*") + require.NoError(t, err) + + byPath := map[string]file.Location{} + for _, loc := range locs { + _, dup := byPath[loc.RealPath] + assert.Falsef(t, dup, "path %s emitted more than once", loc.RealPath) + byPath[loc.RealPath] = loc + } + for _, path := range []string{"/hardlink-a", "/hardlink-b"} { + loc, ok := byPath[path] + require.Truef(t, ok, "expected glob to surface %s", path) + assertHardlink(t, resolver, path, loc) + } + }) + + t.Run("AllLocations includes every hardlink path as a regular file exactly once", func(t *testing.T) { + counts := map[string]int{} + locsByPath := map[string]file.Location{} + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + for loc := range resolver.AllLocations(ctx) { + counts[loc.RealPath]++ + locsByPath[loc.RealPath] = loc + } + for _, path := range []string{"/file.txt", "/hardlink-a", "/hardlink-b"} { + require.Equalf(t, 1, counts[path], "expected %s to be reported exactly once by AllLocations", path) + assertHardlink(t, resolver, path, locsByPath[path]) + } + }) + }) + } +} diff --git a/syft/internal/fileresolver/testdata/image-hardlinks/Dockerfile b/syft/internal/fileresolver/testdata/image-hardlinks/Dockerfile new file mode 100644 index 000000000..b733fb777 --- /dev/null +++ b/syft/internal/fileresolver/testdata/image-hardlinks/Dockerfile @@ -0,0 +1,12 @@ +# LAYER 0: +FROM busybox:1.34.0@sha256:e8e5cca392e3cf056fcdb3093e7ac2bf83fcf28b3bcf5818fe8ae71cf360c231 + +# LAYER 1: a regular file with two hardlinks to it, all created in the same layer so the layer tar contains one +# regular file entry and two hardlink (tar TypeLink) entries pointing at it. +RUN echo "hardlinked contents" > /file.txt && ln /file.txt /hardlink-a && ln /file.txt /hardlink-b + +# squash representation (of the files added here; busybox base also contributes hardlinked applets under /bin) +# . +# ├── file.txt +# ├── hardlink-a (hardlink -> file.txt) +# └── hardlink-b (hardlink -> file.txt)