From 89842bd2f645d82562b5027b1ba19c2a57aab14e Mon Sep 17 00:00:00 2001 From: Kudryavcev Nikolay <35200428+Rupikz@users.noreply.github.com> Date: Fri, 14 Nov 2025 02:04:43 +0300 Subject: [PATCH] chore: migrate syft to use mholt/archives instead of anchore fork (#4029) --------- Signed-off-by: Kudryavcev Nikolay Signed-off-by: Christopher Phillips Signed-off-by: Alex Goodman --- README.md | 4 +- go.mod | 9 +- go.sum | 6 - internal/file/tar_file_traversal.go | 41 +- internal/file/zip_file_manifest.go | 17 +- internal/file/zip_file_manifest_test.go | 7 +- internal/file/zip_file_traversal.go | 114 ++-- internal/file/zip_file_traversal_test.go | 528 +++++++++++++++++- internal/file/zip_read_closer.go | 229 -------- internal/file/zip_read_closer_test.go | 50 -- internal/task/unknowns_tasks.go | 8 +- syft/format/github/internal/model/model.go | 8 +- syft/pkg/cataloger/java/archive_parser.go | 29 +- .../pkg/cataloger/java/archive_parser_test.go | 8 +- .../java/tar_wrapped_archive_parser.go | 2 +- .../java/zip_wrapped_archive_parser.go | 2 +- syft/source/filesource/file_source.go | 73 ++- 17 files changed, 720 insertions(+), 415 deletions(-) delete mode 100644 internal/file/zip_read_closer.go delete mode 100644 internal/file/zip_read_closer_test.go diff --git a/README.md b/README.md index 52b210cb5..febec6934 100644 --- a/README.md +++ b/README.md @@ -106,8 +106,8 @@ syft -o Where the `formats` available are: - `syft-json`: Use this to get as much information out of Syft as possible! - `syft-text`: A row-oriented, human-and-machine-friendly output. -- `cyclonedx-xml`: A XML report conforming to the [CycloneDX 1.6 specification](https://cyclonedx.org/specification/overview/). -- `cyclonedx-xml@1.5`: A XML report conforming to the [CycloneDX 1.5 specification](https://cyclonedx.org/specification/overview/). +- `cyclonedx-xml`: An XML report conforming to the [CycloneDX 1.6 specification](https://cyclonedx.org/specification/overview/). +- `cyclonedx-xml@1.5`: An XML report conforming to the [CycloneDX 1.5 specification](https://cyclonedx.org/specification/overview/). - `cyclonedx-json`: A JSON report conforming to the [CycloneDX 1.6 specification](https://cyclonedx.org/specification/overview/). - `cyclonedx-json@1.5`: A JSON report conforming to the [CycloneDX 1.5 specification](https://cyclonedx.org/specification/overview/). - `spdx-tag-value`: A tag-value formatted report conforming to the [SPDX 2.3 specification](https://spdx.github.io/spdx-spec/v2.3/). diff --git a/go.mod b/go.mod index 3eb181ee9..4ed0739c8 100644 --- a/go.mod +++ b/go.mod @@ -11,7 +11,6 @@ require ( github.com/acarl005/stripansi v0.0.0-20180116102854-5a71ef0e047d github.com/acobaugh/osrelease v0.1.0 github.com/adrg/xdg v0.5.3 - github.com/anchore/archiver/v3 v3.5.3-0.20241210171143-5b1d8d1c7c51 github.com/anchore/bubbly v0.0.0-20231115134915-def0aba654a9 github.com/anchore/clio v0.0.0-20250319180342-2cfe4b0cb716 github.com/anchore/fangs v0.0.0-20250319222917-446a1e748ec2 @@ -168,7 +167,6 @@ require ( github.com/goccy/go-yaml v1.18.0 github.com/gogo/protobuf v1.3.2 // indirect github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8 // indirect - github.com/golang/snappy v0.0.4 // indirect github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e // indirect github.com/google/s2a-go v0.1.8 // indirect github.com/googleapis/enterprise-certificate-proxy v0.3.4 // indirect @@ -209,10 +207,6 @@ require ( github.com/muesli/cancelreader v0.2.2 // indirect github.com/muesli/termenv v0.16.0 // indirect github.com/ncruces/go-strftime v0.1.9 // indirect - github.com/nwaples/rardecode v1.1.3 // indirect - github.com/nwaples/rardecode/v2 v2.2.0 // indirect - github.com/olekukonko/errors v1.1.0 // indirect - github.com/olekukonko/ll v0.1.2 // indirect github.com/opencontainers/image-spec v1.1.1 // indirect github.com/opencontainers/runtime-spec v1.1.0 // indirect github.com/opencontainers/selinux v1.13.0 // indirect @@ -319,7 +313,10 @@ require ( github.com/json-iterator/go v1.1.12 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect + github.com/nwaples/rardecode/v2 v2.2.0 // indirect github.com/olekukonko/cat v0.0.0-20250911104152-50322a0618f6 // indirect + github.com/olekukonko/errors v1.1.0 // indirect + github.com/olekukonko/ll v0.1.2 // indirect github.com/smallnest/ringbuffer v0.0.0-20241116012123-461381446e3d // indirect gonum.org/v1/gonum v0.15.1 // indirect ) diff --git a/go.sum b/go.sum index 550b43819..05c2da5cd 100644 --- a/go.sum +++ b/go.sum @@ -110,8 +110,6 @@ github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuy github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= github.com/alecthomas/units v0.0.0-20190717042225-c3de453c63f4/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= -github.com/anchore/archiver/v3 v3.5.3-0.20241210171143-5b1d8d1c7c51 h1:yhk+P8lF3ZiROjmaVRao9WGTRo4b/wYjoKEiAHWrKwc= -github.com/anchore/archiver/v3 v3.5.3-0.20241210171143-5b1d8d1c7c51/go.mod h1:nwuGSd7aZp0rtYt79YggCGafz1RYsclE7pi3fhLwvuw= github.com/anchore/bubbly v0.0.0-20231115134915-def0aba654a9 h1:p0ZIe0htYOX284Y4axJaGBvXHU0VCCzLN5Wf5XbKStU= github.com/anchore/bubbly v0.0.0-20231115134915-def0aba654a9/go.mod h1:3ZsFB9tzW3vl4gEiUeuSOMDnwroWxIxJelOOHUp8dSw= github.com/anchore/clio v0.0.0-20250319180342-2cfe4b0cb716 h1:2sIdYJlQESEnyk3Y0WD2vXWW5eD2iMz9Ev8fj1Z8LNA= @@ -479,8 +477,6 @@ github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiu github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= github.com/golang/snappy v0.0.3/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= -github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM= -github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= @@ -755,8 +751,6 @@ github.com/nix-community/go-nix v0.0.0-20250101154619-4bdde671e0a1 h1:kpt9ZfKcm+ github.com/nix-community/go-nix v0.0.0-20250101154619-4bdde671e0a1/go.mod h1:qgCw4bBKZX8qMgGeEZzGFVT3notl42dBjNqO2jut0M0= github.com/nsf/jsondiff v0.0.0-20210926074059-1e845ec5d249 h1:NHrXEjTNQY7P0Zfx1aMrNhpgxHmow66XQtm0aQLY0AE= github.com/nsf/jsondiff v0.0.0-20210926074059-1e845ec5d249/go.mod h1:mpRZBD8SJ55OIICQ3iWH0Yz3cjzA61JdqMLoWXeB2+8= -github.com/nwaples/rardecode v1.1.3 h1:cWCaZwfM5H7nAD6PyEdcVnczzV8i/JtotnyW/dD9lEc= -github.com/nwaples/rardecode v1.1.3/go.mod h1:5DzqNKiOdpKKBH87u8VlvAnPZMXcGRhxWkRpHbbfGS0= github.com/nwaples/rardecode/v2 v2.2.0 h1:4ufPGHiNe1rYJxYfehALLjup4Ls3ck42CWwjKiOqu0A= github.com/nwaples/rardecode/v2 v2.2.0/go.mod h1:7uz379lSxPe6j9nvzxUZ+n7mnJNgjsRNb6IbvGVHRmw= github.com/olekukonko/cat v0.0.0-20250911104152-50322a0618f6 h1:zrbMGy9YXpIeTnGj4EljqMiZsIcE09mmF8XsD5AYOJc= diff --git a/internal/file/tar_file_traversal.go b/internal/file/tar_file_traversal.go index 7d211168a..c3511a1cc 100644 --- a/internal/file/tar_file_traversal.go +++ b/internal/file/tar_file_traversal.go @@ -1,17 +1,40 @@ package file import ( + "context" "fmt" "os" "path/filepath" "github.com/bmatcuk/doublestar/v4" + "github.com/mholt/archives" - "github.com/anchore/archiver/v3" + "github.com/anchore/syft/internal" ) +// TraverseFilesInTar enumerates all paths stored within a tar archive using the visitor pattern. +func TraverseFilesInTar(ctx context.Context, archivePath string, visitor archives.FileHandler) error { + tarReader, err := os.Open(archivePath) + if err != nil { + return fmt.Errorf("unable to open tar archive (%s): %w", archivePath, err) + } + defer internal.CloseAndLogError(tarReader, archivePath) + + format, _, err := archives.Identify(ctx, archivePath, nil) + if err != nil { + return fmt.Errorf("failed to identify tar compression format: %w", err) + } + + extractor, ok := format.(archives.Extractor) + if !ok { + return fmt.Errorf("file format does not support extraction: %s", archivePath) + } + + return extractor.Extract(ctx, tarReader, visitor) +} + // ExtractGlobsFromTarToUniqueTempFile extracts paths matching the given globs within the given archive to a temporary directory, returning file openers for each file extracted. -func ExtractGlobsFromTarToUniqueTempFile(archivePath, dir string, globs ...string) (map[string]Opener, error) { +func ExtractGlobsFromTarToUniqueTempFile(ctx context.Context, archivePath, dir string, globs ...string) (map[string]Opener, error) { results := make(map[string]Opener) // don't allow for full traversal, only select traversal from given paths @@ -19,9 +42,7 @@ func ExtractGlobsFromTarToUniqueTempFile(archivePath, dir string, globs ...strin return results, nil } - visitor := func(file archiver.File) error { - defer file.Close() - + visitor := func(_ context.Context, file archives.FileInfo) error { // ignore directories if file.IsDir() { return nil @@ -43,7 +64,13 @@ func ExtractGlobsFromTarToUniqueTempFile(archivePath, dir string, globs ...strin // provides a ReadCloser. It is up to the caller to handle closing the file explicitly. defer tempFile.Close() - if err := safeCopy(tempFile, file.ReadCloser); err != nil { + packedFile, err := file.Open() + if err != nil { + return fmt.Errorf("unable to read file=%q from tar=%q: %w", file.NameInArchive, archivePath, err) + } + defer internal.CloseAndLogError(packedFile, archivePath) + + if err := safeCopy(tempFile, packedFile); err != nil { return fmt.Errorf("unable to copy source=%q for tar=%q: %w", file.Name(), archivePath, err) } @@ -52,7 +79,7 @@ func ExtractGlobsFromTarToUniqueTempFile(archivePath, dir string, globs ...strin return nil } - return results, archiver.Walk(archivePath, visitor) + return results, TraverseFilesInTar(ctx, archivePath, visitor) } func matchesAnyGlob(name string, globs ...string) bool { diff --git a/internal/file/zip_file_manifest.go b/internal/file/zip_file_manifest.go index 346e661c6..8dcb0d2f2 100644 --- a/internal/file/zip_file_manifest.go +++ b/internal/file/zip_file_manifest.go @@ -1,10 +1,12 @@ package file import ( + "context" "os" "sort" "strings" + "github.com/mholt/archives" "github.com/scylladb/go-set/strset" "github.com/anchore/syft/internal/log" @@ -14,22 +16,25 @@ import ( type ZipFileManifest map[string]os.FileInfo // NewZipFileManifest creates and returns a new ZipFileManifest populated with path and metadata from the given zip archive path. -func NewZipFileManifest(archivePath string) (ZipFileManifest, error) { - zipReader, err := OpenZip(archivePath) +func NewZipFileManifest(ctx context.Context, archivePath string) (ZipFileManifest, error) { + zipReader, err := os.Open(archivePath) manifest := make(ZipFileManifest) if err != nil { log.Debugf("unable to open zip archive (%s): %v", archivePath, err) return manifest, err } defer func() { - err = zipReader.Close() - if err != nil { + if err = zipReader.Close(); err != nil { log.Debugf("unable to close zip archive (%s): %+v", archivePath, err) } }() - for _, file := range zipReader.File { - manifest.Add(file.Name, file.FileInfo()) + err = archives.Zip{}.Extract(ctx, zipReader, func(_ context.Context, file archives.FileInfo) error { + manifest.Add(file.NameInArchive, file.FileInfo) + return nil + }) + if err != nil { + return manifest, err } return manifest, nil } diff --git a/internal/file/zip_file_manifest_test.go b/internal/file/zip_file_manifest_test.go index 75d445228..9ebe42224 100644 --- a/internal/file/zip_file_manifest_test.go +++ b/internal/file/zip_file_manifest_test.go @@ -4,6 +4,7 @@ package file import ( + "context" "encoding/json" "os" "path" @@ -24,7 +25,7 @@ func TestNewZipFileManifest(t *testing.T) { archiveFilePath := setupZipFileTest(t, sourceDirPath, false) - actual, err := NewZipFileManifest(archiveFilePath) + actual, err := NewZipFileManifest(context.Background(), archiveFilePath) if err != nil { t.Fatalf("unable to extract from unzip archive: %+v", err) } @@ -59,7 +60,7 @@ func TestNewZip64FileManifest(t *testing.T) { sourceDirPath := path.Join(cwd, "test-fixtures", "zip-source") archiveFilePath := setupZipFileTest(t, sourceDirPath, true) - actual, err := NewZipFileManifest(archiveFilePath) + actual, err := NewZipFileManifest(context.Background(), archiveFilePath) if err != nil { t.Fatalf("unable to extract from unzip archive: %+v", err) } @@ -99,7 +100,7 @@ func TestZipFileManifest_GlobMatch(t *testing.T) { archiveFilePath := setupZipFileTest(t, sourceDirPath, false) - z, err := NewZipFileManifest(archiveFilePath) + z, err := NewZipFileManifest(context.Background(), archiveFilePath) if err != nil { t.Fatalf("unable to extract from unzip archive: %+v", err) } diff --git a/internal/file/zip_file_traversal.go b/internal/file/zip_file_traversal.go index 1b712eff5..5fc26a220 100644 --- a/internal/file/zip_file_traversal.go +++ b/internal/file/zip_file_traversal.go @@ -1,13 +1,15 @@ package file import ( - "archive/zip" "bytes" + "context" "fmt" "os" "path/filepath" "strings" + "github.com/mholt/archives" + "github.com/anchore/syft/internal/log" ) @@ -25,7 +27,7 @@ type errZipSlipDetected struct { } func (e *errZipSlipDetected) Error() string { - return fmt.Sprintf("paths are not allowed to resolve outside of the root prefix (%q). Destination: %q", e.Prefix, e.JoinArgs) + return fmt.Sprintf("path traversal detected: paths are not allowed to resolve outside of the root prefix (%q). Destination: %q", e.Prefix, e.JoinArgs) } type zipTraversalRequest map[string]struct{} @@ -39,38 +41,34 @@ func newZipTraverseRequest(paths ...string) zipTraversalRequest { } // TraverseFilesInZip enumerates all paths stored within a zip archive using the visitor pattern. -func TraverseFilesInZip(archivePath string, visitor func(*zip.File) error, paths ...string) error { +func TraverseFilesInZip(ctx context.Context, archivePath string, visitor archives.FileHandler, paths ...string) error { request := newZipTraverseRequest(paths...) - zipReader, err := OpenZip(archivePath) + zipReader, err := os.Open(archivePath) if err != nil { return fmt.Errorf("unable to open zip archive (%s): %w", archivePath, err) } defer func() { - err = zipReader.Close() - if err != nil { + if err := zipReader.Close(); err != nil { log.Errorf("unable to close zip archive (%s): %+v", archivePath, err) } }() - for _, file := range zipReader.File { + return archives.Zip{}.Extract(ctx, zipReader, func(ctx context.Context, file archives.FileInfo) error { // if no paths are given then assume that all files should be traversed if len(paths) > 0 { - if _, ok := request[file.Name]; !ok { + if _, ok := request[file.NameInArchive]; !ok { // this file path is not of interest - continue + return nil } } - if err = visitor(file); err != nil { - return err - } - } - return nil + return visitor(ctx, file) + }) } // ExtractFromZipToUniqueTempFile extracts select paths for the given archive to a temporary directory, returning file openers for each file extracted. -func ExtractFromZipToUniqueTempFile(archivePath, dir string, paths ...string) (map[string]Opener, error) { +func ExtractFromZipToUniqueTempFile(ctx context.Context, archivePath, dir string, paths ...string) (map[string]Opener, error) { results := make(map[string]Opener) // don't allow for full traversal, only select traversal from given paths @@ -78,9 +76,8 @@ func ExtractFromZipToUniqueTempFile(archivePath, dir string, paths ...string) (m return results, nil } - visitor := func(file *zip.File) error { - tempfilePrefix := filepath.Base(filepath.Clean(file.Name)) + "-" - + visitor := func(_ context.Context, file archives.FileInfo) error { + tempfilePrefix := filepath.Base(filepath.Clean(file.NameInArchive)) + "-" tempFile, err := os.CreateTemp(dir, tempfilePrefix) if err != nil { return fmt.Errorf("unable to create temp file: %w", err) @@ -92,33 +89,32 @@ func ExtractFromZipToUniqueTempFile(archivePath, dir string, paths ...string) (m zippedFile, err := file.Open() if err != nil { - return fmt.Errorf("unable to read file=%q from zip=%q: %w", file.Name, archivePath, err) + return fmt.Errorf("unable to read file=%q from zip=%q: %w", file.NameInArchive, archivePath, err) } defer func() { - err := zippedFile.Close() - if err != nil { - log.Errorf("unable to close source file=%q from zip=%q: %+v", file.Name, archivePath, err) + if err := zippedFile.Close(); err != nil { + log.Errorf("unable to close source file=%q from zip=%q: %+v", file.NameInArchive, archivePath, err) } }() - if file.FileInfo().IsDir() { - return fmt.Errorf("unable to extract directories, only files: %s", file.Name) + if file.IsDir() { + return fmt.Errorf("unable to extract directories, only files: %s", file.NameInArchive) } if err := safeCopy(tempFile, zippedFile); err != nil { - return fmt.Errorf("unable to copy source=%q for zip=%q: %w", file.Name, archivePath, err) + return fmt.Errorf("unable to copy source=%q for zip=%q: %w", file.NameInArchive, archivePath, err) } - results[file.Name] = Opener{path: tempFile.Name()} + results[file.NameInArchive] = Opener{path: tempFile.Name()} return nil } - return results, TraverseFilesInZip(archivePath, visitor, paths...) + return results, TraverseFilesInZip(ctx, archivePath, visitor, paths...) } // ContentsFromZip extracts select paths for the given archive and returns a set of string contents for each path. -func ContentsFromZip(archivePath string, paths ...string) (map[string]string, error) { +func ContentsFromZip(ctx context.Context, archivePath string, paths ...string) (map[string]string, error) { results := make(map[string]string) // don't allow for full traversal, only select traversal from given paths @@ -126,37 +122,38 @@ func ContentsFromZip(archivePath string, paths ...string) (map[string]string, er return results, nil } - visitor := func(file *zip.File) error { + visitor := func(_ context.Context, file archives.FileInfo) error { zippedFile, err := file.Open() if err != nil { - return fmt.Errorf("unable to read file=%q from zip=%q: %w", file.Name, archivePath, err) + return fmt.Errorf("unable to read file=%q from zip=%q: %w", file.NameInArchive, archivePath, err) } + defer func() { + if err := zippedFile.Close(); err != nil { + log.Errorf("unable to close source file=%q from zip=%q: %+v", file.NameInArchive, archivePath, err) + } + }() - if file.FileInfo().IsDir() { - return fmt.Errorf("unable to extract directories, only files: %s", file.Name) + if file.IsDir() { + return fmt.Errorf("unable to extract directories, only files: %s", file.NameInArchive) } var buffer bytes.Buffer if err := safeCopy(&buffer, zippedFile); err != nil { - return fmt.Errorf("unable to copy source=%q for zip=%q: %w", file.Name, archivePath, err) + return fmt.Errorf("unable to copy source=%q for zip=%q: %w", file.NameInArchive, archivePath, err) } - results[file.Name] = buffer.String() + results[file.NameInArchive] = buffer.String() - err = zippedFile.Close() - if err != nil { - return fmt.Errorf("unable to close source file=%q from zip=%q: %w", file.Name, archivePath, err) - } return nil } - return results, TraverseFilesInZip(archivePath, visitor, paths...) + return results, TraverseFilesInZip(ctx, archivePath, visitor, paths...) } // UnzipToDir extracts a zip archive to a target directory. -func UnzipToDir(archivePath, targetDir string) error { - visitor := func(file *zip.File) error { - joinedPath, err := safeJoin(targetDir, file.Name) +func UnzipToDir(ctx context.Context, archivePath, targetDir string) error { + visitor := func(_ context.Context, file archives.FileInfo) error { + joinedPath, err := SafeJoin(targetDir, file.NameInArchive) if err != nil { return err } @@ -164,11 +161,11 @@ func UnzipToDir(archivePath, targetDir string) error { return extractSingleFile(file, joinedPath, archivePath) } - return TraverseFilesInZip(archivePath, visitor) + return TraverseFilesInZip(ctx, archivePath, visitor) } -// safeJoin ensures that any destinations do not resolve to a path above the prefix path. -func safeJoin(prefix string, dest ...string) (string, error) { +// SafeJoin ensures that any destinations do not resolve to a path above the prefix path. +func SafeJoin(prefix string, dest ...string) (string, error) { joinResult := filepath.Join(append([]string{prefix}, dest...)...) cleanJoinResult := filepath.Clean(joinResult) if !strings.HasPrefix(cleanJoinResult, filepath.Clean(prefix)) { @@ -181,13 +178,18 @@ func safeJoin(prefix string, dest ...string) (string, error) { return joinResult, nil } -func extractSingleFile(file *zip.File, expandedFilePath, archivePath string) error { +func extractSingleFile(file archives.FileInfo, expandedFilePath, archivePath string) error { zippedFile, err := file.Open() if err != nil { - return fmt.Errorf("unable to read file=%q from zip=%q: %w", file.Name, archivePath, err) + return fmt.Errorf("unable to read file=%q from zip=%q: %w", file.NameInArchive, archivePath, err) } + defer func() { + if err := zippedFile.Close(); err != nil { + log.Errorf("unable to close source file=%q from zip=%q: %+v", file.NameInArchive, archivePath, err) + } + }() - if file.FileInfo().IsDir() { + if file.IsDir() { err = os.MkdirAll(expandedFilePath, file.Mode()) if err != nil { return fmt.Errorf("unable to create dir=%q from zip=%q: %w", expandedFilePath, archivePath, err) @@ -202,20 +204,16 @@ func extractSingleFile(file *zip.File, expandedFilePath, archivePath string) err if err != nil { return fmt.Errorf("unable to create dest file=%q from zip=%q: %w", expandedFilePath, archivePath, err) } + defer func() { + if err := outputFile.Close(); err != nil { + log.Errorf("unable to close dest file=%q from zip=%q: %+v", outputFile.Name(), archivePath, err) + } + }() if err := safeCopy(outputFile, zippedFile); err != nil { - return fmt.Errorf("unable to copy source=%q to dest=%q for zip=%q: %w", file.Name, outputFile.Name(), archivePath, err) - } - - err = outputFile.Close() - if err != nil { - return fmt.Errorf("unable to close dest file=%q from zip=%q: %w", outputFile.Name(), archivePath, err) + return fmt.Errorf("unable to copy source=%q to dest=%q for zip=%q: %w", file.NameInArchive, outputFile.Name(), archivePath, err) } } - err = zippedFile.Close() - if err != nil { - return fmt.Errorf("unable to close source file=%q from zip=%q: %w", file.Name, archivePath, err) - } return nil } diff --git a/internal/file/zip_file_traversal_test.go b/internal/file/zip_file_traversal_test.go index d5a81d273..812f5e450 100644 --- a/internal/file/zip_file_traversal_test.go +++ b/internal/file/zip_file_traversal_test.go @@ -4,6 +4,8 @@ package file import ( + "archive/zip" + "context" "crypto/sha256" "encoding/json" "errors" @@ -17,6 +19,7 @@ import ( "github.com/go-test/deep" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) func equal(r1, r2 io.Reader) (bool, error) { @@ -55,7 +58,7 @@ func TestUnzipToDir(t *testing.T) { expectedPaths := len(expectedZipArchiveEntries) observedPaths := 0 - err = UnzipToDir(archiveFilePath, unzipDestinationDir) + err = UnzipToDir(context.Background(), archiveFilePath, unzipDestinationDir) if err != nil { t.Fatalf("unable to unzip archive: %+v", err) } @@ -145,7 +148,7 @@ func TestContentsFromZip(t *testing.T) { paths = append(paths, p) } - actual, err := ContentsFromZip(archivePath, paths...) + actual, err := ContentsFromZip(context.Background(), archivePath, paths...) if err != nil { t.Fatalf("unable to extract from unzip archive: %+v", err) } @@ -307,9 +310,528 @@ func TestSafeJoin(t *testing.T) { for _, test := range tests { t.Run(fmt.Sprintf("%+v:%+v", test.prefix, test.args), func(t *testing.T) { - actual, err := safeJoin(test.prefix, test.args...) + actual, err := SafeJoin(test.prefix, test.args...) test.errAssertion(t, err) assert.Equal(t, test.expected, actual) }) } } + +// TestSymlinkProtection demonstrates that SafeJoin protects against symlink-based +// directory traversal attacks by validating that archive entry paths cannot escape +// the extraction directory. +func TestSafeJoin_SymlinkProtection(t *testing.T) { + tests := []struct { + name string + archivePath string // Path as it would appear in the archive + expectError bool + description string + }{ + { + name: "path traversal via ../", + archivePath: "../../../outside/file.txt", + expectError: true, + description: "Archive entry with ../ trying to escape extraction dir", + }, + { + name: "absolute path symlink target", + archivePath: "../../../sensitive.txt", + expectError: true, + description: "Simulates symlink pointing outside via relative path", + }, + { + name: "safe relative path within extraction dir", + archivePath: "subdir/safe.txt", + expectError: false, + description: "Normal file path that stays within extraction directory", + }, + { + name: "safe path with internal ../", + archivePath: "dir1/../dir2/file.txt", + expectError: false, + description: "Path with ../ that still resolves within extraction dir", + }, + { + name: "deeply nested traversal", + archivePath: "../../../../../../tmp/evil.txt", + expectError: true, + description: "Multiple levels of ../ trying to escape", + }, + { + name: "single parent directory escape", + archivePath: "../", + expectError: true, + description: "Simple one-level escape attempt", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Create temp directories to simulate extraction scenario + tmpDir := t.TempDir() + extractDir := filepath.Join(tmpDir, "extract") + outsideDir := filepath.Join(tmpDir, "outside") + + require.NoError(t, os.MkdirAll(extractDir, 0755)) + require.NoError(t, os.MkdirAll(outsideDir, 0755)) + + // Create a file outside extraction dir that an attacker might target + outsideFile := filepath.Join(outsideDir, "sensitive.txt") + require.NoError(t, os.WriteFile(outsideFile, []byte("sensitive data"), 0644)) + + // Test SafeJoin - this is what happens when processing archive entries + result, err := SafeJoin(extractDir, tt.archivePath) + + if tt.expectError { + // Should block malicious paths + require.Error(t, err, "Expected SafeJoin to reject malicious path") + var zipSlipErr *errZipSlipDetected + assert.ErrorAs(t, err, &zipSlipErr, "Error should be errZipSlipDetected type") + assert.Empty(t, result, "Result should be empty for blocked paths") + } else { + // Should allow safe paths + require.NoError(t, err, "Expected SafeJoin to allow safe path") + assert.NotEmpty(t, result, "Result should not be empty for safe paths") + assert.True(t, strings.HasPrefix(filepath.Clean(result), filepath.Clean(extractDir)), + "Safe path should resolve within extraction directory") + } + }) + } +} + +// TestUnzipToDir_SymlinkAttacks tests UnzipToDir function with malicious ZIP archives +// containing symlink entries that attempt path traversal attacks. +// +// EXPECTED BEHAVIOR: UnzipToDir should either: +// 1. Detect and reject symlinks explicitly with a security error, OR +// 2. Extract them safely (library converts symlinks to regular files) +func TestUnzipToDir_SymlinkAttacks(t *testing.T) { + tests := []struct { + name string + symlinkName string + fileName string + errContains string + }{ + { + name: "direct symlink to outside directory", + symlinkName: "evil_link", + fileName: "evil_link/payload.txt", + errContains: "not a directory", // attempt to write through symlink leaf (which is not a directory) + }, + { + name: "directory symlink attack", + symlinkName: "safe_dir/link", + fileName: "safe_dir/link/payload.txt", + errContains: "not a directory", // attempt to write through symlink (which is not a directory) + }, + { + name: "symlink without payload file", + symlinkName: "standalone_link", + fileName: "", // no payload file + errContains: "", // no error expected, symlink without payload is safe + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tempDir := t.TempDir() + + // create outside target directory + outsideDir := filepath.Join(tempDir, "outside_target") + require.NoError(t, os.MkdirAll(outsideDir, 0755)) + + // create extraction directory + extractDir := filepath.Join(tempDir, "extract") + require.NoError(t, os.MkdirAll(extractDir, 0755)) + + maliciousZip := createMaliciousZipWithSymlink(t, tempDir, tt.symlinkName, outsideDir, tt.fileName) + + err := UnzipToDir(context.Background(), maliciousZip, extractDir) + + // check error expectations + if tt.errContains != "" { + require.Error(t, err) + require.Contains(t, err.Error(), tt.errContains) + } else { + require.NoError(t, err) + } + + analyzeExtractionDirectory(t, extractDir) + + // check if payload file escaped extraction directory + if tt.fileName != "" { + maliciousFile := filepath.Join(outsideDir, filepath.Base(tt.fileName)) + checkFileOutsideExtraction(t, maliciousFile) + } + + // check if symlink was created pointing outside + symlinkPath := filepath.Join(extractDir, tt.symlinkName) + checkSymlinkCreation(t, symlinkPath, extractDir, outsideDir) + }) + } +} + +// TestContentsFromZip_SymlinkAttacks tests the ContentsFromZip function with malicious +// ZIP archives containing symlink entries. +// +// EXPECTED BEHAVIOR: ContentsFromZip should either: +// 1. Reject symlinks explicitly, OR +// 2. Return empty content for symlinks (library behavior) +// +// Though ContentsFromZip doesn't write to disk, but if symlinks are followed, it could read sensitive +// files from outside the archive. +func TestContentsFromZip_SymlinkAttacks(t *testing.T) { + tests := []struct { + name string + symlinkName string + symlinkTarget string + requestPath string + errContains string + }{ + { + name: "request symlink entry directly", + symlinkName: "evil_link", + symlinkTarget: "/etc/hosts", // attempt to read sensitive file + requestPath: "evil_link", + errContains: "", // no error expected - library returns symlink metadata + }, + { + name: "symlink in nested directory", + symlinkName: "nested/link", + symlinkTarget: "/etc/hosts", + requestPath: "nested/link", + errContains: "", // no error expected - library returns symlink metadata + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tempDir := t.TempDir() + + // create malicious ZIP with symlink entry (no payload file needed) + maliciousZip := createMaliciousZipWithSymlink(t, tempDir, tt.symlinkName, tt.symlinkTarget, "") + + contents, err := ContentsFromZip(context.Background(), maliciousZip, tt.requestPath) + + // check error expectations + if tt.errContains != "" { + require.Error(t, err) + require.Contains(t, err.Error(), tt.errContains) + return + } + require.NoError(t, err) + + // verify symlink handling - library should return symlink target as content (metadata) + content, found := contents[tt.requestPath] + require.True(t, found, "symlink entry should be found in results") + + // verify symlink was NOT followed (content should be target path or empty) + if content != "" && content != tt.symlinkTarget { + // content is not empty and not the symlink target - check if actual file was read + if _, statErr := os.Stat(tt.symlinkTarget); statErr == nil { + targetContent, readErr := os.ReadFile(tt.symlinkTarget) + if readErr == nil && string(targetContent) == content { + t.Errorf("critical issue!... symlink was FOLLOWED and external file content was read!") + t.Logf(" symlink: %s → %s", tt.requestPath, tt.symlinkTarget) + t.Logf(" content length: %d bytes", len(content)) + } + } + } + }) + } +} + +// TestExtractFromZipToUniqueTempFile_SymlinkAttacks tests the ExtractFromZipToUniqueTempFile +// function with malicious ZIP archives containing symlink entries. +// +// EXPECTED BEHAVIOR: ExtractFromZipToUniqueTempFile should either: +// 1. Reject symlinks explicitly, OR +// 2. Extract them safely (library converts to empty files, filepath.Base sanitizes names) +// +// This function uses filepath.Base() on the archive entry name for temp file prefix and +// os.CreateTemp() which creates files in the specified directory, so it should be protected. +func TestExtractFromZipToUniqueTempFile_SymlinkAttacks(t *testing.T) { + tests := []struct { + name string + symlinkName string + symlinkTarget string + requestPath string + errContains string + }{ + { + name: "extract symlink entry to temp file", + symlinkName: "evil_link", + symlinkTarget: "/etc/passwd", + requestPath: "evil_link", + errContains: "", // no error expected - library extracts symlink metadata + }, + { + name: "extract nested symlink", + symlinkName: "nested/dir/link", + symlinkTarget: "/tmp/outside", + requestPath: "nested/dir/link", + errContains: "", // no error expected + }, + { + name: "extract path traversal symlink name", + symlinkName: "../../escape", + symlinkTarget: "/tmp/outside", + requestPath: "../../escape", + errContains: "", // no error expected - filepath.Base sanitizes name + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tempDir := t.TempDir() + + maliciousZip := createMaliciousZipWithSymlink(t, tempDir, tt.symlinkName, tt.symlinkTarget, "") + + // create temp directory for extraction + extractTempDir := filepath.Join(tempDir, "temp_extract") + require.NoError(t, os.MkdirAll(extractTempDir, 0755)) + + openers, err := ExtractFromZipToUniqueTempFile(context.Background(), maliciousZip, extractTempDir, tt.requestPath) + + // check error expectations + if tt.errContains != "" { + require.Error(t, err) + require.Contains(t, err.Error(), tt.errContains) + return + } + require.NoError(t, err) + + // verify symlink was extracted + opener, found := openers[tt.requestPath] + require.True(t, found, "symlink entry should be extracted") + + // verify temp file is within temp directory + tempFilePath := opener.path + cleanTempDir := filepath.Clean(extractTempDir) + cleanTempFile := filepath.Clean(tempFilePath) + require.True(t, strings.HasPrefix(cleanTempFile, cleanTempDir), + "temp file must be within temp directory: %s not in %s", cleanTempFile, cleanTempDir) + + // verify symlink was NOT followed (content should be target path or empty) + f, openErr := opener.Open() + require.NoError(t, openErr) + defer f.Close() + + content, readErr := io.ReadAll(f) + require.NoError(t, readErr) + + // check if symlink was followed (content matches actual file) + if len(content) > 0 && string(content) != tt.symlinkTarget { + if _, statErr := os.Stat(tt.symlinkTarget); statErr == nil { + targetContent, readErr := os.ReadFile(tt.symlinkTarget) + if readErr == nil && string(targetContent) == string(content) { + t.Errorf("critical issue!... symlink was FOLLOWED and external file content was copied!") + t.Logf(" symlink: %s → %s", tt.requestPath, tt.symlinkTarget) + t.Logf(" content length: %d bytes", len(content)) + } + } + } + }) + } +} + +// forensicFindings contains the results of analyzing an extraction directory +type forensicFindings struct { + symlinksFound []forensicSymlink + regularFiles []string + directories []string + symlinkVulnerabilities []string +} + +type forensicSymlink struct { + path string + target string + escapesExtraction bool + resolvedPath string +} + +// analyzeExtractionDirectory walks the extraction directory and detects symlinks that point +// outside the extraction directory. It is silent unless vulnerabilities are found. +func analyzeExtractionDirectory(t *testing.T, extractDir string) forensicFindings { + t.Helper() + + findings := forensicFindings{} + + filepath.Walk(extractDir, func(path string, info os.FileInfo, err error) error { + if err != nil { + // only log if there's an error walking the directory + t.Logf("Error walking %s: %v", path, err) + return nil + } + + relPath := strings.TrimPrefix(path, extractDir+"/") + if relPath == "" { + relPath = "." + } + + // use Lstat to detect symlinks without following them + linfo, lerr := os.Lstat(path) + if lerr == nil && linfo.Mode()&os.ModeSymlink != 0 { + target, _ := os.Readlink(path) + + // resolve to see where it actually points + var resolvedPath string + var escapesExtraction bool + + if filepath.IsAbs(target) { + // absolute symlink + resolvedPath = target + cleanExtractDir := filepath.Clean(extractDir) + escapesExtraction = !strings.HasPrefix(filepath.Clean(target), cleanExtractDir) + + if escapesExtraction { + t.Errorf("critical issue!... absolute symlink created: %s → %s", relPath, target) + t.Logf(" this symlink points outside the extraction directory") + findings.symlinkVulnerabilities = append(findings.symlinkVulnerabilities, + fmt.Sprintf("absolute symlink: %s → %s", relPath, target)) + } + } else { + // relative symlink - resolve it + resolvedPath = filepath.Join(filepath.Dir(path), target) + cleanResolved := filepath.Clean(resolvedPath) + cleanExtractDir := filepath.Clean(extractDir) + + escapesExtraction = !strings.HasPrefix(cleanResolved, cleanExtractDir) + + if escapesExtraction { + t.Errorf("critical issue!... symlink escapes extraction dir: %s → %s", relPath, target) + t.Logf(" symlink resolves to: %s (outside extraction directory)", cleanResolved) + findings.symlinkVulnerabilities = append(findings.symlinkVulnerabilities, + fmt.Sprintf("relative symlink escape: %s → %s (resolves to %s)", relPath, target, cleanResolved)) + } + } + + findings.symlinksFound = append(findings.symlinksFound, forensicSymlink{ + path: relPath, + target: target, + escapesExtraction: escapesExtraction, + resolvedPath: resolvedPath, + }) + } else { + // regular file or directory - collect silently + if info.IsDir() { + findings.directories = append(findings.directories, relPath) + } else { + findings.regularFiles = append(findings.regularFiles, relPath) + } + } + return nil + }) + + return findings +} + +// checkFileOutsideExtraction checks if a file was written outside the extraction directory. +// Returns true if the file exists (vulnerability), false otherwise. Silent on success. +func checkFileOutsideExtraction(t *testing.T, filePath string) bool { + t.Helper() + + if stat, err := os.Stat(filePath); err == nil { + content, _ := os.ReadFile(filePath) + t.Errorf("critical issue!... file written OUTSIDE extraction directory!") + t.Logf(" location: %s", filePath) + t.Logf(" size: %d bytes", stat.Size()) + t.Logf(" content: %s", string(content)) + t.Logf(" ...this means an attacker can write files to arbitrary locations on the filesystem") + return true + } + // no file found outside extraction directory... + return false +} + +// checkSymlinkCreation verifies if a symlink was created at the expected path and reports +// whether it points outside the extraction directory. Silent unless a symlink is found. +func checkSymlinkCreation(t *testing.T, symlinkPath, extractDir, expectedTarget string) bool { + t.Helper() + + if linfo, err := os.Lstat(symlinkPath); err == nil { + if linfo.Mode()&os.ModeSymlink != 0 { + target, _ := os.Readlink(symlinkPath) + + if expectedTarget != "" && target == expectedTarget { + t.Errorf("critical issue!... symlink pointing outside extraction dir was created!") + t.Logf(" Symlink: %s → %s", symlinkPath, target) + return true + } + + // Check if it escapes even if target doesn't match expected + if filepath.IsAbs(target) { + cleanExtractDir := filepath.Clean(extractDir) + if !strings.HasPrefix(filepath.Clean(target), cleanExtractDir) { + t.Errorf("critical issue!... absolute symlink escapes extraction dir!") + t.Logf(" symlink: %s → %s", symlinkPath, target) + return true + } + } + } + // if it exists but is not a symlink, that's good (attack was thwarted)... + } + + return false +} + +// createMaliciousZipWithSymlink creates a ZIP archive containing a symlink entry pointing to an arbitrary target, +// followed by a file entry that attempts to write through that symlink. +// returns the path to the created ZIP archive. +func createMaliciousZipWithSymlink(t *testing.T, tempDir, symlinkName, symlinkTarget, fileName string) string { + t.Helper() + + maliciousZip := filepath.Join(tempDir, "malicious.zip") + zipFile, err := os.Create(maliciousZip) + require.NoError(t, err) + defer zipFile.Close() + + zw := zip.NewWriter(zipFile) + + // create parent directories if the symlink is nested + if dir := filepath.Dir(symlinkName); dir != "." { + dirHeader := &zip.FileHeader{ + Name: dir + "/", + Method: zip.Store, + } + dirHeader.SetMode(os.ModeDir | 0755) + _, err = zw.CreateHeader(dirHeader) + require.NoError(t, err) + } + + // create symlink entry pointing outside extraction directory + // note: ZIP format stores symlinks as regular files with the target path as content + symlinkHeader := &zip.FileHeader{ + Name: symlinkName, + Method: zip.Store, + } + symlinkHeader.SetMode(os.ModeSymlink | 0755) + + symlinkWriter, err := zw.CreateHeader(symlinkHeader) + require.NoError(t, err) + + // write the symlink target as the file content (this is how ZIP stores symlinks) + _, err = symlinkWriter.Write([]byte(symlinkTarget)) + require.NoError(t, err) + + // create file entry that will be written through the symlink + if fileName != "" { + payloadContent := []byte("MALICIOUS PAYLOAD - This should NOT be written outside extraction dir!") + payloadHeader := &zip.FileHeader{ + Name: fileName, + Method: zip.Deflate, + } + payloadHeader.SetMode(0644) + + payloadWriter, err := zw.CreateHeader(payloadHeader) + require.NoError(t, err) + + _, err = payloadWriter.Write(payloadContent) + require.NoError(t, err) + } + + require.NoError(t, zw.Close()) + require.NoError(t, zipFile.Close()) + + return maliciousZip +} diff --git a/internal/file/zip_read_closer.go b/internal/file/zip_read_closer.go deleted file mode 100644 index fd45f52a1..000000000 --- a/internal/file/zip_read_closer.go +++ /dev/null @@ -1,229 +0,0 @@ -package file - -import ( - "archive/zip" - "encoding/binary" - "errors" - "fmt" - "io" - "math" - "os" - - "github.com/anchore/syft/internal/log" -) - -// directoryEndLen, readByf, directoryEnd, and findSignatureInBlock were copied from the golang stdlib, specifically: -// - https://github.com/golang/go/blob/go1.16.4/src/archive/zip/struct.go -// - https://github.com/golang/go/blob/go1.16.4/src/archive/zip/reader.go -// findArchiveStartOffset is derived from the same stdlib utils, specifically the readDirectoryEnd function. - -const ( - directoryEndLen = 22 - directory64LocLen = 20 - directory64EndLen = 56 - directory64LocSignature = 0x07064b50 - directory64EndSignature = 0x06064b50 -) - -// ZipReadCloser is a drop-in replacement for zip.ReadCloser (from zip.OpenReader) that additionally considers zips -// that have bytes prefixed to the front of the archive (common with self-extracting jars). -type ZipReadCloser struct { - *zip.Reader - io.Closer -} - -// OpenZip provides a ZipReadCloser for the given filepath. -func OpenZip(filepath string) (*ZipReadCloser, error) { - f, err := os.Open(filepath) - if err != nil { - return nil, err - } - fi, err := f.Stat() - if err != nil { - f.Close() - return nil, err - } - - // some archives may have bytes prepended to the front of the archive, such as with self executing JARs. We first - // need to find the start of the archive and keep track of this offset. - offset, err := findArchiveStartOffset(f, fi.Size()) - if err != nil { - log.Debugf("cannot find beginning of zip archive=%q : %v", filepath, err) - return nil, err - } - - if _, err := f.Seek(0, io.SeekStart); err != nil { - return nil, fmt.Errorf("unable to seek to beginning of archive: %w", err) - } - - if offset > math.MaxInt64 { - return nil, fmt.Errorf("archive start offset too large: %v", offset) - } - offset64 := int64(offset) - - size := fi.Size() - offset64 - - r, err := zip.NewReader(io.NewSectionReader(f, offset64, size), size) - if err != nil { - log.Debugf("unable to open ZipReadCloser @ %q: %v", filepath, err) - return nil, err - } - - return &ZipReadCloser{ - Reader: r, - Closer: f, - }, nil -} - -type readBuf []byte - -func (b *readBuf) uint16() uint16 { - v := binary.LittleEndian.Uint16(*b) - *b = (*b)[2:] - return v -} - -func (b *readBuf) uint32() uint32 { - v := binary.LittleEndian.Uint32(*b) - *b = (*b)[4:] - return v -} - -func (b *readBuf) uint64() uint64 { - v := binary.LittleEndian.Uint64(*b) - *b = (*b)[8:] - return v -} - -type directoryEnd struct { - diskNbr uint32 // unused - dirDiskNbr uint32 // unused - dirRecordsThisDisk uint64 // unused - directoryRecords uint64 - directorySize uint64 - directoryOffset uint64 // relative to file -} - -// note: this is derived from readDirectoryEnd within the archive/zip package -func findArchiveStartOffset(r io.ReaderAt, size int64) (startOfArchive uint64, err error) { - // look for directoryEndSignature in the last 1k, then in the last 65k - var buf []byte - var directoryEndOffset int64 - for i, bLen := range []int64{1024, 65 * 1024} { - if bLen > size { - bLen = size - } - buf = make([]byte, int(bLen)) - if _, err := r.ReadAt(buf, size-bLen); err != nil && !errors.Is(err, io.EOF) { - return 0, err - } - if p := findSignatureInBlock(buf); p >= 0 { - buf = buf[p:] - directoryEndOffset = size - bLen + int64(p) - break - } - if i == 1 || bLen == size { - return 0, zip.ErrFormat - } - } - - if buf == nil { - // we were unable to find the directoryEndSignature block - return 0, zip.ErrFormat - } - - // read header into struct - b := readBuf(buf[4:]) // skip signature - d := &directoryEnd{ - diskNbr: uint32(b.uint16()), - dirDiskNbr: uint32(b.uint16()), - dirRecordsThisDisk: uint64(b.uint16()), - directoryRecords: uint64(b.uint16()), - directorySize: uint64(b.uint32()), - directoryOffset: uint64(b.uint32()), - } - // Calculate where the zip data actually begins - - // These values mean that the file can be a zip64 file - if d.directoryRecords == 0xffff || d.directorySize == 0xffff || d.directoryOffset == 0xffffffff { - p, err := findDirectory64End(r, directoryEndOffset) - if err == nil && p >= 0 { - directoryEndOffset = p - err = readDirectory64End(r, p, d) - } - if err != nil { - return 0, err - } - } - startOfArchive = uint64(directoryEndOffset) - d.directorySize - d.directoryOffset - - // Make sure directoryOffset points to somewhere in our file. - if d.directoryOffset >= uint64(size) { - return 0, zip.ErrFormat - } - return startOfArchive, nil -} - -// findDirectory64End tries to read the zip64 locator just before the -// directory end and returns the offset of the zip64 directory end if -// found. -func findDirectory64End(r io.ReaderAt, directoryEndOffset int64) (int64, error) { - locOffset := directoryEndOffset - directory64LocLen - if locOffset < 0 { - return -1, nil // no need to look for a header outside the file - } - buf := make([]byte, directory64LocLen) - if _, err := r.ReadAt(buf, locOffset); err != nil { - return -1, err - } - b := readBuf(buf) - if sig := b.uint32(); sig != directory64LocSignature { - return -1, nil - } - if b.uint32() != 0 { // number of the disk with the start of the zip64 end of central directory - return -1, nil // the file is not a valid zip64-file - } - p := b.uint64() // relative offset of the zip64 end of central directory record - if b.uint32() != 1 { // total number of disks - return -1, nil // the file is not a valid zip64-file - } - return int64(p), nil -} - -// readDirectory64End reads the zip64 directory end and updates the -// directory end with the zip64 directory end values. -func readDirectory64End(r io.ReaderAt, offset int64, d *directoryEnd) (err error) { - buf := make([]byte, directory64EndLen) - if _, err := r.ReadAt(buf, offset); err != nil { - return err - } - - b := readBuf(buf) - if sig := b.uint32(); sig != directory64EndSignature { - return errors.New("could not read directory64End") - } - - b = b[12:] // skip dir size, version and version needed (uint64 + 2x uint16) - d.diskNbr = b.uint32() // number of this disk - d.dirDiskNbr = b.uint32() // number of the disk with the start of the central directory - d.dirRecordsThisDisk = b.uint64() // total number of entries in the central directory on this disk - d.directoryRecords = b.uint64() // total number of entries in the central directory - d.directorySize = b.uint64() // size of the central directory - d.directoryOffset = b.uint64() // offset of start of central directory with respect to the starting disk number - - return nil -} - -func findSignatureInBlock(b []byte) int { - for i := len(b) - directoryEndLen; i >= 0; i-- { - // defined from directoryEndSignature - if b[i] == 'P' && b[i+1] == 'K' && b[i+2] == 0x05 && b[i+3] == 0x06 { - // n is length of comment - n := int(b[i+directoryEndLen-2]) | int(b[i+directoryEndLen-1])<<8 - if n+directoryEndLen+i <= len(b) { - return i - } - } - } - return -1 -} diff --git a/internal/file/zip_read_closer_test.go b/internal/file/zip_read_closer_test.go deleted file mode 100644 index 349bfcc9b..000000000 --- a/internal/file/zip_read_closer_test.go +++ /dev/null @@ -1,50 +0,0 @@ -//go:build !windows -// +build !windows - -package file - -import ( - "os" - "testing" - - "github.com/stretchr/testify/assert" -) - -func TestFindArchiveStartOffset(t *testing.T) { - tests := []struct { - name string - archivePrep func(tb testing.TB) string - expected uint64 - }{ - { - name: "standard, non-nested zip", - archivePrep: prepZipSourceFixture, - expected: 0, - }, - { - name: "zip with prepended bytes", - archivePrep: prependZipSourceFixtureWithString(t, "junk at the beginning of the file..."), - expected: 36, - }, - } - - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - archivePath := test.archivePrep(t) - f, err := os.Open(archivePath) - if err != nil { - t.Fatalf("could not open archive %q: %+v", archivePath, err) - } - fi, err := os.Stat(f.Name()) - if err != nil { - t.Fatalf("unable to stat archive: %+v", err) - } - - actual, err := findArchiveStartOffset(f, fi.Size()) - if err != nil { - t.Fatalf("unable to find offset: %+v", err) - } - assert.Equal(t, test.expected, actual) - }) - } -} diff --git a/internal/task/unknowns_tasks.go b/internal/task/unknowns_tasks.go index 0b8959bd0..2f63ce28e 100644 --- a/internal/task/unknowns_tasks.go +++ b/internal/task/unknowns_tasks.go @@ -4,7 +4,8 @@ import ( "context" "strings" - "github.com/anchore/archiver/v3" + "github.com/mholt/archives" + "github.com/anchore/syft/internal/log" "github.com/anchore/syft/internal/sbomsync" "github.com/anchore/syft/syft/cataloging" @@ -57,9 +58,10 @@ func (c unknownsLabelerTask) finalize(resolver file.Resolver, s *sbom.SBOM) { } if c.IncludeUnexpandedArchives { + ctx := context.Background() for coords := range s.Artifacts.FileMetadata { - unarchiver, notArchiveErr := archiver.ByExtension(coords.RealPath) - if unarchiver != nil && notArchiveErr == nil && !hasPackageReference(coords) { + format, _, notArchiveErr := archives.Identify(ctx, coords.RealPath, nil) + if format != nil && notArchiveErr == nil && !hasPackageReference(coords) { s.Artifacts.Unknowns[coords] = append(s.Artifacts.Unknowns[coords], "archive not cataloged") } } diff --git a/syft/format/github/internal/model/model.go b/syft/format/github/internal/model/model.go index 69d2b9876..b2aa0d23a 100644 --- a/syft/format/github/internal/model/model.go +++ b/syft/format/github/internal/model/model.go @@ -1,11 +1,13 @@ package model import ( + "context" "fmt" "strings" "time" - "github.com/anchore/archiver/v3" + "github.com/mholt/archives" + "github.com/anchore/packageurl-go" "github.com/anchore/syft/internal/log" "github.com/anchore/syft/syft/pkg" @@ -153,8 +155,8 @@ func trimRelative(s string) string { // isArchive returns true if the path appears to be an archive func isArchive(path string) bool { - _, err := archiver.ByExtension(path) - return err == nil + format, _, err := archives.Identify(context.Background(), path, nil) + return err == nil && format != nil } func toDependencies(s *sbom.SBOM, p pkg.Package) (out []string) { diff --git a/syft/pkg/cataloger/java/archive_parser.go b/syft/pkg/cataloger/java/archive_parser.go index 27d4f164f..d44ec5d3a 100644 --- a/syft/pkg/cataloger/java/archive_parser.go +++ b/syft/pkg/cataloger/java/archive_parser.go @@ -80,7 +80,7 @@ func (gap genericArchiveParserAdapter) parseJavaArchive(ctx context.Context, _ f // processJavaArchive processes an archive for java contents, returning all Java libraries and nested archives func (gap genericArchiveParserAdapter) processJavaArchive(ctx context.Context, reader file.LocationReadCloser, parentPkg *pkg.Package) ([]pkg.Package, []artifact.Relationship, error) { - parser, cleanupFn, err := newJavaArchiveParser(reader, true, gap.cfg) + parser, cleanupFn, err := newJavaArchiveParser(ctx, reader, true, gap.cfg) // note: even on error, we should always run cleanup functions defer cleanupFn() if err != nil { @@ -99,7 +99,7 @@ func uniquePkgKey(groupID string, p *pkg.Package) string { // newJavaArchiveParser returns a new java archive parser object for the given archive. Can be configured to discover // and parse nested archives or ignore them. -func newJavaArchiveParser(reader file.LocationReadCloser, detectNested bool, cfg ArchiveCatalogerConfig) (*archiveParser, func(), error) { +func newJavaArchiveParser(ctx context.Context, reader file.LocationReadCloser, detectNested bool, cfg ArchiveCatalogerConfig) (*archiveParser, func(), error) { // fetch the last element of the virtual path virtualElements := strings.Split(reader.Path(), ":") currentFilepath := virtualElements[len(virtualElements)-1] @@ -109,7 +109,7 @@ func newJavaArchiveParser(reader file.LocationReadCloser, detectNested bool, cfg return nil, cleanupFn, fmt.Errorf("unable to process java archive: %w", err) } - fileManifest, err := intFile.NewZipFileManifest(archivePath) + fileManifest, err := intFile.NewZipFileManifest(ctx, archivePath) if err != nil { return nil, cleanupFn, fmt.Errorf("unable to read files from java archive: %w", err) } @@ -226,7 +226,7 @@ func (j *archiveParser) discoverMainPackage(ctx context.Context) (*pkg.Package, } // fetch the manifest file - contents, err := intFile.ContentsFromZip(j.archivePath, manifestMatches...) + contents, err := intFile.ContentsFromZip(ctx, j.archivePath, manifestMatches...) if err != nil { return nil, fmt.Errorf("unable to extract java manifests (%s): %w", j.location, err) } @@ -387,8 +387,9 @@ type parsedPomProject struct { // discoverMainPackageFromPomInfo attempts to resolve maven groupId, artifactId, version and other info from found pom information func (j *archiveParser) discoverMainPackageFromPomInfo(ctx context.Context) (group, name, version string, parsedPom *parsedPomProject) { - properties, _ := pomPropertiesByParentPath(j.archivePath, j.location, j.fileManifest.GlobMatch(false, pomPropertiesGlob)) - projects, _ := pomProjectByParentPath(j.archivePath, j.location, j.fileManifest.GlobMatch(false, pomXMLGlob)) + // Find the pom.properties/pom.xml if the names seem like a plausible match + properties, _ := pomPropertiesByParentPath(ctx, j.archivePath, j.location, j.fileManifest.GlobMatch(false, pomPropertiesGlob)) + projects, _ := pomProjectByParentPath(ctx, j.archivePath, j.location, j.fileManifest.GlobMatch(false, pomXMLGlob)) artifactsMap := j.buildArtifactsMap(properties) pomProperties, parsedPom := j.findBestPomMatch(properties, projects, artifactsMap) @@ -519,13 +520,13 @@ func (j *archiveParser) discoverPkgsFromAllMavenFiles(ctx context.Context, paren var pkgs []pkg.Package // pom.properties - properties, err := pomPropertiesByParentPath(j.archivePath, j.location, j.fileManifest.GlobMatch(false, pomPropertiesGlob)) + properties, err := pomPropertiesByParentPath(ctx, j.archivePath, j.location, j.fileManifest.GlobMatch(false, pomPropertiesGlob)) if err != nil { return nil, err } // pom.xml - projects, err := pomProjectByParentPath(j.archivePath, j.location, j.fileManifest.GlobMatch(false, pomXMLGlob)) + projects, err := pomProjectByParentPath(ctx, j.archivePath, j.location, j.fileManifest.GlobMatch(false, pomXMLGlob)) if err != nil { return nil, err } @@ -575,7 +576,7 @@ func (j *archiveParser) getLicenseFromFileInArchive(ctx context.Context) ([]pkg. } if len(licenseMatches) > 0 { - contents, err := intFile.ContentsFromZip(j.archivePath, licenseMatches...) + contents, err := intFile.ContentsFromZip(ctx, j.archivePath, licenseMatches...) if err != nil { return nil, fmt.Errorf("unable to extract java license (%s): %w", j.location, err) } @@ -616,7 +617,7 @@ func (j *archiveParser) discoverPkgsFromNestedArchives(ctx context.Context, pare // associating each discovered package to the given parent package. func discoverPkgsFromZip(ctx context.Context, location file.Location, archivePath, contentPath string, fileManifest intFile.ZipFileManifest, parentPkg *pkg.Package, cfg ArchiveCatalogerConfig) ([]pkg.Package, []artifact.Relationship, error) { // search and parse pom.properties files & fetch the contents - openers, err := intFile.ExtractFromZipToUniqueTempFile(archivePath, contentPath, fileManifest.GlobMatch(false, archiveFormatGlobs...)...) + openers, err := intFile.ExtractFromZipToUniqueTempFile(ctx, archivePath, contentPath, fileManifest.GlobMatch(false, archiveFormatGlobs...)...) if err != nil { return nil, nil, fmt.Errorf("unable to extract files from zip: %w", err) } @@ -680,8 +681,8 @@ func discoverPkgsFromOpener(ctx context.Context, location file.Location, pathWit return nestedPkgs, nestedRelationships, nil } -func pomPropertiesByParentPath(archivePath string, location file.Location, extractPaths []string) (map[string]pkg.JavaPomProperties, error) { - contentsOfMavenPropertiesFiles, err := intFile.ContentsFromZip(archivePath, extractPaths...) +func pomPropertiesByParentPath(ctx context.Context, archivePath string, location file.Location, extractPaths []string) (map[string]pkg.JavaPomProperties, error) { + contentsOfMavenPropertiesFiles, err := intFile.ContentsFromZip(ctx, archivePath, extractPaths...) if err != nil { return nil, fmt.Errorf("unable to extract maven files: %w", err) } @@ -709,8 +710,8 @@ func pomPropertiesByParentPath(archivePath string, location file.Location, extra return propertiesByParentPath, nil } -func pomProjectByParentPath(archivePath string, location file.Location, extractPaths []string) (map[string]*parsedPomProject, error) { - contentsOfMavenProjectFiles, err := intFile.ContentsFromZip(archivePath, extractPaths...) +func pomProjectByParentPath(ctx context.Context, archivePath string, location file.Location, extractPaths []string) (map[string]*parsedPomProject, error) { + contentsOfMavenProjectFiles, err := intFile.ContentsFromZip(ctx, archivePath, extractPaths...) if err != nil { return nil, fmt.Errorf("unable to extract maven files: %w", err) } diff --git a/syft/pkg/cataloger/java/archive_parser_test.go b/syft/pkg/cataloger/java/archive_parser_test.go index 76187e5fd..c5f559d65 100644 --- a/syft/pkg/cataloger/java/archive_parser_test.go +++ b/syft/pkg/cataloger/java/archive_parser_test.go @@ -72,7 +72,7 @@ func TestSearchMavenForLicenses(t *testing.T) { require.NoError(t, err) // setup parser - ap, cleanupFn, err := newJavaArchiveParser( + ap, cleanupFn, err := newJavaArchiveParser(context.Background(), file.LocationReadCloser{ Location: file.NewLocation(fixture.Name()), ReadCloser: fixture, @@ -372,7 +372,7 @@ func TestParseJar(t *testing.T) { UseNetwork: false, UseMavenLocalRepository: false, } - parser, cleanupFn, err := newJavaArchiveParser( + parser, cleanupFn, err := newJavaArchiveParser(context.Background(), file.LocationReadCloser{ Location: file.NewLocation(fixture.Name()), ReadCloser: fixture, @@ -1499,7 +1499,7 @@ func Test_deterministicMatchingPomProperties(t *testing.T) { fixture, err := os.Open(fixturePath) require.NoError(t, err) - parser, cleanupFn, err := newJavaArchiveParser( + parser, cleanupFn, err := newJavaArchiveParser(context.Background(), file.LocationReadCloser{ Location: file.NewLocation(fixture.Name()), ReadCloser: fixture, @@ -1636,7 +1636,7 @@ func Test_jarPomPropertyResolutionDoesNotPanic(t *testing.T) { ctx := context.TODO() // setup parser - ap, cleanupFn, err := newJavaArchiveParser( + ap, cleanupFn, err := newJavaArchiveParser(context.Background(), file.LocationReadCloser{ Location: file.NewLocation(fixture.Name()), ReadCloser: fixture, diff --git a/syft/pkg/cataloger/java/tar_wrapped_archive_parser.go b/syft/pkg/cataloger/java/tar_wrapped_archive_parser.go index 5af4f0b3f..4c4edc595 100644 --- a/syft/pkg/cataloger/java/tar_wrapped_archive_parser.go +++ b/syft/pkg/cataloger/java/tar_wrapped_archive_parser.go @@ -70,7 +70,7 @@ func (gtp genericTarWrappedJavaArchiveParser) parseTarWrappedJavaArchive(ctx con } func discoverPkgsFromTar(ctx context.Context, location file.Location, archivePath, contentPath string, cfg ArchiveCatalogerConfig) ([]pkg.Package, []artifact.Relationship, error) { - openers, err := intFile.ExtractGlobsFromTarToUniqueTempFile(archivePath, contentPath, archiveFormatGlobs...) + openers, err := intFile.ExtractGlobsFromTarToUniqueTempFile(ctx, archivePath, contentPath, archiveFormatGlobs...) if err != nil { return nil, nil, fmt.Errorf("unable to extract files from tar: %w", err) } diff --git a/syft/pkg/cataloger/java/zip_wrapped_archive_parser.go b/syft/pkg/cataloger/java/zip_wrapped_archive_parser.go index 3dd1d2524..e515f4f90 100644 --- a/syft/pkg/cataloger/java/zip_wrapped_archive_parser.go +++ b/syft/pkg/cataloger/java/zip_wrapped_archive_parser.go @@ -41,7 +41,7 @@ func (gzp genericZipWrappedJavaArchiveParser) parseZipWrappedJavaArchive(ctx con // functions support zips with shell scripts prepended to the file. Specifically, the helpers use the central // header at the end of the file to determine where the beginning of the zip payload is (unlike the standard lib // or archiver). - fileManifest, err := intFile.NewZipFileManifest(archivePath) + fileManifest, err := intFile.NewZipFileManifest(ctx, archivePath) if err != nil { return nil, nil, fmt.Errorf("unable to read files from java archive: %w", err) } diff --git a/syft/source/filesource/file_source.go b/syft/source/filesource/file_source.go index da2be0e19..0517be04a 100644 --- a/syft/source/filesource/file_source.go +++ b/syft/source/filesource/file_source.go @@ -4,13 +4,15 @@ import ( "context" "crypto" "fmt" + "io" "os" "path" + "path/filepath" "sync" + "github.com/mholt/archives" "github.com/opencontainers/go-digest" - "github.com/anchore/archiver/v3" stereoFile "github.com/anchore/stereoscope/pkg/file" intFile "github.com/anchore/syft/internal/file" "github.com/anchore/syft/internal/log" @@ -208,18 +210,8 @@ func fileAnalysisPath(path string, skipExtractArchive bool) (string, func() erro // if the given file is an archive (as indicated by the file extension and not MIME type) then unarchive it and // use the contents as the source. Note: this does NOT recursively unarchive contents, only the given path is // unarchived. - envelopedUnarchiver, err := archiver.ByExtension(path) - if unarchiver, ok := envelopedUnarchiver.(archiver.Unarchiver); err == nil && ok { - // when tar/zip files are extracted, if there are multiple entries at the same - // location, the last entry wins - // NOTE: this currently does not display any messages if an overwrite happens - switch v := unarchiver.(type) { - case *archiver.Tar: - v.OverwriteExisting = true - case *archiver.Zip: - v.OverwriteExisting = true - } - + envelopedUnarchiver, _, err := archives.Identify(context.Background(), path, nil) + if unarchiver, ok := envelopedUnarchiver.(archives.Extractor); err == nil && ok { analysisPath, cleanupFn, err = unarchiveToTmp(path, unarchiver) if err != nil { return "", nil, fmt.Errorf("unable to unarchive source file: %w", err) @@ -246,15 +238,58 @@ func digestOfFileContents(path string) string { return di.String() } -func unarchiveToTmp(path string, unarchiver archiver.Unarchiver) (string, func() error, error) { +func unarchiveToTmp(path string, unarchiver archives.Extractor) (string, func() error, error) { + var cleanupFn = func() error { return nil } + archive, err := os.Open(path) + if err != nil { + return "", cleanupFn, fmt.Errorf("unable to open archive: %v", err) + } + defer archive.Close() + tempDir, err := os.MkdirTemp("", "syft-archive-contents-") if err != nil { - return "", func() error { return nil }, fmt.Errorf("unable to create tempdir for archive processing: %w", err) + return "", cleanupFn, fmt.Errorf("unable to create tempdir for archive processing: %w", err) } - cleanupFn := func() error { + visitor := func(_ context.Context, file archives.FileInfo) error { + // Protect against symlink attacks by ensuring path doesn't escape tempDir + destPath, err := intFile.SafeJoin(tempDir, file.NameInArchive) + if err != nil { + return err + } + + if file.IsDir() { + return os.MkdirAll(destPath, file.Mode()) + } + + if err = os.MkdirAll(filepath.Dir(destPath), os.ModeDir|0755); err != nil { + return fmt.Errorf("failed to create parent directory: %w", err) + } + + rc, err := file.Open() + if err != nil { + return fmt.Errorf("failed to open file in archive: %w", err) + } + defer rc.Close() + + destFile, err := os.Create(destPath) + if err != nil { + return fmt.Errorf("failed to create file in destination: %w", err) + } + defer destFile.Close() + + if err := destFile.Chmod(file.Mode()); err != nil { + return fmt.Errorf("failed to change mode of destination file: %w", err) + } + + if _, err := io.Copy(destFile, rc); err != nil { + return fmt.Errorf("failed to copy file contents: %w", err) + } + + return nil + } + + return tempDir, func() error { return os.RemoveAll(tempDir) - } - - return tempDir, cleanupFn, unarchiver.Unarchive(path, tempDir) + }, unarchiver.Extract(context.Background(), archive, visitor) }