From 9e3150b7ee5a7d1ae0a15d6d12c0f3ab410be447 Mon Sep 17 00:00:00 2001 From: Keith Zantow Date: Mon, 8 Dec 2025 15:58:13 -0500 Subject: [PATCH] fix: java archives excluded due to incorrect license glob results (#4449) Signed-off-by: Keith Zantow --- internal/file/glob_match.go | 46 ------------------- internal/file/glob_match_test.go | 39 ---------------- .../zip-source/b-file/in-subdir.txt | 1 + internal/file/zip_file_helpers_test.go | 14 ++---- internal/file/zip_file_manifest.go | 16 ++++++- internal/file/zip_file_manifest_test.go | 26 ++++++----- syft/pkg/cataloger/java/archive_parser.go | 25 ++++------ .../cataloger/java/parse_pom_properties.go | 2 +- syft/pkg/cataloger/java/parse_pom_xml.go | 2 +- .../python/parse_wheel_egg_metadata.go | 4 +- 10 files changed, 47 insertions(+), 128 deletions(-) delete mode 100644 internal/file/glob_match.go delete mode 100644 internal/file/glob_match_test.go create mode 100644 internal/file/test-fixtures/zip-source/b-file/in-subdir.txt diff --git a/internal/file/glob_match.go b/internal/file/glob_match.go deleted file mode 100644 index 6befe32e4..000000000 --- a/internal/file/glob_match.go +++ /dev/null @@ -1,46 +0,0 @@ -package file - -// GlobMatch evaluates the given glob pattern against the given "name" string, indicating if there is a match or not. -// Source: https://research.swtch.com/glob.go -func GlobMatch(pattern, name string) bool { - px := 0 - nx := 0 - nextPx := 0 - nextNx := 0 - for px < len(pattern) || nx < len(name) { - if px < len(pattern) { - c := pattern[px] - switch c { - default: // ordinary character - if nx < len(name) && name[nx] == c { - px++ - nx++ - continue - } - case '?': // single-character wildcard - if nx < len(name) { - px++ - nx++ - continue - } - case '*': // zero-or-more-character wildcard - // Try to match at nx. - // If that doesn't work out, - // restart at nx+1 next. - nextPx = px - nextNx = nx + 1 - px++ - continue - } - } - // Mismatch. Maybe restart. - if 0 < nextNx && nextNx <= len(name) { - px = nextPx - nx = nextNx - continue - } - return false - } - // Matched all of pattern to all of name. Success. - return true -} diff --git a/internal/file/glob_match_test.go b/internal/file/glob_match_test.go deleted file mode 100644 index aab2803dc..000000000 --- a/internal/file/glob_match_test.go +++ /dev/null @@ -1,39 +0,0 @@ -package file - -import ( - "strings" - "testing" -) - -func TestGlobMatch(t *testing.T) { - var tests = []struct { - pattern string - data string - ok bool - }{ - {"", "", true}, - {"x", "", false}, - {"", "x", false}, - {"abc", "abc", true}, - {"*", "abc", true}, - {"*c", "abc", true}, - {"*b", "abc", false}, - {"a*", "abc", true}, - {"b*", "abc", false}, - {"a*", "a", true}, - {"*a", "a", true}, - {"a*b*c*d*e*", "axbxcxdxe", true}, - {"a*b*c*d*e*", "axbxcxdxexxx", true}, - {"a*b?c*x", "abxbbxdbxebxczzx", true}, - {"a*b?c*x", "abxbbxdbxebxczzy", false}, - {"a*a*a*a*b", strings.Repeat("a", 100), false}, - {"*x", "xxx", true}, - {"/home/place/**", "/home/place/a/thing", true}, - } - - for _, test := range tests { - if GlobMatch(test.pattern, test.data) != test.ok { - t.Errorf("failed glob='%s' data='%s'", test.pattern, test.data) - } - } -} diff --git a/internal/file/test-fixtures/zip-source/b-file/in-subdir.txt b/internal/file/test-fixtures/zip-source/b-file/in-subdir.txt new file mode 100644 index 000000000..63e98b6f6 --- /dev/null +++ b/internal/file/test-fixtures/zip-source/b-file/in-subdir.txt @@ -0,0 +1 @@ +this file is in a subdirectory \ No newline at end of file diff --git a/internal/file/zip_file_helpers_test.go b/internal/file/zip_file_helpers_test.go index a94304cca..22f92e97b 100644 --- a/internal/file/zip_file_helpers_test.go +++ b/internal/file/zip_file_helpers_test.go @@ -7,14 +7,14 @@ import ( "path/filepath" "syscall" "testing" - - "github.com/stretchr/testify/assert" ) var expectedZipArchiveEntries = []string{ - "some-dir" + string(os.PathSeparator), - filepath.Join("some-dir", "a-file.txt"), + "some-dir/", + "some-dir/a-file.txt", "b-file.txt", + "b-file/", + "b-file/in-subdir.txt", "nested.zip", } @@ -59,12 +59,6 @@ func createZipArchive(t testing.TB, sourceDirPath, destinationArchivePath string } -func assertNoError(t testing.TB, fn func() error) func() { - return func() { - assert.NoError(t, fn()) - } -} - // setupZipFileTest encapsulates common test setup work for zip file tests. It returns a cleanup function, // which should be called (typically deferred) by the caller, the path of the created zip archive, and an error, // which should trigger a fatal test failure in the consuming test. The returned cleanup function will never be nil diff --git a/internal/file/zip_file_manifest.go b/internal/file/zip_file_manifest.go index 8dcb0d2f2..34a2cb66d 100644 --- a/internal/file/zip_file_manifest.go +++ b/internal/file/zip_file_manifest.go @@ -6,6 +6,7 @@ import ( "sort" "strings" + "github.com/bmatcuk/doublestar/v4" "github.com/mholt/archives" "github.com/scylladb/go-set/strset" @@ -44,12 +45,17 @@ func (z ZipFileManifest) Add(entry string, info os.FileInfo) { z[entry] = info } -// GlobMatch returns the path keys that match the given value(s). +// GlobMatch returns the path keys to files (not directories) that match the given value(s). func (z ZipFileManifest) GlobMatch(caseInsensitive bool, patterns ...string) []string { uniqueMatches := strset.New() for _, pattern := range patterns { for entry := range z { + fileInfo := z[entry] + if fileInfo != nil && fileInfo.IsDir() { + continue + } + // We want to match globs as if entries begin with a leading slash (akin to an absolute path) // so that glob logic is consistent inside and outside of ZIP archives normalizedEntry := normalizeZipEntryName(caseInsensitive, entry) @@ -57,7 +63,13 @@ func (z ZipFileManifest) GlobMatch(caseInsensitive bool, patterns ...string) []s if caseInsensitive { pattern = strings.ToLower(pattern) } - if GlobMatch(pattern, normalizedEntry) { + + matches, err := doublestar.Match(pattern, normalizedEntry) + if err != nil { + log.Debugf("error with match pattern '%s', including by default: %v", pattern, err) + matches = true + } + if matches { uniqueMatches.Add(entry) } } diff --git a/internal/file/zip_file_manifest_test.go b/internal/file/zip_file_manifest_test.go index 9ebe42224..339048a1d 100644 --- a/internal/file/zip_file_manifest_test.go +++ b/internal/file/zip_file_manifest_test.go @@ -9,6 +9,8 @@ import ( "os" "path" "testing" + + "github.com/stretchr/testify/require" ) func TestNewZipFileManifest(t *testing.T) { @@ -107,23 +109,27 @@ func TestZipFileManifest_GlobMatch(t *testing.T) { cases := []struct { glob string - expected string + expected []string }{ { "/b*", - "b-file.txt", + []string{"b-file.txt"}, }, { - "*/a-file.txt", - "some-dir/a-file.txt", + "/b*/**", + []string{"b-file.txt", "b-file/in-subdir.txt"}, }, { - "*/A-file.txt", - "some-dir/a-file.txt", + "**/a-file.txt", + []string{"some-dir/a-file.txt"}, + }, + { + "**/A-file.txt", + []string{"some-dir/a-file.txt"}, }, { "**/*.zip", - "nested.zip", + []string{"nested.zip"}, }, } @@ -133,11 +139,7 @@ func TestZipFileManifest_GlobMatch(t *testing.T) { results := z.GlobMatch(true, glob) - if len(results) == 1 && results[0] == tc.expected { - return - } - - t.Errorf("unexpected results for glob '%s': %+v", glob, results) + require.ElementsMatch(t, tc.expected, results) }) } } diff --git a/syft/pkg/cataloger/java/archive_parser.go b/syft/pkg/cataloger/java/archive_parser.go index d44ec5d3a..7cefa9f47 100644 --- a/syft/pkg/cataloger/java/archive_parser.go +++ b/syft/pkg/cataloger/java/archive_parser.go @@ -184,7 +184,7 @@ func (j *archiveParser) parse(ctx context.Context, parentPkg *pkg.Package) ([]pk relationships = append(relationships, nestedRelationships...) } else { // .jar and .war files are present in archives, are others? or generally just consider them top-level? - nestedArchives := j.fileManifest.GlobMatch(true, "*.jar", "*.war") + nestedArchives := j.fileManifest.GlobMatch(true, "**/*.jar", "**/*.war") if len(nestedArchives) > 0 { slices.Sort(nestedArchives) errs = unknown.Appendf(errs, j.location, "nested archives not cataloged: %v", strings.Join(nestedArchives, ", ")) @@ -252,10 +252,7 @@ func (j *archiveParser) discoverMainPackage(ctx context.Context) (*pkg.Package, return nil, err } - name, version, lics, parsedPom, err := j.discoverNameVersionLicense(ctx, manifest) - if err != nil { - return nil, err - } + name, version, lics, parsedPom := j.discoverNameVersionLicense(ctx, manifest) var pkgPomProject *pkg.JavaPomProject if parsedPom != nil { pkgPomProject = newPomProject(ctx, j.maven, parsedPom.path, parsedPom.project) @@ -280,7 +277,7 @@ func (j *archiveParser) discoverMainPackage(ctx context.Context) (*pkg.Package, }, nil } -func (j *archiveParser) discoverNameVersionLicense(ctx context.Context, manifest *pkg.JavaManifest) (string, string, []pkg.License, *parsedPomProject, error) { +func (j *archiveParser) discoverNameVersionLicense(ctx context.Context, manifest *pkg.JavaManifest) (string, string, []pkg.License, *parsedPomProject) { // we use j.location because we want to associate the license declaration with where we discovered the contents in the manifest // TODO: when we support locations of paths within archives we should start passing the specific manifest location object instead of the top jar lics := pkg.NewLicensesFromLocationWithContext(ctx, j.location, selectLicenses(manifest)...) @@ -300,10 +297,7 @@ func (j *archiveParser) discoverNameVersionLicense(ctx context.Context, manifest } if len(lics) == 0 { - fileLicenses, err := j.getLicenseFromFileInArchive(ctx) - if err != nil { - return "", "", nil, parsedPom, err - } + fileLicenses := j.getLicenseFromFileInArchive(ctx) if fileLicenses != nil { lics = append(lics, fileLicenses...) } @@ -317,7 +311,7 @@ func (j *archiveParser) discoverNameVersionLicense(ctx context.Context, manifest lics = j.findLicenseFromJavaMetadata(ctx, groupID, artifactID, version, parsedPom, manifest) } - return artifactID, version, lics, parsedPom, nil + return artifactID, version, lics, parsedPom } // findLicenseFromJavaMetadata attempts to find license information from all available maven metadata properties and pom info @@ -562,7 +556,7 @@ func getDigestsFromArchive(ctx context.Context, archivePath string) ([]file.Dige return digests, nil } -func (j *archiveParser) getLicenseFromFileInArchive(ctx context.Context) ([]pkg.License, error) { +func (j *archiveParser) getLicenseFromFileInArchive(ctx context.Context) []pkg.License { // prefer identified licenses, fall back to unknown var identified []pkg.License var unidentified []pkg.License @@ -578,7 +572,8 @@ func (j *archiveParser) getLicenseFromFileInArchive(ctx context.Context) ([]pkg. if len(licenseMatches) > 0 { contents, err := intFile.ContentsFromZip(ctx, j.archivePath, licenseMatches...) if err != nil { - return nil, fmt.Errorf("unable to extract java license (%s): %w", j.location, err) + log.Debugf("unable to extract java license (%s): %w", j.location, err) + continue } for _, licenseMatch := range licenseMatches { @@ -602,10 +597,10 @@ func (j *archiveParser) getLicenseFromFileInArchive(ctx context.Context) ([]pkg. } if len(identified) == 0 { - return unidentified, nil + return unidentified } - return identified, nil + return identified } func (j *archiveParser) discoverPkgsFromNestedArchives(ctx context.Context, parentPkg *pkg.Package) ([]pkg.Package, []artifact.Relationship, error) { diff --git a/syft/pkg/cataloger/java/parse_pom_properties.go b/syft/pkg/cataloger/java/parse_pom_properties.go index 084e10807..8ef3676be 100644 --- a/syft/pkg/cataloger/java/parse_pom_properties.go +++ b/syft/pkg/cataloger/java/parse_pom_properties.go @@ -11,7 +11,7 @@ import ( "github.com/anchore/syft/syft/pkg" ) -const pomPropertiesGlob = "*pom.properties" +const pomPropertiesGlob = "**/*pom.properties" func parsePomProperties(path string, reader io.Reader) (*pkg.JavaPomProperties, error) { var props pkg.JavaPomProperties diff --git a/syft/pkg/cataloger/java/parse_pom_xml.go b/syft/pkg/cataloger/java/parse_pom_xml.go index 24fdcde74..572dfed8e 100644 --- a/syft/pkg/cataloger/java/parse_pom_xml.go +++ b/syft/pkg/cataloger/java/parse_pom_xml.go @@ -16,7 +16,7 @@ import ( ) const ( - pomXMLGlob = "*pom.xml" + pomXMLGlob = "**/*pom.xml" pomCatalogerName = "java-pom-cataloger" ) diff --git a/syft/pkg/cataloger/python/parse_wheel_egg_metadata.go b/syft/pkg/cataloger/python/parse_wheel_egg_metadata.go index 5d11d3e52..0cb060a23 100644 --- a/syft/pkg/cataloger/python/parse_wheel_egg_metadata.go +++ b/syft/pkg/cataloger/python/parse_wheel_egg_metadata.go @@ -6,9 +6,9 @@ import ( "path/filepath" "strings" + "github.com/bmatcuk/doublestar/v4" "github.com/go-viper/mapstructure/v2" - intFile "github.com/anchore/syft/internal/file" "github.com/anchore/syft/internal/log" "github.com/anchore/syft/syft/file" "github.com/anchore/syft/syft/pkg" @@ -141,7 +141,7 @@ func getFieldType(key, in string) any { // of egg metadata (as opposed to a directory that contains more metadata // files). func isEggRegularFile(path string) bool { - return intFile.GlobMatch(eggInfoGlob, path) + return doublestar.MatchUnvalidated(eggInfoGlob, path) } // determineSitePackagesRootPath returns the path of the site packages root,