diff --git a/internal/file/archive_aliases.go b/internal/file/archive_aliases.go new file mode 100644 index 000000000..63021fa3b --- /dev/null +++ b/internal/file/archive_aliases.go @@ -0,0 +1,46 @@ +package file + +import ( + "context" + "io" + "path/filepath" + "strings" + + "github.com/mholt/archives" +) + +// compoundExtensionAliases maps shorthand archive extensions to their full forms. +// The mholt/archives library doesn't recognize these aliases natively. +// +// See: https://github.com/anchore/syft/issues/4416 +// Reference: https://github.com/mholt/archives?tab=readme-ov-file#supported-compression-formats +var compoundExtensionAliases = map[string]string{ + ".tgz": ".tar.gz", + ".tbz2": ".tar.bz2", + ".txz": ".tar.xz", + ".tlz": ".tar.lz", + ".tzst": ".tar.zst", +} + +// IdentifyArchive is a wrapper around archives.Identify that handles compound extension +// aliases (like .tgz -> .tar.gz) transparently. It first attempts filename-based detection +// using the alias map, and falls back to content-based detection if needed. +// +// This function is a drop-in replacement for archives.Identify that centralizes +// the compound alias handling logic in one place. +func IdentifyArchive(ctx context.Context, path string, r io.Reader) (archives.Format, io.Reader, error) { + // First, try to identify using the alias-mapped path (filename-based detection) + normalizedPath := handleCompoundArchiveAliases(path) + return archives.Identify(ctx, normalizedPath, r) +} + +// handleCompoundArchiveAliases normalizes archive file paths that use compound extension +// aliases (like .tgz) to their full forms (like .tar.gz) for correct identification +// by the mholt/archives library. +func handleCompoundArchiveAliases(path string) string { + ext := filepath.Ext(path) + if newExt, ok := compoundExtensionAliases[ext]; ok { + return strings.TrimSuffix(path, ext) + newExt + } + return path +} diff --git a/internal/file/archive_aliases_test.go b/internal/file/archive_aliases_test.go new file mode 100644 index 000000000..9d6176652 --- /dev/null +++ b/internal/file/archive_aliases_test.go @@ -0,0 +1,73 @@ +package file + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestHandleCompoundArchiveAliases(t *testing.T) { + tests := []struct { + name string + input string + expected string + }{ + { + name: "tgz to tar.gz", + input: "/path/to/archive.tgz", + expected: "/path/to/archive.tar.gz", + }, + { + name: "tbz2 to tar.bz2", + input: "/path/to/archive.tbz2", + expected: "/path/to/archive.tar.bz2", + }, + { + name: "txz to tar.xz", + input: "/path/to/archive.txz", + expected: "/path/to/archive.tar.xz", + }, + { + name: "tlz to tar.lz", + input: "/path/to/archive.tlz", + expected: "/path/to/archive.tar.lz", + }, + { + name: "tzst to tar.zst", + input: "/path/to/archive.tzst", + expected: "/path/to/archive.tar.zst", + }, + { + name: "standard tar.gz unchanged", + input: "/path/to/archive.tar.gz", + expected: "/path/to/archive.tar.gz", + }, + { + name: "zip unchanged", + input: "/path/to/archive.zip", + expected: "/path/to/archive.zip", + }, + { + name: "no extension unchanged", + input: "/path/to/archive", + expected: "/path/to/archive", + }, + { + name: "case sensitive - TGZ not matched", + input: "/path/to/archive.TGZ", + expected: "/path/to/archive.TGZ", + }, + { + name: "just filename with tgz", + input: "archive.tgz", + expected: "archive.tar.gz", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := handleCompoundArchiveAliases(tt.input) + assert.Equal(t, tt.expected, result) + }) + } +} diff --git a/internal/file/tar_file_traversal.go b/internal/file/tar_file_traversal.go index c3511a1cc..d3c9656a7 100644 --- a/internal/file/tar_file_traversal.go +++ b/internal/file/tar_file_traversal.go @@ -20,7 +20,7 @@ func TraverseFilesInTar(ctx context.Context, archivePath string, visitor archive } defer internal.CloseAndLogError(tarReader, archivePath) - format, _, err := archives.Identify(ctx, archivePath, nil) + format, _, err := IdentifyArchive(ctx, archivePath, tarReader) if err != nil { return fmt.Errorf("failed to identify tar compression format: %w", err) } diff --git a/internal/packagemetadata/names.go b/internal/packagemetadata/names.go index d7a1bfcf9..44e686190 100644 --- a/internal/packagemetadata/names.go +++ b/internal/packagemetadata/names.go @@ -99,7 +99,8 @@ var jsonTypes = makeJSONTypes( jsonNames(pkg.PEBinary{}, "pe-binary"), jsonNames(pkg.PhpComposerLockEntry{}, "php-composer-lock-entry", "PhpComposerJsonMetadata"), jsonNamesWithoutLookup(pkg.PhpComposerInstalledEntry{}, "php-composer-installed-entry", "PhpComposerJsonMetadata"), // the legacy value is split into two types, where the other is preferred - jsonNames(pkg.PhpPeclEntry{}, "php-pecl-entry", "PhpPeclMetadata"), //nolint:staticcheck + //nolint:staticcheck + jsonNames(pkg.PhpPeclEntry{}, "php-pecl-entry", "PhpPeclMetadata"), jsonNames(pkg.PhpPearEntry{}, "php-pear-entry"), jsonNames(pkg.PortageEntry{}, "portage-db-entry", "PortageMetadata"), jsonNames(pkg.PythonPackage{}, "python-package", "PythonPackageMetadata"), diff --git a/internal/task/unknowns_tasks.go b/internal/task/unknowns_tasks.go index 2f63ce28e..8761a06cd 100644 --- a/internal/task/unknowns_tasks.go +++ b/internal/task/unknowns_tasks.go @@ -4,8 +4,7 @@ import ( "context" "strings" - "github.com/mholt/archives" - + intFile "github.com/anchore/syft/internal/file" "github.com/anchore/syft/internal/log" "github.com/anchore/syft/internal/sbomsync" "github.com/anchore/syft/syft/cataloging" @@ -60,7 +59,7 @@ func (c unknownsLabelerTask) finalize(resolver file.Resolver, s *sbom.SBOM) { if c.IncludeUnexpandedArchives { ctx := context.Background() for coords := range s.Artifacts.FileMetadata { - format, _, notArchiveErr := archives.Identify(ctx, coords.RealPath, nil) + format, _, notArchiveErr := intFile.IdentifyArchive(ctx, coords.RealPath, nil) if format != nil && notArchiveErr == nil && !hasPackageReference(coords) { s.Artifacts.Unknowns[coords] = append(s.Artifacts.Unknowns[coords], "archive not cataloged") } diff --git a/syft/format/github/internal/model/model.go b/syft/format/github/internal/model/model.go index b2aa0d23a..0f50dc21d 100644 --- a/syft/format/github/internal/model/model.go +++ b/syft/format/github/internal/model/model.go @@ -6,9 +6,8 @@ import ( "strings" "time" - "github.com/mholt/archives" - "github.com/anchore/packageurl-go" + "github.com/anchore/syft/internal/file" "github.com/anchore/syft/internal/log" "github.com/anchore/syft/syft/pkg" "github.com/anchore/syft/syft/sbom" @@ -155,7 +154,7 @@ func trimRelative(s string) string { // isArchive returns true if the path appears to be an archive func isArchive(path string) bool { - format, _, err := archives.Identify(context.Background(), path, nil) + format, _, err := file.IdentifyArchive(context.Background(), path, nil) return err == nil && format != nil } diff --git a/syft/pkg/cataloger/java/tar_wrapped_archive_parser_test.go b/syft/pkg/cataloger/java/tar_wrapped_archive_parser_test.go index 089d19eb1..c0f3eef3c 100644 --- a/syft/pkg/cataloger/java/tar_wrapped_archive_parser_test.go +++ b/syft/pkg/cataloger/java/tar_wrapped_archive_parser_test.go @@ -32,6 +32,13 @@ func Test_parseTarWrappedJavaArchive(t *testing.T) { "joda-time", }, }, + { + fixture: "test-fixtures/java-builds/packages/example-java-app-maven-0.1.0.tgz", + expected: []string{ + "example-java-app-maven", + "joda-time", + }, + }, } for _, test := range tests { t.Run(path.Base(test.fixture), func(t *testing.T) { diff --git a/syft/pkg/cataloger/java/test-fixtures/java-builds/Makefile b/syft/pkg/cataloger/java/test-fixtures/java-builds/Makefile index b3aae020a..1ef3fab78 100644 --- a/syft/pkg/cataloger/java/test-fixtures/java-builds/Makefile +++ b/syft/pkg/cataloger/java/test-fixtures/java-builds/Makefile @@ -16,7 +16,7 @@ fingerprint: $(FINGERPRINT_FILE) jars: $(PKGSDIR)/example-java-app-maven-0.1.0.jar $(PKGSDIR)/example-java-app-gradle-0.1.0.jar $(PKGSDIR)/example-jenkins-plugin.hpi $(PKGSDIR)/spring-boot-0.0.1-SNAPSHOT.jar -archives: $(PKGSDIR)/example-java-app-maven-0.1.0.zip $(PKGSDIR)/example-java-app-maven-0.1.0.tar $(PKGSDIR)/example-java-app-maven-0.1.0.tar.gz +archives: $(PKGSDIR)/example-java-app-maven-0.1.0.zip $(PKGSDIR)/example-java-app-maven-0.1.0.tar $(PKGSDIR)/example-java-app-maven-0.1.0.tar.gz $(PKGSDIR)/example-java-app-maven-0.1.0.tgz native-image: $(PKGSDIR)/example-java-app $(PKGSDIR)/gcc-amd64-darwin-exec-debug @@ -31,6 +31,9 @@ $(PKGSDIR)/example-java-app-maven-0.1.0.tar: $(PKGSDIR)/example-java-app-maven-0 $(PKGSDIR)/example-java-app-maven-0.1.0.tar.gz: $(PKGSDIR)/example-java-app-maven-0.1.0.jar tar -czvf $(PKGSDIR)/example-java-app-maven-0.1.0.tar.gz $(PKGSDIR)/example-java-app-maven-0.1.0.jar +$(PKGSDIR)/example-java-app-maven-0.1.0.tgz: $(PKGSDIR)/example-java-app-maven-0.1.0.tar.gz + tar -czf $(PKGSDIR)/example-java-app-maven-0.1.0.tgz $(PKGSDIR)/example-java-app-maven-0.1.0.jar + # Nested jar... $(PKGSDIR)/spring-boot-0.0.1-SNAPSHOT.jar: diff --git a/syft/source/filesource/file_source.go b/syft/source/filesource/file_source.go index 0517be04a..3e3ca71ba 100644 --- a/syft/source/filesource/file_source.go +++ b/syft/source/filesource/file_source.go @@ -207,10 +207,7 @@ func fileAnalysisPath(path string, skipExtractArchive bool) (string, func() erro return analysisPath, cleanupFn, nil } - // if the given file is an archive (as indicated by the file extension and not MIME type) then unarchive it and - // use the contents as the source. Note: this does NOT recursively unarchive contents, only the given path is - // unarchived. - envelopedUnarchiver, _, err := archives.Identify(context.Background(), path, nil) + envelopedUnarchiver, _, err := intFile.IdentifyArchive(context.Background(), path, nil) if unarchiver, ok := envelopedUnarchiver.(archives.Extractor); err == nil && ok { analysisPath, cleanupFn, err = unarchiveToTmp(path, unarchiver) if err != nil {