From cff5a05681af9ea70d6c64006842a70ceb953baa Mon Sep 17 00:00:00 2001 From: David Dashti <47575784+Dashtid@users.noreply.github.com> Date: Mon, 15 Jun 2026 22:15:32 +0200 Subject: [PATCH] fix(dpkg): extract License field for opkg/ipkg entries (#4963) * fix(dpkg): extract License field for opkg/ipkg entries opkg and ipkg use the dpkg cataloger but declare the package License inline in the status DB (unlike Debian dpkg, where licenses live in copyright files). The cataloger silently dropped the License field at mapstructure decode time, so all opkg-managed packages reported empty licenses. This adds the field to the intermediate decode struct and the public DpkgDBEntry, and populates licenses in newDpkgPackage using the alpine cataloger's pattern: try license.ParseExpression first to keep valid SPDX expressions whole, fall back to whitespace splitting for space-separated lists. Standard Debian dpkg status files never carry a License field per Debian policy, so the new path is a no-op for them; the existing copyright-file lookup in addLicenses is unaffected. Closes #4940 Signed-off-by: David Dashti <47575784+Dashtid@users.noreply.github.com> * remove license from dpkg metadata struct Signed-off-by: Alex Goodman * restore format snapshot files Signed-off-by: Alex Goodman * add additional tests Signed-off-by: Alex Goodman --------- Signed-off-by: David Dashti <47575784+Dashtid@users.noreply.github.com> Signed-off-by: Alex Goodman Co-authored-by: Alex Goodman --- syft/pkg/cataloger/debian/package.go | 35 +++++-- syft/pkg/cataloger/debian/package_test.go | 98 +++++++++++++++++++ .../pkg/cataloger/debian/parse_deb_archive.go | 2 +- syft/pkg/cataloger/debian/parse_dpkg_db.go | 71 ++++++++------ .../cataloger/debian/parse_dpkg_db_test.go | 54 +++++++++- .../debian/testdata/var/lib/opkg/status | 30 ++++++ 6 files changed, 246 insertions(+), 44 deletions(-) create mode 100644 syft/pkg/cataloger/debian/testdata/var/lib/opkg/status diff --git a/syft/pkg/cataloger/debian/package.go b/syft/pkg/cataloger/debian/package.go index 88102da6e..364963516 100644 --- a/syft/pkg/cataloger/debian/package.go +++ b/syft/pkg/cataloger/debian/package.go @@ -13,6 +13,7 @@ import ( "github.com/anchore/syft/internal" "github.com/anchore/syft/internal/log" "github.com/anchore/syft/syft/file" + "github.com/anchore/syft/syft/license" "github.com/anchore/syft/syft/linux" "github.com/anchore/syft/syft/pkg" ) @@ -23,21 +24,26 @@ const ( docsPath = "/usr/share/doc" ) -func newDpkgPackage(ctx context.Context, d pkg.DpkgDBEntry, dbLocation file.Location, resolver file.Resolver, release *linux.Release, evidence ...file.Location) pkg.Package { - // TODO: separate pr to license refactor, but explore extracting dpkg-specific license parsing into a separate function - var licenses []pkg.License +func newDpkgPackage(ctx context.Context, d dpkgExtractedMetadata, dbLocation file.Location, resolver file.Resolver, release *linux.Release, evidence ...file.Location) pkg.Package { + // the License field is empty for standard Debian dpkg entries (licenses live in copyright files), + // but opkg/ipkg derivatives carry it inline in the status DB — extract it here so those packages + // report licenses without requiring per-package copyright lookups. The license is not persisted on + // the final entry, so convert the raw metadata into the entry just-in-time here. + licenses := extractDeclaredLicenses(ctx, d.License, dbLocation) + + entry := d.toDpkgEntry() locations := file.NewLocationSet(dbLocation) locations.Add(evidence...) p := pkg.Package{ - Name: d.Package, - Version: d.Version, + Name: entry.Package, + Version: entry.Version, Licenses: pkg.NewLicenseSet(licenses...), Locations: locations, - PURL: packageURL(d, release), + PURL: packageURL(entry, release), Type: pkg.DebPkg, - Metadata: d, + Metadata: entry, } if resolver != nil { @@ -55,6 +61,21 @@ func newDpkgPackage(ctx context.Context, d pkg.DpkgDBEntry, dbLocation file.Loca return p } +// extractDeclaredLicenses converts a License field from the status DB into a license set. Returns nil +// for empty input so standard dpkg entries (which never declare License inline) incur no allocation. +// Mirrors the alpine cataloger's approach: keep the value whole if it parses as a valid SPDX expression, +// otherwise split on whitespace to handle space-separated lists. +func extractDeclaredLicenses(ctx context.Context, raw string, dbLocation file.Location) []pkg.License { + if raw == "" { + return nil + } + licenseStrings := []string{raw} + if _, err := license.ParseExpression(raw); err != nil { + licenseStrings = strings.Fields(raw) + } + return pkg.NewLicensesFromLocationWithContext(ctx, dbLocation, licenseStrings...) +} + func newDebArchivePackage(ctx context.Context, location file.Location, metadata pkg.DpkgArchiveEntry, licenseStrings []string) pkg.Package { p := pkg.Package{ Name: metadata.Package, diff --git a/syft/pkg/cataloger/debian/package_test.go b/syft/pkg/cataloger/debian/package_test.go index a52659739..227ebc9d5 100644 --- a/syft/pkg/cataloger/debian/package_test.go +++ b/syft/pkg/cataloger/debian/package_test.go @@ -1,10 +1,14 @@ package debian import ( + "context" + "sort" "testing" "github.com/google/go-cmp/cmp" + "github.com/stretchr/testify/require" + "github.com/anchore/syft/syft/file" "github.com/anchore/syft/syft/linux" "github.com/anchore/syft/syft/pkg" ) @@ -111,3 +115,97 @@ func Test_packageURL(t *testing.T) { }) } } + +func Test_extractDeclaredLicenses(t *testing.T) { + ctx := context.Background() + dbLocation := file.NewLocation("/var/lib/opkg/status") + + tests := []struct { + name string + raw string + expected []string + }{ + { + name: "empty input returns nil", + raw: "", + expected: nil, + }, + { + name: "single SPDX identifier kept whole", + raw: "MIT", + expected: []string{"MIT"}, + }, + { + name: "valid SPDX expression kept whole", + raw: "Apache-2.0 OR MIT", + expected: []string{"Apache-2.0 OR MIT"}, + }, + { + name: "non-expression space-separated list is split", + raw: "GPL-2.0 BSD-3-Clause", + expected: []string{"GPL-2.0", "BSD-3-Clause"}, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + got := extractDeclaredLicenses(ctx, test.raw, dbLocation) + var gotValues []string + for _, l := range got { + gotValues = append(gotValues, l.Value) + } + // NewLicensesFromLocationWithContext does not guarantee output order + sort.Strings(gotValues) + want := append([]string(nil), test.expected...) + sort.Strings(want) + if diff := cmp.Diff(want, gotValues); diff != "" { + t.Errorf("unexpected licenses (-want +got):\n%s", diff) + } + }) + } +} + +func Test_newDpkgPackage_declaredLicense(t *testing.T) { + // the License field is not persisted on pkg.DpkgDBEntry, so this guards that the inline opkg/ipkg license + // declared on the raw metadata still flows into the built package's license set + tests := []struct { + name string + metadata dpkgExtractedMetadata + expected []string + }{ + { + name: "no declared license", + metadata: dpkgExtractedMetadata{Package: "apt", Version: "1.8.2"}, + expected: nil, + }, + { + name: "inline license flows to package", + metadata: dpkgExtractedMetadata{Package: "dropbear", Version: "2024.85-r0", License: "MIT"}, + expected: []string{"MIT"}, + }, + { + name: "space-separated licenses split into the set", + metadata: dpkgExtractedMetadata{Package: "busybox", Version: "1.36.1", License: "GPL-2.0 BSD-3-Clause"}, + expected: []string{"BSD-3-Clause", "GPL-2.0"}, + }, + { + name: "valid SPDX expression kept whole", + metadata: dpkgExtractedMetadata{Package: "curl", Version: "8.5.0", License: "Apache-2.0 OR MIT"}, + expected: []string{"Apache-2.0 OR MIT"}, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + p := newDpkgPackage(context.Background(), test.metadata, file.NewLocation("/var/lib/opkg/status"), nil, nil) + + var got []string + for _, l := range p.Licenses.ToSlice() { + got = append(got, l.Value) + } + // the license set does not guarantee output order + sort.Strings(got) + require.Equal(t, test.expected, got) + }) + } +} diff --git a/syft/pkg/cataloger/debian/parse_deb_archive.go b/syft/pkg/cataloger/debian/parse_deb_archive.go index 0f977c05e..b3b5f923f 100644 --- a/syft/pkg/cataloger/debian/parse_deb_archive.go +++ b/syft/pkg/cataloger/debian/parse_deb_archive.go @@ -131,7 +131,7 @@ func processControlTar(dcReader io.ReadCloser) (*pkg.DpkgArchiveEntry, error) { if len(entries) == 0 { return nil, fmt.Errorf("no package entries found in control file") } - entry := pkg.DpkgArchiveEntry(entries[0]) + entry := pkg.DpkgArchiveEntry(entries[0].toDpkgEntry()) metadata = &entry case "md5sums": // parseDpkgMD5Info already streams via bufio.Scanner diff --git a/syft/pkg/cataloger/debian/parse_dpkg_db.go b/syft/pkg/cataloger/debian/parse_dpkg_db.go index 1a2e72e50..be9f04448 100644 --- a/syft/pkg/cataloger/debian/parse_dpkg_db.go +++ b/syft/pkg/cataloger/debian/parse_dpkg_db.go @@ -42,7 +42,7 @@ func parseDpkgDB(ctx context.Context, resolver file.Resolver, env *generic.Envir dbLoc := reader.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation) var pkgs []pkg.Package - _ = sync.CollectSlice(&ctx, cataloging.ExecutorFile, sync.ToSeq(metadata), func(m pkg.DpkgDBEntry) (pkg.Package, error) { + _ = sync.CollectSlice(&ctx, cataloging.ExecutorFile, sync.ToSeq(metadata), func(m dpkgExtractedMetadata) (pkg.Package, error) { return newDpkgPackage(ctx, m, dbLoc, resolver, env.LinuxRelease, findDpkgInfoFiles(m.Package, resolver, reader.Location)...), nil }, &pkgs) @@ -77,10 +77,12 @@ func findDpkgInfoFiles(name string, resolver file.Resolver, dbLocation file.Loca return locations } -// parseDpkgStatus is a parser function for Debian DB status contents, returning all Debian packages listed. -func parseDpkgStatus(reader io.Reader) ([]pkg.DpkgDBEntry, error) { +// parseDpkgStatus is a parser function for Debian DB status contents, returning the raw metadata for all Debian +// packages listed. Conversion to pkg.DpkgDBEntry is deferred to the package-building stage so that fields which +// are not part of the final entry (e.g. License) remain available to the caller. +func parseDpkgStatus(reader io.Reader) ([]dpkgExtractedMetadata, error) { buffedReader := bufio.NewReader(reader) - var metadata []pkg.DpkgDBEntry + var metadata []dpkgExtractedMetadata continueProcessing := true for continueProcessing { @@ -117,10 +119,41 @@ type dpkgExtractedMetadata struct { Depends string `mapstructure:"Depends"` PreDepends string `mapstructure:"PreDepends"` // note: original doc is Pre-Depends Status string `mapstructure:"Status"` + License string `mapstructure:"License"` + Conffiles string `mapstructure:"Conffiles"` +} + +func (d *dpkgExtractedMetadata) toDpkgEntry() pkg.DpkgDBEntry { + entry := pkg.DpkgDBEntry{ + Package: d.Package, + Source: d.Source, + Version: d.Version, + SourceVersion: d.SourceVersion, + Architecture: d.Architecture, + Maintainer: d.Maintainer, + InstalledSize: d.InstalledSize, + Description: d.Description, + Provides: splitPkgList(d.Provides), + Depends: splitPkgList(d.Depends), + PreDepends: splitPkgList(d.PreDepends), + // note: licenses and conffiles are handled separately + } + + // there may be an optional conffiles section that we should persist as files + if d.Conffiles != "" { + entry.Files = parseDpkgConffileInfo(strings.NewReader(d.Conffiles)) + } + + if entry.Files == nil { + // ensure the default value for a collection is never nil since this may be shown as JSON + entry.Files = make([]pkg.DpkgFileRecord, 0) + } + + return entry } // parseDpkgStatusEntry returns an individual Dpkg entry, or returns errEndOfPackages if there are no more packages to parse from the reader. -func parseDpkgStatusEntry(reader *bufio.Reader) (*pkg.DpkgDBEntry, error) { +func parseDpkgStatusEntry(reader *bufio.Reader) (*dpkgExtractedMetadata, error) { var retErr error dpkgFields, err := extractAllFields(reader) if err != nil { @@ -154,33 +187,7 @@ func parseDpkgStatusEntry(reader *bufio.Reader) (*pkg.DpkgDBEntry, error) { return nil, retErr } - entry := pkg.DpkgDBEntry{ - Package: raw.Package, - Source: raw.Source, - Version: raw.Version, - SourceVersion: raw.SourceVersion, - Architecture: raw.Architecture, - Maintainer: raw.Maintainer, - InstalledSize: raw.InstalledSize, - Description: raw.Description, - Provides: splitPkgList(raw.Provides), - Depends: splitPkgList(raw.Depends), - PreDepends: splitPkgList(raw.PreDepends), - } - - // there may be an optional conffiles section that we should persist as files - if conffilesSection, exists := dpkgFields["Conffiles"]; exists && conffilesSection != nil { - if sectionStr, ok := conffilesSection.(string); ok { - entry.Files = parseDpkgConffileInfo(strings.NewReader(sectionStr)) - } - } - - if entry.Files == nil { - // ensure the default value for a collection is never nil since this may be shown as JSON - entry.Files = make([]pkg.DpkgFileRecord, 0) - } - - return &entry, retErr + return &raw, retErr } func splitPkgList(pkgList string) (ret []string) { diff --git a/syft/pkg/cataloger/debian/parse_dpkg_db_test.go b/syft/pkg/cataloger/debian/parse_dpkg_db_test.go index 1cca3d9f6..e423c0a08 100644 --- a/syft/pkg/cataloger/debian/parse_dpkg_db_test.go +++ b/syft/pkg/cataloger/debian/parse_dpkg_db_test.go @@ -20,9 +20,12 @@ import ( func Test_parseDpkgStatus(t *testing.T) { tests := []struct { - name string - expected []pkg.DpkgDBEntry - fixturePath string + name string + expected []pkg.DpkgDBEntry + // expectedLicenses is the raw License value parsed for each entry (parallel to expected). License is not + // persisted on pkg.DpkgDBEntry, so it is asserted separately from the raw extracted metadata. + expectedLicenses []string + fixturePath string }{ { name: "single package", @@ -237,6 +240,37 @@ func Test_parseDpkgStatus(t *testing.T) { }, }, }, + { + name: "opkg status with license field", + fixturePath: "testdata/var/lib/opkg/status", + expected: []pkg.DpkgDBEntry{ + { + Package: "dropbear", + Version: "2024.85-r0", + Architecture: "x86_64", + Description: "Small SSH server and client.", + Depends: []string{"libc", "zlib"}, + Files: []pkg.DpkgFileRecord{}, + }, + { + Package: "busybox", + Version: "1.36.1-r3", + Architecture: "x86_64", + Description: "Single executable providing many common UNIX utilities.", + Depends: []string{"libc"}, + Files: []pkg.DpkgFileRecord{}, + }, + { + Package: "kernel-modules", + Version: "6.6.0-r0", + Architecture: "x86_64", + Description: "Loadable kernel modules with mixed licensing.", + Depends: []string{"kmod"}, + Files: []pkg.DpkgFileRecord{}, + }, + }, + expectedLicenses: []string{"MIT", "GPL-2.0-only", "GPL-2.0 BSD-3-Clause"}, + }, { name: "deinstall status packages are ignored", fixturePath: "testdata/var/lib/dpkg/status.d/deinstall", @@ -278,12 +312,24 @@ func Test_parseDpkgStatus(t *testing.T) { reader := bufio.NewReader(f) - entries, err := parseDpkgStatus(reader) + raw, err := parseDpkgStatus(reader) require.NoError(t, err) + // convert the raw metadata into the final entries just-in-time, mirroring the package-building stage + var entries []pkg.DpkgDBEntry + var licenses []string + for _, r := range raw { + entries = append(entries, r.toDpkgEntry()) + licenses = append(licenses, r.License) + } + if diff := cmp.Diff(test.expected, entries); diff != "" { t.Errorf("unexpected entry (-want +got):\n%s", diff) } + + if test.expectedLicenses != nil { + require.Equal(t, test.expectedLicenses, licenses) + } }) } } diff --git a/syft/pkg/cataloger/debian/testdata/var/lib/opkg/status b/syft/pkg/cataloger/debian/testdata/var/lib/opkg/status new file mode 100644 index 000000000..ed4b15038 --- /dev/null +++ b/syft/pkg/cataloger/debian/testdata/var/lib/opkg/status @@ -0,0 +1,30 @@ +Package: dropbear +Version: 2024.85-r0 +Depends: libc, zlib +Status: install user installed +Architecture: x86_64 +Installed-Time: 1714492800 +License: MIT +Section: net +Description: Small SSH server and client. + +Package: busybox +Version: 1.36.1-r3 +Depends: libc +Status: install user installed +Architecture: x86_64 +Installed-Time: 1714492800 +License: GPL-2.0-only +Section: base +Description: Single executable providing many common UNIX utilities. + +Package: kernel-modules +Version: 6.6.0-r0 +Depends: kmod +Status: install user installed +Architecture: x86_64 +Installed-Time: 1714492800 +License: GPL-2.0 BSD-3-Clause +Section: kernel +Description: Loadable kernel modules with mixed licensing. +