syft/internal/capabilities/generate/discover_metadata.go
Alex Goodman b5e85c3ea5
chore: migrate fixtures to testdata (#4651)
* migrate fixtures to testdata

Signed-off-by: Alex Goodman <wagoodman@users.noreply.github.com>

* fix: correct broken symlinks after testdata migration

The migration from test-fixtures to testdata broke several symlinks:
- elf-test-fixtures symlinks pointed to old test-fixtures paths
- elf-test-fixtures needed to be renamed to elf-testdata
- image-pkg-coverage symlink pointed to test-fixtures instead of testdata

Signed-off-by: Alex Goodman <wagoodman@users.noreply.github.com>

* fix: handle missing classifiers/bin directory in Makefile

The clean-fingerprint target was failing when classifiers/bin doesn't
exist (e.g., on fresh clone without downloaded binaries).

Signed-off-by: Alex Goodman <wagoodman@users.noreply.github.com>

* fix: add gitignore negation for jar/zip fixtures in test/cli

The jar and zip files in test/cli/testdata/image-unknowns were being
gitignored by the root .gitignore patterns. This caused them to be
untracked and not included when building docker images in CI, resulting
in Test_Unknowns failures since the test expects errors from corrupt
archive files that weren't present.

Add a .gitignore in test/cli/testdata to negate the exclusions for
these specific test fixture files.

Signed-off-by: Alex Goodman <wagoodman@users.noreply.github.com>

* switch fixture cache to v2

Signed-off-by: Alex Goodman <wagoodman@users.noreply.github.com>

* test: update expected versions for rebuilt fixtures

Update test expectations for packages that have been updated in
upstream repositories when docker images are rebuilt:
- glibc: 2.42-r4 → 2.43-r1 (wolfi)
- php: 8.2.29 → 8.2.30 (ubuntu/apache)

Signed-off-by: Alex Goodman <wagoodman@users.noreply.github.com>

* upgrade go

Signed-off-by: Alex Goodman <wagoodman@users.noreply.github.com>

* fix: add go-shlex dependency for testdata manager tool

The manager tool in syft/pkg/cataloger/binary/testdata/ imports
go-shlex, but since it's in a testdata directory, Go doesn't track
its dependencies. This caused CI failures when go.mod didn't
explicitly list the dependency.

Signed-off-by: Alex Goodman <wagoodman@users.noreply.github.com>

* refactor: move binary classifier manager to internal/

Move the manager tool from testdata/manager to internal/manager so
that Go properly tracks its dependencies. Code in testdata directories
is ignored by Go for dependency tracking, which caused CI failures
when go.mod didn't explicitly list transitive dependencies.

This is a cleaner solution than manually adding dependencies to go.mod
for code that happens to live in testdata.

Signed-off-by: Alex Goodman <wagoodman@users.noreply.github.com>

* fix: add gitignore negations for test fixtures blocked by root patterns

Multiple test fixtures were being blocked by root-level gitignore patterns
like bin/, *.jar, *.tar, and *.exe. This adds targeted .gitignore files with
negation patterns to allow these specific test fixtures to be tracked:

- syft/linux/testdata/os/busybox/bin/busybox (blocked by bin/)
- syft/pkg/cataloger/java/testdata/corrupt/example.{jar,tar} (blocked by *.jar, *.tar)
- syft/pkg/cataloger/binary/testdata/classifiers/snippets/go-version-hint/**/bin/go (blocked by bin/)
- syft/pkg/cataloger/bitnami/testdata/no-rel/.../bin/redis-server (blocked by bin/)

Also updates the bitnami test expectation to include the newly required
.gitignore files in the test fixture.

Signed-off-by: Alex Goodman <wagoodman@users.noreply.github.com>

* test: update glibc version expectation (2.43-r1 -> 2.43-r2)

Signed-off-by: Alex Goodman <wagoodman@users.noreply.github.com>

* add capability drift check as unit step

Signed-off-by: Alex Goodman <wagoodman@users.noreply.github.com>

* dont clear test observations before drift detection

Signed-off-by: Alex Goodman <wagoodman@users.noreply.github.com>

* bump stereoscope commit to main

Signed-off-by: Alex Goodman <wagoodman@users.noreply.github.com>

---------

Signed-off-by: Alex Goodman <wagoodman@users.noreply.github.com>
2026-03-06 19:42:04 +00:00

324 lines
11 KiB
Go

// this file discovers metadata and package types by reading test-observations.json files generated by pkgtest helpers during test execution.
package main
import (
"fmt"
"os"
"path/filepath"
"sort"
"github.com/anchore/syft/internal/capabilities/internal"
"github.com/anchore/syft/internal/capabilities/pkgtestobservation"
"github.com/anchore/syft/internal/packagemetadata"
)
// TestObservationIndex indexes all test observations for efficient lookup and application.
// parser-level observations are indexed by package name (from test file) + parser function,
// while cataloger-level observations are indexed by cataloger name.
type TestObservationIndex struct {
// parser-level observations: package -> parser function -> types
parsersByPackage map[string]map[string]*TypeObservation
// cataloger-level observations: cataloger name -> types
catalogers map[string]*TypeObservation
}
// TypeObservation combines metadata types and package types
type TypeObservation struct {
MetadataTypes []string
PackageTypes []string
JSONSchemaTypes []string
}
// newTestObservationIndex creates a new empty index
func newTestObservationIndex() *TestObservationIndex {
return &TestObservationIndex{
parsersByPackage: make(map[string]map[string]*TypeObservation),
catalogers: make(map[string]*TypeObservation),
}
}
// getParserObservations retrieves parser-level observations by package name and parser function
func (idx *TestObservationIndex) getParserObservations(packageName, parserFunction string) *TypeObservation {
if parsers, ok := idx.parsersByPackage[packageName]; ok {
return parsers[parserFunction]
}
return nil
}
// getCatalogerObservations retrieves cataloger-level observations by cataloger name
func (idx *TestObservationIndex) getCatalogerObservations(catalogerName string) *TypeObservation {
return idx.catalogers[catalogerName]
}
// setParserObservations stores parser-level observations
func (idx *TestObservationIndex) setParserObservations(packageName, parserFunction string, obs *TypeObservation) {
if idx.parsersByPackage[packageName] == nil {
idx.parsersByPackage[packageName] = make(map[string]*TypeObservation)
}
idx.parsersByPackage[packageName][parserFunction] = obs
}
// setCatalogerObservations stores cataloger-level observations
func (idx *TestObservationIndex) setCatalogerObservations(catalogerName string, obs *TypeObservation) {
idx.catalogers[catalogerName] = obs
}
// extractCustomCatalogerData extracts cataloger-level metadata and package types for custom catalogers
func (idx *TestObservationIndex) extractCustomCatalogerData() (map[string][]string, map[string][]string) {
metadata := make(map[string][]string)
packageTypes := make(map[string][]string)
for catalogerName, obs := range idx.catalogers {
if len(obs.MetadataTypes) > 0 {
metadata[catalogerName] = obs.MetadataTypes
}
if len(obs.PackageTypes) > 0 {
packageTypes[catalogerName] = obs.PackageTypes
}
}
return metadata, packageTypes
}
// discoverMetadataTypes searches for test-observations.json files and merges metadata type information
// into the discovered catalogers. Returns maps of custom cataloger metadata types and package types.
func discoverMetadataTypes(repoRoot string, discovered map[string]DiscoveredCataloger) (map[string][]string, map[string][]string, error) {
testDataDirs, err := internal.FindTestDataDirs(repoRoot)
if err != nil {
return nil, nil, err
}
// create index to aggregate all observations
index := newTestObservationIndex()
// read all test-observations files and merge into index
for _, dir := range testDataDirs {
observationsFile := filepath.Join(dir, "test-observations.json")
if observations, err := internal.ReadTestObservations(observationsFile); err == nil {
mergeTestObservations(observations, index)
} else if !os.IsNotExist(err) {
fmt.Printf(" Warning: failed to read %s: %v\n", observationsFile, err)
}
}
// check if any observations were found
if len(index.parsersByPackage) == 0 && len(index.catalogers) == 0 {
// no metadata found, this is not an error
return nil, nil, nil
}
// apply observations to discovered catalogers
applyTypesToCatalogers(discovered, index)
// extract custom cataloger data for return
customMetadata, customPackageTypes := index.extractCustomCatalogerData()
return customMetadata, customPackageTypes, nil
}
// mergeAndDeduplicateStrings merges two string slices, removes duplicates, and returns a sorted slice
func mergeAndDeduplicateStrings(existing, additional []string) []string {
set := make(map[string]bool)
for _, s := range existing {
set[s] = true
}
for _, s := range additional {
set[s] = true
}
result := make([]string, 0, len(set))
for s := range set {
result = append(result, s)
}
sort.Strings(result)
return result
}
// convertToJSONSchemaTypes converts Go struct names to UpperCamelCase JSON schema names
func convertToJSONSchemaTypes(metadataTypes []string) []string {
if len(metadataTypes) == 0 {
return nil
}
result := make([]string, 0, len(metadataTypes))
for _, typeName := range metadataTypes {
jsonName := packagemetadata.JSONNameFromString(typeName)
if jsonName != "" {
camelCase := packagemetadata.ToUpperCamelCase(jsonName)
result = append(result, camelCase)
}
}
return result
}
// mergeTestObservations merges metadata and package type data from a test-observations.json file
// into the observation index
func mergeTestObservations(observations *pkgtestobservation.Test, index *TestObservationIndex) {
pkg := observations.Package
// merge parser-level observations
for parserName, parserObs := range observations.Parsers {
if len(parserObs.MetadataTypes) == 0 && len(parserObs.PackageTypes) == 0 {
continue
}
existing := index.getParserObservations(pkg, parserName)
if existing == nil {
existing = &TypeObservation{}
}
// merge the types
existing.MetadataTypes = mergeAndDeduplicateStrings(existing.MetadataTypes, parserObs.MetadataTypes)
existing.PackageTypes = mergeAndDeduplicateStrings(existing.PackageTypes, parserObs.PackageTypes)
// generate JSON schema types from metadata types
existing.JSONSchemaTypes = convertToJSONSchemaTypes(existing.MetadataTypes)
index.setParserObservations(pkg, parserName, existing)
}
// merge cataloger-level observations
for catalogerName, catalogerObs := range observations.Catalogers {
if len(catalogerObs.MetadataTypes) == 0 && len(catalogerObs.PackageTypes) == 0 {
continue
}
existing := index.getCatalogerObservations(catalogerName)
if existing == nil {
existing = &TypeObservation{}
}
// merge the types
existing.MetadataTypes = mergeAndDeduplicateStrings(existing.MetadataTypes, catalogerObs.MetadataTypes)
existing.PackageTypes = mergeAndDeduplicateStrings(existing.PackageTypes, catalogerObs.PackageTypes)
// generate JSON schema types from metadata types
existing.JSONSchemaTypes = convertToJSONSchemaTypes(existing.MetadataTypes)
index.setCatalogerObservations(catalogerName, existing)
}
}
// applyParserObservations applies parser-level observations to a cataloger's parsers
func applyParserObservations(cataloger *DiscoveredCataloger, index *TestObservationIndex) bool {
foundData := false
// apply parser-level observations by matching package name + parser function
for i, parser := range cataloger.Parsers {
if obs := index.getParserObservations(cataloger.PackageName, parser.ParserFunction); obs != nil {
if len(obs.MetadataTypes) > 0 {
cataloger.Parsers[i].MetadataTypes = obs.MetadataTypes
cataloger.Parsers[i].JSONSchemaTypes = obs.JSONSchemaTypes
foundData = true
}
if len(obs.PackageTypes) > 0 {
cataloger.Parsers[i].PackageTypes = obs.PackageTypes
foundData = true
}
}
}
return foundData
}
// applySingleParserCatalogerObservations applies cataloger-level observations to a single-parser cataloger
// by merging them with any existing parser-level observations
func applySingleParserCatalogerObservations(cataloger *DiscoveredCataloger, catalogerObs *TypeObservation) bool {
foundData := false
if len(catalogerObs.MetadataTypes) > 0 {
cataloger.Parsers[0].MetadataTypes = mergeAndDeduplicateStrings(
cataloger.Parsers[0].MetadataTypes,
catalogerObs.MetadataTypes,
)
cataloger.Parsers[0].JSONSchemaTypes = convertToJSONSchemaTypes(cataloger.Parsers[0].MetadataTypes)
foundData = true
}
if len(catalogerObs.PackageTypes) > 0 {
cataloger.Parsers[0].PackageTypes = mergeAndDeduplicateStrings(
cataloger.Parsers[0].PackageTypes,
catalogerObs.PackageTypes,
)
foundData = true
}
return foundData
}
// applyMultiParserCatalogerObservations applies cataloger-level observations to a multi-parser cataloger
// only applies to parsers that don't already have parser-level data
func applyMultiParserCatalogerObservations(cataloger *DiscoveredCataloger, catalogerObs *TypeObservation) bool {
foundData := false
// count parsers without any data
parsersWithoutData := 0
for _, parser := range cataloger.Parsers {
if len(parser.MetadataTypes) == 0 && len(parser.PackageTypes) == 0 {
parsersWithoutData++
}
}
// if all parsers lack data, apply to all and warn
if parsersWithoutData == len(cataloger.Parsers) {
fmt.Printf(" Warning: cataloger %q has %d parsers but only cataloger-level observations. Consider adding parser-level tests for better granularity.\n",
cataloger.Name, len(cataloger.Parsers))
for i := range cataloger.Parsers {
if len(catalogerObs.MetadataTypes) > 0 {
cataloger.Parsers[i].MetadataTypes = catalogerObs.MetadataTypes
cataloger.Parsers[i].JSONSchemaTypes = catalogerObs.JSONSchemaTypes
foundData = true
}
if len(catalogerObs.PackageTypes) > 0 {
cataloger.Parsers[i].PackageTypes = catalogerObs.PackageTypes
foundData = true
}
}
} else if parsersWithoutData > 0 {
// only apply to parsers without data
for i, parser := range cataloger.Parsers {
if len(parser.MetadataTypes) == 0 && len(catalogerObs.MetadataTypes) > 0 {
cataloger.Parsers[i].MetadataTypes = catalogerObs.MetadataTypes
cataloger.Parsers[i].JSONSchemaTypes = catalogerObs.JSONSchemaTypes
foundData = true
}
if len(parser.PackageTypes) == 0 && len(catalogerObs.PackageTypes) > 0 {
cataloger.Parsers[i].PackageTypes = catalogerObs.PackageTypes
foundData = true
}
}
}
return foundData
}
// applyTypesToCatalogers applies the aggregated metadata and package type data to the discovered catalogers.
// it updates the cataloger's parser entries with the appropriate metadata and package types.
func applyTypesToCatalogers(discovered map[string]DiscoveredCataloger, index *TestObservationIndex) {
for catalogerName, cataloger := range discovered {
var foundData bool
// first, apply parser-level observations
if applyParserObservations(&cataloger, index) {
foundData = true
}
// then, apply cataloger-level observations if they exist
if catalogerObs := index.getCatalogerObservations(catalogerName); catalogerObs != nil && len(cataloger.Parsers) > 0 {
if len(cataloger.Parsers) == 1 {
// single parser: merge cataloger-level with parser-level observations
if applySingleParserCatalogerObservations(&cataloger, catalogerObs) {
foundData = true
}
} else {
// multiple parsers: only apply to parsers without parser-level data
if applyMultiParserCatalogerObservations(&cataloger, catalogerObs) {
foundData = true
}
}
}
if foundData {
discovered[catalogerName] = cataloger
}
}
}