From 538430d65d2ccb6d86424130244639cc0a418833 Mon Sep 17 00:00:00 2001
From: Alex Goodman <wagoodman@users.noreply.github.com>
Date: Thu, 30 Oct 2025 13:19:42 -0400
Subject: [PATCH] describe cataloger capabilities via test observations (#4318)

Signed-off-by: Alex Goodman <wagoodman@users.noreply.github.com>
---
 .../capabilities/pkgtestobservation/model.go  |  46 ++
 syft/pkg/cataloger/.gitignore                 |   2 +
 .../internal/pkgtest/metadata_tracker.go      | 514 ++++++++++++++++++
 .../internal/pkgtest/observing_resolver.go    | 191 ++++---
 .../internal/pkgtest/test_generic_parser.go   | 176 ++++++
 .../javascript/parse_yarn_lock_test.go        |   6 +-
 syft/pkg/type.go                              |   4 +
 7 files changed, 854 insertions(+), 85 deletions(-)
 create mode 100644 internal/capabilities/pkgtestobservation/model.go
 create mode 100644 syft/pkg/cataloger/.gitignore
 create mode 100644 syft/pkg/cataloger/internal/pkgtest/metadata_tracker.go

diff --git a/internal/capabilities/pkgtestobservation/model.go b/internal/capabilities/pkgtestobservation/model.go
new file mode 100644
index 000000000..77fbcb409
--- /dev/null
+++ b/internal/capabilities/pkgtestobservation/model.go
@@ -0,0 +1,46 @@
+package pkgtestobservation
+
+import "time"
+
+// Observations represents capability observations during testing
+type Observations struct {
+	License       bool         `json:"license"`
+	Relationships Relationship `json:"relationships"`
+	FileListing   Count        `json:"file_listing"`
+	FileDigests   Count        `json:"file_digests"`
+	IntegrityHash Count        `json:"integrity_hash"`
+}
+
+// Relationship tracks dependency relationship observations
+type Relationship struct {
+	Found bool `json:"found"`
+	Count int  `json:"count"`
+}
+
+// Count tracks whether a capability was found and how many times
+type Count struct {
+	Found bool `json:"found"`
+	Count int  `json:"count"`
+}
+
+// Test is the root structure for test-observations.json
+type Test struct {
+	Package    string                `json:"package"`
+	UpdatedAt  time.Time             `json:"updated_at"`
+	Catalogers map[string]*Cataloger `json:"catalogers"`
+	Parsers    map[string]*Parser    `json:"parsers"`
+}
+
+// Parser captures all observations for a parser
+type Parser struct {
+	MetadataTypes []string     `json:"metadata_types"`
+	PackageTypes  []string     `json:"package_types"`
+	Observations  Observations `json:"observations"`
+}
+
+// Cataloger captures all observations for a cataloger
+type Cataloger struct {
+	MetadataTypes []string     `json:"metadata_types"`
+	PackageTypes  []string     `json:"package_types"`
+	Observations  Observations `json:"observations"`
+}
diff --git a/syft/pkg/cataloger/.gitignore b/syft/pkg/cataloger/.gitignore
new file mode 100644
index 000000000..752bd7a6c
--- /dev/null
+++ b/syft/pkg/cataloger/.gitignore
@@ -0,0 +1,2 @@
+# these are generated by pkgtest helpers, no need to check them in
+**/test-fixtures/test-observations.json
\ No newline at end of file
diff --git a/syft/pkg/cataloger/internal/pkgtest/metadata_tracker.go b/syft/pkg/cataloger/internal/pkgtest/metadata_tracker.go
new file mode 100644
index 000000000..378a025c8
--- /dev/null
+++ b/syft/pkg/cataloger/internal/pkgtest/metadata_tracker.go
@@ -0,0 +1,514 @@
+// Package pkgtest provides test helpers for cataloger and parser testing,
+// including automatic observation tracking for capability documentation.
+package pkgtest
+
+import (
+	"encoding/json"
+	"os"
+	"path/filepath"
+	"reflect"
+	"sort"
+	"sync"
+	"time"
+
+	"github.com/anchore/syft/internal/capabilities/pkgtestobservation"
+	"github.com/anchore/syft/syft/artifact"
+	"github.com/anchore/syft/syft/pkg"
+)
+
+var (
+	globalTracker     *MetadataTracker
+	globalTrackerOnce sync.Once
+
+	// commonPackageIntegrityFields are common field names used to store integrity hashes in package metadata.
+	// TODO: this is a best-effort list and may need to be expanded as new package types are added. Don't depend on this list to catch everything - it's only for test validation.
+	commonPackageIntegrityFields = []string{
+		"Integrity", "Checksum", "H1Digest",
+		"OutputHash", "PkgHash", "ContentHash",
+		"PkgHashExt", "Hash", "IntegrityHash",
+	}
+)
+
+// MetadataTracker collects metadata type and package type usage during test execution
+type MetadataTracker struct {
+	mu                    sync.Mutex
+	parserData            map[string]map[string]map[string]bool // package -> parser -> metadata types (set)
+	catalogerData         map[string]map[string]bool            // cataloger -> metadata types (set)
+	parserPackageTypes    map[string]map[string]map[string]bool // package -> parser -> package types (set)
+	catalogerPackageTypes map[string]map[string]bool            // cataloger -> package types (set)
+
+	// unified observations for the current test package
+	observations *pkgtestobservation.Test
+}
+
+// getTracker returns the singleton metadata tracker
+func getTracker() *MetadataTracker {
+	globalTrackerOnce.Do(func() {
+		globalTracker = &MetadataTracker{
+			parserData:            make(map[string]map[string]map[string]bool),
+			catalogerData:         make(map[string]map[string]bool),
+			parserPackageTypes:    make(map[string]map[string]map[string]bool),
+			catalogerPackageTypes: make(map[string]map[string]bool),
+		}
+	})
+	return globalTracker
+}
+
+// RecordParser records a metadata type usage for a parser function
+func (t *MetadataTracker) RecordParser(packageName, parserFunction, metadataType string) {
+	if packageName == "" || parserFunction == "" || metadataType == "" {
+		return
+	}
+
+	// filter out non-metadata types
+	if metadataType == "pkg.Package" || metadataType == "" {
+		return
+	}
+
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	if t.parserData[packageName] == nil {
+		t.parserData[packageName] = make(map[string]map[string]bool)
+	}
+
+	if t.parserData[packageName][parserFunction] == nil {
+		t.parserData[packageName][parserFunction] = make(map[string]bool)
+	}
+
+	t.parserData[packageName][parserFunction][metadataType] = true
+}
+
+// RecordCataloger records a metadata type usage for a cataloger
+func (t *MetadataTracker) RecordCataloger(catalogerName, metadataType string) {
+	if catalogerName == "" || metadataType == "" {
+		return
+	}
+
+	// filter out non-metadata types
+	if metadataType == "pkg.Package" || metadataType == "" {
+		return
+	}
+
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	if t.catalogerData[catalogerName] == nil {
+		t.catalogerData[catalogerName] = make(map[string]bool)
+	}
+
+	t.catalogerData[catalogerName][metadataType] = true
+}
+
+// RecordParserPackageType records a package type usage for a parser function
+func (t *MetadataTracker) RecordParserPackageType(packageName, parserFunction, pkgType string) {
+	if packageName == "" || parserFunction == "" || pkgType == "" {
+		return
+	}
+
+	// filter out unknown types
+	if pkgType == pkg.UnknownPkg.String() || pkgType == "" {
+		return
+	}
+
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	if t.parserPackageTypes[packageName] == nil {
+		t.parserPackageTypes[packageName] = make(map[string]map[string]bool)
+	}
+
+	if t.parserPackageTypes[packageName][parserFunction] == nil {
+		t.parserPackageTypes[packageName][parserFunction] = make(map[string]bool)
+	}
+
+	t.parserPackageTypes[packageName][parserFunction][pkgType] = true
+}
+
+// RecordCatalogerPackageType records a package type usage for a cataloger
+func (t *MetadataTracker) RecordCatalogerPackageType(catalogerName, pkgType string) {
+	if catalogerName == "" || pkgType == "" {
+		return
+	}
+
+	// filter out unknown types
+	if pkgType == pkg.UnknownPkg.String() || pkgType == "" {
+		return
+	}
+
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	if t.catalogerPackageTypes[catalogerName] == nil {
+		t.catalogerPackageTypes[catalogerName] = make(map[string]bool)
+	}
+
+	t.catalogerPackageTypes[catalogerName][pkgType] = true
+}
+
+// RecordParserPackageMetadata extracts and records metadata type and package type from a package for a parser
+func (t *MetadataTracker) RecordParserPackageMetadata(packageName, parserFunction string, p pkg.Package) {
+	if p.Metadata != nil {
+		metadataType := getMetadataTypeName(p.Metadata)
+		if metadataType != "" {
+			t.RecordParser(packageName, parserFunction, metadataType)
+		}
+	}
+
+	// record package type
+	t.RecordParserPackageType(packageName, parserFunction, string(p.Type))
+}
+
+// RecordCatalogerPackageMetadata extracts and records metadata type and package type from a package for a cataloger
+func (t *MetadataTracker) RecordCatalogerPackageMetadata(catalogerName string, p pkg.Package) {
+	if p.Metadata != nil {
+		metadataType := getMetadataTypeName(p.Metadata)
+		if metadataType != "" {
+			t.RecordCataloger(catalogerName, metadataType)
+		}
+	}
+
+	// record package type
+	t.RecordCatalogerPackageType(catalogerName, string(p.Type))
+}
+
+// aggregateObservations aggregates package and relationship observations into metadata types, package types, and observations.
+// this is used by both parser and cataloger observation recording.
+func aggregateObservations(
+	metadataTypes *[]string,
+	packageTypes *[]string,
+	obs *pkgtestobservation.Observations,
+	pkgs []pkg.Package,
+	relationships []artifact.Relationship,
+) {
+	// aggregate observations from packages
+	for _, p := range pkgs {
+		// metadata types
+		if p.Metadata != nil {
+			metadataType := getMetadataTypeName(p.Metadata)
+			if metadataType != "" && !contains(*metadataTypes, metadataType) {
+				*metadataTypes = append(*metadataTypes, metadataType)
+			}
+		}
+
+		// package types
+		pkgType := string(p.Type)
+		if pkgType != "" && pkgType != pkg.UnknownPkg.String() && !contains(*packageTypes, pkgType) {
+			*packageTypes = append(*packageTypes, pkgType)
+		}
+
+		// license observation
+		if !p.Licenses.Empty() {
+			obs.License = true
+		}
+
+		// file listing observation
+		if fileOwner, ok := p.Metadata.(pkg.FileOwner); ok {
+			files := fileOwner.OwnedFiles()
+			if len(files) > 0 {
+				obs.FileListing.Found = true
+				obs.FileListing.Count += len(files)
+			}
+		}
+
+		// file digests observation
+		if hasFileDigests(p.Metadata) {
+			obs.FileDigests.Found = true
+			obs.FileDigests.Count++
+		}
+
+		// integrity hash observation
+		if hasIntegrityHash(p.Metadata) {
+			obs.IntegrityHash.Found = true
+			obs.IntegrityHash.Count++
+		}
+	}
+
+	// relationship observations
+	depCount := countDependencyRelationships(relationships)
+	if depCount > 0 {
+		obs.Relationships.Found = true
+		obs.Relationships.Count = depCount
+	}
+
+	// sort arrays for consistency
+	sort.Strings(*metadataTypes)
+	sort.Strings(*packageTypes)
+}
+
+// ensureObservationsInitialized ensures t.observations is initialized and package name is set.
+// must be called with t.mu locked.
+func (t *MetadataTracker) ensureObservationsInitialized(packageName string) {
+	if t.observations == nil {
+		t.observations = &pkgtestobservation.Test{
+			Package:    packageName,
+			Catalogers: make(map[string]*pkgtestobservation.Cataloger),
+			Parsers:    make(map[string]*pkgtestobservation.Parser),
+		}
+		return
+	}
+
+	// update package name if not set (for the first test) or if it matches (for subsequent tests in same package)
+	if t.observations.Package == "" || t.observations.Package == packageName {
+		t.observations.Package = packageName
+	}
+}
+
+// getOrCreateParser gets an existing parser observation or creates a new one.
+// must be called with t.mu locked.
+func (t *MetadataTracker) getOrCreateParser(parserFunction string) *pkgtestobservation.Parser {
+	if t.observations.Parsers[parserFunction] == nil {
+		t.observations.Parsers[parserFunction] = &pkgtestobservation.Parser{
+			MetadataTypes: []string{},
+			PackageTypes:  []string{},
+			Observations:  pkgtestobservation.Observations{},
+		}
+	}
+	return t.observations.Parsers[parserFunction]
+}
+
+// getOrCreateCataloger gets an existing cataloger observation or creates a new one.
+// must be called with t.mu locked.
+func (t *MetadataTracker) getOrCreateCataloger(catalogerName string) *pkgtestobservation.Cataloger {
+	if t.observations.Catalogers[catalogerName] == nil {
+		t.observations.Catalogers[catalogerName] = &pkgtestobservation.Cataloger{
+			MetadataTypes: []string{},
+			PackageTypes:  []string{},
+			Observations:  pkgtestobservation.Observations{},
+		}
+	}
+	return t.observations.Catalogers[catalogerName]
+}
+
+// RecordParserObservations records comprehensive observations for a parser.
+func (t *MetadataTracker) RecordParserObservations(
+	packageName, parserFunction string,
+	pkgs []pkg.Package,
+	relationships []artifact.Relationship,
+) {
+	if packageName == "" || parserFunction == "" {
+		return
+	}
+
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	t.ensureObservationsInitialized(packageName)
+	parser := t.getOrCreateParser(parserFunction)
+	aggregateObservations(&parser.MetadataTypes, &parser.PackageTypes, &parser.Observations, pkgs, relationships)
+}
+
+// RecordCatalogerObservations records comprehensive observations for a cataloger.
+func (t *MetadataTracker) RecordCatalogerObservations(
+	packageName, catalogerName string,
+	pkgs []pkg.Package,
+	relationships []artifact.Relationship,
+) {
+	if packageName == "" || catalogerName == "" {
+		return
+	}
+
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	t.ensureObservationsInitialized(packageName)
+	cataloger := t.getOrCreateCataloger(catalogerName)
+	aggregateObservations(&cataloger.MetadataTypes, &cataloger.PackageTypes, &cataloger.Observations, pkgs, relationships)
+}
+
+// ===== Metadata Type and Capability Detection =====
+// These functions use reflection to inspect package metadata and detect capabilities.
+// They are best-effort and may not catch all cases.
+
+// getMetadataTypeName returns the fully qualified type name of metadata (e.g., "pkg.ApkDBEntry").
+// extracts just the last package path segment to keep names concise.
+func getMetadataTypeName(metadata interface{}) string {
+	if metadata == nil {
+		return ""
+	}
+
+	t := reflect.TypeOf(metadata)
+	if t == nil {
+		return ""
+	}
+
+	// handle pointers
+	if t.Kind() == reflect.Ptr {
+		t = t.Elem()
+	}
+
+	// return pkg path + type name (e.g., "pkg.ApkDBEntry")
+	if t.PkgPath() != "" {
+		// extract just "pkg" from "github.com/anchore/syft/syft/pkg"
+		pkgPath := lastPathSegment(t.PkgPath())
+		return pkgPath + "." + t.Name()
+	}
+
+	return t.Name()
+}
+
+// lastPathSegment extracts the last segment from a package path.
+// for example: "github.com/anchore/syft/syft/pkg" -> "pkg"
+func lastPathSegment(path string) string {
+	for i := len(path) - 1; i >= 0; i-- {
+		if path[i] == '/' {
+			return path[i+1:]
+		}
+	}
+	return path
+}
+
+// hasIntegrityHash checks if metadata contains an integrity hash field.
+// note: this uses a best-effort approach checking common field names.
+// DO NOT depend on these values in auto-generated capabilities definitions - use for test validation only.
+func hasIntegrityHash(metadata interface{}) bool {
+	v := dereferenceToStruct(metadata)
+	if !v.IsValid() || v.Kind() != reflect.Struct {
+		return false
+	}
+
+	for _, fieldName := range commonPackageIntegrityFields {
+		if hasPopulatedStringField(v, fieldName) {
+			return true
+		}
+	}
+	return false
+}
+
+// hasFileDigests checks if metadata contains file records with digests.
+// note: uses a best-effort approach for detection.
+// DO NOT depend on these values in auto-generated capabilities definitions - use for test validation only.
+func hasFileDigests(metadata interface{}) bool {
+	v := dereferenceToStruct(metadata)
+	if !v.IsValid() || v.Kind() != reflect.Struct {
+		return false
+	}
+
+	filesField := v.FieldByName("Files")
+	if !filesField.IsValid() || filesField.Kind() != reflect.Slice {
+		return false
+	}
+
+	// check if any file record has a Digest field populated
+	for i := 0; i < filesField.Len(); i++ {
+		if hasPopulatedDigest(filesField.Index(i)) {
+			return true
+		}
+	}
+	return false
+}
+
+// dereferenceToStruct handles pointer dereferencing and returns the underlying value.
+// returns an invalid value if the input is nil or not convertible to a struct.
+func dereferenceToStruct(v interface{}) reflect.Value {
+	if v == nil {
+		return reflect.Value{}
+	}
+
+	val := reflect.ValueOf(v)
+	if val.Kind() == reflect.Ptr {
+		if val.IsNil() {
+			return reflect.Value{}
+		}
+		val = val.Elem()
+	}
+	return val
+}
+
+// hasPopulatedStringField checks if a struct has a non-empty string field with the given name.
+func hasPopulatedStringField(v reflect.Value, fieldName string) bool {
+	field := v.FieldByName(fieldName)
+	return field.IsValid() && field.Kind() == reflect.String && field.String() != ""
+}
+
+// hasPopulatedDigest checks if a file record has a populated Digest field.
+func hasPopulatedDigest(fileRecord reflect.Value) bool {
+	fileRecord = dereferenceToStruct(fileRecord.Interface())
+	if !fileRecord.IsValid() || fileRecord.Kind() != reflect.Struct {
+		return false
+	}
+
+	digestField := fileRecord.FieldByName("Digest")
+	if !digestField.IsValid() {
+		return false
+	}
+
+	// check if digest is a pointer and not nil, or a non-zero value
+	switch digestField.Kind() {
+	case reflect.Ptr:
+		return !digestField.IsNil()
+	case reflect.String:
+		return digestField.String() != ""
+	case reflect.Struct:
+		return !digestField.IsZero()
+	}
+	return false
+}
+
+// ===== Utility Functions =====
+
+// countDependencyRelationships counts the number of dependency relationships.
+func countDependencyRelationships(relationships []artifact.Relationship) int {
+	count := 0
+	for _, rel := range relationships {
+		if rel.Type == artifact.DependencyOfRelationship {
+			count++
+		}
+	}
+	return count
+}
+
+// contains checks if a string slice contains a specific string.
+func contains(slice []string, item string) bool {
+	for _, s := range slice {
+		if s == item {
+			return true
+		}
+	}
+	return false
+}
+
+// ===== Result Writing =====
+
+// WriteResults writes the collected observation data to test-fixtures/test-observations.json.
+func (t *MetadataTracker) WriteResults() error {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	if t.observations == nil {
+		// no data to write
+		return nil
+	}
+
+	// create output directory
+	outDir := "test-fixtures"
+	if err := os.MkdirAll(outDir, 0755); err != nil {
+		return err
+	}
+
+	// write unified test-observations.json
+	t.observations.UpdatedAt = time.Now().UTC()
+
+	filename := filepath.Join(outDir, "test-observations.json")
+	return writeJSONFile(filename, t.observations)
+}
+
+// writeJSONFile writes data as pretty-printed JSON to the specified path.
+func writeJSONFile(path string, data interface{}) error {
+	file, err := os.Create(path)
+	if err != nil {
+		return err
+	}
+	defer file.Close()
+
+	encoder := json.NewEncoder(file)
+	encoder.SetIndent("", "  ")
+	return encoder.Encode(data)
+}
+
+// WriteResultsIfEnabled writes results if tracking is enabled.
+// this is typically called via t.Cleanup() in tests.
+func WriteResultsIfEnabled() error {
+	tracker := getTracker()
+	return tracker.WriteResults()
+}
diff --git a/syft/pkg/cataloger/internal/pkgtest/observing_resolver.go b/syft/pkg/cataloger/internal/pkgtest/observing_resolver.go
index a05d0823b..49fc9c51f 100644
--- a/syft/pkg/cataloger/internal/pkgtest/observing_resolver.go
+++ b/syft/pkg/cataloger/internal/pkgtest/observing_resolver.go
@@ -1,3 +1,5 @@
+// Package pkgtest provides test helpers for cataloger and parser testing,
+// including resolver decorators that track file access patterns.
 package pkgtest
 
 import (
@@ -13,29 +15,36 @@ import (
 
 var _ file.Resolver = (*ObservingResolver)(nil)
 
+// ObservingResolver wraps a file.Resolver to observe and track all file access patterns.
+// it records what paths were queried, what was returned, and what file contents were read.
+// this is useful for validating that catalogers use appropriate glob patterns and don't over-read files.
 type ObservingResolver struct {
 	decorated          file.Resolver
-	pathQueries        map[string][]string
-	pathResponses      []file.Location
-	contentQueries     []file.Location
-	emptyPathResponses map[string][]string
+	pathQueries        map[string][]string // method name -> list of query patterns
+	pathResponses      []file.Location     // all locations successfully returned
+	contentQueries     []file.Location     // all locations whose content was read
+	emptyPathResponses map[string][]string // method name -> paths that returned empty results
 }
 
+// NewObservingResolver creates a new ObservingResolver that wraps the given resolver.
 func NewObservingResolver(resolver file.Resolver) *ObservingResolver {
 	return &ObservingResolver{
 		decorated:          resolver,
-		pathResponses:      make([]file.Location, 0),
-		emptyPathResponses: make(map[string][]string),
 		pathQueries:        make(map[string][]string),
+		pathResponses:      make([]file.Location, 0),
+		contentQueries:     make([]file.Location, 0),
+		emptyPathResponses: make(map[string][]string),
 	}
 }
 
-// testing helpers...
+// ===== Test Assertion Helpers =====
+// these methods are used by tests to validate expected file access patterns.
 
+// ObservedPathQuery checks if a specific path pattern was queried.
 func (r *ObservingResolver) ObservedPathQuery(input string) bool {
-	for _, vs := range r.pathQueries {
-		for _, v := range vs {
-			if v == input {
+	for _, queries := range r.pathQueries {
+		for _, query := range queries {
+			if query == input {
 				return true
 			}
 		}
@@ -43,6 +52,7 @@ func (r *ObservingResolver) ObservedPathQuery(input string) bool {
 	return false
 }
 
+// ObservedPathResponses checks if a specific path was returned in any response.
 func (r *ObservingResolver) ObservedPathResponses(path string) bool {
 	for _, loc := range r.pathResponses {
 		if loc.RealPath == path {
@@ -52,6 +62,7 @@ func (r *ObservingResolver) ObservedPathResponses(path string) bool {
 	return false
 }
 
+// ObservedContentQueries checks if a specific file's content was read.
 func (r *ObservingResolver) ObservedContentQueries(path string) bool {
 	for _, loc := range r.contentQueries {
 		if loc.RealPath == path {
@@ -61,6 +72,7 @@ func (r *ObservingResolver) ObservedContentQueries(path string) bool {
 	return false
 }
 
+// AllContentQueries returns a deduplicated list of all file paths whose content was read.
 func (r *ObservingResolver) AllContentQueries() []string {
 	observed := strset.New()
 	for _, loc := range r.contentQueries {
@@ -69,155 +81,166 @@ func (r *ObservingResolver) AllContentQueries() []string {
 	return observed.List()
 }
 
+// AllPathQueries returns all path query patterns grouped by method name.
 func (r *ObservingResolver) AllPathQueries() map[string][]string {
 	return r.pathQueries
 }
 
+// PruneUnfulfilledPathResponses removes specified paths from the unfulfilled requests tracking.
+// ignore maps method names to paths that should be ignored for that method.
+// ignorePaths lists paths that should be ignored for all methods.
 func (r *ObservingResolver) PruneUnfulfilledPathResponses(ignore map[string][]string, ignorePaths ...string) {
-	if ignore == nil {
-		return
-	}
-	// remove any paths that were ignored for specific calls
-	for k, v := range ignore {
-		results := r.emptyPathResponses[k]
-		for _, ig := range v {
-			for i, result := range results {
-				if result == ig {
-					results = append(results[:i], results[i+1:]...)
-					break
-				}
-			}
-		}
-		if len(results) > 0 {
-			r.emptyPathResponses[k] = results
-		} else {
-			delete(r.emptyPathResponses, k)
+	// remove paths ignored for specific methods
+	for methodName, pathsToIgnore := range ignore {
+		r.emptyPathResponses[methodName] = removeStrings(r.emptyPathResponses[methodName], pathsToIgnore)
+		if len(r.emptyPathResponses[methodName]) == 0 {
+			delete(r.emptyPathResponses, methodName)
 		}
 	}
 
-	// remove any paths that were ignored for all calls
-	for _, ig := range ignorePaths {
-		for k, v := range r.emptyPathResponses {
-			for i, result := range v {
-				if result == ig {
-					v = append(v[:i], v[i+1:]...)
-					break
-				}
-			}
-			if len(v) > 0 {
-				r.emptyPathResponses[k] = v
-			} else {
-				delete(r.emptyPathResponses, k)
+	// remove paths ignored for all methods
+	if len(ignorePaths) > 0 {
+		for methodName := range r.emptyPathResponses {
+			r.emptyPathResponses[methodName] = removeStrings(r.emptyPathResponses[methodName], ignorePaths)
+			if len(r.emptyPathResponses[methodName]) == 0 {
+				delete(r.emptyPathResponses, methodName)
 			}
 		}
 	}
 }
 
+// HasUnfulfilledPathRequests returns true if there are any paths that were queried but returned empty.
 func (r *ObservingResolver) HasUnfulfilledPathRequests() bool {
 	return len(r.emptyPathResponses) > 0
 }
 
+// PrettyUnfulfilledPathRequests returns a formatted string of all unfulfilled path requests.
 func (r *ObservingResolver) PrettyUnfulfilledPathRequests() string {
-	var res string
-	var keys []string
+	if len(r.emptyPathResponses) == 0 {
+		return ""
+	}
 
+	var keys []string
 	for k := range r.emptyPathResponses {
 		keys = append(keys, k)
 	}
-
 	sort.Strings(keys)
 
+	var result string
 	for _, k := range keys {
-		res += fmt.Sprintf("   %s: %+v\n", k, r.emptyPathResponses[k])
+		result += fmt.Sprintf("   %s: %+v\n", k, r.emptyPathResponses[k])
 	}
-	return res
+	return result
 }
 
-// For the file path resolver...
+// removeStrings removes all occurrences of toRemove from slice.
+func removeStrings(slice []string, toRemove []string) []string {
+	if len(toRemove) == 0 {
+		return slice
+	}
 
-func (r *ObservingResolver) addPathQuery(name string, input ...string) {
-	r.pathQueries[name] = append(r.pathQueries[name], input...)
+	// create a set for O(1) lookup
+	removeSet := make(map[string]bool)
+	for _, s := range toRemove {
+		removeSet[s] = true
+	}
+
+	// filter the slice
+	result := make([]string, 0, len(slice))
+	for _, s := range slice {
+		if !removeSet[s] {
+			result = append(result, s)
+		}
+	}
+	return result
 }
 
-func (r *ObservingResolver) addPathResponse(locs ...file.Location) {
+// ===== Internal Tracking Helpers =====
+
+// recordQuery records a path query for a given method.
+func (r *ObservingResolver) recordQuery(methodName string, queries ...string) {
+	r.pathQueries[methodName] = append(r.pathQueries[methodName], queries...)
+}
+
+// recordResponses records successful path responses and tracks any unfulfilled queries.
+func (r *ObservingResolver) recordResponses(methodName string, locs []file.Location, queriedPaths ...string) {
 	r.pathResponses = append(r.pathResponses, locs...)
-}
 
-func (r *ObservingResolver) addEmptyPathResponse(name string, locs []file.Location, paths ...string) {
-	if len(locs) == 0 {
-		results := r.emptyPathResponses[name]
-		results = append(results, paths...)
-		r.emptyPathResponses[name] = results
+	// track paths that returned no results
+	if len(locs) == 0 && len(queriedPaths) > 0 {
+		r.emptyPathResponses[methodName] = append(r.emptyPathResponses[methodName], queriedPaths...)
 	}
 }
 
+// ===== file.Resolver Implementation =====
+// these methods delegate to the wrapped resolver while recording observations.
+
+// FilesByPath returns files matching the given paths.
 func (r *ObservingResolver) FilesByPath(paths ...string) ([]file.Location, error) {
-	name := "FilesByPath"
-	r.addPathQuery(name, paths...)
+	const methodName = "FilesByPath"
+	r.recordQuery(methodName, paths...)
 
 	locs, err := r.decorated.FilesByPath(paths...)
+	r.recordResponses(methodName, locs, paths...)
 
-	r.addPathResponse(locs...)
-	r.addEmptyPathResponse(name, locs, paths...)
 	return locs, err
 }
 
+// FilesByGlob returns files matching the given glob patterns.
 func (r *ObservingResolver) FilesByGlob(patterns ...string) ([]file.Location, error) {
-	name := "FilesByGlob"
-	r.addPathQuery(name, patterns...)
+	const methodName = "FilesByGlob"
+	r.recordQuery(methodName, patterns...)
 
 	locs, err := r.decorated.FilesByGlob(patterns...)
+	r.recordResponses(methodName, locs, patterns...)
 
-	r.addPathResponse(locs...)
-	r.addEmptyPathResponse(name, locs, patterns...)
 	return locs, err
 }
 
+// FilesByMIMEType returns files matching the given MIME types.
 func (r *ObservingResolver) FilesByMIMEType(types ...string) ([]file.Location, error) {
-	name := "FilesByMIMEType"
-	r.addPathQuery(name, types...)
+	const methodName = "FilesByMIMEType"
+	r.recordQuery(methodName, types...)
 
 	locs, err := r.decorated.FilesByMIMEType(types...)
+	r.recordResponses(methodName, locs, types...)
 
-	r.addPathResponse(locs...)
-	r.addEmptyPathResponse(name, locs, types...)
 	return locs, err
 }
 
-func (r *ObservingResolver) RelativeFileByPath(l file.Location, path string) *file.Location {
-	name := "RelativeFileByPath"
-	r.addPathQuery(name, path)
+// RelativeFileByPath returns a file at a path relative to the given location.
+func (r *ObservingResolver) RelativeFileByPath(location file.Location, path string) *file.Location {
+	const methodName = "RelativeFileByPath"
+	r.recordQuery(methodName, path)
 
-	loc := r.decorated.RelativeFileByPath(l, path)
+	loc := r.decorated.RelativeFileByPath(location, path)
 
 	if loc != nil {
-		r.addPathResponse(*loc)
+		r.pathResponses = append(r.pathResponses, *loc)
 	} else {
-		results := r.emptyPathResponses[name]
-		results = append(results, path)
-		r.emptyPathResponses[name] = results
+		r.emptyPathResponses[methodName] = append(r.emptyPathResponses[methodName], path)
 	}
+
 	return loc
 }
 
-// For the content resolver methods...
-
+// FileContentsByLocation returns a reader for the contents of the file at the given location.
 func (r *ObservingResolver) FileContentsByLocation(location file.Location) (io.ReadCloser, error) {
 	r.contentQueries = append(r.contentQueries, location)
-	reader, err := r.decorated.FileContentsByLocation(location)
-	return reader, err
+	return r.decorated.FileContentsByLocation(location)
 }
 
-// For the remaining resolver methods...
-
+// AllLocations returns all file locations known to the resolver.
 func (r *ObservingResolver) AllLocations(ctx context.Context) <-chan file.Location {
 	return r.decorated.AllLocations(ctx)
 }
 
-func (r *ObservingResolver) HasPath(s string) bool {
-	return r.decorated.HasPath(s)
+// HasPath returns true if the resolver knows about the given path.
+func (r *ObservingResolver) HasPath(path string) bool {
+	return r.decorated.HasPath(path)
 }
 
+// FileMetadataByLocation returns metadata for the file at the given location.
 func (r *ObservingResolver) FileMetadataByLocation(location file.Location) (file.Metadata, error) {
 	return r.decorated.FileMetadataByLocation(location)
 }
diff --git a/syft/pkg/cataloger/internal/pkgtest/test_generic_parser.go b/syft/pkg/cataloger/internal/pkgtest/test_generic_parser.go
index 493d44355..08e789fbb 100644
--- a/syft/pkg/cataloger/internal/pkgtest/test_generic_parser.go
+++ b/syft/pkg/cataloger/internal/pkgtest/test_generic_parser.go
@@ -6,6 +6,8 @@ import (
 	"io"
 	"os"
 	"path/filepath"
+	"reflect"
+	"runtime"
 	"sort"
 	"strings"
 	"sync"
@@ -56,6 +58,7 @@ type CatalogTester struct {
 	packageStringer                func(pkg.Package) string
 	customAssertions               []func(t *testing.T, pkgs []pkg.Package, relationships []artifact.Relationship)
 	context                        context.Context
+	skipTestObservations           bool
 }
 
 func Context() context.Context {
@@ -260,13 +263,23 @@ func (p *CatalogTester) IgnoreUnfulfilledPathResponses(paths ...string) *Catalog
 	return p
 }
 
+func (p *CatalogTester) WithoutTestObserver() *CatalogTester {
+	p.skipTestObservations = true
+	return p
+}
+
 func (p *CatalogTester) TestParser(t *testing.T, parser generic.Parser) {
 	t.Helper()
 	pkgs, relationships, err := parser(p.context, p.resolver, p.env, p.reader)
+
 	// only test for errors if explicitly requested
 	if p.wantErr != nil {
 		p.wantErr(t, err)
 	}
+
+	// track metadata types for cataloger discovery
+	p.trackParserMetadata(t, parser, pkgs, relationships)
+
 	p.assertPkgs(t, pkgs, relationships)
 }
 
@@ -292,6 +305,9 @@ func (p *CatalogTester) TestCataloger(t *testing.T, cataloger pkg.Cataloger) {
 		p.wantErr(t, err)
 	}
 
+	// track metadata types for cataloger discovery
+	p.trackCatalogerMetadata(t, cataloger, pkgs, relationships)
+
 	if p.assertResultExpectations {
 		p.assertPkgs(t, pkgs, relationships)
 	}
@@ -458,3 +474,163 @@ func stringPackage(p pkg.Package) string {
 
 	return fmt.Sprintf("%s @ %s (%s)", p.Name, p.Version, loc)
 }
+
+// getFunctionName extracts the function name from a function pointer using reflection
+func getFunctionName(fn interface{}) string {
+	// get the function pointer
+	ptr := reflect.ValueOf(fn).Pointer()
+
+	// get the function details
+	funcForPC := runtime.FuncForPC(ptr)
+	if funcForPC == nil {
+		return ""
+	}
+
+	fullName := funcForPC.Name()
+
+	// extract just the function name from the full path
+	// e.g., "github.com/anchore/syft/syft/pkg/cataloger/python.parseRequirementsTxt"
+	//   -> "parseRequirementsTxt"
+	parts := strings.Split(fullName, ".")
+	if len(parts) > 0 {
+		name := parts[len(parts)-1]
+		// strip the -fm suffix that Go's reflection adds for methods
+		// e.g., "parsePackageLock-fm" -> "parsePackageLock"
+		return strings.TrimSuffix(name, "-fm")
+	}
+
+	return fullName
+}
+
+// getCatalogerName extracts the cataloger name from the test context or cataloger name
+func getCatalogerName(_ *testing.T, cataloger pkg.Cataloger) string {
+	// use the cataloger's name method if available
+	return cataloger.Name()
+}
+
+// getPackagePath extracts the package path from a function name
+// e.g., "github.com/anchore/syft/syft/pkg/cataloger/python.parseRequirementsTxt" -> "python"
+func getPackagePath(fn interface{}) string {
+	ptr := reflect.ValueOf(fn).Pointer()
+	funcForPC := runtime.FuncForPC(ptr)
+	if funcForPC == nil {
+		return ""
+	}
+
+	fullName := funcForPC.Name()
+
+	// extract package name from path
+	// e.g., "github.com/anchore/syft/syft/pkg/cataloger/python.parseRequirementsTxt"
+	//   -> "python"
+	if strings.Contains(fullName, "/cataloger/") {
+		parts := strings.Split(fullName, "/cataloger/")
+		if len(parts) > 1 {
+			// get the next segment after "/cataloger/"
+			remaining := parts[1]
+			// split by "." to get package name
+			pkgParts := strings.Split(remaining, ".")
+			if len(pkgParts) > 0 {
+				return pkgParts[0]
+			}
+		}
+	}
+
+	return ""
+}
+
+// getPackagePathFromCataloger extracts the package path from the caller's file path
+// For generic catalogers, the cataloger type is from the generic package, but we need
+// the package where the test is defined (e.g., rust, python, etc.)
+func getPackagePathFromCataloger(_ pkg.Cataloger) string {
+	// walk up the call stack to find the test file
+	// we're looking for a file in the cataloger directory structure
+	for i := 0; i < 10; i++ {
+		_, file, _, ok := runtime.Caller(i)
+		if !ok {
+			break
+		}
+
+		// extract package name from file path
+		// e.g., "/Users/.../syft/pkg/cataloger/rust/cataloger_test.go" -> "rust"
+		if strings.Contains(file, "/cataloger/") {
+			parts := strings.Split(file, "/cataloger/")
+			if len(parts) > 1 {
+				// get the next segment after "/cataloger/"
+				remaining := parts[1]
+				// split by "/" to get package name
+				pkgParts := strings.Split(remaining, "/")
+				if len(pkgParts) > 0 && pkgParts[0] != "internal" {
+					return pkgParts[0]
+				}
+			}
+		}
+	}
+
+	return ""
+}
+
+// trackParserMetadata records metadata types for a parser function
+func (p *CatalogTester) trackParserMetadata(t *testing.T, parser generic.Parser, pkgs []pkg.Package, relationships []artifact.Relationship) {
+	if p.skipTestObservations {
+		return
+	}
+
+	parserName := getFunctionName(parser)
+	if parserName == "" {
+		return
+	}
+
+	// try to infer package name from function path
+	packageName := getPackagePath(parser)
+	if packageName == "" {
+		return
+	}
+
+	tracker := getTracker()
+
+	// old tracking (still used by metadata discovery)
+	for _, pkg := range pkgs {
+		tracker.RecordParserPackageMetadata(packageName, parserName, pkg)
+	}
+
+	// new unified observations with capability tracking
+	tracker.RecordParserObservations(packageName, parserName, pkgs, relationships)
+
+	// ensure results are written when tests complete
+	t.Cleanup(func() {
+		_ = WriteResultsIfEnabled()
+	})
+}
+
+// trackCatalogerMetadata records metadata types for a cataloger
+func (p *CatalogTester) trackCatalogerMetadata(t *testing.T, cataloger pkg.Cataloger, pkgs []pkg.Package, relationships []artifact.Relationship) {
+	if p.skipTestObservations {
+		return
+	}
+
+	catalogerName := getCatalogerName(t, cataloger)
+	if catalogerName == "" {
+		return
+	}
+
+	// try to infer package name from cataloger type
+	packageName := getPackagePathFromCataloger(cataloger)
+	if packageName == "" {
+		return
+	}
+
+	tracker := getTracker()
+
+	// old tracking (still used by metadata discovery)
+	for _, pkg := range pkgs {
+		tracker.RecordCatalogerPackageMetadata(catalogerName, pkg)
+	}
+
+	// new unified observations with capability tracking
+	tracker.RecordCatalogerObservations(packageName, catalogerName, pkgs, relationships)
+
+	// ensure results are written when tests complete
+	t.Cleanup(func() {
+		_ = WriteResultsIfEnabled()
+	})
+}
diff --git a/syft/pkg/cataloger/javascript/parse_yarn_lock_test.go b/syft/pkg/cataloger/javascript/parse_yarn_lock_test.go
index f6719db4a..0cf6b9dba 100644
--- a/syft/pkg/cataloger/javascript/parse_yarn_lock_test.go
+++ b/syft/pkg/cataloger/javascript/parse_yarn_lock_test.go
@@ -284,7 +284,11 @@ func TestSearchYarnForLicenses(t *testing.T) {
 			}
 			tc.config.NPMBaseURL = url
 			adapter := newGenericYarnLockAdapter(tc.config)
-			pkgtest.TestFileParser(t, fixture, adapter.parseYarnLock, tc.expectedPackages, nil)
+			pkgtest.NewCatalogTester().
+				FromFile(t, fixture).
+				Expects(tc.expectedPackages, nil).
+				WithoutTestObserver(). // this is an online test, thus not the default configuration
+				TestParser(t, adapter.parseYarnLock)
 		})
 	}
 }
diff --git a/syft/pkg/type.go b/syft/pkg/type.go
index 7dea22586..6ac815f0e 100644
--- a/syft/pkg/type.go
+++ b/syft/pkg/type.go
@@ -7,6 +7,10 @@ import (
 // Type represents a Package Type for or within a language ecosystem (there may be multiple package types within a language ecosystem)
 type Type string
 
+func (t Type) String() string {
+	return string(t)
+}
+
 const (
 	// the full set of supported packages
 	UnknownPkg              Type = "UnknownPackage"