review: remove and refactor implementation for easier review

Signed-off-by: Christopher Phillips <32073428+spiffcs@users.noreply.github.com>
2026-07-05 02:28:25 +02:00 · 2026-06-01 22:34:37 -04:00 · 2026-06-01 22:34:37 -04:00 · 4eaf583526
commit 4eaf583526
parent 4352ac4691
1 changed files with 92 additions and 174 deletions
--- a/syft/pkg/cataloger/ai/processor.go
+++ b/syft/pkg/cataloger/ai/processor.go
@ -22,7 +22,7 @@ import (
 	"github.com/anchore/syft/syft/pkg/cataloger/internal/licenses"
 )
-// ociGroupKey is the sentinel grouping key for every safetensors package that
+// ociGroupKey is the grouping key for every safetensors package that
 // originated from an OCI model artifact. The ContainerImageModel resolver gives
 // each layer the virtual RealPath "/" regardless of layer media type, so all
 // safetensors packages from a single OCI scan collapse into one group.
@ -78,31 +78,17 @@ func ggufMergeProcessor(pkgs []pkg.Package, rels []artifact.Relationship, err er
 	return namedPkgs, rels, err
 }
-// safeTensorsMergeProcessor owns naming, license resolution, etc
+// safeTensorsMergeProcessor owns naming, license resolution, and tensor package creation
-//  1. groups all nameless packages by parent directory (or a single sentinel
+// - groups all nameless packages
-//     for OCI artifacts, since the ContainerImageModel resolver puts every
+// - merge the per-shard metadata
-//     layer at virtual path "/");
+// - picks a name (see pickSafeTensorsName)
 //  2. merges the per-shard metadata (tensor count, dominant dtype, total size,
 //     UserMetadata, rollup MetadataHash) into one package per group;
 //  3. enriches the merged package by consulting the resolver ONCE per group —
 //     sibling config.json + README.md for dir scans, the model-file companion
 //     layers + license layer for OCI — and attaches those locations as
 //     supporting evidence;
 //  4. picks a name via a two-rung precedence chain (see pickSafeTensorsName):
 //     config.json _name_or_path first (both sources), then the OCI image-ref
 //     last segment (OCI only). Drops the group when neither rung produces a
 //     name. There is no opaque fallback (no Architecture-Parameters synthetic,
 //     no parent-dir, no MetadataHash-as-name) — an unnameable model is
 //     intentionally absent from the SBOM rather than recorded under a
 //     misleading label.
 func safeTensorsMergeProcessor(ctx context.Context, resolver file.Resolver, pkgs []pkg.Package, rels []artifact.Relationship, err error) ([]pkg.Package, []artifact.Relationship, error) {
 	if err != nil || len(pkgs) == 0 {
 		return pkgs, rels, err
 	}
-	// Defensively split off non-safetensors packages — the cataloger only emits
+	// split off non-safetensors packages
-	// SafeTensorsModelInfo today, but this keeps the processor robust if other
+	// this keeps the processor robust if other types ever flow through
 	// types ever flow through.
 	var stPkgs, other []pkg.Package
 	for _, p := range pkgs {
 		if _, ok := p.Metadata.(pkg.SafeTensorsModelInfo); ok {
@ -127,11 +113,17 @@ func safeTensorsMergeProcessor(ctx context.Context, resolver file.Resolver, pkgs
 	out := other
 	for _, key := range keys {
 		merged := mergeSafeTensorsGroup(groups[key])
-		nameOrPath, fallbackName := enrichSafeTensorsGroup(ctx, resolver, key, &merged)
+
-		name := pickSafeTensorsName(nameOrPath, fallbackName)
+		// Resolve model identity (name candidates) before enrich
 		id := resolveSafeTensorsIdentity(resolver, key, &merged)
 		name := pickSafeTensorsName(id.nameOrPath, id.fallbackName)
 		if name == "" {
-			continue // drop groups with no name source and no usable fallback
+			log.Debugf("dropped safetensors model package (metadata hash %q): no name source",
 				merged.Metadata.(pkg.SafeTensorsModelInfo).MetadataHash)
 			continue
 		}
 		enrichSafeTensorsGroup(ctx, resolver, key, &merged, id)
 		merged.Name = name
 		merged.SetID()
 		out = append(out, merged)
@ -140,8 +132,7 @@ func safeTensorsMergeProcessor(ctx context.Context, resolver file.Resolver, pkgs
 }
 // groupSafeTensorsPackages buckets packages by the parent directory of their
-// primary-evidence location, or the OCI sentinel when the location lives at
+// primary-evidence location
 // the ContainerImageModel resolver's virtual "/" path.
 func groupSafeTensorsPackages(pkgs []pkg.Package) map[string][]pkg.Package {
 	out := make(map[string][]pkg.Package)
 	for _, p := range pkgs {
@ -178,19 +169,7 @@ func primaryEvidenceLocation(p pkg.Package) *file.Location {
 	return nil
 }
-// mergeSafeTensorsGroup folds a group's per-member metadata into a single
+// mergeSafeTensorsGroup folds a group's per-member metadata into a single package.
 // package. Members are classified into two buckets to avoid double-counting:
 //
 //   - "aggregate" members have producer-declared totals (TensorCount, TotalSize,
 //     ShardCount, Parameters) but no MetadataHash — these are the Docker AI
 //     config blob and the sharded-index file.
 //   - "shard" members have a content-derived MetadataHash and per-shard counts —
 //     these are the individual .safetensors header parsers, both dir-scan and
 //     OCI weight-layer.
 //
 // Aggregate values are the source of truth for the merged totals when present;
 // shards contribute Quantization, UserMetadata, the rollup MetadataHash, and
 // (for multi-shard models) the Parts breakdown.
 func mergeSafeTensorsGroup(members []pkg.Package) pkg.Package {
 	locSet := unionLocations(members)
 	aggregates, shards := bucketSafeTensorsMembers(members)
@ -199,6 +178,11 @@ func mergeSafeTensorsGroup(members []pkg.Package) pkg.Package {
 	mergeAggregatesInto(&merged, aggregates)
 	shardTensorTotal, hashes := mergeShardsInto(&merged, shards)
 	// Keep merged UserMetadata globally key-sorted so the SBOM is stable
 	sort.Slice(merged.UserMetadata, func(i, j int) bool {
 		return merged.UserMetadata[i].Key < merged.UserMetadata[j].Key
 	})
 	if merged.TensorCount == 0 {
 		merged.TensorCount = shardTensorTotal
 	}
@ -228,10 +212,6 @@ func mergeSafeTensorsGroup(members []pkg.Package) pkg.Package {
 	}
 }
 // mergeAggregatesInto folds aggregate-declared totals (config blob or sharded
 // index) into merged. First non-empty wins, so the order aggregates are passed
 // in determines tie-breaking — in practice there is one config blob and one
 // index per group, never two of the same kind.
 func mergeAggregatesInto(merged *pkg.SafeTensorsModelInfo, aggregates []pkg.SafeTensorsModelInfo) {
 	for _, a := range aggregates {
 		if merged.TensorCount == 0 {
@ -252,8 +232,7 @@ func mergeAggregatesInto(merged *pkg.SafeTensorsModelInfo, aggregates []pkg.Safe
 // mergeShardsInto folds the per-shard header metadata into merged, returning
 // the summed shard TensorCount and the list of non-empty per-shard hashes for
 // the rollup. Shards carry only the content-derived fields (Quantization,
-// Parameters, UserMetadata); producer-declared fields like Architecture come
+// Parameters, UserMetadata);
 // from the resolver-backed enrichment that runs afterwards.
 func mergeShardsInto(merged *pkg.SafeTensorsModelInfo, shards []pkg.SafeTensorsModelInfo) (shardTensorTotal uint64, hashes []string) {
 	seenKV := map[string]bool{}
 	for _, s := range shards {
@ -309,11 +288,6 @@ func bucketSafeTensorsMembers(members []pkg.Package) (aggregates, shards []pkg.S
 	return aggregates, shards
 }
 // rollupHash returns a stable hash across the sorted set of per-member
 // content-derived hashes. For a single member it returns that hash unchanged,
 // so a single-file dir scan and an OCI scan with one safetensors layer surface
 // the same value. For multi-shard models the rollup is the xxhash of the
 // sorted hashes joined with "|".
 func rollupHash(hashes []string) string {
 	if len(hashes) == 0 {
 		return ""
@ -326,43 +300,59 @@ func rollupHash(hashes []string) string {
 	return fmt.Sprintf("%016x", xxhash.Sum64String(strings.Join(sorted, "|")))
 }
-// enrichSafeTensorsGroup reads the resolver once for the group to populate the
+type safeTensorsIdentity struct {
-// merged metadata's Architecture / TorchDtype / TransformersVersion, set the
+	nameOrPath    string
-// licenses on the merged package, and attach the location of every consulted
+	fallbackName  string
-// supporting file as SupportingEvidence. Returns two name candidates for the
+	readmeLicense string
-// merge processor: nameOrPath (raw _name_or_path from a config.json) and a
+	supporting    []file.Location
-// fallbackName used when no _name_or_path is available — the last path segment
+}
-// of the OCI image reference for OCI groups, or the parent directory's base
+
-// name for directory-scan groups.
+// resolveSafeTensorsIdentity reads the resolver for the group's naming signals
-func enrichSafeTensorsGroup(ctx context.Context, resolver file.Resolver, groupKey string, merged *pkg.Package) (nameOrPath, fallbackName string) {
+// (config.json _name_or_path, README base_model, OCI image ref / dir name)
 func resolveSafeTensorsIdentity(resolver file.Resolver, groupKey string, merged *pkg.Package) safeTensorsIdentity {
 	md := merged.Metadata.(pkg.SafeTensorsModelInfo)
-	var (
+	var id safeTensorsIdentity
 		lics       []pkg.License
 		supporting []file.Location
 	)
 	if groupKey == ociGroupKey {
-		nameOrPath, fallbackName, lics, supporting = enrichSafeTensorsOCI(ctx, resolver, &md)
+		id = resolveSafeTensorsOCIIdentity(resolver, &md)
 	} else {
-		nameOrPath, lics, supporting = enrichSafeTensorsDir(ctx, resolver, groupKey, &md)
+		id = resolveSafeTensorsDirIdentity(resolver, groupKey, &md)
 		fallbackName = safeTensorsDirName(groupKey)
 	}
 	merged.Metadata = md
 	return id
 }
 func enrichSafeTensorsGroup(ctx context.Context, resolver file.Resolver, groupKey string, merged *pkg.Package, id safeTensorsIdentity) {
 	var lics []pkg.License
 	supporting := id.supporting
 	switch {
 	case id.readmeLicense != "":
 		lics = pkg.NewLicensesFromValuesWithContext(ctx, id.readmeLicense)
 	case groupKey == ociGroupKey:
 		if ociResolver, ok := resolver.(file.OCIMediaTypeResolver); ok {
 			licLocs, err := ociResolver.FilesByMediaType(dockerAILicenseMediaType)
 			if err != nil {
 				log.Debugf("failed to list docker AI license layers: %v", err)
 			}
 			if len(licLocs) > 0 {
 				lics = identifyLicenseLayers(ctx, resolver, licLocs)
 				supporting = append(supporting, licLocs...)
 			}
 		}
 	}
 	if len(lics) > 0 {
 		merged.Licenses = pkg.NewLicenseSet(lics...)
 	}
 	for _, loc := range supporting {
 		merged.Locations.Add(loc.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.SupportingEvidenceAnnotation))
 	}
 	return nameOrPath, fallbackName
 }
 // safeTensorsDirName returns the directory-scan naming fallback: the base name
 // of the group's parent directory (the group key is already that directory).
 // For "/models/tiny-llama" this returns "tiny-llama". Degenerate roots that
 // carry no meaningful model name ("/", ".", "") return "", so the group is
 // dropped rather than labeled with a filesystem artifact.
 func safeTensorsDirName(groupKey string) string {
 	base := path.Base(groupKey)
 	switch base {
@ -372,41 +362,32 @@ func safeTensorsDirName(groupKey string) string {
 	return base
 }
-// enrichSafeTensorsDir handles the directory-scan case: look for a config.json
+// resolveSafeTensorsDirIdentity handles the directory-scan case: look for a
-// beside the model files (walking up parent directories to the scanned source
+// config.json beside the model files (walking up parent directories to the
-// root if no sibling exists) and a sibling README.md.
+// scanned source root if no sibling exists) and a sibling README.md
-func enrichSafeTensorsDir(ctx context.Context, resolver file.Resolver, dir string, md *pkg.SafeTensorsModelInfo) (nameOrPath string, lics []pkg.License, supporting []file.Location) {
+func resolveSafeTensorsDirIdentity(resolver file.Resolver, dir string, md *pkg.SafeTensorsModelInfo) safeTensorsIdentity {
 	id := safeTensorsIdentity{fallbackName: safeTensorsDirName(dir)}
 	if loc, cfg := findDirHFConfig(resolver, dir); cfg != nil {
 		applyHFConfig(md, cfg)
-		nameOrPath = cfg.NameOrPath
+		id.nameOrPath = cfg.NameOrPath
-		supporting = append(supporting, *loc)
+		id.supporting = append(id.supporting, *loc)
 	}
 	if loc, fm := readDirReadmeFrontmatter(resolver, path.Join(dir, "README.md")); fm != nil {
-		if fm.License != "" {
+		id.readmeLicense = fm.License
-			lics = pkg.NewLicensesFromValuesWithContext(ctx, fm.License)
+		if id.nameOrPath == "" && len(fm.BaseModel) > 0 {
 			id.nameOrPath = fm.BaseModel[0]
 		}
-		if nameOrPath == "" && len(fm.BaseModel) > 0 {
+		id.supporting = append(id.supporting, *loc)
 			nameOrPath = fm.BaseModel[0]
 	}
-		supporting = append(supporting, *loc)
+	return id
 	}
 	return nameOrPath, lics, supporting
 }
-// enrichSafeTensorsOCI handles the OCI-artifact case: walk the
+func resolveSafeTensorsOCIIdentity(resolver file.Resolver, md *pkg.SafeTensorsModelInfo) safeTensorsIdentity {
 // vnd.docker.ai.model.file layers (READMEs and HF config.json all ride that
 // media type — we sniff content to tell them apart), then fall back to the
 // vnd.docker.ai.license layer through the shared license scanner. It also
 // pulls the user-supplied image reference off the resolver (when the resolver
 // implements file.OCIArtifactResolver) and returns its last path segment as a
 // naming candidate — repacked artifacts like Docker AI vllm images frequently
 // strip name fields out of every embedded config, so the image ref is often
 // the only remaining identifier for the model.
 func enrichSafeTensorsOCI(ctx context.Context, resolver file.Resolver, md *pkg.SafeTensorsModelInfo) (nameOrPath, imageRefName string, lics []pkg.License, supporting []file.Location) {
 	ociResolver, ok := resolver.(file.OCIMediaTypeResolver)
 	if !ok {
-		return "", "", nil, nil
+		return safeTensorsIdentity{}
 	}
 	modelFileLocs, err := ociResolver.FilesByMediaType(dockerAIModelFileMediaType)
@ -417,6 +398,7 @@ func enrichSafeTensorsOCI(ctx context.Context, resolver file.Resolver, md *pkg.S
 	// Collect config / readme candidates separately so the layer-iteration order
 	// returned by the resolver doesn't decide the precedence.
 	var configName, readmeName, readmeLicense string
 	var supporting []file.Location
 	for _, loc := range modelFileLocs {
 		if classifyOCIModelFileLayer(resolver, loc, md, &configName, &readmeName, &readmeLicense) {
 			supporting = append(supporting, loc)
@ -424,37 +406,19 @@ func enrichSafeTensorsOCI(ctx context.Context, resolver file.Resolver, md *pkg.S
 	}
 	// Precedence: config.json _name_or_path > README base_model.
-	if configName != "" {
+	nameOrPath := configName
-		nameOrPath = configName
+	if nameOrPath == "" {
 	} else {
 		nameOrPath = readmeName
 	}
-	// README license takes precedence; fall back to the license layer. For each
+	return safeTensorsIdentity{
-	// license layer we first try a cheap YAML-frontmatter spdx-id read; layers
+		nameOrPath:    nameOrPath,
-	// without frontmatter fall through to the shared license scanner.
+		fallbackName:  ociImageRefBasename(resolver),
-	switch {
+		readmeLicense: readmeLicense,
-	case readmeLicense != "":
+		supporting:    supporting,
 		lics = pkg.NewLicensesFromValuesWithContext(ctx, readmeLicense)
 	default:
 		licLocs, lErr := ociResolver.FilesByMediaType(dockerAILicenseMediaType)
 		if lErr != nil {
 			log.Debugf("failed to list docker AI license layers: %v", lErr)
 	}
 		if len(licLocs) > 0 {
 			lics = identifyLicenseLayers(ctx, resolver, licLocs)
 			supporting = append(supporting, licLocs...)
 		}
 	}
 	imageRefName = ociImageRefBasename(resolver)
 	return nameOrPath, imageRefName, lics, supporting
 }
 // ociImageRefBasename returns the last path segment of the repository portion
 // of the OCI image reference exposed by the resolver, or "" when the resolver
 // does not implement OCIArtifactResolver or the reference fails to parse. For
 // "docker.io/ai/smollm2-vllm:360M" this returns "smollm2-vllm".
 func ociImageRefBasename(resolver file.Resolver) string {
 	info, ok := resolver.(file.OCIArtifactResolver)
 	if !ok {
@ -473,11 +437,7 @@ func ociImageRefBasename(resolver file.Resolver) string {
 }
 // identifyLicenseLayers turns Docker AI license-layer locations into
-// pkg.License values. It first attempts a cheap, exact SPDX-id read from the
+// pkg.License values.
 // layer's YAML frontmatter (the choosealicense.com shape Docker Model Runner
 // publishes for its AI artifacts); layers without frontmatter fall through to
 // the shared license scanner. Each returned license is tagged with the layer
 // location it came from so the SBOM cites its source.
 func identifyLicenseLayers(ctx context.Context, resolver file.Resolver, locs []file.Location) []pkg.License {
 	var out []pkg.License
 	var scanFallback []file.Location
@ -496,9 +456,7 @@ func identifyLicenseLayers(ctx context.Context, resolver file.Resolver, locs []f
 }
 // readLicenseSPDXIDFromFrontmatter reads a bounded prefix of a license-layer
-// blob and returns the spdx-id declared in its YAML frontmatter, if any. The
+// blob and returns the spdx-id declared in its YAML frontmatter
 // 64 KiB cap is well above any real choosealicense.com frontmatter block while
 // still bounding memory if the layer turns out to be huge.
 func readLicenseSPDXIDFromFrontmatter(resolver file.Resolver, loc file.Location) string {
 	rc, err := resolver.FileContentsByLocation(loc)
 	if err != nil {
@ -514,10 +472,7 @@ func readLicenseSPDXIDFromFrontmatter(resolver file.Resolver, loc file.Location)
 }
 // classifyOCIModelFileLayer reads up to 4 MiB of a model.file layer and
-// classifies it as README frontmatter or HF config.json based on its leading
+// classifies it as README frontmatter or HF config.json based on its leading bytes.
 // bytes. Side-effects: applies HF config fields onto md, accumulates name and
 // license candidates via the out-params. Returns true when the layer was
 // successfully classified (and should be recorded as supporting evidence).
 func classifyOCIModelFileLayer(resolver file.Resolver, loc file.Location, md *pkg.SafeTensorsModelInfo, configName, readmeName, license *string) bool {
 	rc, err := resolver.FileContentsByLocation(loc)
 	if err != nil {
@ -557,10 +512,6 @@ func classifyOCIModelFileLayer(resolver file.Resolver, loc file.Location, md *pk
 	return false
 }
 // applyHFConfig folds the subset of HF config.json fields we surface in our
 // metadata onto md. Fields already populated on md are left alone — earlier
 // content-derived values (Quantization, TensorCount, etc., from header bytes)
 // always win over producer-declared ones in case of conflict.
 func applyHFConfig(md *pkg.SafeTensorsModelInfo, cfg *hfConfig) {
 	if md.Architecture == "" && len(cfg.Architectures) > 0 {
 		md.Architecture = cfg.Architectures[0]
@ -574,20 +525,9 @@ func applyHFConfig(md *pkg.SafeTensorsModelInfo, cfg *hfConfig) {
 }
 // pickSafeTensorsName implements the documented naming precedence chain:
-//
+//   - config.json _name_or_path  (path.Base, so "org/Model" → "Model";
 //  1. config.json _name_or_path  (path.Base, so "org/Model" → "Model";
 //     applies to both dir-scan and OCI groups)
-//  2. fallback name — the group's source-specific positional identifier:
+//   - fallback name — the group's source-specific positional identifier
 //     the OCI image-ref repository basename for OCI groups (e.g.
 //     "docker.io/ai/smollm2-vllm:360M" → "smollm2-vllm"), or the parent
 //     directory base name for directory-scan groups (e.g.
 //     "/models/tiny-llama/*.safetensors" → "tiny-llama")
 //
 // Returns "" to signal the merge processor should drop the group. There is
 // intentionally no Architecture-Parameters synthetic and no opaque hash label:
 // when neither a producer-declared name nor a positional fallback is available
 // the model is recorded as absent rather than under a label the SBOM consumer
 // would not recognize.
 func pickSafeTensorsName(nameOrPath, fallbackName string) string {
 	if nameOrPath != "" {
 		return path.Base(nameOrPath)
@ -595,12 +535,6 @@ func pickSafeTensorsName(nameOrPath, fallbackName string) string {
 	return fallbackName
 }
 // --- Enrichment helpers ---------------------------------------------------
 //
 // The parsers decode only the safetensors-specific format; every resolver-backed
 // read (config.json, README, license layers) is centralized here in the merge
 // processor, along with the types those reads decode into.
 // hfConfig is a minimal projection of Hugging Face config.json fields.
 type hfConfig struct {
 	Architectures       []string `json:"architectures"`
@ -615,14 +549,7 @@ type readmeFrontmatter struct {
 	BaseModel []string `yaml:"base_model"`
 }
-// findDirHFConfig looks for a config.json beside the model files, walking up
+// findDirHFConfig looks for a config.json beside the model files
 // parent directories until it reaches the scanned source root. The walk needs
 // no explicit depth bound: the resolver only resolves paths within the scanned
 // source, so an ancestor above the scan root simply yields no config, and
 // path.Dir converges on a fixed point ("/" or ".") that terminates the loop.
 // The first config.json found wins, so the closest one — a sibling, then the
 // nearest ancestor — supplies both the producer-declared name and the HF fields
 // applied to the model.
 func findDirHFConfig(resolver file.Resolver, dir string) (*file.Location, *hfConfig) {
 	for {
 		if loc, cfg := readDirHFConfig(resolver, path.Join(dir, "config.json")); cfg != nil {
@ -678,9 +605,7 @@ func readDirReadmeFrontmatter(resolver file.Resolver, p string) (*file.Location,
 }
 // extractFrontmatterBlock returns the YAML bytes between the first and second
-// "---" delimiters of a file (stripping a leading BOM and any leading
+// "---" delimiters of a file
 // whitespace), or nil when no closed frontmatter block exists. Shared by every
 // YAML-frontmatter parser the cataloger needs.
 func extractFrontmatterBlock(buf []byte) []byte {
 	trimmed := bytes.TrimLeft(buf, "\xef\xbb\xbf \t\r\n")
 	if !bytes.HasPrefix(trimmed, []byte("---")) {
@ -698,8 +623,7 @@ func extractFrontmatterBlock(buf []byte) []byte {
 }
 // parseFrontmatter decodes a Hugging Face model card YAML frontmatter block
-// and returns the license and base_model fields. base_model is decoded via
+// and returns the license and base_model fields.
 // yaml.Node so a scalar value ("org/model") doesn't fail the whole block.
 func parseFrontmatter(buf []byte) *readmeFrontmatter {
 	block := extractFrontmatterBlock(buf)
 	if block == nil {
@ -727,17 +651,11 @@ func parseFrontmatter(buf []byte) *readmeFrontmatter {
 	return &fm
 }
 // licenseFrontmatter holds the fields we lift from a choosealicense.com-style
 // YAML frontmatter block at the top of a license file (the LICENSE blobs Docker
 // Model Runner publishes for AI artifacts use this shape).
 type licenseFrontmatter struct {
 	SPDXID string `yaml:"spdx-id"`
 }
-// parseLicenseFrontmatter returns the producer-declared SPDX identifier from a
+// parseLicenseFrontmatter returns the producer-declared SPDX identifier
 // choosealicense.com-style YAML frontmatter block, or "" if the buffer has no
 // frontmatter or no spdx-id field — caller should fall back to a full license
 // scan in that case.
 func parseLicenseFrontmatter(buf []byte) string {
 	block := extractFrontmatterBlock(buf)
 	if block == nil {