pr: first pass refactor

Signed-off-by: Christopher Phillips <32073428+spiffcs@users.noreply.github.com>
2026-07-05 02:28:25 +02:00 · 2026-06-05 02:29:46 -04:00 · 2026-06-05 02:29:46 -04:00 · fe392a490b
commit fe392a490b
parent dd179eb8a7
7 changed files with 692 additions and 612 deletions
--- a/syft/pkg/cataloger/ai/gguf_processor.go
+++ b/syft/pkg/cataloger/ai/gguf_processor.go
@ -0,0 +1,56 @@
+package ai
+
+import (
+	"github.com/anchore/syft/syft/artifact"
+	"github.com/anchore/syft/syft/pkg"
+)
+
+// ggufMergeProcessor consolidates multiple GGUF packages into a single package
+// representing the AI model. When scanning OCI images with multiple layers,
+// each layer may produce a separate package. This processor finds the package
+// with a name and merges metadata from nameless packages into its GGUFFileParts field.
+// Only packages with a non-empty name are returned in the final result.
+func ggufMergeProcessor(pkgs []pkg.Package, rels []artifact.Relationship, err error) ([]pkg.Package, []artifact.Relationship, error) {
+	if err != nil {
+		return pkgs, rels, err
+	}
+	if len(pkgs) == 0 {
+		return pkgs, rels, err
+	}
+
+	// Separate packages with names from those without
+	var namedPkgs []pkg.Package
+	var namelessHeaders []pkg.GGUFFileHeader
+
+	for _, p := range pkgs {
+		if p.Name != "" {
+			namedPkgs = append(namedPkgs, p)
+		} else {
+			if header, ok := p.Metadata.(pkg.GGUFFileHeader); ok {
+				// We do not want a kv hash for nameless headers
+				header.MetadataKeyValuesHash = ""
+				namelessHeaders = append(namelessHeaders, header)
+			}
+		}
+	}
+
+	// If there are no named packages, return nothing
+	if len(namedPkgs) == 0 {
+		return nil, rels, err
+	}
+
+	// merge nameless headers into a single named package;
+	// if there are multiple named packages, return them without trying to merge headers.
+	// we cannot determine which nameless headers belong to which package
+	// this is because the order we receive the gguf headers in is not guaranteed
+	// to match the layer order in the original oci image
+	if len(namedPkgs) == 1 && len(namelessHeaders) > 0 {
+		winner := &namedPkgs[0]
+		if header, ok := winner.Metadata.(pkg.GGUFFileHeader); ok {
+			header.Parts = namelessHeaders
+			winner.Metadata = header
+		}
+	}
+
+	return namedPkgs, rels, err
+}
--- a/syft/pkg/cataloger/ai/huggingface.go
+++ b/syft/pkg/cataloger/ai/huggingface.go
@ -0,0 +1,93 @@
+package ai
+
+import (
+	"bytes"
+
+	"gopkg.in/yaml.v3"
+
+	"github.com/anchore/syft/internal/log"
+	"github.com/anchore/syft/syft/pkg"
+)
+
+// hfConfig is a minimal projection of Hugging Face config.json fields.
+type hfConfig struct {
+	Architectures []string `json:"architectures"`
+	NameOrPath    string   `json:"_name_or_path"`
+}
+
+func applyHFConfig(md *pkg.SafeTensorsModelInfo, cfg *hfConfig) {
+	if md.Architecture == "" && len(cfg.Architectures) > 0 {
+		md.Architecture = cfg.Architectures[0]
+	}
+}
+
+// readmeFrontmatter holds the subset of YAML frontmatter fields we extract.
+type readmeFrontmatter struct {
+	License   string   `yaml:"license"`
+	BaseModel []string `yaml:"base_model"`
+}
+
+type licenseFrontmatter struct {
+	SPDXID string `yaml:"spdx-id"`
+}
+
+// extractFrontmatterBlock returns the YAML bytes between the first and second
+// "---" delimiters of a file
+func extractFrontmatterBlock(buf []byte) []byte {
+	trimmed := bytes.TrimLeft(buf, "\xef\xbb\xbf \t\r\n")
+	if !bytes.HasPrefix(trimmed, []byte("---")) {
+		return nil
+	}
+	rest := trimmed[3:]
+	if i := bytes.IndexByte(rest, '\n'); i >= 0 {
+		rest = rest[i+1:]
+	}
+	block, _, found := bytes.Cut(rest, []byte("\n---"))
+	if !found {
+		return nil
+	}
+	return block
+}
+
+// parseFrontmatter decodes a Hugging Face model card YAML frontmatter block
+// and returns the license and base_model fields.
+func parseFrontmatter(buf []byte) *readmeFrontmatter {
+	block := extractFrontmatterBlock(buf)
+	if block == nil {
+		return nil
+	}
+
+	var raw struct {
+		License   string    `yaml:"license"`
+		BaseModel yaml.Node `yaml:"base_model"`
+	}
+	if err := yaml.Unmarshal(block, &raw); err != nil {
+		log.Debugf("failed to parse README frontmatter: %v", err)
+		return nil
+	}
+
+	fm := readmeFrontmatter{License: raw.License}
+	switch raw.BaseModel.Kind {
+	case yaml.ScalarNode:
+		if raw.BaseModel.Value != "" {
+			fm.BaseModel = []string{raw.BaseModel.Value}
+		}
+	case yaml.SequenceNode:
+		_ = raw.BaseModel.Decode(&fm.BaseModel)
+	}
+	return &fm
+}
+
+// parseLicenseFrontmatter returns the producer-declared SPDX identifier
+func parseLicenseFrontmatter(buf []byte) string {
+	block := extractFrontmatterBlock(buf)
+	if block == nil {
+		return ""
+	}
+	var fm licenseFrontmatter
+	if err := yaml.Unmarshal(block, &fm); err != nil {
+		log.Debugf("failed to parse license frontmatter: %v", err)
+		return ""
+	}
+	return fm.SPDXID
+}
--- a/syft/pkg/cataloger/ai/identity_dir.go
+++ b/syft/pkg/cataloger/ai/identity_dir.go
@ -0,0 +1,93 @@
+package ai
+
+import (
+	"context"
+	"encoding/json"
+	"io"
+	"path"
+
+	"github.com/anchore/syft/internal"
+	"github.com/anchore/syft/internal/log"
+	"github.com/anchore/syft/syft/file"
+	"github.com/anchore/syft/syft/pkg"
+)
+
+// resolveSafeTensorsDirIdentity handles the directory-scan case: look for a
+// config.json beside the model files (walking up parent directories to the
+// scanned source root if no sibling exists) and a sibling README.md. It returns
+// the group's name candidates, resolved licenses, and supporting evidence.
+func resolveSafeTensorsDirIdentity(ctx context.Context, resolver file.Resolver, dir string, md *pkg.SafeTensorsModelInfo) safeTensorsIdentity {
+	id := safeTensorsIdentity{fallbackName: safeTensorsDirName(dir)}
+
+	if loc, cfg := findDirHFConfig(resolver, dir); cfg != nil {
+		applyHFConfig(md, cfg)
+		id.nameOrPath = cfg.NameOrPath
+		id.supporting = append(id.supporting, *loc)
+	}
+
+	if loc, fm := readDirReadmeFrontmatter(resolver, path.Join(dir, "README.md")); fm != nil {
+		if fm.License != "" {
+			id.licenses = pkg.NewLicensesFromValuesWithContext(ctx, fm.License)
+		}
+		if id.nameOrPath == "" && len(fm.BaseModel) > 0 {
+			id.nameOrPath = fm.BaseModel[0]
+		}
+		id.supporting = append(id.supporting, *loc)
+	}
+	return id
+}
+
+// findDirHFConfig looks for a config.json beside the model files
+func findDirHFConfig(resolver file.Resolver, dir string) (*file.Location, *hfConfig) {
+	for {
+		if loc, cfg := readDirHFConfig(resolver, path.Join(dir, "config.json")); cfg != nil {
+			return loc, cfg
+		}
+		parent := path.Dir(dir)
+		if parent == dir {
+			return nil, nil // reached the source root
+		}
+		dir = parent
+	}
+}
+
+func readDirHFConfig(resolver file.Resolver, p string) (*file.Location, *hfConfig) {
+	locations, err := resolver.FilesByPath(p)
+	if err != nil || len(locations) == 0 {
+		return nil, nil
+	}
+	rc, err := resolver.FileContentsByLocation(locations[0])
+	if err != nil {
+		return nil, nil
+	}
+	defer internal.CloseAndLogError(rc, p)
+
+	var cfg hfConfig
+	if err := json.NewDecoder(rc).Decode(&cfg); err != nil {
+		log.Debugf("failed to decode %s: %v", p, err)
+		return nil, nil
+	}
+	return &locations[0], &cfg
+}
+
+func readDirReadmeFrontmatter(resolver file.Resolver, p string) (*file.Location, *readmeFrontmatter) {
+	locations, err := resolver.FilesByPath(p)
+	if err != nil || len(locations) == 0 {
+		return nil, nil
+	}
+	rc, err := resolver.FileContentsByLocation(locations[0])
+	if err != nil {
+		return nil, nil
+	}
+	defer internal.CloseAndLogError(rc, p)
+
+	buf, err := io.ReadAll(io.LimitReader(rc, 1024*1024))
+	if err != nil {
+		return nil, nil
+	}
+	fm := parseFrontmatter(buf)
+	if fm == nil {
+		return nil, nil
+	}
+	return &locations[0], fm
+}
--- a/syft/pkg/cataloger/ai/identity_oci.go
+++ b/syft/pkg/cataloger/ai/identity_oci.go
@ -0,0 +1,177 @@
+package ai
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"io"
+	"path"
+
+	gcrname "github.com/google/go-containerregistry/pkg/name"
+
+	"github.com/anchore/syft/internal"
+	"github.com/anchore/syft/internal/log"
+	"github.com/anchore/syft/syft/file"
+	"github.com/anchore/syft/syft/pkg"
+	"github.com/anchore/syft/syft/pkg/cataloger/internal/licenses"
+)
+
+// resolveSafeTensorsOCIIdentity handles the OCI-artifact case: the model's
+// naming and license signals arrive as sibling layers (model.file companions
+// carrying config.json / README, and dedicated license layers). It returns the
+// group's name candidates, resolved licenses, and supporting evidence.
+func resolveSafeTensorsOCIIdentity(ctx context.Context, resolver file.Resolver, md *pkg.SafeTensorsModelInfo) safeTensorsIdentity {
+	ociResolver, ok := resolver.(file.OCIMediaTypeResolver)
+	if !ok {
+		return safeTensorsIdentity{}
+	}
+
+	modelFileLocs, err := ociResolver.FilesByMediaType(dockerAIModelFileMediaType)
+	if err != nil {
+		log.Debugf("failed to list docker AI model-file layers: %v", err)
+	}
+
+	// Collect config / readme candidates separately so the layer-iteration order
+	// returned by the resolver doesn't decide the precedence.
+	var configName, readmeName, readmeLicense string
+	var supporting []file.Location
+	for _, loc := range modelFileLocs {
+		if classifyOCIModelFileLayer(resolver, loc, md, &configName, &readmeName, &readmeLicense) {
+			supporting = append(supporting, loc)
+		}
+	}
+
+	// Precedence: config.json _name_or_path > README base_model.
+	nameOrPath := configName
+	if nameOrPath == "" {
+		nameOrPath = readmeName
+	}
+
+	id := safeTensorsIdentity{
+		nameOrPath:   nameOrPath,
+		fallbackName: ociImageRefBasename(resolver),
+		supporting:   supporting,
+	}
+
+	// License precedence: a README model-card license wins over dedicated
+	// license layers (mirrors the dir-scan path, where README frontmatter is the
+	// license source).
+	switch {
+	case readmeLicense != "":
+		id.licenses = pkg.NewLicensesFromValuesWithContext(ctx, readmeLicense)
+	default:
+		licLocs, err := ociResolver.FilesByMediaType(dockerAILicenseMediaType)
+		if err != nil {
+			log.Debugf("failed to list docker AI license layers: %v", err)
+		}
+		if len(licLocs) > 0 {
+			id.licenses = identifyLicenseLayers(ctx, resolver, licLocs)
+			id.supporting = append(id.supporting, licLocs...)
+		}
+	}
+
+	return id
+}
+
+// ociImageReferencer is the minimal capability ociImageRefBasename needs: a
+// resolver that can surface the OCI image reference it was built from. It is
+// kept local to this package (rather than exported from the file package) so the
+// assertion stays with its only consumer.
+type ociImageReferencer interface {
+	ImageReference() string
+}
+
+func ociImageRefBasename(resolver file.Resolver) string {
+	// TODO: we don't think this approach is generalizable quite yet, but we really do need this information.
+	// (Ideally we should be NOT be type asserting on the file resolver directly).
+	info, ok := resolver.(ociImageReferencer)
+	if !ok {
+		return ""
+	}
+	ref := info.ImageReference()
+	if ref == "" {
+		return ""
+	}
+	parsed, err := gcrname.ParseReference(ref)
+	if err != nil {
+		log.Debugf("failed to parse OCI ref %q: %v", ref, err)
+		return ""
+	}
+	return path.Base(parsed.Context().RepositoryStr())
+}
+
+// classifyOCIModelFileLayer reads up to 4 MiB of a model.file layer and
+// classifies it as README frontmatter or HF config.json based on its leading bytes.
+func classifyOCIModelFileLayer(resolver file.Resolver, loc file.Location, md *pkg.SafeTensorsModelInfo, configName, readmeName, license *string) bool {
+	rc, err := resolver.FileContentsByLocation(loc)
+	if err != nil {
+		return false
+	}
+	defer internal.CloseAndLogError(rc, loc.RealPath)
+
+	buf, err := io.ReadAll(io.LimitReader(rc, 4*1024*1024))
+	if err != nil {
+		return false
+	}
+	trimmed := bytes.TrimLeft(buf, "\xef\xbb\xbf \t\r\n")
+	switch {
+	case bytes.HasPrefix(trimmed, []byte("---")):
+		fm := parseFrontmatter(buf)
+		if fm == nil {
+			return false
+		}
+		if *license == "" {
+			*license = fm.License
+		}
+		if *readmeName == "" && len(fm.BaseModel) > 0 {
+			*readmeName = fm.BaseModel[0]
+		}
+		return true
+	case bytes.HasPrefix(trimmed, []byte("{")):
+		var cfg hfConfig
+		if err := json.Unmarshal(buf, &cfg); err != nil {
+			return false
+		}
+		applyHFConfig(md, &cfg)
+		if *configName == "" && cfg.NameOrPath != "" {
+			*configName = cfg.NameOrPath
+		}
+		return true
+	}
+	return false
+}
+
+// identifyLicenseLayers turns Docker AI license-layer locations into
+// pkg.License values.
+func identifyLicenseLayers(ctx context.Context, resolver file.Resolver, locs []file.Location) []pkg.License {
+	var out []pkg.License
+	var scanFallback []file.Location
+	for i := range locs {
+		loc := locs[i]
+		if spdx := readLicenseSPDXIDFromFrontmatter(resolver, loc); spdx != "" {
+			out = append(out, pkg.NewLicenseFromFieldsWithContext(ctx, spdx, "", &loc))
+			continue
+		}
+		scanFallback = append(scanFallback, loc)
+	}
+	if len(scanFallback) > 0 {
+		out = append(out, licenses.FindAtLocations(ctx, resolver, scanFallback...)...)
+	}
+	return out
+}
+
+// readLicenseSPDXIDFromFrontmatter reads a bounded prefix of a license-layer
+// blob and returns the spdx-id declared in its YAML frontmatter
+func readLicenseSPDXIDFromFrontmatter(resolver file.Resolver, loc file.Location) string {
+	rc, err := resolver.FileContentsByLocation(loc)
+	if err != nil {
+		return ""
+	}
+	defer internal.CloseAndLogError(rc, loc.RealPath)
+
+	buf, err := io.ReadAll(io.LimitReader(rc, 64*1024))
+	if err != nil {
+		return ""
+	}
+	return parseLicenseFrontmatter(buf)
+}
--- a/syft/pkg/cataloger/ai/merge.go
+++ b/syft/pkg/cataloger/ai/merge.go
@ -0,0 +1,140 @@
+package ai
+
+import (
+	"fmt"
+	"sort"
+	"strings"
+
+	"github.com/cespare/xxhash/v2"
+
+	"github.com/anchore/syft/syft/file"
+	"github.com/anchore/syft/syft/pkg"
+)
+
+// mergeSafeTensorsGroup folds a group's per-member metadata into a single package.
+func mergeSafeTensorsGroup(members []pkg.Package) pkg.Package {
+	locSet := unionLocations(members)
+	aggregates, shards := bucketSafeTensorsMembers(members)
+
+	merged := pkg.SafeTensorsModelInfo{Format: "safetensors"}
+	mergeAggregatesInto(&merged, aggregates)
+	shardTensorTotal, hashes := mergeShardsInto(&merged, shards)
+
+	// Keep merged UserMetadata globally key-sorted so the SBOM is stable
+	sort.Slice(merged.UserMetadata, func(i, j int) bool {
+		return merged.UserMetadata[i].Key < merged.UserMetadata[j].Key
+	})
+
+	if merged.TensorCount == 0 {
+		merged.TensorCount = shardTensorTotal
+	}
+	if merged.ShardCount == 0 {
+		if len(shards) > 0 {
+			merged.ShardCount = len(shards)
+		} else {
+			merged.ShardCount = 1
+		}
+	}
+	merged.MetadataHash = rollupHash(hashes)
+
+	// Parts only carry value for multi-shard models; for a single shard the
+	// outer view already exposes every per-shard field.
+	if len(shards) > 1 {
+		parts := append([]pkg.SafeTensorsModelInfo(nil), shards...)
+		sort.Slice(parts, func(i, j int) bool {
+			return parts[i].MetadataHash < parts[j].MetadataHash
+		})
+		merged.Parts = parts
+	}
+
+	return pkg.Package{
+		Locations: locSet,
+		Type:      pkg.ModelPkg,
+		Metadata:  merged,
+	}
+}
+
+func mergeAggregatesInto(merged *pkg.SafeTensorsModelInfo, aggregates []pkg.SafeTensorsModelInfo) {
+	for _, a := range aggregates {
+		if merged.TensorCount == 0 {
+			merged.TensorCount = a.TensorCount
+		}
+		if merged.ShardCount == 0 {
+			merged.ShardCount = a.ShardCount
+		}
+		firstNonEmpty(&merged.Parameters, a.Parameters)
+		firstNonEmpty(&merged.TotalSize, a.TotalSize)
+		firstNonEmpty(&merged.Quantization, a.Quantization)
+	}
+}
+
+// mergeShardsInto folds the per-shard header metadata into merged, returning
+// the summed shard TensorCount and the list of non-empty per-shard hashes for
+// the rollup. Shards carry only the content-derived fields (Quantization,
+// Parameters, UserMetadata);
+func mergeShardsInto(merged *pkg.SafeTensorsModelInfo, shards []pkg.SafeTensorsModelInfo) (shardTensorTotal uint64, hashes []string) {
+	seenKV := map[string]bool{}
+	for _, s := range shards {
+		shardTensorTotal += s.TensorCount
+		firstNonEmpty(&merged.Quantization, s.Quantization)
+		firstNonEmpty(&merged.Parameters, s.Parameters)
+		for _, kv := range s.UserMetadata {
+			if seenKV[kv.Key] {
+				continue
+			}
+			seenKV[kv.Key] = true
+			merged.UserMetadata = append(merged.UserMetadata, kv)
+		}
+		if s.MetadataHash != "" {
+			hashes = append(hashes, s.MetadataHash)
+		}
+	}
+	return shardTensorTotal, hashes
+}
+
+func firstNonEmpty(dst *string, v string) {
+	if *dst == "" {
+		*dst = v
+	}
+}
+
+// unionLocations gathers every location from every member into a single set.
+func unionLocations(members []pkg.Package) file.LocationSet {
+	out := file.NewLocationSet()
+	for _, m := range members {
+		for _, l := range m.Locations.ToSlice() {
+			out.Add(l)
+		}
+	}
+	return out
+}
+
+// bucketSafeTensorsMembers splits group members into aggregate-flavored entries
+// (no MetadataHash — Docker AI config blob or sharded index) and shard-flavored
+// entries (carry a content-derived MetadataHash from a header parser).
+func bucketSafeTensorsMembers(members []pkg.Package) (aggregates, shards []pkg.SafeTensorsModelInfo) {
+	for _, m := range members {
+		md, ok := m.Metadata.(pkg.SafeTensorsModelInfo)
+		if !ok {
+			continue
+		}
+		if md.MetadataHash != "" {
+			shards = append(shards, md)
+			continue
+		}
+		aggregates = append(aggregates, md)
+	}
+	return aggregates, shards
+}
+
+func rollupHash(hashes []string) string {
+	if len(hashes) == 0 {
+		return ""
+	}
+	if len(hashes) == 1 {
+		return hashes[0]
+	}
+	sorted := append([]string(nil), hashes...)
+	sort.Strings(sorted)
+	return fmt.Sprintf("%016x", xxhash.Sum64String(strings.Join(sorted, "|")))
+}
--- a/syft/pkg/cataloger/ai/naming.go
+++ b/syft/pkg/cataloger/ai/naming.go
@ -0,0 +1,25 @@
+package ai
+
+import "path"
+
+// pickSafeTensorsName implements the documented naming precedence chain:
+//   - config.json _name_or_path  (path.Base, so "org/Model" → "Model";
+//     applies to both dir-scan and OCI groups)
+//   - fallback name — the group's source-specific positional identifier
+func pickSafeTensorsName(nameOrPath, fallbackName string) string {
+	if nameOrPath != "" {
+		return path.Base(nameOrPath)
+	}
+	return fallbackName
+}
+
+// safeTensorsDirName returns the directory-scan naming fallback: the base name
+// of the group's parent directory (the group key is already that directory).
+func safeTensorsDirName(groupKey string) string {
+	base := path.Base(groupKey)
+	switch base {
+	case "/", ".", "":
+		return ""
+	}
+	return base
+}
--- a/syft/pkg/cataloger/ai/processor.go
+++ b/syft/pkg/cataloger/ai/processor.go
@ -1,159 +1,122 @@
 package ai

 import (
-	"bytes"
 	"context"
-	"encoding/json"
-	"fmt"
-	"io"
 	"path"
 	"sort"
-	"strings"

-	"github.com/cespare/xxhash/v2"
-	gcrname "github.com/google/go-containerregistry/pkg/name"
-	"gopkg.in/yaml.v3"
-
-	"github.com/anchore/syft/internal"
 	"github.com/anchore/syft/internal/log"
 	"github.com/anchore/syft/syft/artifact"
 	"github.com/anchore/syft/syft/file"
 	"github.com/anchore/syft/syft/pkg"
-	"github.com/anchore/syft/syft/pkg/cataloger/internal/licenses"
 )

-// ociGroupKey is the grouping key for every safetensors package that
-// originated from an OCI model artifact. The ContainerImageModel resolver gives
-// each layer the virtual RealPath "/" regardless of layer media type, so all
-// safetensors packages from a single OCI scan collapse into one group.
-const ociGroupKey = "@oci@"
-
-// ggufMergeProcessor consolidates multiple GGUF packages into a single package
-// representing the AI model. When scanning OCI images with multiple layers,
-// each layer may produce a separate package. This processor finds the package
-// with a name and merges metadata from nameless packages into its GGUFFileParts field.
-// Only packages with a non-empty name are returned in the final result.
-func ggufMergeProcessor(pkgs []pkg.Package, rels []artifact.Relationship, err error) ([]pkg.Package, []artifact.Relationship, error) {
-	if err != nil {
-		return pkgs, rels, err
-	}
-	if len(pkgs) == 0 {
-		return pkgs, rels, err
-	}
-
-	// Separate packages with names from those without
-	var namedPkgs []pkg.Package
-	var namelessHeaders []pkg.GGUFFileHeader
-
-	for _, p := range pkgs {
-		if p.Name != "" {
-			namedPkgs = append(namedPkgs, p)
-		} else {
-			if header, ok := p.Metadata.(pkg.GGUFFileHeader); ok {
-				// We do not want a kv hash for nameless headers
-				header.MetadataKeyValuesHash = ""
-				namelessHeaders = append(namelessHeaders, header)
-			}
-		}
-	}
-
-	// If there are no named packages, return nothing
-	if len(namedPkgs) == 0 {
-		return nil, rels, err
-	}
-
-	// merge nameless headers into a single named package;
-	// if there are multiple named packages, return them without trying to merge headers.
-	// we cannot determine which nameless headers belong to which package
-	// this is because the order we receive the gguf headers in is not guaranteed
-	// to match the layer order in the original oci image
-	if len(namedPkgs) == 1 && len(namelessHeaders) > 0 {
-		winner := &namedPkgs[0]
-		if header, ok := winner.Metadata.(pkg.GGUFFileHeader); ok {
-			header.Parts = namelessHeaders
-			winner.Metadata = header
-		}
-	}
-
-	return namedPkgs, rels, err
-}
-
-// safeTensorsMergeProcessor owns naming, license resolution, and tensor package creation
-// - groups all nameless packages
-// - merge the per-shard metadata
-// - picks a name (see pickSafeTensorsName)
+// safeTensorsMergeProcessor owns naming, license resolution, and final package
+// assembly. SafeTensors packages reach it nameless from the parsers; it groups
+// them per model, merges the per-shard metadata, resolves a name + licenses, and
+// drops any model it cannot name.
+//
+// There are exactly two sources, each handled by its own path:
+//   - an OCI model artifact, where the source presents every layer at the
+//     virtual path "/" and the whole scan is a single model (mergeOCIModel)
+//   - a filesystem scan, where models are grouped by the directory their files
+//     live in (mergeDirModels)
 func safeTensorsMergeProcessor(ctx context.Context, resolver file.Resolver, pkgs []pkg.Package, rels []artifact.Relationship, err error) ([]pkg.Package, []artifact.Relationship, error) {
 	if err != nil || len(pkgs) == 0 {
 		return pkgs, rels, err
 	}

-	// split off non-safetensors packages
-	// this keeps the processor robust if other types ever flow through
-	var stPkgs, other []pkg.Package
-	for _, p := range pkgs {
-		if _, ok := p.Metadata.(pkg.SafeTensorsModelInfo); ok {
-			stPkgs = append(stPkgs, p)
-			continue
-		}
-		other = append(other, p)
-	}
+	// keep the processor robust if non-safetensors packages ever flow through
+	stPkgs, other := partitionSafeTensorsPackages(pkgs)
 	if len(stPkgs) == 0 {
 		return pkgs, rels, err
 	}

-	groups := groupSafeTensorsPackages(stPkgs)
-
-	// Deterministic iteration order so the SBOM doesn't depend on map order.
-	keys := make([]string, 0, len(groups))
-	for k := range groups {
-		keys = append(keys, k)
+	if fromOCIArtifact(stPkgs) {
+		return append(other, mergeOCIModel(ctx, resolver, stPkgs)...), rels, nil
 	}
-	sort.Strings(keys)
-
-	out := other
-	for _, key := range keys {
-		merged := mergeSafeTensorsGroup(groups[key])
-
-		// Resolve model identity (name candidates) before enrich
-		id := resolveSafeTensorsIdentity(resolver, key, &merged)
-		name := pickSafeTensorsName(id.nameOrPath, id.fallbackName)
-		if name == "" {
-			log.Debugf("dropped safetensors model package (metadata hash %q): no name source",
-				merged.Metadata.(pkg.SafeTensorsModelInfo).MetadataHash)
-			continue
-		}
-
-		enrichSafeTensorsGroup(ctx, resolver, key, &merged, id)
-		merged.Name = name
-		merged.SetID()
-		out = append(out, merged)
-	}
-	return out, rels, nil
+	return append(other, mergeDirModels(ctx, resolver, stPkgs)...), rels, nil
 }

-// groupSafeTensorsPackages buckets packages by the parent directory of their
-// primary-evidence location
-func groupSafeTensorsPackages(pkgs []pkg.Package) map[string][]pkg.Package {
-	out := make(map[string][]pkg.Package)
+// partitionSafeTensorsPackages separates safetensors packages from anything else
+// flowing through the processor.
+func partitionSafeTensorsPackages(pkgs []pkg.Package) (safeTensors, other []pkg.Package) {
 	for _, p := range pkgs {
-		key := safeTensorsGroupKey(p)
-		if key == "" {
+		if _, ok := p.Metadata.(pkg.SafeTensorsModelInfo); ok {
+			safeTensors = append(safeTensors, p)
 			continue
 		}
-		out[key] = append(out[key], p)
+		other = append(other, p)
+	}
+	return safeTensors, other
+}
+
+// fromOCIArtifact reports whether the packages came from an OCI model artifact.
+// That source presents every layer at the virtual path "/", whereas a filesystem
+// scan always carries a real file path. A single scan is one source, so the
+// first package is representative of the rest.
+func fromOCIArtifact(pkgs []pkg.Package) bool {
+	loc := primaryEvidenceLocation(pkgs[0])
+	return loc != nil && loc.RealPath == "/"
+}
+
+// mergeOCIModel treats the whole OCI artifact as a single model: every layer
+// merges into one package, named from the artifact's config.json/README or its
+// image reference.
+func mergeOCIModel(ctx context.Context, resolver file.Resolver, pkgs []pkg.Package) []pkg.Package {
+	merged := mergeSafeTensorsGroup(pkgs)
+
+	md := merged.Metadata.(pkg.SafeTensorsModelInfo)
+	id := resolveSafeTensorsOCIIdentity(ctx, resolver, &md)
+	merged.Metadata = md // write architecture enrichment back before assembly
+
+	if p, ok := assembleSafeTensorsPackage(merged, id); ok {
+		return []pkg.Package{p}
+	}
+	return nil
+}
+
+// mergeDirModels groups filesystem-scanned files by their parent directory and
+// emits one model per directory, named from a sibling config.json/README or the
+// directory itself.
+func mergeDirModels(ctx context.Context, resolver file.Resolver, pkgs []pkg.Package) []pkg.Package {
+	groups := groupByParentDir(pkgs)
+
+	// deterministic iteration order so the SBOM doesn't depend on map order
+	dirs := make([]string, 0, len(groups))
+	for dir := range groups {
+		dirs = append(dirs, dir)
+	}
+	sort.Strings(dirs)
+
+	var out []pkg.Package
+	for _, dir := range dirs {
+		merged := mergeSafeTensorsGroup(groups[dir])
+
+		md := merged.Metadata.(pkg.SafeTensorsModelInfo)
+		id := resolveSafeTensorsDirIdentity(ctx, resolver, dir, &md)
+		merged.Metadata = md // write architecture enrichment back before assembly
+
+		if p, ok := assembleSafeTensorsPackage(merged, id); ok {
+			out = append(out, p)
+		}
 	}
 	return out
 }

-func safeTensorsGroupKey(p pkg.Package) string {
-	loc := primaryEvidenceLocation(p)
-	if loc == nil {
-		return ""
+// groupByParentDir buckets filesystem-scanned packages by the directory their
+// primary-evidence file lives in (the shards of one model share a directory).
+func groupByParentDir(pkgs []pkg.Package) map[string][]pkg.Package {
+	out := make(map[string][]pkg.Package)
+	for _, p := range pkgs {
+		loc := primaryEvidenceLocation(p)
+		if loc == nil {
+			continue
+		}
+		dir := path.Dir(loc.RealPath)
+		out[dir] = append(out[dir], p)
 	}
-	if loc.RealPath == "/" {
-		return ociGroupKey
-	}
-	return path.Dir(loc.RealPath)
+	return out
 }

 func primaryEvidenceLocation(p pkg.Package) *file.Location {
@ -169,501 +132,34 @@ func primaryEvidenceLocation(p pkg.Package) *file.Location {
 	return nil
 }

-// mergeSafeTensorsGroup folds a group's per-member metadata into a single package.
-func mergeSafeTensorsGroup(members []pkg.Package) pkg.Package {
-	locSet := unionLocations(members)
-	aggregates, shards := bucketSafeTensorsMembers(members)
-
-	merged := pkg.SafeTensorsModelInfo{Format: "safetensors"}
-	mergeAggregatesInto(&merged, aggregates)
-	shardTensorTotal, hashes := mergeShardsInto(&merged, shards)
-
-	// Keep merged UserMetadata globally key-sorted so the SBOM is stable
-	sort.Slice(merged.UserMetadata, func(i, j int) bool {
-		return merged.UserMetadata[i].Key < merged.UserMetadata[j].Key
-	})
-
-	if merged.TensorCount == 0 {
-		merged.TensorCount = shardTensorTotal
-	}
-	if merged.ShardCount == 0 {
-		if len(shards) > 0 {
-			merged.ShardCount = len(shards)
-		} else {
-			merged.ShardCount = 1
-		}
-	}
-	merged.MetadataHash = rollupHash(hashes)
-
-	// Parts only carry value for multi-shard models; for a single shard the
-	// outer view already exposes every per-shard field.
-	if len(shards) > 1 {
-		parts := append([]pkg.SafeTensorsModelInfo(nil), shards...)
-		sort.Slice(parts, func(i, j int) bool {
-			return parts[i].MetadataHash < parts[j].MetadataHash
-		})
-		merged.Parts = parts
-	}
-
-	return pkg.Package{
-		Locations: locSet,
-		Type:      pkg.ModelPkg,
-		Metadata:  merged,
-	}
-}
-
-func mergeAggregatesInto(merged *pkg.SafeTensorsModelInfo, aggregates []pkg.SafeTensorsModelInfo) {
-	for _, a := range aggregates {
-		if merged.TensorCount == 0 {
-			merged.TensorCount = a.TensorCount
-		}
-		if merged.ShardCount == 0 {
-			merged.ShardCount = a.ShardCount
-		}
-		firstNonEmpty(&merged.Parameters, a.Parameters)
-		firstNonEmpty(&merged.TotalSize, a.TotalSize)
-		firstNonEmpty(&merged.Quantization, a.Quantization)
-	}
-}
-
-// mergeShardsInto folds the per-shard header metadata into merged, returning
-// the summed shard TensorCount and the list of non-empty per-shard hashes for
-// the rollup. Shards carry only the content-derived fields (Quantization,
-// Parameters, UserMetadata);
-func mergeShardsInto(merged *pkg.SafeTensorsModelInfo, shards []pkg.SafeTensorsModelInfo) (shardTensorTotal uint64, hashes []string) {
-	seenKV := map[string]bool{}
-	for _, s := range shards {
-		shardTensorTotal += s.TensorCount
-		firstNonEmpty(&merged.Quantization, s.Quantization)
-		firstNonEmpty(&merged.Parameters, s.Parameters)
-		for _, kv := range s.UserMetadata {
-			if seenKV[kv.Key] {
-				continue
-			}
-			seenKV[kv.Key] = true
-			merged.UserMetadata = append(merged.UserMetadata, kv)
-		}
-		if s.MetadataHash != "" {
-			hashes = append(hashes, s.MetadataHash)
-		}
-	}
-	return shardTensorTotal, hashes
-}
-
-func firstNonEmpty(dst *string, v string) {
-	if *dst == "" {
-		*dst = v
-	}
-}
-
-// unionLocations gathers every location from every member into a single set.
-func unionLocations(members []pkg.Package) file.LocationSet {
-	out := file.NewLocationSet()
-	for _, m := range members {
-		for _, l := range m.Locations.ToSlice() {
-			out.Add(l)
-		}
-	}
-	return out
-}
-
-// bucketSafeTensorsMembers splits group members into aggregate-flavored entries
-// (no MetadataHash — Docker AI config blob or sharded index) and shard-flavored
-// entries (carry a content-derived MetadataHash from a header parser).
-func bucketSafeTensorsMembers(members []pkg.Package) (aggregates, shards []pkg.SafeTensorsModelInfo) {
-	for _, m := range members {
-		md, ok := m.Metadata.(pkg.SafeTensorsModelInfo)
-		if !ok {
-			continue
-		}
-		if md.MetadataHash != "" {
-			shards = append(shards, md)
-			continue
-		}
-		aggregates = append(aggregates, md)
-	}
-	return aggregates, shards
-}
-
-func rollupHash(hashes []string) string {
-	if len(hashes) == 0 {
-		return ""
-	}
-	if len(hashes) == 1 {
-		return hashes[0]
-	}
-	sorted := append([]string(nil), hashes...)
-	sort.Strings(sorted)
-	return fmt.Sprintf("%016x", xxhash.Sum64String(strings.Join(sorted, "|")))
-}
-
+// safeTensorsIdentity is the fully-resolved naming/license result for a model.
+// Each source resolver (dir or OCI) populates it so assembly stays source-agnostic.
 type safeTensorsIdentity struct {
-	nameOrPath    string
-	fallbackName  string
-	readmeLicense string
-	supporting    []file.Location
+	nameOrPath   string
+	fallbackName string
+	licenses     []pkg.License
+	supporting   []file.Location
 }

-// resolveSafeTensorsIdentity reads the resolver for the group's naming signals
-// (config.json _name_or_path, README base_model, OCI image ref / dir name)
-func resolveSafeTensorsIdentity(resolver file.Resolver, groupKey string, merged *pkg.Package) safeTensorsIdentity {
-	md := merged.Metadata.(pkg.SafeTensorsModelInfo)
-
-	var id safeTensorsIdentity
-	if groupKey == ociGroupKey {
-		id = resolveSafeTensorsOCIIdentity(resolver, &md)
-	} else {
-		id = resolveSafeTensorsDirIdentity(resolver, groupKey, &md)
+// assembleSafeTensorsPackage finalizes a merged model from its resolved identity:
+// it picks the name, attaches licenses and supporting evidence, and sets the ID.
+// A model with no name source is dropped (ok=false).
+func assembleSafeTensorsPackage(merged pkg.Package, id safeTensorsIdentity) (pkg.Package, bool) {
+	name := pickSafeTensorsName(id.nameOrPath, id.fallbackName)
+	if name == "" {
+		log.Debugf("dropped safetensors model package (metadata hash %q): no name source",
+			merged.Metadata.(pkg.SafeTensorsModelInfo).MetadataHash)
+		return pkg.Package{}, false
 	}

-	merged.Metadata = md
-	return id
-}
-
-func enrichSafeTensorsGroup(ctx context.Context, resolver file.Resolver, groupKey string, merged *pkg.Package, id safeTensorsIdentity) {
-	var lics []pkg.License
-	supporting := id.supporting
-
-	switch {
-	case id.readmeLicense != "":
-		lics = pkg.NewLicensesFromValuesWithContext(ctx, id.readmeLicense)
-	case groupKey == ociGroupKey:
-		if ociResolver, ok := resolver.(file.OCIMediaTypeResolver); ok {
-			licLocs, err := ociResolver.FilesByMediaType(dockerAILicenseMediaType)
-			if err != nil {
-				log.Debugf("failed to list docker AI license layers: %v", err)
-			}
-			if len(licLocs) > 0 {
-				lics = identifyLicenseLayers(ctx, resolver, licLocs)
-				supporting = append(supporting, licLocs...)
-			}
-		}
+	if len(id.licenses) > 0 {
+		merged.Licenses = pkg.NewLicenseSet(id.licenses...)
 	}
-
-	if len(lics) > 0 {
-		merged.Licenses = pkg.NewLicenseSet(lics...)
-	}
-	for _, loc := range supporting {
+	for _, loc := range id.supporting {
 		merged.Locations.Add(loc.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.SupportingEvidenceAnnotation))
 	}
-}
-
-// safeTensorsDirName returns the directory-scan naming fallback: the base name
-// of the group's parent directory (the group key is already that directory).
-func safeTensorsDirName(groupKey string) string {
-	base := path.Base(groupKey)
-	switch base {
-	case "/", ".", "":
-		return ""
-	}
-	return base
-}
-
-// resolveSafeTensorsDirIdentity handles the directory-scan case: look for a
-// config.json beside the model files (walking up parent directories to the
-// scanned source root if no sibling exists) and a sibling README.md
-func resolveSafeTensorsDirIdentity(resolver file.Resolver, dir string, md *pkg.SafeTensorsModelInfo) safeTensorsIdentity {
-	id := safeTensorsIdentity{fallbackName: safeTensorsDirName(dir)}
-
-	if loc, cfg := findDirHFConfig(resolver, dir); cfg != nil {
-		applyHFConfig(md, cfg)
-		id.nameOrPath = cfg.NameOrPath
-		id.supporting = append(id.supporting, *loc)
-	}
-
-	if loc, fm := readDirReadmeFrontmatter(resolver, path.Join(dir, "README.md")); fm != nil {
-		id.readmeLicense = fm.License
-		if id.nameOrPath == "" && len(fm.BaseModel) > 0 {
-			id.nameOrPath = fm.BaseModel[0]
-		}
-		id.supporting = append(id.supporting, *loc)
-	}
-	return id
-}
-
-func resolveSafeTensorsOCIIdentity(resolver file.Resolver, md *pkg.SafeTensorsModelInfo) safeTensorsIdentity {
-	ociResolver, ok := resolver.(file.OCIMediaTypeResolver)
-	if !ok {
-		return safeTensorsIdentity{}
-	}
-
-	modelFileLocs, err := ociResolver.FilesByMediaType(dockerAIModelFileMediaType)
-	if err != nil {
-		log.Debugf("failed to list docker AI model-file layers: %v", err)
-	}
-
-	// Collect config / readme candidates separately so the layer-iteration order
-	// returned by the resolver doesn't decide the precedence.
-	var configName, readmeName, readmeLicense string
-	var supporting []file.Location
-	for _, loc := range modelFileLocs {
-		if classifyOCIModelFileLayer(resolver, loc, md, &configName, &readmeName, &readmeLicense) {
-			supporting = append(supporting, loc)
-		}
-	}
-
-	// Precedence: config.json _name_or_path > README base_model.
-	nameOrPath := configName
-	if nameOrPath == "" {
-		nameOrPath = readmeName
-	}
-
-	return safeTensorsIdentity{
-		nameOrPath:    nameOrPath,
-		fallbackName:  ociImageRefBasename(resolver),
-		readmeLicense: readmeLicense,
-		supporting:    supporting,
-	}
-}
-
-// ociImageReferencer is the minimal capability ociImageRefBasename needs: a
-// resolver that can surface the OCI image reference it was built from. It is
-// kept local to this package (rather than exported from the file package) so the
-// assertion stays with its only consumer.
-type ociImageReferencer interface {
-	ImageReference() string
-}
-
-func ociImageRefBasename(resolver file.Resolver) string {
-	// TODO: we don't think this approach is generalizable quite yet, but we really do need this information.
-	// (Ideally we should be NOT be type asserting on the file resolver directly).
-	info, ok := resolver.(ociImageReferencer)
-	if !ok {
-		return ""
-	}
-	ref := info.ImageReference()
-	if ref == "" {
-		return ""
-	}
-	parsed, err := gcrname.ParseReference(ref)
-	if err != nil {
-		log.Debugf("failed to parse OCI ref %q: %v", ref, err)
-		return ""
-	}
-	return path.Base(parsed.Context().RepositoryStr())
-}
-
-// identifyLicenseLayers turns Docker AI license-layer locations into
-// pkg.License values.
-func identifyLicenseLayers(ctx context.Context, resolver file.Resolver, locs []file.Location) []pkg.License {
-	var out []pkg.License
-	var scanFallback []file.Location
-	for i := range locs {
-		loc := locs[i]
-		if spdx := readLicenseSPDXIDFromFrontmatter(resolver, loc); spdx != "" {
-			out = append(out, pkg.NewLicenseFromFieldsWithContext(ctx, spdx, "", &loc))
-			continue
-		}
-		scanFallback = append(scanFallback, loc)
-	}
-	if len(scanFallback) > 0 {
-		out = append(out, licenses.FindAtLocations(ctx, resolver, scanFallback...)...)
-	}
-	return out
-}
-
-// readLicenseSPDXIDFromFrontmatter reads a bounded prefix of a license-layer
-// blob and returns the spdx-id declared in its YAML frontmatter
-func readLicenseSPDXIDFromFrontmatter(resolver file.Resolver, loc file.Location) string {
-	rc, err := resolver.FileContentsByLocation(loc)
-	if err != nil {
-		return ""
-	}
-	defer internal.CloseAndLogError(rc, loc.RealPath)
-
-	buf, err := io.ReadAll(io.LimitReader(rc, 64*1024))
-	if err != nil {
-		return ""
-	}
-	return parseLicenseFrontmatter(buf)
-}
-
-// classifyOCIModelFileLayer reads up to 4 MiB of a model.file layer and
-// classifies it as README frontmatter or HF config.json based on its leading bytes.
-func classifyOCIModelFileLayer(resolver file.Resolver, loc file.Location, md *pkg.SafeTensorsModelInfo, configName, readmeName, license *string) bool {
-	rc, err := resolver.FileContentsByLocation(loc)
-	if err != nil {
-		return false
-	}
-	defer internal.CloseAndLogError(rc, loc.RealPath)
-
-	buf, err := io.ReadAll(io.LimitReader(rc, 4*1024*1024))
-	if err != nil {
-		return false
-	}
-	trimmed := bytes.TrimLeft(buf, "\xef\xbb\xbf \t\r\n")
-	switch {
-	case bytes.HasPrefix(trimmed, []byte("---")):
-		fm := parseFrontmatter(buf)
-		if fm == nil {
-			return false
-		}
-		if *license == "" {
-			*license = fm.License
-		}
-		if *readmeName == "" && len(fm.BaseModel) > 0 {
-			*readmeName = fm.BaseModel[0]
-		}
-		return true
-	case bytes.HasPrefix(trimmed, []byte("{")):
-		var cfg hfConfig
-		if err := json.Unmarshal(buf, &cfg); err != nil {
-			return false
-		}
-		applyHFConfig(md, &cfg)
-		if *configName == "" && cfg.NameOrPath != "" {
-			*configName = cfg.NameOrPath
-		}
-		return true
-	}
-	return false
-}
-
-func applyHFConfig(md *pkg.SafeTensorsModelInfo, cfg *hfConfig) {
-	if md.Architecture == "" && len(cfg.Architectures) > 0 {
-		md.Architecture = cfg.Architectures[0]
-	}
-}
-
-// pickSafeTensorsName implements the documented naming precedence chain:
-//   - config.json _name_or_path  (path.Base, so "org/Model" → "Model";
-//     applies to both dir-scan and OCI groups)
-//   - fallback name — the group's source-specific positional identifier
-func pickSafeTensorsName(nameOrPath, fallbackName string) string {
-	if nameOrPath != "" {
-		return path.Base(nameOrPath)
-	}
-	return fallbackName
-}
-
-// hfConfig is a minimal projection of Hugging Face config.json fields.
-type hfConfig struct {
-	Architectures []string `json:"architectures"`
-	NameOrPath    string   `json:"_name_or_path"`
-}
-
-// readmeFrontmatter holds the subset of YAML frontmatter fields we extract.
-type readmeFrontmatter struct {
-	License   string   `yaml:"license"`
-	BaseModel []string `yaml:"base_model"`
-}
-
-// findDirHFConfig looks for a config.json beside the model files
-func findDirHFConfig(resolver file.Resolver, dir string) (*file.Location, *hfConfig) {
-	for {
-		if loc, cfg := readDirHFConfig(resolver, path.Join(dir, "config.json")); cfg != nil {
-			return loc, cfg
-		}
-		parent := path.Dir(dir)
-		if parent == dir {
-			return nil, nil // reached the source root
-		}
-		dir = parent
-	}
-}
-
-func readDirHFConfig(resolver file.Resolver, p string) (*file.Location, *hfConfig) {
-	locations, err := resolver.FilesByPath(p)
-	if err != nil || len(locations) == 0 {
-		return nil, nil
-	}
-	rc, err := resolver.FileContentsByLocation(locations[0])
-	if err != nil {
-		return nil, nil
-	}
-	defer internal.CloseAndLogError(rc, p)
-
-	var cfg hfConfig
-	if err := json.NewDecoder(rc).Decode(&cfg); err != nil {
-		log.Debugf("failed to decode %s: %v", p, err)
-		return nil, nil
-	}
-	return &locations[0], &cfg
-}
-
-func readDirReadmeFrontmatter(resolver file.Resolver, p string) (*file.Location, *readmeFrontmatter) {
-	locations, err := resolver.FilesByPath(p)
-	if err != nil || len(locations) == 0 {
-		return nil, nil
-	}
-	rc, err := resolver.FileContentsByLocation(locations[0])
-	if err != nil {
-		return nil, nil
-	}
-	defer internal.CloseAndLogError(rc, p)
-
-	buf, err := io.ReadAll(io.LimitReader(rc, 1024*1024))
-	if err != nil {
-		return nil, nil
-	}
-	fm := parseFrontmatter(buf)
-	if fm == nil {
-		return nil, nil
-	}
-	return &locations[0], fm
-}
-
-// extractFrontmatterBlock returns the YAML bytes between the first and second
-// "---" delimiters of a file
-func extractFrontmatterBlock(buf []byte) []byte {
-	trimmed := bytes.TrimLeft(buf, "\xef\xbb\xbf \t\r\n")
-	if !bytes.HasPrefix(trimmed, []byte("---")) {
-		return nil
-	}
-	rest := trimmed[3:]
-	if i := bytes.IndexByte(rest, '\n'); i >= 0 {
-		rest = rest[i+1:]
-	}
-	block, _, found := bytes.Cut(rest, []byte("\n---"))
-	if !found {
-		return nil
-	}
-	return block
-}
-
-// parseFrontmatter decodes a Hugging Face model card YAML frontmatter block
-// and returns the license and base_model fields.
-func parseFrontmatter(buf []byte) *readmeFrontmatter {
-	block := extractFrontmatterBlock(buf)
-	if block == nil {
-		return nil
-	}
-
-	var raw struct {
-		License   string    `yaml:"license"`
-		BaseModel yaml.Node `yaml:"base_model"`
-	}
-	if err := yaml.Unmarshal(block, &raw); err != nil {
-		log.Debugf("failed to parse README frontmatter: %v", err)
-		return nil
-	}
-
-	fm := readmeFrontmatter{License: raw.License}
-	switch raw.BaseModel.Kind {
-	case yaml.ScalarNode:
-		if raw.BaseModel.Value != "" {
-			fm.BaseModel = []string{raw.BaseModel.Value}
-		}
-	case yaml.SequenceNode:
-		_ = raw.BaseModel.Decode(&fm.BaseModel)
-	}
-	return &fm
-}
-
-type licenseFrontmatter struct {
-	SPDXID string `yaml:"spdx-id"`
-}
-
-// parseLicenseFrontmatter returns the producer-declared SPDX identifier
-func parseLicenseFrontmatter(buf []byte) string {
-	block := extractFrontmatterBlock(buf)
-	if block == nil {
-		return ""
-	}
-	var fm licenseFrontmatter
-	if err := yaml.Unmarshal(block, &fm); err != nil {
-		log.Debugf("failed to parse license frontmatter: %v", err)
-		return ""
-	}
-	return fm.SPDXID
+
+	merged.Name = name
+	merged.SetID()
+	return merged, true
 }