pr: first pass refactor

Signed-off-by: Christopher Phillips <32073428+spiffcs@users.noreply.github.com>
This commit is contained in:
Christopher Phillips 2026-06-05 02:29:46 -04:00
parent dd179eb8a7
commit fe392a490b
No known key found for this signature in database
7 changed files with 692 additions and 612 deletions

View File

@ -0,0 +1,56 @@
package ai
import (
"github.com/anchore/syft/syft/artifact"
"github.com/anchore/syft/syft/pkg"
)
// ggufMergeProcessor consolidates multiple GGUF packages into a single package
// representing the AI model. When scanning OCI images with multiple layers,
// each layer may produce a separate package. This processor finds the package
// with a name and merges metadata from nameless packages into its GGUFFileParts field.
// Only packages with a non-empty name are returned in the final result.
func ggufMergeProcessor(pkgs []pkg.Package, rels []artifact.Relationship, err error) ([]pkg.Package, []artifact.Relationship, error) {
if err != nil {
return pkgs, rels, err
}
if len(pkgs) == 0 {
return pkgs, rels, err
}
// Separate packages with names from those without
var namedPkgs []pkg.Package
var namelessHeaders []pkg.GGUFFileHeader
for _, p := range pkgs {
if p.Name != "" {
namedPkgs = append(namedPkgs, p)
} else {
if header, ok := p.Metadata.(pkg.GGUFFileHeader); ok {
// We do not want a kv hash for nameless headers
header.MetadataKeyValuesHash = ""
namelessHeaders = append(namelessHeaders, header)
}
}
}
// If there are no named packages, return nothing
if len(namedPkgs) == 0 {
return nil, rels, err
}
// merge nameless headers into a single named package;
// if there are multiple named packages, return them without trying to merge headers.
// we cannot determine which nameless headers belong to which package
// this is because the order we receive the gguf headers in is not guaranteed
// to match the layer order in the original oci image
if len(namedPkgs) == 1 && len(namelessHeaders) > 0 {
winner := &namedPkgs[0]
if header, ok := winner.Metadata.(pkg.GGUFFileHeader); ok {
header.Parts = namelessHeaders
winner.Metadata = header
}
}
return namedPkgs, rels, err
}

View File

@ -0,0 +1,93 @@
package ai
import (
"bytes"
"gopkg.in/yaml.v3"
"github.com/anchore/syft/internal/log"
"github.com/anchore/syft/syft/pkg"
)
// hfConfig is a minimal projection of Hugging Face config.json fields.
type hfConfig struct {
Architectures []string `json:"architectures"`
NameOrPath string `json:"_name_or_path"`
}
func applyHFConfig(md *pkg.SafeTensorsModelInfo, cfg *hfConfig) {
if md.Architecture == "" && len(cfg.Architectures) > 0 {
md.Architecture = cfg.Architectures[0]
}
}
// readmeFrontmatter holds the subset of YAML frontmatter fields we extract.
type readmeFrontmatter struct {
License string `yaml:"license"`
BaseModel []string `yaml:"base_model"`
}
type licenseFrontmatter struct {
SPDXID string `yaml:"spdx-id"`
}
// extractFrontmatterBlock returns the YAML bytes between the first and second
// "---" delimiters of a file
func extractFrontmatterBlock(buf []byte) []byte {
trimmed := bytes.TrimLeft(buf, "\xef\xbb\xbf \t\r\n")
if !bytes.HasPrefix(trimmed, []byte("---")) {
return nil
}
rest := trimmed[3:]
if i := bytes.IndexByte(rest, '\n'); i >= 0 {
rest = rest[i+1:]
}
block, _, found := bytes.Cut(rest, []byte("\n---"))
if !found {
return nil
}
return block
}
// parseFrontmatter decodes a Hugging Face model card YAML frontmatter block
// and returns the license and base_model fields.
func parseFrontmatter(buf []byte) *readmeFrontmatter {
block := extractFrontmatterBlock(buf)
if block == nil {
return nil
}
var raw struct {
License string `yaml:"license"`
BaseModel yaml.Node `yaml:"base_model"`
}
if err := yaml.Unmarshal(block, &raw); err != nil {
log.Debugf("failed to parse README frontmatter: %v", err)
return nil
}
fm := readmeFrontmatter{License: raw.License}
switch raw.BaseModel.Kind {
case yaml.ScalarNode:
if raw.BaseModel.Value != "" {
fm.BaseModel = []string{raw.BaseModel.Value}
}
case yaml.SequenceNode:
_ = raw.BaseModel.Decode(&fm.BaseModel)
}
return &fm
}
// parseLicenseFrontmatter returns the producer-declared SPDX identifier
func parseLicenseFrontmatter(buf []byte) string {
block := extractFrontmatterBlock(buf)
if block == nil {
return ""
}
var fm licenseFrontmatter
if err := yaml.Unmarshal(block, &fm); err != nil {
log.Debugf("failed to parse license frontmatter: %v", err)
return ""
}
return fm.SPDXID
}

View File

@ -0,0 +1,93 @@
package ai
import (
"context"
"encoding/json"
"io"
"path"
"github.com/anchore/syft/internal"
"github.com/anchore/syft/internal/log"
"github.com/anchore/syft/syft/file"
"github.com/anchore/syft/syft/pkg"
)
// resolveSafeTensorsDirIdentity handles the directory-scan case: look for a
// config.json beside the model files (walking up parent directories to the
// scanned source root if no sibling exists) and a sibling README.md. It returns
// the group's name candidates, resolved licenses, and supporting evidence.
func resolveSafeTensorsDirIdentity(ctx context.Context, resolver file.Resolver, dir string, md *pkg.SafeTensorsModelInfo) safeTensorsIdentity {
id := safeTensorsIdentity{fallbackName: safeTensorsDirName(dir)}
if loc, cfg := findDirHFConfig(resolver, dir); cfg != nil {
applyHFConfig(md, cfg)
id.nameOrPath = cfg.NameOrPath
id.supporting = append(id.supporting, *loc)
}
if loc, fm := readDirReadmeFrontmatter(resolver, path.Join(dir, "README.md")); fm != nil {
if fm.License != "" {
id.licenses = pkg.NewLicensesFromValuesWithContext(ctx, fm.License)
}
if id.nameOrPath == "" && len(fm.BaseModel) > 0 {
id.nameOrPath = fm.BaseModel[0]
}
id.supporting = append(id.supporting, *loc)
}
return id
}
// findDirHFConfig looks for a config.json beside the model files
func findDirHFConfig(resolver file.Resolver, dir string) (*file.Location, *hfConfig) {
for {
if loc, cfg := readDirHFConfig(resolver, path.Join(dir, "config.json")); cfg != nil {
return loc, cfg
}
parent := path.Dir(dir)
if parent == dir {
return nil, nil // reached the source root
}
dir = parent
}
}
func readDirHFConfig(resolver file.Resolver, p string) (*file.Location, *hfConfig) {
locations, err := resolver.FilesByPath(p)
if err != nil || len(locations) == 0 {
return nil, nil
}
rc, err := resolver.FileContentsByLocation(locations[0])
if err != nil {
return nil, nil
}
defer internal.CloseAndLogError(rc, p)
var cfg hfConfig
if err := json.NewDecoder(rc).Decode(&cfg); err != nil {
log.Debugf("failed to decode %s: %v", p, err)
return nil, nil
}
return &locations[0], &cfg
}
func readDirReadmeFrontmatter(resolver file.Resolver, p string) (*file.Location, *readmeFrontmatter) {
locations, err := resolver.FilesByPath(p)
if err != nil || len(locations) == 0 {
return nil, nil
}
rc, err := resolver.FileContentsByLocation(locations[0])
if err != nil {
return nil, nil
}
defer internal.CloseAndLogError(rc, p)
buf, err := io.ReadAll(io.LimitReader(rc, 1024*1024))
if err != nil {
return nil, nil
}
fm := parseFrontmatter(buf)
if fm == nil {
return nil, nil
}
return &locations[0], fm
}

View File

@ -0,0 +1,177 @@
package ai
import (
"bytes"
"context"
"encoding/json"
"io"
"path"
gcrname "github.com/google/go-containerregistry/pkg/name"
"github.com/anchore/syft/internal"
"github.com/anchore/syft/internal/log"
"github.com/anchore/syft/syft/file"
"github.com/anchore/syft/syft/pkg"
"github.com/anchore/syft/syft/pkg/cataloger/internal/licenses"
)
// resolveSafeTensorsOCIIdentity handles the OCI-artifact case: the model's
// naming and license signals arrive as sibling layers (model.file companions
// carrying config.json / README, and dedicated license layers). It returns the
// group's name candidates, resolved licenses, and supporting evidence.
func resolveSafeTensorsOCIIdentity(ctx context.Context, resolver file.Resolver, md *pkg.SafeTensorsModelInfo) safeTensorsIdentity {
ociResolver, ok := resolver.(file.OCIMediaTypeResolver)
if !ok {
return safeTensorsIdentity{}
}
modelFileLocs, err := ociResolver.FilesByMediaType(dockerAIModelFileMediaType)
if err != nil {
log.Debugf("failed to list docker AI model-file layers: %v", err)
}
// Collect config / readme candidates separately so the layer-iteration order
// returned by the resolver doesn't decide the precedence.
var configName, readmeName, readmeLicense string
var supporting []file.Location
for _, loc := range modelFileLocs {
if classifyOCIModelFileLayer(resolver, loc, md, &configName, &readmeName, &readmeLicense) {
supporting = append(supporting, loc)
}
}
// Precedence: config.json _name_or_path > README base_model.
nameOrPath := configName
if nameOrPath == "" {
nameOrPath = readmeName
}
id := safeTensorsIdentity{
nameOrPath: nameOrPath,
fallbackName: ociImageRefBasename(resolver),
supporting: supporting,
}
// License precedence: a README model-card license wins over dedicated
// license layers (mirrors the dir-scan path, where README frontmatter is the
// license source).
switch {
case readmeLicense != "":
id.licenses = pkg.NewLicensesFromValuesWithContext(ctx, readmeLicense)
default:
licLocs, err := ociResolver.FilesByMediaType(dockerAILicenseMediaType)
if err != nil {
log.Debugf("failed to list docker AI license layers: %v", err)
}
if len(licLocs) > 0 {
id.licenses = identifyLicenseLayers(ctx, resolver, licLocs)
id.supporting = append(id.supporting, licLocs...)
}
}
return id
}
// ociImageReferencer is the minimal capability ociImageRefBasename needs: a
// resolver that can surface the OCI image reference it was built from. It is
// kept local to this package (rather than exported from the file package) so the
// assertion stays with its only consumer.
type ociImageReferencer interface {
ImageReference() string
}
func ociImageRefBasename(resolver file.Resolver) string {
// TODO: we don't think this approach is generalizable quite yet, but we really do need this information.
// (Ideally we should be NOT be type asserting on the file resolver directly).
info, ok := resolver.(ociImageReferencer)
if !ok {
return ""
}
ref := info.ImageReference()
if ref == "" {
return ""
}
parsed, err := gcrname.ParseReference(ref)
if err != nil {
log.Debugf("failed to parse OCI ref %q: %v", ref, err)
return ""
}
return path.Base(parsed.Context().RepositoryStr())
}
// classifyOCIModelFileLayer reads up to 4 MiB of a model.file layer and
// classifies it as README frontmatter or HF config.json based on its leading bytes.
func classifyOCIModelFileLayer(resolver file.Resolver, loc file.Location, md *pkg.SafeTensorsModelInfo, configName, readmeName, license *string) bool {
rc, err := resolver.FileContentsByLocation(loc)
if err != nil {
return false
}
defer internal.CloseAndLogError(rc, loc.RealPath)
buf, err := io.ReadAll(io.LimitReader(rc, 4*1024*1024))
if err != nil {
return false
}
trimmed := bytes.TrimLeft(buf, "\xef\xbb\xbf \t\r\n")
switch {
case bytes.HasPrefix(trimmed, []byte("---")):
fm := parseFrontmatter(buf)
if fm == nil {
return false
}
if *license == "" {
*license = fm.License
}
if *readmeName == "" && len(fm.BaseModel) > 0 {
*readmeName = fm.BaseModel[0]
}
return true
case bytes.HasPrefix(trimmed, []byte("{")):
var cfg hfConfig
if err := json.Unmarshal(buf, &cfg); err != nil {
return false
}
applyHFConfig(md, &cfg)
if *configName == "" && cfg.NameOrPath != "" {
*configName = cfg.NameOrPath
}
return true
}
return false
}
// identifyLicenseLayers turns Docker AI license-layer locations into
// pkg.License values.
func identifyLicenseLayers(ctx context.Context, resolver file.Resolver, locs []file.Location) []pkg.License {
var out []pkg.License
var scanFallback []file.Location
for i := range locs {
loc := locs[i]
if spdx := readLicenseSPDXIDFromFrontmatter(resolver, loc); spdx != "" {
out = append(out, pkg.NewLicenseFromFieldsWithContext(ctx, spdx, "", &loc))
continue
}
scanFallback = append(scanFallback, loc)
}
if len(scanFallback) > 0 {
out = append(out, licenses.FindAtLocations(ctx, resolver, scanFallback...)...)
}
return out
}
// readLicenseSPDXIDFromFrontmatter reads a bounded prefix of a license-layer
// blob and returns the spdx-id declared in its YAML frontmatter
func readLicenseSPDXIDFromFrontmatter(resolver file.Resolver, loc file.Location) string {
rc, err := resolver.FileContentsByLocation(loc)
if err != nil {
return ""
}
defer internal.CloseAndLogError(rc, loc.RealPath)
buf, err := io.ReadAll(io.LimitReader(rc, 64*1024))
if err != nil {
return ""
}
return parseLicenseFrontmatter(buf)
}

View File

@ -0,0 +1,140 @@
package ai
import (
"fmt"
"sort"
"strings"
"github.com/cespare/xxhash/v2"
"github.com/anchore/syft/syft/file"
"github.com/anchore/syft/syft/pkg"
)
// mergeSafeTensorsGroup folds a group's per-member metadata into a single package.
func mergeSafeTensorsGroup(members []pkg.Package) pkg.Package {
locSet := unionLocations(members)
aggregates, shards := bucketSafeTensorsMembers(members)
merged := pkg.SafeTensorsModelInfo{Format: "safetensors"}
mergeAggregatesInto(&merged, aggregates)
shardTensorTotal, hashes := mergeShardsInto(&merged, shards)
// Keep merged UserMetadata globally key-sorted so the SBOM is stable
sort.Slice(merged.UserMetadata, func(i, j int) bool {
return merged.UserMetadata[i].Key < merged.UserMetadata[j].Key
})
if merged.TensorCount == 0 {
merged.TensorCount = shardTensorTotal
}
if merged.ShardCount == 0 {
if len(shards) > 0 {
merged.ShardCount = len(shards)
} else {
merged.ShardCount = 1
}
}
merged.MetadataHash = rollupHash(hashes)
// Parts only carry value for multi-shard models; for a single shard the
// outer view already exposes every per-shard field.
if len(shards) > 1 {
parts := append([]pkg.SafeTensorsModelInfo(nil), shards...)
sort.Slice(parts, func(i, j int) bool {
return parts[i].MetadataHash < parts[j].MetadataHash
})
merged.Parts = parts
}
return pkg.Package{
Locations: locSet,
Type: pkg.ModelPkg,
Metadata: merged,
}
}
func mergeAggregatesInto(merged *pkg.SafeTensorsModelInfo, aggregates []pkg.SafeTensorsModelInfo) {
for _, a := range aggregates {
if merged.TensorCount == 0 {
merged.TensorCount = a.TensorCount
}
if merged.ShardCount == 0 {
merged.ShardCount = a.ShardCount
}
firstNonEmpty(&merged.Parameters, a.Parameters)
firstNonEmpty(&merged.TotalSize, a.TotalSize)
firstNonEmpty(&merged.Quantization, a.Quantization)
}
}
// mergeShardsInto folds the per-shard header metadata into merged, returning
// the summed shard TensorCount and the list of non-empty per-shard hashes for
// the rollup. Shards carry only the content-derived fields (Quantization,
// Parameters, UserMetadata);
func mergeShardsInto(merged *pkg.SafeTensorsModelInfo, shards []pkg.SafeTensorsModelInfo) (shardTensorTotal uint64, hashes []string) {
seenKV := map[string]bool{}
for _, s := range shards {
shardTensorTotal += s.TensorCount
firstNonEmpty(&merged.Quantization, s.Quantization)
firstNonEmpty(&merged.Parameters, s.Parameters)
for _, kv := range s.UserMetadata {
if seenKV[kv.Key] {
continue
}
seenKV[kv.Key] = true
merged.UserMetadata = append(merged.UserMetadata, kv)
}
if s.MetadataHash != "" {
hashes = append(hashes, s.MetadataHash)
}
}
return shardTensorTotal, hashes
}
func firstNonEmpty(dst *string, v string) {
if *dst == "" {
*dst = v
}
}
// unionLocations gathers every location from every member into a single set.
func unionLocations(members []pkg.Package) file.LocationSet {
out := file.NewLocationSet()
for _, m := range members {
for _, l := range m.Locations.ToSlice() {
out.Add(l)
}
}
return out
}
// bucketSafeTensorsMembers splits group members into aggregate-flavored entries
// (no MetadataHash — Docker AI config blob or sharded index) and shard-flavored
// entries (carry a content-derived MetadataHash from a header parser).
func bucketSafeTensorsMembers(members []pkg.Package) (aggregates, shards []pkg.SafeTensorsModelInfo) {
for _, m := range members {
md, ok := m.Metadata.(pkg.SafeTensorsModelInfo)
if !ok {
continue
}
if md.MetadataHash != "" {
shards = append(shards, md)
continue
}
aggregates = append(aggregates, md)
}
return aggregates, shards
}
func rollupHash(hashes []string) string {
if len(hashes) == 0 {
return ""
}
if len(hashes) == 1 {
return hashes[0]
}
sorted := append([]string(nil), hashes...)
sort.Strings(sorted)
return fmt.Sprintf("%016x", xxhash.Sum64String(strings.Join(sorted, "|")))
}

View File

@ -0,0 +1,25 @@
package ai
import "path"
// pickSafeTensorsName implements the documented naming precedence chain:
// - config.json _name_or_path (path.Base, so "org/Model" → "Model";
// applies to both dir-scan and OCI groups)
// - fallback name — the group's source-specific positional identifier
func pickSafeTensorsName(nameOrPath, fallbackName string) string {
if nameOrPath != "" {
return path.Base(nameOrPath)
}
return fallbackName
}
// safeTensorsDirName returns the directory-scan naming fallback: the base name
// of the group's parent directory (the group key is already that directory).
func safeTensorsDirName(groupKey string) string {
base := path.Base(groupKey)
switch base {
case "/", ".", "":
return ""
}
return base
}

View File

@ -1,159 +1,122 @@
package ai
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"path"
"sort"
"strings"
"github.com/cespare/xxhash/v2"
gcrname "github.com/google/go-containerregistry/pkg/name"
"gopkg.in/yaml.v3"
"github.com/anchore/syft/internal"
"github.com/anchore/syft/internal/log"
"github.com/anchore/syft/syft/artifact"
"github.com/anchore/syft/syft/file"
"github.com/anchore/syft/syft/pkg"
"github.com/anchore/syft/syft/pkg/cataloger/internal/licenses"
)
// ociGroupKey is the grouping key for every safetensors package that
// originated from an OCI model artifact. The ContainerImageModel resolver gives
// each layer the virtual RealPath "/" regardless of layer media type, so all
// safetensors packages from a single OCI scan collapse into one group.
const ociGroupKey = "@oci@"
// ggufMergeProcessor consolidates multiple GGUF packages into a single package
// representing the AI model. When scanning OCI images with multiple layers,
// each layer may produce a separate package. This processor finds the package
// with a name and merges metadata from nameless packages into its GGUFFileParts field.
// Only packages with a non-empty name are returned in the final result.
func ggufMergeProcessor(pkgs []pkg.Package, rels []artifact.Relationship, err error) ([]pkg.Package, []artifact.Relationship, error) {
if err != nil {
return pkgs, rels, err
}
if len(pkgs) == 0 {
return pkgs, rels, err
}
// Separate packages with names from those without
var namedPkgs []pkg.Package
var namelessHeaders []pkg.GGUFFileHeader
for _, p := range pkgs {
if p.Name != "" {
namedPkgs = append(namedPkgs, p)
} else {
if header, ok := p.Metadata.(pkg.GGUFFileHeader); ok {
// We do not want a kv hash for nameless headers
header.MetadataKeyValuesHash = ""
namelessHeaders = append(namelessHeaders, header)
}
}
}
// If there are no named packages, return nothing
if len(namedPkgs) == 0 {
return nil, rels, err
}
// merge nameless headers into a single named package;
// if there are multiple named packages, return them without trying to merge headers.
// we cannot determine which nameless headers belong to which package
// this is because the order we receive the gguf headers in is not guaranteed
// to match the layer order in the original oci image
if len(namedPkgs) == 1 && len(namelessHeaders) > 0 {
winner := &namedPkgs[0]
if header, ok := winner.Metadata.(pkg.GGUFFileHeader); ok {
header.Parts = namelessHeaders
winner.Metadata = header
}
}
return namedPkgs, rels, err
}
// safeTensorsMergeProcessor owns naming, license resolution, and tensor package creation
// - groups all nameless packages
// - merge the per-shard metadata
// - picks a name (see pickSafeTensorsName)
// safeTensorsMergeProcessor owns naming, license resolution, and final package
// assembly. SafeTensors packages reach it nameless from the parsers; it groups
// them per model, merges the per-shard metadata, resolves a name + licenses, and
// drops any model it cannot name.
//
// There are exactly two sources, each handled by its own path:
// - an OCI model artifact, where the source presents every layer at the
// virtual path "/" and the whole scan is a single model (mergeOCIModel)
// - a filesystem scan, where models are grouped by the directory their files
// live in (mergeDirModels)
func safeTensorsMergeProcessor(ctx context.Context, resolver file.Resolver, pkgs []pkg.Package, rels []artifact.Relationship, err error) ([]pkg.Package, []artifact.Relationship, error) {
if err != nil || len(pkgs) == 0 {
return pkgs, rels, err
}
// split off non-safetensors packages
// this keeps the processor robust if other types ever flow through
var stPkgs, other []pkg.Package
for _, p := range pkgs {
if _, ok := p.Metadata.(pkg.SafeTensorsModelInfo); ok {
stPkgs = append(stPkgs, p)
continue
}
other = append(other, p)
}
// keep the processor robust if non-safetensors packages ever flow through
stPkgs, other := partitionSafeTensorsPackages(pkgs)
if len(stPkgs) == 0 {
return pkgs, rels, err
}
groups := groupSafeTensorsPackages(stPkgs)
// Deterministic iteration order so the SBOM doesn't depend on map order.
keys := make([]string, 0, len(groups))
for k := range groups {
keys = append(keys, k)
if fromOCIArtifact(stPkgs) {
return append(other, mergeOCIModel(ctx, resolver, stPkgs)...), rels, nil
}
sort.Strings(keys)
out := other
for _, key := range keys {
merged := mergeSafeTensorsGroup(groups[key])
// Resolve model identity (name candidates) before enrich
id := resolveSafeTensorsIdentity(resolver, key, &merged)
name := pickSafeTensorsName(id.nameOrPath, id.fallbackName)
if name == "" {
log.Debugf("dropped safetensors model package (metadata hash %q): no name source",
merged.Metadata.(pkg.SafeTensorsModelInfo).MetadataHash)
continue
}
enrichSafeTensorsGroup(ctx, resolver, key, &merged, id)
merged.Name = name
merged.SetID()
out = append(out, merged)
}
return out, rels, nil
return append(other, mergeDirModels(ctx, resolver, stPkgs)...), rels, nil
}
// groupSafeTensorsPackages buckets packages by the parent directory of their
// primary-evidence location
func groupSafeTensorsPackages(pkgs []pkg.Package) map[string][]pkg.Package {
out := make(map[string][]pkg.Package)
// partitionSafeTensorsPackages separates safetensors packages from anything else
// flowing through the processor.
func partitionSafeTensorsPackages(pkgs []pkg.Package) (safeTensors, other []pkg.Package) {
for _, p := range pkgs {
key := safeTensorsGroupKey(p)
if key == "" {
if _, ok := p.Metadata.(pkg.SafeTensorsModelInfo); ok {
safeTensors = append(safeTensors, p)
continue
}
out[key] = append(out[key], p)
other = append(other, p)
}
return safeTensors, other
}
// fromOCIArtifact reports whether the packages came from an OCI model artifact.
// That source presents every layer at the virtual path "/", whereas a filesystem
// scan always carries a real file path. A single scan is one source, so the
// first package is representative of the rest.
func fromOCIArtifact(pkgs []pkg.Package) bool {
loc := primaryEvidenceLocation(pkgs[0])
return loc != nil && loc.RealPath == "/"
}
// mergeOCIModel treats the whole OCI artifact as a single model: every layer
// merges into one package, named from the artifact's config.json/README or its
// image reference.
func mergeOCIModel(ctx context.Context, resolver file.Resolver, pkgs []pkg.Package) []pkg.Package {
merged := mergeSafeTensorsGroup(pkgs)
md := merged.Metadata.(pkg.SafeTensorsModelInfo)
id := resolveSafeTensorsOCIIdentity(ctx, resolver, &md)
merged.Metadata = md // write architecture enrichment back before assembly
if p, ok := assembleSafeTensorsPackage(merged, id); ok {
return []pkg.Package{p}
}
return nil
}
// mergeDirModels groups filesystem-scanned files by their parent directory and
// emits one model per directory, named from a sibling config.json/README or the
// directory itself.
func mergeDirModels(ctx context.Context, resolver file.Resolver, pkgs []pkg.Package) []pkg.Package {
groups := groupByParentDir(pkgs)
// deterministic iteration order so the SBOM doesn't depend on map order
dirs := make([]string, 0, len(groups))
for dir := range groups {
dirs = append(dirs, dir)
}
sort.Strings(dirs)
var out []pkg.Package
for _, dir := range dirs {
merged := mergeSafeTensorsGroup(groups[dir])
md := merged.Metadata.(pkg.SafeTensorsModelInfo)
id := resolveSafeTensorsDirIdentity(ctx, resolver, dir, &md)
merged.Metadata = md // write architecture enrichment back before assembly
if p, ok := assembleSafeTensorsPackage(merged, id); ok {
out = append(out, p)
}
}
return out
}
func safeTensorsGroupKey(p pkg.Package) string {
loc := primaryEvidenceLocation(p)
if loc == nil {
return ""
// groupByParentDir buckets filesystem-scanned packages by the directory their
// primary-evidence file lives in (the shards of one model share a directory).
func groupByParentDir(pkgs []pkg.Package) map[string][]pkg.Package {
out := make(map[string][]pkg.Package)
for _, p := range pkgs {
loc := primaryEvidenceLocation(p)
if loc == nil {
continue
}
dir := path.Dir(loc.RealPath)
out[dir] = append(out[dir], p)
}
if loc.RealPath == "/" {
return ociGroupKey
}
return path.Dir(loc.RealPath)
return out
}
func primaryEvidenceLocation(p pkg.Package) *file.Location {
@ -169,501 +132,34 @@ func primaryEvidenceLocation(p pkg.Package) *file.Location {
return nil
}
// mergeSafeTensorsGroup folds a group's per-member metadata into a single package.
func mergeSafeTensorsGroup(members []pkg.Package) pkg.Package {
locSet := unionLocations(members)
aggregates, shards := bucketSafeTensorsMembers(members)
merged := pkg.SafeTensorsModelInfo{Format: "safetensors"}
mergeAggregatesInto(&merged, aggregates)
shardTensorTotal, hashes := mergeShardsInto(&merged, shards)
// Keep merged UserMetadata globally key-sorted so the SBOM is stable
sort.Slice(merged.UserMetadata, func(i, j int) bool {
return merged.UserMetadata[i].Key < merged.UserMetadata[j].Key
})
if merged.TensorCount == 0 {
merged.TensorCount = shardTensorTotal
}
if merged.ShardCount == 0 {
if len(shards) > 0 {
merged.ShardCount = len(shards)
} else {
merged.ShardCount = 1
}
}
merged.MetadataHash = rollupHash(hashes)
// Parts only carry value for multi-shard models; for a single shard the
// outer view already exposes every per-shard field.
if len(shards) > 1 {
parts := append([]pkg.SafeTensorsModelInfo(nil), shards...)
sort.Slice(parts, func(i, j int) bool {
return parts[i].MetadataHash < parts[j].MetadataHash
})
merged.Parts = parts
}
return pkg.Package{
Locations: locSet,
Type: pkg.ModelPkg,
Metadata: merged,
}
}
func mergeAggregatesInto(merged *pkg.SafeTensorsModelInfo, aggregates []pkg.SafeTensorsModelInfo) {
for _, a := range aggregates {
if merged.TensorCount == 0 {
merged.TensorCount = a.TensorCount
}
if merged.ShardCount == 0 {
merged.ShardCount = a.ShardCount
}
firstNonEmpty(&merged.Parameters, a.Parameters)
firstNonEmpty(&merged.TotalSize, a.TotalSize)
firstNonEmpty(&merged.Quantization, a.Quantization)
}
}
// mergeShardsInto folds the per-shard header metadata into merged, returning
// the summed shard TensorCount and the list of non-empty per-shard hashes for
// the rollup. Shards carry only the content-derived fields (Quantization,
// Parameters, UserMetadata);
func mergeShardsInto(merged *pkg.SafeTensorsModelInfo, shards []pkg.SafeTensorsModelInfo) (shardTensorTotal uint64, hashes []string) {
seenKV := map[string]bool{}
for _, s := range shards {
shardTensorTotal += s.TensorCount
firstNonEmpty(&merged.Quantization, s.Quantization)
firstNonEmpty(&merged.Parameters, s.Parameters)
for _, kv := range s.UserMetadata {
if seenKV[kv.Key] {
continue
}
seenKV[kv.Key] = true
merged.UserMetadata = append(merged.UserMetadata, kv)
}
if s.MetadataHash != "" {
hashes = append(hashes, s.MetadataHash)
}
}
return shardTensorTotal, hashes
}
func firstNonEmpty(dst *string, v string) {
if *dst == "" {
*dst = v
}
}
// unionLocations gathers every location from every member into a single set.
func unionLocations(members []pkg.Package) file.LocationSet {
out := file.NewLocationSet()
for _, m := range members {
for _, l := range m.Locations.ToSlice() {
out.Add(l)
}
}
return out
}
// bucketSafeTensorsMembers splits group members into aggregate-flavored entries
// (no MetadataHash — Docker AI config blob or sharded index) and shard-flavored
// entries (carry a content-derived MetadataHash from a header parser).
func bucketSafeTensorsMembers(members []pkg.Package) (aggregates, shards []pkg.SafeTensorsModelInfo) {
for _, m := range members {
md, ok := m.Metadata.(pkg.SafeTensorsModelInfo)
if !ok {
continue
}
if md.MetadataHash != "" {
shards = append(shards, md)
continue
}
aggregates = append(aggregates, md)
}
return aggregates, shards
}
func rollupHash(hashes []string) string {
if len(hashes) == 0 {
return ""
}
if len(hashes) == 1 {
return hashes[0]
}
sorted := append([]string(nil), hashes...)
sort.Strings(sorted)
return fmt.Sprintf("%016x", xxhash.Sum64String(strings.Join(sorted, "|")))
}
// safeTensorsIdentity is the fully-resolved naming/license result for a model.
// Each source resolver (dir or OCI) populates it so assembly stays source-agnostic.
type safeTensorsIdentity struct {
nameOrPath string
fallbackName string
readmeLicense string
supporting []file.Location
nameOrPath string
fallbackName string
licenses []pkg.License
supporting []file.Location
}
// resolveSafeTensorsIdentity reads the resolver for the group's naming signals
// (config.json _name_or_path, README base_model, OCI image ref / dir name)
func resolveSafeTensorsIdentity(resolver file.Resolver, groupKey string, merged *pkg.Package) safeTensorsIdentity {
md := merged.Metadata.(pkg.SafeTensorsModelInfo)
var id safeTensorsIdentity
if groupKey == ociGroupKey {
id = resolveSafeTensorsOCIIdentity(resolver, &md)
} else {
id = resolveSafeTensorsDirIdentity(resolver, groupKey, &md)
// assembleSafeTensorsPackage finalizes a merged model from its resolved identity:
// it picks the name, attaches licenses and supporting evidence, and sets the ID.
// A model with no name source is dropped (ok=false).
func assembleSafeTensorsPackage(merged pkg.Package, id safeTensorsIdentity) (pkg.Package, bool) {
name := pickSafeTensorsName(id.nameOrPath, id.fallbackName)
if name == "" {
log.Debugf("dropped safetensors model package (metadata hash %q): no name source",
merged.Metadata.(pkg.SafeTensorsModelInfo).MetadataHash)
return pkg.Package{}, false
}
merged.Metadata = md
return id
}
func enrichSafeTensorsGroup(ctx context.Context, resolver file.Resolver, groupKey string, merged *pkg.Package, id safeTensorsIdentity) {
var lics []pkg.License
supporting := id.supporting
switch {
case id.readmeLicense != "":
lics = pkg.NewLicensesFromValuesWithContext(ctx, id.readmeLicense)
case groupKey == ociGroupKey:
if ociResolver, ok := resolver.(file.OCIMediaTypeResolver); ok {
licLocs, err := ociResolver.FilesByMediaType(dockerAILicenseMediaType)
if err != nil {
log.Debugf("failed to list docker AI license layers: %v", err)
}
if len(licLocs) > 0 {
lics = identifyLicenseLayers(ctx, resolver, licLocs)
supporting = append(supporting, licLocs...)
}
}
if len(id.licenses) > 0 {
merged.Licenses = pkg.NewLicenseSet(id.licenses...)
}
if len(lics) > 0 {
merged.Licenses = pkg.NewLicenseSet(lics...)
}
for _, loc := range supporting {
for _, loc := range id.supporting {
merged.Locations.Add(loc.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.SupportingEvidenceAnnotation))
}
}
// safeTensorsDirName returns the directory-scan naming fallback: the base name
// of the group's parent directory (the group key is already that directory).
func safeTensorsDirName(groupKey string) string {
base := path.Base(groupKey)
switch base {
case "/", ".", "":
return ""
}
return base
}
// resolveSafeTensorsDirIdentity handles the directory-scan case: look for a
// config.json beside the model files (walking up parent directories to the
// scanned source root if no sibling exists) and a sibling README.md
func resolveSafeTensorsDirIdentity(resolver file.Resolver, dir string, md *pkg.SafeTensorsModelInfo) safeTensorsIdentity {
id := safeTensorsIdentity{fallbackName: safeTensorsDirName(dir)}
if loc, cfg := findDirHFConfig(resolver, dir); cfg != nil {
applyHFConfig(md, cfg)
id.nameOrPath = cfg.NameOrPath
id.supporting = append(id.supporting, *loc)
}
if loc, fm := readDirReadmeFrontmatter(resolver, path.Join(dir, "README.md")); fm != nil {
id.readmeLicense = fm.License
if id.nameOrPath == "" && len(fm.BaseModel) > 0 {
id.nameOrPath = fm.BaseModel[0]
}
id.supporting = append(id.supporting, *loc)
}
return id
}
func resolveSafeTensorsOCIIdentity(resolver file.Resolver, md *pkg.SafeTensorsModelInfo) safeTensorsIdentity {
ociResolver, ok := resolver.(file.OCIMediaTypeResolver)
if !ok {
return safeTensorsIdentity{}
}
modelFileLocs, err := ociResolver.FilesByMediaType(dockerAIModelFileMediaType)
if err != nil {
log.Debugf("failed to list docker AI model-file layers: %v", err)
}
// Collect config / readme candidates separately so the layer-iteration order
// returned by the resolver doesn't decide the precedence.
var configName, readmeName, readmeLicense string
var supporting []file.Location
for _, loc := range modelFileLocs {
if classifyOCIModelFileLayer(resolver, loc, md, &configName, &readmeName, &readmeLicense) {
supporting = append(supporting, loc)
}
}
// Precedence: config.json _name_or_path > README base_model.
nameOrPath := configName
if nameOrPath == "" {
nameOrPath = readmeName
}
return safeTensorsIdentity{
nameOrPath: nameOrPath,
fallbackName: ociImageRefBasename(resolver),
readmeLicense: readmeLicense,
supporting: supporting,
}
}
// ociImageReferencer is the minimal capability ociImageRefBasename needs: a
// resolver that can surface the OCI image reference it was built from. It is
// kept local to this package (rather than exported from the file package) so the
// assertion stays with its only consumer.
type ociImageReferencer interface {
ImageReference() string
}
func ociImageRefBasename(resolver file.Resolver) string {
// TODO: we don't think this approach is generalizable quite yet, but we really do need this information.
// (Ideally we should be NOT be type asserting on the file resolver directly).
info, ok := resolver.(ociImageReferencer)
if !ok {
return ""
}
ref := info.ImageReference()
if ref == "" {
return ""
}
parsed, err := gcrname.ParseReference(ref)
if err != nil {
log.Debugf("failed to parse OCI ref %q: %v", ref, err)
return ""
}
return path.Base(parsed.Context().RepositoryStr())
}
// identifyLicenseLayers turns Docker AI license-layer locations into
// pkg.License values.
func identifyLicenseLayers(ctx context.Context, resolver file.Resolver, locs []file.Location) []pkg.License {
var out []pkg.License
var scanFallback []file.Location
for i := range locs {
loc := locs[i]
if spdx := readLicenseSPDXIDFromFrontmatter(resolver, loc); spdx != "" {
out = append(out, pkg.NewLicenseFromFieldsWithContext(ctx, spdx, "", &loc))
continue
}
scanFallback = append(scanFallback, loc)
}
if len(scanFallback) > 0 {
out = append(out, licenses.FindAtLocations(ctx, resolver, scanFallback...)...)
}
return out
}
// readLicenseSPDXIDFromFrontmatter reads a bounded prefix of a license-layer
// blob and returns the spdx-id declared in its YAML frontmatter
func readLicenseSPDXIDFromFrontmatter(resolver file.Resolver, loc file.Location) string {
rc, err := resolver.FileContentsByLocation(loc)
if err != nil {
return ""
}
defer internal.CloseAndLogError(rc, loc.RealPath)
buf, err := io.ReadAll(io.LimitReader(rc, 64*1024))
if err != nil {
return ""
}
return parseLicenseFrontmatter(buf)
}
// classifyOCIModelFileLayer reads up to 4 MiB of a model.file layer and
// classifies it as README frontmatter or HF config.json based on its leading bytes.
func classifyOCIModelFileLayer(resolver file.Resolver, loc file.Location, md *pkg.SafeTensorsModelInfo, configName, readmeName, license *string) bool {
rc, err := resolver.FileContentsByLocation(loc)
if err != nil {
return false
}
defer internal.CloseAndLogError(rc, loc.RealPath)
buf, err := io.ReadAll(io.LimitReader(rc, 4*1024*1024))
if err != nil {
return false
}
trimmed := bytes.TrimLeft(buf, "\xef\xbb\xbf \t\r\n")
switch {
case bytes.HasPrefix(trimmed, []byte("---")):
fm := parseFrontmatter(buf)
if fm == nil {
return false
}
if *license == "" {
*license = fm.License
}
if *readmeName == "" && len(fm.BaseModel) > 0 {
*readmeName = fm.BaseModel[0]
}
return true
case bytes.HasPrefix(trimmed, []byte("{")):
var cfg hfConfig
if err := json.Unmarshal(buf, &cfg); err != nil {
return false
}
applyHFConfig(md, &cfg)
if *configName == "" && cfg.NameOrPath != "" {
*configName = cfg.NameOrPath
}
return true
}
return false
}
func applyHFConfig(md *pkg.SafeTensorsModelInfo, cfg *hfConfig) {
if md.Architecture == "" && len(cfg.Architectures) > 0 {
md.Architecture = cfg.Architectures[0]
}
}
// pickSafeTensorsName implements the documented naming precedence chain:
// - config.json _name_or_path (path.Base, so "org/Model" → "Model";
// applies to both dir-scan and OCI groups)
// - fallback name — the group's source-specific positional identifier
func pickSafeTensorsName(nameOrPath, fallbackName string) string {
if nameOrPath != "" {
return path.Base(nameOrPath)
}
return fallbackName
}
// hfConfig is a minimal projection of Hugging Face config.json fields.
type hfConfig struct {
Architectures []string `json:"architectures"`
NameOrPath string `json:"_name_or_path"`
}
// readmeFrontmatter holds the subset of YAML frontmatter fields we extract.
type readmeFrontmatter struct {
License string `yaml:"license"`
BaseModel []string `yaml:"base_model"`
}
// findDirHFConfig looks for a config.json beside the model files
func findDirHFConfig(resolver file.Resolver, dir string) (*file.Location, *hfConfig) {
for {
if loc, cfg := readDirHFConfig(resolver, path.Join(dir, "config.json")); cfg != nil {
return loc, cfg
}
parent := path.Dir(dir)
if parent == dir {
return nil, nil // reached the source root
}
dir = parent
}
}
func readDirHFConfig(resolver file.Resolver, p string) (*file.Location, *hfConfig) {
locations, err := resolver.FilesByPath(p)
if err != nil || len(locations) == 0 {
return nil, nil
}
rc, err := resolver.FileContentsByLocation(locations[0])
if err != nil {
return nil, nil
}
defer internal.CloseAndLogError(rc, p)
var cfg hfConfig
if err := json.NewDecoder(rc).Decode(&cfg); err != nil {
log.Debugf("failed to decode %s: %v", p, err)
return nil, nil
}
return &locations[0], &cfg
}
func readDirReadmeFrontmatter(resolver file.Resolver, p string) (*file.Location, *readmeFrontmatter) {
locations, err := resolver.FilesByPath(p)
if err != nil || len(locations) == 0 {
return nil, nil
}
rc, err := resolver.FileContentsByLocation(locations[0])
if err != nil {
return nil, nil
}
defer internal.CloseAndLogError(rc, p)
buf, err := io.ReadAll(io.LimitReader(rc, 1024*1024))
if err != nil {
return nil, nil
}
fm := parseFrontmatter(buf)
if fm == nil {
return nil, nil
}
return &locations[0], fm
}
// extractFrontmatterBlock returns the YAML bytes between the first and second
// "---" delimiters of a file
func extractFrontmatterBlock(buf []byte) []byte {
trimmed := bytes.TrimLeft(buf, "\xef\xbb\xbf \t\r\n")
if !bytes.HasPrefix(trimmed, []byte("---")) {
return nil
}
rest := trimmed[3:]
if i := bytes.IndexByte(rest, '\n'); i >= 0 {
rest = rest[i+1:]
}
block, _, found := bytes.Cut(rest, []byte("\n---"))
if !found {
return nil
}
return block
}
// parseFrontmatter decodes a Hugging Face model card YAML frontmatter block
// and returns the license and base_model fields.
func parseFrontmatter(buf []byte) *readmeFrontmatter {
block := extractFrontmatterBlock(buf)
if block == nil {
return nil
}
var raw struct {
License string `yaml:"license"`
BaseModel yaml.Node `yaml:"base_model"`
}
if err := yaml.Unmarshal(block, &raw); err != nil {
log.Debugf("failed to parse README frontmatter: %v", err)
return nil
}
fm := readmeFrontmatter{License: raw.License}
switch raw.BaseModel.Kind {
case yaml.ScalarNode:
if raw.BaseModel.Value != "" {
fm.BaseModel = []string{raw.BaseModel.Value}
}
case yaml.SequenceNode:
_ = raw.BaseModel.Decode(&fm.BaseModel)
}
return &fm
}
type licenseFrontmatter struct {
SPDXID string `yaml:"spdx-id"`
}
// parseLicenseFrontmatter returns the producer-declared SPDX identifier
func parseLicenseFrontmatter(buf []byte) string {
block := extractFrontmatterBlock(buf)
if block == nil {
return ""
}
var fm licenseFrontmatter
if err := yaml.Unmarshal(block, &fm); err != nil {
log.Debugf("failed to parse license frontmatter: %v", err)
return ""
}
return fm.SPDXID
merged.Name = name
merged.SetID()
return merged, true
}