From fe392a490b953db7d866bc53a185cd6a61d3a9b7 Mon Sep 17 00:00:00 2001 From: Christopher Phillips <32073428+spiffcs@users.noreply.github.com> Date: Fri, 5 Jun 2026 02:29:46 -0400 Subject: [PATCH] pr: first pass refactor Signed-off-by: Christopher Phillips <32073428+spiffcs@users.noreply.github.com> --- syft/pkg/cataloger/ai/gguf_processor.go | 56 ++ syft/pkg/cataloger/ai/huggingface.go | 93 +++ syft/pkg/cataloger/ai/identity_dir.go | 93 +++ syft/pkg/cataloger/ai/identity_oci.go | 177 ++++++ syft/pkg/cataloger/ai/merge.go | 140 +++++ syft/pkg/cataloger/ai/naming.go | 25 + syft/pkg/cataloger/ai/processor.go | 720 ++++-------------------- 7 files changed, 692 insertions(+), 612 deletions(-) create mode 100644 syft/pkg/cataloger/ai/gguf_processor.go create mode 100644 syft/pkg/cataloger/ai/huggingface.go create mode 100644 syft/pkg/cataloger/ai/identity_dir.go create mode 100644 syft/pkg/cataloger/ai/identity_oci.go create mode 100644 syft/pkg/cataloger/ai/merge.go create mode 100644 syft/pkg/cataloger/ai/naming.go diff --git a/syft/pkg/cataloger/ai/gguf_processor.go b/syft/pkg/cataloger/ai/gguf_processor.go new file mode 100644 index 000000000..fe31e3f8b --- /dev/null +++ b/syft/pkg/cataloger/ai/gguf_processor.go @@ -0,0 +1,56 @@ +package ai + +import ( + "github.com/anchore/syft/syft/artifact" + "github.com/anchore/syft/syft/pkg" +) + +// ggufMergeProcessor consolidates multiple GGUF packages into a single package +// representing the AI model. When scanning OCI images with multiple layers, +// each layer may produce a separate package. This processor finds the package +// with a name and merges metadata from nameless packages into its GGUFFileParts field. +// Only packages with a non-empty name are returned in the final result. +func ggufMergeProcessor(pkgs []pkg.Package, rels []artifact.Relationship, err error) ([]pkg.Package, []artifact.Relationship, error) { + if err != nil { + return pkgs, rels, err + } + if len(pkgs) == 0 { + return pkgs, rels, err + } + + // Separate packages with names from those without + var namedPkgs []pkg.Package + var namelessHeaders []pkg.GGUFFileHeader + + for _, p := range pkgs { + if p.Name != "" { + namedPkgs = append(namedPkgs, p) + } else { + if header, ok := p.Metadata.(pkg.GGUFFileHeader); ok { + // We do not want a kv hash for nameless headers + header.MetadataKeyValuesHash = "" + namelessHeaders = append(namelessHeaders, header) + } + } + } + + // If there are no named packages, return nothing + if len(namedPkgs) == 0 { + return nil, rels, err + } + + // merge nameless headers into a single named package; + // if there are multiple named packages, return them without trying to merge headers. + // we cannot determine which nameless headers belong to which package + // this is because the order we receive the gguf headers in is not guaranteed + // to match the layer order in the original oci image + if len(namedPkgs) == 1 && len(namelessHeaders) > 0 { + winner := &namedPkgs[0] + if header, ok := winner.Metadata.(pkg.GGUFFileHeader); ok { + header.Parts = namelessHeaders + winner.Metadata = header + } + } + + return namedPkgs, rels, err +} diff --git a/syft/pkg/cataloger/ai/huggingface.go b/syft/pkg/cataloger/ai/huggingface.go new file mode 100644 index 000000000..6c333835b --- /dev/null +++ b/syft/pkg/cataloger/ai/huggingface.go @@ -0,0 +1,93 @@ +package ai + +import ( + "bytes" + + "gopkg.in/yaml.v3" + + "github.com/anchore/syft/internal/log" + "github.com/anchore/syft/syft/pkg" +) + +// hfConfig is a minimal projection of Hugging Face config.json fields. +type hfConfig struct { + Architectures []string `json:"architectures"` + NameOrPath string `json:"_name_or_path"` +} + +func applyHFConfig(md *pkg.SafeTensorsModelInfo, cfg *hfConfig) { + if md.Architecture == "" && len(cfg.Architectures) > 0 { + md.Architecture = cfg.Architectures[0] + } +} + +// readmeFrontmatter holds the subset of YAML frontmatter fields we extract. +type readmeFrontmatter struct { + License string `yaml:"license"` + BaseModel []string `yaml:"base_model"` +} + +type licenseFrontmatter struct { + SPDXID string `yaml:"spdx-id"` +} + +// extractFrontmatterBlock returns the YAML bytes between the first and second +// "---" delimiters of a file +func extractFrontmatterBlock(buf []byte) []byte { + trimmed := bytes.TrimLeft(buf, "\xef\xbb\xbf \t\r\n") + if !bytes.HasPrefix(trimmed, []byte("---")) { + return nil + } + rest := trimmed[3:] + if i := bytes.IndexByte(rest, '\n'); i >= 0 { + rest = rest[i+1:] + } + block, _, found := bytes.Cut(rest, []byte("\n---")) + if !found { + return nil + } + return block +} + +// parseFrontmatter decodes a Hugging Face model card YAML frontmatter block +// and returns the license and base_model fields. +func parseFrontmatter(buf []byte) *readmeFrontmatter { + block := extractFrontmatterBlock(buf) + if block == nil { + return nil + } + + var raw struct { + License string `yaml:"license"` + BaseModel yaml.Node `yaml:"base_model"` + } + if err := yaml.Unmarshal(block, &raw); err != nil { + log.Debugf("failed to parse README frontmatter: %v", err) + return nil + } + + fm := readmeFrontmatter{License: raw.License} + switch raw.BaseModel.Kind { + case yaml.ScalarNode: + if raw.BaseModel.Value != "" { + fm.BaseModel = []string{raw.BaseModel.Value} + } + case yaml.SequenceNode: + _ = raw.BaseModel.Decode(&fm.BaseModel) + } + return &fm +} + +// parseLicenseFrontmatter returns the producer-declared SPDX identifier +func parseLicenseFrontmatter(buf []byte) string { + block := extractFrontmatterBlock(buf) + if block == nil { + return "" + } + var fm licenseFrontmatter + if err := yaml.Unmarshal(block, &fm); err != nil { + log.Debugf("failed to parse license frontmatter: %v", err) + return "" + } + return fm.SPDXID +} diff --git a/syft/pkg/cataloger/ai/identity_dir.go b/syft/pkg/cataloger/ai/identity_dir.go new file mode 100644 index 000000000..09635f18a --- /dev/null +++ b/syft/pkg/cataloger/ai/identity_dir.go @@ -0,0 +1,93 @@ +package ai + +import ( + "context" + "encoding/json" + "io" + "path" + + "github.com/anchore/syft/internal" + "github.com/anchore/syft/internal/log" + "github.com/anchore/syft/syft/file" + "github.com/anchore/syft/syft/pkg" +) + +// resolveSafeTensorsDirIdentity handles the directory-scan case: look for a +// config.json beside the model files (walking up parent directories to the +// scanned source root if no sibling exists) and a sibling README.md. It returns +// the group's name candidates, resolved licenses, and supporting evidence. +func resolveSafeTensorsDirIdentity(ctx context.Context, resolver file.Resolver, dir string, md *pkg.SafeTensorsModelInfo) safeTensorsIdentity { + id := safeTensorsIdentity{fallbackName: safeTensorsDirName(dir)} + + if loc, cfg := findDirHFConfig(resolver, dir); cfg != nil { + applyHFConfig(md, cfg) + id.nameOrPath = cfg.NameOrPath + id.supporting = append(id.supporting, *loc) + } + + if loc, fm := readDirReadmeFrontmatter(resolver, path.Join(dir, "README.md")); fm != nil { + if fm.License != "" { + id.licenses = pkg.NewLicensesFromValuesWithContext(ctx, fm.License) + } + if id.nameOrPath == "" && len(fm.BaseModel) > 0 { + id.nameOrPath = fm.BaseModel[0] + } + id.supporting = append(id.supporting, *loc) + } + return id +} + +// findDirHFConfig looks for a config.json beside the model files +func findDirHFConfig(resolver file.Resolver, dir string) (*file.Location, *hfConfig) { + for { + if loc, cfg := readDirHFConfig(resolver, path.Join(dir, "config.json")); cfg != nil { + return loc, cfg + } + parent := path.Dir(dir) + if parent == dir { + return nil, nil // reached the source root + } + dir = parent + } +} + +func readDirHFConfig(resolver file.Resolver, p string) (*file.Location, *hfConfig) { + locations, err := resolver.FilesByPath(p) + if err != nil || len(locations) == 0 { + return nil, nil + } + rc, err := resolver.FileContentsByLocation(locations[0]) + if err != nil { + return nil, nil + } + defer internal.CloseAndLogError(rc, p) + + var cfg hfConfig + if err := json.NewDecoder(rc).Decode(&cfg); err != nil { + log.Debugf("failed to decode %s: %v", p, err) + return nil, nil + } + return &locations[0], &cfg +} + +func readDirReadmeFrontmatter(resolver file.Resolver, p string) (*file.Location, *readmeFrontmatter) { + locations, err := resolver.FilesByPath(p) + if err != nil || len(locations) == 0 { + return nil, nil + } + rc, err := resolver.FileContentsByLocation(locations[0]) + if err != nil { + return nil, nil + } + defer internal.CloseAndLogError(rc, p) + + buf, err := io.ReadAll(io.LimitReader(rc, 1024*1024)) + if err != nil { + return nil, nil + } + fm := parseFrontmatter(buf) + if fm == nil { + return nil, nil + } + return &locations[0], fm +} diff --git a/syft/pkg/cataloger/ai/identity_oci.go b/syft/pkg/cataloger/ai/identity_oci.go new file mode 100644 index 000000000..63c43e469 --- /dev/null +++ b/syft/pkg/cataloger/ai/identity_oci.go @@ -0,0 +1,177 @@ +package ai + +import ( + "bytes" + "context" + "encoding/json" + "io" + "path" + + gcrname "github.com/google/go-containerregistry/pkg/name" + + "github.com/anchore/syft/internal" + "github.com/anchore/syft/internal/log" + "github.com/anchore/syft/syft/file" + "github.com/anchore/syft/syft/pkg" + "github.com/anchore/syft/syft/pkg/cataloger/internal/licenses" +) + +// resolveSafeTensorsOCIIdentity handles the OCI-artifact case: the model's +// naming and license signals arrive as sibling layers (model.file companions +// carrying config.json / README, and dedicated license layers). It returns the +// group's name candidates, resolved licenses, and supporting evidence. +func resolveSafeTensorsOCIIdentity(ctx context.Context, resolver file.Resolver, md *pkg.SafeTensorsModelInfo) safeTensorsIdentity { + ociResolver, ok := resolver.(file.OCIMediaTypeResolver) + if !ok { + return safeTensorsIdentity{} + } + + modelFileLocs, err := ociResolver.FilesByMediaType(dockerAIModelFileMediaType) + if err != nil { + log.Debugf("failed to list docker AI model-file layers: %v", err) + } + + // Collect config / readme candidates separately so the layer-iteration order + // returned by the resolver doesn't decide the precedence. + var configName, readmeName, readmeLicense string + var supporting []file.Location + for _, loc := range modelFileLocs { + if classifyOCIModelFileLayer(resolver, loc, md, &configName, &readmeName, &readmeLicense) { + supporting = append(supporting, loc) + } + } + + // Precedence: config.json _name_or_path > README base_model. + nameOrPath := configName + if nameOrPath == "" { + nameOrPath = readmeName + } + + id := safeTensorsIdentity{ + nameOrPath: nameOrPath, + fallbackName: ociImageRefBasename(resolver), + supporting: supporting, + } + + // License precedence: a README model-card license wins over dedicated + // license layers (mirrors the dir-scan path, where README frontmatter is the + // license source). + switch { + case readmeLicense != "": + id.licenses = pkg.NewLicensesFromValuesWithContext(ctx, readmeLicense) + default: + licLocs, err := ociResolver.FilesByMediaType(dockerAILicenseMediaType) + if err != nil { + log.Debugf("failed to list docker AI license layers: %v", err) + } + if len(licLocs) > 0 { + id.licenses = identifyLicenseLayers(ctx, resolver, licLocs) + id.supporting = append(id.supporting, licLocs...) + } + } + + return id +} + +// ociImageReferencer is the minimal capability ociImageRefBasename needs: a +// resolver that can surface the OCI image reference it was built from. It is +// kept local to this package (rather than exported from the file package) so the +// assertion stays with its only consumer. +type ociImageReferencer interface { + ImageReference() string +} + +func ociImageRefBasename(resolver file.Resolver) string { + // TODO: we don't think this approach is generalizable quite yet, but we really do need this information. + // (Ideally we should be NOT be type asserting on the file resolver directly). + info, ok := resolver.(ociImageReferencer) + if !ok { + return "" + } + ref := info.ImageReference() + if ref == "" { + return "" + } + parsed, err := gcrname.ParseReference(ref) + if err != nil { + log.Debugf("failed to parse OCI ref %q: %v", ref, err) + return "" + } + return path.Base(parsed.Context().RepositoryStr()) +} + +// classifyOCIModelFileLayer reads up to 4 MiB of a model.file layer and +// classifies it as README frontmatter or HF config.json based on its leading bytes. +func classifyOCIModelFileLayer(resolver file.Resolver, loc file.Location, md *pkg.SafeTensorsModelInfo, configName, readmeName, license *string) bool { + rc, err := resolver.FileContentsByLocation(loc) + if err != nil { + return false + } + defer internal.CloseAndLogError(rc, loc.RealPath) + + buf, err := io.ReadAll(io.LimitReader(rc, 4*1024*1024)) + if err != nil { + return false + } + trimmed := bytes.TrimLeft(buf, "\xef\xbb\xbf \t\r\n") + switch { + case bytes.HasPrefix(trimmed, []byte("---")): + fm := parseFrontmatter(buf) + if fm == nil { + return false + } + if *license == "" { + *license = fm.License + } + if *readmeName == "" && len(fm.BaseModel) > 0 { + *readmeName = fm.BaseModel[0] + } + return true + case bytes.HasPrefix(trimmed, []byte("{")): + var cfg hfConfig + if err := json.Unmarshal(buf, &cfg); err != nil { + return false + } + applyHFConfig(md, &cfg) + if *configName == "" && cfg.NameOrPath != "" { + *configName = cfg.NameOrPath + } + return true + } + return false +} + +// identifyLicenseLayers turns Docker AI license-layer locations into +// pkg.License values. +func identifyLicenseLayers(ctx context.Context, resolver file.Resolver, locs []file.Location) []pkg.License { + var out []pkg.License + var scanFallback []file.Location + for i := range locs { + loc := locs[i] + if spdx := readLicenseSPDXIDFromFrontmatter(resolver, loc); spdx != "" { + out = append(out, pkg.NewLicenseFromFieldsWithContext(ctx, spdx, "", &loc)) + continue + } + scanFallback = append(scanFallback, loc) + } + if len(scanFallback) > 0 { + out = append(out, licenses.FindAtLocations(ctx, resolver, scanFallback...)...) + } + return out +} + +// readLicenseSPDXIDFromFrontmatter reads a bounded prefix of a license-layer +// blob and returns the spdx-id declared in its YAML frontmatter +func readLicenseSPDXIDFromFrontmatter(resolver file.Resolver, loc file.Location) string { + rc, err := resolver.FileContentsByLocation(loc) + if err != nil { + return "" + } + defer internal.CloseAndLogError(rc, loc.RealPath) + + buf, err := io.ReadAll(io.LimitReader(rc, 64*1024)) + if err != nil { + return "" + } + return parseLicenseFrontmatter(buf) +} diff --git a/syft/pkg/cataloger/ai/merge.go b/syft/pkg/cataloger/ai/merge.go new file mode 100644 index 000000000..8c0b19167 --- /dev/null +++ b/syft/pkg/cataloger/ai/merge.go @@ -0,0 +1,140 @@ +package ai + +import ( + "fmt" + "sort" + "strings" + + "github.com/cespare/xxhash/v2" + + "github.com/anchore/syft/syft/file" + "github.com/anchore/syft/syft/pkg" +) + +// mergeSafeTensorsGroup folds a group's per-member metadata into a single package. +func mergeSafeTensorsGroup(members []pkg.Package) pkg.Package { + locSet := unionLocations(members) + aggregates, shards := bucketSafeTensorsMembers(members) + + merged := pkg.SafeTensorsModelInfo{Format: "safetensors"} + mergeAggregatesInto(&merged, aggregates) + shardTensorTotal, hashes := mergeShardsInto(&merged, shards) + + // Keep merged UserMetadata globally key-sorted so the SBOM is stable + sort.Slice(merged.UserMetadata, func(i, j int) bool { + return merged.UserMetadata[i].Key < merged.UserMetadata[j].Key + }) + + if merged.TensorCount == 0 { + merged.TensorCount = shardTensorTotal + } + if merged.ShardCount == 0 { + if len(shards) > 0 { + merged.ShardCount = len(shards) + } else { + merged.ShardCount = 1 + } + } + merged.MetadataHash = rollupHash(hashes) + + // Parts only carry value for multi-shard models; for a single shard the + // outer view already exposes every per-shard field. + if len(shards) > 1 { + parts := append([]pkg.SafeTensorsModelInfo(nil), shards...) + sort.Slice(parts, func(i, j int) bool { + return parts[i].MetadataHash < parts[j].MetadataHash + }) + merged.Parts = parts + } + + return pkg.Package{ + Locations: locSet, + Type: pkg.ModelPkg, + Metadata: merged, + } +} + +func mergeAggregatesInto(merged *pkg.SafeTensorsModelInfo, aggregates []pkg.SafeTensorsModelInfo) { + for _, a := range aggregates { + if merged.TensorCount == 0 { + merged.TensorCount = a.TensorCount + } + if merged.ShardCount == 0 { + merged.ShardCount = a.ShardCount + } + firstNonEmpty(&merged.Parameters, a.Parameters) + firstNonEmpty(&merged.TotalSize, a.TotalSize) + firstNonEmpty(&merged.Quantization, a.Quantization) + } +} + +// mergeShardsInto folds the per-shard header metadata into merged, returning +// the summed shard TensorCount and the list of non-empty per-shard hashes for +// the rollup. Shards carry only the content-derived fields (Quantization, +// Parameters, UserMetadata); +func mergeShardsInto(merged *pkg.SafeTensorsModelInfo, shards []pkg.SafeTensorsModelInfo) (shardTensorTotal uint64, hashes []string) { + seenKV := map[string]bool{} + for _, s := range shards { + shardTensorTotal += s.TensorCount + firstNonEmpty(&merged.Quantization, s.Quantization) + firstNonEmpty(&merged.Parameters, s.Parameters) + for _, kv := range s.UserMetadata { + if seenKV[kv.Key] { + continue + } + seenKV[kv.Key] = true + merged.UserMetadata = append(merged.UserMetadata, kv) + } + if s.MetadataHash != "" { + hashes = append(hashes, s.MetadataHash) + } + } + return shardTensorTotal, hashes +} + +func firstNonEmpty(dst *string, v string) { + if *dst == "" { + *dst = v + } +} + +// unionLocations gathers every location from every member into a single set. +func unionLocations(members []pkg.Package) file.LocationSet { + out := file.NewLocationSet() + for _, m := range members { + for _, l := range m.Locations.ToSlice() { + out.Add(l) + } + } + return out +} + +// bucketSafeTensorsMembers splits group members into aggregate-flavored entries +// (no MetadataHash — Docker AI config blob or sharded index) and shard-flavored +// entries (carry a content-derived MetadataHash from a header parser). +func bucketSafeTensorsMembers(members []pkg.Package) (aggregates, shards []pkg.SafeTensorsModelInfo) { + for _, m := range members { + md, ok := m.Metadata.(pkg.SafeTensorsModelInfo) + if !ok { + continue + } + if md.MetadataHash != "" { + shards = append(shards, md) + continue + } + aggregates = append(aggregates, md) + } + return aggregates, shards +} + +func rollupHash(hashes []string) string { + if len(hashes) == 0 { + return "" + } + if len(hashes) == 1 { + return hashes[0] + } + sorted := append([]string(nil), hashes...) + sort.Strings(sorted) + return fmt.Sprintf("%016x", xxhash.Sum64String(strings.Join(sorted, "|"))) +} diff --git a/syft/pkg/cataloger/ai/naming.go b/syft/pkg/cataloger/ai/naming.go new file mode 100644 index 000000000..b9e766003 --- /dev/null +++ b/syft/pkg/cataloger/ai/naming.go @@ -0,0 +1,25 @@ +package ai + +import "path" + +// pickSafeTensorsName implements the documented naming precedence chain: +// - config.json _name_or_path (path.Base, so "org/Model" → "Model"; +// applies to both dir-scan and OCI groups) +// - fallback name — the group's source-specific positional identifier +func pickSafeTensorsName(nameOrPath, fallbackName string) string { + if nameOrPath != "" { + return path.Base(nameOrPath) + } + return fallbackName +} + +// safeTensorsDirName returns the directory-scan naming fallback: the base name +// of the group's parent directory (the group key is already that directory). +func safeTensorsDirName(groupKey string) string { + base := path.Base(groupKey) + switch base { + case "/", ".", "": + return "" + } + return base +} diff --git a/syft/pkg/cataloger/ai/processor.go b/syft/pkg/cataloger/ai/processor.go index 1b3172def..7cc5018e2 100644 --- a/syft/pkg/cataloger/ai/processor.go +++ b/syft/pkg/cataloger/ai/processor.go @@ -1,159 +1,122 @@ package ai import ( - "bytes" "context" - "encoding/json" - "fmt" - "io" "path" "sort" - "strings" - "github.com/cespare/xxhash/v2" - gcrname "github.com/google/go-containerregistry/pkg/name" - "gopkg.in/yaml.v3" - - "github.com/anchore/syft/internal" "github.com/anchore/syft/internal/log" "github.com/anchore/syft/syft/artifact" "github.com/anchore/syft/syft/file" "github.com/anchore/syft/syft/pkg" - "github.com/anchore/syft/syft/pkg/cataloger/internal/licenses" ) -// ociGroupKey is the grouping key for every safetensors package that -// originated from an OCI model artifact. The ContainerImageModel resolver gives -// each layer the virtual RealPath "/" regardless of layer media type, so all -// safetensors packages from a single OCI scan collapse into one group. -const ociGroupKey = "@oci@" - -// ggufMergeProcessor consolidates multiple GGUF packages into a single package -// representing the AI model. When scanning OCI images with multiple layers, -// each layer may produce a separate package. This processor finds the package -// with a name and merges metadata from nameless packages into its GGUFFileParts field. -// Only packages with a non-empty name are returned in the final result. -func ggufMergeProcessor(pkgs []pkg.Package, rels []artifact.Relationship, err error) ([]pkg.Package, []artifact.Relationship, error) { - if err != nil { - return pkgs, rels, err - } - if len(pkgs) == 0 { - return pkgs, rels, err - } - - // Separate packages with names from those without - var namedPkgs []pkg.Package - var namelessHeaders []pkg.GGUFFileHeader - - for _, p := range pkgs { - if p.Name != "" { - namedPkgs = append(namedPkgs, p) - } else { - if header, ok := p.Metadata.(pkg.GGUFFileHeader); ok { - // We do not want a kv hash for nameless headers - header.MetadataKeyValuesHash = "" - namelessHeaders = append(namelessHeaders, header) - } - } - } - - // If there are no named packages, return nothing - if len(namedPkgs) == 0 { - return nil, rels, err - } - - // merge nameless headers into a single named package; - // if there are multiple named packages, return them without trying to merge headers. - // we cannot determine which nameless headers belong to which package - // this is because the order we receive the gguf headers in is not guaranteed - // to match the layer order in the original oci image - if len(namedPkgs) == 1 && len(namelessHeaders) > 0 { - winner := &namedPkgs[0] - if header, ok := winner.Metadata.(pkg.GGUFFileHeader); ok { - header.Parts = namelessHeaders - winner.Metadata = header - } - } - - return namedPkgs, rels, err -} - -// safeTensorsMergeProcessor owns naming, license resolution, and tensor package creation -// - groups all nameless packages -// - merge the per-shard metadata -// - picks a name (see pickSafeTensorsName) +// safeTensorsMergeProcessor owns naming, license resolution, and final package +// assembly. SafeTensors packages reach it nameless from the parsers; it groups +// them per model, merges the per-shard metadata, resolves a name + licenses, and +// drops any model it cannot name. +// +// There are exactly two sources, each handled by its own path: +// - an OCI model artifact, where the source presents every layer at the +// virtual path "/" and the whole scan is a single model (mergeOCIModel) +// - a filesystem scan, where models are grouped by the directory their files +// live in (mergeDirModels) func safeTensorsMergeProcessor(ctx context.Context, resolver file.Resolver, pkgs []pkg.Package, rels []artifact.Relationship, err error) ([]pkg.Package, []artifact.Relationship, error) { if err != nil || len(pkgs) == 0 { return pkgs, rels, err } - // split off non-safetensors packages - // this keeps the processor robust if other types ever flow through - var stPkgs, other []pkg.Package - for _, p := range pkgs { - if _, ok := p.Metadata.(pkg.SafeTensorsModelInfo); ok { - stPkgs = append(stPkgs, p) - continue - } - other = append(other, p) - } + // keep the processor robust if non-safetensors packages ever flow through + stPkgs, other := partitionSafeTensorsPackages(pkgs) if len(stPkgs) == 0 { return pkgs, rels, err } - groups := groupSafeTensorsPackages(stPkgs) - - // Deterministic iteration order so the SBOM doesn't depend on map order. - keys := make([]string, 0, len(groups)) - for k := range groups { - keys = append(keys, k) + if fromOCIArtifact(stPkgs) { + return append(other, mergeOCIModel(ctx, resolver, stPkgs)...), rels, nil } - sort.Strings(keys) - - out := other - for _, key := range keys { - merged := mergeSafeTensorsGroup(groups[key]) - - // Resolve model identity (name candidates) before enrich - id := resolveSafeTensorsIdentity(resolver, key, &merged) - name := pickSafeTensorsName(id.nameOrPath, id.fallbackName) - if name == "" { - log.Debugf("dropped safetensors model package (metadata hash %q): no name source", - merged.Metadata.(pkg.SafeTensorsModelInfo).MetadataHash) - continue - } - - enrichSafeTensorsGroup(ctx, resolver, key, &merged, id) - merged.Name = name - merged.SetID() - out = append(out, merged) - } - return out, rels, nil + return append(other, mergeDirModels(ctx, resolver, stPkgs)...), rels, nil } -// groupSafeTensorsPackages buckets packages by the parent directory of their -// primary-evidence location -func groupSafeTensorsPackages(pkgs []pkg.Package) map[string][]pkg.Package { - out := make(map[string][]pkg.Package) +// partitionSafeTensorsPackages separates safetensors packages from anything else +// flowing through the processor. +func partitionSafeTensorsPackages(pkgs []pkg.Package) (safeTensors, other []pkg.Package) { for _, p := range pkgs { - key := safeTensorsGroupKey(p) - if key == "" { + if _, ok := p.Metadata.(pkg.SafeTensorsModelInfo); ok { + safeTensors = append(safeTensors, p) continue } - out[key] = append(out[key], p) + other = append(other, p) + } + return safeTensors, other +} + +// fromOCIArtifact reports whether the packages came from an OCI model artifact. +// That source presents every layer at the virtual path "/", whereas a filesystem +// scan always carries a real file path. A single scan is one source, so the +// first package is representative of the rest. +func fromOCIArtifact(pkgs []pkg.Package) bool { + loc := primaryEvidenceLocation(pkgs[0]) + return loc != nil && loc.RealPath == "/" +} + +// mergeOCIModel treats the whole OCI artifact as a single model: every layer +// merges into one package, named from the artifact's config.json/README or its +// image reference. +func mergeOCIModel(ctx context.Context, resolver file.Resolver, pkgs []pkg.Package) []pkg.Package { + merged := mergeSafeTensorsGroup(pkgs) + + md := merged.Metadata.(pkg.SafeTensorsModelInfo) + id := resolveSafeTensorsOCIIdentity(ctx, resolver, &md) + merged.Metadata = md // write architecture enrichment back before assembly + + if p, ok := assembleSafeTensorsPackage(merged, id); ok { + return []pkg.Package{p} + } + return nil +} + +// mergeDirModels groups filesystem-scanned files by their parent directory and +// emits one model per directory, named from a sibling config.json/README or the +// directory itself. +func mergeDirModels(ctx context.Context, resolver file.Resolver, pkgs []pkg.Package) []pkg.Package { + groups := groupByParentDir(pkgs) + + // deterministic iteration order so the SBOM doesn't depend on map order + dirs := make([]string, 0, len(groups)) + for dir := range groups { + dirs = append(dirs, dir) + } + sort.Strings(dirs) + + var out []pkg.Package + for _, dir := range dirs { + merged := mergeSafeTensorsGroup(groups[dir]) + + md := merged.Metadata.(pkg.SafeTensorsModelInfo) + id := resolveSafeTensorsDirIdentity(ctx, resolver, dir, &md) + merged.Metadata = md // write architecture enrichment back before assembly + + if p, ok := assembleSafeTensorsPackage(merged, id); ok { + out = append(out, p) + } } return out } -func safeTensorsGroupKey(p pkg.Package) string { - loc := primaryEvidenceLocation(p) - if loc == nil { - return "" +// groupByParentDir buckets filesystem-scanned packages by the directory their +// primary-evidence file lives in (the shards of one model share a directory). +func groupByParentDir(pkgs []pkg.Package) map[string][]pkg.Package { + out := make(map[string][]pkg.Package) + for _, p := range pkgs { + loc := primaryEvidenceLocation(p) + if loc == nil { + continue + } + dir := path.Dir(loc.RealPath) + out[dir] = append(out[dir], p) } - if loc.RealPath == "/" { - return ociGroupKey - } - return path.Dir(loc.RealPath) + return out } func primaryEvidenceLocation(p pkg.Package) *file.Location { @@ -169,501 +132,34 @@ func primaryEvidenceLocation(p pkg.Package) *file.Location { return nil } -// mergeSafeTensorsGroup folds a group's per-member metadata into a single package. -func mergeSafeTensorsGroup(members []pkg.Package) pkg.Package { - locSet := unionLocations(members) - aggregates, shards := bucketSafeTensorsMembers(members) - - merged := pkg.SafeTensorsModelInfo{Format: "safetensors"} - mergeAggregatesInto(&merged, aggregates) - shardTensorTotal, hashes := mergeShardsInto(&merged, shards) - - // Keep merged UserMetadata globally key-sorted so the SBOM is stable - sort.Slice(merged.UserMetadata, func(i, j int) bool { - return merged.UserMetadata[i].Key < merged.UserMetadata[j].Key - }) - - if merged.TensorCount == 0 { - merged.TensorCount = shardTensorTotal - } - if merged.ShardCount == 0 { - if len(shards) > 0 { - merged.ShardCount = len(shards) - } else { - merged.ShardCount = 1 - } - } - merged.MetadataHash = rollupHash(hashes) - - // Parts only carry value for multi-shard models; for a single shard the - // outer view already exposes every per-shard field. - if len(shards) > 1 { - parts := append([]pkg.SafeTensorsModelInfo(nil), shards...) - sort.Slice(parts, func(i, j int) bool { - return parts[i].MetadataHash < parts[j].MetadataHash - }) - merged.Parts = parts - } - - return pkg.Package{ - Locations: locSet, - Type: pkg.ModelPkg, - Metadata: merged, - } -} - -func mergeAggregatesInto(merged *pkg.SafeTensorsModelInfo, aggregates []pkg.SafeTensorsModelInfo) { - for _, a := range aggregates { - if merged.TensorCount == 0 { - merged.TensorCount = a.TensorCount - } - if merged.ShardCount == 0 { - merged.ShardCount = a.ShardCount - } - firstNonEmpty(&merged.Parameters, a.Parameters) - firstNonEmpty(&merged.TotalSize, a.TotalSize) - firstNonEmpty(&merged.Quantization, a.Quantization) - } -} - -// mergeShardsInto folds the per-shard header metadata into merged, returning -// the summed shard TensorCount and the list of non-empty per-shard hashes for -// the rollup. Shards carry only the content-derived fields (Quantization, -// Parameters, UserMetadata); -func mergeShardsInto(merged *pkg.SafeTensorsModelInfo, shards []pkg.SafeTensorsModelInfo) (shardTensorTotal uint64, hashes []string) { - seenKV := map[string]bool{} - for _, s := range shards { - shardTensorTotal += s.TensorCount - firstNonEmpty(&merged.Quantization, s.Quantization) - firstNonEmpty(&merged.Parameters, s.Parameters) - for _, kv := range s.UserMetadata { - if seenKV[kv.Key] { - continue - } - seenKV[kv.Key] = true - merged.UserMetadata = append(merged.UserMetadata, kv) - } - if s.MetadataHash != "" { - hashes = append(hashes, s.MetadataHash) - } - } - return shardTensorTotal, hashes -} - -func firstNonEmpty(dst *string, v string) { - if *dst == "" { - *dst = v - } -} - -// unionLocations gathers every location from every member into a single set. -func unionLocations(members []pkg.Package) file.LocationSet { - out := file.NewLocationSet() - for _, m := range members { - for _, l := range m.Locations.ToSlice() { - out.Add(l) - } - } - return out -} - -// bucketSafeTensorsMembers splits group members into aggregate-flavored entries -// (no MetadataHash — Docker AI config blob or sharded index) and shard-flavored -// entries (carry a content-derived MetadataHash from a header parser). -func bucketSafeTensorsMembers(members []pkg.Package) (aggregates, shards []pkg.SafeTensorsModelInfo) { - for _, m := range members { - md, ok := m.Metadata.(pkg.SafeTensorsModelInfo) - if !ok { - continue - } - if md.MetadataHash != "" { - shards = append(shards, md) - continue - } - aggregates = append(aggregates, md) - } - return aggregates, shards -} - -func rollupHash(hashes []string) string { - if len(hashes) == 0 { - return "" - } - if len(hashes) == 1 { - return hashes[0] - } - sorted := append([]string(nil), hashes...) - sort.Strings(sorted) - return fmt.Sprintf("%016x", xxhash.Sum64String(strings.Join(sorted, "|"))) -} - +// safeTensorsIdentity is the fully-resolved naming/license result for a model. +// Each source resolver (dir or OCI) populates it so assembly stays source-agnostic. type safeTensorsIdentity struct { - nameOrPath string - fallbackName string - readmeLicense string - supporting []file.Location + nameOrPath string + fallbackName string + licenses []pkg.License + supporting []file.Location } -// resolveSafeTensorsIdentity reads the resolver for the group's naming signals -// (config.json _name_or_path, README base_model, OCI image ref / dir name) -func resolveSafeTensorsIdentity(resolver file.Resolver, groupKey string, merged *pkg.Package) safeTensorsIdentity { - md := merged.Metadata.(pkg.SafeTensorsModelInfo) - - var id safeTensorsIdentity - if groupKey == ociGroupKey { - id = resolveSafeTensorsOCIIdentity(resolver, &md) - } else { - id = resolveSafeTensorsDirIdentity(resolver, groupKey, &md) +// assembleSafeTensorsPackage finalizes a merged model from its resolved identity: +// it picks the name, attaches licenses and supporting evidence, and sets the ID. +// A model with no name source is dropped (ok=false). +func assembleSafeTensorsPackage(merged pkg.Package, id safeTensorsIdentity) (pkg.Package, bool) { + name := pickSafeTensorsName(id.nameOrPath, id.fallbackName) + if name == "" { + log.Debugf("dropped safetensors model package (metadata hash %q): no name source", + merged.Metadata.(pkg.SafeTensorsModelInfo).MetadataHash) + return pkg.Package{}, false } - merged.Metadata = md - return id -} - -func enrichSafeTensorsGroup(ctx context.Context, resolver file.Resolver, groupKey string, merged *pkg.Package, id safeTensorsIdentity) { - var lics []pkg.License - supporting := id.supporting - - switch { - case id.readmeLicense != "": - lics = pkg.NewLicensesFromValuesWithContext(ctx, id.readmeLicense) - case groupKey == ociGroupKey: - if ociResolver, ok := resolver.(file.OCIMediaTypeResolver); ok { - licLocs, err := ociResolver.FilesByMediaType(dockerAILicenseMediaType) - if err != nil { - log.Debugf("failed to list docker AI license layers: %v", err) - } - if len(licLocs) > 0 { - lics = identifyLicenseLayers(ctx, resolver, licLocs) - supporting = append(supporting, licLocs...) - } - } + if len(id.licenses) > 0 { + merged.Licenses = pkg.NewLicenseSet(id.licenses...) } - - if len(lics) > 0 { - merged.Licenses = pkg.NewLicenseSet(lics...) - } - for _, loc := range supporting { + for _, loc := range id.supporting { merged.Locations.Add(loc.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.SupportingEvidenceAnnotation)) } -} - -// safeTensorsDirName returns the directory-scan naming fallback: the base name -// of the group's parent directory (the group key is already that directory). -func safeTensorsDirName(groupKey string) string { - base := path.Base(groupKey) - switch base { - case "/", ".", "": - return "" - } - return base -} - -// resolveSafeTensorsDirIdentity handles the directory-scan case: look for a -// config.json beside the model files (walking up parent directories to the -// scanned source root if no sibling exists) and a sibling README.md -func resolveSafeTensorsDirIdentity(resolver file.Resolver, dir string, md *pkg.SafeTensorsModelInfo) safeTensorsIdentity { - id := safeTensorsIdentity{fallbackName: safeTensorsDirName(dir)} - - if loc, cfg := findDirHFConfig(resolver, dir); cfg != nil { - applyHFConfig(md, cfg) - id.nameOrPath = cfg.NameOrPath - id.supporting = append(id.supporting, *loc) - } - - if loc, fm := readDirReadmeFrontmatter(resolver, path.Join(dir, "README.md")); fm != nil { - id.readmeLicense = fm.License - if id.nameOrPath == "" && len(fm.BaseModel) > 0 { - id.nameOrPath = fm.BaseModel[0] - } - id.supporting = append(id.supporting, *loc) - } - return id -} - -func resolveSafeTensorsOCIIdentity(resolver file.Resolver, md *pkg.SafeTensorsModelInfo) safeTensorsIdentity { - ociResolver, ok := resolver.(file.OCIMediaTypeResolver) - if !ok { - return safeTensorsIdentity{} - } - - modelFileLocs, err := ociResolver.FilesByMediaType(dockerAIModelFileMediaType) - if err != nil { - log.Debugf("failed to list docker AI model-file layers: %v", err) - } - - // Collect config / readme candidates separately so the layer-iteration order - // returned by the resolver doesn't decide the precedence. - var configName, readmeName, readmeLicense string - var supporting []file.Location - for _, loc := range modelFileLocs { - if classifyOCIModelFileLayer(resolver, loc, md, &configName, &readmeName, &readmeLicense) { - supporting = append(supporting, loc) - } - } - - // Precedence: config.json _name_or_path > README base_model. - nameOrPath := configName - if nameOrPath == "" { - nameOrPath = readmeName - } - - return safeTensorsIdentity{ - nameOrPath: nameOrPath, - fallbackName: ociImageRefBasename(resolver), - readmeLicense: readmeLicense, - supporting: supporting, - } -} - -// ociImageReferencer is the minimal capability ociImageRefBasename needs: a -// resolver that can surface the OCI image reference it was built from. It is -// kept local to this package (rather than exported from the file package) so the -// assertion stays with its only consumer. -type ociImageReferencer interface { - ImageReference() string -} - -func ociImageRefBasename(resolver file.Resolver) string { - // TODO: we don't think this approach is generalizable quite yet, but we really do need this information. - // (Ideally we should be NOT be type asserting on the file resolver directly). - info, ok := resolver.(ociImageReferencer) - if !ok { - return "" - } - ref := info.ImageReference() - if ref == "" { - return "" - } - parsed, err := gcrname.ParseReference(ref) - if err != nil { - log.Debugf("failed to parse OCI ref %q: %v", ref, err) - return "" - } - return path.Base(parsed.Context().RepositoryStr()) -} - -// identifyLicenseLayers turns Docker AI license-layer locations into -// pkg.License values. -func identifyLicenseLayers(ctx context.Context, resolver file.Resolver, locs []file.Location) []pkg.License { - var out []pkg.License - var scanFallback []file.Location - for i := range locs { - loc := locs[i] - if spdx := readLicenseSPDXIDFromFrontmatter(resolver, loc); spdx != "" { - out = append(out, pkg.NewLicenseFromFieldsWithContext(ctx, spdx, "", &loc)) - continue - } - scanFallback = append(scanFallback, loc) - } - if len(scanFallback) > 0 { - out = append(out, licenses.FindAtLocations(ctx, resolver, scanFallback...)...) - } - return out -} - -// readLicenseSPDXIDFromFrontmatter reads a bounded prefix of a license-layer -// blob and returns the spdx-id declared in its YAML frontmatter -func readLicenseSPDXIDFromFrontmatter(resolver file.Resolver, loc file.Location) string { - rc, err := resolver.FileContentsByLocation(loc) - if err != nil { - return "" - } - defer internal.CloseAndLogError(rc, loc.RealPath) - - buf, err := io.ReadAll(io.LimitReader(rc, 64*1024)) - if err != nil { - return "" - } - return parseLicenseFrontmatter(buf) -} - -// classifyOCIModelFileLayer reads up to 4 MiB of a model.file layer and -// classifies it as README frontmatter or HF config.json based on its leading bytes. -func classifyOCIModelFileLayer(resolver file.Resolver, loc file.Location, md *pkg.SafeTensorsModelInfo, configName, readmeName, license *string) bool { - rc, err := resolver.FileContentsByLocation(loc) - if err != nil { - return false - } - defer internal.CloseAndLogError(rc, loc.RealPath) - - buf, err := io.ReadAll(io.LimitReader(rc, 4*1024*1024)) - if err != nil { - return false - } - trimmed := bytes.TrimLeft(buf, "\xef\xbb\xbf \t\r\n") - switch { - case bytes.HasPrefix(trimmed, []byte("---")): - fm := parseFrontmatter(buf) - if fm == nil { - return false - } - if *license == "" { - *license = fm.License - } - if *readmeName == "" && len(fm.BaseModel) > 0 { - *readmeName = fm.BaseModel[0] - } - return true - case bytes.HasPrefix(trimmed, []byte("{")): - var cfg hfConfig - if err := json.Unmarshal(buf, &cfg); err != nil { - return false - } - applyHFConfig(md, &cfg) - if *configName == "" && cfg.NameOrPath != "" { - *configName = cfg.NameOrPath - } - return true - } - return false -} - -func applyHFConfig(md *pkg.SafeTensorsModelInfo, cfg *hfConfig) { - if md.Architecture == "" && len(cfg.Architectures) > 0 { - md.Architecture = cfg.Architectures[0] - } -} - -// pickSafeTensorsName implements the documented naming precedence chain: -// - config.json _name_or_path (path.Base, so "org/Model" → "Model"; -// applies to both dir-scan and OCI groups) -// - fallback name — the group's source-specific positional identifier -func pickSafeTensorsName(nameOrPath, fallbackName string) string { - if nameOrPath != "" { - return path.Base(nameOrPath) - } - return fallbackName -} - -// hfConfig is a minimal projection of Hugging Face config.json fields. -type hfConfig struct { - Architectures []string `json:"architectures"` - NameOrPath string `json:"_name_or_path"` -} - -// readmeFrontmatter holds the subset of YAML frontmatter fields we extract. -type readmeFrontmatter struct { - License string `yaml:"license"` - BaseModel []string `yaml:"base_model"` -} - -// findDirHFConfig looks for a config.json beside the model files -func findDirHFConfig(resolver file.Resolver, dir string) (*file.Location, *hfConfig) { - for { - if loc, cfg := readDirHFConfig(resolver, path.Join(dir, "config.json")); cfg != nil { - return loc, cfg - } - parent := path.Dir(dir) - if parent == dir { - return nil, nil // reached the source root - } - dir = parent - } -} - -func readDirHFConfig(resolver file.Resolver, p string) (*file.Location, *hfConfig) { - locations, err := resolver.FilesByPath(p) - if err != nil || len(locations) == 0 { - return nil, nil - } - rc, err := resolver.FileContentsByLocation(locations[0]) - if err != nil { - return nil, nil - } - defer internal.CloseAndLogError(rc, p) - - var cfg hfConfig - if err := json.NewDecoder(rc).Decode(&cfg); err != nil { - log.Debugf("failed to decode %s: %v", p, err) - return nil, nil - } - return &locations[0], &cfg -} - -func readDirReadmeFrontmatter(resolver file.Resolver, p string) (*file.Location, *readmeFrontmatter) { - locations, err := resolver.FilesByPath(p) - if err != nil || len(locations) == 0 { - return nil, nil - } - rc, err := resolver.FileContentsByLocation(locations[0]) - if err != nil { - return nil, nil - } - defer internal.CloseAndLogError(rc, p) - - buf, err := io.ReadAll(io.LimitReader(rc, 1024*1024)) - if err != nil { - return nil, nil - } - fm := parseFrontmatter(buf) - if fm == nil { - return nil, nil - } - return &locations[0], fm -} - -// extractFrontmatterBlock returns the YAML bytes between the first and second -// "---" delimiters of a file -func extractFrontmatterBlock(buf []byte) []byte { - trimmed := bytes.TrimLeft(buf, "\xef\xbb\xbf \t\r\n") - if !bytes.HasPrefix(trimmed, []byte("---")) { - return nil - } - rest := trimmed[3:] - if i := bytes.IndexByte(rest, '\n'); i >= 0 { - rest = rest[i+1:] - } - block, _, found := bytes.Cut(rest, []byte("\n---")) - if !found { - return nil - } - return block -} - -// parseFrontmatter decodes a Hugging Face model card YAML frontmatter block -// and returns the license and base_model fields. -func parseFrontmatter(buf []byte) *readmeFrontmatter { - block := extractFrontmatterBlock(buf) - if block == nil { - return nil - } - - var raw struct { - License string `yaml:"license"` - BaseModel yaml.Node `yaml:"base_model"` - } - if err := yaml.Unmarshal(block, &raw); err != nil { - log.Debugf("failed to parse README frontmatter: %v", err) - return nil - } - - fm := readmeFrontmatter{License: raw.License} - switch raw.BaseModel.Kind { - case yaml.ScalarNode: - if raw.BaseModel.Value != "" { - fm.BaseModel = []string{raw.BaseModel.Value} - } - case yaml.SequenceNode: - _ = raw.BaseModel.Decode(&fm.BaseModel) - } - return &fm -} - -type licenseFrontmatter struct { - SPDXID string `yaml:"spdx-id"` -} - -// parseLicenseFrontmatter returns the producer-declared SPDX identifier -func parseLicenseFrontmatter(buf []byte) string { - block := extractFrontmatterBlock(buf) - if block == nil { - return "" - } - var fm licenseFrontmatter - if err := yaml.Unmarshal(block, &fm); err != nil { - log.Debugf("failed to parse license frontmatter: %v", err) - return "" - } - return fm.SPDXID + + merged.Name = name + merged.SetID() + return merged, true }