mirror of
https://github.com/anchore/syft.git
synced 2026-07-05 02:28:25 +02:00
pr: first pass refactor
Signed-off-by: Christopher Phillips <32073428+spiffcs@users.noreply.github.com>
This commit is contained in:
parent
dd179eb8a7
commit
fe392a490b
56
syft/pkg/cataloger/ai/gguf_processor.go
Normal file
56
syft/pkg/cataloger/ai/gguf_processor.go
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
package ai
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/anchore/syft/syft/artifact"
|
||||||
|
"github.com/anchore/syft/syft/pkg"
|
||||||
|
)
|
||||||
|
|
||||||
|
// ggufMergeProcessor consolidates multiple GGUF packages into a single package
|
||||||
|
// representing the AI model. When scanning OCI images with multiple layers,
|
||||||
|
// each layer may produce a separate package. This processor finds the package
|
||||||
|
// with a name and merges metadata from nameless packages into its GGUFFileParts field.
|
||||||
|
// Only packages with a non-empty name are returned in the final result.
|
||||||
|
func ggufMergeProcessor(pkgs []pkg.Package, rels []artifact.Relationship, err error) ([]pkg.Package, []artifact.Relationship, error) {
|
||||||
|
if err != nil {
|
||||||
|
return pkgs, rels, err
|
||||||
|
}
|
||||||
|
if len(pkgs) == 0 {
|
||||||
|
return pkgs, rels, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Separate packages with names from those without
|
||||||
|
var namedPkgs []pkg.Package
|
||||||
|
var namelessHeaders []pkg.GGUFFileHeader
|
||||||
|
|
||||||
|
for _, p := range pkgs {
|
||||||
|
if p.Name != "" {
|
||||||
|
namedPkgs = append(namedPkgs, p)
|
||||||
|
} else {
|
||||||
|
if header, ok := p.Metadata.(pkg.GGUFFileHeader); ok {
|
||||||
|
// We do not want a kv hash for nameless headers
|
||||||
|
header.MetadataKeyValuesHash = ""
|
||||||
|
namelessHeaders = append(namelessHeaders, header)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If there are no named packages, return nothing
|
||||||
|
if len(namedPkgs) == 0 {
|
||||||
|
return nil, rels, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// merge nameless headers into a single named package;
|
||||||
|
// if there are multiple named packages, return them without trying to merge headers.
|
||||||
|
// we cannot determine which nameless headers belong to which package
|
||||||
|
// this is because the order we receive the gguf headers in is not guaranteed
|
||||||
|
// to match the layer order in the original oci image
|
||||||
|
if len(namedPkgs) == 1 && len(namelessHeaders) > 0 {
|
||||||
|
winner := &namedPkgs[0]
|
||||||
|
if header, ok := winner.Metadata.(pkg.GGUFFileHeader); ok {
|
||||||
|
header.Parts = namelessHeaders
|
||||||
|
winner.Metadata = header
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return namedPkgs, rels, err
|
||||||
|
}
|
||||||
93
syft/pkg/cataloger/ai/huggingface.go
Normal file
93
syft/pkg/cataloger/ai/huggingface.go
Normal file
@ -0,0 +1,93 @@
|
|||||||
|
package ai
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
|
||||||
|
"gopkg.in/yaml.v3"
|
||||||
|
|
||||||
|
"github.com/anchore/syft/internal/log"
|
||||||
|
"github.com/anchore/syft/syft/pkg"
|
||||||
|
)
|
||||||
|
|
||||||
|
// hfConfig is a minimal projection of Hugging Face config.json fields.
|
||||||
|
type hfConfig struct {
|
||||||
|
Architectures []string `json:"architectures"`
|
||||||
|
NameOrPath string `json:"_name_or_path"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func applyHFConfig(md *pkg.SafeTensorsModelInfo, cfg *hfConfig) {
|
||||||
|
if md.Architecture == "" && len(cfg.Architectures) > 0 {
|
||||||
|
md.Architecture = cfg.Architectures[0]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// readmeFrontmatter holds the subset of YAML frontmatter fields we extract.
|
||||||
|
type readmeFrontmatter struct {
|
||||||
|
License string `yaml:"license"`
|
||||||
|
BaseModel []string `yaml:"base_model"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type licenseFrontmatter struct {
|
||||||
|
SPDXID string `yaml:"spdx-id"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// extractFrontmatterBlock returns the YAML bytes between the first and second
|
||||||
|
// "---" delimiters of a file
|
||||||
|
func extractFrontmatterBlock(buf []byte) []byte {
|
||||||
|
trimmed := bytes.TrimLeft(buf, "\xef\xbb\xbf \t\r\n")
|
||||||
|
if !bytes.HasPrefix(trimmed, []byte("---")) {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
rest := trimmed[3:]
|
||||||
|
if i := bytes.IndexByte(rest, '\n'); i >= 0 {
|
||||||
|
rest = rest[i+1:]
|
||||||
|
}
|
||||||
|
block, _, found := bytes.Cut(rest, []byte("\n---"))
|
||||||
|
if !found {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return block
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseFrontmatter decodes a Hugging Face model card YAML frontmatter block
|
||||||
|
// and returns the license and base_model fields.
|
||||||
|
func parseFrontmatter(buf []byte) *readmeFrontmatter {
|
||||||
|
block := extractFrontmatterBlock(buf)
|
||||||
|
if block == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
var raw struct {
|
||||||
|
License string `yaml:"license"`
|
||||||
|
BaseModel yaml.Node `yaml:"base_model"`
|
||||||
|
}
|
||||||
|
if err := yaml.Unmarshal(block, &raw); err != nil {
|
||||||
|
log.Debugf("failed to parse README frontmatter: %v", err)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
fm := readmeFrontmatter{License: raw.License}
|
||||||
|
switch raw.BaseModel.Kind {
|
||||||
|
case yaml.ScalarNode:
|
||||||
|
if raw.BaseModel.Value != "" {
|
||||||
|
fm.BaseModel = []string{raw.BaseModel.Value}
|
||||||
|
}
|
||||||
|
case yaml.SequenceNode:
|
||||||
|
_ = raw.BaseModel.Decode(&fm.BaseModel)
|
||||||
|
}
|
||||||
|
return &fm
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseLicenseFrontmatter returns the producer-declared SPDX identifier
|
||||||
|
func parseLicenseFrontmatter(buf []byte) string {
|
||||||
|
block := extractFrontmatterBlock(buf)
|
||||||
|
if block == nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
var fm licenseFrontmatter
|
||||||
|
if err := yaml.Unmarshal(block, &fm); err != nil {
|
||||||
|
log.Debugf("failed to parse license frontmatter: %v", err)
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return fm.SPDXID
|
||||||
|
}
|
||||||
93
syft/pkg/cataloger/ai/identity_dir.go
Normal file
93
syft/pkg/cataloger/ai/identity_dir.go
Normal file
@ -0,0 +1,93 @@
|
|||||||
|
package ai
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"io"
|
||||||
|
"path"
|
||||||
|
|
||||||
|
"github.com/anchore/syft/internal"
|
||||||
|
"github.com/anchore/syft/internal/log"
|
||||||
|
"github.com/anchore/syft/syft/file"
|
||||||
|
"github.com/anchore/syft/syft/pkg"
|
||||||
|
)
|
||||||
|
|
||||||
|
// resolveSafeTensorsDirIdentity handles the directory-scan case: look for a
|
||||||
|
// config.json beside the model files (walking up parent directories to the
|
||||||
|
// scanned source root if no sibling exists) and a sibling README.md. It returns
|
||||||
|
// the group's name candidates, resolved licenses, and supporting evidence.
|
||||||
|
func resolveSafeTensorsDirIdentity(ctx context.Context, resolver file.Resolver, dir string, md *pkg.SafeTensorsModelInfo) safeTensorsIdentity {
|
||||||
|
id := safeTensorsIdentity{fallbackName: safeTensorsDirName(dir)}
|
||||||
|
|
||||||
|
if loc, cfg := findDirHFConfig(resolver, dir); cfg != nil {
|
||||||
|
applyHFConfig(md, cfg)
|
||||||
|
id.nameOrPath = cfg.NameOrPath
|
||||||
|
id.supporting = append(id.supporting, *loc)
|
||||||
|
}
|
||||||
|
|
||||||
|
if loc, fm := readDirReadmeFrontmatter(resolver, path.Join(dir, "README.md")); fm != nil {
|
||||||
|
if fm.License != "" {
|
||||||
|
id.licenses = pkg.NewLicensesFromValuesWithContext(ctx, fm.License)
|
||||||
|
}
|
||||||
|
if id.nameOrPath == "" && len(fm.BaseModel) > 0 {
|
||||||
|
id.nameOrPath = fm.BaseModel[0]
|
||||||
|
}
|
||||||
|
id.supporting = append(id.supporting, *loc)
|
||||||
|
}
|
||||||
|
return id
|
||||||
|
}
|
||||||
|
|
||||||
|
// findDirHFConfig looks for a config.json beside the model files
|
||||||
|
func findDirHFConfig(resolver file.Resolver, dir string) (*file.Location, *hfConfig) {
|
||||||
|
for {
|
||||||
|
if loc, cfg := readDirHFConfig(resolver, path.Join(dir, "config.json")); cfg != nil {
|
||||||
|
return loc, cfg
|
||||||
|
}
|
||||||
|
parent := path.Dir(dir)
|
||||||
|
if parent == dir {
|
||||||
|
return nil, nil // reached the source root
|
||||||
|
}
|
||||||
|
dir = parent
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func readDirHFConfig(resolver file.Resolver, p string) (*file.Location, *hfConfig) {
|
||||||
|
locations, err := resolver.FilesByPath(p)
|
||||||
|
if err != nil || len(locations) == 0 {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
rc, err := resolver.FileContentsByLocation(locations[0])
|
||||||
|
if err != nil {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
defer internal.CloseAndLogError(rc, p)
|
||||||
|
|
||||||
|
var cfg hfConfig
|
||||||
|
if err := json.NewDecoder(rc).Decode(&cfg); err != nil {
|
||||||
|
log.Debugf("failed to decode %s: %v", p, err)
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
return &locations[0], &cfg
|
||||||
|
}
|
||||||
|
|
||||||
|
func readDirReadmeFrontmatter(resolver file.Resolver, p string) (*file.Location, *readmeFrontmatter) {
|
||||||
|
locations, err := resolver.FilesByPath(p)
|
||||||
|
if err != nil || len(locations) == 0 {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
rc, err := resolver.FileContentsByLocation(locations[0])
|
||||||
|
if err != nil {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
defer internal.CloseAndLogError(rc, p)
|
||||||
|
|
||||||
|
buf, err := io.ReadAll(io.LimitReader(rc, 1024*1024))
|
||||||
|
if err != nil {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
fm := parseFrontmatter(buf)
|
||||||
|
if fm == nil {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
return &locations[0], fm
|
||||||
|
}
|
||||||
177
syft/pkg/cataloger/ai/identity_oci.go
Normal file
177
syft/pkg/cataloger/ai/identity_oci.go
Normal file
@ -0,0 +1,177 @@
|
|||||||
|
package ai
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"io"
|
||||||
|
"path"
|
||||||
|
|
||||||
|
gcrname "github.com/google/go-containerregistry/pkg/name"
|
||||||
|
|
||||||
|
"github.com/anchore/syft/internal"
|
||||||
|
"github.com/anchore/syft/internal/log"
|
||||||
|
"github.com/anchore/syft/syft/file"
|
||||||
|
"github.com/anchore/syft/syft/pkg"
|
||||||
|
"github.com/anchore/syft/syft/pkg/cataloger/internal/licenses"
|
||||||
|
)
|
||||||
|
|
||||||
|
// resolveSafeTensorsOCIIdentity handles the OCI-artifact case: the model's
|
||||||
|
// naming and license signals arrive as sibling layers (model.file companions
|
||||||
|
// carrying config.json / README, and dedicated license layers). It returns the
|
||||||
|
// group's name candidates, resolved licenses, and supporting evidence.
|
||||||
|
func resolveSafeTensorsOCIIdentity(ctx context.Context, resolver file.Resolver, md *pkg.SafeTensorsModelInfo) safeTensorsIdentity {
|
||||||
|
ociResolver, ok := resolver.(file.OCIMediaTypeResolver)
|
||||||
|
if !ok {
|
||||||
|
return safeTensorsIdentity{}
|
||||||
|
}
|
||||||
|
|
||||||
|
modelFileLocs, err := ociResolver.FilesByMediaType(dockerAIModelFileMediaType)
|
||||||
|
if err != nil {
|
||||||
|
log.Debugf("failed to list docker AI model-file layers: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Collect config / readme candidates separately so the layer-iteration order
|
||||||
|
// returned by the resolver doesn't decide the precedence.
|
||||||
|
var configName, readmeName, readmeLicense string
|
||||||
|
var supporting []file.Location
|
||||||
|
for _, loc := range modelFileLocs {
|
||||||
|
if classifyOCIModelFileLayer(resolver, loc, md, &configName, &readmeName, &readmeLicense) {
|
||||||
|
supporting = append(supporting, loc)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Precedence: config.json _name_or_path > README base_model.
|
||||||
|
nameOrPath := configName
|
||||||
|
if nameOrPath == "" {
|
||||||
|
nameOrPath = readmeName
|
||||||
|
}
|
||||||
|
|
||||||
|
id := safeTensorsIdentity{
|
||||||
|
nameOrPath: nameOrPath,
|
||||||
|
fallbackName: ociImageRefBasename(resolver),
|
||||||
|
supporting: supporting,
|
||||||
|
}
|
||||||
|
|
||||||
|
// License precedence: a README model-card license wins over dedicated
|
||||||
|
// license layers (mirrors the dir-scan path, where README frontmatter is the
|
||||||
|
// license source).
|
||||||
|
switch {
|
||||||
|
case readmeLicense != "":
|
||||||
|
id.licenses = pkg.NewLicensesFromValuesWithContext(ctx, readmeLicense)
|
||||||
|
default:
|
||||||
|
licLocs, err := ociResolver.FilesByMediaType(dockerAILicenseMediaType)
|
||||||
|
if err != nil {
|
||||||
|
log.Debugf("failed to list docker AI license layers: %v", err)
|
||||||
|
}
|
||||||
|
if len(licLocs) > 0 {
|
||||||
|
id.licenses = identifyLicenseLayers(ctx, resolver, licLocs)
|
||||||
|
id.supporting = append(id.supporting, licLocs...)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return id
|
||||||
|
}
|
||||||
|
|
||||||
|
// ociImageReferencer is the minimal capability ociImageRefBasename needs: a
|
||||||
|
// resolver that can surface the OCI image reference it was built from. It is
|
||||||
|
// kept local to this package (rather than exported from the file package) so the
|
||||||
|
// assertion stays with its only consumer.
|
||||||
|
type ociImageReferencer interface {
|
||||||
|
ImageReference() string
|
||||||
|
}
|
||||||
|
|
||||||
|
func ociImageRefBasename(resolver file.Resolver) string {
|
||||||
|
// TODO: we don't think this approach is generalizable quite yet, but we really do need this information.
|
||||||
|
// (Ideally we should be NOT be type asserting on the file resolver directly).
|
||||||
|
info, ok := resolver.(ociImageReferencer)
|
||||||
|
if !ok {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
ref := info.ImageReference()
|
||||||
|
if ref == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
parsed, err := gcrname.ParseReference(ref)
|
||||||
|
if err != nil {
|
||||||
|
log.Debugf("failed to parse OCI ref %q: %v", ref, err)
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return path.Base(parsed.Context().RepositoryStr())
|
||||||
|
}
|
||||||
|
|
||||||
|
// classifyOCIModelFileLayer reads up to 4 MiB of a model.file layer and
|
||||||
|
// classifies it as README frontmatter or HF config.json based on its leading bytes.
|
||||||
|
func classifyOCIModelFileLayer(resolver file.Resolver, loc file.Location, md *pkg.SafeTensorsModelInfo, configName, readmeName, license *string) bool {
|
||||||
|
rc, err := resolver.FileContentsByLocation(loc)
|
||||||
|
if err != nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
defer internal.CloseAndLogError(rc, loc.RealPath)
|
||||||
|
|
||||||
|
buf, err := io.ReadAll(io.LimitReader(rc, 4*1024*1024))
|
||||||
|
if err != nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
trimmed := bytes.TrimLeft(buf, "\xef\xbb\xbf \t\r\n")
|
||||||
|
switch {
|
||||||
|
case bytes.HasPrefix(trimmed, []byte("---")):
|
||||||
|
fm := parseFrontmatter(buf)
|
||||||
|
if fm == nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if *license == "" {
|
||||||
|
*license = fm.License
|
||||||
|
}
|
||||||
|
if *readmeName == "" && len(fm.BaseModel) > 0 {
|
||||||
|
*readmeName = fm.BaseModel[0]
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
case bytes.HasPrefix(trimmed, []byte("{")):
|
||||||
|
var cfg hfConfig
|
||||||
|
if err := json.Unmarshal(buf, &cfg); err != nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
applyHFConfig(md, &cfg)
|
||||||
|
if *configName == "" && cfg.NameOrPath != "" {
|
||||||
|
*configName = cfg.NameOrPath
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// identifyLicenseLayers turns Docker AI license-layer locations into
|
||||||
|
// pkg.License values.
|
||||||
|
func identifyLicenseLayers(ctx context.Context, resolver file.Resolver, locs []file.Location) []pkg.License {
|
||||||
|
var out []pkg.License
|
||||||
|
var scanFallback []file.Location
|
||||||
|
for i := range locs {
|
||||||
|
loc := locs[i]
|
||||||
|
if spdx := readLicenseSPDXIDFromFrontmatter(resolver, loc); spdx != "" {
|
||||||
|
out = append(out, pkg.NewLicenseFromFieldsWithContext(ctx, spdx, "", &loc))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
scanFallback = append(scanFallback, loc)
|
||||||
|
}
|
||||||
|
if len(scanFallback) > 0 {
|
||||||
|
out = append(out, licenses.FindAtLocations(ctx, resolver, scanFallback...)...)
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
// readLicenseSPDXIDFromFrontmatter reads a bounded prefix of a license-layer
|
||||||
|
// blob and returns the spdx-id declared in its YAML frontmatter
|
||||||
|
func readLicenseSPDXIDFromFrontmatter(resolver file.Resolver, loc file.Location) string {
|
||||||
|
rc, err := resolver.FileContentsByLocation(loc)
|
||||||
|
if err != nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
defer internal.CloseAndLogError(rc, loc.RealPath)
|
||||||
|
|
||||||
|
buf, err := io.ReadAll(io.LimitReader(rc, 64*1024))
|
||||||
|
if err != nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return parseLicenseFrontmatter(buf)
|
||||||
|
}
|
||||||
140
syft/pkg/cataloger/ai/merge.go
Normal file
140
syft/pkg/cataloger/ai/merge.go
Normal file
@ -0,0 +1,140 @@
|
|||||||
|
package ai
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"sort"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/cespare/xxhash/v2"
|
||||||
|
|
||||||
|
"github.com/anchore/syft/syft/file"
|
||||||
|
"github.com/anchore/syft/syft/pkg"
|
||||||
|
)
|
||||||
|
|
||||||
|
// mergeSafeTensorsGroup folds a group's per-member metadata into a single package.
|
||||||
|
func mergeSafeTensorsGroup(members []pkg.Package) pkg.Package {
|
||||||
|
locSet := unionLocations(members)
|
||||||
|
aggregates, shards := bucketSafeTensorsMembers(members)
|
||||||
|
|
||||||
|
merged := pkg.SafeTensorsModelInfo{Format: "safetensors"}
|
||||||
|
mergeAggregatesInto(&merged, aggregates)
|
||||||
|
shardTensorTotal, hashes := mergeShardsInto(&merged, shards)
|
||||||
|
|
||||||
|
// Keep merged UserMetadata globally key-sorted so the SBOM is stable
|
||||||
|
sort.Slice(merged.UserMetadata, func(i, j int) bool {
|
||||||
|
return merged.UserMetadata[i].Key < merged.UserMetadata[j].Key
|
||||||
|
})
|
||||||
|
|
||||||
|
if merged.TensorCount == 0 {
|
||||||
|
merged.TensorCount = shardTensorTotal
|
||||||
|
}
|
||||||
|
if merged.ShardCount == 0 {
|
||||||
|
if len(shards) > 0 {
|
||||||
|
merged.ShardCount = len(shards)
|
||||||
|
} else {
|
||||||
|
merged.ShardCount = 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
merged.MetadataHash = rollupHash(hashes)
|
||||||
|
|
||||||
|
// Parts only carry value for multi-shard models; for a single shard the
|
||||||
|
// outer view already exposes every per-shard field.
|
||||||
|
if len(shards) > 1 {
|
||||||
|
parts := append([]pkg.SafeTensorsModelInfo(nil), shards...)
|
||||||
|
sort.Slice(parts, func(i, j int) bool {
|
||||||
|
return parts[i].MetadataHash < parts[j].MetadataHash
|
||||||
|
})
|
||||||
|
merged.Parts = parts
|
||||||
|
}
|
||||||
|
|
||||||
|
return pkg.Package{
|
||||||
|
Locations: locSet,
|
||||||
|
Type: pkg.ModelPkg,
|
||||||
|
Metadata: merged,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func mergeAggregatesInto(merged *pkg.SafeTensorsModelInfo, aggregates []pkg.SafeTensorsModelInfo) {
|
||||||
|
for _, a := range aggregates {
|
||||||
|
if merged.TensorCount == 0 {
|
||||||
|
merged.TensorCount = a.TensorCount
|
||||||
|
}
|
||||||
|
if merged.ShardCount == 0 {
|
||||||
|
merged.ShardCount = a.ShardCount
|
||||||
|
}
|
||||||
|
firstNonEmpty(&merged.Parameters, a.Parameters)
|
||||||
|
firstNonEmpty(&merged.TotalSize, a.TotalSize)
|
||||||
|
firstNonEmpty(&merged.Quantization, a.Quantization)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// mergeShardsInto folds the per-shard header metadata into merged, returning
|
||||||
|
// the summed shard TensorCount and the list of non-empty per-shard hashes for
|
||||||
|
// the rollup. Shards carry only the content-derived fields (Quantization,
|
||||||
|
// Parameters, UserMetadata);
|
||||||
|
func mergeShardsInto(merged *pkg.SafeTensorsModelInfo, shards []pkg.SafeTensorsModelInfo) (shardTensorTotal uint64, hashes []string) {
|
||||||
|
seenKV := map[string]bool{}
|
||||||
|
for _, s := range shards {
|
||||||
|
shardTensorTotal += s.TensorCount
|
||||||
|
firstNonEmpty(&merged.Quantization, s.Quantization)
|
||||||
|
firstNonEmpty(&merged.Parameters, s.Parameters)
|
||||||
|
for _, kv := range s.UserMetadata {
|
||||||
|
if seenKV[kv.Key] {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seenKV[kv.Key] = true
|
||||||
|
merged.UserMetadata = append(merged.UserMetadata, kv)
|
||||||
|
}
|
||||||
|
if s.MetadataHash != "" {
|
||||||
|
hashes = append(hashes, s.MetadataHash)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return shardTensorTotal, hashes
|
||||||
|
}
|
||||||
|
|
||||||
|
func firstNonEmpty(dst *string, v string) {
|
||||||
|
if *dst == "" {
|
||||||
|
*dst = v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// unionLocations gathers every location from every member into a single set.
|
||||||
|
func unionLocations(members []pkg.Package) file.LocationSet {
|
||||||
|
out := file.NewLocationSet()
|
||||||
|
for _, m := range members {
|
||||||
|
for _, l := range m.Locations.ToSlice() {
|
||||||
|
out.Add(l)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
// bucketSafeTensorsMembers splits group members into aggregate-flavored entries
|
||||||
|
// (no MetadataHash — Docker AI config blob or sharded index) and shard-flavored
|
||||||
|
// entries (carry a content-derived MetadataHash from a header parser).
|
||||||
|
func bucketSafeTensorsMembers(members []pkg.Package) (aggregates, shards []pkg.SafeTensorsModelInfo) {
|
||||||
|
for _, m := range members {
|
||||||
|
md, ok := m.Metadata.(pkg.SafeTensorsModelInfo)
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if md.MetadataHash != "" {
|
||||||
|
shards = append(shards, md)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
aggregates = append(aggregates, md)
|
||||||
|
}
|
||||||
|
return aggregates, shards
|
||||||
|
}
|
||||||
|
|
||||||
|
func rollupHash(hashes []string) string {
|
||||||
|
if len(hashes) == 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
if len(hashes) == 1 {
|
||||||
|
return hashes[0]
|
||||||
|
}
|
||||||
|
sorted := append([]string(nil), hashes...)
|
||||||
|
sort.Strings(sorted)
|
||||||
|
return fmt.Sprintf("%016x", xxhash.Sum64String(strings.Join(sorted, "|")))
|
||||||
|
}
|
||||||
25
syft/pkg/cataloger/ai/naming.go
Normal file
25
syft/pkg/cataloger/ai/naming.go
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
package ai
|
||||||
|
|
||||||
|
import "path"
|
||||||
|
|
||||||
|
// pickSafeTensorsName implements the documented naming precedence chain:
|
||||||
|
// - config.json _name_or_path (path.Base, so "org/Model" → "Model";
|
||||||
|
// applies to both dir-scan and OCI groups)
|
||||||
|
// - fallback name — the group's source-specific positional identifier
|
||||||
|
func pickSafeTensorsName(nameOrPath, fallbackName string) string {
|
||||||
|
if nameOrPath != "" {
|
||||||
|
return path.Base(nameOrPath)
|
||||||
|
}
|
||||||
|
return fallbackName
|
||||||
|
}
|
||||||
|
|
||||||
|
// safeTensorsDirName returns the directory-scan naming fallback: the base name
|
||||||
|
// of the group's parent directory (the group key is already that directory).
|
||||||
|
func safeTensorsDirName(groupKey string) string {
|
||||||
|
base := path.Base(groupKey)
|
||||||
|
switch base {
|
||||||
|
case "/", ".", "":
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return base
|
||||||
|
}
|
||||||
@ -1,159 +1,122 @@
|
|||||||
package ai
|
package ai
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
|
||||||
"context"
|
"context"
|
||||||
"encoding/json"
|
|
||||||
"fmt"
|
|
||||||
"io"
|
|
||||||
"path"
|
"path"
|
||||||
"sort"
|
"sort"
|
||||||
"strings"
|
|
||||||
|
|
||||||
"github.com/cespare/xxhash/v2"
|
|
||||||
gcrname "github.com/google/go-containerregistry/pkg/name"
|
|
||||||
"gopkg.in/yaml.v3"
|
|
||||||
|
|
||||||
"github.com/anchore/syft/internal"
|
|
||||||
"github.com/anchore/syft/internal/log"
|
"github.com/anchore/syft/internal/log"
|
||||||
"github.com/anchore/syft/syft/artifact"
|
"github.com/anchore/syft/syft/artifact"
|
||||||
"github.com/anchore/syft/syft/file"
|
"github.com/anchore/syft/syft/file"
|
||||||
"github.com/anchore/syft/syft/pkg"
|
"github.com/anchore/syft/syft/pkg"
|
||||||
"github.com/anchore/syft/syft/pkg/cataloger/internal/licenses"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
// ociGroupKey is the grouping key for every safetensors package that
|
// safeTensorsMergeProcessor owns naming, license resolution, and final package
|
||||||
// originated from an OCI model artifact. The ContainerImageModel resolver gives
|
// assembly. SafeTensors packages reach it nameless from the parsers; it groups
|
||||||
// each layer the virtual RealPath "/" regardless of layer media type, so all
|
// them per model, merges the per-shard metadata, resolves a name + licenses, and
|
||||||
// safetensors packages from a single OCI scan collapse into one group.
|
// drops any model it cannot name.
|
||||||
const ociGroupKey = "@oci@"
|
//
|
||||||
|
// There are exactly two sources, each handled by its own path:
|
||||||
// ggufMergeProcessor consolidates multiple GGUF packages into a single package
|
// - an OCI model artifact, where the source presents every layer at the
|
||||||
// representing the AI model. When scanning OCI images with multiple layers,
|
// virtual path "/" and the whole scan is a single model (mergeOCIModel)
|
||||||
// each layer may produce a separate package. This processor finds the package
|
// - a filesystem scan, where models are grouped by the directory their files
|
||||||
// with a name and merges metadata from nameless packages into its GGUFFileParts field.
|
// live in (mergeDirModels)
|
||||||
// Only packages with a non-empty name are returned in the final result.
|
|
||||||
func ggufMergeProcessor(pkgs []pkg.Package, rels []artifact.Relationship, err error) ([]pkg.Package, []artifact.Relationship, error) {
|
|
||||||
if err != nil {
|
|
||||||
return pkgs, rels, err
|
|
||||||
}
|
|
||||||
if len(pkgs) == 0 {
|
|
||||||
return pkgs, rels, err
|
|
||||||
}
|
|
||||||
|
|
||||||
// Separate packages with names from those without
|
|
||||||
var namedPkgs []pkg.Package
|
|
||||||
var namelessHeaders []pkg.GGUFFileHeader
|
|
||||||
|
|
||||||
for _, p := range pkgs {
|
|
||||||
if p.Name != "" {
|
|
||||||
namedPkgs = append(namedPkgs, p)
|
|
||||||
} else {
|
|
||||||
if header, ok := p.Metadata.(pkg.GGUFFileHeader); ok {
|
|
||||||
// We do not want a kv hash for nameless headers
|
|
||||||
header.MetadataKeyValuesHash = ""
|
|
||||||
namelessHeaders = append(namelessHeaders, header)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// If there are no named packages, return nothing
|
|
||||||
if len(namedPkgs) == 0 {
|
|
||||||
return nil, rels, err
|
|
||||||
}
|
|
||||||
|
|
||||||
// merge nameless headers into a single named package;
|
|
||||||
// if there are multiple named packages, return them without trying to merge headers.
|
|
||||||
// we cannot determine which nameless headers belong to which package
|
|
||||||
// this is because the order we receive the gguf headers in is not guaranteed
|
|
||||||
// to match the layer order in the original oci image
|
|
||||||
if len(namedPkgs) == 1 && len(namelessHeaders) > 0 {
|
|
||||||
winner := &namedPkgs[0]
|
|
||||||
if header, ok := winner.Metadata.(pkg.GGUFFileHeader); ok {
|
|
||||||
header.Parts = namelessHeaders
|
|
||||||
winner.Metadata = header
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return namedPkgs, rels, err
|
|
||||||
}
|
|
||||||
|
|
||||||
// safeTensorsMergeProcessor owns naming, license resolution, and tensor package creation
|
|
||||||
// - groups all nameless packages
|
|
||||||
// - merge the per-shard metadata
|
|
||||||
// - picks a name (see pickSafeTensorsName)
|
|
||||||
func safeTensorsMergeProcessor(ctx context.Context, resolver file.Resolver, pkgs []pkg.Package, rels []artifact.Relationship, err error) ([]pkg.Package, []artifact.Relationship, error) {
|
func safeTensorsMergeProcessor(ctx context.Context, resolver file.Resolver, pkgs []pkg.Package, rels []artifact.Relationship, err error) ([]pkg.Package, []artifact.Relationship, error) {
|
||||||
if err != nil || len(pkgs) == 0 {
|
if err != nil || len(pkgs) == 0 {
|
||||||
return pkgs, rels, err
|
return pkgs, rels, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// split off non-safetensors packages
|
// keep the processor robust if non-safetensors packages ever flow through
|
||||||
// this keeps the processor robust if other types ever flow through
|
stPkgs, other := partitionSafeTensorsPackages(pkgs)
|
||||||
var stPkgs, other []pkg.Package
|
|
||||||
for _, p := range pkgs {
|
|
||||||
if _, ok := p.Metadata.(pkg.SafeTensorsModelInfo); ok {
|
|
||||||
stPkgs = append(stPkgs, p)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
other = append(other, p)
|
|
||||||
}
|
|
||||||
if len(stPkgs) == 0 {
|
if len(stPkgs) == 0 {
|
||||||
return pkgs, rels, err
|
return pkgs, rels, err
|
||||||
}
|
}
|
||||||
|
|
||||||
groups := groupSafeTensorsPackages(stPkgs)
|
if fromOCIArtifact(stPkgs) {
|
||||||
|
return append(other, mergeOCIModel(ctx, resolver, stPkgs)...), rels, nil
|
||||||
// Deterministic iteration order so the SBOM doesn't depend on map order.
|
|
||||||
keys := make([]string, 0, len(groups))
|
|
||||||
for k := range groups {
|
|
||||||
keys = append(keys, k)
|
|
||||||
}
|
}
|
||||||
sort.Strings(keys)
|
return append(other, mergeDirModels(ctx, resolver, stPkgs)...), rels, nil
|
||||||
|
|
||||||
out := other
|
|
||||||
for _, key := range keys {
|
|
||||||
merged := mergeSafeTensorsGroup(groups[key])
|
|
||||||
|
|
||||||
// Resolve model identity (name candidates) before enrich
|
|
||||||
id := resolveSafeTensorsIdentity(resolver, key, &merged)
|
|
||||||
name := pickSafeTensorsName(id.nameOrPath, id.fallbackName)
|
|
||||||
if name == "" {
|
|
||||||
log.Debugf("dropped safetensors model package (metadata hash %q): no name source",
|
|
||||||
merged.Metadata.(pkg.SafeTensorsModelInfo).MetadataHash)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
enrichSafeTensorsGroup(ctx, resolver, key, &merged, id)
|
|
||||||
merged.Name = name
|
|
||||||
merged.SetID()
|
|
||||||
out = append(out, merged)
|
|
||||||
}
|
|
||||||
return out, rels, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// groupSafeTensorsPackages buckets packages by the parent directory of their
|
// partitionSafeTensorsPackages separates safetensors packages from anything else
|
||||||
// primary-evidence location
|
// flowing through the processor.
|
||||||
func groupSafeTensorsPackages(pkgs []pkg.Package) map[string][]pkg.Package {
|
func partitionSafeTensorsPackages(pkgs []pkg.Package) (safeTensors, other []pkg.Package) {
|
||||||
out := make(map[string][]pkg.Package)
|
|
||||||
for _, p := range pkgs {
|
for _, p := range pkgs {
|
||||||
key := safeTensorsGroupKey(p)
|
if _, ok := p.Metadata.(pkg.SafeTensorsModelInfo); ok {
|
||||||
if key == "" {
|
safeTensors = append(safeTensors, p)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
out[key] = append(out[key], p)
|
other = append(other, p)
|
||||||
|
}
|
||||||
|
return safeTensors, other
|
||||||
|
}
|
||||||
|
|
||||||
|
// fromOCIArtifact reports whether the packages came from an OCI model artifact.
|
||||||
|
// That source presents every layer at the virtual path "/", whereas a filesystem
|
||||||
|
// scan always carries a real file path. A single scan is one source, so the
|
||||||
|
// first package is representative of the rest.
|
||||||
|
func fromOCIArtifact(pkgs []pkg.Package) bool {
|
||||||
|
loc := primaryEvidenceLocation(pkgs[0])
|
||||||
|
return loc != nil && loc.RealPath == "/"
|
||||||
|
}
|
||||||
|
|
||||||
|
// mergeOCIModel treats the whole OCI artifact as a single model: every layer
|
||||||
|
// merges into one package, named from the artifact's config.json/README or its
|
||||||
|
// image reference.
|
||||||
|
func mergeOCIModel(ctx context.Context, resolver file.Resolver, pkgs []pkg.Package) []pkg.Package {
|
||||||
|
merged := mergeSafeTensorsGroup(pkgs)
|
||||||
|
|
||||||
|
md := merged.Metadata.(pkg.SafeTensorsModelInfo)
|
||||||
|
id := resolveSafeTensorsOCIIdentity(ctx, resolver, &md)
|
||||||
|
merged.Metadata = md // write architecture enrichment back before assembly
|
||||||
|
|
||||||
|
if p, ok := assembleSafeTensorsPackage(merged, id); ok {
|
||||||
|
return []pkg.Package{p}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// mergeDirModels groups filesystem-scanned files by their parent directory and
|
||||||
|
// emits one model per directory, named from a sibling config.json/README or the
|
||||||
|
// directory itself.
|
||||||
|
func mergeDirModels(ctx context.Context, resolver file.Resolver, pkgs []pkg.Package) []pkg.Package {
|
||||||
|
groups := groupByParentDir(pkgs)
|
||||||
|
|
||||||
|
// deterministic iteration order so the SBOM doesn't depend on map order
|
||||||
|
dirs := make([]string, 0, len(groups))
|
||||||
|
for dir := range groups {
|
||||||
|
dirs = append(dirs, dir)
|
||||||
|
}
|
||||||
|
sort.Strings(dirs)
|
||||||
|
|
||||||
|
var out []pkg.Package
|
||||||
|
for _, dir := range dirs {
|
||||||
|
merged := mergeSafeTensorsGroup(groups[dir])
|
||||||
|
|
||||||
|
md := merged.Metadata.(pkg.SafeTensorsModelInfo)
|
||||||
|
id := resolveSafeTensorsDirIdentity(ctx, resolver, dir, &md)
|
||||||
|
merged.Metadata = md // write architecture enrichment back before assembly
|
||||||
|
|
||||||
|
if p, ok := assembleSafeTensorsPackage(merged, id); ok {
|
||||||
|
out = append(out, p)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
|
||||||
func safeTensorsGroupKey(p pkg.Package) string {
|
// groupByParentDir buckets filesystem-scanned packages by the directory their
|
||||||
loc := primaryEvidenceLocation(p)
|
// primary-evidence file lives in (the shards of one model share a directory).
|
||||||
if loc == nil {
|
func groupByParentDir(pkgs []pkg.Package) map[string][]pkg.Package {
|
||||||
return ""
|
out := make(map[string][]pkg.Package)
|
||||||
|
for _, p := range pkgs {
|
||||||
|
loc := primaryEvidenceLocation(p)
|
||||||
|
if loc == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
dir := path.Dir(loc.RealPath)
|
||||||
|
out[dir] = append(out[dir], p)
|
||||||
}
|
}
|
||||||
if loc.RealPath == "/" {
|
return out
|
||||||
return ociGroupKey
|
|
||||||
}
|
|
||||||
return path.Dir(loc.RealPath)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func primaryEvidenceLocation(p pkg.Package) *file.Location {
|
func primaryEvidenceLocation(p pkg.Package) *file.Location {
|
||||||
@ -169,501 +132,34 @@ func primaryEvidenceLocation(p pkg.Package) *file.Location {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// mergeSafeTensorsGroup folds a group's per-member metadata into a single package.
|
// safeTensorsIdentity is the fully-resolved naming/license result for a model.
|
||||||
func mergeSafeTensorsGroup(members []pkg.Package) pkg.Package {
|
// Each source resolver (dir or OCI) populates it so assembly stays source-agnostic.
|
||||||
locSet := unionLocations(members)
|
|
||||||
aggregates, shards := bucketSafeTensorsMembers(members)
|
|
||||||
|
|
||||||
merged := pkg.SafeTensorsModelInfo{Format: "safetensors"}
|
|
||||||
mergeAggregatesInto(&merged, aggregates)
|
|
||||||
shardTensorTotal, hashes := mergeShardsInto(&merged, shards)
|
|
||||||
|
|
||||||
// Keep merged UserMetadata globally key-sorted so the SBOM is stable
|
|
||||||
sort.Slice(merged.UserMetadata, func(i, j int) bool {
|
|
||||||
return merged.UserMetadata[i].Key < merged.UserMetadata[j].Key
|
|
||||||
})
|
|
||||||
|
|
||||||
if merged.TensorCount == 0 {
|
|
||||||
merged.TensorCount = shardTensorTotal
|
|
||||||
}
|
|
||||||
if merged.ShardCount == 0 {
|
|
||||||
if len(shards) > 0 {
|
|
||||||
merged.ShardCount = len(shards)
|
|
||||||
} else {
|
|
||||||
merged.ShardCount = 1
|
|
||||||
}
|
|
||||||
}
|
|
||||||
merged.MetadataHash = rollupHash(hashes)
|
|
||||||
|
|
||||||
// Parts only carry value for multi-shard models; for a single shard the
|
|
||||||
// outer view already exposes every per-shard field.
|
|
||||||
if len(shards) > 1 {
|
|
||||||
parts := append([]pkg.SafeTensorsModelInfo(nil), shards...)
|
|
||||||
sort.Slice(parts, func(i, j int) bool {
|
|
||||||
return parts[i].MetadataHash < parts[j].MetadataHash
|
|
||||||
})
|
|
||||||
merged.Parts = parts
|
|
||||||
}
|
|
||||||
|
|
||||||
return pkg.Package{
|
|
||||||
Locations: locSet,
|
|
||||||
Type: pkg.ModelPkg,
|
|
||||||
Metadata: merged,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func mergeAggregatesInto(merged *pkg.SafeTensorsModelInfo, aggregates []pkg.SafeTensorsModelInfo) {
|
|
||||||
for _, a := range aggregates {
|
|
||||||
if merged.TensorCount == 0 {
|
|
||||||
merged.TensorCount = a.TensorCount
|
|
||||||
}
|
|
||||||
if merged.ShardCount == 0 {
|
|
||||||
merged.ShardCount = a.ShardCount
|
|
||||||
}
|
|
||||||
firstNonEmpty(&merged.Parameters, a.Parameters)
|
|
||||||
firstNonEmpty(&merged.TotalSize, a.TotalSize)
|
|
||||||
firstNonEmpty(&merged.Quantization, a.Quantization)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// mergeShardsInto folds the per-shard header metadata into merged, returning
|
|
||||||
// the summed shard TensorCount and the list of non-empty per-shard hashes for
|
|
||||||
// the rollup. Shards carry only the content-derived fields (Quantization,
|
|
||||||
// Parameters, UserMetadata);
|
|
||||||
func mergeShardsInto(merged *pkg.SafeTensorsModelInfo, shards []pkg.SafeTensorsModelInfo) (shardTensorTotal uint64, hashes []string) {
|
|
||||||
seenKV := map[string]bool{}
|
|
||||||
for _, s := range shards {
|
|
||||||
shardTensorTotal += s.TensorCount
|
|
||||||
firstNonEmpty(&merged.Quantization, s.Quantization)
|
|
||||||
firstNonEmpty(&merged.Parameters, s.Parameters)
|
|
||||||
for _, kv := range s.UserMetadata {
|
|
||||||
if seenKV[kv.Key] {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
seenKV[kv.Key] = true
|
|
||||||
merged.UserMetadata = append(merged.UserMetadata, kv)
|
|
||||||
}
|
|
||||||
if s.MetadataHash != "" {
|
|
||||||
hashes = append(hashes, s.MetadataHash)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return shardTensorTotal, hashes
|
|
||||||
}
|
|
||||||
|
|
||||||
func firstNonEmpty(dst *string, v string) {
|
|
||||||
if *dst == "" {
|
|
||||||
*dst = v
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// unionLocations gathers every location from every member into a single set.
|
|
||||||
func unionLocations(members []pkg.Package) file.LocationSet {
|
|
||||||
out := file.NewLocationSet()
|
|
||||||
for _, m := range members {
|
|
||||||
for _, l := range m.Locations.ToSlice() {
|
|
||||||
out.Add(l)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return out
|
|
||||||
}
|
|
||||||
|
|
||||||
// bucketSafeTensorsMembers splits group members into aggregate-flavored entries
|
|
||||||
// (no MetadataHash — Docker AI config blob or sharded index) and shard-flavored
|
|
||||||
// entries (carry a content-derived MetadataHash from a header parser).
|
|
||||||
func bucketSafeTensorsMembers(members []pkg.Package) (aggregates, shards []pkg.SafeTensorsModelInfo) {
|
|
||||||
for _, m := range members {
|
|
||||||
md, ok := m.Metadata.(pkg.SafeTensorsModelInfo)
|
|
||||||
if !ok {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if md.MetadataHash != "" {
|
|
||||||
shards = append(shards, md)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
aggregates = append(aggregates, md)
|
|
||||||
}
|
|
||||||
return aggregates, shards
|
|
||||||
}
|
|
||||||
|
|
||||||
func rollupHash(hashes []string) string {
|
|
||||||
if len(hashes) == 0 {
|
|
||||||
return ""
|
|
||||||
}
|
|
||||||
if len(hashes) == 1 {
|
|
||||||
return hashes[0]
|
|
||||||
}
|
|
||||||
sorted := append([]string(nil), hashes...)
|
|
||||||
sort.Strings(sorted)
|
|
||||||
return fmt.Sprintf("%016x", xxhash.Sum64String(strings.Join(sorted, "|")))
|
|
||||||
}
|
|
||||||
|
|
||||||
type safeTensorsIdentity struct {
|
type safeTensorsIdentity struct {
|
||||||
nameOrPath string
|
nameOrPath string
|
||||||
fallbackName string
|
fallbackName string
|
||||||
readmeLicense string
|
licenses []pkg.License
|
||||||
supporting []file.Location
|
supporting []file.Location
|
||||||
}
|
}
|
||||||
|
|
||||||
// resolveSafeTensorsIdentity reads the resolver for the group's naming signals
|
// assembleSafeTensorsPackage finalizes a merged model from its resolved identity:
|
||||||
// (config.json _name_or_path, README base_model, OCI image ref / dir name)
|
// it picks the name, attaches licenses and supporting evidence, and sets the ID.
|
||||||
func resolveSafeTensorsIdentity(resolver file.Resolver, groupKey string, merged *pkg.Package) safeTensorsIdentity {
|
// A model with no name source is dropped (ok=false).
|
||||||
md := merged.Metadata.(pkg.SafeTensorsModelInfo)
|
func assembleSafeTensorsPackage(merged pkg.Package, id safeTensorsIdentity) (pkg.Package, bool) {
|
||||||
|
name := pickSafeTensorsName(id.nameOrPath, id.fallbackName)
|
||||||
var id safeTensorsIdentity
|
if name == "" {
|
||||||
if groupKey == ociGroupKey {
|
log.Debugf("dropped safetensors model package (metadata hash %q): no name source",
|
||||||
id = resolveSafeTensorsOCIIdentity(resolver, &md)
|
merged.Metadata.(pkg.SafeTensorsModelInfo).MetadataHash)
|
||||||
} else {
|
return pkg.Package{}, false
|
||||||
id = resolveSafeTensorsDirIdentity(resolver, groupKey, &md)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
merged.Metadata = md
|
if len(id.licenses) > 0 {
|
||||||
return id
|
merged.Licenses = pkg.NewLicenseSet(id.licenses...)
|
||||||
}
|
|
||||||
|
|
||||||
func enrichSafeTensorsGroup(ctx context.Context, resolver file.Resolver, groupKey string, merged *pkg.Package, id safeTensorsIdentity) {
|
|
||||||
var lics []pkg.License
|
|
||||||
supporting := id.supporting
|
|
||||||
|
|
||||||
switch {
|
|
||||||
case id.readmeLicense != "":
|
|
||||||
lics = pkg.NewLicensesFromValuesWithContext(ctx, id.readmeLicense)
|
|
||||||
case groupKey == ociGroupKey:
|
|
||||||
if ociResolver, ok := resolver.(file.OCIMediaTypeResolver); ok {
|
|
||||||
licLocs, err := ociResolver.FilesByMediaType(dockerAILicenseMediaType)
|
|
||||||
if err != nil {
|
|
||||||
log.Debugf("failed to list docker AI license layers: %v", err)
|
|
||||||
}
|
|
||||||
if len(licLocs) > 0 {
|
|
||||||
lics = identifyLicenseLayers(ctx, resolver, licLocs)
|
|
||||||
supporting = append(supporting, licLocs...)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
for _, loc := range id.supporting {
|
||||||
if len(lics) > 0 {
|
|
||||||
merged.Licenses = pkg.NewLicenseSet(lics...)
|
|
||||||
}
|
|
||||||
for _, loc := range supporting {
|
|
||||||
merged.Locations.Add(loc.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.SupportingEvidenceAnnotation))
|
merged.Locations.Add(loc.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.SupportingEvidenceAnnotation))
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
merged.Name = name
|
||||||
// safeTensorsDirName returns the directory-scan naming fallback: the base name
|
merged.SetID()
|
||||||
// of the group's parent directory (the group key is already that directory).
|
return merged, true
|
||||||
func safeTensorsDirName(groupKey string) string {
|
|
||||||
base := path.Base(groupKey)
|
|
||||||
switch base {
|
|
||||||
case "/", ".", "":
|
|
||||||
return ""
|
|
||||||
}
|
|
||||||
return base
|
|
||||||
}
|
|
||||||
|
|
||||||
// resolveSafeTensorsDirIdentity handles the directory-scan case: look for a
|
|
||||||
// config.json beside the model files (walking up parent directories to the
|
|
||||||
// scanned source root if no sibling exists) and a sibling README.md
|
|
||||||
func resolveSafeTensorsDirIdentity(resolver file.Resolver, dir string, md *pkg.SafeTensorsModelInfo) safeTensorsIdentity {
|
|
||||||
id := safeTensorsIdentity{fallbackName: safeTensorsDirName(dir)}
|
|
||||||
|
|
||||||
if loc, cfg := findDirHFConfig(resolver, dir); cfg != nil {
|
|
||||||
applyHFConfig(md, cfg)
|
|
||||||
id.nameOrPath = cfg.NameOrPath
|
|
||||||
id.supporting = append(id.supporting, *loc)
|
|
||||||
}
|
|
||||||
|
|
||||||
if loc, fm := readDirReadmeFrontmatter(resolver, path.Join(dir, "README.md")); fm != nil {
|
|
||||||
id.readmeLicense = fm.License
|
|
||||||
if id.nameOrPath == "" && len(fm.BaseModel) > 0 {
|
|
||||||
id.nameOrPath = fm.BaseModel[0]
|
|
||||||
}
|
|
||||||
id.supporting = append(id.supporting, *loc)
|
|
||||||
}
|
|
||||||
return id
|
|
||||||
}
|
|
||||||
|
|
||||||
func resolveSafeTensorsOCIIdentity(resolver file.Resolver, md *pkg.SafeTensorsModelInfo) safeTensorsIdentity {
|
|
||||||
ociResolver, ok := resolver.(file.OCIMediaTypeResolver)
|
|
||||||
if !ok {
|
|
||||||
return safeTensorsIdentity{}
|
|
||||||
}
|
|
||||||
|
|
||||||
modelFileLocs, err := ociResolver.FilesByMediaType(dockerAIModelFileMediaType)
|
|
||||||
if err != nil {
|
|
||||||
log.Debugf("failed to list docker AI model-file layers: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Collect config / readme candidates separately so the layer-iteration order
|
|
||||||
// returned by the resolver doesn't decide the precedence.
|
|
||||||
var configName, readmeName, readmeLicense string
|
|
||||||
var supporting []file.Location
|
|
||||||
for _, loc := range modelFileLocs {
|
|
||||||
if classifyOCIModelFileLayer(resolver, loc, md, &configName, &readmeName, &readmeLicense) {
|
|
||||||
supporting = append(supporting, loc)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Precedence: config.json _name_or_path > README base_model.
|
|
||||||
nameOrPath := configName
|
|
||||||
if nameOrPath == "" {
|
|
||||||
nameOrPath = readmeName
|
|
||||||
}
|
|
||||||
|
|
||||||
return safeTensorsIdentity{
|
|
||||||
nameOrPath: nameOrPath,
|
|
||||||
fallbackName: ociImageRefBasename(resolver),
|
|
||||||
readmeLicense: readmeLicense,
|
|
||||||
supporting: supporting,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ociImageReferencer is the minimal capability ociImageRefBasename needs: a
|
|
||||||
// resolver that can surface the OCI image reference it was built from. It is
|
|
||||||
// kept local to this package (rather than exported from the file package) so the
|
|
||||||
// assertion stays with its only consumer.
|
|
||||||
type ociImageReferencer interface {
|
|
||||||
ImageReference() string
|
|
||||||
}
|
|
||||||
|
|
||||||
func ociImageRefBasename(resolver file.Resolver) string {
|
|
||||||
// TODO: we don't think this approach is generalizable quite yet, but we really do need this information.
|
|
||||||
// (Ideally we should be NOT be type asserting on the file resolver directly).
|
|
||||||
info, ok := resolver.(ociImageReferencer)
|
|
||||||
if !ok {
|
|
||||||
return ""
|
|
||||||
}
|
|
||||||
ref := info.ImageReference()
|
|
||||||
if ref == "" {
|
|
||||||
return ""
|
|
||||||
}
|
|
||||||
parsed, err := gcrname.ParseReference(ref)
|
|
||||||
if err != nil {
|
|
||||||
log.Debugf("failed to parse OCI ref %q: %v", ref, err)
|
|
||||||
return ""
|
|
||||||
}
|
|
||||||
return path.Base(parsed.Context().RepositoryStr())
|
|
||||||
}
|
|
||||||
|
|
||||||
// identifyLicenseLayers turns Docker AI license-layer locations into
|
|
||||||
// pkg.License values.
|
|
||||||
func identifyLicenseLayers(ctx context.Context, resolver file.Resolver, locs []file.Location) []pkg.License {
|
|
||||||
var out []pkg.License
|
|
||||||
var scanFallback []file.Location
|
|
||||||
for i := range locs {
|
|
||||||
loc := locs[i]
|
|
||||||
if spdx := readLicenseSPDXIDFromFrontmatter(resolver, loc); spdx != "" {
|
|
||||||
out = append(out, pkg.NewLicenseFromFieldsWithContext(ctx, spdx, "", &loc))
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
scanFallback = append(scanFallback, loc)
|
|
||||||
}
|
|
||||||
if len(scanFallback) > 0 {
|
|
||||||
out = append(out, licenses.FindAtLocations(ctx, resolver, scanFallback...)...)
|
|
||||||
}
|
|
||||||
return out
|
|
||||||
}
|
|
||||||
|
|
||||||
// readLicenseSPDXIDFromFrontmatter reads a bounded prefix of a license-layer
|
|
||||||
// blob and returns the spdx-id declared in its YAML frontmatter
|
|
||||||
func readLicenseSPDXIDFromFrontmatter(resolver file.Resolver, loc file.Location) string {
|
|
||||||
rc, err := resolver.FileContentsByLocation(loc)
|
|
||||||
if err != nil {
|
|
||||||
return ""
|
|
||||||
}
|
|
||||||
defer internal.CloseAndLogError(rc, loc.RealPath)
|
|
||||||
|
|
||||||
buf, err := io.ReadAll(io.LimitReader(rc, 64*1024))
|
|
||||||
if err != nil {
|
|
||||||
return ""
|
|
||||||
}
|
|
||||||
return parseLicenseFrontmatter(buf)
|
|
||||||
}
|
|
||||||
|
|
||||||
// classifyOCIModelFileLayer reads up to 4 MiB of a model.file layer and
|
|
||||||
// classifies it as README frontmatter or HF config.json based on its leading bytes.
|
|
||||||
func classifyOCIModelFileLayer(resolver file.Resolver, loc file.Location, md *pkg.SafeTensorsModelInfo, configName, readmeName, license *string) bool {
|
|
||||||
rc, err := resolver.FileContentsByLocation(loc)
|
|
||||||
if err != nil {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
defer internal.CloseAndLogError(rc, loc.RealPath)
|
|
||||||
|
|
||||||
buf, err := io.ReadAll(io.LimitReader(rc, 4*1024*1024))
|
|
||||||
if err != nil {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
trimmed := bytes.TrimLeft(buf, "\xef\xbb\xbf \t\r\n")
|
|
||||||
switch {
|
|
||||||
case bytes.HasPrefix(trimmed, []byte("---")):
|
|
||||||
fm := parseFrontmatter(buf)
|
|
||||||
if fm == nil {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
if *license == "" {
|
|
||||||
*license = fm.License
|
|
||||||
}
|
|
||||||
if *readmeName == "" && len(fm.BaseModel) > 0 {
|
|
||||||
*readmeName = fm.BaseModel[0]
|
|
||||||
}
|
|
||||||
return true
|
|
||||||
case bytes.HasPrefix(trimmed, []byte("{")):
|
|
||||||
var cfg hfConfig
|
|
||||||
if err := json.Unmarshal(buf, &cfg); err != nil {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
applyHFConfig(md, &cfg)
|
|
||||||
if *configName == "" && cfg.NameOrPath != "" {
|
|
||||||
*configName = cfg.NameOrPath
|
|
||||||
}
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
|
|
||||||
func applyHFConfig(md *pkg.SafeTensorsModelInfo, cfg *hfConfig) {
|
|
||||||
if md.Architecture == "" && len(cfg.Architectures) > 0 {
|
|
||||||
md.Architecture = cfg.Architectures[0]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// pickSafeTensorsName implements the documented naming precedence chain:
|
|
||||||
// - config.json _name_or_path (path.Base, so "org/Model" → "Model";
|
|
||||||
// applies to both dir-scan and OCI groups)
|
|
||||||
// - fallback name — the group's source-specific positional identifier
|
|
||||||
func pickSafeTensorsName(nameOrPath, fallbackName string) string {
|
|
||||||
if nameOrPath != "" {
|
|
||||||
return path.Base(nameOrPath)
|
|
||||||
}
|
|
||||||
return fallbackName
|
|
||||||
}
|
|
||||||
|
|
||||||
// hfConfig is a minimal projection of Hugging Face config.json fields.
|
|
||||||
type hfConfig struct {
|
|
||||||
Architectures []string `json:"architectures"`
|
|
||||||
NameOrPath string `json:"_name_or_path"`
|
|
||||||
}
|
|
||||||
|
|
||||||
// readmeFrontmatter holds the subset of YAML frontmatter fields we extract.
|
|
||||||
type readmeFrontmatter struct {
|
|
||||||
License string `yaml:"license"`
|
|
||||||
BaseModel []string `yaml:"base_model"`
|
|
||||||
}
|
|
||||||
|
|
||||||
// findDirHFConfig looks for a config.json beside the model files
|
|
||||||
func findDirHFConfig(resolver file.Resolver, dir string) (*file.Location, *hfConfig) {
|
|
||||||
for {
|
|
||||||
if loc, cfg := readDirHFConfig(resolver, path.Join(dir, "config.json")); cfg != nil {
|
|
||||||
return loc, cfg
|
|
||||||
}
|
|
||||||
parent := path.Dir(dir)
|
|
||||||
if parent == dir {
|
|
||||||
return nil, nil // reached the source root
|
|
||||||
}
|
|
||||||
dir = parent
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func readDirHFConfig(resolver file.Resolver, p string) (*file.Location, *hfConfig) {
|
|
||||||
locations, err := resolver.FilesByPath(p)
|
|
||||||
if err != nil || len(locations) == 0 {
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
rc, err := resolver.FileContentsByLocation(locations[0])
|
|
||||||
if err != nil {
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
defer internal.CloseAndLogError(rc, p)
|
|
||||||
|
|
||||||
var cfg hfConfig
|
|
||||||
if err := json.NewDecoder(rc).Decode(&cfg); err != nil {
|
|
||||||
log.Debugf("failed to decode %s: %v", p, err)
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
return &locations[0], &cfg
|
|
||||||
}
|
|
||||||
|
|
||||||
func readDirReadmeFrontmatter(resolver file.Resolver, p string) (*file.Location, *readmeFrontmatter) {
|
|
||||||
locations, err := resolver.FilesByPath(p)
|
|
||||||
if err != nil || len(locations) == 0 {
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
rc, err := resolver.FileContentsByLocation(locations[0])
|
|
||||||
if err != nil {
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
defer internal.CloseAndLogError(rc, p)
|
|
||||||
|
|
||||||
buf, err := io.ReadAll(io.LimitReader(rc, 1024*1024))
|
|
||||||
if err != nil {
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
fm := parseFrontmatter(buf)
|
|
||||||
if fm == nil {
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
return &locations[0], fm
|
|
||||||
}
|
|
||||||
|
|
||||||
// extractFrontmatterBlock returns the YAML bytes between the first and second
|
|
||||||
// "---" delimiters of a file
|
|
||||||
func extractFrontmatterBlock(buf []byte) []byte {
|
|
||||||
trimmed := bytes.TrimLeft(buf, "\xef\xbb\xbf \t\r\n")
|
|
||||||
if !bytes.HasPrefix(trimmed, []byte("---")) {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
rest := trimmed[3:]
|
|
||||||
if i := bytes.IndexByte(rest, '\n'); i >= 0 {
|
|
||||||
rest = rest[i+1:]
|
|
||||||
}
|
|
||||||
block, _, found := bytes.Cut(rest, []byte("\n---"))
|
|
||||||
if !found {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
return block
|
|
||||||
}
|
|
||||||
|
|
||||||
// parseFrontmatter decodes a Hugging Face model card YAML frontmatter block
|
|
||||||
// and returns the license and base_model fields.
|
|
||||||
func parseFrontmatter(buf []byte) *readmeFrontmatter {
|
|
||||||
block := extractFrontmatterBlock(buf)
|
|
||||||
if block == nil {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
var raw struct {
|
|
||||||
License string `yaml:"license"`
|
|
||||||
BaseModel yaml.Node `yaml:"base_model"`
|
|
||||||
}
|
|
||||||
if err := yaml.Unmarshal(block, &raw); err != nil {
|
|
||||||
log.Debugf("failed to parse README frontmatter: %v", err)
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
fm := readmeFrontmatter{License: raw.License}
|
|
||||||
switch raw.BaseModel.Kind {
|
|
||||||
case yaml.ScalarNode:
|
|
||||||
if raw.BaseModel.Value != "" {
|
|
||||||
fm.BaseModel = []string{raw.BaseModel.Value}
|
|
||||||
}
|
|
||||||
case yaml.SequenceNode:
|
|
||||||
_ = raw.BaseModel.Decode(&fm.BaseModel)
|
|
||||||
}
|
|
||||||
return &fm
|
|
||||||
}
|
|
||||||
|
|
||||||
type licenseFrontmatter struct {
|
|
||||||
SPDXID string `yaml:"spdx-id"`
|
|
||||||
}
|
|
||||||
|
|
||||||
// parseLicenseFrontmatter returns the producer-declared SPDX identifier
|
|
||||||
func parseLicenseFrontmatter(buf []byte) string {
|
|
||||||
block := extractFrontmatterBlock(buf)
|
|
||||||
if block == nil {
|
|
||||||
return ""
|
|
||||||
}
|
|
||||||
var fm licenseFrontmatter
|
|
||||||
if err := yaml.Unmarshal(block, &fm); err != nil {
|
|
||||||
log.Debugf("failed to parse license frontmatter: %v", err)
|
|
||||||
return ""
|
|
||||||
}
|
|
||||||
return fm.SPDXID
|
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user