mirror of
https://github.com/anchore/syft.git
synced 2026-07-05 02:28:25 +02:00
pr: first pass refactor
Signed-off-by: Christopher Phillips <32073428+spiffcs@users.noreply.github.com>
This commit is contained in:
parent
dd179eb8a7
commit
fe392a490b
56
syft/pkg/cataloger/ai/gguf_processor.go
Normal file
56
syft/pkg/cataloger/ai/gguf_processor.go
Normal file
@ -0,0 +1,56 @@
|
||||
package ai
|
||||
|
||||
import (
|
||||
"github.com/anchore/syft/syft/artifact"
|
||||
"github.com/anchore/syft/syft/pkg"
|
||||
)
|
||||
|
||||
// ggufMergeProcessor consolidates multiple GGUF packages into a single package
|
||||
// representing the AI model. When scanning OCI images with multiple layers,
|
||||
// each layer may produce a separate package. This processor finds the package
|
||||
// with a name and merges metadata from nameless packages into its GGUFFileParts field.
|
||||
// Only packages with a non-empty name are returned in the final result.
|
||||
func ggufMergeProcessor(pkgs []pkg.Package, rels []artifact.Relationship, err error) ([]pkg.Package, []artifact.Relationship, error) {
|
||||
if err != nil {
|
||||
return pkgs, rels, err
|
||||
}
|
||||
if len(pkgs) == 0 {
|
||||
return pkgs, rels, err
|
||||
}
|
||||
|
||||
// Separate packages with names from those without
|
||||
var namedPkgs []pkg.Package
|
||||
var namelessHeaders []pkg.GGUFFileHeader
|
||||
|
||||
for _, p := range pkgs {
|
||||
if p.Name != "" {
|
||||
namedPkgs = append(namedPkgs, p)
|
||||
} else {
|
||||
if header, ok := p.Metadata.(pkg.GGUFFileHeader); ok {
|
||||
// We do not want a kv hash for nameless headers
|
||||
header.MetadataKeyValuesHash = ""
|
||||
namelessHeaders = append(namelessHeaders, header)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If there are no named packages, return nothing
|
||||
if len(namedPkgs) == 0 {
|
||||
return nil, rels, err
|
||||
}
|
||||
|
||||
// merge nameless headers into a single named package;
|
||||
// if there are multiple named packages, return them without trying to merge headers.
|
||||
// we cannot determine which nameless headers belong to which package
|
||||
// this is because the order we receive the gguf headers in is not guaranteed
|
||||
// to match the layer order in the original oci image
|
||||
if len(namedPkgs) == 1 && len(namelessHeaders) > 0 {
|
||||
winner := &namedPkgs[0]
|
||||
if header, ok := winner.Metadata.(pkg.GGUFFileHeader); ok {
|
||||
header.Parts = namelessHeaders
|
||||
winner.Metadata = header
|
||||
}
|
||||
}
|
||||
|
||||
return namedPkgs, rels, err
|
||||
}
|
||||
93
syft/pkg/cataloger/ai/huggingface.go
Normal file
93
syft/pkg/cataloger/ai/huggingface.go
Normal file
@ -0,0 +1,93 @@
|
||||
package ai
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
|
||||
"gopkg.in/yaml.v3"
|
||||
|
||||
"github.com/anchore/syft/internal/log"
|
||||
"github.com/anchore/syft/syft/pkg"
|
||||
)
|
||||
|
||||
// hfConfig is a minimal projection of Hugging Face config.json fields.
|
||||
type hfConfig struct {
|
||||
Architectures []string `json:"architectures"`
|
||||
NameOrPath string `json:"_name_or_path"`
|
||||
}
|
||||
|
||||
func applyHFConfig(md *pkg.SafeTensorsModelInfo, cfg *hfConfig) {
|
||||
if md.Architecture == "" && len(cfg.Architectures) > 0 {
|
||||
md.Architecture = cfg.Architectures[0]
|
||||
}
|
||||
}
|
||||
|
||||
// readmeFrontmatter holds the subset of YAML frontmatter fields we extract.
|
||||
type readmeFrontmatter struct {
|
||||
License string `yaml:"license"`
|
||||
BaseModel []string `yaml:"base_model"`
|
||||
}
|
||||
|
||||
type licenseFrontmatter struct {
|
||||
SPDXID string `yaml:"spdx-id"`
|
||||
}
|
||||
|
||||
// extractFrontmatterBlock returns the YAML bytes between the first and second
|
||||
// "---" delimiters of a file
|
||||
func extractFrontmatterBlock(buf []byte) []byte {
|
||||
trimmed := bytes.TrimLeft(buf, "\xef\xbb\xbf \t\r\n")
|
||||
if !bytes.HasPrefix(trimmed, []byte("---")) {
|
||||
return nil
|
||||
}
|
||||
rest := trimmed[3:]
|
||||
if i := bytes.IndexByte(rest, '\n'); i >= 0 {
|
||||
rest = rest[i+1:]
|
||||
}
|
||||
block, _, found := bytes.Cut(rest, []byte("\n---"))
|
||||
if !found {
|
||||
return nil
|
||||
}
|
||||
return block
|
||||
}
|
||||
|
||||
// parseFrontmatter decodes a Hugging Face model card YAML frontmatter block
|
||||
// and returns the license and base_model fields.
|
||||
func parseFrontmatter(buf []byte) *readmeFrontmatter {
|
||||
block := extractFrontmatterBlock(buf)
|
||||
if block == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
var raw struct {
|
||||
License string `yaml:"license"`
|
||||
BaseModel yaml.Node `yaml:"base_model"`
|
||||
}
|
||||
if err := yaml.Unmarshal(block, &raw); err != nil {
|
||||
log.Debugf("failed to parse README frontmatter: %v", err)
|
||||
return nil
|
||||
}
|
||||
|
||||
fm := readmeFrontmatter{License: raw.License}
|
||||
switch raw.BaseModel.Kind {
|
||||
case yaml.ScalarNode:
|
||||
if raw.BaseModel.Value != "" {
|
||||
fm.BaseModel = []string{raw.BaseModel.Value}
|
||||
}
|
||||
case yaml.SequenceNode:
|
||||
_ = raw.BaseModel.Decode(&fm.BaseModel)
|
||||
}
|
||||
return &fm
|
||||
}
|
||||
|
||||
// parseLicenseFrontmatter returns the producer-declared SPDX identifier
|
||||
func parseLicenseFrontmatter(buf []byte) string {
|
||||
block := extractFrontmatterBlock(buf)
|
||||
if block == nil {
|
||||
return ""
|
||||
}
|
||||
var fm licenseFrontmatter
|
||||
if err := yaml.Unmarshal(block, &fm); err != nil {
|
||||
log.Debugf("failed to parse license frontmatter: %v", err)
|
||||
return ""
|
||||
}
|
||||
return fm.SPDXID
|
||||
}
|
||||
93
syft/pkg/cataloger/ai/identity_dir.go
Normal file
93
syft/pkg/cataloger/ai/identity_dir.go
Normal file
@ -0,0 +1,93 @@
|
||||
package ai
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"io"
|
||||
"path"
|
||||
|
||||
"github.com/anchore/syft/internal"
|
||||
"github.com/anchore/syft/internal/log"
|
||||
"github.com/anchore/syft/syft/file"
|
||||
"github.com/anchore/syft/syft/pkg"
|
||||
)
|
||||
|
||||
// resolveSafeTensorsDirIdentity handles the directory-scan case: look for a
|
||||
// config.json beside the model files (walking up parent directories to the
|
||||
// scanned source root if no sibling exists) and a sibling README.md. It returns
|
||||
// the group's name candidates, resolved licenses, and supporting evidence.
|
||||
func resolveSafeTensorsDirIdentity(ctx context.Context, resolver file.Resolver, dir string, md *pkg.SafeTensorsModelInfo) safeTensorsIdentity {
|
||||
id := safeTensorsIdentity{fallbackName: safeTensorsDirName(dir)}
|
||||
|
||||
if loc, cfg := findDirHFConfig(resolver, dir); cfg != nil {
|
||||
applyHFConfig(md, cfg)
|
||||
id.nameOrPath = cfg.NameOrPath
|
||||
id.supporting = append(id.supporting, *loc)
|
||||
}
|
||||
|
||||
if loc, fm := readDirReadmeFrontmatter(resolver, path.Join(dir, "README.md")); fm != nil {
|
||||
if fm.License != "" {
|
||||
id.licenses = pkg.NewLicensesFromValuesWithContext(ctx, fm.License)
|
||||
}
|
||||
if id.nameOrPath == "" && len(fm.BaseModel) > 0 {
|
||||
id.nameOrPath = fm.BaseModel[0]
|
||||
}
|
||||
id.supporting = append(id.supporting, *loc)
|
||||
}
|
||||
return id
|
||||
}
|
||||
|
||||
// findDirHFConfig looks for a config.json beside the model files
|
||||
func findDirHFConfig(resolver file.Resolver, dir string) (*file.Location, *hfConfig) {
|
||||
for {
|
||||
if loc, cfg := readDirHFConfig(resolver, path.Join(dir, "config.json")); cfg != nil {
|
||||
return loc, cfg
|
||||
}
|
||||
parent := path.Dir(dir)
|
||||
if parent == dir {
|
||||
return nil, nil // reached the source root
|
||||
}
|
||||
dir = parent
|
||||
}
|
||||
}
|
||||
|
||||
func readDirHFConfig(resolver file.Resolver, p string) (*file.Location, *hfConfig) {
|
||||
locations, err := resolver.FilesByPath(p)
|
||||
if err != nil || len(locations) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
rc, err := resolver.FileContentsByLocation(locations[0])
|
||||
if err != nil {
|
||||
return nil, nil
|
||||
}
|
||||
defer internal.CloseAndLogError(rc, p)
|
||||
|
||||
var cfg hfConfig
|
||||
if err := json.NewDecoder(rc).Decode(&cfg); err != nil {
|
||||
log.Debugf("failed to decode %s: %v", p, err)
|
||||
return nil, nil
|
||||
}
|
||||
return &locations[0], &cfg
|
||||
}
|
||||
|
||||
func readDirReadmeFrontmatter(resolver file.Resolver, p string) (*file.Location, *readmeFrontmatter) {
|
||||
locations, err := resolver.FilesByPath(p)
|
||||
if err != nil || len(locations) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
rc, err := resolver.FileContentsByLocation(locations[0])
|
||||
if err != nil {
|
||||
return nil, nil
|
||||
}
|
||||
defer internal.CloseAndLogError(rc, p)
|
||||
|
||||
buf, err := io.ReadAll(io.LimitReader(rc, 1024*1024))
|
||||
if err != nil {
|
||||
return nil, nil
|
||||
}
|
||||
fm := parseFrontmatter(buf)
|
||||
if fm == nil {
|
||||
return nil, nil
|
||||
}
|
||||
return &locations[0], fm
|
||||
}
|
||||
177
syft/pkg/cataloger/ai/identity_oci.go
Normal file
177
syft/pkg/cataloger/ai/identity_oci.go
Normal file
@ -0,0 +1,177 @@
|
||||
package ai
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"io"
|
||||
"path"
|
||||
|
||||
gcrname "github.com/google/go-containerregistry/pkg/name"
|
||||
|
||||
"github.com/anchore/syft/internal"
|
||||
"github.com/anchore/syft/internal/log"
|
||||
"github.com/anchore/syft/syft/file"
|
||||
"github.com/anchore/syft/syft/pkg"
|
||||
"github.com/anchore/syft/syft/pkg/cataloger/internal/licenses"
|
||||
)
|
||||
|
||||
// resolveSafeTensorsOCIIdentity handles the OCI-artifact case: the model's
|
||||
// naming and license signals arrive as sibling layers (model.file companions
|
||||
// carrying config.json / README, and dedicated license layers). It returns the
|
||||
// group's name candidates, resolved licenses, and supporting evidence.
|
||||
func resolveSafeTensorsOCIIdentity(ctx context.Context, resolver file.Resolver, md *pkg.SafeTensorsModelInfo) safeTensorsIdentity {
|
||||
ociResolver, ok := resolver.(file.OCIMediaTypeResolver)
|
||||
if !ok {
|
||||
return safeTensorsIdentity{}
|
||||
}
|
||||
|
||||
modelFileLocs, err := ociResolver.FilesByMediaType(dockerAIModelFileMediaType)
|
||||
if err != nil {
|
||||
log.Debugf("failed to list docker AI model-file layers: %v", err)
|
||||
}
|
||||
|
||||
// Collect config / readme candidates separately so the layer-iteration order
|
||||
// returned by the resolver doesn't decide the precedence.
|
||||
var configName, readmeName, readmeLicense string
|
||||
var supporting []file.Location
|
||||
for _, loc := range modelFileLocs {
|
||||
if classifyOCIModelFileLayer(resolver, loc, md, &configName, &readmeName, &readmeLicense) {
|
||||
supporting = append(supporting, loc)
|
||||
}
|
||||
}
|
||||
|
||||
// Precedence: config.json _name_or_path > README base_model.
|
||||
nameOrPath := configName
|
||||
if nameOrPath == "" {
|
||||
nameOrPath = readmeName
|
||||
}
|
||||
|
||||
id := safeTensorsIdentity{
|
||||
nameOrPath: nameOrPath,
|
||||
fallbackName: ociImageRefBasename(resolver),
|
||||
supporting: supporting,
|
||||
}
|
||||
|
||||
// License precedence: a README model-card license wins over dedicated
|
||||
// license layers (mirrors the dir-scan path, where README frontmatter is the
|
||||
// license source).
|
||||
switch {
|
||||
case readmeLicense != "":
|
||||
id.licenses = pkg.NewLicensesFromValuesWithContext(ctx, readmeLicense)
|
||||
default:
|
||||
licLocs, err := ociResolver.FilesByMediaType(dockerAILicenseMediaType)
|
||||
if err != nil {
|
||||
log.Debugf("failed to list docker AI license layers: %v", err)
|
||||
}
|
||||
if len(licLocs) > 0 {
|
||||
id.licenses = identifyLicenseLayers(ctx, resolver, licLocs)
|
||||
id.supporting = append(id.supporting, licLocs...)
|
||||
}
|
||||
}
|
||||
|
||||
return id
|
||||
}
|
||||
|
||||
// ociImageReferencer is the minimal capability ociImageRefBasename needs: a
|
||||
// resolver that can surface the OCI image reference it was built from. It is
|
||||
// kept local to this package (rather than exported from the file package) so the
|
||||
// assertion stays with its only consumer.
|
||||
type ociImageReferencer interface {
|
||||
ImageReference() string
|
||||
}
|
||||
|
||||
func ociImageRefBasename(resolver file.Resolver) string {
|
||||
// TODO: we don't think this approach is generalizable quite yet, but we really do need this information.
|
||||
// (Ideally we should be NOT be type asserting on the file resolver directly).
|
||||
info, ok := resolver.(ociImageReferencer)
|
||||
if !ok {
|
||||
return ""
|
||||
}
|
||||
ref := info.ImageReference()
|
||||
if ref == "" {
|
||||
return ""
|
||||
}
|
||||
parsed, err := gcrname.ParseReference(ref)
|
||||
if err != nil {
|
||||
log.Debugf("failed to parse OCI ref %q: %v", ref, err)
|
||||
return ""
|
||||
}
|
||||
return path.Base(parsed.Context().RepositoryStr())
|
||||
}
|
||||
|
||||
// classifyOCIModelFileLayer reads up to 4 MiB of a model.file layer and
|
||||
// classifies it as README frontmatter or HF config.json based on its leading bytes.
|
||||
func classifyOCIModelFileLayer(resolver file.Resolver, loc file.Location, md *pkg.SafeTensorsModelInfo, configName, readmeName, license *string) bool {
|
||||
rc, err := resolver.FileContentsByLocation(loc)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
defer internal.CloseAndLogError(rc, loc.RealPath)
|
||||
|
||||
buf, err := io.ReadAll(io.LimitReader(rc, 4*1024*1024))
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
trimmed := bytes.TrimLeft(buf, "\xef\xbb\xbf \t\r\n")
|
||||
switch {
|
||||
case bytes.HasPrefix(trimmed, []byte("---")):
|
||||
fm := parseFrontmatter(buf)
|
||||
if fm == nil {
|
||||
return false
|
||||
}
|
||||
if *license == "" {
|
||||
*license = fm.License
|
||||
}
|
||||
if *readmeName == "" && len(fm.BaseModel) > 0 {
|
||||
*readmeName = fm.BaseModel[0]
|
||||
}
|
||||
return true
|
||||
case bytes.HasPrefix(trimmed, []byte("{")):
|
||||
var cfg hfConfig
|
||||
if err := json.Unmarshal(buf, &cfg); err != nil {
|
||||
return false
|
||||
}
|
||||
applyHFConfig(md, &cfg)
|
||||
if *configName == "" && cfg.NameOrPath != "" {
|
||||
*configName = cfg.NameOrPath
|
||||
}
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// identifyLicenseLayers turns Docker AI license-layer locations into
|
||||
// pkg.License values.
|
||||
func identifyLicenseLayers(ctx context.Context, resolver file.Resolver, locs []file.Location) []pkg.License {
|
||||
var out []pkg.License
|
||||
var scanFallback []file.Location
|
||||
for i := range locs {
|
||||
loc := locs[i]
|
||||
if spdx := readLicenseSPDXIDFromFrontmatter(resolver, loc); spdx != "" {
|
||||
out = append(out, pkg.NewLicenseFromFieldsWithContext(ctx, spdx, "", &loc))
|
||||
continue
|
||||
}
|
||||
scanFallback = append(scanFallback, loc)
|
||||
}
|
||||
if len(scanFallback) > 0 {
|
||||
out = append(out, licenses.FindAtLocations(ctx, resolver, scanFallback...)...)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// readLicenseSPDXIDFromFrontmatter reads a bounded prefix of a license-layer
|
||||
// blob and returns the spdx-id declared in its YAML frontmatter
|
||||
func readLicenseSPDXIDFromFrontmatter(resolver file.Resolver, loc file.Location) string {
|
||||
rc, err := resolver.FileContentsByLocation(loc)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
defer internal.CloseAndLogError(rc, loc.RealPath)
|
||||
|
||||
buf, err := io.ReadAll(io.LimitReader(rc, 64*1024))
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return parseLicenseFrontmatter(buf)
|
||||
}
|
||||
140
syft/pkg/cataloger/ai/merge.go
Normal file
140
syft/pkg/cataloger/ai/merge.go
Normal file
@ -0,0 +1,140 @@
|
||||
package ai
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"github.com/cespare/xxhash/v2"
|
||||
|
||||
"github.com/anchore/syft/syft/file"
|
||||
"github.com/anchore/syft/syft/pkg"
|
||||
)
|
||||
|
||||
// mergeSafeTensorsGroup folds a group's per-member metadata into a single package.
|
||||
func mergeSafeTensorsGroup(members []pkg.Package) pkg.Package {
|
||||
locSet := unionLocations(members)
|
||||
aggregates, shards := bucketSafeTensorsMembers(members)
|
||||
|
||||
merged := pkg.SafeTensorsModelInfo{Format: "safetensors"}
|
||||
mergeAggregatesInto(&merged, aggregates)
|
||||
shardTensorTotal, hashes := mergeShardsInto(&merged, shards)
|
||||
|
||||
// Keep merged UserMetadata globally key-sorted so the SBOM is stable
|
||||
sort.Slice(merged.UserMetadata, func(i, j int) bool {
|
||||
return merged.UserMetadata[i].Key < merged.UserMetadata[j].Key
|
||||
})
|
||||
|
||||
if merged.TensorCount == 0 {
|
||||
merged.TensorCount = shardTensorTotal
|
||||
}
|
||||
if merged.ShardCount == 0 {
|
||||
if len(shards) > 0 {
|
||||
merged.ShardCount = len(shards)
|
||||
} else {
|
||||
merged.ShardCount = 1
|
||||
}
|
||||
}
|
||||
merged.MetadataHash = rollupHash(hashes)
|
||||
|
||||
// Parts only carry value for multi-shard models; for a single shard the
|
||||
// outer view already exposes every per-shard field.
|
||||
if len(shards) > 1 {
|
||||
parts := append([]pkg.SafeTensorsModelInfo(nil), shards...)
|
||||
sort.Slice(parts, func(i, j int) bool {
|
||||
return parts[i].MetadataHash < parts[j].MetadataHash
|
||||
})
|
||||
merged.Parts = parts
|
||||
}
|
||||
|
||||
return pkg.Package{
|
||||
Locations: locSet,
|
||||
Type: pkg.ModelPkg,
|
||||
Metadata: merged,
|
||||
}
|
||||
}
|
||||
|
||||
func mergeAggregatesInto(merged *pkg.SafeTensorsModelInfo, aggregates []pkg.SafeTensorsModelInfo) {
|
||||
for _, a := range aggregates {
|
||||
if merged.TensorCount == 0 {
|
||||
merged.TensorCount = a.TensorCount
|
||||
}
|
||||
if merged.ShardCount == 0 {
|
||||
merged.ShardCount = a.ShardCount
|
||||
}
|
||||
firstNonEmpty(&merged.Parameters, a.Parameters)
|
||||
firstNonEmpty(&merged.TotalSize, a.TotalSize)
|
||||
firstNonEmpty(&merged.Quantization, a.Quantization)
|
||||
}
|
||||
}
|
||||
|
||||
// mergeShardsInto folds the per-shard header metadata into merged, returning
|
||||
// the summed shard TensorCount and the list of non-empty per-shard hashes for
|
||||
// the rollup. Shards carry only the content-derived fields (Quantization,
|
||||
// Parameters, UserMetadata);
|
||||
func mergeShardsInto(merged *pkg.SafeTensorsModelInfo, shards []pkg.SafeTensorsModelInfo) (shardTensorTotal uint64, hashes []string) {
|
||||
seenKV := map[string]bool{}
|
||||
for _, s := range shards {
|
||||
shardTensorTotal += s.TensorCount
|
||||
firstNonEmpty(&merged.Quantization, s.Quantization)
|
||||
firstNonEmpty(&merged.Parameters, s.Parameters)
|
||||
for _, kv := range s.UserMetadata {
|
||||
if seenKV[kv.Key] {
|
||||
continue
|
||||
}
|
||||
seenKV[kv.Key] = true
|
||||
merged.UserMetadata = append(merged.UserMetadata, kv)
|
||||
}
|
||||
if s.MetadataHash != "" {
|
||||
hashes = append(hashes, s.MetadataHash)
|
||||
}
|
||||
}
|
||||
return shardTensorTotal, hashes
|
||||
}
|
||||
|
||||
func firstNonEmpty(dst *string, v string) {
|
||||
if *dst == "" {
|
||||
*dst = v
|
||||
}
|
||||
}
|
||||
|
||||
// unionLocations gathers every location from every member into a single set.
|
||||
func unionLocations(members []pkg.Package) file.LocationSet {
|
||||
out := file.NewLocationSet()
|
||||
for _, m := range members {
|
||||
for _, l := range m.Locations.ToSlice() {
|
||||
out.Add(l)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// bucketSafeTensorsMembers splits group members into aggregate-flavored entries
|
||||
// (no MetadataHash — Docker AI config blob or sharded index) and shard-flavored
|
||||
// entries (carry a content-derived MetadataHash from a header parser).
|
||||
func bucketSafeTensorsMembers(members []pkg.Package) (aggregates, shards []pkg.SafeTensorsModelInfo) {
|
||||
for _, m := range members {
|
||||
md, ok := m.Metadata.(pkg.SafeTensorsModelInfo)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if md.MetadataHash != "" {
|
||||
shards = append(shards, md)
|
||||
continue
|
||||
}
|
||||
aggregates = append(aggregates, md)
|
||||
}
|
||||
return aggregates, shards
|
||||
}
|
||||
|
||||
func rollupHash(hashes []string) string {
|
||||
if len(hashes) == 0 {
|
||||
return ""
|
||||
}
|
||||
if len(hashes) == 1 {
|
||||
return hashes[0]
|
||||
}
|
||||
sorted := append([]string(nil), hashes...)
|
||||
sort.Strings(sorted)
|
||||
return fmt.Sprintf("%016x", xxhash.Sum64String(strings.Join(sorted, "|")))
|
||||
}
|
||||
25
syft/pkg/cataloger/ai/naming.go
Normal file
25
syft/pkg/cataloger/ai/naming.go
Normal file
@ -0,0 +1,25 @@
|
||||
package ai
|
||||
|
||||
import "path"
|
||||
|
||||
// pickSafeTensorsName implements the documented naming precedence chain:
|
||||
// - config.json _name_or_path (path.Base, so "org/Model" → "Model";
|
||||
// applies to both dir-scan and OCI groups)
|
||||
// - fallback name — the group's source-specific positional identifier
|
||||
func pickSafeTensorsName(nameOrPath, fallbackName string) string {
|
||||
if nameOrPath != "" {
|
||||
return path.Base(nameOrPath)
|
||||
}
|
||||
return fallbackName
|
||||
}
|
||||
|
||||
// safeTensorsDirName returns the directory-scan naming fallback: the base name
|
||||
// of the group's parent directory (the group key is already that directory).
|
||||
func safeTensorsDirName(groupKey string) string {
|
||||
base := path.Base(groupKey)
|
||||
switch base {
|
||||
case "/", ".", "":
|
||||
return ""
|
||||
}
|
||||
return base
|
||||
}
|
||||
@ -1,159 +1,122 @@
|
||||
package ai
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"path"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"github.com/cespare/xxhash/v2"
|
||||
gcrname "github.com/google/go-containerregistry/pkg/name"
|
||||
"gopkg.in/yaml.v3"
|
||||
|
||||
"github.com/anchore/syft/internal"
|
||||
"github.com/anchore/syft/internal/log"
|
||||
"github.com/anchore/syft/syft/artifact"
|
||||
"github.com/anchore/syft/syft/file"
|
||||
"github.com/anchore/syft/syft/pkg"
|
||||
"github.com/anchore/syft/syft/pkg/cataloger/internal/licenses"
|
||||
)
|
||||
|
||||
// ociGroupKey is the grouping key for every safetensors package that
|
||||
// originated from an OCI model artifact. The ContainerImageModel resolver gives
|
||||
// each layer the virtual RealPath "/" regardless of layer media type, so all
|
||||
// safetensors packages from a single OCI scan collapse into one group.
|
||||
const ociGroupKey = "@oci@"
|
||||
|
||||
// ggufMergeProcessor consolidates multiple GGUF packages into a single package
|
||||
// representing the AI model. When scanning OCI images with multiple layers,
|
||||
// each layer may produce a separate package. This processor finds the package
|
||||
// with a name and merges metadata from nameless packages into its GGUFFileParts field.
|
||||
// Only packages with a non-empty name are returned in the final result.
|
||||
func ggufMergeProcessor(pkgs []pkg.Package, rels []artifact.Relationship, err error) ([]pkg.Package, []artifact.Relationship, error) {
|
||||
if err != nil {
|
||||
return pkgs, rels, err
|
||||
}
|
||||
if len(pkgs) == 0 {
|
||||
return pkgs, rels, err
|
||||
}
|
||||
|
||||
// Separate packages with names from those without
|
||||
var namedPkgs []pkg.Package
|
||||
var namelessHeaders []pkg.GGUFFileHeader
|
||||
|
||||
for _, p := range pkgs {
|
||||
if p.Name != "" {
|
||||
namedPkgs = append(namedPkgs, p)
|
||||
} else {
|
||||
if header, ok := p.Metadata.(pkg.GGUFFileHeader); ok {
|
||||
// We do not want a kv hash for nameless headers
|
||||
header.MetadataKeyValuesHash = ""
|
||||
namelessHeaders = append(namelessHeaders, header)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If there are no named packages, return nothing
|
||||
if len(namedPkgs) == 0 {
|
||||
return nil, rels, err
|
||||
}
|
||||
|
||||
// merge nameless headers into a single named package;
|
||||
// if there are multiple named packages, return them without trying to merge headers.
|
||||
// we cannot determine which nameless headers belong to which package
|
||||
// this is because the order we receive the gguf headers in is not guaranteed
|
||||
// to match the layer order in the original oci image
|
||||
if len(namedPkgs) == 1 && len(namelessHeaders) > 0 {
|
||||
winner := &namedPkgs[0]
|
||||
if header, ok := winner.Metadata.(pkg.GGUFFileHeader); ok {
|
||||
header.Parts = namelessHeaders
|
||||
winner.Metadata = header
|
||||
}
|
||||
}
|
||||
|
||||
return namedPkgs, rels, err
|
||||
}
|
||||
|
||||
// safeTensorsMergeProcessor owns naming, license resolution, and tensor package creation
|
||||
// - groups all nameless packages
|
||||
// - merge the per-shard metadata
|
||||
// - picks a name (see pickSafeTensorsName)
|
||||
// safeTensorsMergeProcessor owns naming, license resolution, and final package
|
||||
// assembly. SafeTensors packages reach it nameless from the parsers; it groups
|
||||
// them per model, merges the per-shard metadata, resolves a name + licenses, and
|
||||
// drops any model it cannot name.
|
||||
//
|
||||
// There are exactly two sources, each handled by its own path:
|
||||
// - an OCI model artifact, where the source presents every layer at the
|
||||
// virtual path "/" and the whole scan is a single model (mergeOCIModel)
|
||||
// - a filesystem scan, where models are grouped by the directory their files
|
||||
// live in (mergeDirModels)
|
||||
func safeTensorsMergeProcessor(ctx context.Context, resolver file.Resolver, pkgs []pkg.Package, rels []artifact.Relationship, err error) ([]pkg.Package, []artifact.Relationship, error) {
|
||||
if err != nil || len(pkgs) == 0 {
|
||||
return pkgs, rels, err
|
||||
}
|
||||
|
||||
// split off non-safetensors packages
|
||||
// this keeps the processor robust if other types ever flow through
|
||||
var stPkgs, other []pkg.Package
|
||||
for _, p := range pkgs {
|
||||
if _, ok := p.Metadata.(pkg.SafeTensorsModelInfo); ok {
|
||||
stPkgs = append(stPkgs, p)
|
||||
continue
|
||||
}
|
||||
other = append(other, p)
|
||||
}
|
||||
// keep the processor robust if non-safetensors packages ever flow through
|
||||
stPkgs, other := partitionSafeTensorsPackages(pkgs)
|
||||
if len(stPkgs) == 0 {
|
||||
return pkgs, rels, err
|
||||
}
|
||||
|
||||
groups := groupSafeTensorsPackages(stPkgs)
|
||||
|
||||
// Deterministic iteration order so the SBOM doesn't depend on map order.
|
||||
keys := make([]string, 0, len(groups))
|
||||
for k := range groups {
|
||||
keys = append(keys, k)
|
||||
if fromOCIArtifact(stPkgs) {
|
||||
return append(other, mergeOCIModel(ctx, resolver, stPkgs)...), rels, nil
|
||||
}
|
||||
sort.Strings(keys)
|
||||
|
||||
out := other
|
||||
for _, key := range keys {
|
||||
merged := mergeSafeTensorsGroup(groups[key])
|
||||
|
||||
// Resolve model identity (name candidates) before enrich
|
||||
id := resolveSafeTensorsIdentity(resolver, key, &merged)
|
||||
name := pickSafeTensorsName(id.nameOrPath, id.fallbackName)
|
||||
if name == "" {
|
||||
log.Debugf("dropped safetensors model package (metadata hash %q): no name source",
|
||||
merged.Metadata.(pkg.SafeTensorsModelInfo).MetadataHash)
|
||||
continue
|
||||
}
|
||||
|
||||
enrichSafeTensorsGroup(ctx, resolver, key, &merged, id)
|
||||
merged.Name = name
|
||||
merged.SetID()
|
||||
out = append(out, merged)
|
||||
}
|
||||
return out, rels, nil
|
||||
return append(other, mergeDirModels(ctx, resolver, stPkgs)...), rels, nil
|
||||
}
|
||||
|
||||
// groupSafeTensorsPackages buckets packages by the parent directory of their
|
||||
// primary-evidence location
|
||||
func groupSafeTensorsPackages(pkgs []pkg.Package) map[string][]pkg.Package {
|
||||
out := make(map[string][]pkg.Package)
|
||||
// partitionSafeTensorsPackages separates safetensors packages from anything else
|
||||
// flowing through the processor.
|
||||
func partitionSafeTensorsPackages(pkgs []pkg.Package) (safeTensors, other []pkg.Package) {
|
||||
for _, p := range pkgs {
|
||||
key := safeTensorsGroupKey(p)
|
||||
if key == "" {
|
||||
if _, ok := p.Metadata.(pkg.SafeTensorsModelInfo); ok {
|
||||
safeTensors = append(safeTensors, p)
|
||||
continue
|
||||
}
|
||||
out[key] = append(out[key], p)
|
||||
other = append(other, p)
|
||||
}
|
||||
return safeTensors, other
|
||||
}
|
||||
|
||||
// fromOCIArtifact reports whether the packages came from an OCI model artifact.
|
||||
// That source presents every layer at the virtual path "/", whereas a filesystem
|
||||
// scan always carries a real file path. A single scan is one source, so the
|
||||
// first package is representative of the rest.
|
||||
func fromOCIArtifact(pkgs []pkg.Package) bool {
|
||||
loc := primaryEvidenceLocation(pkgs[0])
|
||||
return loc != nil && loc.RealPath == "/"
|
||||
}
|
||||
|
||||
// mergeOCIModel treats the whole OCI artifact as a single model: every layer
|
||||
// merges into one package, named from the artifact's config.json/README or its
|
||||
// image reference.
|
||||
func mergeOCIModel(ctx context.Context, resolver file.Resolver, pkgs []pkg.Package) []pkg.Package {
|
||||
merged := mergeSafeTensorsGroup(pkgs)
|
||||
|
||||
md := merged.Metadata.(pkg.SafeTensorsModelInfo)
|
||||
id := resolveSafeTensorsOCIIdentity(ctx, resolver, &md)
|
||||
merged.Metadata = md // write architecture enrichment back before assembly
|
||||
|
||||
if p, ok := assembleSafeTensorsPackage(merged, id); ok {
|
||||
return []pkg.Package{p}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// mergeDirModels groups filesystem-scanned files by their parent directory and
|
||||
// emits one model per directory, named from a sibling config.json/README or the
|
||||
// directory itself.
|
||||
func mergeDirModels(ctx context.Context, resolver file.Resolver, pkgs []pkg.Package) []pkg.Package {
|
||||
groups := groupByParentDir(pkgs)
|
||||
|
||||
// deterministic iteration order so the SBOM doesn't depend on map order
|
||||
dirs := make([]string, 0, len(groups))
|
||||
for dir := range groups {
|
||||
dirs = append(dirs, dir)
|
||||
}
|
||||
sort.Strings(dirs)
|
||||
|
||||
var out []pkg.Package
|
||||
for _, dir := range dirs {
|
||||
merged := mergeSafeTensorsGroup(groups[dir])
|
||||
|
||||
md := merged.Metadata.(pkg.SafeTensorsModelInfo)
|
||||
id := resolveSafeTensorsDirIdentity(ctx, resolver, dir, &md)
|
||||
merged.Metadata = md // write architecture enrichment back before assembly
|
||||
|
||||
if p, ok := assembleSafeTensorsPackage(merged, id); ok {
|
||||
out = append(out, p)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func safeTensorsGroupKey(p pkg.Package) string {
|
||||
loc := primaryEvidenceLocation(p)
|
||||
if loc == nil {
|
||||
return ""
|
||||
// groupByParentDir buckets filesystem-scanned packages by the directory their
|
||||
// primary-evidence file lives in (the shards of one model share a directory).
|
||||
func groupByParentDir(pkgs []pkg.Package) map[string][]pkg.Package {
|
||||
out := make(map[string][]pkg.Package)
|
||||
for _, p := range pkgs {
|
||||
loc := primaryEvidenceLocation(p)
|
||||
if loc == nil {
|
||||
continue
|
||||
}
|
||||
dir := path.Dir(loc.RealPath)
|
||||
out[dir] = append(out[dir], p)
|
||||
}
|
||||
if loc.RealPath == "/" {
|
||||
return ociGroupKey
|
||||
}
|
||||
return path.Dir(loc.RealPath)
|
||||
return out
|
||||
}
|
||||
|
||||
func primaryEvidenceLocation(p pkg.Package) *file.Location {
|
||||
@ -169,501 +132,34 @@ func primaryEvidenceLocation(p pkg.Package) *file.Location {
|
||||
return nil
|
||||
}
|
||||
|
||||
// mergeSafeTensorsGroup folds a group's per-member metadata into a single package.
|
||||
func mergeSafeTensorsGroup(members []pkg.Package) pkg.Package {
|
||||
locSet := unionLocations(members)
|
||||
aggregates, shards := bucketSafeTensorsMembers(members)
|
||||
|
||||
merged := pkg.SafeTensorsModelInfo{Format: "safetensors"}
|
||||
mergeAggregatesInto(&merged, aggregates)
|
||||
shardTensorTotal, hashes := mergeShardsInto(&merged, shards)
|
||||
|
||||
// Keep merged UserMetadata globally key-sorted so the SBOM is stable
|
||||
sort.Slice(merged.UserMetadata, func(i, j int) bool {
|
||||
return merged.UserMetadata[i].Key < merged.UserMetadata[j].Key
|
||||
})
|
||||
|
||||
if merged.TensorCount == 0 {
|
||||
merged.TensorCount = shardTensorTotal
|
||||
}
|
||||
if merged.ShardCount == 0 {
|
||||
if len(shards) > 0 {
|
||||
merged.ShardCount = len(shards)
|
||||
} else {
|
||||
merged.ShardCount = 1
|
||||
}
|
||||
}
|
||||
merged.MetadataHash = rollupHash(hashes)
|
||||
|
||||
// Parts only carry value for multi-shard models; for a single shard the
|
||||
// outer view already exposes every per-shard field.
|
||||
if len(shards) > 1 {
|
||||
parts := append([]pkg.SafeTensorsModelInfo(nil), shards...)
|
||||
sort.Slice(parts, func(i, j int) bool {
|
||||
return parts[i].MetadataHash < parts[j].MetadataHash
|
||||
})
|
||||
merged.Parts = parts
|
||||
}
|
||||
|
||||
return pkg.Package{
|
||||
Locations: locSet,
|
||||
Type: pkg.ModelPkg,
|
||||
Metadata: merged,
|
||||
}
|
||||
}
|
||||
|
||||
func mergeAggregatesInto(merged *pkg.SafeTensorsModelInfo, aggregates []pkg.SafeTensorsModelInfo) {
|
||||
for _, a := range aggregates {
|
||||
if merged.TensorCount == 0 {
|
||||
merged.TensorCount = a.TensorCount
|
||||
}
|
||||
if merged.ShardCount == 0 {
|
||||
merged.ShardCount = a.ShardCount
|
||||
}
|
||||
firstNonEmpty(&merged.Parameters, a.Parameters)
|
||||
firstNonEmpty(&merged.TotalSize, a.TotalSize)
|
||||
firstNonEmpty(&merged.Quantization, a.Quantization)
|
||||
}
|
||||
}
|
||||
|
||||
// mergeShardsInto folds the per-shard header metadata into merged, returning
|
||||
// the summed shard TensorCount and the list of non-empty per-shard hashes for
|
||||
// the rollup. Shards carry only the content-derived fields (Quantization,
|
||||
// Parameters, UserMetadata);
|
||||
func mergeShardsInto(merged *pkg.SafeTensorsModelInfo, shards []pkg.SafeTensorsModelInfo) (shardTensorTotal uint64, hashes []string) {
|
||||
seenKV := map[string]bool{}
|
||||
for _, s := range shards {
|
||||
shardTensorTotal += s.TensorCount
|
||||
firstNonEmpty(&merged.Quantization, s.Quantization)
|
||||
firstNonEmpty(&merged.Parameters, s.Parameters)
|
||||
for _, kv := range s.UserMetadata {
|
||||
if seenKV[kv.Key] {
|
||||
continue
|
||||
}
|
||||
seenKV[kv.Key] = true
|
||||
merged.UserMetadata = append(merged.UserMetadata, kv)
|
||||
}
|
||||
if s.MetadataHash != "" {
|
||||
hashes = append(hashes, s.MetadataHash)
|
||||
}
|
||||
}
|
||||
return shardTensorTotal, hashes
|
||||
}
|
||||
|
||||
func firstNonEmpty(dst *string, v string) {
|
||||
if *dst == "" {
|
||||
*dst = v
|
||||
}
|
||||
}
|
||||
|
||||
// unionLocations gathers every location from every member into a single set.
|
||||
func unionLocations(members []pkg.Package) file.LocationSet {
|
||||
out := file.NewLocationSet()
|
||||
for _, m := range members {
|
||||
for _, l := range m.Locations.ToSlice() {
|
||||
out.Add(l)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// bucketSafeTensorsMembers splits group members into aggregate-flavored entries
|
||||
// (no MetadataHash — Docker AI config blob or sharded index) and shard-flavored
|
||||
// entries (carry a content-derived MetadataHash from a header parser).
|
||||
func bucketSafeTensorsMembers(members []pkg.Package) (aggregates, shards []pkg.SafeTensorsModelInfo) {
|
||||
for _, m := range members {
|
||||
md, ok := m.Metadata.(pkg.SafeTensorsModelInfo)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if md.MetadataHash != "" {
|
||||
shards = append(shards, md)
|
||||
continue
|
||||
}
|
||||
aggregates = append(aggregates, md)
|
||||
}
|
||||
return aggregates, shards
|
||||
}
|
||||
|
||||
func rollupHash(hashes []string) string {
|
||||
if len(hashes) == 0 {
|
||||
return ""
|
||||
}
|
||||
if len(hashes) == 1 {
|
||||
return hashes[0]
|
||||
}
|
||||
sorted := append([]string(nil), hashes...)
|
||||
sort.Strings(sorted)
|
||||
return fmt.Sprintf("%016x", xxhash.Sum64String(strings.Join(sorted, "|")))
|
||||
}
|
||||
|
||||
// safeTensorsIdentity is the fully-resolved naming/license result for a model.
|
||||
// Each source resolver (dir or OCI) populates it so assembly stays source-agnostic.
|
||||
type safeTensorsIdentity struct {
|
||||
nameOrPath string
|
||||
fallbackName string
|
||||
readmeLicense string
|
||||
supporting []file.Location
|
||||
nameOrPath string
|
||||
fallbackName string
|
||||
licenses []pkg.License
|
||||
supporting []file.Location
|
||||
}
|
||||
|
||||
// resolveSafeTensorsIdentity reads the resolver for the group's naming signals
|
||||
// (config.json _name_or_path, README base_model, OCI image ref / dir name)
|
||||
func resolveSafeTensorsIdentity(resolver file.Resolver, groupKey string, merged *pkg.Package) safeTensorsIdentity {
|
||||
md := merged.Metadata.(pkg.SafeTensorsModelInfo)
|
||||
|
||||
var id safeTensorsIdentity
|
||||
if groupKey == ociGroupKey {
|
||||
id = resolveSafeTensorsOCIIdentity(resolver, &md)
|
||||
} else {
|
||||
id = resolveSafeTensorsDirIdentity(resolver, groupKey, &md)
|
||||
// assembleSafeTensorsPackage finalizes a merged model from its resolved identity:
|
||||
// it picks the name, attaches licenses and supporting evidence, and sets the ID.
|
||||
// A model with no name source is dropped (ok=false).
|
||||
func assembleSafeTensorsPackage(merged pkg.Package, id safeTensorsIdentity) (pkg.Package, bool) {
|
||||
name := pickSafeTensorsName(id.nameOrPath, id.fallbackName)
|
||||
if name == "" {
|
||||
log.Debugf("dropped safetensors model package (metadata hash %q): no name source",
|
||||
merged.Metadata.(pkg.SafeTensorsModelInfo).MetadataHash)
|
||||
return pkg.Package{}, false
|
||||
}
|
||||
|
||||
merged.Metadata = md
|
||||
return id
|
||||
}
|
||||
|
||||
func enrichSafeTensorsGroup(ctx context.Context, resolver file.Resolver, groupKey string, merged *pkg.Package, id safeTensorsIdentity) {
|
||||
var lics []pkg.License
|
||||
supporting := id.supporting
|
||||
|
||||
switch {
|
||||
case id.readmeLicense != "":
|
||||
lics = pkg.NewLicensesFromValuesWithContext(ctx, id.readmeLicense)
|
||||
case groupKey == ociGroupKey:
|
||||
if ociResolver, ok := resolver.(file.OCIMediaTypeResolver); ok {
|
||||
licLocs, err := ociResolver.FilesByMediaType(dockerAILicenseMediaType)
|
||||
if err != nil {
|
||||
log.Debugf("failed to list docker AI license layers: %v", err)
|
||||
}
|
||||
if len(licLocs) > 0 {
|
||||
lics = identifyLicenseLayers(ctx, resolver, licLocs)
|
||||
supporting = append(supporting, licLocs...)
|
||||
}
|
||||
}
|
||||
if len(id.licenses) > 0 {
|
||||
merged.Licenses = pkg.NewLicenseSet(id.licenses...)
|
||||
}
|
||||
|
||||
if len(lics) > 0 {
|
||||
merged.Licenses = pkg.NewLicenseSet(lics...)
|
||||
}
|
||||
for _, loc := range supporting {
|
||||
for _, loc := range id.supporting {
|
||||
merged.Locations.Add(loc.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.SupportingEvidenceAnnotation))
|
||||
}
|
||||
}
|
||||
|
||||
// safeTensorsDirName returns the directory-scan naming fallback: the base name
|
||||
// of the group's parent directory (the group key is already that directory).
|
||||
func safeTensorsDirName(groupKey string) string {
|
||||
base := path.Base(groupKey)
|
||||
switch base {
|
||||
case "/", ".", "":
|
||||
return ""
|
||||
}
|
||||
return base
|
||||
}
|
||||
|
||||
// resolveSafeTensorsDirIdentity handles the directory-scan case: look for a
|
||||
// config.json beside the model files (walking up parent directories to the
|
||||
// scanned source root if no sibling exists) and a sibling README.md
|
||||
func resolveSafeTensorsDirIdentity(resolver file.Resolver, dir string, md *pkg.SafeTensorsModelInfo) safeTensorsIdentity {
|
||||
id := safeTensorsIdentity{fallbackName: safeTensorsDirName(dir)}
|
||||
|
||||
if loc, cfg := findDirHFConfig(resolver, dir); cfg != nil {
|
||||
applyHFConfig(md, cfg)
|
||||
id.nameOrPath = cfg.NameOrPath
|
||||
id.supporting = append(id.supporting, *loc)
|
||||
}
|
||||
|
||||
if loc, fm := readDirReadmeFrontmatter(resolver, path.Join(dir, "README.md")); fm != nil {
|
||||
id.readmeLicense = fm.License
|
||||
if id.nameOrPath == "" && len(fm.BaseModel) > 0 {
|
||||
id.nameOrPath = fm.BaseModel[0]
|
||||
}
|
||||
id.supporting = append(id.supporting, *loc)
|
||||
}
|
||||
return id
|
||||
}
|
||||
|
||||
func resolveSafeTensorsOCIIdentity(resolver file.Resolver, md *pkg.SafeTensorsModelInfo) safeTensorsIdentity {
|
||||
ociResolver, ok := resolver.(file.OCIMediaTypeResolver)
|
||||
if !ok {
|
||||
return safeTensorsIdentity{}
|
||||
}
|
||||
|
||||
modelFileLocs, err := ociResolver.FilesByMediaType(dockerAIModelFileMediaType)
|
||||
if err != nil {
|
||||
log.Debugf("failed to list docker AI model-file layers: %v", err)
|
||||
}
|
||||
|
||||
// Collect config / readme candidates separately so the layer-iteration order
|
||||
// returned by the resolver doesn't decide the precedence.
|
||||
var configName, readmeName, readmeLicense string
|
||||
var supporting []file.Location
|
||||
for _, loc := range modelFileLocs {
|
||||
if classifyOCIModelFileLayer(resolver, loc, md, &configName, &readmeName, &readmeLicense) {
|
||||
supporting = append(supporting, loc)
|
||||
}
|
||||
}
|
||||
|
||||
// Precedence: config.json _name_or_path > README base_model.
|
||||
nameOrPath := configName
|
||||
if nameOrPath == "" {
|
||||
nameOrPath = readmeName
|
||||
}
|
||||
|
||||
return safeTensorsIdentity{
|
||||
nameOrPath: nameOrPath,
|
||||
fallbackName: ociImageRefBasename(resolver),
|
||||
readmeLicense: readmeLicense,
|
||||
supporting: supporting,
|
||||
}
|
||||
}
|
||||
|
||||
// ociImageReferencer is the minimal capability ociImageRefBasename needs: a
|
||||
// resolver that can surface the OCI image reference it was built from. It is
|
||||
// kept local to this package (rather than exported from the file package) so the
|
||||
// assertion stays with its only consumer.
|
||||
type ociImageReferencer interface {
|
||||
ImageReference() string
|
||||
}
|
||||
|
||||
func ociImageRefBasename(resolver file.Resolver) string {
|
||||
// TODO: we don't think this approach is generalizable quite yet, but we really do need this information.
|
||||
// (Ideally we should be NOT be type asserting on the file resolver directly).
|
||||
info, ok := resolver.(ociImageReferencer)
|
||||
if !ok {
|
||||
return ""
|
||||
}
|
||||
ref := info.ImageReference()
|
||||
if ref == "" {
|
||||
return ""
|
||||
}
|
||||
parsed, err := gcrname.ParseReference(ref)
|
||||
if err != nil {
|
||||
log.Debugf("failed to parse OCI ref %q: %v", ref, err)
|
||||
return ""
|
||||
}
|
||||
return path.Base(parsed.Context().RepositoryStr())
|
||||
}
|
||||
|
||||
// identifyLicenseLayers turns Docker AI license-layer locations into
|
||||
// pkg.License values.
|
||||
func identifyLicenseLayers(ctx context.Context, resolver file.Resolver, locs []file.Location) []pkg.License {
|
||||
var out []pkg.License
|
||||
var scanFallback []file.Location
|
||||
for i := range locs {
|
||||
loc := locs[i]
|
||||
if spdx := readLicenseSPDXIDFromFrontmatter(resolver, loc); spdx != "" {
|
||||
out = append(out, pkg.NewLicenseFromFieldsWithContext(ctx, spdx, "", &loc))
|
||||
continue
|
||||
}
|
||||
scanFallback = append(scanFallback, loc)
|
||||
}
|
||||
if len(scanFallback) > 0 {
|
||||
out = append(out, licenses.FindAtLocations(ctx, resolver, scanFallback...)...)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// readLicenseSPDXIDFromFrontmatter reads a bounded prefix of a license-layer
|
||||
// blob and returns the spdx-id declared in its YAML frontmatter
|
||||
func readLicenseSPDXIDFromFrontmatter(resolver file.Resolver, loc file.Location) string {
|
||||
rc, err := resolver.FileContentsByLocation(loc)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
defer internal.CloseAndLogError(rc, loc.RealPath)
|
||||
|
||||
buf, err := io.ReadAll(io.LimitReader(rc, 64*1024))
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return parseLicenseFrontmatter(buf)
|
||||
}
|
||||
|
||||
// classifyOCIModelFileLayer reads up to 4 MiB of a model.file layer and
|
||||
// classifies it as README frontmatter or HF config.json based on its leading bytes.
|
||||
func classifyOCIModelFileLayer(resolver file.Resolver, loc file.Location, md *pkg.SafeTensorsModelInfo, configName, readmeName, license *string) bool {
|
||||
rc, err := resolver.FileContentsByLocation(loc)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
defer internal.CloseAndLogError(rc, loc.RealPath)
|
||||
|
||||
buf, err := io.ReadAll(io.LimitReader(rc, 4*1024*1024))
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
trimmed := bytes.TrimLeft(buf, "\xef\xbb\xbf \t\r\n")
|
||||
switch {
|
||||
case bytes.HasPrefix(trimmed, []byte("---")):
|
||||
fm := parseFrontmatter(buf)
|
||||
if fm == nil {
|
||||
return false
|
||||
}
|
||||
if *license == "" {
|
||||
*license = fm.License
|
||||
}
|
||||
if *readmeName == "" && len(fm.BaseModel) > 0 {
|
||||
*readmeName = fm.BaseModel[0]
|
||||
}
|
||||
return true
|
||||
case bytes.HasPrefix(trimmed, []byte("{")):
|
||||
var cfg hfConfig
|
||||
if err := json.Unmarshal(buf, &cfg); err != nil {
|
||||
return false
|
||||
}
|
||||
applyHFConfig(md, &cfg)
|
||||
if *configName == "" && cfg.NameOrPath != "" {
|
||||
*configName = cfg.NameOrPath
|
||||
}
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func applyHFConfig(md *pkg.SafeTensorsModelInfo, cfg *hfConfig) {
|
||||
if md.Architecture == "" && len(cfg.Architectures) > 0 {
|
||||
md.Architecture = cfg.Architectures[0]
|
||||
}
|
||||
}
|
||||
|
||||
// pickSafeTensorsName implements the documented naming precedence chain:
|
||||
// - config.json _name_or_path (path.Base, so "org/Model" → "Model";
|
||||
// applies to both dir-scan and OCI groups)
|
||||
// - fallback name — the group's source-specific positional identifier
|
||||
func pickSafeTensorsName(nameOrPath, fallbackName string) string {
|
||||
if nameOrPath != "" {
|
||||
return path.Base(nameOrPath)
|
||||
}
|
||||
return fallbackName
|
||||
}
|
||||
|
||||
// hfConfig is a minimal projection of Hugging Face config.json fields.
|
||||
type hfConfig struct {
|
||||
Architectures []string `json:"architectures"`
|
||||
NameOrPath string `json:"_name_or_path"`
|
||||
}
|
||||
|
||||
// readmeFrontmatter holds the subset of YAML frontmatter fields we extract.
|
||||
type readmeFrontmatter struct {
|
||||
License string `yaml:"license"`
|
||||
BaseModel []string `yaml:"base_model"`
|
||||
}
|
||||
|
||||
// findDirHFConfig looks for a config.json beside the model files
|
||||
func findDirHFConfig(resolver file.Resolver, dir string) (*file.Location, *hfConfig) {
|
||||
for {
|
||||
if loc, cfg := readDirHFConfig(resolver, path.Join(dir, "config.json")); cfg != nil {
|
||||
return loc, cfg
|
||||
}
|
||||
parent := path.Dir(dir)
|
||||
if parent == dir {
|
||||
return nil, nil // reached the source root
|
||||
}
|
||||
dir = parent
|
||||
}
|
||||
}
|
||||
|
||||
func readDirHFConfig(resolver file.Resolver, p string) (*file.Location, *hfConfig) {
|
||||
locations, err := resolver.FilesByPath(p)
|
||||
if err != nil || len(locations) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
rc, err := resolver.FileContentsByLocation(locations[0])
|
||||
if err != nil {
|
||||
return nil, nil
|
||||
}
|
||||
defer internal.CloseAndLogError(rc, p)
|
||||
|
||||
var cfg hfConfig
|
||||
if err := json.NewDecoder(rc).Decode(&cfg); err != nil {
|
||||
log.Debugf("failed to decode %s: %v", p, err)
|
||||
return nil, nil
|
||||
}
|
||||
return &locations[0], &cfg
|
||||
}
|
||||
|
||||
func readDirReadmeFrontmatter(resolver file.Resolver, p string) (*file.Location, *readmeFrontmatter) {
|
||||
locations, err := resolver.FilesByPath(p)
|
||||
if err != nil || len(locations) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
rc, err := resolver.FileContentsByLocation(locations[0])
|
||||
if err != nil {
|
||||
return nil, nil
|
||||
}
|
||||
defer internal.CloseAndLogError(rc, p)
|
||||
|
||||
buf, err := io.ReadAll(io.LimitReader(rc, 1024*1024))
|
||||
if err != nil {
|
||||
return nil, nil
|
||||
}
|
||||
fm := parseFrontmatter(buf)
|
||||
if fm == nil {
|
||||
return nil, nil
|
||||
}
|
||||
return &locations[0], fm
|
||||
}
|
||||
|
||||
// extractFrontmatterBlock returns the YAML bytes between the first and second
|
||||
// "---" delimiters of a file
|
||||
func extractFrontmatterBlock(buf []byte) []byte {
|
||||
trimmed := bytes.TrimLeft(buf, "\xef\xbb\xbf \t\r\n")
|
||||
if !bytes.HasPrefix(trimmed, []byte("---")) {
|
||||
return nil
|
||||
}
|
||||
rest := trimmed[3:]
|
||||
if i := bytes.IndexByte(rest, '\n'); i >= 0 {
|
||||
rest = rest[i+1:]
|
||||
}
|
||||
block, _, found := bytes.Cut(rest, []byte("\n---"))
|
||||
if !found {
|
||||
return nil
|
||||
}
|
||||
return block
|
||||
}
|
||||
|
||||
// parseFrontmatter decodes a Hugging Face model card YAML frontmatter block
|
||||
// and returns the license and base_model fields.
|
||||
func parseFrontmatter(buf []byte) *readmeFrontmatter {
|
||||
block := extractFrontmatterBlock(buf)
|
||||
if block == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
var raw struct {
|
||||
License string `yaml:"license"`
|
||||
BaseModel yaml.Node `yaml:"base_model"`
|
||||
}
|
||||
if err := yaml.Unmarshal(block, &raw); err != nil {
|
||||
log.Debugf("failed to parse README frontmatter: %v", err)
|
||||
return nil
|
||||
}
|
||||
|
||||
fm := readmeFrontmatter{License: raw.License}
|
||||
switch raw.BaseModel.Kind {
|
||||
case yaml.ScalarNode:
|
||||
if raw.BaseModel.Value != "" {
|
||||
fm.BaseModel = []string{raw.BaseModel.Value}
|
||||
}
|
||||
case yaml.SequenceNode:
|
||||
_ = raw.BaseModel.Decode(&fm.BaseModel)
|
||||
}
|
||||
return &fm
|
||||
}
|
||||
|
||||
type licenseFrontmatter struct {
|
||||
SPDXID string `yaml:"spdx-id"`
|
||||
}
|
||||
|
||||
// parseLicenseFrontmatter returns the producer-declared SPDX identifier
|
||||
func parseLicenseFrontmatter(buf []byte) string {
|
||||
block := extractFrontmatterBlock(buf)
|
||||
if block == nil {
|
||||
return ""
|
||||
}
|
||||
var fm licenseFrontmatter
|
||||
if err := yaml.Unmarshal(block, &fm); err != nil {
|
||||
log.Debugf("failed to parse license frontmatter: %v", err)
|
||||
return ""
|
||||
}
|
||||
return fm.SPDXID
|
||||
|
||||
merged.Name = name
|
||||
merged.SetID()
|
||||
return merged, true
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user