mirror of
https://github.com/anchore/syft.git
synced 2026-07-05 02:28:25 +02:00
review: remove and refactor implementation for easier review
Signed-off-by: Christopher Phillips <32073428+spiffcs@users.noreply.github.com>
This commit is contained in:
parent
4352ac4691
commit
4eaf583526
@ -22,7 +22,7 @@ import (
|
|||||||
"github.com/anchore/syft/syft/pkg/cataloger/internal/licenses"
|
"github.com/anchore/syft/syft/pkg/cataloger/internal/licenses"
|
||||||
)
|
)
|
||||||
|
|
||||||
// ociGroupKey is the sentinel grouping key for every safetensors package that
|
// ociGroupKey is the grouping key for every safetensors package that
|
||||||
// originated from an OCI model artifact. The ContainerImageModel resolver gives
|
// originated from an OCI model artifact. The ContainerImageModel resolver gives
|
||||||
// each layer the virtual RealPath "/" regardless of layer media type, so all
|
// each layer the virtual RealPath "/" regardless of layer media type, so all
|
||||||
// safetensors packages from a single OCI scan collapse into one group.
|
// safetensors packages from a single OCI scan collapse into one group.
|
||||||
@ -78,31 +78,17 @@ func ggufMergeProcessor(pkgs []pkg.Package, rels []artifact.Relationship, err er
|
|||||||
return namedPkgs, rels, err
|
return namedPkgs, rels, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// safeTensorsMergeProcessor owns naming, license resolution, etc
|
// safeTensorsMergeProcessor owns naming, license resolution, and tensor package creation
|
||||||
// 1. groups all nameless packages by parent directory (or a single sentinel
|
// - groups all nameless packages
|
||||||
// for OCI artifacts, since the ContainerImageModel resolver puts every
|
// - merge the per-shard metadata
|
||||||
// layer at virtual path "/");
|
// - picks a name (see pickSafeTensorsName)
|
||||||
// 2. merges the per-shard metadata (tensor count, dominant dtype, total size,
|
|
||||||
// UserMetadata, rollup MetadataHash) into one package per group;
|
|
||||||
// 3. enriches the merged package by consulting the resolver ONCE per group —
|
|
||||||
// sibling config.json + README.md for dir scans, the model-file companion
|
|
||||||
// layers + license layer for OCI — and attaches those locations as
|
|
||||||
// supporting evidence;
|
|
||||||
// 4. picks a name via a two-rung precedence chain (see pickSafeTensorsName):
|
|
||||||
// config.json _name_or_path first (both sources), then the OCI image-ref
|
|
||||||
// last segment (OCI only). Drops the group when neither rung produces a
|
|
||||||
// name. There is no opaque fallback (no Architecture-Parameters synthetic,
|
|
||||||
// no parent-dir, no MetadataHash-as-name) — an unnameable model is
|
|
||||||
// intentionally absent from the SBOM rather than recorded under a
|
|
||||||
// misleading label.
|
|
||||||
func safeTensorsMergeProcessor(ctx context.Context, resolver file.Resolver, pkgs []pkg.Package, rels []artifact.Relationship, err error) ([]pkg.Package, []artifact.Relationship, error) {
|
func safeTensorsMergeProcessor(ctx context.Context, resolver file.Resolver, pkgs []pkg.Package, rels []artifact.Relationship, err error) ([]pkg.Package, []artifact.Relationship, error) {
|
||||||
if err != nil || len(pkgs) == 0 {
|
if err != nil || len(pkgs) == 0 {
|
||||||
return pkgs, rels, err
|
return pkgs, rels, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// Defensively split off non-safetensors packages — the cataloger only emits
|
// split off non-safetensors packages
|
||||||
// SafeTensorsModelInfo today, but this keeps the processor robust if other
|
// this keeps the processor robust if other types ever flow through
|
||||||
// types ever flow through.
|
|
||||||
var stPkgs, other []pkg.Package
|
var stPkgs, other []pkg.Package
|
||||||
for _, p := range pkgs {
|
for _, p := range pkgs {
|
||||||
if _, ok := p.Metadata.(pkg.SafeTensorsModelInfo); ok {
|
if _, ok := p.Metadata.(pkg.SafeTensorsModelInfo); ok {
|
||||||
@ -127,11 +113,17 @@ func safeTensorsMergeProcessor(ctx context.Context, resolver file.Resolver, pkgs
|
|||||||
out := other
|
out := other
|
||||||
for _, key := range keys {
|
for _, key := range keys {
|
||||||
merged := mergeSafeTensorsGroup(groups[key])
|
merged := mergeSafeTensorsGroup(groups[key])
|
||||||
nameOrPath, fallbackName := enrichSafeTensorsGroup(ctx, resolver, key, &merged)
|
|
||||||
name := pickSafeTensorsName(nameOrPath, fallbackName)
|
// Resolve model identity (name candidates) before enrich
|
||||||
|
id := resolveSafeTensorsIdentity(resolver, key, &merged)
|
||||||
|
name := pickSafeTensorsName(id.nameOrPath, id.fallbackName)
|
||||||
if name == "" {
|
if name == "" {
|
||||||
continue // drop groups with no name source and no usable fallback
|
log.Debugf("dropped safetensors model package (metadata hash %q): no name source",
|
||||||
|
merged.Metadata.(pkg.SafeTensorsModelInfo).MetadataHash)
|
||||||
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
enrichSafeTensorsGroup(ctx, resolver, key, &merged, id)
|
||||||
merged.Name = name
|
merged.Name = name
|
||||||
merged.SetID()
|
merged.SetID()
|
||||||
out = append(out, merged)
|
out = append(out, merged)
|
||||||
@ -140,8 +132,7 @@ func safeTensorsMergeProcessor(ctx context.Context, resolver file.Resolver, pkgs
|
|||||||
}
|
}
|
||||||
|
|
||||||
// groupSafeTensorsPackages buckets packages by the parent directory of their
|
// groupSafeTensorsPackages buckets packages by the parent directory of their
|
||||||
// primary-evidence location, or the OCI sentinel when the location lives at
|
// primary-evidence location
|
||||||
// the ContainerImageModel resolver's virtual "/" path.
|
|
||||||
func groupSafeTensorsPackages(pkgs []pkg.Package) map[string][]pkg.Package {
|
func groupSafeTensorsPackages(pkgs []pkg.Package) map[string][]pkg.Package {
|
||||||
out := make(map[string][]pkg.Package)
|
out := make(map[string][]pkg.Package)
|
||||||
for _, p := range pkgs {
|
for _, p := range pkgs {
|
||||||
@ -178,19 +169,7 @@ func primaryEvidenceLocation(p pkg.Package) *file.Location {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// mergeSafeTensorsGroup folds a group's per-member metadata into a single
|
// mergeSafeTensorsGroup folds a group's per-member metadata into a single package.
|
||||||
// package. Members are classified into two buckets to avoid double-counting:
|
|
||||||
//
|
|
||||||
// - "aggregate" members have producer-declared totals (TensorCount, TotalSize,
|
|
||||||
// ShardCount, Parameters) but no MetadataHash — these are the Docker AI
|
|
||||||
// config blob and the sharded-index file.
|
|
||||||
// - "shard" members have a content-derived MetadataHash and per-shard counts —
|
|
||||||
// these are the individual .safetensors header parsers, both dir-scan and
|
|
||||||
// OCI weight-layer.
|
|
||||||
//
|
|
||||||
// Aggregate values are the source of truth for the merged totals when present;
|
|
||||||
// shards contribute Quantization, UserMetadata, the rollup MetadataHash, and
|
|
||||||
// (for multi-shard models) the Parts breakdown.
|
|
||||||
func mergeSafeTensorsGroup(members []pkg.Package) pkg.Package {
|
func mergeSafeTensorsGroup(members []pkg.Package) pkg.Package {
|
||||||
locSet := unionLocations(members)
|
locSet := unionLocations(members)
|
||||||
aggregates, shards := bucketSafeTensorsMembers(members)
|
aggregates, shards := bucketSafeTensorsMembers(members)
|
||||||
@ -199,6 +178,11 @@ func mergeSafeTensorsGroup(members []pkg.Package) pkg.Package {
|
|||||||
mergeAggregatesInto(&merged, aggregates)
|
mergeAggregatesInto(&merged, aggregates)
|
||||||
shardTensorTotal, hashes := mergeShardsInto(&merged, shards)
|
shardTensorTotal, hashes := mergeShardsInto(&merged, shards)
|
||||||
|
|
||||||
|
// Keep merged UserMetadata globally key-sorted so the SBOM is stable
|
||||||
|
sort.Slice(merged.UserMetadata, func(i, j int) bool {
|
||||||
|
return merged.UserMetadata[i].Key < merged.UserMetadata[j].Key
|
||||||
|
})
|
||||||
|
|
||||||
if merged.TensorCount == 0 {
|
if merged.TensorCount == 0 {
|
||||||
merged.TensorCount = shardTensorTotal
|
merged.TensorCount = shardTensorTotal
|
||||||
}
|
}
|
||||||
@ -228,10 +212,6 @@ func mergeSafeTensorsGroup(members []pkg.Package) pkg.Package {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// mergeAggregatesInto folds aggregate-declared totals (config blob or sharded
|
|
||||||
// index) into merged. First non-empty wins, so the order aggregates are passed
|
|
||||||
// in determines tie-breaking — in practice there is one config blob and one
|
|
||||||
// index per group, never two of the same kind.
|
|
||||||
func mergeAggregatesInto(merged *pkg.SafeTensorsModelInfo, aggregates []pkg.SafeTensorsModelInfo) {
|
func mergeAggregatesInto(merged *pkg.SafeTensorsModelInfo, aggregates []pkg.SafeTensorsModelInfo) {
|
||||||
for _, a := range aggregates {
|
for _, a := range aggregates {
|
||||||
if merged.TensorCount == 0 {
|
if merged.TensorCount == 0 {
|
||||||
@ -252,8 +232,7 @@ func mergeAggregatesInto(merged *pkg.SafeTensorsModelInfo, aggregates []pkg.Safe
|
|||||||
// mergeShardsInto folds the per-shard header metadata into merged, returning
|
// mergeShardsInto folds the per-shard header metadata into merged, returning
|
||||||
// the summed shard TensorCount and the list of non-empty per-shard hashes for
|
// the summed shard TensorCount and the list of non-empty per-shard hashes for
|
||||||
// the rollup. Shards carry only the content-derived fields (Quantization,
|
// the rollup. Shards carry only the content-derived fields (Quantization,
|
||||||
// Parameters, UserMetadata); producer-declared fields like Architecture come
|
// Parameters, UserMetadata);
|
||||||
// from the resolver-backed enrichment that runs afterwards.
|
|
||||||
func mergeShardsInto(merged *pkg.SafeTensorsModelInfo, shards []pkg.SafeTensorsModelInfo) (shardTensorTotal uint64, hashes []string) {
|
func mergeShardsInto(merged *pkg.SafeTensorsModelInfo, shards []pkg.SafeTensorsModelInfo) (shardTensorTotal uint64, hashes []string) {
|
||||||
seenKV := map[string]bool{}
|
seenKV := map[string]bool{}
|
||||||
for _, s := range shards {
|
for _, s := range shards {
|
||||||
@ -309,11 +288,6 @@ func bucketSafeTensorsMembers(members []pkg.Package) (aggregates, shards []pkg.S
|
|||||||
return aggregates, shards
|
return aggregates, shards
|
||||||
}
|
}
|
||||||
|
|
||||||
// rollupHash returns a stable hash across the sorted set of per-member
|
|
||||||
// content-derived hashes. For a single member it returns that hash unchanged,
|
|
||||||
// so a single-file dir scan and an OCI scan with one safetensors layer surface
|
|
||||||
// the same value. For multi-shard models the rollup is the xxhash of the
|
|
||||||
// sorted hashes joined with "|".
|
|
||||||
func rollupHash(hashes []string) string {
|
func rollupHash(hashes []string) string {
|
||||||
if len(hashes) == 0 {
|
if len(hashes) == 0 {
|
||||||
return ""
|
return ""
|
||||||
@ -326,43 +300,59 @@ func rollupHash(hashes []string) string {
|
|||||||
return fmt.Sprintf("%016x", xxhash.Sum64String(strings.Join(sorted, "|")))
|
return fmt.Sprintf("%016x", xxhash.Sum64String(strings.Join(sorted, "|")))
|
||||||
}
|
}
|
||||||
|
|
||||||
// enrichSafeTensorsGroup reads the resolver once for the group to populate the
|
type safeTensorsIdentity struct {
|
||||||
// merged metadata's Architecture / TorchDtype / TransformersVersion, set the
|
nameOrPath string
|
||||||
// licenses on the merged package, and attach the location of every consulted
|
fallbackName string
|
||||||
// supporting file as SupportingEvidence. Returns two name candidates for the
|
readmeLicense string
|
||||||
// merge processor: nameOrPath (raw _name_or_path from a config.json) and a
|
supporting []file.Location
|
||||||
// fallbackName used when no _name_or_path is available — the last path segment
|
}
|
||||||
// of the OCI image reference for OCI groups, or the parent directory's base
|
|
||||||
// name for directory-scan groups.
|
// resolveSafeTensorsIdentity reads the resolver for the group's naming signals
|
||||||
func enrichSafeTensorsGroup(ctx context.Context, resolver file.Resolver, groupKey string, merged *pkg.Package) (nameOrPath, fallbackName string) {
|
// (config.json _name_or_path, README base_model, OCI image ref / dir name)
|
||||||
|
func resolveSafeTensorsIdentity(resolver file.Resolver, groupKey string, merged *pkg.Package) safeTensorsIdentity {
|
||||||
md := merged.Metadata.(pkg.SafeTensorsModelInfo)
|
md := merged.Metadata.(pkg.SafeTensorsModelInfo)
|
||||||
|
|
||||||
var (
|
var id safeTensorsIdentity
|
||||||
lics []pkg.License
|
|
||||||
supporting []file.Location
|
|
||||||
)
|
|
||||||
if groupKey == ociGroupKey {
|
if groupKey == ociGroupKey {
|
||||||
nameOrPath, fallbackName, lics, supporting = enrichSafeTensorsOCI(ctx, resolver, &md)
|
id = resolveSafeTensorsOCIIdentity(resolver, &md)
|
||||||
} else {
|
} else {
|
||||||
nameOrPath, lics, supporting = enrichSafeTensorsDir(ctx, resolver, groupKey, &md)
|
id = resolveSafeTensorsDirIdentity(resolver, groupKey, &md)
|
||||||
fallbackName = safeTensorsDirName(groupKey)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
merged.Metadata = md
|
merged.Metadata = md
|
||||||
|
return id
|
||||||
|
}
|
||||||
|
|
||||||
|
func enrichSafeTensorsGroup(ctx context.Context, resolver file.Resolver, groupKey string, merged *pkg.Package, id safeTensorsIdentity) {
|
||||||
|
var lics []pkg.License
|
||||||
|
supporting := id.supporting
|
||||||
|
|
||||||
|
switch {
|
||||||
|
case id.readmeLicense != "":
|
||||||
|
lics = pkg.NewLicensesFromValuesWithContext(ctx, id.readmeLicense)
|
||||||
|
case groupKey == ociGroupKey:
|
||||||
|
if ociResolver, ok := resolver.(file.OCIMediaTypeResolver); ok {
|
||||||
|
licLocs, err := ociResolver.FilesByMediaType(dockerAILicenseMediaType)
|
||||||
|
if err != nil {
|
||||||
|
log.Debugf("failed to list docker AI license layers: %v", err)
|
||||||
|
}
|
||||||
|
if len(licLocs) > 0 {
|
||||||
|
lics = identifyLicenseLayers(ctx, resolver, licLocs)
|
||||||
|
supporting = append(supporting, licLocs...)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if len(lics) > 0 {
|
if len(lics) > 0 {
|
||||||
merged.Licenses = pkg.NewLicenseSet(lics...)
|
merged.Licenses = pkg.NewLicenseSet(lics...)
|
||||||
}
|
}
|
||||||
for _, loc := range supporting {
|
for _, loc := range supporting {
|
||||||
merged.Locations.Add(loc.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.SupportingEvidenceAnnotation))
|
merged.Locations.Add(loc.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.SupportingEvidenceAnnotation))
|
||||||
}
|
}
|
||||||
return nameOrPath, fallbackName
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// safeTensorsDirName returns the directory-scan naming fallback: the base name
|
// safeTensorsDirName returns the directory-scan naming fallback: the base name
|
||||||
// of the group's parent directory (the group key is already that directory).
|
// of the group's parent directory (the group key is already that directory).
|
||||||
// For "/models/tiny-llama" this returns "tiny-llama". Degenerate roots that
|
|
||||||
// carry no meaningful model name ("/", ".", "") return "", so the group is
|
|
||||||
// dropped rather than labeled with a filesystem artifact.
|
|
||||||
func safeTensorsDirName(groupKey string) string {
|
func safeTensorsDirName(groupKey string) string {
|
||||||
base := path.Base(groupKey)
|
base := path.Base(groupKey)
|
||||||
switch base {
|
switch base {
|
||||||
@ -372,41 +362,32 @@ func safeTensorsDirName(groupKey string) string {
|
|||||||
return base
|
return base
|
||||||
}
|
}
|
||||||
|
|
||||||
// enrichSafeTensorsDir handles the directory-scan case: look for a config.json
|
// resolveSafeTensorsDirIdentity handles the directory-scan case: look for a
|
||||||
// beside the model files (walking up parent directories to the scanned source
|
// config.json beside the model files (walking up parent directories to the
|
||||||
// root if no sibling exists) and a sibling README.md.
|
// scanned source root if no sibling exists) and a sibling README.md
|
||||||
func enrichSafeTensorsDir(ctx context.Context, resolver file.Resolver, dir string, md *pkg.SafeTensorsModelInfo) (nameOrPath string, lics []pkg.License, supporting []file.Location) {
|
func resolveSafeTensorsDirIdentity(resolver file.Resolver, dir string, md *pkg.SafeTensorsModelInfo) safeTensorsIdentity {
|
||||||
|
id := safeTensorsIdentity{fallbackName: safeTensorsDirName(dir)}
|
||||||
|
|
||||||
if loc, cfg := findDirHFConfig(resolver, dir); cfg != nil {
|
if loc, cfg := findDirHFConfig(resolver, dir); cfg != nil {
|
||||||
applyHFConfig(md, cfg)
|
applyHFConfig(md, cfg)
|
||||||
nameOrPath = cfg.NameOrPath
|
id.nameOrPath = cfg.NameOrPath
|
||||||
supporting = append(supporting, *loc)
|
id.supporting = append(id.supporting, *loc)
|
||||||
}
|
}
|
||||||
|
|
||||||
if loc, fm := readDirReadmeFrontmatter(resolver, path.Join(dir, "README.md")); fm != nil {
|
if loc, fm := readDirReadmeFrontmatter(resolver, path.Join(dir, "README.md")); fm != nil {
|
||||||
if fm.License != "" {
|
id.readmeLicense = fm.License
|
||||||
lics = pkg.NewLicensesFromValuesWithContext(ctx, fm.License)
|
if id.nameOrPath == "" && len(fm.BaseModel) > 0 {
|
||||||
|
id.nameOrPath = fm.BaseModel[0]
|
||||||
}
|
}
|
||||||
if nameOrPath == "" && len(fm.BaseModel) > 0 {
|
id.supporting = append(id.supporting, *loc)
|
||||||
nameOrPath = fm.BaseModel[0]
|
|
||||||
}
|
}
|
||||||
supporting = append(supporting, *loc)
|
return id
|
||||||
}
|
|
||||||
return nameOrPath, lics, supporting
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// enrichSafeTensorsOCI handles the OCI-artifact case: walk the
|
func resolveSafeTensorsOCIIdentity(resolver file.Resolver, md *pkg.SafeTensorsModelInfo) safeTensorsIdentity {
|
||||||
// vnd.docker.ai.model.file layers (READMEs and HF config.json all ride that
|
|
||||||
// media type — we sniff content to tell them apart), then fall back to the
|
|
||||||
// vnd.docker.ai.license layer through the shared license scanner. It also
|
|
||||||
// pulls the user-supplied image reference off the resolver (when the resolver
|
|
||||||
// implements file.OCIArtifactResolver) and returns its last path segment as a
|
|
||||||
// naming candidate — repacked artifacts like Docker AI vllm images frequently
|
|
||||||
// strip name fields out of every embedded config, so the image ref is often
|
|
||||||
// the only remaining identifier for the model.
|
|
||||||
func enrichSafeTensorsOCI(ctx context.Context, resolver file.Resolver, md *pkg.SafeTensorsModelInfo) (nameOrPath, imageRefName string, lics []pkg.License, supporting []file.Location) {
|
|
||||||
ociResolver, ok := resolver.(file.OCIMediaTypeResolver)
|
ociResolver, ok := resolver.(file.OCIMediaTypeResolver)
|
||||||
if !ok {
|
if !ok {
|
||||||
return "", "", nil, nil
|
return safeTensorsIdentity{}
|
||||||
}
|
}
|
||||||
|
|
||||||
modelFileLocs, err := ociResolver.FilesByMediaType(dockerAIModelFileMediaType)
|
modelFileLocs, err := ociResolver.FilesByMediaType(dockerAIModelFileMediaType)
|
||||||
@ -417,6 +398,7 @@ func enrichSafeTensorsOCI(ctx context.Context, resolver file.Resolver, md *pkg.S
|
|||||||
// Collect config / readme candidates separately so the layer-iteration order
|
// Collect config / readme candidates separately so the layer-iteration order
|
||||||
// returned by the resolver doesn't decide the precedence.
|
// returned by the resolver doesn't decide the precedence.
|
||||||
var configName, readmeName, readmeLicense string
|
var configName, readmeName, readmeLicense string
|
||||||
|
var supporting []file.Location
|
||||||
for _, loc := range modelFileLocs {
|
for _, loc := range modelFileLocs {
|
||||||
if classifyOCIModelFileLayer(resolver, loc, md, &configName, &readmeName, &readmeLicense) {
|
if classifyOCIModelFileLayer(resolver, loc, md, &configName, &readmeName, &readmeLicense) {
|
||||||
supporting = append(supporting, loc)
|
supporting = append(supporting, loc)
|
||||||
@ -424,37 +406,19 @@ func enrichSafeTensorsOCI(ctx context.Context, resolver file.Resolver, md *pkg.S
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Precedence: config.json _name_or_path > README base_model.
|
// Precedence: config.json _name_or_path > README base_model.
|
||||||
if configName != "" {
|
nameOrPath := configName
|
||||||
nameOrPath = configName
|
if nameOrPath == "" {
|
||||||
} else {
|
|
||||||
nameOrPath = readmeName
|
nameOrPath = readmeName
|
||||||
}
|
}
|
||||||
|
|
||||||
// README license takes precedence; fall back to the license layer. For each
|
return safeTensorsIdentity{
|
||||||
// license layer we first try a cheap YAML-frontmatter spdx-id read; layers
|
nameOrPath: nameOrPath,
|
||||||
// without frontmatter fall through to the shared license scanner.
|
fallbackName: ociImageRefBasename(resolver),
|
||||||
switch {
|
readmeLicense: readmeLicense,
|
||||||
case readmeLicense != "":
|
supporting: supporting,
|
||||||
lics = pkg.NewLicensesFromValuesWithContext(ctx, readmeLicense)
|
|
||||||
default:
|
|
||||||
licLocs, lErr := ociResolver.FilesByMediaType(dockerAILicenseMediaType)
|
|
||||||
if lErr != nil {
|
|
||||||
log.Debugf("failed to list docker AI license layers: %v", lErr)
|
|
||||||
}
|
}
|
||||||
if len(licLocs) > 0 {
|
|
||||||
lics = identifyLicenseLayers(ctx, resolver, licLocs)
|
|
||||||
supporting = append(supporting, licLocs...)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
imageRefName = ociImageRefBasename(resolver)
|
|
||||||
return nameOrPath, imageRefName, lics, supporting
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// ociImageRefBasename returns the last path segment of the repository portion
|
|
||||||
// of the OCI image reference exposed by the resolver, or "" when the resolver
|
|
||||||
// does not implement OCIArtifactResolver or the reference fails to parse. For
|
|
||||||
// "docker.io/ai/smollm2-vllm:360M" this returns "smollm2-vllm".
|
|
||||||
func ociImageRefBasename(resolver file.Resolver) string {
|
func ociImageRefBasename(resolver file.Resolver) string {
|
||||||
info, ok := resolver.(file.OCIArtifactResolver)
|
info, ok := resolver.(file.OCIArtifactResolver)
|
||||||
if !ok {
|
if !ok {
|
||||||
@ -473,11 +437,7 @@ func ociImageRefBasename(resolver file.Resolver) string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// identifyLicenseLayers turns Docker AI license-layer locations into
|
// identifyLicenseLayers turns Docker AI license-layer locations into
|
||||||
// pkg.License values. It first attempts a cheap, exact SPDX-id read from the
|
// pkg.License values.
|
||||||
// layer's YAML frontmatter (the choosealicense.com shape Docker Model Runner
|
|
||||||
// publishes for its AI artifacts); layers without frontmatter fall through to
|
|
||||||
// the shared license scanner. Each returned license is tagged with the layer
|
|
||||||
// location it came from so the SBOM cites its source.
|
|
||||||
func identifyLicenseLayers(ctx context.Context, resolver file.Resolver, locs []file.Location) []pkg.License {
|
func identifyLicenseLayers(ctx context.Context, resolver file.Resolver, locs []file.Location) []pkg.License {
|
||||||
var out []pkg.License
|
var out []pkg.License
|
||||||
var scanFallback []file.Location
|
var scanFallback []file.Location
|
||||||
@ -496,9 +456,7 @@ func identifyLicenseLayers(ctx context.Context, resolver file.Resolver, locs []f
|
|||||||
}
|
}
|
||||||
|
|
||||||
// readLicenseSPDXIDFromFrontmatter reads a bounded prefix of a license-layer
|
// readLicenseSPDXIDFromFrontmatter reads a bounded prefix of a license-layer
|
||||||
// blob and returns the spdx-id declared in its YAML frontmatter, if any. The
|
// blob and returns the spdx-id declared in its YAML frontmatter
|
||||||
// 64 KiB cap is well above any real choosealicense.com frontmatter block while
|
|
||||||
// still bounding memory if the layer turns out to be huge.
|
|
||||||
func readLicenseSPDXIDFromFrontmatter(resolver file.Resolver, loc file.Location) string {
|
func readLicenseSPDXIDFromFrontmatter(resolver file.Resolver, loc file.Location) string {
|
||||||
rc, err := resolver.FileContentsByLocation(loc)
|
rc, err := resolver.FileContentsByLocation(loc)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -514,10 +472,7 @@ func readLicenseSPDXIDFromFrontmatter(resolver file.Resolver, loc file.Location)
|
|||||||
}
|
}
|
||||||
|
|
||||||
// classifyOCIModelFileLayer reads up to 4 MiB of a model.file layer and
|
// classifyOCIModelFileLayer reads up to 4 MiB of a model.file layer and
|
||||||
// classifies it as README frontmatter or HF config.json based on its leading
|
// classifies it as README frontmatter or HF config.json based on its leading bytes.
|
||||||
// bytes. Side-effects: applies HF config fields onto md, accumulates name and
|
|
||||||
// license candidates via the out-params. Returns true when the layer was
|
|
||||||
// successfully classified (and should be recorded as supporting evidence).
|
|
||||||
func classifyOCIModelFileLayer(resolver file.Resolver, loc file.Location, md *pkg.SafeTensorsModelInfo, configName, readmeName, license *string) bool {
|
func classifyOCIModelFileLayer(resolver file.Resolver, loc file.Location, md *pkg.SafeTensorsModelInfo, configName, readmeName, license *string) bool {
|
||||||
rc, err := resolver.FileContentsByLocation(loc)
|
rc, err := resolver.FileContentsByLocation(loc)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -557,10 +512,6 @@ func classifyOCIModelFileLayer(resolver file.Resolver, loc file.Location, md *pk
|
|||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
// applyHFConfig folds the subset of HF config.json fields we surface in our
|
|
||||||
// metadata onto md. Fields already populated on md are left alone — earlier
|
|
||||||
// content-derived values (Quantization, TensorCount, etc., from header bytes)
|
|
||||||
// always win over producer-declared ones in case of conflict.
|
|
||||||
func applyHFConfig(md *pkg.SafeTensorsModelInfo, cfg *hfConfig) {
|
func applyHFConfig(md *pkg.SafeTensorsModelInfo, cfg *hfConfig) {
|
||||||
if md.Architecture == "" && len(cfg.Architectures) > 0 {
|
if md.Architecture == "" && len(cfg.Architectures) > 0 {
|
||||||
md.Architecture = cfg.Architectures[0]
|
md.Architecture = cfg.Architectures[0]
|
||||||
@ -574,20 +525,9 @@ func applyHFConfig(md *pkg.SafeTensorsModelInfo, cfg *hfConfig) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// pickSafeTensorsName implements the documented naming precedence chain:
|
// pickSafeTensorsName implements the documented naming precedence chain:
|
||||||
//
|
// - config.json _name_or_path (path.Base, so "org/Model" → "Model";
|
||||||
// 1. config.json _name_or_path (path.Base, so "org/Model" → "Model";
|
|
||||||
// applies to both dir-scan and OCI groups)
|
// applies to both dir-scan and OCI groups)
|
||||||
// 2. fallback name — the group's source-specific positional identifier:
|
// - fallback name — the group's source-specific positional identifier
|
||||||
// the OCI image-ref repository basename for OCI groups (e.g.
|
|
||||||
// "docker.io/ai/smollm2-vllm:360M" → "smollm2-vllm"), or the parent
|
|
||||||
// directory base name for directory-scan groups (e.g.
|
|
||||||
// "/models/tiny-llama/*.safetensors" → "tiny-llama")
|
|
||||||
//
|
|
||||||
// Returns "" to signal the merge processor should drop the group. There is
|
|
||||||
// intentionally no Architecture-Parameters synthetic and no opaque hash label:
|
|
||||||
// when neither a producer-declared name nor a positional fallback is available
|
|
||||||
// the model is recorded as absent rather than under a label the SBOM consumer
|
|
||||||
// would not recognize.
|
|
||||||
func pickSafeTensorsName(nameOrPath, fallbackName string) string {
|
func pickSafeTensorsName(nameOrPath, fallbackName string) string {
|
||||||
if nameOrPath != "" {
|
if nameOrPath != "" {
|
||||||
return path.Base(nameOrPath)
|
return path.Base(nameOrPath)
|
||||||
@ -595,12 +535,6 @@ func pickSafeTensorsName(nameOrPath, fallbackName string) string {
|
|||||||
return fallbackName
|
return fallbackName
|
||||||
}
|
}
|
||||||
|
|
||||||
// --- Enrichment helpers ---------------------------------------------------
|
|
||||||
//
|
|
||||||
// The parsers decode only the safetensors-specific format; every resolver-backed
|
|
||||||
// read (config.json, README, license layers) is centralized here in the merge
|
|
||||||
// processor, along with the types those reads decode into.
|
|
||||||
|
|
||||||
// hfConfig is a minimal projection of Hugging Face config.json fields.
|
// hfConfig is a minimal projection of Hugging Face config.json fields.
|
||||||
type hfConfig struct {
|
type hfConfig struct {
|
||||||
Architectures []string `json:"architectures"`
|
Architectures []string `json:"architectures"`
|
||||||
@ -615,14 +549,7 @@ type readmeFrontmatter struct {
|
|||||||
BaseModel []string `yaml:"base_model"`
|
BaseModel []string `yaml:"base_model"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// findDirHFConfig looks for a config.json beside the model files, walking up
|
// findDirHFConfig looks for a config.json beside the model files
|
||||||
// parent directories until it reaches the scanned source root. The walk needs
|
|
||||||
// no explicit depth bound: the resolver only resolves paths within the scanned
|
|
||||||
// source, so an ancestor above the scan root simply yields no config, and
|
|
||||||
// path.Dir converges on a fixed point ("/" or ".") that terminates the loop.
|
|
||||||
// The first config.json found wins, so the closest one — a sibling, then the
|
|
||||||
// nearest ancestor — supplies both the producer-declared name and the HF fields
|
|
||||||
// applied to the model.
|
|
||||||
func findDirHFConfig(resolver file.Resolver, dir string) (*file.Location, *hfConfig) {
|
func findDirHFConfig(resolver file.Resolver, dir string) (*file.Location, *hfConfig) {
|
||||||
for {
|
for {
|
||||||
if loc, cfg := readDirHFConfig(resolver, path.Join(dir, "config.json")); cfg != nil {
|
if loc, cfg := readDirHFConfig(resolver, path.Join(dir, "config.json")); cfg != nil {
|
||||||
@ -678,9 +605,7 @@ func readDirReadmeFrontmatter(resolver file.Resolver, p string) (*file.Location,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// extractFrontmatterBlock returns the YAML bytes between the first and second
|
// extractFrontmatterBlock returns the YAML bytes between the first and second
|
||||||
// "---" delimiters of a file (stripping a leading BOM and any leading
|
// "---" delimiters of a file
|
||||||
// whitespace), or nil when no closed frontmatter block exists. Shared by every
|
|
||||||
// YAML-frontmatter parser the cataloger needs.
|
|
||||||
func extractFrontmatterBlock(buf []byte) []byte {
|
func extractFrontmatterBlock(buf []byte) []byte {
|
||||||
trimmed := bytes.TrimLeft(buf, "\xef\xbb\xbf \t\r\n")
|
trimmed := bytes.TrimLeft(buf, "\xef\xbb\xbf \t\r\n")
|
||||||
if !bytes.HasPrefix(trimmed, []byte("---")) {
|
if !bytes.HasPrefix(trimmed, []byte("---")) {
|
||||||
@ -698,8 +623,7 @@ func extractFrontmatterBlock(buf []byte) []byte {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// parseFrontmatter decodes a Hugging Face model card YAML frontmatter block
|
// parseFrontmatter decodes a Hugging Face model card YAML frontmatter block
|
||||||
// and returns the license and base_model fields. base_model is decoded via
|
// and returns the license and base_model fields.
|
||||||
// yaml.Node so a scalar value ("org/model") doesn't fail the whole block.
|
|
||||||
func parseFrontmatter(buf []byte) *readmeFrontmatter {
|
func parseFrontmatter(buf []byte) *readmeFrontmatter {
|
||||||
block := extractFrontmatterBlock(buf)
|
block := extractFrontmatterBlock(buf)
|
||||||
if block == nil {
|
if block == nil {
|
||||||
@ -727,17 +651,11 @@ func parseFrontmatter(buf []byte) *readmeFrontmatter {
|
|||||||
return &fm
|
return &fm
|
||||||
}
|
}
|
||||||
|
|
||||||
// licenseFrontmatter holds the fields we lift from a choosealicense.com-style
|
|
||||||
// YAML frontmatter block at the top of a license file (the LICENSE blobs Docker
|
|
||||||
// Model Runner publishes for AI artifacts use this shape).
|
|
||||||
type licenseFrontmatter struct {
|
type licenseFrontmatter struct {
|
||||||
SPDXID string `yaml:"spdx-id"`
|
SPDXID string `yaml:"spdx-id"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// parseLicenseFrontmatter returns the producer-declared SPDX identifier from a
|
// parseLicenseFrontmatter returns the producer-declared SPDX identifier
|
||||||
// choosealicense.com-style YAML frontmatter block, or "" if the buffer has no
|
|
||||||
// frontmatter or no spdx-id field — caller should fall back to a full license
|
|
||||||
// scan in that case.
|
|
||||||
func parseLicenseFrontmatter(buf []byte) string {
|
func parseLicenseFrontmatter(buf []byte) string {
|
||||||
block := extractFrontmatterBlock(buf)
|
block := extractFrontmatterBlock(buf)
|
||||||
if block == nil {
|
if block == nil {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user