From 0dc4abc0e11697f2dc3a3ec43f53c6bf74ad4c54 Mon Sep 17 00:00:00 2001 From: Christopher Phillips <32073428+spiffcs@users.noreply.github.com> Date: Fri, 5 Jun 2026 03:16:02 -0400 Subject: [PATCH] pr: refactor Signed-off-by: Christopher Phillips <32073428+spiffcs@users.noreply.github.com> --- .../fileresolver/container_image_model.go | 6 +- syft/pkg/cataloger/ai/identity_oci.go | 73 +++++++-------- syft/pkg/cataloger/ai/merge.go | 5 +- syft/pkg/cataloger/ai/parse_safetensors.go | 6 +- .../cataloger/ai/parse_safetensors_model.go | 3 +- .../cataloger/ai/parse_safetensors_test.go | 89 +++++++++++++++++++ syft/pkg/cataloger/ai/processor.go | 12 ++- syft/pkg/safetensors.go | 2 +- .../source/ocimodelsource/oci_model_source.go | 3 +- syft/source/ocimodelsource/registry_client.go | 5 +- 10 files changed, 150 insertions(+), 54 deletions(-) diff --git a/syft/internal/fileresolver/container_image_model.go b/syft/internal/fileresolver/container_image_model.go index cb640a7ad..3cc4a7422 100644 --- a/syft/internal/fileresolver/container_image_model.go +++ b/syft/internal/fileresolver/container_image_model.go @@ -20,8 +20,10 @@ type LayerInfo struct { } // ContainerImageModel is a file.Resolver implementation that provides access to -// GGUF header data fetched from OCI model artifacts via range-GET requests. -// This does not fetch the entire model from the registry, only a sliver of it. +// model header and metadata data (GGUF and SafeTensors headers, the model config +// blob, and companion layers) fetched from OCI model artifacts via range-GET +// requests. This does not fetch the entire model from the registry, only a +// sliver of it. type ContainerImageModel struct { tempDir string // temp directory containing all layer files layerFiles map[string]LayerInfo // digest -> layer info (temp path + media type) diff --git a/syft/pkg/cataloger/ai/identity_oci.go b/syft/pkg/cataloger/ai/identity_oci.go index 63c43e469..1b168d7e0 100644 --- a/syft/pkg/cataloger/ai/identity_oci.go +++ b/syft/pkg/cataloger/ai/identity_oci.go @@ -36,7 +36,21 @@ func resolveSafeTensorsOCIIdentity(ctx context.Context, resolver file.Resolver, var configName, readmeName, readmeLicense string var supporting []file.Location for _, loc := range modelFileLocs { - if classifyOCIModelFileLayer(resolver, loc, md, &configName, &readmeName, &readmeLicense) { + cfg, fm := classifyOCIModelFileLayer(resolver, loc) + switch { + case cfg != nil: + applyHFConfig(md, cfg) + if configName == "" { + configName = cfg.NameOrPath + } + supporting = append(supporting, loc) + case fm != nil: + if readmeLicense == "" { + readmeLicense = fm.License + } + if readmeName == "" && len(fm.BaseModel) > 0 { + readmeName = fm.BaseModel[0] + } supporting = append(supporting, loc) } } @@ -53,21 +67,19 @@ func resolveSafeTensorsOCIIdentity(ctx context.Context, resolver file.Resolver, supporting: supporting, } - // License precedence: a README model-card license wins over dedicated - // license layers (mirrors the dir-scan path, where README frontmatter is the - // license source). + // License precedence: a dedicated vnd.docker.ai.license layer is a + // producer-curated signal and outranks the free-text license field in a model + // card's README frontmatter. + licLocs, err := ociResolver.FilesByMediaType(dockerAILicenseMediaType) + if err != nil { + log.Debugf("failed to list docker AI license layers: %v", err) + } switch { + case len(licLocs) > 0: + id.licenses = identifyLicenseLayers(ctx, resolver, licLocs) + id.supporting = append(id.supporting, licLocs...) case readmeLicense != "": id.licenses = pkg.NewLicensesFromValuesWithContext(ctx, readmeLicense) - default: - licLocs, err := ociResolver.FilesByMediaType(dockerAILicenseMediaType) - if err != nil { - log.Debugf("failed to list docker AI license layers: %v", err) - } - if len(licLocs) > 0 { - id.licenses = identifyLicenseLayers(ctx, resolver, licLocs) - id.supporting = append(id.supporting, licLocs...) - } } return id @@ -100,45 +112,34 @@ func ociImageRefBasename(resolver file.Resolver) string { return path.Base(parsed.Context().RepositoryStr()) } -// classifyOCIModelFileLayer reads up to 4 MiB of a model.file layer and -// classifies it as README frontmatter or HF config.json based on its leading bytes. -func classifyOCIModelFileLayer(resolver file.Resolver, loc file.Location, md *pkg.SafeTensorsModelInfo, configName, readmeName, license *string) bool { +// classifyOCIModelFileLayer reads up to 4 MiB of a model.file layer and decodes +// it as either an HF config.json or a README model card's YAML frontmatter, +// based on its leading bytes. It returns whichever it recognized; both are nil +// when the layer is neither (or fails to decode). The caller owns precedence and +// metadata enrichment. +func classifyOCIModelFileLayer(resolver file.Resolver, loc file.Location) (*hfConfig, *readmeFrontmatter) { rc, err := resolver.FileContentsByLocation(loc) if err != nil { - return false + return nil, nil } defer internal.CloseAndLogError(rc, loc.RealPath) buf, err := io.ReadAll(io.LimitReader(rc, 4*1024*1024)) if err != nil { - return false + return nil, nil } trimmed := bytes.TrimLeft(buf, "\xef\xbb\xbf \t\r\n") switch { case bytes.HasPrefix(trimmed, []byte("---")): - fm := parseFrontmatter(buf) - if fm == nil { - return false - } - if *license == "" { - *license = fm.License - } - if *readmeName == "" && len(fm.BaseModel) > 0 { - *readmeName = fm.BaseModel[0] - } - return true + return nil, parseFrontmatter(buf) case bytes.HasPrefix(trimmed, []byte("{")): var cfg hfConfig if err := json.Unmarshal(buf, &cfg); err != nil { - return false + return nil, nil } - applyHFConfig(md, &cfg) - if *configName == "" && cfg.NameOrPath != "" { - *configName = cfg.NameOrPath - } - return true + return &cfg, nil } - return false + return nil, nil } // identifyLicenseLayers turns Docker AI license-layer locations into diff --git a/syft/pkg/cataloger/ai/merge.go b/syft/pkg/cataloger/ai/merge.go index 8c0b19167..7351c2576 100644 --- a/syft/pkg/cataloger/ai/merge.go +++ b/syft/pkg/cataloger/ai/merge.go @@ -59,9 +59,6 @@ func mergeAggregatesInto(merged *pkg.SafeTensorsModelInfo, aggregates []pkg.Safe if merged.TensorCount == 0 { merged.TensorCount = a.TensorCount } - if merged.ShardCount == 0 { - merged.ShardCount = a.ShardCount - } firstNonEmpty(&merged.Parameters, a.Parameters) firstNonEmpty(&merged.TotalSize, a.TotalSize) firstNonEmpty(&merged.Quantization, a.Quantization) @@ -71,7 +68,7 @@ func mergeAggregatesInto(merged *pkg.SafeTensorsModelInfo, aggregates []pkg.Safe // mergeShardsInto folds the per-shard header metadata into merged, returning // the summed shard TensorCount and the list of non-empty per-shard hashes for // the rollup. Shards carry only the content-derived fields (Quantization, -// Parameters, UserMetadata); +// Parameters, UserMetadata), so those are the only fields folded in here. func mergeShardsInto(merged *pkg.SafeTensorsModelInfo, shards []pkg.SafeTensorsModelInfo) (shardTensorTotal uint64, hashes []string) { seenKV := map[string]bool{} for _, s := range shards { diff --git a/syft/pkg/cataloger/ai/parse_safetensors.go b/syft/pkg/cataloger/ai/parse_safetensors.go index 52e48a19e..f88e73d40 100644 --- a/syft/pkg/cataloger/ai/parse_safetensors.go +++ b/syft/pkg/cataloger/ai/parse_safetensors.go @@ -148,10 +148,8 @@ func (h *safeTensorsHeader) metadataHash() string { } // userMetadataKeyValues converts the safetensors __metadata__ map into a -// KeyValues slice sorted by key. We do not use the convention of returning a -// nil slice for an empty input — instead, an empty input maps to an empty -// (length-0, non-nil) KeyValues — so downstream JSON serialization remains -// stable: `omitempty` drops the field either way. +// KeyValues slice sorted by key, so SBOM output is stable across runs. Returns +// nil for empty input (omitempty then drops the field). func userMetadataKeyValues(m map[string]string) pkg.KeyValues { if len(m) == 0 { return nil diff --git a/syft/pkg/cataloger/ai/parse_safetensors_model.go b/syft/pkg/cataloger/ai/parse_safetensors_model.go index d7288f1ad..134ef71b1 100644 --- a/syft/pkg/cataloger/ai/parse_safetensors_model.go +++ b/syft/pkg/cataloger/ai/parse_safetensors_model.go @@ -26,11 +26,12 @@ func parseSafeTensorsFile(_ context.Context, _ file.Resolver, _ *generic.Environ return nil, nil, fmt.Errorf("failed to read safetensors header: %w", err) } + // ShardCount is intentionally not set here: the merge processor is the single + // owner of ShardCount and derives it from the number of shards in the group. md := pkg.SafeTensorsModelInfo{ Format: "safetensors", TensorCount: uint64(len(header.tensors)), Quantization: normalizeDType(header.dominantDType()), - ShardCount: 1, UserMetadata: userMetadataKeyValues(header.metadata), MetadataHash: header.metadataHash(), } diff --git a/syft/pkg/cataloger/ai/parse_safetensors_test.go b/syft/pkg/cataloger/ai/parse_safetensors_test.go index 560659d39..dfd586b33 100644 --- a/syft/pkg/cataloger/ai/parse_safetensors_test.go +++ b/syft/pkg/cataloger/ai/parse_safetensors_test.go @@ -122,6 +122,27 @@ func TestSafeTensorsCataloger(t *testing.T) { }, }, }, + { + // rung 1 via README: with no config.json, the README model card's + // base_model names the model (path.Base applied), still beating the + // directory fallback ("readme-named"). + name: "README base_model names the model when there is no config.json", + setup: func(t *testing.T) string { + dir := t.TempDir() + modelDir := filepath.Join(dir, "readme-named") + model(t, modelDir) + writeFile(t, filepath.Join(modelDir, "README.md"), + "---\nbase_model:\n - org/base-model-name\n---\n# Card\n") + return dir + }, + expectedPackages: []pkg.Package{ + { + Name: "base-model-name", + Type: pkg.ModelPkg, + Metadata: wantMetadata(""), + }, + }, + }, { // rung 2: no config.json at all, so the model is named after its // immediate parent directory. @@ -265,6 +286,46 @@ func TestSafeTensorsCataloger(t *testing.T) { } } +// TestSafeTensorsCataloger_shardedDirectory covers the primary multi-shard shape: +// several `model-0000N-of-0000M.safetensors` files in one directory. The +// cataloger must group the shards into a single package, sum their tensor counts, +// record the shard count, and roll each shard up into Parts. (The OCI multi-shard +// path is covered separately in TestSafeTensorsMergeProcessor.) +func TestSafeTensorsCataloger_shardedDirectory(t *testing.T) { + userMeta := map[string]string{"format": "pt"} + // Two shards with distinct tensors → distinct per-shard metadata hashes, so + // the merge treats them as separate shards (3 tensors total across 2 shards). + shardA := map[string]safeTensorsEntry{ + "layers.0.weight": {DType: "BF16", Shape: []int64{10, 10}, DataOffsets: []int64{0, 200}}, + } + shardB := map[string]safeTensorsEntry{ + "layers.1.weight": {DType: "BF16", Shape: []int64{10, 10}, DataOffsets: []int64{0, 200}}, + "layers.2.weight": {DType: "BF16", Shape: []int64{10, 10}, DataOffsets: []int64{200, 400}}, + } + + dir := t.TempDir() + modelDir := filepath.Join(dir, "llama-sharded") + require.NoError(t, os.MkdirAll(modelDir, 0o755)) + require.NoError(t, os.WriteFile(filepath.Join(modelDir, "model-00001-of-00002.safetensors"), + buildSafeTensorsFile(t, userMeta, shardA), 0o644)) + require.NoError(t, os.WriteFile(filepath.Join(modelDir, "model-00002-of-00002.safetensors"), + buildSafeTensorsFile(t, userMeta, shardB), 0o644)) + + pkgtest.NewCatalogTester(). + FromDirectory(t, dir). + ExpectsAssertion(func(t *testing.T, pkgs []pkg.Package, _ []artifact.Relationship) { + require.Len(t, pkgs, 1) + got := pkgs[0] + assert.Equal(t, "llama-sharded", got.Name, "a sharded model with no config.json is named by its directory") + md := got.Metadata.(pkg.SafeTensorsModelInfo) + assert.Equal(t, 2, md.ShardCount) + assert.Equal(t, uint64(3), md.TensorCount, "tensor counts are summed across shards") + assert.Len(t, md.Parts, 2, "each shard is rolled up into Parts") + assert.Equal(t, "BF16", md.Quantization) + }). + TestCataloger(t, NewSafeTensorsCataloger()) +} + // TestParseSafeTensorsOCIConfig covers the parser in isolation: it should emit // a nameless package mirroring the config blob's producer-declared fields, and // emit nothing for non-safetensors formats so the GGUF cataloger can claim the @@ -510,6 +571,34 @@ spdx-id: Apache-2.0 assertHasLicense(t, out[0], "Apache-2.0") }) + t.Run("OCI: license layer wins over a README model-card license", func(t *testing.T) { + // When both a dedicated license layer and a README model-card license are + // present, the producer-curated license layer is authoritative. (If the + // README won, the resolved license would be MIT and this assertion fails.) + dir := t.TempDir() + licensePath := filepath.Join(dir, "LICENSE") + require.NoError(t, os.WriteFile(licensePath, []byte("---\nspdx-id: Apache-2.0\n---\n"), 0o644)) + readmePath := filepath.Join(dir, "README.md") + require.NoError(t, os.WriteFile(readmePath, + []byte("---\nlicense: MIT\nbase_model:\n - org/base\n---\n# Card\n"), 0o644)) + configPath := filepath.Join(dir, "config.json") + require.NoError(t, os.WriteFile(configPath, []byte(`{"_name_or_path":"org/precedence-model"}`), 0o644)) + resolver := file.NewMockResolverForMediaTypes(map[string][]file.Location{ + dockerAIModelFileMediaType: {file.NewLocation(configPath), file.NewLocation(readmePath)}, + dockerAILicenseMediaType: {file.NewLocation(licensePath)}, + }) + + configMd := pkg.SafeTensorsModelInfo{Format: "safetensors", TensorCount: 1} + out, _, err := safeTensorsMergeProcessor( + context.Background(), resolver, + []pkg.Package{ociPkg(configMd)}, nil, nil, + ) + require.NoError(t, err) + require.Len(t, out, 1) + assert.Equal(t, "precedence-model", out[0].Name) + assertHasLicense(t, out[0], "Apache-2.0") + }) + t.Run("passes through upstream error", func(t *testing.T) { sentinel := assert.AnError p := dirPkg("/models/x/y.safetensors", pkg.SafeTensorsModelInfo{Format: "safetensors", MetadataHash: "h"}) diff --git a/syft/pkg/cataloger/ai/processor.go b/syft/pkg/cataloger/ai/processor.go index 7cc5018e2..c685aeb6e 100644 --- a/syft/pkg/cataloger/ai/processor.go +++ b/syft/pkg/cataloger/ai/processor.go @@ -52,9 +52,15 @@ func partitionSafeTensorsPackages(pkgs []pkg.Package) (safeTensors, other []pkg. } // fromOCIArtifact reports whether the packages came from an OCI model artifact. -// That source presents every layer at the virtual path "/", whereas a filesystem -// scan always carries a real file path. A single scan is one source, so the -// first package is representative of the rest. +// That source (the ContainerImageModel resolver) presents every layer at the +// virtual path "/", whereas a filesystem scan always carries a real file path. A +// single scan is one source, so the first package is representative of the rest. +// +// This deliberately keys off the path signal rather than type-asserting the +// resolver to file.OCIMediaTypeResolver: the test harness wraps resolvers in an +// ObservingResolver that implements that interface unconditionally, so an +// interface check would misclassify directory scans as OCI. The "/" path is the +// genuine, testable signal the OCI model source produces. func fromOCIArtifact(pkgs []pkg.Package) bool { loc := primaryEvidenceLocation(pkgs[0]) return loc != nil && loc.RealPath == "/" diff --git a/syft/pkg/safetensors.go b/syft/pkg/safetensors.go index ea752bd91..d0ca8c2dc 100644 --- a/syft/pkg/safetensors.go +++ b/syft/pkg/safetensors.go @@ -34,7 +34,7 @@ type SafeTensorsModelInfo struct { TensorCount uint64 `json:"tensorCount,omitempty" cyclonedx:"tensorCount"` // TotalSize is the total byte size of tensor data across all shards when known - // (from the Docker AI model config "size" field or the sharded index "total_size"). + // (from the Docker AI model config "size" field). TotalSize string `json:"totalSize,omitempty" cyclonedx:"totalSize"` // ShardCount is the number of .safetensors shards for a sharded model (1 for a diff --git a/syft/source/ocimodelsource/oci_model_source.go b/syft/source/ocimodelsource/oci_model_source.go index 8fd5e3849..0fc697396 100644 --- a/syft/source/ocimodelsource/oci_model_source.go +++ b/syft/source/ocimodelsource/oci_model_source.go @@ -336,7 +336,8 @@ func (s *ociModelSource) Describe() source.Description { } } -// FileResolver returns a file resolver for accessing header of GGUF files. +// FileResolver returns a file resolver for accessing model headers and companion +// metadata (GGUF/SafeTensors headers, the model config blob, and companion layers). func (s *ociModelSource) FileResolver(_ source.Scope) (file.Resolver, error) { return s.resolver, nil } diff --git a/syft/source/ocimodelsource/registry_client.go b/syft/source/ocimodelsource/registry_client.go index d6c274c42..43adfd5e2 100644 --- a/syft/source/ocimodelsource/registry_client.go +++ b/syft/source/ocimodelsource/registry_client.go @@ -286,8 +286,9 @@ func (c *registryClient) fetchBlobRange(ctx context.Context, ref name.Reference, // https://github.com/ggml-org/ggml/blob/master/docs/gguf.md#file-structure data := make([]byte, maxBytes) n, err := io.ReadFull(reader, data) - if err != nil && err != io.ErrUnexpectedEOF { - // ErrUnexpectedEOF is okay - it means the file is smaller than maxBytes + // ErrUnexpectedEOF means the layer is smaller than maxBytes; EOF means it is + // empty. Both mean we read everything there was, not a failure. + if err != nil && !errors.Is(err, io.ErrUnexpectedEOF) && !errors.Is(err, io.EOF) { return nil, fmt.Errorf("failed to read layer data: %w", err) }