test: test cleanup

Signed-off-by: Christopher Phillips <32073428+spiffcs@users.noreply.github.com>
This commit is contained in:
Christopher Phillips 2026-06-01 21:49:15 -04:00
parent e88d6d019e
commit 4352ac4691
No known key found for this signature in database
4 changed files with 481 additions and 354 deletions

View File

@ -13,6 +13,7 @@ import (
"github.com/stretchr/testify/assert" "github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
"github.com/anchore/syft/syft/artifact"
"github.com/anchore/syft/syft/file" "github.com/anchore/syft/syft/file"
"github.com/anchore/syft/syft/pkg" "github.com/anchore/syft/syft/pkg"
"github.com/anchore/syft/syft/pkg/cataloger/internal/pkgtest" "github.com/anchore/syft/syft/pkg/cataloger/internal/pkgtest"
@ -39,7 +40,7 @@ func buildSafeTensorsFile(t *testing.T, metadata map[string]string, tensors map[
return out return out
} }
func TestSafeTensorsCataloger_singleFile(t *testing.T) { func TestSafeTensorsCataloger(t *testing.T) {
userMeta := map[string]string{"format": "pt"} userMeta := map[string]string{"format": "pt"}
tensors := map[string]safeTensorsEntry{ tensors := map[string]safeTensorsEntry{
"model.embed.weight": {DType: "BF16", Shape: []int64{1000, 16}, DataOffsets: []int64{0, 32000}}, "model.embed.weight": {DType: "BF16", Shape: []int64{1000, 16}, DataOffsets: []int64{0, 32000}},
@ -49,43 +50,61 @@ func TestSafeTensorsCataloger_singleFile(t *testing.T) {
// cataloger wires the header hash through to the package metadata. // cataloger wires the header hash through to the package metadata.
wantHash := (&safeTensorsHeader{metadata: userMeta, tensors: tensors}).metadataHash() wantHash := (&safeTensorsHeader{metadata: userMeta, tensors: tensors}).metadataHash()
dir := t.TempDir() tests := []struct {
modelDir := filepath.Join(dir, "models") name string
require.NoError(t, os.MkdirAll(modelDir, 0o755)) setup func(t *testing.T) string
require.NoError(t, os.WriteFile(filepath.Join(modelDir, "model.safetensors"), buildSafeTensorsFile(t, userMeta, tensors), 0o644)) expectedPackages []pkg.Package
require.NoError(t, os.WriteFile(filepath.Join(modelDir, "config.json"), expectedRelationships []artifact.Relationship
[]byte(`{"architectures":["LlamaForCausalLM"],"torch_dtype":"bfloat16","transformers_version":"4.40.0","_name_or_path":"meta-llama/Llama-3-8B"}`), 0o644)) }{
require.NoError(t, os.WriteFile(filepath.Join(modelDir, "README.md"),
[]byte("---\nlicense: Apache-2.0\nbase_model:\n - meta-llama/Llama-3\n---\n# Llama 3\n"), 0o644))
expected := []pkg.Package{
{ {
Name: "Llama-3-8B", name: "single-file model directory with config.json and README",
Type: pkg.ModelPkg, setup: func(t *testing.T) string {
Licenses: pkg.NewLicenseSet( dir := t.TempDir()
pkg.NewLicenseFromFields("Apache-2.0", "", nil), modelDir := filepath.Join(dir, "models")
), require.NoError(t, os.MkdirAll(modelDir, 0o755))
Metadata: pkg.SafeTensorsModelInfo{ require.NoError(t, os.WriteFile(filepath.Join(modelDir, "model.safetensors"), buildSafeTensorsFile(t, userMeta, tensors), 0o644))
Format: "safetensors", require.NoError(t, os.WriteFile(filepath.Join(modelDir, "config.json"),
Architecture: "LlamaForCausalLM", []byte(`{"architectures":["LlamaForCausalLM"],"torch_dtype":"bfloat16","transformers_version":"4.40.0","_name_or_path":"meta-llama/Llama-3-8B"}`), 0o644))
Quantization: "BF16", require.NoError(t, os.WriteFile(filepath.Join(modelDir, "README.md"),
Parameters: "16.26K", []byte("---\nlicense: Apache-2.0\nbase_model:\n - meta-llama/Llama-3\n---\n# Llama 3\n"), 0o644))
TensorCount: 2, return dir
TorchDtype: "bfloat16", },
TransformersVersion: "4.40.0", expectedPackages: []pkg.Package{
ShardCount: 1, {
UserMetadata: pkg.KeyValues{{Key: "format", Value: "pt"}}, Name: "Llama-3-8B",
MetadataHash: wantHash, Type: pkg.ModelPkg,
Licenses: pkg.NewLicenseSet(
pkg.NewLicenseFromFields("Apache-2.0", "", nil),
),
Metadata: pkg.SafeTensorsModelInfo{
Format: "safetensors",
Architecture: "LlamaForCausalLM",
Quantization: "BF16",
Parameters: "16.26K",
TensorCount: 2,
TorchDtype: "bfloat16",
TransformersVersion: "4.40.0",
ShardCount: 1,
UserMetadata: pkg.KeyValues{{Key: "format", Value: "pt"}},
MetadataHash: wantHash,
},
},
}, },
}, },
} }
pkgtest.NewCatalogTester(). for _, tt := range tests {
FromDirectory(t, dir). t.Run(tt.name, func(t *testing.T) {
Expects(expected, nil). fixtureDir := tt.setup(t)
IgnoreLocationLayer().
IgnorePackageFields("FoundBy", "Locations"). pkgtest.NewCatalogTester().
TestCataloger(t, NewSafeTensorsCataloger()) FromDirectory(t, fixtureDir).
Expects(tt.expectedPackages, tt.expectedRelationships).
IgnoreLocationLayer().
IgnorePackageFields("FoundBy", "Locations").
TestCataloger(t, NewSafeTensorsCataloger())
})
}
} }
// TestParseSafeTensorsOCIConfig covers the parser in isolation: it should emit // TestParseSafeTensorsOCIConfig covers the parser in isolation: it should emit
@ -94,31 +113,48 @@ func TestSafeTensorsCataloger_singleFile(t *testing.T) {
// artifact. Naming and license resolution happen in the merge processor and are // artifact. Naming and license resolution happen in the merge processor and are
// tested separately under TestSafeTensorsMergeProcessor. // tested separately under TestSafeTensorsMergeProcessor.
func TestParseSafeTensorsOCIConfig(t *testing.T) { func TestParseSafeTensorsOCIConfig(t *testing.T) {
t.Run("emits a nameless package with config-blob fields", func(t *testing.T) { tests := []struct {
blob := []byte(`{"config":{"format":"safetensors","quantization":"Q4_K_M","parameters":"8B","size":"16.00GB","safetensors":{"tensor_count":291}}}`) name string
blob string
expectedPackages []pkg.Package // nil => parser must emit nothing
}{
{
name: "emits a nameless package with config-blob fields",
blob: `{"config":{"format":"safetensors","quantization":"Q4_K_M","parameters":"8B","size":"16.00GB","safetensors":{"tensor_count":291}}}`,
expectedPackages: []pkg.Package{
{
// nameless: the merge processor assigns the name and resolves
// licenses. Config blobs carry no header content, so
// MetadataHash stays empty.
Type: pkg.ModelPkg,
Metadata: pkg.SafeTensorsModelInfo{
Format: "safetensors",
Quantization: "Q4_K_M",
Parameters: "8B",
TotalSize: "16.00GB",
TensorCount: 291,
},
},
},
},
{
// non-safetensors formats emit nothing so the GGUF cataloger can claim
// the artifact.
name: "ignores non-safetensors format",
blob: `{"config":{"format":"gguf","quantization":"Q4_K_M"}}`,
expectedPackages: nil,
},
}
pkgs, _, err := parseSafeTensorsOCIConfig(context.Background(), nil, nil, configReader(blob)) for _, tt := range tests {
require.NoError(t, err) t.Run(tt.name, func(t *testing.T) {
require.Len(t, pkgs, 1) pkgtest.NewCatalogTester().
FromString("/config.json", tt.blob).
p := pkgs[0] Expects(tt.expectedPackages, nil).
assert.Empty(t, p.Name, "config-blob parser must emit nameless; the merge processor names it") IgnorePackageFields("FoundBy", "Locations").
assert.Empty(t, p.Licenses.ToSlice(), "license resolution belongs to the merge processor") TestParser(t, parseSafeTensorsOCIConfig)
md := p.Metadata.(pkg.SafeTensorsModelInfo) })
assert.Equal(t, "safetensors", md.Format) }
assert.Equal(t, "Q4_K_M", md.Quantization)
assert.Equal(t, "8B", md.Parameters)
assert.Equal(t, "16.00GB", md.TotalSize)
assert.Equal(t, uint64(291), md.TensorCount)
assert.Empty(t, md.MetadataHash, "config blobs have no header content to hash")
})
t.Run("ignores non-safetensors format", func(t *testing.T) {
ggufBlob := []byte(`{"config":{"format":"gguf","quantization":"Q4_K_M"}}`)
pkgs, _, err := parseSafeTensorsOCIConfig(context.Background(), nil, nil, configReader(ggufBlob))
require.NoError(t, err)
assert.Empty(t, pkgs)
})
} }
// TestSafeTensorsMergeProcessor exercises the merge processor directly with // TestSafeTensorsMergeProcessor exercises the merge processor directly with
@ -147,10 +183,10 @@ func TestSafeTensorsMergeProcessor(t *testing.T) {
} }
} }
t.Run("dir scan: dropped when no sibling config.json carries _name_or_path", func(t *testing.T) { t.Run("dir scan: parent directory base name names the group when no config.json is present", func(t *testing.T) {
// Without a config.json the dir-scan path has no name source. There is // Without a config.json the dir-scan path falls through to the
// intentionally no parent-dir fallback (or any opaque fallback), so the // parent directory base name. hugginface style model dir is named after the
// group is dropped rather than named after the filesystem layout. // model, so "/models/tiny-llama/weights.safetensors" → "tiny-llama".
p := dirPkg("/models/tiny-llama/weights.safetensors", pkg.SafeTensorsModelInfo{ p := dirPkg("/models/tiny-llama/weights.safetensors", pkg.SafeTensorsModelInfo{
Format: "safetensors", Format: "safetensors",
TensorCount: 4, TensorCount: 4,
@ -160,33 +196,46 @@ func TestSafeTensorsMergeProcessor(t *testing.T) {
resolver := file.NewMockResolverForPaths() // no config.json / README available resolver := file.NewMockResolverForPaths() // no config.json / README available
out, _, err := safeTensorsMergeProcessor(context.Background(), resolver, []pkg.Package{p}, nil, nil) out, _, err := safeTensorsMergeProcessor(context.Background(), resolver, []pkg.Package{p}, nil, nil)
require.NoError(t, err) require.NoError(t, err)
assert.Empty(t, out, "dir-scan group with no config.json must be dropped") require.Len(t, out, 1)
assert.Equal(t, "tiny-llama", out[0].Name, "rung 2: parent directory base name")
}) })
t.Run("dir scan: Architecture-Parameters alone does not name the package", func(t *testing.T) { t.Run("dir scan: nested model dirs group and name by immediate parent", func(t *testing.T) {
// Even with rich content-derived metadata (Architecture + Parameters), top := dirPkg("/namea/1.safetensors", pkg.SafeTensorsModelInfo{
// the package must be dropped when there is no producer-declared name. Format: "safetensors", TensorCount: 1, MetadataHash: "aaaa",
// The Arch-Params synthetic rung was removed because it produced labels })
// like "LlamaForCausalLM-2.68B" that SBOM consumers couldn't trace back nested := dirPkg("/namea/nameb/2.safetensors", pkg.SafeTensorsModelInfo{
// to a recognizable model. Format: "safetensors", TensorCount: 1, MetadataHash: "bbbb",
p := dirPkg("/models/tiny/weights.safetensors", pkg.SafeTensorsModelInfo{
Format: "safetensors",
Architecture: "LlamaForCausalLM",
Parameters: "2.68B",
TensorCount: 4,
MetadataHash: "abc",
}) })
resolver := file.NewMockResolverForPaths() resolver := file.NewMockResolverForPaths()
out, _, err := safeTensorsMergeProcessor(context.Background(), resolver, []pkg.Package{top, nested}, nil, nil)
require.NoError(t, err)
require.Len(t, out, 2)
names := []string{out[0].Name, out[1].Name}
assert.ElementsMatch(t, []string{"namea", "nameb"}, names)
})
t.Run("dir scan: config.json _name_or_path beats the parent directory fallback", func(t *testing.T) {
// When a sibling config.json carries _name_or_path
dir := t.TempDir()
require.NoError(t, os.WriteFile(filepath.Join(dir, "config.json"),
[]byte(`{"_name_or_path":"org/preferred-name"}`), 0o644))
stPath := filepath.Join(dir, "weights.safetensors")
p := dirPkg(stPath, pkg.SafeTensorsModelInfo{
Format: "safetensors", TensorCount: 1, MetadataHash: "abc",
})
resolver := file.NewMockResolverForPaths(filepath.Join(dir, "config.json"))
out, _, err := safeTensorsMergeProcessor(context.Background(), resolver, []pkg.Package{p}, nil, nil) out, _, err := safeTensorsMergeProcessor(context.Background(), resolver, []pkg.Package{p}, nil, nil)
require.NoError(t, err) require.NoError(t, err)
assert.Empty(t, out, "Arch-Params alone is not a name source") require.Len(t, out, 1)
assert.Equal(t, "preferred-name", out[0].Name, "rung 1 (config.json) wins over rung 2 (parent dir)")
}) })
t.Run("OCI: dropped when no name source is available", func(t *testing.T) { t.Run("OCI: dropped when no name source is available", func(t *testing.T) {
// The vllm-style shape: config-blob package + a weight-layer package, // The vllm-style shape: config-blob package + a weight-layer package,
// both at virtual path "/", no model.file companions on the resolver // both at virtual path "/", no model.file companions on the resolver
// AND no image ref. With nothing to derive a name from, the group is // AND no image ref. With nothing to derive a name from, the package is
// dropped — no opaque fallback. // dropped
configMd := pkg.SafeTensorsModelInfo{ configMd := pkg.SafeTensorsModelInfo{
Format: "safetensors", Format: "safetensors",
TensorCount: 5, TensorCount: 5,
@ -209,9 +258,7 @@ func TestSafeTensorsMergeProcessor(t *testing.T) {
t.Run("OCI: image-ref last segment names the group when config.json is absent", func(t *testing.T) { t.Run("OCI: image-ref last segment names the group when config.json is absent", func(t *testing.T) {
// vllm-style artifact: a repacked model whose embedded config.json has // vllm-style artifact: a repacked model whose embedded config.json has
// been stripped of _name_or_path. The merge processor falls through to // been stripped of _name_or_path.
// the second rung — the image-reference last segment — so we still emit
// a recognizable model name instead of dropping it.
configMd := pkg.SafeTensorsModelInfo{ configMd := pkg.SafeTensorsModelInfo{
Format: "safetensors", Format: "safetensors",
TensorCount: 290, TensorCount: 290,
@ -235,75 +282,6 @@ func TestSafeTensorsMergeProcessor(t *testing.T) {
assert.Equal(t, "smollm2-vllm", out[0].Name, "rung 2: image-ref repository basename") assert.Equal(t, "smollm2-vllm", out[0].Name, "rung 2: image-ref repository basename")
}) })
t.Run("OCI: config.json _name_or_path beats the image-ref fallback", func(t *testing.T) {
// When the embedded config.json carries _name_or_path, rung 1 wins over
// the image ref even if both are present.
dir := t.TempDir()
hfConfigPath := filepath.Join(dir, "config.json")
require.NoError(t, os.WriteFile(hfConfigPath,
[]byte(`{"_name_or_path":"org/preferred-name"}`), 0o644))
resolver := file.NewMockResolverForOCIArtifact(
"docker.io/ai/smollm2-vllm:360M",
map[string][]file.Location{
dockerAIModelFileMediaType: {file.NewLocation(hfConfigPath)},
},
)
configMd := pkg.SafeTensorsModelInfo{Format: "safetensors", TensorCount: 1}
out, _, err := safeTensorsMergeProcessor(
context.Background(), resolver,
[]pkg.Package{ociPkg(configMd)}, nil, nil,
)
require.NoError(t, err)
require.Len(t, out, 1)
assert.Equal(t, "preferred-name", out[0].Name, "rung 1 (config.json) wins over rung 2 (image ref)")
})
t.Run("OCI: merges config + shard and names from companion config.json", func(t *testing.T) {
// Write a single model.file companion blob containing HF config.json so
// the processor can derive _name_or_path and Architecture from it.
dir := t.TempDir()
hfConfigPath := filepath.Join(dir, "config.json")
require.NoError(t, os.WriteFile(hfConfigPath,
[]byte(`{"architectures":["Qwen3ForCausalLM"],"torch_dtype":"bfloat16","_name_or_path":"org/qwen-tiny"}`), 0o644))
resolver := file.NewMockResolverForMediaTypes(map[string][]file.Location{
dockerAIModelFileMediaType: {file.NewLocation(hfConfigPath)},
})
configMd := pkg.SafeTensorsModelInfo{
Format: "safetensors",
Quantization: "Q4_K_M", // raw producer-declared value
Parameters: "8B",
TotalSize: "16.00GB",
TensorCount: 291,
}
shardMd := pkg.SafeTensorsModelInfo{
Format: "safetensors",
TensorCount: 100, // per-shard count — must NOT be summed onto the aggregate's 291
Quantization: "BF16",
MetadataHash: "deadbeef",
UserMetadata: pkg.KeyValues{{Key: "format", Value: "pt"}},
}
out, _, err := safeTensorsMergeProcessor(
context.Background(), resolver,
[]pkg.Package{ociPkg(configMd), ociPkg(shardMd)}, nil, nil,
)
require.NoError(t, err)
require.Len(t, out, 1)
got := out[0]
assert.Equal(t, "qwen-tiny", got.Name, "name comes from path.Base(_name_or_path)")
md := got.Metadata.(pkg.SafeTensorsModelInfo)
assert.Equal(t, uint64(291), md.TensorCount, "aggregate TensorCount must win — never double-count by summing the shard")
assert.Equal(t, "16.00GB", md.TotalSize)
assert.Equal(t, "8B", md.Parameters)
assert.Equal(t, "Qwen3ForCausalLM", md.Architecture, "Architecture enriched from companion config.json")
assert.Equal(t, "bfloat16", md.TorchDtype)
assert.Equal(t, "Q4_K_M", md.Quantization, "aggregate Quantization wins over shard's normalized dtype when both present")
assert.Equal(t, "deadbeef", md.MetadataHash, "single-shard rollup is the lone shard's hash")
assert.Equal(t, pkg.KeyValues{{Key: "format", Value: "pt"}}, md.UserMetadata)
assert.Nil(t, md.Parts, "single-shard groups skip Parts; the outer view already exposes everything")
})
t.Run("OCI: multi-shard rollup hashes are stable and sorted", func(t *testing.T) { t.Run("OCI: multi-shard rollup hashes are stable and sorted", func(t *testing.T) {
dir := t.TempDir() dir := t.TempDir()
hfConfigPath := filepath.Join(dir, "config.json") hfConfigPath := filepath.Join(dir, "config.json")
@ -389,20 +367,21 @@ spdx-id: Apache-2.0
// //
// Precedence (highest → lowest): // Precedence (highest → lowest):
// 1. config.json _name_or_path (path.Base applied; both dir-scan and OCI) // 1. config.json _name_or_path (path.Base applied; both dir-scan and OCI)
// 2. OCI image-ref last segment (OCI only — empty string for dir scans) // 2. fallback name — OCI image-ref last segment, or dir-scan parent directory
// base name (the merge processor computes the right one per group)
// → drop (empty name) when nothing matches // → drop (empty name) when nothing matches
func TestSafeTensorsNamingPrecedence(t *testing.T) { func TestSafeTensorsNamingPrecedence(t *testing.T) {
cases := []struct { tests := []struct {
name string name string
nameOrPath string nameOrPath string
imageRefName string fallbackName string
want string want string
}{ }{
// rung 1 // rung 1
{ {
name: "rung 1: _name_or_path beats the image-ref fallback", name: "rung 1: _name_or_path beats the fallback",
nameOrPath: "org/MyModel", nameOrPath: "org/MyModel",
imageRefName: "fallback-ref", fallbackName: "fallback-name",
want: "MyModel", want: "MyModel",
}, },
{ {
@ -418,10 +397,15 @@ func TestSafeTensorsNamingPrecedence(t *testing.T) {
// rung 2 // rung 2
{ {
name: "rung 2: image-ref last segment used when _name_or_path is empty", name: "rung 2: OCI image-ref last segment used when _name_or_path is empty",
imageRefName: "smollm2-vllm", fallbackName: "smollm2-vllm",
want: "smollm2-vllm", want: "smollm2-vllm",
}, },
{
name: "rung 2: dir-scan parent directory name used when _name_or_path is empty",
fallbackName: "tiny-llama",
want: "tiny-llama",
},
// drops // drops
{ {
@ -430,10 +414,31 @@ func TestSafeTensorsNamingPrecedence(t *testing.T) {
}, },
} }
for _, tc := range cases { for _, tt := range tests {
t.Run(tc.name, func(t *testing.T) { t.Run(tt.name, func(t *testing.T) {
got := pickSafeTensorsName(tc.nameOrPath, tc.imageRefName) got := pickSafeTensorsName(tt.nameOrPath, tt.fallbackName)
assert.Equal(t, tc.want, got) assert.Equal(t, tt.want, got)
})
}
}
// TestSafeTensorsDirName covers the directory-scan fallback name derivation,
// including the degenerate roots that must yield no name.
func TestSafeTensorsDirName(t *testing.T) {
tests := []struct {
groupKey string
want string
}{
{groupKey: "/models/tiny-llama", want: "tiny-llama"},
{groupKey: "/namea", want: "namea"},
{groupKey: "/namea/nameb", want: "nameb"},
{groupKey: "/", want: ""},
{groupKey: ".", want: ""},
{groupKey: "", want: ""},
}
for _, tt := range tests {
t.Run(tt.groupKey, func(t *testing.T) {
assert.Equal(t, tt.want, safeTensorsDirName(tt.groupKey))
}) })
} }
} }
@ -449,19 +454,26 @@ func TestParseSafeTensorsOCILayer(t *testing.T) {
wantHash := (&safeTensorsHeader{metadata: userMeta, tensors: tensors}).metadataHash() wantHash := (&safeTensorsHeader{metadata: userMeta, tensors: tensors}).metadataHash()
t.Run("emits a nameless package with header-derived metadata", func(t *testing.T) { t.Run("emits a nameless package with header-derived metadata", func(t *testing.T) {
reader := file.NewLocationReadCloser(file.NewLocation("/"), io.NopCloser(bytes.NewReader(blob))) // nameless: the merge processor assigns the name. Parameters is the
pkgs, _, err := parseSafeTensorsOCILayer(context.Background(), nil, nil, reader) // summed element count of the two tensors (1024*16 + 16*16 = 16640).
require.NoError(t, err) expected := []pkg.Package{
require.Len(t, pkgs, 1) {
Type: pkg.ModelPkg,
p := pkgs[0] Metadata: pkg.SafeTensorsModelInfo{
assert.Empty(t, p.Name, "weight-layer parser must emit nameless; the merge processor names it") Format: "safetensors",
md := p.Metadata.(pkg.SafeTensorsModelInfo) Parameters: "16.64K",
assert.Equal(t, "safetensors", md.Format) Quantization: "BF16",
assert.Equal(t, uint64(2), md.TensorCount) TensorCount: 2,
assert.Equal(t, "BF16", md.Quantization) UserMetadata: wantUserMetadata,
assert.Equal(t, wantUserMetadata, md.UserMetadata) MetadataHash: wantHash,
assert.Equal(t, wantHash, md.MetadataHash) },
},
}
pkgtest.NewCatalogTester().
FromString("/", string(blob)).
Expects(expected, nil).
IgnorePackageFields("FoundBy", "Locations").
TestParser(t, parseSafeTensorsOCILayer)
}) })
t.Run("merged via processor: aggregate fields preserved, hash lifted from single shard", func(t *testing.T) { t.Run("merged via processor: aggregate fields preserved, hash lifted from single shard", func(t *testing.T) {
@ -535,27 +547,30 @@ func TestParseSafeTensorsOCILayer(t *testing.T) {
// Locking in the field values guards against changes to the header parser // Locking in the field values guards against changes to the header parser
// silently breaking on real-world content shape. // silently breaking on real-world content shape.
func TestParseSafeTensorsOCILayer_realFixture(t *testing.T) { func TestParseSafeTensorsOCILayer_realFixture(t *testing.T) {
data, err := os.ReadFile(filepath.Join("testdata", "safetensors", "nomic-embed-475M.header.safetensors")) // nameless before the merge processor runs. The fixture is immutable on disk;
require.NoError(t, err) // the locked field values (notably MetadataHash) guard against the header
require.Greater(t, len(data), 8, "fixture must include the 8-byte length prefix") // parser silently breaking on real-world content shape — if MetadataHash
// changes, either the hash algorithm or the canonicalization changed, both of
// which callers may rely on for cross-source identity.
expected := []pkg.Package{
{
Type: pkg.ModelPkg,
Metadata: pkg.SafeTensorsModelInfo{
Format: "safetensors",
Parameters: "475.29M",
Quantization: "F32", // every tensor in the captured shard is F32
TensorCount: 148, // nomic-embed-v2-moe 475M ships 148 tensor entries in this shard
UserMetadata: pkg.KeyValues{{Key: "format", Value: "pt"}},
MetadataHash: "051a14e686673dea",
},
},
}
reader := file.NewLocationReadCloser(file.NewLocation("/"), io.NopCloser(bytes.NewReader(data))) pkgtest.NewCatalogTester().
pkgs, _, err := parseSafeTensorsOCILayer(context.Background(), nil, nil, reader) FromFile(t, filepath.Join("testdata", "safetensors", "nomic-embed-475M.header.safetensors")).
require.NoError(t, err) Expects(expected, nil).
require.Len(t, pkgs, 1) IgnorePackageFields("FoundBy", "Locations").
assert.Empty(t, pkgs[0].Name, "weight-layer packages are nameless before the merge processor runs") TestParser(t, parseSafeTensorsOCILayer)
md := pkgs[0].Metadata.(pkg.SafeTensorsModelInfo)
assert.Equal(t, "safetensors", md.Format)
assert.Equal(t, uint64(148), md.TensorCount, "nomic-embed-v2-moe 475M ships 148 tensor entries in this shard")
assert.Equal(t, "F32", md.Quantization, "every tensor in the captured shard is F32")
assert.Equal(t, "475.29M", md.Parameters)
assert.Equal(t, pkg.KeyValues{{Key: "format", Value: "pt"}}, md.UserMetadata)
// MetadataHash is locked to the exact value the parser produces for this
// captured input. The fixture is immutable on disk; if this value changes
// either the hash algorithm or the canonicalization changed, both of which
// callers may rely on for cross-source identity.
assert.Equal(t, "051a14e686673dea", md.MetadataHash)
} }
func TestSafeTensorsCrossSourceHashParity(t *testing.T) { func TestSafeTensorsCrossSourceHashParity(t *testing.T) {
@ -595,10 +610,6 @@ func TestSafeTensorsCrossSourceHashParity(t *testing.T) {
assert.Equal(t, dirHash, ociHash, "same content via dir scan and OCI weight-layer scan must hash equal") assert.Equal(t, dirHash, ociHash, "same content via dir scan and OCI weight-layer scan must hash equal")
} }
func configReader(blob []byte) file.LocationReadCloser {
return file.NewLocationReadCloser(file.NewLocation("/config.json"), io.NopCloser(bytes.NewReader(blob)))
}
func assertHasLicense(t *testing.T, p pkg.Package, value string) { func assertHasLicense(t *testing.T, p pkg.Package, value string) {
t.Helper() t.Helper()
for _, l := range p.Licenses.ToSlice() { for _, l := range p.Licenses.ToSlice() {
@ -610,28 +621,50 @@ func assertHasLicense(t *testing.T, p pkg.Package, value string) {
} }
func TestReadSafeTensorsHeader(t *testing.T) { func TestReadSafeTensorsHeader(t *testing.T) {
t.Run("valid header", func(t *testing.T) { zeroLength := make([]byte, 8) // length prefix of 0
data := buildSafeTensorsFile(t, map[string]string{"format": "pt"}, map[string]safeTensorsEntry{
"w": {DType: "F32", Shape: []int64{2, 2}, DataOffsets: []int64{0, 16}}, truncatedBody := make([]byte, 8)
binary.LittleEndian.PutUint64(truncatedBody, 100) // claims 100 bytes but supplies none
tests := []struct {
name string
data []byte
wantErr bool
assert func(t *testing.T, h *safeTensorsHeader)
}{
{
name: "valid header",
data: buildSafeTensorsFile(t, map[string]string{"format": "pt"}, map[string]safeTensorsEntry{
"w": {DType: "F32", Shape: []int64{2, 2}, DataOffsets: []int64{0, 16}},
}),
assert: func(t *testing.T, h *safeTensorsHeader) {
assert.Len(t, h.tensors, 1)
assert.Equal(t, "pt", h.metadata["format"])
},
},
{
name: "zero-length header",
data: zeroLength,
wantErr: true,
},
{
name: "truncated body",
data: truncatedBody,
wantErr: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
h, err := readSafeTensorsHeader(bytes.NewReader(tt.data))
if tt.wantErr {
require.Error(t, err)
return
}
require.NoError(t, err)
tt.assert(t, h)
}) })
h, err := readSafeTensorsHeader(bytes.NewReader(data)) }
require.NoError(t, err)
assert.Len(t, h.tensors, 1)
assert.Equal(t, "pt", h.metadata["format"])
})
t.Run("zero-length header", func(t *testing.T) {
var buf [8]byte // length prefix of 0
_, err := readSafeTensorsHeader(bytes.NewReader(buf[:]))
require.Error(t, err)
})
t.Run("truncated body", func(t *testing.T) {
var buf [8]byte
binary.LittleEndian.PutUint64(buf[:], 100) // claims 100 bytes but supplies none
_, err := readSafeTensorsHeader(bytes.NewReader(buf[:]))
require.Error(t, err)
})
} }
func TestSafeTensorsHeader_metadataHash(t *testing.T) { func TestSafeTensorsHeader_metadataHash(t *testing.T) {
@ -680,71 +713,112 @@ func TestSafeTensorsHeader_parameterCountAndDType(t *testing.T) {
} }
func TestNormalizeDType(t *testing.T) { func TestNormalizeDType(t *testing.T) {
cases := map[string]string{ tests := []struct {
"BF16": "BF16", name string
"float16": "F16", in string
"FP32": "F32", want string
"int8": "I8", }{
"U8": "U8", {name: "already canonical BF16", in: "BF16", want: "BF16"},
"bool": "BOOL", {name: "float16 alias", in: "float16", want: "F16"},
"weird": "WEIRD", {name: "FP32 alias", in: "FP32", want: "F32"},
{name: "int8 alias", in: "int8", want: "I8"},
{name: "U8 passthrough", in: "U8", want: "U8"},
{name: "bool", in: "bool", want: "BOOL"},
{name: "unknown value uppercased", in: "weird", want: "WEIRD"},
} }
for in, want := range cases { for _, tt := range tests {
assert.Equalf(t, want, normalizeDType(in), "normalizeDType(%q)", in) t.Run(tt.name, func(t *testing.T) {
assert.Equal(t, tt.want, normalizeDType(tt.in))
})
} }
} }
func TestFormatParameterCount(t *testing.T) { func TestFormatParameterCount(t *testing.T) {
cases := map[uint64]string{ tests := []struct {
512: "512", name string
16256: "16.26K", in uint64
2_680_000_000: "2.68B", want string
35_000_000: "35.00M", }{
{name: "raw count under 1K", in: 512, want: "512"},
{name: "thousands", in: 16256, want: "16.26K"},
{name: "billions", in: 2_680_000_000, want: "2.68B"},
{name: "millions", in: 35_000_000, want: "35.00M"},
} }
for in, want := range cases { for _, tt := range tests {
assert.Equalf(t, want, formatParameterCount(in), "formatParameterCount(%d)", in) t.Run(tt.name, func(t *testing.T) {
assert.Equal(t, tt.want, formatParameterCount(tt.in))
})
} }
} }
func TestParseFrontmatter(t *testing.T) { func TestParseFrontmatter(t *testing.T) {
t.Run("list base_model", func(t *testing.T) { tests := []struct {
fm := parseFrontmatter([]byte("---\nlicense: mit\nbase_model:\n - org/Model\n---\nbody")) name string
require.NotNil(t, fm) input string
assert.Equal(t, "mit", fm.License) wantNil bool
assert.Equal(t, []string{"org/Model"}, fm.BaseModel) wantLicense string
}) wantBaseModel []string
}{
{
name: "list base_model",
input: "---\nlicense: mit\nbase_model:\n - org/Model\n---\nbody",
wantLicense: "mit",
wantBaseModel: []string{"org/Model"},
},
{
name: "scalar base_model",
input: "---\nlicense: apache-2.0\nbase_model: org/Model\n---\n",
wantLicense: "apache-2.0",
wantBaseModel: []string{"org/Model"},
},
{
name: "leading BOM",
input: "\xef\xbb\xbf---\nlicense: mit\n---\n",
wantLicense: "mit",
},
{
name: "no frontmatter",
input: "# just a heading\n",
wantNil: true,
},
{
name: "unterminated frontmatter",
input: "---\nlicense: mit\n",
wantNil: true,
},
}
t.Run("scalar base_model", func(t *testing.T) { for _, tt := range tests {
fm := parseFrontmatter([]byte("---\nlicense: apache-2.0\nbase_model: org/Model\n---\n")) t.Run(tt.name, func(t *testing.T) {
require.NotNil(t, fm) fm := parseFrontmatter([]byte(tt.input))
assert.Equal(t, "apache-2.0", fm.License) if tt.wantNil {
assert.Equal(t, []string{"org/Model"}, fm.BaseModel) assert.Nil(t, fm)
}) return
}
t.Run("leading BOM", func(t *testing.T) { require.NotNil(t, fm)
fm := parseFrontmatter([]byte("\xef\xbb\xbf---\nlicense: mit\n---\n")) assert.Equal(t, tt.wantLicense, fm.License)
require.NotNil(t, fm) if tt.wantBaseModel != nil {
assert.Equal(t, "mit", fm.License) assert.Equal(t, tt.wantBaseModel, fm.BaseModel)
}) }
})
t.Run("no frontmatter", func(t *testing.T) { }
assert.Nil(t, parseFrontmatter([]byte("# just a heading\n")))
})
t.Run("unterminated frontmatter", func(t *testing.T) {
assert.Nil(t, parseFrontmatter([]byte("---\nlicense: mit\n")))
})
} }
// TestParseLicenseFrontmatter covers the choosealicense.com-style YAML // TestParseLicenseFrontmatter covers the choosealicense.com-style YAML
// frontmatter Docker Model Runner uses for its license layers. Only spdx-id // frontmatter Docker Model Runner uses for its license layers. Only spdx-id
// is consumed; everything else in the block is ignored. // is consumed; everything else in the block is ignored.
func TestParseLicenseFrontmatter(t *testing.T) { func TestParseLicenseFrontmatter(t *testing.T) {
t.Run("Apache-2.0 (the canonical choosealicense.com shape)", func(t *testing.T) { // The Apache-2.0 case is the exact frontmatter shape from
// This is the exact frontmatter shape from // https://github.com/github/choosealicense.com/blob/gh-pages/_licenses/apache-2.0.txt
// https://github.com/github/choosealicense.com/blob/gh-pages/_licenses/apache-2.0.txt // Docker AI license layers ship a near-identical block.
// Docker AI license layers ship a near-identical block. tests := []struct {
buf := []byte(`--- name string
input string
want string
}{
{
name: "Apache-2.0 (the canonical choosealicense.com shape)",
input: `---
title: Apache License 2.0 title: Apache License 2.0
spdx-id: Apache-2.0 spdx-id: Apache-2.0
redirect_from: /licenses/apache/ redirect_from: /licenses/apache/
@ -780,29 +854,36 @@ limitations:
Apache License Apache License
Version 2.0, January 2004 Version 2.0, January 2004
`) `,
assert.Equal(t, "Apache-2.0", parseLicenseFrontmatter(buf)) want: "Apache-2.0",
}) },
{
name: "MIT with BOM prefix",
input: "\xef\xbb\xbf---\ntitle: MIT License\nspdx-id: MIT\n---\nThe MIT License...\n",
want: "MIT",
},
{
name: "frontmatter without spdx-id falls through (returns empty)",
input: "---\ntitle: Something\ndescription: no spdx-id here\n---\nbody\n",
want: "",
},
{
name: "plain license text without any frontmatter",
input: " Apache License\n Version 2.0, January 2004\n",
want: "",
},
{
name: "unterminated frontmatter block",
input: "---\nspdx-id: MIT\n(never closes)\n",
want: "",
},
}
t.Run("MIT with BOM prefix", func(t *testing.T) { for _, tt := range tests {
buf := []byte("\xef\xbb\xbf---\ntitle: MIT License\nspdx-id: MIT\n---\nThe MIT License...\n") t.Run(tt.name, func(t *testing.T) {
assert.Equal(t, "MIT", parseLicenseFrontmatter(buf)) assert.Equal(t, tt.want, parseLicenseFrontmatter([]byte(tt.input)))
}) })
}
t.Run("frontmatter without spdx-id falls through (returns empty)", func(t *testing.T) {
buf := []byte("---\ntitle: Something\ndescription: no spdx-id here\n---\nbody\n")
assert.Empty(t, parseLicenseFrontmatter(buf))
})
t.Run("plain license text without any frontmatter", func(t *testing.T) {
buf := []byte(" Apache License\n Version 2.0, January 2004\n")
assert.Empty(t, parseLicenseFrontmatter(buf))
})
t.Run("unterminated frontmatter block", func(t *testing.T) {
buf := []byte("---\nspdx-id: MIT\n(never closes)\n")
assert.Empty(t, parseLicenseFrontmatter(buf))
})
} }
func TestDockerAIModelConfigMediaTypes(t *testing.T) { func TestDockerAIModelConfigMediaTypes(t *testing.T) {
@ -816,14 +897,27 @@ func TestDockerAIModelConfigMediaTypes(t *testing.T) {
} }
return false return false
} }
// the known, verified schema versions are consumed
assert.True(t, supported("application/vnd.docker.ai.model.config.v0.1+json")) tests := []struct {
assert.True(t, supported("application/vnd.docker.ai.model.config.v0.2+json")) name string
// unknown/future schema versions are intentionally NOT consumed, to avoid mediaType string
// silently ingesting a potentially breaking config change want bool
assert.False(t, supported("application/vnd.docker.ai.model.config.v0.3+json")) }{
assert.False(t, supported("application/vnd.docker.ai.model.config.v9.9+json")) // the known, verified schema versions are consumed
// sibling layer media types are not matched either {name: "known schema v0.1 is consumed", mediaType: "application/vnd.docker.ai.model.config.v0.1+json", want: true},
assert.False(t, supported("application/vnd.docker.ai.model.file")) {name: "known schema v0.2 is consumed", mediaType: "application/vnd.docker.ai.model.config.v0.2+json", want: true},
assert.False(t, supported("application/vnd.docker.ai.gguf.v3")) // unknown/future schema versions are intentionally NOT consumed, to avoid
// silently ingesting a potentially breaking config change
{name: "unknown schema v0.3 is rejected", mediaType: "application/vnd.docker.ai.model.config.v0.3+json", want: false},
{name: "far-future schema v9.9 is rejected", mediaType: "application/vnd.docker.ai.model.config.v9.9+json", want: false},
// sibling layer media types are not matched either
{name: "sibling model.file layer is not matched", mediaType: "application/vnd.docker.ai.model.file", want: false},
{name: "sibling gguf layer is not matched", mediaType: "application/vnd.docker.ai.gguf.v3", want: false},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
assert.Equal(t, tt.want, supported(tt.mediaType))
})
}
} }

View File

@ -127,10 +127,10 @@ func safeTensorsMergeProcessor(ctx context.Context, resolver file.Resolver, pkgs
out := other out := other
for _, key := range keys { for _, key := range keys {
merged := mergeSafeTensorsGroup(groups[key]) merged := mergeSafeTensorsGroup(groups[key])
nameOrPath, imageRefName := enrichSafeTensorsGroup(ctx, resolver, key, &merged) nameOrPath, fallbackName := enrichSafeTensorsGroup(ctx, resolver, key, &merged)
name := pickSafeTensorsName(nameOrPath, imageRefName) name := pickSafeTensorsName(nameOrPath, fallbackName)
if name == "" { if name == "" {
continue // drop unnameable groups, per design (no opaque fallback) continue // drop groups with no name source and no usable fallback
} }
merged.Name = name merged.Name = name
merged.SetID() merged.SetID()
@ -142,8 +142,6 @@ func safeTensorsMergeProcessor(ctx context.Context, resolver file.Resolver, pkgs
// groupSafeTensorsPackages buckets packages by the parent directory of their // groupSafeTensorsPackages buckets packages by the parent directory of their
// primary-evidence location, or the OCI sentinel when the location lives at // primary-evidence location, or the OCI sentinel when the location lives at
// the ContainerImageModel resolver's virtual "/" path. // the ContainerImageModel resolver's virtual "/" path.
// TODO: assemble a test where there are cases for DIR ran into for a single scan
// - safe tensors at the top level as well as sub directories
func groupSafeTensorsPackages(pkgs []pkg.Package) map[string][]pkg.Package { func groupSafeTensorsPackages(pkgs []pkg.Package) map[string][]pkg.Package {
out := make(map[string][]pkg.Package) out := make(map[string][]pkg.Package)
for _, p := range pkgs { for _, p := range pkgs {
@ -253,19 +251,15 @@ func mergeAggregatesInto(merged *pkg.SafeTensorsModelInfo, aggregates []pkg.Safe
// mergeShardsInto folds the per-shard header metadata into merged, returning // mergeShardsInto folds the per-shard header metadata into merged, returning
// the summed shard TensorCount and the list of non-empty per-shard hashes for // the summed shard TensorCount and the list of non-empty per-shard hashes for
// the rollup. Architecture / TorchDtype / TransformersVersion are accepted as // the rollup. Shards carry only the content-derived fields (Quantization,
// fallbacks if a shard ever carries them (the current parsers don't, but the // Parameters, UserMetadata); producer-declared fields like Architecture come
// resolver-backed enrichment runs afterwards and won't overwrite anything // from the resolver-backed enrichment that runs afterwards.
// already set, so it's safe to populate them earlier).
func mergeShardsInto(merged *pkg.SafeTensorsModelInfo, shards []pkg.SafeTensorsModelInfo) (shardTensorTotal uint64, hashes []string) { func mergeShardsInto(merged *pkg.SafeTensorsModelInfo, shards []pkg.SafeTensorsModelInfo) (shardTensorTotal uint64, hashes []string) {
seenKV := map[string]bool{} seenKV := map[string]bool{}
for _, s := range shards { for _, s := range shards {
shardTensorTotal += s.TensorCount shardTensorTotal += s.TensorCount
firstNonEmpty(&merged.Quantization, s.Quantization) firstNonEmpty(&merged.Quantization, s.Quantization)
firstNonEmpty(&merged.Parameters, s.Parameters) firstNonEmpty(&merged.Parameters, s.Parameters)
firstNonEmpty(&merged.Architecture, s.Architecture)
firstNonEmpty(&merged.TorchDtype, s.TorchDtype)
firstNonEmpty(&merged.TransformersVersion, s.TransformersVersion)
for _, kv := range s.UserMetadata { for _, kv := range s.UserMetadata {
if seenKV[kv.Key] { if seenKV[kv.Key] {
continue continue
@ -336,10 +330,11 @@ func rollupHash(hashes []string) string {
// merged metadata's Architecture / TorchDtype / TransformersVersion, set the // merged metadata's Architecture / TorchDtype / TransformersVersion, set the
// licenses on the merged package, and attach the location of every consulted // licenses on the merged package, and attach the location of every consulted
// supporting file as SupportingEvidence. Returns two name candidates for the // supporting file as SupportingEvidence. Returns two name candidates for the
// merge processor: nameOrPath (raw _name_or_path from a config.json) and // merge processor: nameOrPath (raw _name_or_path from a config.json) and a
// imageRefName (the last path segment of the OCI image reference, empty for // fallbackName used when no _name_or_path is available — the last path segment
// dir-scan groups). // of the OCI image reference for OCI groups, or the parent directory's base
func enrichSafeTensorsGroup(ctx context.Context, resolver file.Resolver, groupKey string, merged *pkg.Package) (nameOrPath, imageRefName string) { // name for directory-scan groups.
func enrichSafeTensorsGroup(ctx context.Context, resolver file.Resolver, groupKey string, merged *pkg.Package) (nameOrPath, fallbackName string) {
md := merged.Metadata.(pkg.SafeTensorsModelInfo) md := merged.Metadata.(pkg.SafeTensorsModelInfo)
var ( var (
@ -347,9 +342,10 @@ func enrichSafeTensorsGroup(ctx context.Context, resolver file.Resolver, groupKe
supporting []file.Location supporting []file.Location
) )
if groupKey == ociGroupKey { if groupKey == ociGroupKey {
nameOrPath, imageRefName, lics, supporting = enrichSafeTensorsOCI(ctx, resolver, &md) nameOrPath, fallbackName, lics, supporting = enrichSafeTensorsOCI(ctx, resolver, &md)
} else { } else {
nameOrPath, lics, supporting = enrichSafeTensorsDir(ctx, resolver, groupKey, &md) nameOrPath, lics, supporting = enrichSafeTensorsDir(ctx, resolver, groupKey, &md)
fallbackName = safeTensorsDirName(groupKey)
} }
merged.Metadata = md merged.Metadata = md
@ -359,13 +355,28 @@ func enrichSafeTensorsGroup(ctx context.Context, resolver file.Resolver, groupKe
for _, loc := range supporting { for _, loc := range supporting {
merged.Locations.Add(loc.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.SupportingEvidenceAnnotation)) merged.Locations.Add(loc.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.SupportingEvidenceAnnotation))
} }
return nameOrPath, imageRefName return nameOrPath, fallbackName
} }
// enrichSafeTensorsDir handles the directory-scan case: look for sibling // safeTensorsDirName returns the directory-scan naming fallback: the base name
// config.json and README.md next to the model files. // of the group's parent directory (the group key is already that directory).
// For "/models/tiny-llama" this returns "tiny-llama". Degenerate roots that
// carry no meaningful model name ("/", ".", "") return "", so the group is
// dropped rather than labeled with a filesystem artifact.
func safeTensorsDirName(groupKey string) string {
base := path.Base(groupKey)
switch base {
case "/", ".", "":
return ""
}
return base
}
// enrichSafeTensorsDir handles the directory-scan case: look for a config.json
// beside the model files (walking up parent directories to the scanned source
// root if no sibling exists) and a sibling README.md.
func enrichSafeTensorsDir(ctx context.Context, resolver file.Resolver, dir string, md *pkg.SafeTensorsModelInfo) (nameOrPath string, lics []pkg.License, supporting []file.Location) { func enrichSafeTensorsDir(ctx context.Context, resolver file.Resolver, dir string, md *pkg.SafeTensorsModelInfo) (nameOrPath string, lics []pkg.License, supporting []file.Location) {
if loc, cfg := readDirHFConfig(resolver, path.Join(dir, "config.json")); cfg != nil { if loc, cfg := findDirHFConfig(resolver, dir); cfg != nil {
applyHFConfig(md, cfg) applyHFConfig(md, cfg)
nameOrPath = cfg.NameOrPath nameOrPath = cfg.NameOrPath
supporting = append(supporting, *loc) supporting = append(supporting, *loc)
@ -518,9 +529,9 @@ func classifyOCIModelFileLayer(resolver file.Resolver, loc file.Location, md *pk
if err != nil { if err != nil {
return false return false
} }
trimmed := trimLeadingWhitespace(buf) trimmed := bytes.TrimLeft(buf, "\xef\xbb\xbf \t\r\n")
switch { switch {
case hasPrefix(trimmed, "---"): case bytes.HasPrefix(trimmed, []byte("---")):
fm := parseFrontmatter(buf) fm := parseFrontmatter(buf)
if fm == nil { if fm == nil {
return false return false
@ -532,7 +543,7 @@ func classifyOCIModelFileLayer(resolver file.Resolver, loc file.Location, md *pk
*readmeName = fm.BaseModel[0] *readmeName = fm.BaseModel[0]
} }
return true return true
case hasPrefix(trimmed, "{"): case bytes.HasPrefix(trimmed, []byte("{")):
var cfg hfConfig var cfg hfConfig
if err := json.Unmarshal(buf, &cfg); err != nil { if err := json.Unmarshal(buf, &cfg); err != nil {
return false return false
@ -566,26 +577,29 @@ func applyHFConfig(md *pkg.SafeTensorsModelInfo, cfg *hfConfig) {
// //
// 1. config.json _name_or_path (path.Base, so "org/Model" → "Model"; // 1. config.json _name_or_path (path.Base, so "org/Model" → "Model";
// applies to both dir-scan and OCI groups) // applies to both dir-scan and OCI groups)
// 2. OCI image-ref last segment (OCI-only; the user-supplied artifact // 2. fallback name — the group's source-specific positional identifier:
// reference's repository basename, e.g. // the OCI image-ref repository basename for OCI groups (e.g.
// "docker.io/ai/smollm2-vllm:360M" → "smollm2-vllm") // "docker.io/ai/smollm2-vllm:360M" → "smollm2-vllm"), or the parent
// directory base name for directory-scan groups (e.g.
// "/models/tiny-llama/*.safetensors" → "tiny-llama")
// //
// Returns "" to signal the merge processor should drop the group. There is // Returns "" to signal the merge processor should drop the group. There is
// intentionally no Architecture-Parameters synthetic or parent-directory // intentionally no Architecture-Parameters synthetic and no opaque hash label:
// fallback: an unnameable model is recorded as absent rather than under a // when neither a producer-declared name nor a positional fallback is available
// label the SBOM consumer would not recognize. // the model is recorded as absent rather than under a label the SBOM consumer
func pickSafeTensorsName(nameOrPath, imageRefName string) string { // would not recognize.
func pickSafeTensorsName(nameOrPath, fallbackName string) string {
if nameOrPath != "" { if nameOrPath != "" {
return path.Base(nameOrPath) return path.Base(nameOrPath)
} }
return imageRefName return fallbackName
} }
// --- Relocated enrichment helpers ---------------------------------------- // --- Enrichment helpers ---------------------------------------------------
// //
// These types and functions used to live in the parser files; they moved here // The parsers decode only the safetensors-specific format; every resolver-backed
// when the parsers shrank to "just decode the safetensors-specific format" and // read (config.json, README, license layers) is centralized here in the merge
// every resolver-backed read centralized in the merge processor. // processor, along with the types those reads decode into.
// hfConfig is a minimal projection of Hugging Face config.json fields. // hfConfig is a minimal projection of Hugging Face config.json fields.
type hfConfig struct { type hfConfig struct {
@ -601,6 +615,27 @@ type readmeFrontmatter struct {
BaseModel []string `yaml:"base_model"` BaseModel []string `yaml:"base_model"`
} }
// findDirHFConfig looks for a config.json beside the model files, walking up
// parent directories until it reaches the scanned source root. The walk needs
// no explicit depth bound: the resolver only resolves paths within the scanned
// source, so an ancestor above the scan root simply yields no config, and
// path.Dir converges on a fixed point ("/" or ".") that terminates the loop.
// The first config.json found wins, so the closest one — a sibling, then the
// nearest ancestor — supplies both the producer-declared name and the HF fields
// applied to the model.
func findDirHFConfig(resolver file.Resolver, dir string) (*file.Location, *hfConfig) {
for {
if loc, cfg := readDirHFConfig(resolver, path.Join(dir, "config.json")); cfg != nil {
return loc, cfg
}
parent := path.Dir(dir)
if parent == dir {
return nil, nil // reached the source root
}
dir = parent
}
}
func readDirHFConfig(resolver file.Resolver, p string) (*file.Location, *hfConfig) { func readDirHFConfig(resolver file.Resolver, p string) (*file.Location, *hfConfig) {
locations, err := resolver.FilesByPath(p) locations, err := resolver.FilesByPath(p)
if err != nil || len(locations) == 0 { if err != nil || len(locations) == 0 {
@ -715,18 +750,3 @@ func parseLicenseFrontmatter(buf []byte) string {
} }
return fm.SPDXID return fm.SPDXID
} }
func hasPrefix(b []byte, s string) bool {
return len(b) >= len(s) && string(b[:len(s)]) == s
}
func trimLeadingWhitespace(b []byte) []byte {
i := 0
for i < len(b) && (b[i] == ' ' || b[i] == '\t' || b[i] == '\r' || b[i] == '\n') {
i++
}
if len(b)-i >= 3 && b[i] == 0xEF && b[i+1] == 0xBB && b[i+2] == 0xBF {
i += 3
}
return b[i:]
}

View File

@ -52,6 +52,11 @@ func (b *testGGUFBuilder) withVersion(v uint32) *testGGUFBuilder {
return b return b
} }
func (b *testGGUFBuilder) withTensorCount(count uint64) *testGGUFBuilder {
b.tensorCount = count
return b
}
func (b *testGGUFBuilder) withStringKV(key, value string) *testGGUFBuilder { func (b *testGGUFBuilder) withStringKV(key, value string) *testGGUFBuilder {
b.kvPairs = append(b.kvPairs, testKVPair{key: key, valueType: ggufTypeString, value: value}) b.kvPairs = append(b.kvPairs, testKVPair{key: key, valueType: ggufTypeString, value: value})
return b return b
@ -114,3 +119,10 @@ func (b *testGGUFBuilder) build() []byte {
return b.buf.Bytes() return b.buf.Bytes()
} }
// buildInvalidMagic creates a file with invalid magic number
func (b *testGGUFBuilder) buildInvalidMagic() []byte {
buf := new(bytes.Buffer)
binary.Write(buf, binary.LittleEndian, uint32(0x12345678))
return buf.Bytes()
}

View File

@ -3,13 +3,14 @@ package pkg
// SafeTensorsModelInfo holds the model details extracted from SafeTensors content. // SafeTensorsModelInfo holds the model details extracted from SafeTensors content.
// SafeTensors is a simple, safe serialization format for storing tensors, used // SafeTensors is a simple, safe serialization format for storing tensors, used
// as the default weight format for Hugging Face transformer models. Syft may // as the default weight format for Hugging Face transformer models. Syft may
// populate this struct from three sources: // populate this struct from these sources:
// - a single .safetensors file (header-only parse) // - a single .safetensors file (header-only parse)
// - a sharded model described by model.safetensors.index.json // - the per-shard headers of a multi-shard model, merged into one package
// - a Docker AI OCI model artifact config blob (vnd.docker.ai.model.config.v0.1+json) // - a Docker AI OCI model artifact: the config blob
// (vnd.docker.ai.model.config.v0.1+json) plus each weight layer's header
// //
// The Model Name, License, and Version fields have all been lifted up to be on // Model name, license, and version live on the enclosing syft Package rather
// the syft Package. // than in this struct.
type SafeTensorsModelInfo struct { type SafeTensorsModelInfo struct {
// Format is the source format label (always "safetensors" for this metadata type). // Format is the source format label (always "safetensors" for this metadata type).
// Present because the Docker AI model config blob carries an explicit format field // Present because the Docker AI model config blob carries an explicit format field