mirror of
https://github.com/anchore/syft.git
synced 2026-07-05 02:28:25 +02:00
test: test cleanup
Signed-off-by: Christopher Phillips <32073428+spiffcs@users.noreply.github.com>
This commit is contained in:
parent
e88d6d019e
commit
4352ac4691
@ -13,6 +13,7 @@ import (
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/anchore/syft/syft/artifact"
|
||||
"github.com/anchore/syft/syft/file"
|
||||
"github.com/anchore/syft/syft/pkg"
|
||||
"github.com/anchore/syft/syft/pkg/cataloger/internal/pkgtest"
|
||||
@ -39,7 +40,7 @@ func buildSafeTensorsFile(t *testing.T, metadata map[string]string, tensors map[
|
||||
return out
|
||||
}
|
||||
|
||||
func TestSafeTensorsCataloger_singleFile(t *testing.T) {
|
||||
func TestSafeTensorsCataloger(t *testing.T) {
|
||||
userMeta := map[string]string{"format": "pt"}
|
||||
tensors := map[string]safeTensorsEntry{
|
||||
"model.embed.weight": {DType: "BF16", Shape: []int64{1000, 16}, DataOffsets: []int64{0, 32000}},
|
||||
@ -49,43 +50,61 @@ func TestSafeTensorsCataloger_singleFile(t *testing.T) {
|
||||
// cataloger wires the header hash through to the package metadata.
|
||||
wantHash := (&safeTensorsHeader{metadata: userMeta, tensors: tensors}).metadataHash()
|
||||
|
||||
dir := t.TempDir()
|
||||
modelDir := filepath.Join(dir, "models")
|
||||
require.NoError(t, os.MkdirAll(modelDir, 0o755))
|
||||
require.NoError(t, os.WriteFile(filepath.Join(modelDir, "model.safetensors"), buildSafeTensorsFile(t, userMeta, tensors), 0o644))
|
||||
require.NoError(t, os.WriteFile(filepath.Join(modelDir, "config.json"),
|
||||
[]byte(`{"architectures":["LlamaForCausalLM"],"torch_dtype":"bfloat16","transformers_version":"4.40.0","_name_or_path":"meta-llama/Llama-3-8B"}`), 0o644))
|
||||
require.NoError(t, os.WriteFile(filepath.Join(modelDir, "README.md"),
|
||||
[]byte("---\nlicense: Apache-2.0\nbase_model:\n - meta-llama/Llama-3\n---\n# Llama 3\n"), 0o644))
|
||||
|
||||
expected := []pkg.Package{
|
||||
tests := []struct {
|
||||
name string
|
||||
setup func(t *testing.T) string
|
||||
expectedPackages []pkg.Package
|
||||
expectedRelationships []artifact.Relationship
|
||||
}{
|
||||
{
|
||||
Name: "Llama-3-8B",
|
||||
Type: pkg.ModelPkg,
|
||||
Licenses: pkg.NewLicenseSet(
|
||||
pkg.NewLicenseFromFields("Apache-2.0", "", nil),
|
||||
),
|
||||
Metadata: pkg.SafeTensorsModelInfo{
|
||||
Format: "safetensors",
|
||||
Architecture: "LlamaForCausalLM",
|
||||
Quantization: "BF16",
|
||||
Parameters: "16.26K",
|
||||
TensorCount: 2,
|
||||
TorchDtype: "bfloat16",
|
||||
TransformersVersion: "4.40.0",
|
||||
ShardCount: 1,
|
||||
UserMetadata: pkg.KeyValues{{Key: "format", Value: "pt"}},
|
||||
MetadataHash: wantHash,
|
||||
name: "single-file model directory with config.json and README",
|
||||
setup: func(t *testing.T) string {
|
||||
dir := t.TempDir()
|
||||
modelDir := filepath.Join(dir, "models")
|
||||
require.NoError(t, os.MkdirAll(modelDir, 0o755))
|
||||
require.NoError(t, os.WriteFile(filepath.Join(modelDir, "model.safetensors"), buildSafeTensorsFile(t, userMeta, tensors), 0o644))
|
||||
require.NoError(t, os.WriteFile(filepath.Join(modelDir, "config.json"),
|
||||
[]byte(`{"architectures":["LlamaForCausalLM"],"torch_dtype":"bfloat16","transformers_version":"4.40.0","_name_or_path":"meta-llama/Llama-3-8B"}`), 0o644))
|
||||
require.NoError(t, os.WriteFile(filepath.Join(modelDir, "README.md"),
|
||||
[]byte("---\nlicense: Apache-2.0\nbase_model:\n - meta-llama/Llama-3\n---\n# Llama 3\n"), 0o644))
|
||||
return dir
|
||||
},
|
||||
expectedPackages: []pkg.Package{
|
||||
{
|
||||
Name: "Llama-3-8B",
|
||||
Type: pkg.ModelPkg,
|
||||
Licenses: pkg.NewLicenseSet(
|
||||
pkg.NewLicenseFromFields("Apache-2.0", "", nil),
|
||||
),
|
||||
Metadata: pkg.SafeTensorsModelInfo{
|
||||
Format: "safetensors",
|
||||
Architecture: "LlamaForCausalLM",
|
||||
Quantization: "BF16",
|
||||
Parameters: "16.26K",
|
||||
TensorCount: 2,
|
||||
TorchDtype: "bfloat16",
|
||||
TransformersVersion: "4.40.0",
|
||||
ShardCount: 1,
|
||||
UserMetadata: pkg.KeyValues{{Key: "format", Value: "pt"}},
|
||||
MetadataHash: wantHash,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
pkgtest.NewCatalogTester().
|
||||
FromDirectory(t, dir).
|
||||
Expects(expected, nil).
|
||||
IgnoreLocationLayer().
|
||||
IgnorePackageFields("FoundBy", "Locations").
|
||||
TestCataloger(t, NewSafeTensorsCataloger())
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
fixtureDir := tt.setup(t)
|
||||
|
||||
pkgtest.NewCatalogTester().
|
||||
FromDirectory(t, fixtureDir).
|
||||
Expects(tt.expectedPackages, tt.expectedRelationships).
|
||||
IgnoreLocationLayer().
|
||||
IgnorePackageFields("FoundBy", "Locations").
|
||||
TestCataloger(t, NewSafeTensorsCataloger())
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestParseSafeTensorsOCIConfig covers the parser in isolation: it should emit
|
||||
@ -94,31 +113,48 @@ func TestSafeTensorsCataloger_singleFile(t *testing.T) {
|
||||
// artifact. Naming and license resolution happen in the merge processor and are
|
||||
// tested separately under TestSafeTensorsMergeProcessor.
|
||||
func TestParseSafeTensorsOCIConfig(t *testing.T) {
|
||||
t.Run("emits a nameless package with config-blob fields", func(t *testing.T) {
|
||||
blob := []byte(`{"config":{"format":"safetensors","quantization":"Q4_K_M","parameters":"8B","size":"16.00GB","safetensors":{"tensor_count":291}}}`)
|
||||
tests := []struct {
|
||||
name string
|
||||
blob string
|
||||
expectedPackages []pkg.Package // nil => parser must emit nothing
|
||||
}{
|
||||
{
|
||||
name: "emits a nameless package with config-blob fields",
|
||||
blob: `{"config":{"format":"safetensors","quantization":"Q4_K_M","parameters":"8B","size":"16.00GB","safetensors":{"tensor_count":291}}}`,
|
||||
expectedPackages: []pkg.Package{
|
||||
{
|
||||
// nameless: the merge processor assigns the name and resolves
|
||||
// licenses. Config blobs carry no header content, so
|
||||
// MetadataHash stays empty.
|
||||
Type: pkg.ModelPkg,
|
||||
Metadata: pkg.SafeTensorsModelInfo{
|
||||
Format: "safetensors",
|
||||
Quantization: "Q4_K_M",
|
||||
Parameters: "8B",
|
||||
TotalSize: "16.00GB",
|
||||
TensorCount: 291,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
// non-safetensors formats emit nothing so the GGUF cataloger can claim
|
||||
// the artifact.
|
||||
name: "ignores non-safetensors format",
|
||||
blob: `{"config":{"format":"gguf","quantization":"Q4_K_M"}}`,
|
||||
expectedPackages: nil,
|
||||
},
|
||||
}
|
||||
|
||||
pkgs, _, err := parseSafeTensorsOCIConfig(context.Background(), nil, nil, configReader(blob))
|
||||
require.NoError(t, err)
|
||||
require.Len(t, pkgs, 1)
|
||||
|
||||
p := pkgs[0]
|
||||
assert.Empty(t, p.Name, "config-blob parser must emit nameless; the merge processor names it")
|
||||
assert.Empty(t, p.Licenses.ToSlice(), "license resolution belongs to the merge processor")
|
||||
md := p.Metadata.(pkg.SafeTensorsModelInfo)
|
||||
assert.Equal(t, "safetensors", md.Format)
|
||||
assert.Equal(t, "Q4_K_M", md.Quantization)
|
||||
assert.Equal(t, "8B", md.Parameters)
|
||||
assert.Equal(t, "16.00GB", md.TotalSize)
|
||||
assert.Equal(t, uint64(291), md.TensorCount)
|
||||
assert.Empty(t, md.MetadataHash, "config blobs have no header content to hash")
|
||||
})
|
||||
|
||||
t.Run("ignores non-safetensors format", func(t *testing.T) {
|
||||
ggufBlob := []byte(`{"config":{"format":"gguf","quantization":"Q4_K_M"}}`)
|
||||
pkgs, _, err := parseSafeTensorsOCIConfig(context.Background(), nil, nil, configReader(ggufBlob))
|
||||
require.NoError(t, err)
|
||||
assert.Empty(t, pkgs)
|
||||
})
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
pkgtest.NewCatalogTester().
|
||||
FromString("/config.json", tt.blob).
|
||||
Expects(tt.expectedPackages, nil).
|
||||
IgnorePackageFields("FoundBy", "Locations").
|
||||
TestParser(t, parseSafeTensorsOCIConfig)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestSafeTensorsMergeProcessor exercises the merge processor directly with
|
||||
@ -147,10 +183,10 @@ func TestSafeTensorsMergeProcessor(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
t.Run("dir scan: dropped when no sibling config.json carries _name_or_path", func(t *testing.T) {
|
||||
// Without a config.json the dir-scan path has no name source. There is
|
||||
// intentionally no parent-dir fallback (or any opaque fallback), so the
|
||||
// group is dropped rather than named after the filesystem layout.
|
||||
t.Run("dir scan: parent directory base name names the group when no config.json is present", func(t *testing.T) {
|
||||
// Without a config.json the dir-scan path falls through to the
|
||||
// parent directory base name. hugginface style model dir is named after the
|
||||
// model, so "/models/tiny-llama/weights.safetensors" → "tiny-llama".
|
||||
p := dirPkg("/models/tiny-llama/weights.safetensors", pkg.SafeTensorsModelInfo{
|
||||
Format: "safetensors",
|
||||
TensorCount: 4,
|
||||
@ -160,33 +196,46 @@ func TestSafeTensorsMergeProcessor(t *testing.T) {
|
||||
resolver := file.NewMockResolverForPaths() // no config.json / README available
|
||||
out, _, err := safeTensorsMergeProcessor(context.Background(), resolver, []pkg.Package{p}, nil, nil)
|
||||
require.NoError(t, err)
|
||||
assert.Empty(t, out, "dir-scan group with no config.json must be dropped")
|
||||
require.Len(t, out, 1)
|
||||
assert.Equal(t, "tiny-llama", out[0].Name, "rung 2: parent directory base name")
|
||||
})
|
||||
|
||||
t.Run("dir scan: Architecture-Parameters alone does not name the package", func(t *testing.T) {
|
||||
// Even with rich content-derived metadata (Architecture + Parameters),
|
||||
// the package must be dropped when there is no producer-declared name.
|
||||
// The Arch-Params synthetic rung was removed because it produced labels
|
||||
// like "LlamaForCausalLM-2.68B" that SBOM consumers couldn't trace back
|
||||
// to a recognizable model.
|
||||
p := dirPkg("/models/tiny/weights.safetensors", pkg.SafeTensorsModelInfo{
|
||||
Format: "safetensors",
|
||||
Architecture: "LlamaForCausalLM",
|
||||
Parameters: "2.68B",
|
||||
TensorCount: 4,
|
||||
MetadataHash: "abc",
|
||||
t.Run("dir scan: nested model dirs group and name by immediate parent", func(t *testing.T) {
|
||||
top := dirPkg("/namea/1.safetensors", pkg.SafeTensorsModelInfo{
|
||||
Format: "safetensors", TensorCount: 1, MetadataHash: "aaaa",
|
||||
})
|
||||
nested := dirPkg("/namea/nameb/2.safetensors", pkg.SafeTensorsModelInfo{
|
||||
Format: "safetensors", TensorCount: 1, MetadataHash: "bbbb",
|
||||
})
|
||||
resolver := file.NewMockResolverForPaths()
|
||||
out, _, err := safeTensorsMergeProcessor(context.Background(), resolver, []pkg.Package{top, nested}, nil, nil)
|
||||
require.NoError(t, err)
|
||||
require.Len(t, out, 2)
|
||||
names := []string{out[0].Name, out[1].Name}
|
||||
assert.ElementsMatch(t, []string{"namea", "nameb"}, names)
|
||||
})
|
||||
|
||||
t.Run("dir scan: config.json _name_or_path beats the parent directory fallback", func(t *testing.T) {
|
||||
// When a sibling config.json carries _name_or_path
|
||||
dir := t.TempDir()
|
||||
require.NoError(t, os.WriteFile(filepath.Join(dir, "config.json"),
|
||||
[]byte(`{"_name_or_path":"org/preferred-name"}`), 0o644))
|
||||
stPath := filepath.Join(dir, "weights.safetensors")
|
||||
p := dirPkg(stPath, pkg.SafeTensorsModelInfo{
|
||||
Format: "safetensors", TensorCount: 1, MetadataHash: "abc",
|
||||
})
|
||||
resolver := file.NewMockResolverForPaths(filepath.Join(dir, "config.json"))
|
||||
out, _, err := safeTensorsMergeProcessor(context.Background(), resolver, []pkg.Package{p}, nil, nil)
|
||||
require.NoError(t, err)
|
||||
assert.Empty(t, out, "Arch-Params alone is not a name source")
|
||||
require.Len(t, out, 1)
|
||||
assert.Equal(t, "preferred-name", out[0].Name, "rung 1 (config.json) wins over rung 2 (parent dir)")
|
||||
})
|
||||
|
||||
t.Run("OCI: dropped when no name source is available", func(t *testing.T) {
|
||||
// The vllm-style shape: config-blob package + a weight-layer package,
|
||||
// both at virtual path "/", no model.file companions on the resolver
|
||||
// AND no image ref. With nothing to derive a name from, the group is
|
||||
// dropped — no opaque fallback.
|
||||
// AND no image ref. With nothing to derive a name from, the package is
|
||||
// dropped
|
||||
configMd := pkg.SafeTensorsModelInfo{
|
||||
Format: "safetensors",
|
||||
TensorCount: 5,
|
||||
@ -209,9 +258,7 @@ func TestSafeTensorsMergeProcessor(t *testing.T) {
|
||||
|
||||
t.Run("OCI: image-ref last segment names the group when config.json is absent", func(t *testing.T) {
|
||||
// vllm-style artifact: a repacked model whose embedded config.json has
|
||||
// been stripped of _name_or_path. The merge processor falls through to
|
||||
// the second rung — the image-reference last segment — so we still emit
|
||||
// a recognizable model name instead of dropping it.
|
||||
// been stripped of _name_or_path.
|
||||
configMd := pkg.SafeTensorsModelInfo{
|
||||
Format: "safetensors",
|
||||
TensorCount: 290,
|
||||
@ -235,75 +282,6 @@ func TestSafeTensorsMergeProcessor(t *testing.T) {
|
||||
assert.Equal(t, "smollm2-vllm", out[0].Name, "rung 2: image-ref repository basename")
|
||||
})
|
||||
|
||||
t.Run("OCI: config.json _name_or_path beats the image-ref fallback", func(t *testing.T) {
|
||||
// When the embedded config.json carries _name_or_path, rung 1 wins over
|
||||
// the image ref even if both are present.
|
||||
dir := t.TempDir()
|
||||
hfConfigPath := filepath.Join(dir, "config.json")
|
||||
require.NoError(t, os.WriteFile(hfConfigPath,
|
||||
[]byte(`{"_name_or_path":"org/preferred-name"}`), 0o644))
|
||||
resolver := file.NewMockResolverForOCIArtifact(
|
||||
"docker.io/ai/smollm2-vllm:360M",
|
||||
map[string][]file.Location{
|
||||
dockerAIModelFileMediaType: {file.NewLocation(hfConfigPath)},
|
||||
},
|
||||
)
|
||||
configMd := pkg.SafeTensorsModelInfo{Format: "safetensors", TensorCount: 1}
|
||||
out, _, err := safeTensorsMergeProcessor(
|
||||
context.Background(), resolver,
|
||||
[]pkg.Package{ociPkg(configMd)}, nil, nil,
|
||||
)
|
||||
require.NoError(t, err)
|
||||
require.Len(t, out, 1)
|
||||
assert.Equal(t, "preferred-name", out[0].Name, "rung 1 (config.json) wins over rung 2 (image ref)")
|
||||
})
|
||||
|
||||
t.Run("OCI: merges config + shard and names from companion config.json", func(t *testing.T) {
|
||||
// Write a single model.file companion blob containing HF config.json so
|
||||
// the processor can derive _name_or_path and Architecture from it.
|
||||
dir := t.TempDir()
|
||||
hfConfigPath := filepath.Join(dir, "config.json")
|
||||
require.NoError(t, os.WriteFile(hfConfigPath,
|
||||
[]byte(`{"architectures":["Qwen3ForCausalLM"],"torch_dtype":"bfloat16","_name_or_path":"org/qwen-tiny"}`), 0o644))
|
||||
resolver := file.NewMockResolverForMediaTypes(map[string][]file.Location{
|
||||
dockerAIModelFileMediaType: {file.NewLocation(hfConfigPath)},
|
||||
})
|
||||
|
||||
configMd := pkg.SafeTensorsModelInfo{
|
||||
Format: "safetensors",
|
||||
Quantization: "Q4_K_M", // raw producer-declared value
|
||||
Parameters: "8B",
|
||||
TotalSize: "16.00GB",
|
||||
TensorCount: 291,
|
||||
}
|
||||
shardMd := pkg.SafeTensorsModelInfo{
|
||||
Format: "safetensors",
|
||||
TensorCount: 100, // per-shard count — must NOT be summed onto the aggregate's 291
|
||||
Quantization: "BF16",
|
||||
MetadataHash: "deadbeef",
|
||||
UserMetadata: pkg.KeyValues{{Key: "format", Value: "pt"}},
|
||||
}
|
||||
out, _, err := safeTensorsMergeProcessor(
|
||||
context.Background(), resolver,
|
||||
[]pkg.Package{ociPkg(configMd), ociPkg(shardMd)}, nil, nil,
|
||||
)
|
||||
require.NoError(t, err)
|
||||
require.Len(t, out, 1)
|
||||
|
||||
got := out[0]
|
||||
assert.Equal(t, "qwen-tiny", got.Name, "name comes from path.Base(_name_or_path)")
|
||||
md := got.Metadata.(pkg.SafeTensorsModelInfo)
|
||||
assert.Equal(t, uint64(291), md.TensorCount, "aggregate TensorCount must win — never double-count by summing the shard")
|
||||
assert.Equal(t, "16.00GB", md.TotalSize)
|
||||
assert.Equal(t, "8B", md.Parameters)
|
||||
assert.Equal(t, "Qwen3ForCausalLM", md.Architecture, "Architecture enriched from companion config.json")
|
||||
assert.Equal(t, "bfloat16", md.TorchDtype)
|
||||
assert.Equal(t, "Q4_K_M", md.Quantization, "aggregate Quantization wins over shard's normalized dtype when both present")
|
||||
assert.Equal(t, "deadbeef", md.MetadataHash, "single-shard rollup is the lone shard's hash")
|
||||
assert.Equal(t, pkg.KeyValues{{Key: "format", Value: "pt"}}, md.UserMetadata)
|
||||
assert.Nil(t, md.Parts, "single-shard groups skip Parts; the outer view already exposes everything")
|
||||
})
|
||||
|
||||
t.Run("OCI: multi-shard rollup hashes are stable and sorted", func(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
hfConfigPath := filepath.Join(dir, "config.json")
|
||||
@ -389,20 +367,21 @@ spdx-id: Apache-2.0
|
||||
//
|
||||
// Precedence (highest → lowest):
|
||||
// 1. config.json _name_or_path (path.Base applied; both dir-scan and OCI)
|
||||
// 2. OCI image-ref last segment (OCI only — empty string for dir scans)
|
||||
// 2. fallback name — OCI image-ref last segment, or dir-scan parent directory
|
||||
// base name (the merge processor computes the right one per group)
|
||||
// → drop (empty name) when nothing matches
|
||||
func TestSafeTensorsNamingPrecedence(t *testing.T) {
|
||||
cases := []struct {
|
||||
tests := []struct {
|
||||
name string
|
||||
nameOrPath string
|
||||
imageRefName string
|
||||
fallbackName string
|
||||
want string
|
||||
}{
|
||||
// rung 1
|
||||
{
|
||||
name: "rung 1: _name_or_path beats the image-ref fallback",
|
||||
name: "rung 1: _name_or_path beats the fallback",
|
||||
nameOrPath: "org/MyModel",
|
||||
imageRefName: "fallback-ref",
|
||||
fallbackName: "fallback-name",
|
||||
want: "MyModel",
|
||||
},
|
||||
{
|
||||
@ -418,10 +397,15 @@ func TestSafeTensorsNamingPrecedence(t *testing.T) {
|
||||
|
||||
// rung 2
|
||||
{
|
||||
name: "rung 2: image-ref last segment used when _name_or_path is empty",
|
||||
imageRefName: "smollm2-vllm",
|
||||
name: "rung 2: OCI image-ref last segment used when _name_or_path is empty",
|
||||
fallbackName: "smollm2-vllm",
|
||||
want: "smollm2-vllm",
|
||||
},
|
||||
{
|
||||
name: "rung 2: dir-scan parent directory name used when _name_or_path is empty",
|
||||
fallbackName: "tiny-llama",
|
||||
want: "tiny-llama",
|
||||
},
|
||||
|
||||
// drops
|
||||
{
|
||||
@ -430,10 +414,31 @@ func TestSafeTensorsNamingPrecedence(t *testing.T) {
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
got := pickSafeTensorsName(tc.nameOrPath, tc.imageRefName)
|
||||
assert.Equal(t, tc.want, got)
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
got := pickSafeTensorsName(tt.nameOrPath, tt.fallbackName)
|
||||
assert.Equal(t, tt.want, got)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestSafeTensorsDirName covers the directory-scan fallback name derivation,
|
||||
// including the degenerate roots that must yield no name.
|
||||
func TestSafeTensorsDirName(t *testing.T) {
|
||||
tests := []struct {
|
||||
groupKey string
|
||||
want string
|
||||
}{
|
||||
{groupKey: "/models/tiny-llama", want: "tiny-llama"},
|
||||
{groupKey: "/namea", want: "namea"},
|
||||
{groupKey: "/namea/nameb", want: "nameb"},
|
||||
{groupKey: "/", want: ""},
|
||||
{groupKey: ".", want: ""},
|
||||
{groupKey: "", want: ""},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.groupKey, func(t *testing.T) {
|
||||
assert.Equal(t, tt.want, safeTensorsDirName(tt.groupKey))
|
||||
})
|
||||
}
|
||||
}
|
||||
@ -449,19 +454,26 @@ func TestParseSafeTensorsOCILayer(t *testing.T) {
|
||||
wantHash := (&safeTensorsHeader{metadata: userMeta, tensors: tensors}).metadataHash()
|
||||
|
||||
t.Run("emits a nameless package with header-derived metadata", func(t *testing.T) {
|
||||
reader := file.NewLocationReadCloser(file.NewLocation("/"), io.NopCloser(bytes.NewReader(blob)))
|
||||
pkgs, _, err := parseSafeTensorsOCILayer(context.Background(), nil, nil, reader)
|
||||
require.NoError(t, err)
|
||||
require.Len(t, pkgs, 1)
|
||||
|
||||
p := pkgs[0]
|
||||
assert.Empty(t, p.Name, "weight-layer parser must emit nameless; the merge processor names it")
|
||||
md := p.Metadata.(pkg.SafeTensorsModelInfo)
|
||||
assert.Equal(t, "safetensors", md.Format)
|
||||
assert.Equal(t, uint64(2), md.TensorCount)
|
||||
assert.Equal(t, "BF16", md.Quantization)
|
||||
assert.Equal(t, wantUserMetadata, md.UserMetadata)
|
||||
assert.Equal(t, wantHash, md.MetadataHash)
|
||||
// nameless: the merge processor assigns the name. Parameters is the
|
||||
// summed element count of the two tensors (1024*16 + 16*16 = 16640).
|
||||
expected := []pkg.Package{
|
||||
{
|
||||
Type: pkg.ModelPkg,
|
||||
Metadata: pkg.SafeTensorsModelInfo{
|
||||
Format: "safetensors",
|
||||
Parameters: "16.64K",
|
||||
Quantization: "BF16",
|
||||
TensorCount: 2,
|
||||
UserMetadata: wantUserMetadata,
|
||||
MetadataHash: wantHash,
|
||||
},
|
||||
},
|
||||
}
|
||||
pkgtest.NewCatalogTester().
|
||||
FromString("/", string(blob)).
|
||||
Expects(expected, nil).
|
||||
IgnorePackageFields("FoundBy", "Locations").
|
||||
TestParser(t, parseSafeTensorsOCILayer)
|
||||
})
|
||||
|
||||
t.Run("merged via processor: aggregate fields preserved, hash lifted from single shard", func(t *testing.T) {
|
||||
@ -535,27 +547,30 @@ func TestParseSafeTensorsOCILayer(t *testing.T) {
|
||||
// Locking in the field values guards against changes to the header parser
|
||||
// silently breaking on real-world content shape.
|
||||
func TestParseSafeTensorsOCILayer_realFixture(t *testing.T) {
|
||||
data, err := os.ReadFile(filepath.Join("testdata", "safetensors", "nomic-embed-475M.header.safetensors"))
|
||||
require.NoError(t, err)
|
||||
require.Greater(t, len(data), 8, "fixture must include the 8-byte length prefix")
|
||||
// nameless before the merge processor runs. The fixture is immutable on disk;
|
||||
// the locked field values (notably MetadataHash) guard against the header
|
||||
// parser silently breaking on real-world content shape — if MetadataHash
|
||||
// changes, either the hash algorithm or the canonicalization changed, both of
|
||||
// which callers may rely on for cross-source identity.
|
||||
expected := []pkg.Package{
|
||||
{
|
||||
Type: pkg.ModelPkg,
|
||||
Metadata: pkg.SafeTensorsModelInfo{
|
||||
Format: "safetensors",
|
||||
Parameters: "475.29M",
|
||||
Quantization: "F32", // every tensor in the captured shard is F32
|
||||
TensorCount: 148, // nomic-embed-v2-moe 475M ships 148 tensor entries in this shard
|
||||
UserMetadata: pkg.KeyValues{{Key: "format", Value: "pt"}},
|
||||
MetadataHash: "051a14e686673dea",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
reader := file.NewLocationReadCloser(file.NewLocation("/"), io.NopCloser(bytes.NewReader(data)))
|
||||
pkgs, _, err := parseSafeTensorsOCILayer(context.Background(), nil, nil, reader)
|
||||
require.NoError(t, err)
|
||||
require.Len(t, pkgs, 1)
|
||||
assert.Empty(t, pkgs[0].Name, "weight-layer packages are nameless before the merge processor runs")
|
||||
|
||||
md := pkgs[0].Metadata.(pkg.SafeTensorsModelInfo)
|
||||
assert.Equal(t, "safetensors", md.Format)
|
||||
assert.Equal(t, uint64(148), md.TensorCount, "nomic-embed-v2-moe 475M ships 148 tensor entries in this shard")
|
||||
assert.Equal(t, "F32", md.Quantization, "every tensor in the captured shard is F32")
|
||||
assert.Equal(t, "475.29M", md.Parameters)
|
||||
assert.Equal(t, pkg.KeyValues{{Key: "format", Value: "pt"}}, md.UserMetadata)
|
||||
// MetadataHash is locked to the exact value the parser produces for this
|
||||
// captured input. The fixture is immutable on disk; if this value changes
|
||||
// either the hash algorithm or the canonicalization changed, both of which
|
||||
// callers may rely on for cross-source identity.
|
||||
assert.Equal(t, "051a14e686673dea", md.MetadataHash)
|
||||
pkgtest.NewCatalogTester().
|
||||
FromFile(t, filepath.Join("testdata", "safetensors", "nomic-embed-475M.header.safetensors")).
|
||||
Expects(expected, nil).
|
||||
IgnorePackageFields("FoundBy", "Locations").
|
||||
TestParser(t, parseSafeTensorsOCILayer)
|
||||
}
|
||||
|
||||
func TestSafeTensorsCrossSourceHashParity(t *testing.T) {
|
||||
@ -595,10 +610,6 @@ func TestSafeTensorsCrossSourceHashParity(t *testing.T) {
|
||||
assert.Equal(t, dirHash, ociHash, "same content via dir scan and OCI weight-layer scan must hash equal")
|
||||
}
|
||||
|
||||
func configReader(blob []byte) file.LocationReadCloser {
|
||||
return file.NewLocationReadCloser(file.NewLocation("/config.json"), io.NopCloser(bytes.NewReader(blob)))
|
||||
}
|
||||
|
||||
func assertHasLicense(t *testing.T, p pkg.Package, value string) {
|
||||
t.Helper()
|
||||
for _, l := range p.Licenses.ToSlice() {
|
||||
@ -610,28 +621,50 @@ func assertHasLicense(t *testing.T, p pkg.Package, value string) {
|
||||
}
|
||||
|
||||
func TestReadSafeTensorsHeader(t *testing.T) {
|
||||
t.Run("valid header", func(t *testing.T) {
|
||||
data := buildSafeTensorsFile(t, map[string]string{"format": "pt"}, map[string]safeTensorsEntry{
|
||||
"w": {DType: "F32", Shape: []int64{2, 2}, DataOffsets: []int64{0, 16}},
|
||||
zeroLength := make([]byte, 8) // length prefix of 0
|
||||
|
||||
truncatedBody := make([]byte, 8)
|
||||
binary.LittleEndian.PutUint64(truncatedBody, 100) // claims 100 bytes but supplies none
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
data []byte
|
||||
wantErr bool
|
||||
assert func(t *testing.T, h *safeTensorsHeader)
|
||||
}{
|
||||
{
|
||||
name: "valid header",
|
||||
data: buildSafeTensorsFile(t, map[string]string{"format": "pt"}, map[string]safeTensorsEntry{
|
||||
"w": {DType: "F32", Shape: []int64{2, 2}, DataOffsets: []int64{0, 16}},
|
||||
}),
|
||||
assert: func(t *testing.T, h *safeTensorsHeader) {
|
||||
assert.Len(t, h.tensors, 1)
|
||||
assert.Equal(t, "pt", h.metadata["format"])
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "zero-length header",
|
||||
data: zeroLength,
|
||||
wantErr: true,
|
||||
},
|
||||
{
|
||||
name: "truncated body",
|
||||
data: truncatedBody,
|
||||
wantErr: true,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
h, err := readSafeTensorsHeader(bytes.NewReader(tt.data))
|
||||
if tt.wantErr {
|
||||
require.Error(t, err)
|
||||
return
|
||||
}
|
||||
require.NoError(t, err)
|
||||
tt.assert(t, h)
|
||||
})
|
||||
h, err := readSafeTensorsHeader(bytes.NewReader(data))
|
||||
require.NoError(t, err)
|
||||
assert.Len(t, h.tensors, 1)
|
||||
assert.Equal(t, "pt", h.metadata["format"])
|
||||
})
|
||||
|
||||
t.Run("zero-length header", func(t *testing.T) {
|
||||
var buf [8]byte // length prefix of 0
|
||||
_, err := readSafeTensorsHeader(bytes.NewReader(buf[:]))
|
||||
require.Error(t, err)
|
||||
})
|
||||
|
||||
t.Run("truncated body", func(t *testing.T) {
|
||||
var buf [8]byte
|
||||
binary.LittleEndian.PutUint64(buf[:], 100) // claims 100 bytes but supplies none
|
||||
_, err := readSafeTensorsHeader(bytes.NewReader(buf[:]))
|
||||
require.Error(t, err)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestSafeTensorsHeader_metadataHash(t *testing.T) {
|
||||
@ -680,71 +713,112 @@ func TestSafeTensorsHeader_parameterCountAndDType(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestNormalizeDType(t *testing.T) {
|
||||
cases := map[string]string{
|
||||
"BF16": "BF16",
|
||||
"float16": "F16",
|
||||
"FP32": "F32",
|
||||
"int8": "I8",
|
||||
"U8": "U8",
|
||||
"bool": "BOOL",
|
||||
"weird": "WEIRD",
|
||||
tests := []struct {
|
||||
name string
|
||||
in string
|
||||
want string
|
||||
}{
|
||||
{name: "already canonical BF16", in: "BF16", want: "BF16"},
|
||||
{name: "float16 alias", in: "float16", want: "F16"},
|
||||
{name: "FP32 alias", in: "FP32", want: "F32"},
|
||||
{name: "int8 alias", in: "int8", want: "I8"},
|
||||
{name: "U8 passthrough", in: "U8", want: "U8"},
|
||||
{name: "bool", in: "bool", want: "BOOL"},
|
||||
{name: "unknown value uppercased", in: "weird", want: "WEIRD"},
|
||||
}
|
||||
for in, want := range cases {
|
||||
assert.Equalf(t, want, normalizeDType(in), "normalizeDType(%q)", in)
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
assert.Equal(t, tt.want, normalizeDType(tt.in))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestFormatParameterCount(t *testing.T) {
|
||||
cases := map[uint64]string{
|
||||
512: "512",
|
||||
16256: "16.26K",
|
||||
2_680_000_000: "2.68B",
|
||||
35_000_000: "35.00M",
|
||||
tests := []struct {
|
||||
name string
|
||||
in uint64
|
||||
want string
|
||||
}{
|
||||
{name: "raw count under 1K", in: 512, want: "512"},
|
||||
{name: "thousands", in: 16256, want: "16.26K"},
|
||||
{name: "billions", in: 2_680_000_000, want: "2.68B"},
|
||||
{name: "millions", in: 35_000_000, want: "35.00M"},
|
||||
}
|
||||
for in, want := range cases {
|
||||
assert.Equalf(t, want, formatParameterCount(in), "formatParameterCount(%d)", in)
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
assert.Equal(t, tt.want, formatParameterCount(tt.in))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseFrontmatter(t *testing.T) {
|
||||
t.Run("list base_model", func(t *testing.T) {
|
||||
fm := parseFrontmatter([]byte("---\nlicense: mit\nbase_model:\n - org/Model\n---\nbody"))
|
||||
require.NotNil(t, fm)
|
||||
assert.Equal(t, "mit", fm.License)
|
||||
assert.Equal(t, []string{"org/Model"}, fm.BaseModel)
|
||||
})
|
||||
tests := []struct {
|
||||
name string
|
||||
input string
|
||||
wantNil bool
|
||||
wantLicense string
|
||||
wantBaseModel []string
|
||||
}{
|
||||
{
|
||||
name: "list base_model",
|
||||
input: "---\nlicense: mit\nbase_model:\n - org/Model\n---\nbody",
|
||||
wantLicense: "mit",
|
||||
wantBaseModel: []string{"org/Model"},
|
||||
},
|
||||
{
|
||||
name: "scalar base_model",
|
||||
input: "---\nlicense: apache-2.0\nbase_model: org/Model\n---\n",
|
||||
wantLicense: "apache-2.0",
|
||||
wantBaseModel: []string{"org/Model"},
|
||||
},
|
||||
{
|
||||
name: "leading BOM",
|
||||
input: "\xef\xbb\xbf---\nlicense: mit\n---\n",
|
||||
wantLicense: "mit",
|
||||
},
|
||||
{
|
||||
name: "no frontmatter",
|
||||
input: "# just a heading\n",
|
||||
wantNil: true,
|
||||
},
|
||||
{
|
||||
name: "unterminated frontmatter",
|
||||
input: "---\nlicense: mit\n",
|
||||
wantNil: true,
|
||||
},
|
||||
}
|
||||
|
||||
t.Run("scalar base_model", func(t *testing.T) {
|
||||
fm := parseFrontmatter([]byte("---\nlicense: apache-2.0\nbase_model: org/Model\n---\n"))
|
||||
require.NotNil(t, fm)
|
||||
assert.Equal(t, "apache-2.0", fm.License)
|
||||
assert.Equal(t, []string{"org/Model"}, fm.BaseModel)
|
||||
})
|
||||
|
||||
t.Run("leading BOM", func(t *testing.T) {
|
||||
fm := parseFrontmatter([]byte("\xef\xbb\xbf---\nlicense: mit\n---\n"))
|
||||
require.NotNil(t, fm)
|
||||
assert.Equal(t, "mit", fm.License)
|
||||
})
|
||||
|
||||
t.Run("no frontmatter", func(t *testing.T) {
|
||||
assert.Nil(t, parseFrontmatter([]byte("# just a heading\n")))
|
||||
})
|
||||
|
||||
t.Run("unterminated frontmatter", func(t *testing.T) {
|
||||
assert.Nil(t, parseFrontmatter([]byte("---\nlicense: mit\n")))
|
||||
})
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
fm := parseFrontmatter([]byte(tt.input))
|
||||
if tt.wantNil {
|
||||
assert.Nil(t, fm)
|
||||
return
|
||||
}
|
||||
require.NotNil(t, fm)
|
||||
assert.Equal(t, tt.wantLicense, fm.License)
|
||||
if tt.wantBaseModel != nil {
|
||||
assert.Equal(t, tt.wantBaseModel, fm.BaseModel)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestParseLicenseFrontmatter covers the choosealicense.com-style YAML
|
||||
// frontmatter Docker Model Runner uses for its license layers. Only spdx-id
|
||||
// is consumed; everything else in the block is ignored.
|
||||
func TestParseLicenseFrontmatter(t *testing.T) {
|
||||
t.Run("Apache-2.0 (the canonical choosealicense.com shape)", func(t *testing.T) {
|
||||
// This is the exact frontmatter shape from
|
||||
// https://github.com/github/choosealicense.com/blob/gh-pages/_licenses/apache-2.0.txt
|
||||
// Docker AI license layers ship a near-identical block.
|
||||
buf := []byte(`---
|
||||
// The Apache-2.0 case is the exact frontmatter shape from
|
||||
// https://github.com/github/choosealicense.com/blob/gh-pages/_licenses/apache-2.0.txt
|
||||
// Docker AI license layers ship a near-identical block.
|
||||
tests := []struct {
|
||||
name string
|
||||
input string
|
||||
want string
|
||||
}{
|
||||
{
|
||||
name: "Apache-2.0 (the canonical choosealicense.com shape)",
|
||||
input: `---
|
||||
title: Apache License 2.0
|
||||
spdx-id: Apache-2.0
|
||||
redirect_from: /licenses/apache/
|
||||
@ -780,29 +854,36 @@ limitations:
|
||||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
`)
|
||||
assert.Equal(t, "Apache-2.0", parseLicenseFrontmatter(buf))
|
||||
})
|
||||
`,
|
||||
want: "Apache-2.0",
|
||||
},
|
||||
{
|
||||
name: "MIT with BOM prefix",
|
||||
input: "\xef\xbb\xbf---\ntitle: MIT License\nspdx-id: MIT\n---\nThe MIT License...\n",
|
||||
want: "MIT",
|
||||
},
|
||||
{
|
||||
name: "frontmatter without spdx-id falls through (returns empty)",
|
||||
input: "---\ntitle: Something\ndescription: no spdx-id here\n---\nbody\n",
|
||||
want: "",
|
||||
},
|
||||
{
|
||||
name: "plain license text without any frontmatter",
|
||||
input: " Apache License\n Version 2.0, January 2004\n",
|
||||
want: "",
|
||||
},
|
||||
{
|
||||
name: "unterminated frontmatter block",
|
||||
input: "---\nspdx-id: MIT\n(never closes)\n",
|
||||
want: "",
|
||||
},
|
||||
}
|
||||
|
||||
t.Run("MIT with BOM prefix", func(t *testing.T) {
|
||||
buf := []byte("\xef\xbb\xbf---\ntitle: MIT License\nspdx-id: MIT\n---\nThe MIT License...\n")
|
||||
assert.Equal(t, "MIT", parseLicenseFrontmatter(buf))
|
||||
})
|
||||
|
||||
t.Run("frontmatter without spdx-id falls through (returns empty)", func(t *testing.T) {
|
||||
buf := []byte("---\ntitle: Something\ndescription: no spdx-id here\n---\nbody\n")
|
||||
assert.Empty(t, parseLicenseFrontmatter(buf))
|
||||
})
|
||||
|
||||
t.Run("plain license text without any frontmatter", func(t *testing.T) {
|
||||
buf := []byte(" Apache License\n Version 2.0, January 2004\n")
|
||||
assert.Empty(t, parseLicenseFrontmatter(buf))
|
||||
})
|
||||
|
||||
t.Run("unterminated frontmatter block", func(t *testing.T) {
|
||||
buf := []byte("---\nspdx-id: MIT\n(never closes)\n")
|
||||
assert.Empty(t, parseLicenseFrontmatter(buf))
|
||||
})
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
assert.Equal(t, tt.want, parseLicenseFrontmatter([]byte(tt.input)))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestDockerAIModelConfigMediaTypes(t *testing.T) {
|
||||
@ -816,14 +897,27 @@ func TestDockerAIModelConfigMediaTypes(t *testing.T) {
|
||||
}
|
||||
return false
|
||||
}
|
||||
// the known, verified schema versions are consumed
|
||||
assert.True(t, supported("application/vnd.docker.ai.model.config.v0.1+json"))
|
||||
assert.True(t, supported("application/vnd.docker.ai.model.config.v0.2+json"))
|
||||
// unknown/future schema versions are intentionally NOT consumed, to avoid
|
||||
// silently ingesting a potentially breaking config change
|
||||
assert.False(t, supported("application/vnd.docker.ai.model.config.v0.3+json"))
|
||||
assert.False(t, supported("application/vnd.docker.ai.model.config.v9.9+json"))
|
||||
// sibling layer media types are not matched either
|
||||
assert.False(t, supported("application/vnd.docker.ai.model.file"))
|
||||
assert.False(t, supported("application/vnd.docker.ai.gguf.v3"))
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
mediaType string
|
||||
want bool
|
||||
}{
|
||||
// the known, verified schema versions are consumed
|
||||
{name: "known schema v0.1 is consumed", mediaType: "application/vnd.docker.ai.model.config.v0.1+json", want: true},
|
||||
{name: "known schema v0.2 is consumed", mediaType: "application/vnd.docker.ai.model.config.v0.2+json", want: true},
|
||||
// unknown/future schema versions are intentionally NOT consumed, to avoid
|
||||
// silently ingesting a potentially breaking config change
|
||||
{name: "unknown schema v0.3 is rejected", mediaType: "application/vnd.docker.ai.model.config.v0.3+json", want: false},
|
||||
{name: "far-future schema v9.9 is rejected", mediaType: "application/vnd.docker.ai.model.config.v9.9+json", want: false},
|
||||
// sibling layer media types are not matched either
|
||||
{name: "sibling model.file layer is not matched", mediaType: "application/vnd.docker.ai.model.file", want: false},
|
||||
{name: "sibling gguf layer is not matched", mediaType: "application/vnd.docker.ai.gguf.v3", want: false},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
assert.Equal(t, tt.want, supported(tt.mediaType))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@ -127,10 +127,10 @@ func safeTensorsMergeProcessor(ctx context.Context, resolver file.Resolver, pkgs
|
||||
out := other
|
||||
for _, key := range keys {
|
||||
merged := mergeSafeTensorsGroup(groups[key])
|
||||
nameOrPath, imageRefName := enrichSafeTensorsGroup(ctx, resolver, key, &merged)
|
||||
name := pickSafeTensorsName(nameOrPath, imageRefName)
|
||||
nameOrPath, fallbackName := enrichSafeTensorsGroup(ctx, resolver, key, &merged)
|
||||
name := pickSafeTensorsName(nameOrPath, fallbackName)
|
||||
if name == "" {
|
||||
continue // drop unnameable groups, per design (no opaque fallback)
|
||||
continue // drop groups with no name source and no usable fallback
|
||||
}
|
||||
merged.Name = name
|
||||
merged.SetID()
|
||||
@ -142,8 +142,6 @@ func safeTensorsMergeProcessor(ctx context.Context, resolver file.Resolver, pkgs
|
||||
// groupSafeTensorsPackages buckets packages by the parent directory of their
|
||||
// primary-evidence location, or the OCI sentinel when the location lives at
|
||||
// the ContainerImageModel resolver's virtual "/" path.
|
||||
// TODO: assemble a test where there are cases for DIR ran into for a single scan
|
||||
// - safe tensors at the top level as well as sub directories
|
||||
func groupSafeTensorsPackages(pkgs []pkg.Package) map[string][]pkg.Package {
|
||||
out := make(map[string][]pkg.Package)
|
||||
for _, p := range pkgs {
|
||||
@ -253,19 +251,15 @@ func mergeAggregatesInto(merged *pkg.SafeTensorsModelInfo, aggregates []pkg.Safe
|
||||
|
||||
// mergeShardsInto folds the per-shard header metadata into merged, returning
|
||||
// the summed shard TensorCount and the list of non-empty per-shard hashes for
|
||||
// the rollup. Architecture / TorchDtype / TransformersVersion are accepted as
|
||||
// fallbacks if a shard ever carries them (the current parsers don't, but the
|
||||
// resolver-backed enrichment runs afterwards and won't overwrite anything
|
||||
// already set, so it's safe to populate them earlier).
|
||||
// the rollup. Shards carry only the content-derived fields (Quantization,
|
||||
// Parameters, UserMetadata); producer-declared fields like Architecture come
|
||||
// from the resolver-backed enrichment that runs afterwards.
|
||||
func mergeShardsInto(merged *pkg.SafeTensorsModelInfo, shards []pkg.SafeTensorsModelInfo) (shardTensorTotal uint64, hashes []string) {
|
||||
seenKV := map[string]bool{}
|
||||
for _, s := range shards {
|
||||
shardTensorTotal += s.TensorCount
|
||||
firstNonEmpty(&merged.Quantization, s.Quantization)
|
||||
firstNonEmpty(&merged.Parameters, s.Parameters)
|
||||
firstNonEmpty(&merged.Architecture, s.Architecture)
|
||||
firstNonEmpty(&merged.TorchDtype, s.TorchDtype)
|
||||
firstNonEmpty(&merged.TransformersVersion, s.TransformersVersion)
|
||||
for _, kv := range s.UserMetadata {
|
||||
if seenKV[kv.Key] {
|
||||
continue
|
||||
@ -336,10 +330,11 @@ func rollupHash(hashes []string) string {
|
||||
// merged metadata's Architecture / TorchDtype / TransformersVersion, set the
|
||||
// licenses on the merged package, and attach the location of every consulted
|
||||
// supporting file as SupportingEvidence. Returns two name candidates for the
|
||||
// merge processor: nameOrPath (raw _name_or_path from a config.json) and
|
||||
// imageRefName (the last path segment of the OCI image reference, empty for
|
||||
// dir-scan groups).
|
||||
func enrichSafeTensorsGroup(ctx context.Context, resolver file.Resolver, groupKey string, merged *pkg.Package) (nameOrPath, imageRefName string) {
|
||||
// merge processor: nameOrPath (raw _name_or_path from a config.json) and a
|
||||
// fallbackName used when no _name_or_path is available — the last path segment
|
||||
// of the OCI image reference for OCI groups, or the parent directory's base
|
||||
// name for directory-scan groups.
|
||||
func enrichSafeTensorsGroup(ctx context.Context, resolver file.Resolver, groupKey string, merged *pkg.Package) (nameOrPath, fallbackName string) {
|
||||
md := merged.Metadata.(pkg.SafeTensorsModelInfo)
|
||||
|
||||
var (
|
||||
@ -347,9 +342,10 @@ func enrichSafeTensorsGroup(ctx context.Context, resolver file.Resolver, groupKe
|
||||
supporting []file.Location
|
||||
)
|
||||
if groupKey == ociGroupKey {
|
||||
nameOrPath, imageRefName, lics, supporting = enrichSafeTensorsOCI(ctx, resolver, &md)
|
||||
nameOrPath, fallbackName, lics, supporting = enrichSafeTensorsOCI(ctx, resolver, &md)
|
||||
} else {
|
||||
nameOrPath, lics, supporting = enrichSafeTensorsDir(ctx, resolver, groupKey, &md)
|
||||
fallbackName = safeTensorsDirName(groupKey)
|
||||
}
|
||||
|
||||
merged.Metadata = md
|
||||
@ -359,13 +355,28 @@ func enrichSafeTensorsGroup(ctx context.Context, resolver file.Resolver, groupKe
|
||||
for _, loc := range supporting {
|
||||
merged.Locations.Add(loc.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.SupportingEvidenceAnnotation))
|
||||
}
|
||||
return nameOrPath, imageRefName
|
||||
return nameOrPath, fallbackName
|
||||
}
|
||||
|
||||
// enrichSafeTensorsDir handles the directory-scan case: look for sibling
|
||||
// config.json and README.md next to the model files.
|
||||
// safeTensorsDirName returns the directory-scan naming fallback: the base name
|
||||
// of the group's parent directory (the group key is already that directory).
|
||||
// For "/models/tiny-llama" this returns "tiny-llama". Degenerate roots that
|
||||
// carry no meaningful model name ("/", ".", "") return "", so the group is
|
||||
// dropped rather than labeled with a filesystem artifact.
|
||||
func safeTensorsDirName(groupKey string) string {
|
||||
base := path.Base(groupKey)
|
||||
switch base {
|
||||
case "/", ".", "":
|
||||
return ""
|
||||
}
|
||||
return base
|
||||
}
|
||||
|
||||
// enrichSafeTensorsDir handles the directory-scan case: look for a config.json
|
||||
// beside the model files (walking up parent directories to the scanned source
|
||||
// root if no sibling exists) and a sibling README.md.
|
||||
func enrichSafeTensorsDir(ctx context.Context, resolver file.Resolver, dir string, md *pkg.SafeTensorsModelInfo) (nameOrPath string, lics []pkg.License, supporting []file.Location) {
|
||||
if loc, cfg := readDirHFConfig(resolver, path.Join(dir, "config.json")); cfg != nil {
|
||||
if loc, cfg := findDirHFConfig(resolver, dir); cfg != nil {
|
||||
applyHFConfig(md, cfg)
|
||||
nameOrPath = cfg.NameOrPath
|
||||
supporting = append(supporting, *loc)
|
||||
@ -518,9 +529,9 @@ func classifyOCIModelFileLayer(resolver file.Resolver, loc file.Location, md *pk
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
trimmed := trimLeadingWhitespace(buf)
|
||||
trimmed := bytes.TrimLeft(buf, "\xef\xbb\xbf \t\r\n")
|
||||
switch {
|
||||
case hasPrefix(trimmed, "---"):
|
||||
case bytes.HasPrefix(trimmed, []byte("---")):
|
||||
fm := parseFrontmatter(buf)
|
||||
if fm == nil {
|
||||
return false
|
||||
@ -532,7 +543,7 @@ func classifyOCIModelFileLayer(resolver file.Resolver, loc file.Location, md *pk
|
||||
*readmeName = fm.BaseModel[0]
|
||||
}
|
||||
return true
|
||||
case hasPrefix(trimmed, "{"):
|
||||
case bytes.HasPrefix(trimmed, []byte("{")):
|
||||
var cfg hfConfig
|
||||
if err := json.Unmarshal(buf, &cfg); err != nil {
|
||||
return false
|
||||
@ -566,26 +577,29 @@ func applyHFConfig(md *pkg.SafeTensorsModelInfo, cfg *hfConfig) {
|
||||
//
|
||||
// 1. config.json _name_or_path (path.Base, so "org/Model" → "Model";
|
||||
// applies to both dir-scan and OCI groups)
|
||||
// 2. OCI image-ref last segment (OCI-only; the user-supplied artifact
|
||||
// reference's repository basename, e.g.
|
||||
// "docker.io/ai/smollm2-vllm:360M" → "smollm2-vllm")
|
||||
// 2. fallback name — the group's source-specific positional identifier:
|
||||
// the OCI image-ref repository basename for OCI groups (e.g.
|
||||
// "docker.io/ai/smollm2-vllm:360M" → "smollm2-vllm"), or the parent
|
||||
// directory base name for directory-scan groups (e.g.
|
||||
// "/models/tiny-llama/*.safetensors" → "tiny-llama")
|
||||
//
|
||||
// Returns "" to signal the merge processor should drop the group. There is
|
||||
// intentionally no Architecture-Parameters synthetic or parent-directory
|
||||
// fallback: an unnameable model is recorded as absent rather than under a
|
||||
// label the SBOM consumer would not recognize.
|
||||
func pickSafeTensorsName(nameOrPath, imageRefName string) string {
|
||||
// intentionally no Architecture-Parameters synthetic and no opaque hash label:
|
||||
// when neither a producer-declared name nor a positional fallback is available
|
||||
// the model is recorded as absent rather than under a label the SBOM consumer
|
||||
// would not recognize.
|
||||
func pickSafeTensorsName(nameOrPath, fallbackName string) string {
|
||||
if nameOrPath != "" {
|
||||
return path.Base(nameOrPath)
|
||||
}
|
||||
return imageRefName
|
||||
return fallbackName
|
||||
}
|
||||
|
||||
// --- Relocated enrichment helpers ----------------------------------------
|
||||
// --- Enrichment helpers ---------------------------------------------------
|
||||
//
|
||||
// These types and functions used to live in the parser files; they moved here
|
||||
// when the parsers shrank to "just decode the safetensors-specific format" and
|
||||
// every resolver-backed read centralized in the merge processor.
|
||||
// The parsers decode only the safetensors-specific format; every resolver-backed
|
||||
// read (config.json, README, license layers) is centralized here in the merge
|
||||
// processor, along with the types those reads decode into.
|
||||
|
||||
// hfConfig is a minimal projection of Hugging Face config.json fields.
|
||||
type hfConfig struct {
|
||||
@ -601,6 +615,27 @@ type readmeFrontmatter struct {
|
||||
BaseModel []string `yaml:"base_model"`
|
||||
}
|
||||
|
||||
// findDirHFConfig looks for a config.json beside the model files, walking up
|
||||
// parent directories until it reaches the scanned source root. The walk needs
|
||||
// no explicit depth bound: the resolver only resolves paths within the scanned
|
||||
// source, so an ancestor above the scan root simply yields no config, and
|
||||
// path.Dir converges on a fixed point ("/" or ".") that terminates the loop.
|
||||
// The first config.json found wins, so the closest one — a sibling, then the
|
||||
// nearest ancestor — supplies both the producer-declared name and the HF fields
|
||||
// applied to the model.
|
||||
func findDirHFConfig(resolver file.Resolver, dir string) (*file.Location, *hfConfig) {
|
||||
for {
|
||||
if loc, cfg := readDirHFConfig(resolver, path.Join(dir, "config.json")); cfg != nil {
|
||||
return loc, cfg
|
||||
}
|
||||
parent := path.Dir(dir)
|
||||
if parent == dir {
|
||||
return nil, nil // reached the source root
|
||||
}
|
||||
dir = parent
|
||||
}
|
||||
}
|
||||
|
||||
func readDirHFConfig(resolver file.Resolver, p string) (*file.Location, *hfConfig) {
|
||||
locations, err := resolver.FilesByPath(p)
|
||||
if err != nil || len(locations) == 0 {
|
||||
@ -715,18 +750,3 @@ func parseLicenseFrontmatter(buf []byte) string {
|
||||
}
|
||||
return fm.SPDXID
|
||||
}
|
||||
|
||||
func hasPrefix(b []byte, s string) bool {
|
||||
return len(b) >= len(s) && string(b[:len(s)]) == s
|
||||
}
|
||||
|
||||
func trimLeadingWhitespace(b []byte) []byte {
|
||||
i := 0
|
||||
for i < len(b) && (b[i] == ' ' || b[i] == '\t' || b[i] == '\r' || b[i] == '\n') {
|
||||
i++
|
||||
}
|
||||
if len(b)-i >= 3 && b[i] == 0xEF && b[i+1] == 0xBB && b[i+2] == 0xBF {
|
||||
i += 3
|
||||
}
|
||||
return b[i:]
|
||||
}
|
||||
|
||||
@ -52,6 +52,11 @@ func (b *testGGUFBuilder) withVersion(v uint32) *testGGUFBuilder {
|
||||
return b
|
||||
}
|
||||
|
||||
func (b *testGGUFBuilder) withTensorCount(count uint64) *testGGUFBuilder {
|
||||
b.tensorCount = count
|
||||
return b
|
||||
}
|
||||
|
||||
func (b *testGGUFBuilder) withStringKV(key, value string) *testGGUFBuilder {
|
||||
b.kvPairs = append(b.kvPairs, testKVPair{key: key, valueType: ggufTypeString, value: value})
|
||||
return b
|
||||
@ -114,3 +119,10 @@ func (b *testGGUFBuilder) build() []byte {
|
||||
|
||||
return b.buf.Bytes()
|
||||
}
|
||||
|
||||
// buildInvalidMagic creates a file with invalid magic number
|
||||
func (b *testGGUFBuilder) buildInvalidMagic() []byte {
|
||||
buf := new(bytes.Buffer)
|
||||
binary.Write(buf, binary.LittleEndian, uint32(0x12345678))
|
||||
return buf.Bytes()
|
||||
}
|
||||
|
||||
@ -3,13 +3,14 @@ package pkg
|
||||
// SafeTensorsModelInfo holds the model details extracted from SafeTensors content.
|
||||
// SafeTensors is a simple, safe serialization format for storing tensors, used
|
||||
// as the default weight format for Hugging Face transformer models. Syft may
|
||||
// populate this struct from three sources:
|
||||
// populate this struct from these sources:
|
||||
// - a single .safetensors file (header-only parse)
|
||||
// - a sharded model described by model.safetensors.index.json
|
||||
// - a Docker AI OCI model artifact config blob (vnd.docker.ai.model.config.v0.1+json)
|
||||
// - the per-shard headers of a multi-shard model, merged into one package
|
||||
// - a Docker AI OCI model artifact: the config blob
|
||||
// (vnd.docker.ai.model.config.v0.1+json) plus each weight layer's header
|
||||
//
|
||||
// The Model Name, License, and Version fields have all been lifted up to be on
|
||||
// the syft Package.
|
||||
// Model name, license, and version live on the enclosing syft Package rather
|
||||
// than in this struct.
|
||||
type SafeTensorsModelInfo struct {
|
||||
// Format is the source format label (always "safetensors" for this metadata type).
|
||||
// Present because the Docker AI model config blob carries an explicit format field
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user