From d12cf9a3e2d8008f919c42a430f65c6d1961e0a0 Mon Sep 17 00:00:00 2001 From: Christopher Phillips <32073428+spiffcs@users.noreply.github.com> Date: Thu, 28 May 2026 13:00:47 -0400 Subject: [PATCH] fix: update userMetadata to use KeyValue Signed-off-by: Christopher Phillips <32073428+spiffcs@users.noreply.github.com> --- schema/json/schema-latest.json | 7 ++---- syft/pkg/cataloger/ai/parse_safetensors.go | 23 +++++++++++++++++++ .../cataloger/ai/parse_safetensors_model.go | 2 +- .../pkg/cataloger/ai/parse_safetensors_oci.go | 2 +- .../cataloger/ai/parse_safetensors_test.go | 9 ++++---- syft/pkg/safetensors.go | 5 ++-- 6 files changed, 35 insertions(+), 13 deletions(-) diff --git a/schema/json/schema-latest.json b/schema/json/schema-latest.json index 8102a8f1d..b21fcd13a 100644 --- a/schema/json/schema-latest.json +++ b/schema/json/schema-latest.json @@ -4129,11 +4129,8 @@ "description": "ShardCount is the number of .safetensors shards for a sharded model (1 for a\nsingle-file model)." }, "userMetadata": { - "additionalProperties": { - "type": "string" - }, - "type": "object", - "description": "UserMetadata is the optional \"__metadata__\" map from a .safetensors file header\n(string-to-string key/values set by the producer)." + "$ref": "#/$defs/KeyValues", + "description": "UserMetadata is the optional \"__metadata__\" map from a .safetensors file header\n(string-to-string key/values set by the producer). Stored as a sorted KeyValues\nslice rather than a Go map so SBOM output is stable across runs." }, "metadataHash": { "type": "string", diff --git a/syft/pkg/cataloger/ai/parse_safetensors.go b/syft/pkg/cataloger/ai/parse_safetensors.go index 3f5de7ed2..6615d7ee6 100644 --- a/syft/pkg/cataloger/ai/parse_safetensors.go +++ b/syft/pkg/cataloger/ai/parse_safetensors.go @@ -9,6 +9,8 @@ import ( "strings" "github.com/cespare/xxhash/v2" + + "github.com/anchore/syft/syft/pkg" ) // SafeTensors file format: [8 bytes u64 LE header size] [N bytes JSON header] [tensor data]. @@ -144,6 +146,27 @@ func (h *safeTensorsHeader) metadataHash() string { return fmt.Sprintf("%016x", xxhash.Sum64(b)) } +// userMetadataKeyValues converts the safetensors __metadata__ map into a +// KeyValues slice sorted by key. We do not use the convention of returning a +// nil slice for an empty input — instead, an empty input maps to an empty +// (length-0, non-nil) KeyValues — so downstream JSON serialization remains +// stable: `omitempty` drops the field either way. +func userMetadataKeyValues(m map[string]string) pkg.KeyValues { + if len(m) == 0 { + return nil + } + keys := make([]string, 0, len(m)) + for k := range m { + keys = append(keys, k) + } + sort.Strings(keys) + out := make(pkg.KeyValues, 0, len(keys)) + for _, k := range keys { + out = append(out, pkg.KeyValue{Key: k, Value: m[k]}) + } + return out +} + // normalizeDType maps a safetensors/torch dtype label to an uppercase quantization // shorthand matching conventions used elsewhere in syft (e.g., BF16, F16, I8). func normalizeDType(dtype string) string { diff --git a/syft/pkg/cataloger/ai/parse_safetensors_model.go b/syft/pkg/cataloger/ai/parse_safetensors_model.go index a1e70d655..0d4ddd6c5 100644 --- a/syft/pkg/cataloger/ai/parse_safetensors_model.go +++ b/syft/pkg/cataloger/ai/parse_safetensors_model.go @@ -38,7 +38,7 @@ func parseSafeTensorsFile(_ context.Context, resolver file.Resolver, _ *generic. TensorCount: uint64(len(header.tensors)), Quantization: normalizeDType(header.dominantDType()), ShardCount: 1, - UserMetadata: header.metadata, + UserMetadata: userMetadataKeyValues(header.metadata), MetadataHash: header.metadataHash(), } if p := header.parameterCount(); p > 0 { diff --git a/syft/pkg/cataloger/ai/parse_safetensors_oci.go b/syft/pkg/cataloger/ai/parse_safetensors_oci.go index d5e78f2db..7f5e5f0bb 100644 --- a/syft/pkg/cataloger/ai/parse_safetensors_oci.go +++ b/syft/pkg/cataloger/ai/parse_safetensors_oci.go @@ -261,7 +261,7 @@ func parseSafeTensorsOCILayer(_ context.Context, _ file.Resolver, _ *generic.Env Format: "safetensors", TensorCount: uint64(len(header.tensors)), Quantization: normalizeDType(header.dominantDType()), - UserMetadata: header.metadata, + UserMetadata: userMetadataKeyValues(header.metadata), MetadataHash: header.metadataHash(), } if p := header.parameterCount(); p > 0 { diff --git a/syft/pkg/cataloger/ai/parse_safetensors_test.go b/syft/pkg/cataloger/ai/parse_safetensors_test.go index fb9775b60..6af4c5da1 100644 --- a/syft/pkg/cataloger/ai/parse_safetensors_test.go +++ b/syft/pkg/cataloger/ai/parse_safetensors_test.go @@ -74,7 +74,7 @@ func TestSafeTensorsCataloger_singleFile(t *testing.T) { TorchDtype: "bfloat16", TransformersVersion: "4.40.0", ShardCount: 1, - UserMetadata: userMeta, + UserMetadata: pkg.KeyValues{{Key: "format", Value: "pt"}}, MetadataHash: wantHash, }, }, @@ -276,6 +276,7 @@ func TestParseSafeTensorsOCILayer(t *testing.T) { "layer.1.weight": {DType: "BF16", Shape: []int64{16, 16}, DataOffsets: []int64{32768, 33280}}, } userMeta := map[string]string{"format": "pt"} + wantUserMetadata := pkg.KeyValues{{Key: "format", Value: "pt"}} blob := buildSafeTensorsFile(t, userMeta, tensors) wantHash := (&safeTensorsHeader{metadata: userMeta, tensors: tensors}).metadataHash() @@ -291,7 +292,7 @@ func TestParseSafeTensorsOCILayer(t *testing.T) { assert.Equal(t, "safetensors", md.Format) assert.Equal(t, uint64(2), md.TensorCount) assert.Equal(t, "BF16", md.Quantization) - assert.Equal(t, userMeta, md.UserMetadata) + assert.Equal(t, wantUserMetadata, md.UserMetadata) assert.Equal(t, wantHash, md.MetadataHash) }) @@ -327,7 +328,7 @@ func TestParseSafeTensorsOCILayer(t *testing.T) { // MetadataHash is cleared on absorbed parts by the existing merge processor. // What survives is the rest of the per-shard metadata (UserMetadata, TensorCount, // header-derived Quantization). Confirm those are intact. - assert.Equal(t, userMeta, md.Parts[0].UserMetadata) + assert.Equal(t, wantUserMetadata, md.Parts[0].UserMetadata) assert.Equal(t, uint64(2), md.Parts[0].TensorCount) assert.Equal(t, "BF16", md.Parts[0].Quantization, "part keeps the normalized header dtype") }) @@ -357,7 +358,7 @@ func TestParseSafeTensorsOCILayer_realFixture(t *testing.T) { assert.Equal(t, uint64(148), md.TensorCount, "nomic-embed-v2-moe 475M ships 148 tensor entries in this shard") assert.Equal(t, "F32", md.Quantization, "every tensor in the captured shard is F32") assert.Equal(t, "475.29M", md.Parameters) - assert.Equal(t, map[string]string{"format": "pt"}, md.UserMetadata) + assert.Equal(t, pkg.KeyValues{{Key: "format", Value: "pt"}}, md.UserMetadata) // MetadataHash is locked to the exact value the parser produces for this // captured input. The fixture is immutable on disk; if this value changes // either the hash algorithm or the canonicalization changed, both of which diff --git a/syft/pkg/safetensors.go b/syft/pkg/safetensors.go index f1fae79f9..b09060c5f 100644 --- a/syft/pkg/safetensors.go +++ b/syft/pkg/safetensors.go @@ -46,8 +46,9 @@ type SafeTensorsModelInfo struct { ShardCount int `json:"shardCount,omitempty" cyclonedx:"shardCount"` // UserMetadata is the optional "__metadata__" map from a .safetensors file header - // (string-to-string key/values set by the producer). - UserMetadata map[string]string `json:"userMetadata,omitempty" cyclonedx:"userMetadata"` + // (string-to-string key/values set by the producer). Stored as a sorted KeyValues + // slice rather than a Go map so SBOM output is stable across runs. + UserMetadata KeyValues `json:"userMetadata,omitempty" cyclonedx:"userMetadata"` // MetadataHash is an xxhash of the normalized header metadata, providing a stable // identifier for identical model content across repositories or filenames.