From 7e482a26c6628572a6b6793ed428f1b200c47909 Mon Sep 17 00:00:00 2001 From: Christopher Phillips <32073428+spiffcs@users.noreply.github.com> Date: Tue, 30 Jun 2026 15:16:34 -0400 Subject: [PATCH] fix: hash logical content and count safetensors parameters as an integer - metadataHash now covers logical tensor content (name + dtype + shape) plus __metadata__, and excludes DataOffsets. - Parameters becomes a measured uint64 count (matching GGUFFileHeader) instead of a formatted/upstream string. Signed-off-by: Christopher Phillips <32073428+spiffcs@users.noreply.github.com> --- schema/json/schema-16.1.8.json | 4 +- schema/json/schema-latest.json | 4 +- syft/pkg/cataloger/ai/merge.go | 6 ++- syft/pkg/cataloger/ai/merge_test.go | 6 +-- syft/pkg/cataloger/ai/parse_safetensors.go | 26 +++++------ .../cataloger/ai/parse_safetensors_model.go | 20 +-------- .../pkg/cataloger/ai/parse_safetensors_oci.go | 10 ++--- .../cataloger/ai/parse_safetensors_test.go | 44 ++++++------------- syft/pkg/safetensors.go | 7 +-- 9 files changed, 48 insertions(+), 79 deletions(-) diff --git a/schema/json/schema-16.1.8.json b/schema/json/schema-16.1.8.json index 0e8e1527c..6e548d319 100644 --- a/schema/json/schema-16.1.8.json +++ b/schema/json/schema-16.1.8.json @@ -4221,8 +4221,8 @@ "description": "Quantization describes tensor precision (e.g., \"BF16\", \"F16\", \"F32\", \"INT8\")." }, "parameters": { - "type": "string", - "description": "Parameters is the parameter count as reported by upstream. Stored as a string\nbecause Docker AI and Hugging Face labels use notation like \"2.68B\" or \"35B-A3B\"." + "type": "integer", + "description": "Parameters is the total number of model parameters, computed from the tensor\nshapes in the SafeTensors header(s). For a sharded model it is the sum across\nevery shard." }, "tensorCount": { "type": "integer", diff --git a/schema/json/schema-latest.json b/schema/json/schema-latest.json index 0e8e1527c..6e548d319 100644 --- a/schema/json/schema-latest.json +++ b/schema/json/schema-latest.json @@ -4221,8 +4221,8 @@ "description": "Quantization describes tensor precision (e.g., \"BF16\", \"F16\", \"F32\", \"INT8\")." }, "parameters": { - "type": "string", - "description": "Parameters is the parameter count as reported by upstream. Stored as a string\nbecause Docker AI and Hugging Face labels use notation like \"2.68B\" or \"35B-A3B\"." + "type": "integer", + "description": "Parameters is the total number of model parameters, computed from the tensor\nshapes in the SafeTensors header(s). For a sharded model it is the sum across\nevery shard." }, "tensorCount": { "type": "integer", diff --git a/syft/pkg/cataloger/ai/merge.go b/syft/pkg/cataloger/ai/merge.go index 7351c2576..a8d2b4e74 100644 --- a/syft/pkg/cataloger/ai/merge.go +++ b/syft/pkg/cataloger/ai/merge.go @@ -59,7 +59,6 @@ func mergeAggregatesInto(merged *pkg.SafeTensorsModelInfo, aggregates []pkg.Safe if merged.TensorCount == 0 { merged.TensorCount = a.TensorCount } - firstNonEmpty(&merged.Parameters, a.Parameters) firstNonEmpty(&merged.TotalSize, a.TotalSize) firstNonEmpty(&merged.Quantization, a.Quantization) } @@ -69,12 +68,15 @@ func mergeAggregatesInto(merged *pkg.SafeTensorsModelInfo, aggregates []pkg.Safe // the summed shard TensorCount and the list of non-empty per-shard hashes for // the rollup. Shards carry only the content-derived fields (Quantization, // Parameters, UserMetadata), so those are the only fields folded in here. +// TensorCount and Parameters are summed because each shard holds a distinct +// slice of the model; Quantization takes the first value since all shards share +// one precision. func mergeShardsInto(merged *pkg.SafeTensorsModelInfo, shards []pkg.SafeTensorsModelInfo) (shardTensorTotal uint64, hashes []string) { seenKV := map[string]bool{} for _, s := range shards { shardTensorTotal += s.TensorCount + merged.Parameters += s.Parameters firstNonEmpty(&merged.Quantization, s.Quantization) - firstNonEmpty(&merged.Parameters, s.Parameters) for _, kv := range s.UserMetadata { if seenKV[kv.Key] { continue diff --git a/syft/pkg/cataloger/ai/merge_test.go b/syft/pkg/cataloger/ai/merge_test.go index 06631db53..7276d8896 100644 --- a/syft/pkg/cataloger/ai/merge_test.go +++ b/syft/pkg/cataloger/ai/merge_test.go @@ -27,7 +27,7 @@ func shardMeta(hash string, tensorCount uint64) pkg.SafeTensorsModelInfo { Format: "safetensors", TensorCount: tensorCount, Quantization: "BF16", - Parameters: "1.00K", + Parameters: 1000, MetadataHash: hash, } } @@ -60,6 +60,7 @@ func TestMergeSafeTensorsGroup(t *testing.T) { md := out.Metadata.(pkg.SafeTensorsModelInfo) assert.Equal(t, 3, md.ShardCount) assert.Equal(t, uint64(9), md.TensorCount, "tensor counts are summed across shards") + assert.Equal(t, uint64(3000), md.Parameters, "parameter counts are summed across shards") require.Len(t, md.Parts, 3) assert.Equal(t, []string{"aaaa", "bbbb", "cccc"}, @@ -85,7 +86,6 @@ func TestMergeSafeTensorsGroup(t *testing.T) { Format: "safetensors", TensorCount: 999, TotalSize: "5.00GB", - Parameters: "2.68B", Quantization: "Q4_K_M", } in := []pkg.Package{ @@ -98,7 +98,7 @@ func TestMergeSafeTensorsGroup(t *testing.T) { md := out.Metadata.(pkg.SafeTensorsModelInfo) assert.Equal(t, uint64(999), md.TensorCount, "aggregate TensorCount is authoritative; shard counts are not summed in") assert.Equal(t, "5.00GB", md.TotalSize) - assert.Equal(t, "2.68B", md.Parameters) + assert.Equal(t, uint64(2000), md.Parameters, "parameters are always measured from the shards (summed), not taken from the aggregate") assert.Equal(t, "Q4_K_M", md.Quantization, "aggregate quantization wins over the shard dtype") assert.Equal(t, 2, md.ShardCount, "ShardCount comes from the number of shards, not the aggregate") assert.Equal(t, rollupHash([]string{"aaaa", "bbbb"}), md.MetadataHash, "the content hash still rolls up the shard hashes") diff --git a/syft/pkg/cataloger/ai/parse_safetensors.go b/syft/pkg/cataloger/ai/parse_safetensors.go index 79967401c..bc1ff1402 100644 --- a/syft/pkg/cataloger/ai/parse_safetensors.go +++ b/syft/pkg/cataloger/ai/parse_safetensors.go @@ -51,14 +51,14 @@ func readSafeTensorsHeader(r io.Reader) (*safeTensorsHeader, error) { return nil, fmt.Errorf("safetensors header size %d exceeds maximum %d", headerLen, maxSafeTensorsHeaderSize) } - // Read incrementally rather than pre-allocating headerLen up front - body, err := io.ReadAll(io.LimitReader(r, int64(headerLen))) - if err != nil { + // Read incrementally rather than pre-allocating headerLen up front + body, err := io.ReadAll(io.LimitReader(r, int64(headerLen))) + if err != nil { return nil, fmt.Errorf("failed to read header body: %w", err) } - if uint64(len(body)) != headerLen { - return nil, fmt.Errorf("safetensors header truncated: read %d of %d bytes", len(body), headerLen) - } + if uint64(len(body)) != headerLen { + return nil, fmt.Errorf("safetensors header truncated: read %d of %d bytes", len(body), headerLen) + } var raw map[string]json.RawMessage if err := json.Unmarshal(body, &raw); err != nil { @@ -131,19 +131,19 @@ func (h *safeTensorsHeader) dominantDType() string { // (name + dtype + shape) plus the __metadata__ map. Tensor keys are sorted to // keep the hash deterministic across producers. func (h *safeTensorsHeader) metadataHash() string { - type logicalEntry struct { - Name string `json:"name"` - DType string `json:"dtype"` - Shape []int64 `json:"shape"` + type logicalEntry struct { + Name string `json:"name"` + DType string `json:"dtype"` + Shape []int64 `json:"shape"` } - entries := make([]logicalEntry, 0, len(h.tensors)) + entries := make([]logicalEntry, 0, len(h.tensors)) for name, t := range h.tensors { - entries = append(entries, logicalEntry{Name: name, DType: t.DType, Shape: t.Shape}) + entries = append(entries, logicalEntry{Name: name, DType: t.DType, Shape: t.Shape}) } sort.Slice(entries, func(i, j int) bool { return entries[i].Name < entries[j].Name }) type hashInput struct { - Tensors []logicalEntry `json:"tensors"` + Tensors []logicalEntry `json:"tensors"` Metadata map[string]string `json:"metadata,omitempty"` } b, err := json.Marshal(hashInput{Tensors: entries, Metadata: h.metadata}) diff --git a/syft/pkg/cataloger/ai/parse_safetensors_model.go b/syft/pkg/cataloger/ai/parse_safetensors_model.go index 134ef71b1..a9a8f3c66 100644 --- a/syft/pkg/cataloger/ai/parse_safetensors_model.go +++ b/syft/pkg/cataloger/ai/parse_safetensors_model.go @@ -31,13 +31,11 @@ func parseSafeTensorsFile(_ context.Context, _ file.Resolver, _ *generic.Environ md := pkg.SafeTensorsModelInfo{ Format: "safetensors", TensorCount: uint64(len(header.tensors)), + Parameters: header.parameterCount(), Quantization: normalizeDType(header.dominantDType()), UserMetadata: userMetadataKeyValues(header.metadata), MetadataHash: header.metadataHash(), } - if p := header.parameterCount(); p > 0 { - md.Parameters = formatParameterCount(p) - } p := newSafeTensorsPackage( &md, @@ -46,21 +44,5 @@ func parseSafeTensorsFile(_ context.Context, _ file.Resolver, _ *generic.Environ return []pkg.Package{p}, nil, unknown.IfEmptyf([]pkg.Package{p}, "unable to parse safetensors file") } -// formatParameterCount prints a count like 6_700_000_000 as "6.70B" using -// B/M/K thresholds matching the notation used by Hugging Face and Docker AI -// labels. -func formatParameterCount(n uint64) string { - switch { - case n >= 1_000_000_000: - return fmt.Sprintf("%.2fB", float64(n)/1_000_000_000) - case n >= 1_000_000: - return fmt.Sprintf("%.2fM", float64(n)/1_000_000) - case n >= 1_000: - return fmt.Sprintf("%.2fK", float64(n)/1_000) - default: - return fmt.Sprintf("%d", n) - } -} - // integrity check var _ generic.Parser = parseSafeTensorsFile diff --git a/syft/pkg/cataloger/ai/parse_safetensors_oci.go b/syft/pkg/cataloger/ai/parse_safetensors_oci.go index a20988f79..7fd9f3b3e 100644 --- a/syft/pkg/cataloger/ai/parse_safetensors_oci.go +++ b/syft/pkg/cataloger/ai/parse_safetensors_oci.go @@ -39,7 +39,6 @@ type dockerAIModelConfig struct { Config struct { Format string `json:"format"` Quantization string `json:"quantization"` - Parameters string `json:"parameters"` Size string `json:"size"` SafeTensors struct { TensorCount json.Number `json:"tensor_count"` @@ -65,10 +64,13 @@ func parseSafeTensorsOCIConfig(_ context.Context, _ file.Resolver, _ *generic.En return nil, nil, nil } + // Parameters is intentionally not read from the config blob: we measure the + // true parameter count from the SafeTensors layer headers (parseSafeTensorsOCILayer) + // so OCI and directory scans of the same model agree, rather than trusting the + // producer-supplied label here. md := pkg.SafeTensorsModelInfo{ Format: "safetensors", Quantization: cfg.Config.Quantization, - Parameters: cfg.Config.Parameters, TotalSize: cfg.Config.Size, } if n, err := cfg.Config.SafeTensors.TensorCount.Int64(); err == nil && n > 0 { @@ -95,13 +97,11 @@ func parseSafeTensorsOCILayer(_ context.Context, _ file.Resolver, _ *generic.Env md := pkg.SafeTensorsModelInfo{ Format: "safetensors", TensorCount: uint64(len(header.tensors)), + Parameters: header.parameterCount(), Quantization: normalizeDType(header.dominantDType()), UserMetadata: userMetadataKeyValues(header.metadata), MetadataHash: header.metadataHash(), } - if p := header.parameterCount(); p > 0 { - md.Parameters = formatParameterCount(p) - } p := newSafeTensorsPackage( &md, diff --git a/syft/pkg/cataloger/ai/parse_safetensors_test.go b/syft/pkg/cataloger/ai/parse_safetensors_test.go index dfd586b33..14cfa7762 100644 --- a/syft/pkg/cataloger/ai/parse_safetensors_test.go +++ b/syft/pkg/cataloger/ai/parse_safetensors_test.go @@ -83,7 +83,7 @@ func TestSafeTensorsCataloger(t *testing.T) { Format: "safetensors", Architecture: architecture, Quantization: "BF16", - Parameters: "16.26K", + Parameters: 16256, TensorCount: 2, ShardCount: 1, UserMetadata: pkg.KeyValues{{Key: "format", Value: "pt"}}, @@ -344,12 +344,13 @@ func TestParseSafeTensorsOCIConfig(t *testing.T) { { // nameless: the merge processor assigns the name and resolves // licenses. Config blobs carry no header content, so - // MetadataHash stays empty. + // MetadataHash stays empty. The "parameters" label in the blob is + // intentionally ignored: the true count is measured from the + // SafeTensors layer headers, so Parameters stays zero here. Type: pkg.ModelPkg, Metadata: pkg.SafeTensorsModelInfo{ Format: "safetensors", Quantization: "Q4_K_M", - Parameters: "8B", TotalSize: "16.00GB", TensorCount: 291, }, @@ -708,7 +709,7 @@ func TestParseSafeTensorsOCILayer(t *testing.T) { Type: pkg.ModelPkg, Metadata: pkg.SafeTensorsModelInfo{ Format: "safetensors", - Parameters: "16.64K", + Parameters: 16640, Quantization: "BF16", TensorCount: 2, UserMetadata: wantUserMetadata, @@ -740,7 +741,6 @@ func TestParseSafeTensorsOCILayer(t *testing.T) { Type: pkg.ModelPkg, Metadata: pkg.SafeTensorsModelInfo{ Format: "safetensors", - Parameters: "2.68B", TotalSize: "5.00GB", Quantization: "Q4_K_M", // raw producer string TensorCount: 9999, @@ -767,11 +767,13 @@ func TestParseSafeTensorsOCILayer(t *testing.T) { got := out[0] assert.Equal(t, "qwen-test", got.Name, "name comes from the companion config.json _name_or_path") md := got.Metadata.(pkg.SafeTensorsModelInfo) - // Aggregate-declared fields win for totals; per-shard count must NOT be - // summed into the aggregate. + // Aggregate-declared TensorCount/TotalSize win as authoritative totals; the + // per-shard TensorCount must NOT be summed into the aggregate. Parameters is + // the exception: it is always measured from the shard layer header (here the + // 16640-element blob), never taken from the config aggregate. assert.Equal(t, uint64(9999), md.TensorCount) assert.Equal(t, "5.00GB", md.TotalSize) - assert.Equal(t, "2.68B", md.Parameters) + assert.Equal(t, uint64(16640), md.Parameters) // Aggregate Quantization wins when set; shard's normalized dtype is the // fallback (not exercised here because the config had Q4_K_M). assert.Equal(t, "Q4_K_M", md.Quantization) @@ -804,11 +806,11 @@ func TestParseSafeTensorsOCILayer_realFixture(t *testing.T) { Type: pkg.ModelPkg, Metadata: pkg.SafeTensorsModelInfo{ Format: "safetensors", - Parameters: "475.29M", - Quantization: "F32", // every tensor in the captured shard is F32 - TensorCount: 148, // nomic-embed-v2-moe 475M ships 148 tensor entries in this shard + Parameters: 475292928, // exact element count summed across the 148 tensors in this shard + Quantization: "F32", // every tensor in the captured shard is F32 + TensorCount: 148, // nomic-embed-v2-moe 475M ships 148 tensor entries in this shard UserMetadata: pkg.KeyValues{{Key: "format", Value: "pt"}}, - MetadataHash: "051a14e686673dea", + MetadataHash: "6026c28a883ab918", }, }, } @@ -980,24 +982,6 @@ func TestNormalizeDType(t *testing.T) { } } -func TestFormatParameterCount(t *testing.T) { - tests := []struct { - name string - in uint64 - want string - }{ - {name: "raw count under 1K", in: 512, want: "512"}, - {name: "thousands", in: 16256, want: "16.26K"}, - {name: "billions", in: 2_680_000_000, want: "2.68B"}, - {name: "millions", in: 35_000_000, want: "35.00M"}, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - assert.Equal(t, tt.want, formatParameterCount(tt.in)) - }) - } -} - func TestParseFrontmatter(t *testing.T) { tests := []struct { name string diff --git a/syft/pkg/safetensors.go b/syft/pkg/safetensors.go index 0e769472b..c1de661d0 100644 --- a/syft/pkg/safetensors.go +++ b/syft/pkg/safetensors.go @@ -18,9 +18,10 @@ type SafeTensorsModelInfo struct { // Quantization describes tensor precision (e.g., "BF16", "F16", "F32", "INT8"). Quantization string `json:"quantization,omitempty" cyclonedx:"quantization"` - // Parameters is the parameter count as reported by upstream. Stored as a string - // because Docker AI and Hugging Face labels use notation like "2.68B" or "35B-A3B". - Parameters string `json:"parameters,omitempty" cyclonedx:"parameters"` + // Parameters is the total number of model parameters, computed from the tensor + // shapes in the SafeTensors header(s). For a sharded model it is the sum across + // every shard. + Parameters uint64 `json:"parameters,omitempty" cyclonedx:"parameters"` // TensorCount is the number of tensor entries in the file header. TensorCount uint64 `json:"tensorCount,omitempty" cyclonedx:"tensorCount"`