fix: hash logical content and count safetensors parameters as an integer

- metadataHash now covers logical tensor content (name + dtype + shape) plus
  __metadata__, and excludes DataOffsets.

- Parameters becomes a measured uint64 count (matching GGUFFileHeader) instead
  of a formatted/upstream string.

Signed-off-by: Christopher Phillips <32073428+spiffcs@users.noreply.github.com>
This commit is contained in:
Christopher Phillips 2026-06-30 15:16:34 -04:00
parent 88ef52f617
commit 7e482a26c6
No known key found for this signature in database
9 changed files with 48 additions and 79 deletions

View File

@ -4221,8 +4221,8 @@
"description": "Quantization describes tensor precision (e.g., \"BF16\", \"F16\", \"F32\", \"INT8\")."
},
"parameters": {
"type": "string",
"description": "Parameters is the parameter count as reported by upstream. Stored as a string\nbecause Docker AI and Hugging Face labels use notation like \"2.68B\" or \"35B-A3B\"."
"type": "integer",
"description": "Parameters is the total number of model parameters, computed from the tensor\nshapes in the SafeTensors header(s). For a sharded model it is the sum across\nevery shard."
},
"tensorCount": {
"type": "integer",

View File

@ -4221,8 +4221,8 @@
"description": "Quantization describes tensor precision (e.g., \"BF16\", \"F16\", \"F32\", \"INT8\")."
},
"parameters": {
"type": "string",
"description": "Parameters is the parameter count as reported by upstream. Stored as a string\nbecause Docker AI and Hugging Face labels use notation like \"2.68B\" or \"35B-A3B\"."
"type": "integer",
"description": "Parameters is the total number of model parameters, computed from the tensor\nshapes in the SafeTensors header(s). For a sharded model it is the sum across\nevery shard."
},
"tensorCount": {
"type": "integer",

View File

@ -59,7 +59,6 @@ func mergeAggregatesInto(merged *pkg.SafeTensorsModelInfo, aggregates []pkg.Safe
if merged.TensorCount == 0 {
merged.TensorCount = a.TensorCount
}
firstNonEmpty(&merged.Parameters, a.Parameters)
firstNonEmpty(&merged.TotalSize, a.TotalSize)
firstNonEmpty(&merged.Quantization, a.Quantization)
}
@ -69,12 +68,15 @@ func mergeAggregatesInto(merged *pkg.SafeTensorsModelInfo, aggregates []pkg.Safe
// the summed shard TensorCount and the list of non-empty per-shard hashes for
// the rollup. Shards carry only the content-derived fields (Quantization,
// Parameters, UserMetadata), so those are the only fields folded in here.
// TensorCount and Parameters are summed because each shard holds a distinct
// slice of the model; Quantization takes the first value since all shards share
// one precision.
func mergeShardsInto(merged *pkg.SafeTensorsModelInfo, shards []pkg.SafeTensorsModelInfo) (shardTensorTotal uint64, hashes []string) {
seenKV := map[string]bool{}
for _, s := range shards {
shardTensorTotal += s.TensorCount
merged.Parameters += s.Parameters
firstNonEmpty(&merged.Quantization, s.Quantization)
firstNonEmpty(&merged.Parameters, s.Parameters)
for _, kv := range s.UserMetadata {
if seenKV[kv.Key] {
continue

View File

@ -27,7 +27,7 @@ func shardMeta(hash string, tensorCount uint64) pkg.SafeTensorsModelInfo {
Format: "safetensors",
TensorCount: tensorCount,
Quantization: "BF16",
Parameters: "1.00K",
Parameters: 1000,
MetadataHash: hash,
}
}
@ -60,6 +60,7 @@ func TestMergeSafeTensorsGroup(t *testing.T) {
md := out.Metadata.(pkg.SafeTensorsModelInfo)
assert.Equal(t, 3, md.ShardCount)
assert.Equal(t, uint64(9), md.TensorCount, "tensor counts are summed across shards")
assert.Equal(t, uint64(3000), md.Parameters, "parameter counts are summed across shards")
require.Len(t, md.Parts, 3)
assert.Equal(t,
[]string{"aaaa", "bbbb", "cccc"},
@ -85,7 +86,6 @@ func TestMergeSafeTensorsGroup(t *testing.T) {
Format: "safetensors",
TensorCount: 999,
TotalSize: "5.00GB",
Parameters: "2.68B",
Quantization: "Q4_K_M",
}
in := []pkg.Package{
@ -98,7 +98,7 @@ func TestMergeSafeTensorsGroup(t *testing.T) {
md := out.Metadata.(pkg.SafeTensorsModelInfo)
assert.Equal(t, uint64(999), md.TensorCount, "aggregate TensorCount is authoritative; shard counts are not summed in")
assert.Equal(t, "5.00GB", md.TotalSize)
assert.Equal(t, "2.68B", md.Parameters)
assert.Equal(t, uint64(2000), md.Parameters, "parameters are always measured from the shards (summed), not taken from the aggregate")
assert.Equal(t, "Q4_K_M", md.Quantization, "aggregate quantization wins over the shard dtype")
assert.Equal(t, 2, md.ShardCount, "ShardCount comes from the number of shards, not the aggregate")
assert.Equal(t, rollupHash([]string{"aaaa", "bbbb"}), md.MetadataHash, "the content hash still rolls up the shard hashes")

View File

@ -51,14 +51,14 @@ func readSafeTensorsHeader(r io.Reader) (*safeTensorsHeader, error) {
return nil, fmt.Errorf("safetensors header size %d exceeds maximum %d", headerLen, maxSafeTensorsHeaderSize)
}
// Read incrementally rather than pre-allocating headerLen up front
body, err := io.ReadAll(io.LimitReader(r, int64(headerLen)))
if err != nil {
// Read incrementally rather than pre-allocating headerLen up front
body, err := io.ReadAll(io.LimitReader(r, int64(headerLen)))
if err != nil {
return nil, fmt.Errorf("failed to read header body: %w", err)
}
if uint64(len(body)) != headerLen {
return nil, fmt.Errorf("safetensors header truncated: read %d of %d bytes", len(body), headerLen)
}
if uint64(len(body)) != headerLen {
return nil, fmt.Errorf("safetensors header truncated: read %d of %d bytes", len(body), headerLen)
}
var raw map[string]json.RawMessage
if err := json.Unmarshal(body, &raw); err != nil {
@ -131,19 +131,19 @@ func (h *safeTensorsHeader) dominantDType() string {
// (name + dtype + shape) plus the __metadata__ map. Tensor keys are sorted to
// keep the hash deterministic across producers.
func (h *safeTensorsHeader) metadataHash() string {
type logicalEntry struct {
Name string `json:"name"`
DType string `json:"dtype"`
Shape []int64 `json:"shape"`
type logicalEntry struct {
Name string `json:"name"`
DType string `json:"dtype"`
Shape []int64 `json:"shape"`
}
entries := make([]logicalEntry, 0, len(h.tensors))
entries := make([]logicalEntry, 0, len(h.tensors))
for name, t := range h.tensors {
entries = append(entries, logicalEntry{Name: name, DType: t.DType, Shape: t.Shape})
entries = append(entries, logicalEntry{Name: name, DType: t.DType, Shape: t.Shape})
}
sort.Slice(entries, func(i, j int) bool { return entries[i].Name < entries[j].Name })
type hashInput struct {
Tensors []logicalEntry `json:"tensors"`
Tensors []logicalEntry `json:"tensors"`
Metadata map[string]string `json:"metadata,omitempty"`
}
b, err := json.Marshal(hashInput{Tensors: entries, Metadata: h.metadata})

View File

@ -31,13 +31,11 @@ func parseSafeTensorsFile(_ context.Context, _ file.Resolver, _ *generic.Environ
md := pkg.SafeTensorsModelInfo{
Format: "safetensors",
TensorCount: uint64(len(header.tensors)),
Parameters: header.parameterCount(),
Quantization: normalizeDType(header.dominantDType()),
UserMetadata: userMetadataKeyValues(header.metadata),
MetadataHash: header.metadataHash(),
}
if p := header.parameterCount(); p > 0 {
md.Parameters = formatParameterCount(p)
}
p := newSafeTensorsPackage(
&md,
@ -46,21 +44,5 @@ func parseSafeTensorsFile(_ context.Context, _ file.Resolver, _ *generic.Environ
return []pkg.Package{p}, nil, unknown.IfEmptyf([]pkg.Package{p}, "unable to parse safetensors file")
}
// formatParameterCount prints a count like 6_700_000_000 as "6.70B" using
// B/M/K thresholds matching the notation used by Hugging Face and Docker AI
// labels.
func formatParameterCount(n uint64) string {
switch {
case n >= 1_000_000_000:
return fmt.Sprintf("%.2fB", float64(n)/1_000_000_000)
case n >= 1_000_000:
return fmt.Sprintf("%.2fM", float64(n)/1_000_000)
case n >= 1_000:
return fmt.Sprintf("%.2fK", float64(n)/1_000)
default:
return fmt.Sprintf("%d", n)
}
}
// integrity check
var _ generic.Parser = parseSafeTensorsFile

View File

@ -39,7 +39,6 @@ type dockerAIModelConfig struct {
Config struct {
Format string `json:"format"`
Quantization string `json:"quantization"`
Parameters string `json:"parameters"`
Size string `json:"size"`
SafeTensors struct {
TensorCount json.Number `json:"tensor_count"`
@ -65,10 +64,13 @@ func parseSafeTensorsOCIConfig(_ context.Context, _ file.Resolver, _ *generic.En
return nil, nil, nil
}
// Parameters is intentionally not read from the config blob: we measure the
// true parameter count from the SafeTensors layer headers (parseSafeTensorsOCILayer)
// so OCI and directory scans of the same model agree, rather than trusting the
// producer-supplied label here.
md := pkg.SafeTensorsModelInfo{
Format: "safetensors",
Quantization: cfg.Config.Quantization,
Parameters: cfg.Config.Parameters,
TotalSize: cfg.Config.Size,
}
if n, err := cfg.Config.SafeTensors.TensorCount.Int64(); err == nil && n > 0 {
@ -95,13 +97,11 @@ func parseSafeTensorsOCILayer(_ context.Context, _ file.Resolver, _ *generic.Env
md := pkg.SafeTensorsModelInfo{
Format: "safetensors",
TensorCount: uint64(len(header.tensors)),
Parameters: header.parameterCount(),
Quantization: normalizeDType(header.dominantDType()),
UserMetadata: userMetadataKeyValues(header.metadata),
MetadataHash: header.metadataHash(),
}
if p := header.parameterCount(); p > 0 {
md.Parameters = formatParameterCount(p)
}
p := newSafeTensorsPackage(
&md,

View File

@ -83,7 +83,7 @@ func TestSafeTensorsCataloger(t *testing.T) {
Format: "safetensors",
Architecture: architecture,
Quantization: "BF16",
Parameters: "16.26K",
Parameters: 16256,
TensorCount: 2,
ShardCount: 1,
UserMetadata: pkg.KeyValues{{Key: "format", Value: "pt"}},
@ -344,12 +344,13 @@ func TestParseSafeTensorsOCIConfig(t *testing.T) {
{
// nameless: the merge processor assigns the name and resolves
// licenses. Config blobs carry no header content, so
// MetadataHash stays empty.
// MetadataHash stays empty. The "parameters" label in the blob is
// intentionally ignored: the true count is measured from the
// SafeTensors layer headers, so Parameters stays zero here.
Type: pkg.ModelPkg,
Metadata: pkg.SafeTensorsModelInfo{
Format: "safetensors",
Quantization: "Q4_K_M",
Parameters: "8B",
TotalSize: "16.00GB",
TensorCount: 291,
},
@ -708,7 +709,7 @@ func TestParseSafeTensorsOCILayer(t *testing.T) {
Type: pkg.ModelPkg,
Metadata: pkg.SafeTensorsModelInfo{
Format: "safetensors",
Parameters: "16.64K",
Parameters: 16640,
Quantization: "BF16",
TensorCount: 2,
UserMetadata: wantUserMetadata,
@ -740,7 +741,6 @@ func TestParseSafeTensorsOCILayer(t *testing.T) {
Type: pkg.ModelPkg,
Metadata: pkg.SafeTensorsModelInfo{
Format: "safetensors",
Parameters: "2.68B",
TotalSize: "5.00GB",
Quantization: "Q4_K_M", // raw producer string
TensorCount: 9999,
@ -767,11 +767,13 @@ func TestParseSafeTensorsOCILayer(t *testing.T) {
got := out[0]
assert.Equal(t, "qwen-test", got.Name, "name comes from the companion config.json _name_or_path")
md := got.Metadata.(pkg.SafeTensorsModelInfo)
// Aggregate-declared fields win for totals; per-shard count must NOT be
// summed into the aggregate.
// Aggregate-declared TensorCount/TotalSize win as authoritative totals; the
// per-shard TensorCount must NOT be summed into the aggregate. Parameters is
// the exception: it is always measured from the shard layer header (here the
// 16640-element blob), never taken from the config aggregate.
assert.Equal(t, uint64(9999), md.TensorCount)
assert.Equal(t, "5.00GB", md.TotalSize)
assert.Equal(t, "2.68B", md.Parameters)
assert.Equal(t, uint64(16640), md.Parameters)
// Aggregate Quantization wins when set; shard's normalized dtype is the
// fallback (not exercised here because the config had Q4_K_M).
assert.Equal(t, "Q4_K_M", md.Quantization)
@ -804,11 +806,11 @@ func TestParseSafeTensorsOCILayer_realFixture(t *testing.T) {
Type: pkg.ModelPkg,
Metadata: pkg.SafeTensorsModelInfo{
Format: "safetensors",
Parameters: "475.29M",
Quantization: "F32", // every tensor in the captured shard is F32
TensorCount: 148, // nomic-embed-v2-moe 475M ships 148 tensor entries in this shard
Parameters: 475292928, // exact element count summed across the 148 tensors in this shard
Quantization: "F32", // every tensor in the captured shard is F32
TensorCount: 148, // nomic-embed-v2-moe 475M ships 148 tensor entries in this shard
UserMetadata: pkg.KeyValues{{Key: "format", Value: "pt"}},
MetadataHash: "051a14e686673dea",
MetadataHash: "6026c28a883ab918",
},
},
}
@ -980,24 +982,6 @@ func TestNormalizeDType(t *testing.T) {
}
}
func TestFormatParameterCount(t *testing.T) {
tests := []struct {
name string
in uint64
want string
}{
{name: "raw count under 1K", in: 512, want: "512"},
{name: "thousands", in: 16256, want: "16.26K"},
{name: "billions", in: 2_680_000_000, want: "2.68B"},
{name: "millions", in: 35_000_000, want: "35.00M"},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
assert.Equal(t, tt.want, formatParameterCount(tt.in))
})
}
}
func TestParseFrontmatter(t *testing.T) {
tests := []struct {
name string

View File

@ -18,9 +18,10 @@ type SafeTensorsModelInfo struct {
// Quantization describes tensor precision (e.g., "BF16", "F16", "F32", "INT8").
Quantization string `json:"quantization,omitempty" cyclonedx:"quantization"`
// Parameters is the parameter count as reported by upstream. Stored as a string
// because Docker AI and Hugging Face labels use notation like "2.68B" or "35B-A3B".
Parameters string `json:"parameters,omitempty" cyclonedx:"parameters"`
// Parameters is the total number of model parameters, computed from the tensor
// shapes in the SafeTensors header(s). For a sharded model it is the sum across
// every shard.
Parameters uint64 `json:"parameters,omitempty" cyclonedx:"parameters"`
// TensorCount is the number of tensor entries in the file header.
TensorCount uint64 `json:"tensorCount,omitempty" cyclonedx:"tensorCount"`