mirror of
https://github.com/anchore/syft.git
synced 2026-07-05 02:28:25 +02:00
fix: hash logical content and count safetensors parameters as an integer
- metadataHash now covers logical tensor content (name + dtype + shape) plus __metadata__, and excludes DataOffsets. - Parameters becomes a measured uint64 count (matching GGUFFileHeader) instead of a formatted/upstream string. Signed-off-by: Christopher Phillips <32073428+spiffcs@users.noreply.github.com>
This commit is contained in:
parent
88ef52f617
commit
7e482a26c6
@ -4221,8 +4221,8 @@
|
||||
"description": "Quantization describes tensor precision (e.g., \"BF16\", \"F16\", \"F32\", \"INT8\")."
|
||||
},
|
||||
"parameters": {
|
||||
"type": "string",
|
||||
"description": "Parameters is the parameter count as reported by upstream. Stored as a string\nbecause Docker AI and Hugging Face labels use notation like \"2.68B\" or \"35B-A3B\"."
|
||||
"type": "integer",
|
||||
"description": "Parameters is the total number of model parameters, computed from the tensor\nshapes in the SafeTensors header(s). For a sharded model it is the sum across\nevery shard."
|
||||
},
|
||||
"tensorCount": {
|
||||
"type": "integer",
|
||||
|
||||
@ -4221,8 +4221,8 @@
|
||||
"description": "Quantization describes tensor precision (e.g., \"BF16\", \"F16\", \"F32\", \"INT8\")."
|
||||
},
|
||||
"parameters": {
|
||||
"type": "string",
|
||||
"description": "Parameters is the parameter count as reported by upstream. Stored as a string\nbecause Docker AI and Hugging Face labels use notation like \"2.68B\" or \"35B-A3B\"."
|
||||
"type": "integer",
|
||||
"description": "Parameters is the total number of model parameters, computed from the tensor\nshapes in the SafeTensors header(s). For a sharded model it is the sum across\nevery shard."
|
||||
},
|
||||
"tensorCount": {
|
||||
"type": "integer",
|
||||
|
||||
@ -59,7 +59,6 @@ func mergeAggregatesInto(merged *pkg.SafeTensorsModelInfo, aggregates []pkg.Safe
|
||||
if merged.TensorCount == 0 {
|
||||
merged.TensorCount = a.TensorCount
|
||||
}
|
||||
firstNonEmpty(&merged.Parameters, a.Parameters)
|
||||
firstNonEmpty(&merged.TotalSize, a.TotalSize)
|
||||
firstNonEmpty(&merged.Quantization, a.Quantization)
|
||||
}
|
||||
@ -69,12 +68,15 @@ func mergeAggregatesInto(merged *pkg.SafeTensorsModelInfo, aggregates []pkg.Safe
|
||||
// the summed shard TensorCount and the list of non-empty per-shard hashes for
|
||||
// the rollup. Shards carry only the content-derived fields (Quantization,
|
||||
// Parameters, UserMetadata), so those are the only fields folded in here.
|
||||
// TensorCount and Parameters are summed because each shard holds a distinct
|
||||
// slice of the model; Quantization takes the first value since all shards share
|
||||
// one precision.
|
||||
func mergeShardsInto(merged *pkg.SafeTensorsModelInfo, shards []pkg.SafeTensorsModelInfo) (shardTensorTotal uint64, hashes []string) {
|
||||
seenKV := map[string]bool{}
|
||||
for _, s := range shards {
|
||||
shardTensorTotal += s.TensorCount
|
||||
merged.Parameters += s.Parameters
|
||||
firstNonEmpty(&merged.Quantization, s.Quantization)
|
||||
firstNonEmpty(&merged.Parameters, s.Parameters)
|
||||
for _, kv := range s.UserMetadata {
|
||||
if seenKV[kv.Key] {
|
||||
continue
|
||||
|
||||
@ -27,7 +27,7 @@ func shardMeta(hash string, tensorCount uint64) pkg.SafeTensorsModelInfo {
|
||||
Format: "safetensors",
|
||||
TensorCount: tensorCount,
|
||||
Quantization: "BF16",
|
||||
Parameters: "1.00K",
|
||||
Parameters: 1000,
|
||||
MetadataHash: hash,
|
||||
}
|
||||
}
|
||||
@ -60,6 +60,7 @@ func TestMergeSafeTensorsGroup(t *testing.T) {
|
||||
md := out.Metadata.(pkg.SafeTensorsModelInfo)
|
||||
assert.Equal(t, 3, md.ShardCount)
|
||||
assert.Equal(t, uint64(9), md.TensorCount, "tensor counts are summed across shards")
|
||||
assert.Equal(t, uint64(3000), md.Parameters, "parameter counts are summed across shards")
|
||||
require.Len(t, md.Parts, 3)
|
||||
assert.Equal(t,
|
||||
[]string{"aaaa", "bbbb", "cccc"},
|
||||
@ -85,7 +86,6 @@ func TestMergeSafeTensorsGroup(t *testing.T) {
|
||||
Format: "safetensors",
|
||||
TensorCount: 999,
|
||||
TotalSize: "5.00GB",
|
||||
Parameters: "2.68B",
|
||||
Quantization: "Q4_K_M",
|
||||
}
|
||||
in := []pkg.Package{
|
||||
@ -98,7 +98,7 @@ func TestMergeSafeTensorsGroup(t *testing.T) {
|
||||
md := out.Metadata.(pkg.SafeTensorsModelInfo)
|
||||
assert.Equal(t, uint64(999), md.TensorCount, "aggregate TensorCount is authoritative; shard counts are not summed in")
|
||||
assert.Equal(t, "5.00GB", md.TotalSize)
|
||||
assert.Equal(t, "2.68B", md.Parameters)
|
||||
assert.Equal(t, uint64(2000), md.Parameters, "parameters are always measured from the shards (summed), not taken from the aggregate")
|
||||
assert.Equal(t, "Q4_K_M", md.Quantization, "aggregate quantization wins over the shard dtype")
|
||||
assert.Equal(t, 2, md.ShardCount, "ShardCount comes from the number of shards, not the aggregate")
|
||||
assert.Equal(t, rollupHash([]string{"aaaa", "bbbb"}), md.MetadataHash, "the content hash still rolls up the shard hashes")
|
||||
|
||||
@ -51,14 +51,14 @@ func readSafeTensorsHeader(r io.Reader) (*safeTensorsHeader, error) {
|
||||
return nil, fmt.Errorf("safetensors header size %d exceeds maximum %d", headerLen, maxSafeTensorsHeaderSize)
|
||||
}
|
||||
|
||||
// Read incrementally rather than pre-allocating headerLen up front
|
||||
body, err := io.ReadAll(io.LimitReader(r, int64(headerLen)))
|
||||
if err != nil {
|
||||
// Read incrementally rather than pre-allocating headerLen up front
|
||||
body, err := io.ReadAll(io.LimitReader(r, int64(headerLen)))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to read header body: %w", err)
|
||||
}
|
||||
if uint64(len(body)) != headerLen {
|
||||
return nil, fmt.Errorf("safetensors header truncated: read %d of %d bytes", len(body), headerLen)
|
||||
}
|
||||
if uint64(len(body)) != headerLen {
|
||||
return nil, fmt.Errorf("safetensors header truncated: read %d of %d bytes", len(body), headerLen)
|
||||
}
|
||||
|
||||
var raw map[string]json.RawMessage
|
||||
if err := json.Unmarshal(body, &raw); err != nil {
|
||||
@ -131,19 +131,19 @@ func (h *safeTensorsHeader) dominantDType() string {
|
||||
// (name + dtype + shape) plus the __metadata__ map. Tensor keys are sorted to
|
||||
// keep the hash deterministic across producers.
|
||||
func (h *safeTensorsHeader) metadataHash() string {
|
||||
type logicalEntry struct {
|
||||
Name string `json:"name"`
|
||||
DType string `json:"dtype"`
|
||||
Shape []int64 `json:"shape"`
|
||||
type logicalEntry struct {
|
||||
Name string `json:"name"`
|
||||
DType string `json:"dtype"`
|
||||
Shape []int64 `json:"shape"`
|
||||
}
|
||||
entries := make([]logicalEntry, 0, len(h.tensors))
|
||||
entries := make([]logicalEntry, 0, len(h.tensors))
|
||||
for name, t := range h.tensors {
|
||||
entries = append(entries, logicalEntry{Name: name, DType: t.DType, Shape: t.Shape})
|
||||
entries = append(entries, logicalEntry{Name: name, DType: t.DType, Shape: t.Shape})
|
||||
}
|
||||
sort.Slice(entries, func(i, j int) bool { return entries[i].Name < entries[j].Name })
|
||||
|
||||
type hashInput struct {
|
||||
Tensors []logicalEntry `json:"tensors"`
|
||||
Tensors []logicalEntry `json:"tensors"`
|
||||
Metadata map[string]string `json:"metadata,omitempty"`
|
||||
}
|
||||
b, err := json.Marshal(hashInput{Tensors: entries, Metadata: h.metadata})
|
||||
|
||||
@ -31,13 +31,11 @@ func parseSafeTensorsFile(_ context.Context, _ file.Resolver, _ *generic.Environ
|
||||
md := pkg.SafeTensorsModelInfo{
|
||||
Format: "safetensors",
|
||||
TensorCount: uint64(len(header.tensors)),
|
||||
Parameters: header.parameterCount(),
|
||||
Quantization: normalizeDType(header.dominantDType()),
|
||||
UserMetadata: userMetadataKeyValues(header.metadata),
|
||||
MetadataHash: header.metadataHash(),
|
||||
}
|
||||
if p := header.parameterCount(); p > 0 {
|
||||
md.Parameters = formatParameterCount(p)
|
||||
}
|
||||
|
||||
p := newSafeTensorsPackage(
|
||||
&md,
|
||||
@ -46,21 +44,5 @@ func parseSafeTensorsFile(_ context.Context, _ file.Resolver, _ *generic.Environ
|
||||
return []pkg.Package{p}, nil, unknown.IfEmptyf([]pkg.Package{p}, "unable to parse safetensors file")
|
||||
}
|
||||
|
||||
// formatParameterCount prints a count like 6_700_000_000 as "6.70B" using
|
||||
// B/M/K thresholds matching the notation used by Hugging Face and Docker AI
|
||||
// labels.
|
||||
func formatParameterCount(n uint64) string {
|
||||
switch {
|
||||
case n >= 1_000_000_000:
|
||||
return fmt.Sprintf("%.2fB", float64(n)/1_000_000_000)
|
||||
case n >= 1_000_000:
|
||||
return fmt.Sprintf("%.2fM", float64(n)/1_000_000)
|
||||
case n >= 1_000:
|
||||
return fmt.Sprintf("%.2fK", float64(n)/1_000)
|
||||
default:
|
||||
return fmt.Sprintf("%d", n)
|
||||
}
|
||||
}
|
||||
|
||||
// integrity check
|
||||
var _ generic.Parser = parseSafeTensorsFile
|
||||
|
||||
@ -39,7 +39,6 @@ type dockerAIModelConfig struct {
|
||||
Config struct {
|
||||
Format string `json:"format"`
|
||||
Quantization string `json:"quantization"`
|
||||
Parameters string `json:"parameters"`
|
||||
Size string `json:"size"`
|
||||
SafeTensors struct {
|
||||
TensorCount json.Number `json:"tensor_count"`
|
||||
@ -65,10 +64,13 @@ func parseSafeTensorsOCIConfig(_ context.Context, _ file.Resolver, _ *generic.En
|
||||
return nil, nil, nil
|
||||
}
|
||||
|
||||
// Parameters is intentionally not read from the config blob: we measure the
|
||||
// true parameter count from the SafeTensors layer headers (parseSafeTensorsOCILayer)
|
||||
// so OCI and directory scans of the same model agree, rather than trusting the
|
||||
// producer-supplied label here.
|
||||
md := pkg.SafeTensorsModelInfo{
|
||||
Format: "safetensors",
|
||||
Quantization: cfg.Config.Quantization,
|
||||
Parameters: cfg.Config.Parameters,
|
||||
TotalSize: cfg.Config.Size,
|
||||
}
|
||||
if n, err := cfg.Config.SafeTensors.TensorCount.Int64(); err == nil && n > 0 {
|
||||
@ -95,13 +97,11 @@ func parseSafeTensorsOCILayer(_ context.Context, _ file.Resolver, _ *generic.Env
|
||||
md := pkg.SafeTensorsModelInfo{
|
||||
Format: "safetensors",
|
||||
TensorCount: uint64(len(header.tensors)),
|
||||
Parameters: header.parameterCount(),
|
||||
Quantization: normalizeDType(header.dominantDType()),
|
||||
UserMetadata: userMetadataKeyValues(header.metadata),
|
||||
MetadataHash: header.metadataHash(),
|
||||
}
|
||||
if p := header.parameterCount(); p > 0 {
|
||||
md.Parameters = formatParameterCount(p)
|
||||
}
|
||||
|
||||
p := newSafeTensorsPackage(
|
||||
&md,
|
||||
|
||||
@ -83,7 +83,7 @@ func TestSafeTensorsCataloger(t *testing.T) {
|
||||
Format: "safetensors",
|
||||
Architecture: architecture,
|
||||
Quantization: "BF16",
|
||||
Parameters: "16.26K",
|
||||
Parameters: 16256,
|
||||
TensorCount: 2,
|
||||
ShardCount: 1,
|
||||
UserMetadata: pkg.KeyValues{{Key: "format", Value: "pt"}},
|
||||
@ -344,12 +344,13 @@ func TestParseSafeTensorsOCIConfig(t *testing.T) {
|
||||
{
|
||||
// nameless: the merge processor assigns the name and resolves
|
||||
// licenses. Config blobs carry no header content, so
|
||||
// MetadataHash stays empty.
|
||||
// MetadataHash stays empty. The "parameters" label in the blob is
|
||||
// intentionally ignored: the true count is measured from the
|
||||
// SafeTensors layer headers, so Parameters stays zero here.
|
||||
Type: pkg.ModelPkg,
|
||||
Metadata: pkg.SafeTensorsModelInfo{
|
||||
Format: "safetensors",
|
||||
Quantization: "Q4_K_M",
|
||||
Parameters: "8B",
|
||||
TotalSize: "16.00GB",
|
||||
TensorCount: 291,
|
||||
},
|
||||
@ -708,7 +709,7 @@ func TestParseSafeTensorsOCILayer(t *testing.T) {
|
||||
Type: pkg.ModelPkg,
|
||||
Metadata: pkg.SafeTensorsModelInfo{
|
||||
Format: "safetensors",
|
||||
Parameters: "16.64K",
|
||||
Parameters: 16640,
|
||||
Quantization: "BF16",
|
||||
TensorCount: 2,
|
||||
UserMetadata: wantUserMetadata,
|
||||
@ -740,7 +741,6 @@ func TestParseSafeTensorsOCILayer(t *testing.T) {
|
||||
Type: pkg.ModelPkg,
|
||||
Metadata: pkg.SafeTensorsModelInfo{
|
||||
Format: "safetensors",
|
||||
Parameters: "2.68B",
|
||||
TotalSize: "5.00GB",
|
||||
Quantization: "Q4_K_M", // raw producer string
|
||||
TensorCount: 9999,
|
||||
@ -767,11 +767,13 @@ func TestParseSafeTensorsOCILayer(t *testing.T) {
|
||||
got := out[0]
|
||||
assert.Equal(t, "qwen-test", got.Name, "name comes from the companion config.json _name_or_path")
|
||||
md := got.Metadata.(pkg.SafeTensorsModelInfo)
|
||||
// Aggregate-declared fields win for totals; per-shard count must NOT be
|
||||
// summed into the aggregate.
|
||||
// Aggregate-declared TensorCount/TotalSize win as authoritative totals; the
|
||||
// per-shard TensorCount must NOT be summed into the aggregate. Parameters is
|
||||
// the exception: it is always measured from the shard layer header (here the
|
||||
// 16640-element blob), never taken from the config aggregate.
|
||||
assert.Equal(t, uint64(9999), md.TensorCount)
|
||||
assert.Equal(t, "5.00GB", md.TotalSize)
|
||||
assert.Equal(t, "2.68B", md.Parameters)
|
||||
assert.Equal(t, uint64(16640), md.Parameters)
|
||||
// Aggregate Quantization wins when set; shard's normalized dtype is the
|
||||
// fallback (not exercised here because the config had Q4_K_M).
|
||||
assert.Equal(t, "Q4_K_M", md.Quantization)
|
||||
@ -804,11 +806,11 @@ func TestParseSafeTensorsOCILayer_realFixture(t *testing.T) {
|
||||
Type: pkg.ModelPkg,
|
||||
Metadata: pkg.SafeTensorsModelInfo{
|
||||
Format: "safetensors",
|
||||
Parameters: "475.29M",
|
||||
Quantization: "F32", // every tensor in the captured shard is F32
|
||||
TensorCount: 148, // nomic-embed-v2-moe 475M ships 148 tensor entries in this shard
|
||||
Parameters: 475292928, // exact element count summed across the 148 tensors in this shard
|
||||
Quantization: "F32", // every tensor in the captured shard is F32
|
||||
TensorCount: 148, // nomic-embed-v2-moe 475M ships 148 tensor entries in this shard
|
||||
UserMetadata: pkg.KeyValues{{Key: "format", Value: "pt"}},
|
||||
MetadataHash: "051a14e686673dea",
|
||||
MetadataHash: "6026c28a883ab918",
|
||||
},
|
||||
},
|
||||
}
|
||||
@ -980,24 +982,6 @@ func TestNormalizeDType(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestFormatParameterCount(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
in uint64
|
||||
want string
|
||||
}{
|
||||
{name: "raw count under 1K", in: 512, want: "512"},
|
||||
{name: "thousands", in: 16256, want: "16.26K"},
|
||||
{name: "billions", in: 2_680_000_000, want: "2.68B"},
|
||||
{name: "millions", in: 35_000_000, want: "35.00M"},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
assert.Equal(t, tt.want, formatParameterCount(tt.in))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseFrontmatter(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
|
||||
@ -18,9 +18,10 @@ type SafeTensorsModelInfo struct {
|
||||
// Quantization describes tensor precision (e.g., "BF16", "F16", "F32", "INT8").
|
||||
Quantization string `json:"quantization,omitempty" cyclonedx:"quantization"`
|
||||
|
||||
// Parameters is the parameter count as reported by upstream. Stored as a string
|
||||
// because Docker AI and Hugging Face labels use notation like "2.68B" or "35B-A3B".
|
||||
Parameters string `json:"parameters,omitempty" cyclonedx:"parameters"`
|
||||
// Parameters is the total number of model parameters, computed from the tensor
|
||||
// shapes in the SafeTensors header(s). For a sharded model it is the sum across
|
||||
// every shard.
|
||||
Parameters uint64 `json:"parameters,omitempty" cyclonedx:"parameters"`
|
||||
|
||||
// TensorCount is the number of tensor entries in the file header.
|
||||
TensorCount uint64 `json:"tensorCount,omitempty" cyclonedx:"tensorCount"`
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user