fix: hash logical content and count safetensors parameters as an integer

- metadataHash now covers logical tensor content (name + dtype + shape) plus __metadata__, and excludes DataOffsets. - Parameters becomes a measured uint64 count (matching GGUFFileHeader) instead of a formatted/upstream string. Signed-off-by: Christopher Phillips <32073428+spiffcs@users.noreply.github.com>
2026-07-05 02:28:25 +02:00 · 2026-06-30 15:16:34 -04:00 · 2026-06-30 15:16:34 -04:00 · 7e482a26c6
commit 7e482a26c6
parent 88ef52f617
9 changed files with 48 additions and 79 deletions
--- a/schema/json/schema-16.1.8.json
+++ b/schema/json/schema-16.1.8.json
@ -4221,8 +4221,8 @@
          "description": "Quantization describes tensor precision (e.g., \"BF16\", \"F16\", \"F32\", \"INT8\")."
        },
        "parameters": {
-          "type": "string",
-          "description": "Parameters is the parameter count as reported by upstream. Stored as a string\nbecause Docker AI and Hugging Face labels use notation like \"2.68B\" or \"35B-A3B\"."
+          "type": "integer",
+          "description": "Parameters is the total number of model parameters, computed from the tensor\nshapes in the SafeTensors header(s). For a sharded model it is the sum across\nevery shard."
        },
        "tensorCount": {
          "type": "integer",
--- a/schema/json/schema-latest.json
+++ b/schema/json/schema-latest.json
@ -4221,8 +4221,8 @@
          "description": "Quantization describes tensor precision (e.g., \"BF16\", \"F16\", \"F32\", \"INT8\")."
        },
        "parameters": {
-          "type": "string",
-          "description": "Parameters is the parameter count as reported by upstream. Stored as a string\nbecause Docker AI and Hugging Face labels use notation like \"2.68B\" or \"35B-A3B\"."
+          "type": "integer",
+          "description": "Parameters is the total number of model parameters, computed from the tensor\nshapes in the SafeTensors header(s). For a sharded model it is the sum across\nevery shard."
        },
        "tensorCount": {
          "type": "integer",
--- a/syft/pkg/cataloger/ai/merge.go
+++ b/syft/pkg/cataloger/ai/merge.go
@ -59,7 +59,6 @@ func mergeAggregatesInto(merged *pkg.SafeTensorsModelInfo, aggregates []pkg.Safe
 		if merged.TensorCount == 0 {
 			merged.TensorCount = a.TensorCount
 		}
-		firstNonEmpty(&merged.Parameters, a.Parameters)
 		firstNonEmpty(&merged.TotalSize, a.TotalSize)
 		firstNonEmpty(&merged.Quantization, a.Quantization)
 	}
@ -69,12 +68,15 @@ func mergeAggregatesInto(merged *pkg.SafeTensorsModelInfo, aggregates []pkg.Safe
 // the summed shard TensorCount and the list of non-empty per-shard hashes for
 // the rollup. Shards carry only the content-derived fields (Quantization,
 // Parameters, UserMetadata), so those are the only fields folded in here.
+// TensorCount and Parameters are summed because each shard holds a distinct
+// slice of the model; Quantization takes the first value since all shards share
+// one precision.
 func mergeShardsInto(merged *pkg.SafeTensorsModelInfo, shards []pkg.SafeTensorsModelInfo) (shardTensorTotal uint64, hashes []string) {
 	seenKV := map[string]bool{}
 	for _, s := range shards {
 		shardTensorTotal += s.TensorCount
+		merged.Parameters += s.Parameters
 		firstNonEmpty(&merged.Quantization, s.Quantization)
-		firstNonEmpty(&merged.Parameters, s.Parameters)
 		for _, kv := range s.UserMetadata {
 			if seenKV[kv.Key] {
 				continue
--- a/syft/pkg/cataloger/ai/merge_test.go
+++ b/syft/pkg/cataloger/ai/merge_test.go
@ -27,7 +27,7 @@ func shardMeta(hash string, tensorCount uint64) pkg.SafeTensorsModelInfo {
 		Format:       "safetensors",
 		TensorCount:  tensorCount,
 		Quantization: "BF16",
-		Parameters:   "1.00K",
+		Parameters:   1000,
 		MetadataHash: hash,
 	}
 }
@ -60,6 +60,7 @@ func TestMergeSafeTensorsGroup(t *testing.T) {
 		md := out.Metadata.(pkg.SafeTensorsModelInfo)
 		assert.Equal(t, 3, md.ShardCount)
 		assert.Equal(t, uint64(9), md.TensorCount, "tensor counts are summed across shards")
+		assert.Equal(t, uint64(3000), md.Parameters, "parameter counts are summed across shards")
 		require.Len(t, md.Parts, 3)
 		assert.Equal(t,
 			[]string{"aaaa", "bbbb", "cccc"},
@ -85,7 +86,6 @@ func TestMergeSafeTensorsGroup(t *testing.T) {
 			Format:       "safetensors",
 			TensorCount:  999,
 			TotalSize:    "5.00GB",
-			Parameters:   "2.68B",
 			Quantization: "Q4_K_M",
 		}
 		in := []pkg.Package{
@ -98,7 +98,7 @@ func TestMergeSafeTensorsGroup(t *testing.T) {
 		md := out.Metadata.(pkg.SafeTensorsModelInfo)
 		assert.Equal(t, uint64(999), md.TensorCount, "aggregate TensorCount is authoritative; shard counts are not summed in")
 		assert.Equal(t, "5.00GB", md.TotalSize)
-		assert.Equal(t, "2.68B", md.Parameters)
+		assert.Equal(t, uint64(2000), md.Parameters, "parameters are always measured from the shards (summed), not taken from the aggregate")
 		assert.Equal(t, "Q4_K_M", md.Quantization, "aggregate quantization wins over the shard dtype")
 		assert.Equal(t, 2, md.ShardCount, "ShardCount comes from the number of shards, not the aggregate")
 		assert.Equal(t, rollupHash([]string{"aaaa", "bbbb"}), md.MetadataHash, "the content hash still rolls up the shard hashes")
--- a/syft/pkg/cataloger/ai/parse_safetensors.go
+++ b/syft/pkg/cataloger/ai/parse_safetensors.go
@ -51,14 +51,14 @@ func readSafeTensorsHeader(r io.Reader) (*safeTensorsHeader, error) {
 		return nil, fmt.Errorf("safetensors header size %d exceeds maximum %d", headerLen, maxSafeTensorsHeaderSize)
 	}

-        // Read incrementally rather than pre-allocating headerLen up front
-        body, err := io.ReadAll(io.LimitReader(r, int64(headerLen)))
-        if err != nil {
+	// Read incrementally rather than pre-allocating headerLen up front
+	body, err := io.ReadAll(io.LimitReader(r, int64(headerLen)))
+	if err != nil {
 		return nil, fmt.Errorf("failed to read header body: %w", err)
 	}
-        if uint64(len(body)) != headerLen {
-                return nil, fmt.Errorf("safetensors header truncated: read %d of %d bytes", len(body), headerLen)
-        }
+	if uint64(len(body)) != headerLen {
+		return nil, fmt.Errorf("safetensors header truncated: read %d of %d bytes", len(body), headerLen)
+	}

 	var raw map[string]json.RawMessage
 	if err := json.Unmarshal(body, &raw); err != nil {
@ -131,19 +131,19 @@ func (h *safeTensorsHeader) dominantDType() string {
 // (name + dtype + shape) plus the __metadata__ map. Tensor keys are sorted to
 // keep the hash deterministic across producers.
 func (h *safeTensorsHeader) metadataHash() string {
-        type logicalEntry struct {
-                Name  string  `json:"name"`
-                DType string  `json:"dtype"`
-                Shape []int64 `json:"shape"`
+	type logicalEntry struct {
+		Name  string  `json:"name"`
+		DType string  `json:"dtype"`
+		Shape []int64 `json:"shape"`
 	}
-        entries := make([]logicalEntry, 0, len(h.tensors))
+	entries := make([]logicalEntry, 0, len(h.tensors))
 	for name, t := range h.tensors {
-                entries = append(entries, logicalEntry{Name: name, DType: t.DType, Shape: t.Shape})
+		entries = append(entries, logicalEntry{Name: name, DType: t.DType, Shape: t.Shape})
 	}
 	sort.Slice(entries, func(i, j int) bool { return entries[i].Name < entries[j].Name })

 	type hashInput struct {
-                Tensors  []logicalEntry    `json:"tensors"`
+		Tensors  []logicalEntry    `json:"tensors"`
 		Metadata map[string]string `json:"metadata,omitempty"`
 	}
 	b, err := json.Marshal(hashInput{Tensors: entries, Metadata: h.metadata})
--- a/syft/pkg/cataloger/ai/parse_safetensors_model.go
+++ b/syft/pkg/cataloger/ai/parse_safetensors_model.go
@ -31,13 +31,11 @@ func parseSafeTensorsFile(_ context.Context, _ file.Resolver, _ *generic.Environ
 	md := pkg.SafeTensorsModelInfo{
 		Format:       "safetensors",
 		TensorCount:  uint64(len(header.tensors)),
+		Parameters:   header.parameterCount(),
 		Quantization: normalizeDType(header.dominantDType()),
 		UserMetadata: userMetadataKeyValues(header.metadata),
 		MetadataHash: header.metadataHash(),
 	}
-	if p := header.parameterCount(); p > 0 {
-		md.Parameters = formatParameterCount(p)
-	}

 	p := newSafeTensorsPackage(
 		&md,
@ -46,21 +44,5 @@ func parseSafeTensorsFile(_ context.Context, _ file.Resolver, _ *generic.Environ
 	return []pkg.Package{p}, nil, unknown.IfEmptyf([]pkg.Package{p}, "unable to parse safetensors file")
 }

-// formatParameterCount prints a count like 6_700_000_000 as "6.70B" using
-// B/M/K thresholds matching the notation used by Hugging Face and Docker AI
-// labels.
-func formatParameterCount(n uint64) string {
-	switch {
-	case n >= 1_000_000_000:
-		return fmt.Sprintf("%.2fB", float64(n)/1_000_000_000)
-	case n >= 1_000_000:
-		return fmt.Sprintf("%.2fM", float64(n)/1_000_000)
-	case n >= 1_000:
-		return fmt.Sprintf("%.2fK", float64(n)/1_000)
-	default:
-		return fmt.Sprintf("%d", n)
-	}
-}
-
 // integrity check
 var _ generic.Parser = parseSafeTensorsFile
--- a/syft/pkg/cataloger/ai/parse_safetensors_oci.go
+++ b/syft/pkg/cataloger/ai/parse_safetensors_oci.go
@ -39,7 +39,6 @@ type dockerAIModelConfig struct {
 	Config struct {
 		Format       string `json:"format"`
 		Quantization string `json:"quantization"`
-		Parameters   string `json:"parameters"`
 		Size         string `json:"size"`
 		SafeTensors  struct {
 			TensorCount json.Number `json:"tensor_count"`
@ -65,10 +64,13 @@ func parseSafeTensorsOCIConfig(_ context.Context, _ file.Resolver, _ *generic.En
 		return nil, nil, nil
 	}

+	// Parameters is intentionally not read from the config blob: we measure the
+	// true parameter count from the SafeTensors layer headers (parseSafeTensorsOCILayer)
+	// so OCI and directory scans of the same model agree, rather than trusting the
+	// producer-supplied label here.
 	md := pkg.SafeTensorsModelInfo{
 		Format:       "safetensors",
 		Quantization: cfg.Config.Quantization,
-		Parameters:   cfg.Config.Parameters,
 		TotalSize:    cfg.Config.Size,
 	}
 	if n, err := cfg.Config.SafeTensors.TensorCount.Int64(); err == nil && n > 0 {
@ -95,13 +97,11 @@ func parseSafeTensorsOCILayer(_ context.Context, _ file.Resolver, _ *generic.Env
 	md := pkg.SafeTensorsModelInfo{
 		Format:       "safetensors",
 		TensorCount:  uint64(len(header.tensors)),
+		Parameters:   header.parameterCount(),
 		Quantization: normalizeDType(header.dominantDType()),
 		UserMetadata: userMetadataKeyValues(header.metadata),
 		MetadataHash: header.metadataHash(),
 	}
-	if p := header.parameterCount(); p > 0 {
-		md.Parameters = formatParameterCount(p)
-	}

 	p := newSafeTensorsPackage(
 		&md,
--- a/syft/pkg/cataloger/ai/parse_safetensors_test.go
+++ b/syft/pkg/cataloger/ai/parse_safetensors_test.go
@ -83,7 +83,7 @@ func TestSafeTensorsCataloger(t *testing.T) {
 			Format:       "safetensors",
 			Architecture: architecture,
 			Quantization: "BF16",
-			Parameters:   "16.26K",
+			Parameters:   16256,
 			TensorCount:  2,
 			ShardCount:   1,
 			UserMetadata: pkg.KeyValues{{Key: "format", Value: "pt"}},
@ -344,12 +344,13 @@ func TestParseSafeTensorsOCIConfig(t *testing.T) {
 				{
 					// nameless: the merge processor assigns the name and resolves
 					// licenses. Config blobs carry no header content, so
-					// MetadataHash stays empty.
+					// MetadataHash stays empty. The "parameters" label in the blob is
+					// intentionally ignored: the true count is measured from the
+					// SafeTensors layer headers, so Parameters stays zero here.
 					Type: pkg.ModelPkg,
 					Metadata: pkg.SafeTensorsModelInfo{
 						Format:       "safetensors",
 						Quantization: "Q4_K_M",
-						Parameters:   "8B",
 						TotalSize:    "16.00GB",
 						TensorCount:  291,
 					},
@ -708,7 +709,7 @@ func TestParseSafeTensorsOCILayer(t *testing.T) {
 				Type: pkg.ModelPkg,
 				Metadata: pkg.SafeTensorsModelInfo{
 					Format:       "safetensors",
-					Parameters:   "16.64K",
+					Parameters:   16640,
 					Quantization: "BF16",
 					TensorCount:  2,
 					UserMetadata: wantUserMetadata,
@ -740,7 +741,6 @@ func TestParseSafeTensorsOCILayer(t *testing.T) {
 			Type: pkg.ModelPkg,
 			Metadata: pkg.SafeTensorsModelInfo{
 				Format:       "safetensors",
-				Parameters:   "2.68B",
 				TotalSize:    "5.00GB",
 				Quantization: "Q4_K_M", // raw producer string
 				TensorCount:  9999,
@ -767,11 +767,13 @@ func TestParseSafeTensorsOCILayer(t *testing.T) {
 		got := out[0]
 		assert.Equal(t, "qwen-test", got.Name, "name comes from the companion config.json _name_or_path")
 		md := got.Metadata.(pkg.SafeTensorsModelInfo)
-		// Aggregate-declared fields win for totals; per-shard count must NOT be
-		// summed into the aggregate.
+		// Aggregate-declared TensorCount/TotalSize win as authoritative totals; the
+		// per-shard TensorCount must NOT be summed into the aggregate. Parameters is
+		// the exception: it is always measured from the shard layer header (here the
+		// 16640-element blob), never taken from the config aggregate.
 		assert.Equal(t, uint64(9999), md.TensorCount)
 		assert.Equal(t, "5.00GB", md.TotalSize)
-		assert.Equal(t, "2.68B", md.Parameters)
+		assert.Equal(t, uint64(16640), md.Parameters)
 		// Aggregate Quantization wins when set; shard's normalized dtype is the
 		// fallback (not exercised here because the config had Q4_K_M).
 		assert.Equal(t, "Q4_K_M", md.Quantization)
@ -804,11 +806,11 @@ func TestParseSafeTensorsOCILayer_realFixture(t *testing.T) {
 			Type: pkg.ModelPkg,
 			Metadata: pkg.SafeTensorsModelInfo{
 				Format:       "safetensors",
-				Parameters:   "475.29M",
-				Quantization: "F32", // every tensor in the captured shard is F32
-				TensorCount:  148,   // nomic-embed-v2-moe 475M ships 148 tensor entries in this shard
+				Parameters:   475292928, // exact element count summed across the 148 tensors in this shard
+				Quantization: "F32",     // every tensor in the captured shard is F32
+				TensorCount:  148,       // nomic-embed-v2-moe 475M ships 148 tensor entries in this shard
 				UserMetadata: pkg.KeyValues{{Key: "format", Value: "pt"}},
-				MetadataHash: "051a14e686673dea",
+				MetadataHash: "6026c28a883ab918",
 			},
 		},
 	}
@ -980,24 +982,6 @@ func TestNormalizeDType(t *testing.T) {
 	}
 }

-func TestFormatParameterCount(t *testing.T) {
-	tests := []struct {
-		name string
-		in   uint64
-		want string
-	}{
-		{name: "raw count under 1K", in: 512, want: "512"},
-		{name: "thousands", in: 16256, want: "16.26K"},
-		{name: "billions", in: 2_680_000_000, want: "2.68B"},
-		{name: "millions", in: 35_000_000, want: "35.00M"},
-	}
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			assert.Equal(t, tt.want, formatParameterCount(tt.in))
-		})
-	}
-}
-
 func TestParseFrontmatter(t *testing.T) {
 	tests := []struct {
 		name          string
--- a/syft/pkg/safetensors.go
+++ b/syft/pkg/safetensors.go
@ -18,9 +18,10 @@ type SafeTensorsModelInfo struct {
 	// Quantization describes tensor precision (e.g., "BF16", "F16", "F32", "INT8").
 	Quantization string `json:"quantization,omitempty" cyclonedx:"quantization"`

-	// Parameters is the parameter count as reported by upstream. Stored as a string
-	// because Docker AI and Hugging Face labels use notation like "2.68B" or "35B-A3B".
-	Parameters string `json:"parameters,omitempty" cyclonedx:"parameters"`
+	// Parameters is the total number of model parameters, computed from the tensor
+	// shapes in the SafeTensors header(s). For a sharded model it is the sum across
+	// every shard.
+	Parameters uint64 `json:"parameters,omitempty" cyclonedx:"parameters"`

 	// TensorCount is the number of tensor entries in the file header.
 	TensorCount uint64 `json:"tensorCount,omitempty" cyclonedx:"tensorCount"`