From 7e482a26c6628572a6b6793ed428f1b200c47909 Mon Sep 17 00:00:00 2001
From: Christopher Phillips <32073428+spiffcs@users.noreply.github.com>
Date: Tue, 30 Jun 2026 15:16:34 -0400
Subject: [PATCH] fix: hash logical content and count safetensors parameters as
 an integer

- metadataHash now covers logical tensor content (name + dtype + shape) plus
  __metadata__, and excludes DataOffsets.

- Parameters becomes a measured uint64 count (matching GGUFFileHeader) instead
  of a formatted/upstream string.

Signed-off-by: Christopher Phillips <32073428+spiffcs@users.noreply.github.com>
---
 schema/json/schema-16.1.8.json                |  4 +-
 schema/json/schema-latest.json                |  4 +-
 syft/pkg/cataloger/ai/merge.go                |  6 ++-
 syft/pkg/cataloger/ai/merge_test.go           |  6 +--
 syft/pkg/cataloger/ai/parse_safetensors.go    | 26 +++++------
 .../cataloger/ai/parse_safetensors_model.go   | 20 +--------
 .../pkg/cataloger/ai/parse_safetensors_oci.go | 10 ++---
 .../cataloger/ai/parse_safetensors_test.go    | 44 ++++++-------------
 syft/pkg/safetensors.go                       |  7 +--
 9 files changed, 48 insertions(+), 79 deletions(-)

diff --git a/schema/json/schema-16.1.8.json b/schema/json/schema-16.1.8.json
index 0e8e1527c..6e548d319 100644
--- a/schema/json/schema-16.1.8.json
+++ b/schema/json/schema-16.1.8.json
@@ -4221,8 +4221,8 @@
           "description": "Quantization describes tensor precision (e.g., \"BF16\", \"F16\", \"F32\", \"INT8\")."
         },
         "parameters": {
-          "type": "string",
-          "description": "Parameters is the parameter count as reported by upstream. Stored as a string\nbecause Docker AI and Hugging Face labels use notation like \"2.68B\" or \"35B-A3B\"."
+          "type": "integer",
+          "description": "Parameters is the total number of model parameters, computed from the tensor\nshapes in the SafeTensors header(s). For a sharded model it is the sum across\nevery shard."
         },
         "tensorCount": {
           "type": "integer",
diff --git a/schema/json/schema-latest.json b/schema/json/schema-latest.json
index 0e8e1527c..6e548d319 100644
--- a/schema/json/schema-latest.json
+++ b/schema/json/schema-latest.json
@@ -4221,8 +4221,8 @@
           "description": "Quantization describes tensor precision (e.g., \"BF16\", \"F16\", \"F32\", \"INT8\")."
         },
         "parameters": {
-          "type": "string",
-          "description": "Parameters is the parameter count as reported by upstream. Stored as a string\nbecause Docker AI and Hugging Face labels use notation like \"2.68B\" or \"35B-A3B\"."
+          "type": "integer",
+          "description": "Parameters is the total number of model parameters, computed from the tensor\nshapes in the SafeTensors header(s). For a sharded model it is the sum across\nevery shard."
         },
         "tensorCount": {
           "type": "integer",
diff --git a/syft/pkg/cataloger/ai/merge.go b/syft/pkg/cataloger/ai/merge.go
index 7351c2576..a8d2b4e74 100644
--- a/syft/pkg/cataloger/ai/merge.go
+++ b/syft/pkg/cataloger/ai/merge.go
@@ -59,7 +59,6 @@ func mergeAggregatesInto(merged *pkg.SafeTensorsModelInfo, aggregates []pkg.Safe
 		if merged.TensorCount == 0 {
 			merged.TensorCount = a.TensorCount
 		}
-		firstNonEmpty(&merged.Parameters, a.Parameters)
 		firstNonEmpty(&merged.TotalSize, a.TotalSize)
 		firstNonEmpty(&merged.Quantization, a.Quantization)
 	}
@@ -69,12 +68,15 @@ func mergeAggregatesInto(merged *pkg.SafeTensorsModelInfo, aggregates []pkg.Safe
 // the summed shard TensorCount and the list of non-empty per-shard hashes for
 // the rollup. Shards carry only the content-derived fields (Quantization,
 // Parameters, UserMetadata), so those are the only fields folded in here.
+// TensorCount and Parameters are summed because each shard holds a distinct
+// slice of the model; Quantization takes the first value since all shards share
+// one precision.
 func mergeShardsInto(merged *pkg.SafeTensorsModelInfo, shards []pkg.SafeTensorsModelInfo) (shardTensorTotal uint64, hashes []string) {
 	seenKV := map[string]bool{}
 	for _, s := range shards {
 		shardTensorTotal += s.TensorCount
+		merged.Parameters += s.Parameters
 		firstNonEmpty(&merged.Quantization, s.Quantization)
-		firstNonEmpty(&merged.Parameters, s.Parameters)
 		for _, kv := range s.UserMetadata {
 			if seenKV[kv.Key] {
 				continue
diff --git a/syft/pkg/cataloger/ai/merge_test.go b/syft/pkg/cataloger/ai/merge_test.go
index 06631db53..7276d8896 100644
--- a/syft/pkg/cataloger/ai/merge_test.go
+++ b/syft/pkg/cataloger/ai/merge_test.go
@@ -27,7 +27,7 @@ func shardMeta(hash string, tensorCount uint64) pkg.SafeTensorsModelInfo {
 		Format:       "safetensors",
 		TensorCount:  tensorCount,
 		Quantization: "BF16",
-		Parameters:   "1.00K",
+		Parameters:   1000,
 		MetadataHash: hash,
 	}
 }
@@ -60,6 +60,7 @@ func TestMergeSafeTensorsGroup(t *testing.T) {
 		md := out.Metadata.(pkg.SafeTensorsModelInfo)
 		assert.Equal(t, 3, md.ShardCount)
 		assert.Equal(t, uint64(9), md.TensorCount, "tensor counts are summed across shards")
+		assert.Equal(t, uint64(3000), md.Parameters, "parameter counts are summed across shards")
 		require.Len(t, md.Parts, 3)
 		assert.Equal(t,
 			[]string{"aaaa", "bbbb", "cccc"},
@@ -85,7 +86,6 @@ func TestMergeSafeTensorsGroup(t *testing.T) {
 			Format:       "safetensors",
 			TensorCount:  999,
 			TotalSize:    "5.00GB",
-			Parameters:   "2.68B",
 			Quantization: "Q4_K_M",
 		}
 		in := []pkg.Package{
@@ -98,7 +98,7 @@ func TestMergeSafeTensorsGroup(t *testing.T) {
 		md := out.Metadata.(pkg.SafeTensorsModelInfo)
 		assert.Equal(t, uint64(999), md.TensorCount, "aggregate TensorCount is authoritative; shard counts are not summed in")
 		assert.Equal(t, "5.00GB", md.TotalSize)
-		assert.Equal(t, "2.68B", md.Parameters)
+		assert.Equal(t, uint64(2000), md.Parameters, "parameters are always measured from the shards (summed), not taken from the aggregate")
 		assert.Equal(t, "Q4_K_M", md.Quantization, "aggregate quantization wins over the shard dtype")
 		assert.Equal(t, 2, md.ShardCount, "ShardCount comes from the number of shards, not the aggregate")
 		assert.Equal(t, rollupHash([]string{"aaaa", "bbbb"}), md.MetadataHash, "the content hash still rolls up the shard hashes")
diff --git a/syft/pkg/cataloger/ai/parse_safetensors.go b/syft/pkg/cataloger/ai/parse_safetensors.go
index 79967401c..bc1ff1402 100644
--- a/syft/pkg/cataloger/ai/parse_safetensors.go
+++ b/syft/pkg/cataloger/ai/parse_safetensors.go
@@ -51,14 +51,14 @@ func readSafeTensorsHeader(r io.Reader) (*safeTensorsHeader, error) {
 		return nil, fmt.Errorf("safetensors header size %d exceeds maximum %d", headerLen, maxSafeTensorsHeaderSize)
 	}
 
-        // Read incrementally rather than pre-allocating headerLen up front
-        body, err := io.ReadAll(io.LimitReader(r, int64(headerLen)))
-        if err != nil {
+	// Read incrementally rather than pre-allocating headerLen up front
+	body, err := io.ReadAll(io.LimitReader(r, int64(headerLen)))
+	if err != nil {
 		return nil, fmt.Errorf("failed to read header body: %w", err)
 	}
-        if uint64(len(body)) != headerLen {
-                return nil, fmt.Errorf("safetensors header truncated: read %d of %d bytes", len(body), headerLen)
-        }
+	if uint64(len(body)) != headerLen {
+		return nil, fmt.Errorf("safetensors header truncated: read %d of %d bytes", len(body), headerLen)
+	}
 
 	var raw map[string]json.RawMessage
 	if err := json.Unmarshal(body, &raw); err != nil {
@@ -131,19 +131,19 @@ func (h *safeTensorsHeader) dominantDType() string {
 // (name + dtype + shape) plus the __metadata__ map. Tensor keys are sorted to
 // keep the hash deterministic across producers.
 func (h *safeTensorsHeader) metadataHash() string {
-        type logicalEntry struct {
-                Name  string  `json:"name"`
-                DType string  `json:"dtype"`
-                Shape []int64 `json:"shape"`
+	type logicalEntry struct {
+		Name  string  `json:"name"`
+		DType string  `json:"dtype"`
+		Shape []int64 `json:"shape"`
 	}
-        entries := make([]logicalEntry, 0, len(h.tensors))
+	entries := make([]logicalEntry, 0, len(h.tensors))
 	for name, t := range h.tensors {
-                entries = append(entries, logicalEntry{Name: name, DType: t.DType, Shape: t.Shape})
+		entries = append(entries, logicalEntry{Name: name, DType: t.DType, Shape: t.Shape})
 	}
 	sort.Slice(entries, func(i, j int) bool { return entries[i].Name < entries[j].Name })
 
 	type hashInput struct {
-                Tensors  []logicalEntry    `json:"tensors"`
+		Tensors  []logicalEntry    `json:"tensors"`
 		Metadata map[string]string `json:"metadata,omitempty"`
 	}
 	b, err := json.Marshal(hashInput{Tensors: entries, Metadata: h.metadata})
diff --git a/syft/pkg/cataloger/ai/parse_safetensors_model.go b/syft/pkg/cataloger/ai/parse_safetensors_model.go
index 134ef71b1..a9a8f3c66 100644
--- a/syft/pkg/cataloger/ai/parse_safetensors_model.go
+++ b/syft/pkg/cataloger/ai/parse_safetensors_model.go
@@ -31,13 +31,11 @@ func parseSafeTensorsFile(_ context.Context, _ file.Resolver, _ *generic.Environ
 	md := pkg.SafeTensorsModelInfo{
 		Format:       "safetensors",
 		TensorCount:  uint64(len(header.tensors)),
+		Parameters:   header.parameterCount(),
 		Quantization: normalizeDType(header.dominantDType()),
 		UserMetadata: userMetadataKeyValues(header.metadata),
 		MetadataHash: header.metadataHash(),
 	}
-	if p := header.parameterCount(); p > 0 {
-		md.Parameters = formatParameterCount(p)
-	}
 
 	p := newSafeTensorsPackage(
 		&md,
@@ -46,21 +44,5 @@ func parseSafeTensorsFile(_ context.Context, _ file.Resolver, _ *generic.Environ
 	return []pkg.Package{p}, nil, unknown.IfEmptyf([]pkg.Package{p}, "unable to parse safetensors file")
 }
 
-// formatParameterCount prints a count like 6_700_000_000 as "6.70B" using
-// B/M/K thresholds matching the notation used by Hugging Face and Docker AI
-// labels.
-func formatParameterCount(n uint64) string {
-	switch {
-	case n >= 1_000_000_000:
-		return fmt.Sprintf("%.2fB", float64(n)/1_000_000_000)
-	case n >= 1_000_000:
-		return fmt.Sprintf("%.2fM", float64(n)/1_000_000)
-	case n >= 1_000:
-		return fmt.Sprintf("%.2fK", float64(n)/1_000)
-	default:
-		return fmt.Sprintf("%d", n)
-	}
-}
-
 // integrity check
 var _ generic.Parser = parseSafeTensorsFile
diff --git a/syft/pkg/cataloger/ai/parse_safetensors_oci.go b/syft/pkg/cataloger/ai/parse_safetensors_oci.go
index a20988f79..7fd9f3b3e 100644
--- a/syft/pkg/cataloger/ai/parse_safetensors_oci.go
+++ b/syft/pkg/cataloger/ai/parse_safetensors_oci.go
@@ -39,7 +39,6 @@ type dockerAIModelConfig struct {
 	Config struct {
 		Format       string `json:"format"`
 		Quantization string `json:"quantization"`
-		Parameters   string `json:"parameters"`
 		Size         string `json:"size"`
 		SafeTensors  struct {
 			TensorCount json.Number `json:"tensor_count"`
@@ -65,10 +64,13 @@ func parseSafeTensorsOCIConfig(_ context.Context, _ file.Resolver, _ *generic.En
 		return nil, nil, nil
 	}
 
+	// Parameters is intentionally not read from the config blob: we measure the
+	// true parameter count from the SafeTensors layer headers (parseSafeTensorsOCILayer)
+	// so OCI and directory scans of the same model agree, rather than trusting the
+	// producer-supplied label here.
 	md := pkg.SafeTensorsModelInfo{
 		Format:       "safetensors",
 		Quantization: cfg.Config.Quantization,
-		Parameters:   cfg.Config.Parameters,
 		TotalSize:    cfg.Config.Size,
 	}
 	if n, err := cfg.Config.SafeTensors.TensorCount.Int64(); err == nil && n > 0 {
@@ -95,13 +97,11 @@ func parseSafeTensorsOCILayer(_ context.Context, _ file.Resolver, _ *generic.Env
 	md := pkg.SafeTensorsModelInfo{
 		Format:       "safetensors",
 		TensorCount:  uint64(len(header.tensors)),
+		Parameters:   header.parameterCount(),
 		Quantization: normalizeDType(header.dominantDType()),
 		UserMetadata: userMetadataKeyValues(header.metadata),
 		MetadataHash: header.metadataHash(),
 	}
-	if p := header.parameterCount(); p > 0 {
-		md.Parameters = formatParameterCount(p)
-	}
 
 	p := newSafeTensorsPackage(
 		&md,
diff --git a/syft/pkg/cataloger/ai/parse_safetensors_test.go b/syft/pkg/cataloger/ai/parse_safetensors_test.go
index dfd586b33..14cfa7762 100644
--- a/syft/pkg/cataloger/ai/parse_safetensors_test.go
+++ b/syft/pkg/cataloger/ai/parse_safetensors_test.go
@@ -83,7 +83,7 @@ func TestSafeTensorsCataloger(t *testing.T) {
 			Format:       "safetensors",
 			Architecture: architecture,
 			Quantization: "BF16",
-			Parameters:   "16.26K",
+			Parameters:   16256,
 			TensorCount:  2,
 			ShardCount:   1,
 			UserMetadata: pkg.KeyValues{{Key: "format", Value: "pt"}},
@@ -344,12 +344,13 @@ func TestParseSafeTensorsOCIConfig(t *testing.T) {
 				{
 					// nameless: the merge processor assigns the name and resolves
 					// licenses. Config blobs carry no header content, so
-					// MetadataHash stays empty.
+					// MetadataHash stays empty. The "parameters" label in the blob is
+					// intentionally ignored: the true count is measured from the
+					// SafeTensors layer headers, so Parameters stays zero here.
 					Type: pkg.ModelPkg,
 					Metadata: pkg.SafeTensorsModelInfo{
 						Format:       "safetensors",
 						Quantization: "Q4_K_M",
-						Parameters:   "8B",
 						TotalSize:    "16.00GB",
 						TensorCount:  291,
 					},
@@ -708,7 +709,7 @@ func TestParseSafeTensorsOCILayer(t *testing.T) {
 				Type: pkg.ModelPkg,
 				Metadata: pkg.SafeTensorsModelInfo{
 					Format:       "safetensors",
-					Parameters:   "16.64K",
+					Parameters:   16640,
 					Quantization: "BF16",
 					TensorCount:  2,
 					UserMetadata: wantUserMetadata,
@@ -740,7 +741,6 @@ func TestParseSafeTensorsOCILayer(t *testing.T) {
 			Type: pkg.ModelPkg,
 			Metadata: pkg.SafeTensorsModelInfo{
 				Format:       "safetensors",
-				Parameters:   "2.68B",
 				TotalSize:    "5.00GB",
 				Quantization: "Q4_K_M", // raw producer string
 				TensorCount:  9999,
@@ -767,11 +767,13 @@ func TestParseSafeTensorsOCILayer(t *testing.T) {
 		got := out[0]
 		assert.Equal(t, "qwen-test", got.Name, "name comes from the companion config.json _name_or_path")
 		md := got.Metadata.(pkg.SafeTensorsModelInfo)
-		// Aggregate-declared fields win for totals; per-shard count must NOT be
-		// summed into the aggregate.
+		// Aggregate-declared TensorCount/TotalSize win as authoritative totals; the
+		// per-shard TensorCount must NOT be summed into the aggregate. Parameters is
+		// the exception: it is always measured from the shard layer header (here the
+		// 16640-element blob), never taken from the config aggregate.
 		assert.Equal(t, uint64(9999), md.TensorCount)
 		assert.Equal(t, "5.00GB", md.TotalSize)
-		assert.Equal(t, "2.68B", md.Parameters)
+		assert.Equal(t, uint64(16640), md.Parameters)
 		// Aggregate Quantization wins when set; shard's normalized dtype is the
 		// fallback (not exercised here because the config had Q4_K_M).
 		assert.Equal(t, "Q4_K_M", md.Quantization)
@@ -804,11 +806,11 @@ func TestParseSafeTensorsOCILayer_realFixture(t *testing.T) {
 			Type: pkg.ModelPkg,
 			Metadata: pkg.SafeTensorsModelInfo{
 				Format:       "safetensors",
-				Parameters:   "475.29M",
-				Quantization: "F32", // every tensor in the captured shard is F32
-				TensorCount:  148,   // nomic-embed-v2-moe 475M ships 148 tensor entries in this shard
+				Parameters:   475292928, // exact element count summed across the 148 tensors in this shard
+				Quantization: "F32",     // every tensor in the captured shard is F32
+				TensorCount:  148,       // nomic-embed-v2-moe 475M ships 148 tensor entries in this shard
 				UserMetadata: pkg.KeyValues{{Key: "format", Value: "pt"}},
-				MetadataHash: "051a14e686673dea",
+				MetadataHash: "6026c28a883ab918",
 			},
 		},
 	}
@@ -980,24 +982,6 @@ func TestNormalizeDType(t *testing.T) {
 	}
 }
 
-func TestFormatParameterCount(t *testing.T) {
-	tests := []struct {
-		name string
-		in   uint64
-		want string
-	}{
-		{name: "raw count under 1K", in: 512, want: "512"},
-		{name: "thousands", in: 16256, want: "16.26K"},
-		{name: "billions", in: 2_680_000_000, want: "2.68B"},
-		{name: "millions", in: 35_000_000, want: "35.00M"},
-	}
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			assert.Equal(t, tt.want, formatParameterCount(tt.in))
-		})
-	}
-}
-
 func TestParseFrontmatter(t *testing.T) {
 	tests := []struct {
 		name          string
diff --git a/syft/pkg/safetensors.go b/syft/pkg/safetensors.go
index 0e769472b..c1de661d0 100644
--- a/syft/pkg/safetensors.go
+++ b/syft/pkg/safetensors.go
@@ -18,9 +18,10 @@ type SafeTensorsModelInfo struct {
 	// Quantization describes tensor precision (e.g., "BF16", "F16", "F32", "INT8").
 	Quantization string `json:"quantization,omitempty" cyclonedx:"quantization"`
 
-	// Parameters is the parameter count as reported by upstream. Stored as a string
-	// because Docker AI and Hugging Face labels use notation like "2.68B" or "35B-A3B".
-	Parameters string `json:"parameters,omitempty" cyclonedx:"parameters"`
+	// Parameters is the total number of model parameters, computed from the tensor
+	// shapes in the SafeTensors header(s). For a sharded model it is the sum across
+	// every shard.
+	Parameters uint64 `json:"parameters,omitempty" cyclonedx:"parameters"`
 
 	// TensorCount is the number of tensor entries in the file header.
 	TensorCount uint64 `json:"tensorCount,omitempty" cyclonedx:"tensorCount"`