fix: trim fields to only be ones from safetensor header

Signed-off-by: Christopher Phillips <32073428+spiffcs@users.noreply.github.com>
2026-07-05 02:28:25 +02:00 · 2026-06-01 23:15:08 -04:00 · 2026-06-01 23:15:08 -04:00 · dbf6dd2eb3
commit dbf6dd2eb3
parent 549f526de0
4 changed files with 15 additions and 41 deletions
--- a/schema/json/schema-latest.json
+++ b/schema/json/schema-latest.json
@ -4098,7 +4098,7 @@
        },
        "architecture": {
          "type": "string",
-          "description": "Architecture is the model architecture (e.g., \"LlamaForCausalLM\",\n\"Qwen3MoeForConditionalGeneration\"), sourced from the Hugging Face config.json\n\"architectures\" array."
+          "description": "Architecture is the model architecture (e.g., \"LlamaForCausalLM\",\n\"Qwen3MoeForConditionalGeneration\"). It is not present in the SafeTensors\nheader itself; it is enriched from the companion Hugging Face config.json\n\"architectures\" array when one is found alongside the model."
        },
        "quantization": {
          "type": "string",
@ -4116,14 +4116,6 @@
          "type": "string",
          "description": "TotalSize is the total byte size of tensor data across all shards when known\n(from the Docker AI model config \"size\" field or the sharded index \"total_size\")."
        },
        "torchDtype": {
          "type": "string",
          "description": "TorchDtype is the Hugging Face torch_dtype (e.g., \"bfloat16\", \"float16\")."
        },
        "transformersVersion": {
          "type": "string",
          "description": "TransformersVersion is the transformers library version recorded in config.json."
        },
        "shardCount": {
          "type": "integer",
          "description": "ShardCount is the number of .safetensors shards for a sharded model (1 for a\nsingle-file model)."
@ -4134,7 +4126,7 @@
        },
        "metadataHash": {
          "type": "string",
-          "description": "MetadataHash is an xxhash of the normalized header metadata, providing a stable\nidentifier for identical model content across repositories or filenames."
+          "description": "MetadataHash is an xxhash over the on-disk SafeTensors header (sorted tensor\nentries + __metadata__). It is derived ONLY from the safetensors file bytes —\nnever from OCI manifest, layer descriptor, or config-blob fields — so the same\nmodel content scanned via a directory source and via an OCI image produces the\nsame value. Treat this as the cross-source content fingerprint."
        },
        "parts": {
          "items": {
--- a/syft/pkg/cataloger/ai/parse_safetensors_test.go
+++ b/syft/pkg/cataloger/ai/parse_safetensors_test.go
@ -77,16 +77,14 @@ func TestSafeTensorsCataloger(t *testing.T) {
 						pkg.NewLicenseFromFields("Apache-2.0", "", nil),
 					),
 					Metadata: pkg.SafeTensorsModelInfo{
-						Format:              "safetensors",
+						Format:       "safetensors",
-						Architecture:        "LlamaForCausalLM",
+						Architecture: "LlamaForCausalLM",
-						Quantization:        "BF16",
+						Quantization: "BF16",
-						Parameters:          "16.26K",
+						Parameters:   "16.26K",
-						TensorCount:         2,
+						TensorCount:  2,
-						TorchDtype:          "bfloat16",
+						ShardCount:   1,
-						TransformersVersion: "4.40.0",
+						UserMetadata: pkg.KeyValues{{Key: "format", Value: "pt"}},
-						ShardCount:          1,
+						MetadataHash: wantHash,
 						UserMetadata:        pkg.KeyValues{{Key: "format", Value: "pt"}},
 						MetadataHash:        wantHash,
 					},
 				},
 			},
--- a/syft/pkg/cataloger/ai/processor.go
+++ b/syft/pkg/cataloger/ai/processor.go
@ -222,10 +222,7 @@ func mergeAggregatesInto(merged *pkg.SafeTensorsModelInfo, aggregates []pkg.Safe
 		}
 		firstNonEmpty(&merged.Parameters, a.Parameters)
 		firstNonEmpty(&merged.TotalSize, a.TotalSize)
 		firstNonEmpty(&merged.Architecture, a.Architecture)
 		firstNonEmpty(&merged.Quantization, a.Quantization)
 		firstNonEmpty(&merged.TorchDtype, a.TorchDtype)
 		firstNonEmpty(&merged.TransformersVersion, a.TransformersVersion)
 	}
 }
@ -516,12 +513,6 @@ func applyHFConfig(md *pkg.SafeTensorsModelInfo, cfg *hfConfig) {
 	if md.Architecture == "" && len(cfg.Architectures) > 0 {
 		md.Architecture = cfg.Architectures[0]
 	}
 	if md.TorchDtype == "" {
 		md.TorchDtype = cfg.TorchDtype
 	}
 	if md.TransformersVersion == "" {
 		md.TransformersVersion = cfg.TransformersVersion
 	}
 }
 // pickSafeTensorsName implements the documented naming precedence chain:
@ -537,10 +528,8 @@ func pickSafeTensorsName(nameOrPath, fallbackName string) string {
 // hfConfig is a minimal projection of Hugging Face config.json fields.
 type hfConfig struct {
-	Architectures       []string `json:"architectures"`
+	Architectures []string `json:"architectures"`
-	TorchDtype          string   `json:"torch_dtype"`
+	NameOrPath    string   `json:"_name_or_path"`
 	TransformersVersion string   `json:"transformers_version"`
 	NameOrPath          string   `json:"_name_or_path"`
 }
 // readmeFrontmatter holds the subset of YAML frontmatter fields we extract.
--- a/syft/pkg/safetensors.go
+++ b/syft/pkg/safetensors.go
@ -18,8 +18,9 @@ type SafeTensorsModelInfo struct {
 	Format string `json:"format,omitempty" cyclonedx:"format"`
 	// Architecture is the model architecture (e.g., "LlamaForCausalLM",
-	// "Qwen3MoeForConditionalGeneration"), sourced from the Hugging Face config.json
+	// "Qwen3MoeForConditionalGeneration"). It is not present in the SafeTensors
-	// "architectures" array.
+	// header itself; it is enriched from the companion Hugging Face config.json
 	// "architectures" array when one is found alongside the model.
 	Architecture string `json:"architecture,omitempty" cyclonedx:"architecture"`
 	// Quantization describes tensor precision (e.g., "BF16", "F16", "F32", "INT8").
@ -36,12 +37,6 @@ type SafeTensorsModelInfo struct {
 	// (from the Docker AI model config "size" field or the sharded index "total_size").
 	TotalSize string `json:"totalSize,omitempty" cyclonedx:"totalSize"`
 	// TorchDtype is the Hugging Face torch_dtype (e.g., "bfloat16", "float16").
 	TorchDtype string `json:"torchDtype,omitempty" cyclonedx:"torchDtype"`
 	// TransformersVersion is the transformers library version recorded in config.json.
 	TransformersVersion string `json:"transformersVersion,omitempty" cyclonedx:"transformersVersion"`
 	// ShardCount is the number of .safetensors shards for a sharded model (1 for a
 	// single-file model).
 	ShardCount int `json:"shardCount,omitempty" cyclonedx:"shardCount"`