From 6daea43c32e7902a883ee8c0d5c56481f0773dc3 Mon Sep 17 00:00:00 2001 From: Christopher Phillips <32073428+spiffcs@users.noreply.github.com> Date: Wed, 12 Nov 2025 23:56:18 -0500 Subject: [PATCH] fix: pr comments Signed-off-by: Christopher Phillips <32073428+spiffcs@users.noreply.github.com> --- go.mod | 5 +- go.sum | 1 - internal/packagemetadata/generated.go | 1 - syft/pkg/cataloger/ai/cataloger_test.go | 287 +--------------------- syft/pkg/cataloger/ai/package.go | 10 +- syft/pkg/cataloger/ai/package_test.go | 10 +- syft/pkg/cataloger/ai/parse_gguf.go | 21 +- syft/pkg/cataloger/ai/parse_gguf_model.go | 52 ++-- syft/pkg/gguf.go | 18 +- 9 files changed, 56 insertions(+), 349 deletions(-) diff --git a/go.mod b/go.mod index 8dcbac1d2..3eb181ee9 100644 --- a/go.mod +++ b/go.mod @@ -286,7 +286,10 @@ require ( modernc.org/memory v1.11.0 // indirect ) -require github.com/gpustack/gguf-parser-go v0.22.1 +require ( + github.com/cespare/xxhash/v2 v2.3.0 + github.com/gpustack/gguf-parser-go v0.22.1 +) require ( cyphar.com/go-pathrs v0.2.1 // indirect diff --git a/go.sum b/go.sum index bb8d3f3a0..550b43819 100644 --- a/go.sum +++ b/go.sum @@ -229,7 +229,6 @@ github.com/cenkalti/backoff/v4 v4.2.1 h1:y4OZtCnogmCPw98Zjyt5a6+QwPLGkiQsYW5oUqy github.com/cenkalti/backoff/v4 v4.2.1/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= github.com/census-instrumentation/opencensus-proto v0.3.0/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= -github.com/cespare/xxhash v1.1.0 h1:a6HrQnmkObjyL+Gs60czilIUGqrzKutQD6XZog3p+ko= github.com/cespare/xxhash v1.1.0/go.mod h1:XrSqR1VqqWfGrhpAt58auRo0WTKS1nRRg3ghfAqPWnc= github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/cespare/xxhash/v2 v2.1.2/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= diff --git a/internal/packagemetadata/generated.go b/internal/packagemetadata/generated.go index 7178662f7..d718bc6e0 100644 --- a/internal/packagemetadata/generated.go +++ b/internal/packagemetadata/generated.go @@ -27,7 +27,6 @@ func AllTypes() []any { pkg.ELFBinaryPackageNoteJSONPayload{}, pkg.ElixirMixLockEntry{}, pkg.ErlangRebarLockEntry{}, - pkg.GGUFFileHeader{}, pkg.GitHubActionsUseStatement{}, pkg.GolangBinaryBuildinfoEntry{}, pkg.GolangModuleEntry{}, diff --git a/syft/pkg/cataloger/ai/cataloger_test.go b/syft/pkg/cataloger/ai/cataloger_test.go index ddf5e4114..c89203878 100644 --- a/syft/pkg/cataloger/ai/cataloger_test.go +++ b/syft/pkg/cataloger/ai/cataloger_test.go @@ -7,7 +7,6 @@ import ( "github.com/google/go-cmp/cmp/cmpopts" "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" "github.com/anchore/syft/syft/artifact" "github.com/anchore/syft/syft/pkg" @@ -15,72 +14,6 @@ import ( ) func TestGGUFCataloger_Globs(t *testing.T) { - tests := []struct { - name string - setup func(t *testing.T) string // returns fixture directory - expected []string - }{ - { - name: "finds GGUF files in root", - setup: func(t *testing.T) string { - dir := t.TempDir() - createTestGGUFInDir(t, dir, "model1.gguf") - createTestGGUFInDir(t, dir, "model2.gguf") - return dir - }, - expected: []string{ - "model1.gguf", - "model2.gguf", - }, - }, - { - name: "finds GGUF files in subdirectories", - setup: func(t *testing.T) string { - dir := t.TempDir() - modelsDir := filepath.Join(dir, "models") - os.MkdirAll(modelsDir, 0755) - createTestGGUFInDir(t, modelsDir, "llama.gguf") - - deepDir := filepath.Join(dir, "deep", "nested", "path") - os.MkdirAll(deepDir, 0755) - createTestGGUFInDir(t, deepDir, "mistral.gguf") - - return dir - }, - expected: []string{ - "models/llama.gguf", - "deep/nested/path/mistral.gguf", - }, - }, - { - name: "ignores non-GGUF files", - setup: func(t *testing.T) string { - dir := t.TempDir() - createTestGGUFInDir(t, dir, "model.gguf") - - // Create non-GGUF files - os.WriteFile(filepath.Join(dir, "readme.txt"), []byte("readme"), 0644) - os.WriteFile(filepath.Join(dir, "model.bin"), []byte("binary"), 0644) - os.WriteFile(filepath.Join(dir, "config.json"), []byte("{}"), 0644) - - return dir - }, - expected: []string{ - "model.gguf", - }, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - fixtureDir := tt.setup(t) - - pkgtest.NewCatalogTester(). - FromDirectory(t, fixtureDir). - ExpectsResolverContentQueries(tt.expected). - TestCataloger(t, NewGGUFCataloger()) - }) - } } func TestGGUFCataloger_Integration(t *testing.T) { @@ -117,114 +50,15 @@ func TestGGUFCataloger_Integration(t *testing.T) { pkg.NewLicenseFromFields("Apache-2.0", "", nil), ), Metadata: pkg.GGUFFileHeader{ - ModelFormat: "gguf", ModelName: "llama3-8b", ModelVersion: "3.0", License: "Apache-2.0", Architecture: "llama", Quantization: "Unknown", Parameters: 0, - GGUFVersion: 3, - TensorCount: 0, - Header: map[string]interface{}{}, - TruncatedHeader: false, - }, - }, - }, - expectedRelationships: nil, - }, - { - name: "catalog multiple GGUF files", - setup: func(t *testing.T) string { - dir := t.TempDir() - - // Create first model - data1 := newTestGGUFBuilder(). - withVersion(3). - withStringKV("general.architecture", "llama"). - withStringKV("general.name", "model1"). - withStringKV("general.version", "1.0"). - build() - os.WriteFile(filepath.Join(dir, "model1.gguf"), data1, 0644) - - // Create second model - data2 := newTestGGUFBuilder(). - withVersion(3). - withStringKV("general.architecture", "mistral"). - withStringKV("general.name", "model2"). - withStringKV("general.version", "2.0"). - build() - os.WriteFile(filepath.Join(dir, "model2.gguf"), data2, 0644) - - return dir - }, - expectedPackages: []pkg.Package{ - { - Name: "model1", - Version: "1.0", - Type: pkg.ModelPkg, - Metadata: pkg.GGUFFileHeader{ - ModelFormat: "gguf", - ModelName: "model1", - ModelVersion: "1.0", - Architecture: "llama", - Quantization: "Unknown", - GGUFVersion: 3, - TensorCount: 0, - Header: map[string]interface{}{}, - TruncatedHeader: false, - }, - }, - { - Name: "model2", - Version: "2.0", - Type: pkg.ModelPkg, - Metadata: pkg.GGUFFileHeader{ - ModelFormat: "gguf", - ModelName: "model2", - ModelVersion: "2.0", - Architecture: "mistral", - Quantization: "Unknown", - GGUFVersion: 3, - TensorCount: 0, - Header: map[string]interface{}{}, - TruncatedHeader: false, - }, - }, - }, - expectedRelationships: nil, - }, - { - name: "catalog GGUF in nested directories", - setup: func(t *testing.T) string { - dir := t.TempDir() - nestedDir := filepath.Join(dir, "models", "quantized") - os.MkdirAll(nestedDir, 0755) - - data := newTestGGUFBuilder(). - withVersion(3). - withStringKV("general.architecture", "qwen"). - withStringKV("general.name", "qwen-nested"). - build() - - os.WriteFile(filepath.Join(nestedDir, "qwen.gguf"), data, 0644) - return dir - }, - expectedPackages: []pkg.Package{ - { - Name: "qwen-nested", - Version: unknownGGUFData, - Type: pkg.ModelPkg, - Metadata: pkg.GGUFFileHeader{ - ModelFormat: "gguf", - ModelName: "qwen-nested", - ModelVersion: unknownGGUFData, - Architecture: "qwen", - Quantization: "Unknown", - GGUFVersion: 3, - TensorCount: 0, - Header: map[string]interface{}{}, - TruncatedHeader: false, + GGUFVersion: 3, + TensorCount: 0, + Header: map[string]interface{}{}, }, }, }, @@ -252,122 +86,7 @@ func TestGGUFCataloger_Integration(t *testing.T) { } } -func TestGGUFCataloger_SkipsInvalidFiles(t *testing.T) { - dir := t.TempDir() - - // Create a valid GGUF - validData := newTestGGUFBuilder(). - withVersion(3). - withStringKV("general.architecture", "llama"). - withStringKV("general.name", "valid-model"). - build() - os.WriteFile(filepath.Join(dir, "valid.gguf"), validData, 0644) - - // Create an invalid GGUF (wrong magic) - invalidData := newTestGGUFBuilder().buildInvalidMagic() - os.WriteFile(filepath.Join(dir, "invalid.gguf"), invalidData, 0644) - - // Create a truncated GGUF - os.WriteFile(filepath.Join(dir, "truncated.gguf"), []byte{0x47}, 0644) - - // Catalog should succeed and only return the valid package - tester := pkgtest.NewCatalogTester(). - FromDirectory(t, dir). - ExpectsAssertion(func(t *testing.T, pkgs []pkg.Package, _ []artifact.Relationship) { - // Should only find the valid model - require.Len(t, pkgs, 1) - assert.Equal(t, "valid-model", pkgs[0].Name) - }) - - tester.TestCataloger(t, NewGGUFCataloger()) -} - func TestGGUFCataloger_Name(t *testing.T) { cataloger := NewGGUFCataloger() assert.Equal(t, "gguf-cataloger", cataloger.Name()) } - -func TestGGUFCataloger_EmptyDirectory(t *testing.T) { - dir := t.TempDir() - // Create a subdirectory to ensure glob still runs - os.MkdirAll(filepath.Join(dir, "models"), 0755) - - tester := pkgtest.NewCatalogTester(). - FromDirectory(t, dir). - ExpectsAssertion(func(t *testing.T, pkgs []pkg.Package, rels []artifact.Relationship) { - assert.Empty(t, pkgs) - assert.Empty(t, rels) - }) - - tester.TestCataloger(t, NewGGUFCataloger()) -} - -func TestGGUFCataloger_MixedFiles(t *testing.T) { - dir := t.TempDir() - - // Create GGUF file - ggufData := newTestGGUFBuilder(). - withVersion(3). - withStringKV("general.architecture", "llama"). - withStringKV("general.name", "test-model"). - build() - os.WriteFile(filepath.Join(dir, "model.gguf"), ggufData, 0644) - - // Create other file types - os.WriteFile(filepath.Join(dir, "README.md"), []byte("# Models"), 0644) - os.WriteFile(filepath.Join(dir, "config.json"), []byte("{}"), 0644) - os.WriteFile(filepath.Join(dir, "weights.bin"), []byte("weights"), 0644) - os.MkdirAll(filepath.Join(dir, "subdir"), 0755) - - tester := pkgtest.NewCatalogTester(). - FromDirectory(t, dir). - ExpectsAssertion(func(t *testing.T, pkgs []pkg.Package, _ []artifact.Relationship) { - // Should only find the GGUF model - require.Len(t, pkgs, 1) - assert.Equal(t, "test-model", pkgs[0].Name) - assert.Equal(t, pkg.ModelPkg, pkgs[0].Type) - }) - - tester.TestCataloger(t, NewGGUFCataloger()) -} - -func TestGGUFCataloger_CaseInsensitiveGlob(t *testing.T) { - // Test that the glob pattern is case-sensitive (as expected for **/*.gguf) - dir := t.TempDir() - - // Create lowercase .gguf - data := newTestGGUFBuilder(). - withVersion(3). - withStringKV("general.architecture", "llama"). - withStringKV("general.name", "lowercase"). - build() - os.WriteFile(filepath.Join(dir, "model.gguf"), data, 0644) - - // Create uppercase .GGUF (should not match the glob) - os.WriteFile(filepath.Join(dir, "MODEL.GGUF"), data, 0644) - - tester := pkgtest.NewCatalogTester(). - FromDirectory(t, dir). - ExpectsAssertion(func(t *testing.T, pkgs []pkg.Package, _ []artifact.Relationship) { - // Depending on filesystem case-sensitivity, we may get 1 or 2 packages - // On case-insensitive filesystems (macOS), both might match - // On case-sensitive filesystems (Linux), only lowercase matches - assert.GreaterOrEqual(t, len(pkgs), 1, "should find at least the lowercase file") - }) - - tester.TestCataloger(t, NewGGUFCataloger()) -} - -// createTestGGUFInDir creates a minimal test GGUF file in the specified directory -func createTestGGUFInDir(t *testing.T, dir, filename string) { - t.Helper() - data := newTestGGUFBuilder(). - withVersion(3). - withStringKV("general.architecture", "llama"). - withStringKV("general.name", "test-model"). - build() - - path := filepath.Join(dir, filename) - err := os.WriteFile(path, data, 0644) - require.NoError(t, err) -} diff --git a/syft/pkg/cataloger/ai/package.go b/syft/pkg/cataloger/ai/package.go index f64d6a008..dfc93adb8 100644 --- a/syft/pkg/cataloger/ai/package.go +++ b/syft/pkg/cataloger/ai/package.go @@ -1,10 +1,11 @@ package ai import ( - "crypto/sha256" "encoding/json" "fmt" + "github.com/cespare/xxhash/v2" + "github.com/anchore/syft/internal/log" "github.com/anchore/syft/syft/file" "github.com/anchore/syft/syft/pkg" @@ -48,7 +49,6 @@ func computeMetadataHash(metadata *pkg.GGUFFileHeader) string { GGUFVersion uint32 TensorCount uint64 }{ - Format: metadata.ModelFormat, Name: metadata.ModelName, Version: metadata.ModelVersion, Architecture: metadata.Architecture, @@ -63,7 +63,7 @@ func computeMetadataHash(metadata *pkg.GGUFFileHeader) string { return "" } - // Compute SHA256 hash - hash := sha256.Sum256(jsonBytes) - return fmt.Sprintf("%x", hash[:8]) // Use first 8 bytes (16 hex chars) + // Compute xxhash + hash := xxhash.Sum64(jsonBytes) + return fmt.Sprintf("%016x", hash) // 16 hex chars (64 bits) } diff --git a/syft/pkg/cataloger/ai/package_test.go b/syft/pkg/cataloger/ai/package_test.go index 2299fc217..6633bcbe2 100644 --- a/syft/pkg/cataloger/ai/package_test.go +++ b/syft/pkg/cataloger/ai/package_test.go @@ -21,17 +21,15 @@ func TestNewGGUFPackage(t *testing.T) { { name: "complete GGUF package with all fields", metadata: &pkg.GGUFFileHeader{ - ModelFormat: "gguf", ModelName: "llama3-8b-instruct", ModelVersion: "3.0", License: "Apache-2.0", Architecture: "llama", Quantization: "Q4_K_M", Parameters: 8030000000, - GGUFVersion: 3, - TensorCount: 291, - Header: map[string]any{}, - TruncatedHeader: false, + GGUFVersion: 3, + TensorCount: 291, + Header: map[string]any{}, }, locations: []file.Location{file.NewLocation("/models/llama3-8b.gguf")}, checkFunc: func(t *testing.T, p pkg.Package) { @@ -55,7 +53,6 @@ func TestNewGGUFPackage(t *testing.T) { { name: "minimal GGUF package", metadata: &pkg.GGUFFileHeader{ - ModelFormat: "gguf", ModelName: "simple-model", ModelVersion: "1.0", Architecture: "gpt2", @@ -80,7 +77,6 @@ func TestNewGGUFPackage(t *testing.T) { { name: "GGUF package with multiple locations", metadata: &pkg.GGUFFileHeader{ - ModelFormat: "gguf", ModelName: "multi-location-model", ModelVersion: "1.5", Architecture: "llama", diff --git a/syft/pkg/cataloger/ai/parse_gguf.go b/syft/pkg/cataloger/ai/parse_gguf.go index 9f4a84550..60455ea93 100644 --- a/syft/pkg/cataloger/ai/parse_gguf.go +++ b/syft/pkg/cataloger/ai/parse_gguf.go @@ -14,19 +14,14 @@ const ( maxHeaderSize = 50 * 1024 * 1024 // 50MB for large tokenizer vocabularies ) -// ggufHeaderReader reads just the header portion of a GGUF file efficiently -type ggufHeaderReader struct { - reader io.Reader -} - // readHeader reads only the GGUF header (metadata) without reading tensor data // This is much more efficient than reading the entire file // The reader should be wrapped with io.LimitedReader to prevent OOM issues -func (r *ggufHeaderReader) readHeader() ([]byte, error) { +func readHeader(r io.Reader) ([]byte, error) { // Read initial chunk to determine header size // GGUF format: magic(4) + version(4) + tensor_count(8) + metadata_kv_count(8) + metadata_kvs + tensors_info initialBuf := make([]byte, 24) // Enough for magic, version, tensor count, and kv count - if _, err := io.ReadFull(r.reader, initialBuf); err != nil { + if _, err := io.ReadFull(r, initialBuf); err != nil { return nil, fmt.Errorf("failed to read GGUF header prefix: %w", err) } @@ -45,7 +40,7 @@ func (r *ggufHeaderReader) readHeader() ([]byte, error) { // The LimitedReader will return EOF once maxHeaderSize is reached buf := make([]byte, 64*1024) // 64KB chunks for { - n, err := r.reader.Read(buf) + n, err := r.Read(buf) if n > 0 { headerData = append(headerData, buf[:n]...) } @@ -65,24 +60,14 @@ func (r *ggufHeaderReader) readHeader() ([]byte, error) { func convertGGUFMetadataKVs(kvs gguf_parser.GGUFMetadataKVs) map[string]interface{} { result := make(map[string]interface{}) - // Limit KV pairs to avoid bloat - const maxKVPairs = 200 - count := 0 - for _, kv := range kvs { - if count >= maxKVPairs { - break - } - // Skip standard fields that are extracted separately switch kv.Key { case "general.architecture", "general.name", "general.license", "general.version", "general.parameter_count", "general.quantization": continue } - result[kv.Key] = kv.Value - count++ } return result diff --git a/syft/pkg/cataloger/ai/parse_gguf_model.go b/syft/pkg/cataloger/ai/parse_gguf_model.go index ff0c134d0..fc22a5cc3 100644 --- a/syft/pkg/cataloger/ai/parse_gguf_model.go +++ b/syft/pkg/cataloger/ai/parse_gguf_model.go @@ -2,15 +2,19 @@ package ai import ( "context" + "encoding/json" "fmt" "io" "os" "path/filepath" + "sort" "strings" + "github.com/cespare/xxhash/v2" gguf_parser "github.com/gpustack/gguf-parser-go" "github.com/anchore/syft/internal" + "github.com/anchore/syft/internal/log" "github.com/anchore/syft/internal/unknown" "github.com/anchore/syft/syft/artifact" "github.com/anchore/syft/syft/file" @@ -18,8 +22,6 @@ import ( "github.com/anchore/syft/syft/pkg/cataloger/generic" ) -const unknownGGUFData = "unknown" - // parseGGUFModel parses a GGUF model file and returns the discovered package. // This implementation only reads the header portion of the file, not the entire model. func parseGGUFModel(_ context.Context, _ file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) { @@ -28,8 +30,7 @@ func parseGGUFModel(_ context.Context, _ file.Resolver, _ *generic.Environment, // Read and validate the GGUF file header using LimitedReader to prevent OOM // We use LimitedReader to cap reads at maxHeaderSize (50MB) limitedReader := &io.LimitedReader{R: reader, N: maxHeaderSize} - headerReader := &ggufHeaderReader{reader: limitedReader} - headerData, err := headerReader.readHeader() + headerData, err := readHeader(limitedReader) if err != nil { return nil, nil, fmt.Errorf("failed to read GGUF header: %w", err) } @@ -63,7 +64,6 @@ func parseGGUFModel(_ context.Context, _ file.Resolver, _ *generic.Environment, // Convert to syft metadata structure syftMetadata := &pkg.GGUFFileHeader{ - ModelFormat: "gguf", ModelName: metadata.Name, ModelVersion: extractVersion(ggufFile.Header.MetadataKV), License: metadata.License, @@ -71,10 +71,9 @@ func parseGGUFModel(_ context.Context, _ file.Resolver, _ *generic.Environment, Quantization: metadata.FileTypeDescriptor, Parameters: uint64(metadata.Parameters), GGUFVersion: uint32(ggufFile.Header.Version), - TensorCount: ggufFile.Header.TensorCount, - Header: convertGGUFMetadataKVs(ggufFile.Header.MetadataKV), - TruncatedHeader: false, // We read the full header - Hash: "", // Will be computed in newGGUFPackage + TensorCount: ggufFile.Header.TensorCount, + Header: convertGGUFMetadataKVs(ggufFile.Header.MetadataKV), + MetadataHash: computeKVMetadataHash(ggufFile.Header.MetadataKV), } // If model name is not in metadata, use filename @@ -82,11 +81,6 @@ func parseGGUFModel(_ context.Context, _ file.Resolver, _ *generic.Environment, syftMetadata.ModelName = extractModelNameFromPath(reader.Path()) } - // If version is still unknown, try to infer from name - if syftMetadata.ModelVersion == unknownGGUFData { - syftMetadata.ModelVersion = extractVersionFromName(syftMetadata.ModelName) - } - // Create package from metadata p := newGGUFPackage( syftMetadata, @@ -96,6 +90,27 @@ func parseGGUFModel(_ context.Context, _ file.Resolver, _ *generic.Environment, return []pkg.Package{p}, nil, unknown.IfEmptyf([]pkg.Package{p}, "unable to parse GGUF file") } +// computeKVMetadataHash computes a stable hash of the KV metadata for use as a global identifier +func computeKVMetadataHash(metadata gguf_parser.GGUFMetadataKVs) string { + // Sort the KV pairs by key for stable hashing + sortedKVs := make([]gguf_parser.GGUFMetadataKV, len(metadata)) + copy(sortedKVs, metadata) + sort.Slice(sortedKVs, func(i, j int) bool { + return sortedKVs[i].Key < sortedKVs[j].Key + }) + + // Marshal sorted KVs to JSON for stable hashing + jsonBytes, err := json.Marshal(sortedKVs) + if err != nil { + log.Debugf("failed to marshal metadata for hashing: %v", err) + return "" + } + + // Compute xxhash + hash := xxhash.Sum64(jsonBytes) + return fmt.Sprintf("%016x", hash) // 16 hex chars (64 bits) +} + // extractVersion attempts to extract version from metadata KV pairs func extractVersion(kvs gguf_parser.GGUFMetadataKVs) string { for _, kv := range kvs { @@ -105,14 +120,7 @@ func extractVersion(kvs gguf_parser.GGUFMetadataKVs) string { } } } - return unknownGGUFData -} - -// extractVersionFromName tries to extract version from model name -func extractVersionFromName(_ string) string { - // Look for version patterns like "v1.0", "1.5b", "3.0", etc. - // For now, return unknown - this could be enhanced with regex - return unknownGGUFData + return "" } // extractModelNameFromPath extracts the model name from the file path diff --git a/syft/pkg/gguf.go b/syft/pkg/gguf.go index 271f6123d..5100d07a5 100644 --- a/syft/pkg/gguf.go +++ b/syft/pkg/gguf.go @@ -4,8 +4,8 @@ package pkg // GGUF is a binary file format used for storing model weights for the GGML library, designed for fast // loading and saving of models, particularly quantized large language models. type GGUFFileHeader struct { - // ModelFormat is always "gguf" - ModelFormat string `json:"modelFormat" cyclonedx:"modelFormat"` + // GGUFVersion is the GGUF format version (e.g., 3) + GGUFVersion uint32 `json:"ggufVersion" cyclonedx:"ggufVersion"` // ModelName is the name of the model (from general.name or filename) ModelName string `json:"modelName" cyclonedx:"modelName"` @@ -16,15 +16,9 @@ type GGUFFileHeader struct { // FileSize is the size of the GGUF file in bytes (best-effort if available from resolver) FileSize int64 `json:"fileSize,omitempty" cyclonedx:"fileSize"` - // Hash is a content hash of the metadata (for stable global identifiers across remotes) - Hash string `json:"hash,omitempty" cyclonedx:"hash"` - // License is the license identifier (from general.license if present) License string `json:"license,omitempty" cyclonedx:"license"` - // GGUFVersion is the GGUF format version (e.g., 3) - GGUFVersion uint32 `json:"ggufVersion" cyclonedx:"ggufVersion"` - // Architecture is the model architecture (from general.architecture, e.g., "qwen3moe", "llama") Architecture string `json:"architecture,omitempty" cyclonedx:"architecture"` @@ -42,6 +36,10 @@ type GGUFFileHeader struct { // (namespaced with general.*, llama.*, etc.) while avoiding duplication. Header map[string]interface{} `json:"header,omitempty" cyclonedx:"header"` - // TruncatedHeader indicates if the header was truncated during parsing (for very large headers) - TruncatedHeader bool `json:"truncatedHeader,omitempty" cyclonedx:"truncatedHeader"` + // MetadataHash is a xx64 hash of all key-value pairs from the GGUF header metadata. + // This hash is computed over the complete header metadata (including the fields extracted + // into typed fields above) and provides a stable identifier for the model configuration + // across different file locations or remotes. It allows matching identical models even + // when stored in different repositories or with different filenames. + MetadataHash string `json:"metadataHash,omitempty" cyclonedx:"metadataHash"` }