From b1c8478d55763c41fd9a39fac24d7706d83e94a8 Mon Sep 17 00:00:00 2001 From: Christopher Phillips <32073428+spiffcs@users.noreply.github.com> Date: Thu, 13 Nov 2025 14:57:07 -0500 Subject: [PATCH] chore: pr comments Signed-off-by: Christopher Phillips <32073428+spiffcs@users.noreply.github.com> --- internal/packagemetadata/names.go | 2 +- syft/pkg/cataloger/ai/cataloger_test.go | 143 +++++++++++++--- syft/pkg/cataloger/ai/package.go | 8 +- syft/pkg/cataloger/ai/package_test.go | 183 +++++++++++++-------- syft/pkg/cataloger/ai/parse_gguf_model.go | 22 +-- syft/pkg/cataloger/ai/test_builder_test.go | 41 ----- syft/pkg/cataloger/ai/test_helpers_test.go | 1 + syft/pkg/gguf.go | 11 +- 8 files changed, 254 insertions(+), 157 deletions(-) delete mode 100644 syft/pkg/cataloger/ai/test_builder_test.go diff --git a/internal/packagemetadata/names.go b/internal/packagemetadata/names.go index 8862571bf..d7a1bfcf9 100644 --- a/internal/packagemetadata/names.go +++ b/internal/packagemetadata/names.go @@ -124,7 +124,7 @@ var jsonTypes = makeJSONTypes( jsonNames(pkg.TerraformLockProviderEntry{}, "terraform-lock-provider-entry"), jsonNames(pkg.DotnetPackagesLockEntry{}, "dotnet-packages-lock-entry"), jsonNames(pkg.CondaMetaPackage{}, "conda-metadata-entry", "CondaPackageMetadata"), - jsonNames(pkg.GGUFFileHeader{}, "gguf-file-metadata"), + jsonNames(pkg.GGUFFileHeader{}, "gguf-file-header"), ) func expandLegacyNameVariants(names ...string) []string { diff --git a/syft/pkg/cataloger/ai/cataloger_test.go b/syft/pkg/cataloger/ai/cataloger_test.go index 26cfa16d2..5d9dda2ba 100644 --- a/syft/pkg/cataloger/ai/cataloger_test.go +++ b/syft/pkg/cataloger/ai/cataloger_test.go @@ -5,8 +5,6 @@ import ( "path/filepath" "testing" - "github.com/google/go-cmp/cmp/cmpopts" - "github.com/anchore/syft/syft/artifact" "github.com/anchore/syft/syft/pkg" "github.com/anchore/syft/syft/pkg/cataloger/internal/pkgtest" @@ -37,7 +35,7 @@ func TestGGUFCataloger_Globs(t *testing.T) { } } -func TestGGUFCataloger_Integration(t *testing.T) { +func TestGGUFCataloger(t *testing.T) { tests := []struct { name string setup func(t *testing.T) string @@ -56,6 +54,7 @@ func TestGGUFCataloger_Integration(t *testing.T) { withStringKV("general.license", "Apache-2.0"). withStringKV("general.quantization", "Q4_K_M"). withUint64KV("general.parameter_count", 8030000000). + withStringKV("general.some_random_kv", "foobar"). build() path := filepath.Join(dir, "llama3-8b.gguf") @@ -71,14 +70,125 @@ func TestGGUFCataloger_Integration(t *testing.T) { pkg.NewLicenseFromFields("Apache-2.0", "", nil), ), Metadata: pkg.GGUFFileHeader{ - ModelName: "llama3-8b", - License: "Apache-2.0", - Architecture: "llama", - Quantization: "Unknown", - Parameters: 0, - GGUFVersion: 3, - TensorCount: 0, - Header: map[string]interface{}{}, + Architecture: "llama", + Quantization: "Unknown", + Parameters: 0, + GGUFVersion: 3, + TensorCount: 0, + MetadataKeyValuesHash: "6e3d368066455ce4", + Header: map[string]interface{}{ + "general.some_random_kv": "foobar", + }, + }, + }, + }, + expectedRelationships: nil, + }, + { + name: "catalog GGUF file with minimal metadata", + setup: func(t *testing.T) string { + dir := t.TempDir() + data := newTestGGUFBuilder(). + withVersion(3). + withStringKV("general.architecture", "gpt2"). + withStringKV("general.name", "gpt2-small"). + withStringKV("gpt2.context_length", "1024"). + withUint32KV("gpt2.embedding_length", 768). + build() + + path := filepath.Join(dir, "gpt2-small.gguf") + os.WriteFile(path, data, 0644) + return dir + }, + expectedPackages: []pkg.Package{ + { + Name: "gpt2-small", + Version: "", + Type: pkg.ModelPkg, + Licenses: pkg.NewLicenseSet(), + Metadata: pkg.GGUFFileHeader{ + Architecture: "gpt2", + Quantization: "Unknown", + Parameters: 0, + GGUFVersion: 3, + TensorCount: 0, + MetadataKeyValuesHash: "9dc6f23591062a27", + Header: map[string]interface{}{ + "gpt2.context_length": "1024", + "gpt2.embedding_length": uint32(768), + }, + }, + }, + }, + expectedRelationships: nil, + }, + { + name: "catalog multiple GGUF files", + setup: func(t *testing.T) string { + dir := t.TempDir() + + // First model - Llama with custom training data + data1 := newTestGGUFBuilder(). + withVersion(3). + withStringKV("general.architecture", "llama"). + withStringKV("general.name", "model-1"). + withStringKV("general.version", "1.0"). + withStringKV("llama.attention.head_count", "32"). + withUint32KV("llama.attention.layer_norm_rms_epsilon", 999). + build() + os.WriteFile(filepath.Join(dir, "model-1.gguf"), data1, 0644) + + // Second model - GPT2 with different config + data2 := newTestGGUFBuilder(). + withVersion(3). + withStringKV("general.architecture", "gpt2"). + withStringKV("general.name", "model-2"). + withStringKV("general.version", "2.0"). + withStringKV("general.license", "MIT"). + withStringKV("gpt2.block_count", "12"). + withUint64KV("tokenizer.ggml.bos_token_id", 50256). + build() + os.WriteFile(filepath.Join(dir, "model-2.gguf"), data2, 0644) + + return dir + }, + expectedPackages: []pkg.Package{ + { + Name: "model-1", + Version: "1.0", + Type: pkg.ModelPkg, + Licenses: pkg.NewLicenseSet(), + Metadata: pkg.GGUFFileHeader{ + Architecture: "llama", + Quantization: "Unknown", + Parameters: 0, + GGUFVersion: 3, + TensorCount: 0, + MetadataKeyValuesHash: "57e0dbea7d2efa6e", + Header: map[string]interface{}{ + "llama.attention.head_count": "32", + "llama.attention.layer_norm_rms_epsilon": uint32(999), + }, + }, + }, + { + Name: "model-2", + Version: "2.0", + Type: pkg.ModelPkg, + Licenses: pkg.NewLicenseSet( + pkg.NewLicenseFromFields("MIT", "", nil), + ), + Metadata: pkg.GGUFFileHeader{ + Architecture: "gpt2", + Quantization: "Unknown", + Parameters: 0, + GGUFVersion: 3, + TensorCount: 0, + MetadataKeyValuesHash: "f85de1bf9be304bb", + Header: map[string]interface{}{ + "gpt2.block_count": "12", + "tokenizer.ggml.bos_token_id": uint64(50256), + }, }, }, }, @@ -91,17 +201,12 @@ func TestGGUFCataloger_Integration(t *testing.T) { fixtureDir := tt.setup(t) // Use pkgtest to catalog and compare - tester := pkgtest.NewCatalogTester(). + pkgtest.NewCatalogTester(). FromDirectory(t, fixtureDir). Expects(tt.expectedPackages, tt.expectedRelationships). IgnoreLocationLayer(). - IgnorePackageFields("FoundBy", "Locations"). // These are set by the cataloger - WithCompareOptions( - // Ignore MetadataHash as it's computed dynamically - cmpopts.IgnoreFields(pkg.GGUFFileHeader{}, "MetadataHash"), - ) - - tester.TestCataloger(t, NewGGUFCataloger()) + IgnorePackageFields("FoundBy", "Locations"). + TestCataloger(t, NewGGUFCataloger()) }) } } diff --git a/syft/pkg/cataloger/ai/package.go b/syft/pkg/cataloger/ai/package.go index b8c3b59d4..dfdede42d 100644 --- a/syft/pkg/cataloger/ai/package.go +++ b/syft/pkg/cataloger/ai/package.go @@ -5,9 +5,9 @@ import ( "github.com/anchore/syft/syft/pkg" ) -func newGGUFPackage(metadata *pkg.GGUFFileHeader, version string, locations ...file.Location) pkg.Package { +func newGGUFPackage(metadata *pkg.GGUFFileHeader, modelName, version, license string, locations ...file.Location) pkg.Package { p := pkg.Package{ - Name: metadata.ModelName, + Name: modelName, Version: version, Locations: file.NewLocationSet(locations...), Type: pkg.ModelPkg, @@ -18,8 +18,8 @@ func newGGUFPackage(metadata *pkg.GGUFFileHeader, version string, locations ...f } // Add license to the package if present in metadata - if metadata.License != "" { - p.Licenses.Add(pkg.NewLicenseFromFields(metadata.License, "", nil)) + if license != "" { + p.Licenses.Add(pkg.NewLicenseFromFields(license, "", nil)) } p.SetID() diff --git a/syft/pkg/cataloger/ai/package_test.go b/syft/pkg/cataloger/ai/package_test.go index ef6ff21f8..62d124e8b 100644 --- a/syft/pkg/cataloger/ai/package_test.go +++ b/syft/pkg/cataloger/ai/package_test.go @@ -3,121 +3,158 @@ package ai import ( "testing" - "github.com/google/go-cmp/cmp" - "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "github.com/anchore/syft/syft/file" "github.com/anchore/syft/syft/pkg" + "github.com/anchore/syft/syft/pkg/cataloger/internal/pkgtest" ) func TestNewGGUFPackage(t *testing.T) { tests := []struct { - name string - metadata *pkg.GGUFFileHeader - version string - locations []file.Location - checkFunc func(t *testing.T, p pkg.Package) + name string + metadata *pkg.GGUFFileHeader + input struct { + modelName string + version string + license string + locations []file.Location + } + expected pkg.Package }{ { - name: "complete GGUF package with all fields", - version: "3.0", + name: "complete GGUF package with all fields", + input: struct { + modelName string + version string + license string + locations []file.Location + }{ + modelName: "llama3-8b", + version: "3.0", + license: "Apache-2.0", + locations: []file.Location{file.NewLocation("/models/llama3-8b.gguf")}, + }, metadata: &pkg.GGUFFileHeader{ - ModelName: "llama3-8b-instruct", - License: "Apache-2.0", Architecture: "llama", Quantization: "Q4_K_M", Parameters: 8030000000, GGUFVersion: 3, TensorCount: 291, - Header: map[string]any{}, + Header: map[string]any{ + "general.random_kv": "foobar", + }, }, - locations: []file.Location{file.NewLocation("/models/llama3-8b.gguf")}, - checkFunc: func(t *testing.T, p pkg.Package) { - if d := cmp.Diff("llama3-8b-instruct", p.Name); d != "" { - t.Errorf("Name mismatch (-want +got):\n%s", d) - } - if d := cmp.Diff("3.0", p.Version); d != "" { - t.Errorf("Version mismatch (-want +got):\n%s", d) - } - if d := cmp.Diff(pkg.ModelPkg, p.Type); d != "" { - t.Errorf("Type mismatch (-want +got):\n%s", d) - } - assert.Empty(t, p.PURL, "PURL should not be set for model packages") - assert.Len(t, p.Licenses.ToSlice(), 1) - if d := cmp.Diff("Apache-2.0", p.Licenses.ToSlice()[0].Value); d != "" { - t.Errorf("License value mismatch (-want +got):\n%s", d) - } - assert.NotEmpty(t, p.ID()) + expected: pkg.Package{ + Name: "llama3-8b", + Version: "3.0", + Type: pkg.ModelPkg, + Licenses: pkg.NewLicenseSet( + pkg.NewLicenseFromFields("Apache-2.0", "", nil), + ), + Metadata: pkg.GGUFFileHeader{ + Architecture: "llama", + Quantization: "Q4_K_M", + Parameters: 8030000000, + GGUFVersion: 3, + TensorCount: 291, + Header: map[string]any{ + "general.random_kv": "foobar", + }, + }, + Locations: file.NewLocationSet(file.NewLocation("/models/llama3-8b.gguf")), }, }, { - name: "minimal GGUF package", - version: "1.0", + name: "minimal GGUF package", + input: struct { + modelName string + version string + license string + locations []file.Location + }{ + modelName: "gpt2-small", + version: "1.0", + license: "MIT", + locations: []file.Location{file.NewLocation("/models/simple.gguf")}, + }, metadata: &pkg.GGUFFileHeader{ - ModelName: "simple-model", Architecture: "gpt2", GGUFVersion: 3, TensorCount: 50, }, - locations: []file.Location{file.NewLocation("/models/simple.gguf")}, - checkFunc: func(t *testing.T, p pkg.Package) { - if d := cmp.Diff("simple-model", p.Name); d != "" { - t.Errorf("Name mismatch (-want +got):\n%s", d) - } - if d := cmp.Diff("1.0", p.Version); d != "" { - t.Errorf("Version mismatch (-want +got):\n%s", d) - } - if d := cmp.Diff(pkg.ModelPkg, p.Type); d != "" { - t.Errorf("Type mismatch (-want +got):\n%s", d) - } - assert.Empty(t, p.PURL, "PURL should not be set for model packages") - assert.Empty(t, p.Licenses.ToSlice()) + expected: pkg.Package{ + Name: "gpt2-small", + Version: "1.0", + Type: pkg.ModelPkg, + Licenses: pkg.NewLicenseSet( + pkg.NewLicenseFromFields("MIT", "", nil), + ), + Metadata: pkg.GGUFFileHeader{ + Architecture: "gpt2", + GGUFVersion: 3, + TensorCount: 50, + }, + Locations: file.NewLocationSet(file.NewLocation("/models/simple.gguf")), }, }, { - name: "GGUF package with multiple locations", - version: "1.5", + name: "GGUF package with multiple locations", + input: struct { + modelName string + version string + license string + locations []file.Location + }{ + modelName: "llama-multi", + version: "2.0", + license: "Apache-2.0", + locations: []file.Location{ + file.NewLocation("/models/model1.gguf"), + file.NewLocation("/models/model2.gguf"), + }, + }, metadata: &pkg.GGUFFileHeader{ - ModelName: "multi-location-model", Architecture: "llama", GGUFVersion: 3, TensorCount: 150, }, - locations: []file.Location{ - file.NewLocation("/models/model1.gguf"), - file.NewLocation("/models/model2.gguf"), - }, - checkFunc: func(t *testing.T, p pkg.Package) { - assert.Len(t, p.Locations.ToSlice(), 2) + expected: pkg.Package{ + Name: "llama-multi", + Version: "2.0", + Type: pkg.ModelPkg, + Licenses: pkg.NewLicenseSet( + pkg.NewLicenseFromFields("Apache-2.0", "", nil), + ), + Metadata: pkg.GGUFFileHeader{ + Architecture: "llama", + GGUFVersion: 3, + TensorCount: 150, + }, + Locations: file.NewLocationSet( + file.NewLocation("/models/model1.gguf"), + file.NewLocation("/models/model2.gguf"), + ), }, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - p := newGGUFPackage(tt.metadata, tt.version, tt.locations...) + actual := newGGUFPackage( + tt.metadata, + tt.input.modelName, + tt.input.version, + tt.input.license, + tt.input.locations..., + ) - if d := cmp.Diff(tt.metadata.ModelName, p.Name); d != "" { - t.Errorf("Name mismatch (-want +got):\n%s", d) - } - if d := cmp.Diff(tt.version, p.Version); d != "" { - t.Errorf("Version mismatch (-want +got):\n%s", d) - } - if d := cmp.Diff(pkg.ModelPkg, p.Type); d != "" { - t.Errorf("Type mismatch (-want +got):\n%s", d) - } - - // Verify metadata is attached - metadata, ok := p.Metadata.(pkg.GGUFFileHeader) + // Verify metadata type + _, ok := actual.Metadata.(pkg.GGUFFileHeader) require.True(t, ok, "metadata should be GGUFFileHeader") - if d := cmp.Diff(*tt.metadata, metadata); d != "" { - t.Errorf("Metadata mismatch (-want +got):\n%s", d) - } - if tt.checkFunc != nil { - tt.checkFunc(t, p) - } + // Use AssertPackagesEqual for comprehensive comparison + pkgtest.AssertPackagesEqual(t, tt.expected, actual) }) } } diff --git a/syft/pkg/cataloger/ai/parse_gguf_model.go b/syft/pkg/cataloger/ai/parse_gguf_model.go index e79fef834..d65537013 100644 --- a/syft/pkg/cataloger/ai/parse_gguf_model.go +++ b/syft/pkg/cataloger/ai/parse_gguf_model.go @@ -67,26 +67,26 @@ func parseGGUFModel(_ context.Context, _ file.Resolver, _ *generic.Environment, // Convert to syft metadata structure syftMetadata := &pkg.GGUFFileHeader{ - ModelName: metadata.Name, - License: metadata.License, - Architecture: metadata.Architecture, - Quantization: metadata.FileTypeDescriptor, - Parameters: uint64(metadata.Parameters), - GGUFVersion: uint32(ggufFile.Header.Version), - TensorCount: ggufFile.Header.TensorCount, - Header: convertGGUFMetadataKVs(ggufFile.Header.MetadataKV), - MetadataHash: computeKVMetadataHash(ggufFile.Header.MetadataKV), + Architecture: metadata.Architecture, + Quantization: metadata.FileTypeDescriptor, + Parameters: uint64(metadata.Parameters), + GGUFVersion: uint32(ggufFile.Header.Version), + TensorCount: ggufFile.Header.TensorCount, + Header: convertGGUFMetadataKVs(ggufFile.Header.MetadataKV), + MetadataKeyValuesHash: computeKVMetadataHash(ggufFile.Header.MetadataKV), } // If model name is not in metadata, use filename - if syftMetadata.ModelName == "" { - syftMetadata.ModelName = extractModelNameFromPath(reader.Path()) + if metadata.Name == "" { + metadata.Name = extractModelNameFromPath(reader.Path()) } // Create package from metadata p := newGGUFPackage( syftMetadata, + metadata.Name, modelVersion, + metadata.License, reader.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation), ) diff --git a/syft/pkg/cataloger/ai/test_builder_test.go b/syft/pkg/cataloger/ai/test_builder_test.go deleted file mode 100644 index 304f6acf3..000000000 --- a/syft/pkg/cataloger/ai/test_builder_test.go +++ /dev/null @@ -1,41 +0,0 @@ -package ai - -import ( - "fmt" - "os" - - gguf_parser "github.com/gpustack/gguf-parser-go" -) - -func main() { - // Create a test GGUF file - data := newTestGGUFBuilder(). - withVersion(3). - withStringKV("general.architecture", "llama"). - withStringKV("general.name", "test-model"). - build() - - // Write to temp file - tempFile, err := os.CreateTemp("", "test-*.gguf") - if err != nil { - panic(err) - } - defer os.Remove(tempFile.Name()) - - if _, err := tempFile.Write(data); err != nil { - panic(err) - } - tempFile.Close() - - fmt.Printf("Wrote %d bytes to %s\n", len(data), tempFile.Name()) - - // Try to parse it - fmt.Println("Attempting to parse...") - gf, err := gguf_parser.ParseGGUFFile(tempFile.Name(), gguf_parser.SkipLargeMetadata()) - if err != nil { - fmt.Printf("Parse error: %v\n", err) - return - } - - fmt.Printf("Success! Model: %s\n", gf.Metadata().Name) -} diff --git a/syft/pkg/cataloger/ai/test_helpers_test.go b/syft/pkg/cataloger/ai/test_helpers_test.go index 565643523..aeca0dc63 100644 --- a/syft/pkg/cataloger/ai/test_helpers_test.go +++ b/syft/pkg/cataloger/ai/test_helpers_test.go @@ -6,6 +6,7 @@ import ( ) // GGUF type constants for test builder +// https://github.com/ggml-org/ggml/blob/master/docs/gguf.md const ( ggufMagic = 0x46554747 // "GGUF" in little-endian ggufTypeUint8 = 0 diff --git a/syft/pkg/gguf.go b/syft/pkg/gguf.go index 7cb83d31c..aea01491d 100644 --- a/syft/pkg/gguf.go +++ b/syft/pkg/gguf.go @@ -3,19 +3,14 @@ package pkg // GGUFFileHeader represents metadata extracted from a GGUF (GPT-Generated Unified Format) model file. // GGUF is a binary file format used for storing model weights for the GGML library, designed for fast // loading and saving of models, particularly quantized large language models. +// The Model Name, License, and Version fields have all been lifted up to be on the syft Package. type GGUFFileHeader struct { // GGUFVersion is the GGUF format version (e.g., 3) GGUFVersion uint32 `json:"ggufVersion" cyclonedx:"ggufVersion"` - // ModelName is the name of the model (from general.name or filename) - ModelName string `json:"modelName" cyclonedx:"modelName"` - // FileSize is the size of the GGUF file in bytes (best-effort if available from resolver) FileSize int64 `json:"fileSize,omitempty" cyclonedx:"fileSize"` - // License is the license identifier (from general.license if present) - License string `json:"license,omitempty" cyclonedx:"license"` - // Architecture is the model architecture (from general.architecture, e.g., "qwen3moe", "llama") Architecture string `json:"architecture,omitempty" cyclonedx:"architecture"` @@ -33,10 +28,10 @@ type GGUFFileHeader struct { // (namespaced with general.*, llama.*, etc.) while avoiding duplication. Header map[string]interface{} `json:"header,omitempty" cyclonedx:"header"` - // MetadataHash is a xx64 hash of all key-value pairs from the GGUF header metadata. + // MetadataKeyValuesHash is a xx64 hash of all key-value pairs from the GGUF header metadata. // This hash is computed over the complete header metadata (including the fields extracted // into typed fields above) and provides a stable identifier for the model configuration // across different file locations or remotes. It allows matching identical models even // when stored in different repositories or with different filenames. - MetadataHash string `json:"metadataHash,omitempty" cyclonedx:"metadataHash"` + MetadataKeyValuesHash string `json:"metadataHash,omitempty" cyclonedx:"metadataHash"` }