From 1ad4a2752aa89a702242819132cc45bd277c708e Mon Sep 17 00:00:00 2001 From: Christopher Phillips <32073428+spiffcs@users.noreply.github.com> Date: Tue, 14 Oct 2025 04:41:20 -0400 Subject: [PATCH] test: migrate gguf tests over Signed-off-by: Christopher Phillips <32073428+spiffcs@users.noreply.github.com> --- .../cataloger/aiartifact/cataloger_test.go | 385 ++++++ syft/pkg/cataloger/aiartifact/package.go | 44 +- syft/pkg/cataloger/aiartifact/package_test.go | 185 +++ .../cataloger/aiartifact/parse_gguf_test.go | 1164 +++++++++++++++++ .../cataloger/aiartifact/test_helpers_test.go | 109 ++ 5 files changed, 1845 insertions(+), 42 deletions(-) create mode 100644 syft/pkg/cataloger/aiartifact/cataloger_test.go create mode 100644 syft/pkg/cataloger/aiartifact/package_test.go create mode 100644 syft/pkg/cataloger/aiartifact/parse_gguf_test.go create mode 100644 syft/pkg/cataloger/aiartifact/test_helpers_test.go diff --git a/syft/pkg/cataloger/aiartifact/cataloger_test.go b/syft/pkg/cataloger/aiartifact/cataloger_test.go new file mode 100644 index 000000000..23dc3dcfd --- /dev/null +++ b/syft/pkg/cataloger/aiartifact/cataloger_test.go @@ -0,0 +1,385 @@ +package aiartifact + +import ( + "os" + "path/filepath" + "testing" + + "github.com/google/go-cmp/cmp/cmpopts" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/anchore/syft/syft/artifact" + "github.com/anchore/syft/syft/pkg" + "github.com/anchore/syft/syft/pkg/cataloger/internal/pkgtest" +) + +func TestGGUFCataloger_Globs(t *testing.T) { + tests := []struct { + name string + setup func(t *testing.T) string // returns fixture directory + expected []string + }{ + { + name: "finds GGUF files in root", + setup: func(t *testing.T) string { + dir := t.TempDir() + createTestGGUFInDir(t, dir, "model1.gguf") + createTestGGUFInDir(t, dir, "model2.gguf") + return dir + }, + expected: []string{ + "model1.gguf", + "model2.gguf", + }, + }, + { + name: "finds GGUF files in subdirectories", + setup: func(t *testing.T) string { + dir := t.TempDir() + modelsDir := filepath.Join(dir, "models") + os.MkdirAll(modelsDir, 0755) + createTestGGUFInDir(t, modelsDir, "llama.gguf") + + deepDir := filepath.Join(dir, "deep", "nested", "path") + os.MkdirAll(deepDir, 0755) + createTestGGUFInDir(t, deepDir, "mistral.gguf") + + return dir + }, + expected: []string{ + "models/llama.gguf", + "deep/nested/path/mistral.gguf", + }, + }, + { + name: "ignores non-GGUF files", + setup: func(t *testing.T) string { + dir := t.TempDir() + createTestGGUFInDir(t, dir, "model.gguf") + + // Create non-GGUF files + os.WriteFile(filepath.Join(dir, "readme.txt"), []byte("readme"), 0644) + os.WriteFile(filepath.Join(dir, "model.bin"), []byte("binary"), 0644) + os.WriteFile(filepath.Join(dir, "config.json"), []byte("{}"), 0644) + + return dir + }, + expected: []string{ + "model.gguf", + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + fixtureDir := tt.setup(t) + + tester := pkgtest.NewCatalogTester(). + FromDirectory(t, fixtureDir) + + if len(tt.expected) > 0 { + tester.ExpectsResolverContentQueries(tt.expected) + } + + tester.TestCataloger(t, NewGGUFCataloger()) + }) + } +} + +func TestGGUFCataloger_Integration(t *testing.T) { + tests := []struct { + name string + setup func(t *testing.T) string + expectedPackages []pkg.Package + expectedRelationships []artifact.Relationship + }{ + { + name: "catalog single GGUF file", + setup: func(t *testing.T) string { + dir := t.TempDir() + data := newTestGGUFBuilder(). + withVersion(3). + withTensorCount(291). + withStringKV("general.architecture", "llama"). + withStringKV("general.name", "llama3-8b"). + withStringKV("general.version", "3.0"). + withStringKV("general.license", "Apache-2.0"). + withStringKV("general.quantization", "Q4_K_M"). + withUint64KV("general.parameter_count", 8030000000). + build() + + path := filepath.Join(dir, "llama3-8b.gguf") + os.WriteFile(path, data, 0644) + return dir + }, + expectedPackages: []pkg.Package{ + { + Name: "llama3-8b", + Version: "3.0", + Type: pkg.ModelPkg, + Licenses: pkg.NewLicenseSet( + pkg.NewLicenseFromFields("Apache-2.0", "", nil), + ), + Metadata: pkg.GGUFFileMetadata{ + ModelFormat: "gguf", + ModelName: "llama3-8b", + ModelVersion: "3.0", + License: "Apache-2.0", + Architecture: "llama", + Quantization: "Q4_K_M", + Parameters: 8030000000, + GGUFVersion: 3, + TensorCount: 291, + Header: map[string]interface{}{}, + TruncatedHeader: false, + }, + }, + }, + expectedRelationships: nil, + }, + { + name: "catalog multiple GGUF files", + setup: func(t *testing.T) string { + dir := t.TempDir() + + // Create first model + data1 := newTestGGUFBuilder(). + withVersion(3). + withTensorCount(100). + withStringKV("general.architecture", "llama"). + withStringKV("general.name", "model1"). + withStringKV("general.version", "1.0"). + build() + os.WriteFile(filepath.Join(dir, "model1.gguf"), data1, 0644) + + // Create second model + data2 := newTestGGUFBuilder(). + withVersion(3). + withTensorCount(200). + withStringKV("general.architecture", "mistral"). + withStringKV("general.name", "model2"). + withStringKV("general.version", "2.0"). + build() + os.WriteFile(filepath.Join(dir, "model2.gguf"), data2, 0644) + + return dir + }, + expectedPackages: []pkg.Package{ + { + Name: "model1", + Version: "1.0", + Type: pkg.ModelPkg, + Metadata: pkg.GGUFFileMetadata{ + ModelFormat: "gguf", + ModelName: "model1", + ModelVersion: "1.0", + Architecture: "llama", + Quantization: unkownGGUFData, + GGUFVersion: 3, + TensorCount: 100, + Header: map[string]interface{}{}, + TruncatedHeader: false, + }, + }, + { + Name: "model2", + Version: "2.0", + Type: pkg.ModelPkg, + Metadata: pkg.GGUFFileMetadata{ + ModelFormat: "gguf", + ModelName: "model2", + ModelVersion: "2.0", + Architecture: "mistral", + Quantization: unkownGGUFData, + GGUFVersion: 3, + TensorCount: 200, + Header: map[string]interface{}{}, + TruncatedHeader: false, + }, + }, + }, + expectedRelationships: nil, + }, + { + name: "catalog GGUF in nested directories", + setup: func(t *testing.T) string { + dir := t.TempDir() + nestedDir := filepath.Join(dir, "models", "quantized") + os.MkdirAll(nestedDir, 0755) + + data := newTestGGUFBuilder(). + withVersion(3). + withTensorCount(150). + withStringKV("general.architecture", "qwen"). + withStringKV("general.name", "qwen-nested"). + build() + + os.WriteFile(filepath.Join(nestedDir, "qwen.gguf"), data, 0644) + return dir + }, + expectedPackages: []pkg.Package{ + { + Name: "qwen-nested", + Version: unkownGGUFData, + Type: pkg.ModelPkg, + Metadata: pkg.GGUFFileMetadata{ + ModelFormat: "gguf", + ModelName: "qwen-nested", + ModelVersion: unkownGGUFData, + Architecture: "qwen", + Quantization: unkownGGUFData, + GGUFVersion: 3, + TensorCount: 150, + Header: map[string]interface{}{}, + TruncatedHeader: false, + }, + }, + }, + expectedRelationships: nil, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + fixtureDir := tt.setup(t) + + // Use pkgtest to catalog and compare + tester := pkgtest.NewCatalogTester(). + FromDirectory(t, fixtureDir). + Expects(tt.expectedPackages, tt.expectedRelationships). + IgnoreLocationLayer(). + IgnorePackageFields("FoundBy", "Locations"). // These are set by the cataloger + WithCompareOptions( + // Ignore Hash as it's computed dynamically + cmpopts.IgnoreFields(pkg.GGUFFileMetadata{}, "Hash"), + ) + + tester.TestCataloger(t, NewGGUFCataloger()) + }) + } +} + +func TestGGUFCataloger_SkipsInvalidFiles(t *testing.T) { + dir := t.TempDir() + + // Create a valid GGUF + validData := newTestGGUFBuilder(). + withVersion(3). + withTensorCount(100). + withStringKV("general.architecture", "llama"). + withStringKV("general.name", "valid-model"). + build() + os.WriteFile(filepath.Join(dir, "valid.gguf"), validData, 0644) + + // Create an invalid GGUF (wrong magic) + invalidData := newTestGGUFBuilder().buildInvalidMagic() + os.WriteFile(filepath.Join(dir, "invalid.gguf"), invalidData, 0644) + + // Create a truncated GGUF + os.WriteFile(filepath.Join(dir, "truncated.gguf"), []byte{0x47}, 0644) + + // Catalog should succeed and only return the valid package + tester := pkgtest.NewCatalogTester(). + FromDirectory(t, dir). + ExpectsAssertion(func(t *testing.T, pkgs []pkg.Package, _ []artifact.Relationship) { + // Should only find the valid model + require.Len(t, pkgs, 1) + assert.Equal(t, "valid-model", pkgs[0].Name) + }) + + tester.TestCataloger(t, NewGGUFCataloger()) +} + +func TestGGUFCataloger_Name(t *testing.T) { + cataloger := NewGGUFCataloger() + assert.Equal(t, "gguf-cataloger", cataloger.Name()) +} + +func TestGGUFCataloger_EmptyDirectory(t *testing.T) { + dir := t.TempDir() + // Create a subdirectory to ensure glob still runs + os.MkdirAll(filepath.Join(dir, "models"), 0755) + + tester := pkgtest.NewCatalogTester(). + FromDirectory(t, dir). + ExpectsAssertion(func(t *testing.T, pkgs []pkg.Package, rels []artifact.Relationship) { + assert.Empty(t, pkgs) + assert.Empty(t, rels) + }) + + tester.TestCataloger(t, NewGGUFCataloger()) +} + +func TestGGUFCataloger_MixedFiles(t *testing.T) { + dir := t.TempDir() + + // Create GGUF file + ggufData := newTestGGUFBuilder(). + withVersion(3). + withTensorCount(100). + withStringKV("general.architecture", "llama"). + withStringKV("general.name", "test-model"). + build() + os.WriteFile(filepath.Join(dir, "model.gguf"), ggufData, 0644) + + // Create other file types + os.WriteFile(filepath.Join(dir, "README.md"), []byte("# Models"), 0644) + os.WriteFile(filepath.Join(dir, "config.json"), []byte("{}"), 0644) + os.WriteFile(filepath.Join(dir, "weights.bin"), []byte("weights"), 0644) + os.MkdirAll(filepath.Join(dir, "subdir"), 0755) + + tester := pkgtest.NewCatalogTester(). + FromDirectory(t, dir). + ExpectsAssertion(func(t *testing.T, pkgs []pkg.Package, _ []artifact.Relationship) { + // Should only find the GGUF model + require.Len(t, pkgs, 1) + assert.Equal(t, "test-model", pkgs[0].Name) + assert.Equal(t, pkg.ModelPkg, pkgs[0].Type) + }) + + tester.TestCataloger(t, NewGGUFCataloger()) +} + +func TestGGUFCataloger_CaseInsensitiveGlob(t *testing.T) { + // Test that the glob pattern is case-sensitive (as expected for **/*.gguf) + dir := t.TempDir() + + // Create lowercase .gguf + data := newTestGGUFBuilder(). + withVersion(3). + withTensorCount(100). + withStringKV("general.architecture", "llama"). + withStringKV("general.name", "lowercase"). + build() + os.WriteFile(filepath.Join(dir, "model.gguf"), data, 0644) + + // Create uppercase .GGUF (should not match the glob) + os.WriteFile(filepath.Join(dir, "MODEL.GGUF"), data, 0644) + + tester := pkgtest.NewCatalogTester(). + FromDirectory(t, dir). + ExpectsAssertion(func(t *testing.T, pkgs []pkg.Package, _ []artifact.Relationship) { + // Depending on filesystem case-sensitivity, we may get 1 or 2 packages + // On case-insensitive filesystems (macOS), both might match + // On case-sensitive filesystems (Linux), only lowercase matches + assert.GreaterOrEqual(t, len(pkgs), 1, "should find at least the lowercase file") + }) + + tester.TestCataloger(t, NewGGUFCataloger()) +} + +// createTestGGUFInDir creates a minimal test GGUF file in the specified directory +func createTestGGUFInDir(t *testing.T, dir, filename string) { + t.Helper() + data := newTestGGUFBuilder(). + withVersion(3). + withTensorCount(100). + withStringKV("general.architecture", "llama"). + withStringKV("general.name", "test-model"). + build() + + path := filepath.Join(dir, filename) + err := os.WriteFile(path, data, 0644) + require.NoError(t, err) +} diff --git a/syft/pkg/cataloger/aiartifact/package.go b/syft/pkg/cataloger/aiartifact/package.go index 76d6b9a9b..334365cc1 100644 --- a/syft/pkg/cataloger/aiartifact/package.go +++ b/syft/pkg/cataloger/aiartifact/package.go @@ -1,9 +1,6 @@ package aiartifact import ( - "fmt" - - "github.com/anchore/packageurl-go" "github.com/anchore/syft/syft/file" "github.com/anchore/syft/syft/pkg" ) @@ -12,11 +9,12 @@ func newGGUFPackage(metadata *pkg.GGUFFileMetadata, locations ...file.Location) p := pkg.Package{ Name: metadata.ModelName, Version: metadata.ModelVersion, - PURL: packageURL(metadata), Locations: file.NewLocationSet(locations...), Type: pkg.ModelPkg, Licenses: pkg.NewLicenseSet(), Metadata: *metadata, + // NOTE: PURL is intentionally not set as the package-url spec + // has not yet finalized support for ML model packages } // Add license to the package if present in metadata @@ -28,41 +26,3 @@ func newGGUFPackage(metadata *pkg.GGUFFileMetadata, locations ...file.Location) return p } - -// packageURL returns the PURL for the specific GGUF model package (see https://github.com/package-url/purl-spec) -func packageURL(metadata *pkg.GGUFFileMetadata) string { - var qualifiers packageurl.Qualifiers - - // Add model-specific qualifiers - if metadata.Architecture != "" { - qualifiers = append(qualifiers, packageurl.Qualifier{ - Key: "arch", - Value: metadata.Architecture, - }) - } - - if metadata.Quantization != "" && metadata.Quantization != "unknown" { - qualifiers = append(qualifiers, packageurl.Qualifier{ - Key: "quantization", - Value: metadata.Quantization, - }) - } - - if metadata.Parameters > 0 { - qualifiers = append(qualifiers, packageurl.Qualifier{ - Key: "parameters", - Value: fmt.Sprintf("%d", metadata.Parameters), - }) - } - - // Use mlmodel as the type for machine learning models in GGUF format - // This follows the PURL spec guidance for ML models - return packageurl.NewPackageURL( - "mlmodel", - "gguf", - metadata.ModelName, - metadata.ModelVersion, - qualifiers, - "", - ).ToString() -} diff --git a/syft/pkg/cataloger/aiartifact/package_test.go b/syft/pkg/cataloger/aiartifact/package_test.go new file mode 100644 index 000000000..72f157590 --- /dev/null +++ b/syft/pkg/cataloger/aiartifact/package_test.go @@ -0,0 +1,185 @@ +package aiartifact + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/anchore/syft/syft/file" + "github.com/anchore/syft/syft/pkg" +) + +func TestNewGGUFPackage(t *testing.T) { + tests := []struct { + name string + metadata *pkg.GGUFFileMetadata + locations []file.Location + checkFunc func(t *testing.T, p pkg.Package) + }{ + { + name: "complete GGUF package with all fields", + metadata: &pkg.GGUFFileMetadata{ + ModelFormat: "gguf", + ModelName: "llama3-8b-instruct", + ModelVersion: "3.0", + License: "Apache-2.0", + Architecture: "llama", + Quantization: "Q4_K_M", + Parameters: 8030000000, + GGUFVersion: 3, + TensorCount: 291, + Header: map[string]interface{}{}, + TruncatedHeader: false, + }, + locations: []file.Location{file.NewLocation("/models/llama3-8b.gguf")}, + checkFunc: func(t *testing.T, p pkg.Package) { + assert.Equal(t, "llama3-8b-instruct", p.Name) + assert.Equal(t, "3.0", p.Version) + assert.Equal(t, pkg.ModelPkg, p.Type) + assert.Empty(t, p.PURL, "PURL should not be set for model packages") + assert.Len(t, p.Licenses.ToSlice(), 1) + assert.Equal(t, "Apache-2.0", p.Licenses.ToSlice()[0].Value) + assert.NotEmpty(t, p.ID()) + }, + }, + { + name: "minimal GGUF package", + metadata: &pkg.GGUFFileMetadata{ + ModelFormat: "gguf", + ModelName: "simple-model", + ModelVersion: "1.0", + Architecture: "gpt2", + GGUFVersion: 3, + TensorCount: 50, + }, + locations: []file.Location{file.NewLocation("/models/simple.gguf")}, + checkFunc: func(t *testing.T, p pkg.Package) { + assert.Equal(t, "simple-model", p.Name) + assert.Equal(t, "1.0", p.Version) + assert.Equal(t, pkg.ModelPkg, p.Type) + assert.Empty(t, p.PURL, "PURL should not be set for model packages") + assert.Empty(t, p.Licenses.ToSlice()) + }, + }, + { + name: "GGUF package with multiple locations", + metadata: &pkg.GGUFFileMetadata{ + ModelFormat: "gguf", + ModelName: "multi-location-model", + ModelVersion: "1.5", + Architecture: "llama", + GGUFVersion: 3, + TensorCount: 150, + }, + locations: []file.Location{ + file.NewLocation("/models/model1.gguf"), + file.NewLocation("/models/model2.gguf"), + }, + checkFunc: func(t *testing.T, p pkg.Package) { + assert.Len(t, p.Locations.ToSlice(), 2) + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + p := newGGUFPackage(tt.metadata, tt.locations...) + + assert.Equal(t, tt.metadata.ModelName, p.Name) + assert.Equal(t, tt.metadata.ModelVersion, p.Version) + assert.Equal(t, pkg.ModelPkg, p.Type) + + // Verify metadata is attached + metadata, ok := p.Metadata.(pkg.GGUFFileMetadata) + require.True(t, ok, "metadata should be GGUFFileMetadata") + assert.Equal(t, *tt.metadata, metadata) + + if tt.checkFunc != nil { + tt.checkFunc(t, p) + } + }) + } +} + +func TestNewGGUFPackage_IDUniqueness(t *testing.T) { + // Test that different packages get different IDs + metadata1 := &pkg.GGUFFileMetadata{ + ModelFormat: "gguf", + ModelName: "model-1", + ModelVersion: "1.0", + Architecture: "llama", + GGUFVersion: 3, + TensorCount: 100, + } + + metadata2 := &pkg.GGUFFileMetadata{ + ModelFormat: "gguf", + ModelName: "model-2", + ModelVersion: "1.0", + Architecture: "llama", + GGUFVersion: 3, + TensorCount: 100, + } + + loc := file.NewLocation("/models/test.gguf") + p1 := newGGUFPackage(metadata1, loc) + p2 := newGGUFPackage(metadata2, loc) + + assert.NotEqual(t, p1.ID(), p2.ID(), "different packages should have different IDs") +} + +func TestNewGGUFPackage_IDConsistency(t *testing.T) { + // Test that same metadata produces same ID + metadata := &pkg.GGUFFileMetadata{ + ModelFormat: "gguf", + ModelName: "test-model", + ModelVersion: "1.0", + Architecture: "llama", + GGUFVersion: 3, + TensorCount: 100, + } + + loc := file.NewLocation("/models/test.gguf") + p1 := newGGUFPackage(metadata, loc) + p2 := newGGUFPackage(metadata, loc) + + assert.Equal(t, p1.ID(), p2.ID(), "identical packages should have identical IDs") +} + +func TestNewGGUFPackage_MetadataPreservation(t *testing.T) { + // Ensure all metadata fields are preserved in the package + metadata := &pkg.GGUFFileMetadata{ + ModelFormat: "gguf", + ModelName: "preservation-test", + ModelVersion: "2.0", + License: "MIT", + Architecture: "llama", + Quantization: "Q4_K_M", + Parameters: 7000000000, + GGUFVersion: 3, + TensorCount: 219, + Hash: "abc123", + Header: map[string]interface{}{"custom.field": "value"}, + TruncatedHeader: false, + } + + loc := file.NewLocation("/models/test.gguf") + p := newGGUFPackage(metadata, loc) + + extractedMetadata, ok := p.Metadata.(pkg.GGUFFileMetadata) + require.True(t, ok) + + assert.Equal(t, metadata.ModelFormat, extractedMetadata.ModelFormat) + assert.Equal(t, metadata.ModelName, extractedMetadata.ModelName) + assert.Equal(t, metadata.ModelVersion, extractedMetadata.ModelVersion) + assert.Equal(t, metadata.License, extractedMetadata.License) + assert.Equal(t, metadata.Architecture, extractedMetadata.Architecture) + assert.Equal(t, metadata.Quantization, extractedMetadata.Quantization) + assert.Equal(t, metadata.Parameters, extractedMetadata.Parameters) + assert.Equal(t, metadata.GGUFVersion, extractedMetadata.GGUFVersion) + assert.Equal(t, metadata.TensorCount, extractedMetadata.TensorCount) + assert.Equal(t, metadata.Hash, extractedMetadata.Hash) + assert.Equal(t, metadata.TruncatedHeader, extractedMetadata.TruncatedHeader) + assert.Equal(t, metadata.Header, extractedMetadata.Header) +} diff --git a/syft/pkg/cataloger/aiartifact/parse_gguf_test.go b/syft/pkg/cataloger/aiartifact/parse_gguf_test.go new file mode 100644 index 000000000..2a7779618 --- /dev/null +++ b/syft/pkg/cataloger/aiartifact/parse_gguf_test.go @@ -0,0 +1,1164 @@ +package aiartifact + +import ( + "bytes" + "context" + "encoding/binary" + "io" + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/anchore/syft/syft/artifact" + "github.com/anchore/syft/syft/file" + "github.com/anchore/syft/syft/pkg" +) + +func TestParseGGUFHeader(t *testing.T) { + tests := []struct { + name string + buildData func() []byte + wantMeta *pkg.GGUFFileMetadata + wantErr bool + }{ + { + name: "standard GGUF with all fields", + buildData: func() []byte { + return newTestGGUFBuilder(). + withVersion(3). + withTensorCount(291). + withStringKV("general.architecture", "llama"). + withStringKV("general.name", "llama3-8b-instruct"). + withStringKV("general.version", "3.0"). + withStringKV("general.license", "Apache-2.0"). + withStringKV("general.quantization", "Q4_K_M"). + withUint64KV("general.parameter_count", 8030000000). + build() + }, + wantMeta: &pkg.GGUFFileMetadata{ + ModelFormat: "gguf", + ModelName: "llama3-8b-instruct", + ModelVersion: "3.0", + License: "Apache-2.0", + Architecture: "llama", + Quantization: "Q4_K_M", + Parameters: 8030000000, + GGUFVersion: 3, + TensorCount: 291, + Header: map[string]any{}, + TruncatedHeader: false, + }, + }, + { + name: "minimal GGUF with only architecture", + buildData: func() []byte { + return newTestGGUFBuilder(). + withVersion(3). + withTensorCount(100). + withStringKV("general.architecture", "qwen"). + withStringKV("general.name", "qwen2-1.5b"). + build() + }, + wantMeta: &pkg.GGUFFileMetadata{ + ModelFormat: "gguf", + ModelName: "qwen2-1.5b", + ModelVersion: unkownGGUFData, + Architecture: "qwen", + Quantization: unkownGGUFData, + GGUFVersion: 3, + TensorCount: 100, + Header: map[string]any{}, + TruncatedHeader: false, + }, + }, + { + name: "GGUF v2 (older version)", + buildData: func() []byte { + return newTestGGUFBuilder(). + withVersion(2). + withTensorCount(50). + withStringKV("general.architecture", "gpt2"). + withStringKV("general.name", "gpt2-small"). + build() + }, + wantMeta: &pkg.GGUFFileMetadata{ + ModelFormat: "gguf", + ModelName: "gpt2-small", + ModelVersion: unkownGGUFData, + Architecture: "gpt2", + Quantization: unkownGGUFData, + GGUFVersion: 2, + TensorCount: 50, + Header: map[string]any{}, + TruncatedHeader: false, + }, + }, + { + name: "GGUF without general.name falls back to location", + buildData: func() []byte { + return newTestGGUFBuilder(). + withVersion(3). + withTensorCount(150). + withStringKV("general.architecture", "llama"). + withStringKV("general.license", "MIT"). + build() + }, + wantMeta: &pkg.GGUFFileMetadata{ + ModelFormat: "gguf", + ModelName: "test-model", // will be extracted from location + ModelVersion: unkownGGUFData, + Architecture: "llama", + License: "MIT", + Quantization: unkownGGUFData, + GGUFVersion: 3, + TensorCount: 150, + Header: map[string]any{}, + TruncatedHeader: false, + }, + }, + { + name: "GGUF with extra metadata fields in header", + buildData: func() []byte { + return newTestGGUFBuilder(). + withVersion(3). + withTensorCount(200). + withStringKV("general.architecture", "mistral"). + withStringKV("general.name", "mistral-7b"). + withStringKV("llama.attention.head_count", "32"). + withStringKV("llama.embedding_length", "4096"). + build() + }, + wantMeta: &pkg.GGUFFileMetadata{ + ModelFormat: "gguf", + ModelName: "mistral-7b", + ModelVersion: unkownGGUFData, + Architecture: "mistral", + Quantization: unkownGGUFData, + GGUFVersion: 3, + TensorCount: 200, + Header: map[string]any{ + "llama.attention.head_count": "32", + "llama.embedding_length": "4096", + }, + TruncatedHeader: false, + }, + }, + { + name: "invalid magic number", + buildData: func() []byte { + return newTestGGUFBuilder().buildInvalidMagic() + }, + wantErr: true, + }, + { + name: "truncated file (too small)", + buildData: func() []byte { + return []byte{0x47, 0x47} + }, + wantErr: true, + }, + { + name: "empty file", + buildData: func() []byte { + return []byte{} + }, + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + data := tt.buildData() + got, err := parseGGUFHeader(data, "/path/to/test-model.gguf") + + if tt.wantErr { + require.Error(t, err) + return + } + + require.NoError(t, err) + require.NotNil(t, got) + + // Don't compare Hash as it's computed + assert.Equal(t, tt.wantMeta.ModelFormat, got.ModelFormat) + assert.Equal(t, tt.wantMeta.ModelVersion, got.ModelVersion) + assert.Equal(t, tt.wantMeta.Architecture, got.Architecture) + assert.Equal(t, tt.wantMeta.Quantization, got.Quantization) + assert.Equal(t, tt.wantMeta.GGUFVersion, got.GGUFVersion) + assert.Equal(t, tt.wantMeta.TensorCount, got.TensorCount) + assert.Equal(t, tt.wantMeta.Parameters, got.Parameters) + assert.Equal(t, tt.wantMeta.License, got.License) + assert.Equal(t, tt.wantMeta.TruncatedHeader, got.TruncatedHeader) + + // For the case without general.name, check that filename was used + if tt.name == "GGUF without general.name falls back to location" { + assert.Equal(t, "test-model", got.ModelName) + } else if tt.wantMeta.ModelName != "" { + assert.Equal(t, tt.wantMeta.ModelName, got.ModelName) + } + + // Check Header map + for k, v := range tt.wantMeta.Header { + assert.Equal(t, v, got.Header[k], "Header key %s mismatch", k) + } + + // Hash should be computed + if !tt.wantErr { + assert.NotEmpty(t, got.Hash) + } + }) + } +} + +func TestReadValue(t *testing.T) { + tests := []struct { + name string + valueType uint32 + buildData func() []byte + want interface{} + wantErr bool + }{ + { + name: "uint8", + valueType: ggufTypeUint8, + buildData: func() []byte { + buf := new(bytes.Buffer) + binary.Write(buf, binary.LittleEndian, uint8(42)) + return buf.Bytes() + }, + want: uint8(42), + }, + { + name: "int8", + valueType: ggufTypeInt8, + buildData: func() []byte { + buf := new(bytes.Buffer) + binary.Write(buf, binary.LittleEndian, int8(-42)) + return buf.Bytes() + }, + want: int8(-42), + }, + { + name: "uint32", + valueType: ggufTypeUint32, + buildData: func() []byte { + buf := new(bytes.Buffer) + binary.Write(buf, binary.LittleEndian, uint32(12345)) + return buf.Bytes() + }, + want: uint32(12345), + }, + { + name: "uint64", + valueType: ggufTypeUint64, + buildData: func() []byte { + buf := new(bytes.Buffer) + binary.Write(buf, binary.LittleEndian, uint64(9876543210)) + return buf.Bytes() + }, + want: uint64(9876543210), + }, + { + name: "float32", + valueType: ggufTypeFloat32, + buildData: func() []byte { + buf := new(bytes.Buffer) + binary.Write(buf, binary.LittleEndian, float32(3.14159)) + return buf.Bytes() + }, + want: float32(3.14159), + }, + { + name: "bool true", + valueType: ggufTypeBool, + buildData: func() []byte { + buf := new(bytes.Buffer) + binary.Write(buf, binary.LittleEndian, uint8(1)) + return buf.Bytes() + }, + want: true, + }, + { + name: "bool false", + valueType: ggufTypeBool, + buildData: func() []byte { + buf := new(bytes.Buffer) + binary.Write(buf, binary.LittleEndian, uint8(0)) + return buf.Bytes() + }, + want: false, + }, + { + name: "string", + valueType: ggufTypeString, + buildData: func() []byte { + buf := new(bytes.Buffer) + s := "hello world" + binary.Write(buf, binary.LittleEndian, uint64(len(s))) + buf.WriteString(s) + return buf.Bytes() + }, + want: "hello world", + }, + { + name: "unknown type", + valueType: 99, + buildData: func() []byte { + return []byte{} + }, + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + data := tt.buildData() + reader := bytes.NewReader(data) + + got, err := readValue(reader, tt.valueType) + + if tt.wantErr { + require.Error(t, err) + return + } + + require.NoError(t, err) + assert.Equal(t, tt.want, got) + }) + } +} + +func TestReadString(t *testing.T) { + tests := []struct { + name string + buildData func() []byte + want string + wantErr bool + }{ + { + name: "normal string", + buildData: func() []byte { + buf := new(bytes.Buffer) + s := "test string" + binary.Write(buf, binary.LittleEndian, uint64(len(s))) + buf.WriteString(s) + return buf.Bytes() + }, + want: "test string", + }, + { + name: "empty string", + buildData: func() []byte { + buf := new(bytes.Buffer) + binary.Write(buf, binary.LittleEndian, uint64(0)) + return buf.Bytes() + }, + want: "", + }, + { + name: "string exceeds max length", + buildData: func() []byte { + buf := new(bytes.Buffer) + binary.Write(buf, binary.LittleEndian, uint64(maxKeyLen+1)) + return buf.Bytes() + }, + wantErr: true, + }, + { + name: "truncated string data", + buildData: func() []byte { + buf := new(bytes.Buffer) + binary.Write(buf, binary.LittleEndian, uint64(100)) + buf.WriteString("short") + return buf.Bytes() + }, + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + data := tt.buildData() + reader := bytes.NewReader(data) + + got, err := readString(reader) + + if tt.wantErr { + require.Error(t, err) + return + } + + require.NoError(t, err) + assert.Equal(t, tt.want, got) + }) + } +} + +func TestReadArray(t *testing.T) { + tests := []struct { + name string + buildData func() []byte + want interface{} + wantErr bool + }{ + { + name: "array of uint32", + buildData: func() []byte { + buf := new(bytes.Buffer) + binary.Write(buf, binary.LittleEndian, uint32(ggufTypeUint32)) // element type + binary.Write(buf, binary.LittleEndian, uint64(3)) // array length + binary.Write(buf, binary.LittleEndian, uint32(1)) + binary.Write(buf, binary.LittleEndian, uint32(2)) + binary.Write(buf, binary.LittleEndian, uint32(3)) + return buf.Bytes() + }, + want: []interface{}{uint32(1), uint32(2), uint32(3)}, + }, + { + name: "empty array", + buildData: func() []byte { + buf := new(bytes.Buffer) + binary.Write(buf, binary.LittleEndian, uint32(ggufTypeUint32)) + binary.Write(buf, binary.LittleEndian, uint64(0)) + return buf.Bytes() + }, + want: ([]interface{})(nil), // Empty array returns nil slice + }, + { + name: "array too large", + buildData: func() []byte { + buf := new(bytes.Buffer) + binary.Write(buf, binary.LittleEndian, uint32(ggufTypeUint32)) + binary.Write(buf, binary.LittleEndian, uint64(10000)) + return buf.Bytes() + }, + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + data := tt.buildData() + reader := bytes.NewReader(data) + + got, err := readArray(reader) + + if tt.wantErr { + require.Error(t, err) + return + } + + require.NoError(t, err) + assert.Equal(t, tt.want, got) + }) + } +} + +func TestReadKVPair(t *testing.T) { + tests := []struct { + name string + buildData func() []byte + wantKey string + wantValue interface{} + wantErr bool + }{ + { + name: "string key-value pair", + buildData: func() []byte { + buf := new(bytes.Buffer) + // Write key + key := "general.name" + binary.Write(buf, binary.LittleEndian, uint64(len(key))) + buf.WriteString(key) + // Write value type + binary.Write(buf, binary.LittleEndian, uint32(ggufTypeString)) + // Write value + value := "test-model" + binary.Write(buf, binary.LittleEndian, uint64(len(value))) + buf.WriteString(value) + return buf.Bytes() + }, + wantKey: "general.name", + wantValue: "test-model", + }, + { + name: "uint64 key-value pair", + buildData: func() []byte { + buf := new(bytes.Buffer) + // Write key + key := "general.parameter_count" + binary.Write(buf, binary.LittleEndian, uint64(len(key))) + buf.WriteString(key) + // Write value type + binary.Write(buf, binary.LittleEndian, uint32(ggufTypeUint64)) + // Write value + binary.Write(buf, binary.LittleEndian, uint64(7000000000)) + return buf.Bytes() + }, + wantKey: "general.parameter_count", + wantValue: uint64(7000000000), + }, + { + name: "key too long", + buildData: func() []byte { + buf := new(bytes.Buffer) + binary.Write(buf, binary.LittleEndian, uint64(maxKeyLen+1)) + return buf.Bytes() + }, + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + data := tt.buildData() + reader := bytes.NewReader(data) + + gotKey, gotValue, err := readKVPair(reader) + + if tt.wantErr { + require.Error(t, err) + return + } + + require.NoError(t, err) + assert.Equal(t, tt.wantKey, gotKey) + assert.Equal(t, tt.wantValue, gotValue) + }) + } +} + +func TestInferQuantizationFromFilename(t *testing.T) { + tests := []struct { + name string + filename string + want string + }{ + { + name: "Q4_K_M quantization", + filename: "/path/to/model-Q4_K_M.gguf", + want: "Q4_K_M", + }, + { + name: "IQ4_NL quantization", + filename: "/path/to/model-IQ4_NL.gguf", + want: "Q4_NL", // The regex [IQ]\d+_[A-Z_]+ matches Q4_NL from IQ4_NL + }, + { + name: "Q5_K_S quantization", + filename: "mistral-7b-Q5_K_S.gguf", + want: "Q5_K_S", + }, + { + name: "no quantization in filename", + filename: "/path/to/model.gguf", + want: unkownGGUFData, + }, + { + name: "partial match should not match", + filename: "/path/to/Q4-model.gguf", + want: unkownGGUFData, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := inferQuantizationFromFilename(tt.filename) + assert.Equal(t, tt.want, got) + }) + } +} + +func TestComputeMetadataHash(t *testing.T) { + tests := []struct { + name string + metadata *pkg.GGUFFileMetadata + wantLen int + }{ + { + name: "hash should be consistent", + metadata: &pkg.GGUFFileMetadata{ + ModelFormat: "gguf", + ModelName: "test-model", + ModelVersion: "1.0", + Architecture: "llama", + GGUFVersion: 3, + TensorCount: 100, + }, + wantLen: 16, // 8 bytes = 16 hex chars + }, + { + name: "different metadata produces different hash", + metadata: &pkg.GGUFFileMetadata{ + ModelFormat: "gguf", + ModelName: "different-model", + ModelVersion: "2.0", + Architecture: "gpt2", + GGUFVersion: 2, + TensorCount: 200, + }, + wantLen: 16, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + hash := computeMetadataHash(tt.metadata) + assert.Len(t, hash, tt.wantLen) + assert.NotEmpty(t, hash) + }) + } + + // Test that same metadata produces same hash + meta1 := &pkg.GGUFFileMetadata{ + ModelFormat: "gguf", + ModelName: "test", + ModelVersion: "1.0", + Architecture: "llama", + GGUFVersion: 3, + TensorCount: 100, + } + meta2 := &pkg.GGUFFileMetadata{ + ModelFormat: "gguf", + ModelName: "test", + ModelVersion: "1.0", + Architecture: "llama", + GGUFVersion: 3, + TensorCount: 100, + } + hash1 := computeMetadataHash(meta1) + hash2 := computeMetadataHash(meta2) + assert.Equal(t, hash1, hash2, "identical metadata should produce identical hash") + + // Test that different metadata produces different hash + meta3 := &pkg.GGUFFileMetadata{ + ModelFormat: "gguf", + ModelName: "different", + ModelVersion: "1.0", + Architecture: "llama", + GGUFVersion: 3, + TensorCount: 100, + } + hash3 := computeMetadataHash(meta3) + assert.NotEqual(t, hash1, hash3, "different metadata should produce different hash") +} + +func TestParseGGUFHeader_EdgeCases(t *testing.T) { + tests := []struct { + name string + buildData func() []byte + location string + wantErr bool + checkFunc func(t *testing.T, meta *pkg.GGUFFileMetadata) + }{ + { + name: "excessive KV pairs should error", + buildData: func() []byte { + buf := new(bytes.Buffer) + binary.Write(buf, binary.LittleEndian, uint32(ggufMagic)) + binary.Write(buf, binary.LittleEndian, uint32(3)) + binary.Write(buf, binary.LittleEndian, uint64(100)) + binary.Write(buf, binary.LittleEndian, uint64(maxKVPairs+1)) // Too many + return buf.Bytes() + }, + wantErr: true, + }, + { + name: "tensor count at maximum should succeed", + buildData: func() []byte { + return newTestGGUFBuilder(). + withVersion(3). + withTensorCount(maxTensors). + withStringKV("general.architecture", "llama"). + withStringKV("general.name", "large-model"). + build() + }, + wantErr: false, + checkFunc: func(t *testing.T, meta *pkg.GGUFFileMetadata) { + assert.Equal(t, uint64(maxTensors), meta.TensorCount) + }, + }, + { + name: "tensor count exceeds maximum should be capped", + buildData: func() []byte { + return newTestGGUFBuilder(). + withVersion(3). + withTensorCount(maxTensors+1000). + withStringKV("general.architecture", "llama"). + withStringKV("general.name", "huge-model"). + build() + }, + wantErr: false, + checkFunc: func(t *testing.T, meta *pkg.GGUFFileMetadata) { + assert.Equal(t, uint64(maxTensors), meta.TensorCount) + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + data := tt.buildData() + location := tt.location + if location == "" { + location = "/test/path.gguf" + } + + got, err := parseGGUFHeader(data, location) + + if tt.wantErr { + require.Error(t, err) + return + } + + require.NoError(t, err) + if tt.checkFunc != nil { + tt.checkFunc(t, got) + } + }) + } +} + +func TestReadValue_EOF(t *testing.T) { + // Test that reading beyond available data returns appropriate errors + tests := []struct { + name string + valueType uint32 + data []byte + }{ + { + name: "EOF reading uint32", + valueType: ggufTypeUint32, + data: []byte{0x01}, // Only 1 byte, need 4 + }, + { + name: "EOF reading string length", + valueType: ggufTypeString, + data: []byte{0x01}, // Incomplete length + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + reader := bytes.NewReader(tt.data) + _, err := readValue(reader, tt.valueType) + assert.Error(t, err) + assert.True(t, err == io.EOF || err == io.ErrUnexpectedEOF || bytes.Contains([]byte(err.Error()), []byte("EOF"))) + }) + } +} + +// ============================================================================ +// Integration Tests for parseGGUFModel +// ============================================================================ + +func TestParseGGUFModel(t *testing.T) { + tests := []struct { + name string + fixture func(t *testing.T) string // returns path to temp fixture + expectedPackages []pkg.Package + expectedRelationships []artifact.Relationship + wantErr bool + }{ + { + name: "valid GGUF with complete metadata", + fixture: func(t *testing.T) string { + return createTempGGUFFile(t, "llama3-8b-q4.gguf", + newTestGGUFBuilder(). + withVersion(3). + withTensorCount(291). + withStringKV("general.architecture", "llama"). + withStringKV("general.name", "llama3-8b-instruct"). + withStringKV("general.version", "3.0"). + withStringKV("general.license", "Apache-2.0"). + withStringKV("general.quantization", "Q4_K_M"). + withUint64KV("general.parameter_count", 8030000000). + build(), + ) + }, + expectedPackages: []pkg.Package{ + { + Name: "llama3-8b-instruct", + Version: "3.0", + Type: pkg.ModelPkg, + Metadata: pkg.GGUFFileMetadata{ + ModelFormat: "gguf", + ModelName: "llama3-8b-instruct", + ModelVersion: "3.0", + License: "Apache-2.0", + Architecture: "llama", + Quantization: "Q4_K_M", + Parameters: 8030000000, + GGUFVersion: 3, + TensorCount: 291, + Header: map[string]interface{}{}, + TruncatedHeader: false, + }, + }, + }, + }, + { + name: "minimal GGUF file", + fixture: func(t *testing.T) string { + return createTempGGUFFile(t, "minimal.gguf", + newTestGGUFBuilder(). + withVersion(3). + withTensorCount(100). + withStringKV("general.architecture", "qwen"). + withStringKV("general.name", "qwen2-1.5b"). + build(), + ) + }, + expectedPackages: []pkg.Package{ + { + Name: "qwen2-1.5b", + Version: unkownGGUFData, + Type: pkg.ModelPkg, + Metadata: pkg.GGUFFileMetadata{ + ModelFormat: "gguf", + ModelName: "qwen2-1.5b", + ModelVersion: unkownGGUFData, + Architecture: "qwen", + Quantization: unkownGGUFData, + GGUFVersion: 3, + TensorCount: 100, + Header: map[string]interface{}{}, + TruncatedHeader: false, + }, + }, + }, + }, + { + name: "GGUF without general.name uses filename", + fixture: func(t *testing.T) string { + return createTempGGUFFile(t, "inferred-name-model.gguf", + newTestGGUFBuilder(). + withVersion(3). + withTensorCount(150). + withStringKV("general.architecture", "llama"). + withStringKV("general.license", "MIT"). + build(), + ) + }, + expectedPackages: []pkg.Package{ + { + Name: "inferred-name-model", + Version: unkownGGUFData, + Type: pkg.ModelPkg, + Metadata: pkg.GGUFFileMetadata{ + ModelFormat: "gguf", + ModelName: "inferred-name-model", + ModelVersion: unkownGGUFData, + License: "MIT", + Architecture: "llama", + Quantization: unkownGGUFData, + GGUFVersion: 3, + TensorCount: 150, + Header: map[string]interface{}{}, + TruncatedHeader: false, + }, + }, + }, + }, + { + name: "GGUF with quantization inferred from filename", + fixture: func(t *testing.T) string { + return createTempGGUFFile(t, "mistral-7b-Q4_K_M.gguf", + newTestGGUFBuilder(). + withVersion(3). + withTensorCount(219). + withStringKV("general.architecture", "mistral"). + withStringKV("general.name", "mistral-7b-instruct"). + withStringKV("general.version", "0.2"). + withUint64KV("general.parameter_count", 7240000000). + withStringKV("general.quantized_by", "llama.cpp"). // Triggers filename inference + build(), + ) + }, + expectedPackages: []pkg.Package{ + { + Name: "mistral-7b-instruct", + Version: "0.2", + Type: pkg.ModelPkg, + Metadata: pkg.GGUFFileMetadata{ + ModelFormat: "gguf", + ModelName: "mistral-7b-instruct", + ModelVersion: "0.2", + Architecture: "mistral", + Quantization: "Q4_K_M", + Parameters: 7240000000, + GGUFVersion: 3, + TensorCount: 219, + Header: map[string]interface{}{}, + TruncatedHeader: false, + }, + }, + }, + }, + { + name: "invalid GGUF magic number", + fixture: func(t *testing.T) string { + return createTempGGUFFile(t, "invalid-magic.gguf", + newTestGGUFBuilder().buildInvalidMagic(), + ) + }, + wantErr: true, + }, + { + name: "truncated GGUF file", + fixture: func(t *testing.T) string { + return createTempGGUFFile(t, "truncated.gguf", []byte{0x47, 0x47}) + }, + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + fixturePath := tt.fixture(t) + defer os.Remove(fixturePath) + + f, err := os.Open(fixturePath) + require.NoError(t, err) + defer f.Close() + + location := file.NewLocation(fixturePath) + reader := file.LocationReadCloser{ + Location: location, + ReadCloser: f, + } + + ctx := context.Background() + pkgs, relationships, err := parseGGUFModel(ctx, nil, nil, reader) + + if tt.wantErr { + require.Error(t, err) + return + } + + require.NoError(t, err) + require.Len(t, pkgs, len(tt.expectedPackages)) + + // Compare packages (ignoring Hash which is computed) + for i, expectedPkg := range tt.expectedPackages { + actualPkg := pkgs[i] + + assert.Equal(t, expectedPkg.Name, actualPkg.Name) + assert.Equal(t, expectedPkg.Version, actualPkg.Version) + assert.Equal(t, expectedPkg.Type, actualPkg.Type) + assert.Empty(t, actualPkg.PURL, "PURL should not be set for model packages") + + // Check metadata + actualMeta, ok := actualPkg.Metadata.(pkg.GGUFFileMetadata) + require.True(t, ok) + expectedMeta := expectedPkg.Metadata.(pkg.GGUFFileMetadata) + + assert.Equal(t, expectedMeta.ModelFormat, actualMeta.ModelFormat) + assert.Equal(t, expectedMeta.ModelName, actualMeta.ModelName) + assert.Equal(t, expectedMeta.ModelVersion, actualMeta.ModelVersion) + assert.Equal(t, expectedMeta.License, actualMeta.License) + assert.Equal(t, expectedMeta.Architecture, actualMeta.Architecture) + assert.Equal(t, expectedMeta.Quantization, actualMeta.Quantization) + assert.Equal(t, expectedMeta.Parameters, actualMeta.Parameters) + assert.Equal(t, expectedMeta.GGUFVersion, actualMeta.GGUFVersion) + assert.Equal(t, expectedMeta.TensorCount, actualMeta.TensorCount) + assert.Equal(t, expectedMeta.TruncatedHeader, actualMeta.TruncatedHeader) + + // Hash should be computed + assert.NotEmpty(t, actualMeta.Hash) + } + + assert.Equal(t, tt.expectedRelationships, relationships) + }) + } +} + +func TestParseGGUFModel_HeaderReadLimit(t *testing.T) { + builder := newTestGGUFBuilder(). + withVersion(3). + withTensorCount(100). + withStringKV("general.architecture", "llama"). + withStringKV("general.name", "large-header-model") + + // Add many additional fields + for i := 0; i < 50; i++ { + builder.withStringKV("custom.field"+string(rune(i)), "value") + } + + fixturePath := createTempGGUFFile(t, "large-header.gguf", builder.build()) + defer os.Remove(fixturePath) + + f, err := os.Open(fixturePath) + require.NoError(t, err) + defer f.Close() + + reader := file.LocationReadCloser{ + Location: file.NewLocation(fixturePath), + ReadCloser: f, + } + + ctx := context.Background() + pkgs, _, err := parseGGUFModel(ctx, nil, nil, reader) + + require.NoError(t, err) + require.Len(t, pkgs, 1) + assert.Equal(t, "large-header-model", pkgs[0].Name) +} + +func TestParseGGUFModel_EmptyFile(t *testing.T) { + tmpDir := t.TempDir() + fixturePath := filepath.Join(tmpDir, "empty.gguf") + err := os.WriteFile(fixturePath, []byte{}, 0644) + require.NoError(t, err) + + f, err := os.Open(fixturePath) + require.NoError(t, err) + defer f.Close() + + reader := file.LocationReadCloser{ + Location: file.NewLocation(fixturePath), + ReadCloser: f, + } + + ctx := context.Background() + _, _, err = parseGGUFModel(ctx, nil, nil, reader) + + require.Error(t, err) + assert.Contains(t, err.Error(), "too small") +} + +func TestParseGGUFModel_LargeFile(t *testing.T) { + // Test that we only read the header, not the entire file + builder := newTestGGUFBuilder(). + withVersion(3). + withTensorCount(100). + withStringKV("general.architecture", "llama"). + withStringKV("general.name", "test-model") + + headerData := builder.build() + + // Create a file with header + large padding + tmpDir := t.TempDir() + fixturePath := filepath.Join(tmpDir, "large.gguf") + f, err := os.Create(fixturePath) + require.NoError(t, err) + + _, err = f.Write(headerData) + require.NoError(t, err) + + // Write 20MB of padding (simulating tensor data) + padding := make([]byte, 20*1024*1024) + _, err = f.Write(padding) + require.NoError(t, err) + f.Close() + + // Parse the file + f, err = os.Open(fixturePath) + require.NoError(t, err) + defer f.Close() + + reader := file.LocationReadCloser{ + Location: file.NewLocation(fixturePath), + ReadCloser: f, + } + + ctx := context.Background() + pkgs, _, err := parseGGUFModel(ctx, nil, nil, reader) + + require.NoError(t, err) + require.Len(t, pkgs, 1) + assert.Equal(t, "test-model", pkgs[0].Name) +} + +func Test_parseGGUFModel_interface(t *testing.T) { + // This test ensures parseGGUFModel matches the generic.Parser signature + fixture := createTempGGUFFile(t, "interface-test.gguf", + newTestGGUFBuilder(). + withVersion(3). + withTensorCount(100). + withStringKV("general.architecture", "llama"). + withStringKV("general.name", "test"). + build(), + ) + defer os.Remove(fixture) + + f, err := os.Open(fixture) + require.NoError(t, err) + defer f.Close() + + reader := file.LocationReadCloser{ + Location: file.NewLocation(fixture), + ReadCloser: f, + } + + ctx := context.Background() + pkgs, rels, err := parseGGUFModel(ctx, nil, nil, reader) + require.NoError(t, err) + require.Len(t, pkgs, 1) + assert.Empty(t, rels) + + // Verify basic package structure + assert.Equal(t, "test", pkgs[0].Name) + assert.Equal(t, unkownGGUFData, pkgs[0].Version) + assert.Equal(t, pkg.ModelPkg, pkgs[0].Type) +} + +func TestParseGGUFModel_ReaderClosed(t *testing.T) { + // Ensure the reader is properly closed after parsing + fixture := createTempGGUFFile(t, "close-test.gguf", + newTestGGUFBuilder(). + withVersion(3). + withTensorCount(100). + withStringKV("general.architecture", "llama"). + withStringKV("general.name", "test"). + build(), + ) + defer os.Remove(fixture) + + f, err := os.Open(fixture) + require.NoError(t, err) + + // Wrap in a custom closer to track if Close was called + closeCalled := false + reader := file.LocationReadCloser{ + Location: file.NewLocation(fixture), + ReadCloser: &testReadCloser{ + Reader: f, + onClose: func() error { + closeCalled = true + return f.Close() + }, + }, + } + + ctx := context.Background() + _, _, err = parseGGUFModel(ctx, nil, nil, reader) + require.NoError(t, err) + + assert.True(t, closeCalled, "reader should be closed after parsing") +} + +// createTempGGUFFile creates a temporary GGUF file for testing +func createTempGGUFFile(t *testing.T, filename string, data []byte) string { + t.Helper() + tmpDir := t.TempDir() + path := filepath.Join(tmpDir, filename) + err := os.WriteFile(path, data, 0644) + require.NoError(t, err) + return path +} + +// testReadCloser wraps an io.Reader and tracks Close calls +type testReadCloser struct { + io.Reader + onClose func() error +} + +func (r *testReadCloser) Close() error { + if r.onClose != nil { + return r.onClose() + } + return nil +} diff --git a/syft/pkg/cataloger/aiartifact/test_helpers_test.go b/syft/pkg/cataloger/aiartifact/test_helpers_test.go new file mode 100644 index 000000000..bf2cfd92f --- /dev/null +++ b/syft/pkg/cataloger/aiartifact/test_helpers_test.go @@ -0,0 +1,109 @@ +package aiartifact + +import ( + "bytes" + "encoding/binary" +) + +// testGGUFBuilder helps build GGUF files for testing +type testGGUFBuilder struct { + buf *bytes.Buffer + version uint32 + tensorCount uint64 + kvPairs []testKVPair +} + +type testKVPair struct { + key string + valueType uint32 + value interface{} +} + +func newTestGGUFBuilder() *testGGUFBuilder { + return &testGGUFBuilder{ + buf: new(bytes.Buffer), + version: 3, + tensorCount: 100, + kvPairs: []testKVPair{}, + } +} + +func (b *testGGUFBuilder) withVersion(v uint32) *testGGUFBuilder { + b.version = v + return b +} + +func (b *testGGUFBuilder) withTensorCount(count uint64) *testGGUFBuilder { + b.tensorCount = count + return b +} + +func (b *testGGUFBuilder) withStringKV(key, value string) *testGGUFBuilder { + b.kvPairs = append(b.kvPairs, testKVPair{key: key, valueType: ggufTypeString, value: value}) + return b +} + +func (b *testGGUFBuilder) withUint64KV(key string, value uint64) *testGGUFBuilder { + b.kvPairs = append(b.kvPairs, testKVPair{key: key, valueType: ggufTypeUint64, value: value}) + return b +} + +func (b *testGGUFBuilder) withUint32KV(key string, value uint32) *testGGUFBuilder { + b.kvPairs = append(b.kvPairs, testKVPair{key: key, valueType: ggufTypeUint32, value: value}) + return b +} + +func (b *testGGUFBuilder) writeString(s string) { + binary.Write(b.buf, binary.LittleEndian, uint64(len(s))) + b.buf.WriteString(s) +} + +func (b *testGGUFBuilder) build() []byte { + // Write magic number "GGUF" + binary.Write(b.buf, binary.LittleEndian, uint32(ggufMagic)) + + // Write version + binary.Write(b.buf, binary.LittleEndian, b.version) + + // Write tensor count + binary.Write(b.buf, binary.LittleEndian, b.tensorCount) + + // Write KV count + binary.Write(b.buf, binary.LittleEndian, uint64(len(b.kvPairs))) + + // Write KV pairs + for _, kv := range b.kvPairs { + // Write key + b.writeString(kv.key) + // Write value type + binary.Write(b.buf, binary.LittleEndian, kv.valueType) + // Write value based on type + switch kv.valueType { + case ggufTypeString: + b.writeString(kv.value.(string)) + case ggufTypeUint32: + binary.Write(b.buf, binary.LittleEndian, kv.value.(uint32)) + case ggufTypeUint64: + binary.Write(b.buf, binary.LittleEndian, kv.value.(uint64)) + case ggufTypeUint8: + binary.Write(b.buf, binary.LittleEndian, kv.value.(uint8)) + case ggufTypeInt32: + binary.Write(b.buf, binary.LittleEndian, kv.value.(int32)) + case ggufTypeBool: + var v uint8 + if kv.value.(bool) { + v = 1 + } + binary.Write(b.buf, binary.LittleEndian, v) + } + } + + return b.buf.Bytes() +} + +// buildInvalidMagic creates a file with invalid magic number +func (b *testGGUFBuilder) buildInvalidMagic() []byte { + buf := new(bytes.Buffer) + binary.Write(buf, binary.LittleEndian, uint32(0x12345678)) + return buf.Bytes() +}