test: migrate gguf tests over

Signed-off-by: Christopher Phillips <32073428+spiffcs@users.noreply.github.com>
This commit is contained in:
Christopher Phillips 2025-10-14 04:41:20 -04:00
parent f92b7d2fc9
commit 1ad4a2752a
No known key found for this signature in database
5 changed files with 1845 additions and 42 deletions

View File

@ -0,0 +1,385 @@
package aiartifact
import (
"os"
"path/filepath"
"testing"
"github.com/google/go-cmp/cmp/cmpopts"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/anchore/syft/syft/artifact"
"github.com/anchore/syft/syft/pkg"
"github.com/anchore/syft/syft/pkg/cataloger/internal/pkgtest"
)
func TestGGUFCataloger_Globs(t *testing.T) {
tests := []struct {
name string
setup func(t *testing.T) string // returns fixture directory
expected []string
}{
{
name: "finds GGUF files in root",
setup: func(t *testing.T) string {
dir := t.TempDir()
createTestGGUFInDir(t, dir, "model1.gguf")
createTestGGUFInDir(t, dir, "model2.gguf")
return dir
},
expected: []string{
"model1.gguf",
"model2.gguf",
},
},
{
name: "finds GGUF files in subdirectories",
setup: func(t *testing.T) string {
dir := t.TempDir()
modelsDir := filepath.Join(dir, "models")
os.MkdirAll(modelsDir, 0755)
createTestGGUFInDir(t, modelsDir, "llama.gguf")
deepDir := filepath.Join(dir, "deep", "nested", "path")
os.MkdirAll(deepDir, 0755)
createTestGGUFInDir(t, deepDir, "mistral.gguf")
return dir
},
expected: []string{
"models/llama.gguf",
"deep/nested/path/mistral.gguf",
},
},
{
name: "ignores non-GGUF files",
setup: func(t *testing.T) string {
dir := t.TempDir()
createTestGGUFInDir(t, dir, "model.gguf")
// Create non-GGUF files
os.WriteFile(filepath.Join(dir, "readme.txt"), []byte("readme"), 0644)
os.WriteFile(filepath.Join(dir, "model.bin"), []byte("binary"), 0644)
os.WriteFile(filepath.Join(dir, "config.json"), []byte("{}"), 0644)
return dir
},
expected: []string{
"model.gguf",
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
fixtureDir := tt.setup(t)
tester := pkgtest.NewCatalogTester().
FromDirectory(t, fixtureDir)
if len(tt.expected) > 0 {
tester.ExpectsResolverContentQueries(tt.expected)
}
tester.TestCataloger(t, NewGGUFCataloger())
})
}
}
func TestGGUFCataloger_Integration(t *testing.T) {
tests := []struct {
name string
setup func(t *testing.T) string
expectedPackages []pkg.Package
expectedRelationships []artifact.Relationship
}{
{
name: "catalog single GGUF file",
setup: func(t *testing.T) string {
dir := t.TempDir()
data := newTestGGUFBuilder().
withVersion(3).
withTensorCount(291).
withStringKV("general.architecture", "llama").
withStringKV("general.name", "llama3-8b").
withStringKV("general.version", "3.0").
withStringKV("general.license", "Apache-2.0").
withStringKV("general.quantization", "Q4_K_M").
withUint64KV("general.parameter_count", 8030000000).
build()
path := filepath.Join(dir, "llama3-8b.gguf")
os.WriteFile(path, data, 0644)
return dir
},
expectedPackages: []pkg.Package{
{
Name: "llama3-8b",
Version: "3.0",
Type: pkg.ModelPkg,
Licenses: pkg.NewLicenseSet(
pkg.NewLicenseFromFields("Apache-2.0", "", nil),
),
Metadata: pkg.GGUFFileMetadata{
ModelFormat: "gguf",
ModelName: "llama3-8b",
ModelVersion: "3.0",
License: "Apache-2.0",
Architecture: "llama",
Quantization: "Q4_K_M",
Parameters: 8030000000,
GGUFVersion: 3,
TensorCount: 291,
Header: map[string]interface{}{},
TruncatedHeader: false,
},
},
},
expectedRelationships: nil,
},
{
name: "catalog multiple GGUF files",
setup: func(t *testing.T) string {
dir := t.TempDir()
// Create first model
data1 := newTestGGUFBuilder().
withVersion(3).
withTensorCount(100).
withStringKV("general.architecture", "llama").
withStringKV("general.name", "model1").
withStringKV("general.version", "1.0").
build()
os.WriteFile(filepath.Join(dir, "model1.gguf"), data1, 0644)
// Create second model
data2 := newTestGGUFBuilder().
withVersion(3).
withTensorCount(200).
withStringKV("general.architecture", "mistral").
withStringKV("general.name", "model2").
withStringKV("general.version", "2.0").
build()
os.WriteFile(filepath.Join(dir, "model2.gguf"), data2, 0644)
return dir
},
expectedPackages: []pkg.Package{
{
Name: "model1",
Version: "1.0",
Type: pkg.ModelPkg,
Metadata: pkg.GGUFFileMetadata{
ModelFormat: "gguf",
ModelName: "model1",
ModelVersion: "1.0",
Architecture: "llama",
Quantization: unkownGGUFData,
GGUFVersion: 3,
TensorCount: 100,
Header: map[string]interface{}{},
TruncatedHeader: false,
},
},
{
Name: "model2",
Version: "2.0",
Type: pkg.ModelPkg,
Metadata: pkg.GGUFFileMetadata{
ModelFormat: "gguf",
ModelName: "model2",
ModelVersion: "2.0",
Architecture: "mistral",
Quantization: unkownGGUFData,
GGUFVersion: 3,
TensorCount: 200,
Header: map[string]interface{}{},
TruncatedHeader: false,
},
},
},
expectedRelationships: nil,
},
{
name: "catalog GGUF in nested directories",
setup: func(t *testing.T) string {
dir := t.TempDir()
nestedDir := filepath.Join(dir, "models", "quantized")
os.MkdirAll(nestedDir, 0755)
data := newTestGGUFBuilder().
withVersion(3).
withTensorCount(150).
withStringKV("general.architecture", "qwen").
withStringKV("general.name", "qwen-nested").
build()
os.WriteFile(filepath.Join(nestedDir, "qwen.gguf"), data, 0644)
return dir
},
expectedPackages: []pkg.Package{
{
Name: "qwen-nested",
Version: unkownGGUFData,
Type: pkg.ModelPkg,
Metadata: pkg.GGUFFileMetadata{
ModelFormat: "gguf",
ModelName: "qwen-nested",
ModelVersion: unkownGGUFData,
Architecture: "qwen",
Quantization: unkownGGUFData,
GGUFVersion: 3,
TensorCount: 150,
Header: map[string]interface{}{},
TruncatedHeader: false,
},
},
},
expectedRelationships: nil,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
fixtureDir := tt.setup(t)
// Use pkgtest to catalog and compare
tester := pkgtest.NewCatalogTester().
FromDirectory(t, fixtureDir).
Expects(tt.expectedPackages, tt.expectedRelationships).
IgnoreLocationLayer().
IgnorePackageFields("FoundBy", "Locations"). // These are set by the cataloger
WithCompareOptions(
// Ignore Hash as it's computed dynamically
cmpopts.IgnoreFields(pkg.GGUFFileMetadata{}, "Hash"),
)
tester.TestCataloger(t, NewGGUFCataloger())
})
}
}
func TestGGUFCataloger_SkipsInvalidFiles(t *testing.T) {
dir := t.TempDir()
// Create a valid GGUF
validData := newTestGGUFBuilder().
withVersion(3).
withTensorCount(100).
withStringKV("general.architecture", "llama").
withStringKV("general.name", "valid-model").
build()
os.WriteFile(filepath.Join(dir, "valid.gguf"), validData, 0644)
// Create an invalid GGUF (wrong magic)
invalidData := newTestGGUFBuilder().buildInvalidMagic()
os.WriteFile(filepath.Join(dir, "invalid.gguf"), invalidData, 0644)
// Create a truncated GGUF
os.WriteFile(filepath.Join(dir, "truncated.gguf"), []byte{0x47}, 0644)
// Catalog should succeed and only return the valid package
tester := pkgtest.NewCatalogTester().
FromDirectory(t, dir).
ExpectsAssertion(func(t *testing.T, pkgs []pkg.Package, _ []artifact.Relationship) {
// Should only find the valid model
require.Len(t, pkgs, 1)
assert.Equal(t, "valid-model", pkgs[0].Name)
})
tester.TestCataloger(t, NewGGUFCataloger())
}
func TestGGUFCataloger_Name(t *testing.T) {
cataloger := NewGGUFCataloger()
assert.Equal(t, "gguf-cataloger", cataloger.Name())
}
func TestGGUFCataloger_EmptyDirectory(t *testing.T) {
dir := t.TempDir()
// Create a subdirectory to ensure glob still runs
os.MkdirAll(filepath.Join(dir, "models"), 0755)
tester := pkgtest.NewCatalogTester().
FromDirectory(t, dir).
ExpectsAssertion(func(t *testing.T, pkgs []pkg.Package, rels []artifact.Relationship) {
assert.Empty(t, pkgs)
assert.Empty(t, rels)
})
tester.TestCataloger(t, NewGGUFCataloger())
}
func TestGGUFCataloger_MixedFiles(t *testing.T) {
dir := t.TempDir()
// Create GGUF file
ggufData := newTestGGUFBuilder().
withVersion(3).
withTensorCount(100).
withStringKV("general.architecture", "llama").
withStringKV("general.name", "test-model").
build()
os.WriteFile(filepath.Join(dir, "model.gguf"), ggufData, 0644)
// Create other file types
os.WriteFile(filepath.Join(dir, "README.md"), []byte("# Models"), 0644)
os.WriteFile(filepath.Join(dir, "config.json"), []byte("{}"), 0644)
os.WriteFile(filepath.Join(dir, "weights.bin"), []byte("weights"), 0644)
os.MkdirAll(filepath.Join(dir, "subdir"), 0755)
tester := pkgtest.NewCatalogTester().
FromDirectory(t, dir).
ExpectsAssertion(func(t *testing.T, pkgs []pkg.Package, _ []artifact.Relationship) {
// Should only find the GGUF model
require.Len(t, pkgs, 1)
assert.Equal(t, "test-model", pkgs[0].Name)
assert.Equal(t, pkg.ModelPkg, pkgs[0].Type)
})
tester.TestCataloger(t, NewGGUFCataloger())
}
func TestGGUFCataloger_CaseInsensitiveGlob(t *testing.T) {
// Test that the glob pattern is case-sensitive (as expected for **/*.gguf)
dir := t.TempDir()
// Create lowercase .gguf
data := newTestGGUFBuilder().
withVersion(3).
withTensorCount(100).
withStringKV("general.architecture", "llama").
withStringKV("general.name", "lowercase").
build()
os.WriteFile(filepath.Join(dir, "model.gguf"), data, 0644)
// Create uppercase .GGUF (should not match the glob)
os.WriteFile(filepath.Join(dir, "MODEL.GGUF"), data, 0644)
tester := pkgtest.NewCatalogTester().
FromDirectory(t, dir).
ExpectsAssertion(func(t *testing.T, pkgs []pkg.Package, _ []artifact.Relationship) {
// Depending on filesystem case-sensitivity, we may get 1 or 2 packages
// On case-insensitive filesystems (macOS), both might match
// On case-sensitive filesystems (Linux), only lowercase matches
assert.GreaterOrEqual(t, len(pkgs), 1, "should find at least the lowercase file")
})
tester.TestCataloger(t, NewGGUFCataloger())
}
// createTestGGUFInDir creates a minimal test GGUF file in the specified directory
func createTestGGUFInDir(t *testing.T, dir, filename string) {
t.Helper()
data := newTestGGUFBuilder().
withVersion(3).
withTensorCount(100).
withStringKV("general.architecture", "llama").
withStringKV("general.name", "test-model").
build()
path := filepath.Join(dir, filename)
err := os.WriteFile(path, data, 0644)
require.NoError(t, err)
}

View File

@ -1,9 +1,6 @@
package aiartifact package aiartifact
import ( import (
"fmt"
"github.com/anchore/packageurl-go"
"github.com/anchore/syft/syft/file" "github.com/anchore/syft/syft/file"
"github.com/anchore/syft/syft/pkg" "github.com/anchore/syft/syft/pkg"
) )
@ -12,11 +9,12 @@ func newGGUFPackage(metadata *pkg.GGUFFileMetadata, locations ...file.Location)
p := pkg.Package{ p := pkg.Package{
Name: metadata.ModelName, Name: metadata.ModelName,
Version: metadata.ModelVersion, Version: metadata.ModelVersion,
PURL: packageURL(metadata),
Locations: file.NewLocationSet(locations...), Locations: file.NewLocationSet(locations...),
Type: pkg.ModelPkg, Type: pkg.ModelPkg,
Licenses: pkg.NewLicenseSet(), Licenses: pkg.NewLicenseSet(),
Metadata: *metadata, Metadata: *metadata,
// NOTE: PURL is intentionally not set as the package-url spec
// has not yet finalized support for ML model packages
} }
// Add license to the package if present in metadata // Add license to the package if present in metadata
@ -28,41 +26,3 @@ func newGGUFPackage(metadata *pkg.GGUFFileMetadata, locations ...file.Location)
return p return p
} }
// packageURL returns the PURL for the specific GGUF model package (see https://github.com/package-url/purl-spec)
func packageURL(metadata *pkg.GGUFFileMetadata) string {
var qualifiers packageurl.Qualifiers
// Add model-specific qualifiers
if metadata.Architecture != "" {
qualifiers = append(qualifiers, packageurl.Qualifier{
Key: "arch",
Value: metadata.Architecture,
})
}
if metadata.Quantization != "" && metadata.Quantization != "unknown" {
qualifiers = append(qualifiers, packageurl.Qualifier{
Key: "quantization",
Value: metadata.Quantization,
})
}
if metadata.Parameters > 0 {
qualifiers = append(qualifiers, packageurl.Qualifier{
Key: "parameters",
Value: fmt.Sprintf("%d", metadata.Parameters),
})
}
// Use mlmodel as the type for machine learning models in GGUF format
// This follows the PURL spec guidance for ML models
return packageurl.NewPackageURL(
"mlmodel",
"gguf",
metadata.ModelName,
metadata.ModelVersion,
qualifiers,
"",
).ToString()
}

View File

@ -0,0 +1,185 @@
package aiartifact
import (
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/anchore/syft/syft/file"
"github.com/anchore/syft/syft/pkg"
)
func TestNewGGUFPackage(t *testing.T) {
tests := []struct {
name string
metadata *pkg.GGUFFileMetadata
locations []file.Location
checkFunc func(t *testing.T, p pkg.Package)
}{
{
name: "complete GGUF package with all fields",
metadata: &pkg.GGUFFileMetadata{
ModelFormat: "gguf",
ModelName: "llama3-8b-instruct",
ModelVersion: "3.0",
License: "Apache-2.0",
Architecture: "llama",
Quantization: "Q4_K_M",
Parameters: 8030000000,
GGUFVersion: 3,
TensorCount: 291,
Header: map[string]interface{}{},
TruncatedHeader: false,
},
locations: []file.Location{file.NewLocation("/models/llama3-8b.gguf")},
checkFunc: func(t *testing.T, p pkg.Package) {
assert.Equal(t, "llama3-8b-instruct", p.Name)
assert.Equal(t, "3.0", p.Version)
assert.Equal(t, pkg.ModelPkg, p.Type)
assert.Empty(t, p.PURL, "PURL should not be set for model packages")
assert.Len(t, p.Licenses.ToSlice(), 1)
assert.Equal(t, "Apache-2.0", p.Licenses.ToSlice()[0].Value)
assert.NotEmpty(t, p.ID())
},
},
{
name: "minimal GGUF package",
metadata: &pkg.GGUFFileMetadata{
ModelFormat: "gguf",
ModelName: "simple-model",
ModelVersion: "1.0",
Architecture: "gpt2",
GGUFVersion: 3,
TensorCount: 50,
},
locations: []file.Location{file.NewLocation("/models/simple.gguf")},
checkFunc: func(t *testing.T, p pkg.Package) {
assert.Equal(t, "simple-model", p.Name)
assert.Equal(t, "1.0", p.Version)
assert.Equal(t, pkg.ModelPkg, p.Type)
assert.Empty(t, p.PURL, "PURL should not be set for model packages")
assert.Empty(t, p.Licenses.ToSlice())
},
},
{
name: "GGUF package with multiple locations",
metadata: &pkg.GGUFFileMetadata{
ModelFormat: "gguf",
ModelName: "multi-location-model",
ModelVersion: "1.5",
Architecture: "llama",
GGUFVersion: 3,
TensorCount: 150,
},
locations: []file.Location{
file.NewLocation("/models/model1.gguf"),
file.NewLocation("/models/model2.gguf"),
},
checkFunc: func(t *testing.T, p pkg.Package) {
assert.Len(t, p.Locations.ToSlice(), 2)
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
p := newGGUFPackage(tt.metadata, tt.locations...)
assert.Equal(t, tt.metadata.ModelName, p.Name)
assert.Equal(t, tt.metadata.ModelVersion, p.Version)
assert.Equal(t, pkg.ModelPkg, p.Type)
// Verify metadata is attached
metadata, ok := p.Metadata.(pkg.GGUFFileMetadata)
require.True(t, ok, "metadata should be GGUFFileMetadata")
assert.Equal(t, *tt.metadata, metadata)
if tt.checkFunc != nil {
tt.checkFunc(t, p)
}
})
}
}
func TestNewGGUFPackage_IDUniqueness(t *testing.T) {
// Test that different packages get different IDs
metadata1 := &pkg.GGUFFileMetadata{
ModelFormat: "gguf",
ModelName: "model-1",
ModelVersion: "1.0",
Architecture: "llama",
GGUFVersion: 3,
TensorCount: 100,
}
metadata2 := &pkg.GGUFFileMetadata{
ModelFormat: "gguf",
ModelName: "model-2",
ModelVersion: "1.0",
Architecture: "llama",
GGUFVersion: 3,
TensorCount: 100,
}
loc := file.NewLocation("/models/test.gguf")
p1 := newGGUFPackage(metadata1, loc)
p2 := newGGUFPackage(metadata2, loc)
assert.NotEqual(t, p1.ID(), p2.ID(), "different packages should have different IDs")
}
func TestNewGGUFPackage_IDConsistency(t *testing.T) {
// Test that same metadata produces same ID
metadata := &pkg.GGUFFileMetadata{
ModelFormat: "gguf",
ModelName: "test-model",
ModelVersion: "1.0",
Architecture: "llama",
GGUFVersion: 3,
TensorCount: 100,
}
loc := file.NewLocation("/models/test.gguf")
p1 := newGGUFPackage(metadata, loc)
p2 := newGGUFPackage(metadata, loc)
assert.Equal(t, p1.ID(), p2.ID(), "identical packages should have identical IDs")
}
func TestNewGGUFPackage_MetadataPreservation(t *testing.T) {
// Ensure all metadata fields are preserved in the package
metadata := &pkg.GGUFFileMetadata{
ModelFormat: "gguf",
ModelName: "preservation-test",
ModelVersion: "2.0",
License: "MIT",
Architecture: "llama",
Quantization: "Q4_K_M",
Parameters: 7000000000,
GGUFVersion: 3,
TensorCount: 219,
Hash: "abc123",
Header: map[string]interface{}{"custom.field": "value"},
TruncatedHeader: false,
}
loc := file.NewLocation("/models/test.gguf")
p := newGGUFPackage(metadata, loc)
extractedMetadata, ok := p.Metadata.(pkg.GGUFFileMetadata)
require.True(t, ok)
assert.Equal(t, metadata.ModelFormat, extractedMetadata.ModelFormat)
assert.Equal(t, metadata.ModelName, extractedMetadata.ModelName)
assert.Equal(t, metadata.ModelVersion, extractedMetadata.ModelVersion)
assert.Equal(t, metadata.License, extractedMetadata.License)
assert.Equal(t, metadata.Architecture, extractedMetadata.Architecture)
assert.Equal(t, metadata.Quantization, extractedMetadata.Quantization)
assert.Equal(t, metadata.Parameters, extractedMetadata.Parameters)
assert.Equal(t, metadata.GGUFVersion, extractedMetadata.GGUFVersion)
assert.Equal(t, metadata.TensorCount, extractedMetadata.TensorCount)
assert.Equal(t, metadata.Hash, extractedMetadata.Hash)
assert.Equal(t, metadata.TruncatedHeader, extractedMetadata.TruncatedHeader)
assert.Equal(t, metadata.Header, extractedMetadata.Header)
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,109 @@
package aiartifact
import (
"bytes"
"encoding/binary"
)
// testGGUFBuilder helps build GGUF files for testing
type testGGUFBuilder struct {
buf *bytes.Buffer
version uint32
tensorCount uint64
kvPairs []testKVPair
}
type testKVPair struct {
key string
valueType uint32
value interface{}
}
func newTestGGUFBuilder() *testGGUFBuilder {
return &testGGUFBuilder{
buf: new(bytes.Buffer),
version: 3,
tensorCount: 100,
kvPairs: []testKVPair{},
}
}
func (b *testGGUFBuilder) withVersion(v uint32) *testGGUFBuilder {
b.version = v
return b
}
func (b *testGGUFBuilder) withTensorCount(count uint64) *testGGUFBuilder {
b.tensorCount = count
return b
}
func (b *testGGUFBuilder) withStringKV(key, value string) *testGGUFBuilder {
b.kvPairs = append(b.kvPairs, testKVPair{key: key, valueType: ggufTypeString, value: value})
return b
}
func (b *testGGUFBuilder) withUint64KV(key string, value uint64) *testGGUFBuilder {
b.kvPairs = append(b.kvPairs, testKVPair{key: key, valueType: ggufTypeUint64, value: value})
return b
}
func (b *testGGUFBuilder) withUint32KV(key string, value uint32) *testGGUFBuilder {
b.kvPairs = append(b.kvPairs, testKVPair{key: key, valueType: ggufTypeUint32, value: value})
return b
}
func (b *testGGUFBuilder) writeString(s string) {
binary.Write(b.buf, binary.LittleEndian, uint64(len(s)))
b.buf.WriteString(s)
}
func (b *testGGUFBuilder) build() []byte {
// Write magic number "GGUF"
binary.Write(b.buf, binary.LittleEndian, uint32(ggufMagic))
// Write version
binary.Write(b.buf, binary.LittleEndian, b.version)
// Write tensor count
binary.Write(b.buf, binary.LittleEndian, b.tensorCount)
// Write KV count
binary.Write(b.buf, binary.LittleEndian, uint64(len(b.kvPairs)))
// Write KV pairs
for _, kv := range b.kvPairs {
// Write key
b.writeString(kv.key)
// Write value type
binary.Write(b.buf, binary.LittleEndian, kv.valueType)
// Write value based on type
switch kv.valueType {
case ggufTypeString:
b.writeString(kv.value.(string))
case ggufTypeUint32:
binary.Write(b.buf, binary.LittleEndian, kv.value.(uint32))
case ggufTypeUint64:
binary.Write(b.buf, binary.LittleEndian, kv.value.(uint64))
case ggufTypeUint8:
binary.Write(b.buf, binary.LittleEndian, kv.value.(uint8))
case ggufTypeInt32:
binary.Write(b.buf, binary.LittleEndian, kv.value.(int32))
case ggufTypeBool:
var v uint8
if kv.value.(bool) {
v = 1
}
binary.Write(b.buf, binary.LittleEndian, v)
}
}
return b.buf.Bytes()
}
// buildInvalidMagic creates a file with invalid magic number
func (b *testGGUFBuilder) buildInvalidMagic() []byte {
buf := new(bytes.Buffer)
binary.Write(buf, binary.LittleEndian, uint32(0x12345678))
return buf.Bytes()
}