chore: pr comments

Signed-off-by: Christopher Phillips <32073428+spiffcs@users.noreply.github.com>
This commit is contained in:
Christopher Phillips 2025-11-13 14:57:07 -05:00
parent 9609ce2b36
commit b1c8478d55
No known key found for this signature in database
8 changed files with 254 additions and 157 deletions

View File

@ -124,7 +124,7 @@ var jsonTypes = makeJSONTypes(
jsonNames(pkg.TerraformLockProviderEntry{}, "terraform-lock-provider-entry"),
jsonNames(pkg.DotnetPackagesLockEntry{}, "dotnet-packages-lock-entry"),
jsonNames(pkg.CondaMetaPackage{}, "conda-metadata-entry", "CondaPackageMetadata"),
jsonNames(pkg.GGUFFileHeader{}, "gguf-file-metadata"),
jsonNames(pkg.GGUFFileHeader{}, "gguf-file-header"),
)
func expandLegacyNameVariants(names ...string) []string {

View File

@ -5,8 +5,6 @@ import (
"path/filepath"
"testing"
"github.com/google/go-cmp/cmp/cmpopts"
"github.com/anchore/syft/syft/artifact"
"github.com/anchore/syft/syft/pkg"
"github.com/anchore/syft/syft/pkg/cataloger/internal/pkgtest"
@ -37,7 +35,7 @@ func TestGGUFCataloger_Globs(t *testing.T) {
}
}
func TestGGUFCataloger_Integration(t *testing.T) {
func TestGGUFCataloger(t *testing.T) {
tests := []struct {
name string
setup func(t *testing.T) string
@ -56,6 +54,7 @@ func TestGGUFCataloger_Integration(t *testing.T) {
withStringKV("general.license", "Apache-2.0").
withStringKV("general.quantization", "Q4_K_M").
withUint64KV("general.parameter_count", 8030000000).
withStringKV("general.some_random_kv", "foobar").
build()
path := filepath.Join(dir, "llama3-8b.gguf")
@ -71,14 +70,125 @@ func TestGGUFCataloger_Integration(t *testing.T) {
pkg.NewLicenseFromFields("Apache-2.0", "", nil),
),
Metadata: pkg.GGUFFileHeader{
ModelName: "llama3-8b",
License: "Apache-2.0",
Architecture: "llama",
Quantization: "Unknown",
Parameters: 0,
GGUFVersion: 3,
TensorCount: 0,
Header: map[string]interface{}{},
MetadataKeyValuesHash: "6e3d368066455ce4",
Header: map[string]interface{}{
"general.some_random_kv": "foobar",
},
},
},
},
expectedRelationships: nil,
},
{
name: "catalog GGUF file with minimal metadata",
setup: func(t *testing.T) string {
dir := t.TempDir()
data := newTestGGUFBuilder().
withVersion(3).
withStringKV("general.architecture", "gpt2").
withStringKV("general.name", "gpt2-small").
withStringKV("gpt2.context_length", "1024").
withUint32KV("gpt2.embedding_length", 768).
build()
path := filepath.Join(dir, "gpt2-small.gguf")
os.WriteFile(path, data, 0644)
return dir
},
expectedPackages: []pkg.Package{
{
Name: "gpt2-small",
Version: "",
Type: pkg.ModelPkg,
Licenses: pkg.NewLicenseSet(),
Metadata: pkg.GGUFFileHeader{
Architecture: "gpt2",
Quantization: "Unknown",
Parameters: 0,
GGUFVersion: 3,
TensorCount: 0,
MetadataKeyValuesHash: "9dc6f23591062a27",
Header: map[string]interface{}{
"gpt2.context_length": "1024",
"gpt2.embedding_length": uint32(768),
},
},
},
},
expectedRelationships: nil,
},
{
name: "catalog multiple GGUF files",
setup: func(t *testing.T) string {
dir := t.TempDir()
// First model - Llama with custom training data
data1 := newTestGGUFBuilder().
withVersion(3).
withStringKV("general.architecture", "llama").
withStringKV("general.name", "model-1").
withStringKV("general.version", "1.0").
withStringKV("llama.attention.head_count", "32").
withUint32KV("llama.attention.layer_norm_rms_epsilon", 999).
build()
os.WriteFile(filepath.Join(dir, "model-1.gguf"), data1, 0644)
// Second model - GPT2 with different config
data2 := newTestGGUFBuilder().
withVersion(3).
withStringKV("general.architecture", "gpt2").
withStringKV("general.name", "model-2").
withStringKV("general.version", "2.0").
withStringKV("general.license", "MIT").
withStringKV("gpt2.block_count", "12").
withUint64KV("tokenizer.ggml.bos_token_id", 50256).
build()
os.WriteFile(filepath.Join(dir, "model-2.gguf"), data2, 0644)
return dir
},
expectedPackages: []pkg.Package{
{
Name: "model-1",
Version: "1.0",
Type: pkg.ModelPkg,
Licenses: pkg.NewLicenseSet(),
Metadata: pkg.GGUFFileHeader{
Architecture: "llama",
Quantization: "Unknown",
Parameters: 0,
GGUFVersion: 3,
TensorCount: 0,
MetadataKeyValuesHash: "57e0dbea7d2efa6e",
Header: map[string]interface{}{
"llama.attention.head_count": "32",
"llama.attention.layer_norm_rms_epsilon": uint32(999),
},
},
},
{
Name: "model-2",
Version: "2.0",
Type: pkg.ModelPkg,
Licenses: pkg.NewLicenseSet(
pkg.NewLicenseFromFields("MIT", "", nil),
),
Metadata: pkg.GGUFFileHeader{
Architecture: "gpt2",
Quantization: "Unknown",
Parameters: 0,
GGUFVersion: 3,
TensorCount: 0,
MetadataKeyValuesHash: "f85de1bf9be304bb",
Header: map[string]interface{}{
"gpt2.block_count": "12",
"tokenizer.ggml.bos_token_id": uint64(50256),
},
},
},
},
@ -91,17 +201,12 @@ func TestGGUFCataloger_Integration(t *testing.T) {
fixtureDir := tt.setup(t)
// Use pkgtest to catalog and compare
tester := pkgtest.NewCatalogTester().
pkgtest.NewCatalogTester().
FromDirectory(t, fixtureDir).
Expects(tt.expectedPackages, tt.expectedRelationships).
IgnoreLocationLayer().
IgnorePackageFields("FoundBy", "Locations"). // These are set by the cataloger
WithCompareOptions(
// Ignore MetadataHash as it's computed dynamically
cmpopts.IgnoreFields(pkg.GGUFFileHeader{}, "MetadataHash"),
)
tester.TestCataloger(t, NewGGUFCataloger())
IgnorePackageFields("FoundBy", "Locations").
TestCataloger(t, NewGGUFCataloger())
})
}
}

View File

@ -5,9 +5,9 @@ import (
"github.com/anchore/syft/syft/pkg"
)
func newGGUFPackage(metadata *pkg.GGUFFileHeader, version string, locations ...file.Location) pkg.Package {
func newGGUFPackage(metadata *pkg.GGUFFileHeader, modelName, version, license string, locations ...file.Location) pkg.Package {
p := pkg.Package{
Name: metadata.ModelName,
Name: modelName,
Version: version,
Locations: file.NewLocationSet(locations...),
Type: pkg.ModelPkg,
@ -18,8 +18,8 @@ func newGGUFPackage(metadata *pkg.GGUFFileHeader, version string, locations ...f
}
// Add license to the package if present in metadata
if metadata.License != "" {
p.Licenses.Add(pkg.NewLicenseFromFields(metadata.License, "", nil))
if license != "" {
p.Licenses.Add(pkg.NewLicenseFromFields(license, "", nil))
}
p.SetID()

View File

@ -3,121 +3,158 @@ package ai
import (
"testing"
"github.com/google/go-cmp/cmp"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/anchore/syft/syft/file"
"github.com/anchore/syft/syft/pkg"
"github.com/anchore/syft/syft/pkg/cataloger/internal/pkgtest"
)
func TestNewGGUFPackage(t *testing.T) {
tests := []struct {
name string
metadata *pkg.GGUFFileHeader
input struct {
modelName string
version string
license string
locations []file.Location
checkFunc func(t *testing.T, p pkg.Package)
}
expected pkg.Package
}{
{
name: "complete GGUF package with all fields",
input: struct {
modelName string
version string
license string
locations []file.Location
}{
modelName: "llama3-8b",
version: "3.0",
license: "Apache-2.0",
locations: []file.Location{file.NewLocation("/models/llama3-8b.gguf")},
},
metadata: &pkg.GGUFFileHeader{
ModelName: "llama3-8b-instruct",
License: "Apache-2.0",
Architecture: "llama",
Quantization: "Q4_K_M",
Parameters: 8030000000,
GGUFVersion: 3,
TensorCount: 291,
Header: map[string]any{},
Header: map[string]any{
"general.random_kv": "foobar",
},
locations: []file.Location{file.NewLocation("/models/llama3-8b.gguf")},
checkFunc: func(t *testing.T, p pkg.Package) {
if d := cmp.Diff("llama3-8b-instruct", p.Name); d != "" {
t.Errorf("Name mismatch (-want +got):\n%s", d)
}
if d := cmp.Diff("3.0", p.Version); d != "" {
t.Errorf("Version mismatch (-want +got):\n%s", d)
}
if d := cmp.Diff(pkg.ModelPkg, p.Type); d != "" {
t.Errorf("Type mismatch (-want +got):\n%s", d)
}
assert.Empty(t, p.PURL, "PURL should not be set for model packages")
assert.Len(t, p.Licenses.ToSlice(), 1)
if d := cmp.Diff("Apache-2.0", p.Licenses.ToSlice()[0].Value); d != "" {
t.Errorf("License value mismatch (-want +got):\n%s", d)
}
assert.NotEmpty(t, p.ID())
},
expected: pkg.Package{
Name: "llama3-8b",
Version: "3.0",
Type: pkg.ModelPkg,
Licenses: pkg.NewLicenseSet(
pkg.NewLicenseFromFields("Apache-2.0", "", nil),
),
Metadata: pkg.GGUFFileHeader{
Architecture: "llama",
Quantization: "Q4_K_M",
Parameters: 8030000000,
GGUFVersion: 3,
TensorCount: 291,
Header: map[string]any{
"general.random_kv": "foobar",
},
},
Locations: file.NewLocationSet(file.NewLocation("/models/llama3-8b.gguf")),
},
},
{
name: "minimal GGUF package",
input: struct {
modelName string
version string
license string
locations []file.Location
}{
modelName: "gpt2-small",
version: "1.0",
license: "MIT",
locations: []file.Location{file.NewLocation("/models/simple.gguf")},
},
metadata: &pkg.GGUFFileHeader{
ModelName: "simple-model",
Architecture: "gpt2",
GGUFVersion: 3,
TensorCount: 50,
},
locations: []file.Location{file.NewLocation("/models/simple.gguf")},
checkFunc: func(t *testing.T, p pkg.Package) {
if d := cmp.Diff("simple-model", p.Name); d != "" {
t.Errorf("Name mismatch (-want +got):\n%s", d)
}
if d := cmp.Diff("1.0", p.Version); d != "" {
t.Errorf("Version mismatch (-want +got):\n%s", d)
}
if d := cmp.Diff(pkg.ModelPkg, p.Type); d != "" {
t.Errorf("Type mismatch (-want +got):\n%s", d)
}
assert.Empty(t, p.PURL, "PURL should not be set for model packages")
assert.Empty(t, p.Licenses.ToSlice())
expected: pkg.Package{
Name: "gpt2-small",
Version: "1.0",
Type: pkg.ModelPkg,
Licenses: pkg.NewLicenseSet(
pkg.NewLicenseFromFields("MIT", "", nil),
),
Metadata: pkg.GGUFFileHeader{
Architecture: "gpt2",
GGUFVersion: 3,
TensorCount: 50,
},
Locations: file.NewLocationSet(file.NewLocation("/models/simple.gguf")),
},
},
{
name: "GGUF package with multiple locations",
version: "1.5",
metadata: &pkg.GGUFFileHeader{
ModelName: "multi-location-model",
Architecture: "llama",
GGUFVersion: 3,
TensorCount: 150,
},
input: struct {
modelName string
version string
license string
locations []file.Location
}{
modelName: "llama-multi",
version: "2.0",
license: "Apache-2.0",
locations: []file.Location{
file.NewLocation("/models/model1.gguf"),
file.NewLocation("/models/model2.gguf"),
},
checkFunc: func(t *testing.T, p pkg.Package) {
assert.Len(t, p.Locations.ToSlice(), 2)
},
metadata: &pkg.GGUFFileHeader{
Architecture: "llama",
GGUFVersion: 3,
TensorCount: 150,
},
expected: pkg.Package{
Name: "llama-multi",
Version: "2.0",
Type: pkg.ModelPkg,
Licenses: pkg.NewLicenseSet(
pkg.NewLicenseFromFields("Apache-2.0", "", nil),
),
Metadata: pkg.GGUFFileHeader{
Architecture: "llama",
GGUFVersion: 3,
TensorCount: 150,
},
Locations: file.NewLocationSet(
file.NewLocation("/models/model1.gguf"),
file.NewLocation("/models/model2.gguf"),
),
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
p := newGGUFPackage(tt.metadata, tt.version, tt.locations...)
actual := newGGUFPackage(
tt.metadata,
tt.input.modelName,
tt.input.version,
tt.input.license,
tt.input.locations...,
)
if d := cmp.Diff(tt.metadata.ModelName, p.Name); d != "" {
t.Errorf("Name mismatch (-want +got):\n%s", d)
}
if d := cmp.Diff(tt.version, p.Version); d != "" {
t.Errorf("Version mismatch (-want +got):\n%s", d)
}
if d := cmp.Diff(pkg.ModelPkg, p.Type); d != "" {
t.Errorf("Type mismatch (-want +got):\n%s", d)
}
// Verify metadata is attached
metadata, ok := p.Metadata.(pkg.GGUFFileHeader)
// Verify metadata type
_, ok := actual.Metadata.(pkg.GGUFFileHeader)
require.True(t, ok, "metadata should be GGUFFileHeader")
if d := cmp.Diff(*tt.metadata, metadata); d != "" {
t.Errorf("Metadata mismatch (-want +got):\n%s", d)
}
if tt.checkFunc != nil {
tt.checkFunc(t, p)
}
// Use AssertPackagesEqual for comprehensive comparison
pkgtest.AssertPackagesEqual(t, tt.expected, actual)
})
}
}

View File

@ -67,26 +67,26 @@ func parseGGUFModel(_ context.Context, _ file.Resolver, _ *generic.Environment,
// Convert to syft metadata structure
syftMetadata := &pkg.GGUFFileHeader{
ModelName: metadata.Name,
License: metadata.License,
Architecture: metadata.Architecture,
Quantization: metadata.FileTypeDescriptor,
Parameters: uint64(metadata.Parameters),
GGUFVersion: uint32(ggufFile.Header.Version),
TensorCount: ggufFile.Header.TensorCount,
Header: convertGGUFMetadataKVs(ggufFile.Header.MetadataKV),
MetadataHash: computeKVMetadataHash(ggufFile.Header.MetadataKV),
MetadataKeyValuesHash: computeKVMetadataHash(ggufFile.Header.MetadataKV),
}
// If model name is not in metadata, use filename
if syftMetadata.ModelName == "" {
syftMetadata.ModelName = extractModelNameFromPath(reader.Path())
if metadata.Name == "" {
metadata.Name = extractModelNameFromPath(reader.Path())
}
// Create package from metadata
p := newGGUFPackage(
syftMetadata,
metadata.Name,
modelVersion,
metadata.License,
reader.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation),
)

View File

@ -1,41 +0,0 @@
package ai
import (
"fmt"
"os"
gguf_parser "github.com/gpustack/gguf-parser-go"
)
func main() {
// Create a test GGUF file
data := newTestGGUFBuilder().
withVersion(3).
withStringKV("general.architecture", "llama").
withStringKV("general.name", "test-model").
build()
// Write to temp file
tempFile, err := os.CreateTemp("", "test-*.gguf")
if err != nil {
panic(err)
}
defer os.Remove(tempFile.Name())
if _, err := tempFile.Write(data); err != nil {
panic(err)
}
tempFile.Close()
fmt.Printf("Wrote %d bytes to %s\n", len(data), tempFile.Name())
// Try to parse it
fmt.Println("Attempting to parse...")
gf, err := gguf_parser.ParseGGUFFile(tempFile.Name(), gguf_parser.SkipLargeMetadata())
if err != nil {
fmt.Printf("Parse error: %v\n", err)
return
}
fmt.Printf("Success! Model: %s\n", gf.Metadata().Name)
}

View File

@ -6,6 +6,7 @@ import (
)
// GGUF type constants for test builder
// https://github.com/ggml-org/ggml/blob/master/docs/gguf.md
const (
ggufMagic = 0x46554747 // "GGUF" in little-endian
ggufTypeUint8 = 0

View File

@ -3,19 +3,14 @@ package pkg
// GGUFFileHeader represents metadata extracted from a GGUF (GPT-Generated Unified Format) model file.
// GGUF is a binary file format used for storing model weights for the GGML library, designed for fast
// loading and saving of models, particularly quantized large language models.
// The Model Name, License, and Version fields have all been lifted up to be on the syft Package.
type GGUFFileHeader struct {
// GGUFVersion is the GGUF format version (e.g., 3)
GGUFVersion uint32 `json:"ggufVersion" cyclonedx:"ggufVersion"`
// ModelName is the name of the model (from general.name or filename)
ModelName string `json:"modelName" cyclonedx:"modelName"`
// FileSize is the size of the GGUF file in bytes (best-effort if available from resolver)
FileSize int64 `json:"fileSize,omitempty" cyclonedx:"fileSize"`
// License is the license identifier (from general.license if present)
License string `json:"license,omitempty" cyclonedx:"license"`
// Architecture is the model architecture (from general.architecture, e.g., "qwen3moe", "llama")
Architecture string `json:"architecture,omitempty" cyclonedx:"architecture"`
@ -33,10 +28,10 @@ type GGUFFileHeader struct {
// (namespaced with general.*, llama.*, etc.) while avoiding duplication.
Header map[string]interface{} `json:"header,omitempty" cyclonedx:"header"`
// MetadataHash is a xx64 hash of all key-value pairs from the GGUF header metadata.
// MetadataKeyValuesHash is a xx64 hash of all key-value pairs from the GGUF header metadata.
// This hash is computed over the complete header metadata (including the fields extracted
// into typed fields above) and provides a stable identifier for the model configuration
// across different file locations or remotes. It allows matching identical models even
// when stored in different repositories or with different filenames.
MetadataHash string `json:"metadataHash,omitempty" cyclonedx:"metadataHash"`
MetadataKeyValuesHash string `json:"metadataHash,omitempty" cyclonedx:"metadataHash"`
}