fix: pr comments

Signed-off-by: Christopher Phillips <32073428+spiffcs@users.noreply.github.com>
This commit is contained in:
Christopher Phillips 2025-11-12 23:56:18 -05:00
parent 9b31c0480f
commit 6daea43c32
No known key found for this signature in database
9 changed files with 56 additions and 349 deletions

5
go.mod
View File

@ -286,7 +286,10 @@ require (
modernc.org/memory v1.11.0 // indirect
)
require github.com/gpustack/gguf-parser-go v0.22.1
require (
github.com/cespare/xxhash/v2 v2.3.0
github.com/gpustack/gguf-parser-go v0.22.1
)
require (
cyphar.com/go-pathrs v0.2.1 // indirect

1
go.sum
View File

@ -229,7 +229,6 @@ github.com/cenkalti/backoff/v4 v4.2.1 h1:y4OZtCnogmCPw98Zjyt5a6+QwPLGkiQsYW5oUqy
github.com/cenkalti/backoff/v4 v4.2.1/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE=
github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
github.com/census-instrumentation/opencensus-proto v0.3.0/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
github.com/cespare/xxhash v1.1.0 h1:a6HrQnmkObjyL+Gs60czilIUGqrzKutQD6XZog3p+ko=
github.com/cespare/xxhash v1.1.0/go.mod h1:XrSqR1VqqWfGrhpAt58auRo0WTKS1nRRg3ghfAqPWnc=
github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/cespare/xxhash/v2 v2.1.2/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=

View File

@ -27,7 +27,6 @@ func AllTypes() []any {
pkg.ELFBinaryPackageNoteJSONPayload{},
pkg.ElixirMixLockEntry{},
pkg.ErlangRebarLockEntry{},
pkg.GGUFFileHeader{},
pkg.GitHubActionsUseStatement{},
pkg.GolangBinaryBuildinfoEntry{},
pkg.GolangModuleEntry{},

View File

@ -7,7 +7,6 @@ import (
"github.com/google/go-cmp/cmp/cmpopts"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/anchore/syft/syft/artifact"
"github.com/anchore/syft/syft/pkg"
@ -15,72 +14,6 @@ import (
)
func TestGGUFCataloger_Globs(t *testing.T) {
tests := []struct {
name string
setup func(t *testing.T) string // returns fixture directory
expected []string
}{
{
name: "finds GGUF files in root",
setup: func(t *testing.T) string {
dir := t.TempDir()
createTestGGUFInDir(t, dir, "model1.gguf")
createTestGGUFInDir(t, dir, "model2.gguf")
return dir
},
expected: []string{
"model1.gguf",
"model2.gguf",
},
},
{
name: "finds GGUF files in subdirectories",
setup: func(t *testing.T) string {
dir := t.TempDir()
modelsDir := filepath.Join(dir, "models")
os.MkdirAll(modelsDir, 0755)
createTestGGUFInDir(t, modelsDir, "llama.gguf")
deepDir := filepath.Join(dir, "deep", "nested", "path")
os.MkdirAll(deepDir, 0755)
createTestGGUFInDir(t, deepDir, "mistral.gguf")
return dir
},
expected: []string{
"models/llama.gguf",
"deep/nested/path/mistral.gguf",
},
},
{
name: "ignores non-GGUF files",
setup: func(t *testing.T) string {
dir := t.TempDir()
createTestGGUFInDir(t, dir, "model.gguf")
// Create non-GGUF files
os.WriteFile(filepath.Join(dir, "readme.txt"), []byte("readme"), 0644)
os.WriteFile(filepath.Join(dir, "model.bin"), []byte("binary"), 0644)
os.WriteFile(filepath.Join(dir, "config.json"), []byte("{}"), 0644)
return dir
},
expected: []string{
"model.gguf",
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
fixtureDir := tt.setup(t)
pkgtest.NewCatalogTester().
FromDirectory(t, fixtureDir).
ExpectsResolverContentQueries(tt.expected).
TestCataloger(t, NewGGUFCataloger())
})
}
}
func TestGGUFCataloger_Integration(t *testing.T) {
@ -117,114 +50,15 @@ func TestGGUFCataloger_Integration(t *testing.T) {
pkg.NewLicenseFromFields("Apache-2.0", "", nil),
),
Metadata: pkg.GGUFFileHeader{
ModelFormat: "gguf",
ModelName: "llama3-8b",
ModelVersion: "3.0",
License: "Apache-2.0",
Architecture: "llama",
Quantization: "Unknown",
Parameters: 0,
GGUFVersion: 3,
TensorCount: 0,
Header: map[string]interface{}{},
TruncatedHeader: false,
},
},
},
expectedRelationships: nil,
},
{
name: "catalog multiple GGUF files",
setup: func(t *testing.T) string {
dir := t.TempDir()
// Create first model
data1 := newTestGGUFBuilder().
withVersion(3).
withStringKV("general.architecture", "llama").
withStringKV("general.name", "model1").
withStringKV("general.version", "1.0").
build()
os.WriteFile(filepath.Join(dir, "model1.gguf"), data1, 0644)
// Create second model
data2 := newTestGGUFBuilder().
withVersion(3).
withStringKV("general.architecture", "mistral").
withStringKV("general.name", "model2").
withStringKV("general.version", "2.0").
build()
os.WriteFile(filepath.Join(dir, "model2.gguf"), data2, 0644)
return dir
},
expectedPackages: []pkg.Package{
{
Name: "model1",
Version: "1.0",
Type: pkg.ModelPkg,
Metadata: pkg.GGUFFileHeader{
ModelFormat: "gguf",
ModelName: "model1",
ModelVersion: "1.0",
Architecture: "llama",
Quantization: "Unknown",
GGUFVersion: 3,
TensorCount: 0,
Header: map[string]interface{}{},
TruncatedHeader: false,
},
},
{
Name: "model2",
Version: "2.0",
Type: pkg.ModelPkg,
Metadata: pkg.GGUFFileHeader{
ModelFormat: "gguf",
ModelName: "model2",
ModelVersion: "2.0",
Architecture: "mistral",
Quantization: "Unknown",
GGUFVersion: 3,
TensorCount: 0,
Header: map[string]interface{}{},
TruncatedHeader: false,
},
},
},
expectedRelationships: nil,
},
{
name: "catalog GGUF in nested directories",
setup: func(t *testing.T) string {
dir := t.TempDir()
nestedDir := filepath.Join(dir, "models", "quantized")
os.MkdirAll(nestedDir, 0755)
data := newTestGGUFBuilder().
withVersion(3).
withStringKV("general.architecture", "qwen").
withStringKV("general.name", "qwen-nested").
build()
os.WriteFile(filepath.Join(nestedDir, "qwen.gguf"), data, 0644)
return dir
},
expectedPackages: []pkg.Package{
{
Name: "qwen-nested",
Version: unknownGGUFData,
Type: pkg.ModelPkg,
Metadata: pkg.GGUFFileHeader{
ModelFormat: "gguf",
ModelName: "qwen-nested",
ModelVersion: unknownGGUFData,
Architecture: "qwen",
Quantization: "Unknown",
GGUFVersion: 3,
TensorCount: 0,
Header: map[string]interface{}{},
TruncatedHeader: false,
GGUFVersion: 3,
TensorCount: 0,
Header: map[string]interface{}{},
},
},
},
@ -252,122 +86,7 @@ func TestGGUFCataloger_Integration(t *testing.T) {
}
}
func TestGGUFCataloger_SkipsInvalidFiles(t *testing.T) {
dir := t.TempDir()
// Create a valid GGUF
validData := newTestGGUFBuilder().
withVersion(3).
withStringKV("general.architecture", "llama").
withStringKV("general.name", "valid-model").
build()
os.WriteFile(filepath.Join(dir, "valid.gguf"), validData, 0644)
// Create an invalid GGUF (wrong magic)
invalidData := newTestGGUFBuilder().buildInvalidMagic()
os.WriteFile(filepath.Join(dir, "invalid.gguf"), invalidData, 0644)
// Create a truncated GGUF
os.WriteFile(filepath.Join(dir, "truncated.gguf"), []byte{0x47}, 0644)
// Catalog should succeed and only return the valid package
tester := pkgtest.NewCatalogTester().
FromDirectory(t, dir).
ExpectsAssertion(func(t *testing.T, pkgs []pkg.Package, _ []artifact.Relationship) {
// Should only find the valid model
require.Len(t, pkgs, 1)
assert.Equal(t, "valid-model", pkgs[0].Name)
})
tester.TestCataloger(t, NewGGUFCataloger())
}
func TestGGUFCataloger_Name(t *testing.T) {
cataloger := NewGGUFCataloger()
assert.Equal(t, "gguf-cataloger", cataloger.Name())
}
func TestGGUFCataloger_EmptyDirectory(t *testing.T) {
dir := t.TempDir()
// Create a subdirectory to ensure glob still runs
os.MkdirAll(filepath.Join(dir, "models"), 0755)
tester := pkgtest.NewCatalogTester().
FromDirectory(t, dir).
ExpectsAssertion(func(t *testing.T, pkgs []pkg.Package, rels []artifact.Relationship) {
assert.Empty(t, pkgs)
assert.Empty(t, rels)
})
tester.TestCataloger(t, NewGGUFCataloger())
}
func TestGGUFCataloger_MixedFiles(t *testing.T) {
dir := t.TempDir()
// Create GGUF file
ggufData := newTestGGUFBuilder().
withVersion(3).
withStringKV("general.architecture", "llama").
withStringKV("general.name", "test-model").
build()
os.WriteFile(filepath.Join(dir, "model.gguf"), ggufData, 0644)
// Create other file types
os.WriteFile(filepath.Join(dir, "README.md"), []byte("# Models"), 0644)
os.WriteFile(filepath.Join(dir, "config.json"), []byte("{}"), 0644)
os.WriteFile(filepath.Join(dir, "weights.bin"), []byte("weights"), 0644)
os.MkdirAll(filepath.Join(dir, "subdir"), 0755)
tester := pkgtest.NewCatalogTester().
FromDirectory(t, dir).
ExpectsAssertion(func(t *testing.T, pkgs []pkg.Package, _ []artifact.Relationship) {
// Should only find the GGUF model
require.Len(t, pkgs, 1)
assert.Equal(t, "test-model", pkgs[0].Name)
assert.Equal(t, pkg.ModelPkg, pkgs[0].Type)
})
tester.TestCataloger(t, NewGGUFCataloger())
}
func TestGGUFCataloger_CaseInsensitiveGlob(t *testing.T) {
// Test that the glob pattern is case-sensitive (as expected for **/*.gguf)
dir := t.TempDir()
// Create lowercase .gguf
data := newTestGGUFBuilder().
withVersion(3).
withStringKV("general.architecture", "llama").
withStringKV("general.name", "lowercase").
build()
os.WriteFile(filepath.Join(dir, "model.gguf"), data, 0644)
// Create uppercase .GGUF (should not match the glob)
os.WriteFile(filepath.Join(dir, "MODEL.GGUF"), data, 0644)
tester := pkgtest.NewCatalogTester().
FromDirectory(t, dir).
ExpectsAssertion(func(t *testing.T, pkgs []pkg.Package, _ []artifact.Relationship) {
// Depending on filesystem case-sensitivity, we may get 1 or 2 packages
// On case-insensitive filesystems (macOS), both might match
// On case-sensitive filesystems (Linux), only lowercase matches
assert.GreaterOrEqual(t, len(pkgs), 1, "should find at least the lowercase file")
})
tester.TestCataloger(t, NewGGUFCataloger())
}
// createTestGGUFInDir creates a minimal test GGUF file in the specified directory
func createTestGGUFInDir(t *testing.T, dir, filename string) {
t.Helper()
data := newTestGGUFBuilder().
withVersion(3).
withStringKV("general.architecture", "llama").
withStringKV("general.name", "test-model").
build()
path := filepath.Join(dir, filename)
err := os.WriteFile(path, data, 0644)
require.NoError(t, err)
}

View File

@ -1,10 +1,11 @@
package ai
import (
"crypto/sha256"
"encoding/json"
"fmt"
"github.com/cespare/xxhash/v2"
"github.com/anchore/syft/internal/log"
"github.com/anchore/syft/syft/file"
"github.com/anchore/syft/syft/pkg"
@ -48,7 +49,6 @@ func computeMetadataHash(metadata *pkg.GGUFFileHeader) string {
GGUFVersion uint32
TensorCount uint64
}{
Format: metadata.ModelFormat,
Name: metadata.ModelName,
Version: metadata.ModelVersion,
Architecture: metadata.Architecture,
@ -63,7 +63,7 @@ func computeMetadataHash(metadata *pkg.GGUFFileHeader) string {
return ""
}
// Compute SHA256 hash
hash := sha256.Sum256(jsonBytes)
return fmt.Sprintf("%x", hash[:8]) // Use first 8 bytes (16 hex chars)
// Compute xxhash
hash := xxhash.Sum64(jsonBytes)
return fmt.Sprintf("%016x", hash) // 16 hex chars (64 bits)
}

View File

@ -21,17 +21,15 @@ func TestNewGGUFPackage(t *testing.T) {
{
name: "complete GGUF package with all fields",
metadata: &pkg.GGUFFileHeader{
ModelFormat: "gguf",
ModelName: "llama3-8b-instruct",
ModelVersion: "3.0",
License: "Apache-2.0",
Architecture: "llama",
Quantization: "Q4_K_M",
Parameters: 8030000000,
GGUFVersion: 3,
TensorCount: 291,
Header: map[string]any{},
TruncatedHeader: false,
GGUFVersion: 3,
TensorCount: 291,
Header: map[string]any{},
},
locations: []file.Location{file.NewLocation("/models/llama3-8b.gguf")},
checkFunc: func(t *testing.T, p pkg.Package) {
@ -55,7 +53,6 @@ func TestNewGGUFPackage(t *testing.T) {
{
name: "minimal GGUF package",
metadata: &pkg.GGUFFileHeader{
ModelFormat: "gguf",
ModelName: "simple-model",
ModelVersion: "1.0",
Architecture: "gpt2",
@ -80,7 +77,6 @@ func TestNewGGUFPackage(t *testing.T) {
{
name: "GGUF package with multiple locations",
metadata: &pkg.GGUFFileHeader{
ModelFormat: "gguf",
ModelName: "multi-location-model",
ModelVersion: "1.5",
Architecture: "llama",

View File

@ -14,19 +14,14 @@ const (
maxHeaderSize = 50 * 1024 * 1024 // 50MB for large tokenizer vocabularies
)
// ggufHeaderReader reads just the header portion of a GGUF file efficiently
type ggufHeaderReader struct {
reader io.Reader
}
// readHeader reads only the GGUF header (metadata) without reading tensor data
// This is much more efficient than reading the entire file
// The reader should be wrapped with io.LimitedReader to prevent OOM issues
func (r *ggufHeaderReader) readHeader() ([]byte, error) {
func readHeader(r io.Reader) ([]byte, error) {
// Read initial chunk to determine header size
// GGUF format: magic(4) + version(4) + tensor_count(8) + metadata_kv_count(8) + metadata_kvs + tensors_info
initialBuf := make([]byte, 24) // Enough for magic, version, tensor count, and kv count
if _, err := io.ReadFull(r.reader, initialBuf); err != nil {
if _, err := io.ReadFull(r, initialBuf); err != nil {
return nil, fmt.Errorf("failed to read GGUF header prefix: %w", err)
}
@ -45,7 +40,7 @@ func (r *ggufHeaderReader) readHeader() ([]byte, error) {
// The LimitedReader will return EOF once maxHeaderSize is reached
buf := make([]byte, 64*1024) // 64KB chunks
for {
n, err := r.reader.Read(buf)
n, err := r.Read(buf)
if n > 0 {
headerData = append(headerData, buf[:n]...)
}
@ -65,24 +60,14 @@ func (r *ggufHeaderReader) readHeader() ([]byte, error) {
func convertGGUFMetadataKVs(kvs gguf_parser.GGUFMetadataKVs) map[string]interface{} {
result := make(map[string]interface{})
// Limit KV pairs to avoid bloat
const maxKVPairs = 200
count := 0
for _, kv := range kvs {
if count >= maxKVPairs {
break
}
// Skip standard fields that are extracted separately
switch kv.Key {
case "general.architecture", "general.name", "general.license",
"general.version", "general.parameter_count", "general.quantization":
continue
}
result[kv.Key] = kv.Value
count++
}
return result

View File

@ -2,15 +2,19 @@ package ai
import (
"context"
"encoding/json"
"fmt"
"io"
"os"
"path/filepath"
"sort"
"strings"
"github.com/cespare/xxhash/v2"
gguf_parser "github.com/gpustack/gguf-parser-go"
"github.com/anchore/syft/internal"
"github.com/anchore/syft/internal/log"
"github.com/anchore/syft/internal/unknown"
"github.com/anchore/syft/syft/artifact"
"github.com/anchore/syft/syft/file"
@ -18,8 +22,6 @@ import (
"github.com/anchore/syft/syft/pkg/cataloger/generic"
)
const unknownGGUFData = "unknown"
// parseGGUFModel parses a GGUF model file and returns the discovered package.
// This implementation only reads the header portion of the file, not the entire model.
func parseGGUFModel(_ context.Context, _ file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) {
@ -28,8 +30,7 @@ func parseGGUFModel(_ context.Context, _ file.Resolver, _ *generic.Environment,
// Read and validate the GGUF file header using LimitedReader to prevent OOM
// We use LimitedReader to cap reads at maxHeaderSize (50MB)
limitedReader := &io.LimitedReader{R: reader, N: maxHeaderSize}
headerReader := &ggufHeaderReader{reader: limitedReader}
headerData, err := headerReader.readHeader()
headerData, err := readHeader(limitedReader)
if err != nil {
return nil, nil, fmt.Errorf("failed to read GGUF header: %w", err)
}
@ -63,7 +64,6 @@ func parseGGUFModel(_ context.Context, _ file.Resolver, _ *generic.Environment,
// Convert to syft metadata structure
syftMetadata := &pkg.GGUFFileHeader{
ModelFormat: "gguf",
ModelName: metadata.Name,
ModelVersion: extractVersion(ggufFile.Header.MetadataKV),
License: metadata.License,
@ -71,10 +71,9 @@ func parseGGUFModel(_ context.Context, _ file.Resolver, _ *generic.Environment,
Quantization: metadata.FileTypeDescriptor,
Parameters: uint64(metadata.Parameters),
GGUFVersion: uint32(ggufFile.Header.Version),
TensorCount: ggufFile.Header.TensorCount,
Header: convertGGUFMetadataKVs(ggufFile.Header.MetadataKV),
TruncatedHeader: false, // We read the full header
Hash: "", // Will be computed in newGGUFPackage
TensorCount: ggufFile.Header.TensorCount,
Header: convertGGUFMetadataKVs(ggufFile.Header.MetadataKV),
MetadataHash: computeKVMetadataHash(ggufFile.Header.MetadataKV),
}
// If model name is not in metadata, use filename
@ -82,11 +81,6 @@ func parseGGUFModel(_ context.Context, _ file.Resolver, _ *generic.Environment,
syftMetadata.ModelName = extractModelNameFromPath(reader.Path())
}
// If version is still unknown, try to infer from name
if syftMetadata.ModelVersion == unknownGGUFData {
syftMetadata.ModelVersion = extractVersionFromName(syftMetadata.ModelName)
}
// Create package from metadata
p := newGGUFPackage(
syftMetadata,
@ -96,6 +90,27 @@ func parseGGUFModel(_ context.Context, _ file.Resolver, _ *generic.Environment,
return []pkg.Package{p}, nil, unknown.IfEmptyf([]pkg.Package{p}, "unable to parse GGUF file")
}
// computeKVMetadataHash computes a stable hash of the KV metadata for use as a global identifier
func computeKVMetadataHash(metadata gguf_parser.GGUFMetadataKVs) string {
// Sort the KV pairs by key for stable hashing
sortedKVs := make([]gguf_parser.GGUFMetadataKV, len(metadata))
copy(sortedKVs, metadata)
sort.Slice(sortedKVs, func(i, j int) bool {
return sortedKVs[i].Key < sortedKVs[j].Key
})
// Marshal sorted KVs to JSON for stable hashing
jsonBytes, err := json.Marshal(sortedKVs)
if err != nil {
log.Debugf("failed to marshal metadata for hashing: %v", err)
return ""
}
// Compute xxhash
hash := xxhash.Sum64(jsonBytes)
return fmt.Sprintf("%016x", hash) // 16 hex chars (64 bits)
}
// extractVersion attempts to extract version from metadata KV pairs
func extractVersion(kvs gguf_parser.GGUFMetadataKVs) string {
for _, kv := range kvs {
@ -105,14 +120,7 @@ func extractVersion(kvs gguf_parser.GGUFMetadataKVs) string {
}
}
}
return unknownGGUFData
}
// extractVersionFromName tries to extract version from model name
func extractVersionFromName(_ string) string {
// Look for version patterns like "v1.0", "1.5b", "3.0", etc.
// For now, return unknown - this could be enhanced with regex
return unknownGGUFData
return ""
}
// extractModelNameFromPath extracts the model name from the file path

View File

@ -4,8 +4,8 @@ package pkg
// GGUF is a binary file format used for storing model weights for the GGML library, designed for fast
// loading and saving of models, particularly quantized large language models.
type GGUFFileHeader struct {
// ModelFormat is always "gguf"
ModelFormat string `json:"modelFormat" cyclonedx:"modelFormat"`
// GGUFVersion is the GGUF format version (e.g., 3)
GGUFVersion uint32 `json:"ggufVersion" cyclonedx:"ggufVersion"`
// ModelName is the name of the model (from general.name or filename)
ModelName string `json:"modelName" cyclonedx:"modelName"`
@ -16,15 +16,9 @@ type GGUFFileHeader struct {
// FileSize is the size of the GGUF file in bytes (best-effort if available from resolver)
FileSize int64 `json:"fileSize,omitempty" cyclonedx:"fileSize"`
// Hash is a content hash of the metadata (for stable global identifiers across remotes)
Hash string `json:"hash,omitempty" cyclonedx:"hash"`
// License is the license identifier (from general.license if present)
License string `json:"license,omitempty" cyclonedx:"license"`
// GGUFVersion is the GGUF format version (e.g., 3)
GGUFVersion uint32 `json:"ggufVersion" cyclonedx:"ggufVersion"`
// Architecture is the model architecture (from general.architecture, e.g., "qwen3moe", "llama")
Architecture string `json:"architecture,omitempty" cyclonedx:"architecture"`
@ -42,6 +36,10 @@ type GGUFFileHeader struct {
// (namespaced with general.*, llama.*, etc.) while avoiding duplication.
Header map[string]interface{} `json:"header,omitempty" cyclonedx:"header"`
// TruncatedHeader indicates if the header was truncated during parsing (for very large headers)
TruncatedHeader bool `json:"truncatedHeader,omitempty" cyclonedx:"truncatedHeader"`
// MetadataHash is a xx64 hash of all key-value pairs from the GGUF header metadata.
// This hash is computed over the complete header metadata (including the fields extracted
// into typed fields above) and provides a stable identifier for the model configuration
// across different file locations or remotes. It allows matching identical models even
// when stored in different repositories or with different filenames.
MetadataHash string `json:"metadataHash,omitempty" cyclonedx:"metadataHash"`
}