mirror of
https://github.com/anchore/syft.git
synced 2025-11-17 08:23:15 +01:00
fix: pr comments
Signed-off-by: Christopher Phillips <32073428+spiffcs@users.noreply.github.com>
This commit is contained in:
parent
9b31c0480f
commit
6daea43c32
5
go.mod
5
go.mod
@ -286,7 +286,10 @@ require (
|
|||||||
modernc.org/memory v1.11.0 // indirect
|
modernc.org/memory v1.11.0 // indirect
|
||||||
)
|
)
|
||||||
|
|
||||||
require github.com/gpustack/gguf-parser-go v0.22.1
|
require (
|
||||||
|
github.com/cespare/xxhash/v2 v2.3.0
|
||||||
|
github.com/gpustack/gguf-parser-go v0.22.1
|
||||||
|
)
|
||||||
|
|
||||||
require (
|
require (
|
||||||
cyphar.com/go-pathrs v0.2.1 // indirect
|
cyphar.com/go-pathrs v0.2.1 // indirect
|
||||||
|
|||||||
1
go.sum
1
go.sum
@ -229,7 +229,6 @@ github.com/cenkalti/backoff/v4 v4.2.1 h1:y4OZtCnogmCPw98Zjyt5a6+QwPLGkiQsYW5oUqy
|
|||||||
github.com/cenkalti/backoff/v4 v4.2.1/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE=
|
github.com/cenkalti/backoff/v4 v4.2.1/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE=
|
||||||
github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
|
github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
|
||||||
github.com/census-instrumentation/opencensus-proto v0.3.0/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
|
github.com/census-instrumentation/opencensus-proto v0.3.0/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
|
||||||
github.com/cespare/xxhash v1.1.0 h1:a6HrQnmkObjyL+Gs60czilIUGqrzKutQD6XZog3p+ko=
|
|
||||||
github.com/cespare/xxhash v1.1.0/go.mod h1:XrSqR1VqqWfGrhpAt58auRo0WTKS1nRRg3ghfAqPWnc=
|
github.com/cespare/xxhash v1.1.0/go.mod h1:XrSqR1VqqWfGrhpAt58auRo0WTKS1nRRg3ghfAqPWnc=
|
||||||
github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
|
github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
|
||||||
github.com/cespare/xxhash/v2 v2.1.2/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
|
github.com/cespare/xxhash/v2 v2.1.2/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
|
||||||
|
|||||||
@ -27,7 +27,6 @@ func AllTypes() []any {
|
|||||||
pkg.ELFBinaryPackageNoteJSONPayload{},
|
pkg.ELFBinaryPackageNoteJSONPayload{},
|
||||||
pkg.ElixirMixLockEntry{},
|
pkg.ElixirMixLockEntry{},
|
||||||
pkg.ErlangRebarLockEntry{},
|
pkg.ErlangRebarLockEntry{},
|
||||||
pkg.GGUFFileHeader{},
|
|
||||||
pkg.GitHubActionsUseStatement{},
|
pkg.GitHubActionsUseStatement{},
|
||||||
pkg.GolangBinaryBuildinfoEntry{},
|
pkg.GolangBinaryBuildinfoEntry{},
|
||||||
pkg.GolangModuleEntry{},
|
pkg.GolangModuleEntry{},
|
||||||
|
|||||||
@ -7,7 +7,6 @@ import (
|
|||||||
|
|
||||||
"github.com/google/go-cmp/cmp/cmpopts"
|
"github.com/google/go-cmp/cmp/cmpopts"
|
||||||
"github.com/stretchr/testify/assert"
|
"github.com/stretchr/testify/assert"
|
||||||
"github.com/stretchr/testify/require"
|
|
||||||
|
|
||||||
"github.com/anchore/syft/syft/artifact"
|
"github.com/anchore/syft/syft/artifact"
|
||||||
"github.com/anchore/syft/syft/pkg"
|
"github.com/anchore/syft/syft/pkg"
|
||||||
@ -15,72 +14,6 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func TestGGUFCataloger_Globs(t *testing.T) {
|
func TestGGUFCataloger_Globs(t *testing.T) {
|
||||||
tests := []struct {
|
|
||||||
name string
|
|
||||||
setup func(t *testing.T) string // returns fixture directory
|
|
||||||
expected []string
|
|
||||||
}{
|
|
||||||
{
|
|
||||||
name: "finds GGUF files in root",
|
|
||||||
setup: func(t *testing.T) string {
|
|
||||||
dir := t.TempDir()
|
|
||||||
createTestGGUFInDir(t, dir, "model1.gguf")
|
|
||||||
createTestGGUFInDir(t, dir, "model2.gguf")
|
|
||||||
return dir
|
|
||||||
},
|
|
||||||
expected: []string{
|
|
||||||
"model1.gguf",
|
|
||||||
"model2.gguf",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "finds GGUF files in subdirectories",
|
|
||||||
setup: func(t *testing.T) string {
|
|
||||||
dir := t.TempDir()
|
|
||||||
modelsDir := filepath.Join(dir, "models")
|
|
||||||
os.MkdirAll(modelsDir, 0755)
|
|
||||||
createTestGGUFInDir(t, modelsDir, "llama.gguf")
|
|
||||||
|
|
||||||
deepDir := filepath.Join(dir, "deep", "nested", "path")
|
|
||||||
os.MkdirAll(deepDir, 0755)
|
|
||||||
createTestGGUFInDir(t, deepDir, "mistral.gguf")
|
|
||||||
|
|
||||||
return dir
|
|
||||||
},
|
|
||||||
expected: []string{
|
|
||||||
"models/llama.gguf",
|
|
||||||
"deep/nested/path/mistral.gguf",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "ignores non-GGUF files",
|
|
||||||
setup: func(t *testing.T) string {
|
|
||||||
dir := t.TempDir()
|
|
||||||
createTestGGUFInDir(t, dir, "model.gguf")
|
|
||||||
|
|
||||||
// Create non-GGUF files
|
|
||||||
os.WriteFile(filepath.Join(dir, "readme.txt"), []byte("readme"), 0644)
|
|
||||||
os.WriteFile(filepath.Join(dir, "model.bin"), []byte("binary"), 0644)
|
|
||||||
os.WriteFile(filepath.Join(dir, "config.json"), []byte("{}"), 0644)
|
|
||||||
|
|
||||||
return dir
|
|
||||||
},
|
|
||||||
expected: []string{
|
|
||||||
"model.gguf",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, tt := range tests {
|
|
||||||
t.Run(tt.name, func(t *testing.T) {
|
|
||||||
fixtureDir := tt.setup(t)
|
|
||||||
|
|
||||||
pkgtest.NewCatalogTester().
|
|
||||||
FromDirectory(t, fixtureDir).
|
|
||||||
ExpectsResolverContentQueries(tt.expected).
|
|
||||||
TestCataloger(t, NewGGUFCataloger())
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestGGUFCataloger_Integration(t *testing.T) {
|
func TestGGUFCataloger_Integration(t *testing.T) {
|
||||||
@ -117,114 +50,15 @@ func TestGGUFCataloger_Integration(t *testing.T) {
|
|||||||
pkg.NewLicenseFromFields("Apache-2.0", "", nil),
|
pkg.NewLicenseFromFields("Apache-2.0", "", nil),
|
||||||
),
|
),
|
||||||
Metadata: pkg.GGUFFileHeader{
|
Metadata: pkg.GGUFFileHeader{
|
||||||
ModelFormat: "gguf",
|
|
||||||
ModelName: "llama3-8b",
|
ModelName: "llama3-8b",
|
||||||
ModelVersion: "3.0",
|
ModelVersion: "3.0",
|
||||||
License: "Apache-2.0",
|
License: "Apache-2.0",
|
||||||
Architecture: "llama",
|
Architecture: "llama",
|
||||||
Quantization: "Unknown",
|
Quantization: "Unknown",
|
||||||
Parameters: 0,
|
Parameters: 0,
|
||||||
GGUFVersion: 3,
|
GGUFVersion: 3,
|
||||||
TensorCount: 0,
|
TensorCount: 0,
|
||||||
Header: map[string]interface{}{},
|
Header: map[string]interface{}{},
|
||||||
TruncatedHeader: false,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
expectedRelationships: nil,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "catalog multiple GGUF files",
|
|
||||||
setup: func(t *testing.T) string {
|
|
||||||
dir := t.TempDir()
|
|
||||||
|
|
||||||
// Create first model
|
|
||||||
data1 := newTestGGUFBuilder().
|
|
||||||
withVersion(3).
|
|
||||||
withStringKV("general.architecture", "llama").
|
|
||||||
withStringKV("general.name", "model1").
|
|
||||||
withStringKV("general.version", "1.0").
|
|
||||||
build()
|
|
||||||
os.WriteFile(filepath.Join(dir, "model1.gguf"), data1, 0644)
|
|
||||||
|
|
||||||
// Create second model
|
|
||||||
data2 := newTestGGUFBuilder().
|
|
||||||
withVersion(3).
|
|
||||||
withStringKV("general.architecture", "mistral").
|
|
||||||
withStringKV("general.name", "model2").
|
|
||||||
withStringKV("general.version", "2.0").
|
|
||||||
build()
|
|
||||||
os.WriteFile(filepath.Join(dir, "model2.gguf"), data2, 0644)
|
|
||||||
|
|
||||||
return dir
|
|
||||||
},
|
|
||||||
expectedPackages: []pkg.Package{
|
|
||||||
{
|
|
||||||
Name: "model1",
|
|
||||||
Version: "1.0",
|
|
||||||
Type: pkg.ModelPkg,
|
|
||||||
Metadata: pkg.GGUFFileHeader{
|
|
||||||
ModelFormat: "gguf",
|
|
||||||
ModelName: "model1",
|
|
||||||
ModelVersion: "1.0",
|
|
||||||
Architecture: "llama",
|
|
||||||
Quantization: "Unknown",
|
|
||||||
GGUFVersion: 3,
|
|
||||||
TensorCount: 0,
|
|
||||||
Header: map[string]interface{}{},
|
|
||||||
TruncatedHeader: false,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
Name: "model2",
|
|
||||||
Version: "2.0",
|
|
||||||
Type: pkg.ModelPkg,
|
|
||||||
Metadata: pkg.GGUFFileHeader{
|
|
||||||
ModelFormat: "gguf",
|
|
||||||
ModelName: "model2",
|
|
||||||
ModelVersion: "2.0",
|
|
||||||
Architecture: "mistral",
|
|
||||||
Quantization: "Unknown",
|
|
||||||
GGUFVersion: 3,
|
|
||||||
TensorCount: 0,
|
|
||||||
Header: map[string]interface{}{},
|
|
||||||
TruncatedHeader: false,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
expectedRelationships: nil,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "catalog GGUF in nested directories",
|
|
||||||
setup: func(t *testing.T) string {
|
|
||||||
dir := t.TempDir()
|
|
||||||
nestedDir := filepath.Join(dir, "models", "quantized")
|
|
||||||
os.MkdirAll(nestedDir, 0755)
|
|
||||||
|
|
||||||
data := newTestGGUFBuilder().
|
|
||||||
withVersion(3).
|
|
||||||
withStringKV("general.architecture", "qwen").
|
|
||||||
withStringKV("general.name", "qwen-nested").
|
|
||||||
build()
|
|
||||||
|
|
||||||
os.WriteFile(filepath.Join(nestedDir, "qwen.gguf"), data, 0644)
|
|
||||||
return dir
|
|
||||||
},
|
|
||||||
expectedPackages: []pkg.Package{
|
|
||||||
{
|
|
||||||
Name: "qwen-nested",
|
|
||||||
Version: unknownGGUFData,
|
|
||||||
Type: pkg.ModelPkg,
|
|
||||||
Metadata: pkg.GGUFFileHeader{
|
|
||||||
ModelFormat: "gguf",
|
|
||||||
ModelName: "qwen-nested",
|
|
||||||
ModelVersion: unknownGGUFData,
|
|
||||||
Architecture: "qwen",
|
|
||||||
Quantization: "Unknown",
|
|
||||||
GGUFVersion: 3,
|
|
||||||
TensorCount: 0,
|
|
||||||
Header: map[string]interface{}{},
|
|
||||||
TruncatedHeader: false,
|
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
@ -252,122 +86,7 @@ func TestGGUFCataloger_Integration(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestGGUFCataloger_SkipsInvalidFiles(t *testing.T) {
|
|
||||||
dir := t.TempDir()
|
|
||||||
|
|
||||||
// Create a valid GGUF
|
|
||||||
validData := newTestGGUFBuilder().
|
|
||||||
withVersion(3).
|
|
||||||
withStringKV("general.architecture", "llama").
|
|
||||||
withStringKV("general.name", "valid-model").
|
|
||||||
build()
|
|
||||||
os.WriteFile(filepath.Join(dir, "valid.gguf"), validData, 0644)
|
|
||||||
|
|
||||||
// Create an invalid GGUF (wrong magic)
|
|
||||||
invalidData := newTestGGUFBuilder().buildInvalidMagic()
|
|
||||||
os.WriteFile(filepath.Join(dir, "invalid.gguf"), invalidData, 0644)
|
|
||||||
|
|
||||||
// Create a truncated GGUF
|
|
||||||
os.WriteFile(filepath.Join(dir, "truncated.gguf"), []byte{0x47}, 0644)
|
|
||||||
|
|
||||||
// Catalog should succeed and only return the valid package
|
|
||||||
tester := pkgtest.NewCatalogTester().
|
|
||||||
FromDirectory(t, dir).
|
|
||||||
ExpectsAssertion(func(t *testing.T, pkgs []pkg.Package, _ []artifact.Relationship) {
|
|
||||||
// Should only find the valid model
|
|
||||||
require.Len(t, pkgs, 1)
|
|
||||||
assert.Equal(t, "valid-model", pkgs[0].Name)
|
|
||||||
})
|
|
||||||
|
|
||||||
tester.TestCataloger(t, NewGGUFCataloger())
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestGGUFCataloger_Name(t *testing.T) {
|
func TestGGUFCataloger_Name(t *testing.T) {
|
||||||
cataloger := NewGGUFCataloger()
|
cataloger := NewGGUFCataloger()
|
||||||
assert.Equal(t, "gguf-cataloger", cataloger.Name())
|
assert.Equal(t, "gguf-cataloger", cataloger.Name())
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestGGUFCataloger_EmptyDirectory(t *testing.T) {
|
|
||||||
dir := t.TempDir()
|
|
||||||
// Create a subdirectory to ensure glob still runs
|
|
||||||
os.MkdirAll(filepath.Join(dir, "models"), 0755)
|
|
||||||
|
|
||||||
tester := pkgtest.NewCatalogTester().
|
|
||||||
FromDirectory(t, dir).
|
|
||||||
ExpectsAssertion(func(t *testing.T, pkgs []pkg.Package, rels []artifact.Relationship) {
|
|
||||||
assert.Empty(t, pkgs)
|
|
||||||
assert.Empty(t, rels)
|
|
||||||
})
|
|
||||||
|
|
||||||
tester.TestCataloger(t, NewGGUFCataloger())
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestGGUFCataloger_MixedFiles(t *testing.T) {
|
|
||||||
dir := t.TempDir()
|
|
||||||
|
|
||||||
// Create GGUF file
|
|
||||||
ggufData := newTestGGUFBuilder().
|
|
||||||
withVersion(3).
|
|
||||||
withStringKV("general.architecture", "llama").
|
|
||||||
withStringKV("general.name", "test-model").
|
|
||||||
build()
|
|
||||||
os.WriteFile(filepath.Join(dir, "model.gguf"), ggufData, 0644)
|
|
||||||
|
|
||||||
// Create other file types
|
|
||||||
os.WriteFile(filepath.Join(dir, "README.md"), []byte("# Models"), 0644)
|
|
||||||
os.WriteFile(filepath.Join(dir, "config.json"), []byte("{}"), 0644)
|
|
||||||
os.WriteFile(filepath.Join(dir, "weights.bin"), []byte("weights"), 0644)
|
|
||||||
os.MkdirAll(filepath.Join(dir, "subdir"), 0755)
|
|
||||||
|
|
||||||
tester := pkgtest.NewCatalogTester().
|
|
||||||
FromDirectory(t, dir).
|
|
||||||
ExpectsAssertion(func(t *testing.T, pkgs []pkg.Package, _ []artifact.Relationship) {
|
|
||||||
// Should only find the GGUF model
|
|
||||||
require.Len(t, pkgs, 1)
|
|
||||||
assert.Equal(t, "test-model", pkgs[0].Name)
|
|
||||||
assert.Equal(t, pkg.ModelPkg, pkgs[0].Type)
|
|
||||||
})
|
|
||||||
|
|
||||||
tester.TestCataloger(t, NewGGUFCataloger())
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestGGUFCataloger_CaseInsensitiveGlob(t *testing.T) {
|
|
||||||
// Test that the glob pattern is case-sensitive (as expected for **/*.gguf)
|
|
||||||
dir := t.TempDir()
|
|
||||||
|
|
||||||
// Create lowercase .gguf
|
|
||||||
data := newTestGGUFBuilder().
|
|
||||||
withVersion(3).
|
|
||||||
withStringKV("general.architecture", "llama").
|
|
||||||
withStringKV("general.name", "lowercase").
|
|
||||||
build()
|
|
||||||
os.WriteFile(filepath.Join(dir, "model.gguf"), data, 0644)
|
|
||||||
|
|
||||||
// Create uppercase .GGUF (should not match the glob)
|
|
||||||
os.WriteFile(filepath.Join(dir, "MODEL.GGUF"), data, 0644)
|
|
||||||
|
|
||||||
tester := pkgtest.NewCatalogTester().
|
|
||||||
FromDirectory(t, dir).
|
|
||||||
ExpectsAssertion(func(t *testing.T, pkgs []pkg.Package, _ []artifact.Relationship) {
|
|
||||||
// Depending on filesystem case-sensitivity, we may get 1 or 2 packages
|
|
||||||
// On case-insensitive filesystems (macOS), both might match
|
|
||||||
// On case-sensitive filesystems (Linux), only lowercase matches
|
|
||||||
assert.GreaterOrEqual(t, len(pkgs), 1, "should find at least the lowercase file")
|
|
||||||
})
|
|
||||||
|
|
||||||
tester.TestCataloger(t, NewGGUFCataloger())
|
|
||||||
}
|
|
||||||
|
|
||||||
// createTestGGUFInDir creates a minimal test GGUF file in the specified directory
|
|
||||||
func createTestGGUFInDir(t *testing.T, dir, filename string) {
|
|
||||||
t.Helper()
|
|
||||||
data := newTestGGUFBuilder().
|
|
||||||
withVersion(3).
|
|
||||||
withStringKV("general.architecture", "llama").
|
|
||||||
withStringKV("general.name", "test-model").
|
|
||||||
build()
|
|
||||||
|
|
||||||
path := filepath.Join(dir, filename)
|
|
||||||
err := os.WriteFile(path, data, 0644)
|
|
||||||
require.NoError(t, err)
|
|
||||||
}
|
|
||||||
|
|||||||
@ -1,10 +1,11 @@
|
|||||||
package ai
|
package ai
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"crypto/sha256"
|
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
|
||||||
|
"github.com/cespare/xxhash/v2"
|
||||||
|
|
||||||
"github.com/anchore/syft/internal/log"
|
"github.com/anchore/syft/internal/log"
|
||||||
"github.com/anchore/syft/syft/file"
|
"github.com/anchore/syft/syft/file"
|
||||||
"github.com/anchore/syft/syft/pkg"
|
"github.com/anchore/syft/syft/pkg"
|
||||||
@ -48,7 +49,6 @@ func computeMetadataHash(metadata *pkg.GGUFFileHeader) string {
|
|||||||
GGUFVersion uint32
|
GGUFVersion uint32
|
||||||
TensorCount uint64
|
TensorCount uint64
|
||||||
}{
|
}{
|
||||||
Format: metadata.ModelFormat,
|
|
||||||
Name: metadata.ModelName,
|
Name: metadata.ModelName,
|
||||||
Version: metadata.ModelVersion,
|
Version: metadata.ModelVersion,
|
||||||
Architecture: metadata.Architecture,
|
Architecture: metadata.Architecture,
|
||||||
@ -63,7 +63,7 @@ func computeMetadataHash(metadata *pkg.GGUFFileHeader) string {
|
|||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
|
|
||||||
// Compute SHA256 hash
|
// Compute xxhash
|
||||||
hash := sha256.Sum256(jsonBytes)
|
hash := xxhash.Sum64(jsonBytes)
|
||||||
return fmt.Sprintf("%x", hash[:8]) // Use first 8 bytes (16 hex chars)
|
return fmt.Sprintf("%016x", hash) // 16 hex chars (64 bits)
|
||||||
}
|
}
|
||||||
|
|||||||
@ -21,17 +21,15 @@ func TestNewGGUFPackage(t *testing.T) {
|
|||||||
{
|
{
|
||||||
name: "complete GGUF package with all fields",
|
name: "complete GGUF package with all fields",
|
||||||
metadata: &pkg.GGUFFileHeader{
|
metadata: &pkg.GGUFFileHeader{
|
||||||
ModelFormat: "gguf",
|
|
||||||
ModelName: "llama3-8b-instruct",
|
ModelName: "llama3-8b-instruct",
|
||||||
ModelVersion: "3.0",
|
ModelVersion: "3.0",
|
||||||
License: "Apache-2.0",
|
License: "Apache-2.0",
|
||||||
Architecture: "llama",
|
Architecture: "llama",
|
||||||
Quantization: "Q4_K_M",
|
Quantization: "Q4_K_M",
|
||||||
Parameters: 8030000000,
|
Parameters: 8030000000,
|
||||||
GGUFVersion: 3,
|
GGUFVersion: 3,
|
||||||
TensorCount: 291,
|
TensorCount: 291,
|
||||||
Header: map[string]any{},
|
Header: map[string]any{},
|
||||||
TruncatedHeader: false,
|
|
||||||
},
|
},
|
||||||
locations: []file.Location{file.NewLocation("/models/llama3-8b.gguf")},
|
locations: []file.Location{file.NewLocation("/models/llama3-8b.gguf")},
|
||||||
checkFunc: func(t *testing.T, p pkg.Package) {
|
checkFunc: func(t *testing.T, p pkg.Package) {
|
||||||
@ -55,7 +53,6 @@ func TestNewGGUFPackage(t *testing.T) {
|
|||||||
{
|
{
|
||||||
name: "minimal GGUF package",
|
name: "minimal GGUF package",
|
||||||
metadata: &pkg.GGUFFileHeader{
|
metadata: &pkg.GGUFFileHeader{
|
||||||
ModelFormat: "gguf",
|
|
||||||
ModelName: "simple-model",
|
ModelName: "simple-model",
|
||||||
ModelVersion: "1.0",
|
ModelVersion: "1.0",
|
||||||
Architecture: "gpt2",
|
Architecture: "gpt2",
|
||||||
@ -80,7 +77,6 @@ func TestNewGGUFPackage(t *testing.T) {
|
|||||||
{
|
{
|
||||||
name: "GGUF package with multiple locations",
|
name: "GGUF package with multiple locations",
|
||||||
metadata: &pkg.GGUFFileHeader{
|
metadata: &pkg.GGUFFileHeader{
|
||||||
ModelFormat: "gguf",
|
|
||||||
ModelName: "multi-location-model",
|
ModelName: "multi-location-model",
|
||||||
ModelVersion: "1.5",
|
ModelVersion: "1.5",
|
||||||
Architecture: "llama",
|
Architecture: "llama",
|
||||||
|
|||||||
@ -14,19 +14,14 @@ const (
|
|||||||
maxHeaderSize = 50 * 1024 * 1024 // 50MB for large tokenizer vocabularies
|
maxHeaderSize = 50 * 1024 * 1024 // 50MB for large tokenizer vocabularies
|
||||||
)
|
)
|
||||||
|
|
||||||
// ggufHeaderReader reads just the header portion of a GGUF file efficiently
|
|
||||||
type ggufHeaderReader struct {
|
|
||||||
reader io.Reader
|
|
||||||
}
|
|
||||||
|
|
||||||
// readHeader reads only the GGUF header (metadata) without reading tensor data
|
// readHeader reads only the GGUF header (metadata) without reading tensor data
|
||||||
// This is much more efficient than reading the entire file
|
// This is much more efficient than reading the entire file
|
||||||
// The reader should be wrapped with io.LimitedReader to prevent OOM issues
|
// The reader should be wrapped with io.LimitedReader to prevent OOM issues
|
||||||
func (r *ggufHeaderReader) readHeader() ([]byte, error) {
|
func readHeader(r io.Reader) ([]byte, error) {
|
||||||
// Read initial chunk to determine header size
|
// Read initial chunk to determine header size
|
||||||
// GGUF format: magic(4) + version(4) + tensor_count(8) + metadata_kv_count(8) + metadata_kvs + tensors_info
|
// GGUF format: magic(4) + version(4) + tensor_count(8) + metadata_kv_count(8) + metadata_kvs + tensors_info
|
||||||
initialBuf := make([]byte, 24) // Enough for magic, version, tensor count, and kv count
|
initialBuf := make([]byte, 24) // Enough for magic, version, tensor count, and kv count
|
||||||
if _, err := io.ReadFull(r.reader, initialBuf); err != nil {
|
if _, err := io.ReadFull(r, initialBuf); err != nil {
|
||||||
return nil, fmt.Errorf("failed to read GGUF header prefix: %w", err)
|
return nil, fmt.Errorf("failed to read GGUF header prefix: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -45,7 +40,7 @@ func (r *ggufHeaderReader) readHeader() ([]byte, error) {
|
|||||||
// The LimitedReader will return EOF once maxHeaderSize is reached
|
// The LimitedReader will return EOF once maxHeaderSize is reached
|
||||||
buf := make([]byte, 64*1024) // 64KB chunks
|
buf := make([]byte, 64*1024) // 64KB chunks
|
||||||
for {
|
for {
|
||||||
n, err := r.reader.Read(buf)
|
n, err := r.Read(buf)
|
||||||
if n > 0 {
|
if n > 0 {
|
||||||
headerData = append(headerData, buf[:n]...)
|
headerData = append(headerData, buf[:n]...)
|
||||||
}
|
}
|
||||||
@ -65,24 +60,14 @@ func (r *ggufHeaderReader) readHeader() ([]byte, error) {
|
|||||||
func convertGGUFMetadataKVs(kvs gguf_parser.GGUFMetadataKVs) map[string]interface{} {
|
func convertGGUFMetadataKVs(kvs gguf_parser.GGUFMetadataKVs) map[string]interface{} {
|
||||||
result := make(map[string]interface{})
|
result := make(map[string]interface{})
|
||||||
|
|
||||||
// Limit KV pairs to avoid bloat
|
|
||||||
const maxKVPairs = 200
|
|
||||||
count := 0
|
|
||||||
|
|
||||||
for _, kv := range kvs {
|
for _, kv := range kvs {
|
||||||
if count >= maxKVPairs {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
|
|
||||||
// Skip standard fields that are extracted separately
|
// Skip standard fields that are extracted separately
|
||||||
switch kv.Key {
|
switch kv.Key {
|
||||||
case "general.architecture", "general.name", "general.license",
|
case "general.architecture", "general.name", "general.license",
|
||||||
"general.version", "general.parameter_count", "general.quantization":
|
"general.version", "general.parameter_count", "general.quantization":
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
result[kv.Key] = kv.Value
|
result[kv.Key] = kv.Value
|
||||||
count++
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|||||||
@ -2,15 +2,19 @@ package ai
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"sort"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
|
"github.com/cespare/xxhash/v2"
|
||||||
gguf_parser "github.com/gpustack/gguf-parser-go"
|
gguf_parser "github.com/gpustack/gguf-parser-go"
|
||||||
|
|
||||||
"github.com/anchore/syft/internal"
|
"github.com/anchore/syft/internal"
|
||||||
|
"github.com/anchore/syft/internal/log"
|
||||||
"github.com/anchore/syft/internal/unknown"
|
"github.com/anchore/syft/internal/unknown"
|
||||||
"github.com/anchore/syft/syft/artifact"
|
"github.com/anchore/syft/syft/artifact"
|
||||||
"github.com/anchore/syft/syft/file"
|
"github.com/anchore/syft/syft/file"
|
||||||
@ -18,8 +22,6 @@ import (
|
|||||||
"github.com/anchore/syft/syft/pkg/cataloger/generic"
|
"github.com/anchore/syft/syft/pkg/cataloger/generic"
|
||||||
)
|
)
|
||||||
|
|
||||||
const unknownGGUFData = "unknown"
|
|
||||||
|
|
||||||
// parseGGUFModel parses a GGUF model file and returns the discovered package.
|
// parseGGUFModel parses a GGUF model file and returns the discovered package.
|
||||||
// This implementation only reads the header portion of the file, not the entire model.
|
// This implementation only reads the header portion of the file, not the entire model.
|
||||||
func parseGGUFModel(_ context.Context, _ file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) {
|
func parseGGUFModel(_ context.Context, _ file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) {
|
||||||
@ -28,8 +30,7 @@ func parseGGUFModel(_ context.Context, _ file.Resolver, _ *generic.Environment,
|
|||||||
// Read and validate the GGUF file header using LimitedReader to prevent OOM
|
// Read and validate the GGUF file header using LimitedReader to prevent OOM
|
||||||
// We use LimitedReader to cap reads at maxHeaderSize (50MB)
|
// We use LimitedReader to cap reads at maxHeaderSize (50MB)
|
||||||
limitedReader := &io.LimitedReader{R: reader, N: maxHeaderSize}
|
limitedReader := &io.LimitedReader{R: reader, N: maxHeaderSize}
|
||||||
headerReader := &ggufHeaderReader{reader: limitedReader}
|
headerData, err := readHeader(limitedReader)
|
||||||
headerData, err := headerReader.readHeader()
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, nil, fmt.Errorf("failed to read GGUF header: %w", err)
|
return nil, nil, fmt.Errorf("failed to read GGUF header: %w", err)
|
||||||
}
|
}
|
||||||
@ -63,7 +64,6 @@ func parseGGUFModel(_ context.Context, _ file.Resolver, _ *generic.Environment,
|
|||||||
|
|
||||||
// Convert to syft metadata structure
|
// Convert to syft metadata structure
|
||||||
syftMetadata := &pkg.GGUFFileHeader{
|
syftMetadata := &pkg.GGUFFileHeader{
|
||||||
ModelFormat: "gguf",
|
|
||||||
ModelName: metadata.Name,
|
ModelName: metadata.Name,
|
||||||
ModelVersion: extractVersion(ggufFile.Header.MetadataKV),
|
ModelVersion: extractVersion(ggufFile.Header.MetadataKV),
|
||||||
License: metadata.License,
|
License: metadata.License,
|
||||||
@ -71,10 +71,9 @@ func parseGGUFModel(_ context.Context, _ file.Resolver, _ *generic.Environment,
|
|||||||
Quantization: metadata.FileTypeDescriptor,
|
Quantization: metadata.FileTypeDescriptor,
|
||||||
Parameters: uint64(metadata.Parameters),
|
Parameters: uint64(metadata.Parameters),
|
||||||
GGUFVersion: uint32(ggufFile.Header.Version),
|
GGUFVersion: uint32(ggufFile.Header.Version),
|
||||||
TensorCount: ggufFile.Header.TensorCount,
|
TensorCount: ggufFile.Header.TensorCount,
|
||||||
Header: convertGGUFMetadataKVs(ggufFile.Header.MetadataKV),
|
Header: convertGGUFMetadataKVs(ggufFile.Header.MetadataKV),
|
||||||
TruncatedHeader: false, // We read the full header
|
MetadataHash: computeKVMetadataHash(ggufFile.Header.MetadataKV),
|
||||||
Hash: "", // Will be computed in newGGUFPackage
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// If model name is not in metadata, use filename
|
// If model name is not in metadata, use filename
|
||||||
@ -82,11 +81,6 @@ func parseGGUFModel(_ context.Context, _ file.Resolver, _ *generic.Environment,
|
|||||||
syftMetadata.ModelName = extractModelNameFromPath(reader.Path())
|
syftMetadata.ModelName = extractModelNameFromPath(reader.Path())
|
||||||
}
|
}
|
||||||
|
|
||||||
// If version is still unknown, try to infer from name
|
|
||||||
if syftMetadata.ModelVersion == unknownGGUFData {
|
|
||||||
syftMetadata.ModelVersion = extractVersionFromName(syftMetadata.ModelName)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Create package from metadata
|
// Create package from metadata
|
||||||
p := newGGUFPackage(
|
p := newGGUFPackage(
|
||||||
syftMetadata,
|
syftMetadata,
|
||||||
@ -96,6 +90,27 @@ func parseGGUFModel(_ context.Context, _ file.Resolver, _ *generic.Environment,
|
|||||||
return []pkg.Package{p}, nil, unknown.IfEmptyf([]pkg.Package{p}, "unable to parse GGUF file")
|
return []pkg.Package{p}, nil, unknown.IfEmptyf([]pkg.Package{p}, "unable to parse GGUF file")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// computeKVMetadataHash computes a stable hash of the KV metadata for use as a global identifier
|
||||||
|
func computeKVMetadataHash(metadata gguf_parser.GGUFMetadataKVs) string {
|
||||||
|
// Sort the KV pairs by key for stable hashing
|
||||||
|
sortedKVs := make([]gguf_parser.GGUFMetadataKV, len(metadata))
|
||||||
|
copy(sortedKVs, metadata)
|
||||||
|
sort.Slice(sortedKVs, func(i, j int) bool {
|
||||||
|
return sortedKVs[i].Key < sortedKVs[j].Key
|
||||||
|
})
|
||||||
|
|
||||||
|
// Marshal sorted KVs to JSON for stable hashing
|
||||||
|
jsonBytes, err := json.Marshal(sortedKVs)
|
||||||
|
if err != nil {
|
||||||
|
log.Debugf("failed to marshal metadata for hashing: %v", err)
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compute xxhash
|
||||||
|
hash := xxhash.Sum64(jsonBytes)
|
||||||
|
return fmt.Sprintf("%016x", hash) // 16 hex chars (64 bits)
|
||||||
|
}
|
||||||
|
|
||||||
// extractVersion attempts to extract version from metadata KV pairs
|
// extractVersion attempts to extract version from metadata KV pairs
|
||||||
func extractVersion(kvs gguf_parser.GGUFMetadataKVs) string {
|
func extractVersion(kvs gguf_parser.GGUFMetadataKVs) string {
|
||||||
for _, kv := range kvs {
|
for _, kv := range kvs {
|
||||||
@ -105,14 +120,7 @@ func extractVersion(kvs gguf_parser.GGUFMetadataKVs) string {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return unknownGGUFData
|
return ""
|
||||||
}
|
|
||||||
|
|
||||||
// extractVersionFromName tries to extract version from model name
|
|
||||||
func extractVersionFromName(_ string) string {
|
|
||||||
// Look for version patterns like "v1.0", "1.5b", "3.0", etc.
|
|
||||||
// For now, return unknown - this could be enhanced with regex
|
|
||||||
return unknownGGUFData
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// extractModelNameFromPath extracts the model name from the file path
|
// extractModelNameFromPath extracts the model name from the file path
|
||||||
|
|||||||
@ -4,8 +4,8 @@ package pkg
|
|||||||
// GGUF is a binary file format used for storing model weights for the GGML library, designed for fast
|
// GGUF is a binary file format used for storing model weights for the GGML library, designed for fast
|
||||||
// loading and saving of models, particularly quantized large language models.
|
// loading and saving of models, particularly quantized large language models.
|
||||||
type GGUFFileHeader struct {
|
type GGUFFileHeader struct {
|
||||||
// ModelFormat is always "gguf"
|
// GGUFVersion is the GGUF format version (e.g., 3)
|
||||||
ModelFormat string `json:"modelFormat" cyclonedx:"modelFormat"`
|
GGUFVersion uint32 `json:"ggufVersion" cyclonedx:"ggufVersion"`
|
||||||
|
|
||||||
// ModelName is the name of the model (from general.name or filename)
|
// ModelName is the name of the model (from general.name or filename)
|
||||||
ModelName string `json:"modelName" cyclonedx:"modelName"`
|
ModelName string `json:"modelName" cyclonedx:"modelName"`
|
||||||
@ -16,15 +16,9 @@ type GGUFFileHeader struct {
|
|||||||
// FileSize is the size of the GGUF file in bytes (best-effort if available from resolver)
|
// FileSize is the size of the GGUF file in bytes (best-effort if available from resolver)
|
||||||
FileSize int64 `json:"fileSize,omitempty" cyclonedx:"fileSize"`
|
FileSize int64 `json:"fileSize,omitempty" cyclonedx:"fileSize"`
|
||||||
|
|
||||||
// Hash is a content hash of the metadata (for stable global identifiers across remotes)
|
|
||||||
Hash string `json:"hash,omitempty" cyclonedx:"hash"`
|
|
||||||
|
|
||||||
// License is the license identifier (from general.license if present)
|
// License is the license identifier (from general.license if present)
|
||||||
License string `json:"license,omitempty" cyclonedx:"license"`
|
License string `json:"license,omitempty" cyclonedx:"license"`
|
||||||
|
|
||||||
// GGUFVersion is the GGUF format version (e.g., 3)
|
|
||||||
GGUFVersion uint32 `json:"ggufVersion" cyclonedx:"ggufVersion"`
|
|
||||||
|
|
||||||
// Architecture is the model architecture (from general.architecture, e.g., "qwen3moe", "llama")
|
// Architecture is the model architecture (from general.architecture, e.g., "qwen3moe", "llama")
|
||||||
Architecture string `json:"architecture,omitempty" cyclonedx:"architecture"`
|
Architecture string `json:"architecture,omitempty" cyclonedx:"architecture"`
|
||||||
|
|
||||||
@ -42,6 +36,10 @@ type GGUFFileHeader struct {
|
|||||||
// (namespaced with general.*, llama.*, etc.) while avoiding duplication.
|
// (namespaced with general.*, llama.*, etc.) while avoiding duplication.
|
||||||
Header map[string]interface{} `json:"header,omitempty" cyclonedx:"header"`
|
Header map[string]interface{} `json:"header,omitempty" cyclonedx:"header"`
|
||||||
|
|
||||||
// TruncatedHeader indicates if the header was truncated during parsing (for very large headers)
|
// MetadataHash is a xx64 hash of all key-value pairs from the GGUF header metadata.
|
||||||
TruncatedHeader bool `json:"truncatedHeader,omitempty" cyclonedx:"truncatedHeader"`
|
// This hash is computed over the complete header metadata (including the fields extracted
|
||||||
|
// into typed fields above) and provides a stable identifier for the model configuration
|
||||||
|
// across different file locations or remotes. It allows matching identical models even
|
||||||
|
// when stored in different repositories or with different filenames.
|
||||||
|
MetadataHash string `json:"metadataHash,omitempty" cyclonedx:"metadataHash"`
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user