mirror of
https://github.com/anchore/syft.git
synced 2025-11-17 08:23:15 +01:00
fix: pr comments
Signed-off-by: Christopher Phillips <32073428+spiffcs@users.noreply.github.com>
This commit is contained in:
parent
9b31c0480f
commit
6daea43c32
5
go.mod
5
go.mod
@ -286,7 +286,10 @@ require (
|
||||
modernc.org/memory v1.11.0 // indirect
|
||||
)
|
||||
|
||||
require github.com/gpustack/gguf-parser-go v0.22.1
|
||||
require (
|
||||
github.com/cespare/xxhash/v2 v2.3.0
|
||||
github.com/gpustack/gguf-parser-go v0.22.1
|
||||
)
|
||||
|
||||
require (
|
||||
cyphar.com/go-pathrs v0.2.1 // indirect
|
||||
|
||||
1
go.sum
1
go.sum
@ -229,7 +229,6 @@ github.com/cenkalti/backoff/v4 v4.2.1 h1:y4OZtCnogmCPw98Zjyt5a6+QwPLGkiQsYW5oUqy
|
||||
github.com/cenkalti/backoff/v4 v4.2.1/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE=
|
||||
github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
|
||||
github.com/census-instrumentation/opencensus-proto v0.3.0/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
|
||||
github.com/cespare/xxhash v1.1.0 h1:a6HrQnmkObjyL+Gs60czilIUGqrzKutQD6XZog3p+ko=
|
||||
github.com/cespare/xxhash v1.1.0/go.mod h1:XrSqR1VqqWfGrhpAt58auRo0WTKS1nRRg3ghfAqPWnc=
|
||||
github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
|
||||
github.com/cespare/xxhash/v2 v2.1.2/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
|
||||
|
||||
@ -27,7 +27,6 @@ func AllTypes() []any {
|
||||
pkg.ELFBinaryPackageNoteJSONPayload{},
|
||||
pkg.ElixirMixLockEntry{},
|
||||
pkg.ErlangRebarLockEntry{},
|
||||
pkg.GGUFFileHeader{},
|
||||
pkg.GitHubActionsUseStatement{},
|
||||
pkg.GolangBinaryBuildinfoEntry{},
|
||||
pkg.GolangModuleEntry{},
|
||||
|
||||
@ -7,7 +7,6 @@ import (
|
||||
|
||||
"github.com/google/go-cmp/cmp/cmpopts"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/anchore/syft/syft/artifact"
|
||||
"github.com/anchore/syft/syft/pkg"
|
||||
@ -15,72 +14,6 @@ import (
|
||||
)
|
||||
|
||||
func TestGGUFCataloger_Globs(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
setup func(t *testing.T) string // returns fixture directory
|
||||
expected []string
|
||||
}{
|
||||
{
|
||||
name: "finds GGUF files in root",
|
||||
setup: func(t *testing.T) string {
|
||||
dir := t.TempDir()
|
||||
createTestGGUFInDir(t, dir, "model1.gguf")
|
||||
createTestGGUFInDir(t, dir, "model2.gguf")
|
||||
return dir
|
||||
},
|
||||
expected: []string{
|
||||
"model1.gguf",
|
||||
"model2.gguf",
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "finds GGUF files in subdirectories",
|
||||
setup: func(t *testing.T) string {
|
||||
dir := t.TempDir()
|
||||
modelsDir := filepath.Join(dir, "models")
|
||||
os.MkdirAll(modelsDir, 0755)
|
||||
createTestGGUFInDir(t, modelsDir, "llama.gguf")
|
||||
|
||||
deepDir := filepath.Join(dir, "deep", "nested", "path")
|
||||
os.MkdirAll(deepDir, 0755)
|
||||
createTestGGUFInDir(t, deepDir, "mistral.gguf")
|
||||
|
||||
return dir
|
||||
},
|
||||
expected: []string{
|
||||
"models/llama.gguf",
|
||||
"deep/nested/path/mistral.gguf",
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "ignores non-GGUF files",
|
||||
setup: func(t *testing.T) string {
|
||||
dir := t.TempDir()
|
||||
createTestGGUFInDir(t, dir, "model.gguf")
|
||||
|
||||
// Create non-GGUF files
|
||||
os.WriteFile(filepath.Join(dir, "readme.txt"), []byte("readme"), 0644)
|
||||
os.WriteFile(filepath.Join(dir, "model.bin"), []byte("binary"), 0644)
|
||||
os.WriteFile(filepath.Join(dir, "config.json"), []byte("{}"), 0644)
|
||||
|
||||
return dir
|
||||
},
|
||||
expected: []string{
|
||||
"model.gguf",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
fixtureDir := tt.setup(t)
|
||||
|
||||
pkgtest.NewCatalogTester().
|
||||
FromDirectory(t, fixtureDir).
|
||||
ExpectsResolverContentQueries(tt.expected).
|
||||
TestCataloger(t, NewGGUFCataloger())
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestGGUFCataloger_Integration(t *testing.T) {
|
||||
@ -117,7 +50,6 @@ func TestGGUFCataloger_Integration(t *testing.T) {
|
||||
pkg.NewLicenseFromFields("Apache-2.0", "", nil),
|
||||
),
|
||||
Metadata: pkg.GGUFFileHeader{
|
||||
ModelFormat: "gguf",
|
||||
ModelName: "llama3-8b",
|
||||
ModelVersion: "3.0",
|
||||
License: "Apache-2.0",
|
||||
@ -127,104 +59,6 @@ func TestGGUFCataloger_Integration(t *testing.T) {
|
||||
GGUFVersion: 3,
|
||||
TensorCount: 0,
|
||||
Header: map[string]interface{}{},
|
||||
TruncatedHeader: false,
|
||||
},
|
||||
},
|
||||
},
|
||||
expectedRelationships: nil,
|
||||
},
|
||||
{
|
||||
name: "catalog multiple GGUF files",
|
||||
setup: func(t *testing.T) string {
|
||||
dir := t.TempDir()
|
||||
|
||||
// Create first model
|
||||
data1 := newTestGGUFBuilder().
|
||||
withVersion(3).
|
||||
withStringKV("general.architecture", "llama").
|
||||
withStringKV("general.name", "model1").
|
||||
withStringKV("general.version", "1.0").
|
||||
build()
|
||||
os.WriteFile(filepath.Join(dir, "model1.gguf"), data1, 0644)
|
||||
|
||||
// Create second model
|
||||
data2 := newTestGGUFBuilder().
|
||||
withVersion(3).
|
||||
withStringKV("general.architecture", "mistral").
|
||||
withStringKV("general.name", "model2").
|
||||
withStringKV("general.version", "2.0").
|
||||
build()
|
||||
os.WriteFile(filepath.Join(dir, "model2.gguf"), data2, 0644)
|
||||
|
||||
return dir
|
||||
},
|
||||
expectedPackages: []pkg.Package{
|
||||
{
|
||||
Name: "model1",
|
||||
Version: "1.0",
|
||||
Type: pkg.ModelPkg,
|
||||
Metadata: pkg.GGUFFileHeader{
|
||||
ModelFormat: "gguf",
|
||||
ModelName: "model1",
|
||||
ModelVersion: "1.0",
|
||||
Architecture: "llama",
|
||||
Quantization: "Unknown",
|
||||
GGUFVersion: 3,
|
||||
TensorCount: 0,
|
||||
Header: map[string]interface{}{},
|
||||
TruncatedHeader: false,
|
||||
},
|
||||
},
|
||||
{
|
||||
Name: "model2",
|
||||
Version: "2.0",
|
||||
Type: pkg.ModelPkg,
|
||||
Metadata: pkg.GGUFFileHeader{
|
||||
ModelFormat: "gguf",
|
||||
ModelName: "model2",
|
||||
ModelVersion: "2.0",
|
||||
Architecture: "mistral",
|
||||
Quantization: "Unknown",
|
||||
GGUFVersion: 3,
|
||||
TensorCount: 0,
|
||||
Header: map[string]interface{}{},
|
||||
TruncatedHeader: false,
|
||||
},
|
||||
},
|
||||
},
|
||||
expectedRelationships: nil,
|
||||
},
|
||||
{
|
||||
name: "catalog GGUF in nested directories",
|
||||
setup: func(t *testing.T) string {
|
||||
dir := t.TempDir()
|
||||
nestedDir := filepath.Join(dir, "models", "quantized")
|
||||
os.MkdirAll(nestedDir, 0755)
|
||||
|
||||
data := newTestGGUFBuilder().
|
||||
withVersion(3).
|
||||
withStringKV("general.architecture", "qwen").
|
||||
withStringKV("general.name", "qwen-nested").
|
||||
build()
|
||||
|
||||
os.WriteFile(filepath.Join(nestedDir, "qwen.gguf"), data, 0644)
|
||||
return dir
|
||||
},
|
||||
expectedPackages: []pkg.Package{
|
||||
{
|
||||
Name: "qwen-nested",
|
||||
Version: unknownGGUFData,
|
||||
Type: pkg.ModelPkg,
|
||||
Metadata: pkg.GGUFFileHeader{
|
||||
ModelFormat: "gguf",
|
||||
ModelName: "qwen-nested",
|
||||
ModelVersion: unknownGGUFData,
|
||||
Architecture: "qwen",
|
||||
Quantization: "Unknown",
|
||||
GGUFVersion: 3,
|
||||
TensorCount: 0,
|
||||
Header: map[string]interface{}{},
|
||||
TruncatedHeader: false,
|
||||
},
|
||||
},
|
||||
},
|
||||
@ -252,122 +86,7 @@ func TestGGUFCataloger_Integration(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestGGUFCataloger_SkipsInvalidFiles(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
|
||||
// Create a valid GGUF
|
||||
validData := newTestGGUFBuilder().
|
||||
withVersion(3).
|
||||
withStringKV("general.architecture", "llama").
|
||||
withStringKV("general.name", "valid-model").
|
||||
build()
|
||||
os.WriteFile(filepath.Join(dir, "valid.gguf"), validData, 0644)
|
||||
|
||||
// Create an invalid GGUF (wrong magic)
|
||||
invalidData := newTestGGUFBuilder().buildInvalidMagic()
|
||||
os.WriteFile(filepath.Join(dir, "invalid.gguf"), invalidData, 0644)
|
||||
|
||||
// Create a truncated GGUF
|
||||
os.WriteFile(filepath.Join(dir, "truncated.gguf"), []byte{0x47}, 0644)
|
||||
|
||||
// Catalog should succeed and only return the valid package
|
||||
tester := pkgtest.NewCatalogTester().
|
||||
FromDirectory(t, dir).
|
||||
ExpectsAssertion(func(t *testing.T, pkgs []pkg.Package, _ []artifact.Relationship) {
|
||||
// Should only find the valid model
|
||||
require.Len(t, pkgs, 1)
|
||||
assert.Equal(t, "valid-model", pkgs[0].Name)
|
||||
})
|
||||
|
||||
tester.TestCataloger(t, NewGGUFCataloger())
|
||||
}
|
||||
|
||||
func TestGGUFCataloger_Name(t *testing.T) {
|
||||
cataloger := NewGGUFCataloger()
|
||||
assert.Equal(t, "gguf-cataloger", cataloger.Name())
|
||||
}
|
||||
|
||||
func TestGGUFCataloger_EmptyDirectory(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
// Create a subdirectory to ensure glob still runs
|
||||
os.MkdirAll(filepath.Join(dir, "models"), 0755)
|
||||
|
||||
tester := pkgtest.NewCatalogTester().
|
||||
FromDirectory(t, dir).
|
||||
ExpectsAssertion(func(t *testing.T, pkgs []pkg.Package, rels []artifact.Relationship) {
|
||||
assert.Empty(t, pkgs)
|
||||
assert.Empty(t, rels)
|
||||
})
|
||||
|
||||
tester.TestCataloger(t, NewGGUFCataloger())
|
||||
}
|
||||
|
||||
func TestGGUFCataloger_MixedFiles(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
|
||||
// Create GGUF file
|
||||
ggufData := newTestGGUFBuilder().
|
||||
withVersion(3).
|
||||
withStringKV("general.architecture", "llama").
|
||||
withStringKV("general.name", "test-model").
|
||||
build()
|
||||
os.WriteFile(filepath.Join(dir, "model.gguf"), ggufData, 0644)
|
||||
|
||||
// Create other file types
|
||||
os.WriteFile(filepath.Join(dir, "README.md"), []byte("# Models"), 0644)
|
||||
os.WriteFile(filepath.Join(dir, "config.json"), []byte("{}"), 0644)
|
||||
os.WriteFile(filepath.Join(dir, "weights.bin"), []byte("weights"), 0644)
|
||||
os.MkdirAll(filepath.Join(dir, "subdir"), 0755)
|
||||
|
||||
tester := pkgtest.NewCatalogTester().
|
||||
FromDirectory(t, dir).
|
||||
ExpectsAssertion(func(t *testing.T, pkgs []pkg.Package, _ []artifact.Relationship) {
|
||||
// Should only find the GGUF model
|
||||
require.Len(t, pkgs, 1)
|
||||
assert.Equal(t, "test-model", pkgs[0].Name)
|
||||
assert.Equal(t, pkg.ModelPkg, pkgs[0].Type)
|
||||
})
|
||||
|
||||
tester.TestCataloger(t, NewGGUFCataloger())
|
||||
}
|
||||
|
||||
func TestGGUFCataloger_CaseInsensitiveGlob(t *testing.T) {
|
||||
// Test that the glob pattern is case-sensitive (as expected for **/*.gguf)
|
||||
dir := t.TempDir()
|
||||
|
||||
// Create lowercase .gguf
|
||||
data := newTestGGUFBuilder().
|
||||
withVersion(3).
|
||||
withStringKV("general.architecture", "llama").
|
||||
withStringKV("general.name", "lowercase").
|
||||
build()
|
||||
os.WriteFile(filepath.Join(dir, "model.gguf"), data, 0644)
|
||||
|
||||
// Create uppercase .GGUF (should not match the glob)
|
||||
os.WriteFile(filepath.Join(dir, "MODEL.GGUF"), data, 0644)
|
||||
|
||||
tester := pkgtest.NewCatalogTester().
|
||||
FromDirectory(t, dir).
|
||||
ExpectsAssertion(func(t *testing.T, pkgs []pkg.Package, _ []artifact.Relationship) {
|
||||
// Depending on filesystem case-sensitivity, we may get 1 or 2 packages
|
||||
// On case-insensitive filesystems (macOS), both might match
|
||||
// On case-sensitive filesystems (Linux), only lowercase matches
|
||||
assert.GreaterOrEqual(t, len(pkgs), 1, "should find at least the lowercase file")
|
||||
})
|
||||
|
||||
tester.TestCataloger(t, NewGGUFCataloger())
|
||||
}
|
||||
|
||||
// createTestGGUFInDir creates a minimal test GGUF file in the specified directory
|
||||
func createTestGGUFInDir(t *testing.T, dir, filename string) {
|
||||
t.Helper()
|
||||
data := newTestGGUFBuilder().
|
||||
withVersion(3).
|
||||
withStringKV("general.architecture", "llama").
|
||||
withStringKV("general.name", "test-model").
|
||||
build()
|
||||
|
||||
path := filepath.Join(dir, filename)
|
||||
err := os.WriteFile(path, data, 0644)
|
||||
require.NoError(t, err)
|
||||
}
|
||||
|
||||
@ -1,10 +1,11 @@
|
||||
package ai
|
||||
|
||||
import (
|
||||
"crypto/sha256"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
|
||||
"github.com/cespare/xxhash/v2"
|
||||
|
||||
"github.com/anchore/syft/internal/log"
|
||||
"github.com/anchore/syft/syft/file"
|
||||
"github.com/anchore/syft/syft/pkg"
|
||||
@ -48,7 +49,6 @@ func computeMetadataHash(metadata *pkg.GGUFFileHeader) string {
|
||||
GGUFVersion uint32
|
||||
TensorCount uint64
|
||||
}{
|
||||
Format: metadata.ModelFormat,
|
||||
Name: metadata.ModelName,
|
||||
Version: metadata.ModelVersion,
|
||||
Architecture: metadata.Architecture,
|
||||
@ -63,7 +63,7 @@ func computeMetadataHash(metadata *pkg.GGUFFileHeader) string {
|
||||
return ""
|
||||
}
|
||||
|
||||
// Compute SHA256 hash
|
||||
hash := sha256.Sum256(jsonBytes)
|
||||
return fmt.Sprintf("%x", hash[:8]) // Use first 8 bytes (16 hex chars)
|
||||
// Compute xxhash
|
||||
hash := xxhash.Sum64(jsonBytes)
|
||||
return fmt.Sprintf("%016x", hash) // 16 hex chars (64 bits)
|
||||
}
|
||||
|
||||
@ -21,7 +21,6 @@ func TestNewGGUFPackage(t *testing.T) {
|
||||
{
|
||||
name: "complete GGUF package with all fields",
|
||||
metadata: &pkg.GGUFFileHeader{
|
||||
ModelFormat: "gguf",
|
||||
ModelName: "llama3-8b-instruct",
|
||||
ModelVersion: "3.0",
|
||||
License: "Apache-2.0",
|
||||
@ -31,7 +30,6 @@ func TestNewGGUFPackage(t *testing.T) {
|
||||
GGUFVersion: 3,
|
||||
TensorCount: 291,
|
||||
Header: map[string]any{},
|
||||
TruncatedHeader: false,
|
||||
},
|
||||
locations: []file.Location{file.NewLocation("/models/llama3-8b.gguf")},
|
||||
checkFunc: func(t *testing.T, p pkg.Package) {
|
||||
@ -55,7 +53,6 @@ func TestNewGGUFPackage(t *testing.T) {
|
||||
{
|
||||
name: "minimal GGUF package",
|
||||
metadata: &pkg.GGUFFileHeader{
|
||||
ModelFormat: "gguf",
|
||||
ModelName: "simple-model",
|
||||
ModelVersion: "1.0",
|
||||
Architecture: "gpt2",
|
||||
@ -80,7 +77,6 @@ func TestNewGGUFPackage(t *testing.T) {
|
||||
{
|
||||
name: "GGUF package with multiple locations",
|
||||
metadata: &pkg.GGUFFileHeader{
|
||||
ModelFormat: "gguf",
|
||||
ModelName: "multi-location-model",
|
||||
ModelVersion: "1.5",
|
||||
Architecture: "llama",
|
||||
|
||||
@ -14,19 +14,14 @@ const (
|
||||
maxHeaderSize = 50 * 1024 * 1024 // 50MB for large tokenizer vocabularies
|
||||
)
|
||||
|
||||
// ggufHeaderReader reads just the header portion of a GGUF file efficiently
|
||||
type ggufHeaderReader struct {
|
||||
reader io.Reader
|
||||
}
|
||||
|
||||
// readHeader reads only the GGUF header (metadata) without reading tensor data
|
||||
// This is much more efficient than reading the entire file
|
||||
// The reader should be wrapped with io.LimitedReader to prevent OOM issues
|
||||
func (r *ggufHeaderReader) readHeader() ([]byte, error) {
|
||||
func readHeader(r io.Reader) ([]byte, error) {
|
||||
// Read initial chunk to determine header size
|
||||
// GGUF format: magic(4) + version(4) + tensor_count(8) + metadata_kv_count(8) + metadata_kvs + tensors_info
|
||||
initialBuf := make([]byte, 24) // Enough for magic, version, tensor count, and kv count
|
||||
if _, err := io.ReadFull(r.reader, initialBuf); err != nil {
|
||||
if _, err := io.ReadFull(r, initialBuf); err != nil {
|
||||
return nil, fmt.Errorf("failed to read GGUF header prefix: %w", err)
|
||||
}
|
||||
|
||||
@ -45,7 +40,7 @@ func (r *ggufHeaderReader) readHeader() ([]byte, error) {
|
||||
// The LimitedReader will return EOF once maxHeaderSize is reached
|
||||
buf := make([]byte, 64*1024) // 64KB chunks
|
||||
for {
|
||||
n, err := r.reader.Read(buf)
|
||||
n, err := r.Read(buf)
|
||||
if n > 0 {
|
||||
headerData = append(headerData, buf[:n]...)
|
||||
}
|
||||
@ -65,24 +60,14 @@ func (r *ggufHeaderReader) readHeader() ([]byte, error) {
|
||||
func convertGGUFMetadataKVs(kvs gguf_parser.GGUFMetadataKVs) map[string]interface{} {
|
||||
result := make(map[string]interface{})
|
||||
|
||||
// Limit KV pairs to avoid bloat
|
||||
const maxKVPairs = 200
|
||||
count := 0
|
||||
|
||||
for _, kv := range kvs {
|
||||
if count >= maxKVPairs {
|
||||
break
|
||||
}
|
||||
|
||||
// Skip standard fields that are extracted separately
|
||||
switch kv.Key {
|
||||
case "general.architecture", "general.name", "general.license",
|
||||
"general.version", "general.parameter_count", "general.quantization":
|
||||
continue
|
||||
}
|
||||
|
||||
result[kv.Key] = kv.Value
|
||||
count++
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
@ -2,15 +2,19 @@ package ai
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"github.com/cespare/xxhash/v2"
|
||||
gguf_parser "github.com/gpustack/gguf-parser-go"
|
||||
|
||||
"github.com/anchore/syft/internal"
|
||||
"github.com/anchore/syft/internal/log"
|
||||
"github.com/anchore/syft/internal/unknown"
|
||||
"github.com/anchore/syft/syft/artifact"
|
||||
"github.com/anchore/syft/syft/file"
|
||||
@ -18,8 +22,6 @@ import (
|
||||
"github.com/anchore/syft/syft/pkg/cataloger/generic"
|
||||
)
|
||||
|
||||
const unknownGGUFData = "unknown"
|
||||
|
||||
// parseGGUFModel parses a GGUF model file and returns the discovered package.
|
||||
// This implementation only reads the header portion of the file, not the entire model.
|
||||
func parseGGUFModel(_ context.Context, _ file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) {
|
||||
@ -28,8 +30,7 @@ func parseGGUFModel(_ context.Context, _ file.Resolver, _ *generic.Environment,
|
||||
// Read and validate the GGUF file header using LimitedReader to prevent OOM
|
||||
// We use LimitedReader to cap reads at maxHeaderSize (50MB)
|
||||
limitedReader := &io.LimitedReader{R: reader, N: maxHeaderSize}
|
||||
headerReader := &ggufHeaderReader{reader: limitedReader}
|
||||
headerData, err := headerReader.readHeader()
|
||||
headerData, err := readHeader(limitedReader)
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("failed to read GGUF header: %w", err)
|
||||
}
|
||||
@ -63,7 +64,6 @@ func parseGGUFModel(_ context.Context, _ file.Resolver, _ *generic.Environment,
|
||||
|
||||
// Convert to syft metadata structure
|
||||
syftMetadata := &pkg.GGUFFileHeader{
|
||||
ModelFormat: "gguf",
|
||||
ModelName: metadata.Name,
|
||||
ModelVersion: extractVersion(ggufFile.Header.MetadataKV),
|
||||
License: metadata.License,
|
||||
@ -73,8 +73,7 @@ func parseGGUFModel(_ context.Context, _ file.Resolver, _ *generic.Environment,
|
||||
GGUFVersion: uint32(ggufFile.Header.Version),
|
||||
TensorCount: ggufFile.Header.TensorCount,
|
||||
Header: convertGGUFMetadataKVs(ggufFile.Header.MetadataKV),
|
||||
TruncatedHeader: false, // We read the full header
|
||||
Hash: "", // Will be computed in newGGUFPackage
|
||||
MetadataHash: computeKVMetadataHash(ggufFile.Header.MetadataKV),
|
||||
}
|
||||
|
||||
// If model name is not in metadata, use filename
|
||||
@ -82,11 +81,6 @@ func parseGGUFModel(_ context.Context, _ file.Resolver, _ *generic.Environment,
|
||||
syftMetadata.ModelName = extractModelNameFromPath(reader.Path())
|
||||
}
|
||||
|
||||
// If version is still unknown, try to infer from name
|
||||
if syftMetadata.ModelVersion == unknownGGUFData {
|
||||
syftMetadata.ModelVersion = extractVersionFromName(syftMetadata.ModelName)
|
||||
}
|
||||
|
||||
// Create package from metadata
|
||||
p := newGGUFPackage(
|
||||
syftMetadata,
|
||||
@ -96,6 +90,27 @@ func parseGGUFModel(_ context.Context, _ file.Resolver, _ *generic.Environment,
|
||||
return []pkg.Package{p}, nil, unknown.IfEmptyf([]pkg.Package{p}, "unable to parse GGUF file")
|
||||
}
|
||||
|
||||
// computeKVMetadataHash computes a stable hash of the KV metadata for use as a global identifier
|
||||
func computeKVMetadataHash(metadata gguf_parser.GGUFMetadataKVs) string {
|
||||
// Sort the KV pairs by key for stable hashing
|
||||
sortedKVs := make([]gguf_parser.GGUFMetadataKV, len(metadata))
|
||||
copy(sortedKVs, metadata)
|
||||
sort.Slice(sortedKVs, func(i, j int) bool {
|
||||
return sortedKVs[i].Key < sortedKVs[j].Key
|
||||
})
|
||||
|
||||
// Marshal sorted KVs to JSON for stable hashing
|
||||
jsonBytes, err := json.Marshal(sortedKVs)
|
||||
if err != nil {
|
||||
log.Debugf("failed to marshal metadata for hashing: %v", err)
|
||||
return ""
|
||||
}
|
||||
|
||||
// Compute xxhash
|
||||
hash := xxhash.Sum64(jsonBytes)
|
||||
return fmt.Sprintf("%016x", hash) // 16 hex chars (64 bits)
|
||||
}
|
||||
|
||||
// extractVersion attempts to extract version from metadata KV pairs
|
||||
func extractVersion(kvs gguf_parser.GGUFMetadataKVs) string {
|
||||
for _, kv := range kvs {
|
||||
@ -105,14 +120,7 @@ func extractVersion(kvs gguf_parser.GGUFMetadataKVs) string {
|
||||
}
|
||||
}
|
||||
}
|
||||
return unknownGGUFData
|
||||
}
|
||||
|
||||
// extractVersionFromName tries to extract version from model name
|
||||
func extractVersionFromName(_ string) string {
|
||||
// Look for version patterns like "v1.0", "1.5b", "3.0", etc.
|
||||
// For now, return unknown - this could be enhanced with regex
|
||||
return unknownGGUFData
|
||||
return ""
|
||||
}
|
||||
|
||||
// extractModelNameFromPath extracts the model name from the file path
|
||||
|
||||
@ -4,8 +4,8 @@ package pkg
|
||||
// GGUF is a binary file format used for storing model weights for the GGML library, designed for fast
|
||||
// loading and saving of models, particularly quantized large language models.
|
||||
type GGUFFileHeader struct {
|
||||
// ModelFormat is always "gguf"
|
||||
ModelFormat string `json:"modelFormat" cyclonedx:"modelFormat"`
|
||||
// GGUFVersion is the GGUF format version (e.g., 3)
|
||||
GGUFVersion uint32 `json:"ggufVersion" cyclonedx:"ggufVersion"`
|
||||
|
||||
// ModelName is the name of the model (from general.name or filename)
|
||||
ModelName string `json:"modelName" cyclonedx:"modelName"`
|
||||
@ -16,15 +16,9 @@ type GGUFFileHeader struct {
|
||||
// FileSize is the size of the GGUF file in bytes (best-effort if available from resolver)
|
||||
FileSize int64 `json:"fileSize,omitempty" cyclonedx:"fileSize"`
|
||||
|
||||
// Hash is a content hash of the metadata (for stable global identifiers across remotes)
|
||||
Hash string `json:"hash,omitempty" cyclonedx:"hash"`
|
||||
|
||||
// License is the license identifier (from general.license if present)
|
||||
License string `json:"license,omitempty" cyclonedx:"license"`
|
||||
|
||||
// GGUFVersion is the GGUF format version (e.g., 3)
|
||||
GGUFVersion uint32 `json:"ggufVersion" cyclonedx:"ggufVersion"`
|
||||
|
||||
// Architecture is the model architecture (from general.architecture, e.g., "qwen3moe", "llama")
|
||||
Architecture string `json:"architecture,omitempty" cyclonedx:"architecture"`
|
||||
|
||||
@ -42,6 +36,10 @@ type GGUFFileHeader struct {
|
||||
// (namespaced with general.*, llama.*, etc.) while avoiding duplication.
|
||||
Header map[string]interface{} `json:"header,omitempty" cyclonedx:"header"`
|
||||
|
||||
// TruncatedHeader indicates if the header was truncated during parsing (for very large headers)
|
||||
TruncatedHeader bool `json:"truncatedHeader,omitempty" cyclonedx:"truncatedHeader"`
|
||||
// MetadataHash is a xx64 hash of all key-value pairs from the GGUF header metadata.
|
||||
// This hash is computed over the complete header metadata (including the fields extracted
|
||||
// into typed fields above) and provides a stable identifier for the model configuration
|
||||
// across different file locations or remotes. It allows matching identical models even
|
||||
// when stored in different repositories or with different filenames.
|
||||
MetadataHash string `json:"metadataHash,omitempty" cyclonedx:"metadataHash"`
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user