Compare commits

..

No commits in common. "f1839215c64884d317f660fb5c271697ddb05c0f" and "9609ce2b366803548de6d75e93cce0a2e7eff447" have entirely different histories.

11 changed files with 235 additions and 180 deletions

View File

@ -124,7 +124,7 @@ var jsonTypes = makeJSONTypes(
jsonNames(pkg.TerraformLockProviderEntry{}, "terraform-lock-provider-entry"), jsonNames(pkg.TerraformLockProviderEntry{}, "terraform-lock-provider-entry"),
jsonNames(pkg.DotnetPackagesLockEntry{}, "dotnet-packages-lock-entry"), jsonNames(pkg.DotnetPackagesLockEntry{}, "dotnet-packages-lock-entry"),
jsonNames(pkg.CondaMetaPackage{}, "conda-metadata-entry", "CondaPackageMetadata"), jsonNames(pkg.CondaMetaPackage{}, "conda-metadata-entry", "CondaPackageMetadata"),
jsonNames(pkg.GGUFFileHeader{}, "gguf-file-header"), jsonNames(pkg.GGUFFileHeader{}, "gguf-file-metadata"),
) )
func expandLegacyNameVariants(names ...string) []string { func expandLegacyNameVariants(names ...string) []string {

View File

@ -1433,16 +1433,24 @@
], ],
"description": "FileMetadataEntry contains filesystem-level metadata attributes such as permissions, ownership, type, and size for a cataloged file." "description": "FileMetadataEntry contains filesystem-level metadata attributes such as permissions, ownership, type, and size for a cataloged file."
}, },
"GgufFileHeader": { "GgufFileMetadata": {
"properties": { "properties": {
"ggufVersion": { "ggufVersion": {
"type": "integer", "type": "integer",
"description": "GGUFVersion is the GGUF format version (e.g., 3)" "description": "GGUFVersion is the GGUF format version (e.g., 3)"
}, },
"modelName": {
"type": "string",
"description": "ModelName is the name of the model (from general.name or filename)"
},
"fileSize": { "fileSize": {
"type": "integer", "type": "integer",
"description": "FileSize is the size of the GGUF file in bytes (best-effort if available from resolver)" "description": "FileSize is the size of the GGUF file in bytes (best-effort if available from resolver)"
}, },
"license": {
"type": "string",
"description": "License is the license identifier (from general.license if present)"
},
"architecture": { "architecture": {
"type": "string", "type": "string",
"description": "Architecture is the model architecture (from general.architecture, e.g., \"qwen3moe\", \"llama\")" "description": "Architecture is the model architecture (from general.architecture, e.g., \"qwen3moe\", \"llama\")"
@ -1461,16 +1469,17 @@
}, },
"header": { "header": {
"type": "object", "type": "object",
"description": "RemainingKeyValues contains the remaining key-value pairs from the GGUF header that are not already\nrepresented as typed fields above. This preserves additional metadata fields for reference\n(namespaced with general.*, llama.*, etc.) while avoiding duplication." "description": "Header contains the remaining key-value pairs from the GGUF header that are not already\nrepresented as typed fields above. This preserves additional metadata fields for reference\n(namespaced with general.*, llama.*, etc.) while avoiding duplication."
}, },
"metadataHash": { "metadataHash": {
"type": "string", "type": "string",
"description": "MetadataKeyValuesHash is a xx64 hash of all key-value pairs from the GGUF header metadata.\nThis hash is computed over the complete header metadata (including the fields extracted\ninto typed fields above) and provides a stable identifier for the model configuration\nacross different file locations or remotes. It allows matching identical models even\nwhen stored in different repositories or with different filenames." "description": "MetadataHash is a xx64 hash of all key-value pairs from the GGUF header metadata.\nThis hash is computed over the complete header metadata (including the fields extracted\ninto typed fields above) and provides a stable identifier for the model configuration\nacross different file locations or remotes. It allows matching identical models even\nwhen stored in different repositories or with different filenames."
} }
}, },
"type": "object", "type": "object",
"required": [ "required": [
"ggufVersion", "ggufVersion",
"modelName",
"tensorCount" "tensorCount"
], ],
"description": "GGUFFileHeader represents metadata extracted from a GGUF (GPT-Generated Unified Format) model file." "description": "GGUFFileHeader represents metadata extracted from a GGUF (GPT-Generated Unified Format) model file."
@ -2622,7 +2631,7 @@
"$ref": "#/$defs/ErlangRebarLockEntry" "$ref": "#/$defs/ErlangRebarLockEntry"
}, },
{ {
"$ref": "#/$defs/GgufFileHeader" "$ref": "#/$defs/GgufFileMetadata"
}, },
{ {
"$ref": "#/$defs/GithubActionsUseStatement" "$ref": "#/$defs/GithubActionsUseStatement"

View File

@ -1433,16 +1433,24 @@
], ],
"description": "FileMetadataEntry contains filesystem-level metadata attributes such as permissions, ownership, type, and size for a cataloged file." "description": "FileMetadataEntry contains filesystem-level metadata attributes such as permissions, ownership, type, and size for a cataloged file."
}, },
"GgufFileHeader": { "GgufFileMetadata": {
"properties": { "properties": {
"ggufVersion": { "ggufVersion": {
"type": "integer", "type": "integer",
"description": "GGUFVersion is the GGUF format version (e.g., 3)" "description": "GGUFVersion is the GGUF format version (e.g., 3)"
}, },
"modelName": {
"type": "string",
"description": "ModelName is the name of the model (from general.name or filename)"
},
"fileSize": { "fileSize": {
"type": "integer", "type": "integer",
"description": "FileSize is the size of the GGUF file in bytes (best-effort if available from resolver)" "description": "FileSize is the size of the GGUF file in bytes (best-effort if available from resolver)"
}, },
"license": {
"type": "string",
"description": "License is the license identifier (from general.license if present)"
},
"architecture": { "architecture": {
"type": "string", "type": "string",
"description": "Architecture is the model architecture (from general.architecture, e.g., \"qwen3moe\", \"llama\")" "description": "Architecture is the model architecture (from general.architecture, e.g., \"qwen3moe\", \"llama\")"
@ -1461,16 +1469,17 @@
}, },
"header": { "header": {
"type": "object", "type": "object",
"description": "RemainingKeyValues contains the remaining key-value pairs from the GGUF header that are not already\nrepresented as typed fields above. This preserves additional metadata fields for reference\n(namespaced with general.*, llama.*, etc.) while avoiding duplication." "description": "Header contains the remaining key-value pairs from the GGUF header that are not already\nrepresented as typed fields above. This preserves additional metadata fields for reference\n(namespaced with general.*, llama.*, etc.) while avoiding duplication."
}, },
"metadataHash": { "metadataHash": {
"type": "string", "type": "string",
"description": "MetadataKeyValuesHash is a xx64 hash of all key-value pairs from the GGUF header metadata.\nThis hash is computed over the complete header metadata (including the fields extracted\ninto typed fields above) and provides a stable identifier for the model configuration\nacross different file locations or remotes. It allows matching identical models even\nwhen stored in different repositories or with different filenames." "description": "MetadataHash is a xx64 hash of all key-value pairs from the GGUF header metadata.\nThis hash is computed over the complete header metadata (including the fields extracted\ninto typed fields above) and provides a stable identifier for the model configuration\nacross different file locations or remotes. It allows matching identical models even\nwhen stored in different repositories or with different filenames."
} }
}, },
"type": "object", "type": "object",
"required": [ "required": [
"ggufVersion", "ggufVersion",
"modelName",
"tensorCount" "tensorCount"
], ],
"description": "GGUFFileHeader represents metadata extracted from a GGUF (GPT-Generated Unified Format) model file." "description": "GGUFFileHeader represents metadata extracted from a GGUF (GPT-Generated Unified Format) model file."
@ -2622,7 +2631,7 @@
"$ref": "#/$defs/ErlangRebarLockEntry" "$ref": "#/$defs/ErlangRebarLockEntry"
}, },
{ {
"$ref": "#/$defs/GgufFileHeader" "$ref": "#/$defs/GgufFileMetadata"
}, },
{ {
"$ref": "#/$defs/GithubActionsUseStatement" "$ref": "#/$defs/GithubActionsUseStatement"

View File

@ -5,6 +5,8 @@ import (
"path/filepath" "path/filepath"
"testing" "testing"
"github.com/google/go-cmp/cmp/cmpopts"
"github.com/anchore/syft/syft/artifact" "github.com/anchore/syft/syft/artifact"
"github.com/anchore/syft/syft/pkg" "github.com/anchore/syft/syft/pkg"
"github.com/anchore/syft/syft/pkg/cataloger/internal/pkgtest" "github.com/anchore/syft/syft/pkg/cataloger/internal/pkgtest"
@ -35,7 +37,7 @@ func TestGGUFCataloger_Globs(t *testing.T) {
} }
} }
func TestGGUFCataloger(t *testing.T) { func TestGGUFCataloger_Integration(t *testing.T) {
tests := []struct { tests := []struct {
name string name string
setup func(t *testing.T) string setup func(t *testing.T) string
@ -54,7 +56,6 @@ func TestGGUFCataloger(t *testing.T) {
withStringKV("general.license", "Apache-2.0"). withStringKV("general.license", "Apache-2.0").
withStringKV("general.quantization", "Q4_K_M"). withStringKV("general.quantization", "Q4_K_M").
withUint64KV("general.parameter_count", 8030000000). withUint64KV("general.parameter_count", 8030000000).
withStringKV("general.some_random_kv", "foobar").
build() build()
path := filepath.Join(dir, "llama3-8b.gguf") path := filepath.Join(dir, "llama3-8b.gguf")
@ -70,53 +71,14 @@ func TestGGUFCataloger(t *testing.T) {
pkg.NewLicenseFromFields("Apache-2.0", "", nil), pkg.NewLicenseFromFields("Apache-2.0", "", nil),
), ),
Metadata: pkg.GGUFFileHeader{ Metadata: pkg.GGUFFileHeader{
Architecture: "llama", ModelName: "llama3-8b",
Quantization: "Unknown", License: "Apache-2.0",
Parameters: 0, Architecture: "llama",
GGUFVersion: 3, Quantization: "Unknown",
TensorCount: 0, Parameters: 0,
MetadataKeyValuesHash: "6e3d368066455ce4", GGUFVersion: 3,
RemainingKeyValues: map[string]interface{}{ TensorCount: 0,
"general.some_random_kv": "foobar", Header: map[string]interface{}{},
},
},
},
},
expectedRelationships: nil,
},
{
name: "catalog GGUF file with minimal metadata",
setup: func(t *testing.T) string {
dir := t.TempDir()
data := newTestGGUFBuilder().
withVersion(3).
withStringKV("general.architecture", "gpt2").
withStringKV("general.name", "gpt2-small").
withStringKV("gpt2.context_length", "1024").
withUint32KV("gpt2.embedding_length", 768).
build()
path := filepath.Join(dir, "gpt2-small.gguf")
os.WriteFile(path, data, 0644)
return dir
},
expectedPackages: []pkg.Package{
{
Name: "gpt2-small",
Version: "",
Type: pkg.ModelPkg,
Licenses: pkg.NewLicenseSet(),
Metadata: pkg.GGUFFileHeader{
Architecture: "gpt2",
Quantization: "Unknown",
Parameters: 0,
GGUFVersion: 3,
TensorCount: 0,
MetadataKeyValuesHash: "9dc6f23591062a27",
RemainingKeyValues: map[string]interface{}{
"gpt2.context_length": "1024",
"gpt2.embedding_length": uint32(768),
},
}, },
}, },
}, },
@ -129,12 +91,17 @@ func TestGGUFCataloger(t *testing.T) {
fixtureDir := tt.setup(t) fixtureDir := tt.setup(t)
// Use pkgtest to catalog and compare // Use pkgtest to catalog and compare
pkgtest.NewCatalogTester(). tester := pkgtest.NewCatalogTester().
FromDirectory(t, fixtureDir). FromDirectory(t, fixtureDir).
Expects(tt.expectedPackages, tt.expectedRelationships). Expects(tt.expectedPackages, tt.expectedRelationships).
IgnoreLocationLayer(). IgnoreLocationLayer().
IgnorePackageFields("FoundBy", "Locations"). IgnorePackageFields("FoundBy", "Locations"). // These are set by the cataloger
TestCataloger(t, NewGGUFCataloger()) WithCompareOptions(
// Ignore MetadataHash as it's computed dynamically
cmpopts.IgnoreFields(pkg.GGUFFileHeader{}, "MetadataHash"),
)
tester.TestCataloger(t, NewGGUFCataloger())
}) })
} }
} }

View File

@ -5,17 +5,23 @@ import (
"github.com/anchore/syft/syft/pkg" "github.com/anchore/syft/syft/pkg"
) )
func newGGUFPackage(metadata *pkg.GGUFFileHeader, modelName, version, license string, locations ...file.Location) pkg.Package { func newGGUFPackage(metadata *pkg.GGUFFileHeader, version string, locations ...file.Location) pkg.Package {
p := pkg.Package{ p := pkg.Package{
Name: modelName, Name: metadata.ModelName,
Version: version, Version: version,
Locations: file.NewLocationSet(locations...), Locations: file.NewLocationSet(locations...),
Type: pkg.ModelPkg, Type: pkg.ModelPkg,
Licenses: pkg.NewLicenseSet(pkg.NewLicensesFromValues(license)...), Licenses: pkg.NewLicenseSet(),
Metadata: *metadata, Metadata: *metadata,
// NOTE: PURL is intentionally not set as the package-url spec // NOTE: PURL is intentionally not set as the package-url spec
// has not yet finalized support for ML model packages // has not yet finalized support for ML model packages
} }
// Add license to the package if present in metadata
if metadata.License != "" {
p.Licenses.Add(pkg.NewLicenseFromFields(metadata.License, "", nil))
}
p.SetID() p.SetID()
return p return p

View File

@ -3,119 +3,121 @@ package ai
import ( import (
"testing" "testing"
"github.com/google/go-cmp/cmp"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
"github.com/anchore/syft/syft/file" "github.com/anchore/syft/syft/file"
"github.com/anchore/syft/syft/pkg" "github.com/anchore/syft/syft/pkg"
"github.com/anchore/syft/syft/pkg/cataloger/internal/pkgtest"
) )
func TestNewGGUFPackage(t *testing.T) { func TestNewGGUFPackage(t *testing.T) {
tests := []struct { tests := []struct {
name string name string
metadata *pkg.GGUFFileHeader metadata *pkg.GGUFFileHeader
input struct { version string
modelName string locations []file.Location
version string checkFunc func(t *testing.T, p pkg.Package)
license string
locations []file.Location
}
expected pkg.Package
}{ }{
{ {
name: "complete GGUF package with all fields", name: "complete GGUF package with all fields",
input: struct { version: "3.0",
modelName string
version string
license string
locations []file.Location
}{
modelName: "llama3-8b",
version: "3.0",
license: "Apache-2.0",
locations: []file.Location{file.NewLocation("/models/llama3-8b.gguf")},
},
metadata: &pkg.GGUFFileHeader{ metadata: &pkg.GGUFFileHeader{
ModelName: "llama3-8b-instruct",
License: "Apache-2.0",
Architecture: "llama", Architecture: "llama",
Quantization: "Q4_K_M", Quantization: "Q4_K_M",
Parameters: 8030000000, Parameters: 8030000000,
GGUFVersion: 3, GGUFVersion: 3,
TensorCount: 291, TensorCount: 291,
RemainingKeyValues: map[string]any{ Header: map[string]any{},
"general.random_kv": "foobar",
},
}, },
expected: pkg.Package{ locations: []file.Location{file.NewLocation("/models/llama3-8b.gguf")},
Name: "llama3-8b", checkFunc: func(t *testing.T, p pkg.Package) {
Version: "3.0", if d := cmp.Diff("llama3-8b-instruct", p.Name); d != "" {
Type: pkg.ModelPkg, t.Errorf("Name mismatch (-want +got):\n%s", d)
Licenses: pkg.NewLicenseSet( }
pkg.NewLicenseFromFields("Apache-2.0", "", nil), if d := cmp.Diff("3.0", p.Version); d != "" {
), t.Errorf("Version mismatch (-want +got):\n%s", d)
Metadata: pkg.GGUFFileHeader{ }
Architecture: "llama", if d := cmp.Diff(pkg.ModelPkg, p.Type); d != "" {
Quantization: "Q4_K_M", t.Errorf("Type mismatch (-want +got):\n%s", d)
Parameters: 8030000000, }
GGUFVersion: 3, assert.Empty(t, p.PURL, "PURL should not be set for model packages")
TensorCount: 291, assert.Len(t, p.Licenses.ToSlice(), 1)
RemainingKeyValues: map[string]any{ if d := cmp.Diff("Apache-2.0", p.Licenses.ToSlice()[0].Value); d != "" {
"general.random_kv": "foobar", t.Errorf("License value mismatch (-want +got):\n%s", d)
}, }
}, assert.NotEmpty(t, p.ID())
Locations: file.NewLocationSet(file.NewLocation("/models/llama3-8b.gguf")),
}, },
}, },
{ {
name: "minimal GGUF package", name: "minimal GGUF package",
input: struct { version: "1.0",
modelName string
version string
license string
locations []file.Location
}{
modelName: "gpt2-small",
version: "1.0",
license: "MIT",
locations: []file.Location{file.NewLocation("/models/simple.gguf")},
},
metadata: &pkg.GGUFFileHeader{ metadata: &pkg.GGUFFileHeader{
ModelName: "simple-model",
Architecture: "gpt2", Architecture: "gpt2",
GGUFVersion: 3, GGUFVersion: 3,
TensorCount: 50, TensorCount: 50,
}, },
expected: pkg.Package{ locations: []file.Location{file.NewLocation("/models/simple.gguf")},
Name: "gpt2-small", checkFunc: func(t *testing.T, p pkg.Package) {
Version: "1.0", if d := cmp.Diff("simple-model", p.Name); d != "" {
Type: pkg.ModelPkg, t.Errorf("Name mismatch (-want +got):\n%s", d)
Licenses: pkg.NewLicenseSet( }
pkg.NewLicenseFromFields("MIT", "", nil), if d := cmp.Diff("1.0", p.Version); d != "" {
), t.Errorf("Version mismatch (-want +got):\n%s", d)
Metadata: pkg.GGUFFileHeader{ }
Architecture: "gpt2", if d := cmp.Diff(pkg.ModelPkg, p.Type); d != "" {
GGUFVersion: 3, t.Errorf("Type mismatch (-want +got):\n%s", d)
TensorCount: 50, }
}, assert.Empty(t, p.PURL, "PURL should not be set for model packages")
Locations: file.NewLocationSet(file.NewLocation("/models/simple.gguf")), assert.Empty(t, p.Licenses.ToSlice())
},
},
{
name: "GGUF package with multiple locations",
version: "1.5",
metadata: &pkg.GGUFFileHeader{
ModelName: "multi-location-model",
Architecture: "llama",
GGUFVersion: 3,
TensorCount: 150,
},
locations: []file.Location{
file.NewLocation("/models/model1.gguf"),
file.NewLocation("/models/model2.gguf"),
},
checkFunc: func(t *testing.T, p pkg.Package) {
assert.Len(t, p.Locations.ToSlice(), 2)
}, },
}, },
} }
for _, tt := range tests { for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) { t.Run(tt.name, func(t *testing.T) {
actual := newGGUFPackage( p := newGGUFPackage(tt.metadata, tt.version, tt.locations...)
tt.metadata,
tt.input.modelName,
tt.input.version,
tt.input.license,
tt.input.locations...,
)
// Verify metadata type if d := cmp.Diff(tt.metadata.ModelName, p.Name); d != "" {
_, ok := actual.Metadata.(pkg.GGUFFileHeader) t.Errorf("Name mismatch (-want +got):\n%s", d)
}
if d := cmp.Diff(tt.version, p.Version); d != "" {
t.Errorf("Version mismatch (-want +got):\n%s", d)
}
if d := cmp.Diff(pkg.ModelPkg, p.Type); d != "" {
t.Errorf("Type mismatch (-want +got):\n%s", d)
}
// Verify metadata is attached
metadata, ok := p.Metadata.(pkg.GGUFFileHeader)
require.True(t, ok, "metadata should be GGUFFileHeader") require.True(t, ok, "metadata should be GGUFFileHeader")
if d := cmp.Diff(*tt.metadata, metadata); d != "" {
t.Errorf("Metadata mismatch (-want +got):\n%s", d)
}
// Use AssertPackagesEqual for comprehensive comparison if tt.checkFunc != nil {
pkgtest.AssertPackagesEqual(t, tt.expected, actual) tt.checkFunc(t, p)
}
}) })
} }
} }

View File

@ -14,35 +14,46 @@ const (
maxHeaderSize = 50 * 1024 * 1024 // 50MB for large tokenizer vocabularies maxHeaderSize = 50 * 1024 * 1024 // 50MB for large tokenizer vocabularies
) )
// copyHeader copies the GGUF header from the reader to the writer. // readHeader reads only the GGUF header (metadata) without reading tensor data
// It validates the magic number first, then copies the rest of the data. // This is much more efficient than reading the entire file
// The reader should be wrapped with io.LimitedReader to prevent OOM issues. // The reader should be wrapped with io.LimitedReader to prevent OOM issues
func copyHeader(w io.Writer, r io.Reader) error { func readHeader(r io.Reader) ([]byte, error) {
// Read initial chunk to validate magic number // Read initial chunk to determine header size
// GGUF format: magic(4) + version(4) + tensor_count(8) + metadata_kv_count(8) + metadata_kvs + tensors_info // GGUF format: magic(4) + version(4) + tensor_count(8) + metadata_kv_count(8) + metadata_kvs + tensors_info
initialBuf := make([]byte, 24) // Enough for magic, version, tensor count, and kv count initialBuf := make([]byte, 24) // Enough for magic, version, tensor count, and kv count
if _, err := io.ReadFull(r, initialBuf); err != nil { if _, err := io.ReadFull(r, initialBuf); err != nil {
return fmt.Errorf("failed to read GGUF header prefix: %w", err) return nil, fmt.Errorf("failed to read GGUF header prefix: %w", err)
} }
// Verify magic number // Verify magic number
magic := binary.LittleEndian.Uint32(initialBuf[0:4]) magic := binary.LittleEndian.Uint32(initialBuf[0:4])
if magic != ggufMagicNumber { if magic != ggufMagicNumber {
return fmt.Errorf("invalid GGUF magic number: 0x%08X", magic) return nil, fmt.Errorf("invalid GGUF magic number: 0x%08X", magic)
} }
// Write the initial buffer to the writer // We need to read the metadata KV pairs to know the full header size
if _, err := w.Write(initialBuf); err != nil { // The io.LimitedReader wrapping this reader ensures we don't read more than maxHeaderSize
return fmt.Errorf("failed to write GGUF header prefix: %w", err) headerData := make([]byte, 0, 1024*1024) // Start with 1MB capacity
} headerData = append(headerData, initialBuf...)
// Copy the rest of the header from reader to writer // Read the rest of the header in larger chunks for efficiency
// The LimitedReader will return EOF once maxHeaderSize is reached // The LimitedReader will return EOF once maxHeaderSize is reached
if _, err := io.Copy(w, r); err != nil { buf := make([]byte, 64*1024) // 64KB chunks
return fmt.Errorf("failed to copy GGUF header: %w", err) for {
n, err := r.Read(buf)
if n > 0 {
headerData = append(headerData, buf[:n]...)
}
if err == io.EOF {
// Reached end of file or limit, we have all available data
break
}
if err != nil {
return nil, fmt.Errorf("failed to read GGUF header: %w", err)
}
} }
return nil return headerData, nil
} }
// Helper to convert gguf_parser metadata to simpler types // Helper to convert gguf_parser metadata to simpler types

View File

@ -27,6 +27,14 @@ import (
func parseGGUFModel(_ context.Context, _ file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) { func parseGGUFModel(_ context.Context, _ file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) {
defer internal.CloseAndLogError(reader, reader.Path()) defer internal.CloseAndLogError(reader, reader.Path())
// Read and validate the GGUF file header using LimitedReader to prevent OOM
// We use LimitedReader to cap reads at maxHeaderSize (50MB)
limitedReader := &io.LimitedReader{R: reader, N: maxHeaderSize}
headerData, err := readHeader(limitedReader)
if err != nil {
return nil, nil, fmt.Errorf("failed to read GGUF header: %w", err)
}
// Create a temporary file for the library to parse // Create a temporary file for the library to parse
// The library requires a file path, so we create a temp file // The library requires a file path, so we create a temp file
tempFile, err := os.CreateTemp("", "syft-gguf-*.gguf") tempFile, err := os.CreateTemp("", "syft-gguf-*.gguf")
@ -36,12 +44,10 @@ func parseGGUFModel(_ context.Context, _ file.Resolver, _ *generic.Environment,
tempPath := tempFile.Name() tempPath := tempFile.Name()
defer os.Remove(tempPath) defer os.Remove(tempPath)
// Copy and validate the GGUF file header using LimitedReader to prevent OOM // Write the validated header data to temp file
// We use LimitedReader to cap reads at maxHeaderSize (50MB) if _, err := tempFile.Write(headerData); err != nil {
limitedReader := &io.LimitedReader{R: reader, N: maxHeaderSize}
if err := copyHeader(tempFile, limitedReader); err != nil {
tempFile.Close() tempFile.Close()
return nil, nil, fmt.Errorf("failed to copy GGUF header: %w", err) return nil, nil, fmt.Errorf("failed to write to temp file: %w", err)
} }
tempFile.Close() tempFile.Close()
@ -61,26 +67,26 @@ func parseGGUFModel(_ context.Context, _ file.Resolver, _ *generic.Environment,
// Convert to syft metadata structure // Convert to syft metadata structure
syftMetadata := &pkg.GGUFFileHeader{ syftMetadata := &pkg.GGUFFileHeader{
Architecture: metadata.Architecture, ModelName: metadata.Name,
Quantization: metadata.FileTypeDescriptor, License: metadata.License,
Parameters: uint64(metadata.Parameters), Architecture: metadata.Architecture,
GGUFVersion: uint32(ggufFile.Header.Version), Quantization: metadata.FileTypeDescriptor,
TensorCount: ggufFile.Header.TensorCount, Parameters: uint64(metadata.Parameters),
RemainingKeyValues: convertGGUFMetadataKVs(ggufFile.Header.MetadataKV), GGUFVersion: uint32(ggufFile.Header.Version),
MetadataKeyValuesHash: computeKVMetadataHash(ggufFile.Header.MetadataKV), TensorCount: ggufFile.Header.TensorCount,
Header: convertGGUFMetadataKVs(ggufFile.Header.MetadataKV),
MetadataHash: computeKVMetadataHash(ggufFile.Header.MetadataKV),
} }
// If model name is not in metadata, use filename // If model name is not in metadata, use filename
if metadata.Name == "" { if syftMetadata.ModelName == "" {
metadata.Name = extractModelNameFromPath(reader.Path()) syftMetadata.ModelName = extractModelNameFromPath(reader.Path())
} }
// Create package from metadata // Create package from metadata
p := newGGUFPackage( p := newGGUFPackage(
syftMetadata, syftMetadata,
metadata.Name,
modelVersion, modelVersion,
metadata.License,
reader.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation), reader.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation),
) )

View File

@ -0,0 +1,41 @@
package ai
import (
"fmt"
"os"
gguf_parser "github.com/gpustack/gguf-parser-go"
)
func main() {
// Create a test GGUF file
data := newTestGGUFBuilder().
withVersion(3).
withStringKV("general.architecture", "llama").
withStringKV("general.name", "test-model").
build()
// Write to temp file
tempFile, err := os.CreateTemp("", "test-*.gguf")
if err != nil {
panic(err)
}
defer os.Remove(tempFile.Name())
if _, err := tempFile.Write(data); err != nil {
panic(err)
}
tempFile.Close()
fmt.Printf("Wrote %d bytes to %s\n", len(data), tempFile.Name())
// Try to parse it
fmt.Println("Attempting to parse...")
gf, err := gguf_parser.ParseGGUFFile(tempFile.Name(), gguf_parser.SkipLargeMetadata())
if err != nil {
fmt.Printf("Parse error: %v\n", err)
return
}
fmt.Printf("Success! Model: %s\n", gf.Metadata().Name)
}

View File

@ -6,7 +6,6 @@ import (
) )
// GGUF type constants for test builder // GGUF type constants for test builder
// https://github.com/ggml-org/ggml/blob/master/docs/gguf.md
const ( const (
ggufMagic = 0x46554747 // "GGUF" in little-endian ggufMagic = 0x46554747 // "GGUF" in little-endian
ggufTypeUint8 = 0 ggufTypeUint8 = 0

View File

@ -3,14 +3,19 @@ package pkg
// GGUFFileHeader represents metadata extracted from a GGUF (GPT-Generated Unified Format) model file. // GGUFFileHeader represents metadata extracted from a GGUF (GPT-Generated Unified Format) model file.
// GGUF is a binary file format used for storing model weights for the GGML library, designed for fast // GGUF is a binary file format used for storing model weights for the GGML library, designed for fast
// loading and saving of models, particularly quantized large language models. // loading and saving of models, particularly quantized large language models.
// The Model Name, License, and Version fields have all been lifted up to be on the syft Package.
type GGUFFileHeader struct { type GGUFFileHeader struct {
// GGUFVersion is the GGUF format version (e.g., 3) // GGUFVersion is the GGUF format version (e.g., 3)
GGUFVersion uint32 `json:"ggufVersion" cyclonedx:"ggufVersion"` GGUFVersion uint32 `json:"ggufVersion" cyclonedx:"ggufVersion"`
// ModelName is the name of the model (from general.name or filename)
ModelName string `json:"modelName" cyclonedx:"modelName"`
// FileSize is the size of the GGUF file in bytes (best-effort if available from resolver) // FileSize is the size of the GGUF file in bytes (best-effort if available from resolver)
FileSize int64 `json:"fileSize,omitempty" cyclonedx:"fileSize"` FileSize int64 `json:"fileSize,omitempty" cyclonedx:"fileSize"`
// License is the license identifier (from general.license if present)
License string `json:"license,omitempty" cyclonedx:"license"`
// Architecture is the model architecture (from general.architecture, e.g., "qwen3moe", "llama") // Architecture is the model architecture (from general.architecture, e.g., "qwen3moe", "llama")
Architecture string `json:"architecture,omitempty" cyclonedx:"architecture"` Architecture string `json:"architecture,omitempty" cyclonedx:"architecture"`
@ -23,15 +28,15 @@ type GGUFFileHeader struct {
// TensorCount is the number of tensors in the model // TensorCount is the number of tensors in the model
TensorCount uint64 `json:"tensorCount" cyclonedx:"tensorCount"` TensorCount uint64 `json:"tensorCount" cyclonedx:"tensorCount"`
// RemainingKeyValues contains the remaining key-value pairs from the GGUF header that are not already // Header contains the remaining key-value pairs from the GGUF header that are not already
// represented as typed fields above. This preserves additional metadata fields for reference // represented as typed fields above. This preserves additional metadata fields for reference
// (namespaced with general.*, llama.*, etc.) while avoiding duplication. // (namespaced with general.*, llama.*, etc.) while avoiding duplication.
RemainingKeyValues map[string]interface{} `json:"header,omitempty" cyclonedx:"header"` Header map[string]interface{} `json:"header,omitempty" cyclonedx:"header"`
// MetadataKeyValuesHash is a xx64 hash of all key-value pairs from the GGUF header metadata. // MetadataHash is a xx64 hash of all key-value pairs from the GGUF header metadata.
// This hash is computed over the complete header metadata (including the fields extracted // This hash is computed over the complete header metadata (including the fields extracted
// into typed fields above) and provides a stable identifier for the model configuration // into typed fields above) and provides a stable identifier for the model configuration
// across different file locations or remotes. It allows matching identical models even // across different file locations or remotes. It allows matching identical models even
// when stored in different repositories or with different filenames. // when stored in different repositories or with different filenames.
MetadataKeyValuesHash string `json:"metadataHash,omitempty" cyclonedx:"metadataHash"` MetadataHash string `json:"metadataHash,omitempty" cyclonedx:"metadataHash"`
} }