mirror of
https://github.com/anchore/syft.git
synced 2025-11-17 00:13:15 +01:00
Compare commits
4 Commits
9609ce2b36
...
f1839215c6
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f1839215c6 | ||
|
|
8706ff8310 | ||
|
|
e58e6317d2 | ||
|
|
b1c8478d55 |
@ -124,7 +124,7 @@ var jsonTypes = makeJSONTypes(
|
|||||||
jsonNames(pkg.TerraformLockProviderEntry{}, "terraform-lock-provider-entry"),
|
jsonNames(pkg.TerraformLockProviderEntry{}, "terraform-lock-provider-entry"),
|
||||||
jsonNames(pkg.DotnetPackagesLockEntry{}, "dotnet-packages-lock-entry"),
|
jsonNames(pkg.DotnetPackagesLockEntry{}, "dotnet-packages-lock-entry"),
|
||||||
jsonNames(pkg.CondaMetaPackage{}, "conda-metadata-entry", "CondaPackageMetadata"),
|
jsonNames(pkg.CondaMetaPackage{}, "conda-metadata-entry", "CondaPackageMetadata"),
|
||||||
jsonNames(pkg.GGUFFileHeader{}, "gguf-file-metadata"),
|
jsonNames(pkg.GGUFFileHeader{}, "gguf-file-header"),
|
||||||
)
|
)
|
||||||
|
|
||||||
func expandLegacyNameVariants(names ...string) []string {
|
func expandLegacyNameVariants(names ...string) []string {
|
||||||
|
|||||||
@ -1433,24 +1433,16 @@
|
|||||||
],
|
],
|
||||||
"description": "FileMetadataEntry contains filesystem-level metadata attributes such as permissions, ownership, type, and size for a cataloged file."
|
"description": "FileMetadataEntry contains filesystem-level metadata attributes such as permissions, ownership, type, and size for a cataloged file."
|
||||||
},
|
},
|
||||||
"GgufFileMetadata": {
|
"GgufFileHeader": {
|
||||||
"properties": {
|
"properties": {
|
||||||
"ggufVersion": {
|
"ggufVersion": {
|
||||||
"type": "integer",
|
"type": "integer",
|
||||||
"description": "GGUFVersion is the GGUF format version (e.g., 3)"
|
"description": "GGUFVersion is the GGUF format version (e.g., 3)"
|
||||||
},
|
},
|
||||||
"modelName": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "ModelName is the name of the model (from general.name or filename)"
|
|
||||||
},
|
|
||||||
"fileSize": {
|
"fileSize": {
|
||||||
"type": "integer",
|
"type": "integer",
|
||||||
"description": "FileSize is the size of the GGUF file in bytes (best-effort if available from resolver)"
|
"description": "FileSize is the size of the GGUF file in bytes (best-effort if available from resolver)"
|
||||||
},
|
},
|
||||||
"license": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "License is the license identifier (from general.license if present)"
|
|
||||||
},
|
|
||||||
"architecture": {
|
"architecture": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"description": "Architecture is the model architecture (from general.architecture, e.g., \"qwen3moe\", \"llama\")"
|
"description": "Architecture is the model architecture (from general.architecture, e.g., \"qwen3moe\", \"llama\")"
|
||||||
@ -1469,17 +1461,16 @@
|
|||||||
},
|
},
|
||||||
"header": {
|
"header": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"description": "Header contains the remaining key-value pairs from the GGUF header that are not already\nrepresented as typed fields above. This preserves additional metadata fields for reference\n(namespaced with general.*, llama.*, etc.) while avoiding duplication."
|
"description": "RemainingKeyValues contains the remaining key-value pairs from the GGUF header that are not already\nrepresented as typed fields above. This preserves additional metadata fields for reference\n(namespaced with general.*, llama.*, etc.) while avoiding duplication."
|
||||||
},
|
},
|
||||||
"metadataHash": {
|
"metadataHash": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"description": "MetadataHash is a xx64 hash of all key-value pairs from the GGUF header metadata.\nThis hash is computed over the complete header metadata (including the fields extracted\ninto typed fields above) and provides a stable identifier for the model configuration\nacross different file locations or remotes. It allows matching identical models even\nwhen stored in different repositories or with different filenames."
|
"description": "MetadataKeyValuesHash is a xx64 hash of all key-value pairs from the GGUF header metadata.\nThis hash is computed over the complete header metadata (including the fields extracted\ninto typed fields above) and provides a stable identifier for the model configuration\nacross different file locations or remotes. It allows matching identical models even\nwhen stored in different repositories or with different filenames."
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"required": [
|
"required": [
|
||||||
"ggufVersion",
|
"ggufVersion",
|
||||||
"modelName",
|
|
||||||
"tensorCount"
|
"tensorCount"
|
||||||
],
|
],
|
||||||
"description": "GGUFFileHeader represents metadata extracted from a GGUF (GPT-Generated Unified Format) model file."
|
"description": "GGUFFileHeader represents metadata extracted from a GGUF (GPT-Generated Unified Format) model file."
|
||||||
@ -2631,7 +2622,7 @@
|
|||||||
"$ref": "#/$defs/ErlangRebarLockEntry"
|
"$ref": "#/$defs/ErlangRebarLockEntry"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/$defs/GgufFileMetadata"
|
"$ref": "#/$defs/GgufFileHeader"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/$defs/GithubActionsUseStatement"
|
"$ref": "#/$defs/GithubActionsUseStatement"
|
||||||
|
|||||||
@ -1433,24 +1433,16 @@
|
|||||||
],
|
],
|
||||||
"description": "FileMetadataEntry contains filesystem-level metadata attributes such as permissions, ownership, type, and size for a cataloged file."
|
"description": "FileMetadataEntry contains filesystem-level metadata attributes such as permissions, ownership, type, and size for a cataloged file."
|
||||||
},
|
},
|
||||||
"GgufFileMetadata": {
|
"GgufFileHeader": {
|
||||||
"properties": {
|
"properties": {
|
||||||
"ggufVersion": {
|
"ggufVersion": {
|
||||||
"type": "integer",
|
"type": "integer",
|
||||||
"description": "GGUFVersion is the GGUF format version (e.g., 3)"
|
"description": "GGUFVersion is the GGUF format version (e.g., 3)"
|
||||||
},
|
},
|
||||||
"modelName": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "ModelName is the name of the model (from general.name or filename)"
|
|
||||||
},
|
|
||||||
"fileSize": {
|
"fileSize": {
|
||||||
"type": "integer",
|
"type": "integer",
|
||||||
"description": "FileSize is the size of the GGUF file in bytes (best-effort if available from resolver)"
|
"description": "FileSize is the size of the GGUF file in bytes (best-effort if available from resolver)"
|
||||||
},
|
},
|
||||||
"license": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "License is the license identifier (from general.license if present)"
|
|
||||||
},
|
|
||||||
"architecture": {
|
"architecture": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"description": "Architecture is the model architecture (from general.architecture, e.g., \"qwen3moe\", \"llama\")"
|
"description": "Architecture is the model architecture (from general.architecture, e.g., \"qwen3moe\", \"llama\")"
|
||||||
@ -1469,17 +1461,16 @@
|
|||||||
},
|
},
|
||||||
"header": {
|
"header": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"description": "Header contains the remaining key-value pairs from the GGUF header that are not already\nrepresented as typed fields above. This preserves additional metadata fields for reference\n(namespaced with general.*, llama.*, etc.) while avoiding duplication."
|
"description": "RemainingKeyValues contains the remaining key-value pairs from the GGUF header that are not already\nrepresented as typed fields above. This preserves additional metadata fields for reference\n(namespaced with general.*, llama.*, etc.) while avoiding duplication."
|
||||||
},
|
},
|
||||||
"metadataHash": {
|
"metadataHash": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"description": "MetadataHash is a xx64 hash of all key-value pairs from the GGUF header metadata.\nThis hash is computed over the complete header metadata (including the fields extracted\ninto typed fields above) and provides a stable identifier for the model configuration\nacross different file locations or remotes. It allows matching identical models even\nwhen stored in different repositories or with different filenames."
|
"description": "MetadataKeyValuesHash is a xx64 hash of all key-value pairs from the GGUF header metadata.\nThis hash is computed over the complete header metadata (including the fields extracted\ninto typed fields above) and provides a stable identifier for the model configuration\nacross different file locations or remotes. It allows matching identical models even\nwhen stored in different repositories or with different filenames."
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"required": [
|
"required": [
|
||||||
"ggufVersion",
|
"ggufVersion",
|
||||||
"modelName",
|
|
||||||
"tensorCount"
|
"tensorCount"
|
||||||
],
|
],
|
||||||
"description": "GGUFFileHeader represents metadata extracted from a GGUF (GPT-Generated Unified Format) model file."
|
"description": "GGUFFileHeader represents metadata extracted from a GGUF (GPT-Generated Unified Format) model file."
|
||||||
@ -2631,7 +2622,7 @@
|
|||||||
"$ref": "#/$defs/ErlangRebarLockEntry"
|
"$ref": "#/$defs/ErlangRebarLockEntry"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/$defs/GgufFileMetadata"
|
"$ref": "#/$defs/GgufFileHeader"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/$defs/GithubActionsUseStatement"
|
"$ref": "#/$defs/GithubActionsUseStatement"
|
||||||
|
|||||||
@ -5,8 +5,6 @@ import (
|
|||||||
"path/filepath"
|
"path/filepath"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"github.com/google/go-cmp/cmp/cmpopts"
|
|
||||||
|
|
||||||
"github.com/anchore/syft/syft/artifact"
|
"github.com/anchore/syft/syft/artifact"
|
||||||
"github.com/anchore/syft/syft/pkg"
|
"github.com/anchore/syft/syft/pkg"
|
||||||
"github.com/anchore/syft/syft/pkg/cataloger/internal/pkgtest"
|
"github.com/anchore/syft/syft/pkg/cataloger/internal/pkgtest"
|
||||||
@ -37,7 +35,7 @@ func TestGGUFCataloger_Globs(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestGGUFCataloger_Integration(t *testing.T) {
|
func TestGGUFCataloger(t *testing.T) {
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
name string
|
name string
|
||||||
setup func(t *testing.T) string
|
setup func(t *testing.T) string
|
||||||
@ -56,6 +54,7 @@ func TestGGUFCataloger_Integration(t *testing.T) {
|
|||||||
withStringKV("general.license", "Apache-2.0").
|
withStringKV("general.license", "Apache-2.0").
|
||||||
withStringKV("general.quantization", "Q4_K_M").
|
withStringKV("general.quantization", "Q4_K_M").
|
||||||
withUint64KV("general.parameter_count", 8030000000).
|
withUint64KV("general.parameter_count", 8030000000).
|
||||||
|
withStringKV("general.some_random_kv", "foobar").
|
||||||
build()
|
build()
|
||||||
|
|
||||||
path := filepath.Join(dir, "llama3-8b.gguf")
|
path := filepath.Join(dir, "llama3-8b.gguf")
|
||||||
@ -71,14 +70,53 @@ func TestGGUFCataloger_Integration(t *testing.T) {
|
|||||||
pkg.NewLicenseFromFields("Apache-2.0", "", nil),
|
pkg.NewLicenseFromFields("Apache-2.0", "", nil),
|
||||||
),
|
),
|
||||||
Metadata: pkg.GGUFFileHeader{
|
Metadata: pkg.GGUFFileHeader{
|
||||||
ModelName: "llama3-8b",
|
Architecture: "llama",
|
||||||
License: "Apache-2.0",
|
Quantization: "Unknown",
|
||||||
Architecture: "llama",
|
Parameters: 0,
|
||||||
Quantization: "Unknown",
|
GGUFVersion: 3,
|
||||||
Parameters: 0,
|
TensorCount: 0,
|
||||||
GGUFVersion: 3,
|
MetadataKeyValuesHash: "6e3d368066455ce4",
|
||||||
TensorCount: 0,
|
RemainingKeyValues: map[string]interface{}{
|
||||||
Header: map[string]interface{}{},
|
"general.some_random_kv": "foobar",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
expectedRelationships: nil,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "catalog GGUF file with minimal metadata",
|
||||||
|
setup: func(t *testing.T) string {
|
||||||
|
dir := t.TempDir()
|
||||||
|
data := newTestGGUFBuilder().
|
||||||
|
withVersion(3).
|
||||||
|
withStringKV("general.architecture", "gpt2").
|
||||||
|
withStringKV("general.name", "gpt2-small").
|
||||||
|
withStringKV("gpt2.context_length", "1024").
|
||||||
|
withUint32KV("gpt2.embedding_length", 768).
|
||||||
|
build()
|
||||||
|
|
||||||
|
path := filepath.Join(dir, "gpt2-small.gguf")
|
||||||
|
os.WriteFile(path, data, 0644)
|
||||||
|
return dir
|
||||||
|
},
|
||||||
|
expectedPackages: []pkg.Package{
|
||||||
|
{
|
||||||
|
Name: "gpt2-small",
|
||||||
|
Version: "",
|
||||||
|
Type: pkg.ModelPkg,
|
||||||
|
Licenses: pkg.NewLicenseSet(),
|
||||||
|
Metadata: pkg.GGUFFileHeader{
|
||||||
|
Architecture: "gpt2",
|
||||||
|
Quantization: "Unknown",
|
||||||
|
Parameters: 0,
|
||||||
|
GGUFVersion: 3,
|
||||||
|
TensorCount: 0,
|
||||||
|
MetadataKeyValuesHash: "9dc6f23591062a27",
|
||||||
|
RemainingKeyValues: map[string]interface{}{
|
||||||
|
"gpt2.context_length": "1024",
|
||||||
|
"gpt2.embedding_length": uint32(768),
|
||||||
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
@ -91,17 +129,12 @@ func TestGGUFCataloger_Integration(t *testing.T) {
|
|||||||
fixtureDir := tt.setup(t)
|
fixtureDir := tt.setup(t)
|
||||||
|
|
||||||
// Use pkgtest to catalog and compare
|
// Use pkgtest to catalog and compare
|
||||||
tester := pkgtest.NewCatalogTester().
|
pkgtest.NewCatalogTester().
|
||||||
FromDirectory(t, fixtureDir).
|
FromDirectory(t, fixtureDir).
|
||||||
Expects(tt.expectedPackages, tt.expectedRelationships).
|
Expects(tt.expectedPackages, tt.expectedRelationships).
|
||||||
IgnoreLocationLayer().
|
IgnoreLocationLayer().
|
||||||
IgnorePackageFields("FoundBy", "Locations"). // These are set by the cataloger
|
IgnorePackageFields("FoundBy", "Locations").
|
||||||
WithCompareOptions(
|
TestCataloger(t, NewGGUFCataloger())
|
||||||
// Ignore MetadataHash as it's computed dynamically
|
|
||||||
cmpopts.IgnoreFields(pkg.GGUFFileHeader{}, "MetadataHash"),
|
|
||||||
)
|
|
||||||
|
|
||||||
tester.TestCataloger(t, NewGGUFCataloger())
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -5,23 +5,17 @@ import (
|
|||||||
"github.com/anchore/syft/syft/pkg"
|
"github.com/anchore/syft/syft/pkg"
|
||||||
)
|
)
|
||||||
|
|
||||||
func newGGUFPackage(metadata *pkg.GGUFFileHeader, version string, locations ...file.Location) pkg.Package {
|
func newGGUFPackage(metadata *pkg.GGUFFileHeader, modelName, version, license string, locations ...file.Location) pkg.Package {
|
||||||
p := pkg.Package{
|
p := pkg.Package{
|
||||||
Name: metadata.ModelName,
|
Name: modelName,
|
||||||
Version: version,
|
Version: version,
|
||||||
Locations: file.NewLocationSet(locations...),
|
Locations: file.NewLocationSet(locations...),
|
||||||
Type: pkg.ModelPkg,
|
Type: pkg.ModelPkg,
|
||||||
Licenses: pkg.NewLicenseSet(),
|
Licenses: pkg.NewLicenseSet(pkg.NewLicensesFromValues(license)...),
|
||||||
Metadata: *metadata,
|
Metadata: *metadata,
|
||||||
// NOTE: PURL is intentionally not set as the package-url spec
|
// NOTE: PURL is intentionally not set as the package-url spec
|
||||||
// has not yet finalized support for ML model packages
|
// has not yet finalized support for ML model packages
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add license to the package if present in metadata
|
|
||||||
if metadata.License != "" {
|
|
||||||
p.Licenses.Add(pkg.NewLicenseFromFields(metadata.License, "", nil))
|
|
||||||
}
|
|
||||||
|
|
||||||
p.SetID()
|
p.SetID()
|
||||||
|
|
||||||
return p
|
return p
|
||||||
|
|||||||
@ -3,121 +3,119 @@ package ai
|
|||||||
import (
|
import (
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"github.com/google/go-cmp/cmp"
|
|
||||||
"github.com/stretchr/testify/assert"
|
|
||||||
"github.com/stretchr/testify/require"
|
"github.com/stretchr/testify/require"
|
||||||
|
|
||||||
"github.com/anchore/syft/syft/file"
|
"github.com/anchore/syft/syft/file"
|
||||||
"github.com/anchore/syft/syft/pkg"
|
"github.com/anchore/syft/syft/pkg"
|
||||||
|
"github.com/anchore/syft/syft/pkg/cataloger/internal/pkgtest"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestNewGGUFPackage(t *testing.T) {
|
func TestNewGGUFPackage(t *testing.T) {
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
name string
|
name string
|
||||||
metadata *pkg.GGUFFileHeader
|
metadata *pkg.GGUFFileHeader
|
||||||
version string
|
input struct {
|
||||||
locations []file.Location
|
modelName string
|
||||||
checkFunc func(t *testing.T, p pkg.Package)
|
version string
|
||||||
|
license string
|
||||||
|
locations []file.Location
|
||||||
|
}
|
||||||
|
expected pkg.Package
|
||||||
}{
|
}{
|
||||||
{
|
{
|
||||||
name: "complete GGUF package with all fields",
|
name: "complete GGUF package with all fields",
|
||||||
version: "3.0",
|
input: struct {
|
||||||
|
modelName string
|
||||||
|
version string
|
||||||
|
license string
|
||||||
|
locations []file.Location
|
||||||
|
}{
|
||||||
|
modelName: "llama3-8b",
|
||||||
|
version: "3.0",
|
||||||
|
license: "Apache-2.0",
|
||||||
|
locations: []file.Location{file.NewLocation("/models/llama3-8b.gguf")},
|
||||||
|
},
|
||||||
metadata: &pkg.GGUFFileHeader{
|
metadata: &pkg.GGUFFileHeader{
|
||||||
ModelName: "llama3-8b-instruct",
|
|
||||||
License: "Apache-2.0",
|
|
||||||
Architecture: "llama",
|
Architecture: "llama",
|
||||||
Quantization: "Q4_K_M",
|
Quantization: "Q4_K_M",
|
||||||
Parameters: 8030000000,
|
Parameters: 8030000000,
|
||||||
GGUFVersion: 3,
|
GGUFVersion: 3,
|
||||||
TensorCount: 291,
|
TensorCount: 291,
|
||||||
Header: map[string]any{},
|
RemainingKeyValues: map[string]any{
|
||||||
|
"general.random_kv": "foobar",
|
||||||
|
},
|
||||||
},
|
},
|
||||||
locations: []file.Location{file.NewLocation("/models/llama3-8b.gguf")},
|
expected: pkg.Package{
|
||||||
checkFunc: func(t *testing.T, p pkg.Package) {
|
Name: "llama3-8b",
|
||||||
if d := cmp.Diff("llama3-8b-instruct", p.Name); d != "" {
|
Version: "3.0",
|
||||||
t.Errorf("Name mismatch (-want +got):\n%s", d)
|
Type: pkg.ModelPkg,
|
||||||
}
|
Licenses: pkg.NewLicenseSet(
|
||||||
if d := cmp.Diff("3.0", p.Version); d != "" {
|
pkg.NewLicenseFromFields("Apache-2.0", "", nil),
|
||||||
t.Errorf("Version mismatch (-want +got):\n%s", d)
|
),
|
||||||
}
|
Metadata: pkg.GGUFFileHeader{
|
||||||
if d := cmp.Diff(pkg.ModelPkg, p.Type); d != "" {
|
Architecture: "llama",
|
||||||
t.Errorf("Type mismatch (-want +got):\n%s", d)
|
Quantization: "Q4_K_M",
|
||||||
}
|
Parameters: 8030000000,
|
||||||
assert.Empty(t, p.PURL, "PURL should not be set for model packages")
|
GGUFVersion: 3,
|
||||||
assert.Len(t, p.Licenses.ToSlice(), 1)
|
TensorCount: 291,
|
||||||
if d := cmp.Diff("Apache-2.0", p.Licenses.ToSlice()[0].Value); d != "" {
|
RemainingKeyValues: map[string]any{
|
||||||
t.Errorf("License value mismatch (-want +got):\n%s", d)
|
"general.random_kv": "foobar",
|
||||||
}
|
},
|
||||||
assert.NotEmpty(t, p.ID())
|
},
|
||||||
|
Locations: file.NewLocationSet(file.NewLocation("/models/llama3-8b.gguf")),
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "minimal GGUF package",
|
name: "minimal GGUF package",
|
||||||
version: "1.0",
|
input: struct {
|
||||||
|
modelName string
|
||||||
|
version string
|
||||||
|
license string
|
||||||
|
locations []file.Location
|
||||||
|
}{
|
||||||
|
modelName: "gpt2-small",
|
||||||
|
version: "1.0",
|
||||||
|
license: "MIT",
|
||||||
|
locations: []file.Location{file.NewLocation("/models/simple.gguf")},
|
||||||
|
},
|
||||||
metadata: &pkg.GGUFFileHeader{
|
metadata: &pkg.GGUFFileHeader{
|
||||||
ModelName: "simple-model",
|
|
||||||
Architecture: "gpt2",
|
Architecture: "gpt2",
|
||||||
GGUFVersion: 3,
|
GGUFVersion: 3,
|
||||||
TensorCount: 50,
|
TensorCount: 50,
|
||||||
},
|
},
|
||||||
locations: []file.Location{file.NewLocation("/models/simple.gguf")},
|
expected: pkg.Package{
|
||||||
checkFunc: func(t *testing.T, p pkg.Package) {
|
Name: "gpt2-small",
|
||||||
if d := cmp.Diff("simple-model", p.Name); d != "" {
|
Version: "1.0",
|
||||||
t.Errorf("Name mismatch (-want +got):\n%s", d)
|
Type: pkg.ModelPkg,
|
||||||
}
|
Licenses: pkg.NewLicenseSet(
|
||||||
if d := cmp.Diff("1.0", p.Version); d != "" {
|
pkg.NewLicenseFromFields("MIT", "", nil),
|
||||||
t.Errorf("Version mismatch (-want +got):\n%s", d)
|
),
|
||||||
}
|
Metadata: pkg.GGUFFileHeader{
|
||||||
if d := cmp.Diff(pkg.ModelPkg, p.Type); d != "" {
|
Architecture: "gpt2",
|
||||||
t.Errorf("Type mismatch (-want +got):\n%s", d)
|
GGUFVersion: 3,
|
||||||
}
|
TensorCount: 50,
|
||||||
assert.Empty(t, p.PURL, "PURL should not be set for model packages")
|
},
|
||||||
assert.Empty(t, p.Licenses.ToSlice())
|
Locations: file.NewLocationSet(file.NewLocation("/models/simple.gguf")),
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "GGUF package with multiple locations",
|
|
||||||
version: "1.5",
|
|
||||||
metadata: &pkg.GGUFFileHeader{
|
|
||||||
ModelName: "multi-location-model",
|
|
||||||
Architecture: "llama",
|
|
||||||
GGUFVersion: 3,
|
|
||||||
TensorCount: 150,
|
|
||||||
},
|
|
||||||
locations: []file.Location{
|
|
||||||
file.NewLocation("/models/model1.gguf"),
|
|
||||||
file.NewLocation("/models/model2.gguf"),
|
|
||||||
},
|
|
||||||
checkFunc: func(t *testing.T, p pkg.Package) {
|
|
||||||
assert.Len(t, p.Locations.ToSlice(), 2)
|
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, tt := range tests {
|
for _, tt := range tests {
|
||||||
t.Run(tt.name, func(t *testing.T) {
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
p := newGGUFPackage(tt.metadata, tt.version, tt.locations...)
|
actual := newGGUFPackage(
|
||||||
|
tt.metadata,
|
||||||
|
tt.input.modelName,
|
||||||
|
tt.input.version,
|
||||||
|
tt.input.license,
|
||||||
|
tt.input.locations...,
|
||||||
|
)
|
||||||
|
|
||||||
if d := cmp.Diff(tt.metadata.ModelName, p.Name); d != "" {
|
// Verify metadata type
|
||||||
t.Errorf("Name mismatch (-want +got):\n%s", d)
|
_, ok := actual.Metadata.(pkg.GGUFFileHeader)
|
||||||
}
|
|
||||||
if d := cmp.Diff(tt.version, p.Version); d != "" {
|
|
||||||
t.Errorf("Version mismatch (-want +got):\n%s", d)
|
|
||||||
}
|
|
||||||
if d := cmp.Diff(pkg.ModelPkg, p.Type); d != "" {
|
|
||||||
t.Errorf("Type mismatch (-want +got):\n%s", d)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Verify metadata is attached
|
|
||||||
metadata, ok := p.Metadata.(pkg.GGUFFileHeader)
|
|
||||||
require.True(t, ok, "metadata should be GGUFFileHeader")
|
require.True(t, ok, "metadata should be GGUFFileHeader")
|
||||||
if d := cmp.Diff(*tt.metadata, metadata); d != "" {
|
|
||||||
t.Errorf("Metadata mismatch (-want +got):\n%s", d)
|
|
||||||
}
|
|
||||||
|
|
||||||
if tt.checkFunc != nil {
|
// Use AssertPackagesEqual for comprehensive comparison
|
||||||
tt.checkFunc(t, p)
|
pkgtest.AssertPackagesEqual(t, tt.expected, actual)
|
||||||
}
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -14,46 +14,35 @@ const (
|
|||||||
maxHeaderSize = 50 * 1024 * 1024 // 50MB for large tokenizer vocabularies
|
maxHeaderSize = 50 * 1024 * 1024 // 50MB for large tokenizer vocabularies
|
||||||
)
|
)
|
||||||
|
|
||||||
// readHeader reads only the GGUF header (metadata) without reading tensor data
|
// copyHeader copies the GGUF header from the reader to the writer.
|
||||||
// This is much more efficient than reading the entire file
|
// It validates the magic number first, then copies the rest of the data.
|
||||||
// The reader should be wrapped with io.LimitedReader to prevent OOM issues
|
// The reader should be wrapped with io.LimitedReader to prevent OOM issues.
|
||||||
func readHeader(r io.Reader) ([]byte, error) {
|
func copyHeader(w io.Writer, r io.Reader) error {
|
||||||
// Read initial chunk to determine header size
|
// Read initial chunk to validate magic number
|
||||||
// GGUF format: magic(4) + version(4) + tensor_count(8) + metadata_kv_count(8) + metadata_kvs + tensors_info
|
// GGUF format: magic(4) + version(4) + tensor_count(8) + metadata_kv_count(8) + metadata_kvs + tensors_info
|
||||||
initialBuf := make([]byte, 24) // Enough for magic, version, tensor count, and kv count
|
initialBuf := make([]byte, 24) // Enough for magic, version, tensor count, and kv count
|
||||||
if _, err := io.ReadFull(r, initialBuf); err != nil {
|
if _, err := io.ReadFull(r, initialBuf); err != nil {
|
||||||
return nil, fmt.Errorf("failed to read GGUF header prefix: %w", err)
|
return fmt.Errorf("failed to read GGUF header prefix: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Verify magic number
|
// Verify magic number
|
||||||
magic := binary.LittleEndian.Uint32(initialBuf[0:4])
|
magic := binary.LittleEndian.Uint32(initialBuf[0:4])
|
||||||
if magic != ggufMagicNumber {
|
if magic != ggufMagicNumber {
|
||||||
return nil, fmt.Errorf("invalid GGUF magic number: 0x%08X", magic)
|
return fmt.Errorf("invalid GGUF magic number: 0x%08X", magic)
|
||||||
}
|
}
|
||||||
|
|
||||||
// We need to read the metadata KV pairs to know the full header size
|
// Write the initial buffer to the writer
|
||||||
// The io.LimitedReader wrapping this reader ensures we don't read more than maxHeaderSize
|
if _, err := w.Write(initialBuf); err != nil {
|
||||||
headerData := make([]byte, 0, 1024*1024) // Start with 1MB capacity
|
return fmt.Errorf("failed to write GGUF header prefix: %w", err)
|
||||||
headerData = append(headerData, initialBuf...)
|
}
|
||||||
|
|
||||||
// Read the rest of the header in larger chunks for efficiency
|
// Copy the rest of the header from reader to writer
|
||||||
// The LimitedReader will return EOF once maxHeaderSize is reached
|
// The LimitedReader will return EOF once maxHeaderSize is reached
|
||||||
buf := make([]byte, 64*1024) // 64KB chunks
|
if _, err := io.Copy(w, r); err != nil {
|
||||||
for {
|
return fmt.Errorf("failed to copy GGUF header: %w", err)
|
||||||
n, err := r.Read(buf)
|
|
||||||
if n > 0 {
|
|
||||||
headerData = append(headerData, buf[:n]...)
|
|
||||||
}
|
|
||||||
if err == io.EOF {
|
|
||||||
// Reached end of file or limit, we have all available data
|
|
||||||
break
|
|
||||||
}
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to read GGUF header: %w", err)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return headerData, nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Helper to convert gguf_parser metadata to simpler types
|
// Helper to convert gguf_parser metadata to simpler types
|
||||||
|
|||||||
@ -27,14 +27,6 @@ import (
|
|||||||
func parseGGUFModel(_ context.Context, _ file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) {
|
func parseGGUFModel(_ context.Context, _ file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) {
|
||||||
defer internal.CloseAndLogError(reader, reader.Path())
|
defer internal.CloseAndLogError(reader, reader.Path())
|
||||||
|
|
||||||
// Read and validate the GGUF file header using LimitedReader to prevent OOM
|
|
||||||
// We use LimitedReader to cap reads at maxHeaderSize (50MB)
|
|
||||||
limitedReader := &io.LimitedReader{R: reader, N: maxHeaderSize}
|
|
||||||
headerData, err := readHeader(limitedReader)
|
|
||||||
if err != nil {
|
|
||||||
return nil, nil, fmt.Errorf("failed to read GGUF header: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Create a temporary file for the library to parse
|
// Create a temporary file for the library to parse
|
||||||
// The library requires a file path, so we create a temp file
|
// The library requires a file path, so we create a temp file
|
||||||
tempFile, err := os.CreateTemp("", "syft-gguf-*.gguf")
|
tempFile, err := os.CreateTemp("", "syft-gguf-*.gguf")
|
||||||
@ -44,10 +36,12 @@ func parseGGUFModel(_ context.Context, _ file.Resolver, _ *generic.Environment,
|
|||||||
tempPath := tempFile.Name()
|
tempPath := tempFile.Name()
|
||||||
defer os.Remove(tempPath)
|
defer os.Remove(tempPath)
|
||||||
|
|
||||||
// Write the validated header data to temp file
|
// Copy and validate the GGUF file header using LimitedReader to prevent OOM
|
||||||
if _, err := tempFile.Write(headerData); err != nil {
|
// We use LimitedReader to cap reads at maxHeaderSize (50MB)
|
||||||
|
limitedReader := &io.LimitedReader{R: reader, N: maxHeaderSize}
|
||||||
|
if err := copyHeader(tempFile, limitedReader); err != nil {
|
||||||
tempFile.Close()
|
tempFile.Close()
|
||||||
return nil, nil, fmt.Errorf("failed to write to temp file: %w", err)
|
return nil, nil, fmt.Errorf("failed to copy GGUF header: %w", err)
|
||||||
}
|
}
|
||||||
tempFile.Close()
|
tempFile.Close()
|
||||||
|
|
||||||
@ -67,26 +61,26 @@ func parseGGUFModel(_ context.Context, _ file.Resolver, _ *generic.Environment,
|
|||||||
|
|
||||||
// Convert to syft metadata structure
|
// Convert to syft metadata structure
|
||||||
syftMetadata := &pkg.GGUFFileHeader{
|
syftMetadata := &pkg.GGUFFileHeader{
|
||||||
ModelName: metadata.Name,
|
Architecture: metadata.Architecture,
|
||||||
License: metadata.License,
|
Quantization: metadata.FileTypeDescriptor,
|
||||||
Architecture: metadata.Architecture,
|
Parameters: uint64(metadata.Parameters),
|
||||||
Quantization: metadata.FileTypeDescriptor,
|
GGUFVersion: uint32(ggufFile.Header.Version),
|
||||||
Parameters: uint64(metadata.Parameters),
|
TensorCount: ggufFile.Header.TensorCount,
|
||||||
GGUFVersion: uint32(ggufFile.Header.Version),
|
RemainingKeyValues: convertGGUFMetadataKVs(ggufFile.Header.MetadataKV),
|
||||||
TensorCount: ggufFile.Header.TensorCount,
|
MetadataKeyValuesHash: computeKVMetadataHash(ggufFile.Header.MetadataKV),
|
||||||
Header: convertGGUFMetadataKVs(ggufFile.Header.MetadataKV),
|
|
||||||
MetadataHash: computeKVMetadataHash(ggufFile.Header.MetadataKV),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// If model name is not in metadata, use filename
|
// If model name is not in metadata, use filename
|
||||||
if syftMetadata.ModelName == "" {
|
if metadata.Name == "" {
|
||||||
syftMetadata.ModelName = extractModelNameFromPath(reader.Path())
|
metadata.Name = extractModelNameFromPath(reader.Path())
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create package from metadata
|
// Create package from metadata
|
||||||
p := newGGUFPackage(
|
p := newGGUFPackage(
|
||||||
syftMetadata,
|
syftMetadata,
|
||||||
|
metadata.Name,
|
||||||
modelVersion,
|
modelVersion,
|
||||||
|
metadata.License,
|
||||||
reader.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation),
|
reader.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -1,41 +0,0 @@
|
|||||||
package ai
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
"os"
|
|
||||||
|
|
||||||
gguf_parser "github.com/gpustack/gguf-parser-go"
|
|
||||||
)
|
|
||||||
|
|
||||||
func main() {
|
|
||||||
// Create a test GGUF file
|
|
||||||
data := newTestGGUFBuilder().
|
|
||||||
withVersion(3).
|
|
||||||
withStringKV("general.architecture", "llama").
|
|
||||||
withStringKV("general.name", "test-model").
|
|
||||||
build()
|
|
||||||
|
|
||||||
// Write to temp file
|
|
||||||
tempFile, err := os.CreateTemp("", "test-*.gguf")
|
|
||||||
if err != nil {
|
|
||||||
panic(err)
|
|
||||||
}
|
|
||||||
defer os.Remove(tempFile.Name())
|
|
||||||
|
|
||||||
if _, err := tempFile.Write(data); err != nil {
|
|
||||||
panic(err)
|
|
||||||
}
|
|
||||||
tempFile.Close()
|
|
||||||
|
|
||||||
fmt.Printf("Wrote %d bytes to %s\n", len(data), tempFile.Name())
|
|
||||||
|
|
||||||
// Try to parse it
|
|
||||||
fmt.Println("Attempting to parse...")
|
|
||||||
gf, err := gguf_parser.ParseGGUFFile(tempFile.Name(), gguf_parser.SkipLargeMetadata())
|
|
||||||
if err != nil {
|
|
||||||
fmt.Printf("Parse error: %v\n", err)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
fmt.Printf("Success! Model: %s\n", gf.Metadata().Name)
|
|
||||||
}
|
|
||||||
@ -6,6 +6,7 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
// GGUF type constants for test builder
|
// GGUF type constants for test builder
|
||||||
|
// https://github.com/ggml-org/ggml/blob/master/docs/gguf.md
|
||||||
const (
|
const (
|
||||||
ggufMagic = 0x46554747 // "GGUF" in little-endian
|
ggufMagic = 0x46554747 // "GGUF" in little-endian
|
||||||
ggufTypeUint8 = 0
|
ggufTypeUint8 = 0
|
||||||
|
|||||||
@ -3,19 +3,14 @@ package pkg
|
|||||||
// GGUFFileHeader represents metadata extracted from a GGUF (GPT-Generated Unified Format) model file.
|
// GGUFFileHeader represents metadata extracted from a GGUF (GPT-Generated Unified Format) model file.
|
||||||
// GGUF is a binary file format used for storing model weights for the GGML library, designed for fast
|
// GGUF is a binary file format used for storing model weights for the GGML library, designed for fast
|
||||||
// loading and saving of models, particularly quantized large language models.
|
// loading and saving of models, particularly quantized large language models.
|
||||||
|
// The Model Name, License, and Version fields have all been lifted up to be on the syft Package.
|
||||||
type GGUFFileHeader struct {
|
type GGUFFileHeader struct {
|
||||||
// GGUFVersion is the GGUF format version (e.g., 3)
|
// GGUFVersion is the GGUF format version (e.g., 3)
|
||||||
GGUFVersion uint32 `json:"ggufVersion" cyclonedx:"ggufVersion"`
|
GGUFVersion uint32 `json:"ggufVersion" cyclonedx:"ggufVersion"`
|
||||||
|
|
||||||
// ModelName is the name of the model (from general.name or filename)
|
|
||||||
ModelName string `json:"modelName" cyclonedx:"modelName"`
|
|
||||||
|
|
||||||
// FileSize is the size of the GGUF file in bytes (best-effort if available from resolver)
|
// FileSize is the size of the GGUF file in bytes (best-effort if available from resolver)
|
||||||
FileSize int64 `json:"fileSize,omitempty" cyclonedx:"fileSize"`
|
FileSize int64 `json:"fileSize,omitempty" cyclonedx:"fileSize"`
|
||||||
|
|
||||||
// License is the license identifier (from general.license if present)
|
|
||||||
License string `json:"license,omitempty" cyclonedx:"license"`
|
|
||||||
|
|
||||||
// Architecture is the model architecture (from general.architecture, e.g., "qwen3moe", "llama")
|
// Architecture is the model architecture (from general.architecture, e.g., "qwen3moe", "llama")
|
||||||
Architecture string `json:"architecture,omitempty" cyclonedx:"architecture"`
|
Architecture string `json:"architecture,omitempty" cyclonedx:"architecture"`
|
||||||
|
|
||||||
@ -28,15 +23,15 @@ type GGUFFileHeader struct {
|
|||||||
// TensorCount is the number of tensors in the model
|
// TensorCount is the number of tensors in the model
|
||||||
TensorCount uint64 `json:"tensorCount" cyclonedx:"tensorCount"`
|
TensorCount uint64 `json:"tensorCount" cyclonedx:"tensorCount"`
|
||||||
|
|
||||||
// Header contains the remaining key-value pairs from the GGUF header that are not already
|
// RemainingKeyValues contains the remaining key-value pairs from the GGUF header that are not already
|
||||||
// represented as typed fields above. This preserves additional metadata fields for reference
|
// represented as typed fields above. This preserves additional metadata fields for reference
|
||||||
// (namespaced with general.*, llama.*, etc.) while avoiding duplication.
|
// (namespaced with general.*, llama.*, etc.) while avoiding duplication.
|
||||||
Header map[string]interface{} `json:"header,omitempty" cyclonedx:"header"`
|
RemainingKeyValues map[string]interface{} `json:"header,omitempty" cyclonedx:"header"`
|
||||||
|
|
||||||
// MetadataHash is a xx64 hash of all key-value pairs from the GGUF header metadata.
|
// MetadataKeyValuesHash is a xx64 hash of all key-value pairs from the GGUF header metadata.
|
||||||
// This hash is computed over the complete header metadata (including the fields extracted
|
// This hash is computed over the complete header metadata (including the fields extracted
|
||||||
// into typed fields above) and provides a stable identifier for the model configuration
|
// into typed fields above) and provides a stable identifier for the model configuration
|
||||||
// across different file locations or remotes. It allows matching identical models even
|
// across different file locations or remotes. It allows matching identical models even
|
||||||
// when stored in different repositories or with different filenames.
|
// when stored in different repositories or with different filenames.
|
||||||
MetadataHash string `json:"metadataHash,omitempty" cyclonedx:"metadataHash"`
|
MetadataKeyValuesHash string `json:"metadataHash,omitempty" cyclonedx:"metadataHash"`
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user