mirror of
https://github.com/anchore/syft.git
synced 2025-11-17 16:33:21 +01:00
feat: migrate gguf parser to separate PR from oci
Signed-off-by: Christopher Phillips <32073428+spiffcs@users.noreply.github.com>
This commit is contained in:
parent
2e100f33f3
commit
6ceef5fe4a
2
.gitignore
vendored
2
.gitignore
vendored
@ -73,3 +73,5 @@ cosign.pub
|
|||||||
__pycache__/
|
__pycache__/
|
||||||
*.py[cod]
|
*.py[cod]
|
||||||
*$py.class
|
*$py.class
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -124,6 +124,7 @@ var jsonTypes = makeJSONTypes(
|
|||||||
jsonNames(pkg.TerraformLockProviderEntry{}, "terraform-lock-provider-entry"),
|
jsonNames(pkg.TerraformLockProviderEntry{}, "terraform-lock-provider-entry"),
|
||||||
jsonNames(pkg.DotnetPackagesLockEntry{}, "dotnet-packages-lock-entry"),
|
jsonNames(pkg.DotnetPackagesLockEntry{}, "dotnet-packages-lock-entry"),
|
||||||
jsonNames(pkg.CondaMetaPackage{}, "conda-metadata-entry", "CondaPackageMetadata"),
|
jsonNames(pkg.CondaMetaPackage{}, "conda-metadata-entry", "CondaPackageMetadata"),
|
||||||
|
jsonNames(pkg.GGUFFileMetadata{}, "gguf-file-metadata", "GGUFFileMetadata"),
|
||||||
)
|
)
|
||||||
|
|
||||||
func expandLegacyNameVariants(names ...string) []string {
|
func expandLegacyNameVariants(names ...string) []string {
|
||||||
|
|||||||
@ -37,6 +37,7 @@ import (
|
|||||||
"github.com/anchore/syft/syft/pkg/cataloger/swipl"
|
"github.com/anchore/syft/syft/pkg/cataloger/swipl"
|
||||||
"github.com/anchore/syft/syft/pkg/cataloger/terraform"
|
"github.com/anchore/syft/syft/pkg/cataloger/terraform"
|
||||||
"github.com/anchore/syft/syft/pkg/cataloger/wordpress"
|
"github.com/anchore/syft/syft/pkg/cataloger/wordpress"
|
||||||
|
"github.com/anchore/syft/syft/pkg/cataloger/aiartifact"
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
@ -178,6 +179,7 @@ func DefaultPackageTaskFactories() Factories {
|
|||||||
newSimplePackageTaskFactory(homebrew.NewCataloger, pkgcataloging.DirectoryTag, pkgcataloging.InstalledTag, pkgcataloging.ImageTag, "homebrew"),
|
newSimplePackageTaskFactory(homebrew.NewCataloger, pkgcataloging.DirectoryTag, pkgcataloging.InstalledTag, pkgcataloging.ImageTag, "homebrew"),
|
||||||
newSimplePackageTaskFactory(conda.NewCondaMetaCataloger, pkgcataloging.DirectoryTag, pkgcataloging.InstalledTag, pkgcataloging.PackageTag, "conda"),
|
newSimplePackageTaskFactory(conda.NewCondaMetaCataloger, pkgcataloging.DirectoryTag, pkgcataloging.InstalledTag, pkgcataloging.PackageTag, "conda"),
|
||||||
newSimplePackageTaskFactory(snap.NewCataloger, pkgcataloging.DirectoryTag, pkgcataloging.InstalledTag, pkgcataloging.ImageTag, "snap"),
|
newSimplePackageTaskFactory(snap.NewCataloger, pkgcataloging.DirectoryTag, pkgcataloging.InstalledTag, pkgcataloging.ImageTag, "snap"),
|
||||||
|
newSimplePackageTaskFactory(aiartifact.NewGGUFCataloger, pkgcataloging.DirectoryTag, pkgcataloging.ImageTag, "ai-artifact", "model", "gguf", "ml"),
|
||||||
|
|
||||||
// deprecated catalogers ////////////////////////////////////////
|
// deprecated catalogers ////////////////////////////////////////
|
||||||
// these are catalogers that should not be selectable other than specific inclusion via name or "deprecated" tag (to remain backwards compatible)
|
// these are catalogers that should not be selectable other than specific inclusion via name or "deprecated" tag (to remain backwards compatible)
|
||||||
|
|||||||
@ -42,6 +42,8 @@ func EncodeComponent(p pkg.Package, supplier string, locationSorter func(a, b fi
|
|||||||
componentType := cyclonedx.ComponentTypeLibrary
|
componentType := cyclonedx.ComponentTypeLibrary
|
||||||
if p.Type == pkg.BinaryPkg {
|
if p.Type == pkg.BinaryPkg {
|
||||||
componentType = cyclonedx.ComponentTypeApplication
|
componentType = cyclonedx.ComponentTypeApplication
|
||||||
|
} else if p.Type == pkg.ModelPkg {
|
||||||
|
componentType = cyclonedx.ComponentTypeMachineLearningModel
|
||||||
}
|
}
|
||||||
|
|
||||||
return cyclonedx.Component{
|
return cyclonedx.Component{
|
||||||
|
|||||||
@ -62,7 +62,7 @@ func collectPackages(component *cyclonedx.Component, s *sbom.SBOM, idMap map[str
|
|||||||
switch component.Type {
|
switch component.Type {
|
||||||
case cyclonedx.ComponentTypeOS:
|
case cyclonedx.ComponentTypeOS:
|
||||||
case cyclonedx.ComponentTypeContainer:
|
case cyclonedx.ComponentTypeContainer:
|
||||||
case cyclonedx.ComponentTypeApplication, cyclonedx.ComponentTypeFramework, cyclonedx.ComponentTypeLibrary:
|
case cyclonedx.ComponentTypeApplication, cyclonedx.ComponentTypeFramework, cyclonedx.ComponentTypeLibrary, cyclonedx.ComponentTypeMachineLearningModel:
|
||||||
p := decodeComponent(component)
|
p := decodeComponent(component)
|
||||||
idMap[component.BOMRef] = p
|
idMap[component.BOMRef] = p
|
||||||
if component.BOMRef != "" {
|
if component.BOMRef != "" {
|
||||||
|
|||||||
16
syft/pkg/cataloger/aiartifact/cataloger.go
Normal file
16
syft/pkg/cataloger/aiartifact/cataloger.go
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
/*
|
||||||
|
Package aiartifact provides concrete Cataloger implementations for AI artifacts and machine learning models,
|
||||||
|
including support for GGUF (GPT-Generated Unified Format) model files.
|
||||||
|
*/
|
||||||
|
package aiartifact
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/anchore/syft/syft/pkg"
|
||||||
|
"github.com/anchore/syft/syft/pkg/cataloger/generic"
|
||||||
|
)
|
||||||
|
|
||||||
|
// NewGGUFCataloger returns a new cataloger instance for GGUF model files.
|
||||||
|
func NewGGUFCataloger() pkg.Cataloger {
|
||||||
|
return generic.NewCataloger("gguf-cataloger").
|
||||||
|
WithParserByGlobs(parseGGUFModel, "**/*.gguf")
|
||||||
|
}
|
||||||
68
syft/pkg/cataloger/aiartifact/package.go
Normal file
68
syft/pkg/cataloger/aiartifact/package.go
Normal file
@ -0,0 +1,68 @@
|
|||||||
|
package aiartifact
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
|
||||||
|
"github.com/anchore/packageurl-go"
|
||||||
|
"github.com/anchore/syft/syft/file"
|
||||||
|
"github.com/anchore/syft/syft/pkg"
|
||||||
|
)
|
||||||
|
|
||||||
|
func newGGUFPackage(metadata *pkg.GGUFFileMetadata, locations ...file.Location) pkg.Package {
|
||||||
|
p := pkg.Package{
|
||||||
|
Name: metadata.ModelName,
|
||||||
|
Version: metadata.ModelVersion,
|
||||||
|
PURL: packageURL(metadata),
|
||||||
|
Locations: file.NewLocationSet(locations...),
|
||||||
|
Type: pkg.ModelPkg,
|
||||||
|
Licenses: pkg.NewLicenseSet(),
|
||||||
|
Metadata: *metadata,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add license to the package if present in metadata
|
||||||
|
if metadata.License != "" {
|
||||||
|
p.Licenses.Add(pkg.NewLicenseFromFields(metadata.License, "", nil))
|
||||||
|
}
|
||||||
|
|
||||||
|
p.SetID()
|
||||||
|
|
||||||
|
return p
|
||||||
|
}
|
||||||
|
|
||||||
|
// packageURL returns the PURL for the specific GGUF model package (see https://github.com/package-url/purl-spec)
|
||||||
|
func packageURL(metadata *pkg.GGUFFileMetadata) string {
|
||||||
|
var qualifiers packageurl.Qualifiers
|
||||||
|
|
||||||
|
// Add model-specific qualifiers
|
||||||
|
if metadata.Architecture != "" {
|
||||||
|
qualifiers = append(qualifiers, packageurl.Qualifier{
|
||||||
|
Key: "arch",
|
||||||
|
Value: metadata.Architecture,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
if metadata.Quantization != "" && metadata.Quantization != "unknown" {
|
||||||
|
qualifiers = append(qualifiers, packageurl.Qualifier{
|
||||||
|
Key: "quantization",
|
||||||
|
Value: metadata.Quantization,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
if metadata.Parameters > 0 {
|
||||||
|
qualifiers = append(qualifiers, packageurl.Qualifier{
|
||||||
|
Key: "parameters",
|
||||||
|
Value: fmt.Sprintf("%d", metadata.Parameters),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use mlmodel as the type for machine learning models in GGUF format
|
||||||
|
// This follows the PURL spec guidance for ML models
|
||||||
|
return packageurl.NewPackageURL(
|
||||||
|
"mlmodel",
|
||||||
|
"gguf",
|
||||||
|
metadata.ModelName,
|
||||||
|
metadata.ModelVersion,
|
||||||
|
qualifiers,
|
||||||
|
"",
|
||||||
|
).ToString()
|
||||||
|
}
|
||||||
344
syft/pkg/cataloger/aiartifact/parse_gguf.go
Normal file
344
syft/pkg/cataloger/aiartifact/parse_gguf.go
Normal file
@ -0,0 +1,344 @@
|
|||||||
|
package aiartifact
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"crypto/sha256"
|
||||||
|
"encoding/binary"
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"path/filepath"
|
||||||
|
"regexp"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/anchore/syft/internal/log"
|
||||||
|
"github.com/anchore/syft/syft/pkg"
|
||||||
|
)
|
||||||
|
|
||||||
|
// GGUF file format constants
|
||||||
|
const (
|
||||||
|
ggufMagic = 0x46554747 // "GGUF" in little-endian
|
||||||
|
maxKVPairs = 10000 // Safety limit for KV pairs
|
||||||
|
maxKeyLen = 65535 // Maximum key length
|
||||||
|
maxTensors = 100000 // Safety limit for tensors
|
||||||
|
maxHeaderKV = 200 // Maximum KV pairs to include in Header map (to avoid bloat)
|
||||||
|
)
|
||||||
|
|
||||||
|
// GGUF value types (from GGUF spec)
|
||||||
|
const (
|
||||||
|
ggufTypeUint8 = 0
|
||||||
|
ggufTypeInt8 = 1
|
||||||
|
ggufTypeUint16 = 2
|
||||||
|
ggufTypeInt16 = 3
|
||||||
|
ggufTypeUint32 = 4
|
||||||
|
ggufTypeInt32 = 5
|
||||||
|
ggufTypeFloat32 = 6
|
||||||
|
ggufTypeUint64 = 7
|
||||||
|
ggufTypeInt64 = 8
|
||||||
|
ggufTypeFloat64 = 9
|
||||||
|
ggufTypeBool = 10
|
||||||
|
ggufTypeString = 11
|
||||||
|
ggufTypeArray = 12
|
||||||
|
)
|
||||||
|
|
||||||
|
// parseGGUFHeader parses the header of a GGUF file from raw bytes and extracts metadata
|
||||||
|
func parseGGUFHeader(data []byte, location string) (*pkg.GGUFFileMetadata, error) {
|
||||||
|
reader := bytes.NewReader(data)
|
||||||
|
// Read magic number
|
||||||
|
var magic uint32
|
||||||
|
if err := binary.Read(reader, binary.LittleEndian, &magic); err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to read magic number: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if magic != ggufMagic {
|
||||||
|
return nil, fmt.Errorf("invalid GGUF magic number: 0x%08X", magic)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read version
|
||||||
|
var version uint32
|
||||||
|
if err := binary.Read(reader, binary.LittleEndian, &version); err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to read version: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read tensor count
|
||||||
|
var tensorCount uint64
|
||||||
|
if err := binary.Read(reader, binary.LittleEndian, &tensorCount); err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to read tensor count: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if tensorCount > maxTensors {
|
||||||
|
log.Warnf("GGUF file has suspicious tensor count: %d (max: %d)", tensorCount, maxTensors)
|
||||||
|
tensorCount = maxTensors
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read metadata KV count
|
||||||
|
var kvCount uint64
|
||||||
|
if err := binary.Read(reader, binary.LittleEndian, &kvCount); err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to read KV count: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if kvCount > maxKVPairs {
|
||||||
|
log.Warnf("GGUF file has suspicious KV count: %d (max: %d)", kvCount, maxKVPairs)
|
||||||
|
return nil, fmt.Errorf("KV count exceeds safety limit: %d", kvCount)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse metadata key-value pairs
|
||||||
|
kvMap := make(map[string]interface{})
|
||||||
|
truncated := false
|
||||||
|
|
||||||
|
for i := uint64(0); i < kvCount; i++ {
|
||||||
|
key, value, err := readKVPair(reader)
|
||||||
|
if err != nil {
|
||||||
|
log.Warnf("failed to read KV pair %d: %v", i, err)
|
||||||
|
truncated = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
if len(kvMap) < maxHeaderKV {
|
||||||
|
kvMap[key] = value
|
||||||
|
} else {
|
||||||
|
truncated = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract common metadata fields
|
||||||
|
metadata := &pkg.GGUFFileMetadata{
|
||||||
|
ModelFormat: "gguf",
|
||||||
|
GGUFVersion: version,
|
||||||
|
TensorCount: tensorCount,
|
||||||
|
Header: kvMap,
|
||||||
|
TruncatedHeader: truncated,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract known fields from KV map and remove them to avoid duplication in Header
|
||||||
|
if arch, ok := kvMap["general.architecture"].(string); ok {
|
||||||
|
metadata.Architecture = arch
|
||||||
|
delete(kvMap, "general.architecture")
|
||||||
|
}
|
||||||
|
|
||||||
|
if name, ok := kvMap["general.name"].(string); ok {
|
||||||
|
metadata.ModelName = name
|
||||||
|
delete(kvMap, "general.name")
|
||||||
|
} else {
|
||||||
|
// Fall back to filename if general.name not present
|
||||||
|
filename := filepath.Base(location)
|
||||||
|
metadata.ModelName = strings.TrimSuffix(filename, filepath.Ext(filename))
|
||||||
|
}
|
||||||
|
|
||||||
|
if license, ok := kvMap["general.license"].(string); ok {
|
||||||
|
metadata.License = license
|
||||||
|
delete(kvMap, "general.license")
|
||||||
|
}
|
||||||
|
|
||||||
|
if version, ok := kvMap["general.version"].(string); ok {
|
||||||
|
metadata.ModelVersion = version
|
||||||
|
delete(kvMap, "general.version")
|
||||||
|
} else {
|
||||||
|
metadata.ModelVersion = "unknown"
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract parameters count if present
|
||||||
|
if params, ok := kvMap["general.parameter_count"].(uint64); ok {
|
||||||
|
metadata.Parameters = params
|
||||||
|
delete(kvMap, "general.parameter_count")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try to infer quantization from general.quantization or from filename
|
||||||
|
if quant, ok := kvMap["general.quantization"].(string); ok {
|
||||||
|
metadata.Quantization = quant
|
||||||
|
delete(kvMap, "general.quantization")
|
||||||
|
} else if quantizedBy, ok := kvMap["general.quantized_by"].(string); ok && quantizedBy != "" {
|
||||||
|
// If quantized but no explicit quantization field, try to extract from filename
|
||||||
|
metadata.Quantization = inferQuantizationFromFilename(location)
|
||||||
|
// Note: we keep general.quantized_by in Header since it's not directly mapped to a field
|
||||||
|
} else {
|
||||||
|
metadata.Quantization = "unknown"
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compute hash of metadata for stable identifier
|
||||||
|
metadata.Hash = computeMetadataHash(metadata)
|
||||||
|
|
||||||
|
return metadata, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// readKVPair reads a single key-value pair from the GGUF header
|
||||||
|
func readKVPair(reader io.Reader) (string, interface{}, error) {
|
||||||
|
// Read key length
|
||||||
|
var keyLen uint64
|
||||||
|
if err := binary.Read(reader, binary.LittleEndian, &keyLen); err != nil {
|
||||||
|
return "", nil, fmt.Errorf("failed to read key length: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if keyLen > maxKeyLen {
|
||||||
|
return "", nil, fmt.Errorf("key length exceeds maximum: %d", keyLen)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read key
|
||||||
|
keyBytes := make([]byte, keyLen)
|
||||||
|
if _, err := io.ReadFull(reader, keyBytes); err != nil {
|
||||||
|
return "", nil, fmt.Errorf("failed to read key: %w", err)
|
||||||
|
}
|
||||||
|
key := string(keyBytes)
|
||||||
|
|
||||||
|
// Read value type
|
||||||
|
var valueType uint32
|
||||||
|
if err := binary.Read(reader, binary.LittleEndian, &valueType); err != nil {
|
||||||
|
return "", nil, fmt.Errorf("failed to read value type: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read value based on type
|
||||||
|
value, err := readValue(reader, valueType)
|
||||||
|
if err != nil {
|
||||||
|
return "", nil, fmt.Errorf("failed to read value for key %s: %w", key, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return key, value, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// readValue reads a value based on its type
|
||||||
|
func readValue(reader io.Reader, valueType uint32) (interface{}, error) {
|
||||||
|
switch valueType {
|
||||||
|
case ggufTypeUint8:
|
||||||
|
var v uint8
|
||||||
|
err := binary.Read(reader, binary.LittleEndian, &v)
|
||||||
|
return v, err
|
||||||
|
case ggufTypeInt8:
|
||||||
|
var v int8
|
||||||
|
err := binary.Read(reader, binary.LittleEndian, &v)
|
||||||
|
return v, err
|
||||||
|
case ggufTypeUint16:
|
||||||
|
var v uint16
|
||||||
|
err := binary.Read(reader, binary.LittleEndian, &v)
|
||||||
|
return v, err
|
||||||
|
case ggufTypeInt16:
|
||||||
|
var v int16
|
||||||
|
err := binary.Read(reader, binary.LittleEndian, &v)
|
||||||
|
return v, err
|
||||||
|
case ggufTypeUint32:
|
||||||
|
var v uint32
|
||||||
|
err := binary.Read(reader, binary.LittleEndian, &v)
|
||||||
|
return v, err
|
||||||
|
case ggufTypeInt32:
|
||||||
|
var v int32
|
||||||
|
err := binary.Read(reader, binary.LittleEndian, &v)
|
||||||
|
return v, err
|
||||||
|
case ggufTypeFloat32:
|
||||||
|
var v float32
|
||||||
|
err := binary.Read(reader, binary.LittleEndian, &v)
|
||||||
|
return v, err
|
||||||
|
case ggufTypeUint64:
|
||||||
|
var v uint64
|
||||||
|
err := binary.Read(reader, binary.LittleEndian, &v)
|
||||||
|
return v, err
|
||||||
|
case ggufTypeInt64:
|
||||||
|
var v int64
|
||||||
|
err := binary.Read(reader, binary.LittleEndian, &v)
|
||||||
|
return v, err
|
||||||
|
case ggufTypeFloat64:
|
||||||
|
var v float64
|
||||||
|
err := binary.Read(reader, binary.LittleEndian, &v)
|
||||||
|
return v, err
|
||||||
|
case ggufTypeBool:
|
||||||
|
var v uint8
|
||||||
|
err := binary.Read(reader, binary.LittleEndian, &v)
|
||||||
|
return v != 0, err
|
||||||
|
case ggufTypeString:
|
||||||
|
return readString(reader)
|
||||||
|
case ggufTypeArray:
|
||||||
|
return readArray(reader)
|
||||||
|
default:
|
||||||
|
return nil, fmt.Errorf("unknown value type: %d", valueType)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// readString reads a length-prefixed UTF-8 string
|
||||||
|
func readString(reader io.Reader) (string, error) {
|
||||||
|
var length uint64
|
||||||
|
if err := binary.Read(reader, binary.LittleEndian, &length); err != nil {
|
||||||
|
return "", fmt.Errorf("failed to read string length: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if length > maxKeyLen {
|
||||||
|
return "", fmt.Errorf("string length exceeds maximum: %d", length)
|
||||||
|
}
|
||||||
|
|
||||||
|
strBytes := make([]byte, length)
|
||||||
|
if _, err := io.ReadFull(reader, strBytes); err != nil {
|
||||||
|
return "", fmt.Errorf("failed to read string: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return string(strBytes), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// readArray reads an array value
|
||||||
|
func readArray(reader io.Reader) (interface{}, error) {
|
||||||
|
// Read array element type
|
||||||
|
var elemType uint32
|
||||||
|
if err := binary.Read(reader, binary.LittleEndian, &elemType); err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to read array element type: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read array length
|
||||||
|
var length uint64
|
||||||
|
if err := binary.Read(reader, binary.LittleEndian, &length); err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to read array length: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if length > 1000 {
|
||||||
|
// Limit array size to avoid memory issues
|
||||||
|
return nil, fmt.Errorf("array length too large: %d", length)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read array elements
|
||||||
|
var elements []interface{}
|
||||||
|
for i := uint64(0); i < length; i++ {
|
||||||
|
value, err := readValue(reader, elemType)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to read array element %d: %w", i, err)
|
||||||
|
}
|
||||||
|
elements = append(elements, value)
|
||||||
|
}
|
||||||
|
|
||||||
|
return elements, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// inferQuantizationFromFilename attempts to extract quantization info from filename
|
||||||
|
func inferQuantizationFromFilename(filename string) string {
|
||||||
|
// Common quantization patterns: Q4_K_M, IQ4_NL, Q5_K_S, etc.
|
||||||
|
quantPattern := regexp.MustCompile(`[IQ]\d+_[A-Z_]+`)
|
||||||
|
if match := quantPattern.FindString(filename); match != "" {
|
||||||
|
return match
|
||||||
|
}
|
||||||
|
return "unknown"
|
||||||
|
}
|
||||||
|
|
||||||
|
// computeMetadataHash computes a stable hash of the metadata for use as a global identifier
|
||||||
|
func computeMetadataHash(metadata *pkg.GGUFFileMetadata) string {
|
||||||
|
// Create a stable representation of the metadata
|
||||||
|
hashData := struct {
|
||||||
|
Format string
|
||||||
|
Name string
|
||||||
|
Version string
|
||||||
|
Architecture string
|
||||||
|
GGUFVersion uint32
|
||||||
|
TensorCount uint64
|
||||||
|
}{
|
||||||
|
Format: metadata.ModelFormat,
|
||||||
|
Name: metadata.ModelName,
|
||||||
|
Version: metadata.ModelVersion,
|
||||||
|
Architecture: metadata.Architecture,
|
||||||
|
GGUFVersion: metadata.GGUFVersion,
|
||||||
|
TensorCount: metadata.TensorCount,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Marshal to JSON for stable hashing
|
||||||
|
jsonBytes, err := json.Marshal(hashData)
|
||||||
|
if err != nil {
|
||||||
|
log.Warnf("failed to marshal metadata for hashing: %v", err)
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compute SHA256 hash
|
||||||
|
hash := sha256.Sum256(jsonBytes)
|
||||||
|
return fmt.Sprintf("%x", hash[:8]) // Use first 8 bytes (16 hex chars)
|
||||||
|
}
|
||||||
|
|
||||||
68
syft/pkg/cataloger/aiartifact/parse_gguf_model.go
Normal file
68
syft/pkg/cataloger/aiartifact/parse_gguf_model.go
Normal file
@ -0,0 +1,68 @@
|
|||||||
|
package aiartifact
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
|
||||||
|
"github.com/anchore/syft/internal"
|
||||||
|
"github.com/anchore/syft/internal/log"
|
||||||
|
"github.com/anchore/syft/internal/unknown"
|
||||||
|
"github.com/anchore/syft/syft/artifact"
|
||||||
|
"github.com/anchore/syft/syft/file"
|
||||||
|
"github.com/anchore/syft/syft/pkg"
|
||||||
|
"github.com/anchore/syft/syft/pkg/cataloger/generic"
|
||||||
|
)
|
||||||
|
|
||||||
|
// parseGGUFModel parses a GGUF model file and returns the discovered package.
|
||||||
|
func parseGGUFModel(_ context.Context, _ file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) {
|
||||||
|
defer internal.CloseAndLogError(reader, reader.Location.Path())
|
||||||
|
|
||||||
|
// Read header (we'll read a reasonable amount to parse the header without reading entire file)
|
||||||
|
// GGUF headers are typically < 1MB, but we'll use a 10MB limit to be safe
|
||||||
|
const maxHeaderSize = 10 * 1024 * 1024
|
||||||
|
limitedReader := io.LimitReader(reader, maxHeaderSize)
|
||||||
|
|
||||||
|
// We need to buffer the data because we need to check magic and parse
|
||||||
|
headerData := make([]byte, 0, 8192) // Start with 8KB buffer
|
||||||
|
buf := make([]byte, 8192)
|
||||||
|
for {
|
||||||
|
n, err := limitedReader.Read(buf)
|
||||||
|
if n > 0 {
|
||||||
|
headerData = append(headerData, buf[:n]...)
|
||||||
|
}
|
||||||
|
if err == io.EOF {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
return nil, nil, fmt.Errorf("error reading file: %w", err)
|
||||||
|
}
|
||||||
|
// Stop if we've read enough for a reasonable header
|
||||||
|
if len(headerData) > maxHeaderSize {
|
||||||
|
log.Warnf("GGUF header at %s exceeds max size, truncating", reader.Location.Path())
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if this is actually a GGUF file
|
||||||
|
if len(headerData) < 4 {
|
||||||
|
return nil, nil, fmt.Errorf("file too small to be a valid GGUF file")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse the GGUF header
|
||||||
|
metadata, err := parseGGUFHeader(headerData, reader.Location.Path())
|
||||||
|
if err != nil {
|
||||||
|
return nil, nil, fmt.Errorf("failed to parse GGUF file: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create package from metadata
|
||||||
|
p := newGGUFPackage(
|
||||||
|
metadata,
|
||||||
|
reader.Location.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation),
|
||||||
|
)
|
||||||
|
|
||||||
|
return []pkg.Package{p}, nil, unknown.IfEmptyf([]pkg.Package{p}, "unable to parse GGUF file")
|
||||||
|
}
|
||||||
|
|
||||||
|
// integrity check
|
||||||
|
var _ generic.Parser = parseGGUFModel
|
||||||
47
syft/pkg/gguf.go
Normal file
47
syft/pkg/gguf.go
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
package pkg
|
||||||
|
|
||||||
|
// GGUFFileMetadata represents metadata extracted from a GGUF (GPT-Generated Unified Format) model file.
|
||||||
|
// GGUF is a binary file format used for storing model weights for the GGML library, designed for fast
|
||||||
|
// loading and saving of models, particularly quantized large language models.
|
||||||
|
type GGUFFileMetadata struct {
|
||||||
|
// ModelFormat is always "gguf"
|
||||||
|
ModelFormat string `json:"modelFormat" cyclonedx:"modelFormat"`
|
||||||
|
|
||||||
|
// ModelName is the name of the model (from general.name or filename)
|
||||||
|
ModelName string `json:"modelName" cyclonedx:"modelName"`
|
||||||
|
|
||||||
|
// ModelVersion is the version of the model (if available in header, else "unknown")
|
||||||
|
ModelVersion string `json:"modelVersion,omitempty" cyclonedx:"modelVersion"`
|
||||||
|
|
||||||
|
// FileSize is the size of the GGUF file in bytes (best-effort if available from resolver)
|
||||||
|
FileSize int64 `json:"fileSize,omitempty" cyclonedx:"fileSize"`
|
||||||
|
|
||||||
|
// Hash is a content hash of the metadata (for stable global identifiers across remotes)
|
||||||
|
Hash string `json:"hash,omitempty" cyclonedx:"hash"`
|
||||||
|
|
||||||
|
// License is the license identifier (from general.license if present)
|
||||||
|
License string `json:"license,omitempty" cyclonedx:"license"`
|
||||||
|
|
||||||
|
// GGUFVersion is the GGUF format version (e.g., 3)
|
||||||
|
GGUFVersion uint32 `json:"ggufVersion" cyclonedx:"ggufVersion"`
|
||||||
|
|
||||||
|
// Architecture is the model architecture (from general.architecture, e.g., "qwen3moe", "llama")
|
||||||
|
Architecture string `json:"architecture,omitempty" cyclonedx:"architecture"`
|
||||||
|
|
||||||
|
// Quantization is the quantization type (e.g., "IQ4_NL", "Q4_K_M")
|
||||||
|
Quantization string `json:"quantization,omitempty" cyclonedx:"quantization"`
|
||||||
|
|
||||||
|
// Parameters is the number of model parameters (if present in header)
|
||||||
|
Parameters uint64 `json:"parameters,omitempty" cyclonedx:"parameters"`
|
||||||
|
|
||||||
|
// TensorCount is the number of tensors in the model
|
||||||
|
TensorCount uint64 `json:"tensorCount" cyclonedx:"tensorCount"`
|
||||||
|
|
||||||
|
// Header contains the remaining key-value pairs from the GGUF header that are not already
|
||||||
|
// represented as typed fields above. This preserves additional metadata fields for reference
|
||||||
|
// (namespaced with general.*, llama.*, etc.) while avoiding duplication.
|
||||||
|
Header map[string]interface{} `json:"header,omitempty" cyclonedx:"header"`
|
||||||
|
|
||||||
|
// TruncatedHeader indicates if the header was truncated during parsing (for very large headers)
|
||||||
|
TruncatedHeader bool `json:"truncatedHeader,omitempty" cyclonedx:"truncatedHeader"`
|
||||||
|
}
|
||||||
@ -54,6 +54,7 @@ const (
|
|||||||
TerraformPkg Type = "terraform"
|
TerraformPkg Type = "terraform"
|
||||||
WordpressPluginPkg Type = "wordpress-plugin"
|
WordpressPluginPkg Type = "wordpress-plugin"
|
||||||
HomebrewPkg Type = "homebrew"
|
HomebrewPkg Type = "homebrew"
|
||||||
|
ModelPkg Type = "model"
|
||||||
)
|
)
|
||||||
|
|
||||||
// AllPkgs represents all supported package types
|
// AllPkgs represents all supported package types
|
||||||
@ -98,6 +99,7 @@ var AllPkgs = []Type{
|
|||||||
TerraformPkg,
|
TerraformPkg,
|
||||||
WordpressPluginPkg,
|
WordpressPluginPkg,
|
||||||
HomebrewPkg,
|
HomebrewPkg,
|
||||||
|
ModelPkg,
|
||||||
}
|
}
|
||||||
|
|
||||||
// PackageURLType returns the PURL package type for the current package.
|
// PackageURLType returns the PURL package type for the current package.
|
||||||
@ -174,6 +176,8 @@ func (t Type) PackageURLType() string {
|
|||||||
return "wordpress-plugin"
|
return "wordpress-plugin"
|
||||||
case HomebrewPkg:
|
case HomebrewPkg:
|
||||||
return "homebrew"
|
return "homebrew"
|
||||||
|
case ModelPkg:
|
||||||
|
return "generic/model"
|
||||||
default:
|
default:
|
||||||
// TODO: should this be a "generic" purl type instead?
|
// TODO: should this be a "generic" purl type instead?
|
||||||
return ""
|
return ""
|
||||||
@ -262,6 +266,8 @@ func TypeByName(name string) Type {
|
|||||||
return WordpressPluginPkg
|
return WordpressPluginPkg
|
||||||
case "homebrew":
|
case "homebrew":
|
||||||
return HomebrewPkg
|
return HomebrewPkg
|
||||||
|
case "model":
|
||||||
|
return ModelPkg
|
||||||
default:
|
default:
|
||||||
return UnknownPkg
|
return UnknownPkg
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user