feat: migrate gguf parser to separate PR from oci

Signed-off-by: Christopher Phillips <32073428+spiffcs@users.noreply.github.com>
This commit is contained in:
Christopher Phillips 2025-10-14 02:46:34 -04:00
parent 2e100f33f3
commit 6ceef5fe4a
No known key found for this signature in database
11 changed files with 557 additions and 1 deletions

2
.gitignore vendored
View File

@ -73,3 +73,5 @@ cosign.pub
__pycache__/ __pycache__/
*.py[cod] *.py[cod]
*$py.class *$py.class

View File

@ -124,6 +124,7 @@ var jsonTypes = makeJSONTypes(
jsonNames(pkg.TerraformLockProviderEntry{}, "terraform-lock-provider-entry"), jsonNames(pkg.TerraformLockProviderEntry{}, "terraform-lock-provider-entry"),
jsonNames(pkg.DotnetPackagesLockEntry{}, "dotnet-packages-lock-entry"), jsonNames(pkg.DotnetPackagesLockEntry{}, "dotnet-packages-lock-entry"),
jsonNames(pkg.CondaMetaPackage{}, "conda-metadata-entry", "CondaPackageMetadata"), jsonNames(pkg.CondaMetaPackage{}, "conda-metadata-entry", "CondaPackageMetadata"),
jsonNames(pkg.GGUFFileMetadata{}, "gguf-file-metadata", "GGUFFileMetadata"),
) )
func expandLegacyNameVariants(names ...string) []string { func expandLegacyNameVariants(names ...string) []string {

View File

@ -37,6 +37,7 @@ import (
"github.com/anchore/syft/syft/pkg/cataloger/swipl" "github.com/anchore/syft/syft/pkg/cataloger/swipl"
"github.com/anchore/syft/syft/pkg/cataloger/terraform" "github.com/anchore/syft/syft/pkg/cataloger/terraform"
"github.com/anchore/syft/syft/pkg/cataloger/wordpress" "github.com/anchore/syft/syft/pkg/cataloger/wordpress"
"github.com/anchore/syft/syft/pkg/cataloger/aiartifact"
) )
const ( const (
@ -178,6 +179,7 @@ func DefaultPackageTaskFactories() Factories {
newSimplePackageTaskFactory(homebrew.NewCataloger, pkgcataloging.DirectoryTag, pkgcataloging.InstalledTag, pkgcataloging.ImageTag, "homebrew"), newSimplePackageTaskFactory(homebrew.NewCataloger, pkgcataloging.DirectoryTag, pkgcataloging.InstalledTag, pkgcataloging.ImageTag, "homebrew"),
newSimplePackageTaskFactory(conda.NewCondaMetaCataloger, pkgcataloging.DirectoryTag, pkgcataloging.InstalledTag, pkgcataloging.PackageTag, "conda"), newSimplePackageTaskFactory(conda.NewCondaMetaCataloger, pkgcataloging.DirectoryTag, pkgcataloging.InstalledTag, pkgcataloging.PackageTag, "conda"),
newSimplePackageTaskFactory(snap.NewCataloger, pkgcataloging.DirectoryTag, pkgcataloging.InstalledTag, pkgcataloging.ImageTag, "snap"), newSimplePackageTaskFactory(snap.NewCataloger, pkgcataloging.DirectoryTag, pkgcataloging.InstalledTag, pkgcataloging.ImageTag, "snap"),
newSimplePackageTaskFactory(aiartifact.NewGGUFCataloger, pkgcataloging.DirectoryTag, pkgcataloging.ImageTag, "ai-artifact", "model", "gguf", "ml"),
// deprecated catalogers //////////////////////////////////////// // deprecated catalogers ////////////////////////////////////////
// these are catalogers that should not be selectable other than specific inclusion via name or "deprecated" tag (to remain backwards compatible) // these are catalogers that should not be selectable other than specific inclusion via name or "deprecated" tag (to remain backwards compatible)

View File

@ -42,6 +42,8 @@ func EncodeComponent(p pkg.Package, supplier string, locationSorter func(a, b fi
componentType := cyclonedx.ComponentTypeLibrary componentType := cyclonedx.ComponentTypeLibrary
if p.Type == pkg.BinaryPkg { if p.Type == pkg.BinaryPkg {
componentType = cyclonedx.ComponentTypeApplication componentType = cyclonedx.ComponentTypeApplication
} else if p.Type == pkg.ModelPkg {
componentType = cyclonedx.ComponentTypeMachineLearningModel
} }
return cyclonedx.Component{ return cyclonedx.Component{

View File

@ -62,7 +62,7 @@ func collectPackages(component *cyclonedx.Component, s *sbom.SBOM, idMap map[str
switch component.Type { switch component.Type {
case cyclonedx.ComponentTypeOS: case cyclonedx.ComponentTypeOS:
case cyclonedx.ComponentTypeContainer: case cyclonedx.ComponentTypeContainer:
case cyclonedx.ComponentTypeApplication, cyclonedx.ComponentTypeFramework, cyclonedx.ComponentTypeLibrary: case cyclonedx.ComponentTypeApplication, cyclonedx.ComponentTypeFramework, cyclonedx.ComponentTypeLibrary, cyclonedx.ComponentTypeMachineLearningModel:
p := decodeComponent(component) p := decodeComponent(component)
idMap[component.BOMRef] = p idMap[component.BOMRef] = p
if component.BOMRef != "" { if component.BOMRef != "" {

View File

@ -0,0 +1,16 @@
/*
Package aiartifact provides concrete Cataloger implementations for AI artifacts and machine learning models,
including support for GGUF (GPT-Generated Unified Format) model files.
*/
package aiartifact
import (
"github.com/anchore/syft/syft/pkg"
"github.com/anchore/syft/syft/pkg/cataloger/generic"
)
// NewGGUFCataloger returns a new cataloger instance for GGUF model files.
func NewGGUFCataloger() pkg.Cataloger {
return generic.NewCataloger("gguf-cataloger").
WithParserByGlobs(parseGGUFModel, "**/*.gguf")
}

View File

@ -0,0 +1,68 @@
package aiartifact
import (
"fmt"
"github.com/anchore/packageurl-go"
"github.com/anchore/syft/syft/file"
"github.com/anchore/syft/syft/pkg"
)
func newGGUFPackage(metadata *pkg.GGUFFileMetadata, locations ...file.Location) pkg.Package {
p := pkg.Package{
Name: metadata.ModelName,
Version: metadata.ModelVersion,
PURL: packageURL(metadata),
Locations: file.NewLocationSet(locations...),
Type: pkg.ModelPkg,
Licenses: pkg.NewLicenseSet(),
Metadata: *metadata,
}
// Add license to the package if present in metadata
if metadata.License != "" {
p.Licenses.Add(pkg.NewLicenseFromFields(metadata.License, "", nil))
}
p.SetID()
return p
}
// packageURL returns the PURL for the specific GGUF model package (see https://github.com/package-url/purl-spec)
func packageURL(metadata *pkg.GGUFFileMetadata) string {
var qualifiers packageurl.Qualifiers
// Add model-specific qualifiers
if metadata.Architecture != "" {
qualifiers = append(qualifiers, packageurl.Qualifier{
Key: "arch",
Value: metadata.Architecture,
})
}
if metadata.Quantization != "" && metadata.Quantization != "unknown" {
qualifiers = append(qualifiers, packageurl.Qualifier{
Key: "quantization",
Value: metadata.Quantization,
})
}
if metadata.Parameters > 0 {
qualifiers = append(qualifiers, packageurl.Qualifier{
Key: "parameters",
Value: fmt.Sprintf("%d", metadata.Parameters),
})
}
// Use mlmodel as the type for machine learning models in GGUF format
// This follows the PURL spec guidance for ML models
return packageurl.NewPackageURL(
"mlmodel",
"gguf",
metadata.ModelName,
metadata.ModelVersion,
qualifiers,
"",
).ToString()
}

View File

@ -0,0 +1,344 @@
package aiartifact
import (
"bytes"
"crypto/sha256"
"encoding/binary"
"encoding/json"
"fmt"
"io"
"path/filepath"
"regexp"
"strings"
"github.com/anchore/syft/internal/log"
"github.com/anchore/syft/syft/pkg"
)
// GGUF file format constants
const (
ggufMagic = 0x46554747 // "GGUF" in little-endian
maxKVPairs = 10000 // Safety limit for KV pairs
maxKeyLen = 65535 // Maximum key length
maxTensors = 100000 // Safety limit for tensors
maxHeaderKV = 200 // Maximum KV pairs to include in Header map (to avoid bloat)
)
// GGUF value types (from GGUF spec)
const (
ggufTypeUint8 = 0
ggufTypeInt8 = 1
ggufTypeUint16 = 2
ggufTypeInt16 = 3
ggufTypeUint32 = 4
ggufTypeInt32 = 5
ggufTypeFloat32 = 6
ggufTypeUint64 = 7
ggufTypeInt64 = 8
ggufTypeFloat64 = 9
ggufTypeBool = 10
ggufTypeString = 11
ggufTypeArray = 12
)
// parseGGUFHeader parses the header of a GGUF file from raw bytes and extracts metadata
func parseGGUFHeader(data []byte, location string) (*pkg.GGUFFileMetadata, error) {
reader := bytes.NewReader(data)
// Read magic number
var magic uint32
if err := binary.Read(reader, binary.LittleEndian, &magic); err != nil {
return nil, fmt.Errorf("failed to read magic number: %w", err)
}
if magic != ggufMagic {
return nil, fmt.Errorf("invalid GGUF magic number: 0x%08X", magic)
}
// Read version
var version uint32
if err := binary.Read(reader, binary.LittleEndian, &version); err != nil {
return nil, fmt.Errorf("failed to read version: %w", err)
}
// Read tensor count
var tensorCount uint64
if err := binary.Read(reader, binary.LittleEndian, &tensorCount); err != nil {
return nil, fmt.Errorf("failed to read tensor count: %w", err)
}
if tensorCount > maxTensors {
log.Warnf("GGUF file has suspicious tensor count: %d (max: %d)", tensorCount, maxTensors)
tensorCount = maxTensors
}
// Read metadata KV count
var kvCount uint64
if err := binary.Read(reader, binary.LittleEndian, &kvCount); err != nil {
return nil, fmt.Errorf("failed to read KV count: %w", err)
}
if kvCount > maxKVPairs {
log.Warnf("GGUF file has suspicious KV count: %d (max: %d)", kvCount, maxKVPairs)
return nil, fmt.Errorf("KV count exceeds safety limit: %d", kvCount)
}
// Parse metadata key-value pairs
kvMap := make(map[string]interface{})
truncated := false
for i := uint64(0); i < kvCount; i++ {
key, value, err := readKVPair(reader)
if err != nil {
log.Warnf("failed to read KV pair %d: %v", i, err)
truncated = true
break
}
if len(kvMap) < maxHeaderKV {
kvMap[key] = value
} else {
truncated = true
}
}
// Extract common metadata fields
metadata := &pkg.GGUFFileMetadata{
ModelFormat: "gguf",
GGUFVersion: version,
TensorCount: tensorCount,
Header: kvMap,
TruncatedHeader: truncated,
}
// Extract known fields from KV map and remove them to avoid duplication in Header
if arch, ok := kvMap["general.architecture"].(string); ok {
metadata.Architecture = arch
delete(kvMap, "general.architecture")
}
if name, ok := kvMap["general.name"].(string); ok {
metadata.ModelName = name
delete(kvMap, "general.name")
} else {
// Fall back to filename if general.name not present
filename := filepath.Base(location)
metadata.ModelName = strings.TrimSuffix(filename, filepath.Ext(filename))
}
if license, ok := kvMap["general.license"].(string); ok {
metadata.License = license
delete(kvMap, "general.license")
}
if version, ok := kvMap["general.version"].(string); ok {
metadata.ModelVersion = version
delete(kvMap, "general.version")
} else {
metadata.ModelVersion = "unknown"
}
// Extract parameters count if present
if params, ok := kvMap["general.parameter_count"].(uint64); ok {
metadata.Parameters = params
delete(kvMap, "general.parameter_count")
}
// Try to infer quantization from general.quantization or from filename
if quant, ok := kvMap["general.quantization"].(string); ok {
metadata.Quantization = quant
delete(kvMap, "general.quantization")
} else if quantizedBy, ok := kvMap["general.quantized_by"].(string); ok && quantizedBy != "" {
// If quantized but no explicit quantization field, try to extract from filename
metadata.Quantization = inferQuantizationFromFilename(location)
// Note: we keep general.quantized_by in Header since it's not directly mapped to a field
} else {
metadata.Quantization = "unknown"
}
// Compute hash of metadata for stable identifier
metadata.Hash = computeMetadataHash(metadata)
return metadata, nil
}
// readKVPair reads a single key-value pair from the GGUF header
func readKVPair(reader io.Reader) (string, interface{}, error) {
// Read key length
var keyLen uint64
if err := binary.Read(reader, binary.LittleEndian, &keyLen); err != nil {
return "", nil, fmt.Errorf("failed to read key length: %w", err)
}
if keyLen > maxKeyLen {
return "", nil, fmt.Errorf("key length exceeds maximum: %d", keyLen)
}
// Read key
keyBytes := make([]byte, keyLen)
if _, err := io.ReadFull(reader, keyBytes); err != nil {
return "", nil, fmt.Errorf("failed to read key: %w", err)
}
key := string(keyBytes)
// Read value type
var valueType uint32
if err := binary.Read(reader, binary.LittleEndian, &valueType); err != nil {
return "", nil, fmt.Errorf("failed to read value type: %w", err)
}
// Read value based on type
value, err := readValue(reader, valueType)
if err != nil {
return "", nil, fmt.Errorf("failed to read value for key %s: %w", key, err)
}
return key, value, nil
}
// readValue reads a value based on its type
func readValue(reader io.Reader, valueType uint32) (interface{}, error) {
switch valueType {
case ggufTypeUint8:
var v uint8
err := binary.Read(reader, binary.LittleEndian, &v)
return v, err
case ggufTypeInt8:
var v int8
err := binary.Read(reader, binary.LittleEndian, &v)
return v, err
case ggufTypeUint16:
var v uint16
err := binary.Read(reader, binary.LittleEndian, &v)
return v, err
case ggufTypeInt16:
var v int16
err := binary.Read(reader, binary.LittleEndian, &v)
return v, err
case ggufTypeUint32:
var v uint32
err := binary.Read(reader, binary.LittleEndian, &v)
return v, err
case ggufTypeInt32:
var v int32
err := binary.Read(reader, binary.LittleEndian, &v)
return v, err
case ggufTypeFloat32:
var v float32
err := binary.Read(reader, binary.LittleEndian, &v)
return v, err
case ggufTypeUint64:
var v uint64
err := binary.Read(reader, binary.LittleEndian, &v)
return v, err
case ggufTypeInt64:
var v int64
err := binary.Read(reader, binary.LittleEndian, &v)
return v, err
case ggufTypeFloat64:
var v float64
err := binary.Read(reader, binary.LittleEndian, &v)
return v, err
case ggufTypeBool:
var v uint8
err := binary.Read(reader, binary.LittleEndian, &v)
return v != 0, err
case ggufTypeString:
return readString(reader)
case ggufTypeArray:
return readArray(reader)
default:
return nil, fmt.Errorf("unknown value type: %d", valueType)
}
}
// readString reads a length-prefixed UTF-8 string
func readString(reader io.Reader) (string, error) {
var length uint64
if err := binary.Read(reader, binary.LittleEndian, &length); err != nil {
return "", fmt.Errorf("failed to read string length: %w", err)
}
if length > maxKeyLen {
return "", fmt.Errorf("string length exceeds maximum: %d", length)
}
strBytes := make([]byte, length)
if _, err := io.ReadFull(reader, strBytes); err != nil {
return "", fmt.Errorf("failed to read string: %w", err)
}
return string(strBytes), nil
}
// readArray reads an array value
func readArray(reader io.Reader) (interface{}, error) {
// Read array element type
var elemType uint32
if err := binary.Read(reader, binary.LittleEndian, &elemType); err != nil {
return nil, fmt.Errorf("failed to read array element type: %w", err)
}
// Read array length
var length uint64
if err := binary.Read(reader, binary.LittleEndian, &length); err != nil {
return nil, fmt.Errorf("failed to read array length: %w", err)
}
if length > 1000 {
// Limit array size to avoid memory issues
return nil, fmt.Errorf("array length too large: %d", length)
}
// Read array elements
var elements []interface{}
for i := uint64(0); i < length; i++ {
value, err := readValue(reader, elemType)
if err != nil {
return nil, fmt.Errorf("failed to read array element %d: %w", i, err)
}
elements = append(elements, value)
}
return elements, nil
}
// inferQuantizationFromFilename attempts to extract quantization info from filename
func inferQuantizationFromFilename(filename string) string {
// Common quantization patterns: Q4_K_M, IQ4_NL, Q5_K_S, etc.
quantPattern := regexp.MustCompile(`[IQ]\d+_[A-Z_]+`)
if match := quantPattern.FindString(filename); match != "" {
return match
}
return "unknown"
}
// computeMetadataHash computes a stable hash of the metadata for use as a global identifier
func computeMetadataHash(metadata *pkg.GGUFFileMetadata) string {
// Create a stable representation of the metadata
hashData := struct {
Format string
Name string
Version string
Architecture string
GGUFVersion uint32
TensorCount uint64
}{
Format: metadata.ModelFormat,
Name: metadata.ModelName,
Version: metadata.ModelVersion,
Architecture: metadata.Architecture,
GGUFVersion: metadata.GGUFVersion,
TensorCount: metadata.TensorCount,
}
// Marshal to JSON for stable hashing
jsonBytes, err := json.Marshal(hashData)
if err != nil {
log.Warnf("failed to marshal metadata for hashing: %v", err)
return ""
}
// Compute SHA256 hash
hash := sha256.Sum256(jsonBytes)
return fmt.Sprintf("%x", hash[:8]) // Use first 8 bytes (16 hex chars)
}

View File

@ -0,0 +1,68 @@
package aiartifact
import (
"context"
"fmt"
"io"
"github.com/anchore/syft/internal"
"github.com/anchore/syft/internal/log"
"github.com/anchore/syft/internal/unknown"
"github.com/anchore/syft/syft/artifact"
"github.com/anchore/syft/syft/file"
"github.com/anchore/syft/syft/pkg"
"github.com/anchore/syft/syft/pkg/cataloger/generic"
)
// parseGGUFModel parses a GGUF model file and returns the discovered package.
func parseGGUFModel(_ context.Context, _ file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) {
defer internal.CloseAndLogError(reader, reader.Location.Path())
// Read header (we'll read a reasonable amount to parse the header without reading entire file)
// GGUF headers are typically < 1MB, but we'll use a 10MB limit to be safe
const maxHeaderSize = 10 * 1024 * 1024
limitedReader := io.LimitReader(reader, maxHeaderSize)
// We need to buffer the data because we need to check magic and parse
headerData := make([]byte, 0, 8192) // Start with 8KB buffer
buf := make([]byte, 8192)
for {
n, err := limitedReader.Read(buf)
if n > 0 {
headerData = append(headerData, buf[:n]...)
}
if err == io.EOF {
break
}
if err != nil {
return nil, nil, fmt.Errorf("error reading file: %w", err)
}
// Stop if we've read enough for a reasonable header
if len(headerData) > maxHeaderSize {
log.Warnf("GGUF header at %s exceeds max size, truncating", reader.Location.Path())
break
}
}
// Check if this is actually a GGUF file
if len(headerData) < 4 {
return nil, nil, fmt.Errorf("file too small to be a valid GGUF file")
}
// Parse the GGUF header
metadata, err := parseGGUFHeader(headerData, reader.Location.Path())
if err != nil {
return nil, nil, fmt.Errorf("failed to parse GGUF file: %w", err)
}
// Create package from metadata
p := newGGUFPackage(
metadata,
reader.Location.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation),
)
return []pkg.Package{p}, nil, unknown.IfEmptyf([]pkg.Package{p}, "unable to parse GGUF file")
}
// integrity check
var _ generic.Parser = parseGGUFModel

47
syft/pkg/gguf.go Normal file
View File

@ -0,0 +1,47 @@
package pkg
// GGUFFileMetadata represents metadata extracted from a GGUF (GPT-Generated Unified Format) model file.
// GGUF is a binary file format used for storing model weights for the GGML library, designed for fast
// loading and saving of models, particularly quantized large language models.
type GGUFFileMetadata struct {
// ModelFormat is always "gguf"
ModelFormat string `json:"modelFormat" cyclonedx:"modelFormat"`
// ModelName is the name of the model (from general.name or filename)
ModelName string `json:"modelName" cyclonedx:"modelName"`
// ModelVersion is the version of the model (if available in header, else "unknown")
ModelVersion string `json:"modelVersion,omitempty" cyclonedx:"modelVersion"`
// FileSize is the size of the GGUF file in bytes (best-effort if available from resolver)
FileSize int64 `json:"fileSize,omitempty" cyclonedx:"fileSize"`
// Hash is a content hash of the metadata (for stable global identifiers across remotes)
Hash string `json:"hash,omitempty" cyclonedx:"hash"`
// License is the license identifier (from general.license if present)
License string `json:"license,omitempty" cyclonedx:"license"`
// GGUFVersion is the GGUF format version (e.g., 3)
GGUFVersion uint32 `json:"ggufVersion" cyclonedx:"ggufVersion"`
// Architecture is the model architecture (from general.architecture, e.g., "qwen3moe", "llama")
Architecture string `json:"architecture,omitempty" cyclonedx:"architecture"`
// Quantization is the quantization type (e.g., "IQ4_NL", "Q4_K_M")
Quantization string `json:"quantization,omitempty" cyclonedx:"quantization"`
// Parameters is the number of model parameters (if present in header)
Parameters uint64 `json:"parameters,omitempty" cyclonedx:"parameters"`
// TensorCount is the number of tensors in the model
TensorCount uint64 `json:"tensorCount" cyclonedx:"tensorCount"`
// Header contains the remaining key-value pairs from the GGUF header that are not already
// represented as typed fields above. This preserves additional metadata fields for reference
// (namespaced with general.*, llama.*, etc.) while avoiding duplication.
Header map[string]interface{} `json:"header,omitempty" cyclonedx:"header"`
// TruncatedHeader indicates if the header was truncated during parsing (for very large headers)
TruncatedHeader bool `json:"truncatedHeader,omitempty" cyclonedx:"truncatedHeader"`
}

View File

@ -54,6 +54,7 @@ const (
TerraformPkg Type = "terraform" TerraformPkg Type = "terraform"
WordpressPluginPkg Type = "wordpress-plugin" WordpressPluginPkg Type = "wordpress-plugin"
HomebrewPkg Type = "homebrew" HomebrewPkg Type = "homebrew"
ModelPkg Type = "model"
) )
// AllPkgs represents all supported package types // AllPkgs represents all supported package types
@ -98,6 +99,7 @@ var AllPkgs = []Type{
TerraformPkg, TerraformPkg,
WordpressPluginPkg, WordpressPluginPkg,
HomebrewPkg, HomebrewPkg,
ModelPkg,
} }
// PackageURLType returns the PURL package type for the current package. // PackageURLType returns the PURL package type for the current package.
@ -174,6 +176,8 @@ func (t Type) PackageURLType() string {
return "wordpress-plugin" return "wordpress-plugin"
case HomebrewPkg: case HomebrewPkg:
return "homebrew" return "homebrew"
case ModelPkg:
return "generic/model"
default: default:
// TODO: should this be a "generic" purl type instead? // TODO: should this be a "generic" purl type instead?
return "" return ""
@ -262,6 +266,8 @@ func TypeByName(name string) Type {
return WordpressPluginPkg return WordpressPluginPkg
case "homebrew": case "homebrew":
return HomebrewPkg return HomebrewPkg
case "model":
return ModelPkg
default: default:
return UnknownPkg return UnknownPkg
} }