mirror of
https://github.com/anchore/syft.git
synced 2025-11-17 16:33:21 +01:00
feat: migrate gguf parser to separate PR from oci
Signed-off-by: Christopher Phillips <32073428+spiffcs@users.noreply.github.com>
This commit is contained in:
parent
760bd9a50a
commit
3f117a3eb5
2
.gitignore
vendored
2
.gitignore
vendored
@ -70,3 +70,5 @@ cosign.pub
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
|
||||
|
||||
@ -122,6 +122,7 @@ var jsonTypes = makeJSONTypes(
|
||||
jsonNames(pkg.TerraformLockProviderEntry{}, "terraform-lock-provider-entry"),
|
||||
jsonNames(pkg.DotnetPackagesLockEntry{}, "dotnet-packages-lock-entry"),
|
||||
jsonNames(pkg.CondaMetaPackage{}, "conda-metadata-entry", "CondaPackageMetadata"),
|
||||
jsonNames(pkg.GGUFFileMetadata{}, "gguf-file-metadata", "GGUFFileMetadata"),
|
||||
)
|
||||
|
||||
func expandLegacyNameVariants(names ...string) []string {
|
||||
|
||||
@ -37,6 +37,7 @@ import (
|
||||
"github.com/anchore/syft/syft/pkg/cataloger/swipl"
|
||||
"github.com/anchore/syft/syft/pkg/cataloger/terraform"
|
||||
"github.com/anchore/syft/syft/pkg/cataloger/wordpress"
|
||||
"github.com/anchore/syft/syft/pkg/cataloger/aiartifact"
|
||||
)
|
||||
|
||||
const (
|
||||
@ -175,6 +176,7 @@ func DefaultPackageTaskFactories() Factories {
|
||||
newSimplePackageTaskFactory(homebrew.NewCataloger, pkgcataloging.DirectoryTag, pkgcataloging.InstalledTag, pkgcataloging.ImageTag, "homebrew"),
|
||||
newSimplePackageTaskFactory(conda.NewCondaMetaCataloger, pkgcataloging.DirectoryTag, pkgcataloging.InstalledTag, pkgcataloging.PackageTag, "conda"),
|
||||
newSimplePackageTaskFactory(snap.NewCataloger, pkgcataloging.DirectoryTag, pkgcataloging.InstalledTag, pkgcataloging.ImageTag, "snap"),
|
||||
newSimplePackageTaskFactory(aiartifact.NewGGUFCataloger, pkgcataloging.DirectoryTag, pkgcataloging.ImageTag, "ai-artifact", "model", "gguf", "ml"),
|
||||
|
||||
// deprecated catalogers ////////////////////////////////////////
|
||||
// these are catalogers that should not be selectable other than specific inclusion via name or "deprecated" tag (to remain backwards compatible)
|
||||
|
||||
@ -42,6 +42,8 @@ func EncodeComponent(p pkg.Package, supplier string, locationSorter func(a, b fi
|
||||
componentType := cyclonedx.ComponentTypeLibrary
|
||||
if p.Type == pkg.BinaryPkg {
|
||||
componentType = cyclonedx.ComponentTypeApplication
|
||||
} else if p.Type == pkg.ModelPkg {
|
||||
componentType = cyclonedx.ComponentTypeMachineLearningModel
|
||||
}
|
||||
|
||||
return cyclonedx.Component{
|
||||
|
||||
@ -62,7 +62,7 @@ func collectPackages(component *cyclonedx.Component, s *sbom.SBOM, idMap map[str
|
||||
switch component.Type {
|
||||
case cyclonedx.ComponentTypeOS:
|
||||
case cyclonedx.ComponentTypeContainer:
|
||||
case cyclonedx.ComponentTypeApplication, cyclonedx.ComponentTypeFramework, cyclonedx.ComponentTypeLibrary:
|
||||
case cyclonedx.ComponentTypeApplication, cyclonedx.ComponentTypeFramework, cyclonedx.ComponentTypeLibrary, cyclonedx.ComponentTypeMachineLearningModel:
|
||||
p := decodeComponent(component)
|
||||
idMap[component.BOMRef] = p
|
||||
if component.BOMRef != "" {
|
||||
|
||||
16
syft/pkg/cataloger/aiartifact/cataloger.go
Normal file
16
syft/pkg/cataloger/aiartifact/cataloger.go
Normal file
@ -0,0 +1,16 @@
|
||||
/*
|
||||
Package aiartifact provides concrete Cataloger implementations for AI artifacts and machine learning models,
|
||||
including support for GGUF (GPT-Generated Unified Format) model files.
|
||||
*/
|
||||
package aiartifact
|
||||
|
||||
import (
|
||||
"github.com/anchore/syft/syft/pkg"
|
||||
"github.com/anchore/syft/syft/pkg/cataloger/generic"
|
||||
)
|
||||
|
||||
// NewGGUFCataloger returns a new cataloger instance for GGUF model files.
|
||||
func NewGGUFCataloger() pkg.Cataloger {
|
||||
return generic.NewCataloger("gguf-cataloger").
|
||||
WithParserByGlobs(parseGGUFModel, "**/*.gguf")
|
||||
}
|
||||
68
syft/pkg/cataloger/aiartifact/package.go
Normal file
68
syft/pkg/cataloger/aiartifact/package.go
Normal file
@ -0,0 +1,68 @@
|
||||
package aiartifact
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/anchore/packageurl-go"
|
||||
"github.com/anchore/syft/syft/file"
|
||||
"github.com/anchore/syft/syft/pkg"
|
||||
)
|
||||
|
||||
func newGGUFPackage(metadata *pkg.GGUFFileMetadata, locations ...file.Location) pkg.Package {
|
||||
p := pkg.Package{
|
||||
Name: metadata.ModelName,
|
||||
Version: metadata.ModelVersion,
|
||||
PURL: packageURL(metadata),
|
||||
Locations: file.NewLocationSet(locations...),
|
||||
Type: pkg.ModelPkg,
|
||||
Licenses: pkg.NewLicenseSet(),
|
||||
Metadata: *metadata,
|
||||
}
|
||||
|
||||
// Add license to the package if present in metadata
|
||||
if metadata.License != "" {
|
||||
p.Licenses.Add(pkg.NewLicenseFromFields(metadata.License, "", nil))
|
||||
}
|
||||
|
||||
p.SetID()
|
||||
|
||||
return p
|
||||
}
|
||||
|
||||
// packageURL returns the PURL for the specific GGUF model package (see https://github.com/package-url/purl-spec)
|
||||
func packageURL(metadata *pkg.GGUFFileMetadata) string {
|
||||
var qualifiers packageurl.Qualifiers
|
||||
|
||||
// Add model-specific qualifiers
|
||||
if metadata.Architecture != "" {
|
||||
qualifiers = append(qualifiers, packageurl.Qualifier{
|
||||
Key: "arch",
|
||||
Value: metadata.Architecture,
|
||||
})
|
||||
}
|
||||
|
||||
if metadata.Quantization != "" && metadata.Quantization != "unknown" {
|
||||
qualifiers = append(qualifiers, packageurl.Qualifier{
|
||||
Key: "quantization",
|
||||
Value: metadata.Quantization,
|
||||
})
|
||||
}
|
||||
|
||||
if metadata.Parameters > 0 {
|
||||
qualifiers = append(qualifiers, packageurl.Qualifier{
|
||||
Key: "parameters",
|
||||
Value: fmt.Sprintf("%d", metadata.Parameters),
|
||||
})
|
||||
}
|
||||
|
||||
// Use mlmodel as the type for machine learning models in GGUF format
|
||||
// This follows the PURL spec guidance for ML models
|
||||
return packageurl.NewPackageURL(
|
||||
"mlmodel",
|
||||
"gguf",
|
||||
metadata.ModelName,
|
||||
metadata.ModelVersion,
|
||||
qualifiers,
|
||||
"",
|
||||
).ToString()
|
||||
}
|
||||
344
syft/pkg/cataloger/aiartifact/parse_gguf.go
Normal file
344
syft/pkg/cataloger/aiartifact/parse_gguf.go
Normal file
@ -0,0 +1,344 @@
|
||||
package aiartifact
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"crypto/sha256"
|
||||
"encoding/binary"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"github.com/anchore/syft/internal/log"
|
||||
"github.com/anchore/syft/syft/pkg"
|
||||
)
|
||||
|
||||
// GGUF file format constants
|
||||
const (
|
||||
ggufMagic = 0x46554747 // "GGUF" in little-endian
|
||||
maxKVPairs = 10000 // Safety limit for KV pairs
|
||||
maxKeyLen = 65535 // Maximum key length
|
||||
maxTensors = 100000 // Safety limit for tensors
|
||||
maxHeaderKV = 200 // Maximum KV pairs to include in Header map (to avoid bloat)
|
||||
)
|
||||
|
||||
// GGUF value types (from GGUF spec)
|
||||
const (
|
||||
ggufTypeUint8 = 0
|
||||
ggufTypeInt8 = 1
|
||||
ggufTypeUint16 = 2
|
||||
ggufTypeInt16 = 3
|
||||
ggufTypeUint32 = 4
|
||||
ggufTypeInt32 = 5
|
||||
ggufTypeFloat32 = 6
|
||||
ggufTypeUint64 = 7
|
||||
ggufTypeInt64 = 8
|
||||
ggufTypeFloat64 = 9
|
||||
ggufTypeBool = 10
|
||||
ggufTypeString = 11
|
||||
ggufTypeArray = 12
|
||||
)
|
||||
|
||||
// parseGGUFHeader parses the header of a GGUF file from raw bytes and extracts metadata
|
||||
func parseGGUFHeader(data []byte, location string) (*pkg.GGUFFileMetadata, error) {
|
||||
reader := bytes.NewReader(data)
|
||||
// Read magic number
|
||||
var magic uint32
|
||||
if err := binary.Read(reader, binary.LittleEndian, &magic); err != nil {
|
||||
return nil, fmt.Errorf("failed to read magic number: %w", err)
|
||||
}
|
||||
|
||||
if magic != ggufMagic {
|
||||
return nil, fmt.Errorf("invalid GGUF magic number: 0x%08X", magic)
|
||||
}
|
||||
|
||||
// Read version
|
||||
var version uint32
|
||||
if err := binary.Read(reader, binary.LittleEndian, &version); err != nil {
|
||||
return nil, fmt.Errorf("failed to read version: %w", err)
|
||||
}
|
||||
|
||||
// Read tensor count
|
||||
var tensorCount uint64
|
||||
if err := binary.Read(reader, binary.LittleEndian, &tensorCount); err != nil {
|
||||
return nil, fmt.Errorf("failed to read tensor count: %w", err)
|
||||
}
|
||||
|
||||
if tensorCount > maxTensors {
|
||||
log.Warnf("GGUF file has suspicious tensor count: %d (max: %d)", tensorCount, maxTensors)
|
||||
tensorCount = maxTensors
|
||||
}
|
||||
|
||||
// Read metadata KV count
|
||||
var kvCount uint64
|
||||
if err := binary.Read(reader, binary.LittleEndian, &kvCount); err != nil {
|
||||
return nil, fmt.Errorf("failed to read KV count: %w", err)
|
||||
}
|
||||
|
||||
if kvCount > maxKVPairs {
|
||||
log.Warnf("GGUF file has suspicious KV count: %d (max: %d)", kvCount, maxKVPairs)
|
||||
return nil, fmt.Errorf("KV count exceeds safety limit: %d", kvCount)
|
||||
}
|
||||
|
||||
// Parse metadata key-value pairs
|
||||
kvMap := make(map[string]interface{})
|
||||
truncated := false
|
||||
|
||||
for i := uint64(0); i < kvCount; i++ {
|
||||
key, value, err := readKVPair(reader)
|
||||
if err != nil {
|
||||
log.Warnf("failed to read KV pair %d: %v", i, err)
|
||||
truncated = true
|
||||
break
|
||||
}
|
||||
if len(kvMap) < maxHeaderKV {
|
||||
kvMap[key] = value
|
||||
} else {
|
||||
truncated = true
|
||||
}
|
||||
}
|
||||
|
||||
// Extract common metadata fields
|
||||
metadata := &pkg.GGUFFileMetadata{
|
||||
ModelFormat: "gguf",
|
||||
GGUFVersion: version,
|
||||
TensorCount: tensorCount,
|
||||
Header: kvMap,
|
||||
TruncatedHeader: truncated,
|
||||
}
|
||||
|
||||
// Extract known fields from KV map and remove them to avoid duplication in Header
|
||||
if arch, ok := kvMap["general.architecture"].(string); ok {
|
||||
metadata.Architecture = arch
|
||||
delete(kvMap, "general.architecture")
|
||||
}
|
||||
|
||||
if name, ok := kvMap["general.name"].(string); ok {
|
||||
metadata.ModelName = name
|
||||
delete(kvMap, "general.name")
|
||||
} else {
|
||||
// Fall back to filename if general.name not present
|
||||
filename := filepath.Base(location)
|
||||
metadata.ModelName = strings.TrimSuffix(filename, filepath.Ext(filename))
|
||||
}
|
||||
|
||||
if license, ok := kvMap["general.license"].(string); ok {
|
||||
metadata.License = license
|
||||
delete(kvMap, "general.license")
|
||||
}
|
||||
|
||||
if version, ok := kvMap["general.version"].(string); ok {
|
||||
metadata.ModelVersion = version
|
||||
delete(kvMap, "general.version")
|
||||
} else {
|
||||
metadata.ModelVersion = "unknown"
|
||||
}
|
||||
|
||||
// Extract parameters count if present
|
||||
if params, ok := kvMap["general.parameter_count"].(uint64); ok {
|
||||
metadata.Parameters = params
|
||||
delete(kvMap, "general.parameter_count")
|
||||
}
|
||||
|
||||
// Try to infer quantization from general.quantization or from filename
|
||||
if quant, ok := kvMap["general.quantization"].(string); ok {
|
||||
metadata.Quantization = quant
|
||||
delete(kvMap, "general.quantization")
|
||||
} else if quantizedBy, ok := kvMap["general.quantized_by"].(string); ok && quantizedBy != "" {
|
||||
// If quantized but no explicit quantization field, try to extract from filename
|
||||
metadata.Quantization = inferQuantizationFromFilename(location)
|
||||
// Note: we keep general.quantized_by in Header since it's not directly mapped to a field
|
||||
} else {
|
||||
metadata.Quantization = "unknown"
|
||||
}
|
||||
|
||||
// Compute hash of metadata for stable identifier
|
||||
metadata.Hash = computeMetadataHash(metadata)
|
||||
|
||||
return metadata, nil
|
||||
}
|
||||
|
||||
// readKVPair reads a single key-value pair from the GGUF header
|
||||
func readKVPair(reader io.Reader) (string, interface{}, error) {
|
||||
// Read key length
|
||||
var keyLen uint64
|
||||
if err := binary.Read(reader, binary.LittleEndian, &keyLen); err != nil {
|
||||
return "", nil, fmt.Errorf("failed to read key length: %w", err)
|
||||
}
|
||||
|
||||
if keyLen > maxKeyLen {
|
||||
return "", nil, fmt.Errorf("key length exceeds maximum: %d", keyLen)
|
||||
}
|
||||
|
||||
// Read key
|
||||
keyBytes := make([]byte, keyLen)
|
||||
if _, err := io.ReadFull(reader, keyBytes); err != nil {
|
||||
return "", nil, fmt.Errorf("failed to read key: %w", err)
|
||||
}
|
||||
key := string(keyBytes)
|
||||
|
||||
// Read value type
|
||||
var valueType uint32
|
||||
if err := binary.Read(reader, binary.LittleEndian, &valueType); err != nil {
|
||||
return "", nil, fmt.Errorf("failed to read value type: %w", err)
|
||||
}
|
||||
|
||||
// Read value based on type
|
||||
value, err := readValue(reader, valueType)
|
||||
if err != nil {
|
||||
return "", nil, fmt.Errorf("failed to read value for key %s: %w", key, err)
|
||||
}
|
||||
|
||||
return key, value, nil
|
||||
}
|
||||
|
||||
// readValue reads a value based on its type
|
||||
func readValue(reader io.Reader, valueType uint32) (interface{}, error) {
|
||||
switch valueType {
|
||||
case ggufTypeUint8:
|
||||
var v uint8
|
||||
err := binary.Read(reader, binary.LittleEndian, &v)
|
||||
return v, err
|
||||
case ggufTypeInt8:
|
||||
var v int8
|
||||
err := binary.Read(reader, binary.LittleEndian, &v)
|
||||
return v, err
|
||||
case ggufTypeUint16:
|
||||
var v uint16
|
||||
err := binary.Read(reader, binary.LittleEndian, &v)
|
||||
return v, err
|
||||
case ggufTypeInt16:
|
||||
var v int16
|
||||
err := binary.Read(reader, binary.LittleEndian, &v)
|
||||
return v, err
|
||||
case ggufTypeUint32:
|
||||
var v uint32
|
||||
err := binary.Read(reader, binary.LittleEndian, &v)
|
||||
return v, err
|
||||
case ggufTypeInt32:
|
||||
var v int32
|
||||
err := binary.Read(reader, binary.LittleEndian, &v)
|
||||
return v, err
|
||||
case ggufTypeFloat32:
|
||||
var v float32
|
||||
err := binary.Read(reader, binary.LittleEndian, &v)
|
||||
return v, err
|
||||
case ggufTypeUint64:
|
||||
var v uint64
|
||||
err := binary.Read(reader, binary.LittleEndian, &v)
|
||||
return v, err
|
||||
case ggufTypeInt64:
|
||||
var v int64
|
||||
err := binary.Read(reader, binary.LittleEndian, &v)
|
||||
return v, err
|
||||
case ggufTypeFloat64:
|
||||
var v float64
|
||||
err := binary.Read(reader, binary.LittleEndian, &v)
|
||||
return v, err
|
||||
case ggufTypeBool:
|
||||
var v uint8
|
||||
err := binary.Read(reader, binary.LittleEndian, &v)
|
||||
return v != 0, err
|
||||
case ggufTypeString:
|
||||
return readString(reader)
|
||||
case ggufTypeArray:
|
||||
return readArray(reader)
|
||||
default:
|
||||
return nil, fmt.Errorf("unknown value type: %d", valueType)
|
||||
}
|
||||
}
|
||||
|
||||
// readString reads a length-prefixed UTF-8 string
|
||||
func readString(reader io.Reader) (string, error) {
|
||||
var length uint64
|
||||
if err := binary.Read(reader, binary.LittleEndian, &length); err != nil {
|
||||
return "", fmt.Errorf("failed to read string length: %w", err)
|
||||
}
|
||||
|
||||
if length > maxKeyLen {
|
||||
return "", fmt.Errorf("string length exceeds maximum: %d", length)
|
||||
}
|
||||
|
||||
strBytes := make([]byte, length)
|
||||
if _, err := io.ReadFull(reader, strBytes); err != nil {
|
||||
return "", fmt.Errorf("failed to read string: %w", err)
|
||||
}
|
||||
|
||||
return string(strBytes), nil
|
||||
}
|
||||
|
||||
// readArray reads an array value
|
||||
func readArray(reader io.Reader) (interface{}, error) {
|
||||
// Read array element type
|
||||
var elemType uint32
|
||||
if err := binary.Read(reader, binary.LittleEndian, &elemType); err != nil {
|
||||
return nil, fmt.Errorf("failed to read array element type: %w", err)
|
||||
}
|
||||
|
||||
// Read array length
|
||||
var length uint64
|
||||
if err := binary.Read(reader, binary.LittleEndian, &length); err != nil {
|
||||
return nil, fmt.Errorf("failed to read array length: %w", err)
|
||||
}
|
||||
|
||||
if length > 1000 {
|
||||
// Limit array size to avoid memory issues
|
||||
return nil, fmt.Errorf("array length too large: %d", length)
|
||||
}
|
||||
|
||||
// Read array elements
|
||||
var elements []interface{}
|
||||
for i := uint64(0); i < length; i++ {
|
||||
value, err := readValue(reader, elemType)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to read array element %d: %w", i, err)
|
||||
}
|
||||
elements = append(elements, value)
|
||||
}
|
||||
|
||||
return elements, nil
|
||||
}
|
||||
|
||||
// inferQuantizationFromFilename attempts to extract quantization info from filename
|
||||
func inferQuantizationFromFilename(filename string) string {
|
||||
// Common quantization patterns: Q4_K_M, IQ4_NL, Q5_K_S, etc.
|
||||
quantPattern := regexp.MustCompile(`[IQ]\d+_[A-Z_]+`)
|
||||
if match := quantPattern.FindString(filename); match != "" {
|
||||
return match
|
||||
}
|
||||
return "unknown"
|
||||
}
|
||||
|
||||
// computeMetadataHash computes a stable hash of the metadata for use as a global identifier
|
||||
func computeMetadataHash(metadata *pkg.GGUFFileMetadata) string {
|
||||
// Create a stable representation of the metadata
|
||||
hashData := struct {
|
||||
Format string
|
||||
Name string
|
||||
Version string
|
||||
Architecture string
|
||||
GGUFVersion uint32
|
||||
TensorCount uint64
|
||||
}{
|
||||
Format: metadata.ModelFormat,
|
||||
Name: metadata.ModelName,
|
||||
Version: metadata.ModelVersion,
|
||||
Architecture: metadata.Architecture,
|
||||
GGUFVersion: metadata.GGUFVersion,
|
||||
TensorCount: metadata.TensorCount,
|
||||
}
|
||||
|
||||
// Marshal to JSON for stable hashing
|
||||
jsonBytes, err := json.Marshal(hashData)
|
||||
if err != nil {
|
||||
log.Warnf("failed to marshal metadata for hashing: %v", err)
|
||||
return ""
|
||||
}
|
||||
|
||||
// Compute SHA256 hash
|
||||
hash := sha256.Sum256(jsonBytes)
|
||||
return fmt.Sprintf("%x", hash[:8]) // Use first 8 bytes (16 hex chars)
|
||||
}
|
||||
|
||||
68
syft/pkg/cataloger/aiartifact/parse_gguf_model.go
Normal file
68
syft/pkg/cataloger/aiartifact/parse_gguf_model.go
Normal file
@ -0,0 +1,68 @@
|
||||
package aiartifact
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
|
||||
"github.com/anchore/syft/internal"
|
||||
"github.com/anchore/syft/internal/log"
|
||||
"github.com/anchore/syft/internal/unknown"
|
||||
"github.com/anchore/syft/syft/artifact"
|
||||
"github.com/anchore/syft/syft/file"
|
||||
"github.com/anchore/syft/syft/pkg"
|
||||
"github.com/anchore/syft/syft/pkg/cataloger/generic"
|
||||
)
|
||||
|
||||
// parseGGUFModel parses a GGUF model file and returns the discovered package.
|
||||
func parseGGUFModel(_ context.Context, _ file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) {
|
||||
defer internal.CloseAndLogError(reader, reader.Location.Path())
|
||||
|
||||
// Read header (we'll read a reasonable amount to parse the header without reading entire file)
|
||||
// GGUF headers are typically < 1MB, but we'll use a 10MB limit to be safe
|
||||
const maxHeaderSize = 10 * 1024 * 1024
|
||||
limitedReader := io.LimitReader(reader, maxHeaderSize)
|
||||
|
||||
// We need to buffer the data because we need to check magic and parse
|
||||
headerData := make([]byte, 0, 8192) // Start with 8KB buffer
|
||||
buf := make([]byte, 8192)
|
||||
for {
|
||||
n, err := limitedReader.Read(buf)
|
||||
if n > 0 {
|
||||
headerData = append(headerData, buf[:n]...)
|
||||
}
|
||||
if err == io.EOF {
|
||||
break
|
||||
}
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("error reading file: %w", err)
|
||||
}
|
||||
// Stop if we've read enough for a reasonable header
|
||||
if len(headerData) > maxHeaderSize {
|
||||
log.Warnf("GGUF header at %s exceeds max size, truncating", reader.Location.Path())
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// Check if this is actually a GGUF file
|
||||
if len(headerData) < 4 {
|
||||
return nil, nil, fmt.Errorf("file too small to be a valid GGUF file")
|
||||
}
|
||||
|
||||
// Parse the GGUF header
|
||||
metadata, err := parseGGUFHeader(headerData, reader.Location.Path())
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("failed to parse GGUF file: %w", err)
|
||||
}
|
||||
|
||||
// Create package from metadata
|
||||
p := newGGUFPackage(
|
||||
metadata,
|
||||
reader.Location.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation),
|
||||
)
|
||||
|
||||
return []pkg.Package{p}, nil, unknown.IfEmptyf([]pkg.Package{p}, "unable to parse GGUF file")
|
||||
}
|
||||
|
||||
// integrity check
|
||||
var _ generic.Parser = parseGGUFModel
|
||||
47
syft/pkg/gguf.go
Normal file
47
syft/pkg/gguf.go
Normal file
@ -0,0 +1,47 @@
|
||||
package pkg
|
||||
|
||||
// GGUFFileMetadata represents metadata extracted from a GGUF (GPT-Generated Unified Format) model file.
|
||||
// GGUF is a binary file format used for storing model weights for the GGML library, designed for fast
|
||||
// loading and saving of models, particularly quantized large language models.
|
||||
type GGUFFileMetadata struct {
|
||||
// ModelFormat is always "gguf"
|
||||
ModelFormat string `json:"modelFormat" cyclonedx:"modelFormat"`
|
||||
|
||||
// ModelName is the name of the model (from general.name or filename)
|
||||
ModelName string `json:"modelName" cyclonedx:"modelName"`
|
||||
|
||||
// ModelVersion is the version of the model (if available in header, else "unknown")
|
||||
ModelVersion string `json:"modelVersion,omitempty" cyclonedx:"modelVersion"`
|
||||
|
||||
// FileSize is the size of the GGUF file in bytes (best-effort if available from resolver)
|
||||
FileSize int64 `json:"fileSize,omitempty" cyclonedx:"fileSize"`
|
||||
|
||||
// Hash is a content hash of the metadata (for stable global identifiers across remotes)
|
||||
Hash string `json:"hash,omitempty" cyclonedx:"hash"`
|
||||
|
||||
// License is the license identifier (from general.license if present)
|
||||
License string `json:"license,omitempty" cyclonedx:"license"`
|
||||
|
||||
// GGUFVersion is the GGUF format version (e.g., 3)
|
||||
GGUFVersion uint32 `json:"ggufVersion" cyclonedx:"ggufVersion"`
|
||||
|
||||
// Architecture is the model architecture (from general.architecture, e.g., "qwen3moe", "llama")
|
||||
Architecture string `json:"architecture,omitempty" cyclonedx:"architecture"`
|
||||
|
||||
// Quantization is the quantization type (e.g., "IQ4_NL", "Q4_K_M")
|
||||
Quantization string `json:"quantization,omitempty" cyclonedx:"quantization"`
|
||||
|
||||
// Parameters is the number of model parameters (if present in header)
|
||||
Parameters uint64 `json:"parameters,omitempty" cyclonedx:"parameters"`
|
||||
|
||||
// TensorCount is the number of tensors in the model
|
||||
TensorCount uint64 `json:"tensorCount" cyclonedx:"tensorCount"`
|
||||
|
||||
// Header contains the remaining key-value pairs from the GGUF header that are not already
|
||||
// represented as typed fields above. This preserves additional metadata fields for reference
|
||||
// (namespaced with general.*, llama.*, etc.) while avoiding duplication.
|
||||
Header map[string]interface{} `json:"header,omitempty" cyclonedx:"header"`
|
||||
|
||||
// TruncatedHeader indicates if the header was truncated during parsing (for very large headers)
|
||||
TruncatedHeader bool `json:"truncatedHeader,omitempty" cyclonedx:"truncatedHeader"`
|
||||
}
|
||||
@ -50,6 +50,7 @@ const (
|
||||
TerraformPkg Type = "terraform"
|
||||
WordpressPluginPkg Type = "wordpress-plugin"
|
||||
HomebrewPkg Type = "homebrew"
|
||||
ModelPkg Type = "model"
|
||||
)
|
||||
|
||||
// AllPkgs represents all supported package types
|
||||
@ -94,6 +95,7 @@ var AllPkgs = []Type{
|
||||
TerraformPkg,
|
||||
WordpressPluginPkg,
|
||||
HomebrewPkg,
|
||||
ModelPkg,
|
||||
}
|
||||
|
||||
// PackageURLType returns the PURL package type for the current package.
|
||||
@ -170,6 +172,8 @@ func (t Type) PackageURLType() string {
|
||||
return "wordpress-plugin"
|
||||
case HomebrewPkg:
|
||||
return "homebrew"
|
||||
case ModelPkg:
|
||||
return "generic/model"
|
||||
default:
|
||||
// TODO: should this be a "generic" purl type instead?
|
||||
return ""
|
||||
@ -258,6 +262,8 @@ func TypeByName(name string) Type {
|
||||
return WordpressPluginPkg
|
||||
case "homebrew":
|
||||
return HomebrewPkg
|
||||
case "model":
|
||||
return ModelPkg
|
||||
default:
|
||||
return UnknownPkg
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user