feat: Add support for scanning GGUF models from OCI registries (#4335)

---------
Signed-off-by: Christopher Phillips <32073428+spiffcs@users.noreply.github.com>
Signed-off-by: Alex Goodman <wagoodman@users.noreply.github.com>
Co-authored-by: Alex Goodman <wagoodman@users.noreply.github.com>
This commit is contained in:
Christopher Angelo Phillips 2026-02-09 16:05:52 -05:00 committed by GitHub
parent 3a23cfff1d
commit 2c5e193f7a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
51 changed files with 6127 additions and 238 deletions

View File

@ -59,7 +59,9 @@ const (
nonImageSchemeHelp = ` {{.appName}} {{.command}} dir:path/to/yourproject read directly from a path on disk (any directory) nonImageSchemeHelp = ` {{.appName}} {{.command}} dir:path/to/yourproject read directly from a path on disk (any directory)
{{.appName}} {{.command}} file:path/to/yourproject/file read directly from a path on disk (any single file) {{.appName}} {{.command}} file:path/to/yourproject/file read directly from a path on disk (any single file)
` `
scanSchemeHelp = "\n " + schemeHelpHeader + "\n" + imageSchemeHelp + nonImageSchemeHelp modelSchemeHelp = ` {{.appName}} {{.command}} oci-model-registry:ai/llama3.2 scan an OCI model artifact from a registry (e.g. Docker Hub AI models)
`
scanSchemeHelp = "\n " + schemeHelpHeader + "\n" + imageSchemeHelp + modelSchemeHelp + nonImageSchemeHelp
scanHelp = scanExample + scanSchemeHelp scanHelp = scanExample + scanSchemeHelp
) )

View File

@ -3,10 +3,12 @@ package internal
const ( const (
// JSONSchemaVersion is the current schema version output by the JSON encoder // JSONSchemaVersion is the current schema version output by the JSON encoder
// This is roughly following the "SchemaVer" guidelines for versioning the JSON schema. Please see schema/json/README.md for details on how to increment. // This is roughly following the "SchemaVer" guidelines for versioning the JSON schema. Please see schema/json/README.md for details on how to increment.
JSONSchemaVersion = "16.1.2" JSONSchemaVersion = "16.1.3"
// Changelog // Changelog
// 16.1.0 - reformulated the python pdm fields (added "URL" and removed the unused "path" field). // 16.1.0 - reformulated the python pdm fields (added "URL" and removed the unused "path" field).
// 16.1.1 - correct elf package osCpe field according to the document of systemd (also add appCpe field) // 16.1.1 - correct elf package osCpe field according to the document of systemd (also add appCpe field)
// 16.1.2 - placeholder for 16.1.2 changelog
// 16.1.3 - add GGUFFileParts to GGUFFileHeader metadata
) )

View File

@ -82,12 +82,29 @@ func assembleTypeContainer(items []any) (any, map[string]string) {
return reflect.New(structType).Elem().Interface(), mapping return reflect.New(structType).Elem().Interface(), mapping
} }
//nolint:funlen
func build() *jsonschema.Schema { func build() *jsonschema.Schema {
// create metadata mapping first so we can use it in the Namer function for self-referential types
pkgMetadataContainer, pkgMetadataMapping := assembleTypeContainer(packagemetadata.AllTypes())
pkgMetadataContainerType := reflect.TypeOf(pkgMetadataContainer)
// create a set of valid metadata display names for lookup
// (since Namer now returns display names, the schema definitions use display names as keys)
pkgMetadataDisplayNames := make(map[string]struct{}, len(pkgMetadataMapping))
for _, displayName := range pkgMetadataMapping {
pkgMetadataDisplayNames[displayName] = struct{}{}
}
reflector := &jsonschema.Reflector{ reflector := &jsonschema.Reflector{
BaseSchemaID: schemaID(), BaseSchemaID: schemaID(),
AllowAdditionalProperties: true, AllowAdditionalProperties: true,
Namer: func(r reflect.Type) string { Namer: func(r reflect.Type) string {
return strings.TrimPrefix(r.Name(), "JSON") name := strings.TrimPrefix(r.Name(), "JSON")
// if this is a metadata type, use the mapped name for consistent references
if mappedName, ok := pkgMetadataMapping[name]; ok {
return mappedName
}
return name
}, },
CommentMap: make(map[string]string), CommentMap: make(map[string]string),
} }
@ -123,9 +140,6 @@ func build() *jsonschema.Schema {
copyAliasFieldComments(reflector.CommentMap, repoRoot) copyAliasFieldComments(reflector.CommentMap, repoRoot)
} }
pkgMetadataContainer, pkgMetadataMapping := assembleTypeContainer(packagemetadata.AllTypes())
pkgMetadataContainerType := reflect.TypeOf(pkgMetadataContainer)
// srcMetadataContainer := assembleTypeContainer(sourcemetadata.AllTypes()) // srcMetadataContainer := assembleTypeContainer(sourcemetadata.AllTypes())
// srcMetadataContainerType := reflect.TypeOf(srcMetadataContainer) // srcMetadataContainerType := reflect.TypeOf(srcMetadataContainer)
@ -144,11 +158,10 @@ func build() *jsonschema.Schema {
continue continue
} }
displayName, ok := pkgMetadataMapping[typeName] if _, ok := pkgMetadataDisplayNames[typeName]; ok {
if ok { // this is a package metadata type (typeName is already the display name from Namer)
// this is a package metadata type... documentSchema.Definitions[typeName] = definition
documentSchema.Definitions[displayName] = definition metadataNames = append(metadataNames, typeName)
metadataNames = append(metadataNames, displayName)
} else { } else {
// this is a type that the metadata type uses (e.g. DpkgFileRecord) // this is a type that the metadata type uses (e.g. DpkgFileRecord)
documentSchema.Definitions[typeName] = definition documentSchema.Definitions[typeName] = definition

View File

@ -26,6 +26,7 @@ var knownNonMetadataTypeNames = strset.New(
// known to be metadata types themselves. Adding to this list will prevent the removal of the type from the schema. // known to be metadata types themselves. Adding to this list will prevent the removal of the type from the schema.
var knownMetadataTypeNames = strset.New( var knownMetadataTypeNames = strset.New(
"DotnetPortableExecutableEntry", "DotnetPortableExecutableEntry",
"GGUFFileHeader",
) )
func DiscoverTypeNames() ([]string, error) { func DiscoverTypeNames() ([]string, error) {

View File

@ -94,25 +94,33 @@ func findMetadataDefinitionNamesInFile(path string) ([]string, []string, error)
// loop over all types declared in the type declaration // loop over all types declared in the type declaration
for _, typ := range spec.Specs { for _, typ := range spec.Specs {
// check if the type is a struct type typeSpec, ok := typ.(*ast.TypeSpec)
spec, ok := typ.(*ast.TypeSpec) if !ok || typeSpec.Type == nil {
if !ok || spec.Type == nil {
continue continue
} }
structType, ok := spec.Type.(*ast.StructType) name := typeSpec.Name.String()
if !ok {
continue
}
// check if the struct type ends with "Metadata"
name := spec.Name.String()
// only look for exported types that end with "Metadata" // only look for exported types that end with "Metadata"
if isMetadataTypeCandidate(name) { if !isMetadataTypeCandidate(name) {
// print the full declaration of the struct type continue
metadataDefinitions = append(metadataDefinitions, name) }
metadataDefinitions = append(metadataDefinitions, name)
// handle struct types (e.g., "type FooMetadata struct {...}")
if structType, ok := typeSpec.Type.(*ast.StructType); ok {
usedTypeNames = append(usedTypeNames, typeNamesUsedInStruct(structType)...) usedTypeNames = append(usedTypeNames, typeNamesUsedInStruct(structType)...)
continue
}
// handle type definitions from another type (e.g., "type FooMetadata BarMetadata")
// if the base type is NOT a metadata candidate, track it as used
// (e.g., we want both ImageMetadata and OCIModelMetadata which is an alias to it)
if ident, ok := typeSpec.Type.(*ast.Ident); ok {
if !isMetadataTypeCandidate(ident.Name) {
usedTypeNames = append(usedTypeNames, ident.Name)
}
} }
} }
} }

View File

@ -6,5 +6,5 @@ import "github.com/anchore/syft/syft/source"
// AllTypes returns a list of all source metadata types that syft supports (that are represented in the source.Description.Metadata field). // AllTypes returns a list of all source metadata types that syft supports (that are represented in the source.Description.Metadata field).
func AllTypes() []any { func AllTypes() []any {
return []any{source.DirectoryMetadata{}, source.FileMetadata{}, source.ImageMetadata{}, source.SnapMetadata{}} return []any{source.DirectoryMetadata{}, source.FileMetadata{}, source.ImageMetadata{}, source.OCIModelMetadata{}, source.SnapMetadata{}}
} }

View File

@ -12,6 +12,7 @@ var jsonNameFromType = map[reflect.Type][]string{
reflect.TypeOf(source.FileMetadata{}): {"file"}, reflect.TypeOf(source.FileMetadata{}): {"file"},
reflect.TypeOf(source.ImageMetadata{}): {"image"}, reflect.TypeOf(source.ImageMetadata{}): {"image"},
reflect.TypeOf(source.SnapMetadata{}): {"snap"}, reflect.TypeOf(source.SnapMetadata{}): {"snap"},
reflect.TypeOf(source.OCIModelMetadata{}): {"oci-model"},
} }
func AllTypeNames() []string { func AllTypeNames() []string {

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,6 @@
{ {
"$schema": "https://json-schema.org/draft/2020-12/schema", "$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "anchore.io/schema/syft/json/16.1.2/document", "$id": "anchore.io/schema/syft/json/16.1.3/document",
"$ref": "#/$defs/Document", "$ref": "#/$defs/Document",
"$defs": { "$defs": {
"AlpmDbEntry": { "AlpmDbEntry": {
@ -1478,6 +1478,13 @@
"metadataHash": { "metadataHash": {
"type": "string", "type": "string",
"description": "MetadataKeyValuesHash is a xx64 hash of all key-value pairs from the GGUF header metadata.\nThis hash is computed over the complete header metadata (including the fields extracted\ninto typed fields above) and provides a stable identifier for the model configuration\nacross different file locations or remotes. It allows matching identical models even\nwhen stored in different repositories or with different filenames." "description": "MetadataKeyValuesHash is a xx64 hash of all key-value pairs from the GGUF header metadata.\nThis hash is computed over the complete header metadata (including the fields extracted\ninto typed fields above) and provides a stable identifier for the model configuration\nacross different file locations or remotes. It allows matching identical models even\nwhen stored in different repositories or with different filenames."
},
"parts": {
"items": {
"$ref": "#/$defs/GgufFileHeader"
},
"type": "array",
"description": "Parts contains headers from additional GGUF files that were merged\ninto this package during post-processing (e.g., from OCI layers without model names)."
} }
}, },
"type": "object", "type": "object",

View File

@ -477,7 +477,7 @@ func (c *CreateSBOMConfig) Create(ctx context.Context, src source.Source) (*sbom
func findDefaultTags(src source.Description) ([]string, error) { func findDefaultTags(src source.Description) ([]string, error) {
switch m := src.Metadata.(type) { switch m := src.Metadata.(type) {
case source.ImageMetadata: case source.ImageMetadata, source.OCIModelMetadata:
return []string{pkgcataloging.ImageTag, filecataloging.FileTag}, nil return []string{pkgcataloging.ImageTag, filecataloging.FileTag}, nil
case source.FileMetadata, source.DirectoryMetadata: case source.FileMetadata, source.DirectoryMetadata:
return []string{pkgcataloging.DirectoryTag, filecataloging.FileTag}, nil return []string{pkgcataloging.DirectoryTag, filecataloging.FileTag}, nil

View File

@ -13,16 +13,18 @@ import (
) )
var _ Resolver = (*MockResolver)(nil) var _ Resolver = (*MockResolver)(nil)
var _ OCIMediaTypeResolver = (*MockResolver)(nil)
// MockResolver implements the FileResolver interface and is intended for use *only in test code*. // MockResolver implements the FileResolver interface and is intended for use *only in test code*.
// It provides an implementation that can resolve local filesystem paths using only a provided discrete list of file // It provides an implementation that can resolve local filesystem paths using only a provided discrete list of file
// paths, which are typically paths to test fixtures. // paths, which are typically paths to test fixtures.
type MockResolver struct { type MockResolver struct {
locations []Location locations []Location
metadata map[Coordinates]Metadata metadata map[Coordinates]Metadata
mimeTypeIndex map[string][]Location mimeTypeIndex map[string][]Location
extension map[string][]Location mediaTypeIndex map[string][]Location
basename map[string][]Location extension map[string][]Location
basename map[string][]Location
} }
// NewMockResolverForPaths creates a new MockResolver, where the only resolvable // NewMockResolverForPaths creates a new MockResolver, where the only resolvable
@ -72,6 +74,34 @@ func NewMockResolverForPathsWithMetadata(metadata map[Coordinates]Metadata) *Moc
} }
} }
// NewMockResolverForMediaTypes creates a MockResolver that can resolve files by media type.
// The mediaTypes map specifies which locations should be returned for each media type.
func NewMockResolverForMediaTypes(mediaTypes map[string][]Location) *MockResolver {
var locations []Location
mediaTypeIndex := make(map[string][]Location)
extension := make(map[string][]Location)
basename := make(map[string][]Location)
for mediaType, locs := range mediaTypes {
mediaTypeIndex[mediaType] = append(mediaTypeIndex[mediaType], locs...)
for _, l := range locs {
locations = append(locations, l)
ext := path.Ext(l.RealPath)
extension[ext] = append(extension[ext], l)
bn := path.Base(l.RealPath)
basename[bn] = append(basename[bn], l)
}
}
return &MockResolver{
locations: locations,
metadata: make(map[Coordinates]Metadata),
mediaTypeIndex: mediaTypeIndex,
extension: extension,
basename: basename,
}
}
// HasPath indicates if the given path exists in the underlying source. // HasPath indicates if the given path exists in the underlying source.
func (r MockResolver) HasPath(path string) bool { func (r MockResolver) HasPath(path string) bool {
for _, l := range r.locations { for _, l := range r.locations {
@ -189,6 +219,14 @@ func (r MockResolver) FilesByMIMEType(types ...string) ([]Location, error) {
return locations, nil return locations, nil
} }
func (r MockResolver) FilesByMediaType(types ...string) ([]Location, error) {
var locations []Location
for _, ty := range types {
locations = append(locations, r.mediaTypeIndex[ty]...)
}
return locations, nil
}
func (r MockResolver) FilesByExtension(extensions ...string) ([]Location, error) { func (r MockResolver) FilesByExtension(extensions ...string) ([]Location, error) {
var results []Location var results []Location
for _, ext := range extensions { for _, ext := range extensions {

View File

@ -52,6 +52,17 @@ type PathResolver interface {
RelativeFileByPath(_ Location, path string) *Location RelativeFileByPath(_ Location, path string) *Location
} }
// OCIMediaTypeResolver resolves single files as a layer in an OCI artifact for a given media type.
type OCIMediaTypeResolver interface {
// FilesByMediaType fetches a set of file references which the contents have been classified as one of the given Media Types.
// The implementation for this may vary, however, this was first implemented to classify ai globs stored in OCI images.
// The following considerations should be made when implementing:
// - only return locations to files (NOT directories)
// - locations for the implementer should be "/" and the fsid should be the layer digest the glob was found
// - locations should be used with the FileContents API to return readers to the temporary data
FilesByMediaType(types ...string) ([]Location, error)
}
// LocationResolver provides iteration over all file locations in a source. // LocationResolver provides iteration over all file locations in a source.
type LocationResolver interface { type LocationResolver interface {
// AllLocations returns a channel of all file references from the underlying source. // AllLocations returns a channel of all file references from the underlying source.

View File

@ -35,6 +35,7 @@ const (
spdxPrimaryPurposeOther = "OTHER" spdxPrimaryPurposeOther = "OTHER"
prefixImage = "Image" prefixImage = "Image"
prefixOCIModel = "OCIModel"
prefixDirectory = "Directory" prefixDirectory = "Directory"
prefixFile = "File" prefixFile = "File"
prefixSnap = "Snap" prefixSnap = "Snap"
@ -215,6 +216,36 @@ func toRootPackage(s source.Description) *spdx.Package {
} }
} }
case source.OCIModelMetadata:
prefix = prefixOCIModel
purpose = spdxPrimaryPurposeContainer
qualifiers := packageurl.Qualifiers{
{
Key: "arch",
Value: m.Architecture,
},
}
ref, _ := reference.Parse(m.UserInput)
if ref, ok := ref.(reference.NamedTagged); ok {
qualifiers = append(qualifiers, packageurl.Qualifier{
Key: "tag",
Value: ref.Tag(),
})
}
c := toChecksum(m.ManifestDigest)
if c != nil {
checksums = append(checksums, *c)
purl = &packageurl.PackageURL{
Type: "oci",
Name: s.Name,
Version: m.ManifestDigest,
Qualifiers: qualifiers,
}
}
case source.DirectoryMetadata: case source.DirectoryMetadata:
prefix = prefixDirectory prefix = prefixDirectory
purpose = spdxPrimaryPurposeFile purpose = spdxPrimaryPurposeFile

View File

@ -316,6 +316,81 @@ func Test_toFormatModel(t *testing.T) {
}, },
}, },
}, },
{
name: "oci-model",
in: sbom.SBOM{
Source: source.Description{
Name: "llama",
Version: "sha256:d34db33f",
Supplier: "Model Provider",
Metadata: source.OCIModelMetadata{
UserInput: "model-repo/llama:latest",
ManifestDigest: "sha256:d34db33f",
},
},
Artifacts: sbom.Artifacts{
Packages: pkg.NewCollection(pkg.Package{
Name: "pkg-1",
Version: "version-1",
}),
},
},
expected: &spdx.Document{
SPDXIdentifier: "DOCUMENT",
SPDXVersion: spdx.Version,
DataLicense: spdx.DataLicense,
DocumentName: "llama",
Packages: []*spdx.Package{
{
PackageSPDXIdentifier: "Package-pkg-1-pkg-1",
PackageName: "pkg-1",
PackageVersion: "version-1",
PackageSupplier: &spdx.Supplier{
Supplier: "Model Provider",
SupplierType: "Organization",
},
},
{
PackageSPDXIdentifier: "DocumentRoot-OCIModel-llama",
PackageName: "llama",
PackageVersion: "sha256:d34db33f",
PrimaryPackagePurpose: "CONTAINER",
PackageChecksums: []spdx.Checksum{{Algorithm: "SHA256", Value: "d34db33f"}},
PackageExternalReferences: []*v2_3.PackageExternalReference{
{
Category: "PACKAGE-MANAGER",
RefType: "purl",
Locator: "pkg:oci/llama@sha256%3Ad34db33f?arch=&tag=latest",
},
},
PackageSupplier: &spdx.Supplier{
Supplier: "Model Provider",
SupplierType: "Organization",
},
},
},
Relationships: []*spdx.Relationship{
{
RefA: spdx.DocElementID{
ElementRefID: "DocumentRoot-OCIModel-llama",
},
RefB: spdx.DocElementID{
ElementRefID: "Package-pkg-1-pkg-1",
},
Relationship: spdx.RelationshipContains,
},
{
RefA: spdx.DocElementID{
ElementRefID: "DOCUMENT",
},
RefB: spdx.DocElementID{
ElementRefID: "DocumentRoot-OCIModel-llama",
},
Relationship: spdx.RelationshipDescribes,
},
},
},
},
} }
for _, test := range tests { for _, test := range tests {

View File

@ -122,6 +122,9 @@ func toPath(s source.Description, p pkg.Package) string {
case source.ImageMetadata: case source.ImageMetadata:
image := strings.ReplaceAll(metadata.UserInput, ":/", "//") image := strings.ReplaceAll(metadata.UserInput, ":/", "//")
return fmt.Sprintf("%s:/%s", image, packagePath) return fmt.Sprintf("%s:/%s", image, packagePath)
case source.OCIModelMetadata:
image := strings.ReplaceAll(metadata.UserInput, ":/", "//")
return fmt.Sprintf("%s:/%s", image, packagePath)
case source.FileMetadata: case source.FileMetadata:
path := trimRelative(metadata.Path) path := trimRelative(metadata.Path)
if isArchive(metadata.Path) { if isArchive(metadata.Path) {

View File

@ -178,6 +178,11 @@ func Test_toGithubModel(t *testing.T) {
metadata: source.SnapMetadata{}, metadata: source.SnapMetadata{},
testPath: "name:/etc", testPath: "name:/etc",
}, },
{
name: "oci-model",
metadata: source.OCIModelMetadata{UserInput: "model-repo/llama:latest"},
testPath: "model-repo/llama:latest:/etc",
},
} }
for _, test := range tests { for _, test := range tests {

View File

@ -12,6 +12,8 @@ func DocumentName(src source.Description) string {
switch metadata := src.Metadata.(type) { switch metadata := src.Metadata.(type) {
case source.ImageMetadata: case source.ImageMetadata:
return metadata.UserInput return metadata.UserInput
case source.OCIModelMetadata:
return metadata.UserInput
case source.DirectoryMetadata: case source.DirectoryMetadata:
return metadata.Path return metadata.Path
case source.FileMetadata: case source.FileMetadata:

View File

@ -54,6 +54,17 @@ func Test_DocumentName(t *testing.T) {
}, },
expected: "some/name", expected: "some/name",
}, },
{
name: "oci-model",
srcMetadata: source.Description{
Metadata: source.OCIModelMetadata{
UserInput: "model-repo/name:tag",
ID: "id",
ManifestDigest: "digest",
},
},
expected: "model-repo/name:tag",
},
{ {
name: "named", name: "named",
srcMetadata: source.Description{ srcMetadata: source.Description{

View File

@ -14,6 +14,7 @@ import (
const ( const (
InputImage = "image" InputImage = "image"
InputOCIModel = "oci-model"
InputDirectory = "dir" InputDirectory = "dir"
InputFile = "file" InputFile = "file"
InputSnap = "snap" InputSnap = "snap"
@ -30,6 +31,8 @@ func DocumentNamespace(name string, src source.Description, desc sbom.Descriptor
switch src.Metadata.(type) { switch src.Metadata.(type) {
case source.ImageMetadata: case source.ImageMetadata:
input = InputImage input = InputImage
case source.OCIModelMetadata:
input = InputOCIModel
case source.DirectoryMetadata: case source.DirectoryMetadata:
input = InputDirectory input = InputDirectory
case source.FileMetadata: case source.FileMetadata:

View File

@ -61,6 +61,18 @@ func Test_DocumentNamespace(t *testing.T) {
}, },
expected: "https://anchore.com/syft/snap/my-name-", expected: "https://anchore.com/syft/snap/my-name-",
}, },
{
name: "oci-model",
inputName: "my-name",
src: source.Description{
Metadata: source.OCIModelMetadata{
UserInput: "model-repo/name:tag",
ID: "id",
ManifestDigest: "digest",
},
},
expected: "https://anchore.com/syft/oci-model/my-name-",
},
} }
for _, test := range tests { for _, test := range tests {
t.Run(test.name, func(t *testing.T) { t.Run(test.name, func(t *testing.T) {

View File

@ -83,7 +83,7 @@ func SourceInfo(p pkg.Package) string {
case pkg.TerraformPkg: case pkg.TerraformPkg:
answer = "acquired package info from Terraform dependency lock file" answer = "acquired package info from Terraform dependency lock file"
case pkg.ModelPkg: case pkg.ModelPkg:
answer = "acquired package info from AI artifact (e.g. GGUF File" answer = "acquired package info from AI artifact (e.g. GGUF File)"
default: default:
answer = "acquired package info from the following paths" answer = "acquired package info from the following paths"
} }

View File

@ -190,6 +190,37 @@ func TestSource_UnmarshalJSON(t *testing.T) {
}, },
}, },
}, },
{
name: "oci-model",
input: []byte(`{
"id": "foobar",
"type": "oci-model",
"metadata": {
"userInput": "model-repo/llama:latest",
"imageID": "sha256:e7b300aee9f9bf3433d32bc9305bfdd22183beb59d933b48d77ab56ba53a197a",
"manifestDigest": "sha256:e515aad2ed234a5072c4d2ef86a1cb77d5bfe4b11aa865d9214875734c4eeb3c",
"mediaType": "application/vnd.oci.image.manifest.v1+json",
"tags": [],
"imageSize": 5576169,
"layers": [],
"repoDigests": []
}
}`),
expected: &Source{
ID: "foobar",
Type: "oci-model",
Metadata: source.OCIModelMetadata{
UserInput: "model-repo/llama:latest",
ID: "sha256:e7b300aee9f9bf3433d32bc9305bfdd22183beb59d933b48d77ab56ba53a197a",
ManifestDigest: "sha256:e515aad2ed234a5072c4d2ef86a1cb77d5bfe4b11aa865d9214875734c4eeb3c",
MediaType: "application/vnd.oci.image.manifest.v1+json",
Tags: []string{},
Size: 5576169,
Layers: []source.LayerMetadata{},
RepoDigests: []string{},
},
},
},
{ {
name: "unknown source type", name: "unknown source type",
input: []byte(`{ input: []byte(`{

View File

@ -325,7 +325,17 @@ func toSourceModel(src source.Description) model.Source {
Metadata: src.Metadata, Metadata: src.Metadata,
} }
if metadata, ok := src.Metadata.(source.ImageMetadata); ok { switch metadata := src.Metadata.(type) {
case source.ImageMetadata:
// ensure that empty collections are not shown as null
if metadata.RepoDigests == nil {
metadata.RepoDigests = []string{}
}
if metadata.Tags == nil {
metadata.Tags = []string{}
}
m.Metadata = metadata
case source.OCIModelMetadata:
// ensure that empty collections are not shown as null // ensure that empty collections are not shown as null
if metadata.RepoDigests == nil { if metadata.RepoDigests == nil {
metadata.RepoDigests = []string{} metadata.RepoDigests = []string{}

View File

@ -161,6 +161,34 @@ func Test_toSourceModel(t *testing.T) {
}, },
}, },
}, },
{
name: "oci-model",
src: source.Description{
ID: "test-id",
Name: "some-name",
Version: "some-version",
Metadata: source.OCIModelMetadata{
UserInput: "user-input",
ID: "id...",
ManifestDigest: "digest...",
MediaType: "type...",
},
},
expected: model.Source{
ID: "test-id",
Name: "some-name",
Version: "some-version",
Type: "oci-model",
Metadata: source.OCIModelMetadata{
UserInput: "user-input",
ID: "id...",
ManifestDigest: "digest...",
MediaType: "type...",
RepoDigests: []string{},
Tags: []string{},
},
},
},
// below are regression tests for when the name/version are not provided // below are regression tests for when the name/version are not provided
// historically we've hoisted up the name/version from the metadata, now it is a simple pass-through // historically we've hoisted up the name/version from the metadata, now it is a simple pass-through
{ {
@ -225,6 +253,30 @@ func Test_toSourceModel(t *testing.T) {
}, },
}, },
}, },
{
name: "oci-model - no name/version",
src: source.Description{
ID: "test-id",
Metadata: source.OCIModelMetadata{
UserInput: "user-input",
ID: "id...",
ManifestDigest: "digest...",
MediaType: "type...",
},
},
expected: model.Source{
ID: "test-id",
Type: "oci-model",
Metadata: source.OCIModelMetadata{
UserInput: "user-input",
ID: "id...",
ManifestDigest: "digest...",
MediaType: "type...",
RepoDigests: []string{},
Tags: []string{},
},
},
},
} }
for _, test := range tests { for _, test := range tests {
t.Run(test.name, func(t *testing.T) { t.Run(test.name, func(t *testing.T) {

View File

@ -130,6 +130,32 @@ func Test_toSyftSourceData(t *testing.T) {
}, },
}, },
}, },
{
name: "oci-model",
src: model.Source{
ID: "the-id",
Name: "some-name",
Version: "some-version",
Type: "oci-model",
Metadata: source.OCIModelMetadata{
UserInput: "user-input",
ID: "id...",
ManifestDigest: "digest...",
MediaType: "type...",
},
},
expected: &source.Description{
ID: "the-id",
Name: "some-name",
Version: "some-version",
Metadata: source.OCIModelMetadata{
UserInput: "user-input",
ID: "id...",
ManifestDigest: "digest...",
MediaType: "type...",
},
},
},
// below are regression tests for when the name/version are not provided // below are regression tests for when the name/version are not provided
// historically we've hoisted up the name/version from the metadata, now it is a simple pass-through // historically we've hoisted up the name/version from the metadata, now it is a simple pass-through
{ {
@ -192,6 +218,28 @@ func Test_toSyftSourceData(t *testing.T) {
}, },
}, },
}, },
{
name: "oci-model - no name/version",
src: model.Source{
ID: "the-id",
Type: "oci-model",
Metadata: source.OCIModelMetadata{
UserInput: "user-input",
ID: "id...",
ManifestDigest: "digest...",
MediaType: "type...",
},
},
expected: &source.Description{
ID: "the-id",
Metadata: source.OCIModelMetadata{
UserInput: "user-input",
ID: "id...",
ManifestDigest: "digest...",
MediaType: "type...",
},
},
},
} }
for _, test := range tests { for _, test := range tests {
t.Run(test.name, func(t *testing.T) { t.Run(test.name, func(t *testing.T) {

View File

@ -65,6 +65,8 @@ func validateSourcePlatform(src source.Source, cfg *GetSourceConfig) error {
switch meta.(type) { switch meta.(type) {
case *source.ImageMetadata, source.ImageMetadata: case *source.ImageMetadata, source.ImageMetadata:
return nil return nil
case *source.OCIModelMetadata, source.OCIModelMetadata:
return nil
case *source.SnapMetadata, source.SnapMetadata: case *source.SnapMetadata, source.SnapMetadata:
return nil return nil
default: default:

View File

@ -31,7 +31,8 @@ func TestGetProviders_Sources(t *testing.T) {
t.Errorf("Expected no error for Sources parameter, got: %v", err) t.Errorf("Expected no error for Sources parameter, got: %v", err)
} }
if len(providers) != 1 { // Registry tag has two providers: OCIModel and Image
t.Errorf("Expected 1 providers, got %d", len(providers)) if len(providers) != 2 {
t.Errorf("Expected 2 providers, got %d", len(providers))
} }
} }

View File

@ -111,6 +111,10 @@ func TestValidateSourcePlatform_SupportedMetadataTypes(t *testing.T) {
metadata: source.FileMetadata{}, metadata: source.FileMetadata{},
wantErr: require.Error, wantErr: require.Error,
}, },
{
name: "oci-model",
metadata: source.OCIModelMetadata{},
},
} }
for _, tt := range tests { for _, tt := range tests {

View File

@ -0,0 +1,141 @@
package fileresolver
import (
"context"
"fmt"
"io"
"os"
"path/filepath"
"github.com/anchore/syft/syft/file"
)
var _ file.Resolver = (*ContainerImageModel)(nil)
var _ file.OCIMediaTypeResolver = (*ContainerImageModel)(nil)
// LayerInfo holds information about an OCI model layer file stored on disk.
type LayerInfo struct {
TempPath string // Path to the temp file on disk
MediaType string // OCI media type of the layer
}
// ContainerImageModel is a file.Resolver implementation that provides access to
// GGUF header data fetched from OCI model artifacts via range-GET requests.
// This does not fetch the entire model from the registry, only a sliver of it.
type ContainerImageModel struct {
tempDir string // temp directory containing all layer files
layerFiles map[string]LayerInfo // digest -> layer info (temp path + media type)
locations map[string]file.Location // digest -> location
}
// NewContainerImageModel creates a new resolver with the given temp directory and layer files.
func NewContainerImageModel(tempDir string, layerFiles map[string]LayerInfo) *ContainerImageModel {
// Create locations for all layer files
// Each location has RealPath="/", FileSystemID=digest, AccessPath="/"
locations := make(map[string]file.Location, len(layerFiles))
for digest := range layerFiles {
// Use NewVirtualLocationFromCoordinates with digest as FileSystemID
coords := file.NewCoordinates("/", digest)
locations[digest] = file.NewVirtualLocationFromCoordinates(coords, "/")
}
return &ContainerImageModel{
tempDir: tempDir,
layerFiles: layerFiles,
locations: locations,
}
}
// FilesByMediaType returns locations for layers matching the given media type patterns.
// Patterns support glob-style matching (e.g., "application/vnd.docker.ai*").
func (r *ContainerImageModel) FilesByMediaType(types ...string) ([]file.Location, error) {
var matches []file.Location
for digest, info := range r.layerFiles {
for _, pattern := range types {
matched, err := filepath.Match(pattern, info.MediaType)
if err != nil {
return nil, fmt.Errorf("invalid media type pattern %q: %w", pattern, err)
}
if matched {
if loc, ok := r.locations[digest]; ok {
matches = append(matches, loc)
}
break // Don't add the same location twice
}
}
}
return matches, nil
}
// FileContentsByLocation returns the contents of the file at the given location.
// The location's FileSystemID contains the layer digest, which is used to look up the temp file.
// This method is used as part of the content selection in the generic cataloger when locations
// are returned by searching for contents by media type.
func (r *ContainerImageModel) FileContentsByLocation(location file.Location) (io.ReadCloser, error) {
// Look up the temp file path using the digest stored in FileSystemID
digest := location.FileSystemID
info, ok := r.layerFiles[digest]
if !ok {
return nil, fmt.Errorf("no file found for digest %q", digest)
}
return os.Open(info.TempPath)
}
// FileMetadataByLocation returns metadata for the file at the given location.
func (r *ContainerImageModel) FileMetadataByLocation(_ file.Location) (m file.Metadata, err error) {
return m, nil
}
// HasPath checks if the given path exists in the resolver.
func (r *ContainerImageModel) HasPath(path string) bool {
// The virtual path is "/" for all files
if path == "/" && len(r.layerFiles) > 0 {
return true
}
return false
}
// FilesByPath returns locations for files matching the given paths.
func (r *ContainerImageModel) FilesByPath(_ ...string) ([]file.Location, error) {
return nil, nil
}
// FilesByGlob returns locations for files matching the given glob patterns.
func (r *ContainerImageModel) FilesByGlob(_ ...string) ([]file.Location, error) {
return nil, nil
}
// FilesByMIMEType returns locations for files with the given MIME types.
// This is not implemented for OCI model artifacts as we don't have MIME type detection.
func (r *ContainerImageModel) FilesByMIMEType(_ ...string) ([]file.Location, error) {
// Not implemented - OCI model artifacts don't have MIME type detection
return nil, nil
}
// RelativeFileByPath returns a file at the given path relative to the reference location.
// This is not applicable for OCI model artifacts.
func (r *ContainerImageModel) RelativeFileByPath(_ file.Location, _ string) *file.Location {
// Not implemented - no layer hierarchy in OCI model artifacts
return nil
}
// AllLocations returns all file locations in the resolver.
func (r *ContainerImageModel) AllLocations(ctx context.Context) <-chan file.Location {
ch := make(chan file.Location)
go func() {
defer close(ch)
for _, loc := range r.locations {
select {
case <-ctx.Done():
return
case ch <- loc:
}
}
}()
return ch
}

View File

@ -0,0 +1,130 @@
package fileresolver
import (
"io"
"os"
"path/filepath"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/anchore/syft/syft/file"
)
const ggufLayerMediaType = "application/vnd.docker.ai.gguf.v3"
func TestOCIModelResolver_FilesByMediaType(t *testing.T) {
tempDir := t.TempDir()
tests := []struct {
name string
layerFiles map[string]LayerInfo
patterns []string
expected int
}{
{
name: "exact match GGUF",
layerFiles: map[string]LayerInfo{
"sha256:abc123": {TempPath: filepath.Join(tempDir, "f1"), MediaType: ggufLayerMediaType},
},
patterns: []string{ggufLayerMediaType},
expected: 1,
},
{
name: "glob match docker ai",
layerFiles: map[string]LayerInfo{
"sha256:abc123": {TempPath: filepath.Join(tempDir, "f1"), MediaType: ggufLayerMediaType},
},
patterns: []string{"application/vnd.docker.ai*"},
expected: 1,
},
{
name: "no match",
layerFiles: map[string]LayerInfo{
"sha256:abc123": {TempPath: filepath.Join(tempDir, "f1"), MediaType: ggufLayerMediaType},
},
patterns: []string{"application/json"},
expected: 0,
},
{
name: "multiple patterns match multiple files",
layerFiles: map[string]LayerInfo{
"sha256:abc123": {TempPath: filepath.Join(tempDir, "f1"), MediaType: ggufLayerMediaType},
"sha256:def456": {TempPath: filepath.Join(tempDir, "f2"), MediaType: "application/octet-stream"},
},
patterns: []string{ggufLayerMediaType, "application/octet-stream"},
expected: 2,
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
resolver := NewContainerImageModel(tempDir, test.layerFiles)
locations, err := resolver.FilesByMediaType(test.patterns...)
require.NoError(t, err)
assert.Len(t, locations, test.expected)
})
}
}
func TestOCIModelResolver_FileContentsByLocation(t *testing.T) {
tempDir := t.TempDir()
content := []byte("test gguf content")
tempFile := filepath.Join(tempDir, "test.gguf")
require.NoError(t, os.WriteFile(tempFile, content, 0600))
digest := "sha256:abc123"
layerFiles := map[string]LayerInfo{
digest: {TempPath: tempFile, MediaType: ggufLayerMediaType},
}
resolver := NewContainerImageModel(tempDir, layerFiles)
tests := []struct {
name string
digest string
wantErr bool
wantData []byte
errSubstr string
}{
{
name: "valid location returns content",
digest: digest,
wantErr: false,
wantData: content,
},
{
name: "invalid digest returns error",
digest: "sha256:invalid",
wantErr: true,
errSubstr: "no file found for digest",
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
loc := file.NewVirtualLocationFromCoordinates(
file.NewCoordinates("/", test.digest),
"/",
)
reader, err := resolver.FileContentsByLocation(loc)
if test.wantErr {
require.Error(t, err)
assert.Contains(t, err.Error(), test.errSubstr)
return
}
require.NoError(t, err)
defer reader.Close()
data, err := io.ReadAll(reader)
require.NoError(t, err)
assert.Equal(t, test.wantData, data)
})
}
}

View File

@ -9,8 +9,17 @@ import (
"github.com/anchore/syft/syft/pkg/cataloger/generic" "github.com/anchore/syft/syft/pkg/cataloger/generic"
) )
const (
catalogerName = "gguf-cataloger"
ggufLayerMediaType = "application/vnd.docker.ai*"
)
// NewGGUFCataloger returns a new cataloger instance for GGUF model files. // NewGGUFCataloger returns a new cataloger instance for GGUF model files.
// It supports both traditional file-based discovery and OCI layer-aware discovery
// when the source for the SBOM is the oci model source
func NewGGUFCataloger() pkg.Cataloger { func NewGGUFCataloger() pkg.Cataloger {
return generic.NewCataloger("gguf-cataloger"). return generic.NewCataloger(catalogerName).
WithParserByGlobs(parseGGUFModel, "**/*.gguf") WithParserByGlobs(parseGGUFModel, "**/*.gguf").
WithParserByMediaType(parseGGUFModel, ggufLayerMediaType).
WithProcessors(ggufMergeProcessor)
} }

View File

@ -122,6 +122,10 @@ func extractVersion(kvs gguf_parser.GGUFMetadataKVs) string {
// extractModelNameFromPath extracts the model name from the file path // extractModelNameFromPath extracts the model name from the file path
func extractModelNameFromPath(path string) string { func extractModelNameFromPath(path string) string {
// we do not want to return a name from filepath if it's not a distinct gguf file
if !strings.Contains(path, ".gguf") {
return ""
}
// Get the base filename // Get the base filename
base := filepath.Base(path) base := filepath.Base(path)

View File

@ -0,0 +1,59 @@
package ai
import (
"github.com/anchore/syft/syft/artifact"
"github.com/anchore/syft/syft/pkg"
)
// ggufMergeProcessor consolidates multiple GGUF packages into a single package
// representing the AI model. When scanning OCI images with multiple layers,
// each layer may produce a separate package. This processor finds the package
// with a name and merges metadata from nameless packages into its GGUFFileParts field.
// Only packages with a non-empty name are returned in the final result.
func ggufMergeProcessor(pkgs []pkg.Package, rels []artifact.Relationship, err error) ([]pkg.Package, []artifact.Relationship, error) {
if err != nil {
return pkgs, rels, err
}
if len(pkgs) == 0 {
return pkgs, rels, err
}
// Separate packages with names from those without
var namedPkgs []pkg.Package
var namelessHeaders []pkg.GGUFFileHeader
for _, p := range pkgs {
if p.Name != "" {
namedPkgs = append(namedPkgs, p)
} else {
if header, ok := p.Metadata.(pkg.GGUFFileHeader); ok {
// We do not want a kv hash for nameless headers
header.MetadataKeyValuesHash = ""
namelessHeaders = append(namelessHeaders, header)
}
}
}
// If there are no named packages, return nothing
if len(namedPkgs) == 0 {
return nil, rels, err
}
// merge nameless headers into a single named package;
// if there are multiple named packages, return them without trying to merge headers.
// we cannot determine which nameless headers belong to which package
// this is because the order we receive the gguf headers in is not guaranteed
// to match the layer order in the original oci image
if len(namedPkgs) == 1 && len(namelessHeaders) > 0 {
winner := &namedPkgs[0]
if header, ok := winner.Metadata.(pkg.GGUFFileHeader); ok {
header.Parts = namelessHeaders
winner.Metadata = header
}
}
// Largest number of key value
return namedPkgs, rels, err
}

View File

@ -0,0 +1,63 @@
package ai
import (
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/anchore/syft/syft/pkg"
)
func Test_ggufMergeProcessor(t *testing.T) {
tests := []struct {
name string
pkgs []pkg.Package
wantPkgCount int
wantFilePartCount int
}{
{
name: "single named package merges nameless headers",
pkgs: []pkg.Package{
{Name: "model", Metadata: pkg.GGUFFileHeader{MetadataKeyValuesHash: "abc"}},
{Name: "", Metadata: pkg.GGUFFileHeader{MetadataKeyValuesHash: "part1"}},
{Name: "", Metadata: pkg.GGUFFileHeader{MetadataKeyValuesHash: "part2"}},
},
wantPkgCount: 1,
wantFilePartCount: 2,
},
{
name: "multiple named packages returns all without merging",
pkgs: []pkg.Package{
{Name: "model1", Metadata: pkg.GGUFFileHeader{}},
{Name: "model2", Metadata: pkg.GGUFFileHeader{}},
{Name: "", Metadata: pkg.GGUFFileHeader{}},
},
wantPkgCount: 2,
wantFilePartCount: 0,
},
{
name: "no named packages returns empty result",
pkgs: []pkg.Package{
{Name: "", Metadata: pkg.GGUFFileHeader{}},
{Name: "", Metadata: pkg.GGUFFileHeader{}},
},
wantPkgCount: 0,
wantFilePartCount: 0,
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
got, _, err := ggufMergeProcessor(test.pkgs, nil, nil)
require.NoError(t, err)
assert.Len(t, got, test.wantPkgCount)
if test.wantPkgCount == 1 && test.wantFilePartCount > 0 {
header, ok := got[0].Metadata.(pkg.GGUFFileHeader)
require.True(t, ok)
assert.Len(t, header.Parts, test.wantFilePartCount)
}
})
}
}

View File

@ -2164,6 +2164,11 @@ func (p *panicyResolver) FilesByMIMEType(_ ...string) ([]file.Location, error) {
return nil, errors.New("not implemented") return nil, errors.New("not implemented")
} }
func (p *panicyResolver) FilesByMediaType(_ ...string) ([]file.Location, error) {
p.searchCalled = true
return nil, errors.New("not implemented")
}
func (p *panicyResolver) RelativeFileByPath(_ file.Location, _ string) *file.Location { func (p *panicyResolver) RelativeFileByPath(_ file.Location, _ string) *file.Location {
return nil return nil
} }

View File

@ -114,6 +114,26 @@ func (c *Cataloger) WithParserByPath(parser Parser, paths ...string) *Cataloger
return c return c
} }
func (c *Cataloger) WithParserByMediaType(parser Parser, types ...string) *Cataloger {
c.requesters = append(c.requesters,
func(resolver file.Resolver, _ Environment) []request {
var requests []request
log.WithFields("mediatypes", types).Trace("searching content matching mediatypes")
ociResolver, ok := resolver.(file.OCIMediaTypeResolver)
if !ok {
return nil
}
matches, err := ociResolver.FilesByMediaType(types...)
if err != nil {
return nil
}
requests = append(requests, makeRequests(parser, matches)...)
return requests
},
)
return c
}
func (c *Cataloger) WithProcessors(processors ...Processor) *Cataloger { func (c *Cataloger) WithProcessors(processors ...Processor) *Cataloger {
for _, p := range processors { for _, p := range processors {
c.processors = append(c.processors, processorWrapper{Processor: p}) c.processors = append(c.processors, processorWrapper{Processor: p})

View File

@ -138,6 +138,10 @@ func (m spyReturningFileResolver) FilesByMIMEType(types ...string) ([]file.Locat
return m.m.FilesByMIMEType(types...) return m.m.FilesByMIMEType(types...)
} }
func (m spyReturningFileResolver) FilesByMediaType(types ...string) ([]file.Location, error) {
return m.m.FilesByMediaType(types...)
}
func (m spyReturningFileResolver) RelativeFileByPath(f file.Location, path string) *file.Location { func (m spyReturningFileResolver) RelativeFileByPath(f file.Location, path string) *file.Location {
return m.m.RelativeFileByPath(f, path) return m.m.RelativeFileByPath(f, path)
} }
@ -189,6 +193,55 @@ func TestClosesFileOnParserPanic(t *testing.T) {
require.True(t, spy.closed) require.True(t, spy.closed)
} }
func Test_CatalogerWithParserByMediaType(t *testing.T) {
allParsedPaths := make(map[string]bool)
parser := func(_ context.Context, resolver file.Resolver, env *Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) {
allParsedPaths[reader.Path()] = true
contents, err := io.ReadAll(reader)
require.NoError(t, err)
if len(contents) == 0 {
return nil, nil, nil
}
p := pkg.Package{
Name: string(contents),
Locations: file.NewLocationSet(reader.Location),
}
return []pkg.Package{p}, nil, nil
}
upstream := "media-type-cataloger"
// Create locations with test fixtures that exist on disk
loc1 := file.NewLocation("test-fixtures/a-path.txt")
loc2 := file.NewLocation("test-fixtures/another-path.txt")
// Create a mock resolver that maps media types to locations
resolver := file.NewMockResolverForMediaTypes(map[string][]file.Location{
"application/vnd.test.model": {loc1, loc2},
})
cataloger := NewCataloger(upstream).
WithParserByMediaType(parser, "application/vnd.test.model")
actualPkgs, _, err := cataloger.Catalog(context.Background(), resolver)
assert.NoError(t, err)
// Verify both files were parsed
assert.True(t, allParsedPaths["test-fixtures/a-path.txt"], "expected a-path.txt to be parsed")
assert.True(t, allParsedPaths["test-fixtures/another-path.txt"], "expected another-path.txt to be parsed")
// Verify packages were created
assert.Len(t, actualPkgs, 2)
// Verify FoundBy is set correctly
for _, p := range actualPkgs {
assert.Equal(t, upstream, p.FoundBy)
}
}
func Test_genericCatalogerReturnsErrors(t *testing.T) { func Test_genericCatalogerReturnsErrors(t *testing.T) {
genericErrorReturning := NewCataloger("error returning").WithParserByGlobs(func(ctx context.Context, resolver file.Resolver, environment *Environment, locationReader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) { genericErrorReturning := NewCataloger("error returning").WithParserByGlobs(func(ctx context.Context, resolver file.Resolver, environment *Environment, locationReader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) {
return []pkg.Package{ return []pkg.Package{

View File

@ -208,6 +208,11 @@ func (r *ObservingResolver) FilesByMIMEType(types ...string) ([]file.Location, e
return locs, err return locs, err
} }
// FilesByMediaType returns files matching the given media types.
func (r *ObservingResolver) FilesByMediaType(_ ...string) ([]file.Location, error) {
return nil, nil
}
// RelativeFileByPath returns a file at a path relative to the given location. // RelativeFileByPath returns a file at a path relative to the given location.
func (r *ObservingResolver) RelativeFileByPath(location file.Location, path string) *file.Location { func (r *ObservingResolver) RelativeFileByPath(location file.Location, path string) *file.Location {
const methodName = "RelativeFileByPath" const methodName = "RelativeFileByPath"

View File

@ -80,6 +80,10 @@ func (r *rpmdbTestFileResolverMock) FilesByMIMEType(...string) ([]file.Location,
return nil, fmt.Errorf("not implemented") return nil, fmt.Errorf("not implemented")
} }
func (r *rpmdbTestFileResolverMock) FilesByMediaType(...string) ([]file.Location, error) {
return nil, fmt.Errorf("not implemented")
}
func TestParseRpmDB(t *testing.T) { func TestParseRpmDB(t *testing.T) {
ctx := context.TODO() ctx := context.TODO()
packagesLocation := file.NewLocation("test-fixtures/Packages") packagesLocation := file.NewLocation("test-fixtures/Packages")

View File

@ -34,4 +34,8 @@ type GGUFFileHeader struct {
// across different file locations or remotes. It allows matching identical models even // across different file locations or remotes. It allows matching identical models even
// when stored in different repositories or with different filenames. // when stored in different repositories or with different filenames.
MetadataKeyValuesHash string `json:"metadataHash,omitempty" cyclonedx:"metadataHash"` MetadataKeyValuesHash string `json:"metadataHash,omitempty" cyclonedx:"metadataHash"`
// Parts contains headers from additional GGUF files that were merged
// into this package during post-processing (e.g., from OCI layers without model names).
Parts []GGUFFileHeader `json:"parts,omitempty" cyclonedx:"parts"`
} }

View File

@ -17,6 +17,7 @@ type ImageMetadata struct {
Variant string `json:"architectureVariant,omitempty"` Variant string `json:"architectureVariant,omitempty"`
OS string `json:"os"` OS string `json:"os"`
Labels map[string]string `json:"labels,omitempty"` Labels map[string]string `json:"labels,omitempty"`
Annotations map[string]string `json:"annotations,omitempty" id:"-"` // critical: do not consider annotations as an identifiable part of the source image
} }
// LayerMetadata represents all static metadata that defines what a container image layer is. // LayerMetadata represents all static metadata that defines what a container image layer is.

View File

@ -0,0 +1,66 @@
package internal
import (
"fmt"
"github.com/opencontainers/go-digest"
"github.com/anchore/syft/syft/artifact"
"github.com/anchore/syft/syft/source"
)
// DeriveImageID derives an artifact ID from the given image metadata. The order of data precedence is:
// 1. prefer a digest of the raw container image manifest
// 2. if no manifest digest is available, calculate a chain ID from the image layer metadata
// 3. if no layer metadata is available, use the user input string
//
// in all cases, if an alias is provided, it is additionally considered in the ID calculation. This allows for the
// same image to be scanned multiple times with different aliases and be considered logically different.
func DeriveImageID(alias source.Alias, metadata source.ImageMetadata) artifact.ID {
var input string
if len(metadata.RawManifest) > 0 {
input = digest.Canonical.FromBytes(metadata.RawManifest).String()
} else {
// calculate chain ID for image sources where manifestDigest is not available
// https://github.com/opencontainers/image-spec/blob/main/config.md#layer-chainid
input = calculateChainID(metadata.Layers)
if input == "" {
// TODO what happens here if image has no layers?
// is this case possible?
input = digest.Canonical.FromString(metadata.UserInput).String()
}
}
if !alias.IsEmpty() {
// if the user provided an alias, we want to consider that in the artifact ID. This way if the user
// scans the same item but is considered to be logically different, then ID will express that.
aliasStr := fmt.Sprintf(":%s@%s", alias.Name, alias.Version)
input = digest.Canonical.FromString(input + aliasStr).String()
}
return ArtifactIDFromDigest(input)
}
// https://github.com/opencontainers/image-spec/blob/main/config.md#layer-chainid
func calculateChainID(lm []source.LayerMetadata) string {
if len(lm) < 1 {
return ""
}
// DiffID(L0) = digest of layer 0
// https://github.com/anchore/stereoscope/blob/1b1b744a919964f38d14e1416fb3f25221b761ce/pkg/image/layer_metadata.go#L19-L32
chainID := lm[0].Digest
id := chain(chainID, lm[1:])
return id
}
func chain(chainID string, layers []source.LayerMetadata) string {
if len(layers) < 1 {
return chainID
}
chainID = digest.Canonical.FromString(layers[0].Digest + " " + chainID).String()
return chain(chainID, layers[1:])
}

View File

@ -0,0 +1,278 @@
package internal
import (
"crypto/sha256"
"fmt"
"strings"
"testing"
"github.com/stretchr/testify/assert"
"github.com/anchore/syft/syft/artifact"
"github.com/anchore/syft/syft/source"
)
func TestDeriveImageID(t *testing.T) {
tests := []struct {
name string
alias source.Alias
metadata source.ImageMetadata
want artifact.ID
}{
{
name: "use raw manifest over chain ID or user input",
metadata: source.ImageMetadata{
UserInput: "user-input",
Layers: []source.LayerMetadata{
{
Digest: "a",
},
{
Digest: "b",
},
{
Digest: "c",
},
},
RawManifest: []byte("raw-manifest"),
},
want: func() artifact.ID {
hasher := sha256.New()
hasher.Write([]byte("raw-manifest"))
return artifact.ID(fmt.Sprintf("%x", hasher.Sum(nil)))
}(),
},
{
name: "use chain ID over user input",
metadata: source.ImageMetadata{
Layers: []source.LayerMetadata{
{
Digest: "a",
},
{
Digest: "b",
},
{
Digest: "c",
},
},
},
want: func() artifact.ID {
metadata := []source.LayerMetadata{
{
Digest: "a",
},
{
Digest: "b",
},
{
Digest: "c",
},
}
return artifact.ID(strings.TrimPrefix(calculateChainID(metadata), "sha256:"))
}(),
},
{
name: "use user input last",
metadata: source.ImageMetadata{
UserInput: "user-input",
},
want: func() artifact.ID {
hasher := sha256.New()
hasher.Write([]byte("user-input"))
return artifact.ID(fmt.Sprintf("%x", hasher.Sum(nil)))
}(),
},
{
name: "without alias (first)",
metadata: source.ImageMetadata{
UserInput: "user-input",
Layers: []source.LayerMetadata{
{
Digest: "a",
},
{
Digest: "b",
},
{
Digest: "c",
},
},
RawManifest: []byte("raw-manifest"),
},
want: "85298926ecd92ed57688f13039017160cd728f04dd0d2d10a10629007106f107",
},
{
name: "always consider alias (first)",
alias: source.Alias{
Name: "alias",
Version: "version",
},
metadata: source.ImageMetadata{
UserInput: "user-input",
Layers: []source.LayerMetadata{
{
Digest: "a",
},
{
Digest: "b",
},
{
Digest: "c",
},
},
RawManifest: []byte("raw-manifest"),
},
want: "a8717e42449960c1dd4963f2f22bd69c7c105e7e82445be0a65aa1825d62ff0d",
},
{
name: "without alias (last)",
metadata: source.ImageMetadata{
UserInput: "user-input",
},
want: "ab0dff627d80b9753193d7280bec8f45e8ec6b4cb0912c6fffcf7cd782d9739e",
},
{
name: "always consider alias (last)",
alias: source.Alias{
Name: "alias",
Version: "version",
},
metadata: source.ImageMetadata{
UserInput: "user-input",
},
want: "fe86c0eecd5654d3c0c0b2176aa394aef6440347c241aa8d9b628dfdde4287cf",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
assert.Equal(t, tt.want, DeriveImageID(tt.alias, tt.metadata))
})
}
}
// ensures same metadata produces identical IDs
// regardless of whether the source is stereoscope-based or OCI model-based. Both source types
// use DeriveImageID with ImageMetadata
// this test captures known-good IDs that must remain
// stable across refactors to maintain consistency.
//
// IMPORTANT: If any of these tests fail after a refactor, it means the artifact ID generation
// has changed and will break consistency between stereoscope images and OCI model sources.
func TestDeriveImageID_CrossSourceConsistency(t *testing.T) {
tests := []struct {
name string
alias source.Alias
metadata source.ImageMetadata
wantID artifact.ID
}{
{
name: "raw manifest with layers - typical container image",
metadata: source.ImageMetadata{
UserInput: "docker.io/library/alpine:latest",
ManifestDigest: "sha256:abc123",
Layers: []source.LayerMetadata{
{Digest: "sha256:layer1", MediaType: "application/vnd.oci.image.layer.v1.tar+gzip", Size: 1000},
{Digest: "sha256:layer2", MediaType: "application/vnd.oci.image.layer.v1.tar+gzip", Size: 2000},
},
RawManifest: []byte(`{"schemaVersion":2,"mediaType":"application/vnd.oci.image.manifest.v1+json"}`),
},
// snapshot: this ID must remain stable for stereoscope/oci-model consistency
wantID: "b22c7289dd3b4785a3795c90e15d16bd66bd29b444b8974fe29ed0443ce50405",
},
{
name: "raw manifest only - minimal image",
metadata: source.ImageMetadata{
RawManifest: []byte(`{"schemaVersion":2}`),
},
// snapshot: this ID must remain stable
wantID: "bafebd36189ad3688b7b3915ea55d461e0bfcfbdde11e54b0a123999fb6be50f",
},
{
name: "chain ID fallback - no raw manifest",
metadata: source.ImageMetadata{
UserInput: "some-image",
Layers: []source.LayerMetadata{
{Digest: "sha256:aaa111"},
{Digest: "sha256:bbb222"},
},
},
// snapshot: chain ID calculation must remain stable
wantID: "0ba9c8d271e6708871505d362e37267c5fb7910066c04d3115b89ba4d34aa180",
},
{
name: "user input fallback - no manifest or layers",
metadata: source.ImageMetadata{
UserInput: "registry.example.com/org/model:v1.0",
},
// snapshot: user input hash must remain stable
wantID: "a5a8733a3ba3eb99a8ebebcd40c4053f9b896ea6e2217ebc6e885573f20baccf",
},
{
name: "with alias - same image different logical identity",
alias: source.Alias{
Name: "my-custom-name",
Version: "1.0.0",
},
metadata: source.ImageMetadata{
RawManifest: []byte(`{"schemaVersion":2}`),
},
// snapshot: alias must affect ID deterministically
wantID: "9eae41c0efc30023368c29089bac007f2c9d0b40a0ee034081a17c4c22f55ac6",
},
{
name: "annotations has no effect on ID",
metadata: source.ImageMetadata{
UserInput: "registry.example.com/org/model:v1.0",
Annotations: map[string]string{
"annotation1": "value1",
},
},
// snapshot: user input hash must remain stable
wantID: "a5a8733a3ba3eb99a8ebebcd40c4053f9b896ea6e2217ebc6e885573f20baccf",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := DeriveImageID(tt.alias, tt.metadata)
assert.Equal(t, tt.wantID, got, "ID must remain stable for cross-source consistency")
})
}
}
func TestCalculateChainID(t *testing.T) {
tests := []struct {
name string
layers []source.LayerMetadata
want string
}{
{
name: "empty layers returns empty string",
layers: []source.LayerMetadata{},
want: "",
},
{
name: "single layer returns digest",
layers: []source.LayerMetadata{
{Digest: "sha256:abc123"},
},
want: "sha256:abc123",
},
{
name: "multiple layers calculates chain ID",
layers: []source.LayerMetadata{
{Digest: "a"},
{Digest: "b"},
{Digest: "c"},
},
// snapshot - this value should not change
want: "sha256:1dfe230e220ef0e6bc0a8978d23d72b95769e76a62879a5f49267d8c007ab43d",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
assert.Equal(t, tt.want, calculateChainID(tt.layers))
})
}
}

View File

@ -0,0 +1,4 @@
package source
// OCIModelMetadata is an AI model from an OCI registry, which is a specialized form of ImageMetadata.
type OCIModelMetadata ImageMetadata

View File

@ -0,0 +1,245 @@
package ocimodelsource
import (
"context"
"fmt"
"os"
"path/filepath"
"strings"
"sync"
"github.com/google/go-containerregistry/pkg/name"
v1 "github.com/google/go-containerregistry/pkg/v1"
"github.com/anchore/stereoscope/pkg/image"
"github.com/anchore/syft/internal/log"
"github.com/anchore/syft/syft/artifact"
"github.com/anchore/syft/syft/file"
"github.com/anchore/syft/syft/internal/fileresolver"
"github.com/anchore/syft/syft/source"
"github.com/anchore/syft/syft/source/internal"
)
var _ source.Source = (*ociModelSource)(nil)
// Config holds the input configuration for an OCI model artifact source.
type Config struct {
Reference string
RegistryOptions *image.RegistryOptions
Alias source.Alias
}
// ociModelSource implements the source.Source interface for OCI model artifacts.
type ociModelSource struct {
id artifact.ID
reference string
alias source.Alias
metadata source.OCIModelMetadata
tempDir string
resolver interface {
file.Resolver
file.OCIMediaTypeResolver
}
mutex *sync.Mutex
}
// NewFromRegistry creates a new OCI model source by fetching the model artifact from a registry.
func NewFromRegistry(ctx context.Context, cfg Config) (source.Source, error) {
client := newRegistryClient(cfg.RegistryOptions)
art, err := validateAndFetchArtifact(ctx, client, cfg.Reference)
if err != nil {
return nil, err
}
metadata := buildMetadata(art)
tempDir, resolver, err := fetchAndStoreGGUFHeaders(ctx, client, art)
if err != nil {
return nil, err
}
id := internal.DeriveImageID(cfg.Alias, source.ImageMetadata(metadata))
return &ociModelSource{
id: id,
reference: cfg.Reference,
alias: cfg.Alias,
metadata: metadata,
tempDir: tempDir,
resolver: resolver,
mutex: &sync.Mutex{},
}, nil
}
// validateAndFetchArtifact fetches and validates a model artifact in a single registry call.
func validateAndFetchArtifact(ctx context.Context, client *registryClient, reference string) (*modelArtifact, error) {
art, err := client.fetchModelArtifact(ctx, reference)
if err != nil {
// errNotModelArtifact is wrapped, so callers can use errors.Is() to check
return nil, err
}
if len(art.GGUFLayers) == 0 {
return nil, fmt.Errorf("model artifact has no GGUF layers")
}
return art, nil
}
// fetchAndStoreGGUFHeaders fetches GGUF layer headers and stores them in temp files.
func fetchAndStoreGGUFHeaders(ctx context.Context, client *registryClient, artifact *modelArtifact) (string, *fileresolver.ContainerImageModel, error) {
tempDir, err := os.MkdirTemp("", "syft-oci-gguf")
if err != nil {
return "", nil, fmt.Errorf("failed to create temp directory: %w", err)
}
layerFiles := make(map[string]fileresolver.LayerInfo)
for _, layer := range artifact.GGUFLayers {
li, err := fetchSingleGGUFHeader(ctx, client, artifact.Reference, layer, tempDir)
if err != nil {
osErr := os.RemoveAll(tempDir)
if osErr != nil {
log.Errorf("unable to remove temp directory (%s): %v", tempDir, err)
}
return "", nil, err
}
layerFiles[layer.Digest.String()] = li
}
resolver := fileresolver.NewContainerImageModel(tempDir, layerFiles)
return tempDir, resolver, nil
}
// fetchSingleGGUFHeader fetches a single GGUF layer header and writes it to a temp file.
func fetchSingleGGUFHeader(ctx context.Context, client *registryClient, ref name.Reference, layer v1.Descriptor, tempDir string) (fileresolver.LayerInfo, error) {
headerData, err := client.fetchBlobRange(ctx, ref, layer.Digest, maxHeaderBytes)
if err != nil {
return fileresolver.LayerInfo{}, fmt.Errorf("failed to fetch GGUF layer header: %w", err)
}
digestStr := layer.Digest.String()
safeDigest := strings.ReplaceAll(digestStr, ":", "-")
tempPath := filepath.Join(tempDir, safeDigest+".gguf")
if err := os.WriteFile(tempPath, headerData, 0600); err != nil {
return fileresolver.LayerInfo{}, fmt.Errorf("failed to write temp file: %w", err)
}
return fileresolver.LayerInfo{
TempPath: tempPath,
MediaType: string(layer.MediaType),
}, nil
}
// buildMetadata constructs OCIModelMetadata from a modelArtifact.
func buildMetadata(artifact *modelArtifact) source.OCIModelMetadata {
// layers
layers := make([]source.LayerMetadata, len(artifact.Manifest.Layers))
for i, layer := range artifact.Manifest.Layers {
layers[i] = source.LayerMetadata{
MediaType: string(layer.MediaType),
Digest: layer.Digest.String(),
Size: layer.Size,
}
}
// tags
var tags []string
if tagged, ok := artifact.Reference.(interface{ TagStr() string }); ok {
if tag := tagged.TagStr(); tag != "" {
tags = []string{tag}
}
}
// digests
var repoDigests []string
if artifact.ManifestDigest != "" {
repoDigests = []string{artifact.Reference.Context().String() + "@" + artifact.ManifestDigest}
}
// metadata
return source.OCIModelMetadata{
UserInput: artifact.Reference.String(),
ID: artifact.ManifestDigest,
ManifestDigest: artifact.ManifestDigest,
MediaType: string(artifact.Manifest.MediaType),
Tags: tags,
Size: calculateTotalSize(layers),
Layers: layers,
RawManifest: artifact.RawManifest,
RawConfig: artifact.RawConfig,
RepoDigests: repoDigests,
Architecture: artifact.Config.Architecture,
Variant: artifact.Config.Variant,
OS: artifact.Config.OS,
Labels: artifact.Config.Config.Labels,
Annotations: extractManifestAnnotations(artifact.Manifest),
}
}
// extractManifestAnnotations extracts annotations from the manifest.
func extractManifestAnnotations(manifest *v1.Manifest) map[string]string {
if manifest == nil || manifest.Annotations == nil {
return make(map[string]string)
}
return manifest.Annotations
}
// calculateTotalSize sums up the size of all layers.
func calculateTotalSize(layers []source.LayerMetadata) int64 {
var total int64
for _, layer := range layers {
total += layer.Size
}
return total
}
// ID returns the artifact ID.
func (s *ociModelSource) ID() artifact.ID {
return s.id
}
// Describe returns a description of the source.
func (s *ociModelSource) Describe() source.Description {
name := s.reference
version := ""
supplier := ""
if !s.alias.IsEmpty() {
if s.alias.Name != "" {
name = s.alias.Name
}
if s.alias.Version != "" {
version = s.alias.Version
}
if s.alias.Supplier != "" {
supplier = s.alias.Supplier
}
}
return source.Description{
ID: string(s.id),
Name: name,
Version: version,
Supplier: supplier,
Metadata: s.metadata,
}
}
// FileResolver returns a file resolver for accessing header of GGUF files.
func (s *ociModelSource) FileResolver(_ source.Scope) (file.Resolver, error) {
return s.resolver, nil
}
// Close cleans up temporary files. Safe to call multiple times.
func (s *ociModelSource) Close() error {
s.mutex.Lock()
defer s.mutex.Unlock()
if s.tempDir == "" {
return nil
}
err := os.RemoveAll(s.tempDir)
s.tempDir = ""
s.resolver = nil
return err
}

View File

@ -0,0 +1,36 @@
package ocimodelsource
import (
"context"
"github.com/anchore/stereoscope/pkg/image"
"github.com/anchore/syft/syft/source"
)
type ociModelSourceProvider struct {
reference string
registryOpts *image.RegistryOptions
alias source.Alias
}
// NewSourceProvider creates a new OCI model artifact source provider.
func NewSourceProvider(reference string, registryOpts *image.RegistryOptions, alias source.Alias) source.Provider {
return &ociModelSourceProvider{
reference: reference,
registryOpts: registryOpts,
alias: alias,
}
}
func (p *ociModelSourceProvider) Name() string {
return "oci-model"
}
func (p *ociModelSourceProvider) Provide(ctx context.Context) (source.Source, error) {
cfg := Config{
Reference: p.reference,
RegistryOptions: p.registryOpts,
Alias: p.alias,
}
return NewFromRegistry(ctx, cfg)
}

View File

@ -0,0 +1,217 @@
package ocimodelsource
import (
"context"
"crypto/tls"
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
"strings"
"github.com/google/go-containerregistry/pkg/authn"
"github.com/google/go-containerregistry/pkg/name"
v1 "github.com/google/go-containerregistry/pkg/v1"
"github.com/google/go-containerregistry/pkg/v1/remote"
"github.com/anchore/stereoscope/pkg/image"
)
// errNotModelArtifact is returned when a reference does not point to a model artifact.
var errNotModelArtifact = errors.New("not an OCI model artifact")
const (
// Model artifact media types as per Docker's OCI artifacts for AI model packaging
// Reference: https://www.docker.com/blog/oci-artifacts-for-ai-model-packaging/
modelConfigMediaTypePrefix = "application/vnd.docker.ai.model.config."
ggufLayerMediaType = "application/vnd.docker.ai.gguf.v3"
// Maximum bytes to read/return for GGUF headers
maxHeaderBytes = 8 * 1024 * 1024 // 8 MB
)
// registryClient handles OCI registry interactions for model artifacts.
type registryClient struct {
options []remote.Option
}
// newRegistryClient creates a new registry client with authentication from RegistryOptions.
func newRegistryClient(registryOpts *image.RegistryOptions) *registryClient {
opts := buildRemoteOptions(registryOpts)
return &registryClient{
options: opts,
}
}
// buildRemoteOptions converts stereoscope RegistryOptions to go-containerregistry remote.Options.
func buildRemoteOptions(registryOpts *image.RegistryOptions) []remote.Option {
var opts []remote.Option
if registryOpts == nil {
return opts
}
// Build authenticator
authenticator := buildAuthenticator(registryOpts)
opts = append(opts, remote.WithAuth(authenticator))
// Handle TLS settings
if registryOpts.InsecureSkipTLSVerify {
if transport, ok := remote.DefaultTransport.(*http.Transport); ok {
transport = transport.Clone()
if transport.TLSClientConfig == nil {
transport.TLSClientConfig = &tls.Config{
MinVersion: tls.VersionTLS12,
}
}
transport.TLSClientConfig.InsecureSkipVerify = true //#nosec G402 -- user explicitly requested insecure TLS
opts = append(opts, remote.WithTransport(transport))
}
}
// Handle insecure HTTP
if registryOpts.InsecureUseHTTP {
opts = append(opts, remote.WithTransport(http.DefaultTransport))
}
return opts
}
// buildAuthenticator creates an authn.Authenticator from RegistryOptions.
func buildAuthenticator(registryOpts *image.RegistryOptions) authn.Authenticator {
// If credentials are provided, use them
if len(registryOpts.Credentials) > 0 {
// Use the first credential set (we could enhance this to match by authority)
cred := registryOpts.Credentials[0]
if cred.Token != "" {
return &authn.Bearer{Token: cred.Token}
}
if cred.Username != "" || cred.Password != "" {
return &authn.Basic{
Username: cred.Username,
Password: cred.Password,
}
}
}
// Fall back to anonymous authenticator
return authn.Anonymous
}
// modelArtifact represents a parsed OCI model artifact.
type modelArtifact struct {
Reference name.Reference
Manifest *v1.Manifest
Config *v1.ConfigFile
RawManifest []byte
RawConfig []byte
ManifestDigest string
GGUFLayers []v1.Descriptor
}
func (c *registryClient) fetchModelArtifact(ctx context.Context, refStr string) (*modelArtifact, error) {
ref, err := name.ParseReference(refStr)
if err != nil {
return nil, fmt.Errorf("failed to parse reference %q: %w", refStr, err)
}
opts := c.options
opts = append(opts, remote.WithContext(ctx))
desc, err := remote.Get(ref, opts...)
if err != nil {
return nil, fmt.Errorf("failed to fetch descriptor: %w", err)
}
manifest := &v1.Manifest{}
if err := json.Unmarshal(desc.Manifest, manifest); err != nil {
return nil, fmt.Errorf("failed to unmarshal manifest: %w", err)
}
if !isModelArtifact(manifest) {
return nil, fmt.Errorf("%w (config media type: %s)", errNotModelArtifact, manifest.Config.MediaType)
}
img, err := desc.Image()
if err != nil {
return nil, fmt.Errorf("failed to get image: %w", err)
}
configFile, err := img.ConfigFile()
if err != nil {
return nil, fmt.Errorf("failed to get config file: %w", err)
}
rawConfig, err := img.RawConfigFile()
if err != nil {
return nil, fmt.Errorf("failed to get raw config: %w", err)
}
ggufLayers := extractGGUFLayers(manifest)
return &modelArtifact{
Reference: ref,
Manifest: manifest,
Config: configFile,
RawManifest: desc.Manifest,
RawConfig: rawConfig,
ManifestDigest: desc.Digest.String(),
GGUFLayers: ggufLayers,
}, nil
}
// isModelArtifact checks if the manifest represents a model artifact.
func isModelArtifact(manifest *v1.Manifest) bool {
return strings.HasPrefix(string(manifest.Config.MediaType), modelConfigMediaTypePrefix)
}
// extractGGUFLayers extracts GGUF layer descriptors from the manifest.
func extractGGUFLayers(manifest *v1.Manifest) []v1.Descriptor {
var ggufLayers []v1.Descriptor
for _, layer := range manifest.Layers {
if string(layer.MediaType) == ggufLayerMediaType {
ggufLayers = append(ggufLayers, layer)
}
}
return ggufLayers
}
func (c *registryClient) fetchBlobRange(ctx context.Context, ref name.Reference, digest v1.Hash, maxBytes int64) ([]byte, error) {
repo := ref.Context()
opts := c.options
opts = append(opts, remote.WithContext(ctx))
layer, err := remote.Layer(repo.Digest(digest.String()), opts...)
if err != nil {
return nil, fmt.Errorf("failed to fetch layer: %w", err)
}
reader, err := layer.Compressed()
if err != nil {
return nil, fmt.Errorf("failed to get layer reader: %w", err)
}
// this defer is what causes the download to stop
// 1. io.ReadFull(reader, data) reads exactly 8MB into the buffer
// 2. The function returns with data[:n]
// 3. defer reader.Close() executes, closing the HTTP response body
// 4. Closing the response body closes the underlying TCP connection
// 5. The server receives TCP FIN/RST and stops sending
// note: some data is already in flight when we close so we will see > 8mb over the wire
// the full image will not download given we terminate the reader early here
defer reader.Close()
// Note: this is not some arbitrary number picked out of the blue.
// This is based on the specification of header data found here:
// https://github.com/ggml-org/ggml/blob/master/docs/gguf.md#file-structure
data := make([]byte, maxBytes)
n, err := io.ReadFull(reader, data)
if err != nil && err != io.ErrUnexpectedEOF {
// ErrUnexpectedEOF is okay - it means the file is smaller than maxBytes
return nil, fmt.Errorf("failed to read layer data: %w", err)
}
return data[:n], nil
}

View File

@ -0,0 +1,114 @@
package ocimodelsource
import (
"testing"
v1 "github.com/google/go-containerregistry/pkg/v1"
"github.com/google/go-containerregistry/pkg/v1/types"
"github.com/stretchr/testify/assert"
)
func TestIsModelArtifact(t *testing.T) {
tests := []struct {
name string
manifest *v1.Manifest
expected bool
}{
{
name: "valid model artifact",
manifest: &v1.Manifest{
Config: v1.Descriptor{
MediaType: modelConfigMediaTypePrefix + "v1+json",
},
},
expected: true,
},
{
name: "container image",
manifest: &v1.Manifest{
Config: v1.Descriptor{
MediaType: types.DockerConfigJSON,
},
},
expected: false,
},
{
name: "empty media type",
manifest: &v1.Manifest{
Config: v1.Descriptor{
MediaType: "",
},
},
expected: false,
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
result := isModelArtifact(test.manifest)
assert.Equal(t, test.expected, result)
})
}
}
func TestExtractGGUFLayers(t *testing.T) {
tests := []struct {
name string
manifest *v1.Manifest
expected int
}{
{
name: "single GGUF layer",
manifest: &v1.Manifest{
Layers: []v1.Descriptor{
{MediaType: types.MediaType(ggufLayerMediaType), Digest: v1.Hash{Algorithm: "sha256", Hex: "abc"}},
},
},
expected: 1,
},
{
name: "multiple GGUF layers",
manifest: &v1.Manifest{
Layers: []v1.Descriptor{
{MediaType: types.MediaType(ggufLayerMediaType), Digest: v1.Hash{Algorithm: "sha256", Hex: "abc"}},
{MediaType: types.MediaType(ggufLayerMediaType), Digest: v1.Hash{Algorithm: "sha256", Hex: "def"}},
},
},
expected: 2,
},
{
name: "mixed layers",
manifest: &v1.Manifest{
Layers: []v1.Descriptor{
{MediaType: types.MediaType(ggufLayerMediaType), Digest: v1.Hash{Algorithm: "sha256", Hex: "abc"}},
{MediaType: types.DockerLayer, Digest: v1.Hash{Algorithm: "sha256", Hex: "def"}},
{MediaType: types.MediaType(ggufLayerMediaType), Digest: v1.Hash{Algorithm: "sha256", Hex: "ghi"}},
},
},
expected: 2,
},
{
name: "no GGUF layers",
manifest: &v1.Manifest{
Layers: []v1.Descriptor{
{MediaType: types.DockerLayer},
},
},
expected: 0,
},
{
name: "empty layers",
manifest: &v1.Manifest{
Layers: []v1.Descriptor{},
},
expected: 0,
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
result := extractGGUFLayers(test.manifest)
assert.Len(t, result, test.expected)
})
}
}

View File

@ -7,6 +7,7 @@ import (
"github.com/anchore/syft/syft/source" "github.com/anchore/syft/syft/source"
"github.com/anchore/syft/syft/source/directorysource" "github.com/anchore/syft/syft/source/directorysource"
"github.com/anchore/syft/syft/source/filesource" "github.com/anchore/syft/syft/source/filesource"
"github.com/anchore/syft/syft/source/ocimodelsource"
"github.com/anchore/syft/syft/source/snapsource" "github.com/anchore/syft/syft/source/snapsource"
"github.com/anchore/syft/syft/source/stereoscopesource" "github.com/anchore/syft/syft/source/stereoscopesource"
) )
@ -43,6 +44,14 @@ func All(userInput string, cfg *Config) []collections.TaggedValue[source.Provide
// --from docker, registry, etc. // --from docker, registry, etc.
Join(stereoscopeProviders.Select(PullTag)...). Join(stereoscopeProviders.Select(PullTag)...).
// --from oci-model, registry (for select cases only)
// OCI model artifacts with header-only fetching
// note: we don't want to use the "pull" tag since it's not actually pulling the full image,
// instead we want to match on registry since these models are stored in OCI registries.
// This does mean that this must be placed after the pull provider, which is ideal since we don't want to
// unnecessarily pull registry headers first if the more common case is the pull providers.
Join(tagProvider(ocimodelsource.NewSourceProvider(userInput, cfg.RegistryOptions, cfg.Alias), "registry")).
// --from snap (remote only) // --from snap (remote only)
Join(tagProvider(snapsource.NewRemoteSourceProvider(userInput, cfg.Exclude, cfg.DigestAlgorithms, cfg.Alias), SnapTag)) Join(tagProvider(snapsource.NewRemoteSourceProvider(userInput, cfg.Exclude, cfg.DigestAlgorithms, cfg.Alias), SnapTag))
} }

View File

@ -5,7 +5,6 @@ import (
"github.com/bmatcuk/doublestar/v4" "github.com/bmatcuk/doublestar/v4"
"github.com/distribution/reference" "github.com/distribution/reference"
"github.com/opencontainers/go-digest"
"github.com/anchore/stereoscope/pkg/image" "github.com/anchore/stereoscope/pkg/image"
"github.com/anchore/syft/internal/log" "github.com/anchore/syft/internal/log"
@ -36,7 +35,7 @@ type stereoscopeImageSource struct {
func New(img *image.Image, cfg ImageConfig) source.Source { func New(img *image.Image, cfg ImageConfig) source.Source {
metadata := imageMetadataFromStereoscopeImage(img, cfg.Reference) metadata := imageMetadataFromStereoscopeImage(img, cfg.Reference)
return &stereoscopeImageSource{ return &stereoscopeImageSource{
id: deriveIDFromStereoscopeImage(cfg.Alias, metadata), id: internal.DeriveImageID(cfg.Alias, metadata),
config: cfg, config: cfg,
image: img, image: img,
metadata: metadata, metadata: metadata,
@ -163,61 +162,6 @@ func imageMetadataFromStereoscopeImage(img *image.Image, reference string) sourc
} }
} }
// deriveIDFromStereoscopeImage derives an artifact ID from the given image metadata. The order of data precedence is:
// 1. prefer a digest of the raw container image manifest
// 2. if no manifest digest is available, calculate a chain ID from the image layer metadata
// 3. if no layer metadata is available, use the user input string
//
// in all cases, if an alias is provided, it is additionally considered in the ID calculation. This allows for the
// same image to be scanned multiple times with different aliases and be considered logically different.
func deriveIDFromStereoscopeImage(alias source.Alias, metadata source.ImageMetadata) artifact.ID {
var input string
if len(metadata.RawManifest) > 0 {
input = digest.Canonical.FromBytes(metadata.RawManifest).String()
} else {
// calculate chain ID for image sources where manifestDigest is not available
// https://github.com/opencontainers/image-spec/blob/main/config.md#layer-chainid
input = calculateChainID(metadata.Layers)
if input == "" {
// TODO what happens here if image has no layers?
// is this case possible?
input = digest.Canonical.FromString(metadata.UserInput).String()
}
}
if !alias.IsEmpty() {
// if the user provided an alias, we want to consider that in the artifact ID. This way if the user
// scans the same item but is considered to be logically different, then ID will express that.
aliasStr := fmt.Sprintf(":%s@%s", alias.Name, alias.Version)
input = digest.Canonical.FromString(input + aliasStr).String()
}
return internal.ArtifactIDFromDigest(input)
}
func calculateChainID(lm []source.LayerMetadata) string {
if len(lm) < 1 {
return ""
}
// DiffID(L0) = digest of layer 0
// https://github.com/anchore/stereoscope/blob/1b1b744a919964f38d14e1416fb3f25221b761ce/pkg/image/layer_metadata.go#L19-L32
chainID := lm[0].Digest
id := chain(chainID, lm[1:])
return id
}
func chain(chainID string, layers []source.LayerMetadata) string {
if len(layers) < 1 {
return chainID
}
chainID = digest.Canonical.FromString(layers[0].Digest + " " + chainID).String()
return chain(chainID, layers[1:])
}
func getImageExclusionFunction(exclusions []string) func(string) bool { func getImageExclusionFunction(exclusions []string) func(string) bool {
if len(exclusions) == 0 { if len(exclusions) == 0 {
return nil return nil

View File

@ -2,8 +2,6 @@ package stereoscopesource
import ( import (
"context" "context"
"crypto/sha256"
"fmt"
"strings" "strings"
"testing" "testing"
@ -12,7 +10,6 @@ import (
"github.com/anchore/stereoscope" "github.com/anchore/stereoscope"
"github.com/anchore/stereoscope/pkg/imagetest" "github.com/anchore/stereoscope/pkg/imagetest"
"github.com/anchore/syft/syft/artifact"
"github.com/anchore/syft/syft/internal/testutil" "github.com/anchore/syft/syft/internal/testutil"
"github.com/anchore/syft/syft/source" "github.com/anchore/syft/syft/source"
) )
@ -112,146 +109,6 @@ func Test_StereoscopeImage_Exclusions(t *testing.T) {
} }
} }
func Test_StereoscopeImageSource_ID(t *testing.T) {
tests := []struct {
name string
alias source.Alias
metadata source.ImageMetadata
want artifact.ID
}{
{
name: "use raw manifest over chain ID or user input",
metadata: source.ImageMetadata{
UserInput: "user-input",
Layers: []source.LayerMetadata{
{
Digest: "a",
},
{
Digest: "b",
},
{
Digest: "c",
},
},
RawManifest: []byte("raw-manifest"),
},
want: func() artifact.ID {
hasher := sha256.New()
hasher.Write([]byte("raw-manifest"))
return artifact.ID(fmt.Sprintf("%x", hasher.Sum(nil)))
}(),
},
{
name: "use chain ID over user input",
metadata: source.ImageMetadata{
//UserInput: "user-input",
Layers: []source.LayerMetadata{
{
Digest: "a",
},
{
Digest: "b",
},
{
Digest: "c",
},
},
},
want: func() artifact.ID {
metadata := []source.LayerMetadata{
{
Digest: "a",
},
{
Digest: "b",
},
{
Digest: "c",
},
}
return artifact.ID(strings.TrimPrefix(calculateChainID(metadata), "sha256:"))
}(),
},
{
name: "use user input last",
metadata: source.ImageMetadata{
UserInput: "user-input",
},
want: func() artifact.ID {
hasher := sha256.New()
hasher.Write([]byte("user-input"))
return artifact.ID(fmt.Sprintf("%x", hasher.Sum(nil)))
}(),
},
{
name: "without alias (first)",
metadata: source.ImageMetadata{
UserInput: "user-input",
Layers: []source.LayerMetadata{
{
Digest: "a",
},
{
Digest: "b",
},
{
Digest: "c",
},
},
RawManifest: []byte("raw-manifest"),
},
want: "85298926ecd92ed57688f13039017160cd728f04dd0d2d10a10629007106f107",
},
{
name: "always consider alias (first)",
alias: source.Alias{
Name: "alias",
Version: "version",
},
metadata: source.ImageMetadata{
UserInput: "user-input",
Layers: []source.LayerMetadata{
{
Digest: "a",
},
{
Digest: "b",
},
{
Digest: "c",
},
},
RawManifest: []byte("raw-manifest"),
},
want: "a8717e42449960c1dd4963f2f22bd69c7c105e7e82445be0a65aa1825d62ff0d",
},
{
name: "without alias (last)",
metadata: source.ImageMetadata{
UserInput: "user-input",
},
want: "ab0dff627d80b9753193d7280bec8f45e8ec6b4cb0912c6fffcf7cd782d9739e",
},
{
name: "always consider alias (last)",
alias: source.Alias{
Name: "alias",
Version: "version",
},
metadata: source.ImageMetadata{
UserInput: "user-input",
},
want: "fe86c0eecd5654d3c0c0b2176aa394aef6440347c241aa8d9b628dfdde4287cf",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
assert.Equal(t, tt.want, deriveIDFromStereoscopeImage(tt.alias, tt.metadata))
})
}
}
func Test_Describe(t *testing.T) { func Test_Describe(t *testing.T) {
tests := []struct { tests := []struct {
name string name string