diff --git a/syft/pkg/cataloger/ai/cataloger.go b/syft/pkg/cataloger/ai/cataloger.go index 4735bb744..d5ddea316 100644 --- a/syft/pkg/cataloger/ai/cataloger.go +++ b/syft/pkg/cataloger/ai/cataloger.go @@ -40,5 +40,5 @@ func NewSafeTensorsCataloger() pkg.Cataloger { WithParserByGlobs(parseSafeTensorsIndex, "**/*.safetensors.index.json"). WithParserByMediaType(parseSafeTensorsOCIConfig, dockerAIModelConfigMediaTypes...). WithParserByMediaType(parseSafeTensorsOCILayer, dockerAISafeTensorsMediaType). - WithProcessors(safeTensorsMergeProcessor) + WithResolvingProcessors(safeTensorsMergeProcessor) } diff --git a/syft/pkg/cataloger/ai/package.go b/syft/pkg/cataloger/ai/package.go index b6043933b..c82ab494c 100644 --- a/syft/pkg/cataloger/ai/package.go +++ b/syft/pkg/cataloger/ai/package.go @@ -21,13 +21,15 @@ func newGGUFPackage(metadata *pkg.GGUFFileHeader, modelName, version, license st return p } -func newSafeTensorsPackage(metadata *pkg.SafeTensorsModelInfo, modelName, version, license string, locations ...file.Location) pkg.Package { +// newSafeTensorsPackage creates a SafeTensors package with the given metadata +// and locations. Name and Licenses are intentionally not set here — the +// safetensors cataloger emits nameless packages from every parser, and the +// merge processor is the single owner of naming, license resolution, and +// supporting-evidence attachment. +func newSafeTensorsPackage(metadata *pkg.SafeTensorsModelInfo, locations ...file.Location) pkg.Package { p := pkg.Package{ - Name: modelName, - Version: version, Locations: file.NewLocationSet(locations...), Type: pkg.ModelPkg, - Licenses: pkg.NewLicenseSet(pkg.NewLicensesFromValues(license)...), Metadata: *metadata, // PURL is intentionally not set: package-url has not yet finalized ML model support. } diff --git a/syft/pkg/cataloger/ai/parse_safetensors_model.go b/syft/pkg/cataloger/ai/parse_safetensors_model.go index 0d4ddd6c5..97c4ac4be 100644 --- a/syft/pkg/cataloger/ai/parse_safetensors_model.go +++ b/syft/pkg/cataloger/ai/parse_safetensors_model.go @@ -1,20 +1,13 @@ package ai import ( - "bytes" "context" "encoding/json" "fmt" "io" - "path" - "path/filepath" "strconv" - "strings" - - "gopkg.in/yaml.v3" "github.com/anchore/syft/internal" - "github.com/anchore/syft/internal/log" "github.com/anchore/syft/internal/unknown" "github.com/anchore/syft/syft/artifact" "github.com/anchore/syft/syft/file" @@ -22,10 +15,11 @@ import ( "github.com/anchore/syft/syft/pkg/cataloger/generic" ) -// parseSafeTensorsFile parses a single .safetensors file by reading only its -// JSON header, then enriches the resulting package with metadata from sibling -// config.json and README.md files when the resolver can find them. -func parseSafeTensorsFile(_ context.Context, resolver file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) { +// parseSafeTensorsFile decodes the JSON header of a single .safetensors file +// and emits a nameless package whose metadata is derived purely from the +// header bytes. Naming, license resolution, sibling enrichment, and cross- +// shard rollup are all the responsibility of safeTensorsMergeProcessor. +func parseSafeTensorsFile(_ context.Context, _ file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) { defer internal.CloseAndLogError(reader, reader.Path()) header, _, err := readSafeTensorsHeader(&io.LimitedReader{R: reader, N: maxSafeTensorsHeaderSize + 8}) @@ -45,27 +39,19 @@ func parseSafeTensorsFile(_ context.Context, resolver file.Resolver, _ *generic. md.Parameters = formatParameterCount(p) } - name, license := enrichFromSiblings(resolver, reader.Path(), &md) - if name == "" { - name = modelNameFromPath(reader.Path()) - } - p := newSafeTensorsPackage( &md, - name, - "", - license, reader.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation), ) - return []pkg.Package{p}, nil, unknown.IfEmptyf([]pkg.Package{p}, "unable to parse safetensors file") } -// parseSafeTensorsIndex parses a model.safetensors.index.json file for a sharded -// model. The index lists every tensor and the shard file it lives in; from this -// we derive tensor count, unique shard count, and (when present) the producer- -// declared total_size. -func parseSafeTensorsIndex(_ context.Context, resolver file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) { +// parseSafeTensorsIndex decodes a model.safetensors.index.json file for a +// sharded model and emits a nameless package recording tensor count, unique +// shard count, and (when present) the producer-declared total_size. Like +// parseSafeTensorsFile, naming and sibling enrichment happen in the merge +// processor. +func parseSafeTensorsIndex(_ context.Context, _ file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) { defer internal.CloseAndLogError(reader, reader.Path()) var doc struct { @@ -92,179 +78,16 @@ func parseSafeTensorsIndex(_ context.Context, resolver file.Resolver, _ *generic md.TotalSize = formatByteSize(doc.Metadata.TotalSize.String()) } - name, license := enrichFromSiblings(resolver, reader.Path(), &md) - if name == "" { - name = modelNameFromIndexPath(reader.Path()) - } - p := newSafeTensorsPackage( &md, - name, - "", - license, reader.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation), ) - return []pkg.Package{p}, nil, unknown.IfEmptyf([]pkg.Package{p}, "unable to parse safetensors index") } -// enrichFromSiblings looks for a sibling config.json and README.md next to the -// safetensors artifact and folds their values into the metadata struct. It -// returns a name and license derived from those sources, with the caller free -// to fall back to a filename-derived default. -func enrichFromSiblings(resolver file.Resolver, sourcePath string, md *pkg.SafeTensorsModelInfo) (name, license string) { - if resolver == nil { - return "", "" - } - dir := path.Dir(sourcePath) - - if cfg := readSiblingJSON(resolver, path.Join(dir, "config.json")); cfg != nil { - if md.Architecture == "" && len(cfg.Architectures) > 0 { - md.Architecture = cfg.Architectures[0] - } - if md.TorchDtype == "" { - md.TorchDtype = cfg.TorchDtype - } - if md.TransformersVersion == "" { - md.TransformersVersion = cfg.TransformersVersion - } - if cfg.NameOrPath != "" { - name = path.Base(cfg.NameOrPath) - } - } - - if fm := readReadmeFrontmatter(resolver, path.Join(dir, "README.md")); fm != nil { - if license == "" { - license = fm.License - } - if name == "" && len(fm.BaseModel) > 0 { - name = path.Base(fm.BaseModel[0]) - } - } - - return name, license -} - -// hfConfig is a minimal projection of Hugging Face config.json fields we care about. -type hfConfig struct { - Architectures []string `json:"architectures"` - TorchDtype string `json:"torch_dtype"` - TransformersVersion string `json:"transformers_version"` - NameOrPath string `json:"_name_or_path"` -} - -func readSiblingJSON(resolver file.Resolver, p string) *hfConfig { - locations, err := resolver.FilesByPath(p) - if err != nil || len(locations) == 0 { - return nil - } - rc, err := resolver.FileContentsByLocation(locations[0]) - if err != nil { - return nil - } - defer internal.CloseAndLogError(rc, p) - - var cfg hfConfig - if err := json.NewDecoder(rc).Decode(&cfg); err != nil { - log.Debugf("failed to decode %s: %v", p, err) - return nil - } - return &cfg -} - -// readmeFrontmatter holds the subset of YAML frontmatter fields we extract. -type readmeFrontmatter struct { - License string `yaml:"license"` - BaseModel []string `yaml:"base_model"` -} - -// readReadmeFrontmatter extracts the leading YAML frontmatter block from a README. -// The block is delimited by "---" lines at the start of the file. -func readReadmeFrontmatter(resolver file.Resolver, p string) *readmeFrontmatter { - locations, err := resolver.FilesByPath(p) - if err != nil || len(locations) == 0 { - return nil - } - rc, err := resolver.FileContentsByLocation(locations[0]) - if err != nil { - return nil - } - defer internal.CloseAndLogError(rc, p) - - buf, err := io.ReadAll(io.LimitReader(rc, 1024*1024)) - if err != nil { - return nil - } - return parseFrontmatter(buf) -} - -// parseFrontmatter pulls the YAML block between the first and second "---" lines -// of a file (if present) and decodes known fields from it. -func parseFrontmatter(buf []byte) *readmeFrontmatter { - trimmed := bytes.TrimLeft(buf, "\xef\xbb\xbf \t\r\n") - if !bytes.HasPrefix(trimmed, []byte("---")) { - return nil - } - rest := trimmed[3:] - // trim the newline directly following the opening delimiter - if i := bytes.IndexByte(rest, '\n'); i >= 0 { - rest = rest[i+1:] - } - end := bytes.Index(rest, []byte("\n---")) - if end < 0 { - return nil - } - - // base_model may be either a scalar ("org/model") or a sequence; decode it - // as a yaml.Node so a scalar value does not fail the whole block. - var raw struct { - License string `yaml:"license"` - BaseModel yaml.Node `yaml:"base_model"` - } - if err := yaml.Unmarshal(rest[:end], &raw); err != nil { - log.Debugf("failed to parse README frontmatter: %v", err) - return nil - } - - fm := readmeFrontmatter{License: raw.License} - switch raw.BaseModel.Kind { - case yaml.ScalarNode: - if raw.BaseModel.Value != "" { - fm.BaseModel = []string{raw.BaseModel.Value} - } - case yaml.SequenceNode: - _ = raw.BaseModel.Decode(&fm.BaseModel) - } - return &fm -} - -// defaultModelName is the fallback package name when no model name can be -// derived from sibling files, the file path, or OCI companion layers. -const defaultModelName = "safetensors-model" - -// modelNameFromPath turns "/models/foo/model.safetensors" into "foo". -// For a bare filename "weights.safetensors" we return "weights". -func modelNameFromPath(p string) string { - base := strings.TrimSuffix(filepath.Base(p), ".safetensors") - dir := filepath.Base(filepath.Dir(p)) - if dir != "" && dir != "." && dir != string(filepath.Separator) { - return dir - } - return base -} - -// modelNameFromIndexPath derives a model name from the index filename's parent -// directory, defaulting to "safetensors-model" if no useful directory name exists. -func modelNameFromIndexPath(p string) string { - dir := filepath.Base(filepath.Dir(p)) - if dir != "" && dir != "." && dir != string(filepath.Separator) { - return dir - } - return defaultModelName -} - -// formatParameterCount prints a count like 6_700_000_000 as "6.7B" using B/M/K -// thresholds matching the notation used by Hugging Face and Docker AI labels. +// formatParameterCount prints a count like 6_700_000_000 as "6.70B" using +// B/M/K thresholds matching the notation used by Hugging Face and Docker AI +// labels. func formatParameterCount(n uint64) string { switch { case n >= 1_000_000_000: @@ -278,9 +101,9 @@ func formatParameterCount(n uint64) string { } } -// formatByteSize turns a numeric string (bytes) into a human-friendly size like -// "71.90GB". Non-numeric inputs are passed through unchanged so we never lose -// producer-declared strings such as "71.90GB". +// formatByteSize turns a numeric string (bytes) into a human-friendly size +// like "71.90GB". Non-numeric inputs are passed through unchanged so producer- +// declared strings (e.g. "71.90GB" from a Docker AI config blob) survive. func formatByteSize(s string) string { n, err := strconv.ParseUint(s, 10, 64) if err != nil || n == 0 { diff --git a/syft/pkg/cataloger/ai/parse_safetensors_oci.go b/syft/pkg/cataloger/ai/parse_safetensors_oci.go index 7f5e5f0bb..6449c2aca 100644 --- a/syft/pkg/cataloger/ai/parse_safetensors_oci.go +++ b/syft/pkg/cataloger/ai/parse_safetensors_oci.go @@ -8,7 +8,6 @@ import ( "strings" "github.com/anchore/syft/internal" - "github.com/anchore/syft/internal/log" "github.com/anchore/syft/internal/unknown" "github.com/anchore/syft/syft/artifact" "github.com/anchore/syft/syft/file" @@ -48,13 +47,13 @@ type dockerAIModelConfig struct { } `json:"config"` } -// parseSafeTensorsOCIConfig parses a Docker AI model-config blob. When the blob -// advertises format=="safetensors" it emits a single named package whose -// metadata is enriched by scanning sibling OCI layers (README.md for license + -// base_model name, config.json for architecture, LICENSE text for a license -// fallback). For any other format it emits nothing so the GGUF cataloger can -// claim the image. -func parseSafeTensorsOCIConfig(_ context.Context, resolver file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) { +// parseSafeTensorsOCIConfig decodes the Docker AI model-config blob and emits +// a nameless package whose metadata mirrors the producer-declared aggregate +// fields (Format, Quantization, Parameters, Size, TensorCount). For any +// format other than "safetensors" it emits nothing so the GGUF cataloger can +// claim the artifact. Naming, license, and HF-companion enrichment all run +// once per group in safeTensorsMergeProcessor. +func parseSafeTensorsOCIConfig(_ context.Context, _ file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) { defer internal.CloseAndLogError(reader, reader.Path()) body, err := io.ReadAll(io.LimitReader(reader, 1024*1024)) @@ -81,174 +80,18 @@ func parseSafeTensorsOCIConfig(_ context.Context, resolver file.Resolver, _ *gen md.TensorCount = uint64(n) } - name, license := enrichFromDockerAILayers(resolver, &md) - if name == "" { - name = defaultModelName - } - p := newSafeTensorsPackage( &md, - name, - "", - license, reader.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation), ) - return []pkg.Package{p}, nil, unknown.IfEmptyf([]pkg.Package{p}, "unable to parse docker AI safetensors config") } -// enrichFromDockerAILayers walks sibling Docker AI layers via the OCI resolver -// and mines them for a model name, architecture, and license. README.md carries -// YAML frontmatter with license + base_model; HF config.json carries -// architectures/torch_dtype/transformers_version; the vnd.docker.ai.license -// blob is plain license text. -func enrichFromDockerAILayers(resolver file.Resolver, md *pkg.SafeTensorsModelInfo) (name, license string) { - ociResolver, ok := resolver.(file.OCIMediaTypeResolver) - if !ok { - return "", "" - } - - modelFileLocations, err := ociResolver.FilesByMediaType(dockerAIModelFileMediaType) - if err != nil { - log.Debugf("failed to list docker AI model-file layers: %v", err) - } - - // Collect name candidates separately so precedence does not depend on the - // order the resolver returns layers in. config.json's _name_or_path wins over - // a README base_model, matching enrichFromSiblings. - var configName, readmeName string - for _, loc := range modelFileLocations { - readAndClassifyDockerAILayer(resolver, loc, md, &configName, &readmeName, &license) - } - - name = configName - if name == "" { - name = readmeName - } - - if license == "" { - license = readDockerAILicense(resolver, ociResolver) - } - - return name, license -} - -// readAndClassifyDockerAILayer fetches a single Docker AI model-file layer and -// passes its contents to classifyAndMerge. Split out from the calling loop so -// the resolver handle is closed via defer on every iteration. -func readAndClassifyDockerAILayer(resolver file.Resolver, loc file.Location, md *pkg.SafeTensorsModelInfo, configName, readmeName, license *string) { - rc, err := resolver.FileContentsByLocation(loc) - if err != nil { - return - } - defer internal.CloseAndLogError(rc, loc.RealPath) - - buf, err := io.ReadAll(io.LimitReader(rc, 4*1024*1024)) - if err != nil { - return - } - classifyAndMerge(buf, md, configName, readmeName, license) -} - -// classifyAndMerge sniffs a vnd.docker.ai.model.file blob (which can be README.md, -// config.json, generation_config.json, tokenizer.json, etc.) and folds useful -// fields into the metadata struct and out-parameters. -func classifyAndMerge(buf []byte, md *pkg.SafeTensorsModelInfo, configName, readmeName, license *string) { - trimmed := trimLeadingWhitespace(buf) - switch { - case hasPrefix(trimmed, "---"): - if fm := parseFrontmatter(buf); fm != nil { - if *license == "" { - *license = fm.License - } - if *readmeName == "" && len(fm.BaseModel) > 0 { - *readmeName = lastPathSegment(fm.BaseModel[0]) - } - } - case hasPrefix(trimmed, "{"): - var cfg hfConfig - if err := json.Unmarshal(buf, &cfg); err != nil { - return - } - if md.Architecture == "" && len(cfg.Architectures) > 0 { - md.Architecture = cfg.Architectures[0] - } - if md.TorchDtype == "" { - md.TorchDtype = cfg.TorchDtype - } - if md.TransformersVersion == "" { - md.TransformersVersion = cfg.TransformersVersion - } - if *configName == "" && cfg.NameOrPath != "" { - *configName = lastPathSegment(cfg.NameOrPath) - } - } -} - -// readDockerAILicense extracts a short license identifier from the first line -// of a vnd.docker.ai.license layer. Docker packages the full license text, so -// we only peek at a prefix looking for well-known titles like "Apache License". -func readDockerAILicense(resolver file.Resolver, ociResolver file.OCIMediaTypeResolver) string { - locations, err := ociResolver.FilesByMediaType(dockerAILicenseMediaType) - if err != nil || len(locations) == 0 { - return "" - } - rc, err := resolver.FileContentsByLocation(locations[0]) - if err != nil { - return "" - } - defer internal.CloseAndLogError(rc, locations[0].RealPath) - - buf, err := io.ReadAll(io.LimitReader(rc, 2048)) - if err != nil { - return "" - } - text := strings.ToLower(string(buf)) - switch { - case strings.Contains(text, "apache license") && strings.Contains(text, "version 2.0"): - return "Apache-2.0" - case strings.Contains(text, "mit license"): - return "MIT" - case strings.Contains(text, "bsd 3-clause"): - return "BSD-3-Clause" - case strings.Contains(text, "bsd 2-clause"): - return "BSD-2-Clause" - case strings.Contains(text, "gnu general public license") && strings.Contains(text, "version 3"): - return "GPL-3.0" - } - return "" -} - -func hasPrefix(b []byte, s string) bool { - return len(b) >= len(s) && string(b[:len(s)]) == s -} - -func trimLeadingWhitespace(b []byte) []byte { - i := 0 - for i < len(b) && (b[i] == ' ' || b[i] == '\t' || b[i] == '\r' || b[i] == '\n') { - i++ - } - // strip a leading UTF-8 BOM if present - if len(b)-i >= 3 && b[i] == 0xEF && b[i+1] == 0xBB && b[i+2] == 0xBF { - i += 3 - } - return b[i:] -} - -func lastPathSegment(s string) string { - if i := strings.LastIndexAny(s, "/\\"); i >= 0 { - return s[i+1:] - } - return s -} - -// parseSafeTensorsOCILayer parses a SafeTensors weight layer from an OCI model -// artifact by reading only its JSON header (the layer is fetched up to a small -// byte cap by the source layer; tensor data is never downloaded). It emits a -// nameless package so safeTensorsMergeProcessor folds the result into the -// config-derived named package as a Part. The point of this parser is to give -// OCI scans the same content-derived fields the directory-scan path produces: -// real tensor count, normalized quantization, __metadata__, and MetadataHash. +// parseSafeTensorsOCILayer decodes the JSON header of a SafeTensors weight +// layer fetched from an OCI model artifact (the source layer caps each layer +// at a small prefix; tensor data is never downloaded). It emits a nameless +// package; safeTensorsMergeProcessor folds it into the artifact's group and +// rolls per-shard fields up into the final merged package. func parseSafeTensorsOCILayer(_ context.Context, _ file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) { defer internal.CloseAndLogError(reader, reader.Path()) @@ -268,17 +111,10 @@ func parseSafeTensorsOCILayer(_ context.Context, _ file.Resolver, _ *generic.Env md.Parameters = formatParameterCount(p) } - // Emit nameless; safeTensorsMergeProcessor will absorb this into the - // config-derived named package as a Part. The merge runs even when only - // nameless packages exist, in which case the result is dropped. p := newSafeTensorsPackage( &md, - "", - "", - "", reader.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation), ) - return []pkg.Package{p}, nil, nil } diff --git a/syft/pkg/cataloger/ai/parse_safetensors_test.go b/syft/pkg/cataloger/ai/parse_safetensors_test.go index afdda7fab..d3a9e04e8 100644 --- a/syft/pkg/cataloger/ai/parse_safetensors_test.go +++ b/syft/pkg/cataloger/ai/parse_safetensors_test.go @@ -124,167 +124,230 @@ func TestSafeTensorsCataloger_shardedIndex(t *testing.T) { TestCataloger(t, NewSafeTensorsCataloger()) } +// TestParseSafeTensorsOCIConfig covers the parser in isolation: it should emit +// a nameless package mirroring the config blob's producer-declared fields, and +// emit nothing for non-safetensors formats so the GGUF cataloger can claim the +// artifact. Naming and license resolution happen in the merge processor and are +// tested separately under TestSafeTensorsMergeProcessor. func TestParseSafeTensorsOCIConfig(t *testing.T) { - configBlob := []byte(`{"config":{"format":"safetensors","quantization":"Q4_K_M","parameters":"8B","size":"16.00GB","safetensors":{"tensor_count":291}}}`) + t.Run("emits a nameless package with config-blob fields", func(t *testing.T) { + blob := []byte(`{"config":{"format":"safetensors","quantization":"Q4_K_M","parameters":"8B","size":"16.00GB","safetensors":{"tensor_count":291}}}`) - t.Run("enriches from companion layers", func(t *testing.T) { - dir := t.TempDir() - readmePath := filepath.Join(dir, "README.md") - require.NoError(t, os.WriteFile(readmePath, - []byte("---\nlicense: mit\nbase_model:\n - org/My-Model\n---\n# card\n"), 0o644)) - hfConfigPath := filepath.Join(dir, "config.json") - require.NoError(t, os.WriteFile(hfConfigPath, - []byte(`{"architectures":["Qwen3ForCausalLM"],"torch_dtype":"bfloat16"}`), 0o644)) - - resolver := file.NewMockResolverForMediaTypes(map[string][]file.Location{ - dockerAIModelFileMediaType: {file.NewLocation(readmePath), file.NewLocation(hfConfigPath)}, - }) - - pkgs, _, err := parseSafeTensorsOCIConfig(context.Background(), resolver, nil, configReader(configBlob)) + pkgs, _, err := parseSafeTensorsOCIConfig(context.Background(), nil, nil, configReader(blob)) require.NoError(t, err) require.Len(t, pkgs, 1) p := pkgs[0] - assert.Equal(t, "My-Model", p.Name) - assert.Equal(t, pkg.ModelPkg, p.Type) - assertHasLicense(t, p, "mit") - + assert.Empty(t, p.Name, "config-blob parser must emit nameless; the merge processor names it") + assert.Empty(t, p.Licenses.ToSlice(), "license resolution belongs to the merge processor") md := p.Metadata.(pkg.SafeTensorsModelInfo) assert.Equal(t, "safetensors", md.Format) - assert.Equal(t, "Qwen3ForCausalLM", md.Architecture) - assert.Equal(t, "bfloat16", md.TorchDtype) assert.Equal(t, "Q4_K_M", md.Quantization) assert.Equal(t, "8B", md.Parameters) assert.Equal(t, "16.00GB", md.TotalSize) assert.Equal(t, uint64(291), md.TensorCount) - }) - - t.Run("falls back to license layer", func(t *testing.T) { - dir := t.TempDir() - readmePath := filepath.Join(dir, "README.md") - require.NoError(t, os.WriteFile(readmePath, - []byte("---\nbase_model:\n - org/My-Model\n---\n"), 0o644)) - licensePath := filepath.Join(dir, "LICENSE") - require.NoError(t, os.WriteFile(licensePath, - []byte(" Apache License\n Version 2.0, January 2004\n"), 0o644)) - - resolver := file.NewMockResolverForMediaTypes(map[string][]file.Location{ - dockerAIModelFileMediaType: {file.NewLocation(readmePath)}, - dockerAILicenseMediaType: {file.NewLocation(licensePath)}, - }) - - pkgs, _, err := parseSafeTensorsOCIConfig(context.Background(), resolver, nil, configReader(configBlob)) - require.NoError(t, err) - require.Len(t, pkgs, 1) - assertHasLicense(t, pkgs[0], "Apache-2.0") - }) - - t.Run("config _name_or_path wins over README base_model regardless of layer order", func(t *testing.T) { - dir := t.TempDir() - readmePath := filepath.Join(dir, "README.md") - require.NoError(t, os.WriteFile(readmePath, []byte("---\nbase_model:\n - org/Readme-Name\n---\n"), 0o644)) - hfConfigPath := filepath.Join(dir, "config.json") - require.NoError(t, os.WriteFile(hfConfigPath, []byte(`{"_name_or_path":"org/Config-Name"}`), 0o644)) - - // both layer orderings must yield the same (config-derived) name - orderings := [][]file.Location{ - {file.NewLocation(readmePath), file.NewLocation(hfConfigPath)}, - {file.NewLocation(hfConfigPath), file.NewLocation(readmePath)}, - } - for _, locs := range orderings { - resolver := file.NewMockResolverForMediaTypes(map[string][]file.Location{ - dockerAIModelFileMediaType: locs, - }) - pkgs, _, err := parseSafeTensorsOCIConfig(context.Background(), resolver, nil, configReader(configBlob)) - require.NoError(t, err) - require.Len(t, pkgs, 1) - assert.Equal(t, "Config-Name", pkgs[0].Name) - } - }) - - t.Run("falls back to default name when none derivable", func(t *testing.T) { - resolver := file.NewMockResolverForMediaTypes(map[string][]file.Location{}) - - pkgs, _, err := parseSafeTensorsOCIConfig(context.Background(), resolver, nil, configReader(configBlob)) - require.NoError(t, err) - require.Len(t, pkgs, 1) - assert.Equal(t, "safetensors-model", pkgs[0].Name, "model must still be emitted, not dropped") + assert.Empty(t, md.MetadataHash, "config blobs have no header content to hash") }) t.Run("ignores non-safetensors format", func(t *testing.T) { ggufBlob := []byte(`{"config":{"format":"gguf","quantization":"Q4_K_M"}}`) - resolver := file.NewMockResolverForMediaTypes(map[string][]file.Location{}) - - pkgs, _, err := parseSafeTensorsOCIConfig(context.Background(), resolver, nil, configReader(ggufBlob)) + pkgs, _, err := parseSafeTensorsOCIConfig(context.Background(), nil, nil, configReader(ggufBlob)) require.NoError(t, err) assert.Empty(t, pkgs) }) } +// TestSafeTensorsMergeProcessor exercises the merge processor directly with +// synthetic input. The full-cataloger integration tests cover the realistic +// happy paths; this focuses on grouping, the naming precedence chain, the +// drop-when-unnameable rule, and cross-shard rollup. func TestSafeTensorsMergeProcessor(t *testing.T) { - named := pkg.Package{Name: "model-a", Type: pkg.ModelPkg, Metadata: pkg.SafeTensorsModelInfo{Format: "safetensors", MetadataHash: "aaaa"}} - nameless := pkg.Package{Name: "", Type: pkg.ModelPkg, Metadata: pkg.SafeTensorsModelInfo{Format: "safetensors", MetadataHash: "bbbb"}} - - t.Run("preserves the part's MetadataHash when the named package already has one", func(t *testing.T) { - out, _, err := safeTensorsMergeProcessor([]pkg.Package{named, nameless}, nil, nil) - require.NoError(t, err) - require.Len(t, out, 1) - assert.Equal(t, "model-a", out[0].Name) - md := out[0].Metadata.(pkg.SafeTensorsModelInfo) - require.Len(t, md.Parts, 1) - assert.Equal(t, "bbbb", md.Parts[0].MetadataHash, "part hash must survive: it is the cross-source fingerprint") - assert.Equal(t, "aaaa", md.MetadataHash, "named package's own hash is not overwritten") - assert.Equal(t, 1, md.ShardCount) - }) - - t.Run("lifts the single part's MetadataHash to top-level when named has none", func(t *testing.T) { - // This is the OCI single-shard shape: the config-blob parser produces a - // named package with no hash; the weight-layer parser produces a nameless - // part with the real header hash. Top-level should land in the same field - // a dir-scan single-file would populate, so callers can correlate them. - namedNoHash := pkg.Package{Name: "model-b", Type: pkg.ModelPkg, Metadata: pkg.SafeTensorsModelInfo{Format: "safetensors"}} - part := pkg.Package{Name: "", Type: pkg.ModelPkg, Metadata: pkg.SafeTensorsModelInfo{Format: "safetensors", MetadataHash: "deadbeef"}} - out, _, err := safeTensorsMergeProcessor([]pkg.Package{namedNoHash, part}, nil, nil) - require.NoError(t, err) - require.Len(t, out, 1) - md := out[0].Metadata.(pkg.SafeTensorsModelInfo) - assert.Equal(t, "deadbeef", md.MetadataHash, "single-shard lift makes OCI top-level match dir-scan top-level") - require.Len(t, md.Parts, 1) - assert.Equal(t, "deadbeef", md.Parts[0].MetadataHash, "part also retains its hash") - }) - - t.Run("multi-shard preserves per-part hashes and sorts deterministically", func(t *testing.T) { - // Three nameless layer packages absorbed into one named config-derived package. - // Top-level MetadataHash stays empty (no canonical single hash for a sharded - // model — callers must combine the per-shard hashes themselves). - namedNoHash := pkg.Package{Name: "model-c", Type: pkg.ModelPkg, Metadata: pkg.SafeTensorsModelInfo{Format: "safetensors"}} - parts := []pkg.Package{ - {Name: "", Type: pkg.ModelPkg, Metadata: pkg.SafeTensorsModelInfo{Format: "safetensors", MetadataHash: "cccc"}}, - {Name: "", Type: pkg.ModelPkg, Metadata: pkg.SafeTensorsModelInfo{Format: "safetensors", MetadataHash: "aaaa"}}, - {Name: "", Type: pkg.ModelPkg, Metadata: pkg.SafeTensorsModelInfo{Format: "safetensors", MetadataHash: "bbbb"}}, + dirPkg := func(realPath string, md pkg.SafeTensorsModelInfo) pkg.Package { + return pkg.Package{ + Type: pkg.ModelPkg, + Metadata: md, + Locations: file.NewLocationSet( + file.NewLocation(realPath). + WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation), + ), } - out, _, err := safeTensorsMergeProcessor(append([]pkg.Package{namedNoHash}, parts...), nil, nil) + } + ociPkg := func(md pkg.SafeTensorsModelInfo) pkg.Package { + return pkg.Package{ + Type: pkg.ModelPkg, + Metadata: md, + Locations: file.NewLocationSet( + file.NewLocation("/"). + WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation), + ), + } + } + + t.Run("dir scan: parent-dir fallback names a bare safetensors with no siblings", func(t *testing.T) { + // case #1: model.safetensors in /models/tiny-llama/ with no config.json + // or README. The processor cannot derive a producer name and Architecture + // is empty, so it lands on the parent-dir rung. + p := dirPkg("/models/tiny-llama/weights.safetensors", pkg.SafeTensorsModelInfo{ + Format: "safetensors", + TensorCount: 4, + Quantization: "BF16", + MetadataHash: "abc", + }) + resolver := file.NewMockResolverForPaths() // no config.json / README available + out, _, err := safeTensorsMergeProcessor(context.Background(), resolver, []pkg.Package{p}, nil, nil) require.NoError(t, err) require.Len(t, out, 1) - md := out[0].Metadata.(pkg.SafeTensorsModelInfo) - assert.Equal(t, 3, md.ShardCount) - assert.Empty(t, md.MetadataHash, "multi-shard leaves top-level hash unset") - require.Len(t, md.Parts, 3) - // Parts sorted by MetadataHash for deterministic SBOM output regardless of resolver order. - assert.Equal(t, []string{"aaaa", "bbbb", "cccc"}, []string{md.Parts[0].MetadataHash, md.Parts[1].MetadataHash, md.Parts[2].MetadataHash}) + assert.Equal(t, "tiny-llama", out[0].Name) }) - t.Run("drops result when no named package", func(t *testing.T) { - out, _, err := safeTensorsMergeProcessor([]pkg.Package{nameless}, nil, nil) + t.Run("dir scan: parent-dir fallback rescues a metadata-only header", func(t *testing.T) { + // case #3: header carries only __metadata__, no tensors. Parameters and + // Architecture are both empty, so Arch-Parameters can't fire either — + // the parent-dir fallback is the only thing that names the package. + p := dirPkg("/scan/edge/headeronly/model.safetensors", pkg.SafeTensorsModelInfo{ + Format: "safetensors", + MetadataHash: "xyz", + UserMetadata: pkg.KeyValues{{Key: "producer", Value: "stgen"}}, + }) + resolver := file.NewMockResolverForPaths() + out, _, err := safeTensorsMergeProcessor(context.Background(), resolver, []pkg.Package{p}, nil, nil) require.NoError(t, err) - assert.Empty(t, out) + require.Len(t, out, 1) + assert.Equal(t, "headeronly", out[0].Name) + }) + + t.Run("dir scan: Architecture-Parameters synthetic wins over parent-dir", func(t *testing.T) { + // Architecture and Parameters are both populated → synthetic wins over + // the parent-dir fallback. _name_or_path is not available (no sibling + // config.json mock). + p := dirPkg("/models/tiny/weights.safetensors", pkg.SafeTensorsModelInfo{ + Format: "safetensors", + Architecture: "LlamaForCausalLM", + Parameters: "2.68B", + TensorCount: 4, + MetadataHash: "abc", + }) + resolver := file.NewMockResolverForPaths() + out, _, err := safeTensorsMergeProcessor(context.Background(), resolver, []pkg.Package{p}, nil, nil) + require.NoError(t, err) + require.Len(t, out, 1) + assert.Equal(t, "LlamaForCausalLM-2.68B", out[0].Name) + }) + + t.Run("OCI: dropped when no name source is available", func(t *testing.T) { + // The vllm-style shape: config-blob package + a weight-layer package, + // both at virtual path "/", no model.file companions on the resolver. + // With nothing to derive a name from, the group is dropped (no opaque + // fallback / no parent-dir option for OCI). + configMd := pkg.SafeTensorsModelInfo{ + Format: "safetensors", + TensorCount: 5, + TotalSize: "1GB", + } + shardMd := pkg.SafeTensorsModelInfo{ + Format: "safetensors", + TensorCount: 5, + Quantization: "BF16", + MetadataHash: "deadbeef", + } + resolver := file.NewMockResolverForMediaTypes(nil) + out, _, err := safeTensorsMergeProcessor( + context.Background(), resolver, + []pkg.Package{ociPkg(configMd), ociPkg(shardMd)}, nil, nil, + ) + require.NoError(t, err) + assert.Empty(t, out, "OCI group with no naming source must be dropped") + }) + + t.Run("OCI: merges config + shard and names from companion config.json", func(t *testing.T) { + // Write a single model.file companion blob containing HF config.json so + // the processor can derive _name_or_path and Architecture from it. + dir := t.TempDir() + hfConfigPath := filepath.Join(dir, "config.json") + require.NoError(t, os.WriteFile(hfConfigPath, + []byte(`{"architectures":["Qwen3ForCausalLM"],"torch_dtype":"bfloat16","_name_or_path":"org/qwen-tiny"}`), 0o644)) + resolver := file.NewMockResolverForMediaTypes(map[string][]file.Location{ + dockerAIModelFileMediaType: {file.NewLocation(hfConfigPath)}, + }) + + configMd := pkg.SafeTensorsModelInfo{ + Format: "safetensors", + Quantization: "Q4_K_M", // raw producer-declared value + Parameters: "8B", + TotalSize: "16.00GB", + TensorCount: 291, + } + shardMd := pkg.SafeTensorsModelInfo{ + Format: "safetensors", + TensorCount: 100, // per-shard count — must NOT be summed onto the aggregate's 291 + Quantization: "BF16", + MetadataHash: "deadbeef", + UserMetadata: pkg.KeyValues{{Key: "format", Value: "pt"}}, + } + out, _, err := safeTensorsMergeProcessor( + context.Background(), resolver, + []pkg.Package{ociPkg(configMd), ociPkg(shardMd)}, nil, nil, + ) + require.NoError(t, err) + require.Len(t, out, 1) + + got := out[0] + assert.Equal(t, "qwen-tiny", got.Name, "name comes from path.Base(_name_or_path)") + md := got.Metadata.(pkg.SafeTensorsModelInfo) + assert.Equal(t, uint64(291), md.TensorCount, "aggregate TensorCount must win — never double-count by summing the shard") + assert.Equal(t, "16.00GB", md.TotalSize) + assert.Equal(t, "8B", md.Parameters) + assert.Equal(t, "Qwen3ForCausalLM", md.Architecture, "Architecture enriched from companion config.json") + assert.Equal(t, "bfloat16", md.TorchDtype) + assert.Equal(t, "Q4_K_M", md.Quantization, "aggregate Quantization wins over shard's normalized dtype when both present") + assert.Equal(t, "deadbeef", md.MetadataHash, "single-shard rollup is the lone shard's hash") + assert.Equal(t, pkg.KeyValues{{Key: "format", Value: "pt"}}, md.UserMetadata) + assert.Nil(t, md.Parts, "single-shard groups skip Parts; the outer view already exposes everything") + }) + + t.Run("OCI: multi-shard rollup hashes are stable and sorted", func(t *testing.T) { + dir := t.TempDir() + hfConfigPath := filepath.Join(dir, "config.json") + require.NoError(t, os.WriteFile(hfConfigPath, + []byte(`{"architectures":["X"],"_name_or_path":"org/multi"}`), 0o644)) + resolver := file.NewMockResolverForMediaTypes(map[string][]file.Location{ + dockerAIModelFileMediaType: {file.NewLocation(hfConfigPath)}, + }) + + configMd := pkg.SafeTensorsModelInfo{Format: "safetensors", TensorCount: 9, TotalSize: "3GB"} + shard := func(hash string, cnt uint64) pkg.SafeTensorsModelInfo { + return pkg.SafeTensorsModelInfo{Format: "safetensors", TensorCount: cnt, Quantization: "BF16", MetadataHash: hash} + } + in := []pkg.Package{ + ociPkg(configMd), + ociPkg(shard("cccc", 3)), + ociPkg(shard("aaaa", 3)), + ociPkg(shard("bbbb", 3)), + } + out1, _, err := safeTensorsMergeProcessor(context.Background(), resolver, in, nil, nil) + require.NoError(t, err) + require.Len(t, out1, 1) + md1 := out1[0].Metadata.(pkg.SafeTensorsModelInfo) + require.Len(t, md1.Parts, 3) + // Parts deterministically sorted by MetadataHash. + assert.Equal(t, + []string{"aaaa", "bbbb", "cccc"}, + []string{md1.Parts[0].MetadataHash, md1.Parts[1].MetadataHash, md1.Parts[2].MetadataHash}, + ) + // Rollup hash is stable across input ordering. + shuffled := []pkg.Package{ociPkg(shard("bbbb", 3)), ociPkg(configMd), ociPkg(shard("aaaa", 3)), ociPkg(shard("cccc", 3))} + out2, _, err := safeTensorsMergeProcessor(context.Background(), resolver, shuffled, nil, nil) + require.NoError(t, err) + md2 := out2[0].Metadata.(pkg.SafeTensorsModelInfo) + assert.Equal(t, md1.MetadataHash, md2.MetadataHash, "rollup hash must not depend on input order") }) t.Run("passes through upstream error", func(t *testing.T) { sentinel := assert.AnError - out, _, err := safeTensorsMergeProcessor([]pkg.Package{named}, nil, sentinel) + p := dirPkg("/models/x/y.safetensors", pkg.SafeTensorsModelInfo{Format: "safetensors", MetadataHash: "h"}) + out, _, err := safeTensorsMergeProcessor(context.Background(), nil, []pkg.Package{p}, nil, sentinel) assert.Equal(t, sentinel, err) - assert.Len(t, out, 1) + assert.Equal(t, []pkg.Package{p}, out) }) } @@ -314,42 +377,65 @@ func TestParseSafeTensorsOCILayer(t *testing.T) { assert.Equal(t, wantHash, md.MetadataHash) }) - t.Run("merges with config-derived named package and lifts ShardCount", func(t *testing.T) { - // Synthesize what the OCI scan would produce: one config-derived named - // package + one weight-layer derived nameless package. Run them through - // the merge processor and assert the result looks like a complete model. - configMd := pkg.SafeTensorsModelInfo{ - Format: "safetensors", - Architecture: "Qwen3ForCausalLM", - Parameters: "2.68B", - TotalSize: "5.00GB", - Quantization: "Q4_K_M", // raw producer string - } - named := pkg.Package{Name: "qwen", Type: pkg.ModelPkg, Metadata: configMd} + t.Run("merged via processor: aggregate fields preserved, hash lifted from single shard", func(t *testing.T) { + // Synthesize the OCI single-shard shape: a config-blob-derived nameless + // package + the weight-layer parser's nameless package (both at virtual + // path "/"). With a companion HF config.json on the resolver to provide + // _name_or_path, the merge processor produces a single named model. + dir := t.TempDir() + hfConfigPath := filepath.Join(dir, "config.json") + require.NoError(t, os.WriteFile(hfConfigPath, + []byte(`{"architectures":["Qwen3ForCausalLM"],"_name_or_path":"org/qwen-test"}`), 0o644)) + resolver := file.NewMockResolverForMediaTypes(map[string][]file.Location{ + dockerAIModelFileMediaType: {file.NewLocation(hfConfigPath)}, + }) - reader := file.NewLocationReadCloser(file.NewLocation("/"), io.NopCloser(bytes.NewReader(blob))) + configPkg := pkg.Package{ + Type: pkg.ModelPkg, + Metadata: pkg.SafeTensorsModelInfo{ + Format: "safetensors", + Parameters: "2.68B", + TotalSize: "5.00GB", + Quantization: "Q4_K_M", // raw producer string + TensorCount: 9999, + }, + Locations: file.NewLocationSet( + file.NewLocation("/").WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation), + ), + } + reader := file.NewLocationReadCloser( + file.NewLocation("/").WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation), + io.NopCloser(bytes.NewReader(blob)), + ) layerPkgs, _, err := parseSafeTensorsOCILayer(context.Background(), nil, nil, reader) require.NoError(t, err) require.Len(t, layerPkgs, 1) - out, _, err := safeTensorsMergeProcessor(append([]pkg.Package{named}, layerPkgs...), nil, nil) + out, _, err := safeTensorsMergeProcessor( + context.Background(), resolver, + append([]pkg.Package{configPkg}, layerPkgs...), nil, nil, + ) require.NoError(t, err) require.Len(t, out, 1) - md := out[0].Metadata.(pkg.SafeTensorsModelInfo) - assert.Equal(t, 1, md.ShardCount, "merge processor should set ShardCount from absorbed parts") - // Producer-declared top-level fields are preserved. - assert.Equal(t, "Qwen3ForCausalLM", md.Architecture) + got := out[0] + assert.Equal(t, "qwen-test", got.Name, "name comes from the companion config.json _name_or_path") + md := got.Metadata.(pkg.SafeTensorsModelInfo) + // Aggregate-declared fields win for totals; per-shard count must NOT be + // summed into the aggregate. + assert.Equal(t, uint64(9999), md.TensorCount) + assert.Equal(t, "5.00GB", md.TotalSize) + assert.Equal(t, "2.68B", md.Parameters) + // Aggregate Quantization wins when set; shard's normalized dtype is the + // fallback (not exercised here because the config had Q4_K_M). assert.Equal(t, "Q4_K_M", md.Quantization) - // Single-shard: the header-derived MetadataHash is lifted to top-level so - // it matches the field a dir-scan would populate. - assert.Equal(t, wantHash, md.MetadataHash, "single-shard OCI scan must expose the hash at the same field as a dir scan") - // The full per-shard breakdown is also preserved under Parts. - require.Len(t, md.Parts, 1) - assert.Equal(t, wantHash, md.Parts[0].MetadataHash) - assert.Equal(t, wantUserMetadata, md.Parts[0].UserMetadata) - assert.Equal(t, uint64(2), md.Parts[0].TensorCount) - assert.Equal(t, "BF16", md.Parts[0].Quantization, "part keeps the normalized header dtype") + // Architecture comes from companion HF config.json enrichment. + assert.Equal(t, "Qwen3ForCausalLM", md.Architecture) + // Single-shard groups skip Parts; the rollup hash is the lone shard's hash. + assert.Nil(t, md.Parts) + assert.Equal(t, wantHash, md.MetadataHash) + assert.Equal(t, wantUserMetadata, md.UserMetadata) + assert.Equal(t, 1, md.ShardCount) }) } @@ -577,13 +663,6 @@ func TestParseFrontmatter(t *testing.T) { }) } -func TestModelNameFromPath(t *testing.T) { - assert.Equal(t, "foo", modelNameFromPath("/models/foo/model.safetensors")) - assert.Equal(t, "weights", modelNameFromPath("weights.safetensors")) - assert.Equal(t, "my-model", modelNameFromIndexPath("/models/my-model/model.safetensors.index.json")) - assert.Equal(t, "safetensors-model", modelNameFromIndexPath("model.safetensors.index.json")) -} - func TestDockerAIModelConfigMediaTypes(t *testing.T) { // supported mirrors how the resolver matches: filepath.Match each registered // media type against a layer's media type. diff --git a/syft/pkg/cataloger/ai/processor.go b/syft/pkg/cataloger/ai/processor.go index 0181820e5..9ad14f8c0 100644 --- a/syft/pkg/cataloger/ai/processor.go +++ b/syft/pkg/cataloger/ai/processor.go @@ -1,12 +1,32 @@ package ai import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "path" "sort" + "strings" + "github.com/cespare/xxhash/v2" + "gopkg.in/yaml.v3" + + "github.com/anchore/syft/internal" + "github.com/anchore/syft/internal/log" "github.com/anchore/syft/syft/artifact" + "github.com/anchore/syft/syft/file" "github.com/anchore/syft/syft/pkg" + "github.com/anchore/syft/syft/pkg/cataloger/internal/licenses" ) +// ociGroupKey is the sentinel grouping key for every safetensors package that +// originated from an OCI model artifact. The ContainerImageModel resolver gives +// each layer the virtual RealPath "/" regardless of layer media type, so all +// safetensors packages from a single OCI scan collapse into one group. +const ociGroupKey = "@oci@" + // ggufMergeProcessor consolidates multiple GGUF packages into a single package // representing the AI model. When scanning OCI images with multiple layers, // each layer may produce a separate package. This processor finds the package @@ -16,7 +36,6 @@ func ggufMergeProcessor(pkgs []pkg.Package, rels []artifact.Relationship, err er if err != nil { return pkgs, rels, err } - if len(pkgs) == 0 { return pkgs, rels, err } @@ -55,69 +74,563 @@ func ggufMergeProcessor(pkgs []pkg.Package, rels []artifact.Relationship, err er } } - // Largest number of key value - return namedPkgs, rels, err } -// safeTensorsMergeProcessor mirrors ggufMergeProcessor for SafeTensors packages. -// When scanning an OCI AI artifact, the model-config blob produces one named -// package and each safetensors weight layer produces a nameless package. The -// nameless packages are absorbed into the named one's Parts slice. +// safeTensorsMergeProcessor is the single owner of naming, license resolution, +// HF config.json mining, cross-shard rollup, and supporting-evidence attachment +// for safetensors packages. The parsers it processes are intentionally minimal +// — they only decode the safetensors-specific format and emit nameless packages +// with content-derived metadata. This function: // -// MetadataHash is intentionally preserved on absorbed parts: it is derived -// purely from the on-disk safetensors header (see SafeTensorsModelInfo doc), -// so it acts as the cross-source content fingerprint. For a single-shard -// model we also copy it up to the named package's top-level MetadataHash so -// that an OCI scan and a directory scan of the same single .safetensors file -// expose the hash at the same field — `md.MetadataHash` — without callers -// having to inspect Parts. -func safeTensorsMergeProcessor(pkgs []pkg.Package, rels []artifact.Relationship, err error) ([]pkg.Package, []artifact.Relationship, error) { - if err != nil { - return pkgs, rels, err - } - if len(pkgs) == 0 { +// 1. groups all nameless packages by parent directory (or a single sentinel +// for OCI artifacts, since the ContainerImageModel resolver puts every +// layer at virtual path "/"); +// 2. merges the per-shard metadata (tensor count, dominant dtype, total size, +// UserMetadata, rollup MetadataHash) into one package per group; +// 3. enriches the merged package by consulting the resolver ONCE per group — +// sibling config.json + README.md for dir scans, the model-file companion +// layers + license layer for OCI — and attaches those locations as +// supporting evidence; +// 4. picks a name via the precedence chain +// config.json _name_or_path → Architecture-Parameters → parent-dir +// and drops the group when none of those produced a name (no opaque +// fallback / no MetadataHash-as-name). +func safeTensorsMergeProcessor(ctx context.Context, resolver file.Resolver, pkgs []pkg.Package, rels []artifact.Relationship, err error) ([]pkg.Package, []artifact.Relationship, error) { + if err != nil || len(pkgs) == 0 { return pkgs, rels, err } - var namedPkgs []pkg.Package - var namelessParts []pkg.SafeTensorsModelInfo + // Defensively split off non-safetensors packages — the cataloger only emits + // SafeTensorsModelInfo today, but this keeps the processor robust if other + // types ever flow through. + var stPkgs, other []pkg.Package for _, p := range pkgs { - if p.Name != "" { - namedPkgs = append(namedPkgs, p) + if _, ok := p.Metadata.(pkg.SafeTensorsModelInfo); ok { + stPkgs = append(stPkgs, p) continue } - if md, ok := p.Metadata.(pkg.SafeTensorsModelInfo); ok { - namelessParts = append(namelessParts, md) + other = append(other, p) + } + if len(stPkgs) == 0 { + return pkgs, rels, err + } + + groups := groupSafeTensorsPackages(stPkgs) + + // Deterministic iteration order so the SBOM doesn't depend on map order. + keys := make([]string, 0, len(groups)) + for k := range groups { + keys = append(keys, k) + } + sort.Strings(keys) + + out := other + for _, key := range keys { + merged := mergeSafeTensorsGroup(groups[key]) + nameOrPath := enrichSafeTensorsGroup(ctx, resolver, key, &merged) + name := pickSafeTensorsName(merged, key, nameOrPath) + if name == "" { + continue // drop unnameable groups, per design (no opaque fallback) } + merged.Name = name + merged.SetID() + out = append(out, merged) } - - if len(namedPkgs) == 0 { - return nil, rels, err - } - - if len(namedPkgs) == 1 && len(namelessParts) > 0 { - // Sort by MetadataHash so OCI layer order (map iteration) doesn't leak - // into the SBOM output. - sort.Slice(namelessParts, func(i, j int) bool { - return namelessParts[i].MetadataHash < namelessParts[j].MetadataHash - }) - winner := &namedPkgs[0] - if md, ok := winner.Metadata.(pkg.SafeTensorsModelInfo); ok { - md.Parts = namelessParts - // Trust per-shard headers over the producer-declared shard count. - md.ShardCount = len(namelessParts) - // Single-shard: lift the part's content fingerprint to the top - // level so the field placement matches a dir-scan single file. - // Only lift when the named package has no hash of its own (the - // OCI config-blob parser never sets one; dir-scan paths never - // produce nameless parts, so they don't reach this branch). - if len(namelessParts) == 1 && md.MetadataHash == "" { - md.MetadataHash = namelessParts[0].MetadataHash - } - winner.Metadata = md - } - } - - return namedPkgs, rels, err + return out, rels, nil +} + +// groupSafeTensorsPackages buckets packages by the parent directory of their +// primary-evidence location, or the OCI sentinel when the location lives at +// the ContainerImageModel resolver's virtual "/" path. +func groupSafeTensorsPackages(pkgs []pkg.Package) map[string][]pkg.Package { + out := make(map[string][]pkg.Package) + for _, p := range pkgs { + key := safeTensorsGroupKey(p) + if key == "" { + continue + } + out[key] = append(out[key], p) + } + return out +} + +func safeTensorsGroupKey(p pkg.Package) string { + loc := primaryEvidenceLocation(p) + if loc == nil { + return "" + } + if loc.RealPath == "/" { + return ociGroupKey + } + return path.Dir(loc.RealPath) +} + +func primaryEvidenceLocation(p pkg.Package) *file.Location { + locs := p.Locations.ToSlice() + for i, l := range locs { + if l.Annotations != nil && l.Annotations[pkg.EvidenceAnnotationKey] == pkg.PrimaryEvidenceAnnotation { + return &locs[i] + } + } + if len(locs) > 0 { + return &locs[0] + } + return nil +} + +// mergeSafeTensorsGroup folds a group's per-member metadata into a single +// package. Members are classified into two buckets to avoid double-counting: +// +// - "aggregate" members have producer-declared totals (TensorCount, TotalSize, +// ShardCount, Parameters) but no MetadataHash — these are the Docker AI +// config blob and the sharded-index file. +// - "shard" members have a content-derived MetadataHash and per-shard counts — +// these are the individual .safetensors header parsers, both dir-scan and +// OCI weight-layer. +// +// Aggregate values are the source of truth for the merged totals when present; +// shards contribute Quantization, UserMetadata, the rollup MetadataHash, and +// (for multi-shard models) the Parts breakdown. +func mergeSafeTensorsGroup(members []pkg.Package) pkg.Package { + locSet := unionLocations(members) + aggregates, shards := bucketSafeTensorsMembers(members) + + merged := pkg.SafeTensorsModelInfo{Format: "safetensors"} + mergeAggregatesInto(&merged, aggregates) + shardTensorTotal, hashes := mergeShardsInto(&merged, shards) + + if merged.TensorCount == 0 { + merged.TensorCount = shardTensorTotal + } + if merged.ShardCount == 0 { + if len(shards) > 0 { + merged.ShardCount = len(shards) + } else { + merged.ShardCount = 1 + } + } + merged.MetadataHash = rollupHash(hashes) + + // Parts only carry value for multi-shard models; for a single shard the + // outer view already exposes every per-shard field. + if len(shards) > 1 { + parts := append([]pkg.SafeTensorsModelInfo(nil), shards...) + sort.Slice(parts, func(i, j int) bool { + return parts[i].MetadataHash < parts[j].MetadataHash + }) + merged.Parts = parts + } + + return pkg.Package{ + Locations: locSet, + Type: pkg.ModelPkg, + Metadata: merged, + } +} + +// mergeAggregatesInto folds aggregate-declared totals (config blob or sharded +// index) into merged. First non-empty wins, so the order aggregates are passed +// in determines tie-breaking — in practice there is one config blob and one +// index per group, never two of the same kind. +func mergeAggregatesInto(merged *pkg.SafeTensorsModelInfo, aggregates []pkg.SafeTensorsModelInfo) { + for _, a := range aggregates { + if merged.TensorCount == 0 { + merged.TensorCount = a.TensorCount + } + if merged.ShardCount == 0 { + merged.ShardCount = a.ShardCount + } + firstNonEmpty(&merged.Parameters, a.Parameters) + firstNonEmpty(&merged.TotalSize, a.TotalSize) + firstNonEmpty(&merged.Architecture, a.Architecture) + firstNonEmpty(&merged.Quantization, a.Quantization) + firstNonEmpty(&merged.TorchDtype, a.TorchDtype) + firstNonEmpty(&merged.TransformersVersion, a.TransformersVersion) + } +} + +// mergeShardsInto folds the per-shard header metadata into merged, returning +// the summed shard TensorCount and the list of non-empty per-shard hashes for +// the rollup. Architecture / TorchDtype / TransformersVersion are accepted as +// fallbacks if a shard ever carries them (the current parsers don't, but the +// resolver-backed enrichment runs afterwards and won't overwrite anything +// already set, so it's safe to populate them earlier). +func mergeShardsInto(merged *pkg.SafeTensorsModelInfo, shards []pkg.SafeTensorsModelInfo) (shardTensorTotal uint64, hashes []string) { + seenKV := map[string]bool{} + for _, s := range shards { + shardTensorTotal += s.TensorCount + firstNonEmpty(&merged.Quantization, s.Quantization) + firstNonEmpty(&merged.Parameters, s.Parameters) + firstNonEmpty(&merged.Architecture, s.Architecture) + firstNonEmpty(&merged.TorchDtype, s.TorchDtype) + firstNonEmpty(&merged.TransformersVersion, s.TransformersVersion) + for _, kv := range s.UserMetadata { + if seenKV[kv.Key] { + continue + } + seenKV[kv.Key] = true + merged.UserMetadata = append(merged.UserMetadata, kv) + } + if s.MetadataHash != "" { + hashes = append(hashes, s.MetadataHash) + } + } + return shardTensorTotal, hashes +} + +func firstNonEmpty(dst *string, v string) { + if *dst == "" { + *dst = v + } +} + +// unionLocations gathers every location from every member into a single set. +func unionLocations(members []pkg.Package) file.LocationSet { + out := file.NewLocationSet() + for _, m := range members { + for _, l := range m.Locations.ToSlice() { + out.Add(l) + } + } + return out +} + +// bucketSafeTensorsMembers splits group members into aggregate-flavored entries +// (no MetadataHash — Docker AI config blob or sharded index) and shard-flavored +// entries (carry a content-derived MetadataHash from a header parser). +func bucketSafeTensorsMembers(members []pkg.Package) (aggregates, shards []pkg.SafeTensorsModelInfo) { + for _, m := range members { + md, ok := m.Metadata.(pkg.SafeTensorsModelInfo) + if !ok { + continue + } + if md.MetadataHash != "" { + shards = append(shards, md) + continue + } + aggregates = append(aggregates, md) + } + return aggregates, shards +} + +// rollupHash returns a stable hash across the sorted set of per-member +// content-derived hashes. For a single member it returns that hash unchanged, +// so a single-file dir scan and an OCI scan with one safetensors layer surface +// the same value. For multi-shard models the rollup is the xxhash of the +// sorted hashes joined with "|". +func rollupHash(hashes []string) string { + if len(hashes) == 0 { + return "" + } + if len(hashes) == 1 { + return hashes[0] + } + sorted := append([]string(nil), hashes...) + sort.Strings(sorted) + return fmt.Sprintf("%016x", xxhash.Sum64String(strings.Join(sorted, "|"))) +} + +// enrichSafeTensorsGroup reads the resolver once for the group to populate the +// merged metadata's Architecture / TorchDtype / TransformersVersion, set the +// licenses on the merged package, and attach the location of every consulted +// supporting file as SupportingEvidence. Returns the raw _name_or_path so the +// caller can apply path.Base in its naming step. +func enrichSafeTensorsGroup(ctx context.Context, resolver file.Resolver, groupKey string, merged *pkg.Package) (nameOrPath string) { + md := merged.Metadata.(pkg.SafeTensorsModelInfo) + + var ( + lics []pkg.License + supporting []file.Location + ) + if groupKey == ociGroupKey { + nameOrPath, lics, supporting = enrichSafeTensorsOCI(ctx, resolver, &md) + } else { + nameOrPath, lics, supporting = enrichSafeTensorsDir(ctx, resolver, groupKey, &md) + } + + merged.Metadata = md + if len(lics) > 0 { + merged.Licenses = pkg.NewLicenseSet(lics...) + } + for _, loc := range supporting { + merged.Locations.Add(loc.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.SupportingEvidenceAnnotation)) + } + return nameOrPath +} + +// enrichSafeTensorsDir handles the directory-scan case: look for sibling +// config.json and README.md next to the model files. +func enrichSafeTensorsDir(ctx context.Context, resolver file.Resolver, dir string, md *pkg.SafeTensorsModelInfo) (nameOrPath string, lics []pkg.License, supporting []file.Location) { + if loc, cfg := readDirHFConfig(resolver, path.Join(dir, "config.json")); cfg != nil { + applyHFConfig(md, cfg) + nameOrPath = cfg.NameOrPath + supporting = append(supporting, *loc) + } + + if loc, fm := readDirReadmeFrontmatter(resolver, path.Join(dir, "README.md")); fm != nil { + if fm.License != "" { + lics = pkg.NewLicensesFromValuesWithContext(ctx, fm.License) + } + if nameOrPath == "" && len(fm.BaseModel) > 0 { + nameOrPath = fm.BaseModel[0] + } + supporting = append(supporting, *loc) + } + return nameOrPath, lics, supporting +} + +// enrichSafeTensorsOCI handles the OCI-artifact case: walk the +// vnd.docker.ai.model.file layers (READMEs and HF config.json all ride that +// media type — we sniff content to tell them apart), then fall back to the +// vnd.docker.ai.license layer through the shared license scanner. +func enrichSafeTensorsOCI(ctx context.Context, resolver file.Resolver, md *pkg.SafeTensorsModelInfo) (nameOrPath string, lics []pkg.License, supporting []file.Location) { + ociResolver, ok := resolver.(file.OCIMediaTypeResolver) + if !ok { + return "", nil, nil + } + + modelFileLocs, err := ociResolver.FilesByMediaType(dockerAIModelFileMediaType) + if err != nil { + log.Debugf("failed to list docker AI model-file layers: %v", err) + } + + // Collect config / readme candidates separately so the layer-iteration order + // returned by the resolver doesn't decide the precedence. + var configName, readmeName, readmeLicense string + for _, loc := range modelFileLocs { + if classifyOCIModelFileLayer(resolver, loc, md, &configName, &readmeName, &readmeLicense) { + supporting = append(supporting, loc) + } + } + + // Precedence: config.json _name_or_path > README base_model. + if configName != "" { + nameOrPath = configName + } else { + nameOrPath = readmeName + } + + // README license takes precedence; fall back to the license layer via the + // shared scanner (which understands SPDX text far better than a hand-rolled + // substring match). + switch { + case readmeLicense != "": + lics = pkg.NewLicensesFromValuesWithContext(ctx, readmeLicense) + default: + licLocs, lErr := ociResolver.FilesByMediaType(dockerAILicenseMediaType) + if lErr != nil { + log.Debugf("failed to list docker AI license layers: %v", lErr) + } + if len(licLocs) > 0 { + lics = licenses.FindAtLocations(ctx, resolver, licLocs...) + supporting = append(supporting, licLocs...) + } + } + return nameOrPath, lics, supporting +} + +// classifyOCIModelFileLayer reads up to 4 MiB of a model.file layer and +// classifies it as README frontmatter or HF config.json based on its leading +// bytes. Side-effects: applies HF config fields onto md, accumulates name and +// license candidates via the out-params. Returns true when the layer was +// successfully classified (and should be recorded as supporting evidence). +func classifyOCIModelFileLayer(resolver file.Resolver, loc file.Location, md *pkg.SafeTensorsModelInfo, configName, readmeName, license *string) bool { + rc, err := resolver.FileContentsByLocation(loc) + if err != nil { + return false + } + defer internal.CloseAndLogError(rc, loc.RealPath) + + buf, err := io.ReadAll(io.LimitReader(rc, 4*1024*1024)) + if err != nil { + return false + } + trimmed := trimLeadingWhitespace(buf) + switch { + case hasPrefix(trimmed, "---"): + fm := parseFrontmatter(buf) + if fm == nil { + return false + } + if *license == "" { + *license = fm.License + } + if *readmeName == "" && len(fm.BaseModel) > 0 { + *readmeName = fm.BaseModel[0] + } + return true + case hasPrefix(trimmed, "{"): + var cfg hfConfig + if err := json.Unmarshal(buf, &cfg); err != nil { + return false + } + applyHFConfig(md, &cfg) + if *configName == "" && cfg.NameOrPath != "" { + *configName = cfg.NameOrPath + } + return true + } + return false +} + +// applyHFConfig folds the subset of HF config.json fields we surface in our +// metadata onto md. Fields already populated on md are left alone — earlier +// content-derived values (Quantization, TensorCount, etc., from header bytes) +// always win over producer-declared ones in case of conflict. +func applyHFConfig(md *pkg.SafeTensorsModelInfo, cfg *hfConfig) { + if md.Architecture == "" && len(cfg.Architectures) > 0 { + md.Architecture = cfg.Architectures[0] + } + if md.TorchDtype == "" { + md.TorchDtype = cfg.TorchDtype + } + if md.TransformersVersion == "" { + md.TransformersVersion = cfg.TransformersVersion + } +} + +// pickSafeTensorsName implements the documented naming precedence chain: +// +// 1. config.json _name_or_path (path.Base, so "org/Model" → "Model") +// 2. OCI manifest title (deferred to a follow-up; reserved here) +// 3. Architecture-Parameters synthetic (only when both are populated) +// 4. parent directory of the group (dir-scan only — OCI has no useful path) +// +// Returns "" to signal the merge processor should drop the group rather than +// invent a name. +func pickSafeTensorsName(merged pkg.Package, groupKey, nameOrPath string) string { + md, _ := merged.Metadata.(pkg.SafeTensorsModelInfo) + + if nameOrPath != "" { + return path.Base(nameOrPath) + } + // 2. OCI manifest title — follow-up. + + if md.Architecture != "" && md.Parameters != "" { + return md.Architecture + "-" + md.Parameters + } + + if groupKey != ociGroupKey { + base := path.Base(groupKey) + if base != "" && base != "." && base != "/" { + return base + } + } + return "" +} + +// --- Relocated enrichment helpers ---------------------------------------- +// +// These types and functions used to live in the parser files; they moved here +// when the parsers shrank to "just decode the safetensors-specific format" and +// every resolver-backed read centralized in the merge processor. + +// hfConfig is a minimal projection of Hugging Face config.json fields. +type hfConfig struct { + Architectures []string `json:"architectures"` + TorchDtype string `json:"torch_dtype"` + TransformersVersion string `json:"transformers_version"` + NameOrPath string `json:"_name_or_path"` +} + +// readmeFrontmatter holds the subset of YAML frontmatter fields we extract. +type readmeFrontmatter struct { + License string `yaml:"license"` + BaseModel []string `yaml:"base_model"` +} + +func readDirHFConfig(resolver file.Resolver, p string) (*file.Location, *hfConfig) { + locations, err := resolver.FilesByPath(p) + if err != nil || len(locations) == 0 { + return nil, nil + } + rc, err := resolver.FileContentsByLocation(locations[0]) + if err != nil { + return nil, nil + } + defer internal.CloseAndLogError(rc, p) + + var cfg hfConfig + if err := json.NewDecoder(rc).Decode(&cfg); err != nil { + log.Debugf("failed to decode %s: %v", p, err) + return nil, nil + } + return &locations[0], &cfg +} + +func readDirReadmeFrontmatter(resolver file.Resolver, p string) (*file.Location, *readmeFrontmatter) { + locations, err := resolver.FilesByPath(p) + if err != nil || len(locations) == 0 { + return nil, nil + } + rc, err := resolver.FileContentsByLocation(locations[0]) + if err != nil { + return nil, nil + } + defer internal.CloseAndLogError(rc, p) + + buf, err := io.ReadAll(io.LimitReader(rc, 1024*1024)) + if err != nil { + return nil, nil + } + fm := parseFrontmatter(buf) + if fm == nil { + return nil, nil + } + return &locations[0], fm +} + +// parseFrontmatter pulls the YAML block between the first and second "---" +// lines of a file (if present) and decodes the fields we care about. base_model +// is decoded via yaml.Node so a scalar value ("org/model") doesn't fail the +// whole block. +func parseFrontmatter(buf []byte) *readmeFrontmatter { + trimmed := bytes.TrimLeft(buf, "\xef\xbb\xbf \t\r\n") + if !bytes.HasPrefix(trimmed, []byte("---")) { + return nil + } + rest := trimmed[3:] + if i := bytes.IndexByte(rest, '\n'); i >= 0 { + rest = rest[i+1:] + } + end := bytes.Index(rest, []byte("\n---")) + if end < 0 { + return nil + } + + var raw struct { + License string `yaml:"license"` + BaseModel yaml.Node `yaml:"base_model"` + } + if err := yaml.Unmarshal(rest[:end], &raw); err != nil { + log.Debugf("failed to parse README frontmatter: %v", err) + return nil + } + + fm := readmeFrontmatter{License: raw.License} + switch raw.BaseModel.Kind { + case yaml.ScalarNode: + if raw.BaseModel.Value != "" { + fm.BaseModel = []string{raw.BaseModel.Value} + } + case yaml.SequenceNode: + _ = raw.BaseModel.Decode(&fm.BaseModel) + } + return &fm +} + +func hasPrefix(b []byte, s string) bool { + return len(b) >= len(s) && string(b[:len(s)]) == s +} + +func trimLeadingWhitespace(b []byte) []byte { + i := 0 + for i < len(b) && (b[i] == ' ' || b[i] == '\t' || b[i] == '\r' || b[i] == '\n') { + i++ + } + if len(b)-i >= 3 && b[i] == 0xEF && b[i+1] == 0xBB && b[i+2] == 0xBF { + i += 3 + } + return b[i:] }