From 15bd509e156b9c1159594d787513da7a32ea613e Mon Sep 17 00:00:00 2001 From: Christopher Phillips <32073428+spiffcs@users.noreply.github.com> Date: Fri, 29 May 2026 03:39:23 -0400 Subject: [PATCH] fix: remove index parsing Signed-off-by: Christopher Phillips <32073428+spiffcs@users.noreply.github.com> --- syft/pkg/cataloger/ai/cataloger.go | 13 ++- .../cataloger/ai/parse_safetensors_model.go | 83 ++----------------- .../cataloger/ai/parse_safetensors_test.go | 49 ----------- 3 files changed, 15 insertions(+), 130 deletions(-) diff --git a/syft/pkg/cataloger/ai/cataloger.go b/syft/pkg/cataloger/ai/cataloger.go index d5ddea316..5b7a75882 100644 --- a/syft/pkg/cataloger/ai/cataloger.go +++ b/syft/pkg/cataloger/ai/cataloger.go @@ -26,18 +26,23 @@ func NewGGUFCataloger() pkg.Cataloger { } // NewSafeTensorsCataloger returns a cataloger for SafeTensors model files, -// covering four discovery paths: -// - **/*.safetensors files (single-file models; header-only parse) -// - **/model.safetensors.index.json files (sharded models) +// covering three discovery paths: +// - **/*.safetensors files (single-file models and individual shards; +// header-only parse) // - application/vnd.docker.ai.model.config.v0.1+json / v0.2+json OCI layers // (Docker Model Runner artifacts whose config advertises format=="safetensors") // - application/vnd.docker.ai.safetensors OCI layers (per-shard JSON headers, // fetched as a prefix by the OCI model source; emitted as nameless // packages and merged into the config-derived package as Parts) +// +// model.safetensors.index.json files are intentionally not parsed today: the +// index describes how tensors map to shards but contributes no metadata the +// cataloger can't derive from the shard headers themselves. If a model is +// distributed as just an index.json with no accompanying shard files, the +// cataloger emits nothing for that directory. func NewSafeTensorsCataloger() pkg.Cataloger { return generic.NewCataloger(safeTensorsCatalogerName). WithParserByGlobs(parseSafeTensorsFile, "**/*.safetensors"). - WithParserByGlobs(parseSafeTensorsIndex, "**/*.safetensors.index.json"). WithParserByMediaType(parseSafeTensorsOCIConfig, dockerAIModelConfigMediaTypes...). WithParserByMediaType(parseSafeTensorsOCILayer, dockerAISafeTensorsMediaType). WithResolvingProcessors(safeTensorsMergeProcessor) diff --git a/syft/pkg/cataloger/ai/parse_safetensors_model.go b/syft/pkg/cataloger/ai/parse_safetensors_model.go index 97c4ac4be..28bf11b78 100644 --- a/syft/pkg/cataloger/ai/parse_safetensors_model.go +++ b/syft/pkg/cataloger/ai/parse_safetensors_model.go @@ -2,10 +2,8 @@ package ai import ( "context" - "encoding/json" "fmt" "io" - "strconv" "github.com/anchore/syft/internal" "github.com/anchore/syft/internal/unknown" @@ -16,9 +14,10 @@ import ( ) // parseSafeTensorsFile decodes the JSON header of a single .safetensors file -// and emits a nameless package whose metadata is derived purely from the -// header bytes. Naming, license resolution, sibling enrichment, and cross- -// shard rollup are all the responsibility of safeTensorsMergeProcessor. +// (also called once per shard for sharded models) and emits a nameless package +// whose metadata is derived purely from the header bytes. Naming, license +// resolution, sibling enrichment, and cross-shard rollup are all handled by +// safeTensorsMergeProcessor. func parseSafeTensorsFile(_ context.Context, _ file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) { defer internal.CloseAndLogError(reader, reader.Path()) @@ -46,45 +45,6 @@ func parseSafeTensorsFile(_ context.Context, _ file.Resolver, _ *generic.Environ return []pkg.Package{p}, nil, unknown.IfEmptyf([]pkg.Package{p}, "unable to parse safetensors file") } -// parseSafeTensorsIndex decodes a model.safetensors.index.json file for a -// sharded model and emits a nameless package recording tensor count, unique -// shard count, and (when present) the producer-declared total_size. Like -// parseSafeTensorsFile, naming and sibling enrichment happen in the merge -// processor. -func parseSafeTensorsIndex(_ context.Context, _ file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) { - defer internal.CloseAndLogError(reader, reader.Path()) - - var doc struct { - Metadata struct { - TotalSize json.Number `json:"total_size"` - } `json:"metadata"` - WeightMap map[string]string `json:"weight_map"` - } - if err := json.NewDecoder(reader).Decode(&doc); err != nil { - return nil, nil, fmt.Errorf("failed to decode safetensors index JSON: %w", err) - } - - shards := make(map[string]struct{}, 4) - for _, shard := range doc.WeightMap { - shards[shard] = struct{}{} - } - - md := pkg.SafeTensorsModelInfo{ - Format: "safetensors", - TensorCount: uint64(len(doc.WeightMap)), - ShardCount: len(shards), - } - if doc.Metadata.TotalSize != "" { - md.TotalSize = formatByteSize(doc.Metadata.TotalSize.String()) - } - - p := newSafeTensorsPackage( - &md, - reader.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation), - ) - return []pkg.Package{p}, nil, unknown.IfEmptyf([]pkg.Package{p}, "unable to parse safetensors index") -} - // formatParameterCount prints a count like 6_700_000_000 as "6.70B" using // B/M/K thresholds matching the notation used by Hugging Face and Docker AI // labels. @@ -101,36 +61,5 @@ func formatParameterCount(n uint64) string { } } -// formatByteSize turns a numeric string (bytes) into a human-friendly size -// like "71.90GB". Non-numeric inputs are passed through unchanged so producer- -// declared strings (e.g. "71.90GB" from a Docker AI config blob) survive. -func formatByteSize(s string) string { - n, err := strconv.ParseUint(s, 10, 64) - if err != nil || n == 0 { - return s - } - const ( - kb = 1024 - mb = kb * 1024 - gb = mb * 1024 - tb = gb * 1024 - ) - switch { - case n >= tb: - return fmt.Sprintf("%.2fTB", float64(n)/float64(tb)) - case n >= gb: - return fmt.Sprintf("%.2fGB", float64(n)/float64(gb)) - case n >= mb: - return fmt.Sprintf("%.2fMB", float64(n)/float64(mb)) - case n >= kb: - return fmt.Sprintf("%.2fKB", float64(n)/float64(kb)) - default: - return fmt.Sprintf("%dB", n) - } -} - -// integrity checks -var ( - _ generic.Parser = parseSafeTensorsFile - _ generic.Parser = parseSafeTensorsIndex -) +// integrity check +var _ generic.Parser = parseSafeTensorsFile diff --git a/syft/pkg/cataloger/ai/parse_safetensors_test.go b/syft/pkg/cataloger/ai/parse_safetensors_test.go index 5b6513d52..7efe86d38 100644 --- a/syft/pkg/cataloger/ai/parse_safetensors_test.go +++ b/syft/pkg/cataloger/ai/parse_safetensors_test.go @@ -88,42 +88,6 @@ func TestSafeTensorsCataloger_singleFile(t *testing.T) { TestCataloger(t, NewSafeTensorsCataloger()) } -func TestSafeTensorsCataloger_shardedIndex(t *testing.T) { - dir := t.TempDir() - modelDir := filepath.Join(dir, "my-model") - require.NoError(t, os.MkdirAll(modelDir, 0o755)) - index := `{ - "metadata": {"total_size": 16000000000}, - "weight_map": { - "layer.0.weight": "model-00001-of-00002.safetensors", - "layer.1.weight": "model-00001-of-00002.safetensors", - "layer.2.weight": "model-00002-of-00002.safetensors" - } - }` - require.NoError(t, os.WriteFile(filepath.Join(modelDir, "model.safetensors.index.json"), []byte(index), 0o644)) - - expected := []pkg.Package{ - { - Name: "my-model", - Type: pkg.ModelPkg, - Licenses: pkg.NewLicenseSet(), - Metadata: pkg.SafeTensorsModelInfo{ - Format: "safetensors", - TensorCount: 3, - ShardCount: 2, - TotalSize: "14.90GB", - }, - }, - } - - pkgtest.NewCatalogTester(). - FromDirectory(t, dir). - Expects(expected, nil). - IgnoreLocationLayer(). - IgnorePackageFields("FoundBy", "Locations"). - TestCataloger(t, NewSafeTensorsCataloger()) -} - // TestParseSafeTensorsOCIConfig covers the parser in isolation: it should emit // a nameless package mirroring the config blob's producer-declared fields, and // emit nothing for non-safetensors formats so the GGUF cataloger can claim the @@ -737,19 +701,6 @@ func TestFormatParameterCount(t *testing.T) { } } -func TestFormatByteSize(t *testing.T) { - cases := map[string]string{ - "16000000000": "14.90GB", - "2048": "2.00KB", - "500": "500B", - "71.90GB": "71.90GB", // non-numeric passes through unchanged - "": "", - } - for in, want := range cases { - assert.Equalf(t, want, formatByteSize(in), "formatByteSize(%q)", in) - } -} - func TestParseFrontmatter(t *testing.T) { t.Run("list base_model", func(t *testing.T) { fm := parseFrontmatter([]byte("---\nlicense: mit\nbase_model:\n - org/Model\n---\nbody"))