fix: remove index parsing

Signed-off-by: Christopher Phillips <32073428+spiffcs@users.noreply.github.com>
This commit is contained in:
Christopher Phillips 2026-05-29 03:39:23 -04:00
parent 19ea799cd2
commit 15bd509e15
No known key found for this signature in database
3 changed files with 15 additions and 130 deletions

View File

@ -26,18 +26,23 @@ func NewGGUFCataloger() pkg.Cataloger {
}
// NewSafeTensorsCataloger returns a cataloger for SafeTensors model files,
// covering four discovery paths:
// - **/*.safetensors files (single-file models; header-only parse)
// - **/model.safetensors.index.json files (sharded models)
// covering three discovery paths:
// - **/*.safetensors files (single-file models and individual shards;
// header-only parse)
// - application/vnd.docker.ai.model.config.v0.1+json / v0.2+json OCI layers
// (Docker Model Runner artifacts whose config advertises format=="safetensors")
// - application/vnd.docker.ai.safetensors OCI layers (per-shard JSON headers,
// fetched as a prefix by the OCI model source; emitted as nameless
// packages and merged into the config-derived package as Parts)
//
// model.safetensors.index.json files are intentionally not parsed today: the
// index describes how tensors map to shards but contributes no metadata the
// cataloger can't derive from the shard headers themselves. If a model is
// distributed as just an index.json with no accompanying shard files, the
// cataloger emits nothing for that directory.
func NewSafeTensorsCataloger() pkg.Cataloger {
return generic.NewCataloger(safeTensorsCatalogerName).
WithParserByGlobs(parseSafeTensorsFile, "**/*.safetensors").
WithParserByGlobs(parseSafeTensorsIndex, "**/*.safetensors.index.json").
WithParserByMediaType(parseSafeTensorsOCIConfig, dockerAIModelConfigMediaTypes...).
WithParserByMediaType(parseSafeTensorsOCILayer, dockerAISafeTensorsMediaType).
WithResolvingProcessors(safeTensorsMergeProcessor)

View File

@ -2,10 +2,8 @@ package ai
import (
"context"
"encoding/json"
"fmt"
"io"
"strconv"
"github.com/anchore/syft/internal"
"github.com/anchore/syft/internal/unknown"
@ -16,9 +14,10 @@ import (
)
// parseSafeTensorsFile decodes the JSON header of a single .safetensors file
// and emits a nameless package whose metadata is derived purely from the
// header bytes. Naming, license resolution, sibling enrichment, and cross-
// shard rollup are all the responsibility of safeTensorsMergeProcessor.
// (also called once per shard for sharded models) and emits a nameless package
// whose metadata is derived purely from the header bytes. Naming, license
// resolution, sibling enrichment, and cross-shard rollup are all handled by
// safeTensorsMergeProcessor.
func parseSafeTensorsFile(_ context.Context, _ file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) {
defer internal.CloseAndLogError(reader, reader.Path())
@ -46,45 +45,6 @@ func parseSafeTensorsFile(_ context.Context, _ file.Resolver, _ *generic.Environ
return []pkg.Package{p}, nil, unknown.IfEmptyf([]pkg.Package{p}, "unable to parse safetensors file")
}
// parseSafeTensorsIndex decodes a model.safetensors.index.json file for a
// sharded model and emits a nameless package recording tensor count, unique
// shard count, and (when present) the producer-declared total_size. Like
// parseSafeTensorsFile, naming and sibling enrichment happen in the merge
// processor.
func parseSafeTensorsIndex(_ context.Context, _ file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) {
defer internal.CloseAndLogError(reader, reader.Path())
var doc struct {
Metadata struct {
TotalSize json.Number `json:"total_size"`
} `json:"metadata"`
WeightMap map[string]string `json:"weight_map"`
}
if err := json.NewDecoder(reader).Decode(&doc); err != nil {
return nil, nil, fmt.Errorf("failed to decode safetensors index JSON: %w", err)
}
shards := make(map[string]struct{}, 4)
for _, shard := range doc.WeightMap {
shards[shard] = struct{}{}
}
md := pkg.SafeTensorsModelInfo{
Format: "safetensors",
TensorCount: uint64(len(doc.WeightMap)),
ShardCount: len(shards),
}
if doc.Metadata.TotalSize != "" {
md.TotalSize = formatByteSize(doc.Metadata.TotalSize.String())
}
p := newSafeTensorsPackage(
&md,
reader.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation),
)
return []pkg.Package{p}, nil, unknown.IfEmptyf([]pkg.Package{p}, "unable to parse safetensors index")
}
// formatParameterCount prints a count like 6_700_000_000 as "6.70B" using
// B/M/K thresholds matching the notation used by Hugging Face and Docker AI
// labels.
@ -101,36 +61,5 @@ func formatParameterCount(n uint64) string {
}
}
// formatByteSize turns a numeric string (bytes) into a human-friendly size
// like "71.90GB". Non-numeric inputs are passed through unchanged so producer-
// declared strings (e.g. "71.90GB" from a Docker AI config blob) survive.
func formatByteSize(s string) string {
n, err := strconv.ParseUint(s, 10, 64)
if err != nil || n == 0 {
return s
}
const (
kb = 1024
mb = kb * 1024
gb = mb * 1024
tb = gb * 1024
)
switch {
case n >= tb:
return fmt.Sprintf("%.2fTB", float64(n)/float64(tb))
case n >= gb:
return fmt.Sprintf("%.2fGB", float64(n)/float64(gb))
case n >= mb:
return fmt.Sprintf("%.2fMB", float64(n)/float64(mb))
case n >= kb:
return fmt.Sprintf("%.2fKB", float64(n)/float64(kb))
default:
return fmt.Sprintf("%dB", n)
}
}
// integrity checks
var (
_ generic.Parser = parseSafeTensorsFile
_ generic.Parser = parseSafeTensorsIndex
)
// integrity check
var _ generic.Parser = parseSafeTensorsFile

View File

@ -88,42 +88,6 @@ func TestSafeTensorsCataloger_singleFile(t *testing.T) {
TestCataloger(t, NewSafeTensorsCataloger())
}
func TestSafeTensorsCataloger_shardedIndex(t *testing.T) {
dir := t.TempDir()
modelDir := filepath.Join(dir, "my-model")
require.NoError(t, os.MkdirAll(modelDir, 0o755))
index := `{
"metadata": {"total_size": 16000000000},
"weight_map": {
"layer.0.weight": "model-00001-of-00002.safetensors",
"layer.1.weight": "model-00001-of-00002.safetensors",
"layer.2.weight": "model-00002-of-00002.safetensors"
}
}`
require.NoError(t, os.WriteFile(filepath.Join(modelDir, "model.safetensors.index.json"), []byte(index), 0o644))
expected := []pkg.Package{
{
Name: "my-model",
Type: pkg.ModelPkg,
Licenses: pkg.NewLicenseSet(),
Metadata: pkg.SafeTensorsModelInfo{
Format: "safetensors",
TensorCount: 3,
ShardCount: 2,
TotalSize: "14.90GB",
},
},
}
pkgtest.NewCatalogTester().
FromDirectory(t, dir).
Expects(expected, nil).
IgnoreLocationLayer().
IgnorePackageFields("FoundBy", "Locations").
TestCataloger(t, NewSafeTensorsCataloger())
}
// TestParseSafeTensorsOCIConfig covers the parser in isolation: it should emit
// a nameless package mirroring the config blob's producer-declared fields, and
// emit nothing for non-safetensors formats so the GGUF cataloger can claim the
@ -737,19 +701,6 @@ func TestFormatParameterCount(t *testing.T) {
}
}
func TestFormatByteSize(t *testing.T) {
cases := map[string]string{
"16000000000": "14.90GB",
"2048": "2.00KB",
"500": "500B",
"71.90GB": "71.90GB", // non-numeric passes through unchanged
"": "",
}
for in, want := range cases {
assert.Equalf(t, want, formatByteSize(in), "formatByteSize(%q)", in)
}
}
func TestParseFrontmatter(t *testing.T) {
t.Run("list base_model", func(t *testing.T) {
fm := parseFrontmatter([]byte("---\nlicense: mit\nbase_model:\n - org/Model\n---\nbody"))