mirror of
https://github.com/anchore/syft.git
synced 2026-07-05 02:28:25 +02:00
fix: remove index parsing
Signed-off-by: Christopher Phillips <32073428+spiffcs@users.noreply.github.com>
This commit is contained in:
parent
19ea799cd2
commit
15bd509e15
@ -26,18 +26,23 @@ func NewGGUFCataloger() pkg.Cataloger {
|
||||
}
|
||||
|
||||
// NewSafeTensorsCataloger returns a cataloger for SafeTensors model files,
|
||||
// covering four discovery paths:
|
||||
// - **/*.safetensors files (single-file models; header-only parse)
|
||||
// - **/model.safetensors.index.json files (sharded models)
|
||||
// covering three discovery paths:
|
||||
// - **/*.safetensors files (single-file models and individual shards;
|
||||
// header-only parse)
|
||||
// - application/vnd.docker.ai.model.config.v0.1+json / v0.2+json OCI layers
|
||||
// (Docker Model Runner artifacts whose config advertises format=="safetensors")
|
||||
// - application/vnd.docker.ai.safetensors OCI layers (per-shard JSON headers,
|
||||
// fetched as a prefix by the OCI model source; emitted as nameless
|
||||
// packages and merged into the config-derived package as Parts)
|
||||
//
|
||||
// model.safetensors.index.json files are intentionally not parsed today: the
|
||||
// index describes how tensors map to shards but contributes no metadata the
|
||||
// cataloger can't derive from the shard headers themselves. If a model is
|
||||
// distributed as just an index.json with no accompanying shard files, the
|
||||
// cataloger emits nothing for that directory.
|
||||
func NewSafeTensorsCataloger() pkg.Cataloger {
|
||||
return generic.NewCataloger(safeTensorsCatalogerName).
|
||||
WithParserByGlobs(parseSafeTensorsFile, "**/*.safetensors").
|
||||
WithParserByGlobs(parseSafeTensorsIndex, "**/*.safetensors.index.json").
|
||||
WithParserByMediaType(parseSafeTensorsOCIConfig, dockerAIModelConfigMediaTypes...).
|
||||
WithParserByMediaType(parseSafeTensorsOCILayer, dockerAISafeTensorsMediaType).
|
||||
WithResolvingProcessors(safeTensorsMergeProcessor)
|
||||
|
||||
@ -2,10 +2,8 @@ package ai
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"strconv"
|
||||
|
||||
"github.com/anchore/syft/internal"
|
||||
"github.com/anchore/syft/internal/unknown"
|
||||
@ -16,9 +14,10 @@ import (
|
||||
)
|
||||
|
||||
// parseSafeTensorsFile decodes the JSON header of a single .safetensors file
|
||||
// and emits a nameless package whose metadata is derived purely from the
|
||||
// header bytes. Naming, license resolution, sibling enrichment, and cross-
|
||||
// shard rollup are all the responsibility of safeTensorsMergeProcessor.
|
||||
// (also called once per shard for sharded models) and emits a nameless package
|
||||
// whose metadata is derived purely from the header bytes. Naming, license
|
||||
// resolution, sibling enrichment, and cross-shard rollup are all handled by
|
||||
// safeTensorsMergeProcessor.
|
||||
func parseSafeTensorsFile(_ context.Context, _ file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) {
|
||||
defer internal.CloseAndLogError(reader, reader.Path())
|
||||
|
||||
@ -46,45 +45,6 @@ func parseSafeTensorsFile(_ context.Context, _ file.Resolver, _ *generic.Environ
|
||||
return []pkg.Package{p}, nil, unknown.IfEmptyf([]pkg.Package{p}, "unable to parse safetensors file")
|
||||
}
|
||||
|
||||
// parseSafeTensorsIndex decodes a model.safetensors.index.json file for a
|
||||
// sharded model and emits a nameless package recording tensor count, unique
|
||||
// shard count, and (when present) the producer-declared total_size. Like
|
||||
// parseSafeTensorsFile, naming and sibling enrichment happen in the merge
|
||||
// processor.
|
||||
func parseSafeTensorsIndex(_ context.Context, _ file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) {
|
||||
defer internal.CloseAndLogError(reader, reader.Path())
|
||||
|
||||
var doc struct {
|
||||
Metadata struct {
|
||||
TotalSize json.Number `json:"total_size"`
|
||||
} `json:"metadata"`
|
||||
WeightMap map[string]string `json:"weight_map"`
|
||||
}
|
||||
if err := json.NewDecoder(reader).Decode(&doc); err != nil {
|
||||
return nil, nil, fmt.Errorf("failed to decode safetensors index JSON: %w", err)
|
||||
}
|
||||
|
||||
shards := make(map[string]struct{}, 4)
|
||||
for _, shard := range doc.WeightMap {
|
||||
shards[shard] = struct{}{}
|
||||
}
|
||||
|
||||
md := pkg.SafeTensorsModelInfo{
|
||||
Format: "safetensors",
|
||||
TensorCount: uint64(len(doc.WeightMap)),
|
||||
ShardCount: len(shards),
|
||||
}
|
||||
if doc.Metadata.TotalSize != "" {
|
||||
md.TotalSize = formatByteSize(doc.Metadata.TotalSize.String())
|
||||
}
|
||||
|
||||
p := newSafeTensorsPackage(
|
||||
&md,
|
||||
reader.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation),
|
||||
)
|
||||
return []pkg.Package{p}, nil, unknown.IfEmptyf([]pkg.Package{p}, "unable to parse safetensors index")
|
||||
}
|
||||
|
||||
// formatParameterCount prints a count like 6_700_000_000 as "6.70B" using
|
||||
// B/M/K thresholds matching the notation used by Hugging Face and Docker AI
|
||||
// labels.
|
||||
@ -101,36 +61,5 @@ func formatParameterCount(n uint64) string {
|
||||
}
|
||||
}
|
||||
|
||||
// formatByteSize turns a numeric string (bytes) into a human-friendly size
|
||||
// like "71.90GB". Non-numeric inputs are passed through unchanged so producer-
|
||||
// declared strings (e.g. "71.90GB" from a Docker AI config blob) survive.
|
||||
func formatByteSize(s string) string {
|
||||
n, err := strconv.ParseUint(s, 10, 64)
|
||||
if err != nil || n == 0 {
|
||||
return s
|
||||
}
|
||||
const (
|
||||
kb = 1024
|
||||
mb = kb * 1024
|
||||
gb = mb * 1024
|
||||
tb = gb * 1024
|
||||
)
|
||||
switch {
|
||||
case n >= tb:
|
||||
return fmt.Sprintf("%.2fTB", float64(n)/float64(tb))
|
||||
case n >= gb:
|
||||
return fmt.Sprintf("%.2fGB", float64(n)/float64(gb))
|
||||
case n >= mb:
|
||||
return fmt.Sprintf("%.2fMB", float64(n)/float64(mb))
|
||||
case n >= kb:
|
||||
return fmt.Sprintf("%.2fKB", float64(n)/float64(kb))
|
||||
default:
|
||||
return fmt.Sprintf("%dB", n)
|
||||
}
|
||||
}
|
||||
|
||||
// integrity checks
|
||||
var (
|
||||
_ generic.Parser = parseSafeTensorsFile
|
||||
_ generic.Parser = parseSafeTensorsIndex
|
||||
)
|
||||
// integrity check
|
||||
var _ generic.Parser = parseSafeTensorsFile
|
||||
|
||||
@ -88,42 +88,6 @@ func TestSafeTensorsCataloger_singleFile(t *testing.T) {
|
||||
TestCataloger(t, NewSafeTensorsCataloger())
|
||||
}
|
||||
|
||||
func TestSafeTensorsCataloger_shardedIndex(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
modelDir := filepath.Join(dir, "my-model")
|
||||
require.NoError(t, os.MkdirAll(modelDir, 0o755))
|
||||
index := `{
|
||||
"metadata": {"total_size": 16000000000},
|
||||
"weight_map": {
|
||||
"layer.0.weight": "model-00001-of-00002.safetensors",
|
||||
"layer.1.weight": "model-00001-of-00002.safetensors",
|
||||
"layer.2.weight": "model-00002-of-00002.safetensors"
|
||||
}
|
||||
}`
|
||||
require.NoError(t, os.WriteFile(filepath.Join(modelDir, "model.safetensors.index.json"), []byte(index), 0o644))
|
||||
|
||||
expected := []pkg.Package{
|
||||
{
|
||||
Name: "my-model",
|
||||
Type: pkg.ModelPkg,
|
||||
Licenses: pkg.NewLicenseSet(),
|
||||
Metadata: pkg.SafeTensorsModelInfo{
|
||||
Format: "safetensors",
|
||||
TensorCount: 3,
|
||||
ShardCount: 2,
|
||||
TotalSize: "14.90GB",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
pkgtest.NewCatalogTester().
|
||||
FromDirectory(t, dir).
|
||||
Expects(expected, nil).
|
||||
IgnoreLocationLayer().
|
||||
IgnorePackageFields("FoundBy", "Locations").
|
||||
TestCataloger(t, NewSafeTensorsCataloger())
|
||||
}
|
||||
|
||||
// TestParseSafeTensorsOCIConfig covers the parser in isolation: it should emit
|
||||
// a nameless package mirroring the config blob's producer-declared fields, and
|
||||
// emit nothing for non-safetensors formats so the GGUF cataloger can claim the
|
||||
@ -737,19 +701,6 @@ func TestFormatParameterCount(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestFormatByteSize(t *testing.T) {
|
||||
cases := map[string]string{
|
||||
"16000000000": "14.90GB",
|
||||
"2048": "2.00KB",
|
||||
"500": "500B",
|
||||
"71.90GB": "71.90GB", // non-numeric passes through unchanged
|
||||
"": "",
|
||||
}
|
||||
for in, want := range cases {
|
||||
assert.Equalf(t, want, formatByteSize(in), "formatByteSize(%q)", in)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseFrontmatter(t *testing.T) {
|
||||
t.Run("list base_model", func(t *testing.T) {
|
||||
fm := parseFrontmatter([]byte("---\nlicense: mit\nbase_model:\n - org/Model\n---\nbody"))
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user