From 4d59bdbb7fc89ccdce05276cb0fe331bca524324 Mon Sep 17 00:00:00 2001 From: Christopher Phillips <32073428+spiffcs@users.noreply.github.com> Date: Tue, 30 Jun 2026 14:57:06 -0400 Subject: [PATCH] fix: bound safetensors header read to content size readSafeTensorsHeader pre-allocated the declared header length, which is read straight from the file and bounded only by the 100MB ceiling. A short file declaring a huge header could force a large allocation it never fills. Read incrementally via io.ReadAll(io.LimitReader(...)) and verify the full header was actually present Signed-off-by: Christopher Phillips <32073428+spiffcs@users.noreply.github.com> --- syft/pkg/cataloger/ai/parse_safetensors.go | 26 +++++++++++++--------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/syft/pkg/cataloger/ai/parse_safetensors.go b/syft/pkg/cataloger/ai/parse_safetensors.go index f88e73d40..79967401c 100644 --- a/syft/pkg/cataloger/ai/parse_safetensors.go +++ b/syft/pkg/cataloger/ai/parse_safetensors.go @@ -51,10 +51,14 @@ func readSafeTensorsHeader(r io.Reader) (*safeTensorsHeader, error) { return nil, fmt.Errorf("safetensors header size %d exceeds maximum %d", headerLen, maxSafeTensorsHeaderSize) } - body := make([]byte, headerLen) - if _, err := io.ReadFull(r, body); err != nil { + // Read incrementally rather than pre-allocating headerLen up front + body, err := io.ReadAll(io.LimitReader(r, int64(headerLen))) + if err != nil { return nil, fmt.Errorf("failed to read header body: %w", err) } + if uint64(len(body)) != headerLen { + return nil, fmt.Errorf("safetensors header truncated: read %d of %d bytes", len(body), headerLen) + } var raw map[string]json.RawMessage if err := json.Unmarshal(body, &raw); err != nil { @@ -123,21 +127,23 @@ func (h *safeTensorsHeader) dominantDType() string { return best } -// metadataHash returns a stable xxhash64 over the tensor entries + __metadata__. -// Tensor keys are sorted to keep the hash deterministic across producers. +// metadataHash returns a stable xxhash64 over the logical tensor content +// (name + dtype + shape) plus the __metadata__ map. Tensor keys are sorted to +// keep the hash deterministic across producers. func (h *safeTensorsHeader) metadataHash() string { - type entry struct { - Name string `json:"name"` - Entry safeTensorsEntry `json:"entry"` + type logicalEntry struct { + Name string `json:"name"` + DType string `json:"dtype"` + Shape []int64 `json:"shape"` } - entries := make([]entry, 0, len(h.tensors)) + entries := make([]logicalEntry, 0, len(h.tensors)) for name, t := range h.tensors { - entries = append(entries, entry{Name: name, Entry: t}) + entries = append(entries, logicalEntry{Name: name, DType: t.DType, Shape: t.Shape}) } sort.Slice(entries, func(i, j int) bool { return entries[i].Name < entries[j].Name }) type hashInput struct { - Tensors []entry `json:"tensors"` + Tensors []logicalEntry `json:"tensors"` Metadata map[string]string `json:"metadata,omitempty"` } b, err := json.Marshal(hashInput{Tensors: entries, Metadata: h.metadata})