From 8706ff83102336fc3e92f1cd1a0dd6e7d846248e Mon Sep 17 00:00:00 2001 From: Christopher Phillips <32073428+spiffcs@users.noreply.github.com> Date: Thu, 13 Nov 2025 15:18:21 -0500 Subject: [PATCH] chore: more idiomatic copy/reader usage Signed-off-by: Christopher Phillips <32073428+spiffcs@users.noreply.github.com> --- syft/pkg/cataloger/ai/cataloger_test.go | 4 +-- syft/pkg/cataloger/ai/parse_gguf.go | 41 +++++++++-------------- syft/pkg/cataloger/ai/parse_gguf_model.go | 16 +++------ 3 files changed, 22 insertions(+), 39 deletions(-) diff --git a/syft/pkg/cataloger/ai/cataloger_test.go b/syft/pkg/cataloger/ai/cataloger_test.go index e71f2ece3..cb4b7573f 100644 --- a/syft/pkg/cataloger/ai/cataloger_test.go +++ b/syft/pkg/cataloger/ai/cataloger_test.go @@ -76,7 +76,7 @@ func TestGGUFCataloger(t *testing.T) { GGUFVersion: 3, TensorCount: 0, MetadataKeyValuesHash: "6e3d368066455ce4", - Header: map[string]interface{}{ + RemainingKeyValues: map[string]interface{}{ "general.some_random_kv": "foobar", }, }, @@ -113,7 +113,7 @@ func TestGGUFCataloger(t *testing.T) { GGUFVersion: 3, TensorCount: 0, MetadataKeyValuesHash: "9dc6f23591062a27", - Header: map[string]interface{}{ + RemainingKeyValues: map[string]interface{}{ "gpt2.context_length": "1024", "gpt2.embedding_length": uint32(768), }, diff --git a/syft/pkg/cataloger/ai/parse_gguf.go b/syft/pkg/cataloger/ai/parse_gguf.go index 60455ea93..3a1eb473f 100644 --- a/syft/pkg/cataloger/ai/parse_gguf.go +++ b/syft/pkg/cataloger/ai/parse_gguf.go @@ -14,46 +14,35 @@ const ( maxHeaderSize = 50 * 1024 * 1024 // 50MB for large tokenizer vocabularies ) -// readHeader reads only the GGUF header (metadata) without reading tensor data -// This is much more efficient than reading the entire file -// The reader should be wrapped with io.LimitedReader to prevent OOM issues -func readHeader(r io.Reader) ([]byte, error) { - // Read initial chunk to determine header size +// copyHeader copies the GGUF header from the reader to the writer. +// It validates the magic number first, then copies the rest of the data. +// The reader should be wrapped with io.LimitedReader to prevent OOM issues. +func copyHeader(w io.Writer, r io.Reader) error { + // Read initial chunk to validate magic number // GGUF format: magic(4) + version(4) + tensor_count(8) + metadata_kv_count(8) + metadata_kvs + tensors_info initialBuf := make([]byte, 24) // Enough for magic, version, tensor count, and kv count if _, err := io.ReadFull(r, initialBuf); err != nil { - return nil, fmt.Errorf("failed to read GGUF header prefix: %w", err) + return fmt.Errorf("failed to read GGUF header prefix: %w", err) } // Verify magic number magic := binary.LittleEndian.Uint32(initialBuf[0:4]) if magic != ggufMagicNumber { - return nil, fmt.Errorf("invalid GGUF magic number: 0x%08X", magic) + return fmt.Errorf("invalid GGUF magic number: 0x%08X", magic) } - // We need to read the metadata KV pairs to know the full header size - // The io.LimitedReader wrapping this reader ensures we don't read more than maxHeaderSize - headerData := make([]byte, 0, 1024*1024) // Start with 1MB capacity - headerData = append(headerData, initialBuf...) + // Write the initial buffer to the writer + if _, err := w.Write(initialBuf); err != nil { + return fmt.Errorf("failed to write GGUF header prefix: %w", err) + } - // Read the rest of the header in larger chunks for efficiency + // Copy the rest of the header from reader to writer // The LimitedReader will return EOF once maxHeaderSize is reached - buf := make([]byte, 64*1024) // 64KB chunks - for { - n, err := r.Read(buf) - if n > 0 { - headerData = append(headerData, buf[:n]...) - } - if err == io.EOF { - // Reached end of file or limit, we have all available data - break - } - if err != nil { - return nil, fmt.Errorf("failed to read GGUF header: %w", err) - } + if _, err := io.Copy(w, r); err != nil { + return fmt.Errorf("failed to copy GGUF header: %w", err) } - return headerData, nil + return nil } // Helper to convert gguf_parser metadata to simpler types diff --git a/syft/pkg/cataloger/ai/parse_gguf_model.go b/syft/pkg/cataloger/ai/parse_gguf_model.go index a2553a7e6..74deb4199 100644 --- a/syft/pkg/cataloger/ai/parse_gguf_model.go +++ b/syft/pkg/cataloger/ai/parse_gguf_model.go @@ -27,14 +27,6 @@ import ( func parseGGUFModel(_ context.Context, _ file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) { defer internal.CloseAndLogError(reader, reader.Path()) - // Read and validate the GGUF file header using LimitedReader to prevent OOM - // We use LimitedReader to cap reads at maxHeaderSize (50MB) - limitedReader := &io.LimitedReader{R: reader, N: maxHeaderSize} - headerData, err := readHeader(limitedReader) - if err != nil { - return nil, nil, fmt.Errorf("failed to read GGUF header: %w", err) - } - // Create a temporary file for the library to parse // The library requires a file path, so we create a temp file tempFile, err := os.CreateTemp("", "syft-gguf-*.gguf") @@ -44,10 +36,12 @@ func parseGGUFModel(_ context.Context, _ file.Resolver, _ *generic.Environment, tempPath := tempFile.Name() defer os.Remove(tempPath) - // Write the validated header data to temp file - if _, err := tempFile.Write(headerData); err != nil { + // Copy and validate the GGUF file header using LimitedReader to prevent OOM + // We use LimitedReader to cap reads at maxHeaderSize (50MB) + limitedReader := &io.LimitedReader{R: reader, N: maxHeaderSize} + if err := copyHeader(tempFile, limitedReader); err != nil { tempFile.Close() - return nil, nil, fmt.Errorf("failed to write to temp file: %w", err) + return nil, nil, fmt.Errorf("failed to copy GGUF header: %w", err) } tempFile.Close()