mirror of
https://github.com/anchore/syft.git
synced 2025-11-17 08:23:15 +01:00
chore: more idiomatic copy/reader usage
Signed-off-by: Christopher Phillips <32073428+spiffcs@users.noreply.github.com>
This commit is contained in:
parent
e58e6317d2
commit
8706ff8310
@ -76,7 +76,7 @@ func TestGGUFCataloger(t *testing.T) {
|
|||||||
GGUFVersion: 3,
|
GGUFVersion: 3,
|
||||||
TensorCount: 0,
|
TensorCount: 0,
|
||||||
MetadataKeyValuesHash: "6e3d368066455ce4",
|
MetadataKeyValuesHash: "6e3d368066455ce4",
|
||||||
Header: map[string]interface{}{
|
RemainingKeyValues: map[string]interface{}{
|
||||||
"general.some_random_kv": "foobar",
|
"general.some_random_kv": "foobar",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
@ -113,7 +113,7 @@ func TestGGUFCataloger(t *testing.T) {
|
|||||||
GGUFVersion: 3,
|
GGUFVersion: 3,
|
||||||
TensorCount: 0,
|
TensorCount: 0,
|
||||||
MetadataKeyValuesHash: "9dc6f23591062a27",
|
MetadataKeyValuesHash: "9dc6f23591062a27",
|
||||||
Header: map[string]interface{}{
|
RemainingKeyValues: map[string]interface{}{
|
||||||
"gpt2.context_length": "1024",
|
"gpt2.context_length": "1024",
|
||||||
"gpt2.embedding_length": uint32(768),
|
"gpt2.embedding_length": uint32(768),
|
||||||
},
|
},
|
||||||
|
|||||||
@ -14,46 +14,35 @@ const (
|
|||||||
maxHeaderSize = 50 * 1024 * 1024 // 50MB for large tokenizer vocabularies
|
maxHeaderSize = 50 * 1024 * 1024 // 50MB for large tokenizer vocabularies
|
||||||
)
|
)
|
||||||
|
|
||||||
// readHeader reads only the GGUF header (metadata) without reading tensor data
|
// copyHeader copies the GGUF header from the reader to the writer.
|
||||||
// This is much more efficient than reading the entire file
|
// It validates the magic number first, then copies the rest of the data.
|
||||||
// The reader should be wrapped with io.LimitedReader to prevent OOM issues
|
// The reader should be wrapped with io.LimitedReader to prevent OOM issues.
|
||||||
func readHeader(r io.Reader) ([]byte, error) {
|
func copyHeader(w io.Writer, r io.Reader) error {
|
||||||
// Read initial chunk to determine header size
|
// Read initial chunk to validate magic number
|
||||||
// GGUF format: magic(4) + version(4) + tensor_count(8) + metadata_kv_count(8) + metadata_kvs + tensors_info
|
// GGUF format: magic(4) + version(4) + tensor_count(8) + metadata_kv_count(8) + metadata_kvs + tensors_info
|
||||||
initialBuf := make([]byte, 24) // Enough for magic, version, tensor count, and kv count
|
initialBuf := make([]byte, 24) // Enough for magic, version, tensor count, and kv count
|
||||||
if _, err := io.ReadFull(r, initialBuf); err != nil {
|
if _, err := io.ReadFull(r, initialBuf); err != nil {
|
||||||
return nil, fmt.Errorf("failed to read GGUF header prefix: %w", err)
|
return fmt.Errorf("failed to read GGUF header prefix: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Verify magic number
|
// Verify magic number
|
||||||
magic := binary.LittleEndian.Uint32(initialBuf[0:4])
|
magic := binary.LittleEndian.Uint32(initialBuf[0:4])
|
||||||
if magic != ggufMagicNumber {
|
if magic != ggufMagicNumber {
|
||||||
return nil, fmt.Errorf("invalid GGUF magic number: 0x%08X", magic)
|
return fmt.Errorf("invalid GGUF magic number: 0x%08X", magic)
|
||||||
}
|
}
|
||||||
|
|
||||||
// We need to read the metadata KV pairs to know the full header size
|
// Write the initial buffer to the writer
|
||||||
// The io.LimitedReader wrapping this reader ensures we don't read more than maxHeaderSize
|
if _, err := w.Write(initialBuf); err != nil {
|
||||||
headerData := make([]byte, 0, 1024*1024) // Start with 1MB capacity
|
return fmt.Errorf("failed to write GGUF header prefix: %w", err)
|
||||||
headerData = append(headerData, initialBuf...)
|
}
|
||||||
|
|
||||||
// Read the rest of the header in larger chunks for efficiency
|
// Copy the rest of the header from reader to writer
|
||||||
// The LimitedReader will return EOF once maxHeaderSize is reached
|
// The LimitedReader will return EOF once maxHeaderSize is reached
|
||||||
buf := make([]byte, 64*1024) // 64KB chunks
|
if _, err := io.Copy(w, r); err != nil {
|
||||||
for {
|
return fmt.Errorf("failed to copy GGUF header: %w", err)
|
||||||
n, err := r.Read(buf)
|
|
||||||
if n > 0 {
|
|
||||||
headerData = append(headerData, buf[:n]...)
|
|
||||||
}
|
|
||||||
if err == io.EOF {
|
|
||||||
// Reached end of file or limit, we have all available data
|
|
||||||
break
|
|
||||||
}
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to read GGUF header: %w", err)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return headerData, nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Helper to convert gguf_parser metadata to simpler types
|
// Helper to convert gguf_parser metadata to simpler types
|
||||||
|
|||||||
@ -27,14 +27,6 @@ import (
|
|||||||
func parseGGUFModel(_ context.Context, _ file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) {
|
func parseGGUFModel(_ context.Context, _ file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) {
|
||||||
defer internal.CloseAndLogError(reader, reader.Path())
|
defer internal.CloseAndLogError(reader, reader.Path())
|
||||||
|
|
||||||
// Read and validate the GGUF file header using LimitedReader to prevent OOM
|
|
||||||
// We use LimitedReader to cap reads at maxHeaderSize (50MB)
|
|
||||||
limitedReader := &io.LimitedReader{R: reader, N: maxHeaderSize}
|
|
||||||
headerData, err := readHeader(limitedReader)
|
|
||||||
if err != nil {
|
|
||||||
return nil, nil, fmt.Errorf("failed to read GGUF header: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Create a temporary file for the library to parse
|
// Create a temporary file for the library to parse
|
||||||
// The library requires a file path, so we create a temp file
|
// The library requires a file path, so we create a temp file
|
||||||
tempFile, err := os.CreateTemp("", "syft-gguf-*.gguf")
|
tempFile, err := os.CreateTemp("", "syft-gguf-*.gguf")
|
||||||
@ -44,10 +36,12 @@ func parseGGUFModel(_ context.Context, _ file.Resolver, _ *generic.Environment,
|
|||||||
tempPath := tempFile.Name()
|
tempPath := tempFile.Name()
|
||||||
defer os.Remove(tempPath)
|
defer os.Remove(tempPath)
|
||||||
|
|
||||||
// Write the validated header data to temp file
|
// Copy and validate the GGUF file header using LimitedReader to prevent OOM
|
||||||
if _, err := tempFile.Write(headerData); err != nil {
|
// We use LimitedReader to cap reads at maxHeaderSize (50MB)
|
||||||
|
limitedReader := &io.LimitedReader{R: reader, N: maxHeaderSize}
|
||||||
|
if err := copyHeader(tempFile, limitedReader); err != nil {
|
||||||
tempFile.Close()
|
tempFile.Close()
|
||||||
return nil, nil, fmt.Errorf("failed to write to temp file: %w", err)
|
return nil, nil, fmt.Errorf("failed to copy GGUF header: %w", err)
|
||||||
}
|
}
|
||||||
tempFile.Close()
|
tempFile.Close()
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user