Use reader when scanning for package versions over reading entire binary into memory (#3558)

* use streaming readers

Signed-off-by: Alex Goodman <wagoodman@users.noreply.github.com>

* replace redis search patterns

Signed-off-by: Alex Goodman <wagoodman@users.noreply.github.com>

* address PR feedback

Signed-off-by: Alex Goodman <wagoodman@users.noreply.github.com>

---------

Signed-off-by: Alex Goodman <wagoodman@users.noreply.github.com>
This commit is contained in:
Alex Goodman 2025-01-02 17:12:37 -05:00 committed by GitHub
parent 470c2ff04c
commit cbce129bb9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
15 changed files with 294 additions and 42 deletions

View File

@ -1,6 +1,11 @@
package internal
import "regexp"
import (
"io"
"regexp"
)
const readerChunkSize = 1024 * 1024
// MatchNamedCaptureGroups takes a regular expression and string and returns all of the named capture group results in a map.
// This is only for the first match in the regex. Callers shouldn't be providing regexes with multiple capture groups with the same name.
@ -32,6 +37,89 @@ func MatchNamedCaptureGroups(regEx *regexp.Regexp, content string) map[string]st
return results
}
// MatchNamedCaptureGroupsFromReader matches named capture groups from a reader, assuming the pattern fits within
// 1.5x the reader chunk size (1MB * 1.5).
func MatchNamedCaptureGroupsFromReader(re *regexp.Regexp, r io.Reader) (map[string]string, error) {
results := make(map[string]string)
_, err := processReaderInChunks(r, readerChunkSize, matchNamedCaptureGroupsHandler(re, results))
if err != nil {
return nil, err
}
if len(results) == 0 {
return nil, nil
}
return results, nil
}
// MatchAnyFromReader matches any of the provided regular expressions from a reader, assuming the pattern fits within
// 1.5x the reader chunk size (1MB * 1.5).
func MatchAnyFromReader(r io.Reader, res ...*regexp.Regexp) (bool, error) {
return processReaderInChunks(r, readerChunkSize, matchAnyHandler(res))
}
func matchNamedCaptureGroupsHandler(re *regexp.Regexp, results map[string]string) func(data []byte) (bool, error) {
return func(data []byte) (bool, error) {
if match := re.FindSubmatch(data); match != nil {
groupNames := re.SubexpNames()
for i, name := range groupNames {
if i > 0 && name != "" {
results[name] = string(match[i])
}
}
return true, nil
}
return false, nil
}
}
func matchAnyHandler(res []*regexp.Regexp) func(data []byte) (bool, error) {
return func(data []byte) (bool, error) {
for _, re := range res {
if re.Match(data) {
return true, nil
}
}
return false, nil
}
}
// processReaderInChunks reads from the provided reader in chunks and calls the provided handler with each chunk + portion of the previous neighboring chunk.
// Note that we only overlap the last half of the previous chunk with the current chunk to avoid missing matches that span chunk boundaries.
func processReaderInChunks(rdr io.Reader, chunkSize int, handler func(data []byte) (bool, error)) (bool, error) {
half := chunkSize / 2
bufSize := chunkSize + half
buf := make([]byte, bufSize)
lastRead := 0
for {
offset := half
if lastRead < half {
offset = lastRead
}
start := half - offset
if lastRead > 0 {
copy(buf[start:], buf[half+offset:half+lastRead])
}
n, err := rdr.Read(buf[half:])
if err != nil {
break
}
// process the combined data with the handler
matched, handlerErr := handler(buf[start : half+n])
if handlerErr != nil {
return false, handlerErr
}
if matched {
return true, nil
}
lastRead = n
}
return false, nil
}
func isEmptyMap(m map[string]string) bool {
if len(m) == 0 {
return true

View File

@ -2,9 +2,11 @@ package internal
import (
"regexp"
"strings"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestMatchCaptureGroups(t *testing.T) {
@ -68,3 +70,149 @@ func TestMatchCaptureGroups(t *testing.T) {
})
}
}
func TestMatchNamedCaptureGroupsFromReader(t *testing.T) {
tests := []struct {
name string
pattern string
input string
want map[string]string
wantErr require.ErrorAssertionFunc
}{
{
name: "match single group",
pattern: `(?P<key>[^1-9]+)`,
input: "key",
want: map[string]string{"key": "key"},
wantErr: require.NoError,
},
{
name: "match multiple groups",
pattern: `(?P<key>[^1-9]+):(?P<value>\w+)`,
input: "key:value",
want: map[string]string{"key": "key", "value": "value"},
wantErr: require.NoError,
},
{
name: "no match",
pattern: `(?P<key>[^1-9]+)`,
input: "2345",
want: nil,
wantErr: require.NoError,
},
{
name: "error empty reader",
pattern: `(?P<key>\w+)`,
input: "",
want: nil,
wantErr: require.NoError,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
re := regexp.MustCompile(tt.pattern)
r := strings.NewReader(tt.input)
got, err := MatchNamedCaptureGroupsFromReader(re, r)
tt.wantErr(t, err)
assert.Equal(t, tt.want, got)
})
}
}
func TestMatchAnyFromReader(t *testing.T) {
tests := []struct {
name string
input string
patterns []*regexp.Regexp
want bool
wantErr require.ErrorAssertionFunc
}{
{
name: "match single pattern",
input: "hello world",
patterns: []*regexp.Regexp{regexp.MustCompile(`hello`)},
want: true,
wantErr: require.NoError,
},
{
name: "match multiple patterns",
input: "test case",
patterns: []*regexp.Regexp{regexp.MustCompile(`case`), regexp.MustCompile(`test`)},
want: true,
wantErr: require.NoError,
},
{
name: "no match",
input: "nothing here",
patterns: []*regexp.Regexp{regexp.MustCompile(`absent`)},
want: false,
wantErr: require.NoError,
},
{
name: "error empty reader",
input: "",
patterns: []*regexp.Regexp{regexp.MustCompile(`match`)},
want: false,
wantErr: require.NoError,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
r := strings.NewReader(tt.input)
got, err := MatchAnyFromReader(r, tt.patterns...)
tt.wantErr(t, err)
assert.Equal(t, tt.want, got)
})
}
}
func TestProcessReaderInChunks_ChunkBoundaries(t *testing.T) {
tests := []struct {
name string
input string
chunkSize int
expectedCalls []string
returnOnChunk int
wantErr require.ErrorAssertionFunc
}{
{
name: "go case",
input: "123456789012345",
chunkSize: 4,
returnOnChunk: 2,
expectedCalls: []string{"1234", "345678", "789012"},
wantErr: require.NoError,
},
{
name: "no match",
input: "123456789012345",
chunkSize: 4,
returnOnChunk: -1,
expectedCalls: []string{"1234", "345678", "789012", "12345"},
wantErr: require.NoError,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
var actualCalls []string
var current int
handler := func(data []byte) (bool, error) {
actualCalls = append(actualCalls, string(data))
if current == tt.returnOnChunk {
return true, nil
}
current++
return false, nil
}
r := strings.NewReader(tt.input)
got, err := processReaderInChunks(r, tt.chunkSize, handler)
tt.wantErr(t, err)
if tt.returnOnChunk == -1 {
assert.False(t, got)
} else {
assert.True(t, got)
}
assert.Equal(t, tt.expectedCalls, actualCalls)
})
}
}

View File

@ -18,6 +18,7 @@ import (
"github.com/anchore/syft/internal/log"
"github.com/anchore/syft/syft/cpe"
"github.com/anchore/syft/syft/file"
"github.com/anchore/syft/syft/internal/unionreader"
"github.com/anchore/syft/syft/pkg"
)
@ -74,9 +75,9 @@ func (cfg Classifier) MarshalJSON() ([]byte, error) {
type EvidenceMatcher func(classifier Classifier, context matcherContext) ([]pkg.Package, error)
type matcherContext struct {
resolver file.Resolver
location file.Location
getContents func(resolver matcherContext) ([]byte, error)
resolver file.Resolver
location file.Location
getReader func(resolver matcherContext) (unionreader.UnionReader, error)
}
func evidenceMatchers(matchers ...EvidenceMatcher) EvidenceMatcher {
@ -124,12 +125,15 @@ func fileNameTemplateVersionMatcher(fileNamePattern string, contentTemplate stri
return nil, fmt.Errorf("unable to compile rendered regex=%q: %w", patternBuf.String(), err)
}
contents, err := getContents(context)
contents, err := getReader(context)
if err != nil {
return nil, fmt.Errorf("unable to get read contents for file: %w", err)
}
matchMetadata := internal.MatchNamedCaptureGroups(tmplPattern, string(contents))
matchMetadata, err := internal.MatchNamedCaptureGroupsFromReader(tmplPattern, contents)
if err != nil {
return nil, fmt.Errorf("unable to match version: %w", err)
}
p := newClassifierPackage(classifier, context.location, matchMetadata)
if p == nil {
@ -143,12 +147,15 @@ func fileNameTemplateVersionMatcher(fileNamePattern string, contentTemplate stri
func FileContentsVersionMatcher(pattern string) EvidenceMatcher {
pat := regexp.MustCompile(pattern)
return func(classifier Classifier, context matcherContext) ([]pkg.Package, error) {
contents, err := getContents(context)
contents, err := getReader(context)
if err != nil {
return nil, fmt.Errorf("unable to get read contents for file: %w", err)
}
matchMetadata := internal.MatchNamedCaptureGroups(pat, string(contents))
matchMetadata, err := internal.MatchNamedCaptureGroupsFromReader(pat, contents)
if err != nil {
return nil, fmt.Errorf("unable to match version: %w", err)
}
// Convert {major: 1, minor: 2, patch: 3} to "1.2.3"
_, versionOk := matchMetadata["version"]
@ -183,14 +190,16 @@ func matchExcluding(matcher EvidenceMatcher, contentPatternsToExclude ...string)
nonMatchPatterns = append(nonMatchPatterns, regexp.MustCompile(p))
}
return func(classifier Classifier, context matcherContext) ([]pkg.Package, error) {
contents, err := getContents(context)
contents, err := getReader(context)
if err != nil {
return nil, fmt.Errorf("unable to get read contents for file: %w", err)
}
for _, nonMatch := range nonMatchPatterns {
if nonMatch.Match(contents) {
return nil, nil
}
matches, err := internal.MatchAnyFromReader(contents, nonMatchPatterns...)
if err != nil {
return nil, fmt.Errorf("unable to match content: %w", err)
}
if matches {
return nil, nil
}
return matcher(classifier, context)
}
@ -214,9 +223,9 @@ func sharedLibraryLookup(sharedLibraryPattern string, sharedLibraryMatcher Evide
}
for _, libraryLocation := range locations {
newResolver := matcherContext{
resolver: context.resolver,
location: libraryLocation,
getContents: context.getContents,
resolver: context.resolver,
location: libraryLocation,
getReader: context.getReader,
}
newResolver.location = libraryLocation
pkgs, err := sharedLibraryMatcher(classifier, newResolver)
@ -253,23 +262,16 @@ func mustPURL(purl string) packageurl.PackageURL {
return p
}
func getContents(context matcherContext) ([]byte, error) {
if context.getContents != nil {
return context.getContents(context)
func getReader(context matcherContext) (unionreader.UnionReader, error) {
if context.getReader != nil {
return context.getReader(context)
}
reader, err := context.resolver.FileContentsByLocation(context.location)
reader, err := context.resolver.FileContentsByLocation(context.location) //nolint:gocritic
if err != nil {
return nil, err
}
defer internal.CloseAndLogError(reader, context.location.AccessPath)
// TODO: there may be room for improvement here, as this may use an excessive amount of memory. Alternate approach is to leverage a RuneReader.
contents, err := io.ReadAll(reader)
if err != nil {
return nil, fmt.Errorf("unable to get contents for file: %w", err)
}
return contents, nil
return unionreader.GetUnionReader(reader)
}
// singleCPE returns a []cpe.CPE with Source: Generated based on the cpe string or panics if the
@ -287,14 +289,13 @@ func singleCPE(cpeString string, source ...cpe.Source) []cpe.CPE {
// sharedLibraries returns a list of all shared libraries found within a binary, currently
// supporting: elf, macho, and windows pe
func sharedLibraries(context matcherContext) ([]string, error) {
contents, err := getContents(context)
contents, err := getReader(context)
if err != nil {
return nil, err
}
defer internal.CloseAndLogError(contents, context.location.RealPath)
r := bytes.NewReader(contents)
e, _ := elf.NewFile(r)
e, _ := elf.NewFile(contents)
if e != nil {
symbols, err := e.ImportedLibraries()
if err != nil {
@ -302,8 +303,11 @@ func sharedLibraries(context matcherContext) ([]string, error) {
}
return symbols, nil
}
if _, err := contents.Seek(0, io.SeekStart); err != nil {
return nil, fmt.Errorf("unable to seek to beginning of file: %w", err)
}
m, _ := macho.NewFile(r)
m, _ := macho.NewFile(contents)
if m != nil {
symbols, err := m.ImportedLibraries()
if err != nil {
@ -311,8 +315,11 @@ func sharedLibraries(context matcherContext) ([]string, error) {
}
return symbols, nil
}
if _, err := contents.Seek(0, io.SeekStart); err != nil {
return nil, fmt.Errorf("unable to seek to beginning of file: %w", err)
}
p, _ := pe.NewFile(r)
p, _ := pe.NewFile(contents)
if p != nil {
symbols, err := p.ImportedLibraries()
if err != nil {
@ -320,6 +327,9 @@ func sharedLibraries(context matcherContext) ([]string, error) {
}
return symbols, nil
}
if _, err := contents.Seek(0, io.SeekStart); err != nil {
return nil, fmt.Errorf("unable to seek to beginning of file: %w", err)
}
return nil, nil
}

View File

@ -1,6 +1,8 @@
package binary
import (
"bytes"
"io"
"testing"
"github.com/stretchr/testify/assert"
@ -9,6 +11,7 @@ import (
"github.com/anchore/packageurl-go"
"github.com/anchore/syft/syft/cpe"
"github.com/anchore/syft/syft/file"
"github.com/anchore/syft/syft/internal/unionreader"
)
func Test_ClassifierCPEs(t *testing.T) {
@ -162,12 +165,12 @@ func TestFileContentsVersionMatcher(t *testing.T) {
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
mockGetContent := func(context matcherContext) ([]byte, error) {
return []byte(tt.data), nil
mockGetContent := func(context matcherContext) (unionreader.UnionReader, error) {
return unionreader.GetUnionReader(io.NopCloser(bytes.NewBufferString(tt.data)))
}
fn := FileContentsVersionMatcher(tt.pattern)
p, err := fn(Classifier{}, matcherContext{
getContents: mockGetContent,
getReader: mockGetContent,
})
if err != nil {

View File

@ -77,8 +77,12 @@ func DefaultClassifiers() []Classifier {
Class: "redis-binary",
FileGlob: "**/redis-server",
EvidenceMatcher: evidenceMatchers(
FileContentsVersionMatcher(`(?s)payload %5.*?(?P<version>\d.\d\.\d\d*)[a-z0-9]{12,15}-[0-9]{19}`),
FileContentsVersionMatcher(`(?s)\x00(?P<version>\d.\d\.\d\d*)[a-z0-9]{12,15}-[0-9]{19}\x00.*?payload %5`),
// matches most recent versions of redis (~v7), e.g. "7.0.14buildkitsandbox-1702957741000000000"
FileContentsVersionMatcher(`[^\d](?P<version>\d+.\d+\.\d+)buildkitsandbox-\d+`),
// matches against older versions of redis (~v3 - v6), e.g. "4.0.11841ce7054bd9-1542359302000000000"
FileContentsVersionMatcher(`[^\d](?P<version>[0-9]+\.[0-9]+\.[0-9]+)\w{12}-\d+`),
// matches against older versions of redis (~v2), e.g. "Server started, Redis version 2.8.23"
FileContentsVersionMatcher(`Redis version (?P<version>[0-9]+\.[0-9]+\.[0-9]+)`),
),
Package: "redis",
PURL: mustPURL("pkg:generic/redis@version"),

View File

@ -262,12 +262,11 @@ func (c *goBinaryCataloger) findMainModuleVersion(metadata *pkg.GolangBinaryBuil
}
func extractVersionFromContents(reader io.Reader) string {
contents, err := io.ReadAll(reader)
matchMetadata, err := internal.MatchNamedCaptureGroupsFromReader(semverPattern, reader)
if err != nil {
log.WithFields("error", err).Trace("unable to read from go binary reader")
log.WithFields("error", err).Trace("unable to extract version from go binary reader")
return ""
}
matchMetadata := internal.MatchNamedCaptureGroups(semverPattern, string(contents))
version, ok := matchMetadata["version"]
if ok {