feat: parallelize catalogers per-file and hash contents in parallel (#3636)

Signed-off-by: Keith Zantow <kzantow@gmail.com>
This commit is contained in:
Keith Zantow 2025-03-26 11:10:08 -04:00 committed by GitHub
parent dbe29ed4ab
commit 4a9437808e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
20 changed files with 161 additions and 118 deletions

View File

@ -68,6 +68,7 @@ var _ interface {
} = (*Catalog)(nil) } = (*Catalog)(nil)
func DefaultCatalog() Catalog { func DefaultCatalog() Catalog {
cfg := syft.DefaultCreateSBOMConfig()
return Catalog{ return Catalog{
Compliance: defaultComplianceConfig(), Compliance: defaultComplianceConfig(),
Scope: source.SquashedScope.String(), Scope: source.SquashedScope.String(),
@ -81,7 +82,7 @@ func DefaultCatalog() Catalog {
Relationships: defaultRelationshipsConfig(), Relationships: defaultRelationshipsConfig(),
Unknowns: defaultUnknowns(), Unknowns: defaultUnknowns(),
Source: defaultSourceConfig(), Source: defaultSourceConfig(),
Parallelism: 1, Parallelism: cfg.Parallelism,
} }
} }
@ -222,6 +223,9 @@ func (cfg *Catalog) AddFlags(flags clio.FlagSet) {
flags.StringArrayVarP(&cfg.Catalogers, "catalogers", "", flags.StringArrayVarP(&cfg.Catalogers, "catalogers", "",
"enable one or more package catalogers") "enable one or more package catalogers")
flags.IntVarP(&cfg.Parallelism, "parallelism", "",
"number of cataloger workers to run in parallel")
if pfp, ok := flags.(fangs.PFlagSetProvider); ok { if pfp, ok := flags.(fangs.PFlagSetProvider); ok {
if err := pfp.PFlagSet().MarkDeprecated("catalogers", "use: override-default-catalogers and select-catalogers"); err != nil { if err := pfp.PFlagSet().MarkDeprecated("catalogers", "use: override-default-catalogers and select-catalogers"); err != nil {
panic(err) panic(err)
@ -250,7 +254,8 @@ func (cfg *Catalog) AddFlags(flags clio.FlagSet) {
} }
func (cfg *Catalog) DescribeFields(descriptions fangs.FieldDescriptionSet) { func (cfg *Catalog) DescribeFields(descriptions fangs.FieldDescriptionSet) {
descriptions.Add(&cfg.Parallelism, "number of cataloger workers to run in parallel") descriptions.Add(&cfg.Parallelism, `number of cataloger workers to run in parallel
by default, when set to 0: this will be based on runtime.NumCPU * 4, if set to less than 0 it will be unbounded`)
descriptions.Add(&cfg.Enrich, fmt.Sprintf(`Enable data enrichment operations, which can utilize services such as Maven Central and NPM. descriptions.Add(&cfg.Enrich, fmt.Sprintf(`Enable data enrichment operations, which can utilize services such as Maven Central and NPM.
By default all enrichment is disabled, use: all to enable everything. By default all enrichment is disabled, use: all to enable everything.

2
go.mod
View File

@ -260,6 +260,8 @@ require (
modernc.org/memory v1.8.2 // indirect modernc.org/memory v1.8.2 // indirect
) )
require github.com/anchore/go-sync v0.0.0-20250326131806-4eda43a485b6
retract ( retract (
v0.53.2 v0.53.2
v0.53.1 // Published accidentally with incorrect license in depdencies v0.53.1 // Published accidentally with incorrect license in depdencies

2
go.sum
View File

@ -114,6 +114,8 @@ github.com/anchore/go-macholibre v0.0.0-20220308212642-53e6d0aaf6fb h1:iDMnx6LIj
github.com/anchore/go-macholibre v0.0.0-20220308212642-53e6d0aaf6fb/go.mod h1:DmTY2Mfcv38hsHbG78xMiTDdxFtkHpgYNVDPsF2TgHk= github.com/anchore/go-macholibre v0.0.0-20220308212642-53e6d0aaf6fb/go.mod h1:DmTY2Mfcv38hsHbG78xMiTDdxFtkHpgYNVDPsF2TgHk=
github.com/anchore/go-struct-converter v0.0.0-20221118182256-c68fdcfa2092 h1:aM1rlcoLz8y5B2r4tTLMiVTrMtpfY0O8EScKJxaSaEc= github.com/anchore/go-struct-converter v0.0.0-20221118182256-c68fdcfa2092 h1:aM1rlcoLz8y5B2r4tTLMiVTrMtpfY0O8EScKJxaSaEc=
github.com/anchore/go-struct-converter v0.0.0-20221118182256-c68fdcfa2092/go.mod h1:rYqSE9HbjzpHTI74vwPvae4ZVYZd1lue2ta6xHPdblA= github.com/anchore/go-struct-converter v0.0.0-20221118182256-c68fdcfa2092/go.mod h1:rYqSE9HbjzpHTI74vwPvae4ZVYZd1lue2ta6xHPdblA=
github.com/anchore/go-sync v0.0.0-20250326131806-4eda43a485b6 h1:Ha+LSCVuXYSYGi7wIkJK6G8g6jI3LH7y6LbyEVyp4Io=
github.com/anchore/go-sync v0.0.0-20250326131806-4eda43a485b6/go.mod h1:+9oM3XUy8iea/vWj9FhZ9bQGUBN8JpPxxJm5Wbcx9XM=
github.com/anchore/go-testutils v0.0.0-20200925183923-d5f45b0d3c04 h1:VzprUTpc0vW0nnNKJfJieyH/TZ9UYAnTZs5/gHTdAe8= github.com/anchore/go-testutils v0.0.0-20200925183923-d5f45b0d3c04 h1:VzprUTpc0vW0nnNKJfJieyH/TZ9UYAnTZs5/gHTdAe8=
github.com/anchore/go-testutils v0.0.0-20200925183923-d5f45b0d3c04/go.mod h1:6dK64g27Qi1qGQZ67gFmBFvEHScy0/C8qhQhNe5B5pQ= github.com/anchore/go-testutils v0.0.0-20200925183923-d5f45b0d3c04/go.mod h1:6dK64g27Qi1qGQZ67gFmBFvEHScy0/C8qhQhNe5B5pQ=
github.com/anchore/go-version v1.2.2-0.20200701162849-18adb9c92b9b h1:e1bmaoJfZVsCYMrIZBpFxwV26CbsuoEh5muXD5I1Ods= github.com/anchore/go-version v1.2.2-0.20200701162849-18adb9c92b9b h1:e1bmaoJfZVsCYMrIZBpFxwV26CbsuoEh5muXD5I1Ods=

View File

@ -1,12 +1,15 @@
package file package file
import ( import (
"context"
"crypto" "crypto"
"fmt" "fmt"
"hash" "hash"
"io" "io"
"strings" "strings"
"github.com/anchore/go-sync"
"github.com/anchore/syft/syft/cataloging"
"github.com/anchore/syft/syft/file" "github.com/anchore/syft/syft/file"
) )
@ -21,7 +24,7 @@ func supportedHashAlgorithms() []crypto.Hash {
} }
} }
func NewDigestsFromFile(closer io.ReadCloser, hashes []crypto.Hash) ([]file.Digest, error) { func NewDigestsFromFile(ctx context.Context, closer io.ReadCloser, hashes []crypto.Hash) ([]file.Digest, error) {
hashes = NormalizeHashes(hashes) hashes = NormalizeHashes(hashes)
// create a set of hasher objects tied together with a single writer to feed content into // create a set of hasher objects tied together with a single writer to feed content into
hashers := make([]hash.Hash, len(hashes)) hashers := make([]hash.Hash, len(hashes))
@ -31,7 +34,7 @@ func NewDigestsFromFile(closer io.ReadCloser, hashes []crypto.Hash) ([]file.Dige
writers[idx] = hashers[idx] writers[idx] = hashers[idx]
} }
size, err := io.Copy(io.MultiWriter(writers...), closer) size, err := io.Copy(sync.ParallelWriter(ctx, cataloging.ExecutorCPU, writers...), closer)
if err != nil { if err != nil {
return nil, err return nil, err
} }

View File

@ -1,6 +1,7 @@
package file package file
import ( import (
"context"
"crypto" "crypto"
"os" "os"
"testing" "testing"
@ -81,7 +82,7 @@ func TestNewDigestsFromFile(t *testing.T) {
fh, err := os.Open(tt.fixture) fh, err := os.Open(tt.fixture)
require.NoError(t, err) require.NoError(t, err)
got, err := NewDigestsFromFile(fh, tt.hashes) got, err := NewDigestsFromFile(context.TODO(), fh, tt.hashes)
tt.wantErr(t, err) tt.wantErr(t, err)
if err != nil { if err != nil {
return return

View File

@ -5,11 +5,8 @@ import (
"fmt" "fmt"
"runtime/debug" "runtime/debug"
"slices" "slices"
"sync"
"time" "time"
"github.com/hashicorp/go-multierror"
"github.com/anchore/syft/internal/log" "github.com/anchore/syft/internal/log"
"github.com/anchore/syft/internal/sbomsync" "github.com/anchore/syft/internal/sbomsync"
"github.com/anchore/syft/internal/unknown" "github.com/anchore/syft/internal/unknown"
@ -18,64 +15,17 @@ import (
"github.com/anchore/syft/syft/sbom" "github.com/anchore/syft/syft/sbom"
) )
type Executor struct { func RunTask(ctx context.Context, tsk Task, resolver file.Resolver, s sbomsync.Builder, prog *monitor.CatalogerTaskProgress) error {
numWorkers int
tasks chan Task
}
func NewTaskExecutor(tasks []Task, numWorkers int) *Executor {
p := &Executor{
numWorkers: numWorkers,
tasks: make(chan Task, len(tasks)),
}
for i := range tasks {
p.tasks <- tasks[i]
}
close(p.tasks)
return p
}
func (p *Executor) Execute(ctx context.Context, resolver file.Resolver, s sbomsync.Builder, prog *monitor.CatalogerTaskProgress) error {
var lock sync.Mutex
withLock := func(fn func()) {
lock.Lock()
defer lock.Unlock()
fn()
}
var errs error
wg := &sync.WaitGroup{}
for i := 0; i < p.numWorkers; i++ {
wg.Add(1)
go func() {
defer wg.Done()
for {
tsk, ok := <-p.tasks
if !ok {
return
}
err := runTaskSafely(ctx, tsk, resolver, s) err := runTaskSafely(ctx, tsk, resolver, s)
unknowns, remainingErrors := unknown.ExtractCoordinateErrors(err) unknowns, remainingErrors := unknown.ExtractCoordinateErrors(err)
if len(unknowns) > 0 { if len(unknowns) > 0 {
appendUnknowns(s, tsk.Name(), unknowns) appendUnknowns(s, tsk.Name(), unknowns)
} }
if remainingErrors != nil { if remainingErrors != nil {
withLock(func() {
errs = multierror.Append(errs, fmt.Errorf("failed to run task: %w", remainingErrors))
prog.SetError(remainingErrors) prog.SetError(remainingErrors)
})
} }
prog.Increment() prog.Increment()
} return remainingErrors
}()
}
wg.Wait()
return errs
} }
func appendUnknowns(builder sbomsync.Builder, taskName string, unknowns []unknown.CoordinateError) { func appendUnknowns(builder sbomsync.Builder, taskName string, unknowns []unknown.CoordinateError) {

View File

@ -16,9 +16,8 @@ func Test_TaskExecutor_PanicHandling(t *testing.T) {
tsk := NewTask("panicking-cataloger", func(_ context.Context, _ file.Resolver, _ sbomsync.Builder) error { tsk := NewTask("panicking-cataloger", func(_ context.Context, _ file.Resolver, _ sbomsync.Builder) error {
panic("something bad happened") panic("something bad happened")
}) })
ex := NewTaskExecutor([]Task{tsk}, 1)
err := ex.Execute(context.Background(), nil, nil, &monitor.CatalogerTaskProgress{ err := RunTask(context.Background(), tsk, nil, nil, &monitor.CatalogerTaskProgress{
Manual: progress.NewManual(-1), Manual: progress.NewManual(-1),
}) })

View File

@ -120,14 +120,14 @@ func newExecutableCatalogerTaskFactory(tags ...string) factory {
} }
func newExecutableCatalogerTask(selection file.Selection, cfg executable.Config, tags ...string) Task { func newExecutableCatalogerTask(selection file.Selection, cfg executable.Config, tags ...string) Task {
fn := func(_ context.Context, resolver file.Resolver, builder sbomsync.Builder) error { fn := func(ctx context.Context, resolver file.Resolver, builder sbomsync.Builder) error {
if selection == file.NoFilesSelection { if selection == file.NoFilesSelection {
return nil return nil
} }
accessor := builder.(sbomsync.Accessor) accessor := builder.(sbomsync.Accessor)
result, err := executable.NewCataloger(cfg).Catalog(resolver) result, err := executable.NewCataloger(cfg).CatalogCtx(ctx, resolver)
accessor.WriteToSBOM(func(sbom *sbom.SBOM) { accessor.WriteToSBOM(func(sbom *sbom.SBOM) {
sbom.Artifacts.Executables = result sbom.Artifacts.Executables = result

View File

@ -0,0 +1,8 @@
package cataloging
// ExecutorCPU is the name to use when executing parallel functions which are CPU-intensive, such as
// hashing full files
const ExecutorCPU = "cpu"
// ExecutorFile is the name to use when executing parallel file reading functions, such as cataloging
const ExecutorFile = "file"

View File

@ -3,16 +3,19 @@ package syft
import ( import (
"context" "context"
"fmt" "fmt"
"runtime"
"sort" "sort"
"github.com/dustin/go-humanize" "github.com/dustin/go-humanize"
"github.com/scylladb/go-set/strset" "github.com/scylladb/go-set/strset"
"github.com/anchore/go-sync"
"github.com/anchore/syft/internal/bus" "github.com/anchore/syft/internal/bus"
"github.com/anchore/syft/internal/licenses" "github.com/anchore/syft/internal/licenses"
"github.com/anchore/syft/internal/sbomsync" "github.com/anchore/syft/internal/sbomsync"
"github.com/anchore/syft/internal/task" "github.com/anchore/syft/internal/task"
"github.com/anchore/syft/syft/artifact" "github.com/anchore/syft/syft/artifact"
"github.com/anchore/syft/syft/cataloging"
"github.com/anchore/syft/syft/event/monitor" "github.com/anchore/syft/syft/event/monitor"
"github.com/anchore/syft/syft/pkg" "github.com/anchore/syft/syft/pkg"
"github.com/anchore/syft/syft/sbom" "github.com/anchore/syft/syft/sbom"
@ -62,22 +65,20 @@ func CreateSBOM(ctx context.Context, src source.Source, cfg *CreateSBOMConfig) (
}, },
} }
// inject a single license scanner and content config for all package cataloging tasks into context // setup everything we need in context: license scanner, executors, etc.
licenseScanner, err := licenses.NewDefaultScanner( ctx, err = setupContext(ctx, cfg)
licenses.WithIncludeLicenseContent(cfg.Licenses.IncludeUnkownLicenseContent),
licenses.WithCoverage(cfg.Licenses.Coverage),
)
if err != nil { if err != nil {
return nil, fmt.Errorf("could not build licenseScanner for cataloging: %w", err) return nil, err
} }
ctx = licenses.SetContextLicenseScanner(ctx, licenseScanner)
catalogingProgress := monitorCatalogingTask(src.ID(), taskGroups) catalogingProgress := monitorCatalogingTask(src.ID(), taskGroups)
packageCatalogingProgress := monitorPackageCatalogingTask() packageCatalogingProgress := monitorPackageCatalogingTask()
builder := sbomsync.NewBuilder(&s, monitorPackageCount(packageCatalogingProgress)) builder := sbomsync.NewBuilder(&s, monitorPackageCount(packageCatalogingProgress))
for i := range taskGroups { for i := range taskGroups {
err := task.NewTaskExecutor(taskGroups[i], cfg.Parallelism).Execute(ctx, resolver, builder, catalogingProgress) err = sync.Collect(&ctx, cataloging.ExecutorFile, sync.ToSeq(taskGroups[i]), func(t task.Task) (any, error) {
return nil, task.RunTask(ctx, t, resolver, builder, catalogingProgress)
}, nil)
if err != nil { if err != nil {
// TODO: tie this to the open progress monitors... // TODO: tie this to the open progress monitors...
return nil, fmt.Errorf("failed to run tasks: %w", err) return nil, fmt.Errorf("failed to run tasks: %w", err)
@ -90,6 +91,53 @@ func CreateSBOM(ctx context.Context, src source.Source, cfg *CreateSBOMConfig) (
return &s, nil return &s, nil
} }
func setupContext(ctx context.Context, cfg *CreateSBOMConfig) (context.Context, error) {
// configure parallel executors
ctx = setContextExecutors(ctx, cfg)
// configure license scanner
return setContextLicenseScanner(ctx, cfg)
}
func setContextLicenseScanner(ctx context.Context, cfg *CreateSBOMConfig) (context.Context, error) {
// inject a single license scanner and content config for all package cataloging tasks into context
licenseScanner, err := licenses.NewDefaultScanner(
licenses.WithIncludeLicenseContent(cfg.Licenses.IncludeUnkownLicenseContent),
licenses.WithCoverage(cfg.Licenses.Coverage),
)
if err != nil {
return nil, fmt.Errorf("could not build licenseScanner for cataloging: %w", err)
}
ctx = licenses.SetContextLicenseScanner(ctx, licenseScanner)
return ctx, nil
}
func setContextExecutors(ctx context.Context, cfg *CreateSBOMConfig) context.Context {
parallelism := 0
if cfg != nil {
parallelism = cfg.Parallelism
}
// executor parallelism is: 0 == serial, no goroutines, 1 == max 1 goroutine
// so if they set 1, we just run in serial to avoid overhead, and treat 0 as default, reasonable max for the system
// negative is unbounded, so no need for any other special handling
switch parallelism {
case 0:
parallelism = runtime.NumCPU() * 4
case 1:
parallelism = 0 // run in serial, don't spawn goroutines
case -99:
parallelism = 1 // special case to catch incorrect executor usage during testing
}
// set up executors for each dimension we want to coordinate bounds for
if !sync.HasContextExecutor(ctx, cataloging.ExecutorCPU) {
ctx = sync.SetContextExecutor(ctx, cataloging.ExecutorCPU, sync.NewExecutor(parallelism))
}
if !sync.HasContextExecutor(ctx, cataloging.ExecutorFile) {
ctx = sync.SetContextExecutor(ctx, cataloging.ExecutorFile, sync.NewExecutor(parallelism))
}
return ctx
}
func monitorPackageCount(prog *monitor.CatalogerTaskProgress) func(s *sbom.SBOM) { func monitorPackageCount(prog *monitor.CatalogerTaskProgress) func(s *sbom.SBOM) {
return func(s *sbom.SBOM) { return func(s *sbom.SBOM) {
count := humanize.Comma(int64(s.Artifacts.Packages.PackageCount())) count := humanize.Comma(int64(s.Artifacts.Packages.PackageCount()))

View File

@ -49,7 +49,7 @@ func DefaultCreateSBOMConfig() *CreateSBOMConfig {
Packages: pkgcataloging.DefaultConfig(), Packages: pkgcataloging.DefaultConfig(),
Licenses: cataloging.DefaultLicenseConfig(), Licenses: cataloging.DefaultLicenseConfig(),
Files: filecataloging.DefaultConfig(), Files: filecataloging.DefaultConfig(),
Parallelism: 1, Parallelism: 0, // use default: run in parallel based on number of CPUs
packageTaskFactories: task.DefaultPackageTaskFactories(), packageTaskFactories: task.DefaultPackageTaskFactories(),
// library consumers are free to override the tool values to fit their needs, however, we have some sane defaults // library consumers are free to override the tool values to fit their needs, however, we have some sane defaults
@ -91,10 +91,6 @@ func (c *CreateSBOMConfig) WithTool(name, version string, cfg ...any) *CreateSBO
// WithParallelism allows for setting the number of concurrent cataloging tasks that can be performed at once // WithParallelism allows for setting the number of concurrent cataloging tasks that can be performed at once
func (c *CreateSBOMConfig) WithParallelism(p int) *CreateSBOMConfig { func (c *CreateSBOMConfig) WithParallelism(p int) *CreateSBOMConfig {
if p < 1 {
// TODO: warn?
p = 1
}
c.Parallelism = p c.Parallelism = p
return c return c
} }

View File

@ -2,6 +2,7 @@ package executable
import ( import (
"bytes" "bytes"
"context"
"debug/elf" "debug/elf"
"debug/macho" "debug/macho"
"encoding/binary" "encoding/binary"
@ -11,11 +12,13 @@ import (
"github.com/bmatcuk/doublestar/v4" "github.com/bmatcuk/doublestar/v4"
"github.com/dustin/go-humanize" "github.com/dustin/go-humanize"
"github.com/anchore/go-sync"
"github.com/anchore/syft/internal" "github.com/anchore/syft/internal"
"github.com/anchore/syft/internal/bus" "github.com/anchore/syft/internal/bus"
"github.com/anchore/syft/internal/log" "github.com/anchore/syft/internal/log"
"github.com/anchore/syft/internal/mimetype" "github.com/anchore/syft/internal/mimetype"
"github.com/anchore/syft/internal/unknown" "github.com/anchore/syft/internal/unknown"
"github.com/anchore/syft/syft/cataloging"
"github.com/anchore/syft/syft/event/monitor" "github.com/anchore/syft/syft/event/monitor"
"github.com/anchore/syft/syft/file" "github.com/anchore/syft/syft/file"
"github.com/anchore/syft/syft/internal/unionreader" "github.com/anchore/syft/syft/internal/unionreader"
@ -46,8 +49,10 @@ func NewCataloger(cfg Config) *Cataloger {
} }
func (i *Cataloger) Catalog(resolver file.Resolver) (map[file.Coordinates]file.Executable, error) { func (i *Cataloger) Catalog(resolver file.Resolver) (map[file.Coordinates]file.Executable, error) {
var errs error return i.CatalogCtx(context.Background(), resolver)
}
func (i *Cataloger) CatalogCtx(ctx context.Context, resolver file.Resolver) (map[file.Coordinates]file.Executable, error) {
locs, err := resolver.FilesByMIMEType(i.config.MIMETypes...) locs, err := resolver.FilesByMIMEType(i.config.MIMETypes...)
if err != nil { if err != nil {
return nil, fmt.Errorf("unable to get file locations for binaries: %w", err) return nil, fmt.Errorf("unable to get file locations for binaries: %w", err)
@ -61,19 +66,20 @@ func (i *Cataloger) Catalog(resolver file.Resolver) (map[file.Coordinates]file.E
prog := catalogingProgress(int64(len(locs))) prog := catalogingProgress(int64(len(locs)))
results := make(map[file.Coordinates]file.Executable) results := make(map[file.Coordinates]file.Executable)
for _, loc := range locs { errs := sync.Collect(&ctx, cataloging.ExecutorFile, sync.ToSeq(locs), func(loc file.Location) (*file.Executable, error) {
prog.AtomicStage.Set(loc.Path()) prog.AtomicStage.Set(loc.Path())
exec, err := processExecutableLocation(loc, resolver) exec, err := processExecutableLocation(loc, resolver)
if err != nil { if err != nil {
errs = unknown.Append(errs, loc, err) err = unknown.New(loc, err)
} }
return exec, err
}, func(loc file.Location, exec *file.Executable) {
if exec != nil { if exec != nil {
prog.Increment() prog.Increment()
results[loc.Coordinates] = *exec results[loc.Coordinates] = *exec
} }
} })
log.Debugf("executable cataloger processed %d files", len(results)) log.Debugf("executable cataloger processed %d files", len(results))

View File

@ -8,12 +8,14 @@ import (
"github.com/dustin/go-humanize" "github.com/dustin/go-humanize"
"github.com/anchore/go-sync"
stereoscopeFile "github.com/anchore/stereoscope/pkg/file" stereoscopeFile "github.com/anchore/stereoscope/pkg/file"
"github.com/anchore/syft/internal" "github.com/anchore/syft/internal"
"github.com/anchore/syft/internal/bus" "github.com/anchore/syft/internal/bus"
intFile "github.com/anchore/syft/internal/file" intFile "github.com/anchore/syft/internal/file"
"github.com/anchore/syft/internal/log" "github.com/anchore/syft/internal/log"
"github.com/anchore/syft/internal/unknown" "github.com/anchore/syft/internal/unknown"
"github.com/anchore/syft/syft/cataloging"
"github.com/anchore/syft/syft/event/monitor" "github.com/anchore/syft/syft/event/monitor"
"github.com/anchore/syft/syft/file" "github.com/anchore/syft/syft/file"
intCataloger "github.com/anchore/syft/syft/file/cataloger/internal" intCataloger "github.com/anchore/syft/syft/file/cataloger/internal"
@ -34,7 +36,6 @@ func NewCataloger(hashes []crypto.Hash) *Cataloger {
func (i *Cataloger) Catalog(ctx context.Context, resolver file.Resolver, coordinates ...file.Coordinates) (map[file.Coordinates][]file.Digest, error) { func (i *Cataloger) Catalog(ctx context.Context, resolver file.Resolver, coordinates ...file.Coordinates) (map[file.Coordinates][]file.Digest, error) {
results := make(map[file.Coordinates][]file.Digest) results := make(map[file.Coordinates][]file.Digest)
var locations []file.Location var locations []file.Location
var errs error
if len(coordinates) == 0 { if len(coordinates) == 0 {
locations = intCataloger.AllRegularFiles(ctx, resolver) locations = intCataloger.AllRegularFiles(ctx, resolver)
@ -49,41 +50,44 @@ func (i *Cataloger) Catalog(ctx context.Context, resolver file.Resolver, coordin
} }
prog := catalogingProgress(int64(len(locations))) prog := catalogingProgress(int64(len(locations)))
for _, location := range locations {
result, err := i.catalogLocation(resolver, location) err := sync.Collect(&ctx, cataloging.ExecutorFile, sync.ToSeq(locations), func(location file.Location) ([]file.Digest, error) {
result, err := i.catalogLocation(ctx, resolver, location)
if errors.Is(err, ErrUndigestableFile) { if errors.Is(err, ErrUndigestableFile) {
continue return nil, nil
} }
prog.AtomicStage.Set(location.Path()) prog.AtomicStage.Set(location.Path())
if internal.IsErrPathPermission(err) { if internal.IsErrPathPermission(err) {
log.Debugf("file digests cataloger skipping %q: %+v", location.RealPath, err) log.Debugf("file digests cataloger skipping %q: %+v", location.RealPath, err)
errs = unknown.Append(errs, location, err) return nil, unknown.New(location, err)
continue
} }
if err != nil { if err != nil {
prog.SetError(err) prog.SetError(err)
errs = unknown.Append(errs, location, err) return nil, unknown.New(location, err)
continue
} }
prog.Increment() prog.Increment()
results[location.Coordinates] = result return result, nil
}, func(location file.Location, digests []file.Digest) {
if len(digests) > 0 {
results[location.Coordinates] = digests
} }
})
log.Debugf("file digests cataloger processed %d files", prog.Current()) log.Debugf("file digests cataloger processed %d files", prog.Current())
prog.AtomicStage.Set(fmt.Sprintf("%s files", humanize.Comma(prog.Current()))) prog.AtomicStage.Set(fmt.Sprintf("%s files", humanize.Comma(prog.Current())))
prog.SetCompleted() prog.SetCompleted()
return results, errs return results, err
} }
func (i *Cataloger) catalogLocation(resolver file.Resolver, location file.Location) ([]file.Digest, error) { func (i *Cataloger) catalogLocation(ctx context.Context, resolver file.Resolver, location file.Location) ([]file.Digest, error) {
meta, err := resolver.FileMetadataByLocation(location) meta, err := resolver.FileMetadataByLocation(location)
if err != nil { if err != nil {
return nil, err return nil, err
@ -100,7 +104,7 @@ func (i *Cataloger) catalogLocation(resolver file.Resolver, location file.Locati
} }
defer internal.CloseAndLogError(contentReader, location.AccessPath) defer internal.CloseAndLogError(contentReader, location.AccessPath)
digests, err := intFile.NewDigestsFromFile(contentReader, i.hashes) digests, err := intFile.NewDigestsFromFile(ctx, contentReader, i.hashes)
if err != nil { if err != nil {
return nil, internal.ErrPath{Context: "digests-cataloger", Path: location.RealPath, Err: err} return nil, internal.ErrPath{Context: "digests-cataloger", Path: location.RealPath, Err: err}
} }

View File

@ -65,13 +65,13 @@ func TestDigestsCataloger(t *testing.T) {
name: "md5", name: "md5",
digests: []crypto.Hash{crypto.MD5}, digests: []crypto.Hash{crypto.MD5},
files: []string{"test-fixtures/last/empty/empty", "test-fixtures/last/path.txt"}, files: []string{"test-fixtures/last/empty/empty", "test-fixtures/last/path.txt"},
expected: testDigests(t, "test-fixtures/last", []string{"empty/empty", "path.txt"}, crypto.MD5), expected: testDigests(t, "test-fixtures/last", []string{"path.txt"}, crypto.MD5),
}, },
{ {
name: "md5-sha1-sha256", name: "md5-sha1-sha256",
digests: []crypto.Hash{crypto.MD5, crypto.SHA1, crypto.SHA256}, digests: []crypto.Hash{crypto.MD5, crypto.SHA1, crypto.SHA256},
files: []string{"test-fixtures/last/empty/empty", "test-fixtures/last/path.txt"}, files: []string{"test-fixtures/last/empty/empty", "test-fixtures/last/path.txt"},
expected: testDigests(t, "test-fixtures/last", []string{"empty/empty", "path.txt"}, crypto.MD5, crypto.SHA1, crypto.SHA256), expected: testDigests(t, "test-fixtures/last", []string{"path.txt"}, crypto.MD5, crypto.SHA1, crypto.SHA256),
}, },
} }

View File

@ -26,7 +26,7 @@ func newDpkgPackage(d pkg.DpkgDBEntry, dbLocation file.Location, resolver file.R
// TODO: separate pr to license refactor, but explore extracting dpkg-specific license parsing into a separate function // TODO: separate pr to license refactor, but explore extracting dpkg-specific license parsing into a separate function
var licenses []pkg.License var licenses []pkg.License
locations := file.NewLocationSet(dbLocation.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation)) locations := file.NewLocationSet(dbLocation)
locations.Add(evidence...) locations.Add(evidence...)
p := pkg.Package{ p := pkg.Package{

View File

@ -13,10 +13,12 @@ import (
"github.com/dustin/go-humanize" "github.com/dustin/go-humanize"
"github.com/go-viper/mapstructure/v2" "github.com/go-viper/mapstructure/v2"
"github.com/anchore/go-sync"
"github.com/anchore/syft/internal" "github.com/anchore/syft/internal"
"github.com/anchore/syft/internal/log" "github.com/anchore/syft/internal/log"
"github.com/anchore/syft/internal/unknown" "github.com/anchore/syft/internal/unknown"
"github.com/anchore/syft/syft/artifact" "github.com/anchore/syft/syft/artifact"
"github.com/anchore/syft/syft/cataloging"
"github.com/anchore/syft/syft/file" "github.com/anchore/syft/syft/file"
"github.com/anchore/syft/syft/pkg" "github.com/anchore/syft/syft/pkg"
"github.com/anchore/syft/syft/pkg/cataloger/generic" "github.com/anchore/syft/syft/pkg/cataloger/generic"
@ -28,17 +30,17 @@ var (
) )
// parseDpkgDB reads a dpkg database "status" file (and surrounding data files) and returns the packages and relationships found. // parseDpkgDB reads a dpkg database "status" file (and surrounding data files) and returns the packages and relationships found.
func parseDpkgDB(_ context.Context, resolver file.Resolver, env *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) { func parseDpkgDB(ctx context.Context, resolver file.Resolver, env *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) {
metadata, err := parseDpkgStatus(reader) metadata, err := parseDpkgStatus(reader)
if err != nil { if err != nil {
return nil, nil, fmt.Errorf("unable to catalog dpkg DB=%q: %w", reader.RealPath, err) return nil, nil, fmt.Errorf("unable to catalog dpkg DB=%q: %w", reader.RealPath, err)
} }
dbLoc := reader.Location.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation)
var pkgs []pkg.Package var pkgs []pkg.Package
for _, m := range metadata { _ = sync.CollectSlice(&ctx, cataloging.ExecutorFile, sync.ToSeq(metadata), func(m pkg.DpkgDBEntry) (pkg.Package, error) {
p := newDpkgPackage(m, reader.Location, resolver, env.LinuxRelease, findDpkgInfoFiles(m.Package, resolver, reader.Location)...) return newDpkgPackage(m, dbLoc, resolver, env.LinuxRelease, findDpkgInfoFiles(m.Package, resolver, reader.Location)...), nil
pkgs = append(pkgs, p) }, &pkgs)
}
return pkgs, nil, unknown.IfEmptyf(pkgs, "unable to determine packages") return pkgs, nil, unknown.IfEmptyf(pkgs, "unable to determine packages")
} }

View File

@ -4,10 +4,12 @@ import (
"context" "context"
"github.com/anchore/go-logger" "github.com/anchore/go-logger"
"github.com/anchore/go-sync"
"github.com/anchore/syft/internal" "github.com/anchore/syft/internal"
"github.com/anchore/syft/internal/log" "github.com/anchore/syft/internal/log"
"github.com/anchore/syft/internal/unknown" "github.com/anchore/syft/internal/unknown"
"github.com/anchore/syft/syft/artifact" "github.com/anchore/syft/syft/artifact"
"github.com/anchore/syft/syft/cataloging"
"github.com/anchore/syft/syft/file" "github.com/anchore/syft/syft/file"
"github.com/anchore/syft/syft/linux" "github.com/anchore/syft/syft/linux"
"github.com/anchore/syft/syft/pkg" "github.com/anchore/syft/syft/pkg"
@ -161,7 +163,11 @@ func (c *Cataloger) Catalog(ctx context.Context, resolver file.Resolver) ([]pkg.
LinuxRelease: linux.IdentifyRelease(resolver), LinuxRelease: linux.IdentifyRelease(resolver),
} }
for _, req := range c.selectFiles(resolver) { type result struct {
pkgs []pkg.Package
rels []artifact.Relationship
}
errs = sync.Collect(&ctx, cataloging.ExecutorFile, sync.ToSeq(c.selectFiles(resolver)), func(req request) (result, error) {
location, parser := req.Location, req.Parser location, parser := req.Location, req.Parser
log.WithFields("path", location.RealPath).Trace("parsing file contents") log.WithFields("path", location.RealPath).Trace("parsing file contents")
@ -171,14 +177,14 @@ func (c *Cataloger) Catalog(ctx context.Context, resolver file.Resolver) ([]pkg.
// parsers may return errors and valid packages / relationships // parsers may return errors and valid packages / relationships
errs = unknown.Append(errs, location, err) errs = unknown.Append(errs, location, err)
} }
return result{discoveredPackages, discoveredRelationships}, errs
for _, p := range discoveredPackages { }, func(_ request, res result) {
for _, p := range res.pkgs {
p.FoundBy = c.upstreamCataloger p.FoundBy = c.upstreamCataloger
packages = append(packages, p) packages = append(packages, p)
} }
relationships = append(relationships, res.rels...)
relationships = append(relationships, discoveredRelationships...) })
}
return c.process(ctx, resolver, packages, relationships, errs) return c.process(ctx, resolver, packages, relationships, errs)
} }

View File

@ -249,7 +249,7 @@ func (j *archiveParser) discoverMainPackage(ctx context.Context) (*pkg.Package,
} }
// grab and assign digest for the entire archive // grab and assign digest for the entire archive
digests, err := getDigestsFromArchive(j.archivePath) digests, err := getDigestsFromArchive(ctx, j.archivePath)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -475,7 +475,7 @@ func (j *archiveParser) discoverPkgsFromAllMavenFiles(ctx context.Context, paren
return pkgs, nil return pkgs, nil
} }
func getDigestsFromArchive(archivePath string) ([]file.Digest, error) { func getDigestsFromArchive(ctx context.Context, archivePath string) ([]file.Digest, error) {
archiveCloser, err := os.Open(archivePath) archiveCloser, err := os.Open(archivePath)
if err != nil { if err != nil {
return nil, fmt.Errorf("unable to open archive path (%s): %w", archivePath, err) return nil, fmt.Errorf("unable to open archive path (%s): %w", archivePath, err)
@ -483,7 +483,7 @@ func getDigestsFromArchive(archivePath string) ([]file.Digest, error) {
defer internal.CloseAndLogError(archiveCloser, archivePath) defer internal.CloseAndLogError(archiveCloser, archivePath)
// grab and assign digest for the entire archive // grab and assign digest for the entire archive
digests, err := intFile.NewDigestsFromFile(archiveCloser, javaArchiveHashes) digests, err := intFile.NewDigestsFromFile(ctx, archiveCloser, javaArchiveHashes)
if err != nil { if err != nil {
log.Debugf("failed to create digest for file=%q: %+v", archivePath, err) log.Debugf("failed to create digest for file=%q: %+v", archivePath, err)
} }

View File

@ -1,6 +1,7 @@
package filesource package filesource
import ( import (
"context"
"crypto" "crypto"
"fmt" "fmt"
"os" "os"
@ -68,7 +69,7 @@ func New(cfg Config) (source.Source, error) {
defer fh.Close() defer fh.Close()
digests, err = intFile.NewDigestsFromFile(fh, cfg.DigestAlgorithms) digests, err = intFile.NewDigestsFromFile(context.TODO(), fh, cfg.DigestAlgorithms)
if err != nil { if err != nil {
return nil, fmt.Errorf("unable to calculate digests for file=%q: %w", cfg.Path, err) return nil, fmt.Errorf("unable to calculate digests for file=%q: %w", cfg.Path, err)
} }

View File

@ -354,7 +354,17 @@ func TestPackagesCmdFlags(t *testing.T) {
args: []string{"scan", "-vvv", "-o", "json", coverageImage}, args: []string{"scan", "-vvv", "-o", "json", coverageImage},
assertions: []traitAssertion{ assertions: []traitAssertion{
// the application config in the log matches that of what we expect to have been configured. // the application config in the log matches that of what we expect to have been configured.
assertInOutput(`parallelism: 1`), assertInOutput(`parallelism: 0`),
assertPackageCount(coverageImageSquashedPackageCount),
assertSuccessfulReturnCode,
},
},
{
name: "parallelism-flag",
args: []string{"scan", "-vvv", "--parallelism", "2", "-o", "json", coverageImage},
assertions: []traitAssertion{
// the application config in the log matches that of what we expect to have been configured.
assertInOutput(`parallelism: 2`),
assertPackageCount(coverageImageSquashedPackageCount), assertPackageCount(coverageImageSquashedPackageCount),
assertSuccessfulReturnCode, assertSuccessfulReturnCode,
}, },