From e4a3e433b60186b1e1d77f5d9aa88a26a51b8840 Mon Sep 17 00:00:00 2001 From: Alex Goodman Date: Fri, 11 Dec 2020 17:10:45 -0500 Subject: [PATCH] add content requested and refactor python cataloger to use it Signed-off-by: Alex Goodman --- syft/cataloger/python/package_cataloger.go | 102 +++++++----------- .../python/package_cataloger_test.go | 49 ++++++--- syft/source/content_requester.go | 48 +++++++++ syft/source/file_data.go | 6 ++ 4 files changed, 132 insertions(+), 73 deletions(-) create mode 100644 syft/source/content_requester.go create mode 100644 syft/source/file_data.go diff --git a/syft/cataloger/python/package_cataloger.go b/syft/cataloger/python/package_cataloger.go index d8ead13e2..247e305e3 100644 --- a/syft/cataloger/python/package_cataloger.go +++ b/syft/cataloger/python/package_cataloger.go @@ -18,6 +18,12 @@ const ( wheelMetadataGlob = "**/*dist-info/METADATA" ) +type pythonPackageData struct { + Metadata source.FileData + FileRecord *source.FileData + TopPackage *source.FileData +} + type PackageCataloger struct{} // NewPythonPackageCataloger returns a new cataloger for python packages within egg or wheel installation directories. @@ -32,55 +38,43 @@ func (c *PackageCataloger) Name() string { // Catalog is given an object to resolve file references and content, this function returns any discovered Packages after analyzing python egg and wheel installations. func (c *PackageCataloger) Catalog(resolver source.Resolver) ([]pkg.Package, error) { - // nolint:prealloc - var fileMatches []source.Location - - for _, glob := range []string{eggMetadataGlob, wheelMetadataGlob} { - matches, err := resolver.FilesByGlob(glob) - if err != nil { - return nil, fmt.Errorf("failed to find files by glob: %s", glob) - } - fileMatches = append(fileMatches, matches...) - } - - request, entries := filesOfInterest(resolver, fileMatches) - if err := getContents(resolver, request); err != nil { + entries, err := c.getPythonPackageEntries(resolver) + if err != nil { return nil, err } - var pkgs []pkg.Package + var packages []pkg.Package for _, entry := range entries { p, err := c.catalogEggOrWheel(entry) if err != nil { return nil, fmt.Errorf("unable to catalog python package=%+v: %w", entry.Metadata.Location.Path, err) } if p != nil { - pkgs = append(pkgs, *p) + packages = append(packages, *p) } } - return pkgs, nil + return packages, nil } -type FileData struct { - Location source.Location - Contents string -} +func (c *PackageCataloger) getPythonPackageEntries(resolver source.Resolver) ([]*pythonPackageData, error) { + var metadataLocations []source.Location -type pythonEntry struct { - Metadata FileData - FileRecord *FileData - TopPackage *FileData -} - -func filesOfInterest(resolver source.FileResolver, metadataLocations []source.Location) (map[source.Location]*FileData, []*pythonEntry) { - var request = make(map[source.Location]*FileData) - var entries []*pythonEntry - for _, metadataLocation := range metadataLocations { + // find all primary record paths + matches, err := resolver.FilesByGlob(eggMetadataGlob, wheelMetadataGlob) + if err != nil { + return nil, fmt.Errorf("failed to find files by glob: %w", err) + } + metadataLocations = append(metadataLocations, matches...) + // for every primary record path, craft all secondary record paths and build a request object to gather all file contents for each record + var requester = source.NewContentRequester() + var entries = make([]*pythonPackageData, len(metadataLocations)) + for i, metadataLocation := range metadataLocations { // we've been given a file reference to a specific wheel METADATA file. note: this may be for a directory // or for an image... for an image the METADATA file may be present within multiple layers, so it is important - // to reconcile the RECORD path to the same layer (or the next adjacent lower layer). + // to reconcile the RECORD path to the same layer (or the next adjacent lower layer). The same is true with + // the top_level.txt file. // lets find the RECORD file relative to the directory where the METADATA file resides (in path AND layer structure) recordPath := filepath.Join(filepath.Dir(metadataLocation.Path), "RECORD") @@ -91,52 +85,39 @@ func filesOfInterest(resolver source.FileResolver, metadataLocations []source.Lo topLevelPath := filepath.Join(parentDir, "top_level.txt") topLevelLocation := resolver.RelativeFileByPath(metadataLocation, topLevelPath) - entry := &pythonEntry{ - Metadata: FileData{ + // build an entry that will later be populated with contents when the request is executed + entry := &pythonPackageData{ + Metadata: source.FileData{ Location: metadataLocation, }, } - request[entry.Metadata.Location] = &entry.Metadata + requester.Add(&entry.Metadata) if recordLocation != nil { - entry.FileRecord = &FileData{ + entry.FileRecord = &source.FileData{ Location: *recordLocation, } - request[entry.FileRecord.Location] = entry.FileRecord + requester.Add(entry.FileRecord) } if topLevelLocation != nil { - entry.TopPackage = &FileData{ + entry.TopPackage = &source.FileData{ Location: *topLevelLocation, } - request[entry.TopPackage.Location] = entry.TopPackage + requester.Add(entry.TopPackage) } - entries = append(entries, entry) - } - return request, entries -} - -func getContents(resolver source.ContentResolver, request map[source.Location]*FileData) error { - var locations []source.Location - for l := range request { - locations = append(locations, l) + // keep the entry for processing later + entries[i] = entry } - response, err := resolver.MultipleFileContentsByLocation(locations) - if err != nil { - return err - } - - for l, contents := range response { - request[l].Contents = contents - } - return nil + // return the set of entries and execute the request for fetching contents + return entries, requester.Execute(resolver) } // catalogEggOrWheel takes the primary metadata file reference and returns the python package it represents. -func (c *PackageCataloger) catalogEggOrWheel(entry *pythonEntry) (*pkg.Package, error) { +func (c *PackageCataloger) catalogEggOrWheel(entry *pythonPackageData) (*pkg.Package, error) { metadata, sources, err := c.assembleEggOrWheelMetadata(entry) if err != nil { return nil, err @@ -161,7 +142,7 @@ func (c *PackageCataloger) catalogEggOrWheel(entry *pythonEntry) (*pkg.Package, } // assembleEggOrWheelMetadata discovers and accumulates python package metadata from multiple file sources and returns a single metadata object as well as a list of files where the metadata was derived from. -func (c *PackageCataloger) assembleEggOrWheelMetadata(entry *pythonEntry) (*pkg.PythonPackageMetadata, []source.Location, error) { +func (c *PackageCataloger) assembleEggOrWheelMetadata(entry *pythonPackageData) (*pkg.PythonPackageMetadata, []source.Location, error) { var sources = []source.Location{entry.Metadata.Location} metadata, err := parseWheelOrEggMetadata(entry.Metadata.Location.Path, strings.NewReader(entry.Metadata.Contents)) @@ -189,7 +170,7 @@ func (c *PackageCataloger) assembleEggOrWheelMetadata(entry *pythonEntry) (*pkg. } // fetchRecordFiles finds a corresponding RECORD file for the given python package metadata file and returns the set of file records contained. -func (c *PackageCataloger) fetchRecordFiles(entry *FileData) (files []pkg.PythonFileRecord, sources []source.Location, err error) { +func (c *PackageCataloger) fetchRecordFiles(entry *source.FileData) (files []pkg.PythonFileRecord, sources []source.Location, err error) { // we've been given a file reference to a specific wheel METADATA file. note: this may be for a directory // or for an image... for an image the METADATA file may be present within multiple layers, so it is important // to reconcile the RECORD path to the same layer (or the next adjacent lower layer). @@ -209,9 +190,8 @@ func (c *PackageCataloger) fetchRecordFiles(entry *FileData) (files []pkg.Python } // fetchTopLevelPackages finds a corresponding top_level.txt file for the given python package metadata file and returns the set of package names contained. -func (c *PackageCataloger) fetchTopLevelPackages(entry *FileData) (pkgs []string, sources []source.Location, err error) { +func (c *PackageCataloger) fetchTopLevelPackages(entry *source.FileData) (pkgs []string, sources []source.Location, err error) { if entry == nil { - // TODO log.Warnf("missing python package top_level.txt (package=!!)") return nil, nil, nil } diff --git a/syft/cataloger/python/package_cataloger_test.go b/syft/cataloger/python/package_cataloger_test.go index 2ad16da73..c94b7f564 100644 --- a/syft/cataloger/python/package_cataloger_test.go +++ b/syft/cataloger/python/package_cataloger_test.go @@ -8,12 +8,15 @@ import ( "strings" "testing" + "github.com/anchore/syft/internal/file" + "github.com/anchore/syft/syft/source" "github.com/anchore/syft/syft/pkg" "github.com/go-test/deep" ) +// TODO: make this generic (based on maps of source.FileData) and make a generic mock to move to the source pkg type pythonTestResolverMock struct { metadataReader io.Reader recordReader io.Reader @@ -68,21 +71,21 @@ func newTestResolver(metaPath, recordPath, topPath string) *pythonTestResolverMo } } -func (r *pythonTestResolverMock) FileContentsByLocation(ref source.Location) (string, error) { +func (r *pythonTestResolverMock) FileContentsByLocation(location source.Location) (string, error) { switch { - case r.topLevelRef != nil && ref.Path == r.topLevelRef.Path: + case r.topLevelRef != nil && location.Path == r.topLevelRef.Path: b, err := ioutil.ReadAll(r.topLevelReader) if err != nil { return "", err } return string(b), nil - case ref.Path == r.metadataRef.Path: + case location.Path == r.metadataRef.Path: b, err := ioutil.ReadAll(r.metadataReader) if err != nil { return "", err } return string(b), nil - case ref.Path == r.recordRef.Path: + case location.Path == r.recordRef.Path: b, err := ioutil.ReadAll(r.recordReader) if err != nil { return "", err @@ -92,16 +95,36 @@ func (r *pythonTestResolverMock) FileContentsByLocation(ref source.Location) (st return "", fmt.Errorf("invalid value given") } -func (r *pythonTestResolverMock) MultipleFileContentsByLocation(_ []source.Location) (map[source.Location]string, error) { - return nil, fmt.Errorf("not implemented") +func (r *pythonTestResolverMock) MultipleFileContentsByLocation(locations []source.Location) (map[source.Location]string, error) { + var results = make(map[source.Location]string) + var err error + for _, l := range locations { + results[l], err = r.FileContentsByLocation(l) + if err != nil { + return nil, err + } + } + + return results, nil } func (r *pythonTestResolverMock) FilesByPath(_ ...string) ([]source.Location, error) { return nil, fmt.Errorf("not implemented") } -func (r *pythonTestResolverMock) FilesByGlob(_ ...string) ([]source.Location, error) { - return nil, fmt.Errorf("not implemented") +func (r *pythonTestResolverMock) FilesByGlob(patterns ...string) ([]source.Location, error) { + var results []source.Location + for _, pattern := range patterns { + for _, l := range []*source.Location{r.topLevelRef, r.metadataRef, r.recordRef} { + if l == nil { + continue + } + if file.GlobMatch(pattern, l.Path) { + results = append(results, *l) + } + } + } + return results, nil } func (r *pythonTestResolverMock) RelativeFileByPath(_ source.Location, path string) *source.Location { switch { @@ -224,14 +247,16 @@ func TestPythonPackageWheelCataloger(t *testing.T) { } // end patching expected values with runtime data... - pyPkgCataloger := NewPythonPackageCataloger() - - actual, err := pyPkgCataloger.catalogEggOrWheel(resolver, *resolver.metadataRef) + actual, err := NewPythonPackageCataloger().Catalog(resolver) if err != nil { t.Fatalf("failed to catalog python package: %+v", err) } - for _, d := range deep.Equal(actual, &test.ExpectedPackage) { + if len(actual) != 1 { + t.Fatalf("unexpected length: %d", len(actual)) + } + + for _, d := range deep.Equal(actual[0], test.ExpectedPackage) { t.Errorf("diff: %+v", d) } }) diff --git a/syft/source/content_requester.go b/syft/source/content_requester.go new file mode 100644 index 000000000..ea7b7b5a8 --- /dev/null +++ b/syft/source/content_requester.go @@ -0,0 +1,48 @@ +package source + +import "sync" + +type ContentRequester struct { + request map[Location][]*FileData + lock sync.Mutex +} + +func NewContentRequester(data ...*FileData) *ContentRequester { + requester := &ContentRequester{ + request: make(map[Location][]*FileData), + } + for _, d := range data { + requester.Add(d) + } + return requester +} + +func (b *ContentRequester) Add(data *FileData) { + b.lock.Lock() + defer b.lock.Unlock() + b.request[data.Location] = append(b.request[data.Location], data) +} + +func (b *ContentRequester) Execute(resolver ContentResolver) error { + b.lock.Lock() + defer b.lock.Unlock() + + var locations = make([]Location, len(b.request)) + idx := 0 + for l := range b.request { + locations[idx] = l + idx++ + } + + response, err := resolver.MultipleFileContentsByLocation(locations) + if err != nil { + return err + } + + for l, contents := range response { + for i := range b.request[l] { + b.request[l][i].Contents = contents + } + } + return nil +} diff --git a/syft/source/file_data.go b/syft/source/file_data.go new file mode 100644 index 000000000..843acc4a5 --- /dev/null +++ b/syft/source/file_data.go @@ -0,0 +1,6 @@ +package source + +type FileData struct { + Location Location + Contents string +}