syft/syft/pkg/cataloger/golang/parse_go_mod.go
Keith Zantow a400c675fc
feat: license file search (#4327)
Signed-off-by: Keith Zantow <kzantow@gmail.com>
2025-11-03 14:16:05 -05:00

530 lines
18 KiB
Go

package golang
import (
"bufio"
"context"
"fmt"
"go/build"
"io"
"path/filepath"
"slices"
"sort"
"strings"
"github.com/spf13/afero"
"golang.org/x/mod/modfile"
"golang.org/x/tools/go/packages"
"github.com/anchore/syft/internal"
"github.com/anchore/syft/internal/log"
"github.com/anchore/syft/internal/unknown"
"github.com/anchore/syft/syft/artifact"
"github.com/anchore/syft/syft/file"
"github.com/anchore/syft/syft/internal/fileresolver"
"github.com/anchore/syft/syft/pkg"
"github.com/anchore/syft/syft/pkg/cataloger/generic"
)
type goModCataloger struct {
licenseResolver goLicenseResolver
}
func newGoModCataloger(opts CatalogerConfig) *goModCataloger {
return &goModCataloger{
licenseResolver: newGoLicenseResolver(modFileCatalogerName, opts),
}
}
// parseGoModFile takes a go.mod and tries to resolve and lists all packages discovered.
func (c *goModCataloger) parseGoModFile(ctx context.Context, resolver file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) {
modDir := filepath.Dir(string(reader.Location.Reference().RealPath))
digests, err := parseGoSumFile(resolver, reader)
if err != nil {
log.Debugf("unable to get go.sum: %v", err)
}
scanRoot := ""
if dir, ok := resolver.(*fileresolver.Directory); ok && dir != nil {
scanRoot = dir.Chroot.Base()
}
// source analysis using go toolchain if available
syftSourcePackages, sourceModules, sourceDependencies, unknownErr := c.loadPackages(modDir, reader.Location)
catalogedModules, sourceModuleToPkg := c.catalogModules(ctx, scanRoot, syftSourcePackages, sourceModules, reader, digests)
relationships := buildModuleRelationships(catalogedModules, sourceDependencies, sourceModuleToPkg)
// base case go.mod file parsing
modFile, err := c.parseModFileContents(reader)
if err != nil {
return nil, nil, err
}
// only use mod packages NOT found in source analysis
goModPackages := c.createGoModPackages(ctx, resolver, modFile, sourceModules, reader, digests)
c.applyReplaceDirectives(ctx, resolver, modFile, goModPackages, reader, digests)
c.applyExcludeDirectives(modFile, goModPackages)
finalPkgs := c.assembleResults(catalogedModules, goModPackages)
return finalPkgs, relationships, unknownErr
}
// loadPackages uses golang.org/x/tools/go/packages to get dependency information.
func (c *goModCataloger) loadPackages(modDir string, loc file.Location) (pkgs map[string][]pkgInfo, modules map[string]*packages.Module, dependencies map[string][]string, unknownErr error) {
cfg := &packages.Config{
// Mode flags control what information is loaded for each package.
// Performance impact increases significantly with each additional flag:
//
// packages.NeedModule - Required for module metadata (path, version, replace directives).
// Essential for SBOM generation. Minimal performance impact.
//
// packages.NeedName - Required for package names & package Path. Minimal performance impact.
// Needed to identify packages and filter out standard library packages.
//
// packages.NeedFiles - Loads source file paths for each package.
// Moderate performance impact as it requires filesystem traversal.
// Required for license discovery.
//
// packages.NeedDeps - Loads the dependency graph between packages.
// High performance impact as it builds the complete import graph.
// Critical for generating accurate dependency relationships in SBOM.
//
// packages.NeedImports - Loads import information for each package.
// High performance impact, especially with large codebases.
// Required for building module-to-module dependency mappings.
//
// Adding flags like NeedTypes, NeedSyntax, or NeedTypesInfo would dramatically
// increase memory usage and processing time (10x+ slower) but are not needed
// for SBOM generation as we only require dependency and module metadata.
Mode: packages.NeedModule | packages.NeedName | packages.NeedFiles | packages.NeedDeps | packages.NeedImports,
Dir: modDir,
Tests: true,
}
// From Go documentation: "all" expands to all packages in the main module
// and their dependencies, including dependencies needed by tests.
//
// The special pattern "all" specifies all the active modules,
// first the main module and then dependencies sorted by module path.
// A pattern containing "..." specifies the active modules whose module paths match the pattern.
// On implementation we could not find a test case that differentiated between all and ...
// There may be a case where ... is non inclusive so we default to all for the inclusive guarantee
rootPkgs, err := packages.Load(cfg, "all")
if err != nil {
log.Debugf("error loading packages: %v", err)
}
// Check for any errors in loading
for _, p := range rootPkgs {
if len(p.Errors) > 0 {
// Log errors but continue processing
for _, e := range p.Errors {
log.Debugf("package load error for %s: %v", p.PkgPath, e)
unknownErr = unknown.Append(unknownErr, loc, err)
}
}
}
// note: dependencies have already pruned local imports and only focuses on module => module dependencies
return c.visitPackages(rootPkgs, loc, unknownErr)
}
type pkgInfo struct {
// pkgPath is the import path of the package.
pkgPath string
// modulePath is the module path of the package.
modulePath string
// pkgDir is the directory containing the package's source code.
pkgDir string
// moduleDir is the directory containing the module's source code.
moduleDir string
}
// visitPackages processes Go module import graphs to get all modules
func (c *goModCataloger) visitPackages(
rootPkgs []*packages.Package,
loc file.Location,
uke error,
) (pkgs map[string][]pkgInfo, modules map[string]*packages.Module, dependencies map[string][]string, unknownErr error) {
modules = make(map[string]*packages.Module)
// note: packages are specific to inside the module - they do not include transitive pkgInfo
// packages is used for identifying licensing documents for modules that could contain multiple licenses
// dependencies cover transitive module imports; see p.Imports array in packages.Visit
pkgs = make(map[string][]pkgInfo)
// dependencies are module => module dependencies
dependencies = make(map[string][]string)
// persist unknown errs from previous parts of the catalog
unknownErr = uke
// closure (p *Package) bool
// return bool determines whether the imports of package p are visited.
packages.Visit(rootPkgs, func(p *packages.Package) bool {
if len(p.Errors) > 0 {
for _, err := range p.Errors {
unknownErr = unknown.Append(unknownErr, loc, err)
}
return false
}
// skip for common causes
if shouldSkipVisit(p) {
return false
}
// different from above; we still might want to visit imports
// ignoring a package shouldn't end walking the tree
// since we need to get the full picture for license discovery
// for _, prefix := range c.config.IgnorePaths {
// if strings.HasPrefix(p.PkgPath, prefix) {
// return c.config.IncludeIgnoredDeps
// }
//}
pkgDir := resolvePkgDir(p)
if pkgDir == "" {
return true
}
module := newModule(p.Module)
if module.Dir == "" {
// We continue processing even when module.Dir is empty because we still want to:
// 1. Extract module dependencies from p.Imports for dependency graph construction
// 2. Create syft packages with available metadata (name, version, etc.)
// 3. Build relationships between modules even without complete filesystem info
// Not having the DIR here just means that we're not going to process the licenses
// Common causes for module.Dir being empty:
// - Vendored dependencies where Go toolchain loses some module metadata
// - Replace directives pointing to non-existent or inaccessible paths
// A known cause is that the module is vendored, so some information is lost.
isVendored := strings.Contains(pkgDir, "/vendor/")
if !isVendored {
log.Debugf("module %s does not have dir and it's not vendored", module.Path)
}
}
// extract module dependencies
for _, imp := range p.Imports {
if imp.Module != nil && imp.Module.Path != module.Path {
if dependencies[module.Path] == nil {
dependencies[module.Path] = []string{imp.Module.Path}
} else {
dependencies[module.Path] = append(dependencies[module.Path], imp.Module.Path)
}
}
}
info := pkgInfo{
pkgPath: p.PkgPath,
modulePath: module.Path,
pkgDir: pkgDir,
moduleDir: module.Dir,
}
if !slices.Contains(pkgs[module.Path], info) { // avoid duplicates
pkgs[module.Path] = append(pkgs[module.Path], info)
}
modules[p.Module.Path] = module
return true
}, nil)
return pkgs, modules, dependencies, unknownErr
}
// create syft packages from Go modules found by the go toolchain
func (c *goModCataloger) catalogModules(
ctx context.Context,
scanRoot string,
pkgs map[string][]pkgInfo,
modules map[string]*packages.Module,
reader file.LocationReadCloser,
digests map[string]string,
) ([]pkg.Package, map[string]artifact.Identifiable) {
syftPackages := make([]pkg.Package, 0)
moduleToPackage := make(map[string]artifact.Identifiable)
for _, m := range modules {
if isRelativeImportOrMain(m.Path) {
// relativeImport modules are already accounted for by their full module paths at other portions of syft's cataloging
// example: something like ../../ found as a module for go.mod b, which is sub to go.mod a is accounted for
// in another call to the goModCataloger when go.mod a is parsed
// local modules that use a "main" heuristic, no module naming (sometimes common pre go module support)
// are also not built as syft packages
continue
}
pkgInfos := pkgs[m.Path]
moduleLicenses := resolveModuleLicenses(ctx, scanRoot, pkgInfos, afero.NewOsFs())
// we do out of source lookups for module parsing
// locations are NOT included in the SBOM because of this
goModulePkg := pkg.Package{
Name: m.Path,
Version: m.Version,
Locations: file.NewLocationSet(reader.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation)),
Licenses: moduleLicenses,
Language: pkg.Go,
Type: pkg.GoModulePkg,
PURL: packageURL(m.Path, m.Version),
Metadata: createSourceMetadata(digests[fmt.Sprintf("%s %s", m.Path, m.Version)]),
}
goModulePkg.SetID()
moduleToPackage[m.Path] = goModulePkg
syftPackages = append(syftPackages, goModulePkg)
}
return syftPackages, moduleToPackage
}
// buildModuleRelationships creates artifact relationships between Go modules.
func buildModuleRelationships(
syftPkgs []pkg.Package,
dependencies map[string][]string,
moduleToPkg map[string]artifact.Identifiable,
) []artifact.Relationship {
var rels []artifact.Relationship
seen := make(map[string]struct{})
for _, fromPkg := range syftPkgs {
for _, dep := range dependencies[fromPkg.Name] {
if dep == fromPkg.Name {
continue
}
toPkg, ok := moduleToPkg[dep]
if !ok {
continue
}
key := string(fromPkg.ID()) + string(toPkg.ID())
if _, exists := seen[key]; exists {
continue
}
rels = append(rels, artifact.Relationship{
From: toPkg, // dep
To: fromPkg, // parent
Type: artifact.DependencyOfRelationship,
})
seen[key] = struct{}{}
}
}
return rels
}
func (c *goModCataloger) parseModFileContents(reader file.LocationReadCloser) (*modfile.File, error) {
contents, err := io.ReadAll(reader)
if err != nil {
return nil, fmt.Errorf("failed to read go module: %w", err)
}
f, err := modfile.Parse(reader.RealPath, contents, nil)
if err != nil {
return nil, fmt.Errorf("failed to parse go module: %w", err)
}
return f, nil
}
// note this handles the deduplication from source by checking if the mod path exists in the sourceModules map
func (c *goModCataloger) createGoModPackages(ctx context.Context, resolver file.Resolver, modFile *modfile.File, sourceModules map[string]*packages.Module, reader file.LocationReadCloser, digests map[string]string) map[string]pkg.Package {
goModPackages := make(map[string]pkg.Package)
for _, m := range modFile.Require {
if _, exists := sourceModules[m.Mod.Path]; !exists {
lics := c.licenseResolver.getLicenses(ctx, resolver, m.Mod.Path, m.Mod.Version)
goModPkg := pkg.Package{
Name: m.Mod.Path,
Version: m.Mod.Version,
Licenses: pkg.NewLicenseSet(lics...),
Locations: file.NewLocationSet(reader.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation)),
PURL: packageURL(m.Mod.Path, m.Mod.Version),
Language: pkg.Go,
Type: pkg.GoModulePkg,
Metadata: pkg.GolangModuleEntry{
H1Digest: digests[fmt.Sprintf("%s %s", m.Mod.Path, m.Mod.Version)],
},
}
goModPkg.SetID()
goModPackages[m.Mod.Path] = goModPkg
}
}
return goModPackages
}
// applyReplaceDirectives processes replace directives from go.mod
func (c *goModCataloger) applyReplaceDirectives(ctx context.Context, resolver file.Resolver, modFile *modfile.File, goModPackages map[string]pkg.Package, reader file.LocationReadCloser, digests map[string]string) {
for _, m := range modFile.Replace {
lics := c.licenseResolver.getLicenses(ctx, resolver, m.New.Path, m.New.Version)
var finalPath string
if !strings.HasPrefix(m.New.Path, ".") && !strings.HasPrefix(m.New.Path, "/") {
finalPath = m.New.Path
delete(goModPackages, m.Old.Path)
} else {
finalPath = m.Old.Path
}
goModPkg := pkg.Package{
Name: finalPath,
Version: m.New.Version,
Licenses: pkg.NewLicenseSet(lics...),
Locations: file.NewLocationSet(reader.WithAnnotation(pkg.EvidenceAnnotationKey, pkg.PrimaryEvidenceAnnotation)),
PURL: packageURL(finalPath, m.New.Version),
Language: pkg.Go,
Type: pkg.GoModulePkg,
Metadata: pkg.GolangModuleEntry{
H1Digest: digests[fmt.Sprintf("%s %s", finalPath, m.New.Version)],
},
}
goModPkg.SetID()
goModPackages[finalPath] = goModPkg
}
}
func (c *goModCataloger) applyExcludeDirectives(modFile *modfile.File, goModPackages map[string]pkg.Package) {
for _, m := range modFile.Exclude {
delete(goModPackages, m.Mod.Path)
}
}
func (c *goModCataloger) assembleResults(catalogedPkgs []pkg.Package, goModPackages map[string]pkg.Package) []pkg.Package {
pkgsSlice := make([]pkg.Package, 0)
pkgsSlice = append(pkgsSlice, catalogedPkgs...)
for _, p := range goModPackages {
pkgsSlice = append(pkgsSlice, p)
}
sort.SliceStable(pkgsSlice, func(i, j int) bool {
return pkgsSlice[i].Name < pkgsSlice[j].Name
})
return pkgsSlice
}
func parseGoSumFile(resolver file.Resolver, reader file.LocationReadCloser) (map[string]string, error) {
out := map[string]string{}
if resolver == nil {
return out, fmt.Errorf("no resolver provided")
}
goSumPath := strings.TrimSuffix(reader.RealPath, ".mod") + ".sum"
goSumLocation := resolver.RelativeFileByPath(reader.Location, goSumPath)
if goSumLocation == nil {
return nil, fmt.Errorf("unable to resolve: %s", goSumPath)
}
contents, err := resolver.FileContentsByLocation(*goSumLocation)
if err != nil {
return nil, err
}
defer internal.CloseAndLogError(contents, goSumLocation.AccessPath)
// go.sum has the format like:
// github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
// github.com/BurntSushi/toml v0.4.1 h1:GaI7EiDXDRfa8VshkTj7Fym7ha+y8/XxIgD2okUIjLw=
// github.com/BurntSushi/toml v0.4.1/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ=
scanner := bufio.NewScanner(contents)
// optionally, resize scanner's capacity for lines over 64K, see next example
for scanner.Scan() {
line := scanner.Text()
parts := strings.Split(line, " ")
if len(parts) < 3 {
continue
}
nameVersion := fmt.Sprintf("%s %s", parts[0], parts[1])
hash := parts[2]
out[nameVersion] = hash
}
return out, nil
}
// createSourceMetadata creates metadata for packages found through source analysis using build.Default
func createSourceMetadata(h1Digest string) pkg.GolangSourceEntry {
return pkg.GolangSourceEntry{
H1Digest: h1Digest,
OperatingSystem: build.Default.GOOS,
Architecture: build.Default.GOARCH,
BuildTags: strings.Join(build.Default.BuildTags, ","),
CgoEnabled: build.Default.CgoEnabled,
}
}
func resolvePkgDir(p *packages.Package) string {
switch {
case len(p.GoFiles) > 0:
return filepath.Dir(p.GoFiles[0])
case len(p.CompiledGoFiles) > 0:
return filepath.Dir(p.CompiledGoFiles[0])
case len(p.OtherFiles) > 0:
return filepath.Dir(p.OtherFiles[0])
default:
return ""
}
}
func shouldSkipVisit(p *packages.Package) bool {
// skip packages that don't have module info
if p.Module == nil {
return true
}
// skip stdlib
if isStdLib(p) {
return true
}
return false
}
// isStdLib returns true if this package is part of the Go standard library.
func isStdLib(pkg *packages.Package) bool {
if pkg.Name == "unsafe" {
// Special case unsafe stdlib, because it does not contain go files.
return true
}
if len(pkg.GoFiles) == 0 {
return false
}
prefix := build.Default.GOROOT
sep := string(filepath.Separator)
if !strings.HasSuffix(prefix, sep) {
prefix += sep
}
return strings.HasPrefix(pkg.GoFiles[0], prefix)
}
// handle replace directives
func newModule(mod *packages.Module) *packages.Module {
// Example of a module with replace directive: k8s.io/kubernetes => k8s.io/kubernetes v1.11.1
// {
// "Path": "k8s.io/kubernetes",
// "Version": "v0.17.9",
// "Replace": {
// "Path": "k8s.io/kubernetes",
// "Version": "v1.11.1",
// "Time": "2018-07-17T04:20:29Z",
// "Dir": "/home/gongyuan_kubeflow_org/go/pkg/mod/k8s.io/kubernetes@v1.11.1",
// "GoMod": "/home/gongyuan_kubeflow_org/go/pkg/mod/cache/download/k8s.io/kubernetes/@v/v1.11.1.mod"
// },
// "Dir": "/home/gongyuan_kubeflow_org/go/pkg/mod/k8s.io/kubernetes@v1.11.1",
// "GoMod": "/home/gongyuan_kubeflow_org/go/pkg/mod/cache/download/k8s.io/kubernetes/@v/v1.11.1.mod"
// }
// handle replace directives
// Note, we specifically want to replace version field.
// Haven't confirmed, but we may also need to override the
// entire struct when using replace directive with local folders.
tmp := *mod
if tmp.Replace != nil {
tmp = *tmp.Replace
}
return &tmp
}
func isRelativeImportOrMain(p string) bool {
if p == "main" {
return true
}
// true for ".", "..", "./...", "../..."
return build.IsLocalImport(p)
}