syft/syft/internal/fileresolver/directory_indexer.go

package fileresolver

import (
	"errors"
	"fmt"
	"io/fs"
	"os"
	"path"
	"path/filepath"
	"strings"

	"github.com/wagoodman/go-partybus"
	"github.com/wagoodman/go-progress"

	"github.com/anchore/stereoscope/pkg/file"
	"github.com/anchore/stereoscope/pkg/filetree"
	"github.com/anchore/syft/internal"
	"github.com/anchore/syft/internal/bus"
	"github.com/anchore/syft/internal/log"
	"github.com/anchore/syft/syft/event"
	"github.com/anchore/syft/syft/internal/windows"
)

type PathIndexVisitor func(string, os.FileInfo, error) error

type directoryIndexer struct {
	path              string
	base              string
	pathIndexVisitors []PathIndexVisitor
	errPaths          map[string]error
	tree              filetree.ReadWriter
	index             filetree.Index
}

func newDirectoryIndexer(path, base string, visitors ...PathIndexVisitor) *directoryIndexer {
	i := &directoryIndexer{
		path:              path,
		base:              base,
		tree:              filetree.New(),
		index:             filetree.NewIndex(),
		pathIndexVisitors: append([]PathIndexVisitor{requireFileInfo, disallowByFileType, disallowUnixSystemRuntimePath}, visitors...),
		errPaths:          make(map[string]error),
	}

	// these additional stateful visitors should be the first thing considered when walking / indexing
	i.pathIndexVisitors = append(
		[]PathIndexVisitor{
			i.disallowRevisitingVisitor,
			i.disallowFileAccessErr,
		},
		i.pathIndexVisitors...,
	)

	return i
}

func (r *directoryIndexer) build() (filetree.Reader, filetree.IndexReader, error) {
	return r.tree, r.index, indexAllRoots(r.path, r.indexTree)
}

func indexAllRoots(root string, indexer func(string, *progress.Stage) ([]string, error)) error {
	// why account for multiple roots? To cover cases when there is a symlink that references above the root path,
	// in which case we need to additionally index where the link resolves to. it's for this reason why the filetree
	// must be relative to the root of the filesystem (and not just relative to the given path).
	pathsToIndex := []string{root}
	fullPathsMap := map[string]struct{}{}

	stager, prog := indexingProgress(root)
	defer prog.SetCompleted()
loop:
	for {
		var currentPath string
		switch len(pathsToIndex) {
		case 0:
			break loop
		case 1:
			currentPath, pathsToIndex = pathsToIndex[0], nil
		default:
			currentPath, pathsToIndex = pathsToIndex[0], pathsToIndex[1:]
		}

		additionalRoots, err := indexer(currentPath, stager)
		if err != nil {
			return fmt.Errorf("unable to index filesystem path=%q: %w", currentPath, err)
		}

		for _, newRoot := range additionalRoots {
			if _, ok := fullPathsMap[newRoot]; !ok {
				fullPathsMap[newRoot] = struct{}{}
				pathsToIndex = append(pathsToIndex, newRoot)
			}
		}
	}

	return nil
}

func (r *directoryIndexer) indexTree(root string, stager *progress.Stage) ([]string, error) {
	log.WithFields("path", root).Trace("indexing filetree")

	var roots []string
	var err error

	root, err = filepath.Abs(root)
	if err != nil {
		return nil, err
	}

	// we want to be able to index single files with the directory resolver. However, we should also allow for attempting
	// to index paths that do not exist (that is, a root that does not exist is not an error case that should stop indexing).
	// For this reason we look for an opportunity to discover if the given root is a file, and if so add a single root,
	// but continue forth with index regardless if the given root path exists or not.
	fi, err := os.Stat(root)
	if err != nil && fi != nil && !fi.IsDir() {
		// note: we want to index the path regardless of an error stat-ing the path
		newRoot, _ := r.indexPath(root, fi, nil)
		if newRoot != "" {
			roots = append(roots, newRoot)
		}
		return roots, nil
	}

	shouldIndexFullTree, err := isRealPath(root)
	if err != nil {
		return nil, err
	}

	if !shouldIndexFullTree {
		newRoots, err := r.indexBranch(root, stager)
		if err != nil {
			return nil, fmt.Errorf("unable to index branch=%q: %w", root, err)
		}

		roots = append(roots, newRoots...)

		return roots, nil
	}

	err = filepath.Walk(root,
		func(path string, info os.FileInfo, err error) error {
			stager.Current = path

			newRoot, err := r.indexPath(path, info, err)

			if err != nil {
				return err
			}

			if newRoot != "" {
				roots = append(roots, newRoot)
			}

			return nil
		})

	if err != nil {
		return nil, fmt.Errorf("unable to index root=%q: %w", root, err)
	}

	return roots, nil
}

func isRealPath(root string) (bool, error) {
	rootParent := filepath.Clean(filepath.Dir(root))

	realRootParent, err := filepath.EvalSymlinks(rootParent)
	if err != nil {
		return false, err
	}

	realRootParent = filepath.Clean(realRootParent)

	return rootParent == realRootParent, nil
}

func (r *directoryIndexer) indexBranch(root string, stager *progress.Stage) ([]string, error) {
	rootRealPath, err := filepath.EvalSymlinks(root)
	if err != nil {
		return nil, err
	}

	// there is a symlink within the path to the root, we need to index the real root parent first
	// then capture the symlinks to the root path
	roots, err := r.indexTree(rootRealPath, stager)
	if err != nil {
		return nil, fmt.Errorf("unable to index real root=%q: %w", rootRealPath, err)
	}

	// walk down all ancestor paths and shallow-add non-existing elements to the tree
	for idx, p := range allContainedPaths(root) {
		var targetPath string
		if idx != 0 {
			parent := path.Dir(p)
			cleanParent, err := filepath.EvalSymlinks(parent)
			if err != nil {
				return nil, fmt.Errorf("unable to evaluate symlink for contained path parent=%q: %w", parent, err)
			}
			targetPath = filepath.Join(cleanParent, filepath.Base(p))
		} else {
			targetPath = p
		}

		stager.Current = targetPath

		lstat, err := os.Lstat(targetPath)
		newRoot, err := r.indexPath(targetPath, lstat, err)
		if err != nil && !errors.Is(err, ErrSkipPath) && !errors.Is(err, fs.SkipDir) {
			return nil, fmt.Errorf("unable to index ancestor path=%q: %w", targetPath, err)
		}
		if newRoot != "" {
			roots = append(roots, newRoot)
		}
	}

	return roots, nil
}

func allContainedPaths(p string) []string {
	var all []string
	var currentPath string

	cleanPath := strings.TrimSpace(p)

	if cleanPath == "" {
		return nil
	}

	// iterate through all parts of the path, replacing path elements with link resolutions where possible.
	for idx, part := range strings.Split(filepath.Clean(cleanPath), file.DirSeparator) {
		if idx == 0 && part == "" {
			currentPath = file.DirSeparator
			continue
		}

		// cumulatively gather where we are currently at and provide a rich object
		currentPath = path.Join(currentPath, part)
		all = append(all, currentPath)
	}
	return all
}

func (r *directoryIndexer) indexPath(path string, info os.FileInfo, err error) (string, error) {
	// ignore any path which a filter function returns true
	for _, filterFn := range r.pathIndexVisitors {
		if filterFn == nil {
			continue
		}

		if filterErr := filterFn(path, info, err); filterErr != nil {
			if errors.Is(filterErr, fs.SkipDir) {
				// signal to walk() to skip this directory entirely (even if we're processing a file)
				return "", filterErr
			}
			// skip this path but don't affect walk() trajectory
			return "", nil
		}
	}

	if info == nil {
		// walk may not be able to provide a FileInfo object, don't allow for this to stop indexing; keep track of the paths and continue.
		r.errPaths[path] = fmt.Errorf("no file info observable at path=%q", path)
		return "", nil
	}

	// here we check to see if we need to normalize paths to posix on the way in coming from windows
	if windows.HostRunningOnWindows() {
		path = windows.ToPosix(path)
	}

	newRoot, err := r.addPathToIndex(path, info)
	if r.isFileAccessErr(path, err) {
		return "", nil
	}

	return newRoot, nil
}

func (r *directoryIndexer) disallowFileAccessErr(path string, _ os.FileInfo, err error) error {
	if r.isFileAccessErr(path, err) {
		return ErrSkipPath
	}
	return nil
}

func (r *directoryIndexer) isFileAccessErr(path string, err error) bool {
	// don't allow for errors to stop indexing, keep track of the paths and continue.
	if err != nil {
		log.Warnf("unable to access path=%q: %+v", path, err)
		r.errPaths[path] = err
		return true
	}
	return false
}

func (r directoryIndexer) addPathToIndex(p string, info os.FileInfo) (string, error) {
	switch t := file.TypeFromMode(info.Mode()); t {
	case file.TypeSymLink:
		return r.addSymlinkToIndex(p, info)
	case file.TypeDirectory:
		return "", r.addDirectoryToIndex(p, info)
	case file.TypeRegular:
		return "", r.addFileToIndex(p, info)
	default:
		return "", fmt.Errorf("unsupported file type: %s", t)
	}
}

func (r directoryIndexer) addDirectoryToIndex(p string, info os.FileInfo) error {
	ref, err := r.tree.AddDir(file.Path(p))
	if err != nil {
		return err
	}

	metadata := file.NewMetadataFromPath(p, info)
	r.index.Add(*ref, metadata)

	return nil
}

func (r directoryIndexer) addFileToIndex(p string, info os.FileInfo) error {
	ref, err := r.tree.AddFile(file.Path(p))
	if err != nil {
		return err
	}

	metadata := file.NewMetadataFromPath(p, info)
	r.index.Add(*ref, metadata)

	return nil
}

func (r directoryIndexer) addSymlinkToIndex(p string, info os.FileInfo) (string, error) {
	linkTarget, err := os.Readlink(p)
	if err != nil {
		isOnWindows := windows.HostRunningOnWindows()
		if isOnWindows {
			p = windows.FromPosix(p)
		}

		linkTarget, err = filepath.EvalSymlinks(p)

		if isOnWindows {
			p = windows.ToPosix(p)
		}

		if err != nil {
			return "", fmt.Errorf("unable to readlink for path=%q: %w", p, err)
		}
	}

	if filepath.IsAbs(linkTarget) {
		linkTarget = filepath.Clean(linkTarget)
		// if the link is absolute (e.g, /bin/ls -> /bin/busybox) we need to
		// resolve relative to the root of the base directory, if it is not already
		// prefixed with a volume name
		if filepath.VolumeName(linkTarget) == "" {
			linkTarget = filepath.Join(r.base, filepath.Clean(linkTarget))
		}
	} else {
		// if the link is not absolute (e.g, /dev/stderr -> fd/2 ) we need to
		// resolve it relative to the directory in question (e.g. resolve to
		// /dev/fd/2)
		if r.base == "" {
			linkTarget = filepath.Join(filepath.Dir(p), linkTarget)
		} else {
			// if the base is set, then we first need to resolve the link,
			// before finding it's location in the base
			dir, err := filepath.Rel(r.base, filepath.Dir(p))
			if err != nil {
				return "", fmt.Errorf("unable to resolve relative path for path=%q: %w", p, err)
			}
			linkTarget = filepath.Join(r.base, filepath.Clean(filepath.Join("/", dir, linkTarget)))
		}
	}

	ref, err := r.tree.AddSymLink(file.Path(p), file.Path(linkTarget))
	if err != nil {
		return "", err
	}

	targetAbsPath := linkTarget
	if !filepath.IsAbs(targetAbsPath) {
		targetAbsPath = filepath.Clean(filepath.Join(path.Dir(p), linkTarget))
	}

	metadata := file.NewMetadataFromPath(p, info)
	metadata.LinkDestination = linkTarget
	r.index.Add(*ref, metadata)

	// if the target path does not exist, then do not report it as a new root, or try to send
	// syft parsing there.
	if _, err := os.Stat(targetAbsPath); err != nil && errors.Is(err, os.ErrNotExist) {
		log.Debugf("link %s points to unresolved path %s, ignoring target as new root", p, targetAbsPath)
		targetAbsPath = ""
	}

	return targetAbsPath, nil
}

func (r directoryIndexer) hasBeenIndexed(p string) (bool, *file.Metadata) {
	filePath := file.Path(p)
	if !r.tree.HasPath(filePath) {
		return false, nil
	}

	exists, ref, err := r.tree.File(filePath)
	if err != nil || !exists || !ref.HasReference() {
		return false, nil
	}

	// cases like "/" will be in the tree, but not been indexed yet (a special case). We want to capture
	// these cases as new paths to index.
	if !ref.HasReference() {
		return false, nil
	}

	entry, err := r.index.Get(*ref.Reference)
	if err != nil {
		return false, nil
	}

	return true, &entry.Metadata
}

func (r *directoryIndexer) disallowRevisitingVisitor(path string, _ os.FileInfo, _ error) error {
	// this prevents visiting:
	// - link destinations twice, once for the real file and another through the virtual path
	// - infinite link cycles
	if indexed, metadata := r.hasBeenIndexed(path); indexed {
		if metadata.IsDir() {
			// signal to walk() that we should skip this directory entirely
			return fs.SkipDir
		}
		return ErrSkipPath
	}
	return nil
}

func disallowUnixSystemRuntimePath(path string, _ os.FileInfo, _ error) error {
	if internal.HasAnyOfPrefixes(path, unixSystemRuntimePrefixes...) {
		return fs.SkipDir
	}
	return nil
}

func disallowByFileType(_ string, info os.FileInfo, _ error) error {
	if info == nil {
		// we can't filter out by filetype for non-existent files
		return nil
	}
	switch file.TypeFromMode(info.Mode()) {
	case file.TypeCharacterDevice, file.TypeSocket, file.TypeBlockDevice, file.TypeFIFO, file.TypeIrregular:
		return ErrSkipPath
		// note: symlinks that point to these files may still get by.
		// We handle this later in processing to help prevent against infinite links traversal.
	}

	return nil
}

func requireFileInfo(_ string, info os.FileInfo, _ error) error {
	if info == nil {
		return ErrSkipPath
	}
	return nil
}

func indexingProgress(path string) (*progress.Stage, *progress.Manual) {
	stage := &progress.Stage{}
	prog := progress.NewManual(-1)

	bus.Publish(partybus.Event{
		Type:   event.FileIndexingStarted,
		Source: path,
		Value: struct {
			progress.Stager
			progress.Progressable
		}{
			Stager:       progress.Stager(stage),
			Progressable: prog,
		},
	})

	return stage, prog
}