remove inline-compare testing

Signed-off-by: Alex Goodman <alex.goodman@anchore.com>
2026-02-14 03:26:41 +01:00 · 2021-03-18 09:00:05 -04:00 · 2021-03-18 09:00:05 -04:00 · 68d698e9f2
commit 68d698e9f2
parent efcd8a8b9a
10 changed files with 0 additions and 709 deletions
--- a/test/inline-compare/.gitignore
+++ b/test/inline-compare/.gitignore
@ -1,3 +0,0 @@
 *.json
 *.pyc
 inline-reports
--- a/test/inline-compare/Makefile
+++ b/test/inline-compare/Makefile
@ -1,49 +0,0 @@
 ifndef SYFT_CMD
 	SYFT_CMD = go run ../../main.go
 endif
 IMAGE_CLEAN = $(shell basename $(COMPARE_IMAGE) | tr ":" "_" )
 SYFT_DIR = syft-reports
 SYFT_REPORT = $(SYFT_DIR)/$(IMAGE_CLEAN).json
 INLINE_DIR = inline-reports
 INLINE_REPORT = $(INLINE_DIR)/$(IMAGE_CLEAN)-content-os.json
 ifndef SYFT_DIR
 	$(error SYFT_DIR is not set)
 endif
 ifndef INLINE_DIR
 	$(error INLINE_DIR is not set)
 endif
 .PHONY: all
 .DEFAULT_GOAL :=
 all: clean-syft
 	./compare-all.sh
 .PHONY: compare-image
 compare-image: $(SYFT_REPORT) $(INLINE_REPORT)
 	./compare.py $(COMPARE_IMAGE)
 .PHONY: gather-image
 gather-image: $(SYFT_REPORT) $(INLINE_REPORT)
 $(INLINE_REPORT):
 	echo "Creating $(INLINE_REPORT)..."
 	mkdir -p $(INLINE_DIR)
 	curl -s https://ci-tools.anchore.io/inline_scan-v0.7.0 | bash -s -- -p -r $(COMPARE_IMAGE)
 	mv anchore-reports/* $(INLINE_DIR)/
 	rmdir anchore-reports
 $(SYFT_REPORT):
 	echo "Creating $(SYFT_REPORT)..."
 	mkdir -p $(SYFT_DIR)
 	$(SYFT_CMD) $(COMPARE_IMAGE) -o json > $(SYFT_REPORT)
 .PHONY: clean
 clean: clean-syft
 	rm -f $(INLINE_DIR)/*
 .PHONY: clean-syft
 clean-syft:
 	rm -f $(SYFT_DIR)/*
--- a/test/inline-compare/compare-all.sh
+++ b/test/inline-compare/compare-all.sh
@ -1,16 +0,0 @@
 #!/usr/bin/env bash
 set -eu
 images=("debian:10.5" "centos:8.2.2004" "rails:5.0.1" "alpine:3.12.0" "anchore/test_images:java" "anchore/test_images:py38" "anchore/anchore-engine:v0.8.2" "jenkins/jenkins:2.249.2-lts-jdk11" )
 # gather all image analyses
 for img in "${images[@]}"; do
    echo "Gathering facts for $img"
    COMPARE_IMAGE=${img} make gather-image
 done
 # compare all results
 for img in "${images[@]}"; do
    echo "Comparing results for $img"
    COMPARE_IMAGE=${img} make compare-image
 done
--- a/test/inline-compare/compare.py
+++ b/test/inline-compare/compare.py
@ -1,234 +0,0 @@
 #!/usr/bin/env python3
 import os
 import sys
 import difflib
 import collections
 import utils.package
 from utils.format import Colors, print_rows
 from utils.inline import InlineScan
 from utils.syft import Syft
 DEFAULT_QUALITY_GATE_THRESHOLD = 0.95
 INDENT = "    "
 PACKAGE_QUALITY_GATE = collections.defaultdict(lambda: DEFAULT_QUALITY_GATE_THRESHOLD, **{})
 METADATA_QUALITY_GATE = collections.defaultdict(lambda: DEFAULT_QUALITY_GATE_THRESHOLD, **{
    # syft is better at detecting package versions in specific cases, leading to a drop in matching metadata
    "anchore/test_images:java": 0.61,
    "jenkins/jenkins:2.249.2-lts-jdk11": 0.85,
 })
 # We additionally fail if an image is above a particular threshold. Why? We expect the lower threshold to be 90%,
 # however additional functionality in grype is still being implemented, so this threshold may not be able to be met.
 # In these cases the IMAGE_QUALITY_GATE is set to a lower value to allow the test to pass for known issues. Once these
 # issues/enhancements are done we want to ensure that the lower threshold is bumped up to catch regression. The only way
 # to do this is to select an upper threshold for images with known threshold values, so we have a failure that
 # loudly indicates the lower threshold should be bumped.
 PACKAGE_UPPER_THRESHOLD = collections.defaultdict(lambda: 1, **{})
 METADATA_UPPER_THRESHOLD = collections.defaultdict(lambda: 1, **{
    # syft is better at detecting package versions in specific cases, leading to a drop in matching metadata
    "anchore/test_images:java": 0.65,
    "jenkins/jenkins:2.249.2-lts-jdk11": 0.9,
 })
 def report(image, analysis):
    if analysis.extra_packages:
        rows = []
        print(
            Colors.bold + "Syft found extra packages:",
            Colors.reset,
            "Syft discovered packages that Inline did not",
        )
        for package in sorted(list(analysis.extra_packages)):
            rows.append([INDENT, repr(package)])
        print_rows(rows)
        print()
    if analysis.missing_packages:
        rows = []
        print(
            Colors.bold + "Syft missed packages:",
            Colors.reset,
            "Inline discovered packages that Syft did not",
        )
        for package in sorted(list(analysis.missing_packages)):
            rows.append([INDENT, repr(package)])
        print_rows(rows)
        print()
    if analysis.missing_metadata:
        print(
            Colors.bold + "Syft mismatched metadata:",
            Colors.reset,
            "the packages between Syft and Inline are the same, the metadata is not",
        )
        for inline_metadata_pair in sorted(list(analysis.missing_metadata)):
            pkg, metadata = inline_metadata_pair
            if pkg not in analysis.syft_data.metadata[pkg.type]:
                continue
            syft_metadata_item = analysis.syft_data.metadata[pkg.type][pkg]
            diffs = difflib.ndiff([repr(syft_metadata_item)], [repr(metadata)])
            print(INDENT + "for: " + repr(pkg), "(top is syft, bottom is inline)")
            print(INDENT+INDENT+("\n"+INDENT+INDENT).join(list(diffs)))
        if not analysis.missing_metadata:
            print(
                INDENT,
                "There are mismatches, but only due to packages Syft did not find (but inline did).\n",
            )
    if analysis.similar_missing_packages:
        rows = []
        print(
            Colors.bold + "Probably pairings of missing/extra packages:",
            Colors.reset,
            "to aid in troubleshooting missed/extra packages",
        )
        for similar_packages in analysis.similar_missing_packages:
            rows.append(
                [
                    INDENT,
                    repr(similar_packages.pkg),
                    "--->",
                    repr(similar_packages.missed),
                ]
            )
        print_rows(rows)
        print()
    show_probable_mismatches = analysis.unmatched_missing_packages and analysis.extra_packages and len(analysis.unmatched_missing_packages) != len(analysis.missing_packages)
    if show_probable_mismatches:
        rows = []
        print(
            Colors.bold + "Probably missed packages:",
            Colors.reset,
            "a probable pair was not found",
        )
        for p in analysis.unmatched_missing_packages:
            rows.append([INDENT, repr(p)])
        print_rows(rows)
        print()
    print(Colors.bold + "Summary:", Colors.reset, image)
    print("   Inline Packages : %d" % len(analysis.inline_data.packages))
    print("   Syft Packages   : %d" % len(analysis.syft_data.packages))
    print(
        "         (extra)   : %d (note: this is ignored by the quality gate!)"
        % len(analysis.extra_packages)
    )
    print("       (missing)   : %d" % len(analysis.missing_packages))
    print()
    if show_probable_mismatches:
        print(
            "   Probable Package Matches  : %d (matches not made, but were probably found by both Inline and Syft)"
            % len(analysis.similar_missing_packages)
        )
        print(
            "   Probable Packages Matched : %2.3f %% (%d/%d packages)"
            % (
                analysis.percent_probable_overlapping_packages,
                len(analysis.overlapping_packages)
                + len(analysis.similar_missing_packages),
                len(analysis.inline_data.packages),
            )
        )
        print(
            "   Probable Packages Missing : %d "
            % len(analysis.unmatched_missing_packages)
        )
        print()
    print(
        "   Baseline Packages Matched : %2.3f %% (%d/%d packages)"
        % (
            analysis.percent_overlapping_packages,
            len(analysis.overlapping_packages),
            len(analysis.inline_data.packages),
        )
    )
    print(
        "   Baseline Metadata Matched : %2.3f %% (%d/%d metadata)"
        % (
            analysis.percent_overlapping_metadata,
            len(analysis.overlapping_metadata),
            len(analysis.inline_metadata),
        )
    )
 def enforce_quality_gate(title, actual_value, lower_gate_value, upper_gate_value):
    if actual_value < lower_gate_value:
        print(
            Colors.bold
            + "   %s Quality Gate:\t" % title
            + Colors.FG.red
            + "FAIL (is not >= %d %%)" % lower_gate_value,
            Colors.reset,
            )
        return False
    elif actual_value > upper_gate_value:
        print(
            Colors.bold
            + "   %s Quality Gate:\t" % title
            + Colors.FG.orange
            + "FAIL (lower threshold is artificially low and should be updated)",
            Colors.reset,
            )
        return False
    print(
        Colors.bold
        + "   %s Quality Gate:\t" % title
        + Colors.FG.green
        + "Pass (>= %d %%)" % lower_gate_value,
        Colors.reset,
        )
    return True
 def main(image):
    cwd = os.path.dirname(os.path.abspath(__file__))
    # parse the inline-scan and syft reports on disk
    inline = InlineScan(image=image, report_dir=os.path.join(cwd, "inline-reports"))
    syft = Syft(image=image, report_dir=os.path.join(cwd, "syft-reports"))
    # analyze the raw data to generate all derivative data for the report and quality gate
    analysis = utils.package.Analysis(
        syft_data=syft.packages(), inline_data=inline.packages()
    )
    # show some useful report data for debugging / warm fuzzies
    report(image, analysis)
    # enforce a quality gate based on the comparison of package values and metadata values
    success = True
    success &= enforce_quality_gate(
        title="Package",
        actual_value=analysis.percent_overlapping_packages,
        lower_gate_value=PACKAGE_QUALITY_GATE[image] * 100,
        upper_gate_value=PACKAGE_UPPER_THRESHOLD[image] * 100
    )
    success &= enforce_quality_gate(
        title="Metadata",
        actual_value=analysis.percent_overlapping_metadata,
        lower_gate_value=METADATA_QUALITY_GATE[image] * 100,
        upper_gate_value=METADATA_UPPER_THRESHOLD[image] * 100
    )
    if not success:
        return 1
    return 0
 if __name__ == "__main__":
    if len(sys.argv) != 2:
        sys.exit("provide an image")
    rc = main(sys.argv[1])
    sys.exit(rc)
--- a/test/inline-compare/utils/init.py
+++ b/test/inline-compare/utils/init.py
--- a/test/inline-compare/utils/format.py
+++ b/test/inline-compare/utils/format.py
@ -1,46 +0,0 @@
 class Colors:
    reset = "\033[0m"
    bold = "\033[01m"
    disable = "\033[02m"
    underline = "\033[04m"
    reverse = "\033[07m"
    strikethrough = "\033[09m"
    invisible = "\033[08m"
    class FG:
        black = "\033[30m"
        red = "\033[31m"
        green = "\033[32m"
        orange = "\033[33m"
        blue = "\033[34m"
        purple = "\033[35m"
        cyan = "\033[36m"
        lightgrey = "\033[37m"
        darkgrey = "\033[90m"
        lightred = "\033[91m"
        lightgreen = "\033[92m"
        yellow = "\033[93m"
        lightblue = "\033[94m"
        pink = "\033[95m"
        lightcyan = "\033[96m"
    class BG:
        black = "\033[40m"
        red = "\033[41m"
        green = "\033[42m"
        orange = "\033[43m"
        blue = "\033[44m"
        purple = "\033[45m"
        cyan = "\033[46m"
        lightgrey = "\033[47m"
 def print_rows(rows):
    if not rows:
        return
    widths = []
    for col, _ in enumerate(rows[0]):
        width = max(len(row[col]) for row in rows) + 2  # padding
        widths.append(width)
    for row in rows:
        print("".join(word.ljust(widths[col_idx]) for col_idx, word in enumerate(row)))
--- a/test/inline-compare/utils/image.py
+++ b/test/inline-compare/utils/image.py
@ -1,5 +0,0 @@
 import os
 def clean(image: str) -> str:
    return os.path.basename(image.replace(":", "_"))
--- a/test/inline-compare/utils/inline.py
+++ b/test/inline-compare/utils/inline.py
@ -1,142 +0,0 @@
 import os
 import re
 import json
 import collections
 import utils.package
 import utils.image
 class InlineScan:
    """
    Class for parsing inlnie-scan output files into a set of packages and package metadata.
    """
    report_tmpl = "{image}-{report}.json"
    def __init__(self, image, report_dir):
        self.report_dir = report_dir
        self.image = image
    def packages(self):
        python_packages, python_metadata = self._python_packages()
        gem_packages, gem_metadata = self._gem_packages()
        java_packages, java_metadata = self._java_packages()
        npm_packages, npm_metadata = self._npm_packages()
        os_packages, os_metadata = self._os_packages()
        packages = (
            python_packages | os_packages | gem_packages | java_packages | npm_packages
        )
        metadata = {
            **python_metadata,
            **os_metadata,
            **gem_metadata,
            **java_metadata,
            **npm_metadata,
        }
        return utils.package.Info(packages=frozenset(packages), metadata=metadata)
    def _report_path(self, report):
        return os.path.join(
            self.report_dir,
            self.report_tmpl.format(image=utils.image.clean(self.image), report=report),
        )
    def _enumerate_section(self, report, section):
        report_path = self._report_path(report=report)
        os_report_path = self._report_path(report="content-os")
        if os.path.exists(os_report_path) and not os.path.exists(report_path):
            # if the OS report is there but the target report is not, that is engine's way of saying "no findings"
            return
        with open(report_path) as json_file:
            data = json.load(json_file)
            for entry in data[section]:
                yield entry
    def _java_packages(self):
        packages = set()
        metadata = collections.defaultdict(dict)
        for entry in self._enumerate_section(report="content-java", section="content"):
            # normalize to pseudo-inline
            pkg_type = entry["type"].lower()
            if pkg_type in ("java-jar", "java-war", "java-ear"):
                pkg_type = "java-?ar"
            elif pkg_type in ("java-jpi", "java-hpi"):
                pkg_type = "java-?pi"
            # this would usually be "package" but this would not be able to account for duplicate dependencies in
            # nested jars of the same name. Fallback to the package name if there is no given location
            name = entry["location"]
            # replace fields with "N/A" with None
            for k, v in dict(entry).items():
                if v in ("", "N/A"):
                    entry[k] = None
            pkg = utils.package.Package(
                name=name,
                type=pkg_type,
            )
            packages.add(pkg)
            metadata[pkg.type][pkg] = utils.package.Metadata(
                version=entry["maven-version"],
            )
        return packages, metadata
    def _npm_packages(self):
        packages = set()
        metadata = collections.defaultdict(dict)
        for entry in self._enumerate_section(report="content-npm", section="content"):
            pkg = utils.package.Package(
                name=entry["package"],
                type=entry["type"].lower(),
            )
            packages.add(pkg)
            metadata[pkg.type][pkg] = utils.package.Metadata(version=entry["version"])
        return packages, metadata
    def _python_packages(self):
        packages = set()
        metadata = collections.defaultdict(dict)
        for entry in self._enumerate_section(
            report="content-python", section="content"
        ):
            pkg = utils.package.Package(
                name=entry["package"],
                type=entry["type"].lower(),
            )
            packages.add(pkg)
            metadata[pkg.type][pkg] = utils.package.Metadata(version=entry["version"])
        return packages, metadata
    def _gem_packages(self):
        packages = set()
        metadata = collections.defaultdict(dict)
        for entry in self._enumerate_section(report="content-gem", section="content"):
            pkg = utils.package.Package(
                name=entry["package"],
                type=entry["type"].lower(),
            )
            packages.add(pkg)
            metadata[pkg.type][pkg] = utils.package.Metadata(version=entry["version"])
        return packages, metadata
    def _os_packages(self):
        packages = set()
        metadata = collections.defaultdict(dict)
        for entry in self._enumerate_section(report="content-os", section="content"):
            pkg = utils.package.Package(
                name=entry["package"], type=entry["type"].lower()
            )
            packages.add(pkg)
            metadata[pkg.type][pkg] = utils.package.Metadata(version=entry["version"])
        return packages, metadata
--- a/test/inline-compare/utils/package.py
+++ b/test/inline-compare/utils/package.py
@ -1,146 +0,0 @@
 import difflib
 import collections
 import dataclasses
 from typing import Set, FrozenSet, Tuple, Any, List
 Metadata = collections.namedtuple("Metadata", "version")
 Package = collections.namedtuple("Package", "name type")
 Info = collections.namedtuple("Info", "packages metadata")
 SimilarPackages = collections.namedtuple("SimilarPackages", "pkg missed")
 ProbableMatch = collections.namedtuple("ProbableMatch", "pkg ratio")
@dataclasses.dataclass()
 class Analysis:
    """
    A package metadata analysis class. When given the raw syft and inline data, all necessary derivative information
    needed to do a comparison of package and metadata is performed, allowing callers to interpret the results
    """
    # all raw data from the inline scan and syft reports
    syft_data: Info
    inline_data: Info
    # all derivative information (derived from the raw data above)
    overlapping_packages: FrozenSet[Package] = dataclasses.field(init=False)
    extra_packages: FrozenSet[Package] = dataclasses.field(init=False)
    missing_packages: FrozenSet[Package] = dataclasses.field(init=False)
    inline_metadata: Set[Tuple[Any, Any]] = dataclasses.field(init=False)
    missing_metadata: Set[Tuple[Any, Any]] = dataclasses.field(init=False)
    overlapping_metadata: Set[Tuple[Any, Any]] = dataclasses.field(init=False)
    similar_missing_packages: List[Package] = dataclasses.field(init=False)
    unmatched_missing_packages: List[Package] = dataclasses.field(init=False)
    def __post_init__(self):
        if not self.valid():
            raise RuntimeError("invalid data given")
        # basic sets derived from package information
        self.overlapping_packages = self.syft_data.packages & self.inline_data.packages
        self.extra_packages = self.syft_data.packages - self.inline_data.packages
        self.missing_packages = self.inline_data.packages - self.syft_data.packages
        # basic sets derived from metadata information
        self.inline_metadata = self._inline_metadata()
        self.overlapping_metadata = self._overlapping_metadata()
        self.missing_metadata = self.inline_metadata - self.overlapping_metadata
        # try to account for potential false negatives by pairing extra packages discovered only by syft with missing
        # packages discovered only by inline scan.
        (
            similar_missing_packages,
            unmatched_missing_packages,
        ) = self._pair_similar_packages(self.extra_packages, self.missing_packages)
        self.similar_missing_packages = similar_missing_packages
        self.unmatched_missing_packages = unmatched_missing_packages
    def valid(self) -> bool:
        # we are purposefully selecting test images that are guaranteed to have packages (this should never happen).
        # ... if it does, then this analysis is not valid!
        return bool(self.inline_data.packages)
    def _inline_metadata(self):
        """
        Returns the set of inline scan metadata paired with the corresponding package info.
        """
        inline_metadata_set = set()
        for package in self.inline_data.packages:
            metadata = self.inline_data.metadata[package.type][package]
            inline_metadata_set.add((package, metadata))
        return inline_metadata_set
    def _overlapping_metadata(self):
        """
        Returns the metadata which has been found similar between both syft and inline scan.
        """
        syft_overlap_metadata_set = set()
        for package in self.syft_data.packages:
            metadata = self.syft_data.metadata[package.type][package]
            # we only want to really count mismatched metadata for packages that are at least found by inline
            if package in self.inline_data.metadata.get(package.type, []):
                syft_overlap_metadata_set.add((package, metadata))
        return syft_overlap_metadata_set & self.inline_metadata
    @staticmethod
    def _pair_similar_packages(extra_packages, missing_packages, similar_threshold=0.7):
        """
        Try to account for potential false negatives by pairing extra packages discovered only by syft with missing
        packages discovered only by inline scan.
        """
        matches = collections.defaultdict(set)
        found = {}
        for s in extra_packages:
            for i in missing_packages:
                ratio = difflib.SequenceMatcher(None, s.name, i.name).ratio()
                if ratio >= similar_threshold:
                    if i in found:
                        # only allow for an inline package to be paired once
                        if ratio < found[i]:
                            continue
                        else:
                            matches[s].discard(i)
                    # persist the result
                    found[i] = ratio
                    matches[s].add(i)
        results = []
        for s, i_set in matches.items():
            missed = tuple([ProbableMatch(pkg=i, ratio=found[i]) for i in i_set])
            results.append(SimilarPackages(pkg=s, missed=missed))
        not_found = [i for i in missing_packages if i not in found]
        return sorted(results, key=lambda x: x.pkg), sorted(
            not_found, key=lambda x: x.name
        )
    @property
    def percent_overlapping_packages(self):
        """Returns a percentage representing how many packages that were found relative to the number of expected"""
        return (
            float(len(self.overlapping_packages))
            / float(len(self.inline_data.packages))
        ) * 100.0
    @property
    def percent_overlapping_metadata(self):
        """Returns a percentage representing how many matching metdata that were found relative to the number of expected"""
        return (
            float(len(self.overlapping_metadata)) / float(len(self.inline_metadata))
        ) * 100.0
    @property
    def percent_probable_overlapping_packages(self):
        """
        Returns a percentage representing how many packages that were found relative to the number of expected after
        considering pairing of missing packages with extra packages in a fuzzy match.
        """
        return (
            float(len(self.overlapping_packages) + len(self.similar_missing_packages))
            / float(len(self.inline_data.packages))
        ) * 100.0
--- a/test/inline-compare/utils/syft.py
+++ b/test/inline-compare/utils/syft.py
@ -1,68 +0,0 @@
 import os
 import json
 import collections
 import utils.package
 import utils.image
 class Syft:
    """
    Class for parsing syft output into a set of packages and package metadata.
    """
    report_tmpl = "{image}.json"
    def __init__(self, image, report_dir):
        self.report_path = os.path.join(
            report_dir, self.report_tmpl.format(image=utils.image.clean(image))
        )
    def _enumerate_section(self, section):
        with open(self.report_path) as json_file:
            data = json.load(json_file)
            for entry in data[section]:
                yield entry
    def packages(self):
        packages = set()
        metadata = collections.defaultdict(dict)
        for entry in self._enumerate_section(section="artifacts"):
            # normalize to inline
            pkg_type = entry["type"].lower()
            if pkg_type in ("wheel", "egg", "python"):
                pkg_type = "python"
            elif pkg_type in ("deb",):
                pkg_type = "dpkg"
            elif pkg_type in ("java-archive",):
                # normalize to pseudo-inline
                pkg_type = "java-?ar"
            elif pkg_type in ("jenkins-plugin",):
                # normalize to pseudo-inline
                pkg_type = "java-?pi"
            elif pkg_type in ("apk",):
                pkg_type = "apkg"
            name = entry["name"]
            version = entry["version"]
            if "java" in pkg_type:
                # we need to use the virtual path instead of the name to account for nested dependencies with the same
                # package name (but potentially different metadata)
                name = entry.get("metadata", {}).get("virtualPath")
            elif pkg_type == "apkg":
                # inline scan strips off the release from the version, which should be normalized here
                fields = entry["version"].split("-")
                version = "-".join(fields[:-1])
            pkg = utils.package.Package(
                name=name,
                type=pkg_type,
            )
            packages.add(pkg)
            metadata[pkg.type][pkg] = utils.package.Metadata(version=version)
        return utils.package.Info(packages=frozenset(packages), metadata=metadata)