feat: 1944 - update purl generation to use a consistent groupID (#2033)

Separate the logic for CPE and PURL generation. 

PURL generation needs a single answer for groupID based on a priority of discovering the field. 
CPE generation still uses multiple potential groupID to populate the candidate cpe.

Improve GroupID detection. 

Currently syft does not use any hierarchy for GroupID detection and treats all sources as equal. 
It treats fields from the manifest file with priority. This change adds a hierarchy to the fields and returns a single answer based on that hierarchy.
---------
Signed-off-by: Christopher Phillips <christopher.phillips@anchore.com>
Signed-off-by: Keith Zantow <kzantow@gmail.com>
Co-authored-by: Keith Zantow <kzantow@gmail.com>
This commit is contained in:
Christopher Angelo Phillips 2023-08-22 10:47:07 -04:00 committed by GitHub
parent cf37b17869
commit ee121cff21
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 247 additions and 77 deletions

View File

@ -21,16 +21,16 @@ var (
"be",
}
primaryJavaManifestGroupIDFields = []string{
PrimaryJavaManifestGroupIDFields = []string{
"Bundle-SymbolicName",
"Extension-Name",
"Specification-Vendor",
"Implementation-Vendor",
"Bundle-SymbolicName",
"Implementation-Vendor-Id",
"Implementation-Title",
"Bundle-Activator",
}
secondaryJavaManifestGroupIDFields = []string{
SecondaryJavaManifestGroupIDFields = []string{
"Automatic-Module-Name",
"Main-Class",
"Package",
@ -168,7 +168,7 @@ func artifactIDFromJavaPackage(p pkg.Package) string {
}
artifactID := strings.TrimSpace(metadata.PomProperties.ArtifactID)
if startsWithTopLevelDomain(artifactID) && len(strings.Split(artifactID, ".")) > 1 {
if looksLikeGroupID(artifactID) && len(strings.Split(artifactID, ".")) > 1 {
// there is a strong indication that the artifact ID is really a group ID, don't use it
return ""
}
@ -184,6 +184,9 @@ func GroupIDsFromJavaPackage(p pkg.Package) (groupIDs []string) {
return GroupIDsFromJavaMetadata(p.Name, metadata)
}
// GroupIDsFromJavaMetadata returns the possible group IDs for a Java package
// This function is similar to GroupIDFromJavaPackage, but returns all possible group IDs and is less strict
// It is used as a way to generate possible candidates for CPE matching.
func GroupIDsFromJavaMetadata(pkgName string, metadata pkg.JavaMetadata) (groupIDs []string) {
groupIDs = append(groupIDs, groupIDsFromPomProperties(metadata.PomProperties)...)
groupIDs = append(groupIDs, groupIDsFromPomProject(metadata.PomProject)...)
@ -242,7 +245,7 @@ func addGroupIDsFromGroupIDsAndArtifactID(groupID, artifactID string) (groupIDs
}
func groupIDsFromJavaManifest(pkgName string, manifest *pkg.JavaManifest) []string {
if groupID, ok := defaultArtifactIDToGroupID[pkgName]; ok {
if groupID, ok := DefaultArtifactIDToGroupID[pkgName]; ok {
return []string{groupID}
}
@ -251,7 +254,7 @@ func groupIDsFromJavaManifest(pkgName string, manifest *pkg.JavaManifest) []stri
}
// try the common manifest fields first for a set of candidates
groupIDs := getManifestFieldGroupIDs(manifest, primaryJavaManifestGroupIDFields)
groupIDs := GetManifestFieldGroupIDs(manifest, PrimaryJavaManifestGroupIDFields)
if len(groupIDs) != 0 {
return groupIDs
@ -262,10 +265,10 @@ func groupIDsFromJavaManifest(pkgName string, manifest *pkg.JavaManifest) []stri
// for more info see pkg:maven/commons-io/commons-io@2.8.0 within cloudbees/cloudbees-core-mm:2.263.4.2
// at /usr/share/jenkins/jenkins.war:WEB-INF/plugins/analysis-model-api.hpi:WEB-INF/lib/commons-io-2.8.0.jar
// as well as the ant package from cloudbees/cloudbees-core-mm:2.277.2.4-ra.
return getManifestFieldGroupIDs(manifest, secondaryJavaManifestGroupIDFields)
return GetManifestFieldGroupIDs(manifest, SecondaryJavaManifestGroupIDFields)
}
func getManifestFieldGroupIDs(manifest *pkg.JavaManifest, fields []string) (groupIDs []string) {
func GetManifestFieldGroupIDs(manifest *pkg.JavaManifest, fields []string) (groupIDs []string) {
if manifest == nil {
return nil
}
@ -302,3 +305,7 @@ func removeOSCIDirectives(groupID string) string {
func startsWithTopLevelDomain(value string) bool {
return internal.HasAnyOfPrefixes(value, domains...)
}
func looksLikeGroupID(value string) bool {
return strings.Contains(value, ".")
}

View File

@ -1,6 +1,6 @@
package cpe
var defaultArtifactIDToGroupID = map[string]string{
var DefaultArtifactIDToGroupID = map[string]string{
"ant": "org.apache.ant",
"ant-antlr": "org.apache.ant",
"ant-antunit": "org.apache.ant",

View File

@ -62,11 +62,11 @@ func parseJavaArchive(_ file.Resolver, _ *generic.Environment, reader file.Locat
}
// uniquePkgKey creates a unique string to identify the given package.
func uniquePkgKey(p *pkg.Package) string {
func uniquePkgKey(groupID string, p *pkg.Package) string {
if p == nil {
return ""
}
return fmt.Sprintf("%s|%s", p.Name, p.Version)
return fmt.Sprintf("%s|%s|%s", groupID, p.Name, p.Version)
}
// newJavaArchiveParser returns a new java archive parser object for the given archive. Can be configured to discover
@ -371,13 +371,27 @@ func pomProjectByParentPath(archivePath string, location file.Location, extractP
return projectByParentPath, nil
}
// packagesFromPomProperties processes a single Maven POM properties for a given parent package, returning all listed Java packages found and
// newPackageFromMavenData processes a single Maven POM properties for a given parent package, returning all listed Java packages found and
// associating each discovered package to the given parent package. Note the pom.xml is optional, the pom.properties is not.
func newPackageFromMavenData(pomProperties pkg.PomProperties, pomProject *pkg.PomProject, parentPkg *pkg.Package, location file.Location) *pkg.Package {
// keep the artifact name within the virtual path if this package does not match the parent package
vPathSuffix := ""
if !strings.HasPrefix(pomProperties.ArtifactID, parentPkg.Name) {
vPathSuffix += ":" + pomProperties.ArtifactID
groupID := ""
if parentMetadata, ok := parentPkg.Metadata.(pkg.JavaMetadata); ok {
groupID = groupIDFromJavaMetadata(parentPkg.Name, parentMetadata)
}
parentKey := fmt.Sprintf("%s:%s:%s", groupID, parentPkg.Name, parentPkg.Version)
// Since we don't have a package yet, it's important to use the same `field: value` association that we used when creating the parent package
// See below where Name => pomProperties.ArtifactID and Version => pomProperties.Version. We want to check for potentially nested identical
// packages and create equal virtual paths so they are de duped in the future
pomProjectKey := fmt.Sprintf("%s:%s:%s", pomProperties.GroupID, pomProperties.ArtifactID, pomProperties.Version)
if parentKey != pomProjectKey {
// build a new virtual path suffix for the package that is different from the parent package
// we want to use the GroupID and ArtifactID here to preserve uniqueness
// Some packages have the same name but different group IDs (e.g. "org.glassfish.jaxb/jaxb-core", "com.sun.xml.bind/jaxb-core")
// https://github.com/anchore/syft/issues/1944
vPathSuffix += ":" + pomProperties.GroupID + ":" + pomProperties.ArtifactID
}
virtualPath := location.AccessPath() + vPathSuffix
@ -408,21 +422,26 @@ func newPackageFromMavenData(pomProperties pkg.PomProperties, pomProject *pkg.Po
}
func packageIdentitiesMatch(p pkg.Package, parentPkg *pkg.Package) bool {
// the name/version pair matches...
if uniquePkgKey(&p) == uniquePkgKey(parentPkg) {
return true
}
metadata, ok := p.Metadata.(pkg.JavaMetadata)
if !ok {
log.WithFields("package", p.String()).Warn("unable to extract java metadata to check for matching package identity")
return false
parentMetadata, parentOk := parentPkg.Metadata.(pkg.JavaMetadata)
if !ok || !parentOk {
switch {
case !ok:
log.WithFields("package", p.String()).Trace("unable to extract java metadata to check for matching package identity for package: %s", p.Name)
case !parentOk:
log.WithFields("package", parentPkg.String()).Trace("unable to extract java metadata to check for matching package identity for package: %s", parentPkg.Name)
}
// if we can't extract metadata, we can check for matching identities via the package name
// this is not ideal, but it's better than nothing - this should not be used if we have Metadata
return uniquePkgKey("", &p) == uniquePkgKey("", parentPkg)
}
parentMetadata, ok := parentPkg.Metadata.(pkg.JavaMetadata)
if !ok {
log.WithFields("package", p.String()).Warn("unable to extract java metadata from parent for verifying virtual path")
return false
// try to determine identity with the metadata
groupID := groupIDFromJavaMetadata(p.Name, metadata)
parentGroupID := groupIDFromJavaMetadata(parentPkg.Name, parentMetadata)
if uniquePkgKey(groupID, &p) == uniquePkgKey(parentGroupID, parentPkg) {
return true
}
// the virtual path matches...
@ -434,10 +453,14 @@ func packageIdentitiesMatch(p pkg.Package, parentPkg *pkg.Package) bool {
// note: you CANNOT use name-is-subset-of-artifact-id or vice versa --this is too generic. Shaded jars are a good
// example of this: where the package name is "cloudbees-analytics-segment-driver" and a child is "analytics", but
// they do not indicate the same package.
// NOTE: artifactId might not be a good indicator of uniqueness since archives can contain forks with the same name
// from different groups (e.g. "org.glassfish.jaxb.jaxb-core" and "com.sun.xml.bind.jaxb-core")
// we will use this check as a last resort
if metadata.PomProperties != nil {
if metadata.PomProperties.ArtifactID != "" && parentPkg.Name == metadata.PomProperties.ArtifactID {
return true
}
}
return false
}

View File

@ -174,8 +174,8 @@ func TestParseJar(t *testing.T) {
MetadataType: pkg.JavaMetadataType,
Metadata: pkg.JavaMetadata{
// ensure that nested packages with different names than that of the parent are appended as
// a suffix on the virtual path
VirtualPath: "test-fixtures/java-builds/packages/example-java-app-gradle-0.1.0.jar:joda-time",
// a suffix on the virtual path with a colon separator between group name and artifact name
VirtualPath: "test-fixtures/java-builds/packages/example-java-app-gradle-0.1.0.jar:joda-time:joda-time",
PomProperties: &pkg.PomProperties{
Path: "META-INF/maven/joda-time/joda-time/pom.properties",
GroupID: "joda-time",
@ -240,7 +240,7 @@ func TestParseJar(t *testing.T) {
Metadata: pkg.JavaMetadata{
// ensure that nested packages with different names than that of the parent are appended as
// a suffix on the virtual path
VirtualPath: "test-fixtures/java-builds/packages/example-java-app-maven-0.1.0.jar:joda-time",
VirtualPath: "test-fixtures/java-builds/packages/example-java-app-maven-0.1.0.jar:joda-time:joda-time",
PomProperties: &pkg.PomProperties{
Path: "META-INF/maven/joda-time/joda-time/pom.properties",
GroupID: "joda-time",
@ -659,7 +659,7 @@ func Test_newPackageFromMavenData(t *testing.T) {
Type: pkg.JavaPkg,
MetadataType: pkg.JavaMetadataType,
Metadata: pkg.JavaMetadata{
VirtualPath: virtualPath + ":" + "some-artifact-id",
VirtualPath: virtualPath + ":" + "some-group-id" + ":" + "some-artifact-id",
PomProperties: &pkg.PomProperties{
Name: "some-name",
GroupID: "some-group-id",
@ -728,7 +728,7 @@ func Test_newPackageFromMavenData(t *testing.T) {
Type: pkg.JavaPkg,
MetadataType: pkg.JavaMetadataType,
Metadata: pkg.JavaMetadata{
VirtualPath: virtualPath + ":" + "some-artifact-id",
VirtualPath: virtualPath + ":" + "some-group-id" + ":" + "some-artifact-id",
PomProperties: &pkg.PomProperties{
Name: "some-name",
GroupID: "some-group-id",
@ -797,7 +797,7 @@ func Test_newPackageFromMavenData(t *testing.T) {
Type: pkg.JenkinsPluginPkg,
MetadataType: pkg.JavaMetadataType,
Metadata: pkg.JavaMetadata{
VirtualPath: virtualPath + ":" + "some-artifact-id",
VirtualPath: virtualPath + ":" + "com.cloudbees.jenkins.plugins" + ":" + "some-artifact-id",
PomProperties: &pkg.PomProperties{
Name: "some-name",
GroupID: "com.cloudbees.jenkins.plugins",
@ -894,44 +894,6 @@ func Test_newPackageFromMavenData(t *testing.T) {
},
expectedPackage: nil,
},
{
name: "child matches parent by virtual path -- override name and version",
props: pkg.PomProperties{
Name: "some-name",
GroupID: "some-group-id",
ArtifactID: "some-parent-name", // note: DOES NOT match parent package
Version: "3.0", // note: DOES NOT match parent package
},
parent: &pkg.Package{
Name: "", // note: empty, so should not be matched on
Version: "", // note: empty, so should not be matched on
Type: pkg.JavaPkg,
Metadata: pkg.JavaMetadata{
VirtualPath: virtualPath, // note: matching virtual path
Manifest: nil,
PomProperties: nil,
Parent: nil,
},
},
expectedParent: pkg.Package{
Name: "some-parent-name",
Version: "3.0",
Type: pkg.JavaPkg,
Metadata: pkg.JavaMetadata{
VirtualPath: virtualPath,
Manifest: nil,
// note: we attach the discovered pom properties data
PomProperties: &pkg.PomProperties{
Name: "some-name",
GroupID: "some-group-id",
ArtifactID: "some-parent-name",
Version: "3.0",
},
Parent: nil,
},
},
expectedPackage: nil,
},
{
name: "child matches parent by artifact id",
props: pkg.PomProperties{

View File

@ -1,6 +1,8 @@
package java
import (
"strings"
"github.com/anchore/packageurl-go"
"github.com/anchore/syft/syft/pkg"
"github.com/anchore/syft/syft/pkg/cataloger/common/cpe"
@ -9,9 +11,9 @@ import (
// PackageURL returns the PURL for the specific java package (see https://github.com/package-url/purl-spec)
func packageURL(name, version string, metadata pkg.JavaMetadata) string {
var groupID = name
groupIDs := cpe.GroupIDsFromJavaMetadata(name, metadata)
if len(groupIDs) > 0 {
groupID = groupIDs[0]
if gID := groupIDFromJavaMetadata(name, metadata); gID != "" {
groupID = gID
}
pURL := packageurl.NewPackageURL(
@ -23,3 +25,121 @@ func packageURL(name, version string, metadata pkg.JavaMetadata) string {
"")
return pURL.ToString()
}
// GroupIDFromJavaPackage returns the authoritative group ID for a Java package.
// The order of precedence is:
// 1. The group ID from the POM properties
// 2. The group ID from the POM project
// 3. The group ID from a select map of known group IDs
// 4. The group ID from the Java manifest
func groupIDFromJavaMetadata(pkgName string, metadata pkg.JavaMetadata) (groupID string) {
if groupID = groupIDFromPomProperties(metadata.PomProperties); groupID != "" {
return groupID
}
if groupID = groupIDFromPomProject(metadata.PomProject); groupID != "" {
return groupID
}
if groupID = groupIDFromKnownPackageList(pkgName); groupID != "" {
return groupID
}
if groupID = groupIDFromJavaManifest(metadata.Manifest); groupID != "" {
return groupID
}
return groupID
}
func groupIDFromKnownPackageList(pkgName string) (groupID string) {
if groupID, ok := cpe.DefaultArtifactIDToGroupID[pkgName]; ok {
return groupID
}
return groupID
}
func groupIDFromJavaManifest(manifest *pkg.JavaManifest) (groupID string) {
if manifest == nil {
return groupID
}
groupIDS := cpe.GetManifestFieldGroupIDs(manifest, cpe.PrimaryJavaManifestGroupIDFields)
// assumes that primaryJavaManifestNameFields are ordered by priority
if len(groupIDS) != 0 {
return groupIDS[0]
}
groupIDS = cpe.GetManifestFieldGroupIDs(manifest, cpe.SecondaryJavaManifestGroupIDFields)
if len(groupIDS) != 0 {
return groupIDS[0]
}
return groupID
}
func groupIDFromPomProperties(properties *pkg.PomProperties) (groupID string) {
if properties == nil {
return groupID
}
if looksLikeGroupID(properties.GroupID) {
return cleanGroupID(properties.GroupID)
}
// sometimes the publisher puts the group ID in the artifact ID field unintentionally
if looksLikeGroupID(properties.ArtifactID) {
// there is a strong indication that the artifact ID is really a group ID
return cleanGroupID(properties.ArtifactID)
}
return groupID
}
func groupIDFromPomProject(project *pkg.PomProject) (groupID string) {
if project == nil {
return groupID
}
// check the project details
if looksLikeGroupID(project.GroupID) {
return cleanGroupID(project.GroupID)
}
// sometimes the publisher puts the group ID in the artifact ID field unintentionally
if looksLikeGroupID(project.ArtifactID) {
// there is a strong indication that the artifact ID is really a group ID
return cleanGroupID(project.ArtifactID)
}
// let's check the parent details
// if the current project does not have a group ID, but the parent does, we'll use the parent's group ID
if project.Parent != nil {
if looksLikeGroupID(project.Parent.GroupID) {
return cleanGroupID(project.Parent.GroupID)
}
// sometimes the publisher puts the group ID in the artifact ID field unintentionally
if looksLikeGroupID(project.Parent.ArtifactID) {
// there is a strong indication that the artifact ID is really a group ID
return cleanGroupID(project.Parent.ArtifactID)
}
}
return groupID
}
func looksLikeGroupID(value string) bool {
return strings.Contains(value, ".")
}
func cleanGroupID(groupID string) string {
return strings.TrimSpace(removeOSCIDirectives(groupID))
}
func removeOSCIDirectives(groupID string) string {
// for example:
// org.bar;uses:=“org.foo” -> org.bar
// more about OSGI directives see https://spring.io/blog/2008/10/20/understanding-the-osgi-uses-directive/
return strings.Split(groupID, ";")[0]
}

View File

@ -45,3 +45,59 @@ func Test_packageURL(t *testing.T) {
})
}
}
func Test_groupIDFromJavaMetadata(t *testing.T) {
tests := []struct {
name string
pkgName string
metadata pkg.JavaMetadata
expect string
}{
{
name: "pom properties",
metadata: pkg.JavaMetadata{
PomProperties: &pkg.PomProperties{
GroupID: "org.anchore",
},
},
expect: "org.anchore",
},
{
name: "pom project",
metadata: pkg.JavaMetadata{
PomProject: &pkg.PomProject{
GroupID: "org.anchore",
},
},
expect: "org.anchore",
},
{
name: "known package list",
pkgName: "ant-antlr",
metadata: pkg.JavaMetadata{},
expect: "org.apache.ant",
},
{
name: "java manifest",
metadata: pkg.JavaMetadata{
Manifest: &pkg.JavaManifest{
Main: map[string]string{
"Implementation-Vendor": "org.anchore",
},
},
},
expect: "org.anchore",
},
{
name: "no group id",
metadata: pkg.JavaMetadata{},
expect: "",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
assert.Equal(t, tt.expect, groupIDFromJavaMetadata(tt.pkgName, tt.metadata))
})
}
}

View File

@ -178,8 +178,10 @@ func resolveProperty(pom gopom.Project, property *string) string {
pomValueType = pomValue.Type()
if pomValueType.Kind() == reflect.Ptr {
pomValue = pomValue.Elem()
if !pomValue.IsZero() {
pomValueType = pomValue.Type()
}
}
if partNum == numParts-1 {
return fmt.Sprintf("%v", pomValue.Interface())
}