Fix CPE encode/decode when it contains special chars (#714)

* Fix CPE generation when the generated CPE contains invalid characters Currently syft seems to generate invalid CPEs which do not conform with the official CPE spec. This is because the underlying nvdtools library is not a completely spec compliant implementation and has some interesting bugs/issues. The following are the list of issues I have encountered with nvdtools: 1. It parses strings which are not CPEs incorrectly as valid CPEs. This messes up our filter function which is supposed to filter out any incorrect CPEs we generate. In order to fix this, I have introduced a new regex in the NewCPE function which follows the upstream spec and filters out any incorrect CPEs. 2. Introduce wfn.WFNize for any cpe attributes we infer from packages. This ensures that we are escaping and quoting any special characters before putting them into CPEs. Note that nvdtools has yet another bug in the WFNize function, specifically the "addSlashesAt" part of the function which stops the loop as soon as it encounters ":" a valid character for a WFN attribute after quoting, but the way nvdtools handles it causes it to truncate strings that container ":". As a result strings like "prefix:1.2" which would have been quoted as "prefix\:1.2" end up becoming "prefix" instead causing loss of information and incorrect CPEs being generated. As a result in such cases, we remove out strings containing ":" in any part entirely for now. This is similar to the way we were handling CPE filtering in the past with http urls as vendor strings 3. Add special handling for version which contain ":" due to epochs in debian and rpm. In this case, we strip out the parts before ":" i.e. the epoch and only output the actual function. This ensures we are not discarding valid version strings due to pt #.2. In the future we should look at moving to a more spec compliant cpe parsing library to avoid such shenanigans. Signed-off-by: Sambhav Kothari <skothari44@bloomberg.net> * Remove WFNize for input strings WFNize seems to not be part of the standard as per https://pkg.go.dev/github.com/facebookincubator/nvdtools@v0.1.4/wfn#WFNize and seems to have bugs/issues with encode/decode cycles, so I am just removing it at this point and relying on the CPE regex to filter out invalid CPEs for now. Signed-off-by: Sambhav Kothari <skothari44@bloomberg.net> * Quote the string on decode to ensure consistent CPE string generation Signed-off-by: Sambhav Kothari <skothari44@bloomberg.net> * Add test cases for round-tripping the CPE and fix strip slashes Signed-off-by: Sambhav Kothari <skothari44@bloomberg.net> * Add comprehensive tests for cpe parsing Signed-off-by: Sambhav Kothari <skothari44@bloomberg.net> * Use strings.Builder instead of byte buffer Signed-off-by: Sambhav Kothari <skothari44@bloomberg.net>
2025-11-17 16:33:21 +01:00 · 2022-01-06 14:56:53 +00:00 · 2022-01-06 14:56:53 +00:00 · 2a7325a965
commit 2a7325a965
parent d9aa54cd00
11 changed files with 17148 additions and 13 deletions
--- a/internal/formats/common/spdxhelpers/external_refs.go
+++ b/internal/formats/common/spdxhelpers/external_refs.go
@ -12,7 +12,7 @@ func ExternalRefs(p pkg.Package) (externalRefs []model.ExternalRef) {
 	for _, c := range p.CPEs {
 		externalRefs = append(externalRefs, model.ExternalRef{
 			ReferenceCategory: model.SecurityReferenceCategory,
-			ReferenceLocator:  c.BindToFmtString(),
+			ReferenceLocator:  pkg.CPEString(c),
 			ReferenceType:     model.Cpe23ExternalRefType,
 		})
 	}
--- a/internal/formats/common/spdxhelpers/external_refs_test.go
+++ b/internal/formats/common/spdxhelpers/external_refs_test.go
@ -26,7 +26,7 @@ func Test_ExternalRefs(t *testing.T) {
 			expected: []model.ExternalRef{
 				{
 					ReferenceCategory: model.SecurityReferenceCategory,
-					ReferenceLocator:  testCPE.BindToFmtString(),
+					ReferenceLocator:  pkg.CPEString(testCPE),
 					ReferenceType:     model.Cpe23ExternalRefType,
 				},
 				{
--- a/internal/formats/syftjson/to_format_model.go
+++ b/internal/formats/syftjson/to_format_model.go
@ -142,7 +142,7 @@ func toPackageModels(catalog *pkg.Catalog) []model.Package {
 func toPackageModel(p pkg.Package) model.Package {
 	var cpes = make([]string, len(p.CPEs))
 	for i, c := range p.CPEs {
-		cpes[i] = c.BindToFmtString()
+		cpes[i] = pkg.CPEString(c)
 	}

 	var licenses = make([]string, 0)
--- a/syft/pkg/cataloger/common/cpe/filter.go
+++ b/syft/pkg/cataloger/common/cpe/filter.go
@ -34,7 +34,7 @@ cpeLoop:
 }

 func disallowNonParseableCPEs(cpe pkg.CPE, _ pkg.Package) bool {
-	v := cpe.BindToFmtString()
+	v := pkg.CPEString(cpe)
 	_, err := pkg.NewCPE(v)

 	cannotParse := err != nil
--- a/syft/pkg/cataloger/common/cpe/generate.go
+++ b/syft/pkg/cataloger/common/cpe/generate.go
@ -19,7 +19,6 @@ func newCPE(product, vendor, version, targetSW string) wfn.Attributes {
 	cpe.Vendor = vendor
 	cpe.Version = version
 	cpe.TargetSW = targetSW
-
 	return cpe
 }

@ -29,7 +28,6 @@ func newCPE(product, vendor, version, targetSW string) wfn.Attributes {
 func Generate(p pkg.Package) []pkg.CPE {
 	vendors := candidateVendors(p)
 	products := candidateProducts(p)
-
 	if len(products) == 0 {
 		return nil
 	}
@ -44,7 +42,6 @@ func Generate(p pkg.Package) []pkg.CPE {
 				continue
 			}
 			keys.Add(key)
-
 			// add a new entry...
 			cpes = append(cpes, newCPE(product, vendor, p.Version, wfn.Any))
 		}
--- a/syft/pkg/cataloger/common/cpe/generate_test.go
+++ b/syft/pkg/cataloger/common/cpe/generate_test.go
@ -327,6 +327,38 @@ func TestGeneratePackageCPEs(t *testing.T) {
 				"cpe:2.3:a:some_vendor:name:3.2:*:*:*:*:*:*:*",
 			},
 		},
+		{
+			name: "rpm with epoch",
+			p: pkg.Package{
+				Name:         "name",
+				Version:      "1:3.2",
+				FoundBy:      "some-analyzer",
+				Type:         pkg.RpmPkg,
+				MetadataType: pkg.RpmdbMetadataType,
+				Metadata: pkg.RpmdbMetadata{
+					Vendor: "some-vendor",
+				},
+			},
+			expected: []string{
+				"cpe:2.3:a:name:name:1\\:3.2:*:*:*:*:*:*:*",
+				"cpe:2.3:a:some-vendor:name:1\\:3.2:*:*:*:*:*:*:*",
+				"cpe:2.3:a:some_vendor:name:1\\:3.2:*:*:*:*:*:*:*",
+			},
+		},
+		{
+			name: "deb with epoch",
+			p: pkg.Package{
+				Name:         "name",
+				Version:      "1:3.2",
+				FoundBy:      "some-analyzer",
+				Type:         pkg.DebPkg,
+				MetadataType: pkg.DpkgMetadataType,
+				Metadata:     pkg.DpkgMetadata{},
+			},
+			expected: []string{
+				"cpe:2.3:a:name:name:1\\:3.2:*:*:*:*:*:*:*",
+			},
+		},
 		{
 			name: "cloudbees jenkins package identified via groupId",
 			p: pkg.Package{
@ -522,7 +554,7 @@ func TestGeneratePackageCPEs(t *testing.T) {
 			expectedCpeSet := set.NewStringSet(test.expected...)
 			actualCpeSet := set.NewStringSet()
 			for _, a := range actual {
-				actualCpeSet.Add(a.BindToFmtString())
+				actualCpeSet.Add(pkg.CPEString(a))
 			}

 			extra := strset.Difference(actualCpeSet, expectedCpeSet).List()
--- a/syft/pkg/cataloger/common/cpe/sort_by_specificity.go
+++ b/syft/pkg/cataloger/common/cpe/sort_by_specificity.go
@ -29,6 +29,9 @@ func (c BySpecificity) Less(i, j int) bool {
 	}

 	// if score and length are equal then text sort
+	// note that we are not using CPEString from the syft pkg
+	// as we are not encoding/decoding this CPE string so we don't
+	// need the proper quoted version of the CPE.
 	return c[i].BindToFmtString() < c[j].BindToFmtString()
 }

--- a/syft/pkg/cpe.go
+++ b/syft/pkg/cpe.go
@ -2,6 +2,7 @@ package pkg

 import (
 	"fmt"
+	"regexp"
 	"strings"

 	"github.com/facebookincubator/nvdtools/wfn"
@ -9,7 +10,22 @@ import (

 type CPE = wfn.Attributes

+// This regex string is taken from
+// https://csrc.nist.gov/schema/cpe/2.3/cpe-naming_2.3.xsd which has the official cpe spec
+// This first part matches CPE urls and the second part matches binding strings
+const cpeRegexString = ((`^([c][pP][eE]:/[AHOaho]?(:[A-Za-z0-9\._\-~%]*){0,6})`) +
+	// Or match the CPE binding string
+	// Note that we had to replace '`' with '\x60' to escape the backticks
+	`|(cpe:2\.3:[aho\*\-](:(((\?*|\*?)([a-zA-Z0-9\-\._]|(\\[\\\*\?!"#$$%&'\(\)\+,/:;<=>@\[\]\^\x60\{\|}~]))+(\?*|\*?))|[\*\-])){5}(:(([a-zA-Z]{2,3}(-([a-zA-Z]{2}|[0-9]{3}))?)|[\*\-]))(:(((\?*|\*?)([a-zA-Z0-9\-\._]|(\\[\\\*\?!"#$$%&'\(\)\+,/:;<=>@\[\]\^\x60\{\|}~]))+(\?*|\*?))|[\*\-])){4})$`)
+
+var cpeRegex = regexp.MustCompile(cpeRegexString)
+
 func NewCPE(cpeStr string) (CPE, error) {
+	// We should filter out all CPEs that do not match the official CPE regex
+	// The facebook nvdtools parser can sometimes incorrectly parse invalid CPE strings
+	if !cpeRegex.MatchString(cpeStr) {
+		return CPE{}, fmt.Errorf("failed to parse CPE=%q as it doesn't match the regex=%s", cpeStr, cpeRegexString)
+	}
 	value, err := wfn.Parse(cpeStr)
 	if err != nil {
 		return CPE{}, fmt.Errorf("failed to parse CPE=%q: %w", cpeStr, err)
@ -48,5 +64,61 @@ func normalizeCpeField(field string) string {
 	if field == "*" {
 		return wfn.Any
 	}
-	return strings.ReplaceAll(wfn.StripSlashes(field), `\/`, "/")
+	return stripSlashes(field)
+}
+
+// stripSlashes is a reverse of the sanitize function below.
+// It correctly removes slashes that are followed by allowed puncts.
+// This is to allow for a correct round trip parsing of cpes with quoted characters.
+func stripSlashes(s string) string {
+	const allowedPunct = "-!\"#$%&'()+,./:;<=>@[]^`{|}!~"
+	sb := strings.Builder{}
+	for i, c := range s {
+		if c == '\\' && i+1 < len(s) && strings.ContainsRune(allowedPunct, rune(s[i+1])) {
+			continue
+		} else {
+			sb.WriteRune(c)
+		}
+	}
+	return sb.String()
+}
+
+func CPEString(c CPE) string {
+	output := CPE{}
+	output.Vendor = sanitize(c.Vendor)
+	output.Product = sanitize(c.Product)
+	output.Language = sanitize(c.Language)
+	output.Version = sanitize(c.Version)
+	output.TargetSW = sanitize(c.TargetSW)
+	output.Part = sanitize(c.Part)
+	output.Edition = sanitize(c.Edition)
+	output.Other = sanitize(c.Other)
+	output.SWEdition = sanitize(c.SWEdition)
+	output.TargetHW = sanitize(c.TargetHW)
+	output.Update = sanitize(c.Update)
+	return output.BindToFmtString()
+}
+
+// sanitize is a modified version of WFNize function from nvdtools
+// that quotes all the allowed punctation chars with a slash and replaces
+// spaces with underscores. It differs from the upstream implmentation as
+// it does not use the buggy nvdtools implementation, specifically the "addSlashesAt" part of the
+// function which stops the loop as soon as it encounters ":" a valid
+// character for a WFN attribute after quoting, but the way nvdtools
+// handles it causes it to truncate strings that container ":". As a result
+// strings like "prefix:1.2" which would have been quoted as "prefix\:1.2"
+// end up becoming "prefix" instead causing loss of information and
+// incorrect CPEs being generated.
+func sanitize(s string) string {
+	const allowedPunct = "-!\"#$%&'()+,./:;<=>@[]^`{|}!~"
+	// replace spaces with underscores
+	in := strings.ReplaceAll(s, " ", "_")
+	sb := strings.Builder{}
+	for _, c := range in {
+		if strings.ContainsRune(allowedPunct, c) {
+			sb.WriteRune('\\')
+		}
+		sb.WriteRune(c)
+	}
+	return sb.String()
 }
--- a/syft/pkg/cpe_test.go
+++ b/syft/pkg/cpe_test.go
@ -1,6 +1,9 @@
 package pkg

 import (
+	"encoding/json"
+	"fmt"
+	"io/ioutil"
 	"testing"

 	"github.com/stretchr/testify/assert"
@ -32,7 +35,7 @@ func TestNewCPE(t *testing.T) {
 		{
 			name:     "URL escape characters",
 			input:    `cpe:/a:%240.99_kindle_books_project:%240.99_kindle_books:6::~~~android~~`,
-			expected: must(NewCPE(`cpe:2.3:a:$0.99_kindle_books_project:$0.99_kindle_books:6:*:*:*:*:android:*:*`)),
+			expected: must(NewCPE(`cpe:2.3:a:\$0.99_kindle_books_project:\$0.99_kindle_books:6:*:*:*:*:android:*:*`)),
 		},
 	}

@ -43,8 +46,8 @@ func TestNewCPE(t *testing.T) {
 				t.Fatalf("got an error while creating CPE: %+v", err)
 			}

-			if actual.BindToFmtString() != test.expected.BindToFmtString() {
-				t.Errorf("mismatched entries:\n\texpected:%+v\n\t  actual:%+v\n", test.expected.BindToFmtString(), actual.BindToFmtString())
+			if CPEString(actual) != CPEString(test.expected) {
+				t.Errorf("mismatched entries:\n\texpected:%+v\n\t  actual:%+v\n", CPEString(test.expected), CPEString(actual))
 			}

 		})
@ -80,3 +83,98 @@ func Test_normalizeCpeField(t *testing.T) {
 		})
 	}
 }
+
+func Test_CPEParser(t *testing.T) {
+	testCases := []struct {
+		CPEString string `json:"cpe-string"`
+		CPEUrl    string `json:"cpe-url"`
+		WFN       CPE    `json:"wfn"`
+	}{}
+	out, err := ioutil.ReadFile("test-fixtures/cpe-data.json")
+	if err != nil {
+		t.Fatal("Unable to read test-fixtures/cpe-data.json: ", err)
+	}
+	json.Unmarshal(out, &testCases)
+	for _, test := range testCases {
+		t.Run(test.CPEString, func(t *testing.T) {
+			c1, err := NewCPE(test.CPEString)
+			assert.NoError(t, err)
+			c2, err := NewCPE(test.CPEUrl)
+			assert.NoError(t, err)
+			assert.Equal(t, c1, c2)
+			assert.Equal(t, c1, test.WFN)
+			assert.Equal(t, c2, test.WFN)
+			assert.Equal(t, CPEString(test.WFN), test.CPEString)
+		})
+	}
+}
+
+func Test_InvalidCPE(t *testing.T) {
+
+	testCases := []string{
+		"cpe:2.3:a:some-vendor:name:1:3.2:*:*:*:*:*:*:*",
+		"cpe:2.3:a:some-vendor:name:1^:*:*:*:*:*:*:*",
+		"cpe:2.3:a:some-vendor:name:**:*:*:*:*:*:*:*",
+		"cpe:2.3:a:some-vendor:name:*\\:*:*:*:*:*:*:*",
+	}
+
+	for _, test := range testCases {
+		t.Run(test, func(t *testing.T) {
+			_, err := NewCPE(test)
+			assert.Error(t, err)
+			assert.Contains(t, fmt.Sprint(err), "regex")
+		})
+	}
+}
+
+func Test_RoundTrip(t *testing.T) {
+	tests := []struct {
+		name      string
+		cpe       string
+		parsedCPE CPE
+	}{
+		{
+			name: "normal",
+			cpe:  "cpe:2.3:a:some-vendor:name:3.2:*:*:*:*:*:*:*",
+			parsedCPE: CPE{
+				Part:    "a",
+				Vendor:  "some-vendor",
+				Product: "name",
+				Version: "3.2",
+			},
+		},
+		{
+			name: "escaped colon",
+			cpe:  "cpe:2.3:a:some-vendor:name:1\\:3.2:*:*:*:*:*:*:*",
+			parsedCPE: CPE{
+				Part:    "a",
+				Vendor:  "some-vendor",
+				Product: "name",
+				Version: "1:3.2",
+			},
+		},
+		{
+			name: "escaped forward slash",
+			cpe:  "cpe:2.3:a:test\\/some-vendor:name:3.2:*:*:*:*:*:*:*",
+			parsedCPE: CPE{
+				Part:    "a",
+				Vendor:  "test/some-vendor",
+				Product: "name",
+				Version: "3.2",
+			},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			// CPE string must be preserved through a round trip
+			assert.Equal(t, test.cpe, CPEString(MustCPE(test.cpe)))
+			// The parsed CPE must be the same after a round trip
+			assert.Equal(t, MustCPE(test.cpe), MustCPE(CPEString(MustCPE(test.cpe))))
+			// The test case parsed CPE must be the same after parsing the input string
+			assert.Equal(t, test.parsedCPE, MustCPE(test.cpe))
+			// The test case parsed CPE must produce the same string as the input cpe
+			assert.Equal(t, CPEString(test.parsedCPE), test.cpe)
+		})
+	}
+}
--- a/syft/pkg/test-fixtures/cpe-data.json
+++ b/syft/pkg/test-fixtures/cpe-data.json
--- a/test/integration/encode_decode_cycle_test.go
+++ b/test/integration/encode_decode_cycle_test.go
@ -33,7 +33,6 @@ func TestEncodeDecodeEncodeCycleComparison(t *testing.T) {

 			by1, err := syft.Encode(originalSBOM, test.format)
 			assert.NoError(t, err)
-
 			newSBOM, newFormat, err := syft.Decode(bytes.NewReader(by1))
 			assert.NoError(t, err)
 			assert.Equal(t, test.format, newFormat)