Sambhav Kothari 2a7325a965
Fix CPE encode/decode when it contains special chars (#714)
* Fix CPE generation when the generated CPE contains invalid characters

Currently syft seems to generate invalid CPEs which do not
conform with the official CPE spec. This is because the underlying
nvdtools library is not a completely spec compliant implementation
and has some interesting bugs/issues.

The following are the list of issues I have encountered with nvdtools:

1. It parses strings which are not CPEs incorrectly as valid CPEs. This
messes up our filter function which is supposed to filter out any
incorrect CPEs we generate. In order to fix this, I have introduced
a new regex in the NewCPE function which follows the upstream spec and
filters out any incorrect CPEs.

2. Introduce wfn.WFNize for any cpe attributes we infer from packages.
This ensures that we are escaping and quoting any special characters
before putting them into CPEs. Note that nvdtools has yet another bug
in the WFNize function, specifically the "addSlashesAt" part of the
function which stops the loop as soon as it encounters ":" a valid
character for a WFN attribute after quoting, but the way nvdtools
handles it causes it to truncate strings that container ":". As a result
strings like "prefix:1.2" which would have been quoted as "prefix\:1.2"
end up becoming "prefix" instead causing loss of information and
incorrect CPEs being generated. As a result in such cases, we remove out
strings containing ":" in any part entirely for now. This is similar
to the way we were handling CPE filtering in the past with http urls as
vendor strings

3. Add special handling for version which contain ":" due to epochs in
debian and rpm. In this case, we strip out the parts before ":" i.e.
the epoch and only output the actual function. This ensures we are not
discarding valid version strings due to pt #.2.

In the future we should look at moving to a more spec compliant cpe
parsing library to avoid such shenanigans.

Signed-off-by: Sambhav Kothari <skothari44@bloomberg.net>

* Remove WFNize for input strings

WFNize seems to not be part of the standard as per
https://pkg.go.dev/github.com/facebookincubator/nvdtools@v0.1.4/wfn#WFNize
and seems to have bugs/issues with encode/decode cycles, so I am
just removing it at this point and relying on the CPE regex to filter
out invalid CPEs for now.

Signed-off-by: Sambhav Kothari <skothari44@bloomberg.net>

* Quote the string on decode to ensure consistent CPE string generation

Signed-off-by: Sambhav Kothari <skothari44@bloomberg.net>

* Add test cases for round-tripping the CPE and fix strip slashes

Signed-off-by: Sambhav Kothari <skothari44@bloomberg.net>

* Add comprehensive tests for cpe parsing

Signed-off-by: Sambhav Kothari <skothari44@bloomberg.net>

* Use strings.Builder instead of byte buffer

Signed-off-by: Sambhav Kothari <skothari44@bloomberg.net>
2022-01-06 09:56:53 -05:00

235 lines
7.1 KiB
Go

package cpe
import (
"bufio"
"bytes"
"fmt"
"sort"
"strings"
"github.com/anchore/syft/internal"
"github.com/anchore/syft/syft/pkg"
"github.com/facebookincubator/nvdtools/wfn"
)
func newCPE(product, vendor, version, targetSW string) wfn.Attributes {
cpe := *(wfn.NewAttributesWithAny())
cpe.Part = "a"
cpe.Product = product
cpe.Vendor = vendor
cpe.Version = version
cpe.TargetSW = targetSW
return cpe
}
// Generate Create a list of CPEs for a given package, trying to guess the vendor, product tuple. We should be trying to
// generate the minimal set of representative CPEs, which implies that optional fields should not be included
// (such as target SW).
func Generate(p pkg.Package) []pkg.CPE {
vendors := candidateVendors(p)
products := candidateProducts(p)
if len(products) == 0 {
return nil
}
keys := internal.NewStringSet()
cpes := make([]pkg.CPE, 0)
for _, product := range products {
for _, vendor := range vendors {
// prevent duplicate entries...
key := fmt.Sprintf("%s|%s|%s", product, vendor, p.Version)
if keys.Contains(key) {
continue
}
keys.Add(key)
// add a new entry...
cpes = append(cpes, newCPE(product, vendor, p.Version, wfn.Any))
}
}
// filter out any known combinations that don't accurately represent this package
cpes = filter(cpes, p, cpeFilters...)
sort.Sort(BySpecificity(cpes))
return cpes
}
func candidateVendors(p pkg.Package) []string {
// in ecosystems where the packaging metadata does not have a clear field to indicate a vendor (or a field that
// could be interpreted indirectly as such) the project name tends to be a common stand in. Examples of this
// are the elasticsearch gem, xstream jar, and rack gem... all of these cases you can find vulnerabilities
// with CPEs where the vendor is the product name and doesn't appear to be derived from any available package
// metadata.
vendors := newFieldCandidateSet(candidateProducts(p)...)
switch p.Language {
case pkg.Ruby:
vendors.addValue("ruby-lang")
case pkg.Go:
// replace all candidates with only the golang-specific helper
vendors.clear()
vendor := candidateVendorForGo(p.Name)
if vendor != "" {
vendors.addValue(vendor)
}
}
// some ecosystems do not have enough metadata to determine the vendor accurately, in which case we selectively
// allow * as a candidate. Note: do NOT allow Java packages to have * vendors.
switch p.Language {
case pkg.Ruby, pkg.JavaScript:
vendors.addValue(wfn.Any)
}
switch p.MetadataType {
case pkg.RpmdbMetadataType:
vendors.union(candidateVendorsForRPM(p))
case pkg.GemMetadataType:
vendors.union(candidateVendorsForRuby(p))
case pkg.PythonPackageMetadataType:
vendors.union(candidateVendorsForPython(p))
case pkg.JavaMetadataType:
vendors.union(candidateVendorsForJava(p))
}
// try swapping hyphens for underscores, vice versa, and removing separators altogether
addDelimiterVariations(vendors)
// generate sub-selections of each candidate based on separators (e.g. jenkins-ci -> [jenkins, jenkins-ci])
addAllSubSelections(vendors)
// add more candidates based on the package info for each vendor candidate
for _, vendor := range vendors.uniqueValues() {
vendors.addValue(findAdditionalVendors(defaultCandidateAdditions, p.Type, p.Name, vendor)...)
}
return vendors.uniqueValues()
}
func candidateProducts(p pkg.Package) []string {
products := newFieldCandidateSet(p.Name)
switch {
case p.Language == pkg.Python:
if !strings.HasPrefix(p.Name, "python") {
products.addValue("python-" + p.Name)
}
case p.Language == pkg.Java || p.MetadataType == pkg.JavaMetadataType:
products.addValue(candidateProductsForJava(p)...)
case p.Language == pkg.Go:
// replace all candidates with only the golang-specific helper
products.clear()
prod := candidateProductForGo(p.Name)
if prod != "" {
products.addValue(prod)
}
}
// it is never OK to have candidates with these values ["" and "*"] (since CPEs will match any other value)
products.removeByValue("")
products.removeByValue("*")
// try swapping hyphens for underscores, vice versa, and removing separators altogether
addDelimiterVariations(products)
// add known candidate additions
products.addValue(findAdditionalProducts(defaultCandidateAdditions, p.Type, p.Name)...)
return products.uniqueValues()
}
func addAllSubSelections(fields fieldCandidateSet) {
candidatesForVariations := fields.copy()
candidatesForVariations.removeWhere(subSelectionsDisallowed)
for _, candidate := range candidatesForVariations.values() {
fields.addValue(generateSubSelections(candidate)...)
}
}
// generateSubSelections attempts to split a field by hyphens and underscores and return a list of sensible sub-selections
// that can be used as product or vendor candidates. E.g. jenkins-ci-tools -> [jenkins-ci-tools, jenkins-ci, jenkins].
func generateSubSelections(field string) (results []string) {
scanner := bufio.NewScanner(strings.NewReader(field))
scanner.Split(scanByHyphenOrUnderscore)
var lastToken uint8
for scanner.Scan() {
rawCandidate := scanner.Text()
if len(rawCandidate) == 0 {
break
}
// trim any number of hyphen or underscore that is prefixed/suffixed on the given candidate. Since
// scanByHyphenOrUnderscore preserves delimiters (hyphens and underscores) they are guaranteed to be at least
// prefixed.
candidate := strings.TrimFunc(rawCandidate, trimHyphenOrUnderscore)
// capture the result (if there is content)
if len(candidate) > 0 {
if len(results) > 0 {
results = append(results, results[len(results)-1]+string(lastToken)+candidate)
} else {
results = append(results, candidate)
}
}
// keep track of the trailing separator for the next loop
lastToken = rawCandidate[len(rawCandidate)-1]
}
return results
}
// trimHyphenOrUnderscore is a character filter function for use with strings.TrimFunc in order to remove any hyphen or underscores.
func trimHyphenOrUnderscore(r rune) bool {
switch r {
case '-', '_':
return true
}
return false
}
// scanByHyphenOrUnderscore splits on hyphen or underscore and includes the separator in the split
func scanByHyphenOrUnderscore(data []byte, atEOF bool) (advance int, token []byte, err error) {
if atEOF && len(data) == 0 {
return 0, nil, nil
}
if i := bytes.IndexAny(data, "-_"); i >= 0 {
return i + 1, data[0 : i+1], nil
}
if atEOF {
return len(data), data, nil
}
return 0, nil, nil
}
func addDelimiterVariations(fields fieldCandidateSet) {
candidatesForVariations := fields.copy()
candidatesForVariations.removeWhere(delimiterVariationsDisallowed)
for _, candidate := range candidatesForVariations.list() {
field := candidate.value
hasHyphen := strings.Contains(field, "-")
hasUnderscore := strings.Contains(field, "_")
if hasHyphen {
// provide variations of hyphen candidates with an underscore
newValue := strings.ReplaceAll(field, "-", "_")
underscoreCandidate := candidate
underscoreCandidate.value = newValue
fields.add(underscoreCandidate)
}
if hasUnderscore {
// provide variations of underscore candidates with a hyphen
newValue := strings.ReplaceAll(field, "_", "-")
hyphenCandidate := candidate
hyphenCandidate.value = newValue
fields.add(hyphenCandidate)
}
}
}