diff --git a/internal/parse.go b/internal/parse.go deleted file mode 100644 index 300825c98..000000000 --- a/internal/parse.go +++ /dev/null @@ -1,15 +0,0 @@ -package internal - -import "regexp" - -// MatchCaptureGroups takes a regular expression and string and returns all of the named capture group results in a map. -func MatchCaptureGroups(regEx *regexp.Regexp, str string) map[string]string { - match := regEx.FindStringSubmatch(str) - results := make(map[string]string) - for i, name := range regEx.SubexpNames() { - if i > 0 && i <= len(match) { - results[name] = match[i] - } - } - return results -} diff --git a/internal/regex_helpers.go b/internal/regex_helpers.go new file mode 100644 index 000000000..106fd5b47 --- /dev/null +++ b/internal/regex_helpers.go @@ -0,0 +1,39 @@ +package internal + +import "regexp" + +// MatchNamedCaptureGroups takes a regular expression and string and returns all of the named capture group results in a map. +// Note: this is only for the first match in the regex. +func MatchNamedCaptureGroups(regEx *regexp.Regexp, content string) map[string]string { + // note: we are looking across all matches and stopping on the first non-empty match. Why? Take the following example: + // input: "cool something to match against" pattern: `((?Pmatch) (?Pagainst))?`. Since the pattern is + // encapsulated in an optional capture group, there will be results for each character, but the results will match + // on nothing. The only "true" match will be at the end ("match against"). + allMatches := regEx.FindAllStringSubmatch(content, -1) + for matchIdx, match := range allMatches { + // fill a candidate results map with named capture group results, accepting empty values, but not groups with + // no names + results := make(map[string]string) + for nameIdx, name := range regEx.SubexpNames() { + if nameIdx <= len(match) && len(name) > 0 { + results[name] = match[nameIdx] + } + } + // note: since we are looking for the first best potential match we should stop when we find the first one + // with non-empty results. + if len(results) > 0 { + foundNonEmptyValue := false + for _, value := range results { + if value != "" { + foundNonEmptyValue = true + break + } + } + // return the first non-empty result, or if this is the last match, the results that were found. + if foundNonEmptyValue || matchIdx == len(allMatches)-1 { + return results + } + } + } + return nil +} diff --git a/internal/regex_helpers_test.go b/internal/regex_helpers_test.go new file mode 100644 index 000000000..1c4837753 --- /dev/null +++ b/internal/regex_helpers_test.go @@ -0,0 +1,70 @@ +package internal + +import ( + "regexp" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestMatchCaptureGroups(t *testing.T) { + tests := []struct { + name string + input string + pattern string + expected map[string]string + }{ + { + name: "go-case", + input: "match this thing", + pattern: `(?Pmatch).*(?Pthing)`, + expected: map[string]string{ + "name": "match", + "version": "thing", + }, + }, + { + name: "only matches the first instance", + input: "match this thing batch another think", + pattern: `(?P[mb]atch).*?(?Pthin[gk])`, + expected: map[string]string{ + "name": "match", + "version": "thing", + }, + }, + { + name: "nested capture groups", + input: "cool something to match against", + pattern: `((?Pmatch) (?Pagainst))`, + expected: map[string]string{ + "name": "match", + "version": "against", + }, + }, + { + name: "nested optional capture groups", + input: "cool something to match against", + pattern: `((?Pmatch) (?Pagainst))?`, + expected: map[string]string{ + "name": "match", + "version": "against", + }, + }, + { + name: "nested optional capture groups with larger match", + input: "cool something to match against match never", + pattern: `.*?((?Pmatch) (?P(against|never)))?`, + expected: map[string]string{ + "name": "match", + "version": "against", + }, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + actual := MatchNamedCaptureGroups(regexp.MustCompile(test.pattern), test.input) + assert.Equal(t, test.expected, actual) + }) + } +} diff --git a/syft/pkg/cataloger/deb/parse_copyright.go b/syft/pkg/cataloger/deb/parse_copyright.go index ceb1dd613..95ee16a8f 100644 --- a/syft/pkg/cataloger/deb/parse_copyright.go +++ b/syft/pkg/cataloger/deb/parse_copyright.go @@ -21,7 +21,7 @@ func parseLicensesFromCopyright(reader io.Reader) []string { for scanner.Scan() { line := scanner.Text() - matchesByGroup := internal.MatchCaptureGroups(licensePattern, line) + matchesByGroup := internal.MatchNamedCaptureGroups(licensePattern, line) if len(matchesByGroup) > 0 { candidate, ok := matchesByGroup["license"] if !ok { diff --git a/syft/pkg/cataloger/deb/parse_dpkg_status.go b/syft/pkg/cataloger/deb/parse_dpkg_status.go index 8992c1836..ea25584dd 100644 --- a/syft/pkg/cataloger/deb/parse_dpkg_status.go +++ b/syft/pkg/cataloger/deb/parse_dpkg_status.go @@ -145,7 +145,7 @@ func extractAllFields(reader *bufio.Reader) (map[string]interface{}, error) { // of the "" form, then return name and nil func extractSourceVersion(source string) (string, string) { // special handling for the Source field since it has formatted data - match := internal.MatchCaptureGroups(sourceRegexp, source) + match := internal.MatchNamedCaptureGroups(sourceRegexp, source) return match["name"], match["version"] } diff --git a/syft/pkg/cataloger/javascript/parse_package_json.go b/syft/pkg/cataloger/javascript/parse_package_json.go index d5ff50142..9cc2f5953 100644 --- a/syft/pkg/cataloger/javascript/parse_package_json.go +++ b/syft/pkg/cataloger/javascript/parse_package_json.go @@ -63,7 +63,7 @@ func (a *Author) UnmarshalJSON(b []byte) error { } } else { // parse out "name (url)" into an Author struct - fields = internal.MatchCaptureGroups(authorPattern, authorStr) + fields = internal.MatchNamedCaptureGroups(authorPattern, authorStr) } // translate the map into a structure diff --git a/syft/pkg/cataloger/ruby/parse_gemspec.go b/syft/pkg/cataloger/ruby/parse_gemspec.go index fc99f6710..f743a2f5c 100644 --- a/syft/pkg/cataloger/ruby/parse_gemspec.go +++ b/syft/pkg/cataloger/ruby/parse_gemspec.go @@ -77,7 +77,7 @@ func parseGemSpecEntries(_ string, reader io.Reader) ([]pkg.Package, error) { } for field, pattern := range patterns { - matchMap := internal.MatchCaptureGroups(pattern, sanitizedLine) + matchMap := internal.MatchNamedCaptureGroups(pattern, sanitizedLine) if value := matchMap[field]; value != "" { if postProcessor := postProcessors[field]; postProcessor != nil { fields[field] = postProcessor(value)