diff --git a/syft/pkg/cataloger/debian/parse_copyright.go b/syft/pkg/cataloger/debian/parse_copyright.go index f4b879842..e6936ce2d 100644 --- a/syft/pkg/cataloger/debian/parse_copyright.go +++ b/syft/pkg/cataloger/debian/parse_copyright.go @@ -18,71 +18,60 @@ var ( licensePattern = regexp.MustCompile(`^License: (?P\S*)`) commonLicensePathPattern = regexp.MustCompile(`/usr/share/common-licenses/(?P[0-9A-Za-z_.\-]+)`) licenseAgreementHeadingPattern = regexp.MustCompile(`(?i)^\s*(?PLICENSE AGREEMENT(?: FOR .+?)?)\s*$`) + formatHeaderPattern = regexp.MustCompile(`^Format:\s*https?://www\.debian\.org/doc/packaging-manuals/copyright-format/`) +) + +// heading-detection states. Replaces licenseFirstSentenceAfterHeadingPattern, +// which only matched at the start of the file: a non-empty heading, a line +// of dashes, blank lines, then text up to the first period. +const ( + expectHeading = iota + expectDashes + skipBlanks + captureLicense + headingDone // matched or impossible — stop checking ) func parseLicensesFromCopyright(reader io.Reader) []string { findings := strset.New() scanner := bufio.NewScanner(reader) - // State machine replacing licenseFirstSentenceAfterHeadingPattern. - // That regex only matched at the start of the file: a non-empty heading, - // a line of dashes, blank lines, then text up to the first period. - const ( - expectHeading = iota - expectDashes - skipBlanks - captureLicense - headingDone // matched or impossible — stop checking - ) + // per the DEP-5 spec, machine-readable copyright files MUST have a + // Format field whose value is a URI for the specification. Only files + // with this header should be parsed as machine-readable. + // See: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ + formatVerified := false headingState := expectHeading var licenseText strings.Builder for scanner.Scan() { line := scanner.Text() - // per-line regex checks (applied to every line) - if value := findLicenseClause(licensePattern, line); value != "" { - findings.Add(value) - } - if value := findLicenseClause(commonLicensePathPattern, line); value != "" { - findings.Add(value) - } - if value := findLicenseClause(licenseAgreementHeadingPattern, line); value != "" { - findings.Add(value) + if !formatVerified { + if strings.TrimSpace(line) == "" { + continue + } + if !formatHeaderPattern.MatchString(line) { + return nil + } + formatVerified = true } - // multi-line heading detection (only at start of file) - switch headingState { - case expectHeading: - if strings.TrimSpace(line) != "" { - headingState = expectDashes - } else { - headingState = headingDone - } - case expectDashes: - trimmed := strings.TrimSpace(line) - if len(trimmed) > 0 && strings.Trim(trimmed, "-") == "" { - headingState = skipBlanks - } else { - headingState = headingDone - } - case skipBlanks: - if strings.TrimSpace(line) != "" { - headingState = captureLicense - licenseText.WriteString(line) - if value := extractUpToFirstPeriod(licenseText.String()); value != "" { - findings.Add(value) - headingState = headingDone - } - } - case captureLicense: - licenseText.WriteString(" ") - licenseText.WriteString(line) - if value := extractUpToFirstPeriod(licenseText.String()); value != "" { + for _, p := range []*regexp.Regexp{licensePattern, commonLicensePathPattern, licenseAgreementHeadingPattern} { + if value := findLicenseClause(p, line); value != "" { findings.Add(value) - headingState = headingDone } } + + var found string + headingState, found = advanceHeadingState(headingState, line, &licenseText) + if found != "" { + findings.Add(found) + } + } + + if !formatVerified { + return nil } results := findings.List() @@ -91,6 +80,40 @@ func parseLicensesFromCopyright(reader io.Reader) []string { return results } +func advanceHeadingState(state int, line string, licenseText *strings.Builder) (int, string) { + switch state { + case expectHeading: + if strings.TrimSpace(line) != "" { + return expectDashes, "" + } + return headingDone, "" + case expectDashes: + trimmed := strings.TrimSpace(line) + if len(trimmed) > 0 && strings.Trim(trimmed, "-") == "" { + return skipBlanks, "" + } + return headingDone, "" + case skipBlanks: + if strings.TrimSpace(line) == "" { + return state, "" + } + licenseText.WriteString(line) + if value := extractUpToFirstPeriod(licenseText.String()); value != "" { + return headingDone, value + } + return captureLicense, "" + case captureLicense: + licenseText.WriteString(" ") + licenseText.WriteString(line) + if value := extractUpToFirstPeriod(licenseText.String()); value != "" { + return headingDone, value + } + return state, "" + case headingDone: + } + return state, "" +} + // extractUpToFirstPeriod returns the license text up to the first period, // processed through ensureIsSingleLicense, or "" if no period found yet. func extractUpToFirstPeriod(s string) string { diff --git a/syft/pkg/cataloger/debian/parse_copyright_test.go b/syft/pkg/cataloger/debian/parse_copyright_test.go index dca0188ca..f25dabca7 100644 --- a/syft/pkg/cataloger/debian/parse_copyright_test.go +++ b/syft/pkg/cataloger/debian/parse_copyright_test.go @@ -2,6 +2,7 @@ package debian import ( "os" + "strings" "testing" "github.com/google/go-cmp/cmp" @@ -14,13 +15,14 @@ func TestParseLicensesFromCopyright(t *testing.T) { expected []string }{ { - fixture: "testdata/copyright/libc6", - // note: there are other licenses in this file that are not matched --we don't do full text license identification yet - expected: []string{"GPL-2", "LGPL-2.1"}, + // no Format header; not machine-readable, returns nil + fixture: "testdata/copyright/libc6", + expected: nil, }, { + // no Format header; not machine-readable, returns nil fixture: "testdata/copyright/trilicense", - expected: []string{"GPL-2", "LGPL-2.1", "MPL-1.1"}, + expected: nil, }, { fixture: "testdata/copyright/liblzma5", @@ -31,21 +33,25 @@ func TestParseLicensesFromCopyright(t *testing.T) { expected: []string{"GPL-1", "GPL-2", "LGPL-2.1"}, }, { - fixture: "testdata/copyright/python", - // note: this should not capture #, Permission, This, see ... however it's not clear how to fix this (this is probably good enough) - expected: []string{"#", "Apache", "Apache-2", "Apache-2.0", "Expat", "GPL-2", "ISC", "LGPL-2.1+", "PSF-2", "Permission", "Python", "This", "see"}, + // no Format header; not machine-readable, returns nil + // previously this captured nonsensical values like "#", "Permission", "This", "see" + fixture: "testdata/copyright/python", + expected: nil, }, { + // no Format header; not machine-readable, returns nil fixture: "testdata/copyright/cuda", - expected: []string{"NVIDIA Software License Agreement and CUDA Supplement to Software License Agreement"}, + expected: nil, }, { + // no Format header; not machine-readable, returns nil fixture: "testdata/copyright/dev-kit", - expected: []string{"LICENSE AGREEMENT FOR NVIDIA SOFTWARE DEVELOPMENT KITS"}, + expected: nil, }, { + // no Format header; not machine-readable, returns nil fixture: "testdata/copyright/microsoft", - expected: []string{"LICENSE AGREEMENT FOR MICROSOFT PRODUCTS"}, + expected: nil, }, } @@ -63,3 +69,71 @@ func TestParseLicensesFromCopyright(t *testing.T) { }) } } + +func TestParseLicensesFromCopyright_FormatHeader(t *testing.T) { + tests := []struct { + name string + content string + machineReadable bool + }{ + { + name: "valid http Format header", + content: "Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/\n", + machineReadable: true, + }, + { + name: "valid https Format header", + content: "Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/\n", + machineReadable: true, + }, + { + name: "blank lines before Format header", + content: "\n\nFormat: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/\n", + machineReadable: true, + }, + { + name: "no Format header", + content: "This is the Debian prepackaged version of foo.\n", + machineReadable: false, + }, + { + name: "Format header is not first non-blank line", + content: "Some-Field: value\nFormat: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/\n", + machineReadable: false, + }, + { + name: "empty content", + content: "", + machineReadable: false, + }, + { + name: "only blank lines", + content: "\n\n\n", + machineReadable: false, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + actual := parseLicensesFromCopyright(strings.NewReader(test.content)) + // parseLicensesFromCopyright returns nil for non-machine-readable + // files and a (possibly empty) slice otherwise. + if test.machineReadable { + require.NotNil(t, actual) + } else { + require.Nil(t, actual) + } + }) + } +} + +func TestParseLicensesFromCopyrightInline(t *testing.T) { + // verify that a file with License: fields but no Format header returns nil + content := `License: GPL-2 +License: LGPL-2.1 +` + actual := parseLicensesFromCopyright(strings.NewReader(content)) + if actual != nil { + t.Errorf("expected nil for non-machine-readable file, got %v", actual) + } +}