mirror of
https://github.com/anchore/syft.git
synced 2026-05-20 04:05:24 +02:00
fix(debian): only parse machine-readable copyright files with Format header (#4754)
* fix(debian): only parse machine-readable copyright files with Format header Only parse debian/copyright files as machine-readable DEP-5 format when they contain the mandatory Format header field pointing to the copyright specification URI. Files without this header are free-form text and should not have License: regex patterns applied to them, which previously produced nonsensical results like "#", "Permission", "This", "see" for non-machine-readable files. The fallback license classifier in the debian cataloger will handle non-machine-readable files by doing full-text license identification. Closes #4708 Signed-off-by: Bahtya <bahtya@users.noreply.github.com> Signed-off-by: Bahtya <bahtayr@gmail.com> Signed-off-by: Alex Goodman <wagoodman@users.noreply.github.com> * decompose parseLicensesFromCopyright to address linting issues Signed-off-by: Alex Goodman <wagoodman@users.noreply.github.com> --------- Signed-off-by: Bahtya <bahtya@users.noreply.github.com> Signed-off-by: Bahtya <bahtayr@gmail.com> Signed-off-by: Alex Goodman <wagoodman@users.noreply.github.com> Co-authored-by: Bahtya <bahtayr@gmail.com> Co-authored-by: Alex Goodman <wagoodman@users.noreply.github.com>
This commit is contained in:
parent
47cda2b5ef
commit
d81df67493
@ -18,71 +18,60 @@ var (
|
||||
licensePattern = regexp.MustCompile(`^License: (?P<license>\S*)`)
|
||||
commonLicensePathPattern = regexp.MustCompile(`/usr/share/common-licenses/(?P<license>[0-9A-Za-z_.\-]+)`)
|
||||
licenseAgreementHeadingPattern = regexp.MustCompile(`(?i)^\s*(?P<license>LICENSE AGREEMENT(?: FOR .+?)?)\s*$`)
|
||||
formatHeaderPattern = regexp.MustCompile(`^Format:\s*https?://www\.debian\.org/doc/packaging-manuals/copyright-format/`)
|
||||
)
|
||||
|
||||
// heading-detection states. Replaces licenseFirstSentenceAfterHeadingPattern,
|
||||
// which only matched at the start of the file: a non-empty heading, a line
|
||||
// of dashes, blank lines, then text up to the first period.
|
||||
const (
|
||||
expectHeading = iota
|
||||
expectDashes
|
||||
skipBlanks
|
||||
captureLicense
|
||||
headingDone // matched or impossible — stop checking
|
||||
)
|
||||
|
||||
func parseLicensesFromCopyright(reader io.Reader) []string {
|
||||
findings := strset.New()
|
||||
scanner := bufio.NewScanner(reader)
|
||||
|
||||
// State machine replacing licenseFirstSentenceAfterHeadingPattern.
|
||||
// That regex only matched at the start of the file: a non-empty heading,
|
||||
// a line of dashes, blank lines, then text up to the first period.
|
||||
const (
|
||||
expectHeading = iota
|
||||
expectDashes
|
||||
skipBlanks
|
||||
captureLicense
|
||||
headingDone // matched or impossible — stop checking
|
||||
)
|
||||
// per the DEP-5 spec, machine-readable copyright files MUST have a
|
||||
// Format field whose value is a URI for the specification. Only files
|
||||
// with this header should be parsed as machine-readable.
|
||||
// See: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
|
||||
formatVerified := false
|
||||
headingState := expectHeading
|
||||
var licenseText strings.Builder
|
||||
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
|
||||
// per-line regex checks (applied to every line)
|
||||
if value := findLicenseClause(licensePattern, line); value != "" {
|
||||
findings.Add(value)
|
||||
if !formatVerified {
|
||||
if strings.TrimSpace(line) == "" {
|
||||
continue
|
||||
}
|
||||
if value := findLicenseClause(commonLicensePathPattern, line); value != "" {
|
||||
findings.Add(value)
|
||||
if !formatHeaderPattern.MatchString(line) {
|
||||
return nil
|
||||
}
|
||||
if value := findLicenseClause(licenseAgreementHeadingPattern, line); value != "" {
|
||||
findings.Add(value)
|
||||
formatVerified = true
|
||||
}
|
||||
|
||||
// multi-line heading detection (only at start of file)
|
||||
switch headingState {
|
||||
case expectHeading:
|
||||
if strings.TrimSpace(line) != "" {
|
||||
headingState = expectDashes
|
||||
} else {
|
||||
headingState = headingDone
|
||||
}
|
||||
case expectDashes:
|
||||
trimmed := strings.TrimSpace(line)
|
||||
if len(trimmed) > 0 && strings.Trim(trimmed, "-") == "" {
|
||||
headingState = skipBlanks
|
||||
} else {
|
||||
headingState = headingDone
|
||||
}
|
||||
case skipBlanks:
|
||||
if strings.TrimSpace(line) != "" {
|
||||
headingState = captureLicense
|
||||
licenseText.WriteString(line)
|
||||
if value := extractUpToFirstPeriod(licenseText.String()); value != "" {
|
||||
for _, p := range []*regexp.Regexp{licensePattern, commonLicensePathPattern, licenseAgreementHeadingPattern} {
|
||||
if value := findLicenseClause(p, line); value != "" {
|
||||
findings.Add(value)
|
||||
headingState = headingDone
|
||||
}
|
||||
}
|
||||
case captureLicense:
|
||||
licenseText.WriteString(" ")
|
||||
licenseText.WriteString(line)
|
||||
if value := extractUpToFirstPeriod(licenseText.String()); value != "" {
|
||||
findings.Add(value)
|
||||
headingState = headingDone
|
||||
|
||||
var found string
|
||||
headingState, found = advanceHeadingState(headingState, line, &licenseText)
|
||||
if found != "" {
|
||||
findings.Add(found)
|
||||
}
|
||||
}
|
||||
|
||||
if !formatVerified {
|
||||
return nil
|
||||
}
|
||||
|
||||
results := findings.List()
|
||||
@ -91,6 +80,40 @@ func parseLicensesFromCopyright(reader io.Reader) []string {
|
||||
return results
|
||||
}
|
||||
|
||||
func advanceHeadingState(state int, line string, licenseText *strings.Builder) (int, string) {
|
||||
switch state {
|
||||
case expectHeading:
|
||||
if strings.TrimSpace(line) != "" {
|
||||
return expectDashes, ""
|
||||
}
|
||||
return headingDone, ""
|
||||
case expectDashes:
|
||||
trimmed := strings.TrimSpace(line)
|
||||
if len(trimmed) > 0 && strings.Trim(trimmed, "-") == "" {
|
||||
return skipBlanks, ""
|
||||
}
|
||||
return headingDone, ""
|
||||
case skipBlanks:
|
||||
if strings.TrimSpace(line) == "" {
|
||||
return state, ""
|
||||
}
|
||||
licenseText.WriteString(line)
|
||||
if value := extractUpToFirstPeriod(licenseText.String()); value != "" {
|
||||
return headingDone, value
|
||||
}
|
||||
return captureLicense, ""
|
||||
case captureLicense:
|
||||
licenseText.WriteString(" ")
|
||||
licenseText.WriteString(line)
|
||||
if value := extractUpToFirstPeriod(licenseText.String()); value != "" {
|
||||
return headingDone, value
|
||||
}
|
||||
return state, ""
|
||||
case headingDone:
|
||||
}
|
||||
return state, ""
|
||||
}
|
||||
|
||||
// extractUpToFirstPeriod returns the license text up to the first period,
|
||||
// processed through ensureIsSingleLicense, or "" if no period found yet.
|
||||
func extractUpToFirstPeriod(s string) string {
|
||||
|
||||
@ -2,6 +2,7 @@ package debian
|
||||
|
||||
import (
|
||||
"os"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/google/go-cmp/cmp"
|
||||
@ -14,13 +15,14 @@ func TestParseLicensesFromCopyright(t *testing.T) {
|
||||
expected []string
|
||||
}{
|
||||
{
|
||||
// no Format header; not machine-readable, returns nil
|
||||
fixture: "testdata/copyright/libc6",
|
||||
// note: there are other licenses in this file that are not matched --we don't do full text license identification yet
|
||||
expected: []string{"GPL-2", "LGPL-2.1"},
|
||||
expected: nil,
|
||||
},
|
||||
{
|
||||
// no Format header; not machine-readable, returns nil
|
||||
fixture: "testdata/copyright/trilicense",
|
||||
expected: []string{"GPL-2", "LGPL-2.1", "MPL-1.1"},
|
||||
expected: nil,
|
||||
},
|
||||
{
|
||||
fixture: "testdata/copyright/liblzma5",
|
||||
@ -31,21 +33,25 @@ func TestParseLicensesFromCopyright(t *testing.T) {
|
||||
expected: []string{"GPL-1", "GPL-2", "LGPL-2.1"},
|
||||
},
|
||||
{
|
||||
// no Format header; not machine-readable, returns nil
|
||||
// previously this captured nonsensical values like "#", "Permission", "This", "see"
|
||||
fixture: "testdata/copyright/python",
|
||||
// note: this should not capture #, Permission, This, see ... however it's not clear how to fix this (this is probably good enough)
|
||||
expected: []string{"#", "Apache", "Apache-2", "Apache-2.0", "Expat", "GPL-2", "ISC", "LGPL-2.1+", "PSF-2", "Permission", "Python", "This", "see"},
|
||||
expected: nil,
|
||||
},
|
||||
{
|
||||
// no Format header; not machine-readable, returns nil
|
||||
fixture: "testdata/copyright/cuda",
|
||||
expected: []string{"NVIDIA Software License Agreement and CUDA Supplement to Software License Agreement"},
|
||||
expected: nil,
|
||||
},
|
||||
{
|
||||
// no Format header; not machine-readable, returns nil
|
||||
fixture: "testdata/copyright/dev-kit",
|
||||
expected: []string{"LICENSE AGREEMENT FOR NVIDIA SOFTWARE DEVELOPMENT KITS"},
|
||||
expected: nil,
|
||||
},
|
||||
{
|
||||
// no Format header; not machine-readable, returns nil
|
||||
fixture: "testdata/copyright/microsoft",
|
||||
expected: []string{"LICENSE AGREEMENT FOR MICROSOFT PRODUCTS"},
|
||||
expected: nil,
|
||||
},
|
||||
}
|
||||
|
||||
@ -63,3 +69,71 @@ func TestParseLicensesFromCopyright(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseLicensesFromCopyright_FormatHeader(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
content string
|
||||
machineReadable bool
|
||||
}{
|
||||
{
|
||||
name: "valid http Format header",
|
||||
content: "Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/\n",
|
||||
machineReadable: true,
|
||||
},
|
||||
{
|
||||
name: "valid https Format header",
|
||||
content: "Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/\n",
|
||||
machineReadable: true,
|
||||
},
|
||||
{
|
||||
name: "blank lines before Format header",
|
||||
content: "\n\nFormat: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/\n",
|
||||
machineReadable: true,
|
||||
},
|
||||
{
|
||||
name: "no Format header",
|
||||
content: "This is the Debian prepackaged version of foo.\n",
|
||||
machineReadable: false,
|
||||
},
|
||||
{
|
||||
name: "Format header is not first non-blank line",
|
||||
content: "Some-Field: value\nFormat: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/\n",
|
||||
machineReadable: false,
|
||||
},
|
||||
{
|
||||
name: "empty content",
|
||||
content: "",
|
||||
machineReadable: false,
|
||||
},
|
||||
{
|
||||
name: "only blank lines",
|
||||
content: "\n\n\n",
|
||||
machineReadable: false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
t.Run(test.name, func(t *testing.T) {
|
||||
actual := parseLicensesFromCopyright(strings.NewReader(test.content))
|
||||
// parseLicensesFromCopyright returns nil for non-machine-readable
|
||||
// files and a (possibly empty) slice otherwise.
|
||||
if test.machineReadable {
|
||||
require.NotNil(t, actual)
|
||||
} else {
|
||||
require.Nil(t, actual)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseLicensesFromCopyrightInline(t *testing.T) {
|
||||
// verify that a file with License: fields but no Format header returns nil
|
||||
content := `License: GPL-2
|
||||
License: LGPL-2.1
|
||||
`
|
||||
actual := parseLicensesFromCopyright(strings.NewReader(content))
|
||||
if actual != nil {
|
||||
t.Errorf("expected nil for non-machine-readable file, got %v", actual)
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user