fix(debian): only parse machine-readable copyright files with Format header (#4754)

* fix(debian): only parse machine-readable copyright files with Format header

Only parse debian/copyright files as machine-readable DEP-5 format when
they contain the mandatory Format header field pointing to the copyright
specification URI. Files without this header are free-form text and
should not have License: regex patterns applied to them, which previously
produced nonsensical results like "#", "Permission", "This", "see" for
non-machine-readable files.

The fallback license classifier in the debian cataloger will handle
non-machine-readable files by doing full-text license identification.

Closes #4708

Signed-off-by: Bahtya <bahtya@users.noreply.github.com>
Signed-off-by: Bahtya <bahtayr@gmail.com>
Signed-off-by: Alex Goodman <wagoodman@users.noreply.github.com>

* decompose parseLicensesFromCopyright to address linting issues

Signed-off-by: Alex Goodman <wagoodman@users.noreply.github.com>

---------

Signed-off-by: Bahtya <bahtya@users.noreply.github.com>
Signed-off-by: Bahtya <bahtayr@gmail.com>
Signed-off-by: Alex Goodman <wagoodman@users.noreply.github.com>
Co-authored-by: Bahtya <bahtayr@gmail.com>
Co-authored-by: Alex Goodman <wagoodman@users.noreply.github.com>
This commit is contained in:
bahtyar 2026-05-06 21:02:27 +08:00 committed by GitHub
parent 47cda2b5ef
commit d81df67493
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 155 additions and 58 deletions

View File

@ -18,71 +18,60 @@ var (
licensePattern = regexp.MustCompile(`^License: (?P<license>\S*)`)
commonLicensePathPattern = regexp.MustCompile(`/usr/share/common-licenses/(?P<license>[0-9A-Za-z_.\-]+)`)
licenseAgreementHeadingPattern = regexp.MustCompile(`(?i)^\s*(?P<license>LICENSE AGREEMENT(?: FOR .+?)?)\s*$`)
formatHeaderPattern = regexp.MustCompile(`^Format:\s*https?://www\.debian\.org/doc/packaging-manuals/copyright-format/`)
)
// heading-detection states. Replaces licenseFirstSentenceAfterHeadingPattern,
// which only matched at the start of the file: a non-empty heading, a line
// of dashes, blank lines, then text up to the first period.
const (
expectHeading = iota
expectDashes
skipBlanks
captureLicense
headingDone // matched or impossible — stop checking
)
func parseLicensesFromCopyright(reader io.Reader) []string {
findings := strset.New()
scanner := bufio.NewScanner(reader)
// State machine replacing licenseFirstSentenceAfterHeadingPattern.
// That regex only matched at the start of the file: a non-empty heading,
// a line of dashes, blank lines, then text up to the first period.
const (
expectHeading = iota
expectDashes
skipBlanks
captureLicense
headingDone // matched or impossible — stop checking
)
// per the DEP-5 spec, machine-readable copyright files MUST have a
// Format field whose value is a URI for the specification. Only files
// with this header should be parsed as machine-readable.
// See: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
formatVerified := false
headingState := expectHeading
var licenseText strings.Builder
for scanner.Scan() {
line := scanner.Text()
// per-line regex checks (applied to every line)
if value := findLicenseClause(licensePattern, line); value != "" {
findings.Add(value)
if !formatVerified {
if strings.TrimSpace(line) == "" {
continue
}
if value := findLicenseClause(commonLicensePathPattern, line); value != "" {
findings.Add(value)
if !formatHeaderPattern.MatchString(line) {
return nil
}
if value := findLicenseClause(licenseAgreementHeadingPattern, line); value != "" {
findings.Add(value)
formatVerified = true
}
// multi-line heading detection (only at start of file)
switch headingState {
case expectHeading:
if strings.TrimSpace(line) != "" {
headingState = expectDashes
} else {
headingState = headingDone
}
case expectDashes:
trimmed := strings.TrimSpace(line)
if len(trimmed) > 0 && strings.Trim(trimmed, "-") == "" {
headingState = skipBlanks
} else {
headingState = headingDone
}
case skipBlanks:
if strings.TrimSpace(line) != "" {
headingState = captureLicense
licenseText.WriteString(line)
if value := extractUpToFirstPeriod(licenseText.String()); value != "" {
for _, p := range []*regexp.Regexp{licensePattern, commonLicensePathPattern, licenseAgreementHeadingPattern} {
if value := findLicenseClause(p, line); value != "" {
findings.Add(value)
headingState = headingDone
}
}
case captureLicense:
licenseText.WriteString(" ")
licenseText.WriteString(line)
if value := extractUpToFirstPeriod(licenseText.String()); value != "" {
findings.Add(value)
headingState = headingDone
var found string
headingState, found = advanceHeadingState(headingState, line, &licenseText)
if found != "" {
findings.Add(found)
}
}
if !formatVerified {
return nil
}
results := findings.List()
@ -91,6 +80,40 @@ func parseLicensesFromCopyright(reader io.Reader) []string {
return results
}
func advanceHeadingState(state int, line string, licenseText *strings.Builder) (int, string) {
switch state {
case expectHeading:
if strings.TrimSpace(line) != "" {
return expectDashes, ""
}
return headingDone, ""
case expectDashes:
trimmed := strings.TrimSpace(line)
if len(trimmed) > 0 && strings.Trim(trimmed, "-") == "" {
return skipBlanks, ""
}
return headingDone, ""
case skipBlanks:
if strings.TrimSpace(line) == "" {
return state, ""
}
licenseText.WriteString(line)
if value := extractUpToFirstPeriod(licenseText.String()); value != "" {
return headingDone, value
}
return captureLicense, ""
case captureLicense:
licenseText.WriteString(" ")
licenseText.WriteString(line)
if value := extractUpToFirstPeriod(licenseText.String()); value != "" {
return headingDone, value
}
return state, ""
case headingDone:
}
return state, ""
}
// extractUpToFirstPeriod returns the license text up to the first period,
// processed through ensureIsSingleLicense, or "" if no period found yet.
func extractUpToFirstPeriod(s string) string {

View File

@ -2,6 +2,7 @@ package debian
import (
"os"
"strings"
"testing"
"github.com/google/go-cmp/cmp"
@ -14,13 +15,14 @@ func TestParseLicensesFromCopyright(t *testing.T) {
expected []string
}{
{
// no Format header; not machine-readable, returns nil
fixture: "testdata/copyright/libc6",
// note: there are other licenses in this file that are not matched --we don't do full text license identification yet
expected: []string{"GPL-2", "LGPL-2.1"},
expected: nil,
},
{
// no Format header; not machine-readable, returns nil
fixture: "testdata/copyright/trilicense",
expected: []string{"GPL-2", "LGPL-2.1", "MPL-1.1"},
expected: nil,
},
{
fixture: "testdata/copyright/liblzma5",
@ -31,21 +33,25 @@ func TestParseLicensesFromCopyright(t *testing.T) {
expected: []string{"GPL-1", "GPL-2", "LGPL-2.1"},
},
{
// no Format header; not machine-readable, returns nil
// previously this captured nonsensical values like "#", "Permission", "This", "see"
fixture: "testdata/copyright/python",
// note: this should not capture #, Permission, This, see ... however it's not clear how to fix this (this is probably good enough)
expected: []string{"#", "Apache", "Apache-2", "Apache-2.0", "Expat", "GPL-2", "ISC", "LGPL-2.1+", "PSF-2", "Permission", "Python", "This", "see"},
expected: nil,
},
{
// no Format header; not machine-readable, returns nil
fixture: "testdata/copyright/cuda",
expected: []string{"NVIDIA Software License Agreement and CUDA Supplement to Software License Agreement"},
expected: nil,
},
{
// no Format header; not machine-readable, returns nil
fixture: "testdata/copyright/dev-kit",
expected: []string{"LICENSE AGREEMENT FOR NVIDIA SOFTWARE DEVELOPMENT KITS"},
expected: nil,
},
{
// no Format header; not machine-readable, returns nil
fixture: "testdata/copyright/microsoft",
expected: []string{"LICENSE AGREEMENT FOR MICROSOFT PRODUCTS"},
expected: nil,
},
}
@ -63,3 +69,71 @@ func TestParseLicensesFromCopyright(t *testing.T) {
})
}
}
func TestParseLicensesFromCopyright_FormatHeader(t *testing.T) {
tests := []struct {
name string
content string
machineReadable bool
}{
{
name: "valid http Format header",
content: "Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/\n",
machineReadable: true,
},
{
name: "valid https Format header",
content: "Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/\n",
machineReadable: true,
},
{
name: "blank lines before Format header",
content: "\n\nFormat: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/\n",
machineReadable: true,
},
{
name: "no Format header",
content: "This is the Debian prepackaged version of foo.\n",
machineReadable: false,
},
{
name: "Format header is not first non-blank line",
content: "Some-Field: value\nFormat: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/\n",
machineReadable: false,
},
{
name: "empty content",
content: "",
machineReadable: false,
},
{
name: "only blank lines",
content: "\n\n\n",
machineReadable: false,
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
actual := parseLicensesFromCopyright(strings.NewReader(test.content))
// parseLicensesFromCopyright returns nil for non-machine-readable
// files and a (possibly empty) slice otherwise.
if test.machineReadable {
require.NotNil(t, actual)
} else {
require.Nil(t, actual)
}
})
}
}
func TestParseLicensesFromCopyrightInline(t *testing.T) {
// verify that a file with License: fields but no Format header returns nil
content := `License: GPL-2
License: LGPL-2.1
`
actual := parseLicensesFromCopyright(strings.NewReader(content))
if actual != nil {
t.Errorf("expected nil for non-machine-readable file, got %v", actual)
}
}