mirror of
https://github.com/anchore/syft.git
synced 2026-07-05 02:28:25 +02:00
fix: license frontmatter
Signed-off-by: Christopher Phillips <32073428+spiffcs@users.noreply.github.com>
This commit is contained in:
parent
15bd509e15
commit
9644340981
@ -306,6 +306,39 @@ func TestSafeTensorsMergeProcessor(t *testing.T) {
|
|||||||
assert.Equal(t, md1.MetadataHash, md2.MetadataHash, "rollup hash must not depend on input order")
|
assert.Equal(t, md1.MetadataHash, md2.MetadataHash, "rollup hash must not depend on input order")
|
||||||
})
|
})
|
||||||
|
|
||||||
|
t.Run("OCI: license layer SPDX comes from choosealicense frontmatter", func(t *testing.T) {
|
||||||
|
// The license layer's content carries a choosealicense.com-style YAML
|
||||||
|
// frontmatter block. The processor should prefer the cheap spdx-id read
|
||||||
|
// over invoking the full license scanner.
|
||||||
|
dir := t.TempDir()
|
||||||
|
licensePath := filepath.Join(dir, "LICENSE")
|
||||||
|
require.NoError(t, os.WriteFile(licensePath, []byte(`---
|
||||||
|
title: Apache License 2.0
|
||||||
|
spdx-id: Apache-2.0
|
||||||
|
---
|
||||||
|
|
||||||
|
Apache License
|
||||||
|
Version 2.0, January 2004
|
||||||
|
`), 0o644))
|
||||||
|
hfConfigPath := filepath.Join(dir, "config.json")
|
||||||
|
require.NoError(t, os.WriteFile(hfConfigPath,
|
||||||
|
[]byte(`{"_name_or_path":"org/with-license-fm"}`), 0o644))
|
||||||
|
resolver := file.NewMockResolverForMediaTypes(map[string][]file.Location{
|
||||||
|
dockerAIModelFileMediaType: {file.NewLocation(hfConfigPath)},
|
||||||
|
dockerAILicenseMediaType: {file.NewLocation(licensePath)},
|
||||||
|
})
|
||||||
|
|
||||||
|
configMd := pkg.SafeTensorsModelInfo{Format: "safetensors", TensorCount: 1}
|
||||||
|
out, _, err := safeTensorsMergeProcessor(
|
||||||
|
context.Background(), resolver,
|
||||||
|
[]pkg.Package{ociPkg(configMd)}, nil, nil,
|
||||||
|
)
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Len(t, out, 1)
|
||||||
|
assert.Equal(t, "with-license-fm", out[0].Name)
|
||||||
|
assertHasLicense(t, out[0], "Apache-2.0")
|
||||||
|
})
|
||||||
|
|
||||||
t.Run("passes through upstream error", func(t *testing.T) {
|
t.Run("passes through upstream error", func(t *testing.T) {
|
||||||
sentinel := assert.AnError
|
sentinel := assert.AnError
|
||||||
p := dirPkg("/models/x/y.safetensors", pkg.SafeTensorsModelInfo{Format: "safetensors", MetadataHash: "h"})
|
p := dirPkg("/models/x/y.safetensors", pkg.SafeTensorsModelInfo{Format: "safetensors", MetadataHash: "h"})
|
||||||
@ -731,6 +764,75 @@ func TestParseFrontmatter(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TestParseLicenseFrontmatter covers the choosealicense.com-style YAML
|
||||||
|
// frontmatter Docker Model Runner uses for its license layers. Only spdx-id
|
||||||
|
// is consumed; everything else in the block is ignored.
|
||||||
|
func TestParseLicenseFrontmatter(t *testing.T) {
|
||||||
|
t.Run("Apache-2.0 (the canonical choosealicense.com shape)", func(t *testing.T) {
|
||||||
|
// This is the exact frontmatter shape from
|
||||||
|
// https://github.com/github/choosealicense.com/blob/gh-pages/_licenses/apache-2.0.txt
|
||||||
|
// Docker AI license layers ship a near-identical block.
|
||||||
|
buf := []byte(`---
|
||||||
|
title: Apache License 2.0
|
||||||
|
spdx-id: Apache-2.0
|
||||||
|
redirect_from: /licenses/apache/
|
||||||
|
featured: true
|
||||||
|
hidden: false
|
||||||
|
|
||||||
|
description: A permissive license whose main conditions require preservation of copyright and license notices.
|
||||||
|
|
||||||
|
how: Create a text file (typically named LICENSE or LICENSE.txt) in the root of your source code and copy the text of the license into the file.
|
||||||
|
|
||||||
|
using:
|
||||||
|
Kubernetes: https://github.com/kubernetes/kubernetes/blob/master/LICENSE
|
||||||
|
PDF.js: https://github.com/mozilla/pdf.js/blob/master/LICENSE
|
||||||
|
Swift: https://github.com/apple/swift/blob/main/LICENSE.txt
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
- commercial-use
|
||||||
|
- modifications
|
||||||
|
- distribution
|
||||||
|
- patent-use
|
||||||
|
- private-use
|
||||||
|
|
||||||
|
conditions:
|
||||||
|
- include-copyright
|
||||||
|
- document-changes
|
||||||
|
|
||||||
|
limitations:
|
||||||
|
- trademark-use
|
||||||
|
- liability
|
||||||
|
- warranty
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Apache License
|
||||||
|
Version 2.0, January 2004
|
||||||
|
`)
|
||||||
|
assert.Equal(t, "Apache-2.0", parseLicenseFrontmatter(buf))
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("MIT with BOM prefix", func(t *testing.T) {
|
||||||
|
buf := []byte("\xef\xbb\xbf---\ntitle: MIT License\nspdx-id: MIT\n---\nThe MIT License...\n")
|
||||||
|
assert.Equal(t, "MIT", parseLicenseFrontmatter(buf))
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("frontmatter without spdx-id falls through (returns empty)", func(t *testing.T) {
|
||||||
|
buf := []byte("---\ntitle: Something\ndescription: no spdx-id here\n---\nbody\n")
|
||||||
|
assert.Empty(t, parseLicenseFrontmatter(buf))
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("plain license text without any frontmatter", func(t *testing.T) {
|
||||||
|
buf := []byte(" Apache License\n Version 2.0, January 2004\n")
|
||||||
|
assert.Empty(t, parseLicenseFrontmatter(buf))
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("unterminated frontmatter block", func(t *testing.T) {
|
||||||
|
buf := []byte("---\nspdx-id: MIT\n(never closes)\n")
|
||||||
|
assert.Empty(t, parseLicenseFrontmatter(buf))
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
func TestDockerAIModelConfigMediaTypes(t *testing.T) {
|
func TestDockerAIModelConfigMediaTypes(t *testing.T) {
|
||||||
// supported mirrors how the resolver matches: filepath.Match each registered
|
// supported mirrors how the resolver matches: filepath.Match each registered
|
||||||
// media type against a layer's media type.
|
// media type against a layer's media type.
|
||||||
|
|||||||
@ -411,9 +411,9 @@ func enrichSafeTensorsOCI(ctx context.Context, resolver file.Resolver, md *pkg.S
|
|||||||
nameOrPath = readmeName
|
nameOrPath = readmeName
|
||||||
}
|
}
|
||||||
|
|
||||||
// README license takes precedence; fall back to the license layer via the
|
// README license takes precedence; fall back to the license layer. For each
|
||||||
// shared scanner (which understands SPDX text far better than a hand-rolled
|
// license layer we first try a cheap YAML-frontmatter spdx-id read; layers
|
||||||
// substring match).
|
// without frontmatter fall through to the shared license scanner.
|
||||||
switch {
|
switch {
|
||||||
case readmeLicense != "":
|
case readmeLicense != "":
|
||||||
lics = pkg.NewLicensesFromValuesWithContext(ctx, readmeLicense)
|
lics = pkg.NewLicensesFromValuesWithContext(ctx, readmeLicense)
|
||||||
@ -423,13 +423,54 @@ func enrichSafeTensorsOCI(ctx context.Context, resolver file.Resolver, md *pkg.S
|
|||||||
log.Debugf("failed to list docker AI license layers: %v", lErr)
|
log.Debugf("failed to list docker AI license layers: %v", lErr)
|
||||||
}
|
}
|
||||||
if len(licLocs) > 0 {
|
if len(licLocs) > 0 {
|
||||||
lics = licenses.FindAtLocations(ctx, resolver, licLocs...)
|
lics = identifyLicenseLayers(ctx, resolver, licLocs)
|
||||||
supporting = append(supporting, licLocs...)
|
supporting = append(supporting, licLocs...)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return nameOrPath, lics, supporting
|
return nameOrPath, lics, supporting
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// identifyLicenseLayers turns Docker AI license-layer locations into
|
||||||
|
// pkg.License values. It first attempts a cheap, exact SPDX-id read from the
|
||||||
|
// layer's YAML frontmatter (the choosealicense.com shape Docker Model Runner
|
||||||
|
// publishes for its AI artifacts); layers without frontmatter fall through to
|
||||||
|
// the shared license scanner. Each returned license is tagged with the layer
|
||||||
|
// location it came from so the SBOM cites its source.
|
||||||
|
func identifyLicenseLayers(ctx context.Context, resolver file.Resolver, locs []file.Location) []pkg.License {
|
||||||
|
var out []pkg.License
|
||||||
|
var scanFallback []file.Location
|
||||||
|
for i := range locs {
|
||||||
|
loc := locs[i]
|
||||||
|
if spdx := readLicenseSPDXIDFromFrontmatter(resolver, loc); spdx != "" {
|
||||||
|
out = append(out, pkg.NewLicenseFromFieldsWithContext(ctx, spdx, "", &loc))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
scanFallback = append(scanFallback, loc)
|
||||||
|
}
|
||||||
|
if len(scanFallback) > 0 {
|
||||||
|
out = append(out, licenses.FindAtLocations(ctx, resolver, scanFallback...)...)
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
// readLicenseSPDXIDFromFrontmatter reads a bounded prefix of a license-layer
|
||||||
|
// blob and returns the spdx-id declared in its YAML frontmatter, if any. The
|
||||||
|
// 64 KiB cap is well above any real choosealicense.com frontmatter block while
|
||||||
|
// still bounding memory if the layer turns out to be huge.
|
||||||
|
func readLicenseSPDXIDFromFrontmatter(resolver file.Resolver, loc file.Location) string {
|
||||||
|
rc, err := resolver.FileContentsByLocation(loc)
|
||||||
|
if err != nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
defer internal.CloseAndLogError(rc, loc.RealPath)
|
||||||
|
|
||||||
|
buf, err := io.ReadAll(io.LimitReader(rc, 64*1024))
|
||||||
|
if err != nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return parseLicenseFrontmatter(buf)
|
||||||
|
}
|
||||||
|
|
||||||
// classifyOCIModelFileLayer reads up to 4 MiB of a model.file layer and
|
// classifyOCIModelFileLayer reads up to 4 MiB of a model.file layer and
|
||||||
// classifies it as README frontmatter or HF config.json based on its leading
|
// classifies it as README frontmatter or HF config.json based on its leading
|
||||||
// bytes. Side-effects: applies HF config fields onto md, accumulates name and
|
// bytes. Side-effects: applies HF config fields onto md, accumulates name and
|
||||||
@ -581,11 +622,11 @@ func readDirReadmeFrontmatter(resolver file.Resolver, p string) (*file.Location,
|
|||||||
return &locations[0], fm
|
return &locations[0], fm
|
||||||
}
|
}
|
||||||
|
|
||||||
// parseFrontmatter pulls the YAML block between the first and second "---"
|
// extractFrontmatterBlock returns the YAML bytes between the first and second
|
||||||
// lines of a file (if present) and decodes the fields we care about. base_model
|
// "---" delimiters of a file (stripping a leading BOM and any leading
|
||||||
// is decoded via yaml.Node so a scalar value ("org/model") doesn't fail the
|
// whitespace), or nil when no closed frontmatter block exists. Shared by every
|
||||||
// whole block.
|
// YAML-frontmatter parser the cataloger needs.
|
||||||
func parseFrontmatter(buf []byte) *readmeFrontmatter {
|
func extractFrontmatterBlock(buf []byte) []byte {
|
||||||
trimmed := bytes.TrimLeft(buf, "\xef\xbb\xbf \t\r\n")
|
trimmed := bytes.TrimLeft(buf, "\xef\xbb\xbf \t\r\n")
|
||||||
if !bytes.HasPrefix(trimmed, []byte("---")) {
|
if !bytes.HasPrefix(trimmed, []byte("---")) {
|
||||||
return nil
|
return nil
|
||||||
@ -598,12 +639,23 @@ func parseFrontmatter(buf []byte) *readmeFrontmatter {
|
|||||||
if end < 0 {
|
if end < 0 {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
return rest[:end]
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseFrontmatter decodes a Hugging Face model card YAML frontmatter block
|
||||||
|
// and returns the license and base_model fields. base_model is decoded via
|
||||||
|
// yaml.Node so a scalar value ("org/model") doesn't fail the whole block.
|
||||||
|
func parseFrontmatter(buf []byte) *readmeFrontmatter {
|
||||||
|
block := extractFrontmatterBlock(buf)
|
||||||
|
if block == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
var raw struct {
|
var raw struct {
|
||||||
License string `yaml:"license"`
|
License string `yaml:"license"`
|
||||||
BaseModel yaml.Node `yaml:"base_model"`
|
BaseModel yaml.Node `yaml:"base_model"`
|
||||||
}
|
}
|
||||||
if err := yaml.Unmarshal(rest[:end], &raw); err != nil {
|
if err := yaml.Unmarshal(block, &raw); err != nil {
|
||||||
log.Debugf("failed to parse README frontmatter: %v", err)
|
log.Debugf("failed to parse README frontmatter: %v", err)
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@ -620,6 +672,30 @@ func parseFrontmatter(buf []byte) *readmeFrontmatter {
|
|||||||
return &fm
|
return &fm
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// licenseFrontmatter holds the fields we lift from a choosealicense.com-style
|
||||||
|
// YAML frontmatter block at the top of a license file (the LICENSE blobs Docker
|
||||||
|
// Model Runner publishes for AI artifacts use this shape).
|
||||||
|
type licenseFrontmatter struct {
|
||||||
|
SPDXID string `yaml:"spdx-id"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseLicenseFrontmatter returns the producer-declared SPDX identifier from a
|
||||||
|
// choosealicense.com-style YAML frontmatter block, or "" if the buffer has no
|
||||||
|
// frontmatter or no spdx-id field — caller should fall back to a full license
|
||||||
|
// scan in that case.
|
||||||
|
func parseLicenseFrontmatter(buf []byte) string {
|
||||||
|
block := extractFrontmatterBlock(buf)
|
||||||
|
if block == nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
var fm licenseFrontmatter
|
||||||
|
if err := yaml.Unmarshal(block, &fm); err != nil {
|
||||||
|
log.Debugf("failed to parse license frontmatter: %v", err)
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return fm.SPDXID
|
||||||
|
}
|
||||||
|
|
||||||
func hasPrefix(b []byte, s string) bool {
|
func hasPrefix(b []byte, s string) bool {
|
||||||
return len(b) >= len(s) && string(b[:len(s)]) == s
|
return len(b) >= len(s) && string(b[:len(s)]) == s
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user