fix: license frontmatter

Signed-off-by: Christopher Phillips <32073428+spiffcs@users.noreply.github.com>
This commit is contained in:
Christopher Phillips 2026-05-29 03:46:33 -04:00
parent 15bd509e15
commit 9644340981
No known key found for this signature in database
2 changed files with 188 additions and 10 deletions

View File

@ -306,6 +306,39 @@ func TestSafeTensorsMergeProcessor(t *testing.T) {
assert.Equal(t, md1.MetadataHash, md2.MetadataHash, "rollup hash must not depend on input order")
})
t.Run("OCI: license layer SPDX comes from choosealicense frontmatter", func(t *testing.T) {
// The license layer's content carries a choosealicense.com-style YAML
// frontmatter block. The processor should prefer the cheap spdx-id read
// over invoking the full license scanner.
dir := t.TempDir()
licensePath := filepath.Join(dir, "LICENSE")
require.NoError(t, os.WriteFile(licensePath, []byte(`---
title: Apache License 2.0
spdx-id: Apache-2.0
---
Apache License
Version 2.0, January 2004
`), 0o644))
hfConfigPath := filepath.Join(dir, "config.json")
require.NoError(t, os.WriteFile(hfConfigPath,
[]byte(`{"_name_or_path":"org/with-license-fm"}`), 0o644))
resolver := file.NewMockResolverForMediaTypes(map[string][]file.Location{
dockerAIModelFileMediaType: {file.NewLocation(hfConfigPath)},
dockerAILicenseMediaType: {file.NewLocation(licensePath)},
})
configMd := pkg.SafeTensorsModelInfo{Format: "safetensors", TensorCount: 1}
out, _, err := safeTensorsMergeProcessor(
context.Background(), resolver,
[]pkg.Package{ociPkg(configMd)}, nil, nil,
)
require.NoError(t, err)
require.Len(t, out, 1)
assert.Equal(t, "with-license-fm", out[0].Name)
assertHasLicense(t, out[0], "Apache-2.0")
})
t.Run("passes through upstream error", func(t *testing.T) {
sentinel := assert.AnError
p := dirPkg("/models/x/y.safetensors", pkg.SafeTensorsModelInfo{Format: "safetensors", MetadataHash: "h"})
@ -731,6 +764,75 @@ func TestParseFrontmatter(t *testing.T) {
})
}
// TestParseLicenseFrontmatter covers the choosealicense.com-style YAML
// frontmatter Docker Model Runner uses for its license layers. Only spdx-id
// is consumed; everything else in the block is ignored.
func TestParseLicenseFrontmatter(t *testing.T) {
t.Run("Apache-2.0 (the canonical choosealicense.com shape)", func(t *testing.T) {
// This is the exact frontmatter shape from
// https://github.com/github/choosealicense.com/blob/gh-pages/_licenses/apache-2.0.txt
// Docker AI license layers ship a near-identical block.
buf := []byte(`---
title: Apache License 2.0
spdx-id: Apache-2.0
redirect_from: /licenses/apache/
featured: true
hidden: false
description: A permissive license whose main conditions require preservation of copyright and license notices.
how: Create a text file (typically named LICENSE or LICENSE.txt) in the root of your source code and copy the text of the license into the file.
using:
Kubernetes: https://github.com/kubernetes/kubernetes/blob/master/LICENSE
PDF.js: https://github.com/mozilla/pdf.js/blob/master/LICENSE
Swift: https://github.com/apple/swift/blob/main/LICENSE.txt
permissions:
- commercial-use
- modifications
- distribution
- patent-use
- private-use
conditions:
- include-copyright
- document-changes
limitations:
- trademark-use
- liability
- warranty
---
Apache License
Version 2.0, January 2004
`)
assert.Equal(t, "Apache-2.0", parseLicenseFrontmatter(buf))
})
t.Run("MIT with BOM prefix", func(t *testing.T) {
buf := []byte("\xef\xbb\xbf---\ntitle: MIT License\nspdx-id: MIT\n---\nThe MIT License...\n")
assert.Equal(t, "MIT", parseLicenseFrontmatter(buf))
})
t.Run("frontmatter without spdx-id falls through (returns empty)", func(t *testing.T) {
buf := []byte("---\ntitle: Something\ndescription: no spdx-id here\n---\nbody\n")
assert.Empty(t, parseLicenseFrontmatter(buf))
})
t.Run("plain license text without any frontmatter", func(t *testing.T) {
buf := []byte(" Apache License\n Version 2.0, January 2004\n")
assert.Empty(t, parseLicenseFrontmatter(buf))
})
t.Run("unterminated frontmatter block", func(t *testing.T) {
buf := []byte("---\nspdx-id: MIT\n(never closes)\n")
assert.Empty(t, parseLicenseFrontmatter(buf))
})
}
func TestDockerAIModelConfigMediaTypes(t *testing.T) {
// supported mirrors how the resolver matches: filepath.Match each registered
// media type against a layer's media type.

View File

@ -411,9 +411,9 @@ func enrichSafeTensorsOCI(ctx context.Context, resolver file.Resolver, md *pkg.S
nameOrPath = readmeName
}
// README license takes precedence; fall back to the license layer via the
// shared scanner (which understands SPDX text far better than a hand-rolled
// substring match).
// README license takes precedence; fall back to the license layer. For each
// license layer we first try a cheap YAML-frontmatter spdx-id read; layers
// without frontmatter fall through to the shared license scanner.
switch {
case readmeLicense != "":
lics = pkg.NewLicensesFromValuesWithContext(ctx, readmeLicense)
@ -423,13 +423,54 @@ func enrichSafeTensorsOCI(ctx context.Context, resolver file.Resolver, md *pkg.S
log.Debugf("failed to list docker AI license layers: %v", lErr)
}
if len(licLocs) > 0 {
lics = licenses.FindAtLocations(ctx, resolver, licLocs...)
lics = identifyLicenseLayers(ctx, resolver, licLocs)
supporting = append(supporting, licLocs...)
}
}
return nameOrPath, lics, supporting
}
// identifyLicenseLayers turns Docker AI license-layer locations into
// pkg.License values. It first attempts a cheap, exact SPDX-id read from the
// layer's YAML frontmatter (the choosealicense.com shape Docker Model Runner
// publishes for its AI artifacts); layers without frontmatter fall through to
// the shared license scanner. Each returned license is tagged with the layer
// location it came from so the SBOM cites its source.
func identifyLicenseLayers(ctx context.Context, resolver file.Resolver, locs []file.Location) []pkg.License {
var out []pkg.License
var scanFallback []file.Location
for i := range locs {
loc := locs[i]
if spdx := readLicenseSPDXIDFromFrontmatter(resolver, loc); spdx != "" {
out = append(out, pkg.NewLicenseFromFieldsWithContext(ctx, spdx, "", &loc))
continue
}
scanFallback = append(scanFallback, loc)
}
if len(scanFallback) > 0 {
out = append(out, licenses.FindAtLocations(ctx, resolver, scanFallback...)...)
}
return out
}
// readLicenseSPDXIDFromFrontmatter reads a bounded prefix of a license-layer
// blob and returns the spdx-id declared in its YAML frontmatter, if any. The
// 64 KiB cap is well above any real choosealicense.com frontmatter block while
// still bounding memory if the layer turns out to be huge.
func readLicenseSPDXIDFromFrontmatter(resolver file.Resolver, loc file.Location) string {
rc, err := resolver.FileContentsByLocation(loc)
if err != nil {
return ""
}
defer internal.CloseAndLogError(rc, loc.RealPath)
buf, err := io.ReadAll(io.LimitReader(rc, 64*1024))
if err != nil {
return ""
}
return parseLicenseFrontmatter(buf)
}
// classifyOCIModelFileLayer reads up to 4 MiB of a model.file layer and
// classifies it as README frontmatter or HF config.json based on its leading
// bytes. Side-effects: applies HF config fields onto md, accumulates name and
@ -581,11 +622,11 @@ func readDirReadmeFrontmatter(resolver file.Resolver, p string) (*file.Location,
return &locations[0], fm
}
// parseFrontmatter pulls the YAML block between the first and second "---"
// lines of a file (if present) and decodes the fields we care about. base_model
// is decoded via yaml.Node so a scalar value ("org/model") doesn't fail the
// whole block.
func parseFrontmatter(buf []byte) *readmeFrontmatter {
// extractFrontmatterBlock returns the YAML bytes between the first and second
// "---" delimiters of a file (stripping a leading BOM and any leading
// whitespace), or nil when no closed frontmatter block exists. Shared by every
// YAML-frontmatter parser the cataloger needs.
func extractFrontmatterBlock(buf []byte) []byte {
trimmed := bytes.TrimLeft(buf, "\xef\xbb\xbf \t\r\n")
if !bytes.HasPrefix(trimmed, []byte("---")) {
return nil
@ -598,12 +639,23 @@ func parseFrontmatter(buf []byte) *readmeFrontmatter {
if end < 0 {
return nil
}
return rest[:end]
}
// parseFrontmatter decodes a Hugging Face model card YAML frontmatter block
// and returns the license and base_model fields. base_model is decoded via
// yaml.Node so a scalar value ("org/model") doesn't fail the whole block.
func parseFrontmatter(buf []byte) *readmeFrontmatter {
block := extractFrontmatterBlock(buf)
if block == nil {
return nil
}
var raw struct {
License string `yaml:"license"`
BaseModel yaml.Node `yaml:"base_model"`
}
if err := yaml.Unmarshal(rest[:end], &raw); err != nil {
if err := yaml.Unmarshal(block, &raw); err != nil {
log.Debugf("failed to parse README frontmatter: %v", err)
return nil
}
@ -620,6 +672,30 @@ func parseFrontmatter(buf []byte) *readmeFrontmatter {
return &fm
}
// licenseFrontmatter holds the fields we lift from a choosealicense.com-style
// YAML frontmatter block at the top of a license file (the LICENSE blobs Docker
// Model Runner publishes for AI artifacts use this shape).
type licenseFrontmatter struct {
SPDXID string `yaml:"spdx-id"`
}
// parseLicenseFrontmatter returns the producer-declared SPDX identifier from a
// choosealicense.com-style YAML frontmatter block, or "" if the buffer has no
// frontmatter or no spdx-id field — caller should fall back to a full license
// scan in that case.
func parseLicenseFrontmatter(buf []byte) string {
block := extractFrontmatterBlock(buf)
if block == nil {
return ""
}
var fm licenseFrontmatter
if err := yaml.Unmarshal(block, &fm); err != nil {
log.Debugf("failed to parse license frontmatter: %v", err)
return ""
}
return fm.SPDXID
}
func hasPrefix(b []byte, s string) bool {
return len(b) >= len(s) && string(b[:len(s)]) == s
}