diff --git a/syft/pkg/cataloger/ai/parse_safetensors_test.go b/syft/pkg/cataloger/ai/parse_safetensors_test.go index 7efe86d38..f58c4cf61 100644 --- a/syft/pkg/cataloger/ai/parse_safetensors_test.go +++ b/syft/pkg/cataloger/ai/parse_safetensors_test.go @@ -306,6 +306,39 @@ func TestSafeTensorsMergeProcessor(t *testing.T) { assert.Equal(t, md1.MetadataHash, md2.MetadataHash, "rollup hash must not depend on input order") }) + t.Run("OCI: license layer SPDX comes from choosealicense frontmatter", func(t *testing.T) { + // The license layer's content carries a choosealicense.com-style YAML + // frontmatter block. The processor should prefer the cheap spdx-id read + // over invoking the full license scanner. + dir := t.TempDir() + licensePath := filepath.Join(dir, "LICENSE") + require.NoError(t, os.WriteFile(licensePath, []byte(`--- +title: Apache License 2.0 +spdx-id: Apache-2.0 +--- + + Apache License + Version 2.0, January 2004 +`), 0o644)) + hfConfigPath := filepath.Join(dir, "config.json") + require.NoError(t, os.WriteFile(hfConfigPath, + []byte(`{"_name_or_path":"org/with-license-fm"}`), 0o644)) + resolver := file.NewMockResolverForMediaTypes(map[string][]file.Location{ + dockerAIModelFileMediaType: {file.NewLocation(hfConfigPath)}, + dockerAILicenseMediaType: {file.NewLocation(licensePath)}, + }) + + configMd := pkg.SafeTensorsModelInfo{Format: "safetensors", TensorCount: 1} + out, _, err := safeTensorsMergeProcessor( + context.Background(), resolver, + []pkg.Package{ociPkg(configMd)}, nil, nil, + ) + require.NoError(t, err) + require.Len(t, out, 1) + assert.Equal(t, "with-license-fm", out[0].Name) + assertHasLicense(t, out[0], "Apache-2.0") + }) + t.Run("passes through upstream error", func(t *testing.T) { sentinel := assert.AnError p := dirPkg("/models/x/y.safetensors", pkg.SafeTensorsModelInfo{Format: "safetensors", MetadataHash: "h"}) @@ -731,6 +764,75 @@ func TestParseFrontmatter(t *testing.T) { }) } +// TestParseLicenseFrontmatter covers the choosealicense.com-style YAML +// frontmatter Docker Model Runner uses for its license layers. Only spdx-id +// is consumed; everything else in the block is ignored. +func TestParseLicenseFrontmatter(t *testing.T) { + t.Run("Apache-2.0 (the canonical choosealicense.com shape)", func(t *testing.T) { + // This is the exact frontmatter shape from + // https://github.com/github/choosealicense.com/blob/gh-pages/_licenses/apache-2.0.txt + // Docker AI license layers ship a near-identical block. + buf := []byte(`--- +title: Apache License 2.0 +spdx-id: Apache-2.0 +redirect_from: /licenses/apache/ +featured: true +hidden: false + +description: A permissive license whose main conditions require preservation of copyright and license notices. + +how: Create a text file (typically named LICENSE or LICENSE.txt) in the root of your source code and copy the text of the license into the file. + +using: + Kubernetes: https://github.com/kubernetes/kubernetes/blob/master/LICENSE + PDF.js: https://github.com/mozilla/pdf.js/blob/master/LICENSE + Swift: https://github.com/apple/swift/blob/main/LICENSE.txt + +permissions: + - commercial-use + - modifications + - distribution + - patent-use + - private-use + +conditions: + - include-copyright + - document-changes + +limitations: + - trademark-use + - liability + - warranty + +--- + + Apache License + Version 2.0, January 2004 +`) + assert.Equal(t, "Apache-2.0", parseLicenseFrontmatter(buf)) + }) + + t.Run("MIT with BOM prefix", func(t *testing.T) { + buf := []byte("\xef\xbb\xbf---\ntitle: MIT License\nspdx-id: MIT\n---\nThe MIT License...\n") + assert.Equal(t, "MIT", parseLicenseFrontmatter(buf)) + }) + + t.Run("frontmatter without spdx-id falls through (returns empty)", func(t *testing.T) { + buf := []byte("---\ntitle: Something\ndescription: no spdx-id here\n---\nbody\n") + assert.Empty(t, parseLicenseFrontmatter(buf)) + }) + + t.Run("plain license text without any frontmatter", func(t *testing.T) { + buf := []byte(" Apache License\n Version 2.0, January 2004\n") + assert.Empty(t, parseLicenseFrontmatter(buf)) + }) + + t.Run("unterminated frontmatter block", func(t *testing.T) { + buf := []byte("---\nspdx-id: MIT\n(never closes)\n") + assert.Empty(t, parseLicenseFrontmatter(buf)) + }) +} + func TestDockerAIModelConfigMediaTypes(t *testing.T) { // supported mirrors how the resolver matches: filepath.Match each registered // media type against a layer's media type. diff --git a/syft/pkg/cataloger/ai/processor.go b/syft/pkg/cataloger/ai/processor.go index 9ad14f8c0..a6a706456 100644 --- a/syft/pkg/cataloger/ai/processor.go +++ b/syft/pkg/cataloger/ai/processor.go @@ -411,9 +411,9 @@ func enrichSafeTensorsOCI(ctx context.Context, resolver file.Resolver, md *pkg.S nameOrPath = readmeName } - // README license takes precedence; fall back to the license layer via the - // shared scanner (which understands SPDX text far better than a hand-rolled - // substring match). + // README license takes precedence; fall back to the license layer. For each + // license layer we first try a cheap YAML-frontmatter spdx-id read; layers + // without frontmatter fall through to the shared license scanner. switch { case readmeLicense != "": lics = pkg.NewLicensesFromValuesWithContext(ctx, readmeLicense) @@ -423,13 +423,54 @@ func enrichSafeTensorsOCI(ctx context.Context, resolver file.Resolver, md *pkg.S log.Debugf("failed to list docker AI license layers: %v", lErr) } if len(licLocs) > 0 { - lics = licenses.FindAtLocations(ctx, resolver, licLocs...) + lics = identifyLicenseLayers(ctx, resolver, licLocs) supporting = append(supporting, licLocs...) } } return nameOrPath, lics, supporting } +// identifyLicenseLayers turns Docker AI license-layer locations into +// pkg.License values. It first attempts a cheap, exact SPDX-id read from the +// layer's YAML frontmatter (the choosealicense.com shape Docker Model Runner +// publishes for its AI artifacts); layers without frontmatter fall through to +// the shared license scanner. Each returned license is tagged with the layer +// location it came from so the SBOM cites its source. +func identifyLicenseLayers(ctx context.Context, resolver file.Resolver, locs []file.Location) []pkg.License { + var out []pkg.License + var scanFallback []file.Location + for i := range locs { + loc := locs[i] + if spdx := readLicenseSPDXIDFromFrontmatter(resolver, loc); spdx != "" { + out = append(out, pkg.NewLicenseFromFieldsWithContext(ctx, spdx, "", &loc)) + continue + } + scanFallback = append(scanFallback, loc) + } + if len(scanFallback) > 0 { + out = append(out, licenses.FindAtLocations(ctx, resolver, scanFallback...)...) + } + return out +} + +// readLicenseSPDXIDFromFrontmatter reads a bounded prefix of a license-layer +// blob and returns the spdx-id declared in its YAML frontmatter, if any. The +// 64 KiB cap is well above any real choosealicense.com frontmatter block while +// still bounding memory if the layer turns out to be huge. +func readLicenseSPDXIDFromFrontmatter(resolver file.Resolver, loc file.Location) string { + rc, err := resolver.FileContentsByLocation(loc) + if err != nil { + return "" + } + defer internal.CloseAndLogError(rc, loc.RealPath) + + buf, err := io.ReadAll(io.LimitReader(rc, 64*1024)) + if err != nil { + return "" + } + return parseLicenseFrontmatter(buf) +} + // classifyOCIModelFileLayer reads up to 4 MiB of a model.file layer and // classifies it as README frontmatter or HF config.json based on its leading // bytes. Side-effects: applies HF config fields onto md, accumulates name and @@ -581,11 +622,11 @@ func readDirReadmeFrontmatter(resolver file.Resolver, p string) (*file.Location, return &locations[0], fm } -// parseFrontmatter pulls the YAML block between the first and second "---" -// lines of a file (if present) and decodes the fields we care about. base_model -// is decoded via yaml.Node so a scalar value ("org/model") doesn't fail the -// whole block. -func parseFrontmatter(buf []byte) *readmeFrontmatter { +// extractFrontmatterBlock returns the YAML bytes between the first and second +// "---" delimiters of a file (stripping a leading BOM and any leading +// whitespace), or nil when no closed frontmatter block exists. Shared by every +// YAML-frontmatter parser the cataloger needs. +func extractFrontmatterBlock(buf []byte) []byte { trimmed := bytes.TrimLeft(buf, "\xef\xbb\xbf \t\r\n") if !bytes.HasPrefix(trimmed, []byte("---")) { return nil @@ -598,12 +639,23 @@ func parseFrontmatter(buf []byte) *readmeFrontmatter { if end < 0 { return nil } + return rest[:end] +} + +// parseFrontmatter decodes a Hugging Face model card YAML frontmatter block +// and returns the license and base_model fields. base_model is decoded via +// yaml.Node so a scalar value ("org/model") doesn't fail the whole block. +func parseFrontmatter(buf []byte) *readmeFrontmatter { + block := extractFrontmatterBlock(buf) + if block == nil { + return nil + } var raw struct { License string `yaml:"license"` BaseModel yaml.Node `yaml:"base_model"` } - if err := yaml.Unmarshal(rest[:end], &raw); err != nil { + if err := yaml.Unmarshal(block, &raw); err != nil { log.Debugf("failed to parse README frontmatter: %v", err) return nil } @@ -620,6 +672,30 @@ func parseFrontmatter(buf []byte) *readmeFrontmatter { return &fm } +// licenseFrontmatter holds the fields we lift from a choosealicense.com-style +// YAML frontmatter block at the top of a license file (the LICENSE blobs Docker +// Model Runner publishes for AI artifacts use this shape). +type licenseFrontmatter struct { + SPDXID string `yaml:"spdx-id"` +} + +// parseLicenseFrontmatter returns the producer-declared SPDX identifier from a +// choosealicense.com-style YAML frontmatter block, or "" if the buffer has no +// frontmatter or no spdx-id field — caller should fall back to a full license +// scan in that case. +func parseLicenseFrontmatter(buf []byte) string { + block := extractFrontmatterBlock(buf) + if block == nil { + return "" + } + var fm licenseFrontmatter + if err := yaml.Unmarshal(block, &fm); err != nil { + log.Debugf("failed to parse license frontmatter: %v", err) + return "" + } + return fm.SPDXID +} + func hasPrefix(b []byte, s string) bool { return len(b) >= len(s) && string(b[:len(s)]) == s }