From 0dc4abc0e11697f2dc3a3ec43f53c6bf74ad4c54 Mon Sep 17 00:00:00 2001
From: Christopher Phillips <32073428+spiffcs@users.noreply.github.com>
Date: Fri, 5 Jun 2026 03:16:02 -0400
Subject: [PATCH] pr: refactor

Signed-off-by: Christopher Phillips <32073428+spiffcs@users.noreply.github.com>
---
 .../fileresolver/container_image_model.go     |  6 +-
 syft/pkg/cataloger/ai/identity_oci.go         | 73 +++++++--------
 syft/pkg/cataloger/ai/merge.go                |  5 +-
 syft/pkg/cataloger/ai/parse_safetensors.go    |  6 +-
 .../cataloger/ai/parse_safetensors_model.go   |  3 +-
 .../cataloger/ai/parse_safetensors_test.go    | 89 +++++++++++++++++++
 syft/pkg/cataloger/ai/processor.go            | 12 ++-
 syft/pkg/safetensors.go                       |  2 +-
 .../source/ocimodelsource/oci_model_source.go |  3 +-
 syft/source/ocimodelsource/registry_client.go |  5 +-
 10 files changed, 150 insertions(+), 54 deletions(-)

diff --git a/syft/internal/fileresolver/container_image_model.go b/syft/internal/fileresolver/container_image_model.go
index cb640a7ad..3cc4a7422 100644
--- a/syft/internal/fileresolver/container_image_model.go
+++ b/syft/internal/fileresolver/container_image_model.go
@@ -20,8 +20,10 @@ type LayerInfo struct {
 }
 
 // ContainerImageModel is a file.Resolver implementation that provides access to
-// GGUF header data fetched from OCI model artifacts via range-GET requests.
-// This does not fetch the entire model from the registry, only a sliver of it.
+// model header and metadata data (GGUF and SafeTensors headers, the model config
+// blob, and companion layers) fetched from OCI model artifacts via range-GET
+// requests. This does not fetch the entire model from the registry, only a
+// sliver of it.
 type ContainerImageModel struct {
 	tempDir    string                   // temp directory containing all layer files
 	layerFiles map[string]LayerInfo     // digest -> layer info (temp path + media type)
diff --git a/syft/pkg/cataloger/ai/identity_oci.go b/syft/pkg/cataloger/ai/identity_oci.go
index 63c43e469..1b168d7e0 100644
--- a/syft/pkg/cataloger/ai/identity_oci.go
+++ b/syft/pkg/cataloger/ai/identity_oci.go
@@ -36,7 +36,21 @@ func resolveSafeTensorsOCIIdentity(ctx context.Context, resolver file.Resolver,
 	var configName, readmeName, readmeLicense string
 	var supporting []file.Location
 	for _, loc := range modelFileLocs {
-		if classifyOCIModelFileLayer(resolver, loc, md, &configName, &readmeName, &readmeLicense) {
+		cfg, fm := classifyOCIModelFileLayer(resolver, loc)
+		switch {
+		case cfg != nil:
+			applyHFConfig(md, cfg)
+			if configName == "" {
+				configName = cfg.NameOrPath
+			}
+			supporting = append(supporting, loc)
+		case fm != nil:
+			if readmeLicense == "" {
+				readmeLicense = fm.License
+			}
+			if readmeName == "" && len(fm.BaseModel) > 0 {
+				readmeName = fm.BaseModel[0]
+			}
 			supporting = append(supporting, loc)
 		}
 	}
@@ -53,21 +67,19 @@ func resolveSafeTensorsOCIIdentity(ctx context.Context, resolver file.Resolver,
 		supporting:   supporting,
 	}
 
-	// License precedence: a README model-card license wins over dedicated
-	// license layers (mirrors the dir-scan path, where README frontmatter is the
-	// license source).
+	// License precedence: a dedicated vnd.docker.ai.license layer is a
+	// producer-curated signal and outranks the free-text license field in a model
+	// card's README frontmatter.
+	licLocs, err := ociResolver.FilesByMediaType(dockerAILicenseMediaType)
+	if err != nil {
+		log.Debugf("failed to list docker AI license layers: %v", err)
+	}
 	switch {
+	case len(licLocs) > 0:
+		id.licenses = identifyLicenseLayers(ctx, resolver, licLocs)
+		id.supporting = append(id.supporting, licLocs...)
 	case readmeLicense != "":
 		id.licenses = pkg.NewLicensesFromValuesWithContext(ctx, readmeLicense)
-	default:
-		licLocs, err := ociResolver.FilesByMediaType(dockerAILicenseMediaType)
-		if err != nil {
-			log.Debugf("failed to list docker AI license layers: %v", err)
-		}
-		if len(licLocs) > 0 {
-			id.licenses = identifyLicenseLayers(ctx, resolver, licLocs)
-			id.supporting = append(id.supporting, licLocs...)
-		}
 	}
 
 	return id
@@ -100,45 +112,34 @@ func ociImageRefBasename(resolver file.Resolver) string {
 	return path.Base(parsed.Context().RepositoryStr())
 }
 
-// classifyOCIModelFileLayer reads up to 4 MiB of a model.file layer and
-// classifies it as README frontmatter or HF config.json based on its leading bytes.
-func classifyOCIModelFileLayer(resolver file.Resolver, loc file.Location, md *pkg.SafeTensorsModelInfo, configName, readmeName, license *string) bool {
+// classifyOCIModelFileLayer reads up to 4 MiB of a model.file layer and decodes
+// it as either an HF config.json or a README model card's YAML frontmatter,
+// based on its leading bytes. It returns whichever it recognized; both are nil
+// when the layer is neither (or fails to decode). The caller owns precedence and
+// metadata enrichment.
+func classifyOCIModelFileLayer(resolver file.Resolver, loc file.Location) (*hfConfig, *readmeFrontmatter) {
 	rc, err := resolver.FileContentsByLocation(loc)
 	if err != nil {
-		return false
+		return nil, nil
 	}
 	defer internal.CloseAndLogError(rc, loc.RealPath)
 
 	buf, err := io.ReadAll(io.LimitReader(rc, 4*1024*1024))
 	if err != nil {
-		return false
+		return nil, nil
 	}
 	trimmed := bytes.TrimLeft(buf, "\xef\xbb\xbf \t\r\n")
 	switch {
 	case bytes.HasPrefix(trimmed, []byte("---")):
-		fm := parseFrontmatter(buf)
-		if fm == nil {
-			return false
-		}
-		if *license == "" {
-			*license = fm.License
-		}
-		if *readmeName == "" && len(fm.BaseModel) > 0 {
-			*readmeName = fm.BaseModel[0]
-		}
-		return true
+		return nil, parseFrontmatter(buf)
 	case bytes.HasPrefix(trimmed, []byte("{")):
 		var cfg hfConfig
 		if err := json.Unmarshal(buf, &cfg); err != nil {
-			return false
+			return nil, nil
 		}
-		applyHFConfig(md, &cfg)
-		if *configName == "" && cfg.NameOrPath != "" {
-			*configName = cfg.NameOrPath
-		}
-		return true
+		return &cfg, nil
 	}
-	return false
+	return nil, nil
 }
 
 // identifyLicenseLayers turns Docker AI license-layer locations into
diff --git a/syft/pkg/cataloger/ai/merge.go b/syft/pkg/cataloger/ai/merge.go
index 8c0b19167..7351c2576 100644
--- a/syft/pkg/cataloger/ai/merge.go
+++ b/syft/pkg/cataloger/ai/merge.go
@@ -59,9 +59,6 @@ func mergeAggregatesInto(merged *pkg.SafeTensorsModelInfo, aggregates []pkg.Safe
 		if merged.TensorCount == 0 {
 			merged.TensorCount = a.TensorCount
 		}
-		if merged.ShardCount == 0 {
-			merged.ShardCount = a.ShardCount
-		}
 		firstNonEmpty(&merged.Parameters, a.Parameters)
 		firstNonEmpty(&merged.TotalSize, a.TotalSize)
 		firstNonEmpty(&merged.Quantization, a.Quantization)
@@ -71,7 +68,7 @@ func mergeAggregatesInto(merged *pkg.SafeTensorsModelInfo, aggregates []pkg.Safe
 // mergeShardsInto folds the per-shard header metadata into merged, returning
 // the summed shard TensorCount and the list of non-empty per-shard hashes for
 // the rollup. Shards carry only the content-derived fields (Quantization,
-// Parameters, UserMetadata);
+// Parameters, UserMetadata), so those are the only fields folded in here.
 func mergeShardsInto(merged *pkg.SafeTensorsModelInfo, shards []pkg.SafeTensorsModelInfo) (shardTensorTotal uint64, hashes []string) {
 	seenKV := map[string]bool{}
 	for _, s := range shards {
diff --git a/syft/pkg/cataloger/ai/parse_safetensors.go b/syft/pkg/cataloger/ai/parse_safetensors.go
index 52e48a19e..f88e73d40 100644
--- a/syft/pkg/cataloger/ai/parse_safetensors.go
+++ b/syft/pkg/cataloger/ai/parse_safetensors.go
@@ -148,10 +148,8 @@ func (h *safeTensorsHeader) metadataHash() string {
 }
 
 // userMetadataKeyValues converts the safetensors __metadata__ map into a
-// KeyValues slice sorted by key. We do not use the convention of returning a
-// nil slice for an empty input — instead, an empty input maps to an empty
-// (length-0, non-nil) KeyValues — so downstream JSON serialization remains
-// stable: `omitempty` drops the field either way.
+// KeyValues slice sorted by key, so SBOM output is stable across runs. Returns
+// nil for empty input (omitempty then drops the field).
 func userMetadataKeyValues(m map[string]string) pkg.KeyValues {
 	if len(m) == 0 {
 		return nil
diff --git a/syft/pkg/cataloger/ai/parse_safetensors_model.go b/syft/pkg/cataloger/ai/parse_safetensors_model.go
index d7288f1ad..134ef71b1 100644
--- a/syft/pkg/cataloger/ai/parse_safetensors_model.go
+++ b/syft/pkg/cataloger/ai/parse_safetensors_model.go
@@ -26,11 +26,12 @@ func parseSafeTensorsFile(_ context.Context, _ file.Resolver, _ *generic.Environ
 		return nil, nil, fmt.Errorf("failed to read safetensors header: %w", err)
 	}
 
+	// ShardCount is intentionally not set here: the merge processor is the single
+	// owner of ShardCount and derives it from the number of shards in the group.
 	md := pkg.SafeTensorsModelInfo{
 		Format:       "safetensors",
 		TensorCount:  uint64(len(header.tensors)),
 		Quantization: normalizeDType(header.dominantDType()),
-		ShardCount:   1,
 		UserMetadata: userMetadataKeyValues(header.metadata),
 		MetadataHash: header.metadataHash(),
 	}
diff --git a/syft/pkg/cataloger/ai/parse_safetensors_test.go b/syft/pkg/cataloger/ai/parse_safetensors_test.go
index 560659d39..dfd586b33 100644
--- a/syft/pkg/cataloger/ai/parse_safetensors_test.go
+++ b/syft/pkg/cataloger/ai/parse_safetensors_test.go
@@ -122,6 +122,27 @@ func TestSafeTensorsCataloger(t *testing.T) {
 				},
 			},
 		},
+		{
+			// rung 1 via README: with no config.json, the README model card's
+			// base_model names the model (path.Base applied), still beating the
+			// directory fallback ("readme-named").
+			name: "README base_model names the model when there is no config.json",
+			setup: func(t *testing.T) string {
+				dir := t.TempDir()
+				modelDir := filepath.Join(dir, "readme-named")
+				model(t, modelDir)
+				writeFile(t, filepath.Join(modelDir, "README.md"),
+					"---\nbase_model:\n  - org/base-model-name\n---\n# Card\n")
+				return dir
+			},
+			expectedPackages: []pkg.Package{
+				{
+					Name:     "base-model-name",
+					Type:     pkg.ModelPkg,
+					Metadata: wantMetadata(""),
+				},
+			},
+		},
 		{
 			// rung 2: no config.json at all, so the model is named after its
 			// immediate parent directory.
@@ -265,6 +286,46 @@ func TestSafeTensorsCataloger(t *testing.T) {
 	}
 }
 
+// TestSafeTensorsCataloger_shardedDirectory covers the primary multi-shard shape:
+// several `model-0000N-of-0000M.safetensors` files in one directory. The
+// cataloger must group the shards into a single package, sum their tensor counts,
+// record the shard count, and roll each shard up into Parts. (The OCI multi-shard
+// path is covered separately in TestSafeTensorsMergeProcessor.)
+func TestSafeTensorsCataloger_shardedDirectory(t *testing.T) {
+	userMeta := map[string]string{"format": "pt"}
+	// Two shards with distinct tensors → distinct per-shard metadata hashes, so
+	// the merge treats them as separate shards (3 tensors total across 2 shards).
+	shardA := map[string]safeTensorsEntry{
+		"layers.0.weight": {DType: "BF16", Shape: []int64{10, 10}, DataOffsets: []int64{0, 200}},
+	}
+	shardB := map[string]safeTensorsEntry{
+		"layers.1.weight": {DType: "BF16", Shape: []int64{10, 10}, DataOffsets: []int64{0, 200}},
+		"layers.2.weight": {DType: "BF16", Shape: []int64{10, 10}, DataOffsets: []int64{200, 400}},
+	}
+
+	dir := t.TempDir()
+	modelDir := filepath.Join(dir, "llama-sharded")
+	require.NoError(t, os.MkdirAll(modelDir, 0o755))
+	require.NoError(t, os.WriteFile(filepath.Join(modelDir, "model-00001-of-00002.safetensors"),
+		buildSafeTensorsFile(t, userMeta, shardA), 0o644))
+	require.NoError(t, os.WriteFile(filepath.Join(modelDir, "model-00002-of-00002.safetensors"),
+		buildSafeTensorsFile(t, userMeta, shardB), 0o644))
+
+	pkgtest.NewCatalogTester().
+		FromDirectory(t, dir).
+		ExpectsAssertion(func(t *testing.T, pkgs []pkg.Package, _ []artifact.Relationship) {
+			require.Len(t, pkgs, 1)
+			got := pkgs[0]
+			assert.Equal(t, "llama-sharded", got.Name, "a sharded model with no config.json is named by its directory")
+			md := got.Metadata.(pkg.SafeTensorsModelInfo)
+			assert.Equal(t, 2, md.ShardCount)
+			assert.Equal(t, uint64(3), md.TensorCount, "tensor counts are summed across shards")
+			assert.Len(t, md.Parts, 2, "each shard is rolled up into Parts")
+			assert.Equal(t, "BF16", md.Quantization)
+		}).
+		TestCataloger(t, NewSafeTensorsCataloger())
+}
+
 // TestParseSafeTensorsOCIConfig covers the parser in isolation: it should emit
 // a nameless package mirroring the config blob's producer-declared fields, and
 // emit nothing for non-safetensors formats so the GGUF cataloger can claim the
@@ -510,6 +571,34 @@ spdx-id: Apache-2.0
 		assertHasLicense(t, out[0], "Apache-2.0")
 	})
 
+	t.Run("OCI: license layer wins over a README model-card license", func(t *testing.T) {
+		// When both a dedicated license layer and a README model-card license are
+		// present, the producer-curated license layer is authoritative. (If the
+		// README won, the resolved license would be MIT and this assertion fails.)
+		dir := t.TempDir()
+		licensePath := filepath.Join(dir, "LICENSE")
+		require.NoError(t, os.WriteFile(licensePath, []byte("---\nspdx-id: Apache-2.0\n---\n"), 0o644))
+		readmePath := filepath.Join(dir, "README.md")
+		require.NoError(t, os.WriteFile(readmePath,
+			[]byte("---\nlicense: MIT\nbase_model:\n  - org/base\n---\n# Card\n"), 0o644))
+		configPath := filepath.Join(dir, "config.json")
+		require.NoError(t, os.WriteFile(configPath, []byte(`{"_name_or_path":"org/precedence-model"}`), 0o644))
+		resolver := file.NewMockResolverForMediaTypes(map[string][]file.Location{
+			dockerAIModelFileMediaType: {file.NewLocation(configPath), file.NewLocation(readmePath)},
+			dockerAILicenseMediaType:   {file.NewLocation(licensePath)},
+		})
+
+		configMd := pkg.SafeTensorsModelInfo{Format: "safetensors", TensorCount: 1}
+		out, _, err := safeTensorsMergeProcessor(
+			context.Background(), resolver,
+			[]pkg.Package{ociPkg(configMd)}, nil, nil,
+		)
+		require.NoError(t, err)
+		require.Len(t, out, 1)
+		assert.Equal(t, "precedence-model", out[0].Name)
+		assertHasLicense(t, out[0], "Apache-2.0")
+	})
+
 	t.Run("passes through upstream error", func(t *testing.T) {
 		sentinel := assert.AnError
 		p := dirPkg("/models/x/y.safetensors", pkg.SafeTensorsModelInfo{Format: "safetensors", MetadataHash: "h"})
diff --git a/syft/pkg/cataloger/ai/processor.go b/syft/pkg/cataloger/ai/processor.go
index 7cc5018e2..c685aeb6e 100644
--- a/syft/pkg/cataloger/ai/processor.go
+++ b/syft/pkg/cataloger/ai/processor.go
@@ -52,9 +52,15 @@ func partitionSafeTensorsPackages(pkgs []pkg.Package) (safeTensors, other []pkg.
 }
 
 // fromOCIArtifact reports whether the packages came from an OCI model artifact.
-// That source presents every layer at the virtual path "/", whereas a filesystem
-// scan always carries a real file path. A single scan is one source, so the
-// first package is representative of the rest.
+// That source (the ContainerImageModel resolver) presents every layer at the
+// virtual path "/", whereas a filesystem scan always carries a real file path. A
+// single scan is one source, so the first package is representative of the rest.
+//
+// This deliberately keys off the path signal rather than type-asserting the
+// resolver to file.OCIMediaTypeResolver: the test harness wraps resolvers in an
+// ObservingResolver that implements that interface unconditionally, so an
+// interface check would misclassify directory scans as OCI. The "/" path is the
+// genuine, testable signal the OCI model source produces.
 func fromOCIArtifact(pkgs []pkg.Package) bool {
 	loc := primaryEvidenceLocation(pkgs[0])
 	return loc != nil && loc.RealPath == "/"
diff --git a/syft/pkg/safetensors.go b/syft/pkg/safetensors.go
index ea752bd91..d0ca8c2dc 100644
--- a/syft/pkg/safetensors.go
+++ b/syft/pkg/safetensors.go
@@ -34,7 +34,7 @@ type SafeTensorsModelInfo struct {
 	TensorCount uint64 `json:"tensorCount,omitempty" cyclonedx:"tensorCount"`
 
 	// TotalSize is the total byte size of tensor data across all shards when known
-	// (from the Docker AI model config "size" field or the sharded index "total_size").
+	// (from the Docker AI model config "size" field).
 	TotalSize string `json:"totalSize,omitempty" cyclonedx:"totalSize"`
 
 	// ShardCount is the number of .safetensors shards for a sharded model (1 for a
diff --git a/syft/source/ocimodelsource/oci_model_source.go b/syft/source/ocimodelsource/oci_model_source.go
index 8fd5e3849..0fc697396 100644
--- a/syft/source/ocimodelsource/oci_model_source.go
+++ b/syft/source/ocimodelsource/oci_model_source.go
@@ -336,7 +336,8 @@ func (s *ociModelSource) Describe() source.Description {
 	}
 }
 
-// FileResolver returns a file resolver for accessing header of GGUF files.
+// FileResolver returns a file resolver for accessing model headers and companion
+// metadata (GGUF/SafeTensors headers, the model config blob, and companion layers).
 func (s *ociModelSource) FileResolver(_ source.Scope) (file.Resolver, error) {
 	return s.resolver, nil
 }
diff --git a/syft/source/ocimodelsource/registry_client.go b/syft/source/ocimodelsource/registry_client.go
index d6c274c42..43adfd5e2 100644
--- a/syft/source/ocimodelsource/registry_client.go
+++ b/syft/source/ocimodelsource/registry_client.go
@@ -286,8 +286,9 @@ func (c *registryClient) fetchBlobRange(ctx context.Context, ref name.Reference,
 	// https://github.com/ggml-org/ggml/blob/master/docs/gguf.md#file-structure
 	data := make([]byte, maxBytes)
 	n, err := io.ReadFull(reader, data)
-	if err != nil && err != io.ErrUnexpectedEOF {
-		// ErrUnexpectedEOF is okay - it means the file is smaller than maxBytes
+	// ErrUnexpectedEOF means the layer is smaller than maxBytes; EOF means it is
+	// empty. Both mean we read everything there was, not a failure.
+	if err != nil && !errors.Is(err, io.ErrUnexpectedEOF) && !errors.Is(err, io.EOF) {
 		return nil, fmt.Errorf("failed to read layer data: %w", err)
 	}