Decompress UPX packed binaries to extract golang build info (ELF formatted binaries with lzma method only) (#4480)

* decompress upx packed binaries

Signed-off-by: Alex Goodman <wagoodman@users.noreply.github.com>

* fix linting and remove dead code

Signed-off-by: Alex Goodman <wagoodman@users.noreply.github.com>

---------

Signed-off-by: Alex Goodman <wagoodman@users.noreply.github.com>
This commit is contained in:
Alex Goodman 2025-12-22 09:17:38 -05:00 committed by GitHub
parent 7ef4703454
commit 0ea920ba6d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 804 additions and 7 deletions

2
go.mod
View File

@ -239,7 +239,7 @@ require (
github.com/tidwall/match v1.1.1 // indirect github.com/tidwall/match v1.1.1 // indirect
github.com/tidwall/pretty v1.2.1 // indirect github.com/tidwall/pretty v1.2.1 // indirect
github.com/tidwall/sjson v1.2.5 // indirect github.com/tidwall/sjson v1.2.5 // indirect
github.com/ulikunitz/xz v0.5.15 // indirect github.com/ulikunitz/xz v0.5.15
github.com/vbatts/tar-split v0.12.2 // indirect github.com/vbatts/tar-split v0.12.2 // indirect
github.com/xanzy/ssh-agent v0.3.3 // indirect github.com/xanzy/ssh-agent v0.3.3 // indirect
github.com/xeipuuv/gojsonpointer v0.0.0-20190905194746-02993c407bfb // indirect github.com/xeipuuv/gojsonpointer v0.0.0-20190905194746-02993c407bfb // indirect

View File

@ -47,6 +47,37 @@ func Test_PackageCataloger_Binary(t *testing.T) {
"stdlib @ go1.23.2 (/run-me) [dependency-of] anchore.io/not/real @ v1.0.0 (/run-me)", "stdlib @ go1.23.2 (/run-me) [dependency-of] anchore.io/not/real @ v1.0.0 (/run-me)",
}, },
}, },
{
name: "upx compressed binary",
fixture: "image-small-upx",
expectedPkgs: []string{
"anchore.io/not/real @ v1.0.0 (/run-me)",
"github.com/andybalholm/brotli @ v1.1.1 (/run-me)",
"github.com/dsnet/compress @ v0.0.2-0.20210315054119-f66993602bf5 (/run-me)",
"github.com/golang/snappy @ v0.0.4 (/run-me)",
"github.com/klauspost/compress @ v1.17.11 (/run-me)",
"github.com/klauspost/pgzip @ v1.2.6 (/run-me)",
"github.com/nwaples/rardecode @ v1.1.3 (/run-me)",
"github.com/pierrec/lz4/v4 @ v4.1.21 (/run-me)",
"github.com/ulikunitz/xz @ v0.5.12 (/run-me)",
"github.com/xi2/xz @ v0.0.0-20171230120015-48954b6210f8 (/run-me)",
"stdlib @ go1.23.2 (/run-me)",
"github.com/anchore/archiver/v3 @ v3.5.3-0.20241210171143-5b1d8d1c7c51 (/run-me)",
},
expectedRels: []string{
"github.com/andybalholm/brotli @ v1.1.1 (/run-me) [dependency-of] anchore.io/not/real @ v1.0.0 (/run-me)",
"github.com/dsnet/compress @ v0.0.2-0.20210315054119-f66993602bf5 (/run-me) [dependency-of] anchore.io/not/real @ v1.0.0 (/run-me)",
"github.com/golang/snappy @ v0.0.4 (/run-me) [dependency-of] anchore.io/not/real @ v1.0.0 (/run-me)",
"github.com/klauspost/compress @ v1.17.11 (/run-me) [dependency-of] anchore.io/not/real @ v1.0.0 (/run-me)",
"github.com/klauspost/pgzip @ v1.2.6 (/run-me) [dependency-of] anchore.io/not/real @ v1.0.0 (/run-me)",
"github.com/anchore/archiver/v3 @ v3.5.3-0.20241210171143-5b1d8d1c7c51 (/run-me) [dependency-of] anchore.io/not/real @ v1.0.0 (/run-me)",
"github.com/nwaples/rardecode @ v1.1.3 (/run-me) [dependency-of] anchore.io/not/real @ v1.0.0 (/run-me)",
"github.com/pierrec/lz4/v4 @ v4.1.21 (/run-me) [dependency-of] anchore.io/not/real @ v1.0.0 (/run-me)",
"github.com/ulikunitz/xz @ v0.5.12 (/run-me) [dependency-of] anchore.io/not/real @ v1.0.0 (/run-me)",
"github.com/xi2/xz @ v0.0.0-20171230120015-48954b6210f8 (/run-me) [dependency-of] anchore.io/not/real @ v1.0.0 (/run-me)",
"stdlib @ go1.23.2 (/run-me) [dependency-of] anchore.io/not/real @ v1.0.0 (/run-me)",
},
},
{ {
name: "partially built binary", name: "partially built binary",
// the difference is the build flags used to build the binary... they will not reference the module directly // the difference is the build flags used to build the binary... they will not reference the module directly

View File

@ -32,7 +32,7 @@ func scanFile(location file.Location, reader unionreader.UnionReader) ([]*extend
var builds []*extendedBuildInfo var builds []*extendedBuildInfo
for _, r := range readers { for _, r := range readers {
bi, err := getBuildInfo(r) bi, err := getBuildInfo(r, location)
if err != nil { if err != nil {
log.WithFields("file", location.RealPath, "error", err).Trace("unable to read golang buildinfo") log.WithFields("file", location.RealPath, "error", err).Trace("unable to read golang buildinfo")
@ -89,7 +89,7 @@ func getCryptoSettingsFromVersion(v version.Version) []string {
return cryptoSettings return cryptoSettings
} }
func getBuildInfo(r io.ReaderAt) (bi *debug.BuildInfo, err error) { func getBuildInfo(r io.ReaderAt, location file.Location) (bi *debug.BuildInfo, err error) {
defer func() { defer func() {
if r := recover(); r != nil { if r := recover(); r != nil {
// this can happen in cases where a malformed binary is passed in can be initially parsed, but not // this can happen in cases where a malformed binary is passed in can be initially parsed, but not
@ -98,7 +98,25 @@ func getBuildInfo(r io.ReaderAt) (bi *debug.BuildInfo, err error) {
err = fmt.Errorf("recovered from panic: %v", r) err = fmt.Errorf("recovered from panic: %v", r)
} }
}() }()
// try to read buildinfo from the binary directly
bi, err = buildinfo.Read(r) bi, err = buildinfo.Read(r)
if err == nil {
return bi, nil
}
// if direct read fails and this looks like a UPX-compressed binary,
// try to decompress and read the buildinfo from the decompressed data
if isUPXCompressed(r) {
log.WithFields("path", location.RealPath).Trace("detected UPX-compressed Go binary, attempting decompression to read the build info")
decompressed, decompErr := decompressUPX(r)
if decompErr == nil {
bi, err = buildinfo.Read(decompressed)
if err == nil {
return bi, nil
}
}
}
// note: the stdlib does not export the error we need to check for // note: the stdlib does not export the error we need to check for
if err != nil { if err != nil {
@ -106,11 +124,11 @@ func getBuildInfo(r io.ReaderAt) (bi *debug.BuildInfo, err error) {
// since the cataloger can only select executables and not distinguish if they are a go-compiled // since the cataloger can only select executables and not distinguish if they are a go-compiled
// binary, we should not show warnings/logs in this case. For this reason we nil-out err here. // binary, we should not show warnings/logs in this case. For this reason we nil-out err here.
err = nil err = nil
return return bi, err
} }
// in this case we could not read the or parse the file, but not explicitly because it is not a // in this case we could not read the or parse the file, but not explicitly because it is not a
// go-compiled binary (though it still might be). // go-compiled binary (though it still might be).
return return bi, err
} }
return return bi, err
} }

View File

@ -8,6 +8,8 @@ import (
"github.com/kastenhq/goversion/version" "github.com/kastenhq/goversion/version"
"github.com/stretchr/testify/assert" "github.com/stretchr/testify/assert"
"github.com/anchore/syft/syft/file"
) )
func Test_getBuildInfo(t *testing.T) { func Test_getBuildInfo(t *testing.T) {
@ -31,7 +33,7 @@ func Test_getBuildInfo(t *testing.T) {
} }
for _, tt := range tests { for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) { t.Run(tt.name, func(t *testing.T) {
gotBi, err := getBuildInfo(tt.args.r) gotBi, err := getBuildInfo(tt.args.r, file.Location{})
if !tt.wantErr(t, err, fmt.Sprintf("getBuildInfo(%v)", tt.args.r)) { if !tt.wantErr(t, err, fmt.Sprintf("getBuildInfo(%v)", tt.args.r)) {
return return
} }

View File

@ -0,0 +1 @@
/run-me

View File

@ -0,0 +1,18 @@
FROM --platform=linux/amd64 golang:1.23.2-alpine AS builder
RUN apk add --no-cache upx
RUN mkdir /app
WORKDIR /app
COPY go.mod go.sum ./
RUN go mod download
COPY main.go main.go
RUN CGO_ENABLED=0 GOOS=linux go build -ldflags "-X main.Version=1.0.0" -o run-me .
RUN upx --best --lzma --exact run-me
FROM scratch
COPY --from=builder /app/run-me /run-me
ENTRYPOINT ["/run-me"]

View File

@ -0,0 +1,19 @@
module anchore.io/not/real
go 1.23
toolchain go1.23.2
require github.com/anchore/archiver/v3 v3.5.3-0.20241210171143-5b1d8d1c7c51
require (
github.com/andybalholm/brotli v1.1.1 // indirect
github.com/dsnet/compress v0.0.2-0.20210315054119-f66993602bf5 // indirect
github.com/golang/snappy v0.0.4 // indirect
github.com/klauspost/compress v1.17.11 // indirect
github.com/klauspost/pgzip v1.2.6 // indirect
github.com/nwaples/rardecode v1.1.3 // indirect
github.com/pierrec/lz4/v4 v4.1.21 // indirect
github.com/ulikunitz/xz v0.5.12 // indirect
github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8 // indirect
)

View File

@ -0,0 +1,28 @@
github.com/anchore/archiver/v3 v3.5.3-0.20241210171143-5b1d8d1c7c51 h1:yhk+P8lF3ZiROjmaVRao9WGTRo4b/wYjoKEiAHWrKwc=
github.com/anchore/archiver/v3 v3.5.3-0.20241210171143-5b1d8d1c7c51/go.mod h1:nwuGSd7aZp0rtYt79YggCGafz1RYsclE7pi3fhLwvuw=
github.com/andybalholm/brotli v1.1.1 h1:PR2pgnyFznKEugtsUo0xLdDop5SKXd5Qf5ysW+7XdTA=
github.com/andybalholm/brotli v1.1.1/go.mod h1:05ib4cKhjx3OQYUY22hTVd34Bc8upXjOLL2rKwwZBoA=
github.com/dsnet/compress v0.0.2-0.20210315054119-f66993602bf5 h1:iFaUwBSo5Svw6L7HYpRu/0lE3e0BaElwnNO1qkNQxBY=
github.com/dsnet/compress v0.0.2-0.20210315054119-f66993602bf5/go.mod h1:qssHWj60/X5sZFNxpG4HBPDHVqxNm4DfnCKgrbZOT+s=
github.com/dsnet/golib v0.0.0-20171103203638-1ea166775780/go.mod h1:Lj+Z9rebOhdfkVLjJ8T6VcRQv3SXugXy999NBtR9aFY=
github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM=
github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/klauspost/compress v1.4.1/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A=
github.com/klauspost/compress v1.17.11 h1:In6xLpyWOi1+C7tXUUWv2ot1QvBjxevKAaI6IXrJmUc=
github.com/klauspost/compress v1.17.11/go.mod h1:pMDklpSncoRMuLFrf1W9Ss9KT+0rH90U12bZKk7uwG0=
github.com/klauspost/cpuid v1.2.0/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek=
github.com/klauspost/pgzip v1.2.6 h1:8RXeL5crjEUFnR2/Sn6GJNWtSQ3Dk8pq4CL3jvdDyjU=
github.com/klauspost/pgzip v1.2.6/go.mod h1:Ch1tH69qFZu15pkjo5kYi6mth2Zzwzt50oCQKQE9RUs=
github.com/nwaples/rardecode v1.1.3 h1:cWCaZwfM5H7nAD6PyEdcVnczzV8i/JtotnyW/dD9lEc=
github.com/nwaples/rardecode v1.1.3/go.mod h1:5DzqNKiOdpKKBH87u8VlvAnPZMXcGRhxWkRpHbbfGS0=
github.com/pierrec/lz4/v4 v4.1.21 h1:yOVMLb6qSIDP67pl/5F7RepeKYu/VmTyEXvuMI5d9mQ=
github.com/pierrec/lz4/v4 v4.1.21/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4=
github.com/ulikunitz/xz v0.5.8/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14=
github.com/ulikunitz/xz v0.5.12 h1:37Nm15o69RwBkXM0J6A5OlE67RZTfzUxTj8fB3dfcsc=
github.com/ulikunitz/xz v0.5.12/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14=
github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8 h1:nIPpBwaJSVYIxUFsDv3M8ofmx9yWTog9BfvIu0q41lo=
github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8/go.mod h1:HUYIGzjTL3rfEspMxjDjgmT5uz5wzYJKVo23qUhYTos=
github.com/xyproto/randomstring v1.0.5 h1:YtlWPoRdgMu3NZtP45drfy1GKoojuR7hmRcnhZqKjWU=
github.com/xyproto/randomstring v1.0.5/go.mod h1:rgmS5DeNXLivK7YprL0pY+lTuhNQW3iGxZ18UQApw/E=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=

View File

@ -0,0 +1,19 @@
package main
import "github.com/anchore/archiver/v3"
func main() {
z := archiver.Zip{
MkdirAll: true,
SelectiveCompression: true,
ContinueOnError: false,
OverwriteExisting: false,
ImplicitTopLevelFolder: false,
}
err := z.Archive([]string{"main.go"}, "test.zip")
if err != nil {
panic(err)
}
}

View File

@ -0,0 +1,533 @@
package golang
// UPX Decompression Support
//
// this file implements decompression of UPX-compressed ELF binaries to enable
// extraction of Go build information (.go.buildinfo) from packed executables.
//
// UPX (Ultimate Packer for eXecutables) is a popular executable packer that
// compresses binaries to reduce file size. When a Go binary is compressed with
// UPX, the standard debug/buildinfo.Read() fails because the .go.buildinfo
// section is compressed. This code decompresses the binary in-memory to allow
// buildinfo extraction.
//
// # Supported Compression Methods
//
// Currently only LZMA (method 14) is supported, which is used by:
//
// upx --best --lzma <binary>
//
// Other UPX methods (NRV2B, NRV2D, NRV2E, etc.) are not yet implemented but
// could be added via the upxDecompressors dispatch map.
//
// # Key Functions
//
// - isUPXCompressed: detects UPX magic bytes ("UPX!") in the binary
// - decompressUPX: main entry point; decompresses all blocks and reconstructs the ELF
// - decompressLZMA: handles UPX's custom 2-byte LZMA header format
// - unfilter49: reverses the CTO (call trick optimization) filter for x86/x64 code
// - parseELFPTLoadOffsets: extracts PT_LOAD segment offsets for proper block placement
//
// # UPX Binary Format
//
// UPX-compressed binaries contain several header structures followed by compressed blocks:
//
// l_info (at "UPX!" magic):
// - l_checksum (4 bytes before magic)
// - l_magic "UPX!" (4 bytes)
// - l_lsize (2 bytes) - loader size
// - l_version (1 byte)
// - l_format (1 byte)
//
// p_info (12 bytes, follows l_info):
// - p_progid (4 bytes)
// - p_filesize (4 bytes) - original uncompressed file size
// - p_blocksize (4 bytes)
//
// b_info (12 bytes each, one per compressed block):
// - sz_unc (4 bytes) - uncompressed size
// - sz_cpr (4 bytes) - compressed size
// - b_method (1 byte) - compression method (14 = LZMA)
// - b_ftid (1 byte) - filter ID (0x49 = CTO filter)
// - b_cto8 (1 byte) - filter parameter
// - unused (1 byte)
//
// # LZMA Header Format
//
// UPX uses a 2-byte custom header, NOT the standard 13-byte LZMA format:
//
// Byte 0: (t << 3) | pb, where t = lc + lp
// Byte 1: (lp << 4) | lc
// Byte 2+: raw LZMA stream
//
// This is converted to standard LZMA props: props = lc + lp*9 + pb*9*5
//
// # ELF Segment Placement
//
// Decompressed blocks must be placed at specific file offsets according to the
// ELF PT_LOAD segments parsed from the first decompressed block. Simply
// concatenating blocks produces invalid output.
//
// # References
//
// - UPX source: https://github.com/upx/upx
// - LZMA format: https://github.com/upx/upx/blob/devel/src/compress/compress_lzma.cpp
// - CTO filter: https://github.com/upx/upx/blob/master/src/filter/cto.h
//
// note: no code was copied from the UPX repo, this is an independent implementation based on format description.
//
// # Anti-Unpacking / Obfuscation (Not Currently Supported)
//
// Malware commonly modifies UPX binaries to evade analysis. This implementation
// does not currently handle obfuscated binaries, but these techniques could be
// addressed in the future:
//
// - Magic modification: "UPX!" replaced with custom bytes (e.g., "YTS!", "MOZI").
// Recovery: scan for decompression stub code patterns instead of magic bytes.
//
// - Zeroed p_info fields: p_filesize and p_blocksize set to 0.
// Recovery: read original size from PackHeader at EOF (last 36 bytes, offset 0x18).
//
// - Header corruption: checksums or version fields modified.
// Recovery: ignore validation and use PackHeader values as authoritative source.
//
// This would require parsing of the PackHeader, located in the final 36 bytes of the file, contains
// metadata recoverable even if p_info is corrupted (not parsed today):
//
// Offset Size Field Description
// ──────────────────────────────────────────────────────────
// 0x00 4 UPX magic "UPX!" (0x21585055)
// 0x04 1 version UPX version
// 0x05 1 format Executable format
// 0x06 1 method Compression method
// 0x07 1 level Compression level (1-10)
// 0x08 4 u_adler Uncompressed data checksum
// 0x0C 4 c_adler Compressed data checksum
// 0x10 4 u_len Uncompressed length
// 0x14 4 c_len Compressed length
// 0x18 4 u_file_size Original file size ← Recovery point
// 0x1C 1 filter Filter ID
// 0x1D 1 filter_cto Filter CTO parameter
// 0x1E 1 n_mru MRU parameter
// 0x1F 1 header_checksum Header checksum
import (
"bytes"
"encoding/binary"
"errors"
"fmt"
"io"
"github.com/ulikunitz/xz/lzma"
)
// UPX compression method constants
const (
upxMethodLZMA uint8 = 14 // M_LZMA in UPX source
)
// UPX filter constants
const (
upxFilterCTO uint8 = 0x49 // CTO (call trick optimization) filter for x86/x64
)
var (
// upxMagic is the magic bytes that identify a UPX-packed binary
upxMagic = []byte("UPX!")
errNotUPX = errors.New("not a UPX-compressed binary")
errUnsupportedUPXMethod = errors.New("unsupported UPX compression method")
)
// upxInfo contains parsed UPX header information
type upxInfo struct {
magicOffset int64
version uint8
format uint8
originalSize uint32 // p_filesize - original uncompressed file size
blockSize uint32 // p_blocksize - size of each compression block
firstBlockOff int64 // offset to first b_info structure
}
// blockInfo contains information about a single compressed block
type blockInfo struct {
uncompressedSize uint32
compressedSize uint32
method uint8
filterID uint8
filterCTO uint8
dataOffset int64
}
// upxDecompressor is a function that decompresses data using a specific method
type upxDecompressor func(compressedData []byte, uncompressedSize uint32) ([]byte, error)
// upxDecompressors maps compression methods to their decompressor functions
var upxDecompressors = map[uint8]upxDecompressor{
upxMethodLZMA: decompressLZMA,
// note: the NRV algorithms are from the UCL library, an open-source implementation based on the NRV (Not Really Vanished) algorithm.
// TODO: future methods can be added here
// upxMethodNRV2B: decompressNRV2B,
// upxMethodNRV2D: decompressNRV2D,
// upxMethodNRV2E: decompressNRV2E,
}
// unfilter49 reverses UPX filter 0x49 (CTO / call trick optimization).
// The filter transforms CALL (0xE8) and JMP (0xE9) instruction addresses in x86/x64 code to improve compression.
// The filtered format stores addresses as big-endian with cto8 as the high byte marker (the `cto8` parameter,
// stored in `b_cto8`, marks transformed instructions):
//
// original: E8 xx xx xx xx (CALL rel32, little-endian offset)
// filtered: E8 CC yy yy yy (big-endian, CC = cto8 marker)
func unfilter49(data []byte, cto8 byte) {
cto := uint32(cto8) << 24
for pos := uint32(0); pos+5 <= uint32(len(data)); pos++ {
opcode := data[pos]
// check for E8 (CALL) or E9 (JMP)
if opcode == 0xE8 || opcode == 0xE9 {
// check if first byte after opcode matches cto8 marker
if data[pos+1] == cto8 {
// read operand as big-endian
jc := binary.BigEndian.Uint32(data[pos+1 : pos+5])
// subtract cto and position to get original relative address
result := jc - (pos + 1) - cto
// write back as little-endian
binary.LittleEndian.PutUint32(data[pos+1:pos+5], result)
}
}
// check for conditional jumps (0F 80-8F)
if opcode == 0x0F && pos+6 <= uint32(len(data)) {
opcode2 := data[pos+1]
if opcode2 >= 0x80 && opcode2 <= 0x8F && data[pos+2] == cto8 {
jc := binary.BigEndian.Uint32(data[pos+2 : pos+6])
result := jc - (pos + 2) - cto
binary.LittleEndian.PutUint32(data[pos+2:pos+6], result)
}
}
}
}
// isUPXCompressed checks if the reader contains a UPX-compressed binary
func isUPXCompressed(r io.ReaderAt) bool {
// UPX magic can be at various offsets depending on the binary format
// scan the first 4KB for the magic bytes
buf := make([]byte, 4096)
n, err := r.ReadAt(buf, 0)
if err != nil && !errors.Is(err, io.EOF) {
return false
}
return bytes.Contains(buf[:n], upxMagic)
}
// decompressUPX attempts to decompress a UPX-compressed ELF binary.
// It reads blocks and places them at correct file offsets based on ELF PT_LOAD segments.
//
// The first decompressed block contains the original ELF headers. Parse them to get PT_LOAD segment
// file offsets for proper block placement:
//
// - After decompressing block 1, parse its ELF headers:
// ptLoadOffsets := parseELFPTLoadOffsets(block1Data)
//
// - Block 1: placed at offset 0 (contains ELF header + program headers)
// - Block 2: placed at offset 0 (overwrites/extends)
// - Block 3+: placed at ptLoadOffsets[blockNum-2]
//
// Why this matters: Simply concatenating decompressed blocks produces invalid output.
// Each block corresponds to a PT_LOAD segment and must be placed at its correct file offset.
//
// Returns the decompressed binary as a bytes.Reader (implements io.ReaderAt).
func decompressUPX(r io.ReaderAt) (io.ReaderAt, error) {
info, err := parseUPXInfo(r)
if err != nil {
return nil, err
}
// allocate buffer for the full decompressed output
output := make([]byte, info.originalSize)
currentOffset := info.firstBlockOff
outputOffset := uint64(0)
blockNum := 0
// track PT_LOAD segment offsets for proper block placement
var ptLoadOffsets []uint64
for {
block, err := readBlockInfo(r, currentOffset)
if err != nil {
return nil, fmt.Errorf("failed to read block info at offset %d: %w", currentOffset, err)
}
// check for end marker (sz_unc == 0)
if block.uncompressedSize == 0 {
break
}
// non-LZMA method on first block is an error; on subsequent blocks it indicates end of data
if block.method != upxMethodLZMA {
if blockNum == 0 {
return nil, fmt.Errorf("%w: method %d", errUnsupportedUPXMethod, block.method)
}
break
}
blockNum++
decompressor, ok := upxDecompressors[block.method]
if !ok {
return nil, fmt.Errorf("%w: method %d", errUnsupportedUPXMethod, block.method)
}
// read compressed data for this block
compressedData := make([]byte, block.compressedSize)
_, err = r.ReadAt(compressedData, block.dataOffset)
if err != nil {
return nil, fmt.Errorf("failed to read compressed data: %w", err)
}
// decompress this block
blockData, err := decompressor(compressedData, block.uncompressedSize)
if err != nil {
return nil, fmt.Errorf("failed to decompress block: %w", err)
}
// apply CTO filter reversal if needed
if block.filterID == upxFilterCTO {
unfilter49(blockData, block.filterCTO)
}
// first block contains ELF headers - parse PT_LOAD segments for subsequent blocks
if blockNum == 1 {
ptLoadOffsets = parseELFPTLoadOffsets(blockData)
}
// determine where to place this block in the output
destOffset := outputOffset
if blockNum > 2 && len(ptLoadOffsets) > blockNum-2 {
// blocks 3+ go to their respective PT_LOAD segment offsets
destOffset = ptLoadOffsets[blockNum-2]
}
// copy block data to output at correct offset
if destOffset+uint64(len(blockData)) <= uint64(len(output)) {
copy(output[destOffset:], blockData)
}
outputOffset = destOffset + uint64(block.uncompressedSize)
currentOffset = block.dataOffset + int64(block.compressedSize)
}
return bytes.NewReader(output), nil
}
// parseELFPTLoadOffsets extracts PT_LOAD segment file offsets from ELF headers.
// These offsets determine where each decompressed block should be placed.
func parseELFPTLoadOffsets(elfHeader []byte) []uint64 {
if len(elfHeader) < 64 {
return nil
}
// verify ELF magic
if !bytes.HasPrefix(elfHeader, []byte{0x7f, 'E', 'L', 'F'}) {
return nil
}
// only support 64-bit ELF
if elfHeader[4] != 2 {
return nil
}
// parse ELF64 header fields
phoff := binary.LittleEndian.Uint64(elfHeader[0x20:0x28])
phentsize := binary.LittleEndian.Uint16(elfHeader[0x36:0x38])
phnum := binary.LittleEndian.Uint16(elfHeader[0x38:0x3a])
var offsets []uint64
for i := uint16(0); i < phnum; i++ {
phStart := phoff + uint64(i)*uint64(phentsize)
if phStart+uint64(phentsize) > uint64(len(elfHeader)) {
break
}
ph := elfHeader[phStart:]
ptype := binary.LittleEndian.Uint32(ph[0:4])
// PT_LOAD = 1
if ptype == 1 {
poffset := binary.LittleEndian.Uint64(ph[8:16])
offsets = append(offsets, poffset)
}
}
return offsets
}
// parseUPXInfo locates and parses the UPX header information
func parseUPXInfo(r io.ReaderAt) (*upxInfo, error) {
// scan for the UPX! magic in the first 8KB
buf := make([]byte, 8192)
n, err := r.ReadAt(buf, 0)
if err != nil && !errors.Is(err, io.EOF) {
return nil, fmt.Errorf("failed to read header: %w", err)
}
magicIdx := bytes.Index(buf[:n], upxMagic)
if magicIdx == -1 {
return nil, errNotUPX
}
// UPX header structure (after finding "UPX!" magic):
// l_info structure (magic is at offset 4 within l_info):
// offset -4: l_checksum (4 bytes) - checksum of following data
// offset 0: l_magic "UPX!" (4 bytes)
// offset 4: l_lsize (2 bytes) - loader size
// offset 6: l_version (1 byte)
// offset 7: l_format (1 byte)
//
// p_info structure (12 bytes, starts at magic+8):
// offset 0: p_progid (4 bytes)
// offset 4: p_filesize (4 bytes) - original file size
// offset 8: p_blocksize (4 bytes)
//
// b_info structures follow (12 bytes each):
// offset 0: sz_unc (4 bytes) - uncompressed size of this block
// offset 4: sz_cpr (4 bytes) - compressed size (may have filter bits)
// offset 8: b_method (1 byte)
// offset 9: b_ftid (1 byte) - filter id
// offset 10: b_cto8 (1 byte) - filter parameter
// offset 11: unused (1 byte)
if magicIdx+32 > n {
return nil, fmt.Errorf("UPX header truncated")
}
lInfoBase := buf[magicIdx:]
pInfoBase := buf[magicIdx+8:] // p_info starts 8 bytes after magic
info := &upxInfo{
magicOffset: int64(magicIdx),
version: lInfoBase[6],
format: lInfoBase[7],
originalSize: binary.LittleEndian.Uint32(pInfoBase[4:8]),
blockSize: binary.LittleEndian.Uint32(pInfoBase[8:12]),
firstBlockOff: int64(magicIdx + 8 + 12), // magic + l_info remainder + p_info
}
// sanity check
if info.originalSize == 0 || info.originalSize > 500*1024*1024 {
return nil, fmt.Errorf("invalid original size: %d", info.originalSize)
}
return info, nil
}
// readBlockInfo reads a b_info structure at the given offset
func readBlockInfo(r io.ReaderAt, offset int64) (*blockInfo, error) {
buf := make([]byte, 12)
_, err := r.ReadAt(buf, offset)
if err != nil {
return nil, err
}
szUnc := binary.LittleEndian.Uint32(buf[0:4])
szCpr := binary.LittleEndian.Uint32(buf[4:8])
// the compressed size may have filter info in the high bits
// for some formats, but for LZMA it's typically clean
block := &blockInfo{
uncompressedSize: szUnc,
compressedSize: szCpr & 0x00ffffff, // lower 24 bits
method: buf[8],
filterID: buf[9],
filterCTO: buf[10],
dataOffset: offset + 12, // data starts right after b_info
}
return block, nil
}
// nextPowerOf2 returns the smallest power of 2 >= n
func nextPowerOf2(n uint32) uint32 {
if n == 0 {
return 1
}
// if already a power of 2, return it
if n&(n-1) == 0 {
return n
}
// find the highest set bit and shift left by 1
n--
n |= n >> 1
n |= n >> 2
n |= n >> 4
n |= n >> 8
n |= n >> 16
return n + 1
}
// decompressLZMA decompresses LZMA-compressed data as used by UPX.
// UPX uses a 2-byte custom header format, not the standard 13-byte LZMA format.
//
// UPX 2-byte header encoding:
// - Byte 0: (t << 3) | pb, where t = lc + lp
// - Byte 1: (lp << 4) | lc
// - Byte 2+: raw LZMA stream (starts with 0x00 for range decoder init)
//
// Standard LZMA props encoding: props = lc + lp*9 + pb*9*5
func decompressLZMA(compressedData []byte, uncompressedSize uint32) ([]byte, error) {
if len(compressedData) < 3 {
return nil, fmt.Errorf("compressed data too short")
}
// parse UPX's 2-byte LZMA header
pb := compressedData[0] & 0x07
lp := compressedData[1] >> 4
lc := compressedData[1] & 0x0f
// convert to standard LZMA properties byte
props := lc + lp*9 + pb*9*5
// raw LZMA stream starts at byte 2 (includes 0x00 init byte)
lzmaStream := compressedData[2:]
// compute dictionary size: must be at least as large as uncompressed size
// use next power of 2 for efficiency, with reasonable min/max bounds.
// note: if you're seeing that testing small binaries works and large ones don't,
// it may be that the dictionary size was not considered properly in this code.
const minDictSize = 64 * 1024 // 64KB minimum
const maxDictSize = 128 * 1024 * 1024 // 128MB maximum
dictSize := nextPowerOf2(uncompressedSize)
if dictSize < minDictSize {
dictSize = minDictSize
}
if dictSize > maxDictSize {
dictSize = maxDictSize
}
// construct standard 13-byte LZMA header
header := make([]byte, 13)
header[0] = props //nolint:gosec
binary.LittleEndian.PutUint32(header[1:5], dictSize)
binary.LittleEndian.PutUint64(header[5:13], uint64(uncompressedSize))
// combine header + raw stream
var fullStream []byte
fullStream = append(fullStream, header...)
fullStream = append(fullStream, lzmaStream...)
reader, err := lzma.NewReader(bytes.NewReader(fullStream))
if err != nil {
return nil, fmt.Errorf("failed to create LZMA reader: %w", err)
}
decompressed := make([]byte, uncompressedSize)
_, err = io.ReadFull(reader, decompressed)
if err != nil {
return nil, fmt.Errorf("failed to decompress LZMA data: %w", err)
}
return decompressed, nil
}

View File

@ -0,0 +1,128 @@
package golang
import (
"bytes"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestIsUPXCompressed(t *testing.T) {
tests := []struct {
name string
data []byte
expected bool
}{
{
name: "contains UPX magic at start",
data: append([]byte("UPX!"), make([]byte, 100)...),
expected: true,
},
{
name: "contains UPX magic with offset",
data: append(append(make([]byte, 500), []byte("UPX!")...), make([]byte, 100)...),
expected: true,
},
{
name: "no UPX magic",
data: []byte("\x7FELF" + string(make([]byte, 100))),
expected: false,
},
{
name: "empty data",
data: []byte{},
expected: false,
},
{
name: "partial UPX magic",
data: []byte("UPX"),
expected: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
reader := bytes.NewReader(tt.data)
result := isUPXCompressed(reader)
assert.Equal(t, tt.expected, result)
})
}
}
func TestParseUPXInfo_NotUPX(t *testing.T) {
data := []byte("\x7FELF" + string(make([]byte, 100)))
reader := bytes.NewReader(data)
_, err := parseUPXInfo(reader)
require.Error(t, err)
assert.ErrorIs(t, err, errNotUPX)
}
func TestParseUPXInfo_ValidHeader(t *testing.T) {
// construct a minimal valid UPX header matching actual format
// l_info: checksum (4) + magic (4) + lsize (2) + version (1) + format (1)
lInfo := []byte{
0, 0, 0, 0, // l_checksum (before magic)
'U', 'P', 'X', '!', // magic
0, 0, // l_lsize
14, // l_version
22, // l_format (ELF)
}
// p_info (12 bytes): progid + filesize + blocksize
pInfo := []byte{
0, 0, 0, 0, // p_progid
0, 0, 0x10, 0, // p_filesize = 0x100000 (1MB) little-endian
0, 0, 0x10, 0, // p_blocksize
}
// b_info (12 bytes): sz_unc + sz_cpr + method + filter info
bInfo := []byte{
0, 0, 0x10, 0, // sz_unc = 1MB
0, 0, 0x08, 0, // sz_cpr = 512KB (compressed)
14, 0, 0, 0, // method=LZMA, filter info
}
data := append(append(lInfo, pInfo...), bInfo...)
data = append(data, make([]byte, 100)...) // padding
reader := bytes.NewReader(data)
info, err := parseUPXInfo(reader)
require.NoError(t, err)
assert.Equal(t, uint8(14), info.version)
assert.Equal(t, uint8(22), info.format)
assert.Equal(t, uint32(0x100000), info.originalSize)
}
func TestDecompressUPX_UnsupportedMethod(t *testing.T) {
// construct a header with an unsupported compression method
lInfo := []byte{
0, 0, 0, 0, // l_checksum
'U', 'P', 'X', '!',
0, 0, // l_lsize
14, 22, // version, format
}
pInfo := []byte{
0, 0, 0, 0, // p_progid
0x00, 0x01, 0x00, 0x00, // p_filesize = 256 bytes (small for test)
0, 0, 0x10, 0, // p_blocksize
}
bInfo := []byte{
0x00, 0x01, 0x00, 0x00, // sz_unc = 256
0x80, 0x00, 0x00, 0x00, // sz_cpr = 128
99, 0, 0, 0, // unsupported method
}
data := append(append(lInfo, pInfo...), bInfo...)
data = append(data, make([]byte, 1000)...)
reader := bytes.NewReader(data)
_, err := decompressUPX(reader)
require.Error(t, err)
assert.ErrorIs(t, err, errUnsupportedUPXMethod)
}