very WIP: lazy union reader

Signed-off-by: Will Murphy <will.murphy@anchore.com>
This commit is contained in:
Will Murphy 2024-04-24 10:29:53 -04:00
parent 6440f26b5a
commit 20a26a0dfe
3 changed files with 444 additions and 23 deletions

View File

@ -0,0 +1,142 @@
package unionreader
import (
"bytes"
"errors"
"fmt"
"io"
"math"
"sync"
)
const readSize int64 = 1024 * 1024
// lazyUnionReader must implement UnionReader
var _ UnionReader = (*lazyUnionReader)(nil)
// lazyUnionReader wraps an io.Reader to make it into a logical ReadSeeker
// The reader maintains a []byte, which is everything that has been read so far.
// Otherwise, callers needing a ReadSeeker might copy the entire reader into
// a buffer in order to have a seeker.
type lazyUnionReader struct {
buf []byte // the bytes that have been read so far
cursor int64 // the current position where Read() will take place
done bool // whether we have seen EOF from rc
rc io.ReadCloser // the underlying reader
mu sync.Mutex // exported methods must acquire this lock before changing any field. Unexported methods assume their caller acquired the lock
}
func (c *lazyUnionReader) Read(p []byte) (n int, err error) {
c.mu.Lock()
defer c.mu.Unlock()
needBytes := int64(len(p))
newOffset := c.cursor + needBytes
err = c.ensureReadUntil(newOffset)
if err != nil && !errors.Is(err, io.EOF) {
return 0, err
}
// stop reading either at cursor + length p, or the end of the buffer, whichever is sooner
end := min(c.cursor+int64(len(p)), int64(len(c.buf)))
copy(p, c.buf[c.cursor:end])
n = int(end - c.cursor)
c.cursor = end
return n, err
}
func (c *lazyUnionReader) ReadAt(p []byte, off int64) (n int, err error) {
c.mu.Lock()
defer c.mu.Unlock()
needUntil := int64(len(p)) + off
err = c.ensureReadUntil(needUntil)
end := min(off+int64(len(p)), int64(len(c.buf)))
copy(p, c.buf[off:end])
return int(end - off), err
}
func (c *lazyUnionReader) Seek(offset int64, whence int) (int64, error) {
c.mu.Lock()
defer c.mu.Unlock()
var trueOffset int64
var err error
switch whence {
case io.SeekStart:
trueOffset = offset
case io.SeekCurrent:
trueOffset = offset + c.cursor
case io.SeekEnd:
err = c.readAll()
trueOffset = c.maxRead() + offset
}
if err != nil {
return 0, err
}
if trueOffset < 0 {
return 0, fmt.Errorf("request to read negative offset impossible %v", trueOffset)
}
c.cursor = trueOffset
return c.cursor, nil
}
func (c *lazyUnionReader) Close() error {
c.mu.Lock()
defer c.mu.Unlock()
return c.rc.Close()
}
func (c *lazyUnionReader) readAll() error {
buf, err := io.ReadAll(c.rc)
switch {
case err != nil && errors.Is(err, io.EOF):
err = nil
case err != nil:
return err
}
//c.maxRead = c.maxRead() + int64(len(buf))
c.buf = append(c.buf, buf...)
return nil
}
func (c *lazyUnionReader) ensureReadUntil(offset int64) error {
readN := offset - c.maxRead()
if readN <= 0 {
return nil
}
var buf bytes.Buffer
_, err := io.CopyN(&buf, c.rc, readN)
if err != nil && !errors.Is(err, io.EOF) {
return err
}
c.buf = append(c.buf, buf.Bytes()...)
return err
}
func (c *lazyUnionReader) maxRead() int64 {
return int64(len(c.buf))
}
func max(ints ...int64) int64 {
var maxSeen int64
for _, in := range ints {
if in > maxSeen {
maxSeen = in
}
}
return maxSeen
}
func min(ints ...int64) int64 {
minSeeen := int64(math.MaxInt64) // really? math.MaxInt64 has type int?
for _, n := range ints {
if n < minSeeen {
minSeeen = n
}
}
return minSeeen
}
func newLazyUnionReader(readCloser io.ReadCloser) (UnionReader, error) {
return &lazyUnionReader{
rc: readCloser,
mu: sync.Mutex{},
}, nil
}

View File

@ -0,0 +1,281 @@
package unionreader
import (
"bytes"
"fmt"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"io"
"strings"
"sync"
"testing"
)
type spyingCloser struct {
closed bool
io.Reader
}
func (s *spyingCloser) Close() error {
s.closed = true
return nil
}
func Test_lazyUnionReader_Close(t *testing.T) {
r := strings.NewReader("some string")
sc := &spyingCloser{
false,
r,
}
subject, err := newLazyUnionReader(sc)
require.NoError(t, err)
require.NoError(t, subject.Close())
assert.True(t, sc.closed)
}
func Test_lazyUnionReader_ReadAll(t *testing.T) {
rc := io.NopCloser(strings.NewReader("some data"))
subject, err := newLazyUnionReader(rc)
require.NoError(t, err)
bytes, err := io.ReadAll(subject)
require.NoError(t, err)
assert.Equal(t, "some data", string(bytes))
}
func Test_lazyUnionReader_RepeatedlyRead(t *testing.T) {
data := "some data for our reader that we need to read!"
rc := io.NopCloser(strings.NewReader(data))
subject, err := newLazyUnionReader(rc)
require.NoError(t, err)
var readErr error
var readResult []byte
for readErr == nil {
buf := make([]byte, 2)
var n int
n, readErr = subject.Read(buf)
readResult = append(readResult, buf[:n]...)
}
assert.Equal(t, data, string(readResult))
assert.ErrorIs(t, readErr, io.EOF)
}
func Test_lazyUnionReader_ReadAt(t *testing.T) {
readStart := make([]byte, 4)
readMid := make([]byte, 4)
readEnd := make([]byte, 4)
tests := []struct {
name string
dst []byte
off int64
wantN int
wantBytes []byte
wantEOF bool
}{
{
name: "read first 4 bytes",
dst: readStart,
off: 0,
wantN: 4,
wantBytes: []byte("0123"),
},
{
name: "read 4 bytes from middle",
dst: readMid,
off: 4,
wantN: 4,
wantBytes: []byte("4567"),
},
{
name: "read last 4 bytes",
dst: readEnd,
off: 12,
wantN: 4,
wantBytes: []byte("cdef"),
},
{
name: "read past end",
dst: make([]byte, 4),
off: 14,
wantN: 2,
wantBytes: []byte("ef"),
wantEOF: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
rc := io.NopCloser(strings.NewReader("0123456789abcdef"))
subject, err := newLazyUnionReader(rc)
require.NoError(t, err)
n, err := subject.ReadAt(tt.dst, tt.off)
assert.Equal(t, tt.wantN, n)
assert.Equal(t, string(tt.wantBytes), string(tt.dst[:tt.wantN]))
if tt.wantEOF {
assert.ErrorIs(t, err, io.EOF)
}
})
}
}
func Test_lazyUnionReader_Seek(t *testing.T) {
//const seek = 0
//const read = 1
type command struct {
seekOffset int64
seekWhence int
readDst []byte
}
data := []byte("this is a string of data that I'm very excited to share")
tests := []struct {
name string
commands []command
wantBytes []byte
wantEOF bool
}{
{
name: "read the first 4 bytes twice",
commands: []command{
{
readDst: make([]byte, 4),
},
{
seekOffset: 0,
seekWhence: io.SeekStart,
},
{
readDst: make([]byte, 4),
},
},
wantBytes: []byte("thisthis"),
},
{
name: "read the last 4 bytes twice",
commands: []command{
{
seekWhence: io.SeekEnd,
seekOffset: -4,
},
{
readDst: make([]byte, 4),
},
{
seekWhence: io.SeekEnd,
seekOffset: -4,
},
{
readDst: make([]byte, 4),
},
},
wantBytes: []byte("harehare"),
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
rc := io.NopCloser(bytes.NewReader(data))
subject, err := newLazyUnionReader(rc)
require.NoError(t, err)
var readSeekErr error
var readResult []byte
for _, c := range tt.commands {
var n int
if len(c.readDst) > 0 {
n, readSeekErr = subject.Read(c.readDst)
readResult = append(readResult, c.readDst[:n]...)
} else {
_, readSeekErr = subject.Seek(c.seekOffset, c.seekWhence)
}
}
if tt.wantEOF {
assert.ErrorIs(t, readSeekErr, io.EOF)
}
assert.Equal(t, string(tt.wantBytes), string(readResult))
})
}
}
func Test_lazyUnionReader_ensureReadUntil(t *testing.T) {
type fields struct {
buf []byte
cursor int64
maxRead int64
done bool
rc io.ReadCloser
mu sync.Mutex
}
type args struct {
offset int64
}
tests := []struct {
name string
fields fields
args args
wantErr assert.ErrorAssertionFunc
}{
// TODO: Add test cases.
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
c := &lazyUnionReader{
buf: tt.fields.buf,
cursor: tt.fields.cursor,
done: tt.fields.done,
rc: tt.fields.rc,
mu: tt.fields.mu,
}
tt.wantErr(t, c.ensureReadUntil(tt.args.offset), fmt.Sprintf("ensureReadUntil(%v)", tt.args.offset))
})
}
}
func Test_lazyUnionReader_readAll(t *testing.T) {
type fields struct {
buf []byte
cursor int64
maxRead int64
done bool
rc io.ReadCloser
mu sync.Mutex
}
tests := []struct {
name string
fields fields
wantErr assert.ErrorAssertionFunc
}{
// TODO: Add test cases.
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
c := &lazyUnionReader{
buf: tt.fields.buf,
cursor: tt.fields.cursor,
done: tt.fields.done,
rc: tt.fields.rc,
mu: tt.fields.mu,
}
tt.wantErr(t, c.readAll(), fmt.Sprintf("readAll()"))
})
}
}
func Test_newLazyUnionReader(t *testing.T) {
type args struct {
readCloser io.ReadCloser
}
tests := []struct {
name string
args args
want UnionReader
wantErr assert.ErrorAssertionFunc
}{
// TODO: Add test cases.
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got, err := newLazyUnionReader(tt.args.readCloser)
if !tt.wantErr(t, err, fmt.Sprintf("newLazyUnionReader(%v)", tt.args.readCloser)) {
return
}
assert.Equalf(t, tt.want, got, "newLazyUnionReader(%v)", tt.args.readCloser)
})
}
}

View File

@ -1,12 +1,9 @@
package unionreader
import (
"bytes"
"fmt"
"io"
macho "github.com/anchore/go-macholibre"
"github.com/anchore/syft/internal/log"
"io"
)
// UnionReader is a single interface with all reading functions needed by multi-arch binary catalogers
@ -43,23 +40,24 @@ func GetUnionReader(readerCloser io.ReadCloser) (UnionReader, error) {
if ok {
return reader, nil
}
b, err := io.ReadAll(readerCloser)
if err != nil {
return nil, fmt.Errorf("unable to read contents from binary: %w", err)
}
bytesReader := bytes.NewReader(b)
reader = struct {
io.ReadCloser
io.ReaderAt
io.Seeker
}{
ReadCloser: io.NopCloser(bytesReader),
ReaderAt: bytesReader,
Seeker: bytesReader,
}
return reader, nil
return newLazyUnionReader(readerCloser)
//
//b, err := io.ReadAll(readerCloser)
//if err != nil {
// return nil, fmt.Errorf("unable to read contents from binary: %w", err)
//}
//
//bytesReader := bytes.NewReader(b)
//
//reader = struct {
// io.ReadCloser
// io.ReaderAt
// io.Seeker
//}{
// ReadCloser: io.NopCloser(bytesReader),
// ReaderAt: bytesReader,
// Seeker: bytesReader,
//}
//
//return reader, nil
}