mirror of
https://github.com/anchore/syft.git
synced 2025-11-17 16:33:21 +01:00
Enable reading non-utf-8 encodings for java pom.xml files (#2047)
* fix reading non utf8 encodings Signed-off-by: Alex Goodman <wagoodman@users.noreply.github.com> * in cases where we cant tell the encoding use the UTF8 replacement char Signed-off-by: Alex Goodman <wagoodman@users.noreply.github.com> * decompose the xml decoding func to get a valid utf8 reader first and test unknown encoding Signed-off-by: Alex Goodman <wagoodman@users.noreply.github.com> --------- Signed-off-by: Alex Goodman <wagoodman@users.noreply.github.com>
This commit is contained in:
parent
ee121cff21
commit
17d4203bbb
1
go.mod
1
go.mod
@ -72,6 +72,7 @@ require (
|
|||||||
github.com/knqyf263/go-rpmdb v0.0.0-20230301153543-ba94b245509b
|
github.com/knqyf263/go-rpmdb v0.0.0-20230301153543-ba94b245509b
|
||||||
github.com/opencontainers/go-digest v1.0.0
|
github.com/opencontainers/go-digest v1.0.0
|
||||||
github.com/saferwall/pe v1.4.4
|
github.com/saferwall/pe v1.4.4
|
||||||
|
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d
|
||||||
github.com/sassoftware/go-rpmutils v0.2.0
|
github.com/sassoftware/go-rpmutils v0.2.0
|
||||||
github.com/vbatts/go-mtree v0.5.3
|
github.com/vbatts/go-mtree v0.5.3
|
||||||
github.com/zyedidia/generic v1.2.2-0.20230320175451-4410d2372cb1
|
github.com/zyedidia/generic v1.2.2-0.20230320175451-4410d2372cb1
|
||||||
|
|||||||
2
go.sum
2
go.sum
@ -596,6 +596,8 @@ github.com/ryanuber/columnize v0.0.0-20160712163229-9b3edd62028f/go.mod h1:sm1tb
|
|||||||
github.com/saferwall/pe v1.4.4 h1:Ml++7/2/Z1iKwV4zCsd1nIqTEAdUQKAetwbbcCarhOg=
|
github.com/saferwall/pe v1.4.4 h1:Ml++7/2/Z1iKwV4zCsd1nIqTEAdUQKAetwbbcCarhOg=
|
||||||
github.com/saferwall/pe v1.4.4/go.mod h1:SNzv3cdgk8SBI0UwHfyTcdjawfdnN+nbydnEL7GZ25s=
|
github.com/saferwall/pe v1.4.4/go.mod h1:SNzv3cdgk8SBI0UwHfyTcdjawfdnN+nbydnEL7GZ25s=
|
||||||
github.com/sagikazarmark/crypt v0.3.0/go.mod h1:uD/D+6UF4SrIR1uGEv7bBNkNqLGqUr43MRiaGWX1Nig=
|
github.com/sagikazarmark/crypt v0.3.0/go.mod h1:uD/D+6UF4SrIR1uGEv7bBNkNqLGqUr43MRiaGWX1Nig=
|
||||||
|
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d h1:hrujxIzL1woJ7AwssoOcM/tq5JjjG2yYOc8odClEiXA=
|
||||||
|
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU=
|
||||||
github.com/sassoftware/go-rpmutils v0.2.0 h1:pKW0HDYMFWQ5b4JQPiI3WI12hGsVoW0V8+GMoZiI/JE=
|
github.com/sassoftware/go-rpmutils v0.2.0 h1:pKW0HDYMFWQ5b4JQPiI3WI12hGsVoW0V8+GMoZiI/JE=
|
||||||
github.com/sassoftware/go-rpmutils v0.2.0/go.mod h1:TJJQYtLe/BeEmEjelI3b7xNZjzAukEkeWKmoakvaOoI=
|
github.com/sassoftware/go-rpmutils v0.2.0/go.mod h1:TJJQYtLe/BeEmEjelI3b7xNZjzAukEkeWKmoakvaOoI=
|
||||||
github.com/scylladb/go-set v1.0.3-0.20200225121959-cc7b2070d91e h1:7q6NSFZDeGfvvtIRwBrU/aegEYJYmvev0cHAwo17zZQ=
|
github.com/scylladb/go-set v1.0.3-0.20200225121959-cc7b2070d91e h1:7q6NSFZDeGfvvtIRwBrU/aegEYJYmvev0cHAwo17zZQ=
|
||||||
|
|||||||
@ -1,6 +1,7 @@
|
|||||||
package java
|
package java
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bytes"
|
||||||
"encoding/xml"
|
"encoding/xml"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
@ -8,6 +9,7 @@ import (
|
|||||||
"regexp"
|
"regexp"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
|
"github.com/saintfish/chardet"
|
||||||
"github.com/vifraa/gopom"
|
"github.com/vifraa/gopom"
|
||||||
"golang.org/x/net/html/charset"
|
"golang.org/x/net/html/charset"
|
||||||
|
|
||||||
@ -99,9 +101,15 @@ func newPackageFromPom(pom gopom.Project, dep gopom.Dependency, locations ...fil
|
|||||||
}
|
}
|
||||||
|
|
||||||
func decodePomXML(content io.Reader) (project gopom.Project, err error) {
|
func decodePomXML(content io.Reader) (project gopom.Project, err error) {
|
||||||
decoder := xml.NewDecoder(content)
|
inputReader, err := getUtf8Reader(content)
|
||||||
// prevent against warnings for "xml: encoding "iso-8859-1" declared but Decoder.CharsetReader is nil"
|
if err != nil {
|
||||||
|
return project, fmt.Errorf("unable to read pom.xml: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
decoder := xml.NewDecoder(inputReader)
|
||||||
|
// when an xml file has a character set declaration (e.g. '<?xml version="1.0" encoding="ISO-8859-1"?>') read that and use the correct decoder
|
||||||
decoder.CharsetReader = charset.NewReaderLabel
|
decoder.CharsetReader = charset.NewReaderLabel
|
||||||
|
|
||||||
if err := decoder.Decode(&project); err != nil {
|
if err := decoder.Decode(&project); err != nil {
|
||||||
return project, fmt.Errorf("unable to unmarshal pom.xml: %w", err)
|
return project, fmt.Errorf("unable to unmarshal pom.xml: %w", err)
|
||||||
}
|
}
|
||||||
@ -109,6 +117,33 @@ func decodePomXML(content io.Reader) (project gopom.Project, err error) {
|
|||||||
return project, nil
|
return project, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func getUtf8Reader(content io.Reader) (io.Reader, error) {
|
||||||
|
pomContents, err := io.ReadAll(content)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
detector := chardet.NewTextDetector()
|
||||||
|
detection, err := detector.DetectBest(pomContents)
|
||||||
|
|
||||||
|
var inputReader io.Reader
|
||||||
|
if err == nil && detection != nil {
|
||||||
|
if detection.Charset == "UTF-8" {
|
||||||
|
inputReader = bytes.NewReader(pomContents)
|
||||||
|
} else {
|
||||||
|
inputReader, err = charset.NewReaderLabel(detection.Charset, bytes.NewReader(pomContents))
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("unable to get encoding: %w", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// we could not detect the encoding, but we want a valid file to read. Replace unreadable
|
||||||
|
// characters with the UTF-8 replacement character.
|
||||||
|
inputReader = strings.NewReader(strings.ToValidUTF8(string(pomContents), "<22>"))
|
||||||
|
}
|
||||||
|
return inputReader, nil
|
||||||
|
}
|
||||||
|
|
||||||
func pomParent(pom gopom.Project, parent *gopom.Parent) (result *pkg.PomParent) {
|
func pomParent(pom gopom.Project, parent *gopom.Parent) (result *pkg.PomParent) {
|
||||||
if parent == nil {
|
if parent == nil {
|
||||||
return nil
|
return nil
|
||||||
|
|||||||
@ -1,10 +1,14 @@
|
|||||||
package java
|
package java
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"encoding/base64"
|
||||||
|
"io"
|
||||||
"os"
|
"os"
|
||||||
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"github.com/stretchr/testify/assert"
|
"github.com/stretchr/testify/assert"
|
||||||
|
"github.com/stretchr/testify/require"
|
||||||
"github.com/vifraa/gopom"
|
"github.com/vifraa/gopom"
|
||||||
|
|
||||||
"github.com/anchore/syft/syft/file"
|
"github.com/anchore/syft/syft/file"
|
||||||
@ -63,6 +67,60 @@ func Test_parserPomXML(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func Test_decodePomXML_surviveNonUtf8Encoding(t *testing.T) {
|
||||||
|
// regression for https://github.com/anchore/syft/issues/2044
|
||||||
|
|
||||||
|
// we are storing the base64 contents of the pom.xml file. We are doing this to prevent accidental changes to the
|
||||||
|
// file, which is extremely important for this test.
|
||||||
|
|
||||||
|
// for instance, even changing a single character in the file and saving in an IntelliJ IDE will automatically
|
||||||
|
// convert the file to UTF-8, which will break this test:
|
||||||
|
|
||||||
|
// xxd with the original pom.xml
|
||||||
|
// 00000780: 6964 3e0d 0a20 2020 2020 2020 2020 2020 id>..
|
||||||
|
// 00000790: 203c 6e61 6d65 3e4a e972 f46d 6520 4d69 <name>J.r.me Mi
|
||||||
|
// 000007a0: 7263 3c2f 6e61 6d65 3e0d 0a20 2020 2020 rc</name>..
|
||||||
|
|
||||||
|
// xxd with the pom.xml converted to UTF-8 (from a simple change with IntelliJ)
|
||||||
|
// 00000780: 6964 3e0d 0a20 2020 2020 2020 2020 2020 id>..
|
||||||
|
// 00000790: 203c 6e61 6d65 3e4a efbf bd72 efbf bd6d <name>J...r...m
|
||||||
|
// 000007a0: 6520 4d69 7263 3c2f 6e61 6d65 3e0d 0a20 e Mirc</name>..
|
||||||
|
|
||||||
|
// Note that the name "Jérôme Mirc" was originally interpreted as "J.r.me Mi" and after the save
|
||||||
|
// is now encoded as "J...r...m" which is not what we want (note the extra bytes for each non UTF-8 character.
|
||||||
|
// The original 0xe9 byte (é) was converted to 0xefbfbd (<28>) which is the UTF-8 replacement character.
|
||||||
|
// This is quite silly on the part of IntelliJ, but it is what it is.
|
||||||
|
|
||||||
|
cases := []struct {
|
||||||
|
name string
|
||||||
|
fixture string
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "undeclared encoding",
|
||||||
|
fixture: "test-fixtures/pom/undeclared-iso-8859-encoded-pom.xml.base64",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "declared encoding",
|
||||||
|
fixture: "test-fixtures/pom/declared-iso-8859-encoded-pom.xml.base64",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, c := range cases {
|
||||||
|
t.Run(c.name, func(t *testing.T) {
|
||||||
|
fh, err := os.Open(c.fixture)
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
decoder := base64.NewDecoder(base64.StdEncoding, fh)
|
||||||
|
|
||||||
|
proj, err := decodePomXML(decoder)
|
||||||
|
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.NotEmpty(t, proj.Developers)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
func Test_parseCommonsTextPomXMLProject(t *testing.T) {
|
func Test_parseCommonsTextPomXMLProject(t *testing.T) {
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
input string
|
input string
|
||||||
@ -401,3 +459,28 @@ func Test_resolveProperty(t *testing.T) {
|
|||||||
func stringPointer(s string) *string {
|
func stringPointer(s string) *string {
|
||||||
return &s
|
return &s
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func Test_getUtf8Reader(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
contents string
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "unknown encoding",
|
||||||
|
// random binary contents
|
||||||
|
contents: "BkiJz02JyEWE0nXR6TH///9NicpJweEETIucJIgAAABJicxPjQwhTY1JCE05WQh0BU2J0eunTYshTIusJIAAAAAPHwBNOeV1BUUx2+tWTIlUJDhMiUwkSEyJRCQgSIl8JFBMiQ==",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
decoder := base64.NewDecoder(base64.StdEncoding, strings.NewReader(tt.contents))
|
||||||
|
|
||||||
|
got, err := getUtf8Reader(decoder)
|
||||||
|
require.NoError(t, err)
|
||||||
|
gotBytes, err := io.ReadAll(got)
|
||||||
|
require.NoError(t, err)
|
||||||
|
// if we couldn't decode the section as UTF-8, we should get a replacement character
|
||||||
|
assert.Contains(t, string(gotBytes), "<22>")
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Loading…
x
Reference in New Issue
Block a user