Skip to content

Commit e8292cc

Browse files
committed
Default CharsetReader improvements
When nil, the ReadSettings struct's CharsetReader field now causes the XML decoder to use a "pass-though" charset converter, passing the reader's data through without modification. This was already the default behavior when creating a new etree document with the NewDocument function, but now a default- constructed ReadSettings struct will result in the same default CharsetReader behavior.
1 parent 7113fd9 commit e8292cc

File tree

2 files changed

+31
-36
lines changed

2 files changed

+31
-36
lines changed

etree.go

Lines changed: 22 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,14 @@ var ErrXML = errors.New("etree: invalid XML format")
3131
var cdataPrefix = []byte("<![CDATA[")
3232

3333
// ReadSettings determine the default behavior of the Document's ReadFrom*
34-
// methods.
34+
// functions.
3535
type ReadSettings struct {
36-
// CharsetReader to be passed to standard xml.Decoder. Default: nil.
36+
// CharsetReader, if non-nil, defines a function to generate
37+
// charset-conversion readers, converting from the provided non-UTF-8
38+
// charset into UTF-8. If nil, the ReadFrom* functions will use a
39+
// "pass-through" CharsetReader that performs no conversion on the reader's
40+
// data regardless of the value of the "charset" encoding string. Default:
41+
// nil.
3742
CharsetReader func(charset string, input io.Reader) (io.Reader, error)
3843

3944
// Permissive allows input containing common mistakes such as missing tags
@@ -72,13 +77,11 @@ type ReadSettings struct {
7277
AutoClose []string
7378
}
7479

75-
// newReadSettings creates a default ReadSettings record.
76-
func newReadSettings() ReadSettings {
77-
return ReadSettings{
78-
CharsetReader: func(label string, input io.Reader) (io.Reader, error) {
79-
return input, nil
80-
},
81-
}
80+
// defaultCharsetReader is used by the xml decoder when the ReadSettings
81+
// CharsetReader value is nil. It behaves as a "pass-through", ignoring
82+
// the requested charset parameter and skipping conversion altogether.
83+
func defaultCharsetReader(charset string, input io.Reader) (io.Reader, error) {
84+
return input, nil
8285
}
8386

8487
// dup creates a duplicate of the ReadSettings object.
@@ -97,7 +100,7 @@ func (s *ReadSettings) dup() ReadSettings {
97100
}
98101
}
99102

100-
// WriteSettings determine the behavior of the Document's WriteTo* methods.
103+
// WriteSettings determine the behavior of the Document's WriteTo* functions.
101104
type WriteSettings struct {
102105
// CanonicalEndTags forces the production of XML end tags, even for
103106
// elements that have no child elements. Default: false.
@@ -118,31 +121,20 @@ type WriteSettings struct {
118121
// false.
119122
AttrSingleQuote bool
120123

121-
// UseCRLF causes the document's Indent* methods to use a carriage return
124+
// UseCRLF causes the document's Indent* functions to use a carriage return
122125
// followed by a linefeed ("\r\n") when outputting a newline. If false,
123126
// only a linefeed is used ("\n"). Default: false.
124127
//
125128
// Deprecated: UseCRLF is deprecated. Use IndentSettings.UseCRLF instead.
126129
UseCRLF bool
127130
}
128131

129-
// newWriteSettings creates a default WriteSettings record.
130-
func newWriteSettings() WriteSettings {
131-
return WriteSettings{
132-
CanonicalEndTags: false,
133-
CanonicalText: false,
134-
CanonicalAttrVal: false,
135-
AttrSingleQuote: false,
136-
UseCRLF: false,
137-
}
138-
}
139-
140132
// dup creates a duplicate of the WriteSettings object.
141133
func (s *WriteSettings) dup() WriteSettings {
142134
return *s
143135
}
144136

145-
// IndentSettings determine the behavior of the Document's Indent* methods.
137+
// IndentSettings determine the behavior of the Document's Indent* functions.
146138
type IndentSettings struct {
147139
// Spaces indicates the number of spaces to insert for each level of
148140
// indentation. Set to etree.NoIndent to remove all indentation. Ignored
@@ -158,7 +150,7 @@ type IndentSettings struct {
158150
// for a newline ("\n"). Default: false.
159151
UseCRLF bool
160152

161-
// PreserveLeafWhitespace causes indent methods to preserve whitespace
153+
// PreserveLeafWhitespace causes indent functions to preserve whitespace
162154
// within XML elements containing only non-CDATA character data. Default:
163155
// false.
164156
PreserveLeafWhitespace bool
@@ -200,7 +192,7 @@ func getIndentFunc(s *IndentSettings) indentFunc {
200192
}
201193
}
202194

203-
// Writer is the interface that wraps the Write* methods called by each token
195+
// Writer is the interface that wraps the Write* functions called by each token
204196
// type's WriteTo function.
205197
type Writer interface {
206198
io.StringWriter
@@ -265,7 +257,7 @@ const (
265257

266258
// CharData may be used to represent simple text data or a CDATA section
267259
// within an XML document. The Data property should never be modified
268-
// directly; use the SetData method instead.
260+
// directly; use the SetData function instead.
269261
type CharData struct {
270262
Data string // the simple text or CDATA section content
271263
parent *Element
@@ -298,9 +290,7 @@ type ProcInst struct {
298290
// NewDocument creates an XML document without a root element.
299291
func NewDocument() *Document {
300292
return &Document{
301-
Element: Element{Child: make([]Token, 0)},
302-
ReadSettings: newReadSettings(),
303-
WriteSettings: newWriteSettings(),
293+
Element: Element{Child: make([]Token, 0)},
304294
}
305295
}
306296

@@ -433,6 +423,9 @@ func validateXML(r io.Reader, settings ReadSettings) error {
433423
func newDecoder(r io.Reader, settings ReadSettings) *xml.Decoder {
434424
d := xml.NewDecoder(r)
435425
d.CharsetReader = settings.CharsetReader
426+
if d.CharsetReader == nil {
427+
d.CharsetReader = defaultCharsetReader
428+
}
436429
d.Strict = !settings.Permissive
437430
d.Entity = settings.Entity
438431
d.AutoClose = settings.AutoClose

etree_test.go

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -257,16 +257,14 @@ func TestDocumentCharsetReader(t *testing.T) {
257257
</Book>
258258
</Store>`
259259

260-
charsetLabel := ""
261260
doc := newDocumentFromString2(t, s, ReadSettings{
262261
CharsetReader: func(label string, input io.Reader) (io.Reader, error) {
263-
charsetLabel = label
264-
return &lowercaseCharsetReader{input}, nil
262+
if label == "lowercase" {
263+
return &lowercaseCharsetReader{input}, nil
264+
}
265+
return nil, errors.New("unknown charset")
265266
},
266267
})
267-
if charsetLabel != "lowercase" {
268-
t.Fatalf("etree: incorrect charset encoding, expected lowercase, got %s", charsetLabel)
269-
}
270268

271269
cases := []struct {
272270
path string
@@ -772,9 +770,13 @@ func TestSortAttrs(t *testing.T) {
772770
checkStrEq(t, out, `<el AAA="1" Foo="2" a01="3" aaa="4" foo="5" z="6" สวัสดี="7" a:AAA="8" a:ZZZ="9"/>`+"\n")
773771
}
774772

775-
func TestCharsetReaderEncoding(t *testing.T) {
773+
func TestCharsetReaderDefaultSetting(t *testing.T) {
774+
// Test encodings where the default pass-through charset conversion
775+
// should work for common single-byte character encodings.
776776
cases := []string{
777+
`<?xml version="1.0"?><foo></foo>`,
777778
`<?xml version="1.0" encoding="ISO-8859-1"?><foo></foo>`,
779+
`<?xml version="1.0" encoding="Windows-1252"?><foo></foo>`,
778780
`<?xml version="1.0" encoding="UTF-8"?><foo></foo>`,
779781
`<?xml version="1.0" encoding="US-ASCII"?><foo></foo>`,
780782
}

0 commit comments

Comments
 (0)