Skip to content

Commit da33d26

Browse files
implement OrgiData, OrigKey and Offset with basic tests
1 parent 118aa28 commit da33d26

File tree

5 files changed

+484
-40
lines changed

5 files changed

+484
-40
lines changed

case-and-offset_test.go

Lines changed: 326 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,326 @@
1+
package html
2+
3+
import (
4+
"bytes"
5+
"fmt"
6+
"io"
7+
"strconv"
8+
"testing"
9+
10+
atom "github.com/vugu/html/atom"
11+
)
12+
13+
// TODO: figure out where this goes in the test suite
14+
15+
func TestTokenizerOffset(t *testing.T) {
16+
17+
var buf bytes.Buffer
18+
linelen := 34 // includes newline
19+
fmt.Fprintf(&buf, "<html> \n")
20+
fmt.Fprintf(&buf, "<body> \n")
21+
for i := 0; i < 20000; i++ { // make this high enough to force reallocation and reading in blocks
22+
fmt.Fprintf(&buf, "<div id=\"%08d\">%08d</div>\n", i, i)
23+
}
24+
fmt.Fprintf(&buf, "</body> \n")
25+
fmt.Fprintf(&buf, "</html> \n")
26+
fmt.Fprintf(&buf, " \n")
27+
fmt.Fprintf(&buf, " \n")
28+
29+
// log.Printf("DATA:\n%s", buf.Bytes())
30+
31+
divnum := 0
32+
33+
z := NewTokenizer(bytes.NewReader(buf.Bytes()))
34+
loop:
35+
for {
36+
tt := z.Next()
37+
switch tt {
38+
39+
case ErrorToken:
40+
if z.Err() == io.EOF {
41+
break loop
42+
}
43+
t.Error(z.Err())
44+
t.FailNow()
45+
46+
case TextToken:
47+
48+
zText := z.Text()
49+
if zText[0] == '0' {
50+
51+
vi := -1
52+
fmt.Sscanf(string(zText), "%d", &vi)
53+
54+
zoff := z.Offset()
55+
if (float64(zoff-19)/float64(linelen))-2 != float64(vi) {
56+
t.Logf("BAD TEXT OFFSET: zoff = %d, vi = %d", zoff, vi)
57+
t.Fail()
58+
}
59+
60+
}
61+
62+
case StartTagToken:
63+
tn, _ := z.TagName()
64+
65+
if bytes.Compare(tn, []byte("div")) == 0 {
66+
67+
k, v, _ := z.TagAttr()
68+
if bytes.Compare(k, []byte("id")) != 0 {
69+
t.Errorf("unknown k: %s", k)
70+
}
71+
72+
vi := -1
73+
fmt.Sscanf(string(v), "%d", &vi)
74+
75+
zoff := z.Offset()
76+
if (float64(zoff)/float64(linelen))-2 != float64(vi) {
77+
t.Logf("BAD DIV OFFSET: zoff = %d, vi = %d", zoff, vi)
78+
t.Fail()
79+
}
80+
81+
divnum++
82+
}
83+
84+
case EndTagToken:
85+
86+
tn, _ := z.TagName()
87+
if bytes.Compare(tn, []byte("div")) == 0 {
88+
zoff := z.Offset()
89+
if (float64(zoff-27)/float64(linelen))-2 != float64(divnum-1) {
90+
b := buf.Bytes()[zoff : zoff+24]
91+
t.Logf("BAD DIV CLOSE OFFSET: zoff = %d, divnum = %d (bytes at offset: %q)", zoff, divnum, b)
92+
t.Fail()
93+
}
94+
}
95+
96+
}
97+
}
98+
99+
}
100+
101+
func TestParserOffset(t *testing.T) {
102+
103+
var buf bytes.Buffer
104+
// linelen := 34 // includes newline
105+
fmt.Fprintf(&buf, "<html> \n")
106+
fmt.Fprintf(&buf, "<body> \n")
107+
for i := 0; i < 20; i++ { // make this high enough to force reallocation and reading in blocks
108+
fmt.Fprintf(&buf, "<div id=\"%08d\">%08d</div>\n", i, i)
109+
}
110+
fmt.Fprintf(&buf, "</body> \n")
111+
fmt.Fprintf(&buf, "</html> \n")
112+
fmt.Fprintf(&buf, " \n")
113+
fmt.Fprintf(&buf, " \n")
114+
115+
n, err := Parse(bytes.NewReader(buf.Bytes()))
116+
if err != nil {
117+
t.Error(err)
118+
t.FailNow()
119+
}
120+
121+
var visit func(n *Node)
122+
visit = func(n *Node) {
123+
124+
// t.Logf("Node Type=%v, Data=%q, DataAtom=%v", n.Type, n.Data, n.DataAtom)
125+
// for _, a := range n.Attr {
126+
// t.Logf(" %s=%q", a.Key, a.Val)
127+
// }
128+
129+
if n.DataAtom == atom.Div {
130+
var idv int
131+
for _, a := range n.Attr {
132+
if a.Key == "id" {
133+
idv, _ = strconv.Atoi(a.Val)
134+
break
135+
136+
}
137+
}
138+
t.Logf("n.Offset = %d, idv = %d", n.Offset, idv)
139+
// n.Offset
140+
}
141+
142+
if n.FirstChild != nil {
143+
visit(n.FirstChild)
144+
}
145+
if n.NextSibling != nil {
146+
visit(n.NextSibling)
147+
}
148+
}
149+
visit(n)
150+
151+
}
152+
153+
// func TestTokenizerPreserveCase(t *testing.T) {
154+
155+
// inHTML := `<!doctype html>
156+
// <html>
157+
// <body>
158+
// <Div id="test1" Class="something"></Div> <!-- tag that matches an Atom -->
159+
// <some-Other id="test2" othER-attr="blah"></some-Other> <!-- random tag not an Atom -->
160+
// Some other random text here.
161+
// </body>
162+
// </html>`
163+
// inHTMLB := []byte(inHTML)
164+
// // defer func() {
165+
// // t.Logf("inHTMLB after: %s", inHTMLB)
166+
// // }()
167+
168+
// z := NewTokenizer(bytes.NewReader(inHTMLB))
169+
// // z.PreserveCase(true)
170+
171+
// loop:
172+
// for {
173+
// tt := z.Next()
174+
// t.Logf("Offset: %d", z.Offset())
175+
// switch tt {
176+
// case ErrorToken:
177+
// if z.Err() == io.EOF {
178+
// break loop
179+
// }
180+
// t.Error(z.Err())
181+
// t.FailNow()
182+
// case TextToken:
183+
// t.Logf("TextToken: %s", z.Text())
184+
// case StartTagToken:
185+
// tn, tno, ha := z.TagNameAndOrig()
186+
// t.Logf("StartTagToken, tag=%s origTagName=%s, hasAttr=%v", tn, tno, ha)
187+
// tns := string(tn)
188+
189+
// if tns == "some-other" || tns == "div" {
190+
// t.Logf("Should not have gotten lower case %q as element name", tns)
191+
// t.Fail()
192+
// }
193+
194+
// var k, ko, v []byte
195+
// for ha {
196+
// k, ko, v, ha = z.TagAttrAndOrig()
197+
// t.Logf(" attr: %s (orig=%s) = %q", k, ko, v)
198+
// if string(k) == "class" {
199+
// t.Logf("Should not have gotten lower case %q as attribute name", string(k))
200+
// t.Fail()
201+
// }
202+
// }
203+
204+
// case EndTagToken:
205+
// tn, tno, _ := z.TagNameAndOrig()
206+
// t.Logf("EndTagToken, tag=%s (origTagName=%s)", tn, tno)
207+
// default:
208+
// t.Logf("Other Token: %v", tt)
209+
// }
210+
// }
211+
212+
// }
213+
214+
func TestParserPreserveCase(t *testing.T) {
215+
216+
if s := atom.String([]byte("Div")); s != "Div" {
217+
t.Logf("atom.String() returned %q instead of Div", s)
218+
t.Fail()
219+
}
220+
221+
inHTML := `<!doctype html>
222+
<html>
223+
<body>
224+
<Div id="test1" Class="something"></Div> <!-- tag that matches an Atom -->
225+
<some-Other id="test2" othER-attr="blah"></some-Other> <!-- random tag not an Atom -->
226+
Some other random text here.
227+
</body>
228+
</html>`
229+
inHTMLB := []byte(inHTML)
230+
231+
node, err := ParseWithOptions(bytes.NewReader(inHTMLB))
232+
if err != nil {
233+
t.Error(err)
234+
t.FailNow()
235+
}
236+
237+
checked := 0
238+
239+
var visit func(n *Node)
240+
visit = func(n *Node) {
241+
242+
if n.DataAtom == atom.Div {
243+
if n.OrigData != "Div" {
244+
t.Logf("Expected Div got: %q", n.OrigData)
245+
t.Fail()
246+
}
247+
checked++
248+
for _, a := range n.Attr {
249+
if a.Key == "class" {
250+
if a.OrigKey != "Class" {
251+
t.Logf("Expected Class got: %q", a.OrigKey)
252+
t.Fail()
253+
}
254+
checked++
255+
}
256+
}
257+
}
258+
259+
if n.Data == "some-other" {
260+
if n.OrigData != "some-Other" {
261+
t.Logf("Expected some-Other got: %q", n.OrigData)
262+
t.Fail()
263+
}
264+
checked++
265+
for _, a := range n.Attr {
266+
if a.Key == "other-attr" {
267+
if a.OrigKey != "othER-attr" {
268+
t.Logf("Expected othER-attr got: %q", a.OrigKey)
269+
t.Fail()
270+
}
271+
checked++
272+
}
273+
}
274+
}
275+
276+
if n.FirstChild != nil {
277+
visit(n.FirstChild)
278+
}
279+
if n.NextSibling != nil {
280+
visit(n.NextSibling)
281+
}
282+
}
283+
visit(node)
284+
285+
if checked != 4 {
286+
t.Errorf("expected 4 checks but did %v", checked)
287+
t.Fail()
288+
}
289+
290+
}
291+
292+
// func TestParserPreserveCase(t *testing.T) {
293+
294+
// inHTML := `<!doctype html>
295+
// <html>
296+
// <body>
297+
// <Div id="test1" Class="something"></Div> <!-- tag that matches an Atom -->
298+
// <some-Other id="test2" othER-attr="blah"></some-Other> <!-- random tag not an Atom -->
299+
// Some other random text here.
300+
// </body>
301+
// </html>`
302+
// inHTMLB := []byte(inHTML)
303+
304+
// // node, err := ParseWithOptions(bytes.NewReader(inHTMLB), ParseOptionPreserveCase(true))
305+
// node, err := ParseWithOptions(bytes.NewReader(inHTMLB))
306+
// if err != nil {
307+
// t.Error(err)
308+
// t.FailNow()
309+
// }
310+
311+
// var visit func(n *Node)
312+
// visit = func(n *Node) {
313+
// t.Logf("Node Type=%v, Data=%q, DataAtom=%v", n.Type, n.Data, n.DataAtom)
314+
// for _, a := range n.Attr {
315+
// t.Logf(" %s=%q", a.Key, a.Val)
316+
// }
317+
// if n.FirstChild != nil {
318+
// visit(n.FirstChild)
319+
// }
320+
// if n.NextSibling != nil {
321+
// visit(n.NextSibling)
322+
// }
323+
// }
324+
// visit(node)
325+
326+
// }

escape.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,26 @@ func lower(b []byte) []byte {
193193
return b
194194
}
195195

196+
// lowerCopy is like lower() but if lower casing is required the original is
197+
// copied first and returned as original. if the input is already lower cased
198+
// then the original slice is returned for both lowered and original
199+
func lowerCopy(b []byte) (lowered, original []byte) {
200+
lowered = b
201+
for i, c := range b {
202+
if 'A' <= c && c <= 'Z' {
203+
if original == nil {
204+
original = make([]byte, len(b))
205+
copy(original, b)
206+
}
207+
b[i] = c + 'a' - 'A'
208+
}
209+
}
210+
if original == nil {
211+
original = lowered
212+
}
213+
return
214+
}
215+
196216
const escapedChars = "&'<>\"\r"
197217

198218
func escape(w writer, s string) error {

node.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@ type Node struct {
4444
Data string
4545
Namespace string
4646
Attr []Attribute
47+
48+
OrigData string // OrigData is original case
49+
Offset int // offset is the starting byte offset into the origial input
4750
}
4851

4952
// InsertBefore inserts newChild as a child of n, immediately before oldChild
@@ -139,6 +142,8 @@ func (n *Node) clone() *Node {
139142
DataAtom: n.DataAtom,
140143
Data: n.Data,
141144
Attr: make([]Attribute, len(n.Attr)),
145+
OrigData: n.OrigData,
146+
Offset: n.Offset,
142147
}
143148
copy(m.Attr, n.Attr)
144149
return m

0 commit comments

Comments
 (0)