Skip to content

Commit 89f608f

Browse files
committed
improve parser to use string instead of []rune
1 parent 10e4040 commit 89f608f

File tree

3 files changed

+37
-54
lines changed

3 files changed

+37
-54
lines changed

parser/lexer/lexer.go

Lines changed: 28 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,18 @@ package lexer
33
import (
44
"fmt"
55
"strings"
6+
"unicode/utf8"
67

78
"github.com/expr-lang/expr/file"
89
)
910

11+
const minTokens = 10
12+
1013
func Lex(source file.Source) ([]Token, error) {
1114
raw := source.String()
1215
l := &lexer{
1316
raw: raw,
14-
runes: []rune(raw),
15-
tokens: make([]Token, 0),
17+
tokens: make([]Token, 0, minTokens),
1618
}
1719

1820
for state := root; state != nil; {
@@ -28,7 +30,6 @@ func Lex(source file.Source) ([]Token, error) {
2830

2931
type lexer struct {
3032
raw string
31-
runes []rune
3233
tokens []Token
3334
err *file.Error
3435
start, end pos
@@ -46,25 +47,37 @@ func (l *lexer) commit() {
4647
}
4748

4849
func (l *lexer) next() rune {
49-
if l.end.rune >= len(l.runes) {
50+
if l.end.byte >= len(l.raw) {
5051
l.eof = true
5152
return eof
5253
}
53-
r := l.runes[l.end.rune]
54+
r, sz := utf8.DecodeRuneInString(l.raw[l.end.byte:])
5455
l.end.rune++
56+
l.end.byte += sz
5557
return r
5658
}
5759

5860
func (l *lexer) peek() rune {
59-
r := l.next()
60-
l.backup()
61-
return r
61+
if l.end.byte < len(l.raw) {
62+
r, _ := utf8.DecodeRuneInString(l.raw[l.end.byte:])
63+
return r
64+
}
65+
return eof
66+
}
67+
68+
func (l *lexer) peekByte() (byte, bool) {
69+
if l.end.byte >= 0 && l.end.byte < len(l.raw) {
70+
return l.raw[l.end.byte], true
71+
}
72+
return 0, false
6273
}
6374

6475
func (l *lexer) backup() {
6576
if l.eof {
6677
l.eof = false
67-
} else {
78+
} else if l.end.rune > 0 {
79+
_, sz := utf8.DecodeLastRuneInString(l.raw[:l.end.byte])
80+
l.end.byte -= sz
6881
l.end.rune--
6982
}
7083
}
@@ -103,54 +116,27 @@ func (l *lexer) skip() {
103116
}
104117

105118
func (l *lexer) word() string {
106-
// TODO: boundary check is NOT needed here, but for some reason CI fuzz tests are failing.
107-
if l.start.rune > len(l.runes) || l.end.rune > len(l.runes) {
108-
return "__invalid__"
109-
}
110-
return string(l.runes[l.start.rune:l.end.rune])
119+
return l.raw[l.start.byte:l.end.byte]
111120
}
112121

113122
func (l *lexer) accept(valid string) bool {
114-
if strings.ContainsRune(valid, l.next()) {
123+
if strings.ContainsRune(valid, l.peek()) {
124+
l.next()
115125
return true
116126
}
117-
l.backup()
118127
return false
119128
}
120129

121130
func (l *lexer) acceptRun(valid string) {
122-
for strings.ContainsRune(valid, l.next()) {
131+
for l.accept(valid) {
123132
}
124-
l.backup()
125133
}
126134

127135
func (l *lexer) skipSpaces() {
128-
r := l.peek()
129-
for ; r == ' '; r = l.peek() {
130-
l.next()
131-
}
136+
l.acceptRun(" ")
132137
l.skip()
133138
}
134139

135-
func (l *lexer) acceptWord(word string) bool {
136-
pos := l.end
137-
138-
l.skipSpaces()
139-
140-
for _, ch := range word {
141-
if l.next() != ch {
142-
l.end = pos
143-
return false
144-
}
145-
}
146-
if r := l.peek(); r != ' ' && r != eof {
147-
l.end = pos
148-
return false
149-
}
150-
151-
return true
152-
}
153-
154140
func (l *lexer) error(format string, args ...any) stateFn {
155141
if l.err == nil { // show first error
156142
end := l.end.rune
@@ -238,6 +224,6 @@ func (l *lexer) scanRawString(quote rune) (n int) {
238224
ch = l.next()
239225
n++
240226
}
241-
l.emitValue(String, string(l.runes[l.start.rune+1:l.end.rune-1]))
227+
l.emitValue(String, l.raw[l.start.byte+1:l.end.byte-1])
242228
return
243229
}

parser/lexer/token.go

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -31,17 +31,13 @@ func (t Token) String() string {
3131
}
3232

3333
func (t Token) Is(kind Kind, values ...string) bool {
34-
if len(values) == 0 {
35-
return kind == t.Kind
34+
if kind != t.Kind {
35+
return false
3636
}
37-
3837
for _, v := range values {
3938
if v == t.Value {
40-
goto found
39+
return true
4140
}
4241
}
43-
return false
44-
45-
found:
46-
return kind == t.Kind
42+
return len(values) == 0
4743
}

parser/lexer/utils.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,21 +36,22 @@ func unescape(value string) (string, error) {
3636
if size >= math.MaxInt {
3737
return "", fmt.Errorf("too large string")
3838
}
39-
buf := make([]byte, 0, size)
39+
buf := new(strings.Builder)
40+
buf.Grow(int(size))
4041
for len(value) > 0 {
4142
c, multibyte, rest, err := unescapeChar(value)
4243
if err != nil {
4344
return "", err
4445
}
4546
value = rest
4647
if c < utf8.RuneSelf || !multibyte {
47-
buf = append(buf, byte(c))
48+
buf.WriteByte(byte(c))
4849
} else {
4950
n := utf8.EncodeRune(runeTmp[:], c)
50-
buf = append(buf, runeTmp[:n]...)
51+
buf.Write(runeTmp[:n])
5152
}
5253
}
53-
return string(buf), nil
54+
return buf.String(), nil
5455
}
5556

5657
// unescapeChar takes a string input and returns the following info:

0 commit comments

Comments
 (0)