@@ -3,16 +3,18 @@ package lexer
3
3
import (
4
4
"fmt"
5
5
"strings"
6
+ "unicode/utf8"
6
7
7
8
"github.com/expr-lang/expr/file"
8
9
)
9
10
11
+ const minTokens = 10
12
+
10
13
func Lex (source file.Source ) ([]Token , error ) {
11
14
raw := source .String ()
12
15
l := & lexer {
13
16
raw : raw ,
14
- runes : []rune (raw ),
15
- tokens : make ([]Token , 0 ),
17
+ tokens : make ([]Token , 0 , minTokens ),
16
18
}
17
19
18
20
for state := root ; state != nil ; {
@@ -28,7 +30,6 @@ func Lex(source file.Source) ([]Token, error) {
28
30
29
31
type lexer struct {
30
32
raw string
31
- runes []rune
32
33
tokens []Token
33
34
err * file.Error
34
35
start , end pos
@@ -46,25 +47,37 @@ func (l *lexer) commit() {
46
47
}
47
48
48
49
func (l * lexer ) next () rune {
49
- if l .end .rune >= len (l .runes ) {
50
+ if l .end .byte >= len (l .raw ) {
50
51
l .eof = true
51
52
return eof
52
53
}
53
- r := l . runes [l .end .rune ]
54
+ r , sz := utf8 . DecodeRuneInString ( l . raw [l .end .byte :])
54
55
l .end .rune ++
56
+ l .end .byte += sz
55
57
return r
56
58
}
57
59
58
60
func (l * lexer ) peek () rune {
59
- r := l .next ()
60
- l .backup ()
61
- return r
61
+ if l .end .byte < len (l .raw ) {
62
+ r , _ := utf8 .DecodeRuneInString (l .raw [l .end .byte :])
63
+ return r
64
+ }
65
+ return eof
66
+ }
67
+
68
+ func (l * lexer ) peekByte () (byte , bool ) {
69
+ if l .end .byte >= 0 && l .end .byte < len (l .raw ) {
70
+ return l .raw [l .end .byte ], true
71
+ }
72
+ return 0 , false
62
73
}
63
74
64
75
func (l * lexer ) backup () {
65
76
if l .eof {
66
77
l .eof = false
67
- } else {
78
+ } else if l .end .rune > 0 {
79
+ _ , sz := utf8 .DecodeLastRuneInString (l .raw [:l .end .byte ])
80
+ l .end .byte -= sz
68
81
l .end .rune --
69
82
}
70
83
}
@@ -103,54 +116,27 @@ func (l *lexer) skip() {
103
116
}
104
117
105
118
func (l * lexer ) word () string {
106
- // TODO: boundary check is NOT needed here, but for some reason CI fuzz tests are failing.
107
- if l .start .rune > len (l .runes ) || l .end .rune > len (l .runes ) {
108
- return "__invalid__"
109
- }
110
- return string (l .runes [l .start .rune :l .end .rune ])
119
+ return l .raw [l .start .byte :l .end .byte ]
111
120
}
112
121
113
122
func (l * lexer ) accept (valid string ) bool {
114
- if strings .ContainsRune (valid , l .next ()) {
123
+ if strings .ContainsRune (valid , l .peek ()) {
124
+ l .next ()
115
125
return true
116
126
}
117
- l .backup ()
118
127
return false
119
128
}
120
129
121
130
func (l * lexer ) acceptRun (valid string ) {
122
- for strings . ContainsRune ( valid , l . next () ) {
131
+ for l . accept ( valid ) {
123
132
}
124
- l .backup ()
125
133
}
126
134
127
135
func (l * lexer ) skipSpaces () {
128
- r := l .peek ()
129
- for ; r == ' ' ; r = l .peek () {
130
- l .next ()
131
- }
136
+ l .acceptRun (" " )
132
137
l .skip ()
133
138
}
134
139
135
- func (l * lexer ) acceptWord (word string ) bool {
136
- pos := l .end
137
-
138
- l .skipSpaces ()
139
-
140
- for _ , ch := range word {
141
- if l .next () != ch {
142
- l .end = pos
143
- return false
144
- }
145
- }
146
- if r := l .peek (); r != ' ' && r != eof {
147
- l .end = pos
148
- return false
149
- }
150
-
151
- return true
152
- }
153
-
154
140
func (l * lexer ) error (format string , args ... any ) stateFn {
155
141
if l .err == nil { // show first error
156
142
end := l .end .rune
@@ -238,6 +224,6 @@ func (l *lexer) scanRawString(quote rune) (n int) {
238
224
ch = l .next ()
239
225
n ++
240
226
}
241
- l .emitValue (String , string ( l . runes [l .start .rune + 1 :l .end .rune - 1 ]) )
227
+ l .emitValue (String , l . raw [l .start .byte + 1 :l .end .byte - 1 ])
242
228
return
243
229
}
0 commit comments