Skip to content

Commit 1076a53

Browse files
committed
compiler: use converted strings internally
* tokenizer parses and converts the source strings * use `Token.text_idx` only for strings and comments * `Token.text_len` no longer counts the null terminator, as the name implies * add `Token.name_idx` for consistency for other uses * simplify multi string concatenation. * accept unicode escape sequences in strings and character constants * report invalid UTF-8 sequences in strings and character constants * add `StringLiteral.getTextIndex()` * add `string_buffer.Buf.encodeBytes()` to encode string fragments with * add `string_buffer.Buf.add_utf8()` to encode Unicode codepoints in UTF-8 backslash escapes. * add **utf8** module for encoding and decoding Unicode code points. * allow up to 255 bytes in string `case` labels * detect and report duplicate string case labels * detect and report string case labels `case` with embedded nulls bytes * encode plugins/shell_cmd_plugin.c2 process output * add tests
1 parent 269810a commit 1076a53

18 files changed

+491
-239
lines changed

analyser/module_analyser_switch.c2

Lines changed: 29 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import init_checker;
2121
import src_loc local;
2222
import scope;
2323
import string_buffer;
24+
import string;
2425

2526
fn void Analyser.analyseSwitchStmt(Analyser* ma, Stmt* s) {
2627
SwitchStmt* sw = cast<SwitchStmt*>(s);
@@ -58,6 +59,7 @@ fn void Analyser.analyseSwitchStmt(Analyser* ma, Stmt* s) {
5859

5960
init_checker.Checker checker = init_checker.Checker.create(numCases);
6061

62+
bool ok = true;
6163
for (u32 i=0; i<numCases; i++) {
6264
SwitchCase* c = cases[i];
6365
bool is_last = (i+1 == numCases);
@@ -79,11 +81,12 @@ fn void Analyser.analyseSwitchStmt(Analyser* ma, Stmt* s) {
7981
}
8082
}
8183

82-
bool ok = ma.analyseCase(c, &checker, etd, is_string);
84+
ok &= ma.analyseCase(c, &checker, etd, is_string);
8385
ma.scope.exit(ma.has_error);
84-
if (!ok) return;
8586
}
8687

88+
if (!ok) return;
89+
8790
ma.scope.exit(ma.has_error);
8891

8992
if (etd) {
@@ -202,25 +205,44 @@ fn bool Analyser.analyseCaseCondition(Analyser* ma,
202205
} else {
203206
Expr* orig = c.getCond();
204207
QualType qt = ma.analyseExpr(c.getCond2(), true, RHS);
208+
205209
if (qt.isInvalid()) return false;
206210
cond.setType(qt);
207211

208212
if (is_string) {
213+
u32 index;
209214
if (orig.isNil()) {
210-
// TODO: check for duplicate nil
215+
index = 0;
216+
SrcLoc duplicate = checker.find(index);
217+
if (duplicate) {
218+
ma.errorRange(cond.getLoc(), cond.getRange(), "duplicate case value nil");
219+
ma.note(duplicate, "previous case is here");
220+
return false;
221+
}
211222
} else
212223
if (orig.isStringLiteral()) {
213224
StringLiteral* lit = cast<StringLiteral*>(orig);
214-
if (lit.getSize() > 255) {
215-
ma.error(cond.getLoc(), "string switch case string is loo long (max 254 bytes)");
225+
u32 len = lit.getSize() - 1;
226+
if (len > 255) {
227+
ma.error(cond.getLoc(), "string switch case string is loo long (max 255 bytes)");
228+
return false;
229+
}
230+
if (string.memchr(lit.getText(), 0, len)) {
231+
ma.error(cond.getLoc(), "case string value has embedded null byte");
232+
return false;
233+
}
234+
index = lit.getTextIndex();
235+
SrcLoc duplicate = checker.find(index);
236+
if (duplicate) {
237+
ma.errorRange(cond.getLoc(), cond.getRange(), "duplicate case string");
238+
ma.note(duplicate, "previous case is here");
216239
return false;
217240
}
218-
// TODO: check for embedded null bytes
219-
// TODO: check for duplicate string
220241
} else {
221242
ma.error(cond.getLoc(), "string switch case can only have a string literal or nil as condition");
222243
return false;
223244
}
245+
checker.add(index, cond.getLoc());
224246
} else {
225247
if (!cond.isCtv()) {
226248
ma.error(cond.getLoc(), "case condition is not compile-time constant");

ast/string_literal.c2

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,24 +29,30 @@ public fn StringLiteral* StringLiteral.create(ast_context.Context* c, SrcLoc loc
2929
StringLiteral* e = c.alloc(sizeof(StringLiteral));
3030
e.base.init(ExprKind.StringLiteral, loc, 0, 1, 0, ValType.LValue);
3131
e.value = value;
32-
e.size = len; // len includes the null terminator
32+
e.size = len + 1; // size includes the null terminator
3333
#if AstStatistics
3434
Stats.addExpr(ExprKind.StringLiteral, sizeof(StringLiteral));
3535
#endif
36-
e.base.setType(getStringType(len));
36+
e.base.setType(getStringType(len + 1));
3737
return e;
3838
}
3939

4040
public fn const char* StringLiteral.getText(const StringLiteral* e) {
4141
return idx2name(e.value);
4242
}
4343

44+
public fn u32 StringLiteral.getTextIndex(const StringLiteral* e) {
45+
return e.value;
46+
}
47+
4448
public fn u32 StringLiteral.getSize(const StringLiteral* e) {
4549
return e.size;
4650
}
4751

4852
public fn void StringLiteral.printLiteral(const StringLiteral* e, string_buffer.Buf* out) {
49-
out.print("\"%s\"", idx2name(e.value));
53+
out.add1('"');
54+
out.encodeBytes(idx2name(e.value), e.size - 1, '"');
55+
out.add1('"');
5056
}
5157

5258
fn void StringLiteral.print(const StringLiteral* e, string_buffer.Buf* out, u32 indent) {

ast_utils/string_buffer.c2

Lines changed: 58 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ import stdio local;
1919
import stdarg local;
2020
import stdlib local;
2121
import string local;
22+
import utf8;
2223

2324
public type Buf struct @(opaque) {
2425
u32 capacity;
@@ -78,10 +79,7 @@ public fn void Buf.clear(Buf* buf) {
7879
}
7980

8081
public fn void Buf.color(Buf* buf, const char* color) {
81-
if (!buf.colors) return;
82-
83-
u32 len = cast<u32>(strlen(color));
84-
buf.add2(color, len);
82+
if (buf.colors) buf.add(color);
8583
}
8684

8785
public fn void Buf.add1(Buf* buf, char c) {
@@ -191,3 +189,59 @@ public fn void Buf.stripNewline(Buf* buf) {
191189
}
192190
}
193191

192+
public fn u32 Buf.add_utf8(Buf* buf, u32 cc) {
193+
char[4] tab;
194+
u32 clen = utf8.encode(tab, elemsof(tab), cc);
195+
buf.add2(tab, clen);
196+
return clen;
197+
}
198+
199+
public fn u32 Buf.encodeBytes(Buf* buf, const char *p, u32 len, char sep) {
200+
u32 size = buf.size_;
201+
u32 copy = 0;
202+
const char* end = p + len;
203+
while (p < end) {
204+
u8 c = *p++;
205+
switch (c) {
206+
case '\a': c = 'a'; goto add_char;
207+
case '\b': c = 'b'; goto add_char;
208+
case '\f': c = 'f'; goto add_char;
209+
case '\n': c = 'n'; goto add_char;
210+
case '\r': c = 'r'; goto add_char;
211+
case '\t': c = 't'; goto add_char;
212+
case '\v': c = 'v'; goto add_char;
213+
case '"':
214+
case '\'':
215+
if (sep && sep != c) goto normal;
216+
fallthrough;
217+
case '\\':
218+
add_char:
219+
if (copy) buf.add2(p - copy - 1, copy);
220+
buf.add1('\\');
221+
buf.add1(c);
222+
copy = 0;
223+
break;
224+
default:
225+
if (c < ' ' || c >= 0x7F) {
226+
char[4] arr;
227+
if (copy) buf.add2(p - copy - 1, copy);
228+
arr[0] = '\\';
229+
arr[1] = '0' + ((c >> 6) & 7);
230+
arr[2] = '0' + ((c >> 3) & 7);
231+
arr[3] = '0' + (c & 7);
232+
u32 esc_len = 4;
233+
// special case \0 not followed by another digit
234+
if (c == 0 && (p == end || !(*p >= '0' && *p <= '9')))
235+
esc_len = 2;
236+
buf.add2(arr, esc_len);
237+
copy = 0;
238+
break;
239+
}
240+
normal:
241+
copy++;
242+
break;
243+
}
244+
}
245+
if (copy) buf.add2(p - copy, copy);
246+
return buf.size_ - size;
247+
}

common/utf8.c2

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
/* Copyright 2022-2025 Charlie Gordon
2+
*
3+
* Licensed under the Apache License, Version 2.0 (the "License");
4+
* you may not use this file except in compliance with the License.
5+
* You may obtain a copy of the License at
6+
*
7+
* http://www.apache.org/licenses/LICENSE-2.0
8+
*
9+
* Unless required by applicable law or agreed to in writing, software
10+
* distributed under the License is distributed on an "AS IS" BASIS,
11+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
* See the License for the specific language governing permissions and
13+
* limitations under the License.
14+
*/
15+
16+
module utf8;
17+
18+
public const u32 MB_CUR_MAX = 6; // UTF-8 uses just 4
19+
20+
public fn u32 encode(char *dest, u32 max_len, u32 cc) {
21+
if (cc < 0x80) {
22+
if (max_len >= 1) {
23+
dest[0] = cast<char>(cc);
24+
return 1;
25+
}
26+
} else
27+
if (cc < 0x800) {
28+
if (max_len >= 2) {
29+
dest[0] = cast<char>(0xC0 + (cc >> 6));
30+
dest[1] = cast<char>(0x80 + (cc & 0x3F));
31+
return 2;
32+
}
33+
} else
34+
if (cc < 0x10000) {
35+
if (max_len >= 3) {
36+
dest[0] = cast<char>(0xE0 + (cc >> 12));
37+
dest[1] = cast<char>(0x80 + ((cc >> 6) & 0x3F));
38+
dest[2] = cast<char>(0x80 + (cc & 0x3F));
39+
return 3;
40+
}
41+
} else
42+
if (cc < 0x110000) {
43+
if (max_len >= 4) {
44+
dest[0] = cast<char>(0xF0 + (cc >> 18));
45+
dest[1] = cast<char>(0x80 + ((cc >> 12) & 0x3F));
46+
dest[2] = cast<char>(0x80 + ((cc >> 6) & 0x3F));
47+
dest[3] = cast<char>(0x80 + (cc & 0x3F));
48+
return 4;
49+
}
50+
}
51+
return 0;
52+
}
53+
54+
public fn u32 decode(const char *p, u32 max_len, u32* pc) {
55+
if (!max_len)
56+
return 0;
57+
58+
u32 c = cast<u8>(*p++);
59+
if (c < 0x80) {
60+
*pc = c;
61+
return 1;
62+
} else
63+
if (c < 0xC2) {
64+
// invalid prefix byte or naked trailing byte
65+
} else
66+
if (c < 0xE0) {
67+
if (max_len >= 2 && p[0] >= 0x80 && p[0] <= 0xBF) {
68+
*pc = ((c - 0xC0) << 6) + (p[0] - 0x80);
69+
return 2;
70+
}
71+
} else
72+
if (c < 0xF0) {
73+
if (max_len >= 3
74+
&& p[0] >= 0x80 && p[0] <= 0xBF
75+
&& p[1] >= 0x80 && p[1] <= 0xBF) {
76+
c = ((c - 0xE0) << 12) + ((p[0] - 0x80) << 6) + (p[1] - 0x80);
77+
if (c >= 0x800) {
78+
*pc = c;
79+
return 3;
80+
}
81+
}
82+
} else
83+
if (c <= 0xF4) {
84+
if (max_len >= 4
85+
&& p[0] >= 0x80 && p[0] <= 0xBF
86+
&& p[1] >= 0x80 && p[1] <= 0xBF
87+
&& p[2] >= 0x80 && p[2] <= 0xBF) {
88+
c = ((c - 0xF0) << 18) + ((p[0] - 0x80) << 12) +
89+
((p[1] - 0x80) << 6) + (p[2] - 0x80);
90+
if (c >= 0x10000 && c < 0x110000) {
91+
*pc = c;
92+
return 4;
93+
}
94+
}
95+
}
96+
return 0;
97+
}

generator/c_generator_call.c2

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import printf_utils;
2020
import source_mgr;
2121
import src_loc local;
2222
import string_buffer;
23+
import string;
2324

2425
fn void Generator.emitCall(Generator* gen, string_buffer.Buf* out, Expr* e) {
2526
CallExpr* call = cast<CallExpr*>(e);
@@ -135,7 +136,8 @@ fn void Generator.emitCall(Generator* gen, string_buffer.Buf* out, Expr* e) {
135136
FormatChanger fc = { format_text, &args[call_index+1], 0, 0, out }
136137
out.add1('"');
137138
printf_utils.parseFormat(format_text, on_format_specifier, &fc);
138-
out.add(format_text + fc.last_offset);
139+
out.encodeBytes(format_text + fc.last_offset,
140+
cast<u32>(string.strlen(format_text + fc.last_offset)), '"');
139141
out.add1('"');
140142
} else {
141143
gen.emitExpr(out, args[call_index]);

generator/c_generator_expr.c2

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -376,7 +376,7 @@ fn bool on_format_specifier(void* context, printf_utils.Specifier specifier, u32
376376
FormatChanger* fc = context;
377377

378378
/* copy optional flags, width and precision */
379-
fc.out.add2(fc.format + fc.last_offset, offset - fc.last_offset);
379+
fc.out.encodeBytes(fc.format + fc.last_offset, offset - fc.last_offset, '"');
380380

381381
fc.idx += stars;
382382
QualType qt = fc.args[fc.idx].getType();

0 commit comments

Comments
 (0)