1
+ /**
2
+ * The scanner is an extension to the built-in lexer that handles cases that are hard or impossible to express with the
3
+ * high-level grammar rules.
4
+ * Since Haskell is indentation sensitive and uses parse errors to end layouts, this component has many
5
+ * responsibilities.
6
+ *
7
+ * tree-sitter runs the scanner at every position repeatedly until it fails, after which the built-in lexer consumes one
8
+ * token.
9
+ * When the scanner succeeds, it returns the index of a symbol in the `externals` array in `grammar.js`, which is then
10
+ * processed like other grammar symbols, except that it terminates any conflict branches in which the symbol isn't
11
+ * valid.
12
+ * The scanner's state is persisted and passed into the next run, but it is discarded when the scanner fails, i.e. when
13
+ * it yields control back to the built-in lexer.
14
+ *
15
+ * The high-level workflow of the scanner consists of three distinct modes.
16
+ * When the first character after whitespace is a newline, the scanner starts newline lookahead, otherwise it processes
17
+ * an interior position.
18
+ * If the state indicates that the previous run performed newline lookahead, it enters newline processing mode.
19
+ *
20
+ * In interior mode, a single lexing pass is performed.
21
+ *
22
+ * Such a pass consists of two steps:
23
+ *
24
+ * In the first step, the scanner identifies the immediate token by branching on the first character after whitespace
25
+ * and examining different conditions to select one of the variants of the enum `Lexed`, which enumerates all known,
26
+ * interesting, situations.
27
+ * The position of the lexer may be advanced in the process to look at subsequent characters.
28
+ * To avoid having to arrange different parts of the logic according to how many characters have been consumed,
29
+ * lookahead is written to an array in the transient state on demand, so that each component can specify the index
30
+ * relative to the position at the beginning of the run (modulo whitespace).
31
+ * The entry point for this step is the function `lex`.
32
+ *
33
+ * The second step is different for each mode.
34
+ * In interior mode, the `Lexed` token determines which symbol to return to the grammar based on the current state, like
35
+ * layout contexts and valid symbols.
36
+ * Most symbols do not contain any text, but only act as conditions in the grammar, but for symbolic operators, CPP,
37
+ * comments, pragmas, and quasiquotes, the lexer is advanced to the end of the token and `mark_end` is called to
38
+ * communicate the range to tree-sitter.
39
+ *
40
+ * In newline lookahead mode, the scanner performs repeated lexing passes until it encounters a `Lexed` token that is
41
+ * not CPP or a comment.
42
+ * In the second step of each pass, the token determines whether to terminate and/or which flags to set in the state to
43
+ * guide processing in the next run.
44
+ * If the lookahead loop has only made a single lexing pass that did not consume any characters of the following token
45
+ * (because the first character did not match any of the conditions for lexing that require more lookahead), the scanner
46
+ * switches to newline processing mode directly; otherwise it terminates the run after storing the newline information
47
+ * in the persistent state.
48
+ * This is possible by succeeding with the symbol `UPDATE`, which is mapped to newline in `externals`.
49
+ * tree-sitter does not create a node in the parse tree for this symbol if `mark_end` wasn't called after consuming
50
+ * lookahead, and immediately calls the scanner again at the same position.
51
+ *
52
+ * In either case, the scanner ends up in newline processing mode, in which it performs a series of highly
53
+ * order-sensitive steps based on the data collected in lookahead mode, potentially returning multiple symbols in
54
+ * successive runs until none of the newline-related conditions match.
55
+ * This procedure ensures that nested layouts are terminated at the earliest position instead of extending over all
56
+ * subsequent (top-level) whitespace, comments and CPP up to the next layout element.
57
+ * Only when all layouts are terminated will the scanner process the final `Lexed` token that it stored in the state in
58
+ * lookahead mode, using the same logic as in interior mode, and update the state to disable newline processing for the
59
+ * next run.
60
+ */
1
61
#define DEBUG 0
2
62
3
63
#include "tree_sitter/parser.h"
69
129
// Symbols
70
130
// --------------------------------------------------------------------------------------------------------
71
131
132
+ /**
133
+ * This enum mirrors the symbols in `externals` in `grammar.js`.
134
+ * tree-sitter passes an array of booleans to the scanner whose entries are `true` if the symbol at the corresponding
135
+ * index is valid at the current parser position.
136
+ */
72
137
typedef enum {
73
138
FAIL ,
74
139
SEMICOLON ,
@@ -162,32 +227,26 @@ static const char *sym_names[] = {
162
227
// --------------------------------------------------------------------------------------------------------
163
228
164
229
#if DEBUG
165
- static char const * context_names [] = {
166
-
167
- "decls" ,
168
- "do" ,
169
- "case" ,
170
- "let" ,
171
- "multi_way_if" ,
172
- "quote" ,
173
- "braces" ,
174
- "texp" ,
175
- "module_header" ,
176
- "none" ,
177
- };
178
230
179
231
typedef struct {
180
232
unsigned len ;
181
233
unsigned cap ;
182
234
int32_t * data ;
183
235
} ParseLine ;
184
236
237
+ /**
238
+ * A vector of lines, persisted across runs, for visualizing the current lexer position and scanner lookahead.
239
+ */
185
240
typedef struct {
186
241
unsigned len ;
187
242
unsigned cap ;
188
243
ParseLine * data ;
189
244
} ParseLines ;
190
245
246
+ /**
247
+ * Info about calls to `mark_end` and how far the lexer has progressed in a run.
248
+ * Discarded after each run.
249
+ */
191
250
typedef struct {
192
251
int marked ;
193
252
unsigned marked_line ;
@@ -210,6 +269,9 @@ Debug debug_new(TSLexer *l) {
210
269
211
270
#endif
212
271
272
+ /**
273
+ * Different sorts of layout contexts that require special treatment.
274
+ */
213
275
typedef enum {
214
276
DeclLayout ,
215
277
DoLayout ,
@@ -223,11 +285,36 @@ typedef enum {
223
285
NoContext ,
224
286
} ContextSort ;
225
287
288
+ #if DEBUG
289
+
290
+ static char const * context_names [] = {
291
+ "decls" ,
292
+ "do" ,
293
+ "case" ,
294
+ "let" ,
295
+ "multi_way_if" ,
296
+ "quote" ,
297
+ "braces" ,
298
+ "texp" ,
299
+ "module_header" ,
300
+ "none" ,
301
+ };
302
+
303
+ #endif
304
+
305
+ /**
306
+ * The persistent state maintains a stack of layout contexts.
307
+ * New entries are created when a layout symbol is valid at the current position, and they are removed when the indent
308
+ * of a line satisfies conditions that depend on the current context sort, or when certain tokens (like `else`) occur.
309
+ */
226
310
typedef struct {
227
311
ContextSort sort ;
228
312
uint32_t indent ;
229
313
} Context ;
230
314
315
+ /**
316
+ * This enumerates the lookahead tokens that have special meaning in the scanner.
317
+ */
231
318
typedef enum {
232
319
LNothing ,
233
320
LEof ,
@@ -306,36 +393,82 @@ static const char *token_names[] = {
306
393
307
394
#endif
308
395
396
+ /**
397
+ * The current newline mode.
398
+ * `NInit` is set during newline lookahead, and `NProcess` when lookahead has finished.
399
+ * After processing is complete, the state is reset to `NInactive`.
400
+ * `NResume` is a special variant that forces newline lookahead mode when a run starts without requiring a newline.
401
+ * This is used for the beginning of the file and after pragmas (see `pragma`).
402
+ */
309
403
typedef enum {
310
404
NInactive ,
311
405
NInit ,
312
406
NProcess ,
313
407
NResume ,
314
408
} NewlineState ;
315
409
410
+ /**
411
+ * The two newline modes need to operate across multiple scanner runs and adapt their behavior to the context
412
+ * established by previous runs, encoded by this persistent state.
413
+ */
316
414
typedef struct {
317
415
NewlineState state ;
318
- uint32_t indent ;
416
+ // The final token encountered after skipping comments and CPP.
319
417
Lexed end ;
418
+ // The indent of `end`, used to decide layout actions before parsing intermediate extras.
419
+ uint32_t indent ;
420
+ // When there is no token after extras, we shouldn't start layouts.
320
421
bool eof ;
422
+ // Prohibit layout semicolons in future runs.
321
423
bool no_semi ;
424
+ // Prohibit layout semicolons in future runs, but can be relaxed by some actions.
425
+ // See `explicit_semicolon`.
322
426
bool skip_semi ;
427
+ // Lookahead has advanced into `end`, so the scanner has to be restarted before processing the newline result.
323
428
bool unsafe ;
324
429
} Newline ;
325
430
431
+ /**
432
+ * The vector for the layout context stack.
433
+ */
326
434
typedef struct {
327
435
uint32_t len ;
328
436
uint32_t cap ;
329
437
Context * data ;
330
438
} Contexts ;
331
439
440
+ /**
441
+ * Whenever the lexer is advanced over non-whitespace, the consumed character is appended to this vector.
442
+ * This avoids having to ensure that different components that need to examine multiple lookahead characters have to be
443
+ * run in the correct order.
444
+ * Instead, we refer to lookahead by the character's index using the interface described in the section 'Lookahead'.
445
+ *
446
+ * For example, the functions `peek0`, `char0`, `char1` operate on the first/second character relative to the start of
447
+ * the scanner run, and the implementation advances the lexer position when it is necessary.
448
+ *
449
+ * The field `offset` can be used to reset relative indexing to the current lexer position.
450
+ * This is used, for example, in `newline_lookahead`, to perform repeated lexing passes, since `lex` uses the lookahead
451
+ * interface.
452
+ * After processing a `Lexed` token, `newline_lookahead` continues seeking ahead after comments and CPP, and when it
453
+ * encounters the next token, it calls `reset_lookahead` to set `offset` to the current position, ensuring that `lex`
454
+ * can use `char0` to test the following character.
455
+ */
332
456
typedef struct {
333
457
uint32_t len ;
334
458
uint32_t cap ;
335
459
int32_t * data ;
336
460
uint32_t offset ;
337
461
} Lookahead ;
338
462
463
+ /**
464
+ * The state that is persisted across scanner runs.
465
+ *
466
+ * Although 'Lookahead' is always reset when starting a new run, storing it in the state avoids having to allocate and
467
+ * free the array repeatedly.
468
+ * Instead we just reset the `len` attribute to 0 and reuse the previous memory.
469
+ *
470
+ * REVIEW: Can tree-sitter run the scanner concurrently on multiple nodes in the same file in some situations?
471
+ */
339
472
typedef struct {
340
473
Contexts contexts ;
341
474
Newline newline ;
@@ -346,12 +479,7 @@ typedef struct {
346
479
} State ;
347
480
348
481
/**
349
- * This structure contains the external and internal state.
350
- *
351
- * The parser provides the lexer interface and the list of valid symbols.
352
- *
353
- * The internal state consists of a stack of indentation widths that is manipulated whenever a layout is started or
354
- * terminated.
482
+ * Transient state and stuff provided by tree-sitter.
355
483
*/
356
484
typedef struct {
357
485
TSLexer * lexer ;
@@ -1852,6 +1980,15 @@ static bool consume_pragma() {
1852
1980
return false;
1853
1981
}
1854
1982
1983
+ /**
1984
+ * Since pragmas can occur anywhere, like comments, but contrarily determine indentation when occurring at the beginning
1985
+ * of a line in layouts, this sets `NResume` to continue newline processing with the indent of the pragma.
1986
+ *
1987
+ * If the pragma is followed by newline, this only ensures that no semicolon is emitted (since this rule is run before
1988
+ * `semicolon` and `NResume` restarts lookahead).
1989
+ *
1990
+ * Otherwise it ensures that the following token is treated as a layout element with the correct indent.
1991
+ */
1855
1992
static Symbol pragma () {
1856
1993
if (consume_pragma ()) {
1857
1994
if (newline -> state != NInactive ) newline -> state = NResume ;
0 commit comments