Skip to content

Commit cbdd079

Browse files
committed
add some scanner docs
1 parent 02d49f8 commit cbdd079

File tree

1 file changed

+157
-20
lines changed

1 file changed

+157
-20
lines changed

src/scanner.c

+157-20
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,63 @@
1+
/**
2+
* The scanner is an extension to the built-in lexer that handles cases that are hard or impossible to express with the
3+
* high-level grammar rules.
4+
* Since Haskell is indentation sensitive and uses parse errors to end layouts, this component has many
5+
* responsibilities.
6+
*
7+
* tree-sitter runs the scanner at every position repeatedly until it fails, after which the built-in lexer consumes one
8+
* token.
9+
* When the scanner succeeds, it returns the index of a symbol in the `externals` array in `grammar.js`, which is then
10+
* processed like other grammar symbols, except that it terminates any conflict branches in which the symbol isn't
11+
* valid.
12+
* The scanner's state is persisted and passed into the next run, but it is discarded when the scanner fails, i.e. when
13+
* it yields control back to the built-in lexer.
14+
*
15+
* The high-level workflow of the scanner consists of three distinct modes.
16+
* When the first character after whitespace is a newline, the scanner starts newline lookahead, otherwise it processes
17+
* an interior position.
18+
* If the state indicates that the previous run performed newline lookahead, it enters newline processing mode.
19+
*
20+
* In interior mode, a single lexing pass is performed.
21+
*
22+
* Such a pass consists of two steps:
23+
*
24+
* In the first step, the scanner identifies the immediate token by branching on the first character after whitespace
25+
* and examining different conditions to select one of the variants of the enum `Lexed`, which enumerates all known,
26+
* interesting, situations.
27+
* The position of the lexer may be advanced in the process to look at subsequent characters.
28+
* To avoid having to arrange different parts of the logic according to how many characters have been consumed,
29+
* lookahead is written to an array in the transient state on demand, so that each component can specify the index
30+
* relative to the position at the beginning of the run (modulo whitespace).
31+
* The entry point for this step is the function `lex`.
32+
*
33+
* The second step is different for each mode.
34+
* In interior mode, the `Lexed` token determines which symbol to return to the grammar based on the current state, like
35+
* layout contexts and valid symbols.
36+
* Most symbols do not contain any text, but only act as conditions in the grammar, but for symbolic operators, CPP,
37+
* comments, pragmas, and quasiquotes, the lexer is advanced to the end of the token and `mark_end` is called to
38+
* communicate the range to tree-sitter.
39+
*
40+
* In newline lookahead mode, the scanner performs repeated lexing passes until it encounters a `Lexed` token that is
41+
* not CPP or a comment.
42+
* In the second step of each pass, the token determines whether to terminate and/or which flags to set in the state to
43+
* guide processing in the next run.
44+
* If the lookahead loop has only made a single lexing pass that did not consume any characters of the following token
45+
* (because the first character did not match any of the conditions for lexing that require more lookahead), the scanner
46+
* switches to newline processing mode directly; otherwise it terminates the run after storing the newline information
47+
* in the persistent state.
48+
* This is possible by succeeding with the symbol `UPDATE`, which is mapped to newline in `externals`.
49+
* tree-sitter does not create a node in the parse tree for this symbol if `mark_end` wasn't called after consuming
50+
* lookahead, and immediately calls the scanner again at the same position.
51+
*
52+
* In either case, the scanner ends up in newline processing mode, in which it performs a series of highly
53+
* order-sensitive steps based on the data collected in lookahead mode, potentially returning multiple symbols in
54+
* successive runs until none of the newline-related conditions match.
55+
* This procedure ensures that nested layouts are terminated at the earliest position instead of extending over all
56+
* subsequent (top-level) whitespace, comments and CPP up to the next layout element.
57+
* Only when all layouts are terminated will the scanner process the final `Lexed` token that it stored in the state in
58+
* lookahead mode, using the same logic as in interior mode, and update the state to disable newline processing for the
59+
* next run.
60+
*/
161
#define DEBUG 0
262

363
#include "tree_sitter/parser.h"
@@ -69,6 +129,11 @@
69129
// Symbols
70130
// --------------------------------------------------------------------------------------------------------
71131

132+
/**
133+
* This enum mirrors the symbols in `externals` in `grammar.js`.
134+
* tree-sitter passes an array of booleans to the scanner whose entries are `true` if the symbol at the corresponding
135+
* index is valid at the current parser position.
136+
*/
72137
typedef enum {
73138
FAIL,
74139
SEMICOLON,
@@ -162,32 +227,26 @@ static const char *sym_names[] = {
162227
// --------------------------------------------------------------------------------------------------------
163228

164229
#if DEBUG
165-
static char const *context_names[] = {
166-
167-
"decls",
168-
"do",
169-
"case",
170-
"let",
171-
"multi_way_if",
172-
"quote",
173-
"braces",
174-
"texp",
175-
"module_header",
176-
"none",
177-
};
178230

179231
typedef struct {
180232
unsigned len;
181233
unsigned cap;
182234
int32_t *data;
183235
} ParseLine;
184236

237+
/**
238+
* A vector of lines, persisted across runs, for visualizing the current lexer position and scanner lookahead.
239+
*/
185240
typedef struct {
186241
unsigned len;
187242
unsigned cap;
188243
ParseLine *data;
189244
} ParseLines;
190245

246+
/**
247+
* Info about calls to `mark_end` and how far the lexer has progressed in a run.
248+
* Discarded after each run.
249+
*/
191250
typedef struct {
192251
int marked;
193252
unsigned marked_line;
@@ -210,6 +269,9 @@ Debug debug_new(TSLexer *l) {
210269

211270
#endif
212271

272+
/**
273+
* Different sorts of layout contexts that require special treatment.
274+
*/
213275
typedef enum {
214276
DeclLayout,
215277
DoLayout,
@@ -223,11 +285,36 @@ typedef enum {
223285
NoContext,
224286
} ContextSort;
225287

288+
#if DEBUG
289+
290+
static char const *context_names[] = {
291+
"decls",
292+
"do",
293+
"case",
294+
"let",
295+
"multi_way_if",
296+
"quote",
297+
"braces",
298+
"texp",
299+
"module_header",
300+
"none",
301+
};
302+
303+
#endif
304+
305+
/**
306+
* The persistent state maintains a stack of layout contexts.
307+
* New entries are created when a layout symbol is valid at the current position, and they are removed when the indent
308+
* of a line satisfies conditions that depend on the current context sort, or when certain tokens (like `else`) occur.
309+
*/
226310
typedef struct {
227311
ContextSort sort;
228312
uint32_t indent;
229313
} Context;
230314

315+
/**
316+
* This enumerates the lookahead tokens that have special meaning in the scanner.
317+
*/
231318
typedef enum {
232319
LNothing,
233320
LEof,
@@ -306,36 +393,82 @@ static const char *token_names[] = {
306393

307394
#endif
308395

396+
/**
397+
* The current newline mode.
398+
* `NInit` is set during newline lookahead, and `NProcess` when lookahead has finished.
399+
* After processing is complete, the state is reset to `NInactive`.
400+
* `NResume` is a special variant that forces newline lookahead mode when a run starts without requiring a newline.
401+
* This is used for the beginning of the file and after pragmas (see `pragma`).
402+
*/
309403
typedef enum {
310404
NInactive,
311405
NInit,
312406
NProcess,
313407
NResume,
314408
} NewlineState;
315409

410+
/**
411+
* The two newline modes need to operate across multiple scanner runs and adapt their behavior to the context
412+
* established by previous runs, encoded by this persistent state.
413+
*/
316414
typedef struct {
317415
NewlineState state;
318-
uint32_t indent;
416+
// The final token encountered after skipping comments and CPP.
319417
Lexed end;
418+
// The indent of `end`, used to decide layout actions before parsing intermediate extras.
419+
uint32_t indent;
420+
// When there is no token after extras, we shouldn't start layouts.
320421
bool eof;
422+
// Prohibit layout semicolons in future runs.
321423
bool no_semi;
424+
// Prohibit layout semicolons in future runs, but can be relaxed by some actions.
425+
// See `explicit_semicolon`.
322426
bool skip_semi;
427+
// Lookahead has advanced into `end`, so the scanner has to be restarted before processing the newline result.
323428
bool unsafe;
324429
} Newline;
325430

431+
/**
432+
* The vector for the layout context stack.
433+
*/
326434
typedef struct {
327435
uint32_t len;
328436
uint32_t cap;
329437
Context *data;
330438
} Contexts;
331439

440+
/**
441+
* Whenever the lexer is advanced over non-whitespace, the consumed character is appended to this vector.
442+
* This avoids having to ensure that different components that need to examine multiple lookahead characters have to be
443+
* run in the correct order.
444+
* Instead, we refer to lookahead by the character's index using the interface described in the section 'Lookahead'.
445+
*
446+
* For example, the functions `peek0`, `char0`, `char1` operate on the first/second character relative to the start of
447+
* the scanner run, and the implementation advances the lexer position when it is necessary.
448+
*
449+
* The field `offset` can be used to reset relative indexing to the current lexer position.
450+
* This is used, for example, in `newline_lookahead`, to perform repeated lexing passes, since `lex` uses the lookahead
451+
* interface.
452+
* After processing a `Lexed` token, `newline_lookahead` continues seeking ahead after comments and CPP, and when it
453+
* encounters the next token, it calls `reset_lookahead` to set `offset` to the current position, ensuring that `lex`
454+
* can use `char0` to test the following character.
455+
*/
332456
typedef struct {
333457
uint32_t len;
334458
uint32_t cap;
335459
int32_t *data;
336460
uint32_t offset;
337461
} Lookahead;
338462

463+
/**
464+
* The state that is persisted across scanner runs.
465+
*
466+
* Although 'Lookahead' is always reset when starting a new run, storing it in the state avoids having to allocate and
467+
* free the array repeatedly.
468+
* Instead we just reset the `len` attribute to 0 and reuse the previous memory.
469+
*
470+
* REVIEW: Can tree-sitter run the scanner concurrently on multiple nodes in the same file in some situations?
471+
*/
339472
typedef struct {
340473
Contexts contexts;
341474
Newline newline;
@@ -346,12 +479,7 @@ typedef struct {
346479
} State;
347480

348481
/**
349-
* This structure contains the external and internal state.
350-
*
351-
* The parser provides the lexer interface and the list of valid symbols.
352-
*
353-
* The internal state consists of a stack of indentation widths that is manipulated whenever a layout is started or
354-
* terminated.
482+
* Transient state and stuff provided by tree-sitter.
355483
*/
356484
typedef struct {
357485
TSLexer *lexer;
@@ -1852,6 +1980,15 @@ static bool consume_pragma() {
18521980
return false;
18531981
}
18541982

1983+
/**
1984+
* Since pragmas can occur anywhere, like comments, but contrarily determine indentation when occurring at the beginning
1985+
* of a line in layouts, this sets `NResume` to continue newline processing with the indent of the pragma.
1986+
*
1987+
* If the pragma is followed by newline, this only ensures that no semicolon is emitted (since this rule is run before
1988+
* `semicolon` and `NResume` restarts lookahead).
1989+
*
1990+
* Otherwise it ensures that the following token is treated as a layout element with the correct indent.
1991+
*/
18551992
static Symbol pragma() {
18561993
if (consume_pragma()) {
18571994
if (newline->state != NInactive) newline->state = NResume;

0 commit comments

Comments
 (0)