Skip to content

Commit 83af0a6

Browse files
committed
small scanner tweaks and more docs
1 parent 83d0112 commit 83af0a6

File tree

1 file changed

+103
-16
lines changed

1 file changed

+103
-16
lines changed

src/scanner.c

+103-16
Original file line numberDiff line numberDiff line change
@@ -1565,14 +1565,17 @@ static Symbol start_layout_newline() {
15651565
return result;
15661566
}
15671567

1568-
static Symbol tuple_context() {
1568+
/**
1569+
* See `token_end_layout_texp`.
1570+
*/
1571+
static Symbol texp_context() {
15691572
if (valid(START_TEXP)) {
15701573
push_context(TExp, 0);
1571-
return finish(START_TEXP, "tuple_context");
1574+
return finish(START_TEXP, "texp_context");
15721575
}
15731576
else if (valid(END_TEXP) && current_context() == TExp) {
15741577
pop();
1575-
return finish(END_TEXP, "tuple_context");
1578+
return finish(END_TEXP, "texp_context");
15761579
}
15771580
else return FAIL;
15781581
}
@@ -1597,9 +1600,13 @@ static Symbol end_layout(const char *restrict desc) {
15971600
else return FAIL;
15981601
}
15991602

1603+
/**
1604+
* Explicit brace layouts need a dedicated symbol, see `_cmd_layout_start_explicit` for an explanation.
1605+
* Includes the brace in the range.
1606+
*/
16001607
static Symbol end_layout_brace() {
16011608
if (valid(END_EXPLICIT) && current_context() == Braces) {
1602-
peek1();
1609+
advance_over(0);
16031610
MARK("end_layout_brace");
16041611
pop();
16051612
return finish(END_EXPLICIT, "brace");
@@ -1715,18 +1722,26 @@ static bool layouts_in_texp() {
17151722
}
17161723

17171724
/**
1718-
* Expression layouts can be closed by commas, vertical bars and closing brackets and parens when they are elements in a
1719-
* list or tuple-like construct:
1725+
* Tuple expressions are constructs that syntactically delimit their contents in an unambiguous way that makes parsing
1726+
* a lot easier.
1727+
* In GHC, this concept is used to classify productions like view patterns and annotated expressions.
1728+
* For us, unfortunately, it also means that there are significantly more circumstances in which layouts can be ended by
1729+
* parse errors.
1730+
*
1731+
* In practice, it means that expression layouts can be closed by commas, vertical bars and closing brackets and parens
1732+
* when they are elements in a list or tuple-like construct:
17201733
*
17211734
* (case a of a -> a, do a; a, if | a -> a | a -> a)
17221735
* [case a of a -> a | a <- a]
17231736
*
1724-
* This pattern also extends to some other this like guards:
1737+
* We encode this as a special context sort, `TExp`, that is pushed and popped at opening and closing brackets.
1738+
*
1739+
* Some other constructs, like guards, have similar characteristics, so we use the same mechanism for them:
17251740
*
17261741
* > a = case a of
17271742
* > a | let a = a -> a
17281743
*
1729-
* Here the arrow ends the let layout.
1744+
* Here the let layout must be ended by parse error, so we start a tuple expression at the bar and end it at the arrow.
17301745
*/
17311746
static Symbol token_end_layout_texp() {
17321747
return (valid(END) && layouts_in_texp()) ? end_layout("texp") : FAIL;
@@ -1746,6 +1761,10 @@ static Symbol force_end_context() {
17461761
// Operators
17471762
// --------------------------------------------------------------------------------------------------------
17481763

1764+
/**
1765+
* Opening tokens are a class of characters that may immediately follow prefix operators like bang pattern `!` or type
1766+
* application `@`.
1767+
*/
17491768
static bool opening_token(uint32_t i) {
17501769
int32_t c = peek(i);
17511770
switch (c) {
@@ -1758,6 +1777,7 @@ static bool opening_token(uint32_t i) {
17581777
case '{':
17591778
return peek(i + 1) != '-';
17601779
default:
1780+
// Includes single quote
17611781
return is_id_char(c);
17621782
}
17631783
}
@@ -1767,27 +1787,31 @@ static bool opening_token(uint32_t i) {
17671787
*/
17681788
static bool valid_symop_two_chars(int32_t first_char, int32_t second_char) {
17691789
switch (first_char) {
1770-
case '-':
1771-
return second_char != '-' && second_char != '>';
17721790
case '=':
17731791
return second_char != '>';
17741792
case '<':
17751793
return second_char != '-';
1776-
case '.':
1777-
return second_char != '.';
17781794
case ':':
17791795
return second_char != ':';
17801796
case '#':
1797+
// Unboxed unit `(##)` and unboxed sum with missing space `(#| Int #)`
17811798
return second_char != '#' && second_char != '|';
17821799
default:
17831800
return true;
17841801
}
17851802
}
17861803

1804+
/**
1805+
* If a prefix operator is not followed by an opening token, it may still be a valid varsym.
1806+
*/
17871807
static Lexed lex_prefix(Lexed t) {
17881808
return opening_token(1) ? t : LSymop;
17891809
}
17901810

1811+
/**
1812+
* If a splice operator is not followed by an opening token, it may still be a valid varsym.
1813+
* We only allow variables and parenthesized expressions for performance reasons, though.
1814+
*/
17911815
static Lexed lex_splice(int32_t c) {
17921816
return varid_start_char(c) || c == '(' ? LDollar : LSymop;
17931817
}
@@ -1817,6 +1841,7 @@ static Lexed lex_symop() {
18171841
if (len == 1) {
18181842
switch (c1) {
18191843
case '?':
1844+
// A `?` can be the head of an implicit parameter, if followed by a varid.
18201845
return varid_start_char(peek1()) ? LNothing : LSymop;
18211846
case '#':
18221847
return char1(')') ? LUnboxedClose : LHash;
@@ -1889,29 +1914,60 @@ static Lexed lex_symop() {
18891914
return LSymop;
18901915
}
18911916

1917+
/**
1918+
* This calls `symop_lookahead` to ensure that the position has advanced beyond the end of the symop, which is necessary
1919+
* because newline lookahead may have validated the symop in a previous run.
1920+
* This marks the range to emit a terminal.
1921+
*/
18921922
static Symbol finish_symop(Symbol s) {
18931923
if (valid(s)) {
1894-
advance_before(symop_lookahead());
1924+
symop_lookahead();
18951925
return finish_marked(s, "symop");
18961926
}
18971927
return FAIL;
18981928
}
18991929

1930+
/**
1931+
* Tight ops like `dot.syntax` require that no initial whitespace was skipped.
1932+
*/
19001933
static Symbol tight_op(bool whitespace, Symbol s) {
19011934
if (!whitespace) return finish_if_valid(s, "tight_op");
19021935
else return FAIL;
19031936
}
19041937

1938+
/**
1939+
* Used for situations where the operator is followed by an opening token, and so can be a prefix op if it is preceded
1940+
* by whitespace; but is no valid tight op and therefore becomes a regular operator if not preceded by whitespace or the
1941+
* symbol is not valid.
1942+
*
1943+
* Only used for `%` (modifier).
1944+
*/
19051945
static Symbol prefix_or_varsym(bool whitespace, Symbol s) {
19061946
if (whitespace) SEQ(finish_if_valid(s, "prefix_or_varsym"));
19071947
return finish_symop(VARSYM);
19081948
}
19091949

1950+
/**
1951+
* Used for situations where the operator is followed by an opening token, and so can be a tight op if it is not
1952+
* preceded by whitespace; but is no valid prefix op and therefore becomes a regular operator if preceded by whitespace
1953+
* or the symbol is not valid.
1954+
*
1955+
* Only used for `.`, when a projection selector `(.fieldname)` is not valid at this position, so the dot becomes the
1956+
* composition operator.
1957+
*/
19101958
static Symbol tight_or_varsym(bool whitespace, Symbol s) {
19111959
SEQ(tight_op(whitespace, s));
19121960
return finish_symop(VARSYM);
19131961
}
19141962

1963+
/**
1964+
* Used for situations where the operator is followed by an opening token, and so can be a tight op if it is not
1965+
* preceded by whitespace or a prefix op if it is.
1966+
*
1967+
* If neither of those symbols is valid, fall back to a regular operator.
1968+
*
1969+
* Used for `!`, `~` and `@`.
1970+
*/
19151971
static Symbol infix_or_varsym(bool whitespace, Symbol prefix, Symbol tight) {
19161972
SEQ(finish_if_valid(whitespace ? prefix : tight, "infix_or_varsym"));
19171973
return finish_symop(VARSYM);
@@ -1930,17 +1986,26 @@ static bool is_qq_start() {
19301986
return char_at(end, '|');
19311987
}
19321988

1989+
/**
1990+
* An end token is a keyword like `else` or `deriving` that can end a layout without newline or indent.
1991+
*/
19331992
static Lexed try_end_token(const char * restrict target, Lexed match) {
19341993
if (token(target)) return match;
19351994
else return LNothing;
19361995
}
19371996

1997+
/**
1998+
* Check that a symop consists only of minuses after the second character.
1999+
*/
19382000
static bool only_minus() {
19392001
uint32_t i = 2;
19402002
while (peek(i) == '-') i++;
19412003
return !symop_char(peek(i));
19422004
}
19432005

2006+
/**
2007+
* Check that a symop consists only of minuses, making it a comment herald.
2008+
*/
19442009
static bool line_comment_herald() {
19452010
return seq("--") && only_minus();
19462011
}
@@ -1956,6 +2021,9 @@ static Lexed lex_cpp() {
19562021
}
19572022
}
19582023

2024+
/**
2025+
* Lex pragmas, comments and CPP.
2026+
*/
19592027
static Lexed lex_extras(bool bol) {
19602028
switch (peek0()) {
19612029
case '{':
@@ -1973,6 +2041,10 @@ static Lexed lex_extras(bool bol) {
19732041
return LNothing;
19742042
}
19752043

2044+
/**
2045+
* The main lexing entry point, branching on the first character, then advancing as far as necessary to identify all
2046+
* interesting tokens.
2047+
*/
19762048
static Lexed lex(bool bol) {
19772049
SEQT(lex_extras(bol));
19782050
if (symop_char(peek0())) SEQT(lex_symop());
@@ -2047,6 +2119,9 @@ static Symbol cpp_line() {
20472119
return finish_marked(CPP, "cpp");
20482120
}
20492121

2122+
/**
2123+
* Distinguish between haddocks and plain comments by matching on the first non-whitespace character.
2124+
*/
20502125
static Symbol comment_type() {
20512126
uint32_t i = 2;
20522127
while (peek(i) == '-') i++;
@@ -2058,6 +2133,10 @@ static Symbol comment_type() {
20582133
return COMMENT;
20592134
}
20602135

2136+
/**
2137+
* Inline comments extend over all consecutive lines that start with comments.
2138+
* Could be improved by requiring equal indent.
2139+
*/
20612140
static Symbol inline_comment() {
20622141
Symbol sym = comment_type();
20632142
do {
@@ -2104,7 +2183,7 @@ static uint32_t consume_block_comment(uint32_t col) {
21042183
}
21052184

21062185
/**
2107-
* Since {- -} comments can be nested arbitrarily, this has to keep track of how many have been openend, so that the
2186+
* Since {- -} comments can be nested arbitrarily, this has to keep track of how many have been opened, so that the
21082187
* outermost comment isn't closed prematurely.
21092188
*/
21102189
static Symbol block_comment() {
@@ -2240,7 +2319,15 @@ static Symbol resolve_semicolon(Lexed next) {
22402319
}
22412320

22422321
/**
2243-
* Multi-way if layouts are exempt from automatic semicolon generation in GHC.
2322+
* Generate a layout semicolon after a newline if the indent is less or equal to the current layout's indent, unless:
2323+
*
2324+
* - The current context doesn't use layout semicolons, which is the case for explicit brace layouts, tuple expressions,
2325+
* the module header and multi-way if layouts.
2326+
*
2327+
* - `no_semi` was set because newline lookahead found an explicit semicolon in the next line, or this function was
2328+
* executed before for the same newline.
2329+
*
2330+
* - `skip_semi` was set because the previous line ended with an explicit semicolon.
22442331
*/
22452332
static Symbol semicolon() {
22462333
if (
@@ -2519,7 +2606,7 @@ static Symbol newline_resume() {
25192606
* These are conditioned only on symbols and don't advance, except for `qq_body`, which cannot fail.
25202607
*/
25212608
static Symbol pre_ws_commands() {
2522-
SEQ(tuple_context());
2609+
SEQ(texp_context());
25232610
SEQ(start_brace());
25242611
SEQ(end_brace());
25252612
// Leading whitespace must be included in the node.

0 commit comments

Comments
 (0)