@@ -1565,14 +1565,17 @@ static Symbol start_layout_newline() {
1565
1565
return result ;
1566
1566
}
1567
1567
1568
- static Symbol tuple_context () {
1568
+ /**
1569
+ * See `token_end_layout_texp`.
1570
+ */
1571
+ static Symbol texp_context () {
1569
1572
if (valid (START_TEXP )) {
1570
1573
push_context (TExp , 0 );
1571
- return finish (START_TEXP , "tuple_context " );
1574
+ return finish (START_TEXP , "texp_context " );
1572
1575
}
1573
1576
else if (valid (END_TEXP ) && current_context () == TExp ) {
1574
1577
pop ();
1575
- return finish (END_TEXP , "tuple_context " );
1578
+ return finish (END_TEXP , "texp_context " );
1576
1579
}
1577
1580
else return FAIL ;
1578
1581
}
@@ -1597,9 +1600,13 @@ static Symbol end_layout(const char *restrict desc) {
1597
1600
else return FAIL ;
1598
1601
}
1599
1602
1603
+ /**
1604
+ * Explicit brace layouts need a dedicated symbol, see `_cmd_layout_start_explicit` for an explanation.
1605
+ * Includes the brace in the range.
1606
+ */
1600
1607
static Symbol end_layout_brace () {
1601
1608
if (valid (END_EXPLICIT ) && current_context () == Braces ) {
1602
- peek1 ( );
1609
+ advance_over ( 0 );
1603
1610
MARK ("end_layout_brace" );
1604
1611
pop ();
1605
1612
return finish (END_EXPLICIT , "brace" );
@@ -1715,18 +1722,26 @@ static bool layouts_in_texp() {
1715
1722
}
1716
1723
1717
1724
/**
1718
- * Expression layouts can be closed by commas, vertical bars and closing brackets and parens when they are elements in a
1719
- * list or tuple-like construct:
1725
+ * Tuple expressions are constructs that syntactically delimit their contents in an unambiguous way that makes parsing
1726
+ * a lot easier.
1727
+ * In GHC, this concept is used to classify productions like view patterns and annotated expressions.
1728
+ * For us, unfortunately, it also means that there are significantly more circumstances in which layouts can be ended by
1729
+ * parse errors.
1730
+ *
1731
+ * In practice, it means that expression layouts can be closed by commas, vertical bars and closing brackets and parens
1732
+ * when they are elements in a list or tuple-like construct:
1720
1733
*
1721
1734
* (case a of a -> a, do a; a, if | a -> a | a -> a)
1722
1735
* [case a of a -> a | a <- a]
1723
1736
*
1724
- * This pattern also extends to some other this like guards:
1737
+ * We encode this as a special context sort, `TExp`, that is pushed and popped at opening and closing brackets.
1738
+ *
1739
+ * Some other constructs, like guards, have similar characteristics, so we use the same mechanism for them:
1725
1740
*
1726
1741
* > a = case a of
1727
1742
* > a | let a = a -> a
1728
1743
*
1729
- * Here the arrow ends the let layout .
1744
+ * Here the let layout must be ended by parse error, so we start a tuple expression at the bar and end it at the arrow .
1730
1745
*/
1731
1746
static Symbol token_end_layout_texp () {
1732
1747
return (valid (END ) && layouts_in_texp ()) ? end_layout ("texp" ) : FAIL ;
@@ -1746,6 +1761,10 @@ static Symbol force_end_context() {
1746
1761
// Operators
1747
1762
// --------------------------------------------------------------------------------------------------------
1748
1763
1764
+ /**
1765
+ * Opening tokens are a class of characters that may immediately follow prefix operators like bang pattern `!` or type
1766
+ * application `@`.
1767
+ */
1749
1768
static bool opening_token (uint32_t i ) {
1750
1769
int32_t c = peek (i );
1751
1770
switch (c ) {
@@ -1758,6 +1777,7 @@ static bool opening_token(uint32_t i) {
1758
1777
case '{' :
1759
1778
return peek (i + 1 ) != '-' ;
1760
1779
default :
1780
+ // Includes single quote
1761
1781
return is_id_char (c );
1762
1782
}
1763
1783
}
@@ -1767,27 +1787,31 @@ static bool opening_token(uint32_t i) {
1767
1787
*/
1768
1788
static bool valid_symop_two_chars (int32_t first_char , int32_t second_char ) {
1769
1789
switch (first_char ) {
1770
- case '-' :
1771
- return second_char != '-' && second_char != '>' ;
1772
1790
case '=' :
1773
1791
return second_char != '>' ;
1774
1792
case '<' :
1775
1793
return second_char != '-' ;
1776
- case '.' :
1777
- return second_char != '.' ;
1778
1794
case ':' :
1779
1795
return second_char != ':' ;
1780
1796
case '#' :
1797
+ // Unboxed unit `(##)` and unboxed sum with missing space `(#| Int #)`
1781
1798
return second_char != '#' && second_char != '|' ;
1782
1799
default :
1783
1800
return true;
1784
1801
}
1785
1802
}
1786
1803
1804
+ /**
1805
+ * If a prefix operator is not followed by an opening token, it may still be a valid varsym.
1806
+ */
1787
1807
static Lexed lex_prefix (Lexed t ) {
1788
1808
return opening_token (1 ) ? t : LSymop ;
1789
1809
}
1790
1810
1811
+ /**
1812
+ * If a splice operator is not followed by an opening token, it may still be a valid varsym.
1813
+ * We only allow variables and parenthesized expressions for performance reasons, though.
1814
+ */
1791
1815
static Lexed lex_splice (int32_t c ) {
1792
1816
return varid_start_char (c ) || c == '(' ? LDollar : LSymop ;
1793
1817
}
@@ -1817,6 +1841,7 @@ static Lexed lex_symop() {
1817
1841
if (len == 1 ) {
1818
1842
switch (c1 ) {
1819
1843
case '?' :
1844
+ // A `?` can be the head of an implicit parameter, if followed by a varid.
1820
1845
return varid_start_char (peek1 ()) ? LNothing : LSymop ;
1821
1846
case '#' :
1822
1847
return char1 (')' ) ? LUnboxedClose : LHash ;
@@ -1889,29 +1914,60 @@ static Lexed lex_symop() {
1889
1914
return LSymop ;
1890
1915
}
1891
1916
1917
+ /**
1918
+ * This calls `symop_lookahead` to ensure that the position has advanced beyond the end of the symop, which is necessary
1919
+ * because newline lookahead may have validated the symop in a previous run.
1920
+ * This marks the range to emit a terminal.
1921
+ */
1892
1922
static Symbol finish_symop (Symbol s ) {
1893
1923
if (valid (s )) {
1894
- advance_before ( symop_lookahead () );
1924
+ symop_lookahead ();
1895
1925
return finish_marked (s , "symop" );
1896
1926
}
1897
1927
return FAIL ;
1898
1928
}
1899
1929
1930
+ /**
1931
+ * Tight ops like `dot.syntax` require that no initial whitespace was skipped.
1932
+ */
1900
1933
static Symbol tight_op (bool whitespace , Symbol s ) {
1901
1934
if (!whitespace ) return finish_if_valid (s , "tight_op" );
1902
1935
else return FAIL ;
1903
1936
}
1904
1937
1938
+ /**
1939
+ * Used for situations where the operator is followed by an opening token, and so can be a prefix op if it is preceded
1940
+ * by whitespace; but is no valid tight op and therefore becomes a regular operator if not preceded by whitespace or the
1941
+ * symbol is not valid.
1942
+ *
1943
+ * Only used for `%` (modifier).
1944
+ */
1905
1945
static Symbol prefix_or_varsym (bool whitespace , Symbol s ) {
1906
1946
if (whitespace ) SEQ (finish_if_valid (s , "prefix_or_varsym" ));
1907
1947
return finish_symop (VARSYM );
1908
1948
}
1909
1949
1950
+ /**
1951
+ * Used for situations where the operator is followed by an opening token, and so can be a tight op if it is not
1952
+ * preceded by whitespace; but is no valid prefix op and therefore becomes a regular operator if preceded by whitespace
1953
+ * or the symbol is not valid.
1954
+ *
1955
+ * Only used for `.`, when a projection selector `(.fieldname)` is not valid at this position, so the dot becomes the
1956
+ * composition operator.
1957
+ */
1910
1958
static Symbol tight_or_varsym (bool whitespace , Symbol s ) {
1911
1959
SEQ (tight_op (whitespace , s ));
1912
1960
return finish_symop (VARSYM );
1913
1961
}
1914
1962
1963
+ /**
1964
+ * Used for situations where the operator is followed by an opening token, and so can be a tight op if it is not
1965
+ * preceded by whitespace or a prefix op if it is.
1966
+ *
1967
+ * If neither of those symbols is valid, fall back to a regular operator.
1968
+ *
1969
+ * Used for `!`, `~` and `@`.
1970
+ */
1915
1971
static Symbol infix_or_varsym (bool whitespace , Symbol prefix , Symbol tight ) {
1916
1972
SEQ (finish_if_valid (whitespace ? prefix : tight , "infix_or_varsym" ));
1917
1973
return finish_symop (VARSYM );
@@ -1930,17 +1986,26 @@ static bool is_qq_start() {
1930
1986
return char_at (end , '|' );
1931
1987
}
1932
1988
1989
+ /**
1990
+ * An end token is a keyword like `else` or `deriving` that can end a layout without newline or indent.
1991
+ */
1933
1992
static Lexed try_end_token (const char * restrict target , Lexed match ) {
1934
1993
if (token (target )) return match ;
1935
1994
else return LNothing ;
1936
1995
}
1937
1996
1997
+ /**
1998
+ * Check that a symop consists only of minuses after the second character.
1999
+ */
1938
2000
static bool only_minus () {
1939
2001
uint32_t i = 2 ;
1940
2002
while (peek (i ) == '-' ) i ++ ;
1941
2003
return !symop_char (peek (i ));
1942
2004
}
1943
2005
2006
+ /**
2007
+ * Check that a symop consists only of minuses, making it a comment herald.
2008
+ */
1944
2009
static bool line_comment_herald () {
1945
2010
return seq ("--" ) && only_minus ();
1946
2011
}
@@ -1956,6 +2021,9 @@ static Lexed lex_cpp() {
1956
2021
}
1957
2022
}
1958
2023
2024
+ /**
2025
+ * Lex pragmas, comments and CPP.
2026
+ */
1959
2027
static Lexed lex_extras (bool bol ) {
1960
2028
switch (peek0 ()) {
1961
2029
case '{' :
@@ -1973,6 +2041,10 @@ static Lexed lex_extras(bool bol) {
1973
2041
return LNothing ;
1974
2042
}
1975
2043
2044
+ /**
2045
+ * The main lexing entry point, branching on the first character, then advancing as far as necessary to identify all
2046
+ * interesting tokens.
2047
+ */
1976
2048
static Lexed lex (bool bol ) {
1977
2049
SEQT (lex_extras (bol ));
1978
2050
if (symop_char (peek0 ())) SEQT (lex_symop ());
@@ -2047,6 +2119,9 @@ static Symbol cpp_line() {
2047
2119
return finish_marked (CPP , "cpp" );
2048
2120
}
2049
2121
2122
+ /**
2123
+ * Distinguish between haddocks and plain comments by matching on the first non-whitespace character.
2124
+ */
2050
2125
static Symbol comment_type () {
2051
2126
uint32_t i = 2 ;
2052
2127
while (peek (i ) == '-' ) i ++ ;
@@ -2058,6 +2133,10 @@ static Symbol comment_type() {
2058
2133
return COMMENT ;
2059
2134
}
2060
2135
2136
+ /**
2137
+ * Inline comments extend over all consecutive lines that start with comments.
2138
+ * Could be improved by requiring equal indent.
2139
+ */
2061
2140
static Symbol inline_comment () {
2062
2141
Symbol sym = comment_type ();
2063
2142
do {
@@ -2104,7 +2183,7 @@ static uint32_t consume_block_comment(uint32_t col) {
2104
2183
}
2105
2184
2106
2185
/**
2107
- * Since {- -} comments can be nested arbitrarily, this has to keep track of how many have been openend , so that the
2186
+ * Since {- -} comments can be nested arbitrarily, this has to keep track of how many have been opened , so that the
2108
2187
* outermost comment isn't closed prematurely.
2109
2188
*/
2110
2189
static Symbol block_comment () {
@@ -2240,7 +2319,15 @@ static Symbol resolve_semicolon(Lexed next) {
2240
2319
}
2241
2320
2242
2321
/**
2243
- * Multi-way if layouts are exempt from automatic semicolon generation in GHC.
2322
+ * Generate a layout semicolon after a newline if the indent is less or equal to the current layout's indent, unless:
2323
+ *
2324
+ * - The current context doesn't use layout semicolons, which is the case for explicit brace layouts, tuple expressions,
2325
+ * the module header and multi-way if layouts.
2326
+ *
2327
+ * - `no_semi` was set because newline lookahead found an explicit semicolon in the next line, or this function was
2328
+ * executed before for the same newline.
2329
+ *
2330
+ * - `skip_semi` was set because the previous line ended with an explicit semicolon.
2244
2331
*/
2245
2332
static Symbol semicolon () {
2246
2333
if (
@@ -2519,7 +2606,7 @@ static Symbol newline_resume() {
2519
2606
* These are conditioned only on symbols and don't advance, except for `qq_body`, which cannot fail.
2520
2607
*/
2521
2608
static Symbol pre_ws_commands () {
2522
- SEQ (tuple_context ());
2609
+ SEQ (texp_context ());
2523
2610
SEQ (start_brace ());
2524
2611
SEQ (end_brace ());
2525
2612
// Leading whitespace must be included in the node.
0 commit comments