Lex multiline strings as multiple tokens.
This is because the structure pass needs to be able to tell where new lines start, and if the line number changes in the middle of a token then it'll get confused.
This commit is contained in:
parent
3fb0cf088f
commit
1f490e9f7f
22
LexOccam.x
22
LexOccam.x
|
@ -68,11 +68,15 @@ $vertSpace = [\r\n]
|
|||
|
||||
@identifier = [a-z A-Z] [a-z A-Z 0-9 \.]*
|
||||
|
||||
$escapeChar = [cnrts \" \' \* \n]
|
||||
@escape = \* ( $escapeChar | \# $hexDigit $hexDigit )
|
||||
@hexEscape = \# $hexDigit $hexDigit
|
||||
@escape = \* ( @hexEscape | [^\#\n] )
|
||||
|
||||
@stringLiteral = \" ( @escape | [^\"\*] )* \"
|
||||
@charLiteral = \' ( @escape | [^\'\*] ) \'
|
||||
@stringBody = ( @escape | [^\"\*] )*
|
||||
@fullString = \" @stringBody \"
|
||||
@startString = \" @stringBody \* \n
|
||||
@contString = \* @stringBody \* \n
|
||||
@endString = \* @stringBody \"
|
||||
|
||||
-- Note that occam number literals don't include their signs -- if you say
|
||||
-- "-3", then that's the operator "-" applied to the literal "3".
|
||||
|
@ -84,10 +88,10 @@ $escapeChar = [cnrts \" \' \* \n]
|
|||
|
||||
occam :-
|
||||
|
||||
-- This would all be very simple if it weren't for preprocessor instructions!
|
||||
-- In state 0, we're consuming the horizontal space at the start of a line.
|
||||
-- In state one, we're reading the first thing on a line.
|
||||
-- In state two, we're reading the rest of the line.
|
||||
-- In state three, we're in the middle of a multi-line string.
|
||||
|
||||
<0> $horizSpace* { mkState one }
|
||||
|
||||
|
@ -98,8 +102,13 @@ occam :-
|
|||
<one, two> @reserved { mkToken TokReserved two }
|
||||
<one, two> @identifier { mkToken TokIdentifier two }
|
||||
|
||||
<one, two> @stringLiteral { mkToken TokStringLiteral two }
|
||||
<one, two> @charLiteral { mkToken TokCharLiteral two }
|
||||
<one, two> @fullString { mkToken TokStringLiteral two }
|
||||
<one, two> @startString { mkToken TokStringCont three }
|
||||
|
||||
<three> $horizSpace+ { mkState three }
|
||||
<three> @contString { mkToken TokStringCont three }
|
||||
<three> @endString { mkToken TokStringLiteral two }
|
||||
|
||||
<one, two> @intLiteral { mkToken TokIntLiteral two }
|
||||
<one, two> @hexLiteral { mkToken TokHexLiteral two }
|
||||
|
@ -117,7 +126,8 @@ type Token = (Meta, TokenType)
|
|||
data TokenType =
|
||||
TokReserved String -- ^ A reserved word or symbol
|
||||
| TokIdentifier String
|
||||
| TokStringLiteral String
|
||||
| TokStringCont String -- ^ A continued string literal.
|
||||
| TokStringLiteral String -- ^ (The end of) a string literal.
|
||||
| TokCharLiteral String
|
||||
| TokIntLiteral String
|
||||
| TokHexLiteral String
|
||||
|
|
|
@ -62,6 +62,7 @@ structureOccam ts = analyse 1 firstLine ts (emptyMeta, EndOfLine)
|
|||
|
||||
isContinuation = case prevTok of
|
||||
(_, TokReserved s) -> s `elem` continuationWords
|
||||
(_, TokStringCont _) -> True
|
||||
_ -> False
|
||||
|
||||
-- A new line -- look to see what's going on with the indentation.
|
||||
|
|
|
@ -7,6 +7,9 @@ PROC P ()
|
|||
VAL BYTE cc IS '"':
|
||||
VAL BYTE ccx IS '*"':
|
||||
VAL BYTE ccc IS '*'':
|
||||
VAL []BYTE mls IS "first*
|
||||
*second*
|
||||
*third":
|
||||
VAL [5][5]BYTE square IS ["sator", "arepo", "tenas", "opera", "rotas"]:
|
||||
SKIP
|
||||
:
|
||||
|
|
Loading…
Reference in New Issue
Block a user