From 1f490e9f7f30b795b53e737187621281ae689904 Mon Sep 17 00:00:00 2001
From: Adam Sampson <ats@offog.org>
Date: Tue, 21 Aug 2007 20:01:46 +0000
Subject: [PATCH] Lex multiline strings as multiple tokens. This is because the
 structure pass needs to be able to tell where new lines start, and if the
 line number changes in the middle of a token then it'll get confused.

---
 LexOccam.x              | 22 ++++++++++++++++------
 StructureOccam.hs       |  1 +
 testcases/stringlit.occ |  3 +++
 3 files changed, 20 insertions(+), 6 deletions(-)
diff --git a/LexOccam.x b/LexOccam.x
index 012704b..0a2b2c9 100644
--- a/LexOccam.x
+++ b/LexOccam.x
@@ -68,11 +68,15 @@ $vertSpace = [\r\n]
 
 @identifier = [a-z A-Z] [a-z A-Z 0-9 \.]*
 
-$escapeChar = [cnrts \" \' \* \n]
-@escape = \* ( $escapeChar | \# $hexDigit $hexDigit )
+@hexEscape = \# $hexDigit $hexDigit
+@escape = \* ( @hexEscape | [^\#\n] )
 
-@stringLiteral = \" ( @escape | [^\"\*] )* \"
 @charLiteral = \' ( @escape | [^\'\*] ) \'
+@stringBody = ( @escape | [^\"\*] )*
+@fullString = \" @stringBody \"
+@startString = \" @stringBody \* \n
+@contString = \* @stringBody \* \n
+@endString = \* @stringBody \"
 
 -- Note that occam number literals don't include their signs -- if you say
 -- "-3", then that's the operator "-" applied to the literal "3".
@@ -84,10 +88,10 @@ $escapeChar = [cnrts \" \' \* \n]
 
 occam :-
 
--- This would all be very simple if it weren't for preprocessor instructions!
 -- In state 0, we're consuming the horizontal space at the start of a line.
 -- In state one, we're reading the first thing on a line.
 -- In state two, we're reading the rest of the line.
+-- In state three, we're in the middle of a multi-line string.
 
 <0>           $horizSpace*   { mkState one }
 
@@ -98,8 +102,13 @@ occam :-
 <one, two>    @reserved      { mkToken TokReserved two }
 <one, two>    @identifier    { mkToken TokIdentifier two }
 
-<one, two>    @stringLiteral { mkToken TokStringLiteral two }
 <one, two>    @charLiteral   { mkToken TokCharLiteral two }
+<one, two>    @fullString    { mkToken TokStringLiteral two }
+<one, two>    @startString   { mkToken TokStringCont three }
+
+<three>       $horizSpace+   { mkState three }
+<three>       @contString    { mkToken TokStringCont three }
+<three>       @endString     { mkToken TokStringLiteral two }
 
 <one, two>    @intLiteral    { mkToken TokIntLiteral two }
 <one, two>    @hexLiteral    { mkToken TokHexLiteral two }
@@ -117,7 +126,8 @@ type Token = (Meta, TokenType)
 data TokenType =
   TokReserved String                   -- ^ A reserved word or symbol
   | TokIdentifier String
-  | TokStringLiteral String
+  | TokStringCont String               -- ^ A continued string literal.
+  | TokStringLiteral String            -- ^ (The end of) a string literal.
   | TokCharLiteral String
   | TokIntLiteral String
   | TokHexLiteral String
diff --git a/StructureOccam.hs b/StructureOccam.hs
index 7e36167..881ebf4 100644
--- a/StructureOccam.hs
+++ b/StructureOccam.hs
@@ -62,6 +62,7 @@ structureOccam ts = analyse 1 firstLine ts (emptyMeta, EndOfLine)
 
         isContinuation = case prevTok of
                            (_, TokReserved s) -> s `elem` continuationWords
+                           (_, TokStringCont _) -> True
                            _ -> False
 
         -- A new line -- look to see what's going on with the indentation.
diff --git a/testcases/stringlit.occ b/testcases/stringlit.occ
index 431c0b8..a222bd6 100644
--- a/testcases/stringlit.occ
+++ b/testcases/stringlit.occ
@@ -7,6 +7,9 @@ PROC P ()
   VAL BYTE cc IS '"':
   VAL BYTE ccx IS '*"':
   VAL BYTE ccc IS '*'':
+  VAL []BYTE mls IS "first*
+                    *second*
+                    *third":
   VAL [5][5]BYTE square IS ["sator", "arepo", "tenas", "opera", "rotas"]:
   SKIP
 :