From 1bac142a532b33d4b0877b28fe29a578284db690 Mon Sep 17 00:00:00 2001
From: Adam Sampson <ats@offog.org>
Date: Tue, 21 Aug 2007 20:44:15 +0000
Subject: [PATCH] Rework the parser to use the new lexer. The occam parser is
 now a GenParser Token OccState, rather than a GenParser Char OccState, and a
 lot of now-redundant code has been removed. The parser is also somewhat
 faster, which wasn't intended but is nice anyway.

I've also modified the Rain parser to not rely on the old preprocessing code;
it wasn't appropriate for Rain's syntax anyway, so I assume Neil will be
replacing it eventually.
---
 CompState.hs            |  11 +-
 Indentation.hs          | 191 ---------------
 Main.hs                 |   9 +-
 Makefile                |   5 +-
 Parse.hs                | 530 ++++++++++++----------------------------
 PreprocessOccam.hs      |  61 +++--
 RainParse.hs            |  27 +-
 Utils.hs                |   4 +
 testcases/stringlit.occ |   2 +-
 9 files changed, 204 insertions(+), 636 deletions(-)
 delete mode 100644 Indentation.hs

diff --git a/CompState.hs b/CompState.hs
index d2d49b6..7e6d2ce 100644
--- a/CompState.hs
+++ b/CompState.hs
@@ -50,15 +50,10 @@ data CompState = CompState {
     csVerboseLevel :: Int,
     csOutputFile :: String,
 
-    -- Set by (new) preprocessor
+    -- Set by preprocessor
     csCurrentFile :: String,
     csUsedFiles :: Set.Set String,
 
-    -- Set by (old) preprocessor
-    csSourceFiles :: Map String String,
-    csIndentLinesIn :: [String],
-    csIndentLinesOut :: [String],
-
     -- Set by Parse
     csLocalNames :: [(String, A.Name)],
     csMainLocals :: [(String, A.Name)],
@@ -93,10 +88,6 @@ emptyState = CompState {
     csCurrentFile = "none",
     csUsedFiles = Set.empty,
 
-    csSourceFiles = Map.empty,
-    csIndentLinesIn = [],
-    csIndentLinesOut = [],
-
     csLocalNames = [],
     csMainLocals = [],
     csNames = Map.empty,
diff --git a/Indentation.hs b/Indentation.hs
deleted file mode 100644
index 770f612..0000000
--- a/Indentation.hs
+++ /dev/null
@@ -1,191 +0,0 @@
-{-
-Tock: a compiler for parallel languages
-Copyright (C) 2007  University of Kent
-
-This program is free software; you can redistribute it and/or modify it
-under the terms of the GNU General Public License as published by the
-Free Software Foundation, either version 2 of the License, or (at your
-option) any later version.
-
-This program is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program.  If not, see <http://www.gnu.org/licenses/>.
--}
-
--- | Parse indentation in occam source.
-module Indentation (removeIndentation, indentMarker, outdentMarker, eolMarker) where
-
-import Control.Monad
-import Control.Monad.Error
-import Control.Monad.State
-import Data.List
-import Text.Regex
-
-import CompState
-import Errors
-import Pass
-
--- FIXME When this joins continuation lines, it should stash the details of
--- what it joined into CompState so that error reporting later on can
--- reconstruct the original position.
-
-indentMarker = "__indent"
-outdentMarker = "__outdent"
-eolMarker = "__eol"
-
--- FIXME: There's probably a nicer way of doing this.
--- (Well, trivially, use a WriterT...)
-
--- | Preprocess occam source code to remove comments and turn indentation into
--- explicit markers.
-removeIndentation :: String -> String -> PassM String
-removeIndentation filename orig
-    =  do modify $ (\ps -> ps { csIndentLinesIn = origLines,
-                                csIndentLinesOut = [] })
-          catchError (nextLine 0) reportError
-          ps <- get
-          let out = concat $ intersperse "\n" $ reverse $ csIndentLinesOut ps
-          modify $ (\ps -> ps { csIndentLinesIn = [],
-                                csIndentLinesOut = [] })
-          return out
-  where
-    origLines = lines orig
-
-    -- | When something goes wrong, figure out how far through the file we'd got.
-    reportError :: String -> PassM ()
-    reportError error
-        =  do ps <- get
-              let lineNumber = length origLines - length (csIndentLinesIn ps)
-              die $ filename ++ ":" ++ show lineNumber ++ ": " ++ error
-
-    -- | Get the next raw line from the input.
-    getLine :: PassM (Maybe String)
-    getLine
-        =  do ps <- get
-              case csIndentLinesIn ps of
-                [] -> return Nothing
-                (line:rest) ->
-                  do put $ ps { csIndentLinesIn = rest }
-                     return $ Just line
-
-    -- | Add a line to the output.
-    putLine :: String -> PassM ()
-    putLine line
-        = modify $ (\ps -> ps { csIndentLinesOut = line : csIndentLinesOut ps })
-
-    -- | Append to the *previous* line added.
-    addToLine :: String -> PassM ()
-    addToLine s
-        = modify $ (\ps -> ps { csIndentLinesOut =
-                                  case csIndentLinesOut ps of (l:ls) -> ((l ++ s):ls) })
-
-    -- | Given a line, read the rest of it, then return the complete thing.
-    finishLine :: String -> String -> Bool -> Bool -> String -> PassM String
-    finishLine left soFar inStr isChar afterStr
-        = case (left, inStr, isChar) of
-            ([], False, _) -> plainEOL
-            ('-':'-':cs, False, _) -> plainEOL
-            ([], True, _) -> die "end of line in string without continuation"
-            (['*'], True, _) -> stringEOL
-            ('\'':cs, False, _) -> finishLine cs (afterStr ++ ('\'':soFar)) True True ""
-            ('\'':cs, True, True) -> finishLine cs (afterStr ++ ('\'':soFar)) False False ""
-            ('"':cs, False, _) -> finishLine cs (afterStr ++ ('"':soFar)) True False ""
-            ('"':cs, True, False) -> finishLine cs (afterStr ++ ('"':soFar)) False False ""
-            ('*':'*':cs, True, _) -> finishLine cs ('*':'*':soFar) True isChar afterStr
-            ('*':'"':cs, True, _) -> finishLine cs ('"':'*':soFar) True isChar afterStr
-            ('*':'\'':cs, True, _) -> finishLine cs ('\'':'*':soFar) True isChar afterStr
-            (c:cs, _, _) -> finishLine cs (c:soFar) inStr isChar afterStr
-      where
-        -- | Finish a regular line.
-        plainEOL :: PassM String
-        plainEOL
-            =  do let s = reverse soFar
-                  if hasContinuation s
-                    then do l <- getLine >>= checkJust "no continuation line"
-                            finishLine l ('\n':soFar) False False ""
-                    else return s
-
-        -- | Finish a line where we're in the middle of a string.
-        stringEOL :: PassM String
-        stringEOL
-            =  do l <- getLine >>= checkJust "no string continuation line"
-                  l' <- contStringStart l
-                  -- When we hit the end of the string, add a \n after it to
-                  -- make the line numbers match up again.
-                  finishLine l' soFar True isChar ('\n':afterStr)
-
-    -- | Does a line have a continuation line following it?
-    hasContinuation :: String -> Bool
-    hasContinuation s
-        = case matchRegex contRE s of
-            Just _ -> True
-            Nothing -> False
-      where
-        -- FIXME This should probably be based on the list of operators and
-        -- reserved words that the parser already has; for now this is the
-        -- regexp that occamdoc uses.
-        contRE = mkRegexWithOpts "(-|~|\\+|-|\\*|/|\\\\|/\\\\|\\\\/|><|=|<>|<|>|>=|<=|,|;|:=|<<|>>|([[:space:]](MINUS|BITNOT|NOT|SIZE|REM|PLUS|MINUS|TIMES|BITAND|BITOR|AND|OR|AFTER|FROM|FOR|IS|RETYPES|RESHAPES)))[[:space:]]*$" False True
-
-    -- | Strip the spaces-then-star beginning off a string continuation line.
-    contStringStart :: String -> PassM String
-    contStringStart (' ':cs) = contStringStart cs
-    contStringStart ('*':cs) = return cs
-    contStringStart _ = die "string continuation line doesn't start with *"
-
-    -- | Get the next *complete* line from the input, resolving continuations.
-    readLine :: PassM (Maybe String)
-    readLine
-        =  do line <- getLine
-              case line of
-                Just s ->
-                  do r <- finishLine s "" False False ""
-                     return $ Just r
-                Nothing -> return Nothing
-
-    -- | Compute the indentation level of a line, and return it without the indentation.
-    countIndent :: String -> Int -> PassM (Int, String)
-    -- Tabs are 8 spaces.
-    countIndent ('\t':cs) soFar = countIndent cs (soFar + 4)
-    countIndent (' ':' ':cs) soFar = countIndent cs (soFar + 1)
-    countIndent [' '] soFar = return (soFar, [])
-    countIndent (' ':_) soFar
-        = die "bad indentation (odd number of spaces)"
-    countIndent cs soFar = return (soFar, cs)
-
-    -- | Repeat a string N times.
-    rep :: Int -> String -> String
-    rep n s = concat $ take n (repeat s)
-
-    -- | Process the next line from the input.
-    nextLine :: Int -> PassM ()
-    nextLine level
-        =  do l <- readLine
-              case l of
-                Nothing -> return ()
-                Just line ->
-                  do (newLevel, stripped) <- countIndent line 0
-                     addLine level newLevel line stripped
-
-    -- | Once a line's been retrieved, add it to the output along with the
-    -- appropriate markers, then go and process the next one.
-    addLine :: Int -> Int -> String -> String -> PassM ()
-    addLine level newLevel line stripped
-      | stripped == "" =
-        do putLine ""
-           nextLine level
-      | newLevel > level =
-        do addToLine $ rep (newLevel - level) (" " ++ indentMarker)
-           putLine $ line ++ " " ++ eolMarker
-           nextLine newLevel
-      | newLevel < level =
-        do addToLine $ rep (level - newLevel) (" " ++ outdentMarker)
-           putLine $ line ++ " " ++ eolMarker
-           nextLine newLevel
-      | otherwise =
-        do putLine $ line ++ " " ++ eolMarker
-           nextLine level
-
diff --git a/Main.hs b/Main.hs
index 7a718f4..2263402 100644
--- a/Main.hs
+++ b/Main.hs
@@ -17,7 +17,7 @@ with this program.  If not, see <http://www.gnu.org/licenses/>.
 -}
 
 -- | Driver for the compiler.
-module Main where
+module Main (main) where
 
 import Control.Monad
 import Control.Monad.Error
@@ -34,6 +34,7 @@ import GenerateC
 import GenerateCPPCSP
 import Parse
 import Pass
+import PreprocessOccam
 import PrettyShow
 import RainParse
 import RainPasses
@@ -141,14 +142,10 @@ compile :: String -> PassM ()
 compile fn
   =  do optsPS <- get
 
-        debug "{{{ Preprocess"
-        loadSource fn
-        debug "}}}"
-
         debug "{{{ Parse"
         progress "Parse"
         ast1 <- case csFrontend optsPS of
-          FrontendOccam -> parseProgram fn
+          FrontendOccam -> preprocessOccamProgram fn >>= parseOccamProgram
           FrontendRain -> parseRainProgram fn
         debugAST ast1
         debug "}}}"
diff --git a/Makefile b/Makefile
index 7657a4f..c16e7b2 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-targets = tock tocktest lextest
+targets = tock tocktest
 
 all: $(targets)
 
@@ -20,9 +20,6 @@ tock: $(sources)
 tocktest: $(sources)
 	ghc $(ghc_opts) -o tocktest -main-is TestMain --make TestMain
 
-lextest: $(sources)
-	ghc $(ghc_opts) -o lextest -main-is PreprocessOccam --make PreprocessOccam
-
 CFLAGS = \
 	-O2 \
 	-g -Wall \
diff --git a/Parse.hs b/Parse.hs
index c05e945..f14daf3 100644
--- a/Parse.hs
+++ b/Parse.hs
@@ -17,7 +17,7 @@ with this program.  If not, see <http://www.gnu.org/licenses/>.
 -}
 
 -- | Parse occam code into an AST.
-module Parse where
+module Parse (parseOccamProgram) where
 
 import Control.Monad (liftM, when)
 import Control.Monad.Error (runErrorT)
@@ -26,26 +26,23 @@ import Data.List
 import qualified Data.Map as Map
 import Data.Maybe
 import Debug.Trace
-import qualified IO
 import Text.ParserCombinators.Parsec
-import Text.ParserCombinators.Parsec.Language (emptyDef)
-import qualified Text.ParserCombinators.Parsec.Token as P
-import Text.Regex
+import Text.ParserCombinators.Parsec.Pos (newPos)
 
 import qualified AST as A
 import CompState
 import Errors
 import EvalConstants
 import EvalLiterals
-import Indentation
 import Intrinsics
+import LexOccam
 import Metadata
 import Pass
 import Types
 import Utils
 
---{{{ setup stuff for Parsec
-type OccParser = GenParser Char CompState
+--{{{ the parser monad
+type OccParser = GenParser Token CompState
 
 -- | Make MonadState functions work in the parser monad.
 -- This came from <http://hackage.haskell.org/trac/ghc/ticket/1274> -- which means
@@ -56,145 +53,48 @@ instance MonadState st (GenParser tok st) where
 
 instance Die (GenParser tok st) where
   die = fail
-
-occamStyle
-    = emptyDef
-    { P.commentLine    = "--"
-    , P.nestedComments = False
-    , P.identStart     = letter
-    , P.identLetter    = alphaNum <|> char '.'
-    , P.opStart        = oneOf "+-*/\\>=<~"
-    , P.opLetter       = oneOf "/\\>=<"
-    , P.reservedOpNames= [
-                          "+",
-                          "-",
-                          "*",
-                          "/",
-                          "\\",
-                          "/\\",
-                          "\\/",
-                          "><",
-                          "<<",
-                          ">>",
-                          "=",
-                          "<>",
-                          "<",
-                          ">",
-                          ">=",
-                          "<=",
-                          "-",
-                          "~"
-                         ]
-    , P.reservedNames  = [
-                          "AFTER",
-                          "ALT",
-                          "AND",
-                          "ANY",
-                          "AT",
-                          "BITAND",
-                          "BITNOT",
-                          "BITOR",
-                          "BOOL",
-                          "BYTE",
-                          "BYTESIN",
-                          "CASE",
-                          "CHAN",
-                          "DATA",
-                          "ELSE",
-                          "FALSE",
-                          "FOR",
-                          "FROM",
-                          "FUNCTION",
-                          "IF",
-                          "IN",
-                          "INLINE",
-                          "INT",
-                          "INT16",
-                          "INT32",
-                          "INT64",
-                          "IS",
-                          "MINUS",
-                          "MOSTNEG",
-                          "MOSTPOS",
-                          "NOT",
-                          "OF",
-                          "OFFSETOF",
-                          "OR",
-                          "PACKED",
-                          "PAR",
-                          "PLACE",
-                          "PLACED",
-                          "PLUS",
-                          "PORT",
-                          "PRI",
-                          "PROC",
-                          "PROCESSOR",
-                          "PROTOCOL",
-                          "REAL32",
-                          "REAL64",
-                          "RECORD",
-                          "REM",
-                          "RESHAPES",
-                          "RESULT",
-                          "RETYPES",
-                          "ROUND",
-                          "SEQ",
-                          "SIZE",
-                          "SKIP",
-                          "STOP",
-                          "TIMER",
-                          "TIMES",
-                          "TRUE",
-                          "TRUNC",
-                          "TYPE",
-                          "VAL",
-                          "VALOF",
-                          "WHILE",
-                          "WORKSPACE",
-                          "VECSPACE",
-                          "#INCLUDE",
-                          "#USE",
-                          indentMarker,
-                          outdentMarker,
-                          eolMarker,
-                          mainMarker
-                         ]
-    , P.caseSensitive  = True
-    }
-
-lexer :: P.TokenParser CompState
-lexer  = P.makeTokenParser occamStyle
-
--- XXX replace whitespace with something that doesn't eat \ns
-
-whiteSpace = P.whiteSpace lexer
-lexeme = P.lexeme lexer
-symbol = P.symbol lexer
-natural = P.natural lexer
-parens = P.parens lexer
-semi = P.semi lexer
-identifier = P.identifier lexer
-reserved = P.reserved lexer
-reservedOp = P.reservedOp lexer
 --}}}
 
+--{{{ matching rules for raw tokens
+-- | Extract source position from a `Token`.
+tokenPos :: Token -> SourcePos
+tokenPos (m, _) = metaToSourcePos m
+
+genToken :: (Token -> Maybe a) -> OccParser a
+genToken test = token show tokenPos test
+
+reserved :: String -> OccParser ()
+reserved name = genToken test
+  where
+    test (_, TokReserved name')
+        = if name' == name then Just () else Nothing
+    test _ = Nothing
+
+identifier :: OccParser String
+identifier = genToken test
+  where
+    test (_, TokIdentifier s) = Just s
+    test _ = Nothing
+
+plainToken :: TokenType -> OccParser ()
+plainToken t = genToken test
+  where
+    test (_, t') = if t == t' then Just () else Nothing
+--}}}
 --{{{ symbols
-sLeft = try $ symbol "["
-sRight = try $ symbol "]"
-sLeftR = try $ symbol "("
-sRightR = try $ symbol ")"
-sAssign = try $ symbol ":="
-sColon = try $ symbol ":"
-sColons = try $ symbol "::"
-sComma = try $ symbol ","
-sSemi = try $ symbol ";"
-sAmp = try $ symbol "&"
-sQuest = try $ symbol "?"
-sBang = try $ symbol "!"
-sEq = try $ symbol "="
-sApos = try $ symbol "'"
-sQuote = try $ symbol "\""
-sHash = try $ symbol "#"
+sAmp = reserved "&"
+sAssign = reserved ":="
+sBang = reserved "!"
+sColon = reserved ":"
+sColons = reserved "::"
+sComma = reserved ","
+sEq = reserved "="
+sLeft = reserved "["
+sLeftR = reserved "("
+sQuest = reserved "?"
+sRight = reserved "]"
+sRightR = reserved ")"
+sSemi = reserved ";"
 --}}}
 --{{{ keywords
 sAFTER = reserved "AFTER"
@@ -263,29 +163,40 @@ sVALOF = reserved "VALOF"
 sWHILE = reserved "WHILE"
 sWORKSPACE = reserved "WORKSPACE"
 sVECSPACE = reserved "VECSPACE"
-sppINCLUDE = reserved "#INCLUDE"
-sppUSE = reserved "#USE"
 --}}}
 --{{{ markers inserted by the preprocessor
--- XXX could handle VALOF by translating each step to one { and matching multiple ones?
-mainMarker = "__main"
-
-sMainMarker = do { whiteSpace; reserved mainMarker } <?> "end of input (top-level process)"
-
-indent = do { whiteSpace; reserved indentMarker } <?> "indentation increase"
-outdent = do { whiteSpace; reserved outdentMarker } <?> "indentation decrease"
-eol = do { whiteSpace; reserved eolMarker } <?> "end of line"
+indent = do { plainToken Indent } <?> "indentation increase"
+outdent = do { plainToken Outdent } <?> "indentation decrease"
+eol = do { plainToken EndOfLine } <?> "end of line"
 --}}}
 
 --{{{ helper functions
 md :: OccParser Meta
 md
   =  do pos <- getPosition
-        return Meta {
-                 metaFile = Just $ sourceName pos,
-                 metaLine = sourceLine pos,
-                 metaColumn = sourceColumn pos
-               }
+        return $ sourcePosToMeta pos
+
+--{{{ Meta to/from SourcePos
+-- | Convert source position into Parsec's format.
+metaToSourcePos :: Meta -> SourcePos
+metaToSourcePos meta
+    = newPos filename (metaLine meta) (metaColumn meta)
+  where
+    filename = case metaFile meta of
+                 Just s -> s
+                 Nothing -> ""
+
+-- | Convert source position out of Parsec's format.
+sourcePosToMeta :: SourcePos -> Meta
+sourcePosToMeta pos
+    = emptyMeta {
+        metaFile = case sourceName pos of
+                     "" -> Nothing
+                     s -> Just s,
+        metaLine = sourceLine pos,
+        metaColumn = sourceColumn pos
+      }
+--}}}
 
 --{{{ try*
 -- These functions let you try a sequence of productions and only retrieve the
@@ -335,12 +246,6 @@ tryXVVX a b c d = try (do { a; bv <- b; cv <- c; d; return (bv, cv) })
 tryVXXV :: OccParser a -> OccParser b -> OccParser c -> OccParser d -> OccParser (a, d)
 tryVXXV a b c d = try (do { av <- a; b; c; dv <- d; return (av, dv) })
 
-tryVXVX :: OccParser a -> OccParser b -> OccParser c -> OccParser d -> OccParser (a, c)
-tryVXVX a b c d = try (do { av <- a; b; cv <- c; d; return (av, cv) })
-
-tryVVXX :: OccParser a -> OccParser b -> OccParser c -> OccParser d -> OccParser (a, b)
-tryVVXX a b c d = try (do { av <- a; bv <- b; c; d; return (av, bv) })
-
 tryVVXV :: OccParser a -> OccParser b -> OccParser c -> OccParser d -> OccParser (a, b, d)
 tryVVXV a b c d = try (do { av <- a; bv <- b; c; dv <- d; return (av, bv, dv) })
 
@@ -431,15 +336,6 @@ handleSpecs specs inner specMarker
            mapM scopeOutSpec ss'
            return $ foldl (\e s -> specMarker m s e) v ss'
 
--- | Like sepBy1, but not eager: it won't consume the separator unless it finds
--- another item after it.
-sepBy1NE :: OccParser a -> OccParser b -> OccParser [a]
-sepBy1NE item sep
-    =  do i <- item
-          rest <- option [] $ try (do sep
-                                      sepBy1NE item sep)
-          return $ i : rest
-
 -- | Run several different parsers with a separator between them.
 -- If you give it [a, b, c] and s, it'll parse [a, s, b, s, c] then
 -- give you back the results from [a, b, c].
@@ -803,42 +699,31 @@ untypedLiteral
 real :: OccParser A.LiteralRepr
 real
     =   do m <- md
-           (l, r) <- tryVXVX digits (char '.') digits (char 'E')
-           e <- lexeme occamExponent
-           return $ A.RealLiteral m (l ++ "." ++ r ++ "E" ++ e)
-    <|> do m <- md
-           l <- tryVX digits (char '.')
-           r <- lexeme digits
-           return $ A.RealLiteral m (l ++ "." ++ r)
+           genToken (test m)
     <?> "real literal"
-
-occamExponent :: OccParser String
-occamExponent
-    =  do c <- oneOf "+-"
-          d <- digits
-          return $ c : d
-    <?> "exponent"
+  where
+    test m (_, TokRealLiteral s) = Just $ A.RealLiteral m s
+    test _ _ = Nothing
 
 integer :: OccParser A.LiteralRepr
 integer
     =  do m <- md
-          do { d <- lexeme digits; return $ A.IntLiteral m d }
-            <|> do { d <- lexeme (sHash >> many1 hexDigit); return $ A.HexLiteral m d }
+          genToken (test m)
     <?> "integer literal"
-
-digits :: OccParser String
-digits
-    =   many1 digit
-    <?> "decimal digits"
+  where
+    test m (_, TokIntLiteral s) = Just $ A.IntLiteral m s
+    test m (_, TokHexLiteral s) = Just $ A.HexLiteral m (drop 1 s)
+    test _ _ = Nothing
 
 byte :: OccParser A.LiteralRepr
 byte
     =  do m <- md
-          char '\''
-          c <- literalCharacter
-          sApos
-          return c
+          genToken (test m)
     <?> "byte literal"
+  where
+    test m (_, TokCharLiteral s)
+        = case splitStringLiteral m (chop 1 1 s) of [lr] -> Just lr
+    test _ _ = Nothing
 
 -- | Parse a table -- an array literal which might be subscripted or sliced.
 -- (The implication of this is that the type of the expression this parses
@@ -884,29 +769,45 @@ tableElems
 stringLiteral :: OccParser (A.LiteralRepr, A.Dimension)
 stringLiteral
     =  do m <- md
-          char '"'
-          cs <- manyTill literalCharacter sQuote
-          let aes = [A.ArrayElemExpr $ A.Literal m A.Byte c | c <- cs]
+          cs <- stringCont <|> stringLit
+          let aes = [A.ArrayElemExpr $ A.Literal m' A.Byte c
+                     | c@(A.ByteLiteral m' _) <- cs]
           return (A.ArrayLiteral m aes, A.Dimension $ length cs)
     <?> "string literal"
+  where
+    stringCont :: OccParser [A.LiteralRepr]
+    stringCont
+        =  do m <- md
+              s <- genToken test
+              rest <- stringCont <|> stringLit
+              return $ (splitStringLiteral m s) ++ rest
+      where
+        test (_, TokStringCont s) = Just (chop 1 2 s)
+        test _ = Nothing
 
-character :: OccParser String
-character
-    =   do char '*'
-           do char '#'
-              a <- hexDigit
-              b <- hexDigit
-              return $ ['*', '#', a, b]
-            <|> do { c <- anyChar; return ['*', c] }
-    <|> do c <- anyChar
-           return [c]
-    <?> "character"
+    stringLit :: OccParser [A.LiteralRepr]
+    stringLit
+        =  do m <- md
+              s <- genToken test
+              return $ splitStringLiteral m s
+      where
+        test (_, TokStringLiteral s) = Just (chop 1 1 s)
+        test _ = Nothing
 
-literalCharacter :: OccParser A.LiteralRepr
-literalCharacter
-    =   do m <- md
-           c <- character
-           return $ A.ByteLiteral m c
+-- | Parse a string literal.
+-- FIXME: This should decode the occam escapes.
+splitStringLiteral :: Meta -> String -> [A.LiteralRepr]
+splitStringLiteral m cs = ssl cs
+  where
+    ssl [] = []
+    ssl ('*':'#':a:b:cs)
+        = (A.ByteLiteral m ['*', '#', a, b]) : ssl cs
+    ssl ('*':'\n':cs)
+        = (A.ByteLiteral m $ tail $ dropWhile (/= '*') cs) : ssl cs
+    ssl ('*':c:cs)
+        = (A.ByteLiteral m ['*', c]) : ssl cs
+    ssl (c:cs)
+        = (A.ByteLiteral m [c]) : ssl cs
 --}}}
 --{{{ expressions
 expressionList :: [A.Type] -> OccParser A.ExpressionList
@@ -1071,42 +972,42 @@ intrinsicFunctionSingle
 
 monadicOperator :: OccParser A.MonadicOp
 monadicOperator
-    =   do { reservedOp "-" <|> sMINUS; return A.MonadicSubtr }
-    <|> do { reservedOp "~" <|> sBITNOT; return A.MonadicBitNot }
+    =   do { reserved "-" <|> sMINUS; return A.MonadicSubtr }
+    <|> do { reserved "~" <|> sBITNOT; return A.MonadicBitNot }
     <|> do { sNOT; return A.MonadicNot }
     <?> "monadic operator"
 
 dyadicOperator :: OccParser A.DyadicOp
 dyadicOperator
-    =   do { reservedOp "+"; return A.Add }
-    <|> do { reservedOp "-"; return A.Subtr }
-    <|> do { reservedOp "*"; return A.Mul }
-    <|> do { reservedOp "/"; return A.Div }
-    <|> do { reservedOp "\\"; return A.Rem }
+    =   do { reserved "+"; return A.Add }
+    <|> do { reserved "-"; return A.Subtr }
+    <|> do { reserved "*"; return A.Mul }
+    <|> do { reserved "/"; return A.Div }
+    <|> do { reserved "\\"; return A.Rem }
     <|> do { sREM; return A.Rem }
     <|> do { sMINUS; return A.Minus }
-    <|> do { reservedOp "/\\" <|> sBITAND; return A.BitAnd }
-    <|> do { reservedOp "\\/" <|> sBITOR; return A.BitOr }
-    <|> do { reservedOp "><"; return A.BitXor }
+    <|> do { reserved "/\\" <|> sBITAND; return A.BitAnd }
+    <|> do { reserved "\\/" <|> sBITOR; return A.BitOr }
+    <|> do { reserved "><"; return A.BitXor }
     <?> "dyadic operator"
 
 -- These always need an INT on their right-hand side.
 shiftOperator :: OccParser A.DyadicOp
 shiftOperator
-    =   do { reservedOp "<<"; return A.LeftShift }
-    <|> do { reservedOp ">>"; return A.RightShift }
+    =   do { reserved "<<"; return A.LeftShift }
+    <|> do { reserved ">>"; return A.RightShift }
     <?> "shift operator"
 
 -- These always return a BOOL, so we have to deal with them specially for type
 -- context.
 comparisonOperator :: OccParser A.DyadicOp
 comparisonOperator
-    =   do { reservedOp "="; return A.Eq }
-    <|> do { reservedOp "<>"; return A.NotEq }
-    <|> do { reservedOp "<"; return A.Less }
-    <|> do { reservedOp ">"; return A.More }
-    <|> do { reservedOp "<="; return A.LessEq }
-    <|> do { reservedOp ">="; return A.MoreEq }
+    =   do { reserved "="; return A.Eq }
+    <|> do { reserved "<>"; return A.NotEq }
+    <|> do { reserved "<"; return A.Less }
+    <|> do { reserved ">"; return A.More }
+    <|> do { reserved "<="; return A.LessEq }
+    <|> do { reserved ">="; return A.MoreEq }
     <|> do { sAFTER; return A.After }
     <?> "comparison operator"
 
@@ -1617,7 +1518,6 @@ process
     <|> mainProcess
     <|> handleSpecs (allocation <|> specification) process
                     (\m s p -> A.Seq m (A.Spec m s (A.OnlyP m p)))
-    <|> preprocessorDirective
     <?> "process"
 
 --{{{ assignment (:=)
@@ -2021,67 +1921,11 @@ intrinsicProc
           return $ A.IntrinsicProcCall m s as
     <?> "intrinsic PROC instance"
 --}}}
---{{{ preprocessor directives
-preprocessorDirective :: OccParser A.Process
-preprocessorDirective
-    =   ppInclude
-    <|> ppUse
-    <|> unknownPP
-    <?> "preprocessor directive"
-
-ppInclude :: OccParser A.Process
-ppInclude
-    =  do sppINCLUDE
-          char '"'
-          file <- manyTill character sQuote
-          eol
-          includeFile $ concat file
-    <?> "#INCLUDE directive"
-
-ppUse :: OccParser A.Process
-ppUse
-    =  do sppUSE
-          char '"'
-          mod <- manyTill character sQuote
-          eol
-          let file = mangleModName $ concat mod
-
-          -- Check whether it's been included already.
-          ps <- getState
-          if file `elem` csLoadedFiles ps
-            then process
-            else includeFile file
-    <?> "#USE directive"
-
--- | Invoke the parser recursively to handle an included file.
-includeFile :: String -> OccParser A.Process
-includeFile file
-    =  do ps <- getState
-          (r, ps') <- parseFile file includedFile ps
-          case r of
-            Left p ->
-              do setState ps'
-                 return p
-            Right f ->
-              do setState ps' { csLocalNames = csMainLocals ps' }
-                 p <- process
-                 return $ f p
-
-unknownPP :: OccParser A.Process
-unknownPP
-    =  do m <- md
-          char '#'
-          rest <- manyTill anyChar (try eol)
-          addWarning m $ "unknown preprocessor directive ignored: " ++ rest
-          process
-    <?> "unknown preprocessor directive"
---}}}
 --{{{ main process
 mainProcess :: OccParser A.Process
 mainProcess
     =  do m <- md
-          sMainMarker
-          eol
+          eof
           -- Stash the current locals so that we can either restore them
           -- when we get back to the file we included this one from, or
           -- pull the TLP name from them at the end.
@@ -2096,100 +1940,26 @@ mainProcess
 -- have the earlier ones in scope, so we can't parse them separately.
 sourceFile :: OccParser (A.Process, CompState)
 sourceFile
-    =   do whiteSpace
-           p <- process
+    =   do p <- process
            s <- getState
            return (p, s)
-
--- | An included file is either a process, or a bunch of specs that can be
--- applied to a process (which we return as a function). This is likewise a bit
--- of a cheat, in that included files should really be *textually* included,
--- but it's good enough for most reasonable uses.
-includedFile :: OccParser (Either A.Process (A.Process -> A.Process), CompState)
-includedFile
-    =  do whiteSpace
-          p <- process
-          s <- getState
-          do eof
-             return $ (Right $ replaceMain p, s)
-           <|> do sMainMarker
-                  eol
-                  return $ (Left p, s)
-  where
-    replaceMain :: A.Process -> A.Process -> A.Process
-    replaceMain (A.Seq m (A.Spec m' s (A.OnlyP m'' p))) np
-        = A.Seq m (A.Spec m' s (A.OnlyP m'' (replaceMain p np)))
-    replaceMain (A.Main _) np = np
 --}}}
 --}}}
 
---{{{  preprocessor
--- XXX Doesn't handle conditionals.
-
--- | Find (via a nasty regex search) all the files that this source file includes.
-preFindIncludes :: String -> [String]
-preFindIncludes source
-    = concat [case matchRegex incRE l of
-                Just [_, fn] -> [fn]
-                Nothing -> []
-              | l <- lines source]
-  where
-    incRE = mkRegex "^ *#(INCLUDE|USE) +\"([^\"]*)\""
-
--- | If a module name doesn't already have a suffix, add one.
-mangleModName :: String -> String
-mangleModName mod
-    = if ".occ" `isSuffixOf` mod || ".inc" `isSuffixOf` mod
-        then mod
-        else mod ++ ".occ"
-
--- | Load all the source files necessary for a program.
--- We have to do this now, before entering the parser, because the parser
--- doesn't run in the IO monad. If there were a monad transformer version of
--- Parsec then we could just open files as we need them.
-loadSource :: String -> PassM ()
-loadSource file = load file file
-  where
-    load :: String -> String -> PassM ()
-    load file realName
-        =  do ps <- get
-              case Map.lookup file (csSourceFiles ps) of
-                Just _ -> return ()
-                Nothing ->
-                  do progress $ "Loading source file " ++ realName
-                     rawSource <- liftIO $ readFile realName
-                     source <- removeIndentation realName (rawSource ++ "\n" ++ mainMarker ++ "\n")
-                     debug $ "Preprocessed source:"
-                     debug $ numberLines source
-                     modify $ (\ps -> ps { csSourceFiles = Map.insert file source (csSourceFiles ps) })
-                     let deps = map mangleModName $ preFindIncludes source
-                     sequence_ [load dep (joinPath realName dep) | dep <- deps]
---}}}
-
 --{{{  entry points for the parser itself
--- | Test a parser production (for use from ghci while debugging the parser).
-testParse :: Show a => OccParser a -> String -> IO ()
-testParse prod text
-    = do let r = runParser prod emptyState "" text
-         putStrLn $ "Result: " ++ show r
-
--- | Parse a file with the given production.
-parseFile :: Monad m => String -> OccParser t -> CompState -> m t
-parseFile file prod ps
-    =  do let source = case Map.lookup file (csSourceFiles ps) of
-                         Just s -> s
-                         Nothing -> dieIO $ "Failed to preload file: " ++ show file
-          let ps' = ps { csLoadedFiles = file : csLoadedFiles ps }
-          case runParser prod ps' file source of
-            Left err -> dieIO $ "Parse error: " ++ show err
+-- | Parse a token stream with the given production.
+runTockParser :: [Token] -> OccParser t -> CompState -> PassM t
+runTockParser toks prod cs
+    =  do case runParser prod cs "irrelevant filename" toks of
+            Left err -> die $ "Parse error: " ++ show err
             Right r -> return r
 
--- | Parse the top level source file in a program.
-parseProgram :: String -> PassM A.Process
-parseProgram file
-    =  do ps <- get
-          (p, ps') <- parseFile file sourceFile ps
-          put ps'
+-- | Parse an occam program.
+parseOccamProgram :: [Token] -> PassM A.Process
+parseOccamProgram toks
+    =  do cs <- get
+          (p, cs') <- runTockParser toks sourceFile cs
+          put cs'
           return p
 --}}}
 
diff --git a/PreprocessOccam.hs b/PreprocessOccam.hs
index 7342f33..02e5e21 100644
--- a/PreprocessOccam.hs
+++ b/PreprocessOccam.hs
@@ -17,26 +17,23 @@ with this program.  If not, see <http://www.gnu.org/licenses/>.
 -}
 
 -- | Preprocess occam code.
-module PreprocessOccam where
+module PreprocessOccam (preprocessOccamProgram) where
 
+import Control.Monad.State
 import Data.List
 import qualified Data.Set as Set
+import System.IO
 import Text.Regex
 
+import CompState
 import Errors
 import LexOccam
 import Metadata
 import Pass
+import PrettyShow
 import StructureOccam
 import Utils
 
-import CompState
-import Control.Monad.Error
-import Control.Monad.State
-import System
-import System.IO
-import PrettyShow
-
 -- | Open an included file, looking for it in the search path.
 -- Return the open filehandle and the location of the file.
 -- FIXME: This doesn't actually look at the search path yet.
@@ -59,21 +56,38 @@ searchFile m filename
 preprocessFile :: Meta -> String -> PassM [Token]
 preprocessFile m filename
     =  do (handle, realFilename) <- searchFile m filename
-          liftIO $ putStrLn $ "Loading " ++ realFilename
+          progress $ "Loading source file " ++ realFilename
           origCS <- get
           modify (\cs -> cs { csCurrentFile = realFilename })
           s <- liftIO $ hGetContents handle
-          toks <- runLexer realFilename s >>= structureOccam >>= preprocessOccam
+          toks <- runLexer realFilename s
+          veryDebug $ "{{{ lexer tokens"
+          veryDebug $ pshow toks
+          veryDebug $ "}}}"
+          toks' <- structureOccam toks
+          veryDebug $ "{{{ structured tokens"
+          veryDebug $ pshow toks'
+          veryDebug $ "}}}"
+          toks'' <- preprocessOccam toks'
+          veryDebug $ "{{{ preprocessed tokens"
+          veryDebug $ pshow toks''
+          veryDebug $ "}}}"
           modify (\cs -> cs { csCurrentFile = csCurrentFile origCS })
-          return toks
+          return toks''
 
 -- | Preprocess a token stream.
 preprocessOccam :: [Token] -> PassM [Token]
 preprocessOccam [] = return []
-preprocessOccam ((m, TokPreprocessor ('#':s)):(_, EndOfLine):ts)
-    =  do func <- handleDirective m s
+preprocessOccam ((m, TokPreprocessor s):(_, EndOfLine):ts)
+    =  do func <- handleDirective m (stripPrefix s)
           rest <- preprocessOccam ts
           return $ func rest
+  where
+    stripPrefix :: String -> String
+    stripPrefix (' ':cs) = stripPrefix cs
+    stripPrefix ('\t':cs) = stripPrefix cs
+    stripPrefix ('#':cs) = cs
+    stripPrefix _ = error "bad TokPreprocessor prefix"
 -- Check the above case didn't miss something.
 preprocessOccam ((_, TokPreprocessor _):_)
     = error "bad TokPreprocessor token"
@@ -134,17 +148,12 @@ handleUse m [modName]
             else mod ++ ".occ"
 --}}}
 
--- | Main function for testing.
-main :: IO ()
-main
-    =  do (arg:_) <- getArgs
-          v <- evalStateT (runErrorT (test arg)) emptyState
-          case v of
-            Left e -> dieIO e
-            Right r -> return ()
-  where
-    test :: String -> PassM ()
-    test fn
-        = do tokens <- preprocessFile emptyMeta fn
-             liftIO $ putStrLn $ pshow tokens
+-- | Load and preprocess an occam program.
+preprocessOccamProgram :: String -> PassM [Token]
+preprocessOccamProgram filename
+    =  do toks <- preprocessFile emptyMeta filename
+          veryDebug $ "{{{ tokenised source"
+          veryDebug $ pshow toks
+          veryDebug $ "}}}"
+          return toks
 
diff --git a/RainParse.hs b/RainParse.hs
index bd841ce..7fa2002 100644
--- a/RainParse.hs
+++ b/RainParse.hs
@@ -43,7 +43,6 @@ import CompState
 import Errors
 import EvalConstants
 import EvalLiterals
-import Indentation
 import Intrinsics
 import Metadata
 import Pass
@@ -296,21 +295,13 @@ rainSourceFile
            s <- getState
            return (A.Seq emptyMeta p, s)
 
--- | Parse a file with the given production.
--- This is copied from Parse.hs (because OccParser is about to be changed to not be the same as RainParser):
-parseFile :: Monad m => String -> RainParser t -> CompState -> m t
-parseFile file prod ps
-    =  do let source = case Map.lookup file (csSourceFiles ps) of
-                         Just s -> s
-                         Nothing -> dieIO $ "Failed to preload file: " ++ show file
-          let ps' = ps { csLoadedFiles = file : csLoadedFiles ps }
-          case runParser prod ps' file source of
-            Left err -> dieIO $ "Parse error: " ++ show err
-            Right r -> return r
-
+-- | Load and parse a Rain source file.
 parseRainProgram :: String -> PassM A.Process
-parseRainProgram file
-    =  do ps <- get
-          (p, ps') <- parseFile file rainSourceFile ps
-          put ps'
-          return p
+parseRainProgram filename
+    =  do source <- liftIO $ readFile filename
+          cs <- get
+          case runParser rainSourceFile cs filename source of
+            Left err -> dieIO $ "Parse error: " ++ show err
+            Right (p, cs') ->
+              do put cs'
+                 return p
diff --git a/Utils.hs b/Utils.hs
index dc81930..2d45835 100644
--- a/Utils.hs
+++ b/Utils.hs
@@ -66,3 +66,7 @@ transformEither funcLeft funcRight x = case x of
 maybeIO :: IO a -> IO (Maybe a)
 maybeIO op = catch (op >>= (return . Just)) (\e -> return Nothing)
 
+-- | Remove a number of items from the start and end of a list.
+chop :: Int -> Int -> [a] -> [a]
+chop start end s = drop start (take (length s - end) s)
+
diff --git a/testcases/stringlit.occ b/testcases/stringlit.occ
index a222bd6..fdd994a 100644
--- a/testcases/stringlit.occ
+++ b/testcases/stringlit.occ
@@ -10,6 +10,6 @@ PROC P ()
   VAL []BYTE mls IS "first*
                     *second*
                     *third":
-  VAL [5][5]BYTE square IS ["sator", "arepo", "tenas", "opera", "rotas"]:
+  VAL [5][5]BYTE square IS ["sator", "arepo", "tenat", "opera", "rotas"]:
   SKIP
 :