Add a preprocessor that works with the new lexer.

The parser now gets a stream of tokens, rather than needing to worry about
loading files itself.

This also reworks the lexer's idea of what constitutes a Token -- it's now a
pair (Meta, TokenType), so it's always easy to pull out/rewrite the metadata --
and adds proper support for lexing preprocessor directives, rather than just
treating them as reserved words.
This commit is contained in:
Adam Sampson 2007-08-20 23:46:57 +00:00
parent a640dabc04
commit 223a14d767
6 changed files with 196 additions and 44 deletions

View File

@ -22,6 +22,7 @@ module CompState where
import Data.Generics
import Data.Map (Map)
import qualified Data.Map as Map
import qualified Data.Set as Set
import Control.Monad.State
import qualified AST as A
@ -49,7 +50,11 @@ data CompState = CompState {
csVerboseLevel :: Int,
csOutputFile :: String,
-- Set by preprocessor
-- Set by (new) preprocessor
csCurrentFile :: String,
csUsedFiles :: Set.Set String,
-- Set by (old) preprocessor
csSourceFiles :: Map String String,
csIndentLinesIn :: [String],
csIndentLinesOut :: [String],
@ -85,6 +90,9 @@ emptyState = CompState {
csVerboseLevel = 0,
csOutputFile = "-",
csCurrentFile = "none",
csUsedFiles = Set.empty,
csSourceFiles = Map.empty,
csIndentLinesIn = [],
csIndentLinesOut = [],

View File

@ -31,6 +31,8 @@ import Pass
$decimalDigit = [0-9]
$hexDigit = [0-9 a-f A-F]
@preprocessor = "#" [^\n]*
@reserved = "[" | "]" | "(" | ")"
| "::" | ":=" | ":" | "," | ";" | "&"
| "?" | "!" | "="
@ -60,7 +62,6 @@ $hexDigit = [0-9 a-f A-F]
| "VAL" | "VALOF"
| "WHILE" | "WORKSPACE"
| "VECSPACE"
| "#INCLUDE" | "#USE"
@identifier = [a-z A-Z] [a-z A-Z 0-9 \.]*
@ -80,6 +81,8 @@ $escapeChar = [cnrts \" \' \* \n]
occam :-
@preprocessor { mkToken TokPreprocessor }
-- Ignore whitespace and comments.
$white+ ;
"--" [^\n]* ;
@ -95,28 +98,31 @@ $white+ ;
@realLiteral { mkToken TokRealLiteral }
{
-- | An occam source token and its position.
type Token = (Meta, TokenType)
-- | An occam source token.
-- Only `Token` is generated by the lexer itself; the others are added later
-- once the indentation has been analysed.
data Token =
Token TokenType Meta String -- ^ A real token read from the source
data TokenType =
TokReserved String -- ^ A reserved word or symbol
| TokIdentifier String
| TokStringLiteral String
| TokCharLiteral String
| TokDecimalLiteral String
| TokHexLiteral String
| TokRealLiteral String
| TokPreprocessor String
| Indent -- ^ Indentation increase
| Outdent -- ^ Indentation decrease
| EndOfLine -- ^ End of line
deriving (Show, Eq, Typeable, Data)
-- | The type of a source token.
data TokenType =
TokReserved | TokIdentifier
| TokStringLiteral | TokCharLiteral
| TokDecimalLiteral | TokHexLiteral | TokRealLiteral
deriving (Show, Eq, Typeable, Data)
-- | Build a lexer rule for a token.
mkToken :: TokenType -> AlexPosn -> String -> Token
mkToken tt (AlexPn _ line col) s = Token tt emptyMeta s
mkToken :: (String -> TokenType) -> AlexPosn -> String -> Token
mkToken cons _ s = (emptyMeta, cons s)
-- | Run the lexer, returning either an error position or a list of tokens.
-- | Run the lexer, returning a list of tokens.
-- (This is based on the `alexScanTokens` function that Alex provides.)
runLexer :: String -> String -> PassM [Token]
runLexer filename str = go (alexStartPos, '\n', str)
@ -128,8 +134,8 @@ runLexer filename str = go (alexStartPos, '\n', str)
AlexSkip inp' len -> go inp'
AlexToken inp' len act ->
do ts <- go inp'
let (Token tt _ s) = act pos (take len str)
return $ (Token tt meta s) : ts
let t = act pos (take len str)
return $ (meta, snd t) : ts
where
meta = emptyMeta {

View File

@ -21,7 +21,7 @@ tocktest: $(sources)
ghc $(ghc_opts) -o tocktest -main-is TestMain --make TestMain
lextest: $(sources)
ghc $(ghc_opts) -o lextest -main-is StructureOccam --make StructureOccam
ghc $(ghc_opts) -o lextest -main-is PreprocessOccam --make PreprocessOccam
CFLAGS = \
-O2 \

145
PreprocessOccam.hs Normal file
View File

@ -0,0 +1,145 @@
{-
Tock: a compiler for parallel languages
Copyright (C) 2007 University of Kent
This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
Free Software Foundation, either version 2 of the License, or (at your
option) any later version.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program. If not, see <http://www.gnu.org/licenses/>.
-}
-- | Preprocess occam code.
module PreprocessOccam where
import Data.List
import qualified Data.Set as Set
import Text.Regex
import Errors
import LexOccam
import Metadata
import Pass
import StructureOccam
import Utils
import CompState
import Control.Monad.Error
import Control.Monad.State
import System
import System.IO
import PrettyShow
-- | Open an included file, looking for it in the search path.
-- Return the open filehandle and the location of the file.
-- FIXME: This doesn't actually look at the search path yet.
searchFile :: Meta -> String -> PassM (Handle, String)
searchFile m filename
= do cs <- get
let currentFile = csCurrentFile cs
let possibilities = [joinPath currentFile filename]
openOneOf possibilities
where
openOneOf :: [String] -> PassM (Handle, String)
openOneOf [] = dieP m $ "Unable to find " ++ filename
openOneOf (fn:fns)
= do r <- liftIO $ maybeIO $ openFile fn ReadMode
case r of
Just h -> return (h, fn)
Nothing -> openOneOf fns
-- | Preprocess a file and return its tokenised form ready for parsing.
preprocessFile :: Meta -> String -> PassM [Token]
preprocessFile m filename
= do (handle, realFilename) <- searchFile m filename
liftIO $ putStrLn $ "Loading " ++ realFilename
origCS <- get
modify (\cs -> cs { csCurrentFile = realFilename })
s <- liftIO $ hGetContents handle
toks <- runLexer realFilename s >>= structureOccam >>= preprocessOccam
modify (\cs -> cs { csCurrentFile = csCurrentFile origCS })
return toks
-- | Preprocess a token stream.
preprocessOccam :: [Token] -> PassM [Token]
preprocessOccam [] = return []
preprocessOccam ((m, TokPreprocessor ('#':s)):(_, EndOfLine):ts)
= do func <- handleDirective m s
rest <- preprocessOccam ts
return $ func rest
-- Check the above case didn't miss something.
preprocessOccam ((_, TokPreprocessor _):_)
= error "bad TokPreprocessor token"
preprocessOccam (t:ts)
= do rest <- preprocessOccam ts
return $ t : rest
--{{{ preprocessor directive handlers
type DirectiveFunc = Meta -> [String] -> PassM ([Token] -> [Token])
-- | Call the handler for a preprocessor directive.
handleDirective :: Meta -> String -> PassM ([Token] -> [Token])
handleDirective m s = lookup s directives
where
lookup s [] = dieP m "Unknown preprocessor directive"
lookup s ((re, func):ds)
= case matchRegex re s of
Just fields -> func m fields
Nothing -> lookup s ds
-- | List of handlers for preprocessor directives.
-- `handleDirective` walks down the regexps in this list until it finds one
-- that matches, then uses the corresponding function.
directives :: [(Regex, DirectiveFunc)]
directives =
[ (mkRegex "^INCLUDE \"(.*)\"$", handleInclude)
, (mkRegex "^USE \"(.*)\"$", handleUse)
]
-- | Handle the @#INCLUDE@ directive.
handleInclude :: DirectiveFunc
handleInclude m [incName]
= do toks <- preprocessFile m incName
return (\ts -> toks ++ ts)
-- | Handle the @#USE@ directive.
-- This is a bit of a hack at the moment, since it just includes the file
-- textually.
handleUse :: DirectiveFunc
handleUse m [modName]
= do let incName = mangleModName modName
cs <- get
put $ cs { csUsedFiles = Set.insert incName (csUsedFiles cs) }
if Set.member incName (csUsedFiles cs)
then return id
else handleInclude m [incName]
where
-- | If a module name doesn't already have a suffix, add one.
mangleModName :: String -> String
mangleModName mod
= if ".occ" `isSuffixOf` mod || ".inc" `isSuffixOf` mod
then mod
else mod ++ ".occ"
--}}}
-- | Main function for testing.
main :: IO ()
main
= do (arg:_) <- getArgs
v <- evalStateT (runErrorT (test arg)) emptyState
case v of
Left e -> dieIO e
Right r -> return ()
where
test :: String -> PassM ()
test fn
= do tokens <- preprocessFile emptyMeta fn
liftIO $ putStrLn $ pshow tokens

View File

@ -19,17 +19,12 @@ with this program. If not, see <http://www.gnu.org/licenses/>.
-- | Analyse syntactic structure of occam code.
module StructureOccam where
import Control.Monad.Error
import Control.Monad.State
import Data.Generics
import System
import CompState
import Errors
import LexOccam
import Metadata
import Pass
import PrettyShow
-- | Given the output of the lexer for a single file, add `Indent`, `Outdent`
-- and `EndOfLine` markers.
@ -39,12 +34,12 @@ structureOccam ts = analyse 1 firstLine ts
where
-- Find the first line that's actually got something on it.
firstLine
= case ts of (Token _ m _:_) -> metaLine m
= case ts of ((m, _):_) -> metaLine m
analyse :: Int -> Int -> [Token] -> PassM [Token]
-- Add extra EndOfLine at the end of the file.
analyse _ _ [] = return [EndOfLine]
analyse prevCol prevLine (t@(Token _ m _):ts)
analyse _ _ [] = return [(emptyMeta, EndOfLine)]
analyse prevCol prevLine (t@(m, tokType):ts)
= if line /= prevLine
then do rest <- analyse col line ts
newLine $ t : rest
@ -56,29 +51,19 @@ structureOccam ts = analyse 1 firstLine ts
-- A new line -- look to see what's going on with the indentation.
newLine rest
| col == prevCol + 2 = return $ EndOfLine : Indent : rest
| col == prevCol + 2 = withEOL $ (m, Indent) : rest
-- FIXME: If col > prevCol, then look to see if there's a VALOF
-- coming up before the next column change...
| col < prevCol
= if (prevCol - col) `mod` 2 == 0
then return $ EndOfLine : (replicate steps Outdent ++ rest)
else dieP m "Invalid indentation"
| col == prevCol = return $ EndOfLine : rest
| otherwise = dieP m "Invalid indentation"
then withEOL $ replicate steps (m, Outdent) ++ rest
else bad
| col == prevCol = withEOL rest
| otherwise = bad
where
steps = (prevCol - col) `div` 2
-- | Main function for testing.
main :: IO ()
main
= do (arg:_) <- getArgs
s <- readFile arg
e <- evalStateT (runErrorT (test arg s)) emptyState
return ()
where
test :: String -> String -> PassM ()
test arg s
= do tokens <- runLexer arg s
tokens' <- structureOccam tokens
liftIO $ putStrLn $ pshow tokens'
bad = dieP m "Invalid indentation"
-- This is actually the position at which the new line starts
-- rather than the end of the previous line.
withEOL ts = return $ (m, EndOfLine) : ts

View File

@ -20,6 +20,9 @@ with this program. If not, see <http://www.gnu.org/licenses/>.
-- that could be put into the standard library.
module Utils where
import Control.Monad
import System.IO
import System.IO.Error
import Text.Regex
-- | Split the directory and file components of a path.
@ -58,3 +61,8 @@ transformEither :: (a -> c) -> (b -> d) -> Either a b -> Either c d
transformEither funcLeft funcRight x = case x of
Left l -> Left (funcLeft l)
Right r -> Right (funcRight r)
-- | Try an IO operation, returning `Nothing` if it fails.
maybeIO :: IO a -> IO (Maybe a)
maybeIO op = catch (op >>= (return . Just)) (\e -> return Nothing)