Add a preprocessor that works with the new lexer.

The parser now gets a stream of tokens, rather than needing to worry about
loading files itself.

This also reworks the lexer's idea of what constitutes a Token -- it's now a
pair (Meta, TokenType), so it's always easy to pull out/rewrite the metadata --
and adds proper support for lexing preprocessor directives, rather than just
treating them as reserved words.
This commit is contained in:
Adam Sampson 2007-08-20 23:46:57 +00:00
parent a640dabc04
commit 223a14d767
6 changed files with 196 additions and 44 deletions

View File

@ -22,6 +22,7 @@ module CompState where
import Data.Generics import Data.Generics
import Data.Map (Map) import Data.Map (Map)
import qualified Data.Map as Map import qualified Data.Map as Map
import qualified Data.Set as Set
import Control.Monad.State import Control.Monad.State
import qualified AST as A import qualified AST as A
@ -49,7 +50,11 @@ data CompState = CompState {
csVerboseLevel :: Int, csVerboseLevel :: Int,
csOutputFile :: String, csOutputFile :: String,
-- Set by preprocessor -- Set by (new) preprocessor
csCurrentFile :: String,
csUsedFiles :: Set.Set String,
-- Set by (old) preprocessor
csSourceFiles :: Map String String, csSourceFiles :: Map String String,
csIndentLinesIn :: [String], csIndentLinesIn :: [String],
csIndentLinesOut :: [String], csIndentLinesOut :: [String],
@ -85,6 +90,9 @@ emptyState = CompState {
csVerboseLevel = 0, csVerboseLevel = 0,
csOutputFile = "-", csOutputFile = "-",
csCurrentFile = "none",
csUsedFiles = Set.empty,
csSourceFiles = Map.empty, csSourceFiles = Map.empty,
csIndentLinesIn = [], csIndentLinesIn = [],
csIndentLinesOut = [], csIndentLinesOut = [],

View File

@ -31,6 +31,8 @@ import Pass
$decimalDigit = [0-9] $decimalDigit = [0-9]
$hexDigit = [0-9 a-f A-F] $hexDigit = [0-9 a-f A-F]
@preprocessor = "#" [^\n]*
@reserved = "[" | "]" | "(" | ")" @reserved = "[" | "]" | "(" | ")"
| "::" | ":=" | ":" | "," | ";" | "&" | "::" | ":=" | ":" | "," | ";" | "&"
| "?" | "!" | "=" | "?" | "!" | "="
@ -60,7 +62,6 @@ $hexDigit = [0-9 a-f A-F]
| "VAL" | "VALOF" | "VAL" | "VALOF"
| "WHILE" | "WORKSPACE" | "WHILE" | "WORKSPACE"
| "VECSPACE" | "VECSPACE"
| "#INCLUDE" | "#USE"
@identifier = [a-z A-Z] [a-z A-Z 0-9 \.]* @identifier = [a-z A-Z] [a-z A-Z 0-9 \.]*
@ -80,6 +81,8 @@ $escapeChar = [cnrts \" \' \* \n]
occam :- occam :-
@preprocessor { mkToken TokPreprocessor }
-- Ignore whitespace and comments. -- Ignore whitespace and comments.
$white+ ; $white+ ;
"--" [^\n]* ; "--" [^\n]* ;
@ -95,28 +98,31 @@ $white+ ;
@realLiteral { mkToken TokRealLiteral } @realLiteral { mkToken TokRealLiteral }
{ {
-- | An occam source token and its position.
type Token = (Meta, TokenType)
-- | An occam source token. -- | An occam source token.
-- Only `Token` is generated by the lexer itself; the others are added later -- Only `Token` is generated by the lexer itself; the others are added later
-- once the indentation has been analysed. -- once the indentation has been analysed.
data Token = data TokenType =
Token TokenType Meta String -- ^ A real token read from the source TokReserved String -- ^ A reserved word or symbol
| TokIdentifier String
| TokStringLiteral String
| TokCharLiteral String
| TokDecimalLiteral String
| TokHexLiteral String
| TokRealLiteral String
| TokPreprocessor String
| Indent -- ^ Indentation increase | Indent -- ^ Indentation increase
| Outdent -- ^ Indentation decrease | Outdent -- ^ Indentation decrease
| EndOfLine -- ^ End of line | EndOfLine -- ^ End of line
deriving (Show, Eq, Typeable, Data) deriving (Show, Eq, Typeable, Data)
-- | The type of a source token.
data TokenType =
TokReserved | TokIdentifier
| TokStringLiteral | TokCharLiteral
| TokDecimalLiteral | TokHexLiteral | TokRealLiteral
deriving (Show, Eq, Typeable, Data)
-- | Build a lexer rule for a token. -- | Build a lexer rule for a token.
mkToken :: TokenType -> AlexPosn -> String -> Token mkToken :: (String -> TokenType) -> AlexPosn -> String -> Token
mkToken tt (AlexPn _ line col) s = Token tt emptyMeta s mkToken cons _ s = (emptyMeta, cons s)
-- | Run the lexer, returning either an error position or a list of tokens. -- | Run the lexer, returning a list of tokens.
-- (This is based on the `alexScanTokens` function that Alex provides.) -- (This is based on the `alexScanTokens` function that Alex provides.)
runLexer :: String -> String -> PassM [Token] runLexer :: String -> String -> PassM [Token]
runLexer filename str = go (alexStartPos, '\n', str) runLexer filename str = go (alexStartPos, '\n', str)
@ -128,8 +134,8 @@ runLexer filename str = go (alexStartPos, '\n', str)
AlexSkip inp' len -> go inp' AlexSkip inp' len -> go inp'
AlexToken inp' len act -> AlexToken inp' len act ->
do ts <- go inp' do ts <- go inp'
let (Token tt _ s) = act pos (take len str) let t = act pos (take len str)
return $ (Token tt meta s) : ts return $ (meta, snd t) : ts
where where
meta = emptyMeta { meta = emptyMeta {

View File

@ -21,7 +21,7 @@ tocktest: $(sources)
ghc $(ghc_opts) -o tocktest -main-is TestMain --make TestMain ghc $(ghc_opts) -o tocktest -main-is TestMain --make TestMain
lextest: $(sources) lextest: $(sources)
ghc $(ghc_opts) -o lextest -main-is StructureOccam --make StructureOccam ghc $(ghc_opts) -o lextest -main-is PreprocessOccam --make PreprocessOccam
CFLAGS = \ CFLAGS = \
-O2 \ -O2 \

145
PreprocessOccam.hs Normal file
View File

@ -0,0 +1,145 @@
{-
Tock: a compiler for parallel languages
Copyright (C) 2007 University of Kent
This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
Free Software Foundation, either version 2 of the License, or (at your
option) any later version.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program. If not, see <http://www.gnu.org/licenses/>.
-}
-- | Preprocess occam code.
module PreprocessOccam where
import Data.List
import qualified Data.Set as Set
import Text.Regex
import Errors
import LexOccam
import Metadata
import Pass
import StructureOccam
import Utils
import CompState
import Control.Monad.Error
import Control.Monad.State
import System
import System.IO
import PrettyShow
-- | Open an included file, looking for it in the search path.
-- Return the open filehandle and the location of the file.
-- FIXME: This doesn't actually look at the search path yet.
searchFile :: Meta -> String -> PassM (Handle, String)
searchFile m filename
= do cs <- get
let currentFile = csCurrentFile cs
let possibilities = [joinPath currentFile filename]
openOneOf possibilities
where
openOneOf :: [String] -> PassM (Handle, String)
openOneOf [] = dieP m $ "Unable to find " ++ filename
openOneOf (fn:fns)
= do r <- liftIO $ maybeIO $ openFile fn ReadMode
case r of
Just h -> return (h, fn)
Nothing -> openOneOf fns
-- | Preprocess a file and return its tokenised form ready for parsing.
preprocessFile :: Meta -> String -> PassM [Token]
preprocessFile m filename
= do (handle, realFilename) <- searchFile m filename
liftIO $ putStrLn $ "Loading " ++ realFilename
origCS <- get
modify (\cs -> cs { csCurrentFile = realFilename })
s <- liftIO $ hGetContents handle
toks <- runLexer realFilename s >>= structureOccam >>= preprocessOccam
modify (\cs -> cs { csCurrentFile = csCurrentFile origCS })
return toks
-- | Preprocess a token stream.
preprocessOccam :: [Token] -> PassM [Token]
preprocessOccam [] = return []
preprocessOccam ((m, TokPreprocessor ('#':s)):(_, EndOfLine):ts)
= do func <- handleDirective m s
rest <- preprocessOccam ts
return $ func rest
-- Check the above case didn't miss something.
preprocessOccam ((_, TokPreprocessor _):_)
= error "bad TokPreprocessor token"
preprocessOccam (t:ts)
= do rest <- preprocessOccam ts
return $ t : rest
--{{{ preprocessor directive handlers
type DirectiveFunc = Meta -> [String] -> PassM ([Token] -> [Token])
-- | Call the handler for a preprocessor directive.
handleDirective :: Meta -> String -> PassM ([Token] -> [Token])
handleDirective m s = lookup s directives
where
lookup s [] = dieP m "Unknown preprocessor directive"
lookup s ((re, func):ds)
= case matchRegex re s of
Just fields -> func m fields
Nothing -> lookup s ds
-- | List of handlers for preprocessor directives.
-- `handleDirective` walks down the regexps in this list until it finds one
-- that matches, then uses the corresponding function.
directives :: [(Regex, DirectiveFunc)]
directives =
[ (mkRegex "^INCLUDE \"(.*)\"$", handleInclude)
, (mkRegex "^USE \"(.*)\"$", handleUse)
]
-- | Handle the @#INCLUDE@ directive.
handleInclude :: DirectiveFunc
handleInclude m [incName]
= do toks <- preprocessFile m incName
return (\ts -> toks ++ ts)
-- | Handle the @#USE@ directive.
-- This is a bit of a hack at the moment, since it just includes the file
-- textually.
handleUse :: DirectiveFunc
handleUse m [modName]
= do let incName = mangleModName modName
cs <- get
put $ cs { csUsedFiles = Set.insert incName (csUsedFiles cs) }
if Set.member incName (csUsedFiles cs)
then return id
else handleInclude m [incName]
where
-- | If a module name doesn't already have a suffix, add one.
mangleModName :: String -> String
mangleModName mod
= if ".occ" `isSuffixOf` mod || ".inc" `isSuffixOf` mod
then mod
else mod ++ ".occ"
--}}}
-- | Main function for testing.
main :: IO ()
main
= do (arg:_) <- getArgs
v <- evalStateT (runErrorT (test arg)) emptyState
case v of
Left e -> dieIO e
Right r -> return ()
where
test :: String -> PassM ()
test fn
= do tokens <- preprocessFile emptyMeta fn
liftIO $ putStrLn $ pshow tokens

View File

@ -19,17 +19,12 @@ with this program. If not, see <http://www.gnu.org/licenses/>.
-- | Analyse syntactic structure of occam code. -- | Analyse syntactic structure of occam code.
module StructureOccam where module StructureOccam where
import Control.Monad.Error
import Control.Monad.State
import Data.Generics import Data.Generics
import System
import CompState
import Errors import Errors
import LexOccam import LexOccam
import Metadata import Metadata
import Pass import Pass
import PrettyShow
-- | Given the output of the lexer for a single file, add `Indent`, `Outdent` -- | Given the output of the lexer for a single file, add `Indent`, `Outdent`
-- and `EndOfLine` markers. -- and `EndOfLine` markers.
@ -39,12 +34,12 @@ structureOccam ts = analyse 1 firstLine ts
where where
-- Find the first line that's actually got something on it. -- Find the first line that's actually got something on it.
firstLine firstLine
= case ts of (Token _ m _:_) -> metaLine m = case ts of ((m, _):_) -> metaLine m
analyse :: Int -> Int -> [Token] -> PassM [Token] analyse :: Int -> Int -> [Token] -> PassM [Token]
-- Add extra EndOfLine at the end of the file. -- Add extra EndOfLine at the end of the file.
analyse _ _ [] = return [EndOfLine] analyse _ _ [] = return [(emptyMeta, EndOfLine)]
analyse prevCol prevLine (t@(Token _ m _):ts) analyse prevCol prevLine (t@(m, tokType):ts)
= if line /= prevLine = if line /= prevLine
then do rest <- analyse col line ts then do rest <- analyse col line ts
newLine $ t : rest newLine $ t : rest
@ -56,29 +51,19 @@ structureOccam ts = analyse 1 firstLine ts
-- A new line -- look to see what's going on with the indentation. -- A new line -- look to see what's going on with the indentation.
newLine rest newLine rest
| col == prevCol + 2 = return $ EndOfLine : Indent : rest | col == prevCol + 2 = withEOL $ (m, Indent) : rest
-- FIXME: If col > prevCol, then look to see if there's a VALOF -- FIXME: If col > prevCol, then look to see if there's a VALOF
-- coming up before the next column change... -- coming up before the next column change...
| col < prevCol | col < prevCol
= if (prevCol - col) `mod` 2 == 0 = if (prevCol - col) `mod` 2 == 0
then return $ EndOfLine : (replicate steps Outdent ++ rest) then withEOL $ replicate steps (m, Outdent) ++ rest
else dieP m "Invalid indentation" else bad
| col == prevCol = return $ EndOfLine : rest | col == prevCol = withEOL rest
| otherwise = dieP m "Invalid indentation" | otherwise = bad
where where
steps = (prevCol - col) `div` 2 steps = (prevCol - col) `div` 2
bad = dieP m "Invalid indentation"
-- | Main function for testing. -- This is actually the position at which the new line starts
main :: IO () -- rather than the end of the previous line.
main withEOL ts = return $ (m, EndOfLine) : ts
= do (arg:_) <- getArgs
s <- readFile arg
e <- evalStateT (runErrorT (test arg s)) emptyState
return ()
where
test :: String -> String -> PassM ()
test arg s
= do tokens <- runLexer arg s
tokens' <- structureOccam tokens
liftIO $ putStrLn $ pshow tokens'

View File

@ -20,6 +20,9 @@ with this program. If not, see <http://www.gnu.org/licenses/>.
-- that could be put into the standard library. -- that could be put into the standard library.
module Utils where module Utils where
import Control.Monad
import System.IO
import System.IO.Error
import Text.Regex import Text.Regex
-- | Split the directory and file components of a path. -- | Split the directory and file components of a path.
@ -58,3 +61,8 @@ transformEither :: (a -> c) -> (b -> d) -> Either a b -> Either c d
transformEither funcLeft funcRight x = case x of transformEither funcLeft funcRight x = case x of
Left l -> Left (funcLeft l) Left l -> Left (funcLeft l)
Right r -> Right (funcRight r) Right r -> Right (funcRight r)
-- | Try an IO operation, returning `Nothing` if it fails.
maybeIO :: IO a -> IO (Maybe a)
maybeIO op = catch (op >>= (return . Just)) (\e -> return Nothing)