parse tokens into a tree

This commit is contained in:
Jon Rafkind 2011-07-26 12:36:41 -06:00
parent 783ee2cf88
commit 1f9b9b4c51

View File

@ -3,22 +3,27 @@
(require rackunit) (require rackunit)
(require parser-tools/lex (require parser-tools/lex
(prefix-in : parser-tools/lex-sre)) (prefix-in : parser-tools/lex-sre))
(require racket/match) (require racket/match
(for-syntax racket/base
syntax/parse))
(define-tokens honu-tokens (number identifier)) (define-tokens honu-tokens (number identifier))
(define-empty-tokens honu-empty-tokens (eof fail whitespace (define-empty-tokens honu-empty-tokens
[eof fail whitespace
left-parens right-parens left-parens right-parens
left-bracket right-bracket left-bracket right-bracket
left-brace right-brace left-brace right-brace
end-of-line-comment)) end-of-line-comment])
(define-lex-abbrev digit (:/ #\0 #\9))
(define-lex-abbrev identifier-first-character (:or (:/ #\a #\z) (define-lex-abbrev identifier-first-character (:or (:/ #\a #\z)
(:/ #\A #\Z))) (:/ #\A #\Z)))
(define-lex-abbrev identifier-character identifier-first-character) (define-lex-abbrev identifier-character (:or identifier-first-character
digit))
(define-lex-abbrev identifier (:: identifier-first-character (define-lex-abbrev identifier (:: identifier-first-character
(:+ identifier-character))) (:* identifier-character)))
(define-lex-abbrev number (:+ (:/ #\0 #\9))) (define-lex-abbrev number (:+ digit))
(define honu-lexer (define honu-lexer
(lexer-src-pos (lexer-src-pos
@ -26,27 +31,33 @@
[(:or "#" "//") (token-end-of-line-comment)] [(:or "#" "//") (token-end-of-line-comment)]
[number (token-number (string->number lexeme))] [number (token-number (string->number lexeme))]
["." (token-identifier '|.|)] ["." (token-identifier '|.|)]
["(" (token-left-parens)] ["(" (token-left-parens)] [")" (token-right-parens)]
[")" (token-right-parens)] ["[" (token-left-bracket)] ["]" (token-right-bracket)]
["[" (token-left-bracket)] ["{" (token-left-brace)] ["}" (token-right-brace)]
["]" (token-right-bracket)]
["{" (token-left-brace)]
["}" (token-right-brace)]
[identifier (token-identifier (string->symbol lexeme))] [identifier (token-identifier (string->symbol lexeme))]
[(union " " "\t") (token-whitespace)] [(union " " "\t") (token-whitespace)]
)) ))
(define (token-eof? token) (define-syntax (define-token? stx)
(equal? 'eof (token-name token))) (syntax-parse stx
[(_ name)
(define name? (datum->syntax #'name (string->symbol
(format "token-~a?"
(symbol->string
(syntax->datum #'name))))
#'name))
(with-syntax ([name? name?])
#'(define (name? token)
(equal? 'name (token-name token))))]))
(define (token-whitespace? token) (define-syntax-rule (define-tokens? name ...)
(equal? 'whitespace (token-name token))) (begin
(define-token? name) ...))
(define (token-end-of-line-comment? token) (define-tokens? eof whitespace end-of-line-comment number
(equal? 'end-of-line-comment (token-name token))) identifier left-parens right-parens)
(define (read-until-end-of-line input) (define (read-until-end-of-line input) (define (finish? what)
(define (finish? what)
(or (eof-object? what) (or (eof-object? what)
(= (char->integer #\newline) what))) (= (char->integer #\newline) what)))
(let loop () (let loop ()
@ -54,8 +65,7 @@
(when (not (finish? what)) (when (not (finish? what))
(loop)))) (loop))))
(define (lex-string input) (define (read-tokens port)
(define port (open-input-string input))
(let loop ([tokens '()]) (let loop ([tokens '()])
(define next (honu-lexer port)) (define next (honu-lexer port))
;; (printf "next ~a\n" next) ;; (printf "next ~a\n" next)
@ -70,32 +80,88 @@
(loop tokens)] (loop tokens)]
[(struct* position-token ([token (? token-whitespace?)] [start-pos start] [end-pos end])) [(struct* position-token ([token (? token-whitespace?)] [start-pos start] [end-pos end]))
(loop tokens)] (loop tokens)]
[else (loop (cons next tokens))]
#;
[(position-token token start end) [(position-token token start end)
;; (printf "next is ~a eof? ~a\n" token (token-eof? token)) ;; (printf "next is ~a eof? ~a\n" token (token-eof? token))
(loop (cons token tokens))]))) (loop (cons token tokens))])))
;; convert a string to a stream of tokens
(define (lex-string input)
(read-tokens (open-input-string input)))
;; converts a stream of tokens to a tree
(define (parse tokens)
(let loop ([current '()]
[tokens tokens]
[stop '()])
(if (null? tokens)
(reverse current)
(match (car tokens)
[(position-token token start end)
(cond
[(or (token-number? token)
(token-identifier? token))
(loop (cons token current)
(cdr tokens)
stop)]
[(token-right-parens? token)
(match stop
[(list 'right-parens rest ...)
(values (reverse current)
rest)]
[else (error 'parse "expected a left parentheses before the right parentheses")])]
[(token-left-parens? token)
(define-values (parsed more-tokens)
(loop '(#%parens)
(cdr tokens)
(cons 'right-parens stop)))
(loop (cons parsed current)
more-tokens
stop)]
[else (error 'parse "cannot parse ~a" token)])]))))
;; strip the source location from the position tokens
(define (strip tokens)
(for/list ([token tokens])
(match token
[(position-token token start end) token])))
(define (honu-read-syntax [port (current-input-port)])
(read-tokens (parse port)))
(define (honu-read [port (current-input-port)])
(syntax->datum (honu-read-syntax port)))
(test-case (test-case
"Basic tests" "Basic tests"
(check-equal? (lex-string "5") (check-equal? (strip (lex-string "5"))
(list (token-number 5))) (list (token-number 5)))
(check-equal? (lex-string "5 8") (check-equal? (strip (lex-string "5 8"))
(list (token-number 5) (token-number 8))) (list (token-number 5) (token-number 8)))
(check-equal? (lex-string "hello") (check-equal? (strip (lex-string "hello"))
(list (token-identifier 'hello))) (list (token-identifier 'hello)))
(check-equal? (lex-string "()") (check-equal? (strip (lex-string "()"))
(list (token-left-parens) (list (token-left-parens)
(token-right-parens))) (token-right-parens)))
(check-equal? (lex-string "()[]{}") (check-equal? (strip (lex-string "()[]{}"))
(list (token-left-parens) (list (token-left-parens)
(token-right-parens) (token-right-parens)
(token-left-bracket) (token-left-bracket)
(token-right-bracket) (token-right-bracket)
(token-left-brace) (token-left-brace)
(token-right-brace))) (token-right-brace)))
(check-equal? (lex-string "foo // 5") (check-equal? (strip (lex-string "foo // 5"))
(list (token-identifier 'foo))) (list (token-identifier 'foo)))
(check-equal? (lex-string "foo // 5 (check-equal? (strip (lex-string "foo // 5
bar") bar"))
(list (token-identifier 'foo) (list (token-identifier 'foo)
(token-identifier 'bar))) (token-identifier 'bar)))
(check-equal? (strip (lex-string "f(2)"))
(list (token-identifier 'f)
(token-left-parens)
(token-number 2)
(token-right-parens)))
) )
(parse (lex-string "f(5)"))