510 lines
19 KiB
Racket
510 lines
19 KiB
Racket
#lang racket/base
|
|
|
|
;; This module implements read related functions such as `read', `read-syntax',
|
|
;; and a colored lexer for drracket
|
|
|
|
(require rackunit)
|
|
(require parser-tools/lex
|
|
(prefix-in : parser-tools/lex-sre))
|
|
(require racket/match
|
|
(for-syntax racket/base
|
|
syntax/parse))
|
|
|
|
(require "private/debug.rkt")
|
|
|
|
(define-tokens honu-tokens (number identifier string))
|
|
|
|
(define-empty-tokens honu-empty-tokens
|
|
[eof fail whitespace
|
|
left-parens right-parens
|
|
parse-error
|
|
left-bracket right-bracket
|
|
left-brace right-brace
|
|
semicolon
|
|
block-comment
|
|
end-of-line-comment])
|
|
|
|
(define-lex-abbrev digit (:/ #\0 #\9))
|
|
(define-lex-abbrev identifier-first-character (:or (:/ #\a #\z)
|
|
(:/ #\A #\Z)
|
|
"_" "?"))
|
|
(define-lex-abbrev identifier-character (:or identifier-first-character
|
|
digit))
|
|
(define-lex-abbrev identifier (:: identifier-first-character
|
|
(:* identifier-character)))
|
|
(define-lex-abbrev number (:: (:+ digit) (:? (:: "." (:+ digit)))))
|
|
(define-lex-abbrev string-character (:or (:: #\\ any-char)
|
|
(:~ #\")))
|
|
(define-lex-abbrev string (:: #\" (:* string-character) #\"))
|
|
(define-lex-abbrev operator (:or "+=" "-=" "*=" "/="
|
|
"+" "!=" "=>" "=" "==" "*" "/" "-" "^" "||" "|" "&&" "<="
|
|
">=" "<-" "<" ">" "!" "::" ":=" "%"))
|
|
(define-lex-abbrev block-comment (:: "/*"
|
|
(complement (:: any-string "*/" any-string))
|
|
"*/"))
|
|
|
|
(define-lex-abbrev line-comment (:: (:or "#" "//")
|
|
(:* (:~ "\n"))
|
|
;; we might hit eof before a \n
|
|
(:? "\n" "\r")))
|
|
|
|
(define (replace-escapes string)
|
|
(define replacements '([#px"\\\\n" "\n"]
|
|
[#px"\\\\t" "\t"]))
|
|
(for/fold ([string string])
|
|
([replace replacements])
|
|
(define pattern (car replace))
|
|
(define with (cadr replace))
|
|
(regexp-replace* pattern string with)))
|
|
|
|
(provide honu-lexer)
|
|
(define honu-lexer
|
|
(lexer-src-pos
|
|
[(eof) (token-eof)]
|
|
#;
|
|
[line-comment (token-whitespace)]
|
|
[(:or "#" "//") (token-end-of-line-comment)]
|
|
[(:? "\n" "\r") (token-whitespace)]
|
|
[number (token-number (string->number lexeme))]
|
|
#;
|
|
[block-comment (token-whitespace)]
|
|
["/*" (token-block-comment)]
|
|
["..." (token-identifier '...)]
|
|
["." (token-identifier '%dot)]
|
|
["$" (token-identifier 'honu-$)]
|
|
["," (token-identifier 'honu-comma)]
|
|
[":" (token-identifier '%colon)]
|
|
["'" (token-identifier 'quote)]
|
|
["`" (token-identifier 'quasiquote)]
|
|
["->" (token-identifier '%arrow)]
|
|
[operator (token-identifier (string->symbol lexeme))]
|
|
[";" (token-semicolon)]
|
|
;; strip the quotes from the resulting string
|
|
;; TODO: find a more optimal way
|
|
[string (let ()
|
|
(define raw (substring (substring lexeme 1)
|
|
0 (- (string-length lexeme) 2)))
|
|
(token-string (replace-escapes raw)))]
|
|
["(" (token-left-parens)] [")" (token-right-parens)]
|
|
["[" (token-left-bracket)] ["]" (token-right-bracket)]
|
|
["{" (token-left-brace)] ["}" (token-right-brace)]
|
|
[identifier (token-identifier (string->symbol lexeme))]
|
|
[(union " " "\t") (token-whitespace)]
|
|
[any-char (token-parse-error)]))
|
|
|
|
(define-syntax (define-token? stx)
|
|
(syntax-parse stx
|
|
[(_ name)
|
|
(define name? (datum->syntax #'name (string->symbol
|
|
(format "token-~a?"
|
|
(symbol->string
|
|
(syntax->datum #'name))))
|
|
#'name))
|
|
(with-syntax ([name? name?])
|
|
#'(begin
|
|
(provide name?)
|
|
(define (name? token)
|
|
(equal? 'name (token-name token)))))]))
|
|
|
|
(define-syntax-rule (define-tokens? name ...)
|
|
(begin
|
|
(define-token? name) ...))
|
|
|
|
(define-tokens? eof whitespace end-of-line-comment number string
|
|
block-comment parse-error
|
|
identifier left-parens right-parens
|
|
left-bracket right-bracket
|
|
left-brace right-brace
|
|
semicolon)
|
|
|
|
;; returns #t if an entire comment was read (with an ending newline)
|
|
(define (read-until-end-of-line input)
|
|
(define (finish? what)
|
|
(or (eof-object? what)
|
|
(= (char->integer #\newline) what)))
|
|
;; #t if read a #\newline, otherwise #f
|
|
(define (clean-end? what)
|
|
(if (eof-object? what)
|
|
#f
|
|
(= (char->integer #\newline) what)))
|
|
(let loop ()
|
|
(define what (read-byte input))
|
|
(if (not (finish? what))
|
|
(loop)
|
|
(clean-end? what))))
|
|
|
|
;; returns #t if an entire block comment was read
|
|
(define (read-block-comment port)
|
|
(define comment-lexer
|
|
(lexer
|
|
["/*" 'nest]
|
|
["*/" 'done]
|
|
[(eof) eof]
|
|
[any-char 'continue]))
|
|
|
|
(define (finish? what)
|
|
(or (eq? 'done what)
|
|
(eof-object? what)))
|
|
|
|
(let loop ([nesting 1])
|
|
(define what (comment-lexer port))
|
|
(cond
|
|
[(eq? what 'nest) (loop (add1 nesting))]
|
|
[(eq? what 'done) (if (> nesting 1)
|
|
(loop (sub1 nesting))
|
|
#t)]
|
|
[(eof-object? what) #f]
|
|
[else (loop nesting)])))
|
|
|
|
;; read characters from a port and return a stream of tokens
|
|
(define (read-tokens port)
|
|
(let loop ([tokens '()])
|
|
(define next (honu-lexer port))
|
|
;; (debug "next token ~a\n" next)
|
|
(match next
|
|
[(struct* position-token ([token (? token-eof?)] [start-pos start] [end-pos end]))
|
|
; (printf "done lexing ~a\n" tokens)
|
|
(reverse tokens)]
|
|
[(struct* position-token ([token (? token-end-of-line-comment?)]
|
|
[start-pos start]
|
|
[end-pos end]))
|
|
(read-until-end-of-line port)
|
|
(loop tokens)]
|
|
[(struct* position-token ([token (? token-block-comment?)]
|
|
[start-pos start]
|
|
[end-pos end]))
|
|
(read-block-comment port)
|
|
(loop tokens)]
|
|
[(struct* position-token ([token (? token-whitespace?)] [start-pos start] [end-pos end]))
|
|
(loop tokens)]
|
|
[else (loop (cons next tokens))]
|
|
#;
|
|
[(position-token token start end)
|
|
;; (printf "next is ~a eof? ~a\n" token (token-eof? token))
|
|
(loop (cons token tokens))])))
|
|
|
|
;; symbols that can be used for colors
|
|
;; symbol
|
|
;; keyword
|
|
;; comment
|
|
;; string
|
|
;; constant
|
|
;; parenthesis
|
|
;; error
|
|
;; other
|
|
(define (honu-name->color token-name)
|
|
;; (printf "Get honu color for ~a\n" token-name)
|
|
(case token-name
|
|
[(number) 'constant]
|
|
[(string) 'string]
|
|
[(parens) 'parenthesis]
|
|
[(identifier) 'symbol]
|
|
[else 'other]))
|
|
|
|
;; implements a lexer that the colorer in drracket expects
|
|
;; FIXME: color comments
|
|
(provide color-lexer)
|
|
(define (color-lexer port offset mode)
|
|
;; (printf "Parse at token ~a mode is ~a\n" offset mode)
|
|
(define lexeme (honu-lexer port))
|
|
(define need-backup (if mode mode 0))
|
|
(match lexeme
|
|
[(position-token token start end)
|
|
;; (printf "Lexed ~a\n" token)
|
|
(define (encloser kind)
|
|
(values token 'parenthesis kind
|
|
(position-offset start)
|
|
(position-offset end)
|
|
need-backup mode))
|
|
|
|
;; (printf "Get token for at ~a\n" (position-offset start))
|
|
(cond
|
|
[(token-eof? token)
|
|
(values eof 'eof #f
|
|
(position-offset start)
|
|
(position-offset end)
|
|
need-backup mode)]
|
|
[(token-parse-error? token)
|
|
(define backup (if mode (add1 mode) 1))
|
|
(values token
|
|
'error
|
|
#f
|
|
(position-offset start)
|
|
(position-offset end)
|
|
backup backup)]
|
|
[(token-end-of-line-comment? token)
|
|
(read-until-end-of-line port)
|
|
(define-values (line column position) (port-next-location port))
|
|
(values #f 'comment #f
|
|
(position-offset start)
|
|
position
|
|
need-backup need-backup)]
|
|
[(token-block-comment? token)
|
|
(read-block-comment port)
|
|
(define-values (line column position) (port-next-location port))
|
|
(values #f 'comment #f
|
|
(position-offset start)
|
|
position
|
|
need-backup need-backup)]
|
|
[(token-left-parens? token) (encloser '|(|)]
|
|
[(token-right-parens? token) (encloser '|)|)]
|
|
[(token-left-bracket? token) (encloser '|[|)]
|
|
[(token-right-bracket? token)(encloser '|]|)]
|
|
[(token-left-brace? token) (encloser '|{|)]
|
|
[(token-right-brace? token) (encloser '|}|)]
|
|
[else (values (format "~a" (token-value token))
|
|
(honu-name->color (token-name token))
|
|
#f
|
|
(position-offset start)
|
|
(position-offset end)
|
|
need-backup mode)])]))
|
|
|
|
;; convert a string to a stream of tokens
|
|
(define (lex-string input)
|
|
(read-tokens (open-input-string input)))
|
|
|
|
;; make a syntax object out of some symbol and a position-token
|
|
(define (make-syntax datum token source)
|
|
(match token
|
|
[(position-token token start end)
|
|
(datum->syntax #f datum
|
|
(list source (position-line start)
|
|
(position-col start)
|
|
(position-offset start)
|
|
(- (position-offset end)
|
|
(position-offset start))))]))
|
|
|
|
(define (make-syntax-from-token token source)
|
|
(match token
|
|
[(position-token datum start end)
|
|
(make-syntax (token-value datum) token source)]))
|
|
|
|
;; converts a stream of tokens to a tree
|
|
(define (parse source tokens)
|
|
(define (is-first-token what? tokens)
|
|
(match tokens
|
|
[(list (position-token token start end) rest ...)
|
|
(what? token)]
|
|
[else #f]))
|
|
|
|
(define (do-atom current tokens table)
|
|
(do-parse (cons (make-syntax-from-token (car tokens) source) current)
|
|
(cdr tokens)
|
|
table))
|
|
(define (atom? tokens)
|
|
(is-first-token (lambda (token)
|
|
(or (token-identifier? token)
|
|
(token-string? token)
|
|
(token-number? token)))
|
|
tokens))
|
|
|
|
(define (do-empty current tokens table)
|
|
(reverse current))
|
|
|
|
(define (contains-semicolon? syntax)
|
|
(syntax-case syntax (%semicolon #%braces #%parens)
|
|
[(%semicolon x ...) #t]
|
|
[#%braces #t]
|
|
[(#%braces x ...) #t]
|
|
#;
|
|
[(#%parens x ...) #t]
|
|
[else #f]))
|
|
|
|
;; wraps syntax objects with (%semicolon ...)
|
|
;; it will search backwards through the list of already created syntax objects
|
|
;; until it either runs out of syntax objects or finds one already wrapped with
|
|
;; (%semicolon ...)
|
|
;;
|
|
;; if the original input is '1 + 2; 3 + 4;' this will get read as
|
|
;; (1 + 2 %semicolon 3 + 4 %semicolon)
|
|
;; then parsing will start from the front. when %semicolon is reached we will have
|
|
;; the following (current is always backwards).
|
|
;; current: (2 + 1)
|
|
;; do-semicolon will search this for a syntax object containing (%semicolon ...) but
|
|
;; since one doesn't appear the entire expression will be reversed and wrapped thus
|
|
;; resulting in
|
|
;; (%semicolon 1 + 2)
|
|
;;
|
|
;; Now parsing continues with (3 + 4 %semicolon). When the next %semicolon is hit we
|
|
;; will have
|
|
;; current: (4 + 3 (%semicolon 1 + 2))
|
|
;;
|
|
;; So do-semicolon will find (%semicolon 1 + 2) and leave stop processing there,
|
|
;; resulting in
|
|
;; ((%semicolon 3 + 4) (%semicolon 1 + 2))
|
|
;;
|
|
;; The entire list will be reversed at the end of parsing.
|
|
(define (do-semicolon current tokens table)
|
|
;; (debug "Do semicolon on ~a\n" current)
|
|
(define-values (wrap ok)
|
|
(let loop ([found '()]
|
|
[rest current])
|
|
(match rest
|
|
[(list) (values found rest)]
|
|
[(list (and (? contains-semicolon?) head) rest* ...)
|
|
(values found rest)]
|
|
[(list head rest ...)
|
|
(loop (cons head found) rest)])))
|
|
(define semicolon (make-syntax `(%semicolon ,@wrap)
|
|
;; FIXME: this is probably the wrong token
|
|
;; to get source location from
|
|
(car tokens)
|
|
source))
|
|
|
|
|
|
(do-parse (cons semicolon ok)
|
|
(cdr tokens)
|
|
table))
|
|
|
|
(define (do-semicolon2 current tokens table)
|
|
(do-parse (cons (make-syntax '%semicolon (car tokens) source) current)
|
|
(cdr tokens)
|
|
table))
|
|
|
|
(define (do-continue current tokens table)
|
|
(do-parse current (cdr tokens) table))
|
|
|
|
(define (parse-error? tokens)
|
|
(is-first-token token-parse-error? tokens))
|
|
|
|
(define (semicolon? tokens)
|
|
(is-first-token token-semicolon? tokens))
|
|
|
|
(define (left-parens? tokens)
|
|
(is-first-token token-left-parens? tokens))
|
|
(define (right-parens? tokens)
|
|
(is-first-token token-right-parens? tokens))
|
|
(define (left-bracket? tokens)
|
|
(is-first-token token-left-bracket? tokens))
|
|
(define (right-bracket? tokens)
|
|
(is-first-token token-right-bracket? tokens))
|
|
(define (left-brace? tokens)
|
|
(is-first-token token-left-brace? tokens))
|
|
(define (right-brace? tokens)
|
|
(is-first-token token-right-brace? tokens))
|
|
|
|
(define (do-end-encloser current tokens table)
|
|
(values (reverse current) (cdr tokens)))
|
|
|
|
(define (add-dispatch-rule table rule)
|
|
(cons rule table))
|
|
(define ((do-fail kind) current tokens table)
|
|
(define line (syntax-line (car current)))
|
|
(define column (add1 (+ (syntax-span (car current))
|
|
(syntax-column (car current)))))
|
|
(error 'parse "expected a ~a character at line ~a column ~a" kind line column))
|
|
;; add a rule to the dispatch table to expect an ending token then
|
|
;; parse the sub-tree and continue
|
|
(define (make-encloser head failure-name next)
|
|
(lambda (current tokens table)
|
|
(define added (add-dispatch-rule
|
|
(add-dispatch-rule dispatch-table [list next do-end-encloser])
|
|
[list null? (do-fail failure-name)]))
|
|
(define-values (sub-tree unparsed)
|
|
(do-parse (list (make-syntax head (car tokens) source))
|
|
(cdr tokens) added))
|
|
(do-parse (cons sub-tree current) unparsed table)))
|
|
|
|
(define do-left-parens (make-encloser '#%parens ")" right-parens?))
|
|
(define do-left-bracket (make-encloser '#%brackets "}" right-bracket?))
|
|
(define do-left-brace (make-encloser '#%braces "]" right-brace?))
|
|
|
|
(define dispatch-table (list [list semicolon? do-semicolon]
|
|
[list atom? do-atom]
|
|
[list left-parens? do-left-parens]
|
|
[list left-bracket? do-left-bracket]
|
|
[list left-brace? do-left-brace]
|
|
[list parse-error? do-continue]
|
|
[list null? do-empty]))
|
|
|
|
(define (do-parse current tokens table)
|
|
(define (fail tokens)
|
|
(if (null? tokens)
|
|
(error 'read "error while reading")
|
|
(let ([first (car tokens)])
|
|
;; hack to get the current failure behavior
|
|
(do-parse current '() table)
|
|
(define line (position-line (position-token-start-pos first)))
|
|
(define column (position-col (position-token-start-pos first)))
|
|
(error 'parse "error while parsing on line ~a column ~a" line column))))
|
|
;; (printf "do parse ~a [tokens] ~a table ~a\n" (strip current) (strip tokens) table)
|
|
(let loop ([use table])
|
|
;; (printf "Check ~a on ~a null? ~a\n" use (map position-token-token tokens) (null? tokens))
|
|
(cond
|
|
[(null? use) (fail tokens)]
|
|
[(let ([dispatcher (caar use)])
|
|
(dispatcher tokens))
|
|
(define action (cadar use))
|
|
(action current tokens table)]
|
|
[else (loop (cdr use))])))
|
|
|
|
(debug 3 "Parsing tokens ~a\n" (map position-token-token tokens))
|
|
(if (null? tokens)
|
|
(datum->syntax #f '() #f)
|
|
(datum->syntax #f (do-parse '() tokens dispatch-table)
|
|
#f)))
|
|
|
|
;; strip the source location from the position tokens
|
|
(define (strip tokens)
|
|
(for/list ([token tokens])
|
|
(match token
|
|
[(position-token token start end) token]
|
|
[else token])))
|
|
|
|
(provide honu-read-syntax)
|
|
(define (honu-read-syntax [name #f] [port (current-input-port)])
|
|
(parse name (read-tokens port)))
|
|
|
|
(provide honu-read)
|
|
(define (honu-read [port (current-input-port)])
|
|
(syntax->datum (honu-read-syntax #f port)))
|
|
|
|
(define (count-lines port)
|
|
(port-count-lines! port)
|
|
port)
|
|
|
|
#;
|
|
(test-case
|
|
"Basic tests"
|
|
(check-equal? (strip (lex-string "5"))
|
|
(list (token-number 5)))
|
|
(check-equal? (strip (lex-string "5 8"))
|
|
(list (token-number 5) (token-number 8)))
|
|
(check-equal? (strip (lex-string "hello"))
|
|
(list (token-identifier 'hello)))
|
|
(check-equal? (strip (lex-string "()"))
|
|
(list (token-left-parens)
|
|
(token-right-parens)))
|
|
(check-equal? (strip (lex-string "()[]{}"))
|
|
(list (token-left-parens)
|
|
(token-right-parens)
|
|
(token-left-bracket)
|
|
(token-right-bracket)
|
|
(token-left-brace)
|
|
(token-right-brace)))
|
|
(check-equal? (strip (lex-string "foo // 5"))
|
|
(list (token-identifier 'foo)))
|
|
(check-equal? (strip (lex-string "foo // 5
|
|
bar"))
|
|
(list (token-identifier 'foo)
|
|
(token-identifier 'bar)))
|
|
(check-equal? (strip (lex-string "f(2)"))
|
|
(list (token-identifier 'f)
|
|
(token-left-parens)
|
|
(token-number 2)
|
|
(token-right-parens)))
|
|
(check-equal? (strip (lex-string "8 /* aosidfjasdf329023 */ 5"))
|
|
(list (token-number 8)
|
|
(token-number 5)))
|
|
(check-equal? (strip (lex-string "\"hello\""))
|
|
(list (token-string "hello")))
|
|
;; FIXME: how can we write a string with an escaped character in it?
|
|
#;
|
|
(check-equal? (strip (lex-string "\"hel\\\\\"lo\""))
|
|
(list (token-string "hel\"lo")))
|
|
(check-equal? (honu-read (open-input-string "f(5)"))
|
|
'(f (#%parens 5)))
|
|
(check-exn exn:fail? (lambda ()
|
|
(honu-read (count-lines (open-input-string "({)}")))))
|
|
)
|