racket/collects/honu/core/read.rkt

510 lines
19 KiB
Racket

#lang racket/base
;; This module implements read related functions such as `read', `read-syntax',
;; and a colored lexer for drracket
(require rackunit)
(require parser-tools/lex
(prefix-in : parser-tools/lex-sre))
(require racket/match
(for-syntax racket/base
syntax/parse))
(require "private/debug.rkt")
(define-tokens honu-tokens (number identifier string))
(define-empty-tokens honu-empty-tokens
[eof fail whitespace
left-parens right-parens
parse-error
left-bracket right-bracket
left-brace right-brace
semicolon
block-comment
end-of-line-comment])
(define-lex-abbrev digit (:/ #\0 #\9))
(define-lex-abbrev identifier-first-character (:or (:/ #\a #\z)
(:/ #\A #\Z)
"_" "?"))
(define-lex-abbrev identifier-character (:or identifier-first-character
digit))
(define-lex-abbrev identifier (:: identifier-first-character
(:* identifier-character)))
(define-lex-abbrev number (:: (:+ digit) (:? (:: "." (:+ digit)))))
(define-lex-abbrev string-character (:or (:: #\\ any-char)
(:~ #\")))
(define-lex-abbrev string (:: #\" (:* string-character) #\"))
(define-lex-abbrev operator (:or "+=" "-=" "*=" "/="
"+" "!=" "=>" "=" "==" "*" "/" "-" "^" "||" "|" "&&" "<="
">=" "<-" "<" ">" "!" "::" ":=" "%"))
(define-lex-abbrev block-comment (:: "/*"
(complement (:: any-string "*/" any-string))
"*/"))
(define-lex-abbrev line-comment (:: (:or "#" "//")
(:* (:~ "\n"))
;; we might hit eof before a \n
(:? "\n" "\r")))
(define (replace-escapes string)
(define replacements '([#px"\\\\n" "\n"]
[#px"\\\\t" "\t"]))
(for/fold ([string string])
([replace replacements])
(define pattern (car replace))
(define with (cadr replace))
(regexp-replace* pattern string with)))
(provide honu-lexer)
(define honu-lexer
(lexer-src-pos
[(eof) (token-eof)]
#;
[line-comment (token-whitespace)]
[(:or "#" "//") (token-end-of-line-comment)]
[(:? "\n" "\r") (token-whitespace)]
[number (token-number (string->number lexeme))]
#;
[block-comment (token-whitespace)]
["/*" (token-block-comment)]
["..." (token-identifier '...)]
["." (token-identifier '%dot)]
["$" (token-identifier 'honu-$)]
["," (token-identifier 'honu-comma)]
[":" (token-identifier '%colon)]
["'" (token-identifier 'quote)]
["`" (token-identifier 'quasiquote)]
["->" (token-identifier '%arrow)]
[operator (token-identifier (string->symbol lexeme))]
[";" (token-semicolon)]
;; strip the quotes from the resulting string
;; TODO: find a more optimal way
[string (let ()
(define raw (substring (substring lexeme 1)
0 (- (string-length lexeme) 2)))
(token-string (replace-escapes raw)))]
["(" (token-left-parens)] [")" (token-right-parens)]
["[" (token-left-bracket)] ["]" (token-right-bracket)]
["{" (token-left-brace)] ["}" (token-right-brace)]
[identifier (token-identifier (string->symbol lexeme))]
[(union " " "\t") (token-whitespace)]
[any-char (token-parse-error)]))
(define-syntax (define-token? stx)
(syntax-parse stx
[(_ name)
(define name? (datum->syntax #'name (string->symbol
(format "token-~a?"
(symbol->string
(syntax->datum #'name))))
#'name))
(with-syntax ([name? name?])
#'(begin
(provide name?)
(define (name? token)
(equal? 'name (token-name token)))))]))
(define-syntax-rule (define-tokens? name ...)
(begin
(define-token? name) ...))
(define-tokens? eof whitespace end-of-line-comment number string
block-comment parse-error
identifier left-parens right-parens
left-bracket right-bracket
left-brace right-brace
semicolon)
;; returns #t if an entire comment was read (with an ending newline)
(define (read-until-end-of-line input)
(define (finish? what)
(or (eof-object? what)
(= (char->integer #\newline) what)))
;; #t if read a #\newline, otherwise #f
(define (clean-end? what)
(if (eof-object? what)
#f
(= (char->integer #\newline) what)))
(let loop ()
(define what (read-byte input))
(if (not (finish? what))
(loop)
(clean-end? what))))
;; returns #t if an entire block comment was read
(define (read-block-comment port)
(define comment-lexer
(lexer
["/*" 'nest]
["*/" 'done]
[(eof) eof]
[any-char 'continue]))
(define (finish? what)
(or (eq? 'done what)
(eof-object? what)))
(let loop ([nesting 1])
(define what (comment-lexer port))
(cond
[(eq? what 'nest) (loop (add1 nesting))]
[(eq? what 'done) (if (> nesting 1)
(loop (sub1 nesting))
#t)]
[(eof-object? what) #f]
[else (loop nesting)])))
;; read characters from a port and return a stream of tokens
(define (read-tokens port)
(let loop ([tokens '()])
(define next (honu-lexer port))
;; (debug "next token ~a\n" next)
(match next
[(struct* position-token ([token (? token-eof?)] [start-pos start] [end-pos end]))
; (printf "done lexing ~a\n" tokens)
(reverse tokens)]
[(struct* position-token ([token (? token-end-of-line-comment?)]
[start-pos start]
[end-pos end]))
(read-until-end-of-line port)
(loop tokens)]
[(struct* position-token ([token (? token-block-comment?)]
[start-pos start]
[end-pos end]))
(read-block-comment port)
(loop tokens)]
[(struct* position-token ([token (? token-whitespace?)] [start-pos start] [end-pos end]))
(loop tokens)]
[else (loop (cons next tokens))]
#;
[(position-token token start end)
;; (printf "next is ~a eof? ~a\n" token (token-eof? token))
(loop (cons token tokens))])))
;; symbols that can be used for colors
;; symbol
;; keyword
;; comment
;; string
;; constant
;; parenthesis
;; error
;; other
(define (honu-name->color token-name)
;; (printf "Get honu color for ~a\n" token-name)
(case token-name
[(number) 'constant]
[(string) 'string]
[(parens) 'parenthesis]
[(identifier) 'symbol]
[else 'other]))
;; implements a lexer that the colorer in drracket expects
;; FIXME: color comments
(provide color-lexer)
(define (color-lexer port offset mode)
;; (printf "Parse at token ~a mode is ~a\n" offset mode)
(define lexeme (honu-lexer port))
(define need-backup (if mode mode 0))
(match lexeme
[(position-token token start end)
;; (printf "Lexed ~a\n" token)
(define (encloser kind)
(values token 'parenthesis kind
(position-offset start)
(position-offset end)
need-backup mode))
;; (printf "Get token for at ~a\n" (position-offset start))
(cond
[(token-eof? token)
(values eof 'eof #f
(position-offset start)
(position-offset end)
need-backup mode)]
[(token-parse-error? token)
(define backup (if mode (add1 mode) 1))
(values token
'error
#f
(position-offset start)
(position-offset end)
backup backup)]
[(token-end-of-line-comment? token)
(read-until-end-of-line port)
(define-values (line column position) (port-next-location port))
(values #f 'comment #f
(position-offset start)
position
need-backup need-backup)]
[(token-block-comment? token)
(read-block-comment port)
(define-values (line column position) (port-next-location port))
(values #f 'comment #f
(position-offset start)
position
need-backup need-backup)]
[(token-left-parens? token) (encloser '|(|)]
[(token-right-parens? token) (encloser '|)|)]
[(token-left-bracket? token) (encloser '|[|)]
[(token-right-bracket? token)(encloser '|]|)]
[(token-left-brace? token) (encloser '|{|)]
[(token-right-brace? token) (encloser '|}|)]
[else (values (format "~a" (token-value token))
(honu-name->color (token-name token))
#f
(position-offset start)
(position-offset end)
need-backup mode)])]))
;; convert a string to a stream of tokens
(define (lex-string input)
(read-tokens (open-input-string input)))
;; make a syntax object out of some symbol and a position-token
(define (make-syntax datum token source)
(match token
[(position-token token start end)
(datum->syntax #f datum
(list source (position-line start)
(position-col start)
(position-offset start)
(- (position-offset end)
(position-offset start))))]))
(define (make-syntax-from-token token source)
(match token
[(position-token datum start end)
(make-syntax (token-value datum) token source)]))
;; converts a stream of tokens to a tree
(define (parse source tokens)
(define (is-first-token what? tokens)
(match tokens
[(list (position-token token start end) rest ...)
(what? token)]
[else #f]))
(define (do-atom current tokens table)
(do-parse (cons (make-syntax-from-token (car tokens) source) current)
(cdr tokens)
table))
(define (atom? tokens)
(is-first-token (lambda (token)
(or (token-identifier? token)
(token-string? token)
(token-number? token)))
tokens))
(define (do-empty current tokens table)
(reverse current))
(define (contains-semicolon? syntax)
(syntax-case syntax (%semicolon #%braces #%parens)
[(%semicolon x ...) #t]
[#%braces #t]
[(#%braces x ...) #t]
#;
[(#%parens x ...) #t]
[else #f]))
;; wraps syntax objects with (%semicolon ...)
;; it will search backwards through the list of already created syntax objects
;; until it either runs out of syntax objects or finds one already wrapped with
;; (%semicolon ...)
;;
;; if the original input is '1 + 2; 3 + 4;' this will get read as
;; (1 + 2 %semicolon 3 + 4 %semicolon)
;; then parsing will start from the front. when %semicolon is reached we will have
;; the following (current is always backwards).
;; current: (2 + 1)
;; do-semicolon will search this for a syntax object containing (%semicolon ...) but
;; since one doesn't appear the entire expression will be reversed and wrapped thus
;; resulting in
;; (%semicolon 1 + 2)
;;
;; Now parsing continues with (3 + 4 %semicolon). When the next %semicolon is hit we
;; will have
;; current: (4 + 3 (%semicolon 1 + 2))
;;
;; So do-semicolon will find (%semicolon 1 + 2) and leave stop processing there,
;; resulting in
;; ((%semicolon 3 + 4) (%semicolon 1 + 2))
;;
;; The entire list will be reversed at the end of parsing.
(define (do-semicolon current tokens table)
;; (debug "Do semicolon on ~a\n" current)
(define-values (wrap ok)
(let loop ([found '()]
[rest current])
(match rest
[(list) (values found rest)]
[(list (and (? contains-semicolon?) head) rest* ...)
(values found rest)]
[(list head rest ...)
(loop (cons head found) rest)])))
(define semicolon (make-syntax `(%semicolon ,@wrap)
;; FIXME: this is probably the wrong token
;; to get source location from
(car tokens)
source))
(do-parse (cons semicolon ok)
(cdr tokens)
table))
(define (do-semicolon2 current tokens table)
(do-parse (cons (make-syntax '%semicolon (car tokens) source) current)
(cdr tokens)
table))
(define (do-continue current tokens table)
(do-parse current (cdr tokens) table))
(define (parse-error? tokens)
(is-first-token token-parse-error? tokens))
(define (semicolon? tokens)
(is-first-token token-semicolon? tokens))
(define (left-parens? tokens)
(is-first-token token-left-parens? tokens))
(define (right-parens? tokens)
(is-first-token token-right-parens? tokens))
(define (left-bracket? tokens)
(is-first-token token-left-bracket? tokens))
(define (right-bracket? tokens)
(is-first-token token-right-bracket? tokens))
(define (left-brace? tokens)
(is-first-token token-left-brace? tokens))
(define (right-brace? tokens)
(is-first-token token-right-brace? tokens))
(define (do-end-encloser current tokens table)
(values (reverse current) (cdr tokens)))
(define (add-dispatch-rule table rule)
(cons rule table))
(define ((do-fail kind) current tokens table)
(define line (syntax-line (car current)))
(define column (add1 (+ (syntax-span (car current))
(syntax-column (car current)))))
(error 'parse "expected a ~a character at line ~a column ~a" kind line column))
;; add a rule to the dispatch table to expect an ending token then
;; parse the sub-tree and continue
(define (make-encloser head failure-name next)
(lambda (current tokens table)
(define added (add-dispatch-rule
(add-dispatch-rule dispatch-table [list next do-end-encloser])
[list null? (do-fail failure-name)]))
(define-values (sub-tree unparsed)
(do-parse (list (make-syntax head (car tokens) source))
(cdr tokens) added))
(do-parse (cons sub-tree current) unparsed table)))
(define do-left-parens (make-encloser '#%parens ")" right-parens?))
(define do-left-bracket (make-encloser '#%brackets "}" right-bracket?))
(define do-left-brace (make-encloser '#%braces "]" right-brace?))
(define dispatch-table (list [list semicolon? do-semicolon]
[list atom? do-atom]
[list left-parens? do-left-parens]
[list left-bracket? do-left-bracket]
[list left-brace? do-left-brace]
[list parse-error? do-continue]
[list null? do-empty]))
(define (do-parse current tokens table)
(define (fail tokens)
(if (null? tokens)
(error 'read "error while reading")
(let ([first (car tokens)])
;; hack to get the current failure behavior
(do-parse current '() table)
(define line (position-line (position-token-start-pos first)))
(define column (position-col (position-token-start-pos first)))
(error 'parse "error while parsing on line ~a column ~a" line column))))
;; (printf "do parse ~a [tokens] ~a table ~a\n" (strip current) (strip tokens) table)
(let loop ([use table])
;; (printf "Check ~a on ~a null? ~a\n" use (map position-token-token tokens) (null? tokens))
(cond
[(null? use) (fail tokens)]
[(let ([dispatcher (caar use)])
(dispatcher tokens))
(define action (cadar use))
(action current tokens table)]
[else (loop (cdr use))])))
(debug 3 "Parsing tokens ~a\n" (map position-token-token tokens))
(if (null? tokens)
(datum->syntax #f '() #f)
(datum->syntax #f (do-parse '() tokens dispatch-table)
#f)))
;; strip the source location from the position tokens
(define (strip tokens)
(for/list ([token tokens])
(match token
[(position-token token start end) token]
[else token])))
(provide honu-read-syntax)
(define (honu-read-syntax [name #f] [port (current-input-port)])
(parse name (read-tokens port)))
(provide honu-read)
(define (honu-read [port (current-input-port)])
(syntax->datum (honu-read-syntax #f port)))
(define (count-lines port)
(port-count-lines! port)
port)
#;
(test-case
"Basic tests"
(check-equal? (strip (lex-string "5"))
(list (token-number 5)))
(check-equal? (strip (lex-string "5 8"))
(list (token-number 5) (token-number 8)))
(check-equal? (strip (lex-string "hello"))
(list (token-identifier 'hello)))
(check-equal? (strip (lex-string "()"))
(list (token-left-parens)
(token-right-parens)))
(check-equal? (strip (lex-string "()[]{}"))
(list (token-left-parens)
(token-right-parens)
(token-left-bracket)
(token-right-bracket)
(token-left-brace)
(token-right-brace)))
(check-equal? (strip (lex-string "foo // 5"))
(list (token-identifier 'foo)))
(check-equal? (strip (lex-string "foo // 5
bar"))
(list (token-identifier 'foo)
(token-identifier 'bar)))
(check-equal? (strip (lex-string "f(2)"))
(list (token-identifier 'f)
(token-left-parens)
(token-number 2)
(token-right-parens)))
(check-equal? (strip (lex-string "8 /* aosidfjasdf329023 */ 5"))
(list (token-number 8)
(token-number 5)))
(check-equal? (strip (lex-string "\"hello\""))
(list (token-string "hello")))
;; FIXME: how can we write a string with an escaped character in it?
#;
(check-equal? (strip (lex-string "\"hel\\\\\"lo\""))
(list (token-string "hel\"lo")))
(check-equal? (honu-read (open-input-string "f(5)"))
'(f (#%parens 5)))
(check-exn exn:fail? (lambda ()
(honu-read (count-lines (open-input-string "({)}")))))
)