racket/collects/honu/core/read.rkt

#lang racket/base

;; This module implements read related functions such as `read', `read-syntax',
;; and a colored lexer for drracket

(require rackunit)
(require parser-tools/lex
         (prefix-in : parser-tools/lex-sre))
(require racket/match
         (for-syntax racket/base
                     syntax/parse))

(require "private/debug.rkt")

(define-tokens honu-tokens (number identifier string))

(define-empty-tokens honu-empty-tokens
                     [eof fail whitespace
                      left-parens right-parens
                      parse-error
                      left-bracket right-bracket
                      left-brace right-brace
                      semicolon
                      block-comment
                      end-of-line-comment])

(define-lex-abbrev digit (:/ #\0 #\9))
(define-lex-abbrev identifier-first-character (:or (:/ #\a #\z)
                                                   (:/ #\A #\Z)
                                                   "_" "?"))
(define-lex-abbrev identifier-character (:or identifier-first-character
                                             digit))
(define-lex-abbrev identifier (:: identifier-first-character
                                  (:* identifier-character)))
(define-lex-abbrev number (:: (:+ digit) (:? (:: "." (:+ digit)))))
(define-lex-abbrev string-character (:or (:: #\\ any-char)
                                         (:~ #\")))
(define-lex-abbrev string (:: #\" (:* string-character) #\"))
(define-lex-abbrev operator (:or "+=" "-=" "*=" "/="
                                 "+" "!=" "=>" "=" "==" "*" "/" "-" "^" "||" "|" "&&" "<="
                                 ">=" "<-" "<" ">" "!" "::" ":=" "%"))
(define-lex-abbrev block-comment (:: "/*"
                                     (complement (:: any-string "*/" any-string))
                                     "*/"))

(define-lex-abbrev line-comment (:: (:or "#" "//")
                                    (:* (:~ "\n"))
                                    ;; we might hit eof before a \n
                                    (:? "\n" "\r")))

(define (replace-escapes string)
  (define replacements '([#px"\\\\n" "\n"]
                         [#px"\\\\t" "\t"]))
  (for/fold ([string string])
            ([replace replacements])
    (define pattern (car replace))
    (define with (cadr replace))
    (regexp-replace* pattern string with)))

(provide honu-lexer)
(define honu-lexer
  (lexer-src-pos
    [(eof) (token-eof)]
    #;
    [line-comment (token-whitespace)]
    [(:or "#" "//") (token-end-of-line-comment)]
    [(:? "\n" "\r") (token-whitespace)]
    [number (token-number (string->number lexeme))]
    #;
    [block-comment (token-whitespace)]
    ["/*" (token-block-comment)]
    ["..." (token-identifier '...)]
    ["." (token-identifier '%dot)]
    ["$" (token-identifier 'honu-$)]
    ["," (token-identifier 'honu-comma)]
    [":" (token-identifier '%colon)]
    ["'" (token-identifier 'quote)]
    ["`" (token-identifier 'quasiquote)]
    ["->" (token-identifier '%arrow)]
    [operator (token-identifier (string->symbol lexeme))]
    [";" (token-semicolon)]
    ;; strip the quotes from the resulting string
    ;; TODO: find a more optimal way
    [string (let ()
              (define raw (substring (substring lexeme 1)
                                     0 (- (string-length lexeme) 2)))
              (token-string (replace-escapes raw)))]
    ["(" (token-left-parens)]  [")" (token-right-parens)]
    ["[" (token-left-bracket)] ["]" (token-right-bracket)]
    ["{" (token-left-brace)]   ["}" (token-right-brace)]
    [identifier (token-identifier (string->symbol lexeme))]
    [(union " " "\t") (token-whitespace)]
    [any-char (token-parse-error)]))

(define-syntax (define-token? stx)
  (syntax-parse stx
    [(_ name)
     (define name? (datum->syntax #'name (string->symbol
                                           (format "token-~a?"
                                                 (symbol->string
                                                   (syntax->datum #'name))))
                                  #'name))
     (with-syntax ([name? name?])
       #'(begin
           (provide name?)
           (define (name? token)
             (equal? 'name (token-name token)))))]))

(define-syntax-rule (define-tokens? name ...)
                    (begin
                      (define-token? name) ...))

(define-tokens? eof whitespace end-of-line-comment number string
                block-comment parse-error
                identifier left-parens right-parens
                left-bracket right-bracket
                left-brace right-brace
                semicolon)

;; returns #t if an entire comment was read (with an ending newline)
(define (read-until-end-of-line input)
  (define (finish? what)
    (or (eof-object? what)
        (= (char->integer #\newline) what)))
  ;; #t if read a #\newline, otherwise #f
  (define (clean-end? what)
    (if (eof-object? what)
      #f
      (= (char->integer #\newline) what)))
  (let loop ()
    (define what (read-byte input))
    (if (not (finish? what))
      (loop)
      (clean-end? what))))

;; returns #t if an entire block comment was read
(define (read-block-comment port)
  (define comment-lexer
    (lexer
      ["/*" 'nest]
      ["*/" 'done]
      [(eof) eof]
      [any-char 'continue]))

  (define (finish? what)
    (or (eq? 'done what)
        (eof-object? what)))

  (let loop ([nesting 1])
    (define what (comment-lexer port))
    (cond
      [(eq? what 'nest) (loop (add1 nesting))]
      [(eq? what 'done) (if (> nesting 1)
                          (loop (sub1 nesting))
                          #t)]
      [(eof-object? what) #f]
      [else (loop nesting)])))

;; read characters from a port and return a stream of tokens
(define (read-tokens port)
  (let loop ([tokens '()])
    (define next (honu-lexer port))
    ;; (debug "next token ~a\n" next)
    (match next
      [(struct* position-token ([token (? token-eof?)] [start-pos start] [end-pos end]))
       ; (printf "done lexing ~a\n" tokens)
       (reverse tokens)]
      [(struct* position-token ([token (? token-end-of-line-comment?)]
                                 [start-pos start]
                                 [end-pos end]))
       (read-until-end-of-line port)
       (loop tokens)]
       [(struct* position-token ([token (? token-block-comment?)]
                                 [start-pos start]
                                 [end-pos end]))
        (read-block-comment port)
        (loop tokens)]
       [(struct* position-token ([token (? token-whitespace?)] [start-pos start] [end-pos end]))
        (loop tokens)]
       [else (loop (cons next tokens))]
       #;
       [(position-token token start end)
        ;; (printf "next is ~a eof? ~a\n" token (token-eof? token))
        (loop (cons token tokens))])))

;; symbols that can be used for colors
;;   symbol
;;   keyword
;;   comment
;;   string
;;   constant
;;   parenthesis
;;   error
;;   other
(define (honu-name->color token-name)
  ;; (printf "Get honu color for ~a\n" token-name)
  (case token-name
    [(number) 'constant]
    [(string) 'string]
    [(parens) 'parenthesis]
    [(identifier) 'symbol]
    [else 'other]))

;; implements a lexer that the colorer in drracket expects
;; FIXME: color comments
(provide color-lexer)
(define (color-lexer port offset mode)
  ;; (printf "Parse at token ~a mode is ~a\n" offset mode)
  (define lexeme (honu-lexer port))
  (define need-backup (if mode mode 0))
  (match lexeme
    [(position-token token start end)
     ;; (printf "Lexed ~a\n" token)
     (define (encloser kind)
       (values token 'parenthesis kind
               (position-offset start)
               (position-offset end)
               need-backup mode))

     ;; (printf "Get token for at ~a\n" (position-offset start))
     (cond
       [(token-eof? token)
        (values eof 'eof #f
                (position-offset start)
                (position-offset end)
                need-backup mode)]
       [(token-parse-error? token)
        (define backup (if mode (add1 mode) 1))
        (values token
                'error
                #f
                (position-offset start)
                (position-offset end)
                backup backup)]
       [(token-end-of-line-comment? token)
        (read-until-end-of-line port)
        (define-values (line column position) (port-next-location port))
        (values #f 'comment #f
                (position-offset start)
                position
                need-backup need-backup)]
       [(token-block-comment? token)
        (read-block-comment port)
        (define-values (line column position) (port-next-location port))
        (values #f 'comment #f
                (position-offset start)
                position
                need-backup need-backup)]
       [(token-left-parens? token)  (encloser '|(|)]
       [(token-right-parens? token) (encloser '|)|)]
       [(token-left-bracket? token) (encloser '|[|)]
       [(token-right-bracket? token)(encloser '|]|)]
       [(token-left-brace? token)   (encloser '|{|)]
       [(token-right-brace? token)  (encloser '|}|)]
       [else (values (format "~a" (token-value token))
                     (honu-name->color (token-name token))
                     #f
                     (position-offset start)
                     (position-offset end)
                     need-backup mode)])]))

;; convert a string to a stream of tokens
(define (lex-string input)
  (read-tokens (open-input-string input)))

;; make a syntax object out of some symbol and a position-token
(define (make-syntax datum token source)
  (match token
    [(position-token token start end)
     (datum->syntax #f datum
                    (list source (position-line start)
                          (position-col start)
                          (position-offset start)
                          (- (position-offset end)
                             (position-offset start))))]))

(define (make-syntax-from-token token source)
  (match token
    [(position-token datum start end)
     (make-syntax (token-value datum) token source)]))

;; converts a stream of tokens to a tree
(define (parse source tokens)
  (define (is-first-token what? tokens)
    (match tokens
      [(list (position-token token start end) rest ...)
       (what? token)]
      [else #f]))

  (define (do-atom current tokens table)
    (do-parse (cons (make-syntax-from-token (car tokens) source) current)
              (cdr tokens)
              table))
  (define (atom? tokens)
    (is-first-token (lambda (token)
                      (or (token-identifier? token)
                          (token-string? token)
                          (token-number? token)))
                    tokens))

  (define (do-empty current tokens table)
    (reverse current))

  (define (contains-semicolon? syntax)
    (syntax-case syntax (%semicolon #%braces #%parens)
      [(%semicolon x ...) #t]
      [#%braces #t]
      [(#%braces x ...) #t]
      #;
      [(#%parens x ...) #t]
      [else #f]))

  ;; wraps syntax objects with (%semicolon ...)
  ;; it will search backwards through the list of already created syntax objects
  ;; until it either runs out of syntax objects or finds one already wrapped with
  ;; (%semicolon ...)
  ;;
  ;; if the original input is '1 + 2; 3 + 4;' this will get read as
  ;;   (1 + 2 %semicolon 3 + 4 %semicolon)
  ;; then parsing will start from the front. when %semicolon is reached we will have
  ;; the following (current is always backwards).
  ;;   current: (2 + 1)
  ;; do-semicolon will search this for a syntax object containing (%semicolon ...) but
  ;; since one doesn't appear the entire expression will be reversed and wrapped thus
  ;; resulting in
  ;;   (%semicolon 1 + 2)
  ;;
  ;; Now parsing continues with (3 + 4 %semicolon). When the next %semicolon is hit we
  ;; will have
  ;;   current: (4 + 3 (%semicolon 1 + 2))
  ;;
  ;; So do-semicolon will find (%semicolon 1 + 2) and leave stop processing there,
  ;; resulting in
  ;;   ((%semicolon 3 + 4) (%semicolon 1 + 2))
  ;;
  ;; The entire list will be reversed at the end of parsing.
  (define (do-semicolon current tokens table)
    ;; (debug "Do semicolon on ~a\n" current)
    (define-values (wrap ok)
                   (let loop ([found '()]
                              [rest current])
                     (match rest
                       [(list) (values found rest)]
                       [(list (and (? contains-semicolon?) head) rest* ...)
                        (values found rest)]
                       [(list head rest ...)
                        (loop (cons head found) rest)])))
    (define semicolon (make-syntax `(%semicolon ,@wrap)
                                   ;; FIXME: this is probably the wrong token
                                   ;; to get source location from
                                   (car tokens)
                                   source))


    (do-parse (cons semicolon ok)
              (cdr tokens)
              table))

  (define (do-semicolon2 current tokens table)
    (do-parse (cons (make-syntax '%semicolon (car tokens) source) current)
              (cdr tokens)
              table))

  (define (do-continue current tokens table)
    (do-parse current (cdr tokens) table))

  (define (parse-error? tokens)
    (is-first-token token-parse-error? tokens))

  (define (semicolon? tokens)
    (is-first-token token-semicolon? tokens))

  (define (left-parens? tokens)
    (is-first-token token-left-parens? tokens))
  (define (right-parens? tokens)
    (is-first-token token-right-parens? tokens))
  (define (left-bracket? tokens)
    (is-first-token token-left-bracket? tokens))
  (define (right-bracket? tokens)
    (is-first-token token-right-bracket? tokens))
  (define (left-brace? tokens)
    (is-first-token token-left-brace? tokens))
  (define (right-brace? tokens)
    (is-first-token token-right-brace? tokens))

  (define (do-end-encloser current tokens table)
    (values (reverse current) (cdr tokens)))

  (define (add-dispatch-rule table rule)
    (cons rule table))
  (define ((do-fail kind) current tokens table)
    (define line (syntax-line (car current)))
    (define column (add1 (+ (syntax-span (car current))
                            (syntax-column (car current)))))
    (error 'parse "expected a ~a character at line ~a column ~a" kind line column))
  ;; add a rule to the dispatch table to expect an ending token then
  ;; parse the sub-tree and continue
  (define (make-encloser head failure-name next)
    (lambda (current tokens table)
      (define added (add-dispatch-rule
                      (add-dispatch-rule dispatch-table [list next do-end-encloser])
                    [list null? (do-fail failure-name)]))
      (define-values (sub-tree unparsed)
                     (do-parse (list (make-syntax head (car tokens) source))
                               (cdr tokens) added))
      (do-parse (cons sub-tree current) unparsed table)))

  (define do-left-parens (make-encloser '#%parens ")" right-parens?))
  (define do-left-bracket (make-encloser '#%brackets "}" right-bracket?))
  (define do-left-brace (make-encloser '#%braces "]" right-brace?))

  (define dispatch-table (list [list semicolon? do-semicolon]
                               [list atom? do-atom]
                               [list left-parens? do-left-parens]
                               [list left-bracket? do-left-bracket]
                               [list left-brace? do-left-brace]
                               [list parse-error? do-continue]
                               [list null? do-empty]))

  (define (do-parse current tokens table)
    (define (fail tokens)
      (if (null? tokens)
        (error 'read "error while reading")
        (let ([first (car tokens)])
          ;; hack to get the current failure behavior
          (do-parse current '() table)
          (define line (position-line (position-token-start-pos first)))
          (define column (position-col (position-token-start-pos first)))
          (error 'parse "error while parsing on line ~a column ~a" line column))))
    ;; (printf "do parse ~a [tokens] ~a table ~a\n" (strip current) (strip tokens) table)
    (let loop ([use table])
      ;; (printf "Check ~a on ~a null? ~a\n" use (map position-token-token tokens) (null? tokens))
      (cond
        [(null? use) (fail tokens)]
        [(let ([dispatcher (caar use)])
           (dispatcher tokens))
         (define action (cadar use))
         (action current tokens table)]
        [else (loop (cdr use))])))

  (debug 3 "Parsing tokens ~a\n" (map position-token-token tokens))
  (if (null? tokens)
    (datum->syntax #f '() #f)
    (datum->syntax #f (do-parse '() tokens dispatch-table)
                   #f)))

;; strip the source location from the position tokens
(define (strip tokens)
  (for/list ([token tokens])
    (match token
      [(position-token token start end) token]
      [else token])))

(provide honu-read-syntax)
(define (honu-read-syntax [name #f] [port (current-input-port)])
  (parse name (read-tokens port)))

(provide honu-read)
(define (honu-read [port (current-input-port)])
  (syntax->datum (honu-read-syntax #f port)))

(define (count-lines port)
  (port-count-lines! port)
  port)

#;
(test-case
  "Basic tests"
  (check-equal? (strip (lex-string "5"))
                (list (token-number 5)))
  (check-equal? (strip (lex-string "5 8"))
                (list (token-number 5) (token-number 8)))
  (check-equal? (strip (lex-string "hello"))
                (list (token-identifier 'hello)))
  (check-equal? (strip (lex-string "()"))
                (list (token-left-parens)
                      (token-right-parens)))
  (check-equal? (strip (lex-string "()[]{}"))
                (list (token-left-parens)
                      (token-right-parens)
                      (token-left-bracket)
                      (token-right-bracket)
                      (token-left-brace)
                      (token-right-brace)))
  (check-equal? (strip (lex-string "foo // 5"))
                (list (token-identifier 'foo)))
  (check-equal? (strip (lex-string "foo // 5
                            bar"))
                (list (token-identifier 'foo)
                      (token-identifier 'bar)))
  (check-equal? (strip (lex-string "f(2)"))
                (list (token-identifier 'f)
                      (token-left-parens)
                      (token-number 2)
                      (token-right-parens)))
  (check-equal? (strip (lex-string "8 /* aosidfjasdf329023 */ 5"))
                (list (token-number 8)
                      (token-number 5)))
  (check-equal? (strip (lex-string "\"hello\""))
                (list (token-string "hello")))
  ;; FIXME: how can we write a string with an escaped character in it?
  #;
  (check-equal? (strip (lex-string "\"hel\\\\\"lo\""))
                (list (token-string "hel\"lo")))
  (check-equal? (honu-read (open-input-string "f(5)"))
                '(f (#%parens 5)))
  (check-exn exn:fail? (lambda ()
                         (honu-read (count-lines (open-input-string "({)}")))))
  )