From 1f9b9b4c5112ccdeb58517cf5ea78ca57e021b63 Mon Sep 17 00:00:00 2001
From: Jon Rafkind <rafkind@racket-lang.org>
Date: Tue, 26 Jul 2011 12:36:41 -0600
Subject: [PATCH] parse tokens into a tree

---
 collects/honu/core/read.rkt | 164 +++++++++++++++++++++++++-----------
 1 file changed, 115 insertions(+), 49 deletions(-)

diff --git a/collects/honu/core/read.rkt b/collects/honu/core/read.rkt
index c8385ab0b8..df5f378a82 100644
--- a/collects/honu/core/read.rkt
+++ b/collects/honu/core/read.rkt
@@ -2,23 +2,28 @@
 
 (require rackunit)
 (require parser-tools/lex
-	 (prefix-in : parser-tools/lex-sre))
-(require racket/match)
+         (prefix-in : parser-tools/lex-sre))
+(require racket/match
+         (for-syntax racket/base
+                     syntax/parse))
 
 (define-tokens honu-tokens (number identifier))
 
-(define-empty-tokens honu-empty-tokens (eof fail whitespace
-					left-parens right-parens
-					left-bracket right-bracket
-					left-brace right-brace
-					end-of-line-comment))
+(define-empty-tokens honu-empty-tokens
+                     [eof fail whitespace
+                      left-parens right-parens
+                      left-bracket right-bracket
+                      left-brace right-brace
+                      end-of-line-comment])
 
+(define-lex-abbrev digit (:/ #\0 #\9))
 (define-lex-abbrev identifier-first-character (:or (:/ #\a #\z)
-						   (:/ #\A #\Z)))
-(define-lex-abbrev identifier-character identifier-first-character)
+                                                   (:/ #\A #\Z)))
+(define-lex-abbrev identifier-character (:or identifier-first-character
+                                             digit))
 (define-lex-abbrev identifier (:: identifier-first-character
-				  (:+ identifier-character)))
-(define-lex-abbrev number (:+ (:/ #\0 #\9)))
+                                  (:* identifier-character)))
+(define-lex-abbrev number (:+ digit))
 
 (define honu-lexer
   (lexer-src-pos
@@ -26,27 +31,33 @@
     [(:or "#" "//") (token-end-of-line-comment)]
     [number (token-number (string->number lexeme))]
     ["." (token-identifier '|.|)]
-    ["(" (token-left-parens)]
-    [")" (token-right-parens)]
-    ["[" (token-left-bracket)]
-    ["]" (token-right-bracket)]
-    ["{" (token-left-brace)]
-    ["}" (token-right-brace)]
+    ["(" (token-left-parens)]  [")" (token-right-parens)]
+    ["[" (token-left-bracket)] ["]" (token-right-bracket)]
+    ["{" (token-left-brace)]   ["}" (token-right-brace)]
     [identifier (token-identifier (string->symbol lexeme))]
     [(union " " "\t") (token-whitespace)]
     ))
 
-(define (token-eof? token)
-  (equal? 'eof (token-name token)))
+(define-syntax (define-token? stx)
+  (syntax-parse stx
+    [(_ name)
+     (define name? (datum->syntax #'name (string->symbol
+                                           (format "token-~a?"
+                                                 (symbol->string
+                                                   (syntax->datum #'name))))
+                                  #'name))
+     (with-syntax ([name? name?])
+       #'(define (name? token)
+           (equal? 'name (token-name token))))]))
 
-(define (token-whitespace? token)
-  (equal? 'whitespace (token-name token)))
+(define-syntax-rule (define-tokens? name ...)
+                    (begin
+                      (define-token? name) ...))
 
-(define (token-end-of-line-comment? token)
-  (equal? 'end-of-line-comment (token-name token)))
+(define-tokens? eof whitespace end-of-line-comment number
+                identifier left-parens right-parens)
 
-(define (read-until-end-of-line input)
-  (define (finish? what)
+(define (read-until-end-of-line input) (define (finish? what)
     (or (eof-object? what)
 	(= (char->integer #\newline) what)))
   (let loop ()
@@ -54,8 +65,7 @@
     (when (not (finish? what))
       (loop))))
 
-(define (lex-string input)
-  (define port (open-input-string input))
+(define (read-tokens port)
   (let loop ([tokens '()])
     (define next (honu-lexer port))
     ;; (printf "next ~a\n" next)
@@ -70,32 +80,88 @@
 	    (loop tokens)]
 	   [(struct* position-token ([token (? token-whitespace?)] [start-pos start] [end-pos end]))
 	    (loop tokens)]
+       [else (loop (cons next tokens))]
+       #;
 	   [(position-token token start end)
 	    ;; (printf "next is ~a eof? ~a\n" token (token-eof? token))
 	    (loop (cons token tokens))])))
 
+;; convert a string to a stream of tokens
+(define (lex-string input)
+  (read-tokens (open-input-string input)))
+
+;; converts a stream of tokens to a tree
+(define (parse tokens)
+  (let loop ([current '()]
+             [tokens tokens]
+             [stop '()])
+    (if (null? tokens)
+      (reverse current)
+      (match (car tokens)
+        [(position-token token start end)
+         (cond
+           [(or (token-number? token)
+                (token-identifier? token))
+            (loop (cons token current)
+                  (cdr tokens)
+                  stop)]
+           [(token-right-parens? token)
+            (match stop
+              [(list 'right-parens rest ...)
+               (values (reverse current)
+                       rest)]
+              [else (error 'parse "expected a left parentheses before the right parentheses")])]
+           [(token-left-parens? token)
+            (define-values (parsed more-tokens)
+                           (loop '(#%parens)
+                                 (cdr tokens)
+                                 (cons 'right-parens stop)))
+            (loop (cons parsed current)
+                  more-tokens
+                  stop)]
+           [else (error 'parse "cannot parse ~a" token)])]))))
+
+;; strip the source location from the position tokens
+(define (strip tokens)
+  (for/list ([token tokens])
+    (match token
+      [(position-token token start end) token])))
+
+(define (honu-read-syntax [port (current-input-port)])
+  (read-tokens (parse port)))
+
+(define (honu-read [port (current-input-port)])
+  (syntax->datum (honu-read-syntax port)))
+
 (test-case
   "Basic tests"
-  (check-equal? (lex-string "5")
-		(list (token-number 5)))
-  (check-equal? (lex-string "5 8")
-		(list (token-number 5) (token-number 8)))
-  (check-equal? (lex-string "hello")
-		(list (token-identifier 'hello)))
-  (check-equal? (lex-string "()")
-		(list (token-left-parens)
-		      (token-right-parens)))
-  (check-equal? (lex-string "()[]{}")
-		(list (token-left-parens)
-		      (token-right-parens)
-		      (token-left-bracket)
-		      (token-right-bracket)
-		      (token-left-brace)
-		      (token-right-brace)))
-  (check-equal? (lex-string "foo // 5")
-		(list (token-identifier 'foo)))
-  (check-equal? (lex-string "foo // 5
-			     bar")
-		(list (token-identifier 'foo)
-		      (token-identifier 'bar)))
+  (check-equal? (strip (lex-string "5"))
+                (list (token-number 5)))
+  (check-equal? (strip (lex-string "5 8"))
+                (list (token-number 5) (token-number 8)))
+  (check-equal? (strip (lex-string "hello"))
+                (list (token-identifier 'hello)))
+  (check-equal? (strip (lex-string "()"))
+                (list (token-left-parens)
+                      (token-right-parens)))
+  (check-equal? (strip (lex-string "()[]{}"))
+                (list (token-left-parens)
+                      (token-right-parens)
+                      (token-left-bracket)
+                      (token-right-bracket)
+                      (token-left-brace)
+                      (token-right-brace)))
+  (check-equal? (strip (lex-string "foo // 5"))
+                (list (token-identifier 'foo)))
+  (check-equal? (strip (lex-string "foo // 5
+                            bar"))
+                (list (token-identifier 'foo)
+                      (token-identifier 'bar)))
+  (check-equal? (strip (lex-string "f(2)"))
+                (list (token-identifier 'f)
+                      (token-left-parens)
+                      (token-number 2)
+                      (token-right-parens)))
   )
+
+(parse (lex-string "f(5)"))