#lang racket (require "parameter.rkt") (require syntax/srcloc) ;;; ;;; NOTES: ;;; Changes from planet version: ]] (CDB) Bug fixed ;;; Strings are supported. ;;; Assignments changed from set! to define. ;;; Assignment of functions. ;;; Bug fix: identifiers with more than one _ wasn't converted correctly ; = ; | ; | variable reference ; | [ ] application ; | { } list construction ; | + addition ; | - subtraction ; | * multiplication ; | / division ; | ^ exponentiation ; | - negation ; | ( ) grouping ; | ( ) := function definition ; An identifier begins with a letter, ; and is optionally followed by series of letters, digits or underscores. ; An underscore is converted to a -. Thus list_ref will refer to list-ref. ; A number is an non-empty series of digits, ; optionally followed by a period followed by a series of digits. ; A number is a " followed by a series of non-" characters followed by a " . (provide parse-expression color-lexer) (require parser-tools/yacc parser-tools/lex (prefix-in : parser-tools/lex-sre) syntax/readerr) (define-tokens value-tokens (NUMBER STRING IDENTIFIER IDENTIFIEROP IDENTIFIER:=)) (define-empty-tokens op-tokens (newline := OP CP ; ( ) CP:= ; ):= OB CB ; [ ] OC CC ; { } ODB ; [[ ]] COMMA ; , SEMI ; ; PERIOD ; . LAMBDA ; lambda or λ SQRT ; √ NEG ; ¬ (logical negation) LESS-EQUAL ; <= or ≤ GREATER-EQUAL ; >= or ≥ NOT-EQUAL ; <> or ≠ = < > + - * / ^ DEFINE EOF)) (define-lex-abbrevs [letter (:or (:/ "a" "z") (:/ #\A #\Z) )] [digit (:/ #\0 #\9)] [string (:: #\" (:* (:~ #\")) #\")] [identifier (:: letter (:* (:or letter digit #\_ #\?)))] [identifier:= (:: letter (:* (:or letter digit #\_ #\?)) ":=")] [identifierOP (:: letter (:* (:or letter digit #\_ #\?)) "(")] [comment (:: "%" (complement (:: any-string #\newline any-string)) #\newline)]) (define (string-drop-right n s) (substring s 0 (- (string-length s) n))) (define expression-lexer (lexer-src-pos [(eof) 'EOF] [(:or #\tab #\space #\newline) ; this skips whitespace (return-without-pos (expression-lexer input-port))] [#\newline (token-newline)] ; (token-newline) returns 'newline [(:or ":=" "+" "-" "*" "/" "^" "<" ">" "=" "\"") (string->symbol lexeme)] ["(" 'OP] [")" 'CP] ["[" 'OB] ["]" 'CB] ["{" 'OC] ["}" 'CC] ["[[" 'ODB] ; ["]]" 'CDB] ["," 'COMMA] [";" 'SEMI] ["." 'PERIOD] [#\λ 'LAMBDA] ["lambda" 'LAMBDA] ["√" 'SQRT] ["¬" 'NEG] ["≤" 'LESS-EQUAL] ["<=" 'LESS-EQUAL] ["≥" 'GREATER-EQUAL] [">=" 'GREATER-EQUAL] ["<>" 'NOT-EQUAL] ["≠" 'NOT-EQUAL] ["define" 'DEFINE] [comment (return-without-pos (expression-lexer input-port))] [string (token-STRING (substring lexeme 1 (- (string-length lexeme) 1)))] ; The parser can only look ahead 1 token, so we have 3 ; different identifiers to see whether := or ( comes after the identfier. ; This is enough to prevent shift/reduce conflicts between atom, definition, ; and application. [identifier:= (token-IDENTIFIER:= (string->symbol (string-drop-right 2 (regexp-replace* #rx"_" lexeme "-"))))] [identifierOP (token-IDENTIFIEROP (string->symbol (string-drop-right 1 (regexp-replace* #rx"_" lexeme "-"))))] [identifier (token-IDENTIFIER (string->symbol (regexp-replace* #rx"_" lexeme "-")))] [(:+ digit) (token-NUMBER (string->number lexeme))] [(:: (:+ digit) #\. (:* digit)) (token-NUMBER (string->number lexeme))])) ; The color-expression-lexer is for syntax coloring. ; The function returns 5 values: ; - Either a string containing the matching text or the eof object. ; Block comments and specials currently return an empty string. ; This may change in the future to other string or non-string data. ; - A symbol in '(error comment sexp-comment white-space constant ; string no-color parenthesis other symbol eof). ; - A symbol in '(|(| |)| |[| |]| |{| |}|) or #f. ; - A number representing the starting position of the match (or #f if eof). ; - A number representing the ending position of the match (or #f if eof). (define (syn-val a b c d e) (values a b c (position-offset d) #;(position-offset e) (max (position-offset e) (+ (position-offset d) 1)))) (define color-lexer (lexer [(eof) (syn-val lexeme 'eof #f start-pos end-pos)] [(:or #\tab #\space #\newline) (syn-val lexeme 'white-space #f start-pos end-pos)] [(:or ":=" "+" "-" "*" "/" "^" "<" ">" "=" "\"") (syn-val lexeme 'symbol #f start-pos end-pos)] [(:or "(" ")" "[" "]" "{" "}") (syn-val lexeme 'parenthesis #f start-pos end-pos)] [(:or "[[" "," ";" "." "λ" "lambda" "√" "¬" "≤" "<=" "≥" ">=" "<>" "≠") (syn-val lexeme 'no-color #f start-pos end-pos)] ["define" (syn-val lexeme 'constant #f start-pos end-pos)] [comment (syn-val lexeme 'comment #f start-pos end-pos)] [string (syn-val lexeme 'string #f start-pos end-pos)] ; The parser can only look ahead 1 token, so we have 3 ; different identifiers to see whether := or ( comes after the identfier. ; This is enough to prevent shift/reduce conflicts between atom, definition, ; and application. [(:or identifier:= identifierOP identifier) (syn-val lexeme 'symbol #f start-pos end-pos)] [(:+ digit) (syn-val lexeme 'constant #f start-pos end-pos)] [(:: (:+ digit) #\. (:* digit)) (syn-val lexeme 'constant #f start-pos end-pos)] ; catch anything else [any-char (syn-val lexeme 'error #f start-pos end-pos)])) ;; A macro to build the syntax object (define-syntax (b stx) (syntax-case stx () [(_ o value start end) #'(b o value start end 0 0)] [(_ o value start end start-adjust end-adjust) (with-syntax ((start-pos (datum->syntax #'start (string->symbol (format "$~a-start-pos" (syntax->datum #'start))))) (end-pos (datum->syntax #'end (string->symbol (format "$~a-end-pos" (syntax->datum #'end)))))) #`(datum->syntax o value (list (if (syntax? o) (syntax-source o) 'missing-in-action--sorry) (if (and o (syntax-line o)) (+ (syntax-line o) (position-line start-pos) start-adjust -1) #f) (if (and o (syntax-column o)) (+ (syntax-column o) (position-offset start-pos) start-adjust) #f) (if (and o (syntax-position o)) (+ (syntax-position o) (- (position-offset start-pos) start-adjust 1)) #f) (- (+ (position-offset end-pos) end-adjust) (+ (position-offset start-pos) start-adjust))) o o))])) ; for testing: builds lists instead of syntax objects #;(define-syntax (b stx) (syntax-case stx () [(_ _ val _ _) #'val])) (define (display-position-token pos) (display (position->list pos))) (define (position->list pos) (list (position-offset pos) (position-line pos) (position-col pos))) (define source-name #f) (define orig-stx #f) (define o #f) (define (expression-parser src-name orig) (λ (gen) (set! source-name src-name) (set! orig-stx orig) (set! o orig) (the-parser gen))) (define the-parser (parser (src-pos) ;(debug "parser-dump.txt") ;(yacc-output "parser-dump.yacc") (start start) (end newline EOF) (tokens value-tokens op-tokens) (error (lambda (token-ok? name val start end) ; The first argument will be #f if and only if the error is that an invalid token was received. ; The second and third arguments will be the name and the value of the token at which the error was detected. ; The fourth and fifth arguments, if present, provide the source positions of that token. #;(unless #f #; (string? (syntax->datum o)) (display "DEBUGXXX: ") (display (list o token-ok? name val start end)) (display-position-token start) (newline) (display-position-token end) (newline) (displayln source-name) (newline)) (error-print-source-location #t) (displayln "Syntax error") (raise-read-error "Syntax error" source-name (position-line start) (position-col start) (position-offset start) (+ (- (position-offset end) (position-offset start)))))) (precs ; (left :=) ; (right OP) (left - +) (left * /) (right OB) (right ^) (left =) ; comparisons (right NEG) (left SEMI) ; (right IDENTIFIER) ) (grammar (start [(exp) (b o `(#%infix ,$1) 1 1)] [() #f]) ;; If there is an error, ignore everything before the error ;; and try to start over right after the error (args [(exp) (b o (list $1) 1 1)] [(exp COMMA args) (b o (cons $1 $3) 1 3)] [() '()]) (ids [() '()] [(IDENTIFIER ids) (b o (cons $1 $2) 1 2)]) (parenthensis-exp [(OP exp CP) $2]) (atom [(NUMBER) (b o $1 1 1)] [(IDENTIFIER) (prec IDENTIFIER) (b o $1 1 1)] [(STRING) (b o $1 1 1)] [(parenthensis-exp) $1]) (construction-exp [(OC args CC) (b o `(,(b o 'list 1 3) ,@$2) 1 3)] [(OP LAMBDA ids PERIOD exp CP) (b o `(,(b o 'lambda 2 2) ,$3 ,$5) 1 6)] [(atom) $1]) (application-exp ;[(application-exp OB args CB) (b o `(,$1 ,@$3) 1 4)] ; function application ; Due the extra ( in IDENTIFIEROP we need to adjust the end with -1. [(IDENTIFIEROP args CP) (b o `(,(b o $1 1 1 0 -1) ,@$2) 1 3)] ; function application [(application-exp OP args CP) (prec OP) (b o `(,$1 ,@$3) 1 4 )] ; function application [(application-exp ODB exp CB CB) (b o `(,(b o 'list-ref 1 4) ,$1 ,$3) 1 4)] ; list ref [(construction-exp) $1]) #;(implicit-exp [(application-exp application-exp) (prec *) (b o `(,(b o '* 1 2) ,$1 ,$2) 1 2)] ; implicit [(application-exp) $1]) (power-exp [(application-exp ^ power-exp) (prec ^) (b o `(expt ,$1 ,$3) 1 3)] [(application-exp) $1]) (sqrt-exp [(SQRT sqrt-exp) (b o `(,(b o 'sqrt 1 1) ,$2) 1 2)] [(power-exp) $1]) (negation-exp [(- negation-exp) (b o `(,(b o '- 1 1) ,$2) 1 2)] [(sqrt-exp) $1]) (multiplication-exp [(multiplication-exp * negation-exp) (prec *) (b o `(,(b o '* 2 2) ,$1 ,$3) 1 3)] [(multiplication-exp / negation-exp) (prec /) (b o `(,(b o '/ 2 2) ,$1 ,$3) 1 3)] ;[(multiplication-exp negation-exp) (prec *) (b o `(,(b o '* 1 2) ,$1 ,$2) 1 2)] [(negation-exp) $1]) (addition-exp [(addition-exp - multiplication-exp) (prec -) (b o `(,(b o '- 2 2) ,$1 ,$3) 1 3)] [(addition-exp + multiplication-exp) (prec +) (b o `(,(b o '+ 2 2) ,$1 ,$3) 1 3)] [(multiplication-exp) $1]) (order-exp [(addition-exp LESS-EQUAL addition-exp) (prec =) (b o `(,(b o '<= 2 2) ,$1 ,$3) 1 3)] [(addition-exp < addition-exp) (prec =) (b o `(,(b o '< 2 2) ,$1 ,$3) 1 3)] [(addition-exp GREATER-EQUAL addition-exp) (prec =) (b o `(,(b o '>= 2 2) ,$1 ,$3) 1 3)] [(addition-exp > addition-exp) (prec =) (b o `(,(b o '> 2 2) ,$1 ,$3) 1 3)] [(addition-exp NOT-EQUAL addition-exp) (prec =) (b o `(not (,(b o '= 2 2) ,$1 ,$3)) 1 3)] [(addition-exp = addition-exp) (prec =) (b o `(,(b o '= 2 2) ,$1 ,$3) 1 3)] [(addition-exp) $1]) (logical-negation-exp [(NEG logical-negation-exp) (prec NEG) (b o `(,(b o 'not 1 1) ,$2) 1 2)] [(order-exp) $1]) ; The no DEFINE version conflicts with application. ; Solution? Move definition with := into rule for application. (assignment-exp ;[(DEFINE IDENTIFIER := assignment-exp) (b o `(,(b o 'define 3 3) ,$2 ,$4) 2 4)] ;[(DEFINE IDENTIFIER OP args CP := assignment-exp) (b o `(,(b o 'define 4 4) (,$2 ,@$4) ,$7) 2 6)] [(IDENTIFIER:= assignment-exp) (b o `(,(b o 'define 1 1) ,(b o $1 1 1 ; adjust end with -2 due to the chars in := 0 -2) ,(b o $2 2 2)) 1 2)] [(IDENTIFIEROP args CP := assignment-exp) (b o `(,(b o 'define 2 2) (,$1 ,@$2) ,$5) 1 5)] [(logical-negation-exp) $1]) (compound-exp [(compound-exp SEMI assignment-exp) (b o `(,(b o 'begin 2 2) ,$1 ,$3) 1 3)] [(assignment-exp) $1]) (exp [(compound-exp) $1])))) (define (parse-expression src stx ip) (port-count-lines! ip) (define out ((expression-parser (if src src (object-name ip)) ; If you change any of these values, then ; you must change the builder b which uses ; the the values in here to add to newly read ; objects. If the lexer/parser was only to ; read from entire files, there would be no problem, ; but sometimes one must read from strings ; entered in a repl. There the syntax-objects must be ; adjusted. (datum->syntax #f 'here (list (if src src (object-name ip)) 1 ; line 1 0 ; column 0 1 ; offset/position 1 (contract for datum->syntax requires a positive offset) #f) ; span )) ; no properties ; The following Tom Foolery is to needed to turn ; SEMI EOF into EOF ; This allows one to have an optional semi colon in the end of the file. (let ([peek (expression-lexer ip)] [peek1 (expression-lexer ip)]) (define (next) (cond [(eq? (position-token-token peek) 'EOF) (void)] [else (set! peek peek1) (set! peek1 (expression-lexer ip))])) (λ () (if (and (eq? (position-token-token peek) 'SEMI) (eq? (position-token-token peek1) 'EOF)) (begin0 peek1 (next)) (begin0 peek (next))))))) (displayln out) out)