450 lines
17 KiB
Racket
450 lines
17 KiB
Racket
#lang racket
|
||
(require "parameter.rkt")
|
||
(require syntax/srcloc)
|
||
|
||
;;;
|
||
;;; TODO: Handle special values.
|
||
;;; Pasting an image into the REPL breaks the parser at the moment.
|
||
|
||
;;;
|
||
;;; NOTES:
|
||
;;; Changes from planet version: ]] (CDB) Bug fixed
|
||
;;; Strings are supported.
|
||
;;; Assignments changed from set! to define.
|
||
;;; Assignment of functions.
|
||
;;; Bug fix: identifiers with more than one _ wasn't converted correctly
|
||
|
||
; <e> = <num>
|
||
; | <string>
|
||
; | <id> variable reference
|
||
; | <e> [ <args> ] application
|
||
; | { <args> } list construction
|
||
; | <e> + <e> addition
|
||
; | <e> - <e> subtraction
|
||
; | <e> * <e> multiplication
|
||
; | <e> / <e> division
|
||
; | <e> ^ <e> exponentiation
|
||
; | - <e> negation
|
||
; | ( <e> ) grouping
|
||
; | <id> ( <args> ) := <e> function definition
|
||
|
||
; <id> An identifier begins with a letter,
|
||
; and is optionally followed by series of letters, digits or underscores.
|
||
; An underscore is converted to a -. Thus list_ref will refer to list-ref.
|
||
|
||
; <num> A number is an non-empty series of digits,
|
||
; optionally followed by a period followed by a series of digits.
|
||
|
||
; <string> A number is a " followed by a series of non-" characters followed by a " .
|
||
|
||
(provide parse-expression
|
||
color-lexer)
|
||
|
||
(require parser-tools/yacc
|
||
parser-tools/lex
|
||
(prefix-in : parser-tools/lex-sre)
|
||
syntax/readerr)
|
||
|
||
(define-tokens value-tokens (NUMBER STRING IDENTIFIER IDENTIFIEROP IDENTIFIER:= SPECIAL))
|
||
(define-empty-tokens
|
||
op-tokens
|
||
(newline
|
||
:=
|
||
OP CP ; ( )
|
||
CP:= ; ):=
|
||
OB CB ; [ ]
|
||
OC CC ; { }
|
||
ODB ; [[ ]]
|
||
COMMA ; ,
|
||
SEMI ; ;
|
||
PERIOD ; .
|
||
LAMBDA ; lambda or λ
|
||
SQRT ; √
|
||
NEG ; ¬ (logical negation)
|
||
LESS-EQUAL ; <= or ≤
|
||
GREATER-EQUAL ; >= or ≥
|
||
NOT-EQUAL ; <> or ≠
|
||
= < >
|
||
+ - * / ^
|
||
DEFINE
|
||
EOF))
|
||
|
||
(define-lex-abbrevs
|
||
[greek (:or (char-range #\α #\ω) (char-range #\Γ #\Ω))]
|
||
[roman (:or (:/ "a" "z") (:/ #\A #\Z) )]
|
||
[letter (:or roman greek)]
|
||
[digit (:/ #\0 #\9)]
|
||
[string (:: #\" (:* (:~ #\")) #\")]
|
||
[identifier (:: letter (:* (:or letter digit #\_ #\?)))]
|
||
[identifier:= (:: letter (:* (:or letter digit #\_ #\?)) ":=")]
|
||
[identifierOP (:: letter (:* (:or letter digit #\_ #\?)) "(")]
|
||
[comment (:: "%" (complement (:: any-string #\newline any-string)) #\newline)])
|
||
|
||
(define (string-drop-right n s)
|
||
(substring s 0 (- (string-length s) n)))
|
||
|
||
|
||
(define expression-lexer
|
||
(lexer-src-pos
|
||
[(eof) (token-EOF)]
|
||
[(special) (token-SPECIAL lexeme)]
|
||
[(:or #\tab #\space #\newline) ; this skips whitespace
|
||
(return-without-pos (expression-lexer input-port))]
|
||
[#\newline (token-newline)] ; (token-newline) returns 'newline
|
||
[(:or ":=" "+" "-" "*" "/" "^" "<" ">" "=" "\"") (string->symbol lexeme)]
|
||
["(" 'OP]
|
||
[")" 'CP]
|
||
["[" 'OB]
|
||
["]" 'CB]
|
||
["{" 'OC]
|
||
["}" 'CC]
|
||
["[[" 'ODB]
|
||
; ["]]" 'CDB]
|
||
["," 'COMMA]
|
||
[";" 'SEMI]
|
||
["." 'PERIOD]
|
||
[#\λ 'LAMBDA]
|
||
["lambda" 'LAMBDA]
|
||
["√" 'SQRT]
|
||
["¬" 'NEG]
|
||
["≤" 'LESS-EQUAL]
|
||
["<=" 'LESS-EQUAL]
|
||
["≥" 'GREATER-EQUAL]
|
||
[">=" 'GREATER-EQUAL]
|
||
["<>" 'NOT-EQUAL]
|
||
["≠" 'NOT-EQUAL]
|
||
["define" 'DEFINE]
|
||
[comment
|
||
(return-without-pos (expression-lexer input-port))]
|
||
[string
|
||
(token-STRING (substring lexeme 1 (- (string-length lexeme) 1)))]
|
||
; The parser can only look ahead 1 token, so we have 3
|
||
; different identifiers to see whether := or ( comes after the identfier.
|
||
; This is enough to prevent shift/reduce conflicts between atom, definition,
|
||
; and application.
|
||
[identifier:=
|
||
(token-IDENTIFIER:=
|
||
(string->symbol (string-drop-right 2 (regexp-replace* #rx"_" lexeme "-"))))]
|
||
[identifierOP
|
||
(token-IDENTIFIEROP
|
||
(string->symbol (string-drop-right 1 (regexp-replace* #rx"_" lexeme "-"))))]
|
||
[identifier
|
||
(token-IDENTIFIER (string->symbol (regexp-replace* #rx"_" lexeme "-")))]
|
||
[(:+ digit) (token-NUMBER (string->number lexeme))]
|
||
[(:: (:+ digit) #\. (:* digit)) (token-NUMBER (string->number lexeme))]))
|
||
|
||
; The color-expression-lexer is for syntax coloring.
|
||
; The function returns 5 values:
|
||
; - Either a string containing the matching text or the eof object.
|
||
; Block comments and specials currently return an empty string.
|
||
; This may change in the future to other string or non-string data.
|
||
; - A symbol in '(error comment sexp-comment white-space constant
|
||
; string no-color parenthesis other symbol eof).
|
||
; - A symbol in '(|(| |)| |[| |]| |{| |}|) or #f.
|
||
; - A number representing the starting position of the match (or #f if eof).
|
||
; - A number representing the ending position of the match (or #f if eof).
|
||
|
||
(define (syn-val a b c d e)
|
||
(values a ; string with mathching text
|
||
b ; symbol in '(comment white-space no-color eof)
|
||
c ; symbol in '(|(| |)| |[| |]| |{| |}|) or #f.
|
||
(position-offset d) ; start pos
|
||
#;(position-offset e)
|
||
(max ; end pos
|
||
(position-offset e)
|
||
(+ (position-offset d) 1))))
|
||
|
||
(define color-lexer
|
||
; REMEMBER to restart DrScheme to test any changes in the color-lexer.
|
||
; The lexer is only imported into DrRacket at startup.
|
||
(lexer
|
||
[(eof)
|
||
(syn-val lexeme 'eof #f start-pos end-pos)]
|
||
[(:or #\tab #\space #\newline)
|
||
(syn-val lexeme 'white-space #f start-pos end-pos)]
|
||
[(:or ":=" "+" "-" "*" "/" "^" "<" ">" "=" "\"")
|
||
(syn-val lexeme 'symbol #f start-pos end-pos)]
|
||
[(:or "(" ")" "[" "]" "{" "}")
|
||
(syn-val lexeme 'parenthesis #f start-pos end-pos)]
|
||
[(:or "[[" "," ";" "." "λ" "lambda" "√" "¬" "≤" "<=" "≥" ">=" "<>" "≠")
|
||
(syn-val lexeme 'no-color #f start-pos end-pos)]
|
||
["define"
|
||
(syn-val lexeme 'constant #f start-pos end-pos)]
|
||
[comment
|
||
(syn-val lexeme 'comment #f start-pos end-pos)]
|
||
[string
|
||
(syn-val lexeme 'string #f start-pos end-pos)]
|
||
; The parser can only look ahead 1 token, so we have 3
|
||
; different identifiers to see whether := or ( comes after the identfier.
|
||
; This is enough to prevent shift/reduce conflicts between atom, definition,
|
||
; and application.
|
||
[":="
|
||
(syn-val lexeme 'symbol #f start-pos end-pos)]
|
||
[identifier
|
||
(syn-val lexeme 'symbol #f start-pos end-pos)]
|
||
[(:+ digit)
|
||
(syn-val lexeme 'constant #f start-pos end-pos)]
|
||
[(:: (:+ digit) #\. (:* digit))
|
||
(syn-val lexeme 'constant #f start-pos end-pos)]
|
||
; catch anything else
|
||
[any-char
|
||
(syn-val lexeme 'error #f start-pos end-pos)]))
|
||
|
||
; read-syntax puts on the syntax-object read s.t.
|
||
; DrRacket knows it is original syntax.
|
||
; And then DrRacket draws arrows after check syntax.
|
||
(define (sym->original-syntax sym srcloc)
|
||
(define p (open-input-string (symbol->string sym)))
|
||
(port-count-lines! p)
|
||
(match-define (list source-name line column position span) srcloc)
|
||
(set-port-next-location! p line column position)
|
||
(read-syntax source-name p))
|
||
|
||
;; A macro to build the syntax object
|
||
(define-for-syntax (build stx original?)
|
||
(syntax-case stx ()
|
||
[(_ o value start end)
|
||
(if original?
|
||
#'(borg o value start end 0 0)
|
||
#'(b o value start end 0 0))]
|
||
[(_ o value start end start-adjust end-adjust)
|
||
(with-syntax
|
||
([start-pos (datum->syntax #'start
|
||
(string->symbol
|
||
(format "$~a-start-pos"
|
||
(syntax->datum #'start))))]
|
||
[end-pos (datum->syntax #'end
|
||
(string->symbol
|
||
(format "$~a-end-pos"
|
||
(syntax->datum #'end))))]
|
||
[org (if original? #t #f)])
|
||
#`(datum->syntax
|
||
o
|
||
(if (and org (or (symbol? value) (identifier? value)))
|
||
(sym->original-syntax
|
||
value (list (if (syntax? o) (syntax-source o) 'missing-in-action--sorry)
|
||
(if (and o (syntax-line o))
|
||
(+ (syntax-line o) (position-line start-pos) start-adjust -1) #f)
|
||
(if (and o (syntax-column o))
|
||
(+ (syntax-column o) (position-offset start-pos) start-adjust) #f)
|
||
(if (and o (syntax-position o))
|
||
(+ (syntax-position o) (- (position-offset start-pos) start-adjust 1)) #f)
|
||
(- (+ (position-offset end-pos) end-adjust)
|
||
(+ (position-offset start-pos) start-adjust))))
|
||
value)
|
||
(list (if (syntax? o) (syntax-source o) 'missing-in-action--sorry)
|
||
(if (and o (syntax-line o))
|
||
(+ (syntax-line o) (position-line start-pos) start-adjust -1) #f)
|
||
(if (and o (syntax-column o))
|
||
(+ (syntax-column o) (position-offset start-pos) start-adjust) #f)
|
||
(if (and o (syntax-position o))
|
||
(+ (syntax-position o) (- (position-offset start-pos) start-adjust 1)) #f)
|
||
(- (+ (position-offset end-pos) end-adjust)
|
||
(+ (position-offset start-pos) start-adjust)))
|
||
o o))]))
|
||
|
||
(define-syntax (b stx)
|
||
(build stx #f))
|
||
|
||
(define-syntax (borg stx)
|
||
(define out (build stx #t))
|
||
;(displayln (list 'borg out))
|
||
out)
|
||
|
||
|
||
; for testing: builds lists instead of syntax objects
|
||
#;(define-syntax (b stx)
|
||
(syntax-case stx ()
|
||
[(_ _ val _ _)
|
||
#'val]))
|
||
|
||
(define (display-position-token pos)
|
||
(display (position->list pos)))
|
||
|
||
(define (position->list pos)
|
||
(list (position-offset pos)
|
||
(position-line pos)
|
||
(position-col pos)))
|
||
|
||
(define source-name #f)
|
||
(define orig-stx #f)
|
||
(define o #f)
|
||
|
||
(define (expression-parser src-name orig)
|
||
(λ (gen)
|
||
(set! source-name src-name)
|
||
(set! orig-stx orig)
|
||
(set! o orig)
|
||
(the-parser gen)))
|
||
|
||
|
||
(define the-parser
|
||
(parser
|
||
(src-pos)
|
||
;(debug "parser-dump.txt")
|
||
;(yacc-output "parser-dump.yacc")
|
||
(start start)
|
||
(end newline EOF)
|
||
(tokens value-tokens op-tokens)
|
||
(error (lambda (token-ok? name val start end)
|
||
; The first argument will be #f if and only if the error is that an invalid token was received.
|
||
; The second and third arguments will be the name and the value of the token at which the error was detected.
|
||
; The fourth and fifth arguments, if present, provide the source positions of that token.
|
||
#;(unless #f #; (string? (syntax->datum o))
|
||
(display "DEBUGXXX: ")
|
||
(display (list o token-ok? name val start end))
|
||
(display-position-token start) (newline)
|
||
(display-position-token end) (newline)
|
||
(displayln source-name)
|
||
(newline))
|
||
(error-print-source-location #t)
|
||
(displayln "Syntax error")
|
||
(raise-read-error
|
||
"Syntax error"
|
||
source-name
|
||
(position-line start)
|
||
(position-col start)
|
||
(position-offset start)
|
||
(+ (- (position-offset end) (position-offset start))))))
|
||
|
||
(precs ; (left :=)
|
||
; (right OP)
|
||
(left - +)
|
||
(left * /)
|
||
(right OB)
|
||
(right ^)
|
||
(left =) ; comparisons
|
||
(right NEG)
|
||
(left SEMI)
|
||
; (right IDENTIFIER)
|
||
)
|
||
|
||
(grammar
|
||
(start [(exp) (b o `(#%infix ,$1) 1 1)]
|
||
[() #f])
|
||
;; If there is an error, ignore everything before the error
|
||
;; and try to start over right after the error
|
||
|
||
(args [(exp) (b o (list $1) 1 1)]
|
||
[(exp COMMA args) (b o (cons $1 $3) 1 3)]
|
||
[() '()])
|
||
|
||
(ids [() '()]
|
||
[(IDENTIFIER ids) (b o (cons $1 $2) 1 2)])
|
||
|
||
(parenthensis-exp
|
||
[(OP exp CP) $2])
|
||
|
||
(atom
|
||
[(NUMBER) (b o $1 1 1)]
|
||
[(IDENTIFIER) (prec IDENTIFIER) (borg o $1 1 1)]
|
||
[(STRING) (b o $1 1 1)]
|
||
[(SPECIAL) (b o $1 1 1)]
|
||
[(parenthensis-exp) $1])
|
||
|
||
(construction-exp
|
||
[(OC args CC) (b o `(,(b o 'list 1 3) ,@$2) 1 3)]
|
||
[(OP LAMBDA ids PERIOD exp CP) (b o `(,(b o 'lambda 2 2) ,$3 ,$5) 1 6)]
|
||
[(atom) $1])
|
||
|
||
(application-exp
|
||
;[(application-exp OB args CB) (b o `(,$1 ,@$3) 1 4)] ; function application
|
||
; Due the extra ( in IDENTIFIEROP we need to adjust the end with -1.
|
||
[(IDENTIFIEROP args CP) (b o `(,(borg o $1 1 1 0 -1) ,@$2) 1 3)] ; function application
|
||
[(application-exp OP args CP) (prec OP) (b o `(,$1 ,@$3) 1 4 )] ; function application
|
||
[(application-exp ODB exp CB CB) (b o `(,(b o 'list-ref 1 4) ,$1 ,$3) 1 4)] ; list ref
|
||
[(construction-exp) $1])
|
||
|
||
|
||
#;(implicit-exp
|
||
[(application-exp application-exp) (prec *) (b o `(,(b o '* 1 2) ,$1 ,$2) 1 2)] ; implicit
|
||
[(application-exp) $1])
|
||
|
||
(power-exp
|
||
[(application-exp ^ power-exp) (prec ^) (b o `(expt ,$1 ,$3) 1 3)]
|
||
[(application-exp) $1])
|
||
|
||
(sqrt-exp
|
||
[(SQRT sqrt-exp) (b o `(,(b o 'sqrt 1 1) ,$2) 1 2)]
|
||
[(power-exp) $1])
|
||
|
||
(negation-exp
|
||
[(- negation-exp) (b o `(,(b o '- 1 1) ,$2) 1 2)]
|
||
[(sqrt-exp) $1])
|
||
|
||
(multiplication-exp
|
||
[(multiplication-exp * negation-exp) (prec *) (b o `(,(b o '* 2 2) ,$1 ,$3) 1 3)]
|
||
[(multiplication-exp / negation-exp) (prec /) (b o `(,(b o '/ 2 2) ,$1 ,$3) 1 3)]
|
||
;[(multiplication-exp negation-exp) (prec *) (b o `(,(b o '* 1 2) ,$1 ,$2) 1 2)]
|
||
[(negation-exp) $1])
|
||
|
||
(addition-exp
|
||
[(addition-exp - multiplication-exp) (prec -) (b o `(,(b o '- 2 2) ,$1 ,$3) 1 3)]
|
||
[(addition-exp + multiplication-exp) (prec +) (b o `(,(b o '+ 2 2) ,$1 ,$3) 1 3)]
|
||
[(multiplication-exp) $1])
|
||
|
||
(order-exp
|
||
[(addition-exp LESS-EQUAL addition-exp) (prec =) (b o `(,(b o '<= 2 2) ,$1 ,$3) 1 3)]
|
||
[(addition-exp < addition-exp) (prec =) (b o `(,(b o '< 2 2) ,$1 ,$3) 1 3)]
|
||
[(addition-exp GREATER-EQUAL addition-exp) (prec =) (b o `(,(b o '>= 2 2) ,$1 ,$3) 1 3)]
|
||
[(addition-exp > addition-exp) (prec =) (b o `(,(b o '> 2 2) ,$1 ,$3) 1 3)]
|
||
[(addition-exp NOT-EQUAL addition-exp) (prec =) (b o `(not (,(b o '= 2 2) ,$1 ,$3)) 1 3)]
|
||
[(addition-exp = addition-exp) (prec =) (b o `(,(b o '= 2 2) ,$1 ,$3) 1 3)]
|
||
[(addition-exp) $1])
|
||
|
||
(logical-negation-exp
|
||
[(NEG logical-negation-exp) (prec NEG) (b o `(,(b o 'not 1 1) ,$2) 1 2)]
|
||
[(order-exp) $1])
|
||
|
||
; The no DEFINE version conflicts with application.
|
||
; Solution? Move definition with := into rule for application.
|
||
(assignment-exp
|
||
;[(DEFINE IDENTIFIER := assignment-exp) (b o `(,(b o 'define 3 3) ,$2 ,$4) 2 4)]
|
||
;[(DEFINE IDENTIFIER OP args CP := assignment-exp) (b o `(,(b o 'define 4 4) (,$2 ,@$4) ,$7) 2 6)]
|
||
[(IDENTIFIER:= assignment-exp) (borg o `(,(b o 'define 1 1)
|
||
,(borg o $1 1 1
|
||
; adjust end with -2 due to the chars in :=
|
||
0 -2)
|
||
,(b o $2 2 2)) 1 2)]
|
||
[(IDENTIFIEROP args CP := assignment-exp) (b o `(,(b o 'define 2 2) (,(borg o $1 1 1) ,@(borg o $2 2 2)) ,$5) 1 5)]
|
||
[(logical-negation-exp) $1])
|
||
|
||
(compound-exp
|
||
[(compound-exp SEMI assignment-exp) (b o `(,(b o 'begin 2 2) ,$1 ,$3) 1 3)]
|
||
[(assignment-exp) $1])
|
||
|
||
(exp
|
||
[(compound-exp) $1]
|
||
[(compound-exp SEMI) $1]) ; the last SEMI is optional
|
||
)))
|
||
|
||
(define (parse-expression src stx ip)
|
||
(port-count-lines! ip)
|
||
(define out
|
||
((expression-parser
|
||
(if src src (object-name ip))
|
||
; If you change any of these values, then
|
||
; you must change the builder b which uses
|
||
; the the values in here to add to newly read
|
||
; objects. If the lexer/parser was only to
|
||
; read from entire files, there would be no problem,
|
||
; but sometimes one must read from strings
|
||
; entered in a repl. There the syntax-objects must be
|
||
; adjusted.
|
||
(datum->syntax #f
|
||
'here
|
||
(list (if src src (object-name ip))
|
||
1 ; line 1
|
||
0 ; column 0
|
||
1 ; offset/position 1 (contract for datum->syntax requires a positive offset)
|
||
#f) ; span
|
||
)) ; no properties
|
||
; The following Tom Foolery is to needed to turn
|
||
; SEMI EOF into EOF
|
||
; This allows one to have an optional semi colon in the end of the file.
|
||
(λ () (expression-lexer ip))))
|
||
;(displayln out)
|
||
(if (eq? out #f)
|
||
eof
|
||
out))
|