scribble-math/bracket/lang/parser.rkt
2012-06-22 15:29:19 +02:00

408 lines
16 KiB
Racket

#lang racket
(require "parameter.rkt")
(require syntax/srcloc)
;;;
;;; NOTES:
;;; Changes from planet version: ]] (CDB) Bug fixed
;;; Strings are supported.
;;; Assignments changed from set! to define.
;;; Assignment of functions.
;;; Bug fix: identifiers with more than one _ wasn't converted correctly
; <e> = <num>
; | <string>
; | <id> variable reference
; | <e> [ <args> ] application
; | { <args> } list construction
; | <e> + <e> addition
; | <e> - <e> subtraction
; | <e> * <e> multiplication
; | <e> / <e> division
; | <e> ^ <e> exponentiation
; | - <e> negation
; | ( <e> ) grouping
; | <id> ( <args> ) := <e> function definition
; <id> An identifier begins with a letter,
; and is optionally followed by series of letters, digits or underscores.
; An underscore is converted to a -. Thus list_ref will refer to list-ref.
; <num> A number is an non-empty series of digits,
; optionally followed by a period followed by a series of digits.
; <string> A number is a " followed by a series of non-" characters followed by a " .
(provide parse-expression
color-lexer)
(require parser-tools/yacc
parser-tools/lex
(prefix-in : parser-tools/lex-sre)
syntax/readerr)
(define-tokens value-tokens (NUMBER STRING IDENTIFIER IDENTIFIEROP IDENTIFIER:=))
(define-empty-tokens
op-tokens
(newline
:=
OP CP ; ( )
CP:= ; ):=
OB CB ; [ ]
OC CC ; { }
ODB ; [[ ]]
COMMA ; ,
SEMI ; ;
PERIOD ; .
LAMBDA ; lambda or λ
SQRT ; √
NEG ; ¬ (logical negation)
LESS-EQUAL ; <= or ≤
GREATER-EQUAL ; >= or ≥
NOT-EQUAL ; <> or ≠
= < >
+ - * / ^
DEFINE
EOF))
(define-lex-abbrevs
[letter (:or (:/ "a" "z") (:/ #\A #\Z) )]
[digit (:/ #\0 #\9)]
[string (:: #\" (:* (:~ #\")) #\")]
[identifier (:: letter (:* (:or letter digit #\_ #\?)))]
[identifier:= (:: letter (:* (:or letter digit #\_ #\?)) ":=")]
[identifierOP (:: letter (:* (:or letter digit #\_ #\?)) "(")])
(define (string-drop-right n s)
(substring s 0 (- (string-length s) n)))
(define expression-lexer
(lexer-src-pos
[(eof) 'EOF]
[(:or #\tab #\space #\newline) ; this skips whitespace
(return-without-pos (expression-lexer input-port))]
[#\newline (token-newline)] ; (token-newline) returns 'newline
[(:or ":=" "+" "-" "*" "/" "^" "<" ">" "=" "\"") (string->symbol lexeme)]
["(" 'OP]
[")" 'CP]
["[" 'OB]
["]" 'CB]
["{" 'OC]
["}" 'CC]
["[[" 'ODB]
; ["]]" 'CDB]
["," 'COMMA]
[";" 'SEMI]
["." 'PERIOD]
[#\λ 'LAMBDA]
["lambda" 'LAMBDA]
["" 'SQRT]
["¬" 'NEG]
["" 'LESS-EQUAL]
["<=" 'LESS-EQUAL]
["" 'GREATER-EQUAL]
[">=" 'GREATER-EQUAL]
["<>" 'NOT-EQUAL]
["" 'NOT-EQUAL]
["define" 'DEFINE]
[string
(token-STRING (substring lexeme 1 (- (string-length lexeme) 1)))]
; The parser can only look ahead 1 token, so we have 3
; different identifiers to see whether := or ( comes after the identfier.
; This is enough to prevent shift/reduce conflicts between atom, definition,
; and application.
[identifier:=
(token-IDENTIFIER:=
(string->symbol (string-drop-right 2 (regexp-replace* #rx"_" lexeme "-"))))]
[identifierOP
(token-IDENTIFIEROP
(string->symbol (string-drop-right 1 (regexp-replace* #rx"_" lexeme "-"))))]
[identifier
(token-IDENTIFIER (string->symbol (regexp-replace* #rx"_" lexeme "-")))]
[(:+ digit) (token-NUMBER (string->number lexeme))]
[(:: (:+ digit) #\. (:* digit)) (token-NUMBER (string->number lexeme))]))
; The color-expression-lexer is for syntax coloring.
; The function returns 5 values:
; - Either a string containing the matching text or the eof object.
; Block comments and specials currently return an empty string.
; This may change in the future to other string or non-string data.
; - A symbol in '(error comment sexp-comment white-space constant
; string no-color parenthesis other symbol eof).
; - A symbol in '(|(| |)| |[| |]| |{| |}|) or #f.
; - A number representing the starting position of the match (or #f if eof).
; - A number representing the ending position of the match (or #f if eof).
(define color-lexer
(lexer-src-pos
[(eof)
(values eof #f #f start-pos end-pos)]
[(:or #\tab #\space #\newline)
(values lexeme 'white-space #f start-pos end-pos)]
[#\newline
(begin
(/ 0 0)
(values lexeme 'white-space #f start-pos end-pos))]
[(:or ":=" "+" "-" "*" "/" "^" "<" ">" "=" "\"")
(values lexeme 'symbol #f start-pos end-pos)]
[(:or "(" ")" "[" "]" "{" "}")
(values lexeme 'parenthesis #f start-pos end-pos)]
[(:or "[[" "," ";" "." "λ" "lambda" "" "¬" "" "<=" "" ">=" "<>" "")
(values lexeme 'no-color #f start-pos end-pos)]
["define"
(values lexeme 'constant (/ 0 0) start-pos end-pos)]
[string
(values lexeme 'string #f start-pos end-pos)]
; The parser can only look ahead 1 token, so we have 3
; different identifiers to see whether := or ( comes after the identfier.
; This is enough to prevent shift/reduce conflicts between atom, definition,
; and application.
[(:or identifier:= identifierOP identifier)
(values lexeme 'symbol #f start-pos end-pos)]
[(:+ digit)
(values lexeme 'constant #f start-pos end-pos)]
[(:: (:+ digit) #\. (:* digit))
(values lexeme 'constant #f start-pos end-pos)]))
;; A macro to build the syntax object
(define-syntax (b stx)
(syntax-case stx ()
[(_ o value start end)
#'(b o value start end 0 0)]
[(_ o value start end start-adjust end-adjust)
(with-syntax
((start-pos (datum->syntax #'start
(string->symbol
(format "$~a-start-pos"
(syntax->datum #'start)))))
(end-pos (datum->syntax #'end
(string->symbol
(format "$~a-end-pos"
(syntax->datum #'end))))))
#`(datum->syntax
o
value
(list (if (syntax? o) (syntax-source o) 'missing-in-action--sorry)
(if (and o (syntax-line o))
(+ (syntax-line o) (position-line start-pos) start-adjust -1) #f)
(if (and o (syntax-column o))
(+ (syntax-column o) (position-offset start-pos) start-adjust) #f)
(if (and o (syntax-position o))
(+ (syntax-position o) (- (position-offset start-pos) start-adjust 1)) #f)
(- (+ (position-offset end-pos) end-adjust)
(+ (position-offset start-pos) start-adjust)))
o o))]))
; for testing: builds lists instead of syntax objects
#;(define-syntax (b stx)
(syntax-case stx ()
[(_ _ val _ _)
#'val]))
(define (display-position-token pos)
(display (position->list pos)))
(define (position->list pos)
(list (position-offset pos)
(position-line pos)
(position-col pos)))
(define source-name #f)
(define orig-stx #f)
(define o #f)
(define (expression-parser src-name orig)
(λ (gen)
(set! source-name src-name)
(set! orig-stx orig)
(set! o orig)
(the-parser gen)))
(define the-parser
(parser
(src-pos)
; (suppress) ; hmm...
;(debug "parser-dump.txt")
;(yacc-output "parser-dump.yacc")
(start start)
(end newline EOF)
(tokens value-tokens op-tokens)
(error (lambda (token-ok? name val start end)
; The first argument will be #f if and only if the error is that an invalid token was received.
; The second and third arguments will be the name and the value of the token at which the error was detected.
; The fourth and fifth arguments, if present, provide the source positions of that token.
#;(unless #f #; (string? (syntax->datum o))
(display "DEBUGXXX: ")
(display (list o token-ok? name val start end))
(display-position-token start) (newline)
(display-position-token end) (newline)
(displayln source-name)
(newline))
(error-print-source-location #t)
(displayln "Syntax error")
(raise-read-error
"Syntax error"
source-name
(position-line start)
(position-col start)
(position-offset start)
(+ (- (position-offset end) (position-offset start))))
#;(raise-syntax-error
#f
"syntax error"
(datum->syntax
#'here 'here
(list
source-name
(position-line start)
(position-col start)
(position-offset start)
(+ (- (position-offset end) (position-offset start))))))))
(precs ; (left :=)
; (right OP)
(left - +)
(left * /)
(right OB)
(right ^)
(left =) ; comparisons
(right NEG)
(left SEMI)
; (right IDENTIFIER)
)
(grammar
(start [(exp) (b o `(#%infix ,$1) 1 1)]
[() #f])
;; If there is an error, ignore everything before the error
;; and try to start over right after the error
(args [(exp) (b o (list $1) 1 1)]
[(exp COMMA args) (b o (cons $1 $3) 1 3)]
[() '()])
(ids [() '()]
[(IDENTIFIER ids) (b o (cons $1 $2) 1 2)])
(parenthensis-exp
[(OP exp CP) $2])
(atom
[(NUMBER) (b o $1 1 1)]
[(IDENTIFIER) (prec IDENTIFIER) (b o $1 1 1)]
[(STRING) (b o $1 1 1)]
[(parenthensis-exp) $1])
(construction-exp
[(OC args CC) (b o `(,(b o 'list 1 3) ,@$2) 1 3)]
[(OP LAMBDA ids PERIOD exp CP) (b o `(,(b o 'lambda 2 2) ,$3 ,$5) 1 6)]
[(atom) $1])
(application-exp
;[(application-exp OB args CB) (b o `(,$1 ,@$3) 1 4)] ; function application
; Due the extra ( in IDENTIFIEROP we need to adjust the end with -1.
[(IDENTIFIEROP args CP) (b o `(,(b o $1 1 1 0 -1) ,@$2) 1 3)] ; function application
[(application-exp OP args CP) (prec OP) (b o `(,$1 ,@$3) 1 4 )] ; function application
[(application-exp ODB exp CB CB) (b o `(,(b o 'list-ref 1 4) ,$1 ,$3) 1 4)] ; list ref
[(construction-exp) $1])
#;(implicit-exp
[(application-exp application-exp) (prec *) (b o `(,(b o '* 1 2) ,$1 ,$2) 1 2)] ; implicit
[(application-exp) $1])
(power-exp
[(application-exp ^ power-exp) (prec ^) (b o `(expt ,$1 ,$3) 1 3)]
[(application-exp) $1])
(sqrt-exp
[(SQRT sqrt-exp) (b o `(,(b o 'sqrt 1 1) ,$2) 1 2)]
[(power-exp) $1])
(negation-exp
[(- negation-exp) (b o `(,(b o '- 1 1) ,$2) 1 2)]
[(sqrt-exp) $1])
(multiplication-exp
[(multiplication-exp * negation-exp) (prec *) (b o `(,(b o '* 2 2) ,$1 ,$3) 1 3)]
[(multiplication-exp / negation-exp) (prec /) (b o `(,(b o '/ 2 2) ,$1 ,$3) 1 3)]
;[(multiplication-exp negation-exp) (prec *) (b o `(,(b o '* 1 2) ,$1 ,$2) 1 2)]
[(negation-exp) $1])
(addition-exp
[(addition-exp - multiplication-exp) (prec -) (b o `(,(b o '- 2 2) ,$1 ,$3) 1 3)]
[(addition-exp + multiplication-exp) (prec +) (b o `(,(b o '+ 2 2) ,$1 ,$3) 1 3)]
[(multiplication-exp) $1])
(order-exp
[(addition-exp LESS-EQUAL addition-exp) (prec =) (b o `(,(b o '<= 2 2) ,$1 ,$3) 1 3)]
[(addition-exp < addition-exp) (prec =) (b o `(,(b o '< 2 2) ,$1 ,$3) 1 3)]
[(addition-exp GREATER-EQUAL addition-exp) (prec =) (b o `(,(b o '>= 2 2) ,$1 ,$3) 1 3)]
[(addition-exp > addition-exp) (prec =) (b o `(,(b o '> 2 2) ,$1 ,$3) 1 3)]
[(addition-exp NOT-EQUAL addition-exp) (prec =) (b o `(not (,(b o '= 2 2) ,$1 ,$3)) 1 3)]
[(addition-exp = addition-exp) (prec =) (b o `(,(b o '= 2 2) ,$1 ,$3) 1 3)]
[(addition-exp) $1])
(logical-negation-exp
[(NEG logical-negation-exp) (prec NEG) (b o `(,(b o 'not 1 1) ,$2) 1 2)]
[(order-exp) $1])
; The no DEFINE version conflicts with application.
; Solution? Move definition with := into rule for application.
(assignment-exp
;[(DEFINE IDENTIFIER := assignment-exp) (b o `(,(b o 'define 3 3) ,$2 ,$4) 2 4)]
;[(DEFINE IDENTIFIER OP args CP := assignment-exp) (b o `(,(b o 'define 4 4) (,$2 ,@$4) ,$7) 2 6)]
[(IDENTIFIER:= assignment-exp) (b o `(,(b o 'define 1 1)
,(b o $1 1 1
; adjust end with -2 due to the chars in :=
0 -2)
,(b o $2 2 2)) 1 2)]
[(IDENTIFIEROP args CP := assignment-exp) (b o `(,(b o 'define 2 2) (,$1 ,@$2) ,$5) 1 5)]
[(logical-negation-exp) $1])
(compound-exp
[(compound-exp SEMI assignment-exp) (b o `(,(b o 'begin 2 2) ,$1 ,$3) 1 3)]
[(assignment-exp) $1])
(exp
[(compound-exp) $1]))))
(define (parse-expression src stx ip)
(port-count-lines! ip)
((expression-parser
(if src src (object-name ip))
; If you change any of these values, then
; you must change the builder b which uses
; the the values in here to add to newly read
; objects. If the lexer/parser was only to
; read from entire files, there would be no problem,
; but sometimes one must read from strings
; entered in a repl. There the syntax-objects must be
; adjusted.
(datum->syntax #f
'here
(list (if src src (object-name ip))
1 ; line 1
0 ; column 0
1 ; offset/position 1 (contract for datum->syntax requires a positive offset)
#f) ; span
)) ; no properties
; The following Tom Foolery is to needed to turn
; SEMI EOF into EOF
; This allows one to have an optional semi colon in the end of the file.
(let ([peek (expression-lexer ip)]
[peek1 (expression-lexer ip)])
(define (next)
(cond
[(eq? (position-token-token peek) 'EOF) (void)]
[else (set! peek peek1)
(set! peek1 (expression-lexer ip))]))
(λ ()
(if (and (eq? (position-token-token peek) 'SEMI)
(eq? (position-token-token peek1) 'EOF))
(begin0 peek1 (next))
(begin0 peek (next)))))))