scribble-math/bracket/lang/parser.rkt

#lang racket
(require "parameter.rkt")
(require syntax/srcloc)

;;;
;;; NOTES:
;;;   Changes from planet version:  ]] (CDB) Bug fixed
;;;   Strings are supported.
;;;   Assignments changed from set! to define.
;;;   Assignment of functions.
;;;   Bug fix:  identifiers with more than one _ wasn't converted correctly

; <e> =   <num>
;      |  <string>
;      |  <id>                   variable reference
;      |  <e> [ <args> ]         application
;      |  { <args> }             list construction
;      |  <e> + <e>              addition
;      |  <e> - <e>              subtraction
;      |  <e> * <e>              multiplication
;      |  <e> / <e>              division
;      |  <e> ^ <e>              exponentiation
;      |  - <e>                  negation
;      | ( <e> )                 grouping
;      | <id> ( <args> ) := <e>  function definition


; <id>   An identifier begins with a letter,
;        and is optionally followed by series of letters, digits or underscores.
;        An underscore is converted to a -. Thus list_ref will refer to list-ref.

; <num>  A number is an non-empty series of digits,
;        optionally followed by a period followed by a series of digits.

; <string> A number is a " followed by a series of non-" characters followed by a " .

(provide parse-expression
         color-lexer)

(require parser-tools/yacc
         parser-tools/lex
         (prefix-in : parser-tools/lex-sre)
         syntax/readerr)

(define-tokens value-tokens (NUMBER STRING IDENTIFIER IDENTIFIEROP IDENTIFIER:=))
(define-empty-tokens
  op-tokens
  (newline
   :=
   OP CP    ; ( )
   CP:=     ; ):=
   OB CB    ; [ ]
   OC CC    ; { }
   ODB      ; [[ ]]
   COMMA    ; ,
   SEMI     ; ;
   PERIOD   ; .
   LAMBDA   ; lambda or λ
   SQRT     ; √
   NEG      ; ¬  (logical negation)
   LESS-EQUAL    ; <= or ≤
   GREATER-EQUAL ; >= or ≥
   NOT-EQUAL     ; <> or ≠
   = < >
   + - * / ^
   DEFINE
   EOF))

(define-lex-abbrevs
  [letter       (:or (:/ "a" "z") (:/ #\A #\Z) )]
  [digit        (:/ #\0 #\9)]
  [string       (:: #\" (:* (:~ #\")) #\")]
  [identifier   (:: letter (:* (:or letter digit #\_ #\?)))]
  [identifier:= (:: letter (:* (:or letter digit #\_ #\?)) ":=")]
  [identifierOP (:: letter (:* (:or letter digit #\_ #\?)) "(")])

(define (string-drop-right n s)
  (substring s 0 (- (string-length s) n)))

(define expression-lexer
  (lexer-src-pos
   [(eof) 'EOF]
   [(:or #\tab #\space #\newline)    ; this skips whitespace
    (return-without-pos (expression-lexer input-port))]
   [#\newline (token-newline)]  ; (token-newline) returns 'newline
   [(:or ":=" "+" "-" "*" "/" "^" "<" ">" "=" "\"") (string->symbol lexeme)]
   ["(" 'OP]
   [")" 'CP]
   ["[" 'OB]
   ["]" 'CB]
   ["{" 'OC]
   ["}" 'CC]
   ["[[" 'ODB]
   ; ["]]" 'CDB]
   ["," 'COMMA]
   [";" 'SEMI]
   ["." 'PERIOD]
   [#\λ 'LAMBDA]
   ["lambda" 'LAMBDA]
   ["√" 'SQRT]
   ["¬" 'NEG]
   ["≤" 'LESS-EQUAL]
   ["<=" 'LESS-EQUAL]
   ["≥" 'GREATER-EQUAL]
   [">=" 'GREATER-EQUAL]
   ["<>" 'NOT-EQUAL]
   ["≠" 'NOT-EQUAL]
   ["define" 'DEFINE]
   [string
    (token-STRING (substring lexeme 1 (- (string-length lexeme) 1)))]
   ; The parser can only look ahead 1 token, so we have 3
   ; different identifiers to see whether := or ( comes after the identfier.
   ; This is enough to prevent shift/reduce conflicts between atom, definition,
   ; and application.
   [identifier:=
    (token-IDENTIFIER:=
     (string->symbol (string-drop-right 2 (regexp-replace* #rx"_" lexeme "-"))))]
   [identifierOP
    (token-IDENTIFIEROP
     (string->symbol (string-drop-right 1 (regexp-replace* #rx"_" lexeme "-"))))]
   [identifier
    (token-IDENTIFIER (string->symbol (regexp-replace* #rx"_" lexeme "-")))]
   [(:+ digit) (token-NUMBER (string->number lexeme))]
   [(:: (:+ digit) #\. (:* digit)) (token-NUMBER (string->number lexeme))]))

; The color-expression-lexer is for syntax coloring.
; The function returns 5 values:
; - Either a string containing the matching text or the eof object.
;   Block comments and specials currently return an empty string.
;   This may change in the future to other string or non-string data.
; - A symbol in '(error comment sexp-comment white-space constant
;                 string no-color parenthesis other symbol eof).
; - A symbol in '(|(| |)| |[| |]| |{| |}|) or #f.
; - A number representing the starting position of the match (or #f if eof).
; - A number representing the ending position of the match (or #f if eof).

(define color-lexer
  (lexer-src-pos
   [(eof)
    (values eof #f #f start-pos end-pos)]
   [(:or #\tab #\space #\newline)
    (values lexeme 'white-space #f start-pos end-pos)]
   [#\newline
    (begin
      (/ 0 0)
      (values lexeme 'white-space #f start-pos end-pos))]
   [(:or ":=" "+" "-" "*" "/" "^" "<" ">" "=" "\"")
    (values lexeme 'symbol #f start-pos end-pos)]
   [(:or "(" ")" "[" "]" "{" "}")
    (values lexeme 'parenthesis #f start-pos end-pos)]
   [(:or "[[" "," ";" "." "λ" "lambda" "√" "¬" "≤" "<=" "≥" ">=" "<>" "≠")
    (values lexeme 'no-color #f start-pos end-pos)]
   ["define"
    (values lexeme 'constant (/ 0 0) start-pos end-pos)]
   [string
    (values lexeme 'string #f start-pos end-pos)]
   ; The parser can only look ahead 1 token, so we have 3
   ; different identifiers to see whether := or ( comes after the identfier.
   ; This is enough to prevent shift/reduce conflicts between atom, definition,
   ; and application.
   [(:or identifier:= identifierOP identifier)
    (values lexeme 'symbol #f start-pos end-pos)]
   [(:+ digit)
    (values lexeme 'constant #f start-pos end-pos)]
   [(:: (:+ digit) #\. (:* digit))
    (values lexeme 'constant #f start-pos end-pos)]))


;; A macro to build the syntax object
(define-syntax (b stx)
  (syntax-case stx ()
    [(_ o value start end)
     #'(b o value start end 0 0)]
    [(_ o value start end start-adjust end-adjust)
     (with-syntax
         ((start-pos (datum->syntax #'start
                                    (string->symbol
                                     (format "$~a-start-pos"
                                             (syntax->datum #'start)))))
          (end-pos (datum->syntax #'end
                                  (string->symbol
                                   (format "$~a-end-pos"
                                           (syntax->datum #'end))))))
       #`(datum->syntax
          o
          value
          (list (if (syntax? o) (syntax-source o) 'missing-in-action--sorry)
                (if (and o (syntax-line o))
                    (+ (syntax-line o) (position-line start-pos) start-adjust -1) #f)
                (if (and o (syntax-column o))
                    (+ (syntax-column o) (position-offset start-pos) start-adjust) #f)
                (if (and o (syntax-position o))
                    (+ (syntax-position o) (- (position-offset start-pos) start-adjust 1)) #f)
                (- (+ (position-offset end-pos) end-adjust)
                   (+ (position-offset start-pos) start-adjust)))
          o o))]))

; for testing: builds lists instead of syntax objects
#;(define-syntax (b stx)
    (syntax-case stx ()
      [(_ _ val _ _)
       #'val]))

(define (display-position-token pos)
  (display (position->list pos)))

(define (position->list pos)
  (list (position-offset pos)
        (position-line pos)
        (position-col pos)))

(define source-name #f)
(define orig-stx #f)
(define o #f)

(define (expression-parser src-name orig)
  (λ (gen)
    (set! source-name src-name)
    (set! orig-stx orig)
    (set! o orig)
    (the-parser gen)))


(define the-parser
  (parser
   (src-pos)
   ; (suppress)  ; hmm...
   ;(debug "parser-dump.txt")
   ;(yacc-output "parser-dump.yacc")
   (start start)
   (end newline EOF)
   (tokens value-tokens op-tokens)
   (error (lambda (token-ok? name val start end)
            ; The first argument will be #f if and only if the error is that an invalid token was received.
            ; The second and third arguments will be the name and the value of the token at which the error was detected.
            ; The fourth and fifth arguments, if present, provide the source positions of that token.
            #;(unless #f #; (string? (syntax->datum o))
                (display "DEBUGXXX: ")
                (display (list o token-ok? name val start end))
                (display-position-token start) (newline)
                (display-position-token end) (newline)
                (displayln source-name)
                (newline))
            (error-print-source-location #t)
            (displayln "Syntax error")
            (raise-read-error
             "Syntax error"
             source-name
             (position-line start)
             (position-col start)
             (position-offset start)
             (+ (- (position-offset end) (position-offset start))))
            #;(raise-syntax-error
               #f
               "syntax error"
               (datum->syntax
                #'here 'here
                (list
                 source-name
                 (position-line start)
                 (position-col start)
                 (position-offset start)
                 (+ (- (position-offset end) (position-offset start))))))))

   (precs ; (left :=)
          ; (right OP)
          (left - +)
          (left * /)
          (right OB)
          (right ^)
          (left =)  ; comparisons
          (right NEG)
          (left SEMI)
          ; (right IDENTIFIER)
          )

   (grammar
    (start [(exp) (b o `(#%infix ,$1) 1 1)]
           [() #f])
    ;; If there is an error, ignore everything before the error
    ;; and try to start over right after the error

    (args [(exp)            (b o (list $1) 1 1)]
          [(exp COMMA args) (b o (cons $1 $3) 1 3)]
          [() '()])

    (ids [()               '()]
         [(IDENTIFIER ids) (b o (cons $1 $2) 1 2)])

    (parenthensis-exp
     [(OP exp CP)                                   $2])

    (atom
     [(NUMBER)                                      (b o $1 1 1)]
     [(IDENTIFIER)                (prec IDENTIFIER) (b o $1 1 1)]
     [(STRING)                                      (b o $1 1 1)]
     [(parenthensis-exp)                            $1])

    (construction-exp
     [(OC args CC)                                  (b o `(,(b o 'list 1 3) ,@$2) 1 3)]
     [(OP LAMBDA ids PERIOD exp CP)                 (b o `(,(b o 'lambda 2 2) ,$3 ,$5) 1 6)]
     [(atom)                                        $1])

    (application-exp
     ;[(application-exp OB args CB)                 (b o `(,$1 ,@$3) 1 4)]                     ; function application
     ; Due the extra ( in IDENTIFIEROP we need to adjust the end with -1.
     [(IDENTIFIEROP args CP)                        (b o `(,(b o $1 1 1 0 -1) ,@$2) 1 3)]      ; function application
     [(application-exp OP args CP)      (prec OP)   (b o `(,$1 ,@$3) 1 4 )]                    ; function application
     [(application-exp ODB exp CB CB)               (b o `(,(b o 'list-ref 1 4) ,$1 ,$3) 1 4)] ; list ref
     [(construction-exp)                            $1])


    #;(implicit-exp
       [(application-exp application-exp) (prec *)  (b o `(,(b o '* 1 2) ,$1 ,$2) 1 2)]    ; implicit
       [(application-exp)                           $1])

    (power-exp
     [(application-exp ^ power-exp) (prec ^)        (b o `(expt ,$1 ,$3) 1 3)]
     [(application-exp)                             $1])

    (sqrt-exp
     [(SQRT sqrt-exp)                               (b o `(,(b o 'sqrt 1 1) ,$2) 1 2)]
     [(power-exp)                                   $1])

    (negation-exp
     [(- negation-exp)                              (b o `(,(b o '- 1 1) ,$2) 1 2)]
     [(sqrt-exp)                                    $1])

    (multiplication-exp
     [(multiplication-exp * negation-exp) (prec *)  (b o `(,(b o '* 2 2) ,$1 ,$3) 1 3)]
     [(multiplication-exp / negation-exp) (prec /)  (b o `(,(b o '/ 2 2) ,$1 ,$3) 1 3)]
     ;[(multiplication-exp  negation-exp) (prec *)  (b o `(,(b o '* 1 2) ,$1 ,$2) 1 2)]
     [(negation-exp)                                $1])

    (addition-exp
     [(addition-exp - multiplication-exp) (prec -)  (b o `(,(b o '- 2 2) ,$1 ,$3) 1 3)]
     [(addition-exp + multiplication-exp) (prec +)  (b o `(,(b o '+ 2 2) ,$1 ,$3) 1 3)]
     [(multiplication-exp)                          $1])

    (order-exp
     [(addition-exp LESS-EQUAL addition-exp)    (prec =)  (b o `(,(b o '<= 2 2) ,$1 ,$3) 1 3)]
     [(addition-exp < addition-exp)             (prec =)  (b o `(,(b o '< 2 2) ,$1 ,$3) 1 3)]
     [(addition-exp GREATER-EQUAL addition-exp) (prec =)  (b o `(,(b o '>= 2 2) ,$1 ,$3) 1 3)]
     [(addition-exp > addition-exp)             (prec =)  (b o `(,(b o '> 2 2) ,$1 ,$3) 1 3)]
     [(addition-exp NOT-EQUAL addition-exp)     (prec =)  (b o `(not (,(b o '= 2 2) ,$1 ,$3)) 1 3)]
     [(addition-exp = addition-exp)             (prec =)  (b o `(,(b o '= 2 2) ,$1 ,$3) 1 3)]
     [(addition-exp)                            $1])

    (logical-negation-exp
     [(NEG logical-negation-exp)   (prec NEG)      (b o `(,(b o 'not 1 1) ,$2) 1 2)]
     [(order-exp)                                  $1])

    ; The no DEFINE version conflicts with application.
    ; Solution? Move definition with := into rule for application.
    (assignment-exp
     ;[(DEFINE IDENTIFIER := assignment-exp)               (b o `(,(b o 'define 3 3) ,$2 ,$4) 2 4)]
     ;[(DEFINE IDENTIFIER OP args CP := assignment-exp)    (b o `(,(b o 'define 4 4) (,$2 ,@$4) ,$7) 2 6)]
     [(IDENTIFIER:= assignment-exp)                  (b o `(,(b o 'define 1 1)
                                                            ,(b o $1 1 1
                                                                ; adjust end with -2 due to the chars in :=
                                                                0 -2)
                                                            ,(b o $2 2 2)) 1 2)]
     [(IDENTIFIEROP args CP := assignment-exp)       (b o `(,(b o 'define 2 2) (,$1 ,@$2) ,$5) 1 5)]
     [(logical-negation-exp)                          $1])

    (compound-exp
     [(compound-exp SEMI assignment-exp)            (b o `(,(b o 'begin 2 2) ,$1 ,$3) 1 3)]
     [(assignment-exp)                              $1])

    (exp
     [(compound-exp)                                $1]))))

(define (parse-expression src stx ip)
  (port-count-lines! ip)
  ((expression-parser
    (if src src (object-name ip))
    ; If you change any of these values, then
    ; you must change the builder b which uses
    ; the the values in here to add to newly read
    ; objects. If the lexer/parser was only to
    ; read from entire files, there would be no problem,
    ; but sometimes one must read from strings
    ; entered in a repl. There the syntax-objects must be
    ; adjusted.
    (datum->syntax #f
                   'here
                   (list (if src src (object-name ip))
                         1   ; line 1
                         0   ; column 0
                         1   ; offset/position 1 (contract for datum->syntax requires a positive offset)
                         #f) ; span
                   )) ; no properties
   ; The following Tom Foolery is to needed to turn
   ;   SEMI EOF into EOF
   ; This allows one to have an optional semi colon in the end of the file.
   (let ([peek  (expression-lexer ip)]
         [peek1 (expression-lexer ip)])
     (define (next)
       (cond
         [(eq? (position-token-token peek) 'EOF) (void)]
         [else (set! peek peek1)
               (set! peek1 (expression-lexer ip))]))
     (λ ()
       (if (and (eq? (position-token-token peek) 'SEMI)
                (eq? (position-token-token peek1) 'EOF))
           (begin0 peek1 (next))
           (begin0 peek  (next)))))))