racket/collects/scribble/reader.ss
2007-07-07 06:27:23 +00:00

526 lines
24 KiB
Scheme

;; ============================================================================
;; Implements the @-reader macro for embedding text in Scheme code.
(module reader mzscheme
(require (lib "kw.ss") (lib "string.ss") (lib "readerr.ss" "syntax"))
;; --------------------------------------------------------------------------
;; utilities for syntax specifications below
;; regexps
(define (px . args)
(let* ([args (let loop ([xs args])
(if (list? xs) (apply append (map loop xs)) (list xs)))]
[args (map (lambda (x)
(cond
[(bytes? x) x]
[(string? x) (string->bytes/utf-8 x)]
[(char? x) (regexp-quote (bytes (char->integer x)))]
[(not x) #""]
[else (error 'reader "internal error [px]")]))
args)])
(byte-pregexp (apply bytes-append args))))
(define (^px . args) (px #"^" args))
;; reverses a byte string visually
(define reverse-bytes
(let ([pairs (let ([xs (bytes->list #"([{<")]
[ys (bytes->list #")]}>")])
(append (map cons xs ys) (map cons ys xs)))])
(define (rev-byte b)
(cond [(assq b pairs) => cdr]
[else b]))
(lambda (bs) (list->bytes (map rev-byte (reverse! (bytes->list bs)))))))
;; --------------------------------------------------------------------------
;; syntax
;; basic customization
(define ch:command #\@)
(define ch:comment #\;)
(define ch:expr-escape #\|)
(define ch:attrs-begin #\[)
(define ch:attrs-end #\])
(define ch:lines-begin #\{)
(define ch:lines-end #\})
(define str:lines-begin* #"(\\|[^a-zA-Z0-9 \t\r\n\f@\\\177-\377{]*)\\{")
(define re:command (^px ch:command
;; the following identifies string and
;; expression escapes, see how it is used below
"(?:(\")|("ch:expr-escape"))?"))
(define re:whitespaces (^px "\\s+"))
(define re:comment-start (^px ch:comment))
(define re:comment-line (^px "[^\n]*\n[ \t]*")) ; like tex's `%'
(define re:expr-escape (^px ch:expr-escape))
(define re:attrs-begin (^px ch:attrs-begin))
(define re:attrs-end (^px ch:attrs-end))
(define re:lines-begin (^px ch:lines-begin))
(define re:lines-begin* (^px str:lines-begin*))
(define re:lines-end (^px ch:lines-end))
(define str:end-of-line "[ \t]*\r?\n[ \t]*") ; eat spaces on the next line
(define re:end-of-line (^px str:end-of-line))
(define (re:line-item* bgn end cmd-prefix)
(^px "(.+?)(?:" (and bgn `(,bgn"|")) (and end `(,end"|"))
cmd-prefix ch:command"|"str:end-of-line"|$)"))
(define re:line-item (re:line-item* ch:lines-begin ch:lines-end #f))
(define re:line-item-no-nests (re:line-item* #f #f #f))
;; --------------------------------------------------------------------------
;; utilities
;; like `regexp-match/fail-without-reading', without extras; the regexp that
;; is used must be anchored -- nothing is dropped
(define (*regexp-match-peek-positions pattern input-port)
#; ; sanity checks, not needed unless this file is edited
(unless (and (byte-regexp? pattern)
(regexp-match? #rx#"^\\^" (object-name pattern)))
(error 'reader "internal error [invalid bregexp] ~e" pattern))
(regexp-match-peek-positions pattern input-port))
;; the following doesn't work -- must peek first
;; (define (*regexp-match-positions pattern input-port)
;; (unless (and (byte-regexp? pattern)
;; (regexp-match? #rx#"^\\^" (object-name pattern)))
;; (error 'reader "internal error [invalid bregexp] ~e" pattern))
;; (regexp-match-peek-positions pattern input-port))
(define (*regexp-match pattern input-port)
(let ([m (*regexp-match-peek-positions pattern input-port)])
(and m (let ([s (read-bytes (cdar m) input-port)])
(cons s (map (lambda (p) (and p (subbytes s (car p) (cdr p))))
(cdr m)))))))
;; like regexp-match, but returns the whole match
(define (*regexp-match1 pattern input-port)
(let ([m (*regexp-match-peek-positions pattern input-port)])
(and m (read-bytes (cdar m) input-port))))
;; Skips whitespace characters, sensitive to the current readtable's
;; definition of whitespace; optimizes common spaces when possible
(define skip-whitespace
(let* ([plain-readtables (make-hash-table 'weak)]
[plain-spaces " \t\n\r\f"]
[plain-spaces-list (string->list " \t\n\r\f")]
[plain-spaces-re (^px "[" plain-spaces "]*")])
(define (skip-plain-spaces port)
;; hack: according to the specs, this might consume more characters
;; than needed, but it works fine with a simple <ch>* regexp (because
;; it can always match an empty string)
(*regexp-match-peek-positions plain-spaces-re port))
(define (whitespace? ch rt)
(if rt
(let-values ([(like-ch/sym _1 _2) (readtable-mapping rt ch)])
;; if like-ch/sym is whitespace, then ch is whitespace
(and (char? like-ch/sym) (char-whitespace? like-ch/sym)))
;; `char-whitespace?' is fine for the default readtable
(char-whitespace? ch)))
(define (plain-readtable? rt)
(hash-table-get plain-readtables rt
(lambda ()
(let ([plain? (andmap (lambda (ch) (whitespace? ch rt))
plain-spaces-list)])
(hash-table-put! plain-readtables rt #t)
rt))))
(lambda (port)
(let* ([rt (current-readtable)] [plain? (plain-readtable? rt)])
(let loop ()
(when plain? (skip-plain-spaces port))
(let ([ch (peek-char port)])
(unless (eof-object? ch)
(when (whitespace? ch rt) (read-char port) (loop)))))))))
;; make n spaces, cached for n
(define make-spaces
(let ([t (make-hash-table)])
(lambda (n)
(hash-table-get t n
(lambda ()
(let ([s (make-string n #\space)])
(hash-table-put! t n s) s))))))
(define (bytes-width bs start)
(let ([len (bytes-length bs)])
(if (regexp-match? #rx"^ *$" bs start)
(- (bytes-length bs) start)
(let loop ([i start] [w 0])
(if (= i len)
w
(loop (add1 i) (+ w (if (eq? 9 (bytes-ref bs i))
(- 8 (modulo w 8))
1))))))))
;; a unique eol string
(define eol-token "\n")
(define (eol-syntax? x) (and (syntax? x) (eq? eol-token (syntax-e x))))
;; sanity check, in case this property gets violated in the future
(unless (eol-syntax? (datum->syntax-object #f eol-token))
(error 'reader "internal error [invalid assumption]"))
;; --------------------------------------------------------------------------
;; main reader function for @ constructs
(define ((dispatcher start-inside?)
char inp source-name line-num col-num position)
(define (read-error* line col pos span msg . xs)
(let* ([eof? (and (eq? 'eof msg) (pair? xs))]
[msg (apply format (if eof? xs (cons msg xs)))]
[loc (cond [(and line col) (format "at ~a:~a" line col)]
[pos (format "at #~a" pos)]
[else #f])]
[loc (cond [(and source-name loc)
(format "when reading ~a ~a" source-name loc)]
[source-name (format "when reading ~a" source-name)]
[else loc])]
[msg (if loc (format "~a (~a)" msg loc) msg)])
((if eof? raise-read-error raise-read-eof-error)
msg source-name line col pos span)))
(define (read-error msg . xs)
(let-values ([(line col pos) (port-next-location inp)])
(apply read-error* line col pos #f msg xs)))
(define (read-stx) (read-syntax/recursive source-name inp))
(define (read-stx/rt rt) (read-syntax/recursive source-name inp #f rt))
;; use this to avoid placeholders so we have source location information
(define (read-stx*)
;; (read-syntax/recursive source-name inp #f (current-readtable) #f)
(read-syntax source-name inp))
(define (*match rx) (*regexp-match rx inp))
(define (*match1 rx) (*regexp-match1 rx inp))
;; (define (*skip rx) (*regexp-match-positions rx inp)) <- see above
(define (*skip rx) (*regexp-match1 rx inp))
(define (*peek rx) (*regexp-match-peek-positions rx inp))
(define (span-from start)
(and start (let-values ([(line col pos) (port-next-location inp)])
(- pos start))))
(define (read-delimited-list begin-re end-re end-ch)
(and (*skip begin-re)
(let loop ([r '()])
(skip-whitespace inp)
(if (*skip end-re)
(reverse! r)
(let ([x (read-stx)])
(if (eof-object? x)
(read-error 'eof "expected a '~a'" end-ch)
(loop (if (special-comment? x) r (cons x r)))))))))
;; gets an accumulated (reversed) list of syntaxes and column markers, and
;; sorts things out (remove prefix and suffix newlines, adds indentation if
;; needed)
(define (done-items xs)
;; a column marker is either a non-negative integer N (saying the the
;; following code came from at column N), or a negative integer -N
;; (saying that the following code came from column N but no need to add
;; indentation at this point because it is at the openning of a {...});
;; `get-lines*' is careful not to include column markers before a newline
;; or the end of the text, and a -N marker can only come from the
;; beginning of the text (and it's never there if the text began with a
;; newline)
(if (andmap eol-syntax? xs)
;; nothing to do
(reverse! xs)
(let ([mincol (let loop ([xs xs] [m #f])
(if (null? xs)
m
(let ([x (car xs)])
(loop (cdr xs)
(if (integer? x)
(let ([x (abs x)]) (if (and m (< m x)) m x))
m)))))])
(let loop ([xs (if (and (not start-inside?) (eol-syntax? (car xs)))
(cdr xs) ; trim last eol
xs)]
[r '()])
(if (or (null? xs)
(and (not start-inside?)
;; trim first eol
(null? (cdr xs)) (eol-syntax? (car xs))))
r
(loop
(cdr xs)
(let ([x (car xs)])
(cond [(integer? x)
(if (or (< x 0) (= x mincol))
r ; no indentation marker, or zero indentation
(let ([eol (cadr xs)]
[spaces (make-spaces (- x mincol))])
;; markers always follow end-of-lines
(unless (eol-syntax? eol)
(error 'reader "internal error [done-items]"))
(cons (syntax-property
(datum->syntax-object eol spaces eol)
'scribble 'indentation)
r)))]
;; can have special comment values from "@||"
[(special-comment? x) r]
[else (cons x r)]))))))))
;; cons stx (new syntax) to the list of stxs, merging it if both are
;; strings, except for newline markers
(define (maybe-merge stx stxs)
(let* ([2nd (and (syntax? stx) (syntax-e stx))]
[stx0 (and (pair? stxs) (car stxs))]
[1st (and (syntax? stx0) (syntax-e stx0))])
(if (and (string? 1st) (not (eq? eol-token 1st))
(string? 2nd) (not (eq? eol-token 2nd)))
(cons (datum->syntax-object stx0
(string-append 1st 2nd)
(list (syntax-source stx0)
(syntax-line stx0)
(syntax-column stx0)
(syntax-position stx0)
;; this is called right after reading stx
(span-from (syntax-position stx0))))
(cdr stxs))
(cons stx stxs))))
;; helper for `get-lines*' drop a column marker if the previous item was
;; also a newline (or the beginning)
(define (maybe-drop-marker r)
(if (and (pair? r) (integer? (car r))
(or (null? (cdr r)) (eol-syntax? (cadr r))))
(cdr r)
r))
(define (get-lines* re:begin re:end re:cmd-pfx re:item end-token)
;; re:begin, re:end, end-token can be false if start-inside? is #t;
;; re:cmd-pfx is a regexp when we do sub-@-reads only after a prefix
(let loop ([lvl 0]
[r (let-values ([(l c p) (port-next-location inp)])
;; marker for the beginning of the text
(if c (list (- c)) '()))])
;; this loop collects lines etc for the body, and also puts in column
;; markers (integers) after newlines -- the result is handed off to
;; `done-items' to finish the job
(define make-stx
(let-values ([(line col pos) (port-next-location inp)])
(lambda (sexpr)
(datum->syntax-object #f
(if (bytes? sexpr) (bytes->string/utf-8 sexpr) sexpr)
(list source-name line col pos (span-from pos))))))
(cond
[(and re:begin (*match1 re:begin))
=> (lambda (m) (loop (add1 lvl) (maybe-merge (make-stx m) r)))]
[(and re:end (*match1 re:end))
=> (lambda (m)
(if (and (zero? lvl) (not start-inside?))
;; drop a marker if it's after a last eol item
(done-items (maybe-drop-marker r))
(loop (sub1 lvl) (maybe-merge (make-stx m) r))))]
[(*match1 re:end-of-line)
=> (lambda (m)
(let ([n (car (regexp-match-positions #rx#"\n" m))])
(loop lvl (list* ; no merge needed
(bytes-width m (cdr n))
(syntax-property
(make-stx eol-token)
'scribble `(newline ,(bytes->string/utf-8 m)))
(maybe-drop-marker r)))))]
[(if re:cmd-pfx
(and (*skip re:cmd-pfx) (*peek re:command))
(*peek re:command))
;; read the next value
=> (lambda (m)
(let ([x (cond
[(cadr m)
;; the command is a string escape, use `read-stx*'
;; to not get a placeholder, so we can merge the
;; string to others, and adjust source location to
;; avoid bogus indentation
(read-stx*)]
[(caddr m)
;; it's an expression escape, get multiple
;; expressions and put them all here
(read-bytes (caaddr m) inp)
(get-escape-expr #f)]
[else (read-stx)])]) ; otherwise: a plain sub-read
(loop lvl (cond [(eof-object? x)
(read-error 'eof "missing command")]
;; throw away comments
[(special-comment? x) r]
;; escaped expressions: no merge
[(pair? x) (append! (reverse x) r)]
[(null? x) (cons (make-special-comment #f) r)]
[else (maybe-merge x r)]))))]
;; must be last, since it will always succeed with 1 char
[(*peek re:item) ; don't read: regexp grabs the following text
=> (lambda (m)
(loop lvl
(maybe-merge (make-stx (read-bytes (cdadr m) inp)) r)))]
[(*peek #rx#"^$")
(if end-token
(read-error 'eof "missing closing `~a'~a" end-token
(if (and line-num col-num)
(format " for command at ~a:~a" line-num col-num)
""))
(done-items r))]
[else (read-error "internal error [get-lines*]")])))
(define (get-lines)
(cond [(*skip re:lines-begin) (get-lines* re:lines-begin re:lines-end #f
re:line-item ch:lines-end)]
[(*match re:lines-begin*)
=> (lambda (m)
(let* ([bgn (car m)]
[end (reverse-bytes bgn)]
[bgn* (regexp-quote bgn)]
[end* (regexp-quote end)]
[cmd-pfx* (regexp-quote (cadr m))])
(get-lines* (^px bgn*) (^px end*)
(^px cmd-pfx* "(?=" ch:command ")")
(re:line-item* bgn* end* cmd-pfx*)
end)))]
[else #f]))
(define (get-attrs)
(read-delimited-list re:attrs-begin re:attrs-end ch:attrs-end))
(define (get-escape-expr single?)
;; single? means expect just one expression (or none, which is returned
;; as a special-comment)
(let ([get (lambda ()
(parameterize ([current-readtable command-readtable])
;; tweak source information to avoid bad indentation
(read-delimited-list re:expr-escape re:expr-escape
ch:expr-escape)))])
(if single?
(let*-values ([(line col pos) (port-next-location inp)]
[(xs) (get)])
(cond [(not xs) xs]
[(null? xs) (make-special-comment #f)]
[(null? (cdr xs)) (car xs)]
[else (read-error* line col pos (span-from pos)
"too many escape expressions")]))
(get))))
;; called only when we must see a command in the input
(define (get-command)
(define-values (line col pos) (port-next-location inp))
(let ([cmd (read-stx/rt command-readtable)])
(cond [(special-comment? cmd)
(read-error* line col pos (span-from pos)
"expecting a command expression, got a comment")]
[(eof-object? cmd) (read-error 'eof "missing command")]
[else cmd])))
(define (get-rprefixes) ; return punctuation prefixes in reverse
(let loop ([r '()])
(let-values ([(line col pos) (port-next-location inp)])
(cond
[(*match1 #rx#"^(?:'|`|,@?)")
=> (lambda (m)
(let ([sym (cond
[(assoc m '([#"'" quote]
[#"`" quasiquote]
[#"," unquote]
[#",@" unquote-splicing]))
=> cadr]
[else (read-error "internal error [rpfxs]")])])
(loop (cons (datum->syntax-object #f sym
(list source-name line col pos
(span-from pos)))
r))))]
[(*peek re:whitespaces)
(read-error "unexpected whitespace after ~a" ch:command)]
[else r]))))
(cond
[start-inside?
(datum->syntax-object #f (get-lines* #f #f #f re:line-item-no-nests #f)
(list source-name line-num col-num position (span-from position)))]
[(*peek re:whitespaces)
(read-error "unexpected whitespace after ~a" ch:command)]
[(*skip re:comment-start)
(unless (get-lines) (*skip re:comment-line))
(make-special-comment #f)]
[else
(let*-values
([(rpfxs) (get-rprefixes)]
[(cmd attrs lines)
(cond
;; try get-lines first -- so @|{...}| is not used as a
;; simple expression escape, same for get-attrs
[(get-lines) => (lambda (lines) (values #f #f lines))]
[(get-attrs) => (lambda (attrs) (values #f attrs (get-lines)))]
[(get-escape-expr #t) => (lambda (expr) (values expr #f #f))]
[else (values (get-command) (get-attrs) (get-lines))])]
[(stx) (and (or attrs lines)
(append (or attrs '()) (or lines '())))]
[(stx) (or (and cmd stx (cons cmd stx)) ; all parts
stx ; no cmd part => just a parenthesized expression
cmd ; no attrs/lines => simple expression (no parens)
;; impossible: either we saw []s or {}s, or we read a
;; scheme expression
(read-error "internal error [dispatcher]"))]
[(stx)
;; wrap the prefixes around the result
(let loop ([rpfxs rpfxs] [stx stx])
(if (null? rpfxs)
stx
(loop (cdr rpfxs) (list (car rpfxs) stx))))])
(datum->syntax-object #f stx
(list source-name line-num col-num position
(span-from position))))]))
;; --------------------------------------------------------------------------
;; readtables
(define at-readtable
(make-readtable #f ch:command 'non-terminating-macro (dispatcher #f)))
(provide use-at-readtable)
(define (use-at-readtable)
(port-count-lines! (current-input-port))
(current-readtable at-readtable))
;; similar to plain Scheme (scribble, actually), but with `@' and `|' as
;; terminating macro characters (otherwise it behaves the same; the only
;; difference is that `a|b|c' is three symbols and `@foo@bar' are two
;; @-forms)
(define command-readtable
(make-readtable at-readtable
ch:command 'terminating-macro (dispatcher #f)
#\| 'terminating-macro
(lambda (char inp source-name line-num col-num position)
(let ([m (*regexp-match #rx#"^([^|]*)\\|" inp)])
(unless m
(raise-read-error
"unbalanced `|'" source-name line-num col-num position #f))
(datum->syntax-object
#f (string->symbol (bytes->string/utf-8 (cadr m)))
(list source-name line-num col-num position
(add1 (bytes-length (car m)))))))))
(define default-src (gensym 'scribble-reader))
(define (src-name src port)
(if (eq? src default-src) (object-name port) src))
(define/kw (*read #:optional [inp (current-input-port)])
(parameterize ([current-readtable at-readtable])
(read inp)))
(define/kw (*read-syntax #:optional [src default-src]
[inp (current-input-port)])
(parameterize ([current-readtable at-readtable])
(read-syntax (src-name src inp) inp)))
(define/kw (read-inside #:optional [inp (current-input-port)])
(let-values ([(line col pos) (port-next-location inp)])
(parameterize ([current-readtable at-readtable])
(syntax-object->datum
((dispatcher #t) #f inp (object-name inp) line col pos)))))
(define/kw (read-inside-syntax #:optional [src default-src]
[inp (current-input-port)])
(let-values ([(line col pos) (port-next-location inp)])
(parameterize ([current-readtable at-readtable])
((dispatcher #t) #f inp (src-name src inp) line col pos))))
(provide (rename *read read) (rename *read-syntax read-syntax)
read-inside read-inside-syntax)
)