fix the module reader for the case when there are non-ASCII unicode characters

in comments before the #lang line

also add an #:init-position argument to peeking-input-port
This commit is contained in:
Robby Findler 2011-11-03 17:12:07 -05:00
parent 8ec968fe8f
commit d00aed6f1b
5 changed files with 71 additions and 33 deletions

View File

@ -428,17 +428,22 @@
(set! buffering? (eq? mode 'block))
(buffer-mode-proc mode)])))))
(define peeking-input-port
(lambda (orig-in [name (object-name orig-in)] [delta 0])
(make-input-port/read-to-peek
name
(lambda (s)
(let ([r (peek-bytes-avail!* s delta #f orig-in)])
(set! delta (+ delta (if (number? r) r 1)))
(if (eq? r 0) (handle-evt orig-in (lambda (v) 0)) r)))
(lambda (s skip default)
(peek-bytes-avail!* s (+ delta skip) #f orig-in))
void)))
(define (peeking-input-port orig-in
[name (object-name orig-in)]
[delta 0]
#:init-position [init-position 1])
(make-input-port/read-to-peek
name
(lambda (s)
(let ([r (peek-bytes-avail!* s delta #f orig-in)])
(set! delta (+ delta (if (number? r) r 1)))
(if (eq? r 0) (handle-evt orig-in (lambda (v) 0)) r)))
(lambda (s skip default)
(peek-bytes-avail!* s (+ delta skip) #f orig-in))
void
#f
void
init-position))
(define relocate-input-port
(lambda (p line col pos [close? #t])

View File

@ -5,7 +5,7 @@
@mzlib[#:mode title port]
The @racketmodname[mzlib/port] library mostly re-provides
@racketmodname[scheme/port].
@racketmodname[racket/port].
@defproc[(strip-shell-command-start [in input-port?]) void?]{

View File

@ -310,7 +310,8 @@ resulting port supports @racket[write-special], otherwise it does not.}
@defproc[(peeking-input-port [in input-port?]
[name any/c (object-name in)]
[skip exact-nonnegative-integer? 0])
[skip exact-nonnegative-integer? 0]
[#:init-position init-position exact-positive-integer? 1])
input-port]{
Returns an input port whose content is determined by peeking into
@ -323,8 +324,8 @@ The optional @racket[name] argument is the name of the resulting
port. The @racket[skip] argument is the port initial skip count, and
it defaults to @racket[0].
The resulting port's initial position is @racket[0], no matter the
position of @racket[in].
The resulting port's initial position (as reported by @racket[file-position])
is @racket[(- init-position 1)], no matter the position of @racket[in].
For example, when you read from a peeking port, you
see the same answers as when you read from the original port.

View File

@ -25,36 +25,37 @@ to delegate to the scheme-lexer (in the 'no-lang-line mode).
(define (module-lexer in offset mode)
(cond
[(or (not mode) (eq? mode 'before-lang-line))
(define lexer-port (peeking-input-port in))
(define lexer-port (peeking-input-port in #:init-position (+ 1 (file-position in))))
(port-count-lines! lexer-port)
(define-values (lexeme type data raw-new-token-start raw-new-token-end) (scheme-lexer lexer-port))
(define new-token-start (and raw-new-token-start (+ raw-new-token-start (file-position in))))
(define new-token-end (and raw-new-token-end (+ raw-new-token-end (file-position in))))
(set-port-next-location-from in lexer-port)
(define-values (lexeme type data new-token-start new-token-end) (scheme-lexer lexer-port))
(cond
[(or (eq? type 'comment) (eq? type 'white-space))
(define lexer-end (file-position lexer-port))
;; sync ports
(for ([i (in-range 0 lexer-end)])
(read-char-or-special in))
(for/list ([i (in-range (file-position in) (file-position lexer-port))])
(read-byte-or-special in))
(values lexeme type data new-token-start new-token-end 0 'before-lang-line)]
[else
;; look for #lang:
(define p (peeking-input-port in))
(define p (peeking-input-port in #:init-position (+ 1 (file-position in))))
(port-count-lines! p)
(set-port-next-location-from in p)
(define-values (_1 _2 start-pos) (port-next-location p))
(define get-info (with-handlers ([exn:fail? values]) (read-language p (λ () 'fail))))
(define end-pos (file-position p))
(define-values (_3 _4 end-pos) (port-next-location p))
(cond
[(procedure? get-info)
;; sync ports
(for ([i (in-range 0 end-pos)])
(read-char-or-special in))
(for ([i (in-range (file-position in) (file-position p))])
(read-byte-or-special in))
;; Produce language as first token:
(values
"#lang"
'other
#f
1 ;; start-pos
(+ end-pos 1)
start-pos
end-pos
0
(or (let ([v (get-info 'color-lexer #f)])
(and v
@ -68,12 +69,12 @@ to delegate to the scheme-lexer (in the 'no-lang-line mode).
;; the read-language docs say that this is all it takes to commit to a #lang
(regexp-match #rx"^#[!l]" lexeme))
;; sync ports
(for ([i (in-range 0 end-pos)])
(read-char-or-special in))
(for ([i (in-range (file-position in) (file-position p))])
(read-byte-or-special in))
(values lexeme 'error data 1 (+ end-pos 1) 0 'no-lang-line)]
[else
(for ([i (in-range 0 (file-position lexer-port))])
(read-char-or-special in))
(for ([i (in-range (file-position in) (file-position lexer-port))])
(read-byte-or-special in))
(values lexeme type data new-token-start new-token-end 0 'no-lang-line)])])]
[(eq? mode 'no-lang-line)
(let-values ([(lexeme type data new-token-start new-token-end)
@ -89,3 +90,7 @@ to delegate to the scheme-lexer (in the 'no-lang-line mode).
(let-values ([(lexeme type data new-token-start new-token-end)
(mode in)])
(values lexeme type data new-token-start new-token-end 0 mode))]))
(define (set-port-next-location-from src dest)
(define-values (line col pos) (port-next-location src))
(set-port-next-location! dest line col pos))

View File

@ -4,6 +4,7 @@
(define (lex str)
(define p (open-input-string str))
(port-count-lines! p)
(let loop ([mode #f]
[n 0])
(define-values (lexeme type data token-start token-end backup new-mode)
@ -25,6 +26,15 @@
[(= n 1000) '()] ;; watch out for loops
[else (cons one (loop new-mode (+ n 1)))])))
(define (same? a b)
(cond
[(eq? a 'dont-care) #t]
[(eq? b 'dont-care) #t]
[(and (pair? a) (pair? b))
(and (same? (car a) (car b))
(same? (cdr a) (cdr b)))]
[else (equal? a b)]))
(check-equal? (lex "#lang racket/base")
`(("#lang" other 1 18 #f)
(,eof eof #f #f (proc scheme-lexer))))
@ -33,14 +43,31 @@
("\n" white-space 18 19 (proc scheme-lexer))
("1" constant 19 20 (proc scheme-lexer))
(,eof eof #f #f (proc scheme-lexer))))
(check-equal? (lex ";; αα\n")
`(("; αα" comment 1 6 #f)
("\n" white-space 6 7 before-lang-line)
(,eof eof #f #f before-lang-line)))
(check-equal? (lex ";; ααα\n;; aaa\n")
`(("; ααα" comment 1 7 #f)
("\n" white-space 7 8 before-lang-line)
("; aaa" comment 8 14 before-lang-line)
("\n" white-space 14 15 before-lang-line)
(,eof eof #f #f before-lang-line)))
(check-equal? (lex ";; a\n#lang racket/base")
`(("; a" comment 1 5 #f)
("\n" white-space 5 6 before-lang-line)
("#lang" other 1 18 before-lang-line)
("#lang" other 6 23 before-lang-line)
(,eof eof #f #f (proc scheme-lexer))))
(check-equal? (lex "#lang at-exp racket/base")
`(("#lang" other 1 25 #f)
(,eof eof 25 25 ((proc scribble-lexer) . #f))))
(check-equal? (lex "#lang at-exp racket/baseBOGUS")
`(("#lang at-exp" error 1 30 #f)
`(("#lang at-exp" error 1 31 #f)
(,eof eof #f #f no-lang-line)))
(check same?
(lex "#lang at-exp racket/base\n1\n")
`(("#lang" other 1 25 #f)
("\n" white-space 25 26 ((proc scribble-lexer) . #f))
("1" constant 26 27 ((proc scribble-lexer) . dont-care))
("\n" white-space 27 28 ((proc scribble-lexer) . dont-care))
(,eof eof 28 28 ((proc scribble-lexer) . dont-care))))