From d00aed6f1b2934f591204ea9b97199fae698ceae Mon Sep 17 00:00:00 2001 From: Robby Findler Date: Thu, 3 Nov 2011 17:12:07 -0500 Subject: [PATCH] fix the module reader for the case when there are non-ASCII unicode characters in comments before the #lang line also add an #:init-position argument to peeking-input-port --- collects/mzlib/port.rkt | 27 ++++++++------ collects/mzlib/scribblings/port.scrbl | 2 +- collects/scribblings/reference/port-lib.scrbl | 7 ++-- collects/syntax-color/module-lexer.rkt | 37 +++++++++++-------- collects/tests/syntax-color/module-lexer.rkt | 31 +++++++++++++++- 5 files changed, 71 insertions(+), 33 deletions(-) diff --git a/collects/mzlib/port.rkt b/collects/mzlib/port.rkt index 1c32172120..06dbc01b6d 100644 --- a/collects/mzlib/port.rkt +++ b/collects/mzlib/port.rkt @@ -428,17 +428,22 @@ (set! buffering? (eq? mode 'block)) (buffer-mode-proc mode)]))))) -(define peeking-input-port - (lambda (orig-in [name (object-name orig-in)] [delta 0]) - (make-input-port/read-to-peek - name - (lambda (s) - (let ([r (peek-bytes-avail!* s delta #f orig-in)]) - (set! delta (+ delta (if (number? r) r 1))) - (if (eq? r 0) (handle-evt orig-in (lambda (v) 0)) r))) - (lambda (s skip default) - (peek-bytes-avail!* s (+ delta skip) #f orig-in)) - void))) +(define (peeking-input-port orig-in + [name (object-name orig-in)] + [delta 0] + #:init-position [init-position 1]) + (make-input-port/read-to-peek + name + (lambda (s) + (let ([r (peek-bytes-avail!* s delta #f orig-in)]) + (set! delta (+ delta (if (number? r) r 1))) + (if (eq? r 0) (handle-evt orig-in (lambda (v) 0)) r))) + (lambda (s skip default) + (peek-bytes-avail!* s (+ delta skip) #f orig-in)) + void + #f + void + init-position)) (define relocate-input-port (lambda (p line col pos [close? #t]) diff --git a/collects/mzlib/scribblings/port.scrbl b/collects/mzlib/scribblings/port.scrbl index aba02f4559..d98d43ebe6 100644 --- a/collects/mzlib/scribblings/port.scrbl +++ b/collects/mzlib/scribblings/port.scrbl @@ -5,7 +5,7 @@ @mzlib[#:mode title port] The @racketmodname[mzlib/port] library mostly re-provides -@racketmodname[scheme/port]. +@racketmodname[racket/port]. @defproc[(strip-shell-command-start [in input-port?]) void?]{ diff --git a/collects/scribblings/reference/port-lib.scrbl b/collects/scribblings/reference/port-lib.scrbl index 9f2c510d26..74c4f9ca9a 100644 --- a/collects/scribblings/reference/port-lib.scrbl +++ b/collects/scribblings/reference/port-lib.scrbl @@ -310,7 +310,8 @@ resulting port supports @racket[write-special], otherwise it does not.} @defproc[(peeking-input-port [in input-port?] [name any/c (object-name in)] - [skip exact-nonnegative-integer? 0]) + [skip exact-nonnegative-integer? 0] + [#:init-position init-position exact-positive-integer? 1]) input-port]{ Returns an input port whose content is determined by peeking into @@ -323,8 +324,8 @@ The optional @racket[name] argument is the name of the resulting port. The @racket[skip] argument is the port initial skip count, and it defaults to @racket[0]. -The resulting port's initial position is @racket[0], no matter the -position of @racket[in]. +The resulting port's initial position (as reported by @racket[file-position]) +is @racket[(- init-position 1)], no matter the position of @racket[in]. For example, when you read from a peeking port, you see the same answers as when you read from the original port. diff --git a/collects/syntax-color/module-lexer.rkt b/collects/syntax-color/module-lexer.rkt index 98962ef0da..0ebfbc61dd 100644 --- a/collects/syntax-color/module-lexer.rkt +++ b/collects/syntax-color/module-lexer.rkt @@ -25,36 +25,37 @@ to delegate to the scheme-lexer (in the 'no-lang-line mode). (define (module-lexer in offset mode) (cond [(or (not mode) (eq? mode 'before-lang-line)) - (define lexer-port (peeking-input-port in)) + (define lexer-port (peeking-input-port in #:init-position (+ 1 (file-position in)))) (port-count-lines! lexer-port) - (define-values (lexeme type data raw-new-token-start raw-new-token-end) (scheme-lexer lexer-port)) - (define new-token-start (and raw-new-token-start (+ raw-new-token-start (file-position in)))) - (define new-token-end (and raw-new-token-end (+ raw-new-token-end (file-position in)))) + (set-port-next-location-from in lexer-port) + (define-values (lexeme type data new-token-start new-token-end) (scheme-lexer lexer-port)) (cond [(or (eq? type 'comment) (eq? type 'white-space)) (define lexer-end (file-position lexer-port)) ;; sync ports - (for ([i (in-range 0 lexer-end)]) - (read-char-or-special in)) + (for/list ([i (in-range (file-position in) (file-position lexer-port))]) + (read-byte-or-special in)) (values lexeme type data new-token-start new-token-end 0 'before-lang-line)] [else ;; look for #lang: - (define p (peeking-input-port in)) + (define p (peeking-input-port in #:init-position (+ 1 (file-position in)))) (port-count-lines! p) + (set-port-next-location-from in p) + (define-values (_1 _2 start-pos) (port-next-location p)) (define get-info (with-handlers ([exn:fail? values]) (read-language p (λ () 'fail)))) - (define end-pos (file-position p)) + (define-values (_3 _4 end-pos) (port-next-location p)) (cond [(procedure? get-info) ;; sync ports - (for ([i (in-range 0 end-pos)]) - (read-char-or-special in)) + (for ([i (in-range (file-position in) (file-position p))]) + (read-byte-or-special in)) ;; Produce language as first token: (values "#lang" 'other #f - 1 ;; start-pos - (+ end-pos 1) + start-pos + end-pos 0 (or (let ([v (get-info 'color-lexer #f)]) (and v @@ -68,12 +69,12 @@ to delegate to the scheme-lexer (in the 'no-lang-line mode). ;; the read-language docs say that this is all it takes to commit to a #lang (regexp-match #rx"^#[!l]" lexeme)) ;; sync ports - (for ([i (in-range 0 end-pos)]) - (read-char-or-special in)) + (for ([i (in-range (file-position in) (file-position p))]) + (read-byte-or-special in)) (values lexeme 'error data 1 (+ end-pos 1) 0 'no-lang-line)] [else - (for ([i (in-range 0 (file-position lexer-port))]) - (read-char-or-special in)) + (for ([i (in-range (file-position in) (file-position lexer-port))]) + (read-byte-or-special in)) (values lexeme type data new-token-start new-token-end 0 'no-lang-line)])])] [(eq? mode 'no-lang-line) (let-values ([(lexeme type data new-token-start new-token-end) @@ -89,3 +90,7 @@ to delegate to the scheme-lexer (in the 'no-lang-line mode). (let-values ([(lexeme type data new-token-start new-token-end) (mode in)]) (values lexeme type data new-token-start new-token-end 0 mode))])) + +(define (set-port-next-location-from src dest) + (define-values (line col pos) (port-next-location src)) + (set-port-next-location! dest line col pos)) \ No newline at end of file diff --git a/collects/tests/syntax-color/module-lexer.rkt b/collects/tests/syntax-color/module-lexer.rkt index 5be0e6df76..b48f5ad1cc 100644 --- a/collects/tests/syntax-color/module-lexer.rkt +++ b/collects/tests/syntax-color/module-lexer.rkt @@ -4,6 +4,7 @@ (define (lex str) (define p (open-input-string str)) + (port-count-lines! p) (let loop ([mode #f] [n 0]) (define-values (lexeme type data token-start token-end backup new-mode) @@ -25,6 +26,15 @@ [(= n 1000) '()] ;; watch out for loops [else (cons one (loop new-mode (+ n 1)))]))) +(define (same? a b) + (cond + [(eq? a 'dont-care) #t] + [(eq? b 'dont-care) #t] + [(and (pair? a) (pair? b)) + (and (same? (car a) (car b)) + (same? (cdr a) (cdr b)))] + [else (equal? a b)])) + (check-equal? (lex "#lang racket/base") `(("#lang" other 1 18 #f) (,eof eof #f #f (proc scheme-lexer)))) @@ -33,14 +43,31 @@ ("\n" white-space 18 19 (proc scheme-lexer)) ("1" constant 19 20 (proc scheme-lexer)) (,eof eof #f #f (proc scheme-lexer)))) +(check-equal? (lex ";; αα\n") + `(("; αα" comment 1 6 #f) + ("\n" white-space 6 7 before-lang-line) + (,eof eof #f #f before-lang-line))) +(check-equal? (lex ";; ααα\n;; aaa\n") + `(("; ααα" comment 1 7 #f) + ("\n" white-space 7 8 before-lang-line) + ("; aaa" comment 8 14 before-lang-line) + ("\n" white-space 14 15 before-lang-line) + (,eof eof #f #f before-lang-line))) (check-equal? (lex ";; a\n#lang racket/base") `(("; a" comment 1 5 #f) ("\n" white-space 5 6 before-lang-line) - ("#lang" other 1 18 before-lang-line) + ("#lang" other 6 23 before-lang-line) (,eof eof #f #f (proc scheme-lexer)))) (check-equal? (lex "#lang at-exp racket/base") `(("#lang" other 1 25 #f) (,eof eof 25 25 ((proc scribble-lexer) . #f)))) (check-equal? (lex "#lang at-exp racket/baseBOGUS") - `(("#lang at-exp" error 1 30 #f) + `(("#lang at-exp" error 1 31 #f) (,eof eof #f #f no-lang-line))) +(check same? + (lex "#lang at-exp racket/base\n1\n") + `(("#lang" other 1 25 #f) + ("\n" white-space 25 26 ((proc scribble-lexer) . #f)) + ("1" constant 26 27 ((proc scribble-lexer) . dont-care)) + ("\n" white-space 27 28 ((proc scribble-lexer) . dont-care)) + (,eof eof 28 28 ((proc scribble-lexer) . dont-care))))