reader: treat BOM like whitespace in places where comments are allowed
Some text-editing tools on Windows include a BOM character (encoded) at the start of a file that is intended as UTF-8. The general recommendation for UTF-8 is to *not* include a BOM --- but, well, Windows. When a BOM is there, meanwhile, the recommendation is to preserve it in the stream, so always discarding an initial BOM at the file-port level is not a good idea. A new file mode would make sense, but distinctions like 'text and 'binary mode have turned out to be best avoided. Although I'm not sure it's really a good idea, treating a BOM character as whitespace in the reader (at least in comment positions) is an easy way around the problem for text files that are intended as programs. Closes #1114
This commit is contained in:
parent
ace1c6a128
commit
d280462250
|
@ -53,8 +53,8 @@ necessarily produce an @tech{interned} value at the receiving
|
||||||
@;------------------------------------------------------------------------
|
@;------------------------------------------------------------------------
|
||||||
@section[#:tag "default-readtable-dispatch"]{Delimiters and Dispatch}
|
@section[#:tag "default-readtable-dispatch"]{Delimiters and Dispatch}
|
||||||
|
|
||||||
Along with @racketlink[char-whitespace?]{whitespace}, the following
|
Along with @racketlink[char-whitespace?]{whitespace} and a BOM
|
||||||
characters are @defterm{delimiters}:
|
character, the following characters are @defterm{delimiters}:
|
||||||
|
|
||||||
@t{
|
@t{
|
||||||
@hspace[2] @ilitchar{(} @ilitchar{)} @ilitchar{[} @ilitchar{]}
|
@hspace[2] @ilitchar{(} @ilitchar{)} @ilitchar{[} @ilitchar{]}
|
||||||
|
@ -86,8 +86,9 @@ characters play special roles:
|
||||||
|
|
||||||
]
|
]
|
||||||
|
|
||||||
More precisely, after skipping whitespace, the reader dispatches based
|
More precisely, after skipping whitespace and @racket[#\uFEFF] BOM
|
||||||
on the next character or characters in the input stream as follows:
|
characters, the reader dispatches based on the next character or
|
||||||
|
characters in the input stream as follows:
|
||||||
|
|
||||||
@dispatch-table[
|
@dispatch-table[
|
||||||
|
|
||||||
|
@ -189,6 +190,11 @@ on the next character or characters in the input stream as follows:
|
||||||
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@history[#:changed "7.8.0.9" @elem{Changed treatment of the BOM
|
||||||
|
character so that it is treated
|
||||||
|
like whitespace in the same places
|
||||||
|
that comments are allowed.}]
|
||||||
|
|
||||||
|
|
||||||
@section[#:tag "parse-symbol"]{Reading Symbols}
|
@section[#:tag "parse-symbol"]{Reading Symbols}
|
||||||
|
|
||||||
|
@ -978,7 +984,7 @@ numbers are followed by a @litchar{.} intended to be read as a C-style
|
||||||
infix dot, then a delimiter must precede the @litchar{.}.
|
infix dot, then a delimiter must precede the @litchar{.}.
|
||||||
|
|
||||||
Finally, after reading any datum @racket[_x], the reader will seek
|
Finally, after reading any datum @racket[_x], the reader will seek
|
||||||
through whitespace and comments and look for zero or more sequences of a
|
through whitespace, BOM characters, and comments and look for zero or more sequences of a
|
||||||
@litchar{.} followed by another datum @racket[_y]. It will then group
|
@litchar{.} followed by another datum @racket[_y]. It will then group
|
||||||
@racket[_x] and @racket[_y] together in a @racket[#%dot] form so that
|
@racket[_x] and @racket[_y] together in a @racket[#%dot] form so that
|
||||||
@racket[_x.y] reads equal to @racket[(#%dot _x _y)].
|
@racket[_x.y] reads equal to @racket[(#%dot _x _y)].
|
||||||
|
|
|
@ -655,6 +655,26 @@
|
||||||
(test-write-sym (cadar l) (cadar l) (cadar l))
|
(test-write-sym (cadar l) (cadar l) (cadar l))
|
||||||
(loop (cdr l))]))
|
(loop (cdr l))]))
|
||||||
|
|
||||||
|
(let ()
|
||||||
|
(define BOM-utf8 (bytes #xEF #xBB #xBF))
|
||||||
|
|
||||||
|
(test "it-works" symbol->string
|
||||||
|
(read (open-input-bytes
|
||||||
|
(bytes-append BOM-utf8 #"it-works"))))
|
||||||
|
|
||||||
|
(test '(1 2 3) read (open-input-bytes
|
||||||
|
(bytes-append BOM-utf8
|
||||||
|
#"(" BOM-utf8 BOM-utf8
|
||||||
|
#"1" BOM-utf8
|
||||||
|
#"2" BOM-utf8
|
||||||
|
#"3" BOM-utf8 BOM-utf8 #")"
|
||||||
|
BOM-utf8)))
|
||||||
|
|
||||||
|
(test #t procedure?
|
||||||
|
(parameterize ([read-accept-reader #t])
|
||||||
|
(read-language (open-input-bytes
|
||||||
|
(bytes-append BOM-utf8 #"#lang racket/base"))))))
|
||||||
|
|
||||||
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; Test mid-stream EOF
|
;; Test mid-stream EOF
|
||||||
|
|
||||||
|
|
|
@ -54246,7 +54246,8 @@ static const char *startup_source =
|
||||||
" #f)"
|
" #f)"
|
||||||
"(let-values()(skip-loop_0 #f))"
|
"(let-values()(skip-loop_0 #f))"
|
||||||
"(let-values() c_0))))"
|
"(let-values() c_0))))"
|
||||||
"(if(char-whitespace? ec_0)"
|
"(if(let-values(((or-part_0)(char-whitespace? ec_0)))"
|
||||||
|
"(if or-part_0 or-part_0(eqv? '#\\uFEFF ec_0)))"
|
||||||
"(let-values()(skip-loop_0 #f))"
|
"(let-values()(skip-loop_0 #f))"
|
||||||
"(if(char=? '#\\; ec_0)"
|
"(if(char=? '#\\; ec_0)"
|
||||||
"(let-values()"
|
"(let-values()"
|
||||||
|
@ -54491,9 +54492,12 @@ static const char *startup_source =
|
||||||
" (let-values (((or-part_11) (char=? dc_0 '#\\\")))"
|
" (let-values (((or-part_11) (char=? dc_0 '#\\\")))"
|
||||||
"(if or-part_11"
|
"(if or-part_11"
|
||||||
" or-part_11"
|
" or-part_11"
|
||||||
|
"(let-values(((or-part_12)(char=? dc_0 '#\\uFEFF)))"
|
||||||
|
"(if or-part_12"
|
||||||
|
" or-part_12"
|
||||||
"(if(char=? dc_0 '#\\.)"
|
"(if(char=? dc_0 '#\\.)"
|
||||||
"(check-parameter 1/read-cdot config_0)"
|
"(check-parameter 1/read-cdot config_0)"
|
||||||
" #f))))))))))))))))))))))))))))))))"
|
" #f))))))))))))))))))))))))))))))))))"
|
||||||
"(define-values"
|
"(define-values"
|
||||||
"(char-delimiter?)"
|
"(char-delimiter?)"
|
||||||
"(lambda(c_0 config_0)(begin(readtable-char-delimiter?(read-config-readtable config_0) c_0 config_0))))"
|
"(lambda(c_0 config_0)(begin(readtable-char-delimiter?(read-config-readtable config_0) c_0 config_0))))"
|
||||||
|
|
|
@ -64072,7 +64072,8 @@
|
||||||
#f)
|
#f)
|
||||||
(skip-loop_0 #f)
|
(skip-loop_0 #f)
|
||||||
c_0))
|
c_0))
|
||||||
(if (char-whitespace? ec_0)
|
(if (let ((or-part_0 (char-whitespace? ec_0)))
|
||||||
|
(if or-part_0 or-part_0 (eqv? '#\xfeff ec_0)))
|
||||||
(skip-loop_0 #f)
|
(skip-loop_0 #f)
|
||||||
(if (char=? '#\x3b ec_0)
|
(if (char=? '#\x3b ec_0)
|
||||||
(begin
|
(begin
|
||||||
|
@ -64322,13 +64323,19 @@
|
||||||
'#\x22)))
|
'#\x22)))
|
||||||
(if or-part_11
|
(if or-part_11
|
||||||
or-part_11
|
or-part_11
|
||||||
|
(let ((or-part_12
|
||||||
|
(char=?
|
||||||
|
dc_0
|
||||||
|
'#\xfeff)))
|
||||||
|
(if or-part_12
|
||||||
|
or-part_12
|
||||||
(if (char=?
|
(if (char=?
|
||||||
dc_0
|
dc_0
|
||||||
'#\x2e)
|
'#\x2e)
|
||||||
(check-parameter
|
(check-parameter
|
||||||
1/read-cdot
|
1/read-cdot
|
||||||
config_0)
|
config_0)
|
||||||
#f))))))))))))))))))))))))))))))
|
#f))))))))))))))))))))))))))))))))
|
||||||
(define char-delimiter?
|
(define char-delimiter?
|
||||||
(lambda (c_0 config_0)
|
(lambda (c_0 config_0)
|
||||||
(readtable-char-delimiter?
|
(readtable-char-delimiter?
|
||||||
|
|
|
@ -26,6 +26,7 @@
|
||||||
(char=? dc #\,)
|
(char=? dc #\,)
|
||||||
(char=? dc #\;)
|
(char=? dc #\;)
|
||||||
(char=? dc #\")
|
(char=? dc #\")
|
||||||
|
(char=? dc #\uFEFF) ; treat BOM as comment-like whitespace
|
||||||
(and (char=? dc #\.)
|
(and (char=? dc #\.)
|
||||||
(check-parameter read-cdot config)))]))
|
(check-parameter read-cdot config)))]))
|
||||||
|
|
||||||
|
|
|
@ -31,7 +31,9 @@
|
||||||
(not (read-config-keep-comment? config)))
|
(not (read-config-keep-comment? config)))
|
||||||
(skip-loop #f)]
|
(skip-loop #f)]
|
||||||
[else c])]
|
[else c])]
|
||||||
[(char-whitespace? ec)
|
[(or (char-whitespace? ec)
|
||||||
|
;; treat BOM as whitespace in the same sense as a comment:
|
||||||
|
(eqv? #\uFEFF ec))
|
||||||
(skip-loop #f)]
|
(skip-loop #f)]
|
||||||
[(char=? #\; ec)
|
[(char=? #\; ec)
|
||||||
(let loop ()
|
(let loop ()
|
||||||
|
|
Loading…
Reference in New Issue
Block a user