reader: treat BOM like whitespace in places where comments are allowed
Some text-editing tools on Windows include a BOM character (encoded) at the start of a file that is intended as UTF-8. The general recommendation for UTF-8 is to *not* include a BOM --- but, well, Windows. When a BOM is there, meanwhile, the recommendation is to preserve it in the stream, so always discarding an initial BOM at the file-port level is not a good idea. A new file mode would make sense, but distinctions like 'text and 'binary mode have turned out to be best avoided. Although I'm not sure it's really a good idea, treating a BOM character as whitespace in the reader (at least in comment positions) is an easy way around the problem for text files that are intended as programs. Closes #1114
This commit is contained in:
parent
ace1c6a128
commit
d280462250
|
@ -53,8 +53,8 @@ necessarily produce an @tech{interned} value at the receiving
|
|||
@;------------------------------------------------------------------------
|
||||
@section[#:tag "default-readtable-dispatch"]{Delimiters and Dispatch}
|
||||
|
||||
Along with @racketlink[char-whitespace?]{whitespace}, the following
|
||||
characters are @defterm{delimiters}:
|
||||
Along with @racketlink[char-whitespace?]{whitespace} and a BOM
|
||||
character, the following characters are @defterm{delimiters}:
|
||||
|
||||
@t{
|
||||
@hspace[2] @ilitchar{(} @ilitchar{)} @ilitchar{[} @ilitchar{]}
|
||||
|
@ -86,8 +86,9 @@ characters play special roles:
|
|||
|
||||
]
|
||||
|
||||
More precisely, after skipping whitespace, the reader dispatches based
|
||||
on the next character or characters in the input stream as follows:
|
||||
More precisely, after skipping whitespace and @racket[#\uFEFF] BOM
|
||||
characters, the reader dispatches based on the next character or
|
||||
characters in the input stream as follows:
|
||||
|
||||
@dispatch-table[
|
||||
|
||||
|
@ -189,6 +190,11 @@ on the next character or characters in the input stream as follows:
|
|||
|
||||
]
|
||||
|
||||
@history[#:changed "7.8.0.9" @elem{Changed treatment of the BOM
|
||||
character so that it is treated
|
||||
like whitespace in the same places
|
||||
that comments are allowed.}]
|
||||
|
||||
|
||||
@section[#:tag "parse-symbol"]{Reading Symbols}
|
||||
|
||||
|
@ -978,7 +984,7 @@ numbers are followed by a @litchar{.} intended to be read as a C-style
|
|||
infix dot, then a delimiter must precede the @litchar{.}.
|
||||
|
||||
Finally, after reading any datum @racket[_x], the reader will seek
|
||||
through whitespace and comments and look for zero or more sequences of a
|
||||
through whitespace, BOM characters, and comments and look for zero or more sequences of a
|
||||
@litchar{.} followed by another datum @racket[_y]. It will then group
|
||||
@racket[_x] and @racket[_y] together in a @racket[#%dot] form so that
|
||||
@racket[_x.y] reads equal to @racket[(#%dot _x _y)].
|
||||
|
|
|
@ -655,6 +655,26 @@
|
|||
(test-write-sym (cadar l) (cadar l) (cadar l))
|
||||
(loop (cdr l))]))
|
||||
|
||||
(let ()
|
||||
(define BOM-utf8 (bytes #xEF #xBB #xBF))
|
||||
|
||||
(test "it-works" symbol->string
|
||||
(read (open-input-bytes
|
||||
(bytes-append BOM-utf8 #"it-works"))))
|
||||
|
||||
(test '(1 2 3) read (open-input-bytes
|
||||
(bytes-append BOM-utf8
|
||||
#"(" BOM-utf8 BOM-utf8
|
||||
#"1" BOM-utf8
|
||||
#"2" BOM-utf8
|
||||
#"3" BOM-utf8 BOM-utf8 #")"
|
||||
BOM-utf8)))
|
||||
|
||||
(test #t procedure?
|
||||
(parameterize ([read-accept-reader #t])
|
||||
(read-language (open-input-bytes
|
||||
(bytes-append BOM-utf8 #"#lang racket/base"))))))
|
||||
|
||||
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; Test mid-stream EOF
|
||||
|
||||
|
|
|
@ -54246,7 +54246,8 @@ static const char *startup_source =
|
|||
" #f)"
|
||||
"(let-values()(skip-loop_0 #f))"
|
||||
"(let-values() c_0))))"
|
||||
"(if(char-whitespace? ec_0)"
|
||||
"(if(let-values(((or-part_0)(char-whitespace? ec_0)))"
|
||||
"(if or-part_0 or-part_0(eqv? '#\\uFEFF ec_0)))"
|
||||
"(let-values()(skip-loop_0 #f))"
|
||||
"(if(char=? '#\\; ec_0)"
|
||||
"(let-values()"
|
||||
|
@ -54491,9 +54492,12 @@ static const char *startup_source =
|
|||
" (let-values (((or-part_11) (char=? dc_0 '#\\\")))"
|
||||
"(if or-part_11"
|
||||
" or-part_11"
|
||||
"(let-values(((or-part_12)(char=? dc_0 '#\\uFEFF)))"
|
||||
"(if or-part_12"
|
||||
" or-part_12"
|
||||
"(if(char=? dc_0 '#\\.)"
|
||||
"(check-parameter 1/read-cdot config_0)"
|
||||
" #f))))))))))))))))))))))))))))))))"
|
||||
" #f))))))))))))))))))))))))))))))))))"
|
||||
"(define-values"
|
||||
"(char-delimiter?)"
|
||||
"(lambda(c_0 config_0)(begin(readtable-char-delimiter?(read-config-readtable config_0) c_0 config_0))))"
|
||||
|
|
|
@ -64072,7 +64072,8 @@
|
|||
#f)
|
||||
(skip-loop_0 #f)
|
||||
c_0))
|
||||
(if (char-whitespace? ec_0)
|
||||
(if (let ((or-part_0 (char-whitespace? ec_0)))
|
||||
(if or-part_0 or-part_0 (eqv? '#\xfeff ec_0)))
|
||||
(skip-loop_0 #f)
|
||||
(if (char=? '#\x3b ec_0)
|
||||
(begin
|
||||
|
@ -64322,13 +64323,19 @@
|
|||
'#\x22)))
|
||||
(if or-part_11
|
||||
or-part_11
|
||||
(if (char=?
|
||||
dc_0
|
||||
'#\x2e)
|
||||
(check-parameter
|
||||
1/read-cdot
|
||||
config_0)
|
||||
#f))))))))))))))))))))))))))))))
|
||||
(let ((or-part_12
|
||||
(char=?
|
||||
dc_0
|
||||
'#\xfeff)))
|
||||
(if or-part_12
|
||||
or-part_12
|
||||
(if (char=?
|
||||
dc_0
|
||||
'#\x2e)
|
||||
(check-parameter
|
||||
1/read-cdot
|
||||
config_0)
|
||||
#f))))))))))))))))))))))))))))))))
|
||||
(define char-delimiter?
|
||||
(lambda (c_0 config_0)
|
||||
(readtable-char-delimiter?
|
||||
|
|
|
@ -26,6 +26,7 @@
|
|||
(char=? dc #\,)
|
||||
(char=? dc #\;)
|
||||
(char=? dc #\")
|
||||
(char=? dc #\uFEFF) ; treat BOM as comment-like whitespace
|
||||
(and (char=? dc #\.)
|
||||
(check-parameter read-cdot config)))]))
|
||||
|
||||
|
|
|
@ -31,7 +31,9 @@
|
|||
(not (read-config-keep-comment? config)))
|
||||
(skip-loop #f)]
|
||||
[else c])]
|
||||
[(char-whitespace? ec)
|
||||
[(or (char-whitespace? ec)
|
||||
;; treat BOM as whitespace in the same sense as a comment:
|
||||
(eqv? #\uFEFF ec))
|
||||
(skip-loop #f)]
|
||||
[(char=? #\; ec)
|
||||
(let loop ()
|
||||
|
|
Loading…
Reference in New Issue
Block a user