read: reject non-Latin-1 characters in byte-string literals

This is a backward-incompatible change, but the old behavior (truncate
the character value to 8 bits) was never intended and seems clearly bad.
This commit is contained in:
Matthew Flatt 2014-11-13 08:33:33 -07:00
parent 7ed82a5f15
commit 80a7ff831f
5 changed files with 33 additions and 20 deletions

View File

@ -521,7 +521,10 @@ literal.) See @secref["bytestrings"] for information on byte
strings. The resulting byte string is @tech{interned} in strings. The resulting byte string is @tech{interned} in
@racket[read-syntax] mode. @racket[read-syntax] mode.
Byte-string constants support the same escape sequences as Byte-string constants support the same escape sequences as
character strings, except @litchar{\u} and @litchar{\U}. character strings, except @litchar{\u} and @litchar{\U}. Otherwise, each
character within the byte-string quotes must have a Unicode code-point number
in the range 0 to 255, which is used as the corresponding byte's value; if
a character is not in that range, the @exnraise[exn:fail:read].
When the reader encounters @as-index{@litchar{#<<}}, it starts parsing a When the reader encounters @as-index{@litchar{#<<}}, it starts parsing a
@pidefterm{here string}. The characters following @litchar{#<<} until @pidefterm{here string}. The characters following @litchar{#<<} until

View File

@ -196,6 +196,8 @@
(err/rt-test (readstr "#\"\\c\"") exn:fail:read?) (err/rt-test (readstr "#\"\\c\"") exn:fail:read?)
(err/rt-test (readstr "#\"\\777\"") exn:fail:read?) (err/rt-test (readstr "#\"\\777\"") exn:fail:read?)
(err/rt-test (readstr "#\"\\u0040\"") exn:fail:read?) (err/rt-test (readstr "#\"\\u0040\"") exn:fail:read?)
(err/rt-test (readstr "#\"\u0100\"") exn:fail:read?)
(err/rt-test (readstr "#\"\u03BB\"") exn:fail:read?)
(load-relative "numstrs.rktl") (load-relative "numstrs.rktl")
(let loop ([l number-table]) (let loop ([l number-table])

View File

@ -82,22 +82,25 @@
;; What about byte string regexp strings ;; What about byte string regexp strings
[str (:or (:: (:? (:or "#px" "#rx")) "\"" (:* string-element (:: "\\" unicode)) "\"") [str (:or (:: (:? (:or "#px" "#rx")) "\"" (:* string-element (:: "\\" unicode)) "\"")
byte-str)] byte-str)]
[byte-str (:: (:? (:or "#px" "#rx")) "#\"" (:* string-element) "\"")] [byte-str (:: (:? (:or "#px" "#rx")) "#\"" (:* byte-string-element) "\"")]
[string-element (:or (:~ "\"" "\\") [string-element (:or (:~ "\"" "\\")
"\\\"" string-escape)]
"\\\\" [byte-string-element (:or (:- (:/ "\x00" "\xFF") "\"" "\\")
"\\a" string-escape)]
"\\b" [string-escape (:or "\\\""
"\\t" "\\\\"
"\\n" "\\a"
"\\v" "\\b"
"\\f" "\\t"
"\\r" "\\n"
"\\e" "\\v"
"\\'" "\\f"
(:: "\\" (:** 1 3 digit8)) "\\r"
(:: "\\x" (:** 1 2 digit16)) "\\e"
(:: "\\" #\newline))] "\\'"
(:: "\\" (:** 1 3 digit8))
(:: "\\x" (:** 1 2 digit16))
(:: "\\" #\newline))]
[bad-str (:: (:? (:or "#px" "#rx")) (:? "#") "\"" [bad-str (:: (:? (:or "#px" "#rx")) (:? "#") "\""
(:* (:~ "\"" "\\") (:* (:~ "\"" "\\")

View File

@ -391,8 +391,8 @@ static intptr_t sch_vsprintf(char *s, intptr_t maxlen, const char *msg, va_list
tlen = 1; tlen = 1;
} else { } else {
mzchar mc; mzchar mc;
mc = c;
tlen = scheme_utf8_encode_all(&mc, 1, (unsigned char *)buf); tlen = scheme_utf8_encode_all(&mc, 1, (unsigned char *)buf);
c = (int)mc;
} }
t = buf; t = buf;
} }

View File

@ -3283,14 +3283,19 @@ read_string(int is_byte, Scheme_Object *port,
} }
} }
} }
} else if (is_byte && (ch > 255)) {
if (err_ok)
scheme_read_err(port, stxsrc, line, col, pos, SPAN(port, pos), 0, indentation,
"read: out-of-range character in byte string: %c",
ch);
return NULL;
} }
if (ch < 0) { if (ch < 0) {
if (err_ok) if (err_ok)
scheme_read_err(port, stxsrc, line, col, pos, SPAN(port, pos), 0, indentation, scheme_read_err(port, stxsrc, line, col, pos, SPAN(port, pos), 0, indentation,
"read: out-of-range character in %s%s", "read: out-of-range character in %sstring",
is_byte ? "byte " : "", is_byte ? "byte " : "");
"string");
return NULL; return NULL;
} }