read: reject non-Latin-1 characters in byte-string literals

This is a backward-incompatible change, but the old behavior (truncate
the character value to 8 bits) was never intended and seems clearly bad.
This commit is contained in:
Matthew Flatt 2014-11-13 08:33:33 -07:00
parent 7ed82a5f15
commit 80a7ff831f
5 changed files with 33 additions and 20 deletions

View File

@ -521,7 +521,10 @@ literal.) See @secref["bytestrings"] for information on byte
strings. The resulting byte string is @tech{interned} in
@racket[read-syntax] mode.
Byte-string constants support the same escape sequences as
character strings, except @litchar{\u} and @litchar{\U}.
character strings, except @litchar{\u} and @litchar{\U}. Otherwise, each
character within the byte-string quotes must have a Unicode code-point number
in the range 0 to 255, which is used as the corresponding byte's value; if
a character is not in that range, the @exnraise[exn:fail:read].
When the reader encounters @as-index{@litchar{#<<}}, it starts parsing a
@pidefterm{here string}. The characters following @litchar{#<<} until

View File

@ -196,6 +196,8 @@
(err/rt-test (readstr "#\"\\c\"") exn:fail:read?)
(err/rt-test (readstr "#\"\\777\"") exn:fail:read?)
(err/rt-test (readstr "#\"\\u0040\"") exn:fail:read?)
(err/rt-test (readstr "#\"\u0100\"") exn:fail:read?)
(err/rt-test (readstr "#\"\u03BB\"") exn:fail:read?)
(load-relative "numstrs.rktl")
(let loop ([l number-table])

View File

@ -82,22 +82,25 @@
;; What about byte string regexp strings
[str (:or (:: (:? (:or "#px" "#rx")) "\"" (:* string-element (:: "\\" unicode)) "\"")
byte-str)]
[byte-str (:: (:? (:or "#px" "#rx")) "#\"" (:* string-element) "\"")]
[byte-str (:: (:? (:or "#px" "#rx")) "#\"" (:* byte-string-element) "\"")]
[string-element (:or (:~ "\"" "\\")
"\\\""
"\\\\"
"\\a"
"\\b"
"\\t"
"\\n"
"\\v"
"\\f"
"\\r"
"\\e"
"\\'"
(:: "\\" (:** 1 3 digit8))
(:: "\\x" (:** 1 2 digit16))
(:: "\\" #\newline))]
string-escape)]
[byte-string-element (:or (:- (:/ "\x00" "\xFF") "\"" "\\")
string-escape)]
[string-escape (:or "\\\""
"\\\\"
"\\a"
"\\b"
"\\t"
"\\n"
"\\v"
"\\f"
"\\r"
"\\e"
"\\'"
(:: "\\" (:** 1 3 digit8))
(:: "\\x" (:** 1 2 digit16))
(:: "\\" #\newline))]
[bad-str (:: (:? (:or "#px" "#rx")) (:? "#") "\""
(:* (:~ "\"" "\\")

View File

@ -391,8 +391,8 @@ static intptr_t sch_vsprintf(char *s, intptr_t maxlen, const char *msg, va_list
tlen = 1;
} else {
mzchar mc;
mc = c;
tlen = scheme_utf8_encode_all(&mc, 1, (unsigned char *)buf);
c = (int)mc;
}
t = buf;
}

View File

@ -3283,14 +3283,19 @@ read_string(int is_byte, Scheme_Object *port,
}
}
}
} else if (is_byte && (ch > 255)) {
if (err_ok)
scheme_read_err(port, stxsrc, line, col, pos, SPAN(port, pos), 0, indentation,
"read: out-of-range character in byte string: %c",
ch);
return NULL;
}
if (ch < 0) {
if (err_ok)
scheme_read_err(port, stxsrc, line, col, pos, SPAN(port, pos), 0, indentation,
"read: out-of-range character in %s%s",
is_byte ? "byte " : "",
"string");
"read: out-of-range character in %sstring",
is_byte ? "byte " : "");
return NULL;
}