From 80a7ff831f16d522fcdd87909ec1a4a90f7882ee Mon Sep 17 00:00:00 2001 From: Matthew Flatt Date: Thu, 13 Nov 2014 08:33:33 -0700 Subject: [PATCH] read: reject non-Latin-1 characters in byte-string literals This is a backward-incompatible change, but the old behavior (truncate the character value to 8 bits) was never intended and seems clearly bad. --- .../scribblings/reference/reader.scrbl | 5 ++- .../racket-test/tests/racket/read.rktl | 2 ++ .../syntax-color/racket-lexer.rkt | 33 ++++++++++--------- racket/src/racket/src/error.c | 2 +- racket/src/racket/src/read.c | 11 +++++-- 5 files changed, 33 insertions(+), 20 deletions(-) diff --git a/pkgs/racket-pkgs/racket-doc/scribblings/reference/reader.scrbl b/pkgs/racket-pkgs/racket-doc/scribblings/reference/reader.scrbl index 529e2e41fc..485ad75260 100644 --- a/pkgs/racket-pkgs/racket-doc/scribblings/reference/reader.scrbl +++ b/pkgs/racket-pkgs/racket-doc/scribblings/reference/reader.scrbl @@ -521,7 +521,10 @@ literal.) See @secref["bytestrings"] for information on byte strings. The resulting byte string is @tech{interned} in @racket[read-syntax] mode. Byte-string constants support the same escape sequences as -character strings, except @litchar{\u} and @litchar{\U}. +character strings, except @litchar{\u} and @litchar{\U}. Otherwise, each +character within the byte-string quotes must have a Unicode code-point number +in the range 0 to 255, which is used as the corresponding byte's value; if +a character is not in that range, the @exnraise[exn:fail:read]. When the reader encounters @as-index{@litchar{#<<}}, it starts parsing a @pidefterm{here string}. The characters following @litchar{#<<} until diff --git a/pkgs/racket-pkgs/racket-test/tests/racket/read.rktl b/pkgs/racket-pkgs/racket-test/tests/racket/read.rktl index bb6bca14d7..8e0d4690af 100644 --- a/pkgs/racket-pkgs/racket-test/tests/racket/read.rktl +++ b/pkgs/racket-pkgs/racket-test/tests/racket/read.rktl @@ -196,6 +196,8 @@ (err/rt-test (readstr "#\"\\c\"") exn:fail:read?) (err/rt-test (readstr "#\"\\777\"") exn:fail:read?) (err/rt-test (readstr "#\"\\u0040\"") exn:fail:read?) +(err/rt-test (readstr "#\"\u0100\"") exn:fail:read?) +(err/rt-test (readstr "#\"\u03BB\"") exn:fail:read?) (load-relative "numstrs.rktl") (let loop ([l number-table]) diff --git a/pkgs/syntax-color-pkgs/syntax-color-lib/syntax-color/racket-lexer.rkt b/pkgs/syntax-color-pkgs/syntax-color-lib/syntax-color/racket-lexer.rkt index d338612c7b..50759a817b 100644 --- a/pkgs/syntax-color-pkgs/syntax-color-lib/syntax-color/racket-lexer.rkt +++ b/pkgs/syntax-color-pkgs/syntax-color-lib/syntax-color/racket-lexer.rkt @@ -82,22 +82,25 @@ ;; What about byte string regexp strings [str (:or (:: (:? (:or "#px" "#rx")) "\"" (:* string-element (:: "\\" unicode)) "\"") byte-str)] - [byte-str (:: (:? (:or "#px" "#rx")) "#\"" (:* string-element) "\"")] + [byte-str (:: (:? (:or "#px" "#rx")) "#\"" (:* byte-string-element) "\"")] [string-element (:or (:~ "\"" "\\") - "\\\"" - "\\\\" - "\\a" - "\\b" - "\\t" - "\\n" - "\\v" - "\\f" - "\\r" - "\\e" - "\\'" - (:: "\\" (:** 1 3 digit8)) - (:: "\\x" (:** 1 2 digit16)) - (:: "\\" #\newline))] + string-escape)] + [byte-string-element (:or (:- (:/ "\x00" "\xFF") "\"" "\\") + string-escape)] + [string-escape (:or "\\\"" + "\\\\" + "\\a" + "\\b" + "\\t" + "\\n" + "\\v" + "\\f" + "\\r" + "\\e" + "\\'" + (:: "\\" (:** 1 3 digit8)) + (:: "\\x" (:** 1 2 digit16)) + (:: "\\" #\newline))] [bad-str (:: (:? (:or "#px" "#rx")) (:? "#") "\"" (:* (:~ "\"" "\\") diff --git a/racket/src/racket/src/error.c b/racket/src/racket/src/error.c index c9771c1c15..716801dfc8 100644 --- a/racket/src/racket/src/error.c +++ b/racket/src/racket/src/error.c @@ -391,8 +391,8 @@ static intptr_t sch_vsprintf(char *s, intptr_t maxlen, const char *msg, va_list tlen = 1; } else { mzchar mc; + mc = c; tlen = scheme_utf8_encode_all(&mc, 1, (unsigned char *)buf); - c = (int)mc; } t = buf; } diff --git a/racket/src/racket/src/read.c b/racket/src/racket/src/read.c index e549adccf4..378dd36868 100644 --- a/racket/src/racket/src/read.c +++ b/racket/src/racket/src/read.c @@ -3283,14 +3283,19 @@ read_string(int is_byte, Scheme_Object *port, } } } + } else if (is_byte && (ch > 255)) { + if (err_ok) + scheme_read_err(port, stxsrc, line, col, pos, SPAN(port, pos), 0, indentation, + "read: out-of-range character in byte string: %c", + ch); + return NULL; } if (ch < 0) { if (err_ok) scheme_read_err(port, stxsrc, line, col, pos, SPAN(port, pos), 0, indentation, - "read: out-of-range character in %s%s", - is_byte ? "byte " : "", - "string"); + "read: out-of-range character in %sstring", + is_byte ? "byte " : ""); return NULL; }