From 060515b9cd3ca0752e90420c5dfefa5a9106ed0e Mon Sep 17 00:00:00 2001 From: Matthew Flatt Date: Mon, 12 Mar 2012 17:02:27 -0600 Subject: [PATCH] extend string-literal syntax to allow surrogate-style \u pairs For example, "\uD834\uDD1E" is another way to write "\U01D11E", while "\uD834", "\uDD1E" or "\uDD1E\uD834" are still errors. --- collects/scribblings/reference/reader.scrbl | 16 ++++-- collects/tests/racket/read.rktl | 7 +++ src/racket/src/read.c | 55 +++++++++++++++++++++ 3 files changed, 75 insertions(+), 3 deletions(-) diff --git a/collects/scribblings/reference/reader.scrbl b/collects/scribblings/reference/reader.scrbl index e178321088..d8fccf2c2d 100644 --- a/collects/scribblings/reference/reader.scrbl +++ b/collects/scribblings/reference/reader.scrbl @@ -438,11 +438,21 @@ Within a string sequence, the following escape sequences are takes precedence over the shorter form.} @item{@as-index{@litchar{\u}@kleenerange[1 - 4]{@nonterm{digit@sub{16}}}}: like @litchar{\x}, but with up - to four hexadecimal digits (longer sequences take precedence). + 4]{@nonterm{digit@sub{16}}}}: like @litchar{\x}, but with up to + four hexadecimal digits (longer sequences take precedence). The resulting hexadecimal number must be a valid argument to @racket[integer->char], otherwise the - @exnraise[exn:fail:read].} + @exnraise[exn:fail:read]---unless the encoding continues with + another @litchar{\u} to form a surrogate-style encoding.} + + @item{@as-index{@litchar{\u}@kleenerange[4 + 4]{@nonterm{digit@sub{16}}}@litchar{\u}@kleenerange[4 + 4]{@nonterm{digit@sub{16}}}}: like @litchar{\u}, but for two + hexadecimal numbers, where the first is in the range + @code{#xD800} to @code{#xDBFF} and the second is in the + range @code{#xDC00} to @code{#xDFFF}; the resulting + character is the one represented by the numbers as a UTF-16 + surrogate pair.} @item{@as-index{@litchar{\U}@kleenerange[1 8]{@nonterm{digit@sub{16}}}}: like @litchar{\x}, but with up diff --git a/collects/tests/racket/read.rktl b/collects/tests/racket/read.rktl index 58b83d611f..776a5141b2 100644 --- a/collects/tests/racket/read.rktl +++ b/collects/tests/racket/read.rktl @@ -162,12 +162,19 @@ (test (string #\u0001) readstr "\"\\u0001\"") (test (string #\u0001 #\space) readstr "\"\\u0001 \"") (test (string #\u0001 #\1) readstr "\"\\u00011\"") +(test (string #\U10000) readstr "\"\\uD800\\uDC00\"") +(test (string #\U1D11E) readstr "\"\\uD834\\uDD1E\"") (err/rt-test (readstr "\"\\c\"") exn:fail:read?) (err/rt-test (readstr "\"\\777\"") exn:fail:read?) (err/rt-test (readstr "\"\\uD800\"") exn:fail:read?) (err/rt-test (readstr "\"\\UB0000000\"") exn:fail:read?) (err/rt-test (readstr "\"\\UFFFFFFFF\"") exn:fail:read?) +(err/rt-test (readstr "\"\\uD800\\u\"") exn:fail:read?) +(err/rt-test (readstr "\"\\uD800\\uD\"") exn:fail:read?) +(err/rt-test (readstr "\"\\uD800\\uD\"") exn:fail:read?) +(err/rt-test (readstr "\"\\uD800\\uDD\"") exn:fail:read?) +(err/rt-test (readstr "\"\\uD800\\uDD1\"") exn:fail:read?) (test (bytes 7) readstr "#\"\\a\"") (test (bytes 8) readstr "#\"\\b\"") diff --git a/src/racket/src/read.c b/src/racket/src/read.c index 7fe20d8aed..75711477e3 100644 --- a/src/racket/src/read.c +++ b/src/racket/src/read.c @@ -2903,6 +2903,7 @@ read_string(int is_byte, Scheme_Object *port, case 'U': if (!is_byte) { int maxc = ((ch == 'u') ? 4 : 8); + char initial[8]; ch = scheme_getc_special_ok(port); if (NOT_EOF_OR_SPECIAL(ch) && scheme_isxdigit(ch)) { int count = 1; @@ -2910,12 +2911,66 @@ read_string(int is_byte, Scheme_Object *port, while (count < maxc) { ch = scheme_peekc_special_ok(port); if (NOT_EOF_OR_SPECIAL(ch) && scheme_isxdigit(ch)) { + initial[count] = ch; n = n*16 + (ch<='9' ? ch-'0' : (scheme_toupper(ch)-'A'+10)); scheme_getc(port); /* must be ch */ count++; } else break; } + if ((maxc == 4) && ((n >= 0xD800) && (n <= 0xDBFF))) { + /* Allow a surrogate-pair-like encoding, as long as + the next part is "\uD..." */ + int n2 = -1, sndp = 0; + mzchar snd[7]; + ch = scheme_getc_special_ok(port); + if (ch == '\\') { + snd[sndp++] = ch; + ch = scheme_getc_special_ok(port); + if (ch == 'u') { + snd[sndp++] = ch; + ch = scheme_getc_special_ok(port); + if ((ch == 'd') || (ch == 'D')) { + snd[sndp++] = ch; + ch = scheme_getc_special_ok(port); + if (NOT_EOF_OR_SPECIAL(ch) && scheme_isxdigit(ch)) { + snd[sndp++] = ch; + n2 = (scheme_toupper(ch)-'A'+10); + if ((n2 >= 12) && (n2 <= 15)) { + n2 = 0xD000 | (n2 << 8); + ch = scheme_getc_special_ok(port); + if (NOT_EOF_OR_SPECIAL(ch) && scheme_isxdigit(ch)) { + snd[sndp++] = ch; + n2 |= ((ch<='9' ? ch-'0' : (scheme_toupper(ch)-'A'+10)) << 4); + ch = scheme_getc_special_ok(port); + if (NOT_EOF_OR_SPECIAL(ch) && scheme_isxdigit(ch)) { + n2 |= (ch<='9' ? ch-'0' : (scheme_toupper(ch)-'A'+10)); + n = (((n - 0xD800) << 10) + (n2 - 0xDC00)) + 0x10000; + } else + n2 = -1; + } else + n2 = -1; + } else + n2 = -1; + } + } + } + } + if (n2 < 0) { + if (ch == SCHEME_SPECIAL) + scheme_get_ready_read_special(port, stxsrc, ht); + else if (NOT_EOF_OR_SPECIAL(ch)) + snd[sndp++] = ch; + snd[sndp] = 0; + initial[4] = 0; + if (err_ok) + scheme_read_err(port, stxsrc, line, col, pos, SPAN(port, pos), ch, indentation, + "read: bad or incomplete surrogate-style encoding at `\\u%s%5'", + initial, + snd); + return NULL; + } + } /* disallow surrogate points, etc */ if (((n >= 0xD800) && (n <= 0xDFFF)) || (n > 0x10FFFF)) {