extend string-literal syntax to allow surrogate-style \u pairs

For example, "\uD834\uDD1E" is another way to write "\U01D11E",
while "\uD834", "\uDD1E" or "\uDD1E\uD834" are still errors.
This commit is contained in:
Matthew Flatt 2012-03-12 17:02:27 -06:00
parent abfc3de9ba
commit 060515b9cd
3 changed files with 75 additions and 3 deletions

View File

@ -438,11 +438,21 @@ Within a string sequence, the following escape sequences are
takes precedence over the shorter form.}
@item{@as-index{@litchar{\u}@kleenerange[1
4]{@nonterm{digit@sub{16}}}}: like @litchar{\x}, but with up
to four hexadecimal digits (longer sequences take precedence).
4]{@nonterm{digit@sub{16}}}}: like @litchar{\x}, but with up to
four hexadecimal digits (longer sequences take precedence).
The resulting hexadecimal number must be a valid argument to
@racket[integer->char], otherwise the
@exnraise[exn:fail:read].}
@exnraise[exn:fail:read]---unless the encoding continues with
another @litchar{\u} to form a surrogate-style encoding.}
@item{@as-index{@litchar{\u}@kleenerange[4
4]{@nonterm{digit@sub{16}}}@litchar{\u}@kleenerange[4
4]{@nonterm{digit@sub{16}}}}: like @litchar{\u}, but for two
hexadecimal numbers, where the first is in the range
@code{#xD800} to @code{#xDBFF} and the second is in the
range @code{#xDC00} to @code{#xDFFF}; the resulting
character is the one represented by the numbers as a UTF-16
surrogate pair.}
@item{@as-index{@litchar{\U}@kleenerange[1
8]{@nonterm{digit@sub{16}}}}: like @litchar{\x}, but with up

View File

@ -162,12 +162,19 @@
(test (string #\u0001) readstr "\"\\u0001\"")
(test (string #\u0001 #\space) readstr "\"\\u0001 \"")
(test (string #\u0001 #\1) readstr "\"\\u00011\"")
(test (string #\U10000) readstr "\"\\uD800\\uDC00\"")
(test (string #\U1D11E) readstr "\"\\uD834\\uDD1E\"")
(err/rt-test (readstr "\"\\c\"") exn:fail:read?)
(err/rt-test (readstr "\"\\777\"") exn:fail:read?)
(err/rt-test (readstr "\"\\uD800\"") exn:fail:read?)
(err/rt-test (readstr "\"\\UB0000000\"") exn:fail:read?)
(err/rt-test (readstr "\"\\UFFFFFFFF\"") exn:fail:read?)
(err/rt-test (readstr "\"\\uD800\\u\"") exn:fail:read?)
(err/rt-test (readstr "\"\\uD800\\uD\"") exn:fail:read?)
(err/rt-test (readstr "\"\\uD800\\uD\"") exn:fail:read?)
(err/rt-test (readstr "\"\\uD800\\uDD\"") exn:fail:read?)
(err/rt-test (readstr "\"\\uD800\\uDD1\"") exn:fail:read?)
(test (bytes 7) readstr "#\"\\a\"")
(test (bytes 8) readstr "#\"\\b\"")

View File

@ -2903,6 +2903,7 @@ read_string(int is_byte, Scheme_Object *port,
case 'U':
if (!is_byte) {
int maxc = ((ch == 'u') ? 4 : 8);
char initial[8];
ch = scheme_getc_special_ok(port);
if (NOT_EOF_OR_SPECIAL(ch) && scheme_isxdigit(ch)) {
int count = 1;
@ -2910,12 +2911,66 @@ read_string(int is_byte, Scheme_Object *port,
while (count < maxc) {
ch = scheme_peekc_special_ok(port);
if (NOT_EOF_OR_SPECIAL(ch) && scheme_isxdigit(ch)) {
initial[count] = ch;
n = n*16 + (ch<='9' ? ch-'0' : (scheme_toupper(ch)-'A'+10));
scheme_getc(port); /* must be ch */
count++;
} else
break;
}
if ((maxc == 4) && ((n >= 0xD800) && (n <= 0xDBFF))) {
/* Allow a surrogate-pair-like encoding, as long as
the next part is "\uD..." */
int n2 = -1, sndp = 0;
mzchar snd[7];
ch = scheme_getc_special_ok(port);
if (ch == '\\') {
snd[sndp++] = ch;
ch = scheme_getc_special_ok(port);
if (ch == 'u') {
snd[sndp++] = ch;
ch = scheme_getc_special_ok(port);
if ((ch == 'd') || (ch == 'D')) {
snd[sndp++] = ch;
ch = scheme_getc_special_ok(port);
if (NOT_EOF_OR_SPECIAL(ch) && scheme_isxdigit(ch)) {
snd[sndp++] = ch;
n2 = (scheme_toupper(ch)-'A'+10);
if ((n2 >= 12) && (n2 <= 15)) {
n2 = 0xD000 | (n2 << 8);
ch = scheme_getc_special_ok(port);
if (NOT_EOF_OR_SPECIAL(ch) && scheme_isxdigit(ch)) {
snd[sndp++] = ch;
n2 |= ((ch<='9' ? ch-'0' : (scheme_toupper(ch)-'A'+10)) << 4);
ch = scheme_getc_special_ok(port);
if (NOT_EOF_OR_SPECIAL(ch) && scheme_isxdigit(ch)) {
n2 |= (ch<='9' ? ch-'0' : (scheme_toupper(ch)-'A'+10));
n = (((n - 0xD800) << 10) + (n2 - 0xDC00)) + 0x10000;
} else
n2 = -1;
} else
n2 = -1;
} else
n2 = -1;
}
}
}
}
if (n2 < 0) {
if (ch == SCHEME_SPECIAL)
scheme_get_ready_read_special(port, stxsrc, ht);
else if (NOT_EOF_OR_SPECIAL(ch))
snd[sndp++] = ch;
snd[sndp] = 0;
initial[4] = 0;
if (err_ok)
scheme_read_err(port, stxsrc, line, col, pos, SPAN(port, pos), ch, indentation,
"read: bad or incomplete surrogate-style encoding at `\\u%s%5'",
initial,
snd);
return NULL;
}
}
/* disallow surrogate points, etc */
if (((n >= 0xD800) && (n <= 0xDFFF))
|| (n > 0x10FFFF)) {