extend string-literal syntax to allow surrogate-style \u pairs
For example, "\uD834\uDD1E" is another way to write "\U01D11E", while "\uD834", "\uDD1E" or "\uDD1E\uD834" are still errors.
This commit is contained in:
parent
abfc3de9ba
commit
060515b9cd
|
@ -438,11 +438,21 @@ Within a string sequence, the following escape sequences are
|
|||
takes precedence over the shorter form.}
|
||||
|
||||
@item{@as-index{@litchar{\u}@kleenerange[1
|
||||
4]{@nonterm{digit@sub{16}}}}: like @litchar{\x}, but with up
|
||||
to four hexadecimal digits (longer sequences take precedence).
|
||||
4]{@nonterm{digit@sub{16}}}}: like @litchar{\x}, but with up to
|
||||
four hexadecimal digits (longer sequences take precedence).
|
||||
The resulting hexadecimal number must be a valid argument to
|
||||
@racket[integer->char], otherwise the
|
||||
@exnraise[exn:fail:read].}
|
||||
@exnraise[exn:fail:read]---unless the encoding continues with
|
||||
another @litchar{\u} to form a surrogate-style encoding.}
|
||||
|
||||
@item{@as-index{@litchar{\u}@kleenerange[4
|
||||
4]{@nonterm{digit@sub{16}}}@litchar{\u}@kleenerange[4
|
||||
4]{@nonterm{digit@sub{16}}}}: like @litchar{\u}, but for two
|
||||
hexadecimal numbers, where the first is in the range
|
||||
@code{#xD800} to @code{#xDBFF} and the second is in the
|
||||
range @code{#xDC00} to @code{#xDFFF}; the resulting
|
||||
character is the one represented by the numbers as a UTF-16
|
||||
surrogate pair.}
|
||||
|
||||
@item{@as-index{@litchar{\U}@kleenerange[1
|
||||
8]{@nonterm{digit@sub{16}}}}: like @litchar{\x}, but with up
|
||||
|
|
|
@ -162,12 +162,19 @@
|
|||
(test (string #\u0001) readstr "\"\\u0001\"")
|
||||
(test (string #\u0001 #\space) readstr "\"\\u0001 \"")
|
||||
(test (string #\u0001 #\1) readstr "\"\\u00011\"")
|
||||
(test (string #\U10000) readstr "\"\\uD800\\uDC00\"")
|
||||
(test (string #\U1D11E) readstr "\"\\uD834\\uDD1E\"")
|
||||
|
||||
(err/rt-test (readstr "\"\\c\"") exn:fail:read?)
|
||||
(err/rt-test (readstr "\"\\777\"") exn:fail:read?)
|
||||
(err/rt-test (readstr "\"\\uD800\"") exn:fail:read?)
|
||||
(err/rt-test (readstr "\"\\UB0000000\"") exn:fail:read?)
|
||||
(err/rt-test (readstr "\"\\UFFFFFFFF\"") exn:fail:read?)
|
||||
(err/rt-test (readstr "\"\\uD800\\u\"") exn:fail:read?)
|
||||
(err/rt-test (readstr "\"\\uD800\\uD\"") exn:fail:read?)
|
||||
(err/rt-test (readstr "\"\\uD800\\uD\"") exn:fail:read?)
|
||||
(err/rt-test (readstr "\"\\uD800\\uDD\"") exn:fail:read?)
|
||||
(err/rt-test (readstr "\"\\uD800\\uDD1\"") exn:fail:read?)
|
||||
|
||||
(test (bytes 7) readstr "#\"\\a\"")
|
||||
(test (bytes 8) readstr "#\"\\b\"")
|
||||
|
|
|
@ -2903,6 +2903,7 @@ read_string(int is_byte, Scheme_Object *port,
|
|||
case 'U':
|
||||
if (!is_byte) {
|
||||
int maxc = ((ch == 'u') ? 4 : 8);
|
||||
char initial[8];
|
||||
ch = scheme_getc_special_ok(port);
|
||||
if (NOT_EOF_OR_SPECIAL(ch) && scheme_isxdigit(ch)) {
|
||||
int count = 1;
|
||||
|
@ -2910,12 +2911,66 @@ read_string(int is_byte, Scheme_Object *port,
|
|||
while (count < maxc) {
|
||||
ch = scheme_peekc_special_ok(port);
|
||||
if (NOT_EOF_OR_SPECIAL(ch) && scheme_isxdigit(ch)) {
|
||||
initial[count] = ch;
|
||||
n = n*16 + (ch<='9' ? ch-'0' : (scheme_toupper(ch)-'A'+10));
|
||||
scheme_getc(port); /* must be ch */
|
||||
count++;
|
||||
} else
|
||||
break;
|
||||
}
|
||||
if ((maxc == 4) && ((n >= 0xD800) && (n <= 0xDBFF))) {
|
||||
/* Allow a surrogate-pair-like encoding, as long as
|
||||
the next part is "\uD..." */
|
||||
int n2 = -1, sndp = 0;
|
||||
mzchar snd[7];
|
||||
ch = scheme_getc_special_ok(port);
|
||||
if (ch == '\\') {
|
||||
snd[sndp++] = ch;
|
||||
ch = scheme_getc_special_ok(port);
|
||||
if (ch == 'u') {
|
||||
snd[sndp++] = ch;
|
||||
ch = scheme_getc_special_ok(port);
|
||||
if ((ch == 'd') || (ch == 'D')) {
|
||||
snd[sndp++] = ch;
|
||||
ch = scheme_getc_special_ok(port);
|
||||
if (NOT_EOF_OR_SPECIAL(ch) && scheme_isxdigit(ch)) {
|
||||
snd[sndp++] = ch;
|
||||
n2 = (scheme_toupper(ch)-'A'+10);
|
||||
if ((n2 >= 12) && (n2 <= 15)) {
|
||||
n2 = 0xD000 | (n2 << 8);
|
||||
ch = scheme_getc_special_ok(port);
|
||||
if (NOT_EOF_OR_SPECIAL(ch) && scheme_isxdigit(ch)) {
|
||||
snd[sndp++] = ch;
|
||||
n2 |= ((ch<='9' ? ch-'0' : (scheme_toupper(ch)-'A'+10)) << 4);
|
||||
ch = scheme_getc_special_ok(port);
|
||||
if (NOT_EOF_OR_SPECIAL(ch) && scheme_isxdigit(ch)) {
|
||||
n2 |= (ch<='9' ? ch-'0' : (scheme_toupper(ch)-'A'+10));
|
||||
n = (((n - 0xD800) << 10) + (n2 - 0xDC00)) + 0x10000;
|
||||
} else
|
||||
n2 = -1;
|
||||
} else
|
||||
n2 = -1;
|
||||
} else
|
||||
n2 = -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (n2 < 0) {
|
||||
if (ch == SCHEME_SPECIAL)
|
||||
scheme_get_ready_read_special(port, stxsrc, ht);
|
||||
else if (NOT_EOF_OR_SPECIAL(ch))
|
||||
snd[sndp++] = ch;
|
||||
snd[sndp] = 0;
|
||||
initial[4] = 0;
|
||||
if (err_ok)
|
||||
scheme_read_err(port, stxsrc, line, col, pos, SPAN(port, pos), ch, indentation,
|
||||
"read: bad or incomplete surrogate-style encoding at `\\u%s%5'",
|
||||
initial,
|
||||
snd);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
/* disallow surrogate points, etc */
|
||||
if (((n >= 0xD800) && (n <= 0xDFFF))
|
||||
|| (n > 0x10FFFF)) {
|
||||
|
|
Loading…
Reference in New Issue
Block a user