extend string-literal syntax to allow surrogate-style \u pairs

For example, "\uD834\uDD1E" is another way to write "\U01D11E", while "\uD834", "\uDD1E" or "\uDD1E\uD834" are still errors.
2012-03-12 17:02:27 -06:00 · 2012-03-12 17:02:27 -06:00 · 060515b9cd
commit 060515b9cd
parent abfc3de9ba
3 changed files with 75 additions and 3 deletions
--- a/collects/scribblings/reference/reader.scrbl
+++ b/collects/scribblings/reference/reader.scrbl
@ -438,11 +438,21 @@ Within a string sequence, the following escape sequences are
       takes precedence over the shorter form.}

 @item{@as-index{@litchar{\u}@kleenerange[1
-       4]{@nonterm{digit@sub{16}}}}: like @litchar{\x}, but with up
-       to four hexadecimal digits (longer sequences take precedence).
+       4]{@nonterm{digit@sub{16}}}}: like @litchar{\x}, but with up to
+       four hexadecimal digits (longer sequences take precedence).
       The resulting hexadecimal number must be a valid argument to
       @racket[integer->char], otherwise the
-       @exnraise[exn:fail:read].}
+       @exnraise[exn:fail:read]---unless the encoding continues with
+       another @litchar{\u} to form a surrogate-style encoding.}
+
+ @item{@as-index{@litchar{\u}@kleenerange[4
+       4]{@nonterm{digit@sub{16}}}@litchar{\u}@kleenerange[4
+       4]{@nonterm{digit@sub{16}}}}: like @litchar{\u}, but for two
+       hexadecimal numbers, where the first is in the range
+       @code{#xD800} to @code{#xDBFF} and the second is in the
+       range @code{#xDC00} to @code{#xDFFF}; the resulting
+       character is the one represented by the numbers as a UTF-16
+       surrogate pair.}

 @item{@as-index{@litchar{\U}@kleenerange[1
       8]{@nonterm{digit@sub{16}}}}: like @litchar{\x}, but with up
--- a/collects/tests/racket/read.rktl
+++ b/collects/tests/racket/read.rktl
@ -162,12 +162,19 @@
 (test (string #\u0001) readstr "\"\\u0001\"")
 (test (string #\u0001 #\space) readstr "\"\\u0001 \"")
 (test (string #\u0001 #\1) readstr "\"\\u00011\"")
+(test (string #\U10000) readstr "\"\\uD800\\uDC00\"")
+(test (string #\U1D11E) readstr "\"\\uD834\\uDD1E\"")

 (err/rt-test (readstr "\"\\c\"") exn:fail:read?)
 (err/rt-test (readstr "\"\\777\"") exn:fail:read?)
 (err/rt-test (readstr "\"\\uD800\"") exn:fail:read?)
 (err/rt-test (readstr "\"\\UB0000000\"") exn:fail:read?)
 (err/rt-test (readstr "\"\\UFFFFFFFF\"") exn:fail:read?)
+(err/rt-test (readstr "\"\\uD800\\u\"") exn:fail:read?)
+(err/rt-test (readstr "\"\\uD800\\uD\"") exn:fail:read?)
+(err/rt-test (readstr "\"\\uD800\\uD\"") exn:fail:read?)
+(err/rt-test (readstr "\"\\uD800\\uDD\"") exn:fail:read?)
+(err/rt-test (readstr "\"\\uD800\\uDD1\"") exn:fail:read?)

 (test (bytes 7) readstr "#\"\\a\"")
 (test (bytes 8) readstr "#\"\\b\"")
--- a/src/racket/src/read.c
+++ b/src/racket/src/read.c
@ -2903,6 +2903,7 @@ read_string(int is_byte, Scheme_Object *port,
      case 'U':
 	if (!is_byte) {
 	  int maxc = ((ch == 'u') ? 4 : 8);
+          char initial[8];
 	  ch = scheme_getc_special_ok(port);
 	  if (NOT_EOF_OR_SPECIAL(ch) && scheme_isxdigit(ch)) {
 	    int count = 1;
@ -2910,12 +2911,66 @@ read_string(int is_byte, Scheme_Object *port,
 	    while (count < maxc) {
 	      ch = scheme_peekc_special_ok(port);
 	      if (NOT_EOF_OR_SPECIAL(ch) && scheme_isxdigit(ch)) {
+                initial[count] = ch;
 		n = n*16 + (ch<='9' ? ch-'0' : (scheme_toupper(ch)-'A'+10));
 		scheme_getc(port); /* must be ch */
 		count++;
 	      } else
 		break;
 	    }
+            if ((maxc == 4) && ((n >= 0xD800) && (n <= 0xDBFF))) {
+              /* Allow a surrogate-pair-like encoding, as long as
+                 the next part is "\uD..." */
+              int n2 = -1, sndp = 0;
+              mzchar snd[7];
+              ch = scheme_getc_special_ok(port);
+              if (ch == '\\') {
+                snd[sndp++] = ch;
+                ch = scheme_getc_special_ok(port);
+                if (ch == 'u') {
+                  snd[sndp++] = ch;
+                  ch = scheme_getc_special_ok(port);
+                  if ((ch == 'd') || (ch == 'D')) {
+                    snd[sndp++] = ch;
+                    ch = scheme_getc_special_ok(port);
+                    if (NOT_EOF_OR_SPECIAL(ch) && scheme_isxdigit(ch)) {
+                      snd[sndp++] = ch;
+                      n2 = (scheme_toupper(ch)-'A'+10);
+                      if ((n2 >= 12) && (n2 <= 15)) {
+                        n2 = 0xD000 | (n2 << 8);
+                        ch = scheme_getc_special_ok(port);
+                        if (NOT_EOF_OR_SPECIAL(ch) && scheme_isxdigit(ch)) {
+                          snd[sndp++] = ch;
+                          n2 |= ((ch<='9' ? ch-'0' : (scheme_toupper(ch)-'A'+10)) << 4);
+                          ch = scheme_getc_special_ok(port);
+                          if (NOT_EOF_OR_SPECIAL(ch) && scheme_isxdigit(ch)) {
+                            n2 |= (ch<='9' ? ch-'0' : (scheme_toupper(ch)-'A'+10));
+                            n = (((n - 0xD800) << 10) + (n2 - 0xDC00)) + 0x10000;
+                          } else
+                            n2 = -1;
+                        } else
+                          n2 = -1;
+                      } else
+                        n2 = -1;
+                    }
+                  }
+                }
+              }
+              if (n2 < 0) {
+                if (ch == SCHEME_SPECIAL)
+                  scheme_get_ready_read_special(port, stxsrc, ht);
+                else if (NOT_EOF_OR_SPECIAL(ch))
+                  snd[sndp++] = ch;
+                snd[sndp] = 0;
+                initial[4] = 0;
+                if (err_ok)
+                  scheme_read_err(port, stxsrc, line, col, pos, SPAN(port, pos), ch, indentation,
+                                  "read: bad or incomplete surrogate-style encoding at `\\u%s%5'",
+                                  initial,
+                                  snd);
+                return NULL;
+              }
+            }
 	    /* disallow surrogate points, etc */
 	    if (((n >= 0xD800) && (n <= 0xDFFF))
 		|| (n > 0x10FFFF)) {