From 7b85b853110b095c6cf067a4902f333c891538dd Mon Sep 17 00:00:00 2001
From: Matthew Flatt <mflatt@racket-lang.org>
Date: Wed, 10 May 2006 14:55:25 +0000
Subject: [PATCH] add built-in UTF-16 bytes converter

svn: r2899
---
 src/mzscheme/src/string.c | 194 ++++++++++++++++++++++++++++++++++----
 1 file changed, 176 insertions(+), 18 deletions(-)

diff --git a/src/mzscheme/src/string.c b/src/mzscheme/src/string.c
index 85fa7c8dac..6e8fd0265b 100644
--- a/src/mzscheme/src/string.c
+++ b/src/mzscheme/src/string.c
@@ -148,6 +148,8 @@ static void init_iconv() { }
 
 #define mzICONV_KIND 0
 #define mzUTF8_KIND 1
+#define mzUTF8_TO_UTF16_KIND 2
+#define mzUTF16_TO_UTF8_KIND 3
 
 typedef struct Scheme_Converter {
   Scheme_Object so;
@@ -273,6 +275,9 @@ static int utf8_decode_x(const unsigned char *s, int start, int end,
 			 long *ipos, long *jpos,
 			 char compact, char utf16,
 			 int *state, int might_continue, int permissive);
+static int utf8_encode_x(const unsigned int *us, int start, int end,
+			 unsigned char *s, int dstart, int dend,
+			 long *_ipos, long *_opos, char utf16);
 
 static char *string_to_from_locale(int to_bytes,
 				   char *in, int delta, int len,
@@ -4040,6 +4045,22 @@ Scheme_Object *scheme_open_converter(const char *from_e, const char *to_e)
       permissive = 0;
     cd = (iconv_t)-1;
     need_regis = (*to_e && *from_e);
+  } else if ((!strcmp(from_e, "platform-UTF-8")
+	      || !strcmp(from_e, "platform-UTF-8-permissive"))
+	     && !strcmp(to_e, "platform-UTF-16")) {
+    kind = mzUTF8_TO_UTF16_KIND;
+    if (!strcmp(from_e, "platform-UTF-8-permissive"))
+      permissive = '?';
+    else
+      permissive = 0;
+    cd = (iconv_t)-1;
+    need_regis = 0;
+  } else if (!strcmp(from_e, "platform-UTF-16")
+	     && !strcmp(to_e, "platform-UTF-8")) {
+    kind = mzUTF16_TO_UTF8_KIND;
+    permissive = 0;
+    cd = (iconv_t)-1;
+    need_regis = 0;
   } else {
     if (!iconv_ready) init_iconv();
 
@@ -4183,15 +4204,95 @@ static Scheme_Object *convert_one(const char *who, int opos, int argc, Scheme_Ob
 
   instr = ((opos > 1) ? SCHEME_BYTE_STR_VAL(argv[1]) : NULL);
 
-  if (c->kind == mzUTF8_KIND) {
-    /* UTF-8 -> UTF-8 "identity" converter, but maybe permissive */
+  if (c->kind == mzUTF16_TO_UTF8_KIND) {
+    if (istart & 0x1) {
+      /* Copy to word-align */
+      char *c;
+      c = (char *)scheme_malloc_atomic(ifinish - istart);
+      memcpy(c, instr XFORM_OK_PLUS istart, ifinish - istart);
+      ifinish = ifinish - istart;
+      istart = 0;
+      instr = c;
+    }
+
+    status = utf8_encode_x((const unsigned int *)instr, istart >> 1, ifinish >> 1,
+			   (unsigned char *)r, ostart, ofinish,
+			   &amt_read, &amt_wrote, 1);
+    
+    amt_read -= (istart >> 1);
+
+    if (amt_read) {
+      if (!r) {
+	/* Need to allocate, then do it again: */
+	r = (char *)scheme_malloc_atomic(amt_wrote + 1);
+	utf8_encode_x((const unsigned int *)instr, istart >> 1, ifinish >> 1,
+		      (unsigned char *)r, ostart, ofinish,
+		      NULL, NULL, 1);
+	r[amt_wrote] = 0;
+      }
+      amt_read <<= 1;
+    }
+
+    /* We might get a -1 result because the input has an odd number of
+       bytes, and 2nd+next-to-last bytes form an unpaired
+       surrogate. In that case, the transformer normally needs one
+       more byte: Windows is little-endian, so we need the byte to
+       tell whether the surrogate is paired, and for all other
+       platforms (where we assume that surrogates are paired), we need
+       the byte to generate output. Technically, on a big-endian
+       non-Windows machine, we could generate the first byte of UTF-8
+       output and keep the byte as state, but we don't. */
+
+    if (status != -1) {
+      if (amt_read < ((ifinish - istart) & ~0x1)) {
+	/* Must have run out of output space */
+	status = 1;
+      } else {
+	/* Read all of input --- but it wasn't really all if there
+	   was an odd number of bytes. */
+	if ((ifinish - istart) & 0x1)
+	  status = -1;
+	else
+	  status = 0;
+      }
+    }
+  } else if (c->kind != mzICONV_KIND) {
+    /* UTF-8 -> UTF-{8,16} "identity" converter, but maybe permissive */
     if (instr) {
+      long _ostart, _ofinish;
+      int utf16;
+
+      if (c->kind == mzUTF8_TO_UTF16_KIND) {
+	_ostart = ostart;
+	_ofinish = ofinish;
+	if (_ostart & 0x1)
+	  _ostart++;
+	_ostart >>= 1;
+	if (_ofinish > 0)
+	  _ofinish >>= 1;
+	utf16 = 1;
+      } else {
+	_ostart = ostart;
+	_ofinish = ofinish;
+	utf16 = 0;
+      }
+
       status = utf8_decode_x((unsigned char *)instr, istart, ifinish,
-			     (unsigned int *)r, ostart, ofinish,
+			     (unsigned int *)r, _ostart, _ofinish,
 			     &amt_read, &amt_wrote,
-			     1, 0, NULL, 1, c->permissive);
+			     1, utf16, NULL, 1, c->permissive);
+      
+      if (utf16) {
+	_ostart <<= 1;
+	amt_wrote <<= 1;
+	if ((ostart & 0x1) && (amt_wrote > _ostart)) {
+	  /* Shift down one byte: */
+	  memmove(r XFORM_OK_PLUS ostart, r XFORM_OK_PLUS _ostart, amt_wrote - _ostart);
+	}
+      }
+
       amt_read -= istart;
-      amt_wrote -= ostart;
+      amt_wrote -= _ostart;
       if (status == -3) {
 	/* r is not NULL; ran out of room */
 	status = 1;
@@ -4201,9 +4302,9 @@ static Scheme_Object *convert_one(const char *who, int opos, int argc, Scheme_Ob
 	    /* Need to allocate, then do it again: */
 	    r = (char *)scheme_malloc_atomic(amt_wrote + 1);
 	    utf8_decode_x((unsigned char *)instr, istart, ifinish,
-			  (unsigned int *)r, ostart, ofinish,
+			  (unsigned int *)r, ostart, _ofinish,
 			  NULL, NULL,
-			  1, 0, NULL, 1, c->permissive);
+			  1, utf16, NULL, 1, c->permissive);
 	    r[amt_wrote] = 0;
 	  }
 	} else if (!r)
@@ -4311,7 +4412,7 @@ static int utf8_decode_x(const unsigned char *s, int start, int end,
 	and [d]end) before return, unless they are NULL.
 
 	compact => UTF-8 to UTF-8 or UTF-16 --- the latter if utf16
-	!compact && utf16 => decode extended UTF-8 that allows surrogates
+	for Windows for utf16, decode extended UTF-8 that allows surrogates
 
 	_state provides initial state and is filled with ending state;
 	when it's not NULL, the us must be NULL
@@ -4478,7 +4579,7 @@ static int utf8_decode_x(const unsigned char *s, int start, int end,
 	  if (v > 0xFFFF) {
 	    if (us) {
 	      v -= 0x10000;
-	      if (j + 1 >= dstart)
+	      if (j + 1 >= dend)
 		break;
 	      ((unsigned short *)us)[j] = 0xD800 | ((v >> 10) & 0x3FF);
 	      ((unsigned short *)us)[j+1] = 0xDC00 | (v & 0x3FF);
@@ -4669,11 +4770,17 @@ int scheme_utf8_decode_count(const unsigned char *s, int start, int end,
   return pos;
 }
 
-int scheme_utf8_encode(const unsigned int *us, int start, int end,
-		       unsigned char *s, int dstart,
-		       char utf16)
+static int utf8_encode_x(const unsigned int *us, int start, int end,
+			 unsigned char *s, int dstart, int dend,
+			 long *_ipos, long *_opos, char utf16)
+  /* Results:
+        -1 => input ended in the middle of an encoding - only when utf16 and _opos
+	non-negative => reports number of bytes/code-units produced */
 {
-  int i, j;
+  int i, j, done = start;
+
+  if (dend < 0)
+    dend = 0x7FFFFFFF;
 
   if (!s) {
     unsigned int wc;
@@ -4683,7 +4790,20 @@ int scheme_utf8_encode(const unsigned int *us, int start, int end,
 	wc = ((unsigned short *)us)[i];
 	if ((wc & 0xF800) == 0xD800) {
 	  /* Unparse surrogates. We assume that the surrogates are
-	     well formed, unless this is Windows. */
+	     well formed, unless this is Windows or if we're at the
+             end and _opos is 0. */
+# ifdef WINDOWS_UNICODE_SUPPORT
+#  define UNPAIRED_MASK 0xFC00
+# else
+#  define UNPAIRED_MASK 0xF800
+# endif
+	  if (((i + 1) == end) && ((wc & UNPAIRED_MASK) == 0xD800) && _opos) {
+	    /* Ended in the middle of a surrogate pair */
+	    *_opos = j;
+	    if (_ipos)
+	      *_ipos = i;
+	    return -1;
+	  }
 # ifdef WINDOWS_UNICODE_SUPPORT
 	  if ((wc & 0xFC00) != 0xD800) {
 	    /* Count as one */
@@ -4714,6 +4834,10 @@ int scheme_utf8_encode(const unsigned int *us, int start, int end,
 	j += 6;
       }
     }
+    if (_ipos)
+      *_ipos = i;
+    if (_opos)
+      *_opos = j + dstart;
     return j;
   } else {
     unsigned int wc;
@@ -4723,7 +4847,15 @@ int scheme_utf8_encode(const unsigned int *us, int start, int end,
 	wc = ((unsigned short *)us)[i];
 	if ((wc & 0xF800) == 0xD800) {
 	  /* Unparse surrogates. We assume that the surrogates are
-	     well formed on non-Windows platforms. */
+	     well formed on non-Windows platforms, but when _opos,
+	     we detect ending in the middle of an surrogate pair. */
+	  if (((i + 1) == end) && ((wc & UNPAIRED_MASK) == 0xD800) && _opos) {
+	    /* Ended in the middle of a surrogate pair */
+	    *_opos = j;
+	    if (_ipos)
+	      *_ipos = i;
+	    return -1;
+	  }
 # ifdef WINDOWS_UNICODE_SUPPORT
 	  if ((wc & 0xFC00) != 0xD800) {
 	    /* Let the misplaced surrogate through */
@@ -4743,26 +4875,38 @@ int scheme_utf8_encode(const unsigned int *us, int start, int end,
       }
 
       if (wc < 0x80) {
+	if (j + 1 > dend)
+	  break;
 	s[j++] = wc;
       } else if (wc < 0x800) {
+	if (j + 2 > dend)
+	  break;
 	s[j++] = 0xC0 | ((wc & 0x7C0) >> 6);
 	s[j++] = 0x80 | (wc & 0x3F);
       } else if (wc < 0x10000) {
+	if (j + 3 > dend)
+	  break;
 	s[j++] = 0xE0 | ((wc & 0xF000) >> 12);
 	s[j++] = 0x80 | ((wc & 0x0FC0) >> 6);
 	s[j++] = 0x80 | (wc & 0x3F);
       } else if (wc < 0x200000) {
+	if (j + 4 > dend)
+	  break;
 	s[j++] = 0xF0 | ((wc & 0x1C0000) >> 18);
 	s[j++] = 0x80 | ((wc & 0x03F000) >> 12);
 	s[j++] = 0x80 | ((wc & 0x000FC0) >> 6);
 	s[j++] = 0x80 | (wc & 0x3F);
       } else if (wc < 0x4000000) {
+	if (j + 5 > dend)
+	  break;
 	s[j++] = 0xF8 | ((wc & 0x3000000) >> 24);
 	s[j++] = 0x80 | ((wc & 0x0FC0000) >> 18);
 	s[j++] = 0x80 | ((wc & 0x003F000) >> 12);
 	s[j++] = 0x80 | ((wc & 0x0000FC0) >> 6);
 	s[j++] = 0x80 | (wc & 0x3F);
       } else {
+	if (j + 6 > dend)
+	  break;
 	s[j++] = 0xFC | ((wc & 0x40000000) >> 30);
 	s[j++] = 0x80 | ((wc & 0x3F000000) >> 24);
 	s[j++] = 0x80 | ((wc & 0x00FC0000) >> 18);
@@ -4770,14 +4914,28 @@ int scheme_utf8_encode(const unsigned int *us, int start, int end,
 	s[j++] = 0x80 | ((wc & 0x00000FC0) >> 6);
 	s[j++] = 0x80 | (wc & 0x3F);
       }
+      done = i;
     }
+    if (_ipos)
+      *_ipos = done;
+    if (_opos)
+      *_opos = j;
     return j - dstart;
   }
 }
 
+int scheme_utf8_encode(const unsigned int *us, int start, int end,
+		       unsigned char *s, int dstart,
+		       char utf16)
+{
+  return utf8_encode_x(us, start, end,
+		       s, dstart, -1,
+		       NULL, NULL, utf16);
+}
+
 int scheme_utf8_encode_all(const unsigned int *us, int len, unsigned char *s)
 {
-  return scheme_utf8_encode(us, 0, len, s, 0, 0 /* utf16 */);
+  return utf8_encode_x(us, 0, len, s, 0, -1, NULL, NULL, 0 /* utf16 */);
 }
 
 char *scheme_utf8_encode_to_buffer_len(const mzchar *s, int len,
@@ -4785,11 +4943,11 @@ char *scheme_utf8_encode_to_buffer_len(const mzchar *s, int len,
 				       long *_slen)
 {
   int slen;
-  slen = scheme_utf8_encode(s, 0, len, NULL, 0, 0);
+  slen = utf8_encode_x(s, 0, len, NULL, 0, -1, NULL, NULL, 0);
   if (slen + 1 > blen) {
     buf = (char *)scheme_malloc_atomic(slen + 1);
   }
-  scheme_utf8_encode(s, 0, len, (unsigned char *)buf, 0, 0);
+  utf8_encode_x(s, 0, len, (unsigned char *)buf, 0, -1, NULL, NULL, 0);
   buf[slen] = 0;
   *_slen = slen;
   return buf;