From 4271d987cdd8d515bb824ec6649e321d55872b88 Mon Sep 17 00:00:00 2001 From: Matthew Flatt Date: Mon, 18 Feb 2013 08:38:18 -0700 Subject: [PATCH] ffi/unsafe: fix a bug in conversion to UTF-16 Characters outside of the BMP were translated incorrectly. --- collects/tests/racket/foreign-test.rktl | 3 +++ collects/tests/racket/unicode.rktl | 8 ++++++++ src/racket/src/string.c | 1 + 3 files changed, 12 insertions(+) diff --git a/collects/tests/racket/foreign-test.rktl b/collects/tests/racket/foreign-test.rktl index 4beb061338..e0345d8a5b 100644 --- a/collects/tests/racket/foreign-test.rktl +++ b/collects/tests/racket/foreign-test.rktl @@ -494,6 +494,9 @@ (test 4.4t0 extflvector-ref v 2) (test 2.2t0 ptr-ref (ptr-add (extflvector->cpointer v) (ctype-sizeof _longdouble)) _longdouble)) +;; Check a corner of UTF-16 conversion: +(test "\U171D3" cast (cast "\U171D3" _string/utf-16 _pointer) _pointer _string/utf-16) + (report-errs) #| --- ignore everything below --- diff --git a/collects/tests/racket/unicode.rktl b/collects/tests/racket/unicode.rktl index 874feda7c9..6a3314304f 100644 --- a/collects/tests/racket/unicode.rktl +++ b/collects/tests/racket/unicode.rktl @@ -988,6 +988,14 @@ (bytes-convert c (bytes-append (integer->integer-bytes #xDC00 2 #f) (integer->integer-bytes #x1000 2 #f)))))))) + +;; Check a corner of UTF-16 conversion: +(let ([c (bytes-open-converter "platform-UTF-8" "platform-UTF-16")]) + (let-values ([(s n status) (bytes-convert c (string->bytes/utf-8 "\U171D3"))]) + (let ([c2 (bytes-open-converter "platform-UTF-16" "platform-UTF-8")]) + (let-values ([(s2 n2 status2) (bytes-convert c2 s)]) + (bytes->string/utf-8 s2))))) + (when (eq? (system-type) 'windows) (let ([c (bytes-open-converter "platform-UTF-8-permissive" "platform-UTF-16")]) ;; Check that we use all 6 bytes of #"\355\240\200\355\260\200" or none diff --git a/src/racket/src/string.c b/src/racket/src/string.c index 5e909388bc..f8e10ed32a 100644 --- a/src/racket/src/string.c +++ b/src/racket/src/string.c @@ -5389,6 +5389,7 @@ unsigned short *scheme_ucs4_to_utf16(const mzchar *text, intptr_t start, intptr_ for (i = start, j = 0; i < end; i++) { v = text[i]; if (v > 0xFFFF) { + v -= 0x10000; utf16[j++] = 0xD800 | ((v >> 10) & 0x3FF); utf16[j++] = 0xDC00 | (v & 0x3FF); } else