io: fix some error corners of encoding conversion

2019-01-20 15:35:36 -07:00 · 2019-01-20 15:35:36 -07:00 · 725af574fd
commit 725af574fd
parent 25a4f67912
3 changed files with 20 additions and 11 deletions
--- a/racket/src/io/converter/main.rkt
+++ b/racket/src/io/converter/main.rkt
@ -22,7 +22,7 @@
 (define windows? (eq? 'windows (system-type)))
 (define platform-utf-8 (if windows? 'utf-8-ish 'utf-8))
 (define platform-utf-8-permissive (if windows? 'utf-8-ish-permissive 'utf-8-permissive))
-(define platform-utf-16 (if windows? 'utf-16-ish 'utf-16))
+(define platform-utf-16 (if windows? 'utf-16-ish 'utf-16-assume))

 (define/who (bytes-open-converter from-str to-str)
  (check who string? from-str)
@ -115,7 +115,8 @@
  (check who bytes? src-bstr)
  (check who exact-nonnegative-integer? src-start-pos)
  (check who exact-nonnegative-integer? src-end-pos)
-  (check who #:or-false bytes? dest-bstr)
+  (check who (lambda (d) (or (not d) (and (bytes? d) (not (immutable? d)))))
+         #:contract "(or/c (and/c bytes? (not/c immutable?)) #f)" dest-bstr)
  (check who exact-nonnegative-integer? dest-start-pos)
  (check who #:or-false exact-nonnegative-integer? dest-end-pos)
  (check-range who src-start-pos src-end-pos (bytes-length src-bstr) src-bstr)
--- a/racket/src/io/converter/utf-8.rkt
+++ b/racket/src/io/converter/utf-8.rkt
@ -19,10 +19,12 @@
  (define to (utf-8-converter-to c))
  (define-values (in-consumed out-produced status)
    (if (or (eq? from 'utf-16)
-            (eq? from 'utf-16-ish))
+            (eq? from 'utf-16-ish)
+            (eq? from 'utf-16-assume))
        (utf-16-ish-reencode! src src-start src-end
                              dest dest-start dest-end
-                              #:from-utf-16-ish? (eq? from 'utf-16-ish))
+                              #:from-utf-16-ish? (eq? from 'utf-16-ish)
+                              #:assume-paired-surrogates? (eq? from 'utf-16-assume))
        (utf-8-ish-reencode! src src-start src-end
                             dest dest-start dest-end
                             #:permissive? (or (eq? from 'utf-8-permissive)
@ -30,7 +32,8 @@
                             #:from-utf-8-ish? (or (eq? from 'utf-8-ish)
                                                   (eq? from 'utf-8-ish-permissive))
                             #:to-utf-16? (or (eq? to 'utf-16)
-                                              (eq? to 'utf-16-ish)))))
+                                              (eq? to 'utf-16-ish)
+                                              (eq? to 'utf-16-assume)))))
  (values in-consumed
          out-produced
          (case status
@ -246,7 +249,8 @@
 ;; Converts UTF-16 into UTF-8
 (define (utf-16-ish-reencode! in-bstr in-start in-end
                              out-bstr out-start out-end
-                              #:from-utf-16-ish? from-utf-16-ish?)
+                              #:from-utf-16-ish? from-utf-16-ish?
+                              #:assume-paired-surrogates? assume-paired-surrogates?)
  (let loop ([i in-start] [j out-start])
    (define (done status)
      (values (- i in-start)
@ -274,7 +278,8 @@
         [(and (v . >= . #xD800)
               (v . <= . #xDFFF))
          (cond
-            [(v . <= . #xDBFF)
+            [(or assume-paired-surrogates?
+                 (v . <= . #xDBFF))
             ;; Look for surrogate pair
             (cond
               [((+ i 4) . > . in-end)
@ -286,8 +291,9 @@
                               (+ (arithmetic-shift a 8) b)
                               (+ (arithmetic-shift b 8) a)))
                (cond
-                  [(and (v2 . >= . #xDC00)
-                        (v2 . <= . #xDFFF))
+                  [(or assume-paired-surrogates?
+                       (and (v2 . >= . #xDC00)
+                            (v2 . <= . #xDFFF)))
                   (define v3 (+ #x10000
                                 (bitwise-ior (arithmetic-shift (bitwise-and v #x3FF) 10)
                                              (bitwise-and v2 #x3FF))))
@ -303,5 +309,5 @@
               [from-utf-16-ish?
                ;; continue anyway
                (continue v (+ i 2))]
-               [else (done 'aborts)])])]
+               [else (done 'error)])])]
         [else (continue v (+ i 2))])])))
--- a/racket/src/racket/src/string.c
+++ b/racket/src/racket/src/string.c
@ -5071,7 +5071,9 @@ static intptr_t utf8_encode_x(const unsigned int *us, intptr_t start, intptr_t e
 	if ((wc & 0xF800) == 0xD800) {
 	  /* Unparse surrogates. We assume that the surrogates are
 	     well formed, unless this is Windows or if we're at the
-             end and _opos is 0. */
+             end and _opos is 0. The well-formedness assumption was
+             probably not a good idea, but note that it's explicitly
+             documented to behave that way. */
 # ifdef WINDOWS_UNICODE_SUPPORT
 #  define UNPAIRED_MASK 0xFC00
 # else