From 725af574fd3499ffac42d5d77d25caed2ae20b34 Mon Sep 17 00:00:00 2001 From: Matthew Flatt Date: Sun, 20 Jan 2019 15:35:36 -0700 Subject: [PATCH] io: fix some error corners of encoding conversion --- racket/src/io/converter/main.rkt | 5 +++-- racket/src/io/converter/utf-8.rkt | 22 ++++++++++++++-------- racket/src/racket/src/string.c | 4 +++- 3 files changed, 20 insertions(+), 11 deletions(-) diff --git a/racket/src/io/converter/main.rkt b/racket/src/io/converter/main.rkt index b1390bbd53..0c82e6034e 100644 --- a/racket/src/io/converter/main.rkt +++ b/racket/src/io/converter/main.rkt @@ -22,7 +22,7 @@ (define windows? (eq? 'windows (system-type))) (define platform-utf-8 (if windows? 'utf-8-ish 'utf-8)) (define platform-utf-8-permissive (if windows? 'utf-8-ish-permissive 'utf-8-permissive)) -(define platform-utf-16 (if windows? 'utf-16-ish 'utf-16)) +(define platform-utf-16 (if windows? 'utf-16-ish 'utf-16-assume)) (define/who (bytes-open-converter from-str to-str) (check who string? from-str) @@ -115,7 +115,8 @@ (check who bytes? src-bstr) (check who exact-nonnegative-integer? src-start-pos) (check who exact-nonnegative-integer? src-end-pos) - (check who #:or-false bytes? dest-bstr) + (check who (lambda (d) (or (not d) (and (bytes? d) (not (immutable? d))))) + #:contract "(or/c (and/c bytes? (not/c immutable?)) #f)" dest-bstr) (check who exact-nonnegative-integer? dest-start-pos) (check who #:or-false exact-nonnegative-integer? dest-end-pos) (check-range who src-start-pos src-end-pos (bytes-length src-bstr) src-bstr) diff --git a/racket/src/io/converter/utf-8.rkt b/racket/src/io/converter/utf-8.rkt index 53ca542f83..69c47ebc28 100644 --- a/racket/src/io/converter/utf-8.rkt +++ b/racket/src/io/converter/utf-8.rkt @@ -19,10 +19,12 @@ (define to (utf-8-converter-to c)) (define-values (in-consumed out-produced status) (if (or (eq? from 'utf-16) - (eq? from 'utf-16-ish)) + (eq? from 'utf-16-ish) + (eq? from 'utf-16-assume)) (utf-16-ish-reencode! src src-start src-end dest dest-start dest-end - #:from-utf-16-ish? (eq? from 'utf-16-ish)) + #:from-utf-16-ish? (eq? from 'utf-16-ish) + #:assume-paired-surrogates? (eq? from 'utf-16-assume)) (utf-8-ish-reencode! src src-start src-end dest dest-start dest-end #:permissive? (or (eq? from 'utf-8-permissive) @@ -30,7 +32,8 @@ #:from-utf-8-ish? (or (eq? from 'utf-8-ish) (eq? from 'utf-8-ish-permissive)) #:to-utf-16? (or (eq? to 'utf-16) - (eq? to 'utf-16-ish))))) + (eq? to 'utf-16-ish) + (eq? to 'utf-16-assume))))) (values in-consumed out-produced (case status @@ -246,7 +249,8 @@ ;; Converts UTF-16 into UTF-8 (define (utf-16-ish-reencode! in-bstr in-start in-end out-bstr out-start out-end - #:from-utf-16-ish? from-utf-16-ish?) + #:from-utf-16-ish? from-utf-16-ish? + #:assume-paired-surrogates? assume-paired-surrogates?) (let loop ([i in-start] [j out-start]) (define (done status) (values (- i in-start) @@ -274,7 +278,8 @@ [(and (v . >= . #xD800) (v . <= . #xDFFF)) (cond - [(v . <= . #xDBFF) + [(or assume-paired-surrogates? + (v . <= . #xDBFF)) ;; Look for surrogate pair (cond [((+ i 4) . > . in-end) @@ -286,8 +291,9 @@ (+ (arithmetic-shift a 8) b) (+ (arithmetic-shift b 8) a))) (cond - [(and (v2 . >= . #xDC00) - (v2 . <= . #xDFFF)) + [(or assume-paired-surrogates? + (and (v2 . >= . #xDC00) + (v2 . <= . #xDFFF))) (define v3 (+ #x10000 (bitwise-ior (arithmetic-shift (bitwise-and v #x3FF) 10) (bitwise-and v2 #x3FF)))) @@ -303,5 +309,5 @@ [from-utf-16-ish? ;; continue anyway (continue v (+ i 2))] - [else (done 'aborts)])])] + [else (done 'error)])])] [else (continue v (+ i 2))])]))) diff --git a/racket/src/racket/src/string.c b/racket/src/racket/src/string.c index 761cab41dd..9525bd61db 100644 --- a/racket/src/racket/src/string.c +++ b/racket/src/racket/src/string.c @@ -5071,7 +5071,9 @@ static intptr_t utf8_encode_x(const unsigned int *us, intptr_t start, intptr_t e if ((wc & 0xF800) == 0xD800) { /* Unparse surrogates. We assume that the surrogates are well formed, unless this is Windows or if we're at the - end and _opos is 0. */ + end and _opos is 0. The well-formedness assumption was + probably not a good idea, but note that it's explicitly + documented to behave that way. */ # ifdef WINDOWS_UNICODE_SUPPORT # define UNPAIRED_MASK 0xFC00 # else