io: fix some error corners of encoding conversion

This commit is contained in:
Matthew Flatt 2019-01-20 15:35:36 -07:00
parent 25a4f67912
commit 725af574fd
3 changed files with 20 additions and 11 deletions

View File

@ -22,7 +22,7 @@
(define windows? (eq? 'windows (system-type))) (define windows? (eq? 'windows (system-type)))
(define platform-utf-8 (if windows? 'utf-8-ish 'utf-8)) (define platform-utf-8 (if windows? 'utf-8-ish 'utf-8))
(define platform-utf-8-permissive (if windows? 'utf-8-ish-permissive 'utf-8-permissive)) (define platform-utf-8-permissive (if windows? 'utf-8-ish-permissive 'utf-8-permissive))
(define platform-utf-16 (if windows? 'utf-16-ish 'utf-16)) (define platform-utf-16 (if windows? 'utf-16-ish 'utf-16-assume))
(define/who (bytes-open-converter from-str to-str) (define/who (bytes-open-converter from-str to-str)
(check who string? from-str) (check who string? from-str)
@ -115,7 +115,8 @@
(check who bytes? src-bstr) (check who bytes? src-bstr)
(check who exact-nonnegative-integer? src-start-pos) (check who exact-nonnegative-integer? src-start-pos)
(check who exact-nonnegative-integer? src-end-pos) (check who exact-nonnegative-integer? src-end-pos)
(check who #:or-false bytes? dest-bstr) (check who (lambda (d) (or (not d) (and (bytes? d) (not (immutable? d)))))
#:contract "(or/c (and/c bytes? (not/c immutable?)) #f)" dest-bstr)
(check who exact-nonnegative-integer? dest-start-pos) (check who exact-nonnegative-integer? dest-start-pos)
(check who #:or-false exact-nonnegative-integer? dest-end-pos) (check who #:or-false exact-nonnegative-integer? dest-end-pos)
(check-range who src-start-pos src-end-pos (bytes-length src-bstr) src-bstr) (check-range who src-start-pos src-end-pos (bytes-length src-bstr) src-bstr)

View File

@ -19,10 +19,12 @@
(define to (utf-8-converter-to c)) (define to (utf-8-converter-to c))
(define-values (in-consumed out-produced status) (define-values (in-consumed out-produced status)
(if (or (eq? from 'utf-16) (if (or (eq? from 'utf-16)
(eq? from 'utf-16-ish)) (eq? from 'utf-16-ish)
(eq? from 'utf-16-assume))
(utf-16-ish-reencode! src src-start src-end (utf-16-ish-reencode! src src-start src-end
dest dest-start dest-end dest dest-start dest-end
#:from-utf-16-ish? (eq? from 'utf-16-ish)) #:from-utf-16-ish? (eq? from 'utf-16-ish)
#:assume-paired-surrogates? (eq? from 'utf-16-assume))
(utf-8-ish-reencode! src src-start src-end (utf-8-ish-reencode! src src-start src-end
dest dest-start dest-end dest dest-start dest-end
#:permissive? (or (eq? from 'utf-8-permissive) #:permissive? (or (eq? from 'utf-8-permissive)
@ -30,7 +32,8 @@
#:from-utf-8-ish? (or (eq? from 'utf-8-ish) #:from-utf-8-ish? (or (eq? from 'utf-8-ish)
(eq? from 'utf-8-ish-permissive)) (eq? from 'utf-8-ish-permissive))
#:to-utf-16? (or (eq? to 'utf-16) #:to-utf-16? (or (eq? to 'utf-16)
(eq? to 'utf-16-ish))))) (eq? to 'utf-16-ish)
(eq? to 'utf-16-assume)))))
(values in-consumed (values in-consumed
out-produced out-produced
(case status (case status
@ -246,7 +249,8 @@
;; Converts UTF-16 into UTF-8 ;; Converts UTF-16 into UTF-8
(define (utf-16-ish-reencode! in-bstr in-start in-end (define (utf-16-ish-reencode! in-bstr in-start in-end
out-bstr out-start out-end out-bstr out-start out-end
#:from-utf-16-ish? from-utf-16-ish?) #:from-utf-16-ish? from-utf-16-ish?
#:assume-paired-surrogates? assume-paired-surrogates?)
(let loop ([i in-start] [j out-start]) (let loop ([i in-start] [j out-start])
(define (done status) (define (done status)
(values (- i in-start) (values (- i in-start)
@ -274,7 +278,8 @@
[(and (v . >= . #xD800) [(and (v . >= . #xD800)
(v . <= . #xDFFF)) (v . <= . #xDFFF))
(cond (cond
[(v . <= . #xDBFF) [(or assume-paired-surrogates?
(v . <= . #xDBFF))
;; Look for surrogate pair ;; Look for surrogate pair
(cond (cond
[((+ i 4) . > . in-end) [((+ i 4) . > . in-end)
@ -286,8 +291,9 @@
(+ (arithmetic-shift a 8) b) (+ (arithmetic-shift a 8) b)
(+ (arithmetic-shift b 8) a))) (+ (arithmetic-shift b 8) a)))
(cond (cond
[(and (v2 . >= . #xDC00) [(or assume-paired-surrogates?
(v2 . <= . #xDFFF)) (and (v2 . >= . #xDC00)
(v2 . <= . #xDFFF)))
(define v3 (+ #x10000 (define v3 (+ #x10000
(bitwise-ior (arithmetic-shift (bitwise-and v #x3FF) 10) (bitwise-ior (arithmetic-shift (bitwise-and v #x3FF) 10)
(bitwise-and v2 #x3FF)))) (bitwise-and v2 #x3FF))))
@ -303,5 +309,5 @@
[from-utf-16-ish? [from-utf-16-ish?
;; continue anyway ;; continue anyway
(continue v (+ i 2))] (continue v (+ i 2))]
[else (done 'aborts)])])] [else (done 'error)])])]
[else (continue v (+ i 2))])]))) [else (continue v (+ i 2))])])))

View File

@ -5071,7 +5071,9 @@ static intptr_t utf8_encode_x(const unsigned int *us, intptr_t start, intptr_t e
if ((wc & 0xF800) == 0xD800) { if ((wc & 0xF800) == 0xD800) {
/* Unparse surrogates. We assume that the surrogates are /* Unparse surrogates. We assume that the surrogates are
well formed, unless this is Windows or if we're at the well formed, unless this is Windows or if we're at the
end and _opos is 0. */ end and _opos is 0. The well-formedness assumption was
probably not a good idea, but note that it's explicitly
documented to behave that way. */
# ifdef WINDOWS_UNICODE_SUPPORT # ifdef WINDOWS_UNICODE_SUPPORT
# define UNPAIRED_MASK 0xFC00 # define UNPAIRED_MASK 0xFC00
# else # else