io: fix some error corners of encoding conversion

This commit is contained in:
Matthew Flatt 2019-01-20 15:35:36 -07:00
parent 25a4f67912
commit 725af574fd
3 changed files with 20 additions and 11 deletions

View File

@ -22,7 +22,7 @@
(define windows? (eq? 'windows (system-type)))
(define platform-utf-8 (if windows? 'utf-8-ish 'utf-8))
(define platform-utf-8-permissive (if windows? 'utf-8-ish-permissive 'utf-8-permissive))
(define platform-utf-16 (if windows? 'utf-16-ish 'utf-16))
(define platform-utf-16 (if windows? 'utf-16-ish 'utf-16-assume))
(define/who (bytes-open-converter from-str to-str)
(check who string? from-str)
@ -115,7 +115,8 @@
(check who bytes? src-bstr)
(check who exact-nonnegative-integer? src-start-pos)
(check who exact-nonnegative-integer? src-end-pos)
(check who #:or-false bytes? dest-bstr)
(check who (lambda (d) (or (not d) (and (bytes? d) (not (immutable? d)))))
#:contract "(or/c (and/c bytes? (not/c immutable?)) #f)" dest-bstr)
(check who exact-nonnegative-integer? dest-start-pos)
(check who #:or-false exact-nonnegative-integer? dest-end-pos)
(check-range who src-start-pos src-end-pos (bytes-length src-bstr) src-bstr)

View File

@ -19,10 +19,12 @@
(define to (utf-8-converter-to c))
(define-values (in-consumed out-produced status)
(if (or (eq? from 'utf-16)
(eq? from 'utf-16-ish))
(eq? from 'utf-16-ish)
(eq? from 'utf-16-assume))
(utf-16-ish-reencode! src src-start src-end
dest dest-start dest-end
#:from-utf-16-ish? (eq? from 'utf-16-ish))
#:from-utf-16-ish? (eq? from 'utf-16-ish)
#:assume-paired-surrogates? (eq? from 'utf-16-assume))
(utf-8-ish-reencode! src src-start src-end
dest dest-start dest-end
#:permissive? (or (eq? from 'utf-8-permissive)
@ -30,7 +32,8 @@
#:from-utf-8-ish? (or (eq? from 'utf-8-ish)
(eq? from 'utf-8-ish-permissive))
#:to-utf-16? (or (eq? to 'utf-16)
(eq? to 'utf-16-ish)))))
(eq? to 'utf-16-ish)
(eq? to 'utf-16-assume)))))
(values in-consumed
out-produced
(case status
@ -246,7 +249,8 @@
;; Converts UTF-16 into UTF-8
(define (utf-16-ish-reencode! in-bstr in-start in-end
out-bstr out-start out-end
#:from-utf-16-ish? from-utf-16-ish?)
#:from-utf-16-ish? from-utf-16-ish?
#:assume-paired-surrogates? assume-paired-surrogates?)
(let loop ([i in-start] [j out-start])
(define (done status)
(values (- i in-start)
@ -274,7 +278,8 @@
[(and (v . >= . #xD800)
(v . <= . #xDFFF))
(cond
[(v . <= . #xDBFF)
[(or assume-paired-surrogates?
(v . <= . #xDBFF))
;; Look for surrogate pair
(cond
[((+ i 4) . > . in-end)
@ -286,8 +291,9 @@
(+ (arithmetic-shift a 8) b)
(+ (arithmetic-shift b 8) a)))
(cond
[(and (v2 . >= . #xDC00)
(v2 . <= . #xDFFF))
[(or assume-paired-surrogates?
(and (v2 . >= . #xDC00)
(v2 . <= . #xDFFF)))
(define v3 (+ #x10000
(bitwise-ior (arithmetic-shift (bitwise-and v #x3FF) 10)
(bitwise-and v2 #x3FF))))
@ -303,5 +309,5 @@
[from-utf-16-ish?
;; continue anyway
(continue v (+ i 2))]
[else (done 'aborts)])])]
[else (done 'error)])])]
[else (continue v (+ i 2))])])))

View File

@ -5071,7 +5071,9 @@ static intptr_t utf8_encode_x(const unsigned int *us, intptr_t start, intptr_t e
if ((wc & 0xF800) == 0xD800) {
/* Unparse surrogates. We assume that the surrogates are
well formed, unless this is Windows or if we're at the
end and _opos is 0. */
end and _opos is 0. The well-formedness assumption was
probably not a good idea, but note that it's explicitly
documented to behave that way. */
# ifdef WINDOWS_UNICODE_SUPPORT
# define UNPAIRED_MASK 0xFC00
# else