io: fix some error corners of encoding conversion
This commit is contained in:
parent
25a4f67912
commit
725af574fd
|
@ -22,7 +22,7 @@
|
|||
(define windows? (eq? 'windows (system-type)))
|
||||
(define platform-utf-8 (if windows? 'utf-8-ish 'utf-8))
|
||||
(define platform-utf-8-permissive (if windows? 'utf-8-ish-permissive 'utf-8-permissive))
|
||||
(define platform-utf-16 (if windows? 'utf-16-ish 'utf-16))
|
||||
(define platform-utf-16 (if windows? 'utf-16-ish 'utf-16-assume))
|
||||
|
||||
(define/who (bytes-open-converter from-str to-str)
|
||||
(check who string? from-str)
|
||||
|
@ -115,7 +115,8 @@
|
|||
(check who bytes? src-bstr)
|
||||
(check who exact-nonnegative-integer? src-start-pos)
|
||||
(check who exact-nonnegative-integer? src-end-pos)
|
||||
(check who #:or-false bytes? dest-bstr)
|
||||
(check who (lambda (d) (or (not d) (and (bytes? d) (not (immutable? d)))))
|
||||
#:contract "(or/c (and/c bytes? (not/c immutable?)) #f)" dest-bstr)
|
||||
(check who exact-nonnegative-integer? dest-start-pos)
|
||||
(check who #:or-false exact-nonnegative-integer? dest-end-pos)
|
||||
(check-range who src-start-pos src-end-pos (bytes-length src-bstr) src-bstr)
|
||||
|
|
|
@ -19,10 +19,12 @@
|
|||
(define to (utf-8-converter-to c))
|
||||
(define-values (in-consumed out-produced status)
|
||||
(if (or (eq? from 'utf-16)
|
||||
(eq? from 'utf-16-ish))
|
||||
(eq? from 'utf-16-ish)
|
||||
(eq? from 'utf-16-assume))
|
||||
(utf-16-ish-reencode! src src-start src-end
|
||||
dest dest-start dest-end
|
||||
#:from-utf-16-ish? (eq? from 'utf-16-ish))
|
||||
#:from-utf-16-ish? (eq? from 'utf-16-ish)
|
||||
#:assume-paired-surrogates? (eq? from 'utf-16-assume))
|
||||
(utf-8-ish-reencode! src src-start src-end
|
||||
dest dest-start dest-end
|
||||
#:permissive? (or (eq? from 'utf-8-permissive)
|
||||
|
@ -30,7 +32,8 @@
|
|||
#:from-utf-8-ish? (or (eq? from 'utf-8-ish)
|
||||
(eq? from 'utf-8-ish-permissive))
|
||||
#:to-utf-16? (or (eq? to 'utf-16)
|
||||
(eq? to 'utf-16-ish)))))
|
||||
(eq? to 'utf-16-ish)
|
||||
(eq? to 'utf-16-assume)))))
|
||||
(values in-consumed
|
||||
out-produced
|
||||
(case status
|
||||
|
@ -246,7 +249,8 @@
|
|||
;; Converts UTF-16 into UTF-8
|
||||
(define (utf-16-ish-reencode! in-bstr in-start in-end
|
||||
out-bstr out-start out-end
|
||||
#:from-utf-16-ish? from-utf-16-ish?)
|
||||
#:from-utf-16-ish? from-utf-16-ish?
|
||||
#:assume-paired-surrogates? assume-paired-surrogates?)
|
||||
(let loop ([i in-start] [j out-start])
|
||||
(define (done status)
|
||||
(values (- i in-start)
|
||||
|
@ -274,7 +278,8 @@
|
|||
[(and (v . >= . #xD800)
|
||||
(v . <= . #xDFFF))
|
||||
(cond
|
||||
[(v . <= . #xDBFF)
|
||||
[(or assume-paired-surrogates?
|
||||
(v . <= . #xDBFF))
|
||||
;; Look for surrogate pair
|
||||
(cond
|
||||
[((+ i 4) . > . in-end)
|
||||
|
@ -286,8 +291,9 @@
|
|||
(+ (arithmetic-shift a 8) b)
|
||||
(+ (arithmetic-shift b 8) a)))
|
||||
(cond
|
||||
[(and (v2 . >= . #xDC00)
|
||||
(v2 . <= . #xDFFF))
|
||||
[(or assume-paired-surrogates?
|
||||
(and (v2 . >= . #xDC00)
|
||||
(v2 . <= . #xDFFF)))
|
||||
(define v3 (+ #x10000
|
||||
(bitwise-ior (arithmetic-shift (bitwise-and v #x3FF) 10)
|
||||
(bitwise-and v2 #x3FF))))
|
||||
|
@ -303,5 +309,5 @@
|
|||
[from-utf-16-ish?
|
||||
;; continue anyway
|
||||
(continue v (+ i 2))]
|
||||
[else (done 'aborts)])])]
|
||||
[else (done 'error)])])]
|
||||
[else (continue v (+ i 2))])])))
|
||||
|
|
|
@ -5071,7 +5071,9 @@ static intptr_t utf8_encode_x(const unsigned int *us, intptr_t start, intptr_t e
|
|||
if ((wc & 0xF800) == 0xD800) {
|
||||
/* Unparse surrogates. We assume that the surrogates are
|
||||
well formed, unless this is Windows or if we're at the
|
||||
end and _opos is 0. */
|
||||
end and _opos is 0. The well-formedness assumption was
|
||||
probably not a good idea, but note that it's explicitly
|
||||
documented to behave that way. */
|
||||
# ifdef WINDOWS_UNICODE_SUPPORT
|
||||
# define UNPAIRED_MASK 0xFC00
|
||||
# else
|
||||
|
|
Loading…
Reference in New Issue
Block a user