io: fix some error corners of encoding conversion
This commit is contained in:
parent
25a4f67912
commit
725af574fd
|
@ -22,7 +22,7 @@
|
||||||
(define windows? (eq? 'windows (system-type)))
|
(define windows? (eq? 'windows (system-type)))
|
||||||
(define platform-utf-8 (if windows? 'utf-8-ish 'utf-8))
|
(define platform-utf-8 (if windows? 'utf-8-ish 'utf-8))
|
||||||
(define platform-utf-8-permissive (if windows? 'utf-8-ish-permissive 'utf-8-permissive))
|
(define platform-utf-8-permissive (if windows? 'utf-8-ish-permissive 'utf-8-permissive))
|
||||||
(define platform-utf-16 (if windows? 'utf-16-ish 'utf-16))
|
(define platform-utf-16 (if windows? 'utf-16-ish 'utf-16-assume))
|
||||||
|
|
||||||
(define/who (bytes-open-converter from-str to-str)
|
(define/who (bytes-open-converter from-str to-str)
|
||||||
(check who string? from-str)
|
(check who string? from-str)
|
||||||
|
@ -115,7 +115,8 @@
|
||||||
(check who bytes? src-bstr)
|
(check who bytes? src-bstr)
|
||||||
(check who exact-nonnegative-integer? src-start-pos)
|
(check who exact-nonnegative-integer? src-start-pos)
|
||||||
(check who exact-nonnegative-integer? src-end-pos)
|
(check who exact-nonnegative-integer? src-end-pos)
|
||||||
(check who #:or-false bytes? dest-bstr)
|
(check who (lambda (d) (or (not d) (and (bytes? d) (not (immutable? d)))))
|
||||||
|
#:contract "(or/c (and/c bytes? (not/c immutable?)) #f)" dest-bstr)
|
||||||
(check who exact-nonnegative-integer? dest-start-pos)
|
(check who exact-nonnegative-integer? dest-start-pos)
|
||||||
(check who #:or-false exact-nonnegative-integer? dest-end-pos)
|
(check who #:or-false exact-nonnegative-integer? dest-end-pos)
|
||||||
(check-range who src-start-pos src-end-pos (bytes-length src-bstr) src-bstr)
|
(check-range who src-start-pos src-end-pos (bytes-length src-bstr) src-bstr)
|
||||||
|
|
|
@ -19,10 +19,12 @@
|
||||||
(define to (utf-8-converter-to c))
|
(define to (utf-8-converter-to c))
|
||||||
(define-values (in-consumed out-produced status)
|
(define-values (in-consumed out-produced status)
|
||||||
(if (or (eq? from 'utf-16)
|
(if (or (eq? from 'utf-16)
|
||||||
(eq? from 'utf-16-ish))
|
(eq? from 'utf-16-ish)
|
||||||
|
(eq? from 'utf-16-assume))
|
||||||
(utf-16-ish-reencode! src src-start src-end
|
(utf-16-ish-reencode! src src-start src-end
|
||||||
dest dest-start dest-end
|
dest dest-start dest-end
|
||||||
#:from-utf-16-ish? (eq? from 'utf-16-ish))
|
#:from-utf-16-ish? (eq? from 'utf-16-ish)
|
||||||
|
#:assume-paired-surrogates? (eq? from 'utf-16-assume))
|
||||||
(utf-8-ish-reencode! src src-start src-end
|
(utf-8-ish-reencode! src src-start src-end
|
||||||
dest dest-start dest-end
|
dest dest-start dest-end
|
||||||
#:permissive? (or (eq? from 'utf-8-permissive)
|
#:permissive? (or (eq? from 'utf-8-permissive)
|
||||||
|
@ -30,7 +32,8 @@
|
||||||
#:from-utf-8-ish? (or (eq? from 'utf-8-ish)
|
#:from-utf-8-ish? (or (eq? from 'utf-8-ish)
|
||||||
(eq? from 'utf-8-ish-permissive))
|
(eq? from 'utf-8-ish-permissive))
|
||||||
#:to-utf-16? (or (eq? to 'utf-16)
|
#:to-utf-16? (or (eq? to 'utf-16)
|
||||||
(eq? to 'utf-16-ish)))))
|
(eq? to 'utf-16-ish)
|
||||||
|
(eq? to 'utf-16-assume)))))
|
||||||
(values in-consumed
|
(values in-consumed
|
||||||
out-produced
|
out-produced
|
||||||
(case status
|
(case status
|
||||||
|
@ -246,7 +249,8 @@
|
||||||
;; Converts UTF-16 into UTF-8
|
;; Converts UTF-16 into UTF-8
|
||||||
(define (utf-16-ish-reencode! in-bstr in-start in-end
|
(define (utf-16-ish-reencode! in-bstr in-start in-end
|
||||||
out-bstr out-start out-end
|
out-bstr out-start out-end
|
||||||
#:from-utf-16-ish? from-utf-16-ish?)
|
#:from-utf-16-ish? from-utf-16-ish?
|
||||||
|
#:assume-paired-surrogates? assume-paired-surrogates?)
|
||||||
(let loop ([i in-start] [j out-start])
|
(let loop ([i in-start] [j out-start])
|
||||||
(define (done status)
|
(define (done status)
|
||||||
(values (- i in-start)
|
(values (- i in-start)
|
||||||
|
@ -274,7 +278,8 @@
|
||||||
[(and (v . >= . #xD800)
|
[(and (v . >= . #xD800)
|
||||||
(v . <= . #xDFFF))
|
(v . <= . #xDFFF))
|
||||||
(cond
|
(cond
|
||||||
[(v . <= . #xDBFF)
|
[(or assume-paired-surrogates?
|
||||||
|
(v . <= . #xDBFF))
|
||||||
;; Look for surrogate pair
|
;; Look for surrogate pair
|
||||||
(cond
|
(cond
|
||||||
[((+ i 4) . > . in-end)
|
[((+ i 4) . > . in-end)
|
||||||
|
@ -286,8 +291,9 @@
|
||||||
(+ (arithmetic-shift a 8) b)
|
(+ (arithmetic-shift a 8) b)
|
||||||
(+ (arithmetic-shift b 8) a)))
|
(+ (arithmetic-shift b 8) a)))
|
||||||
(cond
|
(cond
|
||||||
[(and (v2 . >= . #xDC00)
|
[(or assume-paired-surrogates?
|
||||||
(v2 . <= . #xDFFF))
|
(and (v2 . >= . #xDC00)
|
||||||
|
(v2 . <= . #xDFFF)))
|
||||||
(define v3 (+ #x10000
|
(define v3 (+ #x10000
|
||||||
(bitwise-ior (arithmetic-shift (bitwise-and v #x3FF) 10)
|
(bitwise-ior (arithmetic-shift (bitwise-and v #x3FF) 10)
|
||||||
(bitwise-and v2 #x3FF))))
|
(bitwise-and v2 #x3FF))))
|
||||||
|
@ -303,5 +309,5 @@
|
||||||
[from-utf-16-ish?
|
[from-utf-16-ish?
|
||||||
;; continue anyway
|
;; continue anyway
|
||||||
(continue v (+ i 2))]
|
(continue v (+ i 2))]
|
||||||
[else (done 'aborts)])])]
|
[else (done 'error)])])]
|
||||||
[else (continue v (+ i 2))])])))
|
[else (continue v (+ i 2))])])))
|
||||||
|
|
|
@ -5071,7 +5071,9 @@ static intptr_t utf8_encode_x(const unsigned int *us, intptr_t start, intptr_t e
|
||||||
if ((wc & 0xF800) == 0xD800) {
|
if ((wc & 0xF800) == 0xD800) {
|
||||||
/* Unparse surrogates. We assume that the surrogates are
|
/* Unparse surrogates. We assume that the surrogates are
|
||||||
well formed, unless this is Windows or if we're at the
|
well formed, unless this is Windows or if we're at the
|
||||||
end and _opos is 0. */
|
end and _opos is 0. The well-formedness assumption was
|
||||||
|
probably not a good idea, but note that it's explicitly
|
||||||
|
documented to behave that way. */
|
||||||
# ifdef WINDOWS_UNICODE_SUPPORT
|
# ifdef WINDOWS_UNICODE_SUPPORT
|
||||||
# define UNPAIRED_MASK 0xFC00
|
# define UNPAIRED_MASK 0xFC00
|
||||||
# else
|
# else
|
||||||
|
|
Loading…
Reference in New Issue
Block a user