cs: fix some UTF-16 and UCS-4 conversions

This commit is contained in:
Matthew Flatt 2021-05-08 08:44:33 -06:00
parent f35a92744d
commit c42d64cdbc
6 changed files with 38 additions and 28 deletions

View File

@ -1214,6 +1214,11 @@
(parameterize ([current-locale "C"]) (parameterize ([current-locale "C"])
(go #t)))) (go #t))))
(when (or known-locale?
(eq? 'macosx (system-type)))
(test "\U1F600" string-locale-downcase "\U1F600")
(test "\U1F600" string-locale-upcase "\U1F600"))
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; locale<->UTF-8 conversions ;; locale<->UTF-8 conversions

View File

@ -433,14 +433,14 @@
bstr) bstr)
(loop (fx+ i 1))))]))) (loop (fx+ i 1))))])))
(define (subbytes-at-2-byte-nul x) (define (subbytes-at-2-byte-nul x offset)
(let ([len (fxand (bytes-length x) (fxnot 1))]) (let ([len (fxand (bytes-length x) (fxnot 1))])
(let loop ([i 0]) (let loop ([i offset])
(cond (cond
[(fx= i len) x] [(fx= i len) (if (fx= offset 0) x (subbytes x offset len))]
[(and (fx= 0 (bytes-ref x i)) [(and (fx= 0 (bytes-ref x i))
(fx= 0 (bytes-ref x (fx+ i 1)))) (fx= 0 (bytes-ref x (fx+ i 1))))
(subbytes x 0 i)] (subbytes x offset i)]
[else (loop (fx+ i 2))])))) [else (loop (fx+ i 2))]))))
(define (uptr->bytes/2-byte-nul x) (define (uptr->bytes/2-byte-nul x)
@ -452,16 +452,16 @@
bstr) bstr)
(loop (fx+ i 2))))) (loop (fx+ i 2)))))
(define (subbytes-at-4-byte-nul x) (define (subbytes-at-4-byte-nul x offset)
(let ([len (fxand (bytes-length x) (fxnot 3))]) (let ([len (fxand (bytes-length x) (fxnot 3))])
(let loop ([i 0]) (let loop ([i offset])
(cond (cond
[(fx= i len) x] [(fx= i len) (if (fx= offset 0) x (subbytes x offset len))]
[(and (fx= 0 (bytes-ref x i)) [(and (fx= 0 (bytes-ref x i))
(fx= 0 (bytes-ref x (fx+ i 1))) (fx= 0 (bytes-ref x (fx+ i 1)))
(fx= 0 (bytes-ref x (fx+ i 2))) (fx= 0 (bytes-ref x (fx+ i 2)))
(fx= 0 (bytes-ref x (fx+ i 3)))) (fx= 0 (bytes-ref x (fx+ i 3))))
(subbytes x 0 i)] (subbytes x offset i)]
[else (loop (fx+ i 4))])))) [else (loop (fx+ i 4))]))))
(define (uptr->bytes/4-byte-nul x) (define (uptr->bytes/4-byte-nul x)
@ -901,9 +901,9 @@
'big)]) 'big)])
(cond (cond
[(bytevector? v) [(bytevector? v)
(utf16->string (subbytes-at-2-byte-nul v) endian #t)] (utf16->string (subbytes-at-2-byte-nul v offset) endian #t)]
[(integer? v) [(integer? v)
(utf16->string (uptr->bytes/2-byte-nul v) endian #t)] (utf16->string (uptr->bytes/2-byte-nul (+ v offset)) endian #t)]
[else #f]))) [else #f])))
=> (lambda (v) v)] => (lambda (v) v)]
[(and (word-aligned? offset) [(and (word-aligned? offset)
@ -915,9 +915,9 @@
'big)]) 'big)])
(cond (cond
[(bytevector? v) [(bytevector? v)
(utf32->string (subbytes-at-4-byte-nul v) endian #t)] (utf32->string (subbytes-at-4-byte-nul v offset) endian #t)]
[(integer? v) [(integer? v)
(utf32->string (uptr->bytes/4-byte-nul v) endian #t)] (utf32->string (uptr->bytes/4-byte-nul (+ v offset)) endian #t)]
[else #f]))) [else #f])))
=> (lambda (v) v)] => (lambda (v) v)]
[else [else
@ -930,12 +930,12 @@
(eq? 'utf-32le host-rep) (eq? 'utf-32le host-rep)
(eq? 'utf-32be host-rep)) (eq? 'utf-32be host-rep))
(let ([v (with-interrupts-disabled* (let ([v (with-interrupts-disabled*
(foreign-ref 'uptr (cpointer-address p) 0))]) (foreign-ref 'uptr (cpointer-address p) offset))])
(case host-rep (case host-rep
[(utf-16le) (utf16->string (uptr->bytes/2-byte-nul v) 'little #t)] [(utf-16le) (utf16->string (uptr->bytes/2-byte-nul v) 'little #t)]
[(utf-16be) (utf16->string (uptr->bytes/2-byte-nul v) 'big #t)] [(utf-16be) (utf16->string (uptr->bytes/2-byte-nul v) 'big #t)]
[(utf-32le) (utf16->string (uptr->bytes/4-byte-nul v) 'little #t)] [(utf-32le) (utf32->string (uptr->bytes/4-byte-nul v) 'little #t)]
[(utf-32be) (utf16->string (uptr->bytes/4-byte-nul v) 'big #t)]))] [(utf-32be) (utf32->string (uptr->bytes/4-byte-nul v) 'big #t)]))]
[else [else
;; Disable interrupts to avoid a GC: ;; Disable interrupts to avoid a GC:
(with-interrupts-disabled* (with-interrupts-disabled*

View File

@ -31045,11 +31045,14 @@
pos_0 pos_0
(unsafe-fxrshift hi_0 8) (unsafe-fxrshift hi_0 8)
(fxand hi_0 255)) (fxand hi_0 255))
(bytes-set-two! (let ((app_0 (+ pos_0 2)))
bstr_0 (bytes-set-two!
pos_0 bstr_0
(unsafe-fxrshift lo_0 8) app_0
(fxand lo_0 255)) (unsafe-fxrshift
lo_0
8)
(fxand lo_0 255)))
(fx+ pos_0 4))))) (fx+ pos_0 4)))))
(begin (begin
(bytes-set-two! (bytes-set-two!
@ -31119,7 +31122,7 @@
(if big-endian? (if big-endian?
(fxior (unsafe-fxlshift a_0 8) b_0) (fxior (unsafe-fxlshift a_0 8) b_0)
(fxior (unsafe-fxlshift b_0 8) a_0)))) (fxior (unsafe-fxlshift b_0 8) a_0))))
(if (fx= (fxand v_0 56320) 56320) (if (fx= (fxand v_0 56320) 55296)
(let ((a_1 (let ((a_1
(unsafe-bytes-ref bstr_0 (fx+ i_0 2)))) (unsafe-bytes-ref bstr_0 (fx+ i_0 2))))
(let ((b_1 (let ((b_1
@ -34375,11 +34378,11 @@
'subprocess 'subprocess
"(or/c (and/c output-port? file-stream-port?) #f 'stdout)" "(or/c (and/c output-port? file-stream-port?) #f 'stdout)"
stderr_0)) stderr_0))
(let ((lr1322 unsafe-undefined) (let ((lr1323 unsafe-undefined)
(group_0 unsafe-undefined) (group_0 unsafe-undefined)
(command_0 unsafe-undefined) (command_0 unsafe-undefined)
(exact/args_0 unsafe-undefined)) (exact/args_0 unsafe-undefined))
(set! lr1322 (set! lr1323
(call-with-values (call-with-values
(lambda () (lambda ()
(if (path-string? group/command_0) (if (path-string? group/command_0)
@ -34434,9 +34437,9 @@
((group_1 command_1 exact/args_1) ((group_1 command_1 exact/args_1)
(vector group_1 command_1 exact/args_1)) (vector group_1 command_1 exact/args_1))
(args (raise-binding-result-arity-error 3 args))))) (args (raise-binding-result-arity-error 3 args)))))
(set! group_0 (unsafe-vector*-ref lr1322 0)) (set! group_0 (unsafe-vector*-ref lr1323 0))
(set! command_0 (unsafe-vector*-ref lr1322 1)) (set! command_0 (unsafe-vector*-ref lr1323 1))
(set! exact/args_0 (unsafe-vector*-ref lr1322 2)) (set! exact/args_0 (unsafe-vector*-ref lr1323 2))
(call-with-values (call-with-values
(lambda () (lambda ()
(if (if (pair? exact/args_0) (if (if (pair? exact/args_0)

View File

@ -655,6 +655,8 @@
(parameterize ([current-locale "en_US.ISO8859-1"]) (parameterize ([current-locale "en_US.ISO8859-1"])
(test "Éric" (string-locale-downcase "Éric")))) (test "Éric" (string-locale-downcase "Éric"))))
(when (eq? 'macosx (system-type))
(test "\U1F600" (string-locale-downcase "\U1F600")))
;; ---------------------------------------- ;; ----------------------------------------

View File

@ -24,7 +24,7 @@
(fxior (fxlshift a 8) b) (fxior (fxlshift a 8) b)
(fxior (fxlshift b 8) a))) (fxior (fxlshift b 8) a)))
(cond (cond
[(fx= (fxand v #xDC00) #xDC00) [(fx= (fxand v #xDC00) #xD800)
;; surrogate pair ;; surrogate pair
(define a (bytes-ref bstr (fx+ i 2))) (define a (bytes-ref bstr (fx+ i 2)))
(define b (bytes-ref bstr (fx+ i 3))) (define b (bytes-ref bstr (fx+ i 3)))

View File

@ -19,7 +19,7 @@
(define hi (fxior #xD800 (fxand (fxrshift av 10) #x3FF))) (define hi (fxior #xD800 (fxand (fxrshift av 10) #x3FF)))
(define lo (fxior #xDC00 (fxand av #x3FF))) (define lo (fxior #xDC00 (fxand av #x3FF)))
(bytes-set-two! bstr pos (fxrshift hi 8) (fxand hi #xFF)) (bytes-set-two! bstr pos (fxrshift hi 8) (fxand hi #xFF))
(bytes-set-two! bstr pos (fxrshift lo 8) (fxand lo #xFF)) (bytes-set-two! bstr (+ pos 2) (fxrshift lo 8) (fxand lo #xFF))
(fx+ pos 4)] (fx+ pos 4)]
[else [else
(bytes-set-two! bstr pos (fxrshift v 8) (fxand v #xFF)) (bytes-set-two! bstr pos (fxrshift v 8) (fxand v #xFF))