improved/corrected UTF-16 tests

svn: r2901
This commit is contained in:
Matthew Flatt 2006-05-10 16:28:25 +00:00
parent 30ad72f765
commit 97e800aeee

View File

@ -551,35 +551,35 @@
(#(#f #f #f #f #f #f) error
(#o374 #o200 #o200 #o200 #o200 #o200))
;; illedgal surrogates
(#(#f #f #f) surrogate
(#(#f #f #f) surrogate1
(#o355 #o240 #o200))
(#(#f #f #f) surrogate
(#(#f #f #f) surrogate1
(#o355 #o255 #o277))
(#(#f #f #f) surrogate
(#(#f #f #f) surrogate1
(#o355 #o256 #o200))
(#(#f #f #f) surrogate
(#(#f #f #f) surrogate1
(#o355 #o257 #o277))
(#(#f #f #f) surrogate
(#(#f #f #f) surrogate2
(#o355 #o260 #o200))
(#(#f #f #f) surrogate
(#(#f #f #f) surrogate2
(#o355 #o276 #o200))
(#(#f #f #f) surrogate
(#(#f #f #f) surrogate2
(#o355 #o277 #o277))
(#(#f #f #f #f #f #f) surrogate
(#(#f #f #f #f #f #f) error
(#o355 #o240 #o200 #o355 #o260 #o200))
(#(#f #f #f #f #f #f) surrogate
(#(#f #f #f #f #f #f) error
(#o355 #o240 #o200 #o355 #o277 #o277))
(#(#f #f #f #f #f #f) surrogate
(#(#f #f #f #f #f #f) error
(#o355 #o255 #o277 #o355 #o260 #o200))
(#(#f #f #f #f #f #f) surrogate
(#(#f #f #f #f #f #f) error
(#o355 #o255 #o277 #o355 #o277 #o277))
(#(#f #f #f #f #f #f) surrogate
(#(#f #f #f #f #f #f) error
(#o355 #o256 #o200 #o355 #o260 #o200))
(#(#f #f #f #f #f #f) surrogate
(#(#f #f #f #f #f #f) error
(#o355 #o256 #o200 #o355 #o277 #o277))
(#(#f #f #f #f #f #f) surrogate
(#(#f #f #f #f #f #f) error
(#o355 #o257 #o277 #o355 #o260 #o200))
(#(#f #f #f #f #f #f) surrogate
(#(#f #f #f #f #f #f) error
(#o355 #o257 #o277 #o355 #o277 #o277))
;; 0xFFFE and 0xFFFF
(#(#xFFFE) complete
@ -609,7 +609,6 @@
eof
(list->string l)))
#;
(let ([utf-8-iconv (bytes-open-converter "UTF-8" "UTF-8")]
[utf-8-iconv-p (bytes-open-converter "UTF-8-permissive" "UTF-8")])
;; First, check some simple conversions
@ -667,7 +666,7 @@
(test 0 bytes-utf-8-index s 0)
(let-values ([(s2 n status) (bytes-convert utf-8-iconv s)])
(test (case parse-status
[(error/aborts surrogate) 'error]
[(error/aborts surrogate1 surrogate2) 'error]
[else parse-status ])
'status status))
(let ([convert
@ -701,7 +700,7 @@
(bytes-convert utf-8-iconv-p (bytes-append prefix s))]
[(pl) (bytes-length prefix)])
(case parse-status
[(error surrogate)
[(error surrogate1 surrogate2)
(test 'complete 'status status)
(test (+ (bytes-length s) pl) 'n n)
(test (+ (vector-length code-points) pl) bytes-length s2)
@ -863,9 +862,38 @@
(let ([c (bytes-open-converter "platform-UTF-8" "platform-UTF-16")])
(let-values ([(s2 n status) (bytes-convert c s)])
(case parse-status
[(surrogate)
[(surrogate1 surrogate2)
(if (eq? (system-type) 'windows)
(test 'aborts 'status status)
(begin
(if (eq? parse-status 'surrogate1)
(test 'aborts 'status status)
(test 'complete 'status status))
;; Double the surrogate, and then the "unpaired"
;; surrogates are still allowed, and they're doubled
(let-values ([(s2 n status) (bytes-convert c (bytes-append s s))])
(if (eq? parse-status 'surrogate1)
(begin
(test 3 'ds-count n)
(test 'aborts `(status ,s) status))
(test 'complete `(status ,s) status)))
(let ([try-xtra
(lambda (xtra all? d-size)
(when all?
;; Add non-surrogate to the end, and it should always work:
(let-values ([(s2 n status) (bytes-convert c (bytes-append s xtra))])
(test 'complete 'status status)
(test (+ 3 (bytes-length xtra)) 'cnt n)
(test (* 2 (+ 1 d-size)) bytes-length s2)))
;; Same, even if we only accept 2 output bytes:
(let-values ([(n0 n status) (bytes-convert c (bytes-append s xtra)
0 (+ 3 (bytes-length xtra))
(make-bytes 2))])
(test 'continues 'status status)
(test 3 'cnt n)
(test 2 'got n0)))])
(try-xtra #"a" #t 1)
(try-xtra #"\360\220\200\200" #t 2)
(try-xtra #"\355\240\200" #f 1)))
(test 'error 'status status))]
[(error error/aborts)
(test 'error 'status status)]
@ -877,17 +905,18 @@
;; complete => used all bytes
(test n 'count (bytes-length s))
;; complete => unconverting to return to original
(let ([c2 (bytes-open-converter "platform-UTF-16" "platform-UTF-8")])
(let-values ([(s3 n2 status2) (bytes-convert c2 s2)])
(test s `(UTF-16-round-trip ,s2) s3)
(test n2 'count (bytes-length s2)))
;; Try partial unconvert
(let-values ([(s4 n4 status4) (bytes-convert c2 s2 0 (sub1 (bytes-length s2)))])
(test 'aborts `(status ,s2) status4))
;; Try unconvert into to-small buffer:
(let-values ([(s4 n4 status4) (bytes-convert c2 s2 0 (bytes-length s2)
(make-bytes (sub1 (bytes-length s))))])
(test 'continues 'status status4)))
(unless (memq parse-status '(surrogate1 surrogate2))
(let ([c2 (bytes-open-converter "platform-UTF-16" "platform-UTF-8")])
(let-values ([(s3 n2 status2) (bytes-convert c2 s2)])
(test s `(UTF-16-round-trip ,s2) s3)
(test n2 'count (bytes-length s2)))
;; Try partial unconvert
(let-values ([(s4 n4 status4) (bytes-convert c2 s2 0 (sub1 (bytes-length s2)))])
(test 'aborts `(status ,s2) status4))
;; Try unconvert into to-small buffer:
(let-values ([(s4 n4 status4) (bytes-convert c2 s2 0 (bytes-length s2)
(make-bytes (sub1 (bytes-length s))))])
(test 'continues 'status status4))))
;; complete => can add bytes onto the end
(let-values ([(s2 n status) (bytes-convert c (bytes-append s #"x"))])
(test 'complete 'status status)
@ -912,7 +941,7 @@
;; Should be the same as decoding corrected UTF-8:
(let-values ([(s3 n status) (bytes-convert c (string->bytes/utf-8
(bytes->string/utf-8 s #\?)))])
(test s3 'permissive s2)))))))))
(test s3 `(permissive ,s) s2)))))))))
basic-utf-8-tests))
;; Further UTF-16 tests
@ -926,7 +955,7 @@
(when (eq? 'windows (system-type))
(test-values (list #"" 0 'aborts)
(lambda () (bytes-convert c (integer->integer-bytes #xD8FF 2 #f))))
(test-values (list #"..." 0 'aborts)
(test-values (list #"\355\277\277" 2 'complete)
(lambda () (bytes-convert c (integer->integer-bytes #xDFFF 2 #f)))))
;; Non-windows: after #xD800 bits, surrogate pair is assumed
(unless (eq? 'windows (system-type))
@ -949,6 +978,29 @@
(bytes-convert c
(bytes-append (integer->integer-bytes #xDC00 2 #f)
(integer->integer-bytes #x1000 2 #f))))))))
(when (eq? (system-type) 'windows)
(let ([c (bytes-open-converter "platform-UTF-8-permissive" "platform-UTF-16")])
;; Check that we use all 6 bytes of #"\355\240\200\355\260\200" or none
(test-values (list 12 6 'complete)
(lambda ()
(bytes-convert c #"\355\240\200\355\260\200" 0 6 (make-bytes 12))))
;; If we can't look all the way to the end, reliably abort without writing:
(let ([s (make-bytes 12 (char->integer #\x))])
(let loop ([n 1])
(unless (= n 6)
(test-values (list 0 0 'aborts)
(lambda ()
(bytes-convert c #"\355\240\200\355\260\200" 0 n s)))
(test #"xxxxxxxxxxxx" values s) ; no writes to bytes string
(loop (add1 n)))))
(let ([s (make-bytes 12 (char->integer #\x))])
(let loop ([n 0])
(unless (= n 12)
(test-values (list 0 0 'continues)
(lambda ()
(bytes-convert c #"\355\240\200\355\260\200" 0 6 (make-bytes n))))
(test #"xxxxxxxxxxxx" values s) ; no writes to bytes string
(loop (add1 n)))))))
;; Seems like this sort of thing should be covered above, and maybe it
;; it after some other corrections. But just in case: