1644 lines
47 KiB
Scheme
1644 lines
47 KiB
Scheme
|
|
(load-relative "loadtest.ss")
|
|
|
|
(Section 'unicode)
|
|
|
|
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;; UTF-8 boundary tests based on Markus Kuhn's test suite
|
|
|
|
(define basic-utf-8-tests
|
|
'((#(#x3ba #x1f79 #x3c3 #x3bc #x3b5) complete
|
|
(#o316 #o272 #o341 #o275 #o271 #o317 #o203 #o316 #o274 #o316 #o265))
|
|
(#(0) complete
|
|
(#o0))
|
|
(#(#x00000080) complete
|
|
(#o302 #o200))
|
|
(#(#x00000800) complete
|
|
(#o340 #o240 #o200))
|
|
(#(#x00010000) complete
|
|
(#o360 #o220 #o200 #o200))
|
|
(#(#x0010FFFF) complete
|
|
(#o364 #o217 #o277 #o277))
|
|
(#(#f #f #f #f) error ; would be #x00110000 without UTF-8 limit
|
|
(#o364 #o220 #o200 #o200))
|
|
(#(#f #f #f #f #f) error
|
|
(#o370 #o210 #o200 #o200 #o200))
|
|
(#(#f #f #f #f #f #f) error
|
|
(#o374 #o204 #o200 #o200 #o200 #o200))
|
|
(#(#x0000007F) complete
|
|
(127))
|
|
(#(#x000007FF) complete
|
|
(#o337 #o277))
|
|
(#(#f #f #f #f) error ; #x001FFFFF
|
|
(#o367 #o277 #o277 #o277))
|
|
(#(#f #f #f #f #f) error ; #x03FFFFFF
|
|
(#o373 #o277 #o277 #o277 #o277))
|
|
(#(#f #f #f #f #f #f) error ; #x7FFFFFFF
|
|
(#o375 #o277 #o277 #o277 #o277 #o277))
|
|
(#(#x0000D7FF) complete
|
|
(#o355 #o237 #o277))
|
|
(#(#x0000E000) complete
|
|
(#o356 #o200 #o200))
|
|
(#(#x0000FFFD) complete
|
|
(#o357 #o277 #o275))
|
|
(#(#x0010FFFF) complete
|
|
(#o364 #o217 #o277 #o277))
|
|
;; Missing start byte
|
|
(#(#f) error
|
|
(#o200))
|
|
(#(#f) error
|
|
(#o277))
|
|
(#(#f #f) error
|
|
(#o200 #o277))
|
|
(#(#f #f #f) error
|
|
(#o200 #o277 #o200))
|
|
(#(#f #f #f #f) error
|
|
(#o200 #o277 #o200 #o277))
|
|
(#(#f #f #f #f #f) error
|
|
(#o200 #o277 #o200 #o277 #o200))
|
|
(#(#f #f #f #f #f #f) error
|
|
(#o200 #o277 #o200 #o277 #o200 #o277))
|
|
(#(#f #f #f #f #f #f #f) error
|
|
(#o200 #o277 #o200 #o277 #o200 #o277 #o200))
|
|
(#(#f) error
|
|
(#o200))
|
|
(#(#f) error
|
|
(#o201))
|
|
(#(#f) error
|
|
(#o202))
|
|
(#(#f) error
|
|
(#o203))
|
|
(#(#f) error
|
|
(#o204))
|
|
(#(#f) error
|
|
(#o205))
|
|
(#(#f) error
|
|
(#o206))
|
|
(#(#f) error
|
|
(#o207))
|
|
(#(#f) error
|
|
(#o210))
|
|
(#(#f) error
|
|
(#o211))
|
|
(#(#f) error
|
|
(#o212))
|
|
(#(#f) error
|
|
(#o213))
|
|
(#(#f) error
|
|
(#o214))
|
|
(#(#f) error
|
|
(#o215))
|
|
(#(#f) error
|
|
(#o216))
|
|
(#(#f) error
|
|
(#o217))
|
|
(#(#f) error
|
|
(#o220))
|
|
(#(#f) error
|
|
(#o221))
|
|
(#(#f) error
|
|
(#o222))
|
|
(#(#f) error
|
|
(#o223))
|
|
(#(#f) error
|
|
(#o224))
|
|
(#(#f) error
|
|
(#o225))
|
|
(#(#f) error
|
|
(#o226))
|
|
(#(#f) error
|
|
(#o227))
|
|
(#(#f) error
|
|
(#o230))
|
|
(#(#f) error
|
|
(#o231))
|
|
(#(#f) error
|
|
(#o232))
|
|
(#(#f) error
|
|
(#o233))
|
|
(#(#f) error
|
|
(#o234))
|
|
(#(#f) error
|
|
(#o235))
|
|
(#(#f) error
|
|
(#o236))
|
|
(#(#f) error
|
|
(#o237))
|
|
(#(#f) error
|
|
(#o240))
|
|
(#(#f) error
|
|
(#o241))
|
|
(#(#f) error
|
|
(#o242))
|
|
(#(#f) error
|
|
(#o243))
|
|
(#(#f) error
|
|
(#o244))
|
|
(#(#f) error
|
|
(#o245))
|
|
(#(#f) error
|
|
(#o246))
|
|
(#(#f) error
|
|
(#o247))
|
|
(#(#f) error
|
|
(#o250))
|
|
(#(#f) error
|
|
(#o251))
|
|
(#(#f) error
|
|
(#o252))
|
|
(#(#f) error
|
|
(#o253))
|
|
(#(#f) error
|
|
(#o254))
|
|
(#(#f) error
|
|
(#o255))
|
|
(#(#f) error
|
|
(#o256))
|
|
(#(#f) error
|
|
(#o257))
|
|
(#(#f) error
|
|
(#o260))
|
|
(#(#f) error
|
|
(#o261))
|
|
(#(#f) error
|
|
(#o262))
|
|
(#(#f) error
|
|
(#o263))
|
|
(#(#f) error
|
|
(#o264))
|
|
(#(#f) error
|
|
(#o265))
|
|
(#(#f) error
|
|
(#o266))
|
|
(#(#f) error
|
|
(#o267))
|
|
(#(#f) error
|
|
(#o270))
|
|
(#(#f) error
|
|
(#o271))
|
|
(#(#f) error
|
|
(#o272))
|
|
(#(#f) error
|
|
(#o273))
|
|
(#(#f) error
|
|
(#o274))
|
|
(#(#f) error
|
|
(#o275))
|
|
(#(#f) error
|
|
(#o276))
|
|
(#(#f) error
|
|
(#o277))
|
|
;; 2-byte seqs with no continuation
|
|
(#(#f 32) error
|
|
(#o300 #o40))
|
|
(#(#f 32) error
|
|
(#o301 #o40))
|
|
(#(#f 32) error
|
|
(#o302 #o40))
|
|
(#(#f 32) error
|
|
(#o303 #o40))
|
|
(#(#f 32) error
|
|
(#o304 #o40))
|
|
(#(#f 32) error
|
|
(#o305 #o40))
|
|
(#(#f 32) error
|
|
(#o306 #o40))
|
|
(#(#f 32) error
|
|
(#o307 #o40))
|
|
(#(#f 32) error
|
|
(#o310 #o40))
|
|
(#(#f 32) error
|
|
(#o311 #o40))
|
|
(#(#f 32) error
|
|
(#o312 #o40))
|
|
(#(#f 32) error
|
|
(#o313 #o40))
|
|
(#(#f 32) error
|
|
(#o314 #o40))
|
|
(#(#f 32) error
|
|
(#o315 #o40))
|
|
(#(#f 32) error
|
|
(#o316 #o40))
|
|
(#(#f 32) error
|
|
(#o317 #o40))
|
|
(#(#f 32) error
|
|
(#o320 #o40))
|
|
(#(#f 32) error
|
|
(#o321 #o40))
|
|
(#(#f 32) error
|
|
(#o322 #o40))
|
|
(#(#f 32) error
|
|
(#o323 #o40))
|
|
(#(#f 32) error
|
|
(#o324 #o40))
|
|
(#(#f 32) error
|
|
(#o325 #o40))
|
|
(#(#f 32) error
|
|
(#o326 #o40))
|
|
(#(#f 32) error
|
|
(#o327 #o40))
|
|
(#(#f 32) error
|
|
(#o330 #o40))
|
|
(#(#f 32) error
|
|
(#o331 #o40))
|
|
(#(#f 32) error
|
|
(#o332 #o40))
|
|
(#(#f 32) error
|
|
(#o333 #o40))
|
|
(#(#f 32) error
|
|
(#o334 #o40))
|
|
(#(#f 32) error
|
|
(#o335 #o40))
|
|
(#(#f 32) error
|
|
(#o336 #o40))
|
|
(#(#f 32) error
|
|
(#o337 #o40))
|
|
;; 3-byte seqs with no continuation
|
|
(#(#f 32) error
|
|
(#o340 #o40))
|
|
(#(#f 32) error
|
|
(#o341 #o40))
|
|
(#(#f 32) error
|
|
(#o342 #o40))
|
|
(#(#f 32) error
|
|
(#o343 #o40))
|
|
(#(#f 32) error
|
|
(#o344 #o40))
|
|
(#(#f 32) error
|
|
(#o345 #o40))
|
|
(#(#f 32) error
|
|
(#o346 #o40))
|
|
(#(#f 32) error
|
|
(#o347 #o40))
|
|
(#(#f 32) error
|
|
(#o350 #o40))
|
|
(#(#f 32) error
|
|
(#o351 #o40))
|
|
(#(#f 32) error
|
|
(#o352 #o40))
|
|
(#(#f 32) error
|
|
(#o353 #o40))
|
|
(#(#f 32) error
|
|
(#o354 #o40))
|
|
(#(#f 32) error
|
|
(#o355 #o40))
|
|
(#(#f 32) error
|
|
(#o356 #o40))
|
|
(#(#f 32) error
|
|
(#o357 #o40))
|
|
;; 3-byte seqs with partial continuation
|
|
(#(#f #f 32) error
|
|
(#o340 #o203 #o40))
|
|
(#(#f #f 32) error
|
|
(#o341 #o203 #o40))
|
|
(#(#f #f 32) error
|
|
(#o342 #o203 #o40))
|
|
(#(#f #f 32) error
|
|
(#o343 #o203 #o40))
|
|
(#(#f #f 32) error
|
|
(#o344 #o203 #o40))
|
|
(#(#f #f 32) error
|
|
(#o345 #o203 #o40))
|
|
(#(#f #f 32) error
|
|
(#o346 #o203 #o40))
|
|
(#(#f #f 32) error
|
|
(#o347 #o203 #o40))
|
|
(#(#f #f 32) error
|
|
(#o350 #o203 #o40))
|
|
(#(#f #f 32) error
|
|
(#o351 #o203 #o40))
|
|
(#(#f #f 32) error
|
|
(#o352 #o203 #o40))
|
|
(#(#f #f 32) error
|
|
(#o353 #o203 #o40))
|
|
(#(#f #f 32) error
|
|
(#o354 #o203 #o40))
|
|
(#(#f #f 32) error
|
|
(#o355 #o203 #o40))
|
|
(#(#f #f 32) error
|
|
(#o356 #o203 #o40))
|
|
(#(#f #f 32) error
|
|
(#o357 #o203 #o40))
|
|
;; 4-byte seq with no continuations
|
|
(#(#f 32) error
|
|
(#o360 #o40))
|
|
(#(#f 32) error
|
|
(#o361 #o40))
|
|
(#(#f 32) error
|
|
(#o362 #o40))
|
|
(#(#f 32) error
|
|
(#o363 #o40))
|
|
(#(#f 32) error
|
|
(#o364 #o40))
|
|
(#(#f 32) error
|
|
(#o365 #o40))
|
|
(#(#f 32) error
|
|
(#o366 #o40))
|
|
(#(#f 32) error
|
|
(#o367 #o40))
|
|
;; 4-byte seq with only 1 continuation
|
|
(#(#f #f 32) error
|
|
(#o360 #o203 #o40))
|
|
(#(#f #f 32) error
|
|
(#o361 #o203 #o40))
|
|
(#(#f #f 32) error
|
|
(#o362 #o203 #o40))
|
|
(#(#f #f 32) error
|
|
(#o363 #o203 #o40))
|
|
(#(#f #f 32) error
|
|
(#o364 #o203 #o40))
|
|
(#(#f #f 32) error
|
|
(#o365 #o203 #o40))
|
|
(#(#f #f 32) error
|
|
(#o366 #o203 #o40))
|
|
(#(#f #f 32) error
|
|
(#o367 #o203 #o40))
|
|
;; 4-byte seq with only 2 continuation
|
|
(#(#f #f #f 32) error
|
|
(#o360 #o203 #o203 #o40))
|
|
(#(#f #f #f 32) error
|
|
(#o361 #o203 #o203 #o40))
|
|
(#(#f #f #f 32) error
|
|
(#o362 #o203 #o203 #o40))
|
|
(#(#f #f #f 32) error
|
|
(#o363 #o203 #o203 #o40))
|
|
(#(#f #f #f 32) error
|
|
(#o364 #o203 #o203 #o40))
|
|
(#(#f #f #f 32) error
|
|
(#o365 #o203 #o203 #o40))
|
|
(#(#f #f #f 32) error
|
|
(#o366 #o203 #o203 #o40))
|
|
(#(#f #f #f 32) error
|
|
(#o367 #o203 #o203 #o40))
|
|
;; 5-byte seqs with no continuation
|
|
(#(#f 32) error
|
|
(#o370 #o40))
|
|
(#(#f 32) error
|
|
(#o371 #o40))
|
|
(#(#f 32) error
|
|
(#o372 #o40))
|
|
(#(#f 32) error
|
|
(#o373 #o40))
|
|
;; 5-byte seqs with only 1 continuation
|
|
(#(#f #f 32) error
|
|
(#o370 #o203 #o40))
|
|
(#(#f #f 32) error
|
|
(#o371 #o203 #o40))
|
|
(#(#f #f 32) error
|
|
(#o372 #o203 #o40))
|
|
(#(#f #f 32) error
|
|
(#o373 #o203 #o40))
|
|
;; 5-byte seqs with only 2 continuations
|
|
(#(#f #f #f 32) error
|
|
(#o370 #o203 #o203 #o40))
|
|
(#(#f #f #f 32) error
|
|
(#o371 #o203 #o203 #o40))
|
|
(#(#f #f #f 32) error
|
|
(#o372 #o203 #o203 #o40))
|
|
(#(#f #f #f 32) error
|
|
(#o373 #o203 #o203 #o40))
|
|
;; 5-byte seqs with only 3 continuations
|
|
(#(#f #f #f #f 32) error
|
|
(#o370 #o203 #o203 #o203 #o40))
|
|
(#(#f #f #f #f 32) error
|
|
(#o371 #o203 #o203 #o203 #o40))
|
|
(#(#f #f #f #f 32) error
|
|
(#o372 #o203 #o203 #o203 #o40))
|
|
(#(#f #f #f #f 32) error
|
|
(#o373 #o203 #o203 #o203 #o40))
|
|
;; 6-byte seqs with no continuation
|
|
(#(#f 32) error
|
|
(#o374 #o40))
|
|
(#(#f 32) error
|
|
(#o375 #o40))
|
|
;; 6-byte seqs with only 1 continuation
|
|
(#(#f #f 32) error
|
|
(#o374 #o203 #o40))
|
|
(#(#f #f 32) error
|
|
(#o375 #o203 #o40))
|
|
;; 6-byte seqs with only 2 continuation
|
|
(#(#f #f #f 32) error
|
|
(#o374 #o203 #o203 #o40))
|
|
(#(#f #f #f 32) error
|
|
(#o375 #o203 #o203 #o40))
|
|
;; 6-byte seqs with only 3 continuation
|
|
(#(#f #f #f #f 32) error
|
|
(#o374 #o203 #o203 #o203 #o40))
|
|
(#(#f #f #f #f 32) error
|
|
(#o375 #o203 #o203 #o203 #o40))
|
|
;; 6-byte seqs with only 4 continuation
|
|
(#(#f #f #f #f #f 32) error
|
|
(#o374 #o203 #o203 #o203 #o203 #o40))
|
|
(#(#f #f #f #f #f 32) error
|
|
(#o375 #o203 #o203 #o203 #o203 #o40))
|
|
;; Sequences with last continuation byte missing, eol instead of space
|
|
(#(#f) error ; aborts
|
|
(#o300))
|
|
(#(#f) aborts
|
|
(#o310))
|
|
(#(#f #f) error ; aborts
|
|
(#o340 #o200))
|
|
(#(#f #f) aborts
|
|
(#o350 #o200))
|
|
(#(#f #f) aborts
|
|
(#o340 #o240))
|
|
(#(#f) aborts
|
|
(#o340))
|
|
(#(#f #f #f) error ; aborts
|
|
(#o360 #o200 #o200))
|
|
(#(#f #f #f) aborts
|
|
(#o361 #o200 #o200))
|
|
(#(#f #f) error ; aborts
|
|
(#o360 #o200))
|
|
(#(#f #f) aborts
|
|
(#o361 #o200))
|
|
(#(#f) aborts
|
|
(#o360))
|
|
(#(#f) aborts
|
|
(#o361))
|
|
(#(#f #f #f #f) error ; aborts
|
|
(#o370 #o200 #o200 #o200))
|
|
(#(#f #f #f #f) error ;; aborts
|
|
(#o371 #o200 #o200 #o200))
|
|
(#(#f) error ;; aborts
|
|
(#o370))
|
|
(#(#f) error ;; aborts
|
|
(#o371))
|
|
(#(#f #f) error ; aborts
|
|
(#o370 #o200))
|
|
(#(#f #f) error ;; aborts
|
|
(#o371 #o200))
|
|
(#(#f #f #f) error ; aborts
|
|
(#o370 #o200 #o200))
|
|
(#(#f #f #f) error ;; aborts
|
|
(#o371 #o200 #o200))
|
|
(#(#f #f #f #f #f) error ; aborts
|
|
(#o374 #o200 #o200 #o200 #o200))
|
|
(#(#f #f #f #f #f) error ;; aborts
|
|
(#o375 #o200 #o200 #o200 #o200))
|
|
(#(#f) error ;; aborts
|
|
(#o374))
|
|
(#(#f) error ;; aborts
|
|
(#o375))
|
|
(#(#f #f) error ; aborts
|
|
(#o374 #o200))
|
|
(#(#f #f) error ;; aborts
|
|
(#o375 #o200))
|
|
(#(#f #f #f) error ; aborts
|
|
(#o374 #o200 #o200))
|
|
(#(#f #f #f) error ;; aborts
|
|
(#o375 #o200 #o200))
|
|
(#(#f #f #f #f) error ; aborts
|
|
(#o374 #o200 #o200 #o200))
|
|
(#(#f #f #f #f) error ; aborts
|
|
(#o375 #o200 #o200 #o200))
|
|
(#(#f) aborts
|
|
(#o337))
|
|
(#(#f #f) aborts
|
|
(#o357 #o277))
|
|
(#(#f #f #f) aborts
|
|
(#o364 #o277 #o277))
|
|
(#(#f #f #f) error ;; aborts
|
|
(#o365 #o277 #o277))
|
|
(#(#f #f #f) error ;; aborts
|
|
(#o366 #o277 #o277))
|
|
(#(#f #f #f) error ;; aborts
|
|
(#o367 #o277 #o277))
|
|
(#(#f #f #f #f) error ;; aborts
|
|
(#o373 #o277 #o277 #o277))
|
|
(#(#f #f #f #f #f) error ; aborts
|
|
(#o375 #o277 #o277 #o277 #o277))
|
|
;; Concatenation of incomplete sequences
|
|
(#(#f #f #f #f #f #f #f #f #f) error/aborts
|
|
(#o300 #o340 #o200 #o360 #o200 #o200 #o361 #o200 #o200)
|
|
3)
|
|
;; Impossible bytes
|
|
(#(#f) error
|
|
(#o376))
|
|
(#(#f) error
|
|
(#o377))
|
|
(#(#f #f #f #f) error
|
|
(#o376 #o376 #o377 #o377))
|
|
;; Overlong
|
|
(#(#f #f) error
|
|
(#o300 #o257))
|
|
(#(#f #f #f) error
|
|
(#o340 #o200 #o257))
|
|
(#(#f #f #f #f) error
|
|
(#o360 #o200 #o200 #o257))
|
|
(#(#f #f #f #f #f) error
|
|
(#o370 #o200 #o200 #o200 #o257))
|
|
(#(#f #f #f #f #f #f) error
|
|
(#o374 #o200 #o200 #o200 #o200 #o257))
|
|
(#(#f #f) error
|
|
(#o301 #o277))
|
|
(#(#f #f #f) error
|
|
(#o340 #o237 #o277))
|
|
(#(#f #f #f #f) error
|
|
(#o360 #o217 #o277 #o277))
|
|
(#(#f #f #f #f #f) error
|
|
(#o370 #o207 #o277 #o277 #o277))
|
|
(#(#f #f #f #f #f #f) error
|
|
(#o374 #o203 #o277 #o277 #o277 #o277))
|
|
(#(#f #f) error
|
|
(#o300 #o200))
|
|
(#(#f #f #f) error
|
|
(#o340 #o200 #o200))
|
|
(#(#f #f #f #f) error
|
|
(#o360 #o200 #o200 #o200))
|
|
(#(#f #f #f #f #f) error
|
|
(#o370 #o200 #o200 #o200 #o200))
|
|
(#(#f #f #f #f #f #f) error
|
|
(#o374 #o200 #o200 #o200 #o200 #o200))
|
|
;; illedgal surrogates
|
|
(#(#f #f #f) surrogate1
|
|
(#o355 #o240 #o200))
|
|
(#(#f #f #f) surrogate1
|
|
(#o355 #o255 #o277))
|
|
(#(#f #f #f) surrogate1
|
|
(#o355 #o256 #o200))
|
|
(#(#f #f #f) surrogate1
|
|
(#o355 #o257 #o277))
|
|
(#(#f #f #f) surrogate2
|
|
(#o355 #o260 #o200))
|
|
(#(#f #f #f) surrogate2
|
|
(#o355 #o276 #o200))
|
|
(#(#f #f #f) surrogate2
|
|
(#o355 #o277 #o277))
|
|
(#(#f #f #f #f #f #f) error
|
|
(#o355 #o240 #o200 #o355 #o260 #o200))
|
|
(#(#f #f #f #f #f #f) error
|
|
(#o355 #o240 #o200 #o355 #o277 #o277))
|
|
(#(#f #f #f #f #f #f) error
|
|
(#o355 #o255 #o277 #o355 #o260 #o200))
|
|
(#(#f #f #f #f #f #f) error
|
|
(#o355 #o255 #o277 #o355 #o277 #o277))
|
|
(#(#f #f #f #f #f #f) error
|
|
(#o355 #o256 #o200 #o355 #o260 #o200))
|
|
(#(#f #f #f #f #f #f) error
|
|
(#o355 #o256 #o200 #o355 #o277 #o277))
|
|
(#(#f #f #f #f #f #f) error
|
|
(#o355 #o257 #o277 #o355 #o260 #o200))
|
|
(#(#f #f #f #f #f #f) error
|
|
(#o355 #o257 #o277 #o355 #o277 #o277))
|
|
;; 0xFFFE and 0xFFFF
|
|
(#(#xFFFE) complete
|
|
(#o357 #o277 #o276))
|
|
(#(#xFFFF) complete
|
|
(#o357 #o277 #o277))))
|
|
|
|
(define replace-size 3) ; #\uFFFD -> 3 bytes in UTF
|
|
|
|
(define (string->print s) (map char->integer (string->list s)))
|
|
|
|
(define bytes->unicode-vector
|
|
(case-lambda
|
|
[(s)
|
|
(list->vector (map char->integer (string->list (bytes->string/utf-8 s))))]
|
|
[(s start)
|
|
(bytes->unicode-vector (subbytes s start))]
|
|
[(s start end)
|
|
(bytes->unicode-vector (subbytes s start end))]))
|
|
(define (bytes-any->unicode-vector s non-v)
|
|
(list->vector
|
|
(map (lambda (x) (if (= x (char->integer #\?)) non-v x))
|
|
(map char->integer (string->list (bytes->string/utf-8 s #\?))))))
|
|
(define (unicode-vector->bytes v)
|
|
(string->bytes/utf-8 (list->string (map integer->char (vector->list v)))))
|
|
|
|
(define (list->string/eof l)
|
|
(if (and (pair? l) (eof-object? (car l)) (null? (cdr l)))
|
|
eof
|
|
(list->string l)))
|
|
|
|
(let ([utf-8-iconv (bytes-open-converter "UTF-8" "UTF-8")]
|
|
[utf-8-iconv-p (bytes-open-converter "UTF-8-permissive" "UTF-8")])
|
|
;; First, check some simple conversions
|
|
(let-values ([(z1 z2 more) (bytes-convert utf-8-iconv #"xyz" 0 3 (bytes))])
|
|
(test '(0 0 continues) list z1 z2 more))
|
|
(let-values ([(z1 z2 more) (bytes-convert utf-8-iconv #"xyz" 0 3 (bytes 0))])
|
|
(test '(1 1 continues) list z1 z2 more))
|
|
(let-values ([(z1 z2 more) (bytes-convert utf-8-iconv #"xyz" 0 3 (bytes 0 0 0))])
|
|
(test '(3 3 complete) list z1 z2 more))
|
|
|
|
;; The real tests:
|
|
(for-each (lambda (p)
|
|
(let ([code-points (car p)]
|
|
[parse-status (cadr p)]
|
|
[s (apply bytes (caddr p))])
|
|
(if (and (positive? (vector-length code-points))
|
|
(vector-ref code-points 0))
|
|
(begin
|
|
(test (vector-length code-points) bytes-utf-8-length s)
|
|
(test code-points bytes->unicode-vector s)
|
|
(test code-points bytes-any->unicode-vector s #f)
|
|
(test s unicode-vector->bytes code-points)
|
|
(test 0 bytes-utf-8-index s 0)
|
|
(test (bytes-length s) bytes-utf-8-index
|
|
(bytes-append s #"x")
|
|
(vector-length code-points))
|
|
(if ((vector-length code-points) . > . 1)
|
|
(begin
|
|
(test (integer->char (vector-ref code-points 0))
|
|
bytes-utf-8-ref s 0)
|
|
(test (integer->char (vector-ref code-points
|
|
(sub1 (vector-length code-points))))
|
|
bytes-utf-8-ref s (sub1 (vector-length code-points)))
|
|
(let ([post-1 (bytes-utf-8-index s 1)])
|
|
(test #t positive? post-1)
|
|
(test (list->vector (cdr (vector->list code-points)))
|
|
bytes->unicode-vector s post-1))
|
|
(let ([last-1 (bytes-utf-8-index s (sub1 (vector-length code-points)))])
|
|
(test #t positive? last-1)
|
|
(test code-points
|
|
list->vector (append
|
|
(vector->list (bytes->unicode-vector s 0 last-1))
|
|
(vector->list (bytes->unicode-vector s last-1))))))
|
|
(test #f bytes-utf-8-index s 1))
|
|
(test-values (list s (bytes-length s) 'complete)
|
|
(lambda () (bytes-convert utf-8-iconv s)))
|
|
(test (list s) regexp-match #rx"^.*$" s))
|
|
(begin
|
|
(test code-points bytes-any->unicode-vector s #f)
|
|
(test (list->vector (append '(97) (vector->list code-points) '(98)))
|
|
bytes-any->unicode-vector (bytes-append #"a" s #"b") #f)
|
|
(test (list->vector (append (vector->list code-points) (vector->list code-points)))
|
|
bytes-any->unicode-vector (bytes-append s s) #f)
|
|
(test #f bytes-utf-8-length s)
|
|
(test 0 bytes-utf-8-index s 0)
|
|
(let-values ([(s2 n status) (bytes-convert utf-8-iconv s)])
|
|
(test (case parse-status
|
|
[(error/aborts surrogate1 surrogate2) 'error]
|
|
[else parse-status ])
|
|
'status status))
|
|
(let ([convert
|
|
(lambda (prefix)
|
|
(test (+ (vector-length code-points) (bytes-length prefix))
|
|
bytes-utf-8-length (bytes-append prefix s) #\uFFFD)
|
|
(test (vector-length code-points)
|
|
bytes-utf-8-length (bytes-append prefix s) #\uFFFD (bytes-length prefix))
|
|
(test (if (equal? prefix #"")
|
|
#f
|
|
(integer->char (bytes-ref prefix 0)))
|
|
bytes-utf-8-ref (bytes-append prefix s) 0)
|
|
(test (if (equal? prefix #"")
|
|
(if (equal? #() code-points)
|
|
#f
|
|
#\uFFFD)
|
|
(integer->char (bytes-ref prefix 0)))
|
|
bytes-utf-8-ref (bytes-append prefix s) 0 #\uFFFD)
|
|
(test (if (equal? #() code-points)
|
|
(if (equal? #"" prefix)
|
|
#f
|
|
(integer->char (bytes-ref prefix (sub1 (bytes-length prefix)))))
|
|
(integer->char
|
|
(or (vector-ref code-points (sub1 (vector-length code-points)))
|
|
(char->integer #\uFFFD))))
|
|
bytes-utf-8-ref (bytes-append prefix s)
|
|
(max 0 (+ (bytes-length prefix) (sub1 (vector-length code-points))))
|
|
#\uFFFD)
|
|
|
|
(let-values ([(s2 n status)
|
|
(bytes-convert utf-8-iconv-p (bytes-append prefix s))]
|
|
[(pl) (bytes-length prefix)])
|
|
(case parse-status
|
|
[(error surrogate1 surrogate2)
|
|
(test 'complete 'status status)
|
|
(test (+ (bytes-length s) pl) 'n n)
|
|
(test (+ (* replace-size (vector-length code-points))
|
|
(if (and (positive? (vector-length code-points))
|
|
(eq? (vector-ref code-points (sub1 (vector-length code-points)))
|
|
#o40))
|
|
;; space at end is converted, not replaced by #\xFFFD
|
|
(- 1 replace-size)
|
|
0)
|
|
pl)
|
|
bytes-length s2)
|
|
(test (append (bytes->list prefix)
|
|
(map
|
|
(lambda (i) (or i (char->integer #\uFFFD)))
|
|
(vector->list code-points)))
|
|
vector->list (bytes->unicode-vector s2))]
|
|
[(error/aborts)
|
|
(test 'aborts 'status status)
|
|
(let ([code-points (list->vector
|
|
(reverse
|
|
(list-tail
|
|
(reverse
|
|
(vector->list code-points))
|
|
;; indicates how many to be unused due to abort:
|
|
(cadddr p))))])
|
|
(test (+ (* replace-size (vector-length code-points)) pl) bytes-length s2)
|
|
(test (append (bytes->list prefix)
|
|
(map
|
|
(lambda (i) (or i (char->integer #\uFFFD)))
|
|
(vector->list code-points)))
|
|
vector->list (bytes->unicode-vector s2)))]
|
|
[else
|
|
(test parse-status 'status status)])))])
|
|
(convert #"")
|
|
(convert #"so"))
|
|
(let-values ([(s2 n status) (bytes-convert utf-8-iconv-p (bytes-append s #"xxxxx"))])
|
|
(test 'complete 'status status)
|
|
(test '(#"xxxxx") regexp-match #rx#"xxxxx$" s2)
|
|
(test (+ 5 (bytes-length s)) 'n n))
|
|
(test #f regexp-match #rx"^.*$" s)))
|
|
;; Test byte reading and port positions
|
|
(let ([v (bytes-any->unicode-vector s #f)])
|
|
(define (check-full-read read-all-bytes)
|
|
(let ([p (open-input-bytes s)])
|
|
(port-count-lines! p)
|
|
(read-all-bytes p)
|
|
(let-values ([(l c p) (port-next-location p)])
|
|
(test (vector-length v) 'c c)
|
|
(test (add1 (vector-length v)) 'p p)))
|
|
(let ([p (open-input-string (format "\t~a\t" s))])
|
|
(port-count-lines! p)
|
|
(read-all-bytes p)
|
|
(let-values ([(l c p) (port-next-location p)])
|
|
(test p 'p (add1 (+ 2 (vector-length v))))
|
|
(test c 'tab (+ 16
|
|
(- (vector-length v)
|
|
(bitwise-and (vector-length v) 7))))))
|
|
(let ([p (open-input-string (format "~a\t~a" s s))])
|
|
(port-count-lines! p)
|
|
(read-all-bytes p)
|
|
(let-values ([(l c p) (port-next-location p)])
|
|
(test p 'p (add1 (+ 1 (* 2 (vector-length v)))))
|
|
(test c 'tab (+ 8
|
|
(- (vector-length v)
|
|
(bitwise-and (vector-length v) 7))
|
|
(vector-length v)))))
|
|
(let ([p (open-input-string (format "~a\n~a" s s))])
|
|
(port-count-lines! p)
|
|
(read-all-bytes p)
|
|
(let-values ([(l c p) (port-next-location p)])
|
|
(test p 'p (+ 2 (* 2 (vector-length v))))
|
|
(test c 'cr (vector-length v)))))
|
|
(check-full-read (lambda (p) (read-bytes 500 p)))
|
|
(check-full-read (lambda (p)
|
|
(let loop ()
|
|
(unless (eof-object? (read-byte p))
|
|
(loop)))))
|
|
;; Check result char-by-char:
|
|
(let ([p (open-input-bytes s)])
|
|
(port-count-lines! p)
|
|
(let loop ([old-c 0][old-p 1])
|
|
(let ([i (read-char p)])
|
|
(let-values ([(l c p) (port-next-location p)])
|
|
(if (eof-object? i)
|
|
(begin
|
|
(test (cons s old-c) values (cons s c))
|
|
(test (cons s old-p) values (cons s p)))
|
|
(begin
|
|
(test (cons s (add1 old-c)) values (cons s c))
|
|
(test (cons s (add1 old-p)) values (cons s p))
|
|
(loop c p))))))))
|
|
;; Test read-string decoding
|
|
(let ([us (apply string (map (lambda (i)
|
|
(if i (integer->char i) #\uFFFD))
|
|
(vector->list code-points)))])
|
|
(test us bytes->string/utf-8 s #\uFFFD)
|
|
(test us read-string (vector-length code-points) (open-input-bytes s))
|
|
(test us read-string (* 100 (vector-length code-points)) (open-input-bytes s))
|
|
(unless (string=? "" us)
|
|
(test (substring us 0 1) read-string 1 (open-input-bytes s)))
|
|
(let ([go (lambda (read-string)
|
|
(let-values ([(in out) (make-pipe)])
|
|
(let loop ([i 0])
|
|
(unless (= i 10)
|
|
(write-bytes s out)
|
|
(loop (add1 i))))
|
|
(close-output-port out)
|
|
(test #t 'read-10
|
|
(let loop ([i 0])
|
|
(if (= i 10)
|
|
(let ([v (read-string 1 in)])
|
|
(or (eof-object? v)
|
|
(and (test eof `(eof for ,(caddr p) = ,us with ,read-string) v)
|
|
#t)))
|
|
(let ([rs (read-string (string-length us) in)])
|
|
(and (or (equal? us rs)
|
|
(test (string->print us) `(,i of 10 for ,(caddr p) with ,read-string)
|
|
(string->print rs)))
|
|
(loop (add1 i)))))))))])
|
|
(go read-string)
|
|
;; Peek-string at front, consistent with read-string:
|
|
(go (lambda (n p)
|
|
(let ([a (peek-string n 0 p)]
|
|
[b (read-string n p)])
|
|
(if (equal? a b)
|
|
a
|
|
(list a b)))))
|
|
;; Peek-string, using offset calculated from s
|
|
(go (let ([pos 0])
|
|
(lambda (n p)
|
|
(begin0
|
|
(peek-string n pos p)
|
|
(set! pos (+ (bytes-length s) pos))))))
|
|
;; Read-char
|
|
(go (lambda (n p)
|
|
(list->string/eof
|
|
(let loop ([i 0])
|
|
(if (= i n)
|
|
null
|
|
(cons (read-char p) (loop (add1 i))))))))
|
|
;; Peek-char, consistent with read-char
|
|
(go (lambda (n p)
|
|
(list->string/eof
|
|
(let loop ([i 0])
|
|
(if (= i n)
|
|
null
|
|
(let ([a (peek-char p 0)]
|
|
[b (read-char p)])
|
|
(if (equal? a b)
|
|
(cons a (loop (add1 i)))
|
|
(list* #\* a b #\* (loop (add1 i))))))))))
|
|
;; Read-string, one or two at a time
|
|
(let ([read-n
|
|
(lambda (n p d)
|
|
(let loop ([n n][a null])
|
|
(if (zero? n)
|
|
(apply string-append (reverse a))
|
|
(let ([s (read-string (min d n) p)])
|
|
(if (eof-object? s)
|
|
(if (null? a)
|
|
s
|
|
(loop 0 a))
|
|
(loop (- n (min d n)) (cons s a)))))))])
|
|
(go (lambda (n p) (read-n n p 1)))
|
|
(go (lambda (n p) (read-n n p 2))))))
|
|
;; Test UTF-16
|
|
(let ([c (bytes-open-converter "platform-UTF-8" "platform-UTF-16")])
|
|
(let-values ([(s2 n status) (bytes-convert c s)])
|
|
(case parse-status
|
|
[(surrogate1 surrogate2)
|
|
(if (eq? (system-type) 'windows)
|
|
(begin
|
|
(if (eq? parse-status 'surrogate1)
|
|
(test 'aborts 'status status)
|
|
(test 'complete 'status status))
|
|
;; Double the surrogate, and then the "unpaired"
|
|
;; surrogates are still allowed, and they're doubled
|
|
(let-values ([(s2 n status) (bytes-convert c (bytes-append s s))])
|
|
(if (eq? parse-status 'surrogate1)
|
|
(begin
|
|
(test 3 'ds-count n)
|
|
(test 'aborts `(status ,s) status))
|
|
(test 'complete `(status ,s) status)))
|
|
(let ([try-xtra
|
|
(lambda (xtra all? d-size)
|
|
(when all?
|
|
;; Add non-surrogate to the end, and it should always work:
|
|
(let-values ([(s2 n status) (bytes-convert c (bytes-append s xtra))])
|
|
(test 'complete 'status status)
|
|
(test (+ 3 (bytes-length xtra)) 'cnt n)
|
|
(test (* 2 (+ 1 d-size)) bytes-length s2)))
|
|
;; Same, even if we only accept 2 output bytes:
|
|
(let-values ([(n0 n status) (bytes-convert c (bytes-append s xtra)
|
|
0 (+ 3 (bytes-length xtra))
|
|
(make-bytes 2))])
|
|
(test 'continues 'status status)
|
|
(test 3 'cnt n)
|
|
(test 2 'got n0)))])
|
|
(try-xtra #"a" #t 1)
|
|
(try-xtra #"\360\220\200\200" #t 2)
|
|
(try-xtra #"\355\240\200" #f 1)))
|
|
(test 'error 'status status))]
|
|
[(error error/aborts)
|
|
(test 'error 'status status)]
|
|
[(aborts)
|
|
(test 'aborts 'status status)]
|
|
[else
|
|
(test parse-status 'status status)])
|
|
(when (eq? status 'complete)
|
|
;; complete => used all bytes
|
|
(test n 'count (bytes-length s))
|
|
;; complete => unconverting to return to original
|
|
(unless (memq parse-status '(surrogate1 surrogate2))
|
|
(let ([c2 (bytes-open-converter "platform-UTF-16" "platform-UTF-8")])
|
|
(let-values ([(s3 n2 status2) (bytes-convert c2 s2)])
|
|
(test s `(UTF-16-round-trip ,s2) s3)
|
|
(test n2 'count (bytes-length s2)))
|
|
;; Try partial unconvert
|
|
(let-values ([(s4 n4 status4) (bytes-convert c2 s2 0 (sub1 (bytes-length s2)))])
|
|
(test 'aborts `(status ,s2) status4))
|
|
;; Try unconvert into to-small buffer:
|
|
(let-values ([(s4 n4 status4) (bytes-convert c2 s2 0 (bytes-length s2)
|
|
(make-bytes (sub1 (bytes-length s))))])
|
|
(test 'continues 'status status4))))
|
|
;; complete => can add bytes onto the end
|
|
(let-values ([(s2 n status) (bytes-convert c (bytes-append s #"x"))])
|
|
(test 'complete 'status status)
|
|
(test n 'count (add1 (bytes-length s))))
|
|
;; complete => need enough room for result
|
|
(when (positive? (bytes-length s2))
|
|
(let ([dest (make-bytes (bytes-length s2))])
|
|
(let-values ([(s3 n2 status) (bytes-convert c s 0 n dest 0 (sub1 (bytes-length s2)))])
|
|
(test 'continues 'status status)
|
|
(when (positive? n2)
|
|
(let-values ([(s4 n3 status) (bytes-convert c s n2 n dest s3)])
|
|
(test 'complete 'status status)
|
|
(test s2 'bytes-append dest)
|
|
(test n + n2 n3)
|
|
(test (bytes-length s2) + s3 s4)))))))
|
|
(when (and (eq? status 'error)
|
|
(not (eq? parse-status 'error/aborts)))
|
|
;; permissive decoder should work:
|
|
(let ([c2 (bytes-open-converter "platform-UTF-8-permissive" "platform-UTF-16")])
|
|
(let-values ([(s2 n status) (bytes-convert c2 s)])
|
|
(test 'complete 'status status)
|
|
;; Should be the same as decoding corrected UTF-8:
|
|
(let-values ([(s3 n status) (bytes-convert c (string->bytes/utf-8
|
|
(bytes->string/utf-8 s #\uFFFD)))])
|
|
(test s3 `(permissive ,s) s2)))))))))
|
|
basic-utf-8-tests))
|
|
|
|
;; Further UTF-16 tests
|
|
(let ([c (bytes-open-converter "platform-UTF-16" "platform-UTF-8")])
|
|
(let-values ([(s n status) (bytes-convert c (bytes-append
|
|
(integer->integer-bytes #xD800 2 #f)
|
|
(integer->integer-bytes #xDC00 2 #f)))])
|
|
(test-values (list #"" 0 'aborts)
|
|
(lambda () (bytes-convert c (integer->integer-bytes #xD800 2 #f) )))
|
|
;; Windows: unpaired surrogates allowed:
|
|
(when (eq? 'windows (system-type))
|
|
(test-values (list #"" 0 'aborts)
|
|
(lambda () (bytes-convert c (integer->integer-bytes #xD8FF 2 #f))))
|
|
(test-values (list #"\355\277\277" 2 'complete)
|
|
(lambda () (bytes-convert c (integer->integer-bytes #xDFFF 2 #f)))))
|
|
;; Non-windows: after #xD800 bits, surrogate pair is assumed
|
|
(unless (eq? 'windows (system-type))
|
|
(test-values (list #"" 0 'aborts)
|
|
(lambda () (bytes-convert c (integer->integer-bytes #xD800 2 #f))))
|
|
(test-values (list #"" 0 'aborts)
|
|
(lambda () (bytes-convert c (integer->integer-bytes #xDFFF 2 #f) )))
|
|
(test-values (list s 4 'complete)
|
|
(lambda ()
|
|
(bytes-convert c
|
|
(bytes-append (integer->integer-bytes #xD800 2 #f)
|
|
(integer->integer-bytes #xD800 2 #f)))))
|
|
(test-values (list s 4 'complete)
|
|
(lambda ()
|
|
(bytes-convert c
|
|
(bytes-append (integer->integer-bytes #xD800 2 #f)
|
|
(integer->integer-bytes #x0000 2 #f)))))
|
|
(test-values (list s 4 'complete)
|
|
(lambda ()
|
|
(bytes-convert c
|
|
(bytes-append (integer->integer-bytes #xDC00 2 #f)
|
|
(integer->integer-bytes #x1000 2 #f))))))))
|
|
(when (eq? (system-type) 'windows)
|
|
(let ([c (bytes-open-converter "platform-UTF-8-permissive" "platform-UTF-16")])
|
|
;; Check that we use all 6 bytes of #"\355\240\200\355\260\200" or none
|
|
(test-values (list 12 6 'complete)
|
|
(lambda ()
|
|
(bytes-convert c #"\355\240\200\355\260\200" 0 6 (make-bytes 12))))
|
|
;; If we can't look all the way to the end, reliably abort without writing:
|
|
(let ([s (make-bytes 12 (char->integer #\x))])
|
|
(let loop ([n 1])
|
|
(unless (= n 6)
|
|
(test-values (list 0 0 'aborts)
|
|
(lambda ()
|
|
(bytes-convert c #"\355\240\200\355\260\200" 0 n s)))
|
|
(test #"xxxxxxxxxxxx" values s) ; no writes to bytes string
|
|
(loop (add1 n)))))
|
|
(let ([s (make-bytes 12 (char->integer #\x))])
|
|
(let loop ([n 0])
|
|
(unless (= n 12)
|
|
(test-values (list 0 0 'continues)
|
|
(lambda ()
|
|
(bytes-convert c #"\355\240\200\355\260\200" 0 6 (make-bytes n))))
|
|
(test #"xxxxxxxxxxxx" values s) ; no writes to bytes string
|
|
(loop (add1 n)))))))
|
|
|
|
;; Seems like this sort of thing should be covered above, and maybe it
|
|
;; it after some other corrections. But just in case:
|
|
(let ([check-one
|
|
(lambda (str)
|
|
(for-each
|
|
(lambda (str)
|
|
(for-each
|
|
(lambda (peek?)
|
|
(for-each
|
|
(lambda (byte?)
|
|
(let ([p (open-input-string str)])
|
|
(test str read-string (string-length str) p)
|
|
(test eof
|
|
(if peek?
|
|
(if byte? peek-byte peek-char)
|
|
(if byte? read-byte read-char))
|
|
p)
|
|
(test eof (if byte? read-byte read-char) p)))
|
|
'(#t #f)))
|
|
'(#t #f)))
|
|
(list str
|
|
(string-append str "a")
|
|
(string-append "a" str)
|
|
(string-append str str))))])
|
|
(check-one "a")
|
|
(check-one "\uA4")
|
|
(check-one "\u104")
|
|
(check-one "\u7238")
|
|
(check-one "\Ua7238")
|
|
(check-one "\U107238"))
|
|
|
|
(test '(#o302 #o251) bytes->list (unicode-vector->bytes (vector 169)))
|
|
(test '(#o304 #o250) bytes->list (unicode-vector->bytes (vector 296)))
|
|
|
|
(test "a\0b" bytes->string/utf-8 #"a\xFFb" (integer->char 0))
|
|
|
|
(test '("\uA9") regexp-match #rx"." "\uA9")
|
|
(test '(#"\302") regexp-match #rx#"." #"\302\251")
|
|
(test '(#"\302") regexp-match #rx#"." "\uA9")
|
|
(test '(#"\302\251") regexp-match #rx"." #"\302\251")
|
|
|
|
(test #f regexp-match #rx"[a-z]" "\uA9")
|
|
(test '("\uA9") regexp-match #rx"\uA9" "\uA9")
|
|
(test '("\uA9") regexp-match #rx"\uA9+" "\uA9")
|
|
(test '("\uA9\uA9") regexp-match #rx"\uA9+" "\uA9\uA9")
|
|
(test '("\uA9\uA9") regexp-match #rx"\uA9+" "x\uA9\uA9y")
|
|
(test '("\uA9") regexp-match #rx"[a-z\uA9]" "\uA9mm")
|
|
(test '("\uA9mm") regexp-match #rx"[a-z\uA9]+" "\uA9mm")
|
|
(test '("\uA9") regexp-match #rx"[\uA9-\uE9]+" "xx\uA9mm")
|
|
(test '("\uE9") regexp-match #rx"[\uA9-\uE9]+" "xx\uE9mm")
|
|
(test #f regexp-match #rx"[\uA9-\uE9]+" "xx\u129")
|
|
(test #f regexp-match #rx"[\uA9-\uE9]+" "xx\uA8mm")
|
|
(test '("\uE9\uA9") regexp-match #rx"[\uA9-\u128]+" "xx\uE9\uA9mm")
|
|
|
|
(test #f regexp-match #rx"[^a-z][^a-z]" "\uA9")
|
|
(test #f regexp-match #rx"[^a-z][^a-z]" (string->bytes/utf-8 "\uA9"))
|
|
(test (list (string->bytes/utf-8 "\uA9")) regexp-match #rx#"[^a-z][^a-z]" "\uA9")
|
|
(test (list (string->bytes/utf-8 "\uA9")) regexp-match #rx#"[^a-z][^a-z]" (string->bytes/utf-8 "\uA9"))
|
|
|
|
;; Nots of patterns and ranges:
|
|
(test #f regexp-match #rx"[^a-z\uA9]" "\uA9mm")
|
|
(test '("") regexp-match #rx"[^a-d\uA9]*" "\uA9mm")
|
|
(test '("xx") regexp-match #rx"[^\uA9-\u128]+" "xx\uA9mm")
|
|
(test '("xx") regexp-match #rx"[^\uA9-\u128]+" "xx\303\251mm")
|
|
(test '("xx\u129mm") regexp-match #rx"[^\uA9-\u128]+" "xx\u129mm")
|
|
(test '("xx\uA8mm") regexp-match #rx"[^\uA9-\u128]+" "xx\uA8mm")
|
|
(test '("xx") regexp-match #rx"[^\uA9-\u128]+" "xx\uE9\uA9mm")
|
|
|
|
;; 3-char seqs
|
|
(test '("\u1F79") regexp-match #rx"\u1F79" "a\u1F79\u1F79b")
|
|
(test '("\u1F79\u1F79") regexp-match #rx"\u1F79+" "a\u1F79\u1F79b")
|
|
|
|
(test '("\u1F79\uA9\u1F79r") regexp-match #rx"[c-\u1F79]+" "a\u1F79\uA9\u1F79r")
|
|
(test '("d\u1F79\uA9\u1F79r") regexp-match #rx"[c-\u1F79]+" "d\u1F79\uA9\u1F79r")
|
|
|
|
(test '("\u2F79") regexp-match #rx"[\u1F79-\u3F79]" "\u2F79")
|
|
(test '("\u1F79") regexp-match #rx"[\u1F79-\u3F79]" "\u1F79")
|
|
(test #f regexp-match #rx"[\u1F79-\u3F79]" "\u1F39")
|
|
(test #f regexp-match #rx"[\u1F79-\u3F79]" "\u1F78")
|
|
|
|
;; Nots of 3-char seqs:
|
|
(test #f regexp-match #rx"[^\u1F79-\u3F79]" "\u2F79")
|
|
(test #f regexp-match #rx"[^\u1F79-\u3F79]" "\u1F79")
|
|
(test '("\u1F39") regexp-match #rx"[^\u1F79-\u3F79]" "\u1F39")
|
|
(test '("\u1F78") regexp-match #rx"[^\u1F79-\u3F79]" "\u1F78")
|
|
|
|
|
|
;; Regexps that shouldn't parse:
|
|
(err/rt-test (regexp "[a--b\u1F78]") exn:fail?)
|
|
(err/rt-test (regexp "[a-b-c\u1F78]") exn:fail?)
|
|
|
|
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;; Let Matthew perform some basic sanity checks for locale-sensitive
|
|
;; comparisons:
|
|
(define known-locale? (and (regexp-match "mflatt|matthewf" (path->string (find-system-path 'home-dir)))
|
|
(or (regexp-match "linux" (path->string (system-library-subpath)))
|
|
(eq? 'macosx (system-type)))))
|
|
|
|
(printf "Known locale?: ~a~n" known-locale?)
|
|
|
|
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;; String comparison. Relies on the default locale knowing
|
|
;; about upper A with hat (\303\202) and lower a with hat (\303\242),
|
|
;; and also relies on a "C" locale that can't encode those
|
|
;; two characters. It doesn't rely on a relative order of A-hat
|
|
;; and a-hat --- only that they're the same case-insensitively.
|
|
(when known-locale?
|
|
(let ()
|
|
(define (stest r comp? a b)
|
|
(test r comp? a b)
|
|
(test r comp? (format "xx~ayy" a) (format "xx~ayy" b))
|
|
(test r comp? (format "x\000x~ay" a) (format "x\000x~ay" b))
|
|
(test r comp? (format "x\000~ay" a) (format "x\000~ay" b))
|
|
(test r comp? (format "x\000~a\000y" a) (format "x\000~a\000y" b)))
|
|
(define (go c?)
|
|
(stest #f string=? "A" "a")
|
|
(stest #t string-ci=? "A" "a")
|
|
(stest #t string-locale-ci=? "A" "a")
|
|
(stest #f string=? "\uC2" "\uE2")
|
|
(stest #t string-ci=? "\uC2" "\uE2")
|
|
(stest #f string-locale=? "\uC2" "\uE2")
|
|
(stest (if c? #f #t) string-locale-ci=? "\uC2" "\uE2")
|
|
(stest #f string<? "\uE2" "b")
|
|
(stest (if c? #f #t) string-locale<? "\uE2" "b")
|
|
(stest #t string>? "\uE2" "b")
|
|
(stest (if c? #t #f) string-locale>? "\uE2" "b")
|
|
(stest #t string<? "b" "\uE2")
|
|
(stest (if c? #t #f) string-locale<? "b" "\uE2")
|
|
(stest #f string>? "b" "\uE2")
|
|
(stest (if c? #f #t) string-locale>? "b" "\uE2")
|
|
(test "ABC" string-locale-upcase "aBc")
|
|
(test (if c? "\uE2" "\uC2") string-locale-upcase "\uE2")
|
|
(test (if c? "A\uE2\0B" "A\uC2\0B") string-locale-upcase "a\uE2\0b")
|
|
(test (if c? "A\uE2\0\uE2\0B" "A\uC2\0\uC2\0B") string-locale-upcase "a\uE2\0\uE2\0b"))
|
|
(go #f)
|
|
(parameterize ([current-locale "C"])
|
|
(go #t))))
|
|
|
|
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;; locale<->UTF-8 conversions
|
|
|
|
(test #f bytes-open-converter "UTF-8" "nowhere")
|
|
(test #f bytes-open-converter "nowhere" "UTF-8")
|
|
(when known-locale?
|
|
(let ([c (bytes-open-converter "UTF-8" "")])
|
|
(test (void) bytes-close-converter c)
|
|
(test (void) bytes-close-converter c)
|
|
(err/rt-test (bytes-convert c #"hello") exn:application:mismatch?)))
|
|
|
|
(define (bytes-encode-... from to)
|
|
(lambda args
|
|
(let ([c (bytes-open-converter from to)])
|
|
(dynamic-wind
|
|
void
|
|
(lambda ()
|
|
(apply bytes-convert c args))
|
|
(lambda ()
|
|
(bytes-close-converter c))))))
|
|
|
|
(define bytes-encode-locale->utf-8 (bytes-encode-... "" "UTF-8"))
|
|
(define bytes-encode-utf-8->locale (bytes-encode-... "UTF-8" ""))
|
|
|
|
;; Relies on a default locale that extends Latin-1:
|
|
(when known-locale?
|
|
(let-values ([(s l ok?) (bytes-encode-utf-8->locale #"abc")])
|
|
(test 3 'count l)
|
|
(test 'complete 'ok ok?))
|
|
(let-values ([(s l ok?) (bytes-encode-utf-8->locale #"a\300\300")])
|
|
(test 1 'count l)
|
|
(test 'error 'ok ok?))
|
|
(let-values ([(s l ok?) (bytes-encode-utf-8->locale #"a\302\200")])
|
|
(test 3 'count l)
|
|
(test 'complete 'ok ok?))
|
|
(test-values '(#"abcd" 4 complete) (lambda () (bytes-encode-locale->utf-8 #"abcd")))
|
|
(test-values '(#"bcd" 3 complete) (lambda () (bytes-encode-locale->utf-8 #"abcd" 1)))
|
|
(test-values '(#"bc" 2 complete) (lambda () (bytes-encode-locale->utf-8 #"abcd" 1 3)))
|
|
(test-values '(#"a" 1 aborts) (lambda () (bytes-encode-locale->utf-8 #"a\302\200" 0 2)))
|
|
(test-values '(2 2 complete) (lambda () (bytes-encode-locale->utf-8 #"abcd" 1 3 (make-bytes 2))))
|
|
(let ([s (make-bytes 10)])
|
|
(test-values '(2 2 complete) (lambda () (bytes-encode-locale->utf-8 #"abcd" 1 3 s 4 7)))
|
|
(test 98 bytes-ref s 4)
|
|
(test 99 bytes-ref s 5))
|
|
(let ([s (make-bytes 10)])
|
|
(test-values '(1 1 continues) (lambda () (bytes-encode-locale->utf-8 #"abcd" 1 3 s 4 5)))
|
|
(test 98 bytes-ref s 4))
|
|
;; The rest relies on the "C" locale:
|
|
(parameterize ([current-locale "C"])
|
|
(test-values '(#"abc" 3 complete) (lambda () (bytes-encode-utf-8->locale #"abc")))
|
|
(test-values '(#"a" 1 error) (lambda () (bytes-encode-utf-8->locale #"a\300\300")))
|
|
(test-values '(#"ab" 2 aborts) (lambda () (bytes-encode-utf-8->locale #"ab\303")))
|
|
;; Well-formed, but can't be converted to "C":
|
|
(test-values '(#"a" 1 error) (lambda () (bytes-encode-utf-8->locale #"a\303\342")))))
|
|
|
|
(err/rt-test (bytes-encode-utf-8->locale 'ok))
|
|
(err/rt-test (bytes-encode-utf-8->locale "ok"))
|
|
(err/rt-test (bytes-encode-utf-8->locale #"ok" -1))
|
|
(err/rt-test (bytes-encode-utf-8->locale #"ok" 3) exn:application:mismatch?)
|
|
(err/rt-test (bytes-encode-utf-8->locale #"ok" 1 0) exn:application:mismatch?)
|
|
(err/rt-test (bytes-encode-utf-8->locale #"ok" 1 3) exn:application:mismatch?)
|
|
(err/rt-test (bytes-encode-utf-8->locale #"ok" 1 2 'nope))
|
|
(err/rt-test (bytes-encode-utf-8->locale #"ok" 1 2 #"nope"))
|
|
(let ([s (make-string 4)])
|
|
(err/rt-test (bytes-encode-utf-8->locale #"ok" 1 3 s -1) exn:application:mismatch?)
|
|
(err/rt-test (bytes-encode-utf-8->locale #"ok" 1 3 s 5) exn:application:mismatch?)
|
|
(err/rt-test (bytes-encode-utf-8->locale #"ok" 1 3 s 2 1) exn:application:mismatch?)
|
|
(err/rt-test (bytes-encode-utf-8->locale #"ok" 1 3 s 2 7) exn:application:mismatch?))
|
|
|
|
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;; The following tests are derived from SRFI-14
|
|
|
|
(define (check-all-latin-1 ? l)
|
|
(for-each (lambda (c)
|
|
(test #t ? c))
|
|
l)
|
|
(let loop ([i 0])
|
|
(unless (= i 256)
|
|
(unless (memq (integer->char i) l)
|
|
(test #f ? (integer->char i)))
|
|
(loop (add1 i)))))
|
|
|
|
(define (check-all-unicode ? l)
|
|
(define (unless-in-l ? code c)
|
|
(and (? c)
|
|
(not (member c l))))
|
|
(define (qtest expect f . args)
|
|
(unless (equal? expect (apply f args))
|
|
(apply test expect f args)))
|
|
(for-each (lambda (c)
|
|
(test #t ? c))
|
|
l)
|
|
(for-each (lambda (r)
|
|
(if (caddr r)
|
|
(qtest #f unless-in-l ? (car r) (integer->char (car r)))
|
|
(let loop ([i (car r)])
|
|
(qtest #f unless-in-l ? i (integer->char i))
|
|
(unless (= i (cadr r))
|
|
(loop (add1 i))))))
|
|
(make-known-char-range-list)))
|
|
|
|
(check-all-latin-1
|
|
char-lower-case?
|
|
'(#\a
|
|
#\b
|
|
#\c
|
|
#\d
|
|
#\e
|
|
#\f
|
|
#\g
|
|
#\h
|
|
#\i
|
|
#\j
|
|
#\k
|
|
#\l
|
|
#\m
|
|
#\n
|
|
#\o
|
|
#\p
|
|
#\q
|
|
#\r
|
|
#\s
|
|
#\t
|
|
#\u
|
|
#\v
|
|
#\w
|
|
#\x
|
|
#\y
|
|
#\z
|
|
#\u00B5
|
|
#\u00DF
|
|
#\u00E0
|
|
#\u00E1
|
|
#\u00E2
|
|
#\u00E3
|
|
#\u00E4
|
|
#\u00E5
|
|
#\u00E6
|
|
#\u00E7
|
|
#\u00E8
|
|
#\u00E9
|
|
#\u00EA
|
|
#\u00EB
|
|
#\u00EC
|
|
#\u00ED
|
|
#\u00EE
|
|
#\u00EF
|
|
#\u00F0
|
|
#\u00F1
|
|
#\u00F2
|
|
#\u00F3
|
|
#\u00F4
|
|
#\u00F5
|
|
#\u00F6
|
|
#\u00F8
|
|
#\u00F9
|
|
#\u00FA
|
|
#\u00FB
|
|
#\u00FC
|
|
#\u00FD
|
|
#\u00FE
|
|
#\u00FF
|
|
;; New definition of lower case:
|
|
#\u00AA
|
|
#\u00BA))
|
|
|
|
;; No upper case in latin-1
|
|
(check-all-latin-1
|
|
(lambda (x) (and (char-lower-case? x)
|
|
(or (eq? x (char-upcase x))
|
|
(> (char->integer (char-upcase x)) 255))))
|
|
'(#\u00B5
|
|
#\u00DF
|
|
#\u00FF
|
|
#\u00AA
|
|
#\u00BA))
|
|
|
|
;; Latin-1 uppercase:
|
|
(check-all-latin-1
|
|
char-upper-case?
|
|
'(#\A
|
|
#\B
|
|
#\C
|
|
#\D
|
|
#\E
|
|
#\F
|
|
#\G
|
|
#\H
|
|
#\I
|
|
#\J
|
|
#\K
|
|
#\L
|
|
#\M
|
|
#\N
|
|
#\O
|
|
#\P
|
|
#\Q
|
|
#\R
|
|
#\S
|
|
#\T
|
|
#\U
|
|
#\V
|
|
#\W
|
|
#\X
|
|
#\Y
|
|
#\Z
|
|
#\u00C0
|
|
#\u00C1
|
|
#\u00C2
|
|
#\u00C3
|
|
#\u00C4
|
|
#\u00C5
|
|
#\u00C6
|
|
#\u00C7
|
|
#\u00C8
|
|
#\u00C9
|
|
#\u00CA
|
|
#\u00CB
|
|
#\u00CC
|
|
#\u00CD
|
|
#\u00CE
|
|
#\u00CF
|
|
#\u00D0
|
|
#\u00D1
|
|
#\u00D2
|
|
#\u00D3
|
|
#\u00D4
|
|
#\u00D5
|
|
#\u00D6
|
|
#\u00D8
|
|
#\u00D9
|
|
#\u00DA
|
|
#\u00DB
|
|
#\u00DC
|
|
#\u00DD
|
|
#\u00DE))
|
|
|
|
;; Extra letters (not lower or upper) in Latin-1:
|
|
(check-all-latin-1
|
|
(lambda (c)
|
|
(and (char-alphabetic? c)
|
|
(not (char-upper-case? c))
|
|
(not (char-lower-case? c))))
|
|
'())
|
|
|
|
;; Complete titlecase list:
|
|
(check-all-unicode
|
|
char-title-case?
|
|
'(#\u01C5
|
|
#\u01C8
|
|
#\u01CB
|
|
#\u01F2
|
|
#\u1F88
|
|
#\u1F89
|
|
#\u1F8A
|
|
#\u1F8B
|
|
#\u1F8C
|
|
#\u1F8D
|
|
#\u1F8E
|
|
#\u1F8F
|
|
#\u1F98
|
|
#\u1F99
|
|
#\u1F9A
|
|
#\u1F9B
|
|
#\u1F9C
|
|
#\u1F9D
|
|
#\u1F9E
|
|
#\u1F9F
|
|
#\u1FA8
|
|
#\u1FA9
|
|
#\u1FAA
|
|
#\u1FAB
|
|
#\u1FAC
|
|
#\u1FAD
|
|
#\u1FAE
|
|
#\u1FAF
|
|
#\u1FBC
|
|
#\u1FCC
|
|
#\u1FFC))
|
|
|
|
;; Complete whitspace list:
|
|
(check-all-unicode
|
|
char-whitespace?
|
|
'(#\u0009
|
|
#\u000A
|
|
#\u000B
|
|
#\u000C
|
|
#\u000D
|
|
#\u0020
|
|
#\u0085
|
|
#\u00A0
|
|
#\u1680
|
|
#\u2000
|
|
#\u2001
|
|
#\u2002
|
|
#\u2003
|
|
#\u2004
|
|
#\u2005
|
|
#\u2006
|
|
#\u2007
|
|
#\u2008
|
|
#\u2009
|
|
#\u200A
|
|
;; #\u200B --- in Unicode 4.0, this code point changed from Zs to Cf
|
|
#\u2028
|
|
#\u2029
|
|
#\u202F
|
|
#\u3000
|
|
;; Post SRFI-14?
|
|
#\u205F
|
|
#\u180E))
|
|
|
|
;; Punctuation in Latin-1:
|
|
(check-all-latin-1
|
|
char-punctuation?
|
|
'(#\!
|
|
#\"
|
|
#\#
|
|
#\%
|
|
#\&
|
|
#\'
|
|
#\(
|
|
#\)
|
|
#\*
|
|
#\,
|
|
#\-
|
|
#\.
|
|
#\/
|
|
#\:
|
|
#\;
|
|
#\?
|
|
#\@
|
|
#\[
|
|
#\\
|
|
#\]
|
|
#\_
|
|
#\{
|
|
#\}
|
|
#\u00A1
|
|
#\u00AB
|
|
;; #\u00AD ;; Treated as a control character now?
|
|
#\u00B7
|
|
#\u00BB
|
|
#\u00BF))
|
|
|
|
;; Symbols in Latin-1
|
|
(check-all-latin-1
|
|
char-symbolic?
|
|
'(#\$
|
|
#\+
|
|
#\<
|
|
#\=
|
|
#\>
|
|
#\^
|
|
#\`
|
|
#\|
|
|
#\~
|
|
#\u00A2
|
|
#\u00A3
|
|
#\u00A4
|
|
#\u00A5
|
|
#\u00A6
|
|
#\u00A7
|
|
#\u00A8
|
|
#\u00A9
|
|
#\u00AC
|
|
#\u00AE
|
|
#\u00AF
|
|
#\u00B0
|
|
#\u00B1
|
|
#\u00B4
|
|
#\u00B6
|
|
#\u00B8
|
|
#\u00D7
|
|
#\u00F7))
|
|
|
|
;; Complete blank list:
|
|
(check-all-unicode
|
|
char-blank?
|
|
'(#\u0009
|
|
#\u0020
|
|
#\u00A0
|
|
#\u1680
|
|
#\u2000
|
|
#\u2001
|
|
#\u2002
|
|
#\u2003
|
|
#\u2004
|
|
#\u2005
|
|
#\u2006
|
|
#\u2007
|
|
#\u2008
|
|
#\u2009
|
|
#\u200A
|
|
;; #\u200B --- see note above
|
|
#\u202F
|
|
#\u3000
|
|
;; Post SRFI-14?
|
|
#\u205F
|
|
#\u180E
|
|
))
|
|
|
|
|
|
;; Every letter, digit, punct, and symbol is graphic
|
|
(check-all-unicode
|
|
(lambda (c)
|
|
(and (not (char-graphic? c))
|
|
(or (char-alphabetic? c)
|
|
(char-numeric? c)
|
|
(char-punctuation? c)
|
|
(char-symbolic? c))))
|
|
null)
|
|
|
|
;; Letter, digit, punct, and symbol are distinct
|
|
(check-all-unicode
|
|
(lambda (c)
|
|
(> (+ (if (char-alphabetic? c) 1 0)
|
|
(if (char-numeric? c) 1 0)
|
|
(if (char-punctuation? c) 1 0)
|
|
(if (char-symbolic? c)
|
|
(if (char<=? #\u24B6 c #\u24E9)
|
|
0 ;; Those are both alphabetic and symbolic
|
|
1)
|
|
0))
|
|
1))
|
|
null)
|
|
|
|
(check-all-unicode
|
|
char-iso-control?
|
|
(append
|
|
(let loop ([i 0])
|
|
(if (= i #x20)
|
|
null
|
|
(cons (integer->char i) (loop (add1 i)))))
|
|
(let loop ([i #x7F])
|
|
(if (= i #xA0)
|
|
null
|
|
(cons (integer->char i) (loop (add1 i)))))))
|
|
|
|
|
|
;; ----------------------------------------
|
|
;; Whatever bytes->string/locale does with a bad locale, it shouldn't crash
|
|
|
|
(parameterize ([current-locale "no such locale"])
|
|
(with-handlers ([exn:fail:contract? void]) (bytes->string/locale #"xxx"))
|
|
(with-handlers ([exn:fail:contract? void]) (string->bytes/locale "xxx")))
|
|
|
|
|
|
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;; String-upcase, etc.:
|
|
|
|
(test "ABC!" string-upcase "abc!")
|
|
(test "Z\u7238Z" string-upcase "z\u7238z")
|
|
(test "STRASSE" string-upcase "Stra\xDFe")
|
|
(test "\u039A\u0391\u039F\u03A3" string-upcase "\u039A\u03b1\u03BF\u03C2")
|
|
(test "\u039A\u0391\u039F\u03A3" string-upcase "\u039A\u03b1\u03BF\u03C3")
|
|
|
|
(test "abc!" string-downcase "aBC!")
|
|
(test "z\u7238z" string-downcase "z\u7238Z")
|
|
(test "stra\xDFe" string-downcase "Stra\xDFe")
|
|
(test "\u03BA\u03b1\u03BF\u03C2" string-downcase "\u039A\u0391\u039F\u03A3")
|
|
(test "\u03C3" string-downcase "\u03A3")
|
|
(test "x\u03C2" string-downcase "X\u03A3")
|
|
(test "\u03BA\u03b1\u03BF\u03C3\u03C2" string-downcase "\u039A\u0391\u039F\u03A3\u03A3")
|
|
(test "\u03BA\u03b1\u03BF\u03C2 x" string-downcase "\u039A\u0391\u039F\u03A3 x")
|
|
|
|
(test "abc!" string-foldcase "aBC!")
|
|
(test "z\u7238z" string-foldcase "z\u7238Z")
|
|
(test "strasse" string-foldcase "Stra\xDFe")
|
|
(test "\u03BA\u03b1\u03BF\u03C3" string-foldcase "\u039A\u0391\u039F\u03A3")
|
|
(test "\u03C3" string-foldcase "\u03A3")
|
|
(test "x\u03C3" string-foldcase "X\u03A3")
|
|
(test "\u03BA\u03b1\u03BF\u03C3\u03C3" string-foldcase "\u039A\u0391\u039F\u03A3\u03A3")
|
|
(test "\u03BA\u03b1\u03BF\u03C3 x" string-foldcase "\u039A\u0391\u039F\u03A3 x")
|
|
|
|
(test "Abc!" string-titlecase "aBC!")
|
|
(test "Abc Two" string-titlecase "aBC twO")
|
|
(test "Abc!Two" string-titlecase "aBC!twO")
|
|
(test "Z\u7238Z" string-titlecase "z\u7238Z")
|
|
(test "Stra\xDFe" string-titlecase "stra\xDFe")
|
|
(test "Stra Sse" string-titlecase "stra \xDFe")
|
|
(test "\u039A\u03b1\u03BF\u03C2" string-titlecase "\u039A\u0391\u039F\u03A3")
|
|
(test "\u039A\u03b1\u03BF \u03A3x" string-titlecase "\u039A\u0391\u039F \u03A3x")
|
|
(test "\u03A3" string-titlecase "\u03A3")
|
|
(test "X\u03C2" string-titlecase "x\u03A3")
|
|
(test "\u039A\u03b1\u03BF\u03C3\u03C2" string-titlecase "\u039A\u0391\u039F\u03A3\u03A3")
|
|
(test "\u039A\u03b1\u03BF\u03C2 X" string-titlecase "\u039A\u0391\u039F\u03A3 x")
|
|
|
|
|
|
(report-errs)
|