racket/collects/tests/mzscheme/unicode.ss


(load-relative "loadtest.ss")

(Section 'unicode)

;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;  UTF-8 boundary tests based on Markus Kuhn's test suite

(define basic-utf-8-tests
  '((#(#x3ba #x1f79 #x3c3 #x3bc #x3b5) complete
     (#o316 #o272 #o341 #o275 #o271 #o317 #o203 #o316 #o274 #o316 #o265))
    (#(0) complete
     (#o0))
    (#(#x00000080) complete
     (#o302 #o200))
    (#(#x00000800) complete
     (#o340 #o240 #o200))
    (#(#x00010000) complete
     (#o360 #o220 #o200 #o200))
    (#(#x0010FFFF) complete
     (#o364 #o217 #o277 #o277))
    (#(#f #f #f #f) error ; would be #x00110000 without UTF-8 limit
     (#o364 #o220 #o200 #o200))
    (#(#f #f  #f #f #f) error
     (#o370 #o210 #o200 #o200 #o200))
    (#(#f #f #f #f #f #f) error
     (#o374 #o204 #o200 #o200 #o200 #o200))
    (#(#x0000007F) complete
     (127))
    (#(#x000007FF) complete
     (#o337 #o277))
    (#(#f #f #f #f) error ; #x001FFFFF
     (#o367 #o277 #o277 #o277))
    (#(#f #f #f #f #f) error ; #x03FFFFFF
     (#o373 #o277 #o277 #o277 #o277))
    (#(#f #f #f #f #f #f) error ; #x7FFFFFFF
     (#o375 #o277 #o277 #o277 #o277 #o277))
    (#(#x0000D7FF) complete
     (#o355 #o237 #o277))
    (#(#x0000E000) complete
     (#o356 #o200 #o200))
    (#(#x0000FFFD) complete
     (#o357 #o277 #o275))
    (#(#x0010FFFF) complete
     (#o364 #o217 #o277 #o277))
    ;; Missing start byte
    (#(#f) error
     (#o200))
    (#(#f) error
     (#o277))
    (#(#f #f) error
     (#o200 #o277))
    (#(#f #f #f) error
     (#o200 #o277 #o200))
    (#(#f #f #f #f) error
     (#o200 #o277 #o200 #o277))
    (#(#f #f #f #f #f) error
     (#o200 #o277 #o200 #o277 #o200))
    (#(#f #f #f #f #f #f) error
     (#o200 #o277 #o200 #o277 #o200 #o277))
    (#(#f #f #f #f #f #f #f) error
     (#o200 #o277 #o200 #o277 #o200 #o277 #o200))
    (#(#f) error
     (#o200))
    (#(#f) error
     (#o201))
    (#(#f) error
     (#o202))
    (#(#f) error
     (#o203))
    (#(#f) error
     (#o204))
    (#(#f) error
     (#o205))
    (#(#f) error
     (#o206))
    (#(#f) error
     (#o207))
    (#(#f) error
     (#o210))
    (#(#f) error
     (#o211))
    (#(#f) error
     (#o212))
    (#(#f) error
     (#o213))
    (#(#f) error
     (#o214))
    (#(#f) error
     (#o215))
    (#(#f) error
     (#o216))
    (#(#f) error
     (#o217))
    (#(#f) error
     (#o220))
    (#(#f) error
     (#o221))
    (#(#f) error
     (#o222))
    (#(#f) error
     (#o223))
    (#(#f) error
     (#o224))
    (#(#f) error
     (#o225))
    (#(#f) error
     (#o226))
    (#(#f) error
     (#o227))
    (#(#f) error
     (#o230))
    (#(#f) error
     (#o231))
    (#(#f) error
     (#o232))
    (#(#f) error
     (#o233))
    (#(#f) error
     (#o234))
    (#(#f) error
     (#o235))
    (#(#f) error
     (#o236))
    (#(#f) error
     (#o237))
    (#(#f) error
     (#o240))
    (#(#f) error
     (#o241))
    (#(#f) error
     (#o242))
    (#(#f) error
     (#o243))
    (#(#f) error
     (#o244))
    (#(#f) error
     (#o245))
    (#(#f) error
     (#o246))
    (#(#f) error
     (#o247))
    (#(#f) error
     (#o250))
    (#(#f) error
     (#o251))
    (#(#f) error
     (#o252))
    (#(#f) error
     (#o253))
    (#(#f) error
     (#o254))
    (#(#f) error
     (#o255))
    (#(#f) error
     (#o256))
    (#(#f) error
     (#o257))
    (#(#f) error
     (#o260))
    (#(#f) error
     (#o261))
    (#(#f) error
     (#o262))
    (#(#f) error
     (#o263))
    (#(#f) error
     (#o264))
    (#(#f) error
     (#o265))
    (#(#f) error
     (#o266))
    (#(#f) error
     (#o267))
    (#(#f) error
     (#o270))
    (#(#f) error
     (#o271))
    (#(#f) error
     (#o272))
    (#(#f) error
     (#o273))
    (#(#f) error
     (#o274))
    (#(#f) error
     (#o275))
    (#(#f) error
     (#o276))
    (#(#f) error
     (#o277))
    ;; 2-byte seqs with no continuation
    (#(#f 32) error
     (#o300 #o40))
    (#(#f 32) error
     (#o301 #o40))
    (#(#f 32) error
     (#o302 #o40))
    (#(#f 32) error
     (#o303 #o40))
    (#(#f 32) error
     (#o304 #o40))
    (#(#f 32) error
     (#o305 #o40))
    (#(#f 32) error
     (#o306 #o40))
    (#(#f 32) error
     (#o307 #o40))
    (#(#f 32) error
     (#o310 #o40))
    (#(#f 32) error
     (#o311 #o40))
    (#(#f 32) error
     (#o312 #o40))
    (#(#f 32) error
     (#o313 #o40))
    (#(#f 32) error
     (#o314 #o40))
    (#(#f 32) error
     (#o315 #o40))
    (#(#f 32) error
     (#o316 #o40))
    (#(#f 32) error
     (#o317 #o40))
    (#(#f 32) error
     (#o320 #o40))
    (#(#f 32) error
     (#o321 #o40))
    (#(#f 32) error
     (#o322 #o40))
    (#(#f 32) error
     (#o323 #o40))
    (#(#f 32) error
     (#o324 #o40))
    (#(#f 32) error
     (#o325 #o40))
    (#(#f 32) error
     (#o326 #o40))
    (#(#f 32) error
     (#o327 #o40))
    (#(#f 32) error
     (#o330 #o40))
    (#(#f 32) error
     (#o331 #o40))
    (#(#f 32) error
     (#o332 #o40))
    (#(#f 32) error
     (#o333 #o40))
    (#(#f 32) error
     (#o334 #o40))
    (#(#f 32) error
     (#o335 #o40))
    (#(#f 32) error
     (#o336 #o40))
    (#(#f 32) error
     (#o337 #o40))
    ;; 3-byte seqs with no continuation
    (#(#f 32) error
     (#o340 #o40))
    (#(#f 32) error
     (#o341 #o40))
    (#(#f 32) error
     (#o342 #o40))
    (#(#f 32) error
     (#o343 #o40))
    (#(#f 32) error
     (#o344 #o40))
    (#(#f 32) error
     (#o345 #o40))
    (#(#f 32) error
     (#o346 #o40))
    (#(#f 32) error
     (#o347 #o40))
    (#(#f 32) error
     (#o350 #o40))
    (#(#f 32) error
     (#o351 #o40))
    (#(#f 32) error
     (#o352 #o40))
    (#(#f 32) error
     (#o353 #o40))
    (#(#f 32) error
     (#o354 #o40))
    (#(#f 32) error
     (#o355 #o40))
    (#(#f 32) error
     (#o356 #o40))
    (#(#f 32) error
     (#o357 #o40))
    ;; 3-byte seqs with partial continuation
    (#(#f #f 32) error
     (#o340 #o203 #o40))
    (#(#f #f 32) error
     (#o341 #o203 #o40))
    (#(#f #f 32) error
     (#o342 #o203 #o40))
    (#(#f #f 32) error
     (#o343 #o203 #o40))
    (#(#f #f 32) error
     (#o344 #o203 #o40))
    (#(#f #f 32) error
     (#o345 #o203 #o40))
    (#(#f #f 32) error
     (#o346 #o203 #o40))
    (#(#f #f 32) error
     (#o347 #o203 #o40))
    (#(#f #f 32) error
     (#o350 #o203 #o40))
    (#(#f #f 32) error
     (#o351 #o203 #o40))
    (#(#f #f 32) error
     (#o352 #o203 #o40))
    (#(#f #f 32) error
     (#o353 #o203 #o40))
    (#(#f #f 32) error
     (#o354 #o203 #o40))
    (#(#f #f 32) error
     (#o355 #o203 #o40))
    (#(#f #f 32) error
     (#o356 #o203 #o40))
    (#(#f #f 32) error
     (#o357 #o203 #o40))
    ;; 4-byte seq with no continuations
    (#(#f 32) error
     (#o360 #o40))
    (#(#f 32) error
     (#o361 #o40))
    (#(#f 32) error
     (#o362 #o40))
    (#(#f 32) error
     (#o363 #o40))
    (#(#f 32) error
     (#o364 #o40))
    (#(#f 32) error
     (#o365 #o40))
    (#(#f 32) error
     (#o366 #o40))
    (#(#f 32) error
     (#o367 #o40))
    ;; 4-byte seq with only 1 continuation
    (#(#f #f 32) error
     (#o360 #o203 #o40))
    (#(#f #f 32) error
     (#o361 #o203 #o40))
    (#(#f #f 32) error
     (#o362 #o203 #o40))
    (#(#f #f 32) error
     (#o363 #o203 #o40))
    (#(#f #f 32) error
     (#o364 #o203 #o40))
    (#(#f #f 32) error
     (#o365 #o203 #o40))
    (#(#f #f 32) error
     (#o366 #o203 #o40))
    (#(#f #f 32) error
     (#o367 #o203 #o40))
    ;; 4-byte seq with only 2 continuation
    (#(#f #f #f 32) error
     (#o360 #o203 #o203 #o40))
    (#(#f #f #f 32) error
     (#o361 #o203 #o203 #o40))
    (#(#f #f #f 32) error
     (#o362 #o203 #o203 #o40))
    (#(#f #f #f 32) error
     (#o363 #o203 #o203 #o40))
    (#(#f #f #f 32) error
     (#o364 #o203 #o203 #o40))
    (#(#f #f #f 32) error
     (#o365 #o203 #o203 #o40))
    (#(#f #f #f 32) error
     (#o366 #o203 #o203 #o40))
    (#(#f #f #f 32) error
     (#o367 #o203 #o203 #o40))
    ;; 5-byte seqs with no continuation
    (#(#f 32) error
     (#o370 #o40))
    (#(#f 32) error
     (#o371 #o40))
    (#(#f 32) error
     (#o372 #o40))
    (#(#f 32) error
     (#o373 #o40))
    ;; 5-byte seqs with only 1 continuation
    (#(#f #f 32) error
     (#o370 #o203 #o40))
    (#(#f #f 32) error
     (#o371 #o203 #o40))
    (#(#f #f 32) error
     (#o372 #o203 #o40))
    (#(#f #f 32) error
     (#o373 #o203 #o40))
    ;; 5-byte seqs with only 2 continuations
    (#(#f #f #f 32) error
     (#o370 #o203 #o203 #o40))
    (#(#f #f #f 32) error
     (#o371 #o203 #o203 #o40))
    (#(#f #f #f 32) error
     (#o372 #o203 #o203 #o40))
    (#(#f #f #f 32) error
     (#o373 #o203 #o203 #o40))
    ;; 5-byte seqs with only 3 continuations
    (#(#f #f #f #f 32) error
     (#o370 #o203 #o203 #o203 #o40))
    (#(#f #f #f #f 32) error
     (#o371 #o203 #o203 #o203 #o40))
    (#(#f #f #f #f 32) error
     (#o372 #o203 #o203 #o203 #o40))
    (#(#f #f #f #f 32) error
     (#o373 #o203 #o203 #o203 #o40))
    ;; 6-byte seqs with no continuation
    (#(#f 32) error
     (#o374 #o40))
    (#(#f 32) error
     (#o375 #o40))
    ;; 6-byte seqs with only 1 continuation
    (#(#f #f 32) error
     (#o374 #o203 #o40))
    (#(#f #f 32) error
     (#o375 #o203 #o40))
    ;; 6-byte seqs with only 2 continuation
    (#(#f #f #f 32) error
     (#o374 #o203 #o203 #o40))
    (#(#f #f #f 32) error
     (#o375 #o203 #o203 #o40))
    ;; 6-byte seqs with only 3 continuation
    (#(#f #f #f #f 32) error
     (#o374 #o203 #o203 #o203 #o40))
    (#(#f #f #f #f 32) error
     (#o375 #o203 #o203 #o203 #o40))
    ;; 6-byte seqs with only 4 continuation
    (#(#f #f #f #f #f 32) error
     (#o374 #o203 #o203 #o203 #o203 #o40))
    (#(#f #f #f #f #f 32) error
     (#o375 #o203 #o203 #o203 #o203 #o40))
    ;; Sequences with last continuation byte missing, eol instead of space
    (#(#f) error ; aborts
     (#o300))
    (#(#f) aborts
     (#o310))
    (#(#f #f) error ; aborts
     (#o340 #o200))
    (#(#f #f) aborts
     (#o350 #o200))
    (#(#f #f) aborts
     (#o340 #o240))
    (#(#f) aborts
     (#o340))
    (#(#f #f #f) error ; aborts
     (#o360 #o200 #o200))
    (#(#f #f #f) aborts
     (#o361 #o200 #o200))
    (#(#f #f) error ; aborts
     (#o360 #o200))
    (#(#f #f) aborts
     (#o361 #o200))
    (#(#f) aborts
     (#o360))
    (#(#f) aborts
     (#o361))
    (#(#f #f #f #f) error ; aborts
     (#o370 #o200 #o200 #o200))
    (#(#f #f #f #f) error ;; aborts
     (#o371 #o200 #o200 #o200))
    (#(#f) error ;; aborts
     (#o370))
    (#(#f) error ;; aborts
     (#o371))
    (#(#f #f) error ; aborts
     (#o370 #o200))
    (#(#f #f) error ;; aborts
     (#o371 #o200))
    (#(#f #f #f) error ; aborts
     (#o370 #o200 #o200))
    (#(#f #f #f) error ;; aborts
     (#o371 #o200 #o200))
    (#(#f #f #f #f #f) error ; aborts
     (#o374 #o200 #o200 #o200 #o200))
    (#(#f #f #f #f #f) error ;; aborts
     (#o375 #o200 #o200 #o200 #o200))
    (#(#f) error ;; aborts
     (#o374))
    (#(#f) error ;; aborts
     (#o375))
    (#(#f #f) error ; aborts
     (#o374 #o200))
    (#(#f #f) error ;; aborts
     (#o375 #o200))
    (#(#f #f #f) error ; aborts
     (#o374 #o200 #o200))
    (#(#f #f #f) error ;; aborts
     (#o375 #o200 #o200))
    (#(#f #f #f #f) error ; aborts
     (#o374 #o200 #o200 #o200))
    (#(#f #f #f #f) error ; aborts
     (#o375 #o200 #o200 #o200))
    (#(#f) aborts
     (#o337))
    (#(#f #f) aborts
     (#o357 #o277))
    (#(#f #f #f) aborts
     (#o364 #o277 #o277))
    (#(#f #f #f) error ;; aborts
     (#o365 #o277 #o277))
    (#(#f #f #f) error ;; aborts
     (#o366 #o277 #o277))
    (#(#f #f #f) error ;; aborts
     (#o367 #o277 #o277))
    (#(#f #f #f #f) error ;; aborts
     (#o373 #o277 #o277 #o277))
    (#(#f #f #f #f #f) error ; aborts
     (#o375 #o277 #o277 #o277 #o277))
    ;; Concatenation of incomplete sequences
    (#(#f #f #f #f #f #f #f #f #f) error/aborts
     (#o300 #o340 #o200 #o360 #o200 #o200 #o361 #o200 #o200)
     3)
    ;; Impossible bytes
    (#(#f) error
     (#o376))
    (#(#f) error
     (#o377))
    (#(#f #f #f #f) error
     (#o376 #o376 #o377 #o377))
    ;; Overlong
    (#(#f #f) error
     (#o300 #o257))
    (#(#f #f #f) error
     (#o340 #o200 #o257))
    (#(#f #f #f #f) error
     (#o360 #o200 #o200 #o257))
    (#(#f #f #f #f #f) error
     (#o370 #o200 #o200 #o200 #o257))
    (#(#f #f #f #f #f #f) error
     (#o374 #o200 #o200 #o200 #o200 #o257))
    (#(#f #f) error
     (#o301 #o277))
    (#(#f #f #f) error
     (#o340 #o237 #o277))
    (#(#f #f #f #f) error
     (#o360 #o217 #o277 #o277))
    (#(#f #f #f #f #f) error
     (#o370 #o207 #o277 #o277 #o277))
    (#(#f #f #f #f #f #f) error
     (#o374 #o203 #o277 #o277 #o277 #o277))
    (#(#f #f) error
     (#o300 #o200))
    (#(#f #f #f) error
     (#o340 #o200 #o200))
    (#(#f #f #f #f) error
     (#o360 #o200 #o200 #o200))
    (#(#f #f #f #f #f) error
     (#o370 #o200 #o200 #o200 #o200))
    (#(#f #f #f #f #f #f) error
     (#o374 #o200 #o200 #o200 #o200 #o200))
    ;; illedgal surrogates
    (#(#f #f #f) surrogate1
     (#o355 #o240 #o200))
    (#(#f #f #f) surrogate1
     (#o355 #o255 #o277))
    (#(#f #f #f) surrogate1
     (#o355 #o256 #o200))
    (#(#f #f #f) surrogate1
     (#o355 #o257 #o277))
    (#(#f #f #f) surrogate2
     (#o355 #o260 #o200))
    (#(#f #f #f) surrogate2
     (#o355 #o276 #o200))
    (#(#f #f #f) surrogate2
     (#o355 #o277 #o277))
    (#(#f #f #f #f #f #f) error
     (#o355 #o240 #o200 #o355 #o260 #o200))
    (#(#f #f #f #f #f #f) error
     (#o355 #o240 #o200 #o355 #o277 #o277))
    (#(#f #f #f #f #f #f) error
     (#o355 #o255 #o277 #o355 #o260 #o200))
    (#(#f #f #f #f #f #f) error
     (#o355 #o255 #o277 #o355 #o277 #o277))
    (#(#f #f #f #f #f #f) error
     (#o355 #o256 #o200 #o355 #o260 #o200))
    (#(#f #f #f #f #f #f) error
     (#o355 #o256 #o200 #o355 #o277 #o277))
    (#(#f #f #f #f #f #f) error
     (#o355 #o257 #o277 #o355 #o260 #o200))
    (#(#f #f #f #f #f #f) error
     (#o355 #o257 #o277 #o355 #o277 #o277))
    ;; 0xFFFE and 0xFFFF
    (#(#xFFFE) complete
     (#o357 #o277 #o276))
    (#(#xFFFF) complete
     (#o357 #o277 #o277))))

(define replace-size 3) ; #\uFFFD -> 3 bytes in UTF

(define (string->print s) (map char->integer (string->list s)))

(define bytes->unicode-vector
  (case-lambda
   [(s)
    (list->vector (map char->integer (string->list (bytes->string/utf-8 s))))]
   [(s start)
    (bytes->unicode-vector (subbytes s start))]
   [(s start end)
    (bytes->unicode-vector (subbytes s start end))]))
(define (bytes-any->unicode-vector s non-v)
  (list->vector
   (map (lambda (x) (if (= x (char->integer #\?)) non-v x))
	(map char->integer (string->list (bytes->string/utf-8 s #\?))))))
(define (unicode-vector->bytes v)
  (string->bytes/utf-8 (list->string (map integer->char (vector->list v)))))

(define (list->string/eof l)
  (if (and (pair? l) (eof-object? (car l)) (null? (cdr l)))
      eof
      (list->string l)))

(let ([utf-8-iconv (bytes-open-converter "UTF-8" "UTF-8")]
      [utf-8-iconv-p (bytes-open-converter "UTF-8-permissive" "UTF-8")])
  ;; First, check some simple conversions
  (let-values ([(z1 z2 more) (bytes-convert utf-8-iconv #"xyz" 0 3 (bytes))])
    (test '(0 0 continues) list z1 z2 more))
  (let-values ([(z1 z2 more) (bytes-convert utf-8-iconv #"xyz" 0 3 (bytes 0))])
    (test '(1 1 continues) list z1 z2 more))
  (let-values ([(z1 z2 more) (bytes-convert utf-8-iconv #"xyz" 0 3 (bytes 0 0 0))])
    (test '(3 3 complete) list z1 z2 more))

  ;; The real tests:
  (for-each (lambda (p)
	      (let ([code-points (car p)]
		    [parse-status (cadr p)]
		    [s (apply bytes (caddr p))])
		(if (and (positive? (vector-length code-points))
			 (vector-ref code-points 0))
		    (begin
		      (test (vector-length code-points) bytes-utf-8-length s)
		      (test code-points bytes->unicode-vector s)
		      (test code-points bytes-any->unicode-vector s #f)
		      (test s unicode-vector->bytes code-points)
		      (test 0 bytes-utf-8-index s 0)
		      (test (bytes-length s) bytes-utf-8-index
			    (bytes-append s #"x")
			    (vector-length code-points))
		      (if ((vector-length code-points) . > . 1)
			  (begin
			    (test (integer->char (vector-ref code-points 0))
				  bytes-utf-8-ref s 0)
			    (test (integer->char (vector-ref code-points
							     (sub1 (vector-length code-points))))
				  bytes-utf-8-ref s (sub1 (vector-length code-points)))
			    (let ([post-1 (bytes-utf-8-index s 1)])
			      (test #t positive? post-1)
			      (test (list->vector (cdr (vector->list code-points)))
				    bytes->unicode-vector s post-1))
			    (let ([last-1 (bytes-utf-8-index s (sub1 (vector-length code-points)))])
			      (test #t positive? last-1)
			      (test code-points
				    list->vector (append
						  (vector->list (bytes->unicode-vector s 0 last-1))
						  (vector->list (bytes->unicode-vector s last-1))))))
			  (test #f bytes-utf-8-index s 1))
		      (test-values (list s (bytes-length s) 'complete)
				   (lambda () (bytes-convert utf-8-iconv s)))
		      (test (list s) regexp-match #rx"^.*$" s))
		    (begin
		      (test code-points bytes-any->unicode-vector s #f)
		      (test (list->vector (append '(97) (vector->list code-points) '(98)))
			    bytes-any->unicode-vector (bytes-append #"a" s #"b") #f)
		      (test (list->vector (append (vector->list code-points) (vector->list code-points)))
			    bytes-any->unicode-vector (bytes-append s s) #f)
		      (test #f bytes-utf-8-length s)
		      (test 0 bytes-utf-8-index s 0)
		      (let-values ([(s2 n status) (bytes-convert utf-8-iconv s)])
			(test (case parse-status
				[(error/aborts surrogate1 surrogate2) 'error]
				[else parse-status ])
			      'status status))
		      (let ([convert
			     (lambda (prefix)
			       (test (+ (vector-length code-points) (bytes-length prefix))
				     bytes-utf-8-length (bytes-append prefix s) #\uFFFD)
			       (test (vector-length code-points)
				     bytes-utf-8-length (bytes-append prefix s) #\uFFFD (bytes-length prefix))
			       (test (if (equal? prefix #"")
					 #f
					 (integer->char (bytes-ref prefix 0)))
				     bytes-utf-8-ref (bytes-append prefix s) 0)
			       (test (if (equal? prefix #"")
					 (if (equal? #() code-points)
					     #f
					     #\uFFFD)
					 (integer->char (bytes-ref prefix 0)))
				     bytes-utf-8-ref (bytes-append prefix s) 0 #\uFFFD)
			       (test (if (equal? #() code-points)
					 (if (equal? #"" prefix)
					     #f
					     (integer->char (bytes-ref prefix (sub1 (bytes-length prefix)))))
					 (integer->char
					  (or (vector-ref code-points (sub1 (vector-length code-points)))
					      (char->integer #\uFFFD))))
				     bytes-utf-8-ref (bytes-append prefix s)
				     (max 0 (+ (bytes-length prefix) (sub1 (vector-length code-points))))
				     #\uFFFD)

			       (let-values ([(s2 n status)
					     (bytes-convert utf-8-iconv-p (bytes-append prefix s))]
					    [(pl) (bytes-length prefix)])
				 (case parse-status
				   [(error surrogate1 surrogate2)
				    (test 'complete 'status status)
				    (test (+ (bytes-length s) pl) 'n n)
				    (test (+ (* replace-size (vector-length code-points))
                                             (if (and (positive? (vector-length code-points))
                                                      (eq? (vector-ref code-points (sub1 (vector-length code-points)))
                                                           #o40))
                                                 ;; space at end is converted, not replaced by #\xFFFD
                                                 (- 1 replace-size)
                                                 0)
                                             pl)
                                          bytes-length s2)
				    (test (append (bytes->list prefix)
						  (map
						   (lambda (i) (or i (char->integer #\uFFFD)))
						   (vector->list code-points)))
					  vector->list (bytes->unicode-vector s2))]
				   [(error/aborts)
				    (test 'aborts 'status status)
				    (let ([code-points (list->vector
							(reverse
							 (list-tail
							  (reverse
							   (vector->list code-points))
							  ;; indicates how many to be unused due to abort:
							  (cadddr p))))])
				      (test (+ (* replace-size (vector-length code-points)) pl) bytes-length s2)
				      (test (append (bytes->list prefix)
						    (map
						     (lambda (i) (or i (char->integer #\uFFFD)))
						     (vector->list code-points)))
					    vector->list (bytes->unicode-vector s2)))]
				   [else
				    (test parse-status 'status status)])))])
			(convert #"")
			(convert #"so"))
		      (let-values ([(s2 n status) (bytes-convert utf-8-iconv-p (bytes-append s #"xxxxx"))])
			(test 'complete 'status status)
			(test '(#"xxxxx") regexp-match #rx#"xxxxx$" s2)
			(test (+ 5 (bytes-length s)) 'n n))
		      (test #f regexp-match #rx"^.*$" s)))
		;; Test byte reading and port positions
		(let ([v (bytes-any->unicode-vector s #f)])
		  (define (check-full-read read-all-bytes)
		    (let ([p (open-input-bytes s)])
		      (port-count-lines! p)
		      (read-all-bytes p)
		      (let-values ([(l c p) (port-next-location p)])
			(test (vector-length v) 'c c)
			(test (add1 (vector-length v)) 'p p)))
		    (let ([p (open-input-string (format "\t~a\t" s))])
		      (port-count-lines! p)
		      (read-all-bytes p)
		      (let-values ([(l c p) (port-next-location p)])
			(test p 'p (add1 (+ 2 (vector-length v))))
			(test c 'tab (+ 16
					(- (vector-length v)
					   (bitwise-and (vector-length v) 7))))))
		    (let ([p (open-input-string (format "~a\t~a" s s))])
		      (port-count-lines! p)
		      (read-all-bytes p)
		      (let-values ([(l c p) (port-next-location p)])
			(test p 'p (add1 (+ 1 (* 2 (vector-length v)))))
			(test c 'tab (+ 8
					(- (vector-length v)
					   (bitwise-and (vector-length v) 7))
					(vector-length v)))))
		    (let ([p (open-input-string (format "~a\n~a" s s))])
		      (port-count-lines! p)
		      (read-all-bytes p)
		      (let-values ([(l c p) (port-next-location p)])
			(test p 'p (+ 2 (* 2 (vector-length v))))
			(test c 'cr (vector-length v)))))
		  (check-full-read (lambda (p) (read-bytes 500 p)))
		  (check-full-read (lambda (p)
				     (let loop ()
				       (unless (eof-object? (read-byte p))
					 (loop)))))
		  ;; Check result char-by-char:
		  (let ([p (open-input-bytes s)])
		    (port-count-lines! p)
		    (let loop ([old-c 0][old-p 1])
		      (let ([i (read-char p)])
			(let-values ([(l c p) (port-next-location p)])
			  (if (eof-object? i)
			      (begin
				(test (cons s old-c) values (cons s c))
				(test (cons s old-p) values (cons s p)))
			      (begin
				(test (cons s (add1 old-c)) values (cons s c))
				(test (cons s (add1 old-p)) values (cons s p))
				(loop c p))))))))
		;; Test read-string decoding
		(let ([us (apply string (map (lambda (i)
					       (if i (integer->char i) #\uFFFD))
					     (vector->list code-points)))])
		  (test us bytes->string/utf-8 s #\uFFFD)
		  (test us read-string (vector-length code-points) (open-input-bytes s))
		  (test us read-string (* 100 (vector-length code-points)) (open-input-bytes s))
		  (unless (string=? "" us)
		    (test (substring us 0 1) read-string 1 (open-input-bytes s)))
		  (let ([go (lambda (read-string)
			      (let-values ([(in out) (make-pipe)])
				(let loop ([i 0])
				  (unless (= i 10)
				    (write-bytes s out)
				    (loop (add1 i))))
				(close-output-port out)
				(test #t 'read-10
				      (let loop ([i 0])
					(if (= i 10)
					    (let ([v (read-string 1 in)])
					      (or (eof-object? v)
						  (and (test eof `(eof for ,(caddr p) = ,us with ,read-string) v)
						       #t)))
					    (let ([rs (read-string (string-length us) in)])
					      (and (or (equal? us rs)
						       (test (string->print us) `(,i of 10 for ,(caddr p) with ,read-string)
							     (string->print rs)))
						   (loop (add1 i)))))))))])
		    (go read-string)
		    ;; Peek-string at front, consistent with read-string:
		    (go (lambda (n p)
			  (let ([a (peek-string n 0 p)]
				[b (read-string n p)])
			    (if (equal? a b)
				a
				(list a b)))))
		    ;; Peek-string, using offset calculated from s
		    (go (let ([pos 0])
			  (lambda (n p)
			    (begin0
			     (peek-string n pos p)
			     (set! pos (+ (bytes-length s) pos))))))
		    ;; Read-char
		    (go (lambda (n p)
			  (list->string/eof
			   (let loop ([i 0])
			     (if (= i n)
				 null
				 (cons (read-char p) (loop (add1 i))))))))
		    ;; Peek-char, consistent with read-char
		    (go (lambda (n p)
			    (list->string/eof
			     (let loop ([i 0])
			       (if (= i n)
				   null
				   (let ([a (peek-char p 0)]
					 [b (read-char p)])
				     (if (equal? a b)
					 (cons a (loop (add1 i)))
					 (list* #\* a b #\* (loop (add1 i))))))))))
		    ;; Read-string, one or two at a time
		    (let ([read-n
			   (lambda (n p d)
			     (let loop ([n n][a null])
			       (if (zero? n)
				   (apply string-append (reverse a))
				   (let ([s (read-string (min d n) p)])
				     (if (eof-object? s)
					 (if (null? a)
					     s
					     (loop 0 a))
					 (loop (- n (min d n)) (cons s a)))))))])
		      (go (lambda (n p) (read-n n p 1)))
		      (go (lambda (n p) (read-n n p 2))))))
		;; Test UTF-16
		(let ([c (bytes-open-converter "platform-UTF-8" "platform-UTF-16")])
		  (let-values ([(s2 n status) (bytes-convert c s)])
		    (case parse-status
		      [(surrogate1 surrogate2)
		       (if (eq? (system-type) 'windows)
			   (begin
			     (if (eq? parse-status 'surrogate1)
				 (test 'aborts 'status status)
				 (test 'complete 'status status))
			     ;; Double the surrogate, and then the "unpaired"
			     ;; surrogates are still allowed, and they're doubled
			     (let-values ([(s2 n status) (bytes-convert c (bytes-append s s))])
			       (if (eq? parse-status 'surrogate1)
				   (begin
				     (test 3 'ds-count n)
				     (test 'aborts `(status ,s) status))
				   (test 'complete `(status ,s) status)))
			     (let ([try-xtra
				    (lambda (xtra all? d-size)
				      (when all?
					;; Add non-surrogate to the end, and it should always work:
				        (let-values ([(s2 n status) (bytes-convert c (bytes-append s xtra))])
					  (test 'complete 'status status)
					  (test (+ 3 (bytes-length xtra)) 'cnt n)
					  (test (* 2 (+ 1 d-size)) bytes-length s2)))
				      ;; Same, even if we only accept 2 output bytes:
				      (let-values ([(n0 n status) (bytes-convert c (bytes-append s xtra)
										 0 (+ 3 (bytes-length xtra))
										 (make-bytes 2))])
					 (test 'continues 'status status)
					 (test 3 'cnt n)
					 (test 2 'got n0)))])
			       (try-xtra #"a" #t 1)
			       (try-xtra #"\360\220\200\200" #t 2)
			       (try-xtra #"\355\240\200" #f 1)))
			   (test 'error 'status status))]
		      [(error error/aborts)
		       (test 'error 'status status)]
		      [(aborts)
		       (test 'aborts 'status status)]
		      [else
		       (test parse-status 'status status)])
		    (when (eq? status 'complete)
		      ;; complete => used all bytes
		      (test n 'count (bytes-length s))
		      ;; complete => unconverting to return to original
		      (unless (memq parse-status '(surrogate1 surrogate2))
		        (let ([c2 (bytes-open-converter "platform-UTF-16" "platform-UTF-8")])
			  (let-values ([(s3 n2 status2) (bytes-convert c2 s2)])
			    (test s `(UTF-16-round-trip ,s2) s3)
			    (test n2 'count (bytes-length s2)))
			  ;; Try partial unconvert
			  (let-values ([(s4 n4 status4) (bytes-convert c2 s2 0 (sub1 (bytes-length s2)))])
			    (test 'aborts `(status ,s2) status4))
			  ;; Try unconvert into to-small buffer:
			  (let-values ([(s4 n4 status4) (bytes-convert c2 s2 0 (bytes-length s2)
								       (make-bytes (sub1 (bytes-length s))))])
			    (test 'continues 'status status4))))
		      ;; complete => can add bytes onto the end
		      (let-values ([(s2 n status) (bytes-convert c (bytes-append s #"x"))])
			(test 'complete 'status status)
			(test n 'count (add1 (bytes-length s))))
		      ;; complete => need enough room for result
		      (when (positive? (bytes-length s2))
			(let ([dest (make-bytes (bytes-length s2))])
			  (let-values ([(s3 n2 status) (bytes-convert c s 0 n dest 0 (sub1 (bytes-length s2)))])
			    (test 'continues 'status status)
			    (when (positive? n2)
			      (let-values ([(s4 n3 status) (bytes-convert c s n2 n dest s3)])
				(test 'complete 'status status)
				(test s2 'bytes-append dest)
				(test n + n2 n3)
				(test (bytes-length s2) + s3 s4)))))))
		    (when (and (eq? status 'error)
			       (not (eq? parse-status 'error/aborts)))
		      ;; permissive decoder should work:
		      (let ([c2 (bytes-open-converter "platform-UTF-8-permissive" "platform-UTF-16")])
			(let-values ([(s2 n status) (bytes-convert c2 s)])
			  (test 'complete 'status status)
			  ;; Should be the same as decoding corrected UTF-8:
			  (let-values ([(s3 n status) (bytes-convert c (string->bytes/utf-8
									(bytes->string/utf-8 s #\uFFFD)))])
			    (test s3 `(permissive ,s) s2)))))))))
	    basic-utf-8-tests))

;; Further UTF-16 tests
(let ([c (bytes-open-converter "platform-UTF-16" "platform-UTF-8")])
  (let-values ([(s n status) (bytes-convert c (bytes-append
					       (integer->integer-bytes #xD800 2 #f)
					       (integer->integer-bytes #xDC00 2 #f)))])
    (test-values (list #"" 0 'aborts)
		 (lambda () (bytes-convert c (integer->integer-bytes #xD800 2 #f) )))
    ;; Windows: unpaired surrogates allowed:
    (when (eq? 'windows (system-type))
      (test-values (list #"" 0 'aborts)
		   (lambda () (bytes-convert c (integer->integer-bytes #xD8FF 2 #f))))
      (test-values (list #"\355\277\277" 2 'complete)
		   (lambda () (bytes-convert c (integer->integer-bytes #xDFFF 2 #f)))))
    ;; Non-windows: after #xD800 bits, surrogate pair is assumed
    (unless (eq? 'windows (system-type))
      (test-values (list #"" 0 'aborts)
		   (lambda () (bytes-convert c (integer->integer-bytes #xD800 2 #f))))
      (test-values (list #"" 0 'aborts)
		   (lambda () (bytes-convert c (integer->integer-bytes #xDFFF 2 #f) )))
      (test-values (list s 4 'complete)
		   (lambda ()
		     (bytes-convert c
				    (bytes-append (integer->integer-bytes #xD800 2 #f)
						  (integer->integer-bytes #xD800 2 #f)))))
      (test-values (list s 4 'complete)
		   (lambda ()
		     (bytes-convert c
				    (bytes-append (integer->integer-bytes #xD800 2 #f)
						  (integer->integer-bytes #x0000 2 #f)))))
      (test-values (list s 4 'complete)
		   (lambda ()
		     (bytes-convert c
				    (bytes-append (integer->integer-bytes #xDC00 2 #f)
						  (integer->integer-bytes #x1000 2 #f))))))))
(when (eq? (system-type) 'windows)
  (let ([c (bytes-open-converter "platform-UTF-8-permissive" "platform-UTF-16")])
    ;; Check that we use all 6 bytes of #"\355\240\200\355\260\200" or none
    (test-values (list 12 6 'complete)
		 (lambda ()
		   (bytes-convert c #"\355\240\200\355\260\200" 0 6 (make-bytes 12))))
    ;; If we can't look all the way to the end, reliably abort without writing:
    (let ([s (make-bytes 12 (char->integer #\x))])
      (let loop ([n 1])
	(unless (= n 6)
	  (test-values (list 0 0 'aborts)
		       (lambda ()
		         (bytes-convert c #"\355\240\200\355\260\200" 0 n s)))
	  (test #"xxxxxxxxxxxx" values s) ; no writes to bytes string
	  (loop (add1 n)))))
    (let ([s (make-bytes 12 (char->integer #\x))])
      (let loop ([n 0])
	(unless (= n 12)
	  (test-values (list 0 0 'continues)
		       (lambda ()
			 (bytes-convert c #"\355\240\200\355\260\200" 0 6 (make-bytes n))))
	  (test #"xxxxxxxxxxxx" values s) ; no writes to bytes string
	  (loop (add1 n)))))))

;; Seems like this sort of thing should be covered above, and maybe it
;;  it after some other corrections. But just in case:
(let ([check-one
       (lambda (str)
	 (for-each
	  (lambda (str)
	    (for-each
	     (lambda (peek?)
	       (for-each
		(lambda (byte?)
		  (let ([p (open-input-string str)])
		    (test str read-string (string-length str) p)
		    (test eof
			  (if peek?
			      (if byte? peek-byte peek-char)
			      (if byte? read-byte read-char))
			  p)
		    (test eof (if byte? read-byte read-char) p)))
		'(#t #f)))
	     '(#t #f)))
	  (list str
		(string-append str "a")
		(string-append "a" str)
		(string-append str str))))])
  (check-one "a")
  (check-one "\uA4")
  (check-one "\u104")
  (check-one "\u7238")
  (check-one "\Ua7238")
  (check-one "\U107238"))

(test '(#o302 #o251) bytes->list (unicode-vector->bytes (vector 169)))
(test '(#o304 #o250) bytes->list (unicode-vector->bytes (vector 296)))

(test "a\0b" bytes->string/utf-8 #"a\xFFb" (integer->char 0))

(test '("\uA9") regexp-match #rx"." "\uA9")
(test '(#"\302") regexp-match #rx#"." #"\302\251")
(test '(#"\302") regexp-match #rx#"." "\uA9")
(test '(#"\302\251") regexp-match #rx"." #"\302\251")

(test #f regexp-match #rx"[a-z]" "\uA9")
(test '("\uA9") regexp-match #rx"\uA9" "\uA9")
(test '("\uA9") regexp-match #rx"\uA9+" "\uA9")
(test '("\uA9\uA9") regexp-match #rx"\uA9+" "\uA9\uA9")
(test '("\uA9\uA9") regexp-match #rx"\uA9+" "x\uA9\uA9y")
(test '("\uA9") regexp-match #rx"[a-z\uA9]" "\uA9mm")
(test '("\uA9mm") regexp-match #rx"[a-z\uA9]+" "\uA9mm")
(test '("\uA9") regexp-match #rx"[\uA9-\uE9]+" "xx\uA9mm")
(test '("\uE9") regexp-match #rx"[\uA9-\uE9]+" "xx\uE9mm")
(test #f regexp-match #rx"[\uA9-\uE9]+" "xx\u129")
(test #f regexp-match #rx"[\uA9-\uE9]+" "xx\uA8mm")
(test '("\uE9\uA9") regexp-match #rx"[\uA9-\u128]+" "xx\uE9\uA9mm")

(test #f regexp-match #rx"[^a-z][^a-z]" "\uA9")
(test #f regexp-match #rx"[^a-z][^a-z]" (string->bytes/utf-8 "\uA9"))
(test (list (string->bytes/utf-8 "\uA9")) regexp-match #rx#"[^a-z][^a-z]" "\uA9")
(test (list (string->bytes/utf-8 "\uA9")) regexp-match #rx#"[^a-z][^a-z]" (string->bytes/utf-8 "\uA9"))

;; Nots of patterns and ranges:
(test #f regexp-match #rx"[^a-z\uA9]" "\uA9mm")
(test '("") regexp-match #rx"[^a-d\uA9]*" "\uA9mm")
(test '("xx") regexp-match #rx"[^\uA9-\u128]+" "xx\uA9mm")
(test '("xx") regexp-match #rx"[^\uA9-\u128]+" "xx\303\251mm")
(test '("xx\u129mm") regexp-match #rx"[^\uA9-\u128]+" "xx\u129mm")
(test '("xx\uA8mm") regexp-match #rx"[^\uA9-\u128]+" "xx\uA8mm")
(test '("xx") regexp-match #rx"[^\uA9-\u128]+" "xx\uE9\uA9mm")

;; 3-char seqs
(test '("\u1F79") regexp-match #rx"\u1F79" "a\u1F79\u1F79b")
(test '("\u1F79\u1F79") regexp-match #rx"\u1F79+" "a\u1F79\u1F79b")

(test '("\u1F79\uA9\u1F79r") regexp-match #rx"[c-\u1F79]+" "a\u1F79\uA9\u1F79r")
(test '("d\u1F79\uA9\u1F79r") regexp-match #rx"[c-\u1F79]+" "d\u1F79\uA9\u1F79r")

(test '("\u2F79") regexp-match #rx"[\u1F79-\u3F79]" "\u2F79")
(test '("\u1F79") regexp-match #rx"[\u1F79-\u3F79]" "\u1F79")
(test #f regexp-match #rx"[\u1F79-\u3F79]" "\u1F39")
(test #f regexp-match #rx"[\u1F79-\u3F79]" "\u1F78")

;; Nots of 3-char seqs:
(test #f regexp-match #rx"[^\u1F79-\u3F79]" "\u2F79")
(test #f regexp-match #rx"[^\u1F79-\u3F79]" "\u1F79")
(test '("\u1F39") regexp-match #rx"[^\u1F79-\u3F79]" "\u1F39")
(test '("\u1F78") regexp-match #rx"[^\u1F79-\u3F79]" "\u1F78")


;; Regexps that shouldn't parse:
(err/rt-test (regexp "[a--b\u1F78]") exn:fail?)
(err/rt-test (regexp "[a-b-c\u1F78]") exn:fail?)

;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; Let Matthew perform some basic sanity checks for locale-sensitive
;; comparisons:
(define known-locale? (and (regexp-match "mflatt|matthewf" (path->string (find-system-path 'home-dir)))
			   (or (regexp-match "linux" (path->string (system-library-subpath)))
			       (eq? 'macosx (system-type)))))

(printf "Known locale?: ~a~n" known-locale?)

;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; String comparison. Relies on the default locale knowing
;;  about upper A with hat (\303\202) and lower a with hat (\303\242),
;;  and also relies on a "C" locale that can't encode those
;;  two characters. It doesn't rely on a relative order of A-hat
;;  and a-hat --- only that they're the same case-insensitively.
(when known-locale?
  (let ()
    (define (stest r comp? a b)
      (test r comp? a b)
      (test r comp? (format "xx~ayy" a) (format "xx~ayy" b))
      (test r comp? (format "x\000x~ay" a) (format "x\000x~ay" b))
      (test r comp? (format "x\000~ay" a) (format "x\000~ay" b))
      (test r comp? (format "x\000~a\000y" a) (format "x\000~a\000y" b)))
    (define (go c?)
      (stest #f string=? "A" "a")
      (stest #t string-ci=? "A" "a")
      (stest #t string-locale-ci=? "A" "a")
      (stest #f string=? "\uC2" "\uE2")
      (stest #t string-ci=? "\uC2" "\uE2")
      (stest #f string-locale=? "\uC2" "\uE2")
      (stest (if c? #f #t) string-locale-ci=? "\uC2" "\uE2")
      (stest #f string<? "\uE2" "b")
      (stest (if c? #f #t) string-locale<? "\uE2" "b")
      (stest #t string>? "\uE2" "b")
      (stest (if c? #t #f) string-locale>? "\uE2" "b")
      (stest #t string<? "b" "\uE2")
      (stest (if c? #t #f) string-locale<? "b" "\uE2")
      (stest #f string>? "b" "\uE2")
      (stest (if c? #f #t) string-locale>? "b" "\uE2")
      (test "ABC" string-locale-upcase "aBc")
      (test (if c? "\uE2" "\uC2") string-locale-upcase "\uE2")
      (test (if c? "A\uE2\0B" "A\uC2\0B") string-locale-upcase "a\uE2\0b")
      (test (if c? "A\uE2\0\uE2\0B" "A\uC2\0\uC2\0B") string-locale-upcase "a\uE2\0\uE2\0b"))
    (go #f)
    (parameterize ([current-locale "C"])
      (go #t))))

;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;  locale<->UTF-8 conversions

(test #f bytes-open-converter "UTF-8" "nowhere")
(test #f bytes-open-converter "nowhere" "UTF-8")
(when known-locale?
  (let ([c (bytes-open-converter "UTF-8" "")])
    (test (void) bytes-close-converter c)
    (test (void) bytes-close-converter c)
    (err/rt-test (bytes-convert c #"hello") exn:application:mismatch?)))

(define (bytes-encode-... from to)
  (lambda args
    (let ([c (bytes-open-converter from to)])
      (dynamic-wind
	  void
	  (lambda ()
	    (apply bytes-convert c args))
	  (lambda ()
	    (bytes-close-converter c))))))

(define bytes-encode-locale->utf-8 (bytes-encode-... "" "UTF-8"))
(define bytes-encode-utf-8->locale (bytes-encode-... "UTF-8" ""))

;; Relies on a default locale that extends Latin-1:
(when known-locale?
  (let-values ([(s l ok?) (bytes-encode-utf-8->locale #"abc")])
    (test 3 'count l)
    (test 'complete 'ok ok?))
  (let-values ([(s l ok?) (bytes-encode-utf-8->locale #"a\300\300")])
    (test 1 'count l)
    (test 'error 'ok ok?))
  (let-values ([(s l ok?) (bytes-encode-utf-8->locale #"a\302\200")])
    (test 3 'count l)
    (test 'complete 'ok ok?))
  (test-values '(#"abcd" 4 complete) (lambda () (bytes-encode-locale->utf-8 #"abcd")))
  (test-values '(#"bcd" 3 complete) (lambda () (bytes-encode-locale->utf-8 #"abcd" 1)))
  (test-values '(#"bc" 2 complete) (lambda () (bytes-encode-locale->utf-8 #"abcd" 1 3)))
  (test-values '(#"a" 1 aborts) (lambda () (bytes-encode-locale->utf-8 #"a\302\200" 0 2)))
  (test-values '(2 2 complete) (lambda () (bytes-encode-locale->utf-8 #"abcd" 1 3 (make-bytes 2))))
  (let ([s (make-bytes 10)])
    (test-values '(2 2 complete) (lambda () (bytes-encode-locale->utf-8 #"abcd" 1 3 s 4 7)))
    (test 98 bytes-ref s 4)
    (test 99 bytes-ref s 5))
  (let ([s (make-bytes 10)])
    (test-values '(1 1 continues) (lambda () (bytes-encode-locale->utf-8 #"abcd" 1 3 s 4 5)))
    (test 98 bytes-ref s 4))
  ;; The rest relies on the "C" locale:
  (parameterize ([current-locale "C"])
    (test-values '(#"abc" 3 complete) (lambda () (bytes-encode-utf-8->locale #"abc")))
    (test-values '(#"a" 1 error) (lambda () (bytes-encode-utf-8->locale #"a\300\300")))
    (test-values '(#"ab" 2 aborts) (lambda () (bytes-encode-utf-8->locale #"ab\303")))
    ;; Well-formed, but can't be converted to "C":
    (test-values '(#"a" 1 error) (lambda () (bytes-encode-utf-8->locale #"a\303\342")))))

(err/rt-test (bytes-encode-utf-8->locale 'ok))
(err/rt-test (bytes-encode-utf-8->locale "ok"))
(err/rt-test (bytes-encode-utf-8->locale #"ok" -1))
(err/rt-test (bytes-encode-utf-8->locale #"ok" 3) exn:application:mismatch?)
(err/rt-test (bytes-encode-utf-8->locale #"ok" 1 0) exn:application:mismatch?)
(err/rt-test (bytes-encode-utf-8->locale #"ok" 1 3) exn:application:mismatch?)
(err/rt-test (bytes-encode-utf-8->locale #"ok" 1 2 'nope))
(err/rt-test (bytes-encode-utf-8->locale #"ok" 1 2 #"nope"))
(let ([s (make-string 4)])
  (err/rt-test (bytes-encode-utf-8->locale #"ok" 1 3 s -1) exn:application:mismatch?)
  (err/rt-test (bytes-encode-utf-8->locale #"ok" 1 3 s 5) exn:application:mismatch?)
  (err/rt-test (bytes-encode-utf-8->locale #"ok" 1 3 s 2 1) exn:application:mismatch?)
  (err/rt-test (bytes-encode-utf-8->locale #"ok" 1 3 s 2 7) exn:application:mismatch?))

;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; The following tests are derived from SRFI-14

(define (check-all-latin-1 ? l)
  (for-each (lambda (c)
	      (test #t ? c))
	    l)
  (let loop ([i 0])
    (unless (= i 256)
      (unless (memq (integer->char i) l)
	(test #f ? (integer->char i)))
      (loop (add1 i)))))

(define (check-all-unicode ? l)
  (define (unless-in-l ? code c)
    (and (? c)
	 (not (member c l))))
  (define (qtest expect f . args)
    (unless (equal? expect (apply f args))
      (apply test expect f args)))
  (for-each (lambda (c)
	      (test #t ? c))
	    l)
  (for-each (lambda (r)
	      (if (caddr r)
		  (qtest #f unless-in-l ? (car r) (integer->char (car r)))
		  (let loop ([i (car r)])
		    (qtest #f unless-in-l  ? i (integer->char i))
		    (unless (= i (cadr r))
		      (loop (add1 i))))))
	    (make-known-char-range-list)))

(check-all-latin-1
 char-lower-case?
 '(#\a
   #\b
   #\c
   #\d
   #\e
   #\f
   #\g
   #\h
   #\i
   #\j
   #\k
   #\l
   #\m
   #\n
   #\o
   #\p
   #\q
   #\r
   #\s
   #\t
   #\u
   #\v
   #\w
   #\x
   #\y
   #\z
   #\u00B5
   #\u00DF
   #\u00E0
   #\u00E1
   #\u00E2
   #\u00E3
   #\u00E4
   #\u00E5
   #\u00E6
   #\u00E7
   #\u00E8
   #\u00E9
   #\u00EA
   #\u00EB
   #\u00EC
   #\u00ED
   #\u00EE
   #\u00EF
   #\u00F0
   #\u00F1
   #\u00F2
   #\u00F3
   #\u00F4
   #\u00F5
   #\u00F6
   #\u00F8
   #\u00F9
   #\u00FA
   #\u00FB
   #\u00FC
   #\u00FD
   #\u00FE
   #\u00FF
   ;; New definition of lower case:
   #\u00AA
   #\u00BA))

;; No upper case in latin-1
(check-all-latin-1
 (lambda (x) (and (char-lower-case? x)
		  (or (eq? x (char-upcase x))
		      (> (char->integer (char-upcase x)) 255))))
 '(#\u00B5
   #\u00DF
   #\u00FF
   #\u00AA
   #\u00BA))

;; Latin-1 uppercase:
(check-all-latin-1
 char-upper-case?
 '(#\A
   #\B
   #\C
   #\D
   #\E
   #\F
   #\G
   #\H
   #\I
   #\J
   #\K
   #\L
   #\M
   #\N
   #\O
   #\P
   #\Q
   #\R
   #\S
   #\T
   #\U
   #\V
   #\W
   #\X
   #\Y
   #\Z
   #\u00C0
   #\u00C1
   #\u00C2
   #\u00C3
   #\u00C4
   #\u00C5
   #\u00C6
   #\u00C7
   #\u00C8
   #\u00C9
   #\u00CA
   #\u00CB
   #\u00CC
   #\u00CD
   #\u00CE
   #\u00CF
   #\u00D0
   #\u00D1
   #\u00D2
   #\u00D3
   #\u00D4
   #\u00D5
   #\u00D6
   #\u00D8
   #\u00D9
   #\u00DA
   #\u00DB
   #\u00DC
   #\u00DD
   #\u00DE))

;; Extra letters (not lower or upper) in Latin-1:
(check-all-latin-1
 (lambda (c)
   (and (char-alphabetic? c)
	(not (char-upper-case? c))
	(not (char-lower-case? c))))
 '())

;; Complete titlecase list:
(check-all-unicode
 char-title-case?
 '(#\u01C5
   #\u01C8
   #\u01CB
   #\u01F2
   #\u1F88
   #\u1F89
   #\u1F8A
   #\u1F8B
   #\u1F8C
   #\u1F8D
   #\u1F8E
   #\u1F8F
   #\u1F98
   #\u1F99
   #\u1F9A
   #\u1F9B
   #\u1F9C
   #\u1F9D
   #\u1F9E
   #\u1F9F
   #\u1FA8
   #\u1FA9
   #\u1FAA
   #\u1FAB
   #\u1FAC
   #\u1FAD
   #\u1FAE
   #\u1FAF
   #\u1FBC
   #\u1FCC
   #\u1FFC))

;; Complete whitspace list:
(check-all-unicode
 char-whitespace?
 '(#\u0009
   #\u000A
   #\u000B
   #\u000C
   #\u000D
   #\u0020
   #\u0085
   #\u00A0
   #\u1680
   #\u2000
   #\u2001
   #\u2002
   #\u2003
   #\u2004
   #\u2005
   #\u2006
   #\u2007
   #\u2008
   #\u2009
   #\u200A
   ;; #\u200B --- in Unicode 4.0, this code point changed from Zs to Cf
   #\u2028
   #\u2029
   #\u202F
   #\u3000
   ;; Post SRFI-14?
   #\u205F
   #\u180E))

;; Punctuation in Latin-1:
(check-all-latin-1
 char-punctuation?
 '(#\!
   #\"
   #\#
   #\%
   #\&
   #\'
   #\(
   #\)
   #\*
   #\,
   #\-
   #\.
   #\/
   #\:
   #\;
   #\?
   #\@
   #\[
   #\\
   #\]
   #\_
   #\{
   #\}
   #\u00A1
   #\u00AB
   ;; #\u00AD ;; Treated as a control character now?
   #\u00B7
   #\u00BB
   #\u00BF))

;; Symbols in Latin-1
(check-all-latin-1
 char-symbolic?
 '(#\$
   #\+
   #\<
   #\=
   #\>
   #\^
   #\`
   #\|
   #\~
   #\u00A2
   #\u00A3
   #\u00A4
   #\u00A5
   #\u00A6
   #\u00A7
   #\u00A8
   #\u00A9
   #\u00AC
   #\u00AE
   #\u00AF
   #\u00B0
   #\u00B1
   #\u00B4
   #\u00B6
   #\u00B8
   #\u00D7
   #\u00F7))

;; Complete blank list:
(check-all-unicode
 char-blank?
 '(#\u0009
   #\u0020
   #\u00A0
   #\u1680
   #\u2000
   #\u2001
   #\u2002
   #\u2003
   #\u2004
   #\u2005
   #\u2006
   #\u2007
   #\u2008
   #\u2009
   #\u200A
   ;; #\u200B --- see note above
   #\u202F
   #\u3000
   ;; Post SRFI-14?
   #\u205F
   #\u180E
   ))


;; Every letter, digit, punct, and symbol is graphic
(check-all-unicode
 (lambda (c)
   (and (not (char-graphic? c))
	(or (char-alphabetic? c)
	    (char-numeric? c)
	    (char-punctuation? c)
	    (char-symbolic? c))))
 null)

;; Letter, digit, punct, and symbol are distinct
(check-all-unicode
 (lambda (c)
   (> (+ (if (char-alphabetic? c) 1 0)
	 (if (char-numeric? c) 1 0)
	 (if (char-punctuation? c) 1 0)
	 (if (char-symbolic? c)
             (if (char<=? #\u24B6 c #\u24E9)
                 0 ;; Those are both alphabetic and symbolic
                 1)
             0))
      1))
 null)

(check-all-unicode
 char-iso-control?
 (append
  (let loop ([i 0])
    (if (= i #x20)
	null
	(cons (integer->char i) (loop (add1 i)))))
  (let loop ([i #x7F])
    (if (= i #xA0)
	null
	(cons (integer->char i) (loop (add1 i)))))))


;; ----------------------------------------
;; Whatever bytes->string/locale does with a bad locale, it shouldn't crash

(parameterize ([current-locale "no such locale"])
  (with-handlers ([exn:fail:contract? void]) (bytes->string/locale #"xxx"))
  (with-handlers ([exn:fail:contract? void]) (string->bytes/locale "xxx")))


;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;  String-upcase, etc.:

(test "ABC!" string-upcase "abc!")
(test "Z\u7238Z" string-upcase "z\u7238z")
(test "STRASSE" string-upcase "Stra\xDFe")
(test "\u039A\u0391\u039F\u03A3" string-upcase "\u039A\u03b1\u03BF\u03C2")
(test "\u039A\u0391\u039F\u03A3" string-upcase "\u039A\u03b1\u03BF\u03C3")

(test "abc!" string-downcase "aBC!")
(test "z\u7238z" string-downcase "z\u7238Z")
(test "stra\xDFe" string-downcase "Stra\xDFe")
(test "\u03BA\u03b1\u03BF\u03C2" string-downcase "\u039A\u0391\u039F\u03A3")
(test "\u03C3" string-downcase "\u03A3")
(test "x\u03C2" string-downcase "X\u03A3")
(test "\u03BA\u03b1\u03BF\u03C3\u03C2" string-downcase "\u039A\u0391\u039F\u03A3\u03A3")
(test "\u03BA\u03b1\u03BF\u03C2 x" string-downcase "\u039A\u0391\u039F\u03A3 x")

(test "abc!" string-foldcase "aBC!")
(test "z\u7238z" string-foldcase "z\u7238Z")
(test "strasse" string-foldcase "Stra\xDFe")
(test "\u03BA\u03b1\u03BF\u03C3" string-foldcase "\u039A\u0391\u039F\u03A3")
(test "\u03C3" string-foldcase "\u03A3")
(test "x\u03C3" string-foldcase "X\u03A3")
(test "\u03BA\u03b1\u03BF\u03C3\u03C3" string-foldcase "\u039A\u0391\u039F\u03A3\u03A3")
(test "\u03BA\u03b1\u03BF\u03C3 x" string-foldcase "\u039A\u0391\u039F\u03A3 x")

(test "Abc!" string-titlecase "aBC!")
(test "Abc  Two" string-titlecase "aBC  twO")
(test "Abc!Two" string-titlecase "aBC!twO")
(test "Z\u7238Z" string-titlecase "z\u7238Z")
(test "Stra\xDFe" string-titlecase "stra\xDFe")
(test "Stra Sse" string-titlecase "stra \xDFe")
(test "\u039A\u03b1\u03BF\u03C2" string-titlecase "\u039A\u0391\u039F\u03A3")
(test "\u039A\u03b1\u03BF \u03A3x" string-titlecase "\u039A\u0391\u039F \u03A3x")
(test "\u03A3" string-titlecase "\u03A3")
(test "X\u03C2" string-titlecase "x\u03A3")
(test "\u039A\u03b1\u03BF\u03C3\u03C2" string-titlecase "\u039A\u0391\u039F\u03A3\u03A3")
(test "\u039A\u03b1\u03BF\u03C2 X" string-titlecase "\u039A\u0391\u039F\u03A3 x")


(report-errs)