improved/corrected UTF-16 tests

svn: r2901
2006-05-10 16:28:25 +00:00 · 2006-05-10 16:28:25 +00:00 · 97e800aeee
commit 97e800aeee
parent 30ad72f765
1 changed files with 85 additions and 33 deletions
--- a/collects/tests/mzscheme/unicode.ss
+++ b/collects/tests/mzscheme/unicode.ss
@ -551,35 +551,35 @@
    (#(#f #f #f #f #f #f) error
     (#o374 #o200 #o200 #o200 #o200 #o200))
    ;; illedgal surrogates
-    (#(#f #f #f) surrogate
+    (#(#f #f #f) surrogate1
     (#o355 #o240 #o200))
-    (#(#f #f #f) surrogate
+    (#(#f #f #f) surrogate1
     (#o355 #o255 #o277))
-    (#(#f #f #f) surrogate
+    (#(#f #f #f) surrogate1
     (#o355 #o256 #o200))
-    (#(#f #f #f) surrogate
+    (#(#f #f #f) surrogate1
     (#o355 #o257 #o277))
-    (#(#f #f #f) surrogate
+    (#(#f #f #f) surrogate2
     (#o355 #o260 #o200))
-    (#(#f #f #f) surrogate
+    (#(#f #f #f) surrogate2
     (#o355 #o276 #o200))
-    (#(#f #f #f) surrogate
+    (#(#f #f #f) surrogate2
     (#o355 #o277 #o277))
-    (#(#f #f #f #f #f #f) surrogate
+    (#(#f #f #f #f #f #f) error
     (#o355 #o240 #o200 #o355 #o260 #o200))
-    (#(#f #f #f #f #f #f) surrogate
+    (#(#f #f #f #f #f #f) error
     (#o355 #o240 #o200 #o355 #o277 #o277))
-    (#(#f #f #f #f #f #f) surrogate
+    (#(#f #f #f #f #f #f) error
     (#o355 #o255 #o277 #o355 #o260 #o200))
-    (#(#f #f #f #f #f #f) surrogate
+    (#(#f #f #f #f #f #f) error
     (#o355 #o255 #o277 #o355 #o277 #o277))
-    (#(#f #f #f #f #f #f) surrogate
+    (#(#f #f #f #f #f #f) error
     (#o355 #o256 #o200 #o355 #o260 #o200))
-    (#(#f #f #f #f #f #f) surrogate
+    (#(#f #f #f #f #f #f) error
     (#o355 #o256 #o200 #o355 #o277 #o277))
-    (#(#f #f #f #f #f #f) surrogate
+    (#(#f #f #f #f #f #f) error
     (#o355 #o257 #o277 #o355 #o260 #o200))
-    (#(#f #f #f #f #f #f) surrogate
+    (#(#f #f #f #f #f #f) error
     (#o355 #o257 #o277 #o355 #o277 #o277))
    ;; 0xFFFE and 0xFFFF
    (#(#xFFFE) complete
@ -609,7 +609,6 @@
      eof
      (list->string l)))

-#;
 (let ([utf-8-iconv (bytes-open-converter "UTF-8" "UTF-8")]
      [utf-8-iconv-p (bytes-open-converter "UTF-8-permissive" "UTF-8")])
  ;; First, check some simple conversions
@ -667,7 +666,7 @@
 		      (test 0 bytes-utf-8-index s 0)
 		      (let-values ([(s2 n status) (bytes-convert utf-8-iconv s)])
 			(test (case parse-status 
-				[(error/aborts surrogate) 'error]
+				[(error/aborts surrogate1 surrogate2) 'error]
 				[else parse-status ])
 			      'status status))
 		      (let ([convert
@ -701,7 +700,7 @@
 					     (bytes-convert utf-8-iconv-p (bytes-append prefix s))]
 					    [(pl) (bytes-length prefix)])
 				 (case parse-status
-				   [(error surrogate)
+				   [(error surrogate1 surrogate2)
 				    (test 'complete 'status status)
 				    (test (+ (bytes-length s) pl) 'n n)
 				    (test (+ (vector-length code-points) pl) bytes-length s2)
@ -863,9 +862,38 @@
 		(let ([c (bytes-open-converter "platform-UTF-8" "platform-UTF-16")])
 		  (let-values ([(s2 n status) (bytes-convert c s)])
 		    (case parse-status
-		      [(surrogate)
+		      [(surrogate1 surrogate2)
 		       (if (eq? (system-type) 'windows)
-			   (test 'aborts 'status status)
+			   (begin
+			     (if (eq? parse-status 'surrogate1)
+				 (test 'aborts 'status status)
+				 (test 'complete 'status status))
+			     ;; Double the surrogate, and then the "unpaired"
+			     ;; surrogates are still allowed, and they're doubled
+			     (let-values ([(s2 n status) (bytes-convert c (bytes-append s s))])
+			       (if (eq? parse-status 'surrogate1)
+				   (begin
+				     (test 3 'ds-count n)
+				     (test 'aborts `(status ,s) status))
+				   (test 'complete `(status ,s) status)))
+			     (let ([try-xtra
+				    (lambda (xtra all? d-size)
+				      (when all?
+					;; Add non-surrogate to the end, and it should always work:
+				        (let-values ([(s2 n status) (bytes-convert c (bytes-append s xtra))])
+					  (test 'complete 'status status)
+					  (test (+ 3 (bytes-length xtra)) 'cnt n)
+					  (test (* 2 (+ 1 d-size)) bytes-length s2)))
+				      ;; Same, even if we only accept 2 output bytes:
+				      (let-values ([(n0 n status) (bytes-convert c (bytes-append s xtra)
+										 0 (+ 3 (bytes-length xtra))
+										 (make-bytes 2))])
+					 (test 'continues 'status status)
+					 (test 3 'cnt n)
+					 (test 2 'got n0)))])
+			       (try-xtra #"a" #t 1)
+			       (try-xtra #"\360\220\200\200" #t 2)
+			       (try-xtra #"\355\240\200" #f 1)))
 			   (test 'error 'status status))]
 		      [(error error/aborts)
 		       (test 'error 'status status)]
@ -877,17 +905,18 @@
 		      ;; complete => used all bytes
 		      (test n 'count (bytes-length s))
 		      ;; complete => unconverting to return to original
-		      (let ([c2 (bytes-open-converter "platform-UTF-16" "platform-UTF-8")])
-			(let-values ([(s3 n2 status2) (bytes-convert c2 s2)])
-			  (test s `(UTF-16-round-trip ,s2) s3)
-			  (test n2 'count (bytes-length s2)))
-			;; Try partial unconvert
-			(let-values ([(s4 n4 status4) (bytes-convert c2 s2 0 (sub1 (bytes-length s2)))])
-			  (test 'aborts `(status ,s2) status4))
-			;; Try unconvert into to-small buffer:
-			(let-values ([(s4 n4 status4) (bytes-convert c2 s2 0 (bytes-length s2) 
-								     (make-bytes (sub1 (bytes-length s))))])
-			  (test 'continues 'status status4)))
+		      (unless (memq parse-status '(surrogate1 surrogate2))
+		        (let ([c2 (bytes-open-converter "platform-UTF-16" "platform-UTF-8")])
+			  (let-values ([(s3 n2 status2) (bytes-convert c2 s2)])
+			    (test s `(UTF-16-round-trip ,s2) s3)
+			    (test n2 'count (bytes-length s2)))
+			  ;; Try partial unconvert
+			  (let-values ([(s4 n4 status4) (bytes-convert c2 s2 0 (sub1 (bytes-length s2)))])
+			    (test 'aborts `(status ,s2) status4))
+			  ;; Try unconvert into to-small buffer:
+			  (let-values ([(s4 n4 status4) (bytes-convert c2 s2 0 (bytes-length s2) 
+								       (make-bytes (sub1 (bytes-length s))))])
+			    (test 'continues 'status status4))))
 		      ;; complete => can add bytes onto the end
 		      (let-values ([(s2 n status) (bytes-convert c (bytes-append s #"x"))])
 			(test 'complete 'status status)
@ -912,7 +941,7 @@
 			  ;; Should be the same as decoding corrected UTF-8:
 			  (let-values ([(s3 n status) (bytes-convert c (string->bytes/utf-8
 									(bytes->string/utf-8 s #\?)))])
-			    (test s3 'permissive s2)))))))))
+			    (test s3 `(permissive ,s) s2)))))))))
 	    basic-utf-8-tests))

 ;; Further UTF-16 tests
@ -926,7 +955,7 @@
    (when (eq? 'windows (system-type))
      (test-values (list #"" 0 'aborts)
 		   (lambda () (bytes-convert c (integer->integer-bytes #xD8FF 2 #f))))
-      (test-values (list #"..." 0 'aborts)
+      (test-values (list #"\355\277\277" 2 'complete)
 		   (lambda () (bytes-convert c (integer->integer-bytes #xDFFF 2 #f)))))
    ;; Non-windows: after #xD800 bits, surrogate pair is assumed
    (unless (eq? 'windows (system-type))
@ -949,6 +978,29 @@
 		     (bytes-convert c
 				    (bytes-append (integer->integer-bytes #xDC00 2 #f) 
 						  (integer->integer-bytes #x1000 2 #f))))))))
+(when (eq? (system-type) 'windows)
+  (let ([c (bytes-open-converter "platform-UTF-8-permissive" "platform-UTF-16")])
+    ;; Check that we use all 6 bytes of #"\355\240\200\355\260\200" or none
+    (test-values (list 12 6 'complete)
+		 (lambda ()
+		   (bytes-convert c #"\355\240\200\355\260\200" 0 6 (make-bytes 12))))
+    ;; If we can't look all the way to the end, reliably abort without writing:
+    (let ([s (make-bytes 12 (char->integer #\x))])
+      (let loop ([n 1])
+	(unless (= n 6)
+	  (test-values (list 0 0 'aborts)
+		       (lambda ()
+		         (bytes-convert c #"\355\240\200\355\260\200" 0 n s)))
+	  (test #"xxxxxxxxxxxx" values s) ; no writes to bytes string
+	  (loop (add1 n)))))
+    (let ([s (make-bytes 12 (char->integer #\x))])
+      (let loop ([n 0])
+	(unless (= n 12)
+	  (test-values (list 0 0 'continues)
+		       (lambda ()
+			 (bytes-convert c #"\355\240\200\355\260\200" 0 6 (make-bytes n))))
+	  (test #"xxxxxxxxxxxx" values s) ; no writes to bytes string
+	  (loop (add1 n)))))))

 ;; Seems like this sort of thing should be covered above, and maybe it
 ;;  it after some other corrections. But just in case: