Unicode 5.0, use U+FFFD as replacement char

svn: r8838
2008-03-01 14:17:50 +00:00 · 2008-03-01 14:17:50 +00:00 · b3a47edde1
commit b3a47edde1
parent 57f21b3f2c
12 changed files with 1829 additions and 1569 deletions
--- a/collects/scribblings/reference/bytes.scrbl
+++ b/collects/scribblings/reference/bytes.scrbl
@ -351,9 +351,10 @@ Certain encoding combinations are always available:
 @item{@scheme[(bytes-open-converter "UTF-8-permissive" "UTF-8")] ---
   @index['("UTF-8-permissive")]{the} identity conversion, except that
   any input byte that is not part of a valid encoding sequence is
-   effectively replaced by @scheme[(char->integer #\?)].  (This
-   handling of invalid sequences is consistent with the interpretation
-   of port bytes streams into characters; see @secref["ports"].)}
+   effectively replaced by the UTF-8 encoding sequence for
+   @scheme[#\uFFFD].  (This handling of invalid sequences is
+   consistent with the interpretation of port bytes streams into
+   characters; see @secref["ports"].)}

 @item{@scheme[(bytes-open-converter "" "UTF-8")] --- converts from
   the current locale's default encoding (see @secref["encodings"])
--- a/collects/scribblings/reference/chars.scrbl
+++ b/collects/scribblings/reference/chars.scrbl
@ -139,9 +139,8 @@ otherwise.

@defproc[(char-alphabetic? [char char?]) boolean?]{

-Returns @scheme[#t] if @scheme[char]'s Unicode general category is
-@UCat{Lu}, @UCat{Ll}, @UCat{Lt}, @UCat{Lm}, or @UCat{Lo}, @scheme[#f]
-otherwise.}
+Returns @scheme[#t] if @scheme[char] has the Unicode ``Alphabetic''
+property.}

@defproc[(char-lower-case? [char char?]) boolean?]{

@ -161,8 +160,8 @@ Returns @scheme[#t] if @scheme[char]'s Unicode general category is

@defproc[(char-numeric? [char char?]) boolean?]{

-Returns @scheme[#t] if @scheme[char]'s Unicode general category is
-@UCat{Nd}, @scheme[#f] otherwise.}
+Returns @scheme[#t] if @scheme[char] has the Unicode ``Numeric''
+property.}

@defproc[(char-symbolic? [char char?]) boolean?]{

@ -185,10 +184,8 @@ Returns @scheme[#t] if @scheme[char]'s Unicode general category is

@defproc[(char-whitespace? [char char?]) boolean?]{

-Returns @scheme[#t] if @scheme[char]'s Unicode general category is
-@UCat{Zs}, @UCat{Zl}, or @UCat{Zp}, or if @scheme[char] is one of the
-following: @scheme[#\tab], @scheme[#\newline], @scheme[#\vtab],
-@scheme[#\page], @scheme[#\return], or @scheme[#\u0085].}
+Returns @scheme[#t] if @scheme[char] has the Unicode ``White_Space''
+property.}

@defproc[(char-blank? [char char?]) boolean?]{

--- a/collects/scribblings/reference/encodings.scrbl
+++ b/collects/scribblings/reference/encodings.scrbl
@ -17,15 +17,15 @@ stream to discover that the stream is not a valid encoding.
 When an input port produces a sequence of bytes that is not a valid
 UTF-8 encoding in a character-reading context, then bytes that
 constitute an invalid sequence are converted to the character
-@litchar{?}. Specifically, bytes 255 and 254 are always converted to
-@litchar{?}, bytes in the range 192 to 253 produce @litchar{?} when
-they are not followed by bytes that form a valid UTF-8 encoding, and
-bytes in the range 128 to 191 are converted to @litchar{?} when they
-are not part of a valid encoding that was started by a preceding byte
-in the range 192 to 253. To put it another way, when reading a
-sequence of bytes as characters, a minimal set of bytes are changed to
-63 (which is the value that @scheme[(char->integer #\?)] produces) so
-that the entire sequence of bytes is a valid UTF-8 encoding.
+@scheme[#\uFFFD]. Specifically, bytes 255 and 254 are always converted
+to @scheme[#\uFFFD], bytes in the range 192 to 253 produce
+@scheme[#\uFFFD] when they are not followed by bytes that form a valid
+UTF-8 encoding, and bytes in the range 128 to 191 are converted to
+@scheme[#\uFFFD] when they are not part of a valid encoding that was
+started by a preceding byte in the range 192 to 253. To put it another
+way, when reading a sequence of bytes as characters, a minimal set of
+bytes are changed to the encoding of @scheme[#\uFFFD] so that the
+entire sequence of bytes is a valid UTF-8 encoding.

 See @secref["bytestrings"] for procedures that facilitate
 conversions using UTF-8 or other encodings. See also
--- a/collects/tests/mzscheme/file.ss
+++ b/collects/tests/mzscheme/file.ss
@ -110,10 +110,10 @@

 ;; Test escape printing:
 (parameterize ([current-locale #f])
-  (test "\"\\a\\b\\t\\n\\f\\r\\e\\v\\\\\\\"A \\u0005A\\u000FP\\u000FP\u00DDD\u00FF7\\u00011\\U00012345\""
+  (test "\"\\a\\b\\t\\n\\f\\r\\e\\v\\\\\\\"A \\u0005A\\u000FP\\u000FP\u00DDD\u00FF7\\u00011\\U00054321\""
 	'output-escapes
 	(let ([p (open-output-string)])
-	  (write "\a\b\t\n\f\r\e\v\\\"\101\40\5A\xFP\xfP\xdDD\3777\0011\U12345" p)
+	  (write "\a\b\t\n\f\r\e\v\\\"\101\40\5A\xFP\xfP\xdDD\3777\0011\U54321" p)
 	  (get-output-string p))))

 (parameterize ([current-locale #f])
--- a/collects/tests/mzscheme/uni-norm.ss
+++ b/collects/tests/mzscheme/uni-norm.ss
@ -16,7 +16,7 @@

 (define (get-test-file)
  (define name "NormalizationTest.txt")
-  (define base "http://www.unicode.org/Public/4.1.0/ucd/")
+  (define base "http://www.unicode.org/Public/5.0.0/ucd/")
  (define (existing)
    (let loop ([dirs (list (current-load-relative-directory)
                           (current-directory))])
--- a/collects/tests/mzscheme/unicode.ss
+++ b/collects/tests/mzscheme/unicode.ss
@ -587,6 +587,8 @@
    (#(#xFFFF) complete
     (#o357 #o277 #o277))))

+(define replace-size 3) ; #\uFFFD -> 3 bytes in UTF
+
 (define (string->print s) (map char->integer (string->list s)))

 (define bytes->unicode-vector
@ -672,9 +674,9 @@
 		      (let ([convert
 			     (lambda (prefix)
 			       (test (+ (vector-length code-points) (bytes-length prefix))
-				     bytes-utf-8-length (bytes-append prefix s) #\?)
+				     bytes-utf-8-length (bytes-append prefix s) #\uFFFD)
 			       (test (vector-length code-points)
-				     bytes-utf-8-length (bytes-append prefix s) #\? (bytes-length prefix))
+				     bytes-utf-8-length (bytes-append prefix s) #\uFFFD (bytes-length prefix))
 			       (test (if (equal? prefix #"") 
 					 #f 
 					 (integer->char (bytes-ref prefix 0)))
@ -682,19 +684,19 @@
 			       (test (if (equal? prefix #"") 
 					 (if (equal? #() code-points)
 					     #f
-					     #\?)
+					     #\uFFFD)
 					 (integer->char (bytes-ref prefix 0)))
-				     bytes-utf-8-ref (bytes-append prefix s) 0 #\?)
+				     bytes-utf-8-ref (bytes-append prefix s) 0 #\uFFFD)
 			       (test (if (equal? #() code-points)
 					 (if (equal? #"" prefix)
 					     #f
 					     (integer->char (bytes-ref prefix (sub1 (bytes-length prefix)))))
 					 (integer->char
 					  (or (vector-ref code-points (sub1 (vector-length code-points)))
-					      (char->integer #\?))))
+					      (char->integer #\uFFFD))))
 				     bytes-utf-8-ref (bytes-append prefix s) 
 				     (max 0 (+ (bytes-length prefix) (sub1 (vector-length code-points))))
-				     #\?)
+				     #\uFFFD)

 			       (let-values ([(s2 n status) 
 					     (bytes-convert utf-8-iconv-p (bytes-append prefix s))]
@ -703,10 +705,18 @@
 				   [(error surrogate1 surrogate2)
 				    (test 'complete 'status status)
 				    (test (+ (bytes-length s) pl) 'n n)
-				    (test (+ (vector-length code-points) pl) bytes-length s2)
+				    (test (+ (* replace-size (vector-length code-points)) 
+                                             (if (and (positive? (vector-length code-points))
+                                                      (eq? (vector-ref code-points (sub1 (vector-length code-points)))
+                                                           #o40))
+                                                 ;; space at end is converted, not replaced by #\xFFFD
+                                                 (- 1 replace-size)
+                                                 0)
+                                             pl) 
+                                          bytes-length s2)
 				    (test (append (bytes->list prefix)
 						  (map 
-						   (lambda (i) (or i (char->integer #\?)))
+						   (lambda (i) (or i (char->integer #\uFFFD)))
 						   (vector->list code-points)))
 					  vector->list (bytes->unicode-vector s2))]
 				   [(error/aborts)
@ -718,10 +728,10 @@
 							   (vector->list code-points))
 							  ;; indicates how many to be unused due to abort:
 							  (cadddr p))))])
-				      (test (+ (vector-length code-points) pl) bytes-length s2)
+				      (test (+ (* replace-size (vector-length code-points)) pl) bytes-length s2)
 				      (test (append (bytes->list prefix)
 						    (map 
-						     (lambda (i) (or i (char->integer #\?)))
+						     (lambda (i) (or i (char->integer #\uFFFD)))
 						     (vector->list code-points)))
 					    vector->list (bytes->unicode-vector s2)))]
 				   [else
@ -786,9 +796,9 @@
 				(loop c p))))))))
 		;; Test read-string decoding
 		(let ([us (apply string (map (lambda (i)
-					       (if i (integer->char i) #\?))
+					       (if i (integer->char i) #\uFFFD))
 					     (vector->list code-points)))])
-		  (test us bytes->string/utf-8 s #\?)
+		  (test us bytes->string/utf-8 s #\uFFFD)
 		  (test us read-string (vector-length code-points) (open-input-bytes s))
 		  (test us read-string (* 100 (vector-length code-points)) (open-input-bytes s))
 		  (unless (string=? "" us)
@ -940,7 +950,7 @@
 			  (test 'complete 'status status)
 			  ;; Should be the same as decoding corrected UTF-8:
 			  (let-values ([(s3 n status) (bytes-convert c (string->bytes/utf-8
-									(bytes->string/utf-8 s #\?)))])
+									(bytes->string/utf-8 s #\uFFFD)))])
 			    (test s3 `(permissive ,s) s2)))))))))
 	    basic-utf-8-tests))

@ -1224,7 +1234,7 @@
      (loop (add1 i)))))

 (define (check-all-unicode ? l)
-  (define (unless-in-l ? c)
+  (define (unless-in-l ? code c)
    (and (? c)
 	 (not (member c l))))
  (define (qtest expect f . args)
@ -1235,9 +1245,9 @@
 	    l)
  (for-each (lambda (r)
 	      (if (caddr r)
-		  (qtest #f unless-in-l ? (integer->char (car r)))
+		  (qtest #f unless-in-l ? (car r) (integer->char (car r)))
 		  (let loop ([i (car r)])
-		    (qtest #f unless-in-l  ? (integer->char i))
+		    (qtest #f unless-in-l  ? i (integer->char i))
 		    (unless (= i (cadr r))
 		      (loop (add1 i))))))
 	    (make-known-char-range-list)))
@ -1560,7 +1570,11 @@
   (> (+ (if (char-alphabetic? c) 1 0)
 	 (if (char-numeric? c) 1 0)
 	 (if (char-punctuation? c) 1 0)
-	 (if (char-symbolic? c) 1 0))
+	 (if (char-symbolic? c) 
+             (if (char<=? #\u24B6 c #\u24E9)
+                 0 ;; Those are both alphabetic and symbolic
+                 1)
+             0))
      1))
 null)

--- a/doc/release-notes/mzscheme/MzScheme_4.txt
+++ b/doc/release-notes/mzscheme/MzScheme_4.txt
@ -135,6 +135,9 @@ but we start with an enumeration of changes:
 - In a syntax pattern of the form `(a ... . b)', `b' is now allowed
   to match an empty list.

+ - UTF-8 decoding for ports uses #\uFFFD instead of #\? as the
+   replacement character for bad encodings.
+
 ======================================================================
 Porting Advice
 ======================================================================
--- a/src/mzscheme/src/mk-uchar.ss
+++ b/src/mzscheme/src/mk-uchar.ss
@ -1,3 +1,4 @@
+#lang scheme/base 

 ;; This script parses UnicodeData.txt (the standard Unicode database,
 ;; available from the web) and other such files, and it produces
@ -6,13 +7,12 @@
 ;; is used for string operations.

 ;; Run as
-;;   mzscheme -r mk-uchar.ss
+;;   mzscheme mk-uchar.ss
 ;; in the script's directory, and have a copy of UnicodeData.txt, etc.
 ;; in the "Unicode" directory. The file schuchar.inc will be
 ;; overwritten.

-(require mzlib/list)
-(require mzscheme)
+(require scheme/list)

 (define mark-cats '("Mn" "Mc" "Me"))
 (define letter-cats '("Lu" "Ll" "Lt" "Lm" "Lo"))
@ -195,7 +195,7 @@

 ;; This code assumes that Final_Sigma is the only condition that we care about:
 (define special-casings (make-hash-table 'equal))
-(define-struct special-casing (lower upper title folding final-sigma?))
+(define-struct special-casing (lower upper title folding final-sigma?) #:mutable)
 (call-with-input-file "Unicode/SpecialCasing.txt"
  (lambda (i)
    (let loop ()
@ -215,26 +215,51 @@

 (define lower-case  (make-hash-table 'equal))
 (define upper-case  (make-hash-table 'equal))
+(define alphabetic  (make-hash-table 'equal))

 (with-input-from-file "Unicode/DerivedCoreProperties.txt"
  (lambda ()
    (let loop ()
      (let ([l (read-line)])
 	(unless (eof-object? l)
-	  (let ([m (regexp-match #rx"^([0-9A-F.]+) *; (Lower|Upper)case" l)])
+	  (let ([m (regexp-match #rx"^([0-9A-F.]+) *; ((Lower|Upper)case|Alphabetic)" l)])
 	    (when m
 	      (let* ([start (string->number (car (regexp-match #rx"^[0-9A-F]+" (car m))) 16)]
 		     [end (let ([m (regexp-match #rx"^[0-9A-F]+[.][.]([0-9A-F]+)" (car m))])
 			    (if m
 				(string->number (cadr m) 16)
 				start))]
-		     [t (if (string=? (caddr m) "Lower") lower-case upper-case)])
+		     [t (cond
+                         [(string=? (caddr m) "Lowercase") lower-case]
+                         [(string=? (caddr m) "Uppercase") upper-case]
+                         [(string=? (caddr m) "Alphabetic") alphabetic]
+                         [else (error "unknown property section")])])
 		(let loop ([i start])
 		  (hash-table-put! t i #t)
 		  (unless (= i end)
 		    (loop (add1 i)))))))
 	  (loop))))))

+(define white_spaces  (make-hash-table 'equal))
+
+(with-input-from-file "Unicode/PropList.txt"
+  (lambda ()
+    (let loop ()
+      (let ([l (read-line)])
+	(unless (eof-object? l)
+	  (let ([m (regexp-match #rx"^([0-9A-F.]+) *; White_Space" l)])
+	    (when m
+	      (let* ([start (string->number (car (regexp-match #rx"^[0-9A-F]+" (car m))) 16)]
+		     [end (let ([m (regexp-match #rx"^[0-9A-F]+[.][.]([0-9A-F]+)" (car m))])
+			    (if m
+				(string->number (cadr m) 16)
+				start))])
+		(let loop ([i start])
+		  (hash-table-put! white_spaces i #t)
+		  (unless (= i end)
+		    (loop (add1 i)))))))
+	  (loop))))))
+
 (define decomp-ht (make-hash-table))
 (define k-decomp-ht (make-hash-table))
 (define compose-initial-ht (make-hash-table))
@ -290,7 +315,7 @@
    (let loop ([prev-code 0])
      (let ([l (read-line i)])
 	(unless (eof-object? l)
-	  (let ([m (regexp-match #rx"^([0-9A-F]+);([^;]*);([^;]*);([^;]*);[^;]*;([^;]*);[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;([^;]*);([^;]*);([^;]*)"
+	  (let ([m (regexp-match #rx"^([0-9A-F]+);([^;]*);([^;]*);([^;]*);[^;]*;([^;]*);[^;]*;([^;]*);[^;]*;[^;]*;[^;]*;[^;]*;([^;]*);([^;]*);([^;]*)"
 				 l)])
 	    (unless m
 	      (printf "no match: ~a~n" l))
@ -299,74 +324,89 @@
 		  [cat (cadddr m)]
 		  [combining (string->number (cadddr (cdr m)))]
 		  [decomp (cadddr (cddr m))]
-		  [up (string->number (cadddr (cdddr m)) 16)]
-		  [down (string->number (cadddr (cddddr m)) 16)]
-		  [title (string->number (cadddr (cddddr (cdr m))) 16)])
-	      (mapn code
-		    (if (regexp-match #rx", Last>" name)
-			(add1 prev-code)
-			code)
-		    ;; The booleans below are in most-siginficant-bit-first order
-		    (combine
-		     ;; Decomposition
-		     (extract-decomp decomp code)
-		     ;; special-casing
-		     (or (hash-table-get special-casings code (lambda () #f))
-			 (hash-table-get special-case-foldings code (lambda () #f)))
-		     ;; case-ignoreable
-		     (or (member code midletters)
-			 (member cat '("Mn" "Me" "Cf" "Lm" "Sk")))
-		     ;; graphic
-		     (member cat graphic-cats)
-		     ;; lowercase:
-		     (hash-table-get lower-case code (lambda () #f))
-		     #;
-		     (and (not (<= #x2000 code #x2FFF))
-			  (not down)
-			  (or up
-			      (regexp-match #rx"SMALL LETTER" name)
-			      (regexp-match #rx"SMALL LIGATURE" name)))
-		     ;; uppercase;
-		     (hash-table-get upper-case code (lambda () #f))
-		     #;
-		     (and (not (<= #x2000 code #x2FFF))
-			  (not up)
-			  (or down
-			      (regexp-match #rx"CAPITAL LETTER" name)
-			      (regexp-match #rx"CAPITAL LIGATURE" name)))
-		     ;; titlecase:
-		     (string=? cat "Lt")
-		     ;; letter
-		     (member cat letter-cats)
-		     ;; digit
-		     (member cat digit-cats)
-		     ;; SOMETHING - this bit not yet used
-		     #f
-		     ;; whitespace
-		     (or (member cat space-cats)
-			 (member code '(#x9 #xa #xb #xc #xd #x85)))
-		     ;; control
-		     (or (<= #x0000 code #x001F)
-			 (<= #x007F code #x009F))
-		     ;; punctuation
-		     (member cat punc-cats)
-		     ;; symbol
-		     (member cat sym-cats)
-		     ;; blank
-		     (or (string=? cat "Zs")
-			 (= code #x9)))
-		    ;; Cases
-		    (combine-case
-		     (if up (- up code) 0)
-		     (if down (- down code) 0)
-		     (if title (- title code) 0)
-		     (let ([case-fold (hash-table-get case-foldings code (lambda () #f))])
-		       (if case-fold (- case-fold code) 0))
-		     combining)
-		    ;; Category
-		    (combine-cat cat)
-		    ;; Combining class - used again to filter initial composes
-		    combining)
+                  [numeric (cadddr (cdddr m))]
+		  [up (string->number (cadddr (cddddr m)) 16)]
+		  [down (string->number (cadddr (cddddr (cdr m))) 16)]
+		  [title (string->number (cadddr (cddddr (cddr m))) 16)])
+              (let ([alphabetic? (hash-table-get alphabetic code #f)]
+                    [numeric? (not (string=? numeric ""))]
+                    [symbolic? (member cat sym-cats)]
+                    [punctuation? (member cat punc-cats)])
+                (mapn code
+                      (if (regexp-match #rx", Last>" name)
+                          (add1 prev-code)
+                          code)
+                      ;; The booleans below are in most-siginficant-bit-first order
+                      (combine
+                       ;; Decomposition
+                       (extract-decomp decomp code)
+                       ;; special-casing
+                       (or (hash-table-get special-casings code (lambda () #f))
+                           (hash-table-get special-case-foldings code (lambda () #f)))
+                       ;; case-ignoreable
+                       (or (member code midletters)
+                           (member cat '("Mn" "Me" "Cf" "Lm" "Sk")))
+                       ;; graphic
+                       (or alphabetic?
+                           numeric?
+                           symbolic?
+                           punctuation?
+                           (member cat mark-cats))
+                       ;; lowercase:
+                       (hash-table-get lower-case code (lambda () #f))
+                       #;
+                       (and (not (<= #x2000 code #x2FFF))
+                            (not down)
+                            (or up
+                                (regexp-match #rx"SMALL LETTER" name)
+                                (regexp-match #rx"SMALL LIGATURE" name)))
+                       ;; uppercase;
+                       (hash-table-get upper-case code (lambda () #f))
+                       #;
+                       (and (not (<= #x2000 code #x2FFF))
+                            (not up)
+                            (or down
+                                (regexp-match #rx"CAPITAL LETTER" name)
+                                (regexp-match #rx"CAPITAL LIGATURE" name)))
+                       ;; titlecase:
+                       (string=? cat "Lt")
+                       ;; letter
+                       alphabetic?
+                       #;
+                       (member cat letter-cats)
+                       ;; digit
+                       numeric?
+                       #;
+                       (member cat digit-cats)
+                       ;; SOMETHING - this bit not yet used
+                       #f
+                       ;; whitespace
+                       (hash-table-get white_spaces code #f)
+                       #;
+                       (or (member cat space-cats)
+                       (member code '(#x9 #xa #xb #xc #xd #x85)))
+                       ;; control
+                       (or (<= #x0000 code #x001F)
+                           (<= #x007F code #x009F))
+                       ;; punctuation
+                       punctuation?
+                       ;; symbol
+                       symbolic?
+                       ;; blank
+                       (or (string=? cat "Zs")
+                           (= code #x9)))
+                      ;; Cases
+                      (combine-case
+                       (if up (- up code) 0)
+                       (if down (- down code) 0)
+                       (if title (- title code) 0)
+                       (let ([case-fold (hash-table-get case-foldings code (lambda () #f))])
+                         (if case-fold (- case-fold code) 0))
+                       combining)
+                      ;; Category
+                      (combine-cat cat)
+                      ;; Combining class - used again to filter initial composes
+                      combining))
 	      (loop code))))))))

 (hash-table-for-each compose-initial-ht
@ -438,7 +478,7 @@
 (define pos2 0)
 (define pos3 0)

-(current-output-port (open-output-file "schuchar.inc" 'truncate/replace))
+(current-output-port (open-output-file "schuchar.inc" #:exists 'truncate/replace))

 (define (hash-vectors! top vectors get-pos set-pos!)
  (let loop ([i 0])
@ -628,7 +668,7 @@

 ;; ----------------------------------------

-(current-output-port (open-output-file "schustr.inc" 'truncate/replace))
+(current-output-port (open-output-file "schustr.inc" #:exists 'truncate/replace))

 (printf "/* Generated by mk-uchar.ss */~n~n")

--- a/src/mzscheme/src/port.c
+++ b/src/mzscheme/src/port.c
@ -1288,7 +1288,7 @@ XFORM_NONGCING static void do_count_lines(Scheme_Port *ip, const char *buffer, l
    int state = ip->utf8state;
    int n;
    degot += state_len(state);
-    n = scheme_utf8_decode_count((const unsigned char *)buffer, offset, offset + i + 1, &state, 0, '?');
+    n = scheme_utf8_decode_count((const unsigned char *)buffer, offset, offset + i + 1, &state, 0, 0xFFFD);
    degot += (i + 1 - n);
    ip->utf8state = 0; /* assert: state == 0, because we ended with a newline */
  }
@ -1331,7 +1331,7 @@ XFORM_NONGCING static void do_count_lines(Scheme_Port *ip, const char *buffer, l
    col -= n;
    for (i = prev_i; i < got; i++) {
      if (buffer[offset + i] == '\t') {
-	n = scheme_utf8_decode_count((const unsigned char *)buffer, offset + prev_i, offset + i, &state, 0, '?');
+	n = scheme_utf8_decode_count((const unsigned char *)buffer, offset + prev_i, offset + i, &state, 0, 0xFFFD);
 	degot += ((i - prev_i) - n);
 	col += n;
 	col = col - (col & 0x7) + 8;
@ -1339,7 +1339,7 @@ XFORM_NONGCING static void do_count_lines(Scheme_Port *ip, const char *buffer, l
      }
    }
    if (prev_i < i) {
-      n = scheme_utf8_decode_count((const unsigned char *)buffer, offset + prev_i, offset + i, &state, 1, '?');
+      n = scheme_utf8_decode_count((const unsigned char *)buffer, offset + prev_i, offset + i, &state, 1, 0xFFFD);
      n += state_len(state);
      col += n;
      degot += ((i - prev_i) - n);
@ -2178,7 +2178,7 @@ long scheme_get_char_string(const char *who,
 	  long ulen, glen;
 	  glen = scheme_utf8_decode_as_prefix((const unsigned char *)s, 0, got + leftover,
 					      buffer, offset, offset + size,
-					      &ulen, 0, '?');
+					      &ulen, 0, 0xFFFD);
 	  if (glen && (ulen < got + leftover)) {
 	    /* Got one, with a decoding error. If we weren't peeking,
 	       don't read the lookahead bytes after all, yet. */
@ -2221,7 +2221,7 @@ long scheme_get_char_string(const char *who,
 	     sequence in an error. We may have more leftover chars
 	     than we need, but they haven't been read, yet. */
 	  while (leftover && size) {
-	    buffer[offset++] = '?';
+	    buffer[offset++] = 0xFFFD;
 	    total_got++;
 	    --leftover;
 	    --size;
@ -2248,7 +2248,7 @@ long scheme_get_char_string(const char *who,

      glen = scheme_utf8_decode_as_prefix((const unsigned char *)s, 0, got + leftover,
 					  buffer, offset, offset + size,
-					  &ulen, 0, '?');
+					  &ulen, 0, 0xFFFD);
      
      total_got += glen;
      if (glen == size) {
@ -2268,7 +2268,7 @@ long scheme_get_char_string(const char *who,

      /* Leftover bytes must be decoding-error bytes: */
      while (leftover) {
-	buffer[offset++] = '?';
+	buffer[offset++] = 0xFFFD;
 	total_got++;
 	--leftover;
      }
@ -2401,7 +2401,7 @@ scheme_getc(Scheme_Object *port)
      else {
 	/* This counts as a decoding error. The high bit
 	   on the first character must be set. */
-	return '?';
+	return 0xFFFD;
      }
    } else {
      v = scheme_utf8_decode_prefix((const unsigned char *)s, delta + 1, r, 0);
@ -2417,7 +2417,7 @@ scheme_getc(Scheme_Object *port)
 	return r[0];
      } else if (v == -2) {
 	/* -2 => decoding error */
-	return '?';
+	return 0xFFFD;
      } else if (v == -1) {
 	/* In middle of sequence; start/continue peeking bytes */
 	delta++;
@ -2543,8 +2543,8 @@ static int do_peekc_skip(Scheme_Object *port, Scheme_Object *skip,
      if (!delta)
 	return v;
      else {
-	/* This counts as a decoding error, so return '?' */
-	return '?';
+	/* This counts as a decoding error, so return 0xFFFD */
+	return 0xFFFD;
      }
    } else {
      v = scheme_utf8_decode_prefix((const unsigned char *)s, delta + 1, r, 0);
@ -2552,7 +2552,7 @@ static int do_peekc_skip(Scheme_Object *port, Scheme_Object *skip,
 	return r[0];
      else if (v == -2) {
 	/* -2 => decoding error */
-	return '?';
+	return 0xFFFD;
      } else if (v == -1) {
 	/* In middle of sequence - keep getting bytes. */
 	delta++;
--- a/src/mzscheme/src/schuchar.inc
+++ b/src/mzscheme/src/schuchar.inc
--- a/src/mzscheme/src/schustr.inc
+++ b/src/mzscheme/src/schustr.inc
--- a/src/mzscheme/src/string.c
+++ b/src/mzscheme/src/string.c
@ -852,11 +852,11 @@ Scheme_Object *scheme_make_sized_offset_utf8_string(char *chars, long d, long le
  if (len) {
    ulen = scheme_utf8_decode((unsigned char *)chars, d, d + len,
 			      NULL, 0, -1,
-			      NULL, 0 /* not UTF-16 */, '?');
+			      NULL, 0 /* not UTF-16 */, 0xFFFD);
    us = scheme_malloc_atomic(sizeof(mzchar) * (ulen + 1));
    scheme_utf8_decode((unsigned char *)chars, d, d + len,
 		       us, 0, -1,
-		       NULL, 0 /* not UTF-16 */, '?');
+		       NULL, 0 /* not UTF-16 */, 0xFFFD);

    us[ulen] = 0;
  } else {
@ -1291,12 +1291,12 @@ byte_string_to_char_string_latin1 (int argc, Scheme_Object *argv[])

 Scheme_Object *scheme_byte_string_to_char_string(Scheme_Object *o)
 {
-  return do_byte_string_to_char_string("s->s", o, 0, SCHEME_BYTE_STRLEN_VAL(o), '?', 0);
+  return do_byte_string_to_char_string("s->s", o, 0, SCHEME_BYTE_STRLEN_VAL(o), 0xFFFD, 0);
 }

 Scheme_Object *scheme_byte_string_to_char_string_locale(Scheme_Object *o)
 {
-  return do_byte_string_to_char_string_locale("s->s", o, 0, SCHEME_BYTE_STRLEN_VAL(o), '?');
+  return do_byte_string_to_char_string_locale("s->s", o, 0, SCHEME_BYTE_STRLEN_VAL(o), 0xFFFD);
 }

 /************************* string->bytes *************************/
@ -4166,7 +4166,7 @@ Scheme_Object *scheme_open_converter(const char *from_e, const char *to_e)
    /* Use the built-in UTF-8<->UTF-8 converter: */
    kind = mzUTF8_KIND;
    if (!strcmp(from_e, "UTF-8-permissive"))
-      permissive = '?';
+      permissive = 0xFFFD;
    else
      permissive = 0;
    cd = (iconv_t)-1;
@ -4176,7 +4176,7 @@ Scheme_Object *scheme_open_converter(const char *from_e, const char *to_e)
 	     && !strcmp(to_e, "platform-UTF-16")) {
    kind = mzUTF8_TO_UTF16_KIND;
    if (!strcmp(from_e, "platform-UTF-8-permissive"))
-      permissive = '?';
+      permissive = 0xFFFD;
    else
      permissive = 0;
    cd = (iconv_t)-1;
@ -4546,7 +4546,8 @@ static int utf8_decode_x(const unsigned char *s, int start, int end,
 	might_continue => allows -1 result without consuming characters

 	permissive is non-zero => use permissive as value for bad byte
-	sequences*/
+	sequences. When generating UTF-8, this must be an ASCII character
+        or U+FFFD. */

 {
  int i, j, oki, failmode = -3, state;
@ -4796,9 +4797,19 @@ static int utf8_decode_x(const unsigned char *s, int start, int end,
 	      j += delta;
 	    } else
 	      break;
-	  } else if (us) {
-	    ((unsigned char *)us)[j] = v;
-	  }
+	  } else if (v == 0xFFFD) {
+            if (j + 3 < dend) {
+              if (us) {
+                ((unsigned char *)us)[j] = 0xEF;
+                ((unsigned char *)us)[j+1] = 0xBF;
+                ((unsigned char *)us)[j+2] = 0xBD;
+              }
+              j += 2;
+            } else
+              break;
+          } else if (us) {
+            ((unsigned char *)us)[j] = v;
+          }
 	}
      } else if (us) {
 	us[j] = v;