Unicode 5.0, use U+FFFD as replacement char

svn: r8838
This commit is contained in:
Matthew Flatt 2008-03-01 14:17:50 +00:00
parent 57f21b3f2c
commit b3a47edde1
12 changed files with 1829 additions and 1569 deletions

View File

@ -351,9 +351,10 @@ Certain encoding combinations are always available:
@item{@scheme[(bytes-open-converter "UTF-8-permissive" "UTF-8")] ---
@index['("UTF-8-permissive")]{the} identity conversion, except that
any input byte that is not part of a valid encoding sequence is
effectively replaced by @scheme[(char->integer #\?)]. (This
handling of invalid sequences is consistent with the interpretation
of port bytes streams into characters; see @secref["ports"].)}
effectively replaced by the UTF-8 encoding sequence for
@scheme[#\uFFFD]. (This handling of invalid sequences is
consistent with the interpretation of port bytes streams into
characters; see @secref["ports"].)}
@item{@scheme[(bytes-open-converter "" "UTF-8")] --- converts from
the current locale's default encoding (see @secref["encodings"])

View File

@ -139,9 +139,8 @@ otherwise.
@defproc[(char-alphabetic? [char char?]) boolean?]{
Returns @scheme[#t] if @scheme[char]'s Unicode general category is
@UCat{Lu}, @UCat{Ll}, @UCat{Lt}, @UCat{Lm}, or @UCat{Lo}, @scheme[#f]
otherwise.}
Returns @scheme[#t] if @scheme[char] has the Unicode ``Alphabetic''
property.}
@defproc[(char-lower-case? [char char?]) boolean?]{
@ -161,8 +160,8 @@ Returns @scheme[#t] if @scheme[char]'s Unicode general category is
@defproc[(char-numeric? [char char?]) boolean?]{
Returns @scheme[#t] if @scheme[char]'s Unicode general category is
@UCat{Nd}, @scheme[#f] otherwise.}
Returns @scheme[#t] if @scheme[char] has the Unicode ``Numeric''
property.}
@defproc[(char-symbolic? [char char?]) boolean?]{
@ -185,10 +184,8 @@ Returns @scheme[#t] if @scheme[char]'s Unicode general category is
@defproc[(char-whitespace? [char char?]) boolean?]{
Returns @scheme[#t] if @scheme[char]'s Unicode general category is
@UCat{Zs}, @UCat{Zl}, or @UCat{Zp}, or if @scheme[char] is one of the
following: @scheme[#\tab], @scheme[#\newline], @scheme[#\vtab],
@scheme[#\page], @scheme[#\return], or @scheme[#\u0085].}
Returns @scheme[#t] if @scheme[char] has the Unicode ``White_Space''
property.}
@defproc[(char-blank? [char char?]) boolean?]{

View File

@ -17,15 +17,15 @@ stream to discover that the stream is not a valid encoding.
When an input port produces a sequence of bytes that is not a valid
UTF-8 encoding in a character-reading context, then bytes that
constitute an invalid sequence are converted to the character
@litchar{?}. Specifically, bytes 255 and 254 are always converted to
@litchar{?}, bytes in the range 192 to 253 produce @litchar{?} when
they are not followed by bytes that form a valid UTF-8 encoding, and
bytes in the range 128 to 191 are converted to @litchar{?} when they
are not part of a valid encoding that was started by a preceding byte
in the range 192 to 253. To put it another way, when reading a
sequence of bytes as characters, a minimal set of bytes are changed to
63 (which is the value that @scheme[(char->integer #\?)] produces) so
that the entire sequence of bytes is a valid UTF-8 encoding.
@scheme[#\uFFFD]. Specifically, bytes 255 and 254 are always converted
to @scheme[#\uFFFD], bytes in the range 192 to 253 produce
@scheme[#\uFFFD] when they are not followed by bytes that form a valid
UTF-8 encoding, and bytes in the range 128 to 191 are converted to
@scheme[#\uFFFD] when they are not part of a valid encoding that was
started by a preceding byte in the range 192 to 253. To put it another
way, when reading a sequence of bytes as characters, a minimal set of
bytes are changed to the encoding of @scheme[#\uFFFD] so that the
entire sequence of bytes is a valid UTF-8 encoding.
See @secref["bytestrings"] for procedures that facilitate
conversions using UTF-8 or other encodings. See also

View File

@ -110,10 +110,10 @@
;; Test escape printing:
(parameterize ([current-locale #f])
(test "\"\\a\\b\\t\\n\\f\\r\\e\\v\\\\\\\"A \\u0005A\\u000FP\\u000FP\u00DDD\u00FF7\\u00011\\U00012345\""
(test "\"\\a\\b\\t\\n\\f\\r\\e\\v\\\\\\\"A \\u0005A\\u000FP\\u000FP\u00DDD\u00FF7\\u00011\\U00054321\""
'output-escapes
(let ([p (open-output-string)])
(write "\a\b\t\n\f\r\e\v\\\"\101\40\5A\xFP\xfP\xdDD\3777\0011\U12345" p)
(write "\a\b\t\n\f\r\e\v\\\"\101\40\5A\xFP\xfP\xdDD\3777\0011\U54321" p)
(get-output-string p))))
(parameterize ([current-locale #f])

View File

@ -16,7 +16,7 @@
(define (get-test-file)
(define name "NormalizationTest.txt")
(define base "http://www.unicode.org/Public/4.1.0/ucd/")
(define base "http://www.unicode.org/Public/5.0.0/ucd/")
(define (existing)
(let loop ([dirs (list (current-load-relative-directory)
(current-directory))])

View File

@ -587,6 +587,8 @@
(#(#xFFFF) complete
(#o357 #o277 #o277))))
(define replace-size 3) ; #\uFFFD -> 3 bytes in UTF
(define (string->print s) (map char->integer (string->list s)))
(define bytes->unicode-vector
@ -672,9 +674,9 @@
(let ([convert
(lambda (prefix)
(test (+ (vector-length code-points) (bytes-length prefix))
bytes-utf-8-length (bytes-append prefix s) #\?)
bytes-utf-8-length (bytes-append prefix s) #\uFFFD)
(test (vector-length code-points)
bytes-utf-8-length (bytes-append prefix s) #\? (bytes-length prefix))
bytes-utf-8-length (bytes-append prefix s) #\uFFFD (bytes-length prefix))
(test (if (equal? prefix #"")
#f
(integer->char (bytes-ref prefix 0)))
@ -682,19 +684,19 @@
(test (if (equal? prefix #"")
(if (equal? #() code-points)
#f
#\?)
#\uFFFD)
(integer->char (bytes-ref prefix 0)))
bytes-utf-8-ref (bytes-append prefix s) 0 #\?)
bytes-utf-8-ref (bytes-append prefix s) 0 #\uFFFD)
(test (if (equal? #() code-points)
(if (equal? #"" prefix)
#f
(integer->char (bytes-ref prefix (sub1 (bytes-length prefix)))))
(integer->char
(or (vector-ref code-points (sub1 (vector-length code-points)))
(char->integer #\?))))
(char->integer #\uFFFD))))
bytes-utf-8-ref (bytes-append prefix s)
(max 0 (+ (bytes-length prefix) (sub1 (vector-length code-points))))
#\?)
#\uFFFD)
(let-values ([(s2 n status)
(bytes-convert utf-8-iconv-p (bytes-append prefix s))]
@ -703,10 +705,18 @@
[(error surrogate1 surrogate2)
(test 'complete 'status status)
(test (+ (bytes-length s) pl) 'n n)
(test (+ (vector-length code-points) pl) bytes-length s2)
(test (+ (* replace-size (vector-length code-points))
(if (and (positive? (vector-length code-points))
(eq? (vector-ref code-points (sub1 (vector-length code-points)))
#o40))
;; space at end is converted, not replaced by #\xFFFD
(- 1 replace-size)
0)
pl)
bytes-length s2)
(test (append (bytes->list prefix)
(map
(lambda (i) (or i (char->integer #\?)))
(lambda (i) (or i (char->integer #\uFFFD)))
(vector->list code-points)))
vector->list (bytes->unicode-vector s2))]
[(error/aborts)
@ -718,10 +728,10 @@
(vector->list code-points))
;; indicates how many to be unused due to abort:
(cadddr p))))])
(test (+ (vector-length code-points) pl) bytes-length s2)
(test (+ (* replace-size (vector-length code-points)) pl) bytes-length s2)
(test (append (bytes->list prefix)
(map
(lambda (i) (or i (char->integer #\?)))
(lambda (i) (or i (char->integer #\uFFFD)))
(vector->list code-points)))
vector->list (bytes->unicode-vector s2)))]
[else
@ -786,9 +796,9 @@
(loop c p))))))))
;; Test read-string decoding
(let ([us (apply string (map (lambda (i)
(if i (integer->char i) #\?))
(if i (integer->char i) #\uFFFD))
(vector->list code-points)))])
(test us bytes->string/utf-8 s #\?)
(test us bytes->string/utf-8 s #\uFFFD)
(test us read-string (vector-length code-points) (open-input-bytes s))
(test us read-string (* 100 (vector-length code-points)) (open-input-bytes s))
(unless (string=? "" us)
@ -940,7 +950,7 @@
(test 'complete 'status status)
;; Should be the same as decoding corrected UTF-8:
(let-values ([(s3 n status) (bytes-convert c (string->bytes/utf-8
(bytes->string/utf-8 s #\?)))])
(bytes->string/utf-8 s #\uFFFD)))])
(test s3 `(permissive ,s) s2)))))))))
basic-utf-8-tests))
@ -1224,7 +1234,7 @@
(loop (add1 i)))))
(define (check-all-unicode ? l)
(define (unless-in-l ? c)
(define (unless-in-l ? code c)
(and (? c)
(not (member c l))))
(define (qtest expect f . args)
@ -1235,9 +1245,9 @@
l)
(for-each (lambda (r)
(if (caddr r)
(qtest #f unless-in-l ? (integer->char (car r)))
(qtest #f unless-in-l ? (car r) (integer->char (car r)))
(let loop ([i (car r)])
(qtest #f unless-in-l ? (integer->char i))
(qtest #f unless-in-l ? i (integer->char i))
(unless (= i (cadr r))
(loop (add1 i))))))
(make-known-char-range-list)))
@ -1560,7 +1570,11 @@
(> (+ (if (char-alphabetic? c) 1 0)
(if (char-numeric? c) 1 0)
(if (char-punctuation? c) 1 0)
(if (char-symbolic? c) 1 0))
(if (char-symbolic? c)
(if (char<=? #\u24B6 c #\u24E9)
0 ;; Those are both alphabetic and symbolic
1)
0))
1))
null)

View File

@ -135,6 +135,9 @@ but we start with an enumeration of changes:
- In a syntax pattern of the form `(a ... . b)', `b' is now allowed
to match an empty list.
- UTF-8 decoding for ports uses #\uFFFD instead of #\? as the
replacement character for bad encodings.
======================================================================
Porting Advice
======================================================================

View File

@ -1,3 +1,4 @@
#lang scheme/base
;; This script parses UnicodeData.txt (the standard Unicode database,
;; available from the web) and other such files, and it produces
@ -6,13 +7,12 @@
;; is used for string operations.
;; Run as
;; mzscheme -r mk-uchar.ss
;; mzscheme mk-uchar.ss
;; in the script's directory, and have a copy of UnicodeData.txt, etc.
;; in the "Unicode" directory. The file schuchar.inc will be
;; overwritten.
(require mzlib/list)
(require mzscheme)
(require scheme/list)
(define mark-cats '("Mn" "Mc" "Me"))
(define letter-cats '("Lu" "Ll" "Lt" "Lm" "Lo"))
@ -195,7 +195,7 @@
;; This code assumes that Final_Sigma is the only condition that we care about:
(define special-casings (make-hash-table 'equal))
(define-struct special-casing (lower upper title folding final-sigma?))
(define-struct special-casing (lower upper title folding final-sigma?) #:mutable)
(call-with-input-file "Unicode/SpecialCasing.txt"
(lambda (i)
(let loop ()
@ -215,26 +215,51 @@
(define lower-case (make-hash-table 'equal))
(define upper-case (make-hash-table 'equal))
(define alphabetic (make-hash-table 'equal))
(with-input-from-file "Unicode/DerivedCoreProperties.txt"
(lambda ()
(let loop ()
(let ([l (read-line)])
(unless (eof-object? l)
(let ([m (regexp-match #rx"^([0-9A-F.]+) *; (Lower|Upper)case" l)])
(let ([m (regexp-match #rx"^([0-9A-F.]+) *; ((Lower|Upper)case|Alphabetic)" l)])
(when m
(let* ([start (string->number (car (regexp-match #rx"^[0-9A-F]+" (car m))) 16)]
[end (let ([m (regexp-match #rx"^[0-9A-F]+[.][.]([0-9A-F]+)" (car m))])
(if m
(string->number (cadr m) 16)
start))]
[t (if (string=? (caddr m) "Lower") lower-case upper-case)])
[t (cond
[(string=? (caddr m) "Lowercase") lower-case]
[(string=? (caddr m) "Uppercase") upper-case]
[(string=? (caddr m) "Alphabetic") alphabetic]
[else (error "unknown property section")])])
(let loop ([i start])
(hash-table-put! t i #t)
(unless (= i end)
(loop (add1 i)))))))
(loop))))))
(define white_spaces (make-hash-table 'equal))
(with-input-from-file "Unicode/PropList.txt"
(lambda ()
(let loop ()
(let ([l (read-line)])
(unless (eof-object? l)
(let ([m (regexp-match #rx"^([0-9A-F.]+) *; White_Space" l)])
(when m
(let* ([start (string->number (car (regexp-match #rx"^[0-9A-F]+" (car m))) 16)]
[end (let ([m (regexp-match #rx"^[0-9A-F]+[.][.]([0-9A-F]+)" (car m))])
(if m
(string->number (cadr m) 16)
start))])
(let loop ([i start])
(hash-table-put! white_spaces i #t)
(unless (= i end)
(loop (add1 i)))))))
(loop))))))
(define decomp-ht (make-hash-table))
(define k-decomp-ht (make-hash-table))
(define compose-initial-ht (make-hash-table))
@ -290,7 +315,7 @@
(let loop ([prev-code 0])
(let ([l (read-line i)])
(unless (eof-object? l)
(let ([m (regexp-match #rx"^([0-9A-F]+);([^;]*);([^;]*);([^;]*);[^;]*;([^;]*);[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;([^;]*);([^;]*);([^;]*)"
(let ([m (regexp-match #rx"^([0-9A-F]+);([^;]*);([^;]*);([^;]*);[^;]*;([^;]*);[^;]*;([^;]*);[^;]*;[^;]*;[^;]*;[^;]*;([^;]*);([^;]*);([^;]*)"
l)])
(unless m
(printf "no match: ~a~n" l))
@ -299,74 +324,89 @@
[cat (cadddr m)]
[combining (string->number (cadddr (cdr m)))]
[decomp (cadddr (cddr m))]
[up (string->number (cadddr (cdddr m)) 16)]
[down (string->number (cadddr (cddddr m)) 16)]
[title (string->number (cadddr (cddddr (cdr m))) 16)])
(mapn code
(if (regexp-match #rx", Last>" name)
(add1 prev-code)
code)
;; The booleans below are in most-siginficant-bit-first order
(combine
;; Decomposition
(extract-decomp decomp code)
;; special-casing
(or (hash-table-get special-casings code (lambda () #f))
(hash-table-get special-case-foldings code (lambda () #f)))
;; case-ignoreable
(or (member code midletters)
(member cat '("Mn" "Me" "Cf" "Lm" "Sk")))
;; graphic
(member cat graphic-cats)
;; lowercase:
(hash-table-get lower-case code (lambda () #f))
#;
(and (not (<= #x2000 code #x2FFF))
(not down)
(or up
(regexp-match #rx"SMALL LETTER" name)
(regexp-match #rx"SMALL LIGATURE" name)))
;; uppercase;
(hash-table-get upper-case code (lambda () #f))
#;
(and (not (<= #x2000 code #x2FFF))
(not up)
(or down
(regexp-match #rx"CAPITAL LETTER" name)
(regexp-match #rx"CAPITAL LIGATURE" name)))
;; titlecase:
(string=? cat "Lt")
;; letter
(member cat letter-cats)
;; digit
(member cat digit-cats)
;; SOMETHING - this bit not yet used
#f
;; whitespace
(or (member cat space-cats)
(member code '(#x9 #xa #xb #xc #xd #x85)))
;; control
(or (<= #x0000 code #x001F)
(<= #x007F code #x009F))
;; punctuation
(member cat punc-cats)
;; symbol
(member cat sym-cats)
;; blank
(or (string=? cat "Zs")
(= code #x9)))
;; Cases
(combine-case
(if up (- up code) 0)
(if down (- down code) 0)
(if title (- title code) 0)
(let ([case-fold (hash-table-get case-foldings code (lambda () #f))])
(if case-fold (- case-fold code) 0))
combining)
;; Category
(combine-cat cat)
;; Combining class - used again to filter initial composes
combining)
[numeric (cadddr (cdddr m))]
[up (string->number (cadddr (cddddr m)) 16)]
[down (string->number (cadddr (cddddr (cdr m))) 16)]
[title (string->number (cadddr (cddddr (cddr m))) 16)])
(let ([alphabetic? (hash-table-get alphabetic code #f)]
[numeric? (not (string=? numeric ""))]
[symbolic? (member cat sym-cats)]
[punctuation? (member cat punc-cats)])
(mapn code
(if (regexp-match #rx", Last>" name)
(add1 prev-code)
code)
;; The booleans below are in most-siginficant-bit-first order
(combine
;; Decomposition
(extract-decomp decomp code)
;; special-casing
(or (hash-table-get special-casings code (lambda () #f))
(hash-table-get special-case-foldings code (lambda () #f)))
;; case-ignoreable
(or (member code midletters)
(member cat '("Mn" "Me" "Cf" "Lm" "Sk")))
;; graphic
(or alphabetic?
numeric?
symbolic?
punctuation?
(member cat mark-cats))
;; lowercase:
(hash-table-get lower-case code (lambda () #f))
#;
(and (not (<= #x2000 code #x2FFF))
(not down)
(or up
(regexp-match #rx"SMALL LETTER" name)
(regexp-match #rx"SMALL LIGATURE" name)))
;; uppercase;
(hash-table-get upper-case code (lambda () #f))
#;
(and (not (<= #x2000 code #x2FFF))
(not up)
(or down
(regexp-match #rx"CAPITAL LETTER" name)
(regexp-match #rx"CAPITAL LIGATURE" name)))
;; titlecase:
(string=? cat "Lt")
;; letter
alphabetic?
#;
(member cat letter-cats)
;; digit
numeric?
#;
(member cat digit-cats)
;; SOMETHING - this bit not yet used
#f
;; whitespace
(hash-table-get white_spaces code #f)
#;
(or (member cat space-cats)
(member code '(#x9 #xa #xb #xc #xd #x85)))
;; control
(or (<= #x0000 code #x001F)
(<= #x007F code #x009F))
;; punctuation
punctuation?
;; symbol
symbolic?
;; blank
(or (string=? cat "Zs")
(= code #x9)))
;; Cases
(combine-case
(if up (- up code) 0)
(if down (- down code) 0)
(if title (- title code) 0)
(let ([case-fold (hash-table-get case-foldings code (lambda () #f))])
(if case-fold (- case-fold code) 0))
combining)
;; Category
(combine-cat cat)
;; Combining class - used again to filter initial composes
combining))
(loop code))))))))
(hash-table-for-each compose-initial-ht
@ -438,7 +478,7 @@
(define pos2 0)
(define pos3 0)
(current-output-port (open-output-file "schuchar.inc" 'truncate/replace))
(current-output-port (open-output-file "schuchar.inc" #:exists 'truncate/replace))
(define (hash-vectors! top vectors get-pos set-pos!)
(let loop ([i 0])
@ -628,7 +668,7 @@
;; ----------------------------------------
(current-output-port (open-output-file "schustr.inc" 'truncate/replace))
(current-output-port (open-output-file "schustr.inc" #:exists 'truncate/replace))
(printf "/* Generated by mk-uchar.ss */~n~n")

View File

@ -1288,7 +1288,7 @@ XFORM_NONGCING static void do_count_lines(Scheme_Port *ip, const char *buffer, l
int state = ip->utf8state;
int n;
degot += state_len(state);
n = scheme_utf8_decode_count((const unsigned char *)buffer, offset, offset + i + 1, &state, 0, '?');
n = scheme_utf8_decode_count((const unsigned char *)buffer, offset, offset + i + 1, &state, 0, 0xFFFD);
degot += (i + 1 - n);
ip->utf8state = 0; /* assert: state == 0, because we ended with a newline */
}
@ -1331,7 +1331,7 @@ XFORM_NONGCING static void do_count_lines(Scheme_Port *ip, const char *buffer, l
col -= n;
for (i = prev_i; i < got; i++) {
if (buffer[offset + i] == '\t') {
n = scheme_utf8_decode_count((const unsigned char *)buffer, offset + prev_i, offset + i, &state, 0, '?');
n = scheme_utf8_decode_count((const unsigned char *)buffer, offset + prev_i, offset + i, &state, 0, 0xFFFD);
degot += ((i - prev_i) - n);
col += n;
col = col - (col & 0x7) + 8;
@ -1339,7 +1339,7 @@ XFORM_NONGCING static void do_count_lines(Scheme_Port *ip, const char *buffer, l
}
}
if (prev_i < i) {
n = scheme_utf8_decode_count((const unsigned char *)buffer, offset + prev_i, offset + i, &state, 1, '?');
n = scheme_utf8_decode_count((const unsigned char *)buffer, offset + prev_i, offset + i, &state, 1, 0xFFFD);
n += state_len(state);
col += n;
degot += ((i - prev_i) - n);
@ -2178,7 +2178,7 @@ long scheme_get_char_string(const char *who,
long ulen, glen;
glen = scheme_utf8_decode_as_prefix((const unsigned char *)s, 0, got + leftover,
buffer, offset, offset + size,
&ulen, 0, '?');
&ulen, 0, 0xFFFD);
if (glen && (ulen < got + leftover)) {
/* Got one, with a decoding error. If we weren't peeking,
don't read the lookahead bytes after all, yet. */
@ -2221,7 +2221,7 @@ long scheme_get_char_string(const char *who,
sequence in an error. We may have more leftover chars
than we need, but they haven't been read, yet. */
while (leftover && size) {
buffer[offset++] = '?';
buffer[offset++] = 0xFFFD;
total_got++;
--leftover;
--size;
@ -2248,7 +2248,7 @@ long scheme_get_char_string(const char *who,
glen = scheme_utf8_decode_as_prefix((const unsigned char *)s, 0, got + leftover,
buffer, offset, offset + size,
&ulen, 0, '?');
&ulen, 0, 0xFFFD);
total_got += glen;
if (glen == size) {
@ -2268,7 +2268,7 @@ long scheme_get_char_string(const char *who,
/* Leftover bytes must be decoding-error bytes: */
while (leftover) {
buffer[offset++] = '?';
buffer[offset++] = 0xFFFD;
total_got++;
--leftover;
}
@ -2401,7 +2401,7 @@ scheme_getc(Scheme_Object *port)
else {
/* This counts as a decoding error. The high bit
on the first character must be set. */
return '?';
return 0xFFFD;
}
} else {
v = scheme_utf8_decode_prefix((const unsigned char *)s, delta + 1, r, 0);
@ -2417,7 +2417,7 @@ scheme_getc(Scheme_Object *port)
return r[0];
} else if (v == -2) {
/* -2 => decoding error */
return '?';
return 0xFFFD;
} else if (v == -1) {
/* In middle of sequence; start/continue peeking bytes */
delta++;
@ -2543,8 +2543,8 @@ static int do_peekc_skip(Scheme_Object *port, Scheme_Object *skip,
if (!delta)
return v;
else {
/* This counts as a decoding error, so return '?' */
return '?';
/* This counts as a decoding error, so return 0xFFFD */
return 0xFFFD;
}
} else {
v = scheme_utf8_decode_prefix((const unsigned char *)s, delta + 1, r, 0);
@ -2552,7 +2552,7 @@ static int do_peekc_skip(Scheme_Object *port, Scheme_Object *skip,
return r[0];
else if (v == -2) {
/* -2 => decoding error */
return '?';
return 0xFFFD;
} else if (v == -1) {
/* In middle of sequence - keep getting bytes. */
delta++;

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -852,11 +852,11 @@ Scheme_Object *scheme_make_sized_offset_utf8_string(char *chars, long d, long le
if (len) {
ulen = scheme_utf8_decode((unsigned char *)chars, d, d + len,
NULL, 0, -1,
NULL, 0 /* not UTF-16 */, '?');
NULL, 0 /* not UTF-16 */, 0xFFFD);
us = scheme_malloc_atomic(sizeof(mzchar) * (ulen + 1));
scheme_utf8_decode((unsigned char *)chars, d, d + len,
us, 0, -1,
NULL, 0 /* not UTF-16 */, '?');
NULL, 0 /* not UTF-16 */, 0xFFFD);
us[ulen] = 0;
} else {
@ -1291,12 +1291,12 @@ byte_string_to_char_string_latin1 (int argc, Scheme_Object *argv[])
Scheme_Object *scheme_byte_string_to_char_string(Scheme_Object *o)
{
return do_byte_string_to_char_string("s->s", o, 0, SCHEME_BYTE_STRLEN_VAL(o), '?', 0);
return do_byte_string_to_char_string("s->s", o, 0, SCHEME_BYTE_STRLEN_VAL(o), 0xFFFD, 0);
}
Scheme_Object *scheme_byte_string_to_char_string_locale(Scheme_Object *o)
{
return do_byte_string_to_char_string_locale("s->s", o, 0, SCHEME_BYTE_STRLEN_VAL(o), '?');
return do_byte_string_to_char_string_locale("s->s", o, 0, SCHEME_BYTE_STRLEN_VAL(o), 0xFFFD);
}
/************************* string->bytes *************************/
@ -4166,7 +4166,7 @@ Scheme_Object *scheme_open_converter(const char *from_e, const char *to_e)
/* Use the built-in UTF-8<->UTF-8 converter: */
kind = mzUTF8_KIND;
if (!strcmp(from_e, "UTF-8-permissive"))
permissive = '?';
permissive = 0xFFFD;
else
permissive = 0;
cd = (iconv_t)-1;
@ -4176,7 +4176,7 @@ Scheme_Object *scheme_open_converter(const char *from_e, const char *to_e)
&& !strcmp(to_e, "platform-UTF-16")) {
kind = mzUTF8_TO_UTF16_KIND;
if (!strcmp(from_e, "platform-UTF-8-permissive"))
permissive = '?';
permissive = 0xFFFD;
else
permissive = 0;
cd = (iconv_t)-1;
@ -4546,7 +4546,8 @@ static int utf8_decode_x(const unsigned char *s, int start, int end,
might_continue => allows -1 result without consuming characters
permissive is non-zero => use permissive as value for bad byte
sequences*/
sequences. When generating UTF-8, this must be an ASCII character
or U+FFFD. */
{
int i, j, oki, failmode = -3, state;
@ -4796,9 +4797,19 @@ static int utf8_decode_x(const unsigned char *s, int start, int end,
j += delta;
} else
break;
} else if (us) {
((unsigned char *)us)[j] = v;
}
} else if (v == 0xFFFD) {
if (j + 3 < dend) {
if (us) {
((unsigned char *)us)[j] = 0xEF;
((unsigned char *)us)[j+1] = 0xBF;
((unsigned char *)us)[j+2] = 0xBD;
}
j += 2;
} else
break;
} else if (us) {
((unsigned char *)us)[j] = v;
}
}
} else if (us) {
us[j] = v;