Unicode 5.0, use U+FFFD as replacement char

svn: r8838
This commit is contained in:
Matthew Flatt 2008-03-01 14:17:50 +00:00
parent 57f21b3f2c
commit b3a47edde1
12 changed files with 1829 additions and 1569 deletions

View File

@ -351,9 +351,10 @@ Certain encoding combinations are always available:
@item{@scheme[(bytes-open-converter "UTF-8-permissive" "UTF-8")] --- @item{@scheme[(bytes-open-converter "UTF-8-permissive" "UTF-8")] ---
@index['("UTF-8-permissive")]{the} identity conversion, except that @index['("UTF-8-permissive")]{the} identity conversion, except that
any input byte that is not part of a valid encoding sequence is any input byte that is not part of a valid encoding sequence is
effectively replaced by @scheme[(char->integer #\?)]. (This effectively replaced by the UTF-8 encoding sequence for
handling of invalid sequences is consistent with the interpretation @scheme[#\uFFFD]. (This handling of invalid sequences is
of port bytes streams into characters; see @secref["ports"].)} consistent with the interpretation of port bytes streams into
characters; see @secref["ports"].)}
@item{@scheme[(bytes-open-converter "" "UTF-8")] --- converts from @item{@scheme[(bytes-open-converter "" "UTF-8")] --- converts from
the current locale's default encoding (see @secref["encodings"]) the current locale's default encoding (see @secref["encodings"])

View File

@ -139,9 +139,8 @@ otherwise.
@defproc[(char-alphabetic? [char char?]) boolean?]{ @defproc[(char-alphabetic? [char char?]) boolean?]{
Returns @scheme[#t] if @scheme[char]'s Unicode general category is Returns @scheme[#t] if @scheme[char] has the Unicode ``Alphabetic''
@UCat{Lu}, @UCat{Ll}, @UCat{Lt}, @UCat{Lm}, or @UCat{Lo}, @scheme[#f] property.}
otherwise.}
@defproc[(char-lower-case? [char char?]) boolean?]{ @defproc[(char-lower-case? [char char?]) boolean?]{
@ -161,8 +160,8 @@ Returns @scheme[#t] if @scheme[char]'s Unicode general category is
@defproc[(char-numeric? [char char?]) boolean?]{ @defproc[(char-numeric? [char char?]) boolean?]{
Returns @scheme[#t] if @scheme[char]'s Unicode general category is Returns @scheme[#t] if @scheme[char] has the Unicode ``Numeric''
@UCat{Nd}, @scheme[#f] otherwise.} property.}
@defproc[(char-symbolic? [char char?]) boolean?]{ @defproc[(char-symbolic? [char char?]) boolean?]{
@ -185,10 +184,8 @@ Returns @scheme[#t] if @scheme[char]'s Unicode general category is
@defproc[(char-whitespace? [char char?]) boolean?]{ @defproc[(char-whitespace? [char char?]) boolean?]{
Returns @scheme[#t] if @scheme[char]'s Unicode general category is Returns @scheme[#t] if @scheme[char] has the Unicode ``White_Space''
@UCat{Zs}, @UCat{Zl}, or @UCat{Zp}, or if @scheme[char] is one of the property.}
following: @scheme[#\tab], @scheme[#\newline], @scheme[#\vtab],
@scheme[#\page], @scheme[#\return], or @scheme[#\u0085].}
@defproc[(char-blank? [char char?]) boolean?]{ @defproc[(char-blank? [char char?]) boolean?]{

View File

@ -17,15 +17,15 @@ stream to discover that the stream is not a valid encoding.
When an input port produces a sequence of bytes that is not a valid When an input port produces a sequence of bytes that is not a valid
UTF-8 encoding in a character-reading context, then bytes that UTF-8 encoding in a character-reading context, then bytes that
constitute an invalid sequence are converted to the character constitute an invalid sequence are converted to the character
@litchar{?}. Specifically, bytes 255 and 254 are always converted to @scheme[#\uFFFD]. Specifically, bytes 255 and 254 are always converted
@litchar{?}, bytes in the range 192 to 253 produce @litchar{?} when to @scheme[#\uFFFD], bytes in the range 192 to 253 produce
they are not followed by bytes that form a valid UTF-8 encoding, and @scheme[#\uFFFD] when they are not followed by bytes that form a valid
bytes in the range 128 to 191 are converted to @litchar{?} when they UTF-8 encoding, and bytes in the range 128 to 191 are converted to
are not part of a valid encoding that was started by a preceding byte @scheme[#\uFFFD] when they are not part of a valid encoding that was
in the range 192 to 253. To put it another way, when reading a started by a preceding byte in the range 192 to 253. To put it another
sequence of bytes as characters, a minimal set of bytes are changed to way, when reading a sequence of bytes as characters, a minimal set of
63 (which is the value that @scheme[(char->integer #\?)] produces) so bytes are changed to the encoding of @scheme[#\uFFFD] so that the
that the entire sequence of bytes is a valid UTF-8 encoding. entire sequence of bytes is a valid UTF-8 encoding.
See @secref["bytestrings"] for procedures that facilitate See @secref["bytestrings"] for procedures that facilitate
conversions using UTF-8 or other encodings. See also conversions using UTF-8 or other encodings. See also

View File

@ -110,10 +110,10 @@
;; Test escape printing: ;; Test escape printing:
(parameterize ([current-locale #f]) (parameterize ([current-locale #f])
(test "\"\\a\\b\\t\\n\\f\\r\\e\\v\\\\\\\"A \\u0005A\\u000FP\\u000FP\u00DDD\u00FF7\\u00011\\U00012345\"" (test "\"\\a\\b\\t\\n\\f\\r\\e\\v\\\\\\\"A \\u0005A\\u000FP\\u000FP\u00DDD\u00FF7\\u00011\\U00054321\""
'output-escapes 'output-escapes
(let ([p (open-output-string)]) (let ([p (open-output-string)])
(write "\a\b\t\n\f\r\e\v\\\"\101\40\5A\xFP\xfP\xdDD\3777\0011\U12345" p) (write "\a\b\t\n\f\r\e\v\\\"\101\40\5A\xFP\xfP\xdDD\3777\0011\U54321" p)
(get-output-string p)))) (get-output-string p))))
(parameterize ([current-locale #f]) (parameterize ([current-locale #f])

View File

@ -16,7 +16,7 @@
(define (get-test-file) (define (get-test-file)
(define name "NormalizationTest.txt") (define name "NormalizationTest.txt")
(define base "http://www.unicode.org/Public/4.1.0/ucd/") (define base "http://www.unicode.org/Public/5.0.0/ucd/")
(define (existing) (define (existing)
(let loop ([dirs (list (current-load-relative-directory) (let loop ([dirs (list (current-load-relative-directory)
(current-directory))]) (current-directory))])

View File

@ -587,6 +587,8 @@
(#(#xFFFF) complete (#(#xFFFF) complete
(#o357 #o277 #o277)))) (#o357 #o277 #o277))))
(define replace-size 3) ; #\uFFFD -> 3 bytes in UTF
(define (string->print s) (map char->integer (string->list s))) (define (string->print s) (map char->integer (string->list s)))
(define bytes->unicode-vector (define bytes->unicode-vector
@ -672,9 +674,9 @@
(let ([convert (let ([convert
(lambda (prefix) (lambda (prefix)
(test (+ (vector-length code-points) (bytes-length prefix)) (test (+ (vector-length code-points) (bytes-length prefix))
bytes-utf-8-length (bytes-append prefix s) #\?) bytes-utf-8-length (bytes-append prefix s) #\uFFFD)
(test (vector-length code-points) (test (vector-length code-points)
bytes-utf-8-length (bytes-append prefix s) #\? (bytes-length prefix)) bytes-utf-8-length (bytes-append prefix s) #\uFFFD (bytes-length prefix))
(test (if (equal? prefix #"") (test (if (equal? prefix #"")
#f #f
(integer->char (bytes-ref prefix 0))) (integer->char (bytes-ref prefix 0)))
@ -682,19 +684,19 @@
(test (if (equal? prefix #"") (test (if (equal? prefix #"")
(if (equal? #() code-points) (if (equal? #() code-points)
#f #f
#\?) #\uFFFD)
(integer->char (bytes-ref prefix 0))) (integer->char (bytes-ref prefix 0)))
bytes-utf-8-ref (bytes-append prefix s) 0 #\?) bytes-utf-8-ref (bytes-append prefix s) 0 #\uFFFD)
(test (if (equal? #() code-points) (test (if (equal? #() code-points)
(if (equal? #"" prefix) (if (equal? #"" prefix)
#f #f
(integer->char (bytes-ref prefix (sub1 (bytes-length prefix))))) (integer->char (bytes-ref prefix (sub1 (bytes-length prefix)))))
(integer->char (integer->char
(or (vector-ref code-points (sub1 (vector-length code-points))) (or (vector-ref code-points (sub1 (vector-length code-points)))
(char->integer #\?)))) (char->integer #\uFFFD))))
bytes-utf-8-ref (bytes-append prefix s) bytes-utf-8-ref (bytes-append prefix s)
(max 0 (+ (bytes-length prefix) (sub1 (vector-length code-points)))) (max 0 (+ (bytes-length prefix) (sub1 (vector-length code-points))))
#\?) #\uFFFD)
(let-values ([(s2 n status) (let-values ([(s2 n status)
(bytes-convert utf-8-iconv-p (bytes-append prefix s))] (bytes-convert utf-8-iconv-p (bytes-append prefix s))]
@ -703,10 +705,18 @@
[(error surrogate1 surrogate2) [(error surrogate1 surrogate2)
(test 'complete 'status status) (test 'complete 'status status)
(test (+ (bytes-length s) pl) 'n n) (test (+ (bytes-length s) pl) 'n n)
(test (+ (vector-length code-points) pl) bytes-length s2) (test (+ (* replace-size (vector-length code-points))
(if (and (positive? (vector-length code-points))
(eq? (vector-ref code-points (sub1 (vector-length code-points)))
#o40))
;; space at end is converted, not replaced by #\xFFFD
(- 1 replace-size)
0)
pl)
bytes-length s2)
(test (append (bytes->list prefix) (test (append (bytes->list prefix)
(map (map
(lambda (i) (or i (char->integer #\?))) (lambda (i) (or i (char->integer #\uFFFD)))
(vector->list code-points))) (vector->list code-points)))
vector->list (bytes->unicode-vector s2))] vector->list (bytes->unicode-vector s2))]
[(error/aborts) [(error/aborts)
@ -718,10 +728,10 @@
(vector->list code-points)) (vector->list code-points))
;; indicates how many to be unused due to abort: ;; indicates how many to be unused due to abort:
(cadddr p))))]) (cadddr p))))])
(test (+ (vector-length code-points) pl) bytes-length s2) (test (+ (* replace-size (vector-length code-points)) pl) bytes-length s2)
(test (append (bytes->list prefix) (test (append (bytes->list prefix)
(map (map
(lambda (i) (or i (char->integer #\?))) (lambda (i) (or i (char->integer #\uFFFD)))
(vector->list code-points))) (vector->list code-points)))
vector->list (bytes->unicode-vector s2)))] vector->list (bytes->unicode-vector s2)))]
[else [else
@ -786,9 +796,9 @@
(loop c p)))))))) (loop c p))))))))
;; Test read-string decoding ;; Test read-string decoding
(let ([us (apply string (map (lambda (i) (let ([us (apply string (map (lambda (i)
(if i (integer->char i) #\?)) (if i (integer->char i) #\uFFFD))
(vector->list code-points)))]) (vector->list code-points)))])
(test us bytes->string/utf-8 s #\?) (test us bytes->string/utf-8 s #\uFFFD)
(test us read-string (vector-length code-points) (open-input-bytes s)) (test us read-string (vector-length code-points) (open-input-bytes s))
(test us read-string (* 100 (vector-length code-points)) (open-input-bytes s)) (test us read-string (* 100 (vector-length code-points)) (open-input-bytes s))
(unless (string=? "" us) (unless (string=? "" us)
@ -940,7 +950,7 @@
(test 'complete 'status status) (test 'complete 'status status)
;; Should be the same as decoding corrected UTF-8: ;; Should be the same as decoding corrected UTF-8:
(let-values ([(s3 n status) (bytes-convert c (string->bytes/utf-8 (let-values ([(s3 n status) (bytes-convert c (string->bytes/utf-8
(bytes->string/utf-8 s #\?)))]) (bytes->string/utf-8 s #\uFFFD)))])
(test s3 `(permissive ,s) s2))))))))) (test s3 `(permissive ,s) s2)))))))))
basic-utf-8-tests)) basic-utf-8-tests))
@ -1224,7 +1234,7 @@
(loop (add1 i))))) (loop (add1 i)))))
(define (check-all-unicode ? l) (define (check-all-unicode ? l)
(define (unless-in-l ? c) (define (unless-in-l ? code c)
(and (? c) (and (? c)
(not (member c l)))) (not (member c l))))
(define (qtest expect f . args) (define (qtest expect f . args)
@ -1235,9 +1245,9 @@
l) l)
(for-each (lambda (r) (for-each (lambda (r)
(if (caddr r) (if (caddr r)
(qtest #f unless-in-l ? (integer->char (car r))) (qtest #f unless-in-l ? (car r) (integer->char (car r)))
(let loop ([i (car r)]) (let loop ([i (car r)])
(qtest #f unless-in-l ? (integer->char i)) (qtest #f unless-in-l ? i (integer->char i))
(unless (= i (cadr r)) (unless (= i (cadr r))
(loop (add1 i)))))) (loop (add1 i))))))
(make-known-char-range-list))) (make-known-char-range-list)))
@ -1560,7 +1570,11 @@
(> (+ (if (char-alphabetic? c) 1 0) (> (+ (if (char-alphabetic? c) 1 0)
(if (char-numeric? c) 1 0) (if (char-numeric? c) 1 0)
(if (char-punctuation? c) 1 0) (if (char-punctuation? c) 1 0)
(if (char-symbolic? c) 1 0)) (if (char-symbolic? c)
(if (char<=? #\u24B6 c #\u24E9)
0 ;; Those are both alphabetic and symbolic
1)
0))
1)) 1))
null) null)

View File

@ -135,6 +135,9 @@ but we start with an enumeration of changes:
- In a syntax pattern of the form `(a ... . b)', `b' is now allowed - In a syntax pattern of the form `(a ... . b)', `b' is now allowed
to match an empty list. to match an empty list.
- UTF-8 decoding for ports uses #\uFFFD instead of #\? as the
replacement character for bad encodings.
====================================================================== ======================================================================
Porting Advice Porting Advice
====================================================================== ======================================================================

View File

@ -1,3 +1,4 @@
#lang scheme/base
;; This script parses UnicodeData.txt (the standard Unicode database, ;; This script parses UnicodeData.txt (the standard Unicode database,
;; available from the web) and other such files, and it produces ;; available from the web) and other such files, and it produces
@ -6,13 +7,12 @@
;; is used for string operations. ;; is used for string operations.
;; Run as ;; Run as
;; mzscheme -r mk-uchar.ss ;; mzscheme mk-uchar.ss
;; in the script's directory, and have a copy of UnicodeData.txt, etc. ;; in the script's directory, and have a copy of UnicodeData.txt, etc.
;; in the "Unicode" directory. The file schuchar.inc will be ;; in the "Unicode" directory. The file schuchar.inc will be
;; overwritten. ;; overwritten.
(require mzlib/list) (require scheme/list)
(require mzscheme)
(define mark-cats '("Mn" "Mc" "Me")) (define mark-cats '("Mn" "Mc" "Me"))
(define letter-cats '("Lu" "Ll" "Lt" "Lm" "Lo")) (define letter-cats '("Lu" "Ll" "Lt" "Lm" "Lo"))
@ -195,7 +195,7 @@
;; This code assumes that Final_Sigma is the only condition that we care about: ;; This code assumes that Final_Sigma is the only condition that we care about:
(define special-casings (make-hash-table 'equal)) (define special-casings (make-hash-table 'equal))
(define-struct special-casing (lower upper title folding final-sigma?)) (define-struct special-casing (lower upper title folding final-sigma?) #:mutable)
(call-with-input-file "Unicode/SpecialCasing.txt" (call-with-input-file "Unicode/SpecialCasing.txt"
(lambda (i) (lambda (i)
(let loop () (let loop ()
@ -215,26 +215,51 @@
(define lower-case (make-hash-table 'equal)) (define lower-case (make-hash-table 'equal))
(define upper-case (make-hash-table 'equal)) (define upper-case (make-hash-table 'equal))
(define alphabetic (make-hash-table 'equal))
(with-input-from-file "Unicode/DerivedCoreProperties.txt" (with-input-from-file "Unicode/DerivedCoreProperties.txt"
(lambda () (lambda ()
(let loop () (let loop ()
(let ([l (read-line)]) (let ([l (read-line)])
(unless (eof-object? l) (unless (eof-object? l)
(let ([m (regexp-match #rx"^([0-9A-F.]+) *; (Lower|Upper)case" l)]) (let ([m (regexp-match #rx"^([0-9A-F.]+) *; ((Lower|Upper)case|Alphabetic)" l)])
(when m (when m
(let* ([start (string->number (car (regexp-match #rx"^[0-9A-F]+" (car m))) 16)] (let* ([start (string->number (car (regexp-match #rx"^[0-9A-F]+" (car m))) 16)]
[end (let ([m (regexp-match #rx"^[0-9A-F]+[.][.]([0-9A-F]+)" (car m))]) [end (let ([m (regexp-match #rx"^[0-9A-F]+[.][.]([0-9A-F]+)" (car m))])
(if m (if m
(string->number (cadr m) 16) (string->number (cadr m) 16)
start))] start))]
[t (if (string=? (caddr m) "Lower") lower-case upper-case)]) [t (cond
[(string=? (caddr m) "Lowercase") lower-case]
[(string=? (caddr m) "Uppercase") upper-case]
[(string=? (caddr m) "Alphabetic") alphabetic]
[else (error "unknown property section")])])
(let loop ([i start]) (let loop ([i start])
(hash-table-put! t i #t) (hash-table-put! t i #t)
(unless (= i end) (unless (= i end)
(loop (add1 i))))))) (loop (add1 i)))))))
(loop)))))) (loop))))))
(define white_spaces (make-hash-table 'equal))
(with-input-from-file "Unicode/PropList.txt"
(lambda ()
(let loop ()
(let ([l (read-line)])
(unless (eof-object? l)
(let ([m (regexp-match #rx"^([0-9A-F.]+) *; White_Space" l)])
(when m
(let* ([start (string->number (car (regexp-match #rx"^[0-9A-F]+" (car m))) 16)]
[end (let ([m (regexp-match #rx"^[0-9A-F]+[.][.]([0-9A-F]+)" (car m))])
(if m
(string->number (cadr m) 16)
start))])
(let loop ([i start])
(hash-table-put! white_spaces i #t)
(unless (= i end)
(loop (add1 i)))))))
(loop))))))
(define decomp-ht (make-hash-table)) (define decomp-ht (make-hash-table))
(define k-decomp-ht (make-hash-table)) (define k-decomp-ht (make-hash-table))
(define compose-initial-ht (make-hash-table)) (define compose-initial-ht (make-hash-table))
@ -290,7 +315,7 @@
(let loop ([prev-code 0]) (let loop ([prev-code 0])
(let ([l (read-line i)]) (let ([l (read-line i)])
(unless (eof-object? l) (unless (eof-object? l)
(let ([m (regexp-match #rx"^([0-9A-F]+);([^;]*);([^;]*);([^;]*);[^;]*;([^;]*);[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;([^;]*);([^;]*);([^;]*)" (let ([m (regexp-match #rx"^([0-9A-F]+);([^;]*);([^;]*);([^;]*);[^;]*;([^;]*);[^;]*;([^;]*);[^;]*;[^;]*;[^;]*;[^;]*;([^;]*);([^;]*);([^;]*)"
l)]) l)])
(unless m (unless m
(printf "no match: ~a~n" l)) (printf "no match: ~a~n" l))
@ -299,74 +324,89 @@
[cat (cadddr m)] [cat (cadddr m)]
[combining (string->number (cadddr (cdr m)))] [combining (string->number (cadddr (cdr m)))]
[decomp (cadddr (cddr m))] [decomp (cadddr (cddr m))]
[up (string->number (cadddr (cdddr m)) 16)] [numeric (cadddr (cdddr m))]
[down (string->number (cadddr (cddddr m)) 16)] [up (string->number (cadddr (cddddr m)) 16)]
[title (string->number (cadddr (cddddr (cdr m))) 16)]) [down (string->number (cadddr (cddddr (cdr m))) 16)]
(mapn code [title (string->number (cadddr (cddddr (cddr m))) 16)])
(if (regexp-match #rx", Last>" name) (let ([alphabetic? (hash-table-get alphabetic code #f)]
(add1 prev-code) [numeric? (not (string=? numeric ""))]
code) [symbolic? (member cat sym-cats)]
;; The booleans below are in most-siginficant-bit-first order [punctuation? (member cat punc-cats)])
(combine (mapn code
;; Decomposition (if (regexp-match #rx", Last>" name)
(extract-decomp decomp code) (add1 prev-code)
;; special-casing code)
(or (hash-table-get special-casings code (lambda () #f)) ;; The booleans below are in most-siginficant-bit-first order
(hash-table-get special-case-foldings code (lambda () #f))) (combine
;; case-ignoreable ;; Decomposition
(or (member code midletters) (extract-decomp decomp code)
(member cat '("Mn" "Me" "Cf" "Lm" "Sk"))) ;; special-casing
;; graphic (or (hash-table-get special-casings code (lambda () #f))
(member cat graphic-cats) (hash-table-get special-case-foldings code (lambda () #f)))
;; lowercase: ;; case-ignoreable
(hash-table-get lower-case code (lambda () #f)) (or (member code midletters)
#; (member cat '("Mn" "Me" "Cf" "Lm" "Sk")))
(and (not (<= #x2000 code #x2FFF)) ;; graphic
(not down) (or alphabetic?
(or up numeric?
(regexp-match #rx"SMALL LETTER" name) symbolic?
(regexp-match #rx"SMALL LIGATURE" name))) punctuation?
;; uppercase; (member cat mark-cats))
(hash-table-get upper-case code (lambda () #f)) ;; lowercase:
#; (hash-table-get lower-case code (lambda () #f))
(and (not (<= #x2000 code #x2FFF)) #;
(not up) (and (not (<= #x2000 code #x2FFF))
(or down (not down)
(regexp-match #rx"CAPITAL LETTER" name) (or up
(regexp-match #rx"CAPITAL LIGATURE" name))) (regexp-match #rx"SMALL LETTER" name)
;; titlecase: (regexp-match #rx"SMALL LIGATURE" name)))
(string=? cat "Lt") ;; uppercase;
;; letter (hash-table-get upper-case code (lambda () #f))
(member cat letter-cats) #;
;; digit (and (not (<= #x2000 code #x2FFF))
(member cat digit-cats) (not up)
;; SOMETHING - this bit not yet used (or down
#f (regexp-match #rx"CAPITAL LETTER" name)
;; whitespace (regexp-match #rx"CAPITAL LIGATURE" name)))
(or (member cat space-cats) ;; titlecase:
(member code '(#x9 #xa #xb #xc #xd #x85))) (string=? cat "Lt")
;; control ;; letter
(or (<= #x0000 code #x001F) alphabetic?
(<= #x007F code #x009F)) #;
;; punctuation (member cat letter-cats)
(member cat punc-cats) ;; digit
;; symbol numeric?
(member cat sym-cats) #;
;; blank (member cat digit-cats)
(or (string=? cat "Zs") ;; SOMETHING - this bit not yet used
(= code #x9))) #f
;; Cases ;; whitespace
(combine-case (hash-table-get white_spaces code #f)
(if up (- up code) 0) #;
(if down (- down code) 0) (or (member cat space-cats)
(if title (- title code) 0) (member code '(#x9 #xa #xb #xc #xd #x85)))
(let ([case-fold (hash-table-get case-foldings code (lambda () #f))]) ;; control
(if case-fold (- case-fold code) 0)) (or (<= #x0000 code #x001F)
combining) (<= #x007F code #x009F))
;; Category ;; punctuation
(combine-cat cat) punctuation?
;; Combining class - used again to filter initial composes ;; symbol
combining) symbolic?
;; blank
(or (string=? cat "Zs")
(= code #x9)))
;; Cases
(combine-case
(if up (- up code) 0)
(if down (- down code) 0)
(if title (- title code) 0)
(let ([case-fold (hash-table-get case-foldings code (lambda () #f))])
(if case-fold (- case-fold code) 0))
combining)
;; Category
(combine-cat cat)
;; Combining class - used again to filter initial composes
combining))
(loop code)))))))) (loop code))))))))
(hash-table-for-each compose-initial-ht (hash-table-for-each compose-initial-ht
@ -438,7 +478,7 @@
(define pos2 0) (define pos2 0)
(define pos3 0) (define pos3 0)
(current-output-port (open-output-file "schuchar.inc" 'truncate/replace)) (current-output-port (open-output-file "schuchar.inc" #:exists 'truncate/replace))
(define (hash-vectors! top vectors get-pos set-pos!) (define (hash-vectors! top vectors get-pos set-pos!)
(let loop ([i 0]) (let loop ([i 0])
@ -628,7 +668,7 @@
;; ---------------------------------------- ;; ----------------------------------------
(current-output-port (open-output-file "schustr.inc" 'truncate/replace)) (current-output-port (open-output-file "schustr.inc" #:exists 'truncate/replace))
(printf "/* Generated by mk-uchar.ss */~n~n") (printf "/* Generated by mk-uchar.ss */~n~n")

View File

@ -1288,7 +1288,7 @@ XFORM_NONGCING static void do_count_lines(Scheme_Port *ip, const char *buffer, l
int state = ip->utf8state; int state = ip->utf8state;
int n; int n;
degot += state_len(state); degot += state_len(state);
n = scheme_utf8_decode_count((const unsigned char *)buffer, offset, offset + i + 1, &state, 0, '?'); n = scheme_utf8_decode_count((const unsigned char *)buffer, offset, offset + i + 1, &state, 0, 0xFFFD);
degot += (i + 1 - n); degot += (i + 1 - n);
ip->utf8state = 0; /* assert: state == 0, because we ended with a newline */ ip->utf8state = 0; /* assert: state == 0, because we ended with a newline */
} }
@ -1331,7 +1331,7 @@ XFORM_NONGCING static void do_count_lines(Scheme_Port *ip, const char *buffer, l
col -= n; col -= n;
for (i = prev_i; i < got; i++) { for (i = prev_i; i < got; i++) {
if (buffer[offset + i] == '\t') { if (buffer[offset + i] == '\t') {
n = scheme_utf8_decode_count((const unsigned char *)buffer, offset + prev_i, offset + i, &state, 0, '?'); n = scheme_utf8_decode_count((const unsigned char *)buffer, offset + prev_i, offset + i, &state, 0, 0xFFFD);
degot += ((i - prev_i) - n); degot += ((i - prev_i) - n);
col += n; col += n;
col = col - (col & 0x7) + 8; col = col - (col & 0x7) + 8;
@ -1339,7 +1339,7 @@ XFORM_NONGCING static void do_count_lines(Scheme_Port *ip, const char *buffer, l
} }
} }
if (prev_i < i) { if (prev_i < i) {
n = scheme_utf8_decode_count((const unsigned char *)buffer, offset + prev_i, offset + i, &state, 1, '?'); n = scheme_utf8_decode_count((const unsigned char *)buffer, offset + prev_i, offset + i, &state, 1, 0xFFFD);
n += state_len(state); n += state_len(state);
col += n; col += n;
degot += ((i - prev_i) - n); degot += ((i - prev_i) - n);
@ -2178,7 +2178,7 @@ long scheme_get_char_string(const char *who,
long ulen, glen; long ulen, glen;
glen = scheme_utf8_decode_as_prefix((const unsigned char *)s, 0, got + leftover, glen = scheme_utf8_decode_as_prefix((const unsigned char *)s, 0, got + leftover,
buffer, offset, offset + size, buffer, offset, offset + size,
&ulen, 0, '?'); &ulen, 0, 0xFFFD);
if (glen && (ulen < got + leftover)) { if (glen && (ulen < got + leftover)) {
/* Got one, with a decoding error. If we weren't peeking, /* Got one, with a decoding error. If we weren't peeking,
don't read the lookahead bytes after all, yet. */ don't read the lookahead bytes after all, yet. */
@ -2221,7 +2221,7 @@ long scheme_get_char_string(const char *who,
sequence in an error. We may have more leftover chars sequence in an error. We may have more leftover chars
than we need, but they haven't been read, yet. */ than we need, but they haven't been read, yet. */
while (leftover && size) { while (leftover && size) {
buffer[offset++] = '?'; buffer[offset++] = 0xFFFD;
total_got++; total_got++;
--leftover; --leftover;
--size; --size;
@ -2248,7 +2248,7 @@ long scheme_get_char_string(const char *who,
glen = scheme_utf8_decode_as_prefix((const unsigned char *)s, 0, got + leftover, glen = scheme_utf8_decode_as_prefix((const unsigned char *)s, 0, got + leftover,
buffer, offset, offset + size, buffer, offset, offset + size,
&ulen, 0, '?'); &ulen, 0, 0xFFFD);
total_got += glen; total_got += glen;
if (glen == size) { if (glen == size) {
@ -2268,7 +2268,7 @@ long scheme_get_char_string(const char *who,
/* Leftover bytes must be decoding-error bytes: */ /* Leftover bytes must be decoding-error bytes: */
while (leftover) { while (leftover) {
buffer[offset++] = '?'; buffer[offset++] = 0xFFFD;
total_got++; total_got++;
--leftover; --leftover;
} }
@ -2401,7 +2401,7 @@ scheme_getc(Scheme_Object *port)
else { else {
/* This counts as a decoding error. The high bit /* This counts as a decoding error. The high bit
on the first character must be set. */ on the first character must be set. */
return '?'; return 0xFFFD;
} }
} else { } else {
v = scheme_utf8_decode_prefix((const unsigned char *)s, delta + 1, r, 0); v = scheme_utf8_decode_prefix((const unsigned char *)s, delta + 1, r, 0);
@ -2417,7 +2417,7 @@ scheme_getc(Scheme_Object *port)
return r[0]; return r[0];
} else if (v == -2) { } else if (v == -2) {
/* -2 => decoding error */ /* -2 => decoding error */
return '?'; return 0xFFFD;
} else if (v == -1) { } else if (v == -1) {
/* In middle of sequence; start/continue peeking bytes */ /* In middle of sequence; start/continue peeking bytes */
delta++; delta++;
@ -2543,8 +2543,8 @@ static int do_peekc_skip(Scheme_Object *port, Scheme_Object *skip,
if (!delta) if (!delta)
return v; return v;
else { else {
/* This counts as a decoding error, so return '?' */ /* This counts as a decoding error, so return 0xFFFD */
return '?'; return 0xFFFD;
} }
} else { } else {
v = scheme_utf8_decode_prefix((const unsigned char *)s, delta + 1, r, 0); v = scheme_utf8_decode_prefix((const unsigned char *)s, delta + 1, r, 0);
@ -2552,7 +2552,7 @@ static int do_peekc_skip(Scheme_Object *port, Scheme_Object *skip,
return r[0]; return r[0];
else if (v == -2) { else if (v == -2) {
/* -2 => decoding error */ /* -2 => decoding error */
return '?'; return 0xFFFD;
} else if (v == -1) { } else if (v == -1) {
/* In middle of sequence - keep getting bytes. */ /* In middle of sequence - keep getting bytes. */
delta++; delta++;

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -852,11 +852,11 @@ Scheme_Object *scheme_make_sized_offset_utf8_string(char *chars, long d, long le
if (len) { if (len) {
ulen = scheme_utf8_decode((unsigned char *)chars, d, d + len, ulen = scheme_utf8_decode((unsigned char *)chars, d, d + len,
NULL, 0, -1, NULL, 0, -1,
NULL, 0 /* not UTF-16 */, '?'); NULL, 0 /* not UTF-16 */, 0xFFFD);
us = scheme_malloc_atomic(sizeof(mzchar) * (ulen + 1)); us = scheme_malloc_atomic(sizeof(mzchar) * (ulen + 1));
scheme_utf8_decode((unsigned char *)chars, d, d + len, scheme_utf8_decode((unsigned char *)chars, d, d + len,
us, 0, -1, us, 0, -1,
NULL, 0 /* not UTF-16 */, '?'); NULL, 0 /* not UTF-16 */, 0xFFFD);
us[ulen] = 0; us[ulen] = 0;
} else { } else {
@ -1291,12 +1291,12 @@ byte_string_to_char_string_latin1 (int argc, Scheme_Object *argv[])
Scheme_Object *scheme_byte_string_to_char_string(Scheme_Object *o) Scheme_Object *scheme_byte_string_to_char_string(Scheme_Object *o)
{ {
return do_byte_string_to_char_string("s->s", o, 0, SCHEME_BYTE_STRLEN_VAL(o), '?', 0); return do_byte_string_to_char_string("s->s", o, 0, SCHEME_BYTE_STRLEN_VAL(o), 0xFFFD, 0);
} }
Scheme_Object *scheme_byte_string_to_char_string_locale(Scheme_Object *o) Scheme_Object *scheme_byte_string_to_char_string_locale(Scheme_Object *o)
{ {
return do_byte_string_to_char_string_locale("s->s", o, 0, SCHEME_BYTE_STRLEN_VAL(o), '?'); return do_byte_string_to_char_string_locale("s->s", o, 0, SCHEME_BYTE_STRLEN_VAL(o), 0xFFFD);
} }
/************************* string->bytes *************************/ /************************* string->bytes *************************/
@ -4166,7 +4166,7 @@ Scheme_Object *scheme_open_converter(const char *from_e, const char *to_e)
/* Use the built-in UTF-8<->UTF-8 converter: */ /* Use the built-in UTF-8<->UTF-8 converter: */
kind = mzUTF8_KIND; kind = mzUTF8_KIND;
if (!strcmp(from_e, "UTF-8-permissive")) if (!strcmp(from_e, "UTF-8-permissive"))
permissive = '?'; permissive = 0xFFFD;
else else
permissive = 0; permissive = 0;
cd = (iconv_t)-1; cd = (iconv_t)-1;
@ -4176,7 +4176,7 @@ Scheme_Object *scheme_open_converter(const char *from_e, const char *to_e)
&& !strcmp(to_e, "platform-UTF-16")) { && !strcmp(to_e, "platform-UTF-16")) {
kind = mzUTF8_TO_UTF16_KIND; kind = mzUTF8_TO_UTF16_KIND;
if (!strcmp(from_e, "platform-UTF-8-permissive")) if (!strcmp(from_e, "platform-UTF-8-permissive"))
permissive = '?'; permissive = 0xFFFD;
else else
permissive = 0; permissive = 0;
cd = (iconv_t)-1; cd = (iconv_t)-1;
@ -4546,7 +4546,8 @@ static int utf8_decode_x(const unsigned char *s, int start, int end,
might_continue => allows -1 result without consuming characters might_continue => allows -1 result without consuming characters
permissive is non-zero => use permissive as value for bad byte permissive is non-zero => use permissive as value for bad byte
sequences*/ sequences. When generating UTF-8, this must be an ASCII character
or U+FFFD. */
{ {
int i, j, oki, failmode = -3, state; int i, j, oki, failmode = -3, state;
@ -4796,9 +4797,19 @@ static int utf8_decode_x(const unsigned char *s, int start, int end,
j += delta; j += delta;
} else } else
break; break;
} else if (us) { } else if (v == 0xFFFD) {
((unsigned char *)us)[j] = v; if (j + 3 < dend) {
} if (us) {
((unsigned char *)us)[j] = 0xEF;
((unsigned char *)us)[j+1] = 0xBF;
((unsigned char *)us)[j+2] = 0xBD;
}
j += 2;
} else
break;
} else if (us) {
((unsigned char *)us)[j] = v;
}
} }
} else if (us) { } else if (us) {
us[j] = v; us[j] = v;