Unicode 5.0, use U+FFFD as replacement char
svn: r8838
This commit is contained in:
parent
57f21b3f2c
commit
b3a47edde1
|
@ -351,9 +351,10 @@ Certain encoding combinations are always available:
|
|||
@item{@scheme[(bytes-open-converter "UTF-8-permissive" "UTF-8")] ---
|
||||
@index['("UTF-8-permissive")]{the} identity conversion, except that
|
||||
any input byte that is not part of a valid encoding sequence is
|
||||
effectively replaced by @scheme[(char->integer #\?)]. (This
|
||||
handling of invalid sequences is consistent with the interpretation
|
||||
of port bytes streams into characters; see @secref["ports"].)}
|
||||
effectively replaced by the UTF-8 encoding sequence for
|
||||
@scheme[#\uFFFD]. (This handling of invalid sequences is
|
||||
consistent with the interpretation of port bytes streams into
|
||||
characters; see @secref["ports"].)}
|
||||
|
||||
@item{@scheme[(bytes-open-converter "" "UTF-8")] --- converts from
|
||||
the current locale's default encoding (see @secref["encodings"])
|
||||
|
|
|
@ -139,9 +139,8 @@ otherwise.
|
|||
|
||||
@defproc[(char-alphabetic? [char char?]) boolean?]{
|
||||
|
||||
Returns @scheme[#t] if @scheme[char]'s Unicode general category is
|
||||
@UCat{Lu}, @UCat{Ll}, @UCat{Lt}, @UCat{Lm}, or @UCat{Lo}, @scheme[#f]
|
||||
otherwise.}
|
||||
Returns @scheme[#t] if @scheme[char] has the Unicode ``Alphabetic''
|
||||
property.}
|
||||
|
||||
@defproc[(char-lower-case? [char char?]) boolean?]{
|
||||
|
||||
|
@ -161,8 +160,8 @@ Returns @scheme[#t] if @scheme[char]'s Unicode general category is
|
|||
|
||||
@defproc[(char-numeric? [char char?]) boolean?]{
|
||||
|
||||
Returns @scheme[#t] if @scheme[char]'s Unicode general category is
|
||||
@UCat{Nd}, @scheme[#f] otherwise.}
|
||||
Returns @scheme[#t] if @scheme[char] has the Unicode ``Numeric''
|
||||
property.}
|
||||
|
||||
@defproc[(char-symbolic? [char char?]) boolean?]{
|
||||
|
||||
|
@ -185,10 +184,8 @@ Returns @scheme[#t] if @scheme[char]'s Unicode general category is
|
|||
|
||||
@defproc[(char-whitespace? [char char?]) boolean?]{
|
||||
|
||||
Returns @scheme[#t] if @scheme[char]'s Unicode general category is
|
||||
@UCat{Zs}, @UCat{Zl}, or @UCat{Zp}, or if @scheme[char] is one of the
|
||||
following: @scheme[#\tab], @scheme[#\newline], @scheme[#\vtab],
|
||||
@scheme[#\page], @scheme[#\return], or @scheme[#\u0085].}
|
||||
Returns @scheme[#t] if @scheme[char] has the Unicode ``White_Space''
|
||||
property.}
|
||||
|
||||
@defproc[(char-blank? [char char?]) boolean?]{
|
||||
|
||||
|
|
|
@ -17,15 +17,15 @@ stream to discover that the stream is not a valid encoding.
|
|||
When an input port produces a sequence of bytes that is not a valid
|
||||
UTF-8 encoding in a character-reading context, then bytes that
|
||||
constitute an invalid sequence are converted to the character
|
||||
@litchar{?}. Specifically, bytes 255 and 254 are always converted to
|
||||
@litchar{?}, bytes in the range 192 to 253 produce @litchar{?} when
|
||||
they are not followed by bytes that form a valid UTF-8 encoding, and
|
||||
bytes in the range 128 to 191 are converted to @litchar{?} when they
|
||||
are not part of a valid encoding that was started by a preceding byte
|
||||
in the range 192 to 253. To put it another way, when reading a
|
||||
sequence of bytes as characters, a minimal set of bytes are changed to
|
||||
63 (which is the value that @scheme[(char->integer #\?)] produces) so
|
||||
that the entire sequence of bytes is a valid UTF-8 encoding.
|
||||
@scheme[#\uFFFD]. Specifically, bytes 255 and 254 are always converted
|
||||
to @scheme[#\uFFFD], bytes in the range 192 to 253 produce
|
||||
@scheme[#\uFFFD] when they are not followed by bytes that form a valid
|
||||
UTF-8 encoding, and bytes in the range 128 to 191 are converted to
|
||||
@scheme[#\uFFFD] when they are not part of a valid encoding that was
|
||||
started by a preceding byte in the range 192 to 253. To put it another
|
||||
way, when reading a sequence of bytes as characters, a minimal set of
|
||||
bytes are changed to the encoding of @scheme[#\uFFFD] so that the
|
||||
entire sequence of bytes is a valid UTF-8 encoding.
|
||||
|
||||
See @secref["bytestrings"] for procedures that facilitate
|
||||
conversions using UTF-8 or other encodings. See also
|
||||
|
|
|
@ -110,10 +110,10 @@
|
|||
|
||||
;; Test escape printing:
|
||||
(parameterize ([current-locale #f])
|
||||
(test "\"\\a\\b\\t\\n\\f\\r\\e\\v\\\\\\\"A \\u0005A\\u000FP\\u000FP\u00DDD\u00FF7\\u00011\\U00012345\""
|
||||
(test "\"\\a\\b\\t\\n\\f\\r\\e\\v\\\\\\\"A \\u0005A\\u000FP\\u000FP\u00DDD\u00FF7\\u00011\\U00054321\""
|
||||
'output-escapes
|
||||
(let ([p (open-output-string)])
|
||||
(write "\a\b\t\n\f\r\e\v\\\"\101\40\5A\xFP\xfP\xdDD\3777\0011\U12345" p)
|
||||
(write "\a\b\t\n\f\r\e\v\\\"\101\40\5A\xFP\xfP\xdDD\3777\0011\U54321" p)
|
||||
(get-output-string p))))
|
||||
|
||||
(parameterize ([current-locale #f])
|
||||
|
|
|
@ -16,7 +16,7 @@
|
|||
|
||||
(define (get-test-file)
|
||||
(define name "NormalizationTest.txt")
|
||||
(define base "http://www.unicode.org/Public/4.1.0/ucd/")
|
||||
(define base "http://www.unicode.org/Public/5.0.0/ucd/")
|
||||
(define (existing)
|
||||
(let loop ([dirs (list (current-load-relative-directory)
|
||||
(current-directory))])
|
||||
|
|
|
@ -587,6 +587,8 @@
|
|||
(#(#xFFFF) complete
|
||||
(#o357 #o277 #o277))))
|
||||
|
||||
(define replace-size 3) ; #\uFFFD -> 3 bytes in UTF
|
||||
|
||||
(define (string->print s) (map char->integer (string->list s)))
|
||||
|
||||
(define bytes->unicode-vector
|
||||
|
@ -672,9 +674,9 @@
|
|||
(let ([convert
|
||||
(lambda (prefix)
|
||||
(test (+ (vector-length code-points) (bytes-length prefix))
|
||||
bytes-utf-8-length (bytes-append prefix s) #\?)
|
||||
bytes-utf-8-length (bytes-append prefix s) #\uFFFD)
|
||||
(test (vector-length code-points)
|
||||
bytes-utf-8-length (bytes-append prefix s) #\? (bytes-length prefix))
|
||||
bytes-utf-8-length (bytes-append prefix s) #\uFFFD (bytes-length prefix))
|
||||
(test (if (equal? prefix #"")
|
||||
#f
|
||||
(integer->char (bytes-ref prefix 0)))
|
||||
|
@ -682,19 +684,19 @@
|
|||
(test (if (equal? prefix #"")
|
||||
(if (equal? #() code-points)
|
||||
#f
|
||||
#\?)
|
||||
#\uFFFD)
|
||||
(integer->char (bytes-ref prefix 0)))
|
||||
bytes-utf-8-ref (bytes-append prefix s) 0 #\?)
|
||||
bytes-utf-8-ref (bytes-append prefix s) 0 #\uFFFD)
|
||||
(test (if (equal? #() code-points)
|
||||
(if (equal? #"" prefix)
|
||||
#f
|
||||
(integer->char (bytes-ref prefix (sub1 (bytes-length prefix)))))
|
||||
(integer->char
|
||||
(or (vector-ref code-points (sub1 (vector-length code-points)))
|
||||
(char->integer #\?))))
|
||||
(char->integer #\uFFFD))))
|
||||
bytes-utf-8-ref (bytes-append prefix s)
|
||||
(max 0 (+ (bytes-length prefix) (sub1 (vector-length code-points))))
|
||||
#\?)
|
||||
#\uFFFD)
|
||||
|
||||
(let-values ([(s2 n status)
|
||||
(bytes-convert utf-8-iconv-p (bytes-append prefix s))]
|
||||
|
@ -703,10 +705,18 @@
|
|||
[(error surrogate1 surrogate2)
|
||||
(test 'complete 'status status)
|
||||
(test (+ (bytes-length s) pl) 'n n)
|
||||
(test (+ (vector-length code-points) pl) bytes-length s2)
|
||||
(test (+ (* replace-size (vector-length code-points))
|
||||
(if (and (positive? (vector-length code-points))
|
||||
(eq? (vector-ref code-points (sub1 (vector-length code-points)))
|
||||
#o40))
|
||||
;; space at end is converted, not replaced by #\xFFFD
|
||||
(- 1 replace-size)
|
||||
0)
|
||||
pl)
|
||||
bytes-length s2)
|
||||
(test (append (bytes->list prefix)
|
||||
(map
|
||||
(lambda (i) (or i (char->integer #\?)))
|
||||
(lambda (i) (or i (char->integer #\uFFFD)))
|
||||
(vector->list code-points)))
|
||||
vector->list (bytes->unicode-vector s2))]
|
||||
[(error/aborts)
|
||||
|
@ -718,10 +728,10 @@
|
|||
(vector->list code-points))
|
||||
;; indicates how many to be unused due to abort:
|
||||
(cadddr p))))])
|
||||
(test (+ (vector-length code-points) pl) bytes-length s2)
|
||||
(test (+ (* replace-size (vector-length code-points)) pl) bytes-length s2)
|
||||
(test (append (bytes->list prefix)
|
||||
(map
|
||||
(lambda (i) (or i (char->integer #\?)))
|
||||
(lambda (i) (or i (char->integer #\uFFFD)))
|
||||
(vector->list code-points)))
|
||||
vector->list (bytes->unicode-vector s2)))]
|
||||
[else
|
||||
|
@ -786,9 +796,9 @@
|
|||
(loop c p))))))))
|
||||
;; Test read-string decoding
|
||||
(let ([us (apply string (map (lambda (i)
|
||||
(if i (integer->char i) #\?))
|
||||
(if i (integer->char i) #\uFFFD))
|
||||
(vector->list code-points)))])
|
||||
(test us bytes->string/utf-8 s #\?)
|
||||
(test us bytes->string/utf-8 s #\uFFFD)
|
||||
(test us read-string (vector-length code-points) (open-input-bytes s))
|
||||
(test us read-string (* 100 (vector-length code-points)) (open-input-bytes s))
|
||||
(unless (string=? "" us)
|
||||
|
@ -940,7 +950,7 @@
|
|||
(test 'complete 'status status)
|
||||
;; Should be the same as decoding corrected UTF-8:
|
||||
(let-values ([(s3 n status) (bytes-convert c (string->bytes/utf-8
|
||||
(bytes->string/utf-8 s #\?)))])
|
||||
(bytes->string/utf-8 s #\uFFFD)))])
|
||||
(test s3 `(permissive ,s) s2)))))))))
|
||||
basic-utf-8-tests))
|
||||
|
||||
|
@ -1224,7 +1234,7 @@
|
|||
(loop (add1 i)))))
|
||||
|
||||
(define (check-all-unicode ? l)
|
||||
(define (unless-in-l ? c)
|
||||
(define (unless-in-l ? code c)
|
||||
(and (? c)
|
||||
(not (member c l))))
|
||||
(define (qtest expect f . args)
|
||||
|
@ -1235,9 +1245,9 @@
|
|||
l)
|
||||
(for-each (lambda (r)
|
||||
(if (caddr r)
|
||||
(qtest #f unless-in-l ? (integer->char (car r)))
|
||||
(qtest #f unless-in-l ? (car r) (integer->char (car r)))
|
||||
(let loop ([i (car r)])
|
||||
(qtest #f unless-in-l ? (integer->char i))
|
||||
(qtest #f unless-in-l ? i (integer->char i))
|
||||
(unless (= i (cadr r))
|
||||
(loop (add1 i))))))
|
||||
(make-known-char-range-list)))
|
||||
|
@ -1560,7 +1570,11 @@
|
|||
(> (+ (if (char-alphabetic? c) 1 0)
|
||||
(if (char-numeric? c) 1 0)
|
||||
(if (char-punctuation? c) 1 0)
|
||||
(if (char-symbolic? c) 1 0))
|
||||
(if (char-symbolic? c)
|
||||
(if (char<=? #\u24B6 c #\u24E9)
|
||||
0 ;; Those are both alphabetic and symbolic
|
||||
1)
|
||||
0))
|
||||
1))
|
||||
null)
|
||||
|
||||
|
|
|
@ -135,6 +135,9 @@ but we start with an enumeration of changes:
|
|||
- In a syntax pattern of the form `(a ... . b)', `b' is now allowed
|
||||
to match an empty list.
|
||||
|
||||
- UTF-8 decoding for ports uses #\uFFFD instead of #\? as the
|
||||
replacement character for bad encodings.
|
||||
|
||||
======================================================================
|
||||
Porting Advice
|
||||
======================================================================
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
#lang scheme/base
|
||||
|
||||
;; This script parses UnicodeData.txt (the standard Unicode database,
|
||||
;; available from the web) and other such files, and it produces
|
||||
|
@ -6,13 +7,12 @@
|
|||
;; is used for string operations.
|
||||
|
||||
;; Run as
|
||||
;; mzscheme -r mk-uchar.ss
|
||||
;; mzscheme mk-uchar.ss
|
||||
;; in the script's directory, and have a copy of UnicodeData.txt, etc.
|
||||
;; in the "Unicode" directory. The file schuchar.inc will be
|
||||
;; overwritten.
|
||||
|
||||
(require mzlib/list)
|
||||
(require mzscheme)
|
||||
(require scheme/list)
|
||||
|
||||
(define mark-cats '("Mn" "Mc" "Me"))
|
||||
(define letter-cats '("Lu" "Ll" "Lt" "Lm" "Lo"))
|
||||
|
@ -195,7 +195,7 @@
|
|||
|
||||
;; This code assumes that Final_Sigma is the only condition that we care about:
|
||||
(define special-casings (make-hash-table 'equal))
|
||||
(define-struct special-casing (lower upper title folding final-sigma?))
|
||||
(define-struct special-casing (lower upper title folding final-sigma?) #:mutable)
|
||||
(call-with-input-file "Unicode/SpecialCasing.txt"
|
||||
(lambda (i)
|
||||
(let loop ()
|
||||
|
@ -215,26 +215,51 @@
|
|||
|
||||
(define lower-case (make-hash-table 'equal))
|
||||
(define upper-case (make-hash-table 'equal))
|
||||
(define alphabetic (make-hash-table 'equal))
|
||||
|
||||
(with-input-from-file "Unicode/DerivedCoreProperties.txt"
|
||||
(lambda ()
|
||||
(let loop ()
|
||||
(let ([l (read-line)])
|
||||
(unless (eof-object? l)
|
||||
(let ([m (regexp-match #rx"^([0-9A-F.]+) *; (Lower|Upper)case" l)])
|
||||
(let ([m (regexp-match #rx"^([0-9A-F.]+) *; ((Lower|Upper)case|Alphabetic)" l)])
|
||||
(when m
|
||||
(let* ([start (string->number (car (regexp-match #rx"^[0-9A-F]+" (car m))) 16)]
|
||||
[end (let ([m (regexp-match #rx"^[0-9A-F]+[.][.]([0-9A-F]+)" (car m))])
|
||||
(if m
|
||||
(string->number (cadr m) 16)
|
||||
start))]
|
||||
[t (if (string=? (caddr m) "Lower") lower-case upper-case)])
|
||||
[t (cond
|
||||
[(string=? (caddr m) "Lowercase") lower-case]
|
||||
[(string=? (caddr m) "Uppercase") upper-case]
|
||||
[(string=? (caddr m) "Alphabetic") alphabetic]
|
||||
[else (error "unknown property section")])])
|
||||
(let loop ([i start])
|
||||
(hash-table-put! t i #t)
|
||||
(unless (= i end)
|
||||
(loop (add1 i)))))))
|
||||
(loop))))))
|
||||
|
||||
(define white_spaces (make-hash-table 'equal))
|
||||
|
||||
(with-input-from-file "Unicode/PropList.txt"
|
||||
(lambda ()
|
||||
(let loop ()
|
||||
(let ([l (read-line)])
|
||||
(unless (eof-object? l)
|
||||
(let ([m (regexp-match #rx"^([0-9A-F.]+) *; White_Space" l)])
|
||||
(when m
|
||||
(let* ([start (string->number (car (regexp-match #rx"^[0-9A-F]+" (car m))) 16)]
|
||||
[end (let ([m (regexp-match #rx"^[0-9A-F]+[.][.]([0-9A-F]+)" (car m))])
|
||||
(if m
|
||||
(string->number (cadr m) 16)
|
||||
start))])
|
||||
(let loop ([i start])
|
||||
(hash-table-put! white_spaces i #t)
|
||||
(unless (= i end)
|
||||
(loop (add1 i)))))))
|
||||
(loop))))))
|
||||
|
||||
(define decomp-ht (make-hash-table))
|
||||
(define k-decomp-ht (make-hash-table))
|
||||
(define compose-initial-ht (make-hash-table))
|
||||
|
@ -290,7 +315,7 @@
|
|||
(let loop ([prev-code 0])
|
||||
(let ([l (read-line i)])
|
||||
(unless (eof-object? l)
|
||||
(let ([m (regexp-match #rx"^([0-9A-F]+);([^;]*);([^;]*);([^;]*);[^;]*;([^;]*);[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;([^;]*);([^;]*);([^;]*)"
|
||||
(let ([m (regexp-match #rx"^([0-9A-F]+);([^;]*);([^;]*);([^;]*);[^;]*;([^;]*);[^;]*;([^;]*);[^;]*;[^;]*;[^;]*;[^;]*;([^;]*);([^;]*);([^;]*)"
|
||||
l)])
|
||||
(unless m
|
||||
(printf "no match: ~a~n" l))
|
||||
|
@ -299,74 +324,89 @@
|
|||
[cat (cadddr m)]
|
||||
[combining (string->number (cadddr (cdr m)))]
|
||||
[decomp (cadddr (cddr m))]
|
||||
[up (string->number (cadddr (cdddr m)) 16)]
|
||||
[down (string->number (cadddr (cddddr m)) 16)]
|
||||
[title (string->number (cadddr (cddddr (cdr m))) 16)])
|
||||
(mapn code
|
||||
(if (regexp-match #rx", Last>" name)
|
||||
(add1 prev-code)
|
||||
code)
|
||||
;; The booleans below are in most-siginficant-bit-first order
|
||||
(combine
|
||||
;; Decomposition
|
||||
(extract-decomp decomp code)
|
||||
;; special-casing
|
||||
(or (hash-table-get special-casings code (lambda () #f))
|
||||
(hash-table-get special-case-foldings code (lambda () #f)))
|
||||
;; case-ignoreable
|
||||
(or (member code midletters)
|
||||
(member cat '("Mn" "Me" "Cf" "Lm" "Sk")))
|
||||
;; graphic
|
||||
(member cat graphic-cats)
|
||||
;; lowercase:
|
||||
(hash-table-get lower-case code (lambda () #f))
|
||||
#;
|
||||
(and (not (<= #x2000 code #x2FFF))
|
||||
(not down)
|
||||
(or up
|
||||
(regexp-match #rx"SMALL LETTER" name)
|
||||
(regexp-match #rx"SMALL LIGATURE" name)))
|
||||
;; uppercase;
|
||||
(hash-table-get upper-case code (lambda () #f))
|
||||
#;
|
||||
(and (not (<= #x2000 code #x2FFF))
|
||||
(not up)
|
||||
(or down
|
||||
(regexp-match #rx"CAPITAL LETTER" name)
|
||||
(regexp-match #rx"CAPITAL LIGATURE" name)))
|
||||
;; titlecase:
|
||||
(string=? cat "Lt")
|
||||
;; letter
|
||||
(member cat letter-cats)
|
||||
;; digit
|
||||
(member cat digit-cats)
|
||||
;; SOMETHING - this bit not yet used
|
||||
#f
|
||||
;; whitespace
|
||||
(or (member cat space-cats)
|
||||
(member code '(#x9 #xa #xb #xc #xd #x85)))
|
||||
;; control
|
||||
(or (<= #x0000 code #x001F)
|
||||
(<= #x007F code #x009F))
|
||||
;; punctuation
|
||||
(member cat punc-cats)
|
||||
;; symbol
|
||||
(member cat sym-cats)
|
||||
;; blank
|
||||
(or (string=? cat "Zs")
|
||||
(= code #x9)))
|
||||
;; Cases
|
||||
(combine-case
|
||||
(if up (- up code) 0)
|
||||
(if down (- down code) 0)
|
||||
(if title (- title code) 0)
|
||||
(let ([case-fold (hash-table-get case-foldings code (lambda () #f))])
|
||||
(if case-fold (- case-fold code) 0))
|
||||
combining)
|
||||
;; Category
|
||||
(combine-cat cat)
|
||||
;; Combining class - used again to filter initial composes
|
||||
combining)
|
||||
[numeric (cadddr (cdddr m))]
|
||||
[up (string->number (cadddr (cddddr m)) 16)]
|
||||
[down (string->number (cadddr (cddddr (cdr m))) 16)]
|
||||
[title (string->number (cadddr (cddddr (cddr m))) 16)])
|
||||
(let ([alphabetic? (hash-table-get alphabetic code #f)]
|
||||
[numeric? (not (string=? numeric ""))]
|
||||
[symbolic? (member cat sym-cats)]
|
||||
[punctuation? (member cat punc-cats)])
|
||||
(mapn code
|
||||
(if (regexp-match #rx", Last>" name)
|
||||
(add1 prev-code)
|
||||
code)
|
||||
;; The booleans below are in most-siginficant-bit-first order
|
||||
(combine
|
||||
;; Decomposition
|
||||
(extract-decomp decomp code)
|
||||
;; special-casing
|
||||
(or (hash-table-get special-casings code (lambda () #f))
|
||||
(hash-table-get special-case-foldings code (lambda () #f)))
|
||||
;; case-ignoreable
|
||||
(or (member code midletters)
|
||||
(member cat '("Mn" "Me" "Cf" "Lm" "Sk")))
|
||||
;; graphic
|
||||
(or alphabetic?
|
||||
numeric?
|
||||
symbolic?
|
||||
punctuation?
|
||||
(member cat mark-cats))
|
||||
;; lowercase:
|
||||
(hash-table-get lower-case code (lambda () #f))
|
||||
#;
|
||||
(and (not (<= #x2000 code #x2FFF))
|
||||
(not down)
|
||||
(or up
|
||||
(regexp-match #rx"SMALL LETTER" name)
|
||||
(regexp-match #rx"SMALL LIGATURE" name)))
|
||||
;; uppercase;
|
||||
(hash-table-get upper-case code (lambda () #f))
|
||||
#;
|
||||
(and (not (<= #x2000 code #x2FFF))
|
||||
(not up)
|
||||
(or down
|
||||
(regexp-match #rx"CAPITAL LETTER" name)
|
||||
(regexp-match #rx"CAPITAL LIGATURE" name)))
|
||||
;; titlecase:
|
||||
(string=? cat "Lt")
|
||||
;; letter
|
||||
alphabetic?
|
||||
#;
|
||||
(member cat letter-cats)
|
||||
;; digit
|
||||
numeric?
|
||||
#;
|
||||
(member cat digit-cats)
|
||||
;; SOMETHING - this bit not yet used
|
||||
#f
|
||||
;; whitespace
|
||||
(hash-table-get white_spaces code #f)
|
||||
#;
|
||||
(or (member cat space-cats)
|
||||
(member code '(#x9 #xa #xb #xc #xd #x85)))
|
||||
;; control
|
||||
(or (<= #x0000 code #x001F)
|
||||
(<= #x007F code #x009F))
|
||||
;; punctuation
|
||||
punctuation?
|
||||
;; symbol
|
||||
symbolic?
|
||||
;; blank
|
||||
(or (string=? cat "Zs")
|
||||
(= code #x9)))
|
||||
;; Cases
|
||||
(combine-case
|
||||
(if up (- up code) 0)
|
||||
(if down (- down code) 0)
|
||||
(if title (- title code) 0)
|
||||
(let ([case-fold (hash-table-get case-foldings code (lambda () #f))])
|
||||
(if case-fold (- case-fold code) 0))
|
||||
combining)
|
||||
;; Category
|
||||
(combine-cat cat)
|
||||
;; Combining class - used again to filter initial composes
|
||||
combining))
|
||||
(loop code))))))))
|
||||
|
||||
(hash-table-for-each compose-initial-ht
|
||||
|
@ -438,7 +478,7 @@
|
|||
(define pos2 0)
|
||||
(define pos3 0)
|
||||
|
||||
(current-output-port (open-output-file "schuchar.inc" 'truncate/replace))
|
||||
(current-output-port (open-output-file "schuchar.inc" #:exists 'truncate/replace))
|
||||
|
||||
(define (hash-vectors! top vectors get-pos set-pos!)
|
||||
(let loop ([i 0])
|
||||
|
@ -628,7 +668,7 @@
|
|||
|
||||
;; ----------------------------------------
|
||||
|
||||
(current-output-port (open-output-file "schustr.inc" 'truncate/replace))
|
||||
(current-output-port (open-output-file "schustr.inc" #:exists 'truncate/replace))
|
||||
|
||||
(printf "/* Generated by mk-uchar.ss */~n~n")
|
||||
|
||||
|
|
|
@ -1288,7 +1288,7 @@ XFORM_NONGCING static void do_count_lines(Scheme_Port *ip, const char *buffer, l
|
|||
int state = ip->utf8state;
|
||||
int n;
|
||||
degot += state_len(state);
|
||||
n = scheme_utf8_decode_count((const unsigned char *)buffer, offset, offset + i + 1, &state, 0, '?');
|
||||
n = scheme_utf8_decode_count((const unsigned char *)buffer, offset, offset + i + 1, &state, 0, 0xFFFD);
|
||||
degot += (i + 1 - n);
|
||||
ip->utf8state = 0; /* assert: state == 0, because we ended with a newline */
|
||||
}
|
||||
|
@ -1331,7 +1331,7 @@ XFORM_NONGCING static void do_count_lines(Scheme_Port *ip, const char *buffer, l
|
|||
col -= n;
|
||||
for (i = prev_i; i < got; i++) {
|
||||
if (buffer[offset + i] == '\t') {
|
||||
n = scheme_utf8_decode_count((const unsigned char *)buffer, offset + prev_i, offset + i, &state, 0, '?');
|
||||
n = scheme_utf8_decode_count((const unsigned char *)buffer, offset + prev_i, offset + i, &state, 0, 0xFFFD);
|
||||
degot += ((i - prev_i) - n);
|
||||
col += n;
|
||||
col = col - (col & 0x7) + 8;
|
||||
|
@ -1339,7 +1339,7 @@ XFORM_NONGCING static void do_count_lines(Scheme_Port *ip, const char *buffer, l
|
|||
}
|
||||
}
|
||||
if (prev_i < i) {
|
||||
n = scheme_utf8_decode_count((const unsigned char *)buffer, offset + prev_i, offset + i, &state, 1, '?');
|
||||
n = scheme_utf8_decode_count((const unsigned char *)buffer, offset + prev_i, offset + i, &state, 1, 0xFFFD);
|
||||
n += state_len(state);
|
||||
col += n;
|
||||
degot += ((i - prev_i) - n);
|
||||
|
@ -2178,7 +2178,7 @@ long scheme_get_char_string(const char *who,
|
|||
long ulen, glen;
|
||||
glen = scheme_utf8_decode_as_prefix((const unsigned char *)s, 0, got + leftover,
|
||||
buffer, offset, offset + size,
|
||||
&ulen, 0, '?');
|
||||
&ulen, 0, 0xFFFD);
|
||||
if (glen && (ulen < got + leftover)) {
|
||||
/* Got one, with a decoding error. If we weren't peeking,
|
||||
don't read the lookahead bytes after all, yet. */
|
||||
|
@ -2221,7 +2221,7 @@ long scheme_get_char_string(const char *who,
|
|||
sequence in an error. We may have more leftover chars
|
||||
than we need, but they haven't been read, yet. */
|
||||
while (leftover && size) {
|
||||
buffer[offset++] = '?';
|
||||
buffer[offset++] = 0xFFFD;
|
||||
total_got++;
|
||||
--leftover;
|
||||
--size;
|
||||
|
@ -2248,7 +2248,7 @@ long scheme_get_char_string(const char *who,
|
|||
|
||||
glen = scheme_utf8_decode_as_prefix((const unsigned char *)s, 0, got + leftover,
|
||||
buffer, offset, offset + size,
|
||||
&ulen, 0, '?');
|
||||
&ulen, 0, 0xFFFD);
|
||||
|
||||
total_got += glen;
|
||||
if (glen == size) {
|
||||
|
@ -2268,7 +2268,7 @@ long scheme_get_char_string(const char *who,
|
|||
|
||||
/* Leftover bytes must be decoding-error bytes: */
|
||||
while (leftover) {
|
||||
buffer[offset++] = '?';
|
||||
buffer[offset++] = 0xFFFD;
|
||||
total_got++;
|
||||
--leftover;
|
||||
}
|
||||
|
@ -2401,7 +2401,7 @@ scheme_getc(Scheme_Object *port)
|
|||
else {
|
||||
/* This counts as a decoding error. The high bit
|
||||
on the first character must be set. */
|
||||
return '?';
|
||||
return 0xFFFD;
|
||||
}
|
||||
} else {
|
||||
v = scheme_utf8_decode_prefix((const unsigned char *)s, delta + 1, r, 0);
|
||||
|
@ -2417,7 +2417,7 @@ scheme_getc(Scheme_Object *port)
|
|||
return r[0];
|
||||
} else if (v == -2) {
|
||||
/* -2 => decoding error */
|
||||
return '?';
|
||||
return 0xFFFD;
|
||||
} else if (v == -1) {
|
||||
/* In middle of sequence; start/continue peeking bytes */
|
||||
delta++;
|
||||
|
@ -2543,8 +2543,8 @@ static int do_peekc_skip(Scheme_Object *port, Scheme_Object *skip,
|
|||
if (!delta)
|
||||
return v;
|
||||
else {
|
||||
/* This counts as a decoding error, so return '?' */
|
||||
return '?';
|
||||
/* This counts as a decoding error, so return 0xFFFD */
|
||||
return 0xFFFD;
|
||||
}
|
||||
} else {
|
||||
v = scheme_utf8_decode_prefix((const unsigned char *)s, delta + 1, r, 0);
|
||||
|
@ -2552,7 +2552,7 @@ static int do_peekc_skip(Scheme_Object *port, Scheme_Object *skip,
|
|||
return r[0];
|
||||
else if (v == -2) {
|
||||
/* -2 => decoding error */
|
||||
return '?';
|
||||
return 0xFFFD;
|
||||
} else if (v == -1) {
|
||||
/* In middle of sequence - keep getting bytes. */
|
||||
delta++;
|
||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -852,11 +852,11 @@ Scheme_Object *scheme_make_sized_offset_utf8_string(char *chars, long d, long le
|
|||
if (len) {
|
||||
ulen = scheme_utf8_decode((unsigned char *)chars, d, d + len,
|
||||
NULL, 0, -1,
|
||||
NULL, 0 /* not UTF-16 */, '?');
|
||||
NULL, 0 /* not UTF-16 */, 0xFFFD);
|
||||
us = scheme_malloc_atomic(sizeof(mzchar) * (ulen + 1));
|
||||
scheme_utf8_decode((unsigned char *)chars, d, d + len,
|
||||
us, 0, -1,
|
||||
NULL, 0 /* not UTF-16 */, '?');
|
||||
NULL, 0 /* not UTF-16 */, 0xFFFD);
|
||||
|
||||
us[ulen] = 0;
|
||||
} else {
|
||||
|
@ -1291,12 +1291,12 @@ byte_string_to_char_string_latin1 (int argc, Scheme_Object *argv[])
|
|||
|
||||
Scheme_Object *scheme_byte_string_to_char_string(Scheme_Object *o)
|
||||
{
|
||||
return do_byte_string_to_char_string("s->s", o, 0, SCHEME_BYTE_STRLEN_VAL(o), '?', 0);
|
||||
return do_byte_string_to_char_string("s->s", o, 0, SCHEME_BYTE_STRLEN_VAL(o), 0xFFFD, 0);
|
||||
}
|
||||
|
||||
Scheme_Object *scheme_byte_string_to_char_string_locale(Scheme_Object *o)
|
||||
{
|
||||
return do_byte_string_to_char_string_locale("s->s", o, 0, SCHEME_BYTE_STRLEN_VAL(o), '?');
|
||||
return do_byte_string_to_char_string_locale("s->s", o, 0, SCHEME_BYTE_STRLEN_VAL(o), 0xFFFD);
|
||||
}
|
||||
|
||||
/************************* string->bytes *************************/
|
||||
|
@ -4166,7 +4166,7 @@ Scheme_Object *scheme_open_converter(const char *from_e, const char *to_e)
|
|||
/* Use the built-in UTF-8<->UTF-8 converter: */
|
||||
kind = mzUTF8_KIND;
|
||||
if (!strcmp(from_e, "UTF-8-permissive"))
|
||||
permissive = '?';
|
||||
permissive = 0xFFFD;
|
||||
else
|
||||
permissive = 0;
|
||||
cd = (iconv_t)-1;
|
||||
|
@ -4176,7 +4176,7 @@ Scheme_Object *scheme_open_converter(const char *from_e, const char *to_e)
|
|||
&& !strcmp(to_e, "platform-UTF-16")) {
|
||||
kind = mzUTF8_TO_UTF16_KIND;
|
||||
if (!strcmp(from_e, "platform-UTF-8-permissive"))
|
||||
permissive = '?';
|
||||
permissive = 0xFFFD;
|
||||
else
|
||||
permissive = 0;
|
||||
cd = (iconv_t)-1;
|
||||
|
@ -4546,7 +4546,8 @@ static int utf8_decode_x(const unsigned char *s, int start, int end,
|
|||
might_continue => allows -1 result without consuming characters
|
||||
|
||||
permissive is non-zero => use permissive as value for bad byte
|
||||
sequences*/
|
||||
sequences. When generating UTF-8, this must be an ASCII character
|
||||
or U+FFFD. */
|
||||
|
||||
{
|
||||
int i, j, oki, failmode = -3, state;
|
||||
|
@ -4796,9 +4797,19 @@ static int utf8_decode_x(const unsigned char *s, int start, int end,
|
|||
j += delta;
|
||||
} else
|
||||
break;
|
||||
} else if (us) {
|
||||
((unsigned char *)us)[j] = v;
|
||||
}
|
||||
} else if (v == 0xFFFD) {
|
||||
if (j + 3 < dend) {
|
||||
if (us) {
|
||||
((unsigned char *)us)[j] = 0xEF;
|
||||
((unsigned char *)us)[j+1] = 0xBF;
|
||||
((unsigned char *)us)[j+2] = 0xBD;
|
||||
}
|
||||
j += 2;
|
||||
} else
|
||||
break;
|
||||
} else if (us) {
|
||||
((unsigned char *)us)[j] = v;
|
||||
}
|
||||
}
|
||||
} else if (us) {
|
||||
us[j] = v;
|
||||
|
|
Loading…
Reference in New Issue
Block a user