Unicode 5.0, use U+FFFD as replacement char
svn: r8838
This commit is contained in:
parent
57f21b3f2c
commit
b3a47edde1
|
@ -351,9 +351,10 @@ Certain encoding combinations are always available:
|
||||||
@item{@scheme[(bytes-open-converter "UTF-8-permissive" "UTF-8")] ---
|
@item{@scheme[(bytes-open-converter "UTF-8-permissive" "UTF-8")] ---
|
||||||
@index['("UTF-8-permissive")]{the} identity conversion, except that
|
@index['("UTF-8-permissive")]{the} identity conversion, except that
|
||||||
any input byte that is not part of a valid encoding sequence is
|
any input byte that is not part of a valid encoding sequence is
|
||||||
effectively replaced by @scheme[(char->integer #\?)]. (This
|
effectively replaced by the UTF-8 encoding sequence for
|
||||||
handling of invalid sequences is consistent with the interpretation
|
@scheme[#\uFFFD]. (This handling of invalid sequences is
|
||||||
of port bytes streams into characters; see @secref["ports"].)}
|
consistent with the interpretation of port bytes streams into
|
||||||
|
characters; see @secref["ports"].)}
|
||||||
|
|
||||||
@item{@scheme[(bytes-open-converter "" "UTF-8")] --- converts from
|
@item{@scheme[(bytes-open-converter "" "UTF-8")] --- converts from
|
||||||
the current locale's default encoding (see @secref["encodings"])
|
the current locale's default encoding (see @secref["encodings"])
|
||||||
|
|
|
@ -139,9 +139,8 @@ otherwise.
|
||||||
|
|
||||||
@defproc[(char-alphabetic? [char char?]) boolean?]{
|
@defproc[(char-alphabetic? [char char?]) boolean?]{
|
||||||
|
|
||||||
Returns @scheme[#t] if @scheme[char]'s Unicode general category is
|
Returns @scheme[#t] if @scheme[char] has the Unicode ``Alphabetic''
|
||||||
@UCat{Lu}, @UCat{Ll}, @UCat{Lt}, @UCat{Lm}, or @UCat{Lo}, @scheme[#f]
|
property.}
|
||||||
otherwise.}
|
|
||||||
|
|
||||||
@defproc[(char-lower-case? [char char?]) boolean?]{
|
@defproc[(char-lower-case? [char char?]) boolean?]{
|
||||||
|
|
||||||
|
@ -161,8 +160,8 @@ Returns @scheme[#t] if @scheme[char]'s Unicode general category is
|
||||||
|
|
||||||
@defproc[(char-numeric? [char char?]) boolean?]{
|
@defproc[(char-numeric? [char char?]) boolean?]{
|
||||||
|
|
||||||
Returns @scheme[#t] if @scheme[char]'s Unicode general category is
|
Returns @scheme[#t] if @scheme[char] has the Unicode ``Numeric''
|
||||||
@UCat{Nd}, @scheme[#f] otherwise.}
|
property.}
|
||||||
|
|
||||||
@defproc[(char-symbolic? [char char?]) boolean?]{
|
@defproc[(char-symbolic? [char char?]) boolean?]{
|
||||||
|
|
||||||
|
@ -185,10 +184,8 @@ Returns @scheme[#t] if @scheme[char]'s Unicode general category is
|
||||||
|
|
||||||
@defproc[(char-whitespace? [char char?]) boolean?]{
|
@defproc[(char-whitespace? [char char?]) boolean?]{
|
||||||
|
|
||||||
Returns @scheme[#t] if @scheme[char]'s Unicode general category is
|
Returns @scheme[#t] if @scheme[char] has the Unicode ``White_Space''
|
||||||
@UCat{Zs}, @UCat{Zl}, or @UCat{Zp}, or if @scheme[char] is one of the
|
property.}
|
||||||
following: @scheme[#\tab], @scheme[#\newline], @scheme[#\vtab],
|
|
||||||
@scheme[#\page], @scheme[#\return], or @scheme[#\u0085].}
|
|
||||||
|
|
||||||
@defproc[(char-blank? [char char?]) boolean?]{
|
@defproc[(char-blank? [char char?]) boolean?]{
|
||||||
|
|
||||||
|
|
|
@ -17,15 +17,15 @@ stream to discover that the stream is not a valid encoding.
|
||||||
When an input port produces a sequence of bytes that is not a valid
|
When an input port produces a sequence of bytes that is not a valid
|
||||||
UTF-8 encoding in a character-reading context, then bytes that
|
UTF-8 encoding in a character-reading context, then bytes that
|
||||||
constitute an invalid sequence are converted to the character
|
constitute an invalid sequence are converted to the character
|
||||||
@litchar{?}. Specifically, bytes 255 and 254 are always converted to
|
@scheme[#\uFFFD]. Specifically, bytes 255 and 254 are always converted
|
||||||
@litchar{?}, bytes in the range 192 to 253 produce @litchar{?} when
|
to @scheme[#\uFFFD], bytes in the range 192 to 253 produce
|
||||||
they are not followed by bytes that form a valid UTF-8 encoding, and
|
@scheme[#\uFFFD] when they are not followed by bytes that form a valid
|
||||||
bytes in the range 128 to 191 are converted to @litchar{?} when they
|
UTF-8 encoding, and bytes in the range 128 to 191 are converted to
|
||||||
are not part of a valid encoding that was started by a preceding byte
|
@scheme[#\uFFFD] when they are not part of a valid encoding that was
|
||||||
in the range 192 to 253. To put it another way, when reading a
|
started by a preceding byte in the range 192 to 253. To put it another
|
||||||
sequence of bytes as characters, a minimal set of bytes are changed to
|
way, when reading a sequence of bytes as characters, a minimal set of
|
||||||
63 (which is the value that @scheme[(char->integer #\?)] produces) so
|
bytes are changed to the encoding of @scheme[#\uFFFD] so that the
|
||||||
that the entire sequence of bytes is a valid UTF-8 encoding.
|
entire sequence of bytes is a valid UTF-8 encoding.
|
||||||
|
|
||||||
See @secref["bytestrings"] for procedures that facilitate
|
See @secref["bytestrings"] for procedures that facilitate
|
||||||
conversions using UTF-8 or other encodings. See also
|
conversions using UTF-8 or other encodings. See also
|
||||||
|
|
|
@ -110,10 +110,10 @@
|
||||||
|
|
||||||
;; Test escape printing:
|
;; Test escape printing:
|
||||||
(parameterize ([current-locale #f])
|
(parameterize ([current-locale #f])
|
||||||
(test "\"\\a\\b\\t\\n\\f\\r\\e\\v\\\\\\\"A \\u0005A\\u000FP\\u000FP\u00DDD\u00FF7\\u00011\\U00012345\""
|
(test "\"\\a\\b\\t\\n\\f\\r\\e\\v\\\\\\\"A \\u0005A\\u000FP\\u000FP\u00DDD\u00FF7\\u00011\\U00054321\""
|
||||||
'output-escapes
|
'output-escapes
|
||||||
(let ([p (open-output-string)])
|
(let ([p (open-output-string)])
|
||||||
(write "\a\b\t\n\f\r\e\v\\\"\101\40\5A\xFP\xfP\xdDD\3777\0011\U12345" p)
|
(write "\a\b\t\n\f\r\e\v\\\"\101\40\5A\xFP\xfP\xdDD\3777\0011\U54321" p)
|
||||||
(get-output-string p))))
|
(get-output-string p))))
|
||||||
|
|
||||||
(parameterize ([current-locale #f])
|
(parameterize ([current-locale #f])
|
||||||
|
|
|
@ -16,7 +16,7 @@
|
||||||
|
|
||||||
(define (get-test-file)
|
(define (get-test-file)
|
||||||
(define name "NormalizationTest.txt")
|
(define name "NormalizationTest.txt")
|
||||||
(define base "http://www.unicode.org/Public/4.1.0/ucd/")
|
(define base "http://www.unicode.org/Public/5.0.0/ucd/")
|
||||||
(define (existing)
|
(define (existing)
|
||||||
(let loop ([dirs (list (current-load-relative-directory)
|
(let loop ([dirs (list (current-load-relative-directory)
|
||||||
(current-directory))])
|
(current-directory))])
|
||||||
|
|
|
@ -587,6 +587,8 @@
|
||||||
(#(#xFFFF) complete
|
(#(#xFFFF) complete
|
||||||
(#o357 #o277 #o277))))
|
(#o357 #o277 #o277))))
|
||||||
|
|
||||||
|
(define replace-size 3) ; #\uFFFD -> 3 bytes in UTF
|
||||||
|
|
||||||
(define (string->print s) (map char->integer (string->list s)))
|
(define (string->print s) (map char->integer (string->list s)))
|
||||||
|
|
||||||
(define bytes->unicode-vector
|
(define bytes->unicode-vector
|
||||||
|
@ -672,9 +674,9 @@
|
||||||
(let ([convert
|
(let ([convert
|
||||||
(lambda (prefix)
|
(lambda (prefix)
|
||||||
(test (+ (vector-length code-points) (bytes-length prefix))
|
(test (+ (vector-length code-points) (bytes-length prefix))
|
||||||
bytes-utf-8-length (bytes-append prefix s) #\?)
|
bytes-utf-8-length (bytes-append prefix s) #\uFFFD)
|
||||||
(test (vector-length code-points)
|
(test (vector-length code-points)
|
||||||
bytes-utf-8-length (bytes-append prefix s) #\? (bytes-length prefix))
|
bytes-utf-8-length (bytes-append prefix s) #\uFFFD (bytes-length prefix))
|
||||||
(test (if (equal? prefix #"")
|
(test (if (equal? prefix #"")
|
||||||
#f
|
#f
|
||||||
(integer->char (bytes-ref prefix 0)))
|
(integer->char (bytes-ref prefix 0)))
|
||||||
|
@ -682,19 +684,19 @@
|
||||||
(test (if (equal? prefix #"")
|
(test (if (equal? prefix #"")
|
||||||
(if (equal? #() code-points)
|
(if (equal? #() code-points)
|
||||||
#f
|
#f
|
||||||
#\?)
|
#\uFFFD)
|
||||||
(integer->char (bytes-ref prefix 0)))
|
(integer->char (bytes-ref prefix 0)))
|
||||||
bytes-utf-8-ref (bytes-append prefix s) 0 #\?)
|
bytes-utf-8-ref (bytes-append prefix s) 0 #\uFFFD)
|
||||||
(test (if (equal? #() code-points)
|
(test (if (equal? #() code-points)
|
||||||
(if (equal? #"" prefix)
|
(if (equal? #"" prefix)
|
||||||
#f
|
#f
|
||||||
(integer->char (bytes-ref prefix (sub1 (bytes-length prefix)))))
|
(integer->char (bytes-ref prefix (sub1 (bytes-length prefix)))))
|
||||||
(integer->char
|
(integer->char
|
||||||
(or (vector-ref code-points (sub1 (vector-length code-points)))
|
(or (vector-ref code-points (sub1 (vector-length code-points)))
|
||||||
(char->integer #\?))))
|
(char->integer #\uFFFD))))
|
||||||
bytes-utf-8-ref (bytes-append prefix s)
|
bytes-utf-8-ref (bytes-append prefix s)
|
||||||
(max 0 (+ (bytes-length prefix) (sub1 (vector-length code-points))))
|
(max 0 (+ (bytes-length prefix) (sub1 (vector-length code-points))))
|
||||||
#\?)
|
#\uFFFD)
|
||||||
|
|
||||||
(let-values ([(s2 n status)
|
(let-values ([(s2 n status)
|
||||||
(bytes-convert utf-8-iconv-p (bytes-append prefix s))]
|
(bytes-convert utf-8-iconv-p (bytes-append prefix s))]
|
||||||
|
@ -703,10 +705,18 @@
|
||||||
[(error surrogate1 surrogate2)
|
[(error surrogate1 surrogate2)
|
||||||
(test 'complete 'status status)
|
(test 'complete 'status status)
|
||||||
(test (+ (bytes-length s) pl) 'n n)
|
(test (+ (bytes-length s) pl) 'n n)
|
||||||
(test (+ (vector-length code-points) pl) bytes-length s2)
|
(test (+ (* replace-size (vector-length code-points))
|
||||||
|
(if (and (positive? (vector-length code-points))
|
||||||
|
(eq? (vector-ref code-points (sub1 (vector-length code-points)))
|
||||||
|
#o40))
|
||||||
|
;; space at end is converted, not replaced by #\xFFFD
|
||||||
|
(- 1 replace-size)
|
||||||
|
0)
|
||||||
|
pl)
|
||||||
|
bytes-length s2)
|
||||||
(test (append (bytes->list prefix)
|
(test (append (bytes->list prefix)
|
||||||
(map
|
(map
|
||||||
(lambda (i) (or i (char->integer #\?)))
|
(lambda (i) (or i (char->integer #\uFFFD)))
|
||||||
(vector->list code-points)))
|
(vector->list code-points)))
|
||||||
vector->list (bytes->unicode-vector s2))]
|
vector->list (bytes->unicode-vector s2))]
|
||||||
[(error/aborts)
|
[(error/aborts)
|
||||||
|
@ -718,10 +728,10 @@
|
||||||
(vector->list code-points))
|
(vector->list code-points))
|
||||||
;; indicates how many to be unused due to abort:
|
;; indicates how many to be unused due to abort:
|
||||||
(cadddr p))))])
|
(cadddr p))))])
|
||||||
(test (+ (vector-length code-points) pl) bytes-length s2)
|
(test (+ (* replace-size (vector-length code-points)) pl) bytes-length s2)
|
||||||
(test (append (bytes->list prefix)
|
(test (append (bytes->list prefix)
|
||||||
(map
|
(map
|
||||||
(lambda (i) (or i (char->integer #\?)))
|
(lambda (i) (or i (char->integer #\uFFFD)))
|
||||||
(vector->list code-points)))
|
(vector->list code-points)))
|
||||||
vector->list (bytes->unicode-vector s2)))]
|
vector->list (bytes->unicode-vector s2)))]
|
||||||
[else
|
[else
|
||||||
|
@ -786,9 +796,9 @@
|
||||||
(loop c p))))))))
|
(loop c p))))))))
|
||||||
;; Test read-string decoding
|
;; Test read-string decoding
|
||||||
(let ([us (apply string (map (lambda (i)
|
(let ([us (apply string (map (lambda (i)
|
||||||
(if i (integer->char i) #\?))
|
(if i (integer->char i) #\uFFFD))
|
||||||
(vector->list code-points)))])
|
(vector->list code-points)))])
|
||||||
(test us bytes->string/utf-8 s #\?)
|
(test us bytes->string/utf-8 s #\uFFFD)
|
||||||
(test us read-string (vector-length code-points) (open-input-bytes s))
|
(test us read-string (vector-length code-points) (open-input-bytes s))
|
||||||
(test us read-string (* 100 (vector-length code-points)) (open-input-bytes s))
|
(test us read-string (* 100 (vector-length code-points)) (open-input-bytes s))
|
||||||
(unless (string=? "" us)
|
(unless (string=? "" us)
|
||||||
|
@ -940,7 +950,7 @@
|
||||||
(test 'complete 'status status)
|
(test 'complete 'status status)
|
||||||
;; Should be the same as decoding corrected UTF-8:
|
;; Should be the same as decoding corrected UTF-8:
|
||||||
(let-values ([(s3 n status) (bytes-convert c (string->bytes/utf-8
|
(let-values ([(s3 n status) (bytes-convert c (string->bytes/utf-8
|
||||||
(bytes->string/utf-8 s #\?)))])
|
(bytes->string/utf-8 s #\uFFFD)))])
|
||||||
(test s3 `(permissive ,s) s2)))))))))
|
(test s3 `(permissive ,s) s2)))))))))
|
||||||
basic-utf-8-tests))
|
basic-utf-8-tests))
|
||||||
|
|
||||||
|
@ -1224,7 +1234,7 @@
|
||||||
(loop (add1 i)))))
|
(loop (add1 i)))))
|
||||||
|
|
||||||
(define (check-all-unicode ? l)
|
(define (check-all-unicode ? l)
|
||||||
(define (unless-in-l ? c)
|
(define (unless-in-l ? code c)
|
||||||
(and (? c)
|
(and (? c)
|
||||||
(not (member c l))))
|
(not (member c l))))
|
||||||
(define (qtest expect f . args)
|
(define (qtest expect f . args)
|
||||||
|
@ -1235,9 +1245,9 @@
|
||||||
l)
|
l)
|
||||||
(for-each (lambda (r)
|
(for-each (lambda (r)
|
||||||
(if (caddr r)
|
(if (caddr r)
|
||||||
(qtest #f unless-in-l ? (integer->char (car r)))
|
(qtest #f unless-in-l ? (car r) (integer->char (car r)))
|
||||||
(let loop ([i (car r)])
|
(let loop ([i (car r)])
|
||||||
(qtest #f unless-in-l ? (integer->char i))
|
(qtest #f unless-in-l ? i (integer->char i))
|
||||||
(unless (= i (cadr r))
|
(unless (= i (cadr r))
|
||||||
(loop (add1 i))))))
|
(loop (add1 i))))))
|
||||||
(make-known-char-range-list)))
|
(make-known-char-range-list)))
|
||||||
|
@ -1560,7 +1570,11 @@
|
||||||
(> (+ (if (char-alphabetic? c) 1 0)
|
(> (+ (if (char-alphabetic? c) 1 0)
|
||||||
(if (char-numeric? c) 1 0)
|
(if (char-numeric? c) 1 0)
|
||||||
(if (char-punctuation? c) 1 0)
|
(if (char-punctuation? c) 1 0)
|
||||||
(if (char-symbolic? c) 1 0))
|
(if (char-symbolic? c)
|
||||||
|
(if (char<=? #\u24B6 c #\u24E9)
|
||||||
|
0 ;; Those are both alphabetic and symbolic
|
||||||
|
1)
|
||||||
|
0))
|
||||||
1))
|
1))
|
||||||
null)
|
null)
|
||||||
|
|
||||||
|
|
|
@ -135,6 +135,9 @@ but we start with an enumeration of changes:
|
||||||
- In a syntax pattern of the form `(a ... . b)', `b' is now allowed
|
- In a syntax pattern of the form `(a ... . b)', `b' is now allowed
|
||||||
to match an empty list.
|
to match an empty list.
|
||||||
|
|
||||||
|
- UTF-8 decoding for ports uses #\uFFFD instead of #\? as the
|
||||||
|
replacement character for bad encodings.
|
||||||
|
|
||||||
======================================================================
|
======================================================================
|
||||||
Porting Advice
|
Porting Advice
|
||||||
======================================================================
|
======================================================================
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
#lang scheme/base
|
||||||
|
|
||||||
;; This script parses UnicodeData.txt (the standard Unicode database,
|
;; This script parses UnicodeData.txt (the standard Unicode database,
|
||||||
;; available from the web) and other such files, and it produces
|
;; available from the web) and other such files, and it produces
|
||||||
|
@ -6,13 +7,12 @@
|
||||||
;; is used for string operations.
|
;; is used for string operations.
|
||||||
|
|
||||||
;; Run as
|
;; Run as
|
||||||
;; mzscheme -r mk-uchar.ss
|
;; mzscheme mk-uchar.ss
|
||||||
;; in the script's directory, and have a copy of UnicodeData.txt, etc.
|
;; in the script's directory, and have a copy of UnicodeData.txt, etc.
|
||||||
;; in the "Unicode" directory. The file schuchar.inc will be
|
;; in the "Unicode" directory. The file schuchar.inc will be
|
||||||
;; overwritten.
|
;; overwritten.
|
||||||
|
|
||||||
(require mzlib/list)
|
(require scheme/list)
|
||||||
(require mzscheme)
|
|
||||||
|
|
||||||
(define mark-cats '("Mn" "Mc" "Me"))
|
(define mark-cats '("Mn" "Mc" "Me"))
|
||||||
(define letter-cats '("Lu" "Ll" "Lt" "Lm" "Lo"))
|
(define letter-cats '("Lu" "Ll" "Lt" "Lm" "Lo"))
|
||||||
|
@ -195,7 +195,7 @@
|
||||||
|
|
||||||
;; This code assumes that Final_Sigma is the only condition that we care about:
|
;; This code assumes that Final_Sigma is the only condition that we care about:
|
||||||
(define special-casings (make-hash-table 'equal))
|
(define special-casings (make-hash-table 'equal))
|
||||||
(define-struct special-casing (lower upper title folding final-sigma?))
|
(define-struct special-casing (lower upper title folding final-sigma?) #:mutable)
|
||||||
(call-with-input-file "Unicode/SpecialCasing.txt"
|
(call-with-input-file "Unicode/SpecialCasing.txt"
|
||||||
(lambda (i)
|
(lambda (i)
|
||||||
(let loop ()
|
(let loop ()
|
||||||
|
@ -215,26 +215,51 @@
|
||||||
|
|
||||||
(define lower-case (make-hash-table 'equal))
|
(define lower-case (make-hash-table 'equal))
|
||||||
(define upper-case (make-hash-table 'equal))
|
(define upper-case (make-hash-table 'equal))
|
||||||
|
(define alphabetic (make-hash-table 'equal))
|
||||||
|
|
||||||
(with-input-from-file "Unicode/DerivedCoreProperties.txt"
|
(with-input-from-file "Unicode/DerivedCoreProperties.txt"
|
||||||
(lambda ()
|
(lambda ()
|
||||||
(let loop ()
|
(let loop ()
|
||||||
(let ([l (read-line)])
|
(let ([l (read-line)])
|
||||||
(unless (eof-object? l)
|
(unless (eof-object? l)
|
||||||
(let ([m (regexp-match #rx"^([0-9A-F.]+) *; (Lower|Upper)case" l)])
|
(let ([m (regexp-match #rx"^([0-9A-F.]+) *; ((Lower|Upper)case|Alphabetic)" l)])
|
||||||
(when m
|
(when m
|
||||||
(let* ([start (string->number (car (regexp-match #rx"^[0-9A-F]+" (car m))) 16)]
|
(let* ([start (string->number (car (regexp-match #rx"^[0-9A-F]+" (car m))) 16)]
|
||||||
[end (let ([m (regexp-match #rx"^[0-9A-F]+[.][.]([0-9A-F]+)" (car m))])
|
[end (let ([m (regexp-match #rx"^[0-9A-F]+[.][.]([0-9A-F]+)" (car m))])
|
||||||
(if m
|
(if m
|
||||||
(string->number (cadr m) 16)
|
(string->number (cadr m) 16)
|
||||||
start))]
|
start))]
|
||||||
[t (if (string=? (caddr m) "Lower") lower-case upper-case)])
|
[t (cond
|
||||||
|
[(string=? (caddr m) "Lowercase") lower-case]
|
||||||
|
[(string=? (caddr m) "Uppercase") upper-case]
|
||||||
|
[(string=? (caddr m) "Alphabetic") alphabetic]
|
||||||
|
[else (error "unknown property section")])])
|
||||||
(let loop ([i start])
|
(let loop ([i start])
|
||||||
(hash-table-put! t i #t)
|
(hash-table-put! t i #t)
|
||||||
(unless (= i end)
|
(unless (= i end)
|
||||||
(loop (add1 i)))))))
|
(loop (add1 i)))))))
|
||||||
(loop))))))
|
(loop))))))
|
||||||
|
|
||||||
|
(define white_spaces (make-hash-table 'equal))
|
||||||
|
|
||||||
|
(with-input-from-file "Unicode/PropList.txt"
|
||||||
|
(lambda ()
|
||||||
|
(let loop ()
|
||||||
|
(let ([l (read-line)])
|
||||||
|
(unless (eof-object? l)
|
||||||
|
(let ([m (regexp-match #rx"^([0-9A-F.]+) *; White_Space" l)])
|
||||||
|
(when m
|
||||||
|
(let* ([start (string->number (car (regexp-match #rx"^[0-9A-F]+" (car m))) 16)]
|
||||||
|
[end (let ([m (regexp-match #rx"^[0-9A-F]+[.][.]([0-9A-F]+)" (car m))])
|
||||||
|
(if m
|
||||||
|
(string->number (cadr m) 16)
|
||||||
|
start))])
|
||||||
|
(let loop ([i start])
|
||||||
|
(hash-table-put! white_spaces i #t)
|
||||||
|
(unless (= i end)
|
||||||
|
(loop (add1 i)))))))
|
||||||
|
(loop))))))
|
||||||
|
|
||||||
(define decomp-ht (make-hash-table))
|
(define decomp-ht (make-hash-table))
|
||||||
(define k-decomp-ht (make-hash-table))
|
(define k-decomp-ht (make-hash-table))
|
||||||
(define compose-initial-ht (make-hash-table))
|
(define compose-initial-ht (make-hash-table))
|
||||||
|
@ -290,7 +315,7 @@
|
||||||
(let loop ([prev-code 0])
|
(let loop ([prev-code 0])
|
||||||
(let ([l (read-line i)])
|
(let ([l (read-line i)])
|
||||||
(unless (eof-object? l)
|
(unless (eof-object? l)
|
||||||
(let ([m (regexp-match #rx"^([0-9A-F]+);([^;]*);([^;]*);([^;]*);[^;]*;([^;]*);[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;([^;]*);([^;]*);([^;]*)"
|
(let ([m (regexp-match #rx"^([0-9A-F]+);([^;]*);([^;]*);([^;]*);[^;]*;([^;]*);[^;]*;([^;]*);[^;]*;[^;]*;[^;]*;[^;]*;([^;]*);([^;]*);([^;]*)"
|
||||||
l)])
|
l)])
|
||||||
(unless m
|
(unless m
|
||||||
(printf "no match: ~a~n" l))
|
(printf "no match: ~a~n" l))
|
||||||
|
@ -299,74 +324,89 @@
|
||||||
[cat (cadddr m)]
|
[cat (cadddr m)]
|
||||||
[combining (string->number (cadddr (cdr m)))]
|
[combining (string->number (cadddr (cdr m)))]
|
||||||
[decomp (cadddr (cddr m))]
|
[decomp (cadddr (cddr m))]
|
||||||
[up (string->number (cadddr (cdddr m)) 16)]
|
[numeric (cadddr (cdddr m))]
|
||||||
[down (string->number (cadddr (cddddr m)) 16)]
|
[up (string->number (cadddr (cddddr m)) 16)]
|
||||||
[title (string->number (cadddr (cddddr (cdr m))) 16)])
|
[down (string->number (cadddr (cddddr (cdr m))) 16)]
|
||||||
(mapn code
|
[title (string->number (cadddr (cddddr (cddr m))) 16)])
|
||||||
(if (regexp-match #rx", Last>" name)
|
(let ([alphabetic? (hash-table-get alphabetic code #f)]
|
||||||
(add1 prev-code)
|
[numeric? (not (string=? numeric ""))]
|
||||||
code)
|
[symbolic? (member cat sym-cats)]
|
||||||
;; The booleans below are in most-siginficant-bit-first order
|
[punctuation? (member cat punc-cats)])
|
||||||
(combine
|
(mapn code
|
||||||
;; Decomposition
|
(if (regexp-match #rx", Last>" name)
|
||||||
(extract-decomp decomp code)
|
(add1 prev-code)
|
||||||
;; special-casing
|
code)
|
||||||
(or (hash-table-get special-casings code (lambda () #f))
|
;; The booleans below are in most-siginficant-bit-first order
|
||||||
(hash-table-get special-case-foldings code (lambda () #f)))
|
(combine
|
||||||
;; case-ignoreable
|
;; Decomposition
|
||||||
(or (member code midletters)
|
(extract-decomp decomp code)
|
||||||
(member cat '("Mn" "Me" "Cf" "Lm" "Sk")))
|
;; special-casing
|
||||||
;; graphic
|
(or (hash-table-get special-casings code (lambda () #f))
|
||||||
(member cat graphic-cats)
|
(hash-table-get special-case-foldings code (lambda () #f)))
|
||||||
;; lowercase:
|
;; case-ignoreable
|
||||||
(hash-table-get lower-case code (lambda () #f))
|
(or (member code midletters)
|
||||||
#;
|
(member cat '("Mn" "Me" "Cf" "Lm" "Sk")))
|
||||||
(and (not (<= #x2000 code #x2FFF))
|
;; graphic
|
||||||
(not down)
|
(or alphabetic?
|
||||||
(or up
|
numeric?
|
||||||
(regexp-match #rx"SMALL LETTER" name)
|
symbolic?
|
||||||
(regexp-match #rx"SMALL LIGATURE" name)))
|
punctuation?
|
||||||
;; uppercase;
|
(member cat mark-cats))
|
||||||
(hash-table-get upper-case code (lambda () #f))
|
;; lowercase:
|
||||||
#;
|
(hash-table-get lower-case code (lambda () #f))
|
||||||
(and (not (<= #x2000 code #x2FFF))
|
#;
|
||||||
(not up)
|
(and (not (<= #x2000 code #x2FFF))
|
||||||
(or down
|
(not down)
|
||||||
(regexp-match #rx"CAPITAL LETTER" name)
|
(or up
|
||||||
(regexp-match #rx"CAPITAL LIGATURE" name)))
|
(regexp-match #rx"SMALL LETTER" name)
|
||||||
;; titlecase:
|
(regexp-match #rx"SMALL LIGATURE" name)))
|
||||||
(string=? cat "Lt")
|
;; uppercase;
|
||||||
;; letter
|
(hash-table-get upper-case code (lambda () #f))
|
||||||
(member cat letter-cats)
|
#;
|
||||||
;; digit
|
(and (not (<= #x2000 code #x2FFF))
|
||||||
(member cat digit-cats)
|
(not up)
|
||||||
;; SOMETHING - this bit not yet used
|
(or down
|
||||||
#f
|
(regexp-match #rx"CAPITAL LETTER" name)
|
||||||
;; whitespace
|
(regexp-match #rx"CAPITAL LIGATURE" name)))
|
||||||
(or (member cat space-cats)
|
;; titlecase:
|
||||||
(member code '(#x9 #xa #xb #xc #xd #x85)))
|
(string=? cat "Lt")
|
||||||
;; control
|
;; letter
|
||||||
(or (<= #x0000 code #x001F)
|
alphabetic?
|
||||||
(<= #x007F code #x009F))
|
#;
|
||||||
;; punctuation
|
(member cat letter-cats)
|
||||||
(member cat punc-cats)
|
;; digit
|
||||||
;; symbol
|
numeric?
|
||||||
(member cat sym-cats)
|
#;
|
||||||
;; blank
|
(member cat digit-cats)
|
||||||
(or (string=? cat "Zs")
|
;; SOMETHING - this bit not yet used
|
||||||
(= code #x9)))
|
#f
|
||||||
;; Cases
|
;; whitespace
|
||||||
(combine-case
|
(hash-table-get white_spaces code #f)
|
||||||
(if up (- up code) 0)
|
#;
|
||||||
(if down (- down code) 0)
|
(or (member cat space-cats)
|
||||||
(if title (- title code) 0)
|
(member code '(#x9 #xa #xb #xc #xd #x85)))
|
||||||
(let ([case-fold (hash-table-get case-foldings code (lambda () #f))])
|
;; control
|
||||||
(if case-fold (- case-fold code) 0))
|
(or (<= #x0000 code #x001F)
|
||||||
combining)
|
(<= #x007F code #x009F))
|
||||||
;; Category
|
;; punctuation
|
||||||
(combine-cat cat)
|
punctuation?
|
||||||
;; Combining class - used again to filter initial composes
|
;; symbol
|
||||||
combining)
|
symbolic?
|
||||||
|
;; blank
|
||||||
|
(or (string=? cat "Zs")
|
||||||
|
(= code #x9)))
|
||||||
|
;; Cases
|
||||||
|
(combine-case
|
||||||
|
(if up (- up code) 0)
|
||||||
|
(if down (- down code) 0)
|
||||||
|
(if title (- title code) 0)
|
||||||
|
(let ([case-fold (hash-table-get case-foldings code (lambda () #f))])
|
||||||
|
(if case-fold (- case-fold code) 0))
|
||||||
|
combining)
|
||||||
|
;; Category
|
||||||
|
(combine-cat cat)
|
||||||
|
;; Combining class - used again to filter initial composes
|
||||||
|
combining))
|
||||||
(loop code))))))))
|
(loop code))))))))
|
||||||
|
|
||||||
(hash-table-for-each compose-initial-ht
|
(hash-table-for-each compose-initial-ht
|
||||||
|
@ -438,7 +478,7 @@
|
||||||
(define pos2 0)
|
(define pos2 0)
|
||||||
(define pos3 0)
|
(define pos3 0)
|
||||||
|
|
||||||
(current-output-port (open-output-file "schuchar.inc" 'truncate/replace))
|
(current-output-port (open-output-file "schuchar.inc" #:exists 'truncate/replace))
|
||||||
|
|
||||||
(define (hash-vectors! top vectors get-pos set-pos!)
|
(define (hash-vectors! top vectors get-pos set-pos!)
|
||||||
(let loop ([i 0])
|
(let loop ([i 0])
|
||||||
|
@ -628,7 +668,7 @@
|
||||||
|
|
||||||
;; ----------------------------------------
|
;; ----------------------------------------
|
||||||
|
|
||||||
(current-output-port (open-output-file "schustr.inc" 'truncate/replace))
|
(current-output-port (open-output-file "schustr.inc" #:exists 'truncate/replace))
|
||||||
|
|
||||||
(printf "/* Generated by mk-uchar.ss */~n~n")
|
(printf "/* Generated by mk-uchar.ss */~n~n")
|
||||||
|
|
||||||
|
|
|
@ -1288,7 +1288,7 @@ XFORM_NONGCING static void do_count_lines(Scheme_Port *ip, const char *buffer, l
|
||||||
int state = ip->utf8state;
|
int state = ip->utf8state;
|
||||||
int n;
|
int n;
|
||||||
degot += state_len(state);
|
degot += state_len(state);
|
||||||
n = scheme_utf8_decode_count((const unsigned char *)buffer, offset, offset + i + 1, &state, 0, '?');
|
n = scheme_utf8_decode_count((const unsigned char *)buffer, offset, offset + i + 1, &state, 0, 0xFFFD);
|
||||||
degot += (i + 1 - n);
|
degot += (i + 1 - n);
|
||||||
ip->utf8state = 0; /* assert: state == 0, because we ended with a newline */
|
ip->utf8state = 0; /* assert: state == 0, because we ended with a newline */
|
||||||
}
|
}
|
||||||
|
@ -1331,7 +1331,7 @@ XFORM_NONGCING static void do_count_lines(Scheme_Port *ip, const char *buffer, l
|
||||||
col -= n;
|
col -= n;
|
||||||
for (i = prev_i; i < got; i++) {
|
for (i = prev_i; i < got; i++) {
|
||||||
if (buffer[offset + i] == '\t') {
|
if (buffer[offset + i] == '\t') {
|
||||||
n = scheme_utf8_decode_count((const unsigned char *)buffer, offset + prev_i, offset + i, &state, 0, '?');
|
n = scheme_utf8_decode_count((const unsigned char *)buffer, offset + prev_i, offset + i, &state, 0, 0xFFFD);
|
||||||
degot += ((i - prev_i) - n);
|
degot += ((i - prev_i) - n);
|
||||||
col += n;
|
col += n;
|
||||||
col = col - (col & 0x7) + 8;
|
col = col - (col & 0x7) + 8;
|
||||||
|
@ -1339,7 +1339,7 @@ XFORM_NONGCING static void do_count_lines(Scheme_Port *ip, const char *buffer, l
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (prev_i < i) {
|
if (prev_i < i) {
|
||||||
n = scheme_utf8_decode_count((const unsigned char *)buffer, offset + prev_i, offset + i, &state, 1, '?');
|
n = scheme_utf8_decode_count((const unsigned char *)buffer, offset + prev_i, offset + i, &state, 1, 0xFFFD);
|
||||||
n += state_len(state);
|
n += state_len(state);
|
||||||
col += n;
|
col += n;
|
||||||
degot += ((i - prev_i) - n);
|
degot += ((i - prev_i) - n);
|
||||||
|
@ -2178,7 +2178,7 @@ long scheme_get_char_string(const char *who,
|
||||||
long ulen, glen;
|
long ulen, glen;
|
||||||
glen = scheme_utf8_decode_as_prefix((const unsigned char *)s, 0, got + leftover,
|
glen = scheme_utf8_decode_as_prefix((const unsigned char *)s, 0, got + leftover,
|
||||||
buffer, offset, offset + size,
|
buffer, offset, offset + size,
|
||||||
&ulen, 0, '?');
|
&ulen, 0, 0xFFFD);
|
||||||
if (glen && (ulen < got + leftover)) {
|
if (glen && (ulen < got + leftover)) {
|
||||||
/* Got one, with a decoding error. If we weren't peeking,
|
/* Got one, with a decoding error. If we weren't peeking,
|
||||||
don't read the lookahead bytes after all, yet. */
|
don't read the lookahead bytes after all, yet. */
|
||||||
|
@ -2221,7 +2221,7 @@ long scheme_get_char_string(const char *who,
|
||||||
sequence in an error. We may have more leftover chars
|
sequence in an error. We may have more leftover chars
|
||||||
than we need, but they haven't been read, yet. */
|
than we need, but they haven't been read, yet. */
|
||||||
while (leftover && size) {
|
while (leftover && size) {
|
||||||
buffer[offset++] = '?';
|
buffer[offset++] = 0xFFFD;
|
||||||
total_got++;
|
total_got++;
|
||||||
--leftover;
|
--leftover;
|
||||||
--size;
|
--size;
|
||||||
|
@ -2248,7 +2248,7 @@ long scheme_get_char_string(const char *who,
|
||||||
|
|
||||||
glen = scheme_utf8_decode_as_prefix((const unsigned char *)s, 0, got + leftover,
|
glen = scheme_utf8_decode_as_prefix((const unsigned char *)s, 0, got + leftover,
|
||||||
buffer, offset, offset + size,
|
buffer, offset, offset + size,
|
||||||
&ulen, 0, '?');
|
&ulen, 0, 0xFFFD);
|
||||||
|
|
||||||
total_got += glen;
|
total_got += glen;
|
||||||
if (glen == size) {
|
if (glen == size) {
|
||||||
|
@ -2268,7 +2268,7 @@ long scheme_get_char_string(const char *who,
|
||||||
|
|
||||||
/* Leftover bytes must be decoding-error bytes: */
|
/* Leftover bytes must be decoding-error bytes: */
|
||||||
while (leftover) {
|
while (leftover) {
|
||||||
buffer[offset++] = '?';
|
buffer[offset++] = 0xFFFD;
|
||||||
total_got++;
|
total_got++;
|
||||||
--leftover;
|
--leftover;
|
||||||
}
|
}
|
||||||
|
@ -2401,7 +2401,7 @@ scheme_getc(Scheme_Object *port)
|
||||||
else {
|
else {
|
||||||
/* This counts as a decoding error. The high bit
|
/* This counts as a decoding error. The high bit
|
||||||
on the first character must be set. */
|
on the first character must be set. */
|
||||||
return '?';
|
return 0xFFFD;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
v = scheme_utf8_decode_prefix((const unsigned char *)s, delta + 1, r, 0);
|
v = scheme_utf8_decode_prefix((const unsigned char *)s, delta + 1, r, 0);
|
||||||
|
@ -2417,7 +2417,7 @@ scheme_getc(Scheme_Object *port)
|
||||||
return r[0];
|
return r[0];
|
||||||
} else if (v == -2) {
|
} else if (v == -2) {
|
||||||
/* -2 => decoding error */
|
/* -2 => decoding error */
|
||||||
return '?';
|
return 0xFFFD;
|
||||||
} else if (v == -1) {
|
} else if (v == -1) {
|
||||||
/* In middle of sequence; start/continue peeking bytes */
|
/* In middle of sequence; start/continue peeking bytes */
|
||||||
delta++;
|
delta++;
|
||||||
|
@ -2543,8 +2543,8 @@ static int do_peekc_skip(Scheme_Object *port, Scheme_Object *skip,
|
||||||
if (!delta)
|
if (!delta)
|
||||||
return v;
|
return v;
|
||||||
else {
|
else {
|
||||||
/* This counts as a decoding error, so return '?' */
|
/* This counts as a decoding error, so return 0xFFFD */
|
||||||
return '?';
|
return 0xFFFD;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
v = scheme_utf8_decode_prefix((const unsigned char *)s, delta + 1, r, 0);
|
v = scheme_utf8_decode_prefix((const unsigned char *)s, delta + 1, r, 0);
|
||||||
|
@ -2552,7 +2552,7 @@ static int do_peekc_skip(Scheme_Object *port, Scheme_Object *skip,
|
||||||
return r[0];
|
return r[0];
|
||||||
else if (v == -2) {
|
else if (v == -2) {
|
||||||
/* -2 => decoding error */
|
/* -2 => decoding error */
|
||||||
return '?';
|
return 0xFFFD;
|
||||||
} else if (v == -1) {
|
} else if (v == -1) {
|
||||||
/* In middle of sequence - keep getting bytes. */
|
/* In middle of sequence - keep getting bytes. */
|
||||||
delta++;
|
delta++;
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -852,11 +852,11 @@ Scheme_Object *scheme_make_sized_offset_utf8_string(char *chars, long d, long le
|
||||||
if (len) {
|
if (len) {
|
||||||
ulen = scheme_utf8_decode((unsigned char *)chars, d, d + len,
|
ulen = scheme_utf8_decode((unsigned char *)chars, d, d + len,
|
||||||
NULL, 0, -1,
|
NULL, 0, -1,
|
||||||
NULL, 0 /* not UTF-16 */, '?');
|
NULL, 0 /* not UTF-16 */, 0xFFFD);
|
||||||
us = scheme_malloc_atomic(sizeof(mzchar) * (ulen + 1));
|
us = scheme_malloc_atomic(sizeof(mzchar) * (ulen + 1));
|
||||||
scheme_utf8_decode((unsigned char *)chars, d, d + len,
|
scheme_utf8_decode((unsigned char *)chars, d, d + len,
|
||||||
us, 0, -1,
|
us, 0, -1,
|
||||||
NULL, 0 /* not UTF-16 */, '?');
|
NULL, 0 /* not UTF-16 */, 0xFFFD);
|
||||||
|
|
||||||
us[ulen] = 0;
|
us[ulen] = 0;
|
||||||
} else {
|
} else {
|
||||||
|
@ -1291,12 +1291,12 @@ byte_string_to_char_string_latin1 (int argc, Scheme_Object *argv[])
|
||||||
|
|
||||||
Scheme_Object *scheme_byte_string_to_char_string(Scheme_Object *o)
|
Scheme_Object *scheme_byte_string_to_char_string(Scheme_Object *o)
|
||||||
{
|
{
|
||||||
return do_byte_string_to_char_string("s->s", o, 0, SCHEME_BYTE_STRLEN_VAL(o), '?', 0);
|
return do_byte_string_to_char_string("s->s", o, 0, SCHEME_BYTE_STRLEN_VAL(o), 0xFFFD, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
Scheme_Object *scheme_byte_string_to_char_string_locale(Scheme_Object *o)
|
Scheme_Object *scheme_byte_string_to_char_string_locale(Scheme_Object *o)
|
||||||
{
|
{
|
||||||
return do_byte_string_to_char_string_locale("s->s", o, 0, SCHEME_BYTE_STRLEN_VAL(o), '?');
|
return do_byte_string_to_char_string_locale("s->s", o, 0, SCHEME_BYTE_STRLEN_VAL(o), 0xFFFD);
|
||||||
}
|
}
|
||||||
|
|
||||||
/************************* string->bytes *************************/
|
/************************* string->bytes *************************/
|
||||||
|
@ -4166,7 +4166,7 @@ Scheme_Object *scheme_open_converter(const char *from_e, const char *to_e)
|
||||||
/* Use the built-in UTF-8<->UTF-8 converter: */
|
/* Use the built-in UTF-8<->UTF-8 converter: */
|
||||||
kind = mzUTF8_KIND;
|
kind = mzUTF8_KIND;
|
||||||
if (!strcmp(from_e, "UTF-8-permissive"))
|
if (!strcmp(from_e, "UTF-8-permissive"))
|
||||||
permissive = '?';
|
permissive = 0xFFFD;
|
||||||
else
|
else
|
||||||
permissive = 0;
|
permissive = 0;
|
||||||
cd = (iconv_t)-1;
|
cd = (iconv_t)-1;
|
||||||
|
@ -4176,7 +4176,7 @@ Scheme_Object *scheme_open_converter(const char *from_e, const char *to_e)
|
||||||
&& !strcmp(to_e, "platform-UTF-16")) {
|
&& !strcmp(to_e, "platform-UTF-16")) {
|
||||||
kind = mzUTF8_TO_UTF16_KIND;
|
kind = mzUTF8_TO_UTF16_KIND;
|
||||||
if (!strcmp(from_e, "platform-UTF-8-permissive"))
|
if (!strcmp(from_e, "platform-UTF-8-permissive"))
|
||||||
permissive = '?';
|
permissive = 0xFFFD;
|
||||||
else
|
else
|
||||||
permissive = 0;
|
permissive = 0;
|
||||||
cd = (iconv_t)-1;
|
cd = (iconv_t)-1;
|
||||||
|
@ -4546,7 +4546,8 @@ static int utf8_decode_x(const unsigned char *s, int start, int end,
|
||||||
might_continue => allows -1 result without consuming characters
|
might_continue => allows -1 result without consuming characters
|
||||||
|
|
||||||
permissive is non-zero => use permissive as value for bad byte
|
permissive is non-zero => use permissive as value for bad byte
|
||||||
sequences*/
|
sequences. When generating UTF-8, this must be an ASCII character
|
||||||
|
or U+FFFD. */
|
||||||
|
|
||||||
{
|
{
|
||||||
int i, j, oki, failmode = -3, state;
|
int i, j, oki, failmode = -3, state;
|
||||||
|
@ -4796,9 +4797,19 @@ static int utf8_decode_x(const unsigned char *s, int start, int end,
|
||||||
j += delta;
|
j += delta;
|
||||||
} else
|
} else
|
||||||
break;
|
break;
|
||||||
} else if (us) {
|
} else if (v == 0xFFFD) {
|
||||||
((unsigned char *)us)[j] = v;
|
if (j + 3 < dend) {
|
||||||
}
|
if (us) {
|
||||||
|
((unsigned char *)us)[j] = 0xEF;
|
||||||
|
((unsigned char *)us)[j+1] = 0xBF;
|
||||||
|
((unsigned char *)us)[j+2] = 0xBD;
|
||||||
|
}
|
||||||
|
j += 2;
|
||||||
|
} else
|
||||||
|
break;
|
||||||
|
} else if (us) {
|
||||||
|
((unsigned char *)us)[j] = v;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} else if (us) {
|
} else if (us) {
|
||||||
us[j] = v;
|
us[j] = v;
|
||||||
|
|
Loading…
Reference in New Issue
Block a user