fix string-titlecase based on case-ignoreable chars

Fix extraction of case-ignorable characters from the Unicode
database.
This commit is contained in:
Matthew Flatt 2015-04-10 13:03:22 -06:00
parent e807e848f9
commit 8e22b22630
2 changed files with 9 additions and 18 deletions

View File

@ -158,16 +158,6 @@
[i (bitwise-and c low) ]) [i (bitwise-and c low) ])
(vector-set! vec i (bitwise-ior #x8000 (vector-ref vec i)))))) (vector-set! vec i (bitwise-ior #x8000 (vector-ref vec i))))))
(define midletters
(call-with-input-file "Unicode/WordBreakProperty.txt"
(lambda (i)
(let loop ()
(let ([re (regexp-match #rx"\n([0-9A-F]+) *; *MidLetter" i)])
(if re
(cons (string->number (bytes->string/latin-1 (cadr re)) 16)
(loop))
null))))))
(define (string->codes s) (define (string->codes s)
(let ([m (regexp-match #rx"^[^0-9A-F]*([0-9A-F]+)" s)]) (let ([m (regexp-match #rx"^[^0-9A-F]*([0-9A-F]+)" s)])
(if m (if m
@ -216,13 +206,14 @@
(define lower-case (make-hash)) (define lower-case (make-hash))
(define upper-case (make-hash)) (define upper-case (make-hash))
(define alphabetic (make-hash)) (define alphabetic (make-hash))
(define case-ignorable (make-hash))
(with-input-from-file "Unicode/DerivedCoreProperties.txt" (with-input-from-file "Unicode/DerivedCoreProperties.txt"
(lambda () (lambda ()
(let loop () (let loop ()
(let ([l (read-line)]) (let ([l (read-line)])
(unless (eof-object? l) (unless (eof-object? l)
(let ([m (regexp-match #rx"^([0-9A-F.]+) *; ((Lower|Upper)case|Alphabetic)" l)]) (let ([m (regexp-match #rx"^([0-9A-F.]+) *; ((Lower|Upper)case|Alphabetic|Case_Ignorable)" l)])
(when m (when m
(let* ([start (string->number (car (regexp-match #rx"^[0-9A-F]+" (car m))) 16)] (let* ([start (string->number (car (regexp-match #rx"^[0-9A-F]+" (car m))) 16)]
[end (let ([m (regexp-match #rx"^[0-9A-F]+[.][.]([0-9A-F]+)" (car m))]) [end (let ([m (regexp-match #rx"^[0-9A-F]+[.][.]([0-9A-F]+)" (car m))])
@ -233,6 +224,7 @@
[(string=? (caddr m) "Lowercase") lower-case] [(string=? (caddr m) "Lowercase") lower-case]
[(string=? (caddr m) "Uppercase") upper-case] [(string=? (caddr m) "Uppercase") upper-case]
[(string=? (caddr m) "Alphabetic") alphabetic] [(string=? (caddr m) "Alphabetic") alphabetic]
[(string=? (caddr m) "Case_Ignorable") case-ignorable]
[else (error "unknown property section")])]) [else (error "unknown property section")])])
(let loop ([i start]) (let loop ([i start])
(hash-set! t i #t) (hash-set! t i #t)
@ -359,8 +351,7 @@
(or (hash-ref special-casings code (lambda () #f)) (or (hash-ref special-casings code (lambda () #f))
(hash-ref special-case-foldings code (lambda () #f))) (hash-ref special-case-foldings code (lambda () #f)))
;; case-ignoreable ;; case-ignoreable
(or (member code midletters) (hash-ref case-ignorable code #f)
(member cat '("Mn" "Me" "Cf" "Lm" "Sk")))
;; graphic ;; graphic
(or alphabetic? (or alphabetic?
numeric? numeric?

View File

@ -38,7 +38,7 @@ READ_ONLY static unsigned short udata[] = {
/* 1 */ /* 1 */
0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x19, 0x18, 0x18, 0x18, 0x18, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x19, 0x18, 0x18, 0x18, 0x18, 0x8, 0x8,
0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8,
0x11, 0x804, 0x804, 0x804, 0x802, 0x804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x802, 0x804, 0x804, 0x804, 0x804, 0x11, 0x804, 0x804, 0x804, 0x802, 0x804, 0x804, 0x1804, 0x804, 0x804, 0x804, 0x802, 0x804, 0x804, 0x1804, 0x804,
0x840, 0x840, 0x840, 0x840, 0x840, 0x840, 0x840, 0x840, 0x840, 0x840, 0x1804, 0x804, 0x8802, 0x8802, 0x8802, 0x804, 0x840, 0x840, 0x840, 0x840, 0x840, 0x840, 0x840, 0x840, 0x840, 0x840, 0x1804, 0x804, 0x8802, 0x8802, 0x8802, 0x804,
0x804, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x804, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80,
0x8a80, 0xa80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x804, 0x804, 0x804, 0x1802, 0x804, 0x8a80, 0xa80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x804, 0x804, 0x804, 0x1802, 0x804,
@ -564,8 +564,8 @@ READ_ONLY static unsigned short udata[] = {
0x0, 0x0, 0x6c80, 0x6c80, 0x6c80, 0x0, 0xec80, 0x6c80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x6980, 0x5802, 0xd802, 0x0, 0x0, 0x0, 0x6c80, 0x6c80, 0x6c80, 0x0, 0xec80, 0x6c80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x6980, 0x5802, 0xd802, 0x0,
/* 32 */ /* 32 */
0x4011, 0x4011, 0x4011, 0x4011, 0x4011, 0x4011, 0x4011, 0x4011, 0x4011, 0x4011, 0x4011, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x4011, 0x4011, 0x4011, 0x4011, 0x4011, 0x4011, 0x4011, 0x4011, 0x4011, 0x4011, 0x4011, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000,
0x804, 0x4804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x4804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x4804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x4804, 0x1804, 0x1804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x804,
0x804, 0x804, 0x804, 0x804, 0x4804, 0x4804, 0x4804, 0x1804, 0x10, 0x10, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x4011, 0x804, 0x804, 0x804, 0x804, 0x5804, 0x4804, 0x4804, 0x1804, 0x10, 0x10, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x4011,
0x804, 0x804, 0x804, 0x4804, 0x4804, 0x804, 0x4804, 0x4804, 0x804, 0x804, 0x804, 0x804, 0x4804, 0x804, 0x4804, 0x804, 0x804, 0x804, 0x804, 0x4804, 0x4804, 0x804, 0x4804, 0x4804, 0x804, 0x804, 0x804, 0x804, 0x4804, 0x804, 0x4804, 0x804,
0x804, 0x804, 0x804, 0x804, 0x802, 0x804, 0x804, 0x4804, 0x4804, 0x4804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x802, 0x804, 0x804, 0x4804, 0x4804, 0x4804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x804,
0x804, 0x804, 0x802, 0x804, 0x804, 0x804, 0x804, 0x4804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x4011, 0x804, 0x804, 0x802, 0x804, 0x804, 0x804, 0x804, 0x4804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x4011,
@ -1146,7 +1146,7 @@ READ_ONLY static unsigned short udata[] = {
0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x0, 0x0, 0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x0, 0x0,
0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804,
0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x804, 0x804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x804, 0x804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804,
0x4804, 0x4804, 0x4804, 0x0, 0x4804, 0x5804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x5804, 0x0, 0x4804, 0x5804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804,
0x4804, 0x4804, 0x4802, 0x4804, 0x4802, 0x4802, 0x4802, 0x0, 0x4804, 0x4802, 0x4804, 0x4804, 0x0, 0x0, 0x0, 0x0, 0x4804, 0x4804, 0x4802, 0x4804, 0x4802, 0x4802, 0x4802, 0x0, 0x4804, 0x4802, 0x4804, 0x4804, 0x0, 0x0, 0x0, 0x0,
0x4880, 0x4880, 0x4880, 0x880, 0x4880, 0x0, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x880, 0x4880, 0x0, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880,
0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880,
@ -1158,7 +1158,7 @@ READ_ONLY static unsigned short udata[] = {
0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880,
0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x0, 0x0, 0x1000, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x0, 0x0, 0x1000,
/* 67 */ /* 67 */
0x0, 0x4804, 0x4804, 0x4804, 0x4802, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4802, 0x4804, 0x4804, 0x4804, 0x4804, 0x0, 0x4804, 0x4804, 0x4804, 0x4802, 0x4804, 0x4804, 0x5804, 0x4804, 0x4804, 0x4804, 0x4802, 0x4804, 0x4804, 0x5804, 0x4804,
0x4840, 0x4840, 0x4840, 0x4840, 0x4840, 0x4840, 0x4840, 0x4840, 0x4840, 0x4840, 0x5804, 0x4804, 0x4802, 0x4802, 0x4802, 0x4804, 0x4840, 0x4840, 0x4840, 0x4840, 0x4840, 0x4840, 0x4840, 0x4840, 0x4840, 0x4840, 0x5804, 0x4804, 0x4802, 0x4802, 0x4802, 0x4804,
0x4804, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4804, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80,
0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4804, 0x4804, 0x4804, 0x5802, 0x4804, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4804, 0x4804, 0x4804, 0x5802, 0x4804,