fix string-titlecase
based on case-ignoreable chars
Fix extraction of case-ignorable characters from the Unicode database.
This commit is contained in:
parent
e807e848f9
commit
8e22b22630
|
@ -158,16 +158,6 @@
|
||||||
[i (bitwise-and c low) ])
|
[i (bitwise-and c low) ])
|
||||||
(vector-set! vec i (bitwise-ior #x8000 (vector-ref vec i))))))
|
(vector-set! vec i (bitwise-ior #x8000 (vector-ref vec i))))))
|
||||||
|
|
||||||
(define midletters
|
|
||||||
(call-with-input-file "Unicode/WordBreakProperty.txt"
|
|
||||||
(lambda (i)
|
|
||||||
(let loop ()
|
|
||||||
(let ([re (regexp-match #rx"\n([0-9A-F]+) *; *MidLetter" i)])
|
|
||||||
(if re
|
|
||||||
(cons (string->number (bytes->string/latin-1 (cadr re)) 16)
|
|
||||||
(loop))
|
|
||||||
null))))))
|
|
||||||
|
|
||||||
(define (string->codes s)
|
(define (string->codes s)
|
||||||
(let ([m (regexp-match #rx"^[^0-9A-F]*([0-9A-F]+)" s)])
|
(let ([m (regexp-match #rx"^[^0-9A-F]*([0-9A-F]+)" s)])
|
||||||
(if m
|
(if m
|
||||||
|
@ -216,13 +206,14 @@
|
||||||
(define lower-case (make-hash))
|
(define lower-case (make-hash))
|
||||||
(define upper-case (make-hash))
|
(define upper-case (make-hash))
|
||||||
(define alphabetic (make-hash))
|
(define alphabetic (make-hash))
|
||||||
|
(define case-ignorable (make-hash))
|
||||||
|
|
||||||
(with-input-from-file "Unicode/DerivedCoreProperties.txt"
|
(with-input-from-file "Unicode/DerivedCoreProperties.txt"
|
||||||
(lambda ()
|
(lambda ()
|
||||||
(let loop ()
|
(let loop ()
|
||||||
(let ([l (read-line)])
|
(let ([l (read-line)])
|
||||||
(unless (eof-object? l)
|
(unless (eof-object? l)
|
||||||
(let ([m (regexp-match #rx"^([0-9A-F.]+) *; ((Lower|Upper)case|Alphabetic)" l)])
|
(let ([m (regexp-match #rx"^([0-9A-F.]+) *; ((Lower|Upper)case|Alphabetic|Case_Ignorable)" l)])
|
||||||
(when m
|
(when m
|
||||||
(let* ([start (string->number (car (regexp-match #rx"^[0-9A-F]+" (car m))) 16)]
|
(let* ([start (string->number (car (regexp-match #rx"^[0-9A-F]+" (car m))) 16)]
|
||||||
[end (let ([m (regexp-match #rx"^[0-9A-F]+[.][.]([0-9A-F]+)" (car m))])
|
[end (let ([m (regexp-match #rx"^[0-9A-F]+[.][.]([0-9A-F]+)" (car m))])
|
||||||
|
@ -233,6 +224,7 @@
|
||||||
[(string=? (caddr m) "Lowercase") lower-case]
|
[(string=? (caddr m) "Lowercase") lower-case]
|
||||||
[(string=? (caddr m) "Uppercase") upper-case]
|
[(string=? (caddr m) "Uppercase") upper-case]
|
||||||
[(string=? (caddr m) "Alphabetic") alphabetic]
|
[(string=? (caddr m) "Alphabetic") alphabetic]
|
||||||
|
[(string=? (caddr m) "Case_Ignorable") case-ignorable]
|
||||||
[else (error "unknown property section")])])
|
[else (error "unknown property section")])])
|
||||||
(let loop ([i start])
|
(let loop ([i start])
|
||||||
(hash-set! t i #t)
|
(hash-set! t i #t)
|
||||||
|
@ -359,8 +351,7 @@
|
||||||
(or (hash-ref special-casings code (lambda () #f))
|
(or (hash-ref special-casings code (lambda () #f))
|
||||||
(hash-ref special-case-foldings code (lambda () #f)))
|
(hash-ref special-case-foldings code (lambda () #f)))
|
||||||
;; case-ignoreable
|
;; case-ignoreable
|
||||||
(or (member code midletters)
|
(hash-ref case-ignorable code #f)
|
||||||
(member cat '("Mn" "Me" "Cf" "Lm" "Sk")))
|
|
||||||
;; graphic
|
;; graphic
|
||||||
(or alphabetic?
|
(or alphabetic?
|
||||||
numeric?
|
numeric?
|
||||||
|
|
|
@ -38,7 +38,7 @@ READ_ONLY static unsigned short udata[] = {
|
||||||
/* 1 */
|
/* 1 */
|
||||||
0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x19, 0x18, 0x18, 0x18, 0x18, 0x8, 0x8,
|
0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x19, 0x18, 0x18, 0x18, 0x18, 0x8, 0x8,
|
||||||
0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8,
|
0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8,
|
||||||
0x11, 0x804, 0x804, 0x804, 0x802, 0x804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x802, 0x804, 0x804, 0x804, 0x804,
|
0x11, 0x804, 0x804, 0x804, 0x802, 0x804, 0x804, 0x1804, 0x804, 0x804, 0x804, 0x802, 0x804, 0x804, 0x1804, 0x804,
|
||||||
0x840, 0x840, 0x840, 0x840, 0x840, 0x840, 0x840, 0x840, 0x840, 0x840, 0x1804, 0x804, 0x8802, 0x8802, 0x8802, 0x804,
|
0x840, 0x840, 0x840, 0x840, 0x840, 0x840, 0x840, 0x840, 0x840, 0x840, 0x1804, 0x804, 0x8802, 0x8802, 0x8802, 0x804,
|
||||||
0x804, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80,
|
0x804, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80,
|
||||||
0x8a80, 0xa80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x804, 0x804, 0x804, 0x1802, 0x804,
|
0x8a80, 0xa80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x804, 0x804, 0x804, 0x1802, 0x804,
|
||||||
|
@ -564,8 +564,8 @@ READ_ONLY static unsigned short udata[] = {
|
||||||
0x0, 0x0, 0x6c80, 0x6c80, 0x6c80, 0x0, 0xec80, 0x6c80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x6980, 0x5802, 0xd802, 0x0,
|
0x0, 0x0, 0x6c80, 0x6c80, 0x6c80, 0x0, 0xec80, 0x6c80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x6980, 0x5802, 0xd802, 0x0,
|
||||||
/* 32 */
|
/* 32 */
|
||||||
0x4011, 0x4011, 0x4011, 0x4011, 0x4011, 0x4011, 0x4011, 0x4011, 0x4011, 0x4011, 0x4011, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000,
|
0x4011, 0x4011, 0x4011, 0x4011, 0x4011, 0x4011, 0x4011, 0x4011, 0x4011, 0x4011, 0x4011, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000,
|
||||||
0x804, 0x4804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x4804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x804,
|
0x804, 0x4804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x4804, 0x1804, 0x1804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x804,
|
||||||
0x804, 0x804, 0x804, 0x804, 0x4804, 0x4804, 0x4804, 0x1804, 0x10, 0x10, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x4011,
|
0x804, 0x804, 0x804, 0x804, 0x5804, 0x4804, 0x4804, 0x1804, 0x10, 0x10, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x4011,
|
||||||
0x804, 0x804, 0x804, 0x4804, 0x4804, 0x804, 0x4804, 0x4804, 0x804, 0x804, 0x804, 0x804, 0x4804, 0x804, 0x4804, 0x804,
|
0x804, 0x804, 0x804, 0x4804, 0x4804, 0x804, 0x4804, 0x4804, 0x804, 0x804, 0x804, 0x804, 0x4804, 0x804, 0x4804, 0x804,
|
||||||
0x804, 0x804, 0x804, 0x804, 0x802, 0x804, 0x804, 0x4804, 0x4804, 0x4804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x804,
|
0x804, 0x804, 0x804, 0x804, 0x802, 0x804, 0x804, 0x4804, 0x4804, 0x4804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x804,
|
||||||
0x804, 0x804, 0x802, 0x804, 0x804, 0x804, 0x804, 0x4804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x4011,
|
0x804, 0x804, 0x802, 0x804, 0x804, 0x804, 0x804, 0x4804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x4011,
|
||||||
|
@ -1146,7 +1146,7 @@ READ_ONLY static unsigned short udata[] = {
|
||||||
0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x0, 0x0,
|
0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x0, 0x0,
|
||||||
0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804,
|
0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804,
|
||||||
0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x804, 0x804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804,
|
0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x804, 0x804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804,
|
||||||
0x4804, 0x4804, 0x4804, 0x0, 0x4804, 0x5804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804,
|
0x4804, 0x4804, 0x5804, 0x0, 0x4804, 0x5804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804,
|
||||||
0x4804, 0x4804, 0x4802, 0x4804, 0x4802, 0x4802, 0x4802, 0x0, 0x4804, 0x4802, 0x4804, 0x4804, 0x0, 0x0, 0x0, 0x0,
|
0x4804, 0x4804, 0x4802, 0x4804, 0x4802, 0x4802, 0x4802, 0x0, 0x4804, 0x4802, 0x4804, 0x4804, 0x0, 0x0, 0x0, 0x0,
|
||||||
0x4880, 0x4880, 0x4880, 0x880, 0x4880, 0x0, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880,
|
0x4880, 0x4880, 0x4880, 0x880, 0x4880, 0x0, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880,
|
||||||
0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880,
|
0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880,
|
||||||
|
@ -1158,7 +1158,7 @@ READ_ONLY static unsigned short udata[] = {
|
||||||
0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880,
|
0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880,
|
||||||
0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x0, 0x0, 0x1000,
|
0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x0, 0x0, 0x1000,
|
||||||
/* 67 */
|
/* 67 */
|
||||||
0x0, 0x4804, 0x4804, 0x4804, 0x4802, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4802, 0x4804, 0x4804, 0x4804, 0x4804,
|
0x0, 0x4804, 0x4804, 0x4804, 0x4802, 0x4804, 0x4804, 0x5804, 0x4804, 0x4804, 0x4804, 0x4802, 0x4804, 0x4804, 0x5804, 0x4804,
|
||||||
0x4840, 0x4840, 0x4840, 0x4840, 0x4840, 0x4840, 0x4840, 0x4840, 0x4840, 0x4840, 0x5804, 0x4804, 0x4802, 0x4802, 0x4802, 0x4804,
|
0x4840, 0x4840, 0x4840, 0x4840, 0x4840, 0x4840, 0x4840, 0x4840, 0x4840, 0x4840, 0x5804, 0x4804, 0x4802, 0x4802, 0x4802, 0x4804,
|
||||||
0x4804, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80,
|
0x4804, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80,
|
||||||
0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4804, 0x4804, 0x4804, 0x5802, 0x4804,
|
0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4804, 0x4804, 0x4804, 0x5802, 0x4804,
|
||||||
|
|
Loading…
Reference in New Issue
Block a user