From 8e22b226307c0065d411a8bf16ae57d8fccafc66 Mon Sep 17 00:00:00 2001 From: Matthew Flatt Date: Fri, 10 Apr 2015 13:03:22 -0600 Subject: [PATCH] fix `string-titlecase` based on case-ignoreable chars Fix extraction of case-ignorable characters from the Unicode database. --- racket/src/racket/src/mk-uchar.rkt | 17 ++++------------- racket/src/racket/src/schuchar.inc | 10 +++++----- 2 files changed, 9 insertions(+), 18 deletions(-) diff --git a/racket/src/racket/src/mk-uchar.rkt b/racket/src/racket/src/mk-uchar.rkt index a3aa5cd126..5de34358e6 100644 --- a/racket/src/racket/src/mk-uchar.rkt +++ b/racket/src/racket/src/mk-uchar.rkt @@ -158,16 +158,6 @@ [i (bitwise-and c low) ]) (vector-set! vec i (bitwise-ior #x8000 (vector-ref vec i)))))) -(define midletters - (call-with-input-file "Unicode/WordBreakProperty.txt" - (lambda (i) - (let loop () - (let ([re (regexp-match #rx"\n([0-9A-F]+) *; *MidLetter" i)]) - (if re - (cons (string->number (bytes->string/latin-1 (cadr re)) 16) - (loop)) - null)))))) - (define (string->codes s) (let ([m (regexp-match #rx"^[^0-9A-F]*([0-9A-F]+)" s)]) (if m @@ -216,13 +206,14 @@ (define lower-case (make-hash)) (define upper-case (make-hash)) (define alphabetic (make-hash)) +(define case-ignorable (make-hash)) (with-input-from-file "Unicode/DerivedCoreProperties.txt" (lambda () (let loop () (let ([l (read-line)]) (unless (eof-object? l) - (let ([m (regexp-match #rx"^([0-9A-F.]+) *; ((Lower|Upper)case|Alphabetic)" l)]) + (let ([m (regexp-match #rx"^([0-9A-F.]+) *; ((Lower|Upper)case|Alphabetic|Case_Ignorable)" l)]) (when m (let* ([start (string->number (car (regexp-match #rx"^[0-9A-F]+" (car m))) 16)] [end (let ([m (regexp-match #rx"^[0-9A-F]+[.][.]([0-9A-F]+)" (car m))]) @@ -233,6 +224,7 @@ [(string=? (caddr m) "Lowercase") lower-case] [(string=? (caddr m) "Uppercase") upper-case] [(string=? (caddr m) "Alphabetic") alphabetic] + [(string=? (caddr m) "Case_Ignorable") case-ignorable] [else (error "unknown property section")])]) (let loop ([i start]) (hash-set! t i #t) @@ -359,8 +351,7 @@ (or (hash-ref special-casings code (lambda () #f)) (hash-ref special-case-foldings code (lambda () #f))) ;; case-ignoreable - (or (member code midletters) - (member cat '("Mn" "Me" "Cf" "Lm" "Sk"))) + (hash-ref case-ignorable code #f) ;; graphic (or alphabetic? numeric? diff --git a/racket/src/racket/src/schuchar.inc b/racket/src/racket/src/schuchar.inc index d5672400a5..0b0bde9d81 100644 --- a/racket/src/racket/src/schuchar.inc +++ b/racket/src/racket/src/schuchar.inc @@ -38,7 +38,7 @@ READ_ONLY static unsigned short udata[] = { /* 1 */ 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x19, 0x18, 0x18, 0x18, 0x18, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, - 0x11, 0x804, 0x804, 0x804, 0x802, 0x804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x802, 0x804, 0x804, 0x804, 0x804, + 0x11, 0x804, 0x804, 0x804, 0x802, 0x804, 0x804, 0x1804, 0x804, 0x804, 0x804, 0x802, 0x804, 0x804, 0x1804, 0x804, 0x840, 0x840, 0x840, 0x840, 0x840, 0x840, 0x840, 0x840, 0x840, 0x840, 0x1804, 0x804, 0x8802, 0x8802, 0x8802, 0x804, 0x804, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0xa80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x8a80, 0x804, 0x804, 0x804, 0x1802, 0x804, @@ -564,8 +564,8 @@ READ_ONLY static unsigned short udata[] = { 0x0, 0x0, 0x6c80, 0x6c80, 0x6c80, 0x0, 0xec80, 0x6c80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x6980, 0x5802, 0xd802, 0x0, /* 32 */ 0x4011, 0x4011, 0x4011, 0x4011, 0x4011, 0x4011, 0x4011, 0x4011, 0x4011, 0x4011, 0x4011, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, - 0x804, 0x4804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x4804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x804, - 0x804, 0x804, 0x804, 0x804, 0x4804, 0x4804, 0x4804, 0x1804, 0x10, 0x10, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x4011, + 0x804, 0x4804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x4804, 0x1804, 0x1804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x804, + 0x804, 0x804, 0x804, 0x804, 0x5804, 0x4804, 0x4804, 0x1804, 0x10, 0x10, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x4011, 0x804, 0x804, 0x804, 0x4804, 0x4804, 0x804, 0x4804, 0x4804, 0x804, 0x804, 0x804, 0x804, 0x4804, 0x804, 0x4804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x802, 0x804, 0x804, 0x4804, 0x4804, 0x4804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x802, 0x804, 0x804, 0x804, 0x804, 0x4804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x804, 0x4011, @@ -1146,7 +1146,7 @@ READ_ONLY static unsigned short udata[] = { 0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x1800, 0x0, 0x0, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x804, 0x804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, - 0x4804, 0x4804, 0x4804, 0x0, 0x4804, 0x5804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, + 0x4804, 0x4804, 0x5804, 0x0, 0x4804, 0x5804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4802, 0x4804, 0x4802, 0x4802, 0x4802, 0x0, 0x4804, 0x4802, 0x4804, 0x4804, 0x0, 0x0, 0x0, 0x0, 0x4880, 0x4880, 0x4880, 0x880, 0x4880, 0x0, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, @@ -1158,7 +1158,7 @@ READ_ONLY static unsigned short udata[] = { 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x4880, 0x0, 0x0, 0x1000, /* 67 */ - 0x0, 0x4804, 0x4804, 0x4804, 0x4802, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4804, 0x4802, 0x4804, 0x4804, 0x4804, 0x4804, + 0x0, 0x4804, 0x4804, 0x4804, 0x4802, 0x4804, 0x4804, 0x5804, 0x4804, 0x4804, 0x4804, 0x4802, 0x4804, 0x4804, 0x5804, 0x4804, 0x4840, 0x4840, 0x4840, 0x4840, 0x4840, 0x4840, 0x4840, 0x4840, 0x4840, 0x4840, 0x5804, 0x4804, 0x4802, 0x4802, 0x4802, 0x4804, 0x4804, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4a80, 0x4804, 0x4804, 0x4804, 0x5802, 0x4804,