Unicode 7.0

Closes PR 14971
This commit is contained in:
Matthew Flatt 2015-02-09 11:31:16 -07:00
parent fe68c9ab81
commit 9c7d0b8794
7 changed files with 5022 additions and 2845 deletions

View File

@ -21,6 +21,8 @@ reader are @tech{interned} in @racket[read-syntax] mode.
@see-read-print["character"]{characters}
@history[#:changed "6.1.1.8" @elem{Updated from Unicode 5.0.1 to Unicode 7.0.0.}]
@; ----------------------------------------
@section{Characters and Scalar Values}

View File

@ -16,7 +16,7 @@
(define (get-test-file)
(define name "NormalizationTest.txt")
(define base "http://www.unicode.org/Public/5.0.0/ucd/")
(define base "http://www.unicode.org/Public/7.0.0/ucd/")
(define here (current-load-relative-directory))
(or (for/or ([dir (list here (current-directory))])
(define path (build-path dir name))

View File

@ -1469,7 +1469,8 @@
#\u3000
;; Post SRFI-14?
#\u205F
#\u180E))
;; #\u180E --- in Unicode 7, this code point changed from Zs to Cf
))
;; Punctuation in Latin-1:
(check-all-latin-1
@ -1498,8 +1499,10 @@
#\{
#\}
#\u00A1
#\u00A7 ;; Made punctuation in Unicode 7.0
#\u00AB
;; #\u00AD ;; Treated as a control character now?
#\u00B6 ;; Made punctuation in Unicode 7.0
#\u00B7
#\u00BB
#\u00BF))
@ -1521,7 +1524,7 @@
#\u00A4
#\u00A5
#\u00A6
#\u00A7
;; #\u00A7 ;; Made punctuation in Unicode 7.0
#\u00A8
#\u00A9
#\u00AC
@ -1530,7 +1533,7 @@
#\u00B0
#\u00B1
#\u00B4
#\u00B6
;; #\u00B6 ;; Made punctuation in Unicode 7.0
#\u00B8
#\u00D7
#\u00F7))
@ -1558,7 +1561,7 @@
#\u3000
;; Post SRFI-14?
#\u205F
#\u180E
;; #\u180E --- in Unicode 7, this code point changed from Zs to Cf
))
@ -1579,7 +1582,8 @@
(if (char-numeric? c) 1 0)
(if (char-punctuation? c) 1 0)
(if (char-symbolic? c)
(if (char<=? #\u24B6 c #\u24E9)
(if (or (char<=? #\u24B6 c #\u24E9)
(char<=? #\U1F130 c #\U1F189)) ;; added in Unicode 6.0
0 ;; Those are both alphabetic and symbolic
1)
0))

View File

@ -277,6 +277,21 @@
(hash-set! do-not-compose-ht code #t))))
(loop))))))
(define (composition-key a b)
;; If `a` and `b` are both in the BMP (i.e., both fit in 16 bits),
;; map to a 32-bit key.
(bitwise-ior (arithmetic-shift (bitwise-and a #xFFFF) 16)
(bitwise-and b #xFFFF)
(arithmetic-shift
(bitwise-ior (arithmetic-shift (arithmetic-shift a -16)
5)
(arithmetic-shift b -16))
32)))
(define (composition-key-first k)
(bitwise-ior (bitwise-and (arithmetic-shift k -16) #xFFFF)
(arithmetic-shift (arithmetic-shift k -37) 16)))
(define (extract-decomp decomp code)
(if (string=? decomp "")
#f
@ -293,9 +308,9 @@
code
(lambda () #f))))
(hash-set! compose-initial-ht a #t)
(let ([key (bitwise-ior (arithmetic-shift a 16) b)])
(let ([key (composition-key a b)])
(when (hash-ref compose-map key (lambda () #f))
(error 'decomp "composition already mapped: ~e" key))
(error 'decomp "composition already mapped: ~x for: ~x" key code))
(hash-set! compose-map key code)))
(hash-set! decomp-ht code (cons a b))
#t)
@ -423,7 +438,7 @@
;; 4.0, there are only four of these: U+0344, U+0F73,
;; U+0F75, and U+0F81.
(for-each (lambda (k)
(let ([a (arithmetic-shift k -16)])
(let ([a (composition-key-first k)])
(unless (zero? (hash-ref combining-class-ht a))
(hash-remove! compose-map k))))
(hash-map compose-map (lambda (k v) k)))
@ -734,11 +749,21 @@
(let ()
(define (make-composes-table ps)
(list->vector (sort ps (lambda (a b) (< (car a) (car b))))))
(define canon-composes
(list->vector (sort (hash-map compose-map cons)
(lambda (a b) (< (car a) (car b))))))
(define count (hash-count compose-map))
(make-composes-table (for/list ([(k v) (in-hash compose-map)]
#:when (k . <= . #xFFFFFFFF))
(cons k v))))
(define count (vector-length canon-composes))
(define long-canon-composes
(make-composes-table (for/list ([(k v) (in-hash compose-map)]
#:when (k . > . #xFFFFFFFF))
(cons k v))))
(define long-count (vector-length long-canon-composes))
(define-values (all-composes decomp-vector long-composes)
(let ([decomp-pos-ht (make-hasheq)]
[counter count]
@ -748,7 +773,7 @@
(hash-for-each decomp-ht
(lambda (k v)
;; Use table of composed shorts:
(let ([key (+ (arithmetic-shift (car v) 16) (cdr v))])
(let ([key (composition-key (car v) (cdr v))])
(let ([pos
(if (and ((car v) . <= . #xFFFF)
((cdr v) . <= . #xFFFF))
@ -813,6 +838,15 @@
(printf " the mapped index, negate, then multiply by 2 to find the pair. */\n")
(print-compose-data "unsigned int" "compose_long_pairs" values long-composes (vector-length long-composes) #t 8)
(printf "\n")
(printf "/* utable_canon_compose_long_pairs repeats information from utable_compose_long_pairs,\n")
(printf " but for canonical compositions only. The two characters are combined by putting the\n")
(printf " lower 16 bits of the combined numbers in the low 32 bits, and then the next higher 10\n")
(printf " bits provide the remaining 5 bits of each character, and the array is sorted. The\n")
(printf " canon_compose_long_result array provides in parellel the composed character. */\n")
(printf "#define LONG_COMPOSE_TABLE_SIZE ~a\n\n" long-count)
(print-compose-data "mzlonglong" "canon_compose_long_pairs" car long-canon-composes long-count #t 8)
(print-compose-data "unsigned int" "canon_compose_long_result" cdr long-canon-composes long-count #t 8)
(printf "\n")
(printf "/* utable_decomp_keys identifies characters that have a canonical decomposition;\n")
(printf " it is sorted, so binary search can be used, but use scheme_needs_decompose()\n")
(printf " from scheme.h to first determine whether a character may have a mapping in this table.\n")

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -4140,38 +4140,68 @@ static Scheme_Object *string_foldcase (int argc, Scheme_Object *argv[])
#define MZ_JAMO_SYLLABLE_START 0xAC00
#define MZ_JAMO_SYLLABLE_END (MZ_JAMO_SYLLABLE_START + 11171)
static mzchar get_composition(mzchar a, mzchar b)
XFORM_NONGCING static mzchar get_composition(mzchar a, mzchar b)
{
uintptr_t key = (a << 16) | b;
int pos = (COMPOSE_TABLE_SIZE >> 1), new_pos;
int below_len = pos;
int above_len = (COMPOSE_TABLE_SIZE - pos - 1);
if (a > 0xFFFF) return 0;
if ((a > 0xFFFF) || (b > 0xFFFF)) {
/* Look in long-composes table. */
mzlonglong key = ((((mzlonglong)a & 0x1F0000) << 21)
| (((mzlonglong)a & 0xFFFF) << 16)
| (((mzlonglong)b & 0x1F0000) << 16)
| ((mzlonglong)b & 0xFFFF));
int pos = (LONG_COMPOSE_TABLE_SIZE >> 1), new_pos;
int below_len = pos;
int above_len = (LONG_COMPOSE_TABLE_SIZE - pos - 1);
/* Binary search: */
while (key != utable_compose_pairs[pos]) {
if (key > utable_compose_pairs[pos]) {
if (!above_len)
return 0;
new_pos = pos + (above_len >> 1) + 1;
below_len = (new_pos - pos - 1);
above_len = (above_len - below_len - 1);
pos = new_pos;
} else if (key < utable_compose_pairs[pos]) {
if (!below_len)
return 0;
new_pos = pos - ((below_len >> 1) + 1);
above_len = (pos - new_pos - 1);
below_len = (below_len - above_len - 1);
pos = new_pos;
/* Binary search: */
while (key != utable_canon_compose_long_pairs[pos]) {
if (key > utable_canon_compose_long_pairs[pos]) {
if (!above_len)
return 0;
new_pos = pos + (above_len >> 1) + 1;
below_len = (new_pos - pos - 1);
above_len = (above_len - below_len - 1);
pos = new_pos;
} else if (key < utable_canon_compose_long_pairs[pos]) {
if (!below_len)
return 0;
new_pos = pos - ((below_len >> 1) + 1);
above_len = (pos - new_pos - 1);
below_len = (below_len - above_len - 1);
pos = new_pos;
}
}
}
return utable_canon_compose_long_result[pos];
} else {
uintptr_t key = (a << 16) | b;
int pos = (COMPOSE_TABLE_SIZE >> 1), new_pos;
int below_len = pos;
int above_len = (COMPOSE_TABLE_SIZE - pos - 1);
return utable_compose_result[pos];
/* Binary search: */
while (key != utable_compose_pairs[pos]) {
if (key > utable_compose_pairs[pos]) {
if (!above_len)
return 0;
new_pos = pos + (above_len >> 1) + 1;
below_len = (new_pos - pos - 1);
above_len = (above_len - below_len - 1);
pos = new_pos;
} else if (key < utable_compose_pairs[pos]) {
if (!below_len)
return 0;
new_pos = pos - ((below_len >> 1) + 1);
above_len = (pos - new_pos - 1);
below_len = (below_len - above_len - 1);
pos = new_pos;
}
}
return utable_compose_result[pos];
}
}
mzchar get_canon_decomposition(mzchar key, mzchar *b)
XFORM_NONGCING mzchar get_canon_decomposition(mzchar key, mzchar *b)
{
int pos = (DECOMPOSE_TABLE_SIZE >> 1), new_pos;
int below_len = pos;
@ -4209,7 +4239,7 @@ mzchar get_canon_decomposition(mzchar key, mzchar *b)
}
}
int get_kompat_decomposition(mzchar key, unsigned short **chars)
XFORM_NONGCING int get_kompat_decomposition(mzchar key, unsigned short **chars)
{
int pos = (KOMPAT_DECOMPOSE_TABLE_SIZE >> 1), new_pos;
int below_len = pos;