Unicode 7.0
Closes PR 14971
This commit is contained in:
parent
fe68c9ab81
commit
9c7d0b8794
|
@ -21,6 +21,8 @@ reader are @tech{interned} in @racket[read-syntax] mode.
|
|||
|
||||
@see-read-print["character"]{characters}
|
||||
|
||||
@history[#:changed "6.1.1.8" @elem{Updated from Unicode 5.0.1 to Unicode 7.0.0.}]
|
||||
|
||||
@; ----------------------------------------
|
||||
@section{Characters and Scalar Values}
|
||||
|
||||
|
|
|
@ -16,7 +16,7 @@
|
|||
|
||||
(define (get-test-file)
|
||||
(define name "NormalizationTest.txt")
|
||||
(define base "http://www.unicode.org/Public/5.0.0/ucd/")
|
||||
(define base "http://www.unicode.org/Public/7.0.0/ucd/")
|
||||
(define here (current-load-relative-directory))
|
||||
(or (for/or ([dir (list here (current-directory))])
|
||||
(define path (build-path dir name))
|
||||
|
|
|
@ -1469,7 +1469,8 @@
|
|||
#\u3000
|
||||
;; Post SRFI-14?
|
||||
#\u205F
|
||||
#\u180E))
|
||||
;; #\u180E --- in Unicode 7, this code point changed from Zs to Cf
|
||||
))
|
||||
|
||||
;; Punctuation in Latin-1:
|
||||
(check-all-latin-1
|
||||
|
@ -1498,8 +1499,10 @@
|
|||
#\{
|
||||
#\}
|
||||
#\u00A1
|
||||
#\u00A7 ;; Made punctuation in Unicode 7.0
|
||||
#\u00AB
|
||||
;; #\u00AD ;; Treated as a control character now?
|
||||
#\u00B6 ;; Made punctuation in Unicode 7.0
|
||||
#\u00B7
|
||||
#\u00BB
|
||||
#\u00BF))
|
||||
|
@ -1521,7 +1524,7 @@
|
|||
#\u00A4
|
||||
#\u00A5
|
||||
#\u00A6
|
||||
#\u00A7
|
||||
;; #\u00A7 ;; Made punctuation in Unicode 7.0
|
||||
#\u00A8
|
||||
#\u00A9
|
||||
#\u00AC
|
||||
|
@ -1530,7 +1533,7 @@
|
|||
#\u00B0
|
||||
#\u00B1
|
||||
#\u00B4
|
||||
#\u00B6
|
||||
;; #\u00B6 ;; Made punctuation in Unicode 7.0
|
||||
#\u00B8
|
||||
#\u00D7
|
||||
#\u00F7))
|
||||
|
@ -1558,7 +1561,7 @@
|
|||
#\u3000
|
||||
;; Post SRFI-14?
|
||||
#\u205F
|
||||
#\u180E
|
||||
;; #\u180E --- in Unicode 7, this code point changed from Zs to Cf
|
||||
))
|
||||
|
||||
|
||||
|
@ -1579,7 +1582,8 @@
|
|||
(if (char-numeric? c) 1 0)
|
||||
(if (char-punctuation? c) 1 0)
|
||||
(if (char-symbolic? c)
|
||||
(if (char<=? #\u24B6 c #\u24E9)
|
||||
(if (or (char<=? #\u24B6 c #\u24E9)
|
||||
(char<=? #\U1F130 c #\U1F189)) ;; added in Unicode 6.0
|
||||
0 ;; Those are both alphabetic and symbolic
|
||||
1)
|
||||
0))
|
||||
|
|
|
@ -277,6 +277,21 @@
|
|||
(hash-set! do-not-compose-ht code #t))))
|
||||
(loop))))))
|
||||
|
||||
(define (composition-key a b)
|
||||
;; If `a` and `b` are both in the BMP (i.e., both fit in 16 bits),
|
||||
;; map to a 32-bit key.
|
||||
(bitwise-ior (arithmetic-shift (bitwise-and a #xFFFF) 16)
|
||||
(bitwise-and b #xFFFF)
|
||||
(arithmetic-shift
|
||||
(bitwise-ior (arithmetic-shift (arithmetic-shift a -16)
|
||||
5)
|
||||
(arithmetic-shift b -16))
|
||||
32)))
|
||||
|
||||
(define (composition-key-first k)
|
||||
(bitwise-ior (bitwise-and (arithmetic-shift k -16) #xFFFF)
|
||||
(arithmetic-shift (arithmetic-shift k -37) 16)))
|
||||
|
||||
(define (extract-decomp decomp code)
|
||||
(if (string=? decomp "")
|
||||
#f
|
||||
|
@ -293,9 +308,9 @@
|
|||
code
|
||||
(lambda () #f))))
|
||||
(hash-set! compose-initial-ht a #t)
|
||||
(let ([key (bitwise-ior (arithmetic-shift a 16) b)])
|
||||
(let ([key (composition-key a b)])
|
||||
(when (hash-ref compose-map key (lambda () #f))
|
||||
(error 'decomp "composition already mapped: ~e" key))
|
||||
(error 'decomp "composition already mapped: ~x for: ~x" key code))
|
||||
(hash-set! compose-map key code)))
|
||||
(hash-set! decomp-ht code (cons a b))
|
||||
#t)
|
||||
|
@ -423,7 +438,7 @@
|
|||
;; 4.0, there are only four of these: U+0344, U+0F73,
|
||||
;; U+0F75, and U+0F81.
|
||||
(for-each (lambda (k)
|
||||
(let ([a (arithmetic-shift k -16)])
|
||||
(let ([a (composition-key-first k)])
|
||||
(unless (zero? (hash-ref combining-class-ht a))
|
||||
(hash-remove! compose-map k))))
|
||||
(hash-map compose-map (lambda (k v) k)))
|
||||
|
@ -734,11 +749,21 @@
|
|||
|
||||
|
||||
(let ()
|
||||
(define (make-composes-table ps)
|
||||
(list->vector (sort ps (lambda (a b) (< (car a) (car b))))))
|
||||
|
||||
(define canon-composes
|
||||
(list->vector (sort (hash-map compose-map cons)
|
||||
(lambda (a b) (< (car a) (car b))))))
|
||||
(define count (hash-count compose-map))
|
||||
|
||||
(make-composes-table (for/list ([(k v) (in-hash compose-map)]
|
||||
#:when (k . <= . #xFFFFFFFF))
|
||||
(cons k v))))
|
||||
(define count (vector-length canon-composes))
|
||||
|
||||
(define long-canon-composes
|
||||
(make-composes-table (for/list ([(k v) (in-hash compose-map)]
|
||||
#:when (k . > . #xFFFFFFFF))
|
||||
(cons k v))))
|
||||
(define long-count (vector-length long-canon-composes))
|
||||
|
||||
(define-values (all-composes decomp-vector long-composes)
|
||||
(let ([decomp-pos-ht (make-hasheq)]
|
||||
[counter count]
|
||||
|
@ -748,7 +773,7 @@
|
|||
(hash-for-each decomp-ht
|
||||
(lambda (k v)
|
||||
;; Use table of composed shorts:
|
||||
(let ([key (+ (arithmetic-shift (car v) 16) (cdr v))])
|
||||
(let ([key (composition-key (car v) (cdr v))])
|
||||
(let ([pos
|
||||
(if (and ((car v) . <= . #xFFFF)
|
||||
((cdr v) . <= . #xFFFF))
|
||||
|
@ -813,6 +838,15 @@
|
|||
(printf " the mapped index, negate, then multiply by 2 to find the pair. */\n")
|
||||
(print-compose-data "unsigned int" "compose_long_pairs" values long-composes (vector-length long-composes) #t 8)
|
||||
(printf "\n")
|
||||
(printf "/* utable_canon_compose_long_pairs repeats information from utable_compose_long_pairs,\n")
|
||||
(printf " but for canonical compositions only. The two characters are combined by putting the\n")
|
||||
(printf " lower 16 bits of the combined numbers in the low 32 bits, and then the next higher 10\n")
|
||||
(printf " bits provide the remaining 5 bits of each character, and the array is sorted. The\n")
|
||||
(printf " canon_compose_long_result array provides in parellel the composed character. */\n")
|
||||
(printf "#define LONG_COMPOSE_TABLE_SIZE ~a\n\n" long-count)
|
||||
(print-compose-data "mzlonglong" "canon_compose_long_pairs" car long-canon-composes long-count #t 8)
|
||||
(print-compose-data "unsigned int" "canon_compose_long_result" cdr long-canon-composes long-count #t 8)
|
||||
(printf "\n")
|
||||
(printf "/* utable_decomp_keys identifies characters that have a canonical decomposition;\n")
|
||||
(printf " it is sorted, so binary search can be used, but use scheme_needs_decompose()\n")
|
||||
(printf " from scheme.h to first determine whether a character may have a mapping in this table.\n")
|
||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -4140,38 +4140,68 @@ static Scheme_Object *string_foldcase (int argc, Scheme_Object *argv[])
|
|||
#define MZ_JAMO_SYLLABLE_START 0xAC00
|
||||
#define MZ_JAMO_SYLLABLE_END (MZ_JAMO_SYLLABLE_START + 11171)
|
||||
|
||||
static mzchar get_composition(mzchar a, mzchar b)
|
||||
XFORM_NONGCING static mzchar get_composition(mzchar a, mzchar b)
|
||||
{
|
||||
uintptr_t key = (a << 16) | b;
|
||||
int pos = (COMPOSE_TABLE_SIZE >> 1), new_pos;
|
||||
int below_len = pos;
|
||||
int above_len = (COMPOSE_TABLE_SIZE - pos - 1);
|
||||
|
||||
if (a > 0xFFFF) return 0;
|
||||
if ((a > 0xFFFF) || (b > 0xFFFF)) {
|
||||
/* Look in long-composes table. */
|
||||
mzlonglong key = ((((mzlonglong)a & 0x1F0000) << 21)
|
||||
| (((mzlonglong)a & 0xFFFF) << 16)
|
||||
| (((mzlonglong)b & 0x1F0000) << 16)
|
||||
| ((mzlonglong)b & 0xFFFF));
|
||||
int pos = (LONG_COMPOSE_TABLE_SIZE >> 1), new_pos;
|
||||
int below_len = pos;
|
||||
int above_len = (LONG_COMPOSE_TABLE_SIZE - pos - 1);
|
||||
|
||||
/* Binary search: */
|
||||
while (key != utable_compose_pairs[pos]) {
|
||||
if (key > utable_compose_pairs[pos]) {
|
||||
if (!above_len)
|
||||
return 0;
|
||||
new_pos = pos + (above_len >> 1) + 1;
|
||||
below_len = (new_pos - pos - 1);
|
||||
above_len = (above_len - below_len - 1);
|
||||
pos = new_pos;
|
||||
} else if (key < utable_compose_pairs[pos]) {
|
||||
if (!below_len)
|
||||
return 0;
|
||||
new_pos = pos - ((below_len >> 1) + 1);
|
||||
above_len = (pos - new_pos - 1);
|
||||
below_len = (below_len - above_len - 1);
|
||||
pos = new_pos;
|
||||
/* Binary search: */
|
||||
while (key != utable_canon_compose_long_pairs[pos]) {
|
||||
if (key > utable_canon_compose_long_pairs[pos]) {
|
||||
if (!above_len)
|
||||
return 0;
|
||||
new_pos = pos + (above_len >> 1) + 1;
|
||||
below_len = (new_pos - pos - 1);
|
||||
above_len = (above_len - below_len - 1);
|
||||
pos = new_pos;
|
||||
} else if (key < utable_canon_compose_long_pairs[pos]) {
|
||||
if (!below_len)
|
||||
return 0;
|
||||
new_pos = pos - ((below_len >> 1) + 1);
|
||||
above_len = (pos - new_pos - 1);
|
||||
below_len = (below_len - above_len - 1);
|
||||
pos = new_pos;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return utable_canon_compose_long_result[pos];
|
||||
} else {
|
||||
uintptr_t key = (a << 16) | b;
|
||||
int pos = (COMPOSE_TABLE_SIZE >> 1), new_pos;
|
||||
int below_len = pos;
|
||||
int above_len = (COMPOSE_TABLE_SIZE - pos - 1);
|
||||
|
||||
return utable_compose_result[pos];
|
||||
/* Binary search: */
|
||||
while (key != utable_compose_pairs[pos]) {
|
||||
if (key > utable_compose_pairs[pos]) {
|
||||
if (!above_len)
|
||||
return 0;
|
||||
new_pos = pos + (above_len >> 1) + 1;
|
||||
below_len = (new_pos - pos - 1);
|
||||
above_len = (above_len - below_len - 1);
|
||||
pos = new_pos;
|
||||
} else if (key < utable_compose_pairs[pos]) {
|
||||
if (!below_len)
|
||||
return 0;
|
||||
new_pos = pos - ((below_len >> 1) + 1);
|
||||
above_len = (pos - new_pos - 1);
|
||||
below_len = (below_len - above_len - 1);
|
||||
pos = new_pos;
|
||||
}
|
||||
}
|
||||
|
||||
return utable_compose_result[pos];
|
||||
}
|
||||
}
|
||||
|
||||
mzchar get_canon_decomposition(mzchar key, mzchar *b)
|
||||
XFORM_NONGCING mzchar get_canon_decomposition(mzchar key, mzchar *b)
|
||||
{
|
||||
int pos = (DECOMPOSE_TABLE_SIZE >> 1), new_pos;
|
||||
int below_len = pos;
|
||||
|
@ -4209,7 +4239,7 @@ mzchar get_canon_decomposition(mzchar key, mzchar *b)
|
|||
}
|
||||
}
|
||||
|
||||
int get_kompat_decomposition(mzchar key, unsigned short **chars)
|
||||
XFORM_NONGCING int get_kompat_decomposition(mzchar key, unsigned short **chars)
|
||||
{
|
||||
int pos = (KOMPAT_DECOMPOSE_TABLE_SIZE >> 1), new_pos;
|
||||
int below_len = pos;
|
||||
|
|
Loading…
Reference in New Issue
Block a user