Unicode 7.0
Closes PR 14971
This commit is contained in:
parent
fe68c9ab81
commit
9c7d0b8794
|
@ -21,6 +21,8 @@ reader are @tech{interned} in @racket[read-syntax] mode.
|
||||||
|
|
||||||
@see-read-print["character"]{characters}
|
@see-read-print["character"]{characters}
|
||||||
|
|
||||||
|
@history[#:changed "6.1.1.8" @elem{Updated from Unicode 5.0.1 to Unicode 7.0.0.}]
|
||||||
|
|
||||||
@; ----------------------------------------
|
@; ----------------------------------------
|
||||||
@section{Characters and Scalar Values}
|
@section{Characters and Scalar Values}
|
||||||
|
|
||||||
|
|
|
@ -16,7 +16,7 @@
|
||||||
|
|
||||||
(define (get-test-file)
|
(define (get-test-file)
|
||||||
(define name "NormalizationTest.txt")
|
(define name "NormalizationTest.txt")
|
||||||
(define base "http://www.unicode.org/Public/5.0.0/ucd/")
|
(define base "http://www.unicode.org/Public/7.0.0/ucd/")
|
||||||
(define here (current-load-relative-directory))
|
(define here (current-load-relative-directory))
|
||||||
(or (for/or ([dir (list here (current-directory))])
|
(or (for/or ([dir (list here (current-directory))])
|
||||||
(define path (build-path dir name))
|
(define path (build-path dir name))
|
||||||
|
|
|
@ -1469,7 +1469,8 @@
|
||||||
#\u3000
|
#\u3000
|
||||||
;; Post SRFI-14?
|
;; Post SRFI-14?
|
||||||
#\u205F
|
#\u205F
|
||||||
#\u180E))
|
;; #\u180E --- in Unicode 7, this code point changed from Zs to Cf
|
||||||
|
))
|
||||||
|
|
||||||
;; Punctuation in Latin-1:
|
;; Punctuation in Latin-1:
|
||||||
(check-all-latin-1
|
(check-all-latin-1
|
||||||
|
@ -1498,8 +1499,10 @@
|
||||||
#\{
|
#\{
|
||||||
#\}
|
#\}
|
||||||
#\u00A1
|
#\u00A1
|
||||||
|
#\u00A7 ;; Made punctuation in Unicode 7.0
|
||||||
#\u00AB
|
#\u00AB
|
||||||
;; #\u00AD ;; Treated as a control character now?
|
;; #\u00AD ;; Treated as a control character now?
|
||||||
|
#\u00B6 ;; Made punctuation in Unicode 7.0
|
||||||
#\u00B7
|
#\u00B7
|
||||||
#\u00BB
|
#\u00BB
|
||||||
#\u00BF))
|
#\u00BF))
|
||||||
|
@ -1521,7 +1524,7 @@
|
||||||
#\u00A4
|
#\u00A4
|
||||||
#\u00A5
|
#\u00A5
|
||||||
#\u00A6
|
#\u00A6
|
||||||
#\u00A7
|
;; #\u00A7 ;; Made punctuation in Unicode 7.0
|
||||||
#\u00A8
|
#\u00A8
|
||||||
#\u00A9
|
#\u00A9
|
||||||
#\u00AC
|
#\u00AC
|
||||||
|
@ -1530,7 +1533,7 @@
|
||||||
#\u00B0
|
#\u00B0
|
||||||
#\u00B1
|
#\u00B1
|
||||||
#\u00B4
|
#\u00B4
|
||||||
#\u00B6
|
;; #\u00B6 ;; Made punctuation in Unicode 7.0
|
||||||
#\u00B8
|
#\u00B8
|
||||||
#\u00D7
|
#\u00D7
|
||||||
#\u00F7))
|
#\u00F7))
|
||||||
|
@ -1558,7 +1561,7 @@
|
||||||
#\u3000
|
#\u3000
|
||||||
;; Post SRFI-14?
|
;; Post SRFI-14?
|
||||||
#\u205F
|
#\u205F
|
||||||
#\u180E
|
;; #\u180E --- in Unicode 7, this code point changed from Zs to Cf
|
||||||
))
|
))
|
||||||
|
|
||||||
|
|
||||||
|
@ -1579,7 +1582,8 @@
|
||||||
(if (char-numeric? c) 1 0)
|
(if (char-numeric? c) 1 0)
|
||||||
(if (char-punctuation? c) 1 0)
|
(if (char-punctuation? c) 1 0)
|
||||||
(if (char-symbolic? c)
|
(if (char-symbolic? c)
|
||||||
(if (char<=? #\u24B6 c #\u24E9)
|
(if (or (char<=? #\u24B6 c #\u24E9)
|
||||||
|
(char<=? #\U1F130 c #\U1F189)) ;; added in Unicode 6.0
|
||||||
0 ;; Those are both alphabetic and symbolic
|
0 ;; Those are both alphabetic and symbolic
|
||||||
1)
|
1)
|
||||||
0))
|
0))
|
||||||
|
|
|
@ -277,6 +277,21 @@
|
||||||
(hash-set! do-not-compose-ht code #t))))
|
(hash-set! do-not-compose-ht code #t))))
|
||||||
(loop))))))
|
(loop))))))
|
||||||
|
|
||||||
|
(define (composition-key a b)
|
||||||
|
;; If `a` and `b` are both in the BMP (i.e., both fit in 16 bits),
|
||||||
|
;; map to a 32-bit key.
|
||||||
|
(bitwise-ior (arithmetic-shift (bitwise-and a #xFFFF) 16)
|
||||||
|
(bitwise-and b #xFFFF)
|
||||||
|
(arithmetic-shift
|
||||||
|
(bitwise-ior (arithmetic-shift (arithmetic-shift a -16)
|
||||||
|
5)
|
||||||
|
(arithmetic-shift b -16))
|
||||||
|
32)))
|
||||||
|
|
||||||
|
(define (composition-key-first k)
|
||||||
|
(bitwise-ior (bitwise-and (arithmetic-shift k -16) #xFFFF)
|
||||||
|
(arithmetic-shift (arithmetic-shift k -37) 16)))
|
||||||
|
|
||||||
(define (extract-decomp decomp code)
|
(define (extract-decomp decomp code)
|
||||||
(if (string=? decomp "")
|
(if (string=? decomp "")
|
||||||
#f
|
#f
|
||||||
|
@ -293,9 +308,9 @@
|
||||||
code
|
code
|
||||||
(lambda () #f))))
|
(lambda () #f))))
|
||||||
(hash-set! compose-initial-ht a #t)
|
(hash-set! compose-initial-ht a #t)
|
||||||
(let ([key (bitwise-ior (arithmetic-shift a 16) b)])
|
(let ([key (composition-key a b)])
|
||||||
(when (hash-ref compose-map key (lambda () #f))
|
(when (hash-ref compose-map key (lambda () #f))
|
||||||
(error 'decomp "composition already mapped: ~e" key))
|
(error 'decomp "composition already mapped: ~x for: ~x" key code))
|
||||||
(hash-set! compose-map key code)))
|
(hash-set! compose-map key code)))
|
||||||
(hash-set! decomp-ht code (cons a b))
|
(hash-set! decomp-ht code (cons a b))
|
||||||
#t)
|
#t)
|
||||||
|
@ -423,7 +438,7 @@
|
||||||
;; 4.0, there are only four of these: U+0344, U+0F73,
|
;; 4.0, there are only four of these: U+0344, U+0F73,
|
||||||
;; U+0F75, and U+0F81.
|
;; U+0F75, and U+0F81.
|
||||||
(for-each (lambda (k)
|
(for-each (lambda (k)
|
||||||
(let ([a (arithmetic-shift k -16)])
|
(let ([a (composition-key-first k)])
|
||||||
(unless (zero? (hash-ref combining-class-ht a))
|
(unless (zero? (hash-ref combining-class-ht a))
|
||||||
(hash-remove! compose-map k))))
|
(hash-remove! compose-map k))))
|
||||||
(hash-map compose-map (lambda (k v) k)))
|
(hash-map compose-map (lambda (k v) k)))
|
||||||
|
@ -734,10 +749,20 @@
|
||||||
|
|
||||||
|
|
||||||
(let ()
|
(let ()
|
||||||
|
(define (make-composes-table ps)
|
||||||
|
(list->vector (sort ps (lambda (a b) (< (car a) (car b))))))
|
||||||
|
|
||||||
(define canon-composes
|
(define canon-composes
|
||||||
(list->vector (sort (hash-map compose-map cons)
|
(make-composes-table (for/list ([(k v) (in-hash compose-map)]
|
||||||
(lambda (a b) (< (car a) (car b))))))
|
#:when (k . <= . #xFFFFFFFF))
|
||||||
(define count (hash-count compose-map))
|
(cons k v))))
|
||||||
|
(define count (vector-length canon-composes))
|
||||||
|
|
||||||
|
(define long-canon-composes
|
||||||
|
(make-composes-table (for/list ([(k v) (in-hash compose-map)]
|
||||||
|
#:when (k . > . #xFFFFFFFF))
|
||||||
|
(cons k v))))
|
||||||
|
(define long-count (vector-length long-canon-composes))
|
||||||
|
|
||||||
(define-values (all-composes decomp-vector long-composes)
|
(define-values (all-composes decomp-vector long-composes)
|
||||||
(let ([decomp-pos-ht (make-hasheq)]
|
(let ([decomp-pos-ht (make-hasheq)]
|
||||||
|
@ -748,7 +773,7 @@
|
||||||
(hash-for-each decomp-ht
|
(hash-for-each decomp-ht
|
||||||
(lambda (k v)
|
(lambda (k v)
|
||||||
;; Use table of composed shorts:
|
;; Use table of composed shorts:
|
||||||
(let ([key (+ (arithmetic-shift (car v) 16) (cdr v))])
|
(let ([key (composition-key (car v) (cdr v))])
|
||||||
(let ([pos
|
(let ([pos
|
||||||
(if (and ((car v) . <= . #xFFFF)
|
(if (and ((car v) . <= . #xFFFF)
|
||||||
((cdr v) . <= . #xFFFF))
|
((cdr v) . <= . #xFFFF))
|
||||||
|
@ -813,6 +838,15 @@
|
||||||
(printf " the mapped index, negate, then multiply by 2 to find the pair. */\n")
|
(printf " the mapped index, negate, then multiply by 2 to find the pair. */\n")
|
||||||
(print-compose-data "unsigned int" "compose_long_pairs" values long-composes (vector-length long-composes) #t 8)
|
(print-compose-data "unsigned int" "compose_long_pairs" values long-composes (vector-length long-composes) #t 8)
|
||||||
(printf "\n")
|
(printf "\n")
|
||||||
|
(printf "/* utable_canon_compose_long_pairs repeats information from utable_compose_long_pairs,\n")
|
||||||
|
(printf " but for canonical compositions only. The two characters are combined by putting the\n")
|
||||||
|
(printf " lower 16 bits of the combined numbers in the low 32 bits, and then the next higher 10\n")
|
||||||
|
(printf " bits provide the remaining 5 bits of each character, and the array is sorted. The\n")
|
||||||
|
(printf " canon_compose_long_result array provides in parellel the composed character. */\n")
|
||||||
|
(printf "#define LONG_COMPOSE_TABLE_SIZE ~a\n\n" long-count)
|
||||||
|
(print-compose-data "mzlonglong" "canon_compose_long_pairs" car long-canon-composes long-count #t 8)
|
||||||
|
(print-compose-data "unsigned int" "canon_compose_long_result" cdr long-canon-composes long-count #t 8)
|
||||||
|
(printf "\n")
|
||||||
(printf "/* utable_decomp_keys identifies characters that have a canonical decomposition;\n")
|
(printf "/* utable_decomp_keys identifies characters that have a canonical decomposition;\n")
|
||||||
(printf " it is sorted, so binary search can be used, but use scheme_needs_decompose()\n")
|
(printf " it is sorted, so binary search can be used, but use scheme_needs_decompose()\n")
|
||||||
(printf " from scheme.h to first determine whether a character may have a mapping in this table.\n")
|
(printf " from scheme.h to first determine whether a character may have a mapping in this table.\n")
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -4140,15 +4140,44 @@ static Scheme_Object *string_foldcase (int argc, Scheme_Object *argv[])
|
||||||
#define MZ_JAMO_SYLLABLE_START 0xAC00
|
#define MZ_JAMO_SYLLABLE_START 0xAC00
|
||||||
#define MZ_JAMO_SYLLABLE_END (MZ_JAMO_SYLLABLE_START + 11171)
|
#define MZ_JAMO_SYLLABLE_END (MZ_JAMO_SYLLABLE_START + 11171)
|
||||||
|
|
||||||
static mzchar get_composition(mzchar a, mzchar b)
|
XFORM_NONGCING static mzchar get_composition(mzchar a, mzchar b)
|
||||||
{
|
{
|
||||||
|
if ((a > 0xFFFF) || (b > 0xFFFF)) {
|
||||||
|
/* Look in long-composes table. */
|
||||||
|
mzlonglong key = ((((mzlonglong)a & 0x1F0000) << 21)
|
||||||
|
| (((mzlonglong)a & 0xFFFF) << 16)
|
||||||
|
| (((mzlonglong)b & 0x1F0000) << 16)
|
||||||
|
| ((mzlonglong)b & 0xFFFF));
|
||||||
|
int pos = (LONG_COMPOSE_TABLE_SIZE >> 1), new_pos;
|
||||||
|
int below_len = pos;
|
||||||
|
int above_len = (LONG_COMPOSE_TABLE_SIZE - pos - 1);
|
||||||
|
|
||||||
|
/* Binary search: */
|
||||||
|
while (key != utable_canon_compose_long_pairs[pos]) {
|
||||||
|
if (key > utable_canon_compose_long_pairs[pos]) {
|
||||||
|
if (!above_len)
|
||||||
|
return 0;
|
||||||
|
new_pos = pos + (above_len >> 1) + 1;
|
||||||
|
below_len = (new_pos - pos - 1);
|
||||||
|
above_len = (above_len - below_len - 1);
|
||||||
|
pos = new_pos;
|
||||||
|
} else if (key < utable_canon_compose_long_pairs[pos]) {
|
||||||
|
if (!below_len)
|
||||||
|
return 0;
|
||||||
|
new_pos = pos - ((below_len >> 1) + 1);
|
||||||
|
above_len = (pos - new_pos - 1);
|
||||||
|
below_len = (below_len - above_len - 1);
|
||||||
|
pos = new_pos;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return utable_canon_compose_long_result[pos];
|
||||||
|
} else {
|
||||||
uintptr_t key = (a << 16) | b;
|
uintptr_t key = (a << 16) | b;
|
||||||
int pos = (COMPOSE_TABLE_SIZE >> 1), new_pos;
|
int pos = (COMPOSE_TABLE_SIZE >> 1), new_pos;
|
||||||
int below_len = pos;
|
int below_len = pos;
|
||||||
int above_len = (COMPOSE_TABLE_SIZE - pos - 1);
|
int above_len = (COMPOSE_TABLE_SIZE - pos - 1);
|
||||||
|
|
||||||
if (a > 0xFFFF) return 0;
|
|
||||||
|
|
||||||
/* Binary search: */
|
/* Binary search: */
|
||||||
while (key != utable_compose_pairs[pos]) {
|
while (key != utable_compose_pairs[pos]) {
|
||||||
if (key > utable_compose_pairs[pos]) {
|
if (key > utable_compose_pairs[pos]) {
|
||||||
|
@ -4170,8 +4199,9 @@ static mzchar get_composition(mzchar a, mzchar b)
|
||||||
|
|
||||||
return utable_compose_result[pos];
|
return utable_compose_result[pos];
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
mzchar get_canon_decomposition(mzchar key, mzchar *b)
|
XFORM_NONGCING mzchar get_canon_decomposition(mzchar key, mzchar *b)
|
||||||
{
|
{
|
||||||
int pos = (DECOMPOSE_TABLE_SIZE >> 1), new_pos;
|
int pos = (DECOMPOSE_TABLE_SIZE >> 1), new_pos;
|
||||||
int below_len = pos;
|
int below_len = pos;
|
||||||
|
@ -4209,7 +4239,7 @@ mzchar get_canon_decomposition(mzchar key, mzchar *b)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int get_kompat_decomposition(mzchar key, unsigned short **chars)
|
XFORM_NONGCING int get_kompat_decomposition(mzchar key, unsigned short **chars)
|
||||||
{
|
{
|
||||||
int pos = (KOMPAT_DECOMPOSE_TABLE_SIZE >> 1), new_pos;
|
int pos = (KOMPAT_DECOMPOSE_TABLE_SIZE >> 1), new_pos;
|
||||||
int below_len = pos;
|
int below_len = pos;
|
||||||
|
|
Loading…
Reference in New Issue
Block a user