\f
;;;; UCD accessor functions
-;;; The first (* 8 395) => 3160 entries in **CHARACTER-DATABASE**
+;;; The first (* 8 396) => 3168 entries in **CHARACTER-DATABASE**
;;; contain entries for the distinct character attributes:
;;; specifically, indexes into the GC kinds, Bidi kinds, CCC kinds,
;;; the decimal digit property, the digit property and the
;;;
;;; To look up information about a character, take the high 13 bits of
;;; its code point, and index the character database with that and a
-;;; base of 3160 (going past the miscellaneous information[*], so
+;;; base of 3168 (going past the miscellaneous information[*], so
;;; treating (a) as the start of the array). This, labelled A, gives
;;; us another index into the detailed pages[-], which we can use to
;;; look up the details for the character in question: we add the low
;;; 8 bits of the character, shifted twice (because we have four-byte
-;;; table entries) to 1024 times the `page' index, with a base of 6088
+;;; table entries) to 1024 times the `page' index, with a base of 7520
;;; to skip over everything else. This gets us to point B. If we're
;;; after a transformed code point (i.e. an upcase or downcase
;;; operation), we can simply read it off now, beginning with an
(defun ucd-index (char)
(let* ((cp (char-code char))
(cp-high (ash cp -8))
- (page (aref **character-database** (+ 3160 cp-high))))
- (+ 7512 (ash page 10) (ash (ldb (byte 8 0) cp) 2))))
+ (page (aref **character-database** (+ 3168 cp-high))))
+ (+ 7520 (ash page 10) (ash (ldb (byte 8 0) cp) 2))))
(declaim (ftype (sfunction (t) (unsigned-byte 11)) ucd-value-0))
(defun ucd-value-0 (char)
right-bidi-mirrored))))))))))))))))))
(defun build-misc-table ()
- (sort *misc-table* #'compare-misc-entry)
+ (let ((table (sort *misc-table* #'compare-misc-entry)))
+ ;; after sorting, insert at the end a special entry to handle
+ ;; unallocated characters.
+ (setf *misc-table* (make-array (1+ (length table))))
+ (replace *misc-table* table)
+ (setf (aref *misc-table* (length table))
+ ;; unallocated characters have a GC index of 31 (not
+ ;; colliding with any other GC), are not digits or decimal
+ ;; digits, aren't BOTH-CASE-P, don't decompose, and aren't
+ ;; interestingly bidi or combining.
+ '(31 0 0 "" "" "" nil 0)))
(setq *misc-mapping* (make-array (1+ *misc-index*)))
(loop for i from 0 to *misc-index*
do (setf (aref *misc-mapping*
(loop for page across array
do (loop for entry across page
do (write-4-byte
- ;; KLUDGE: while tests indicate that this works
- ;; by accident, actually this causes lookups on
- ;; characters undefined by Unicode (e.g. U+2FB00)
- ;; to zoom off into unrelated bits of
- ;; **CHARACTER-DATABASE** (see UCD-VALUE-[01] in
- ;; src/code/target-char.lisp). It would be good
- ;; to make this work deliberately.
- (dpb (if entry (aref *misc-mapping* (ucd-misc entry)) #x7ff)
+ (dpb (if entry
+ (aref *misc-mapping* (ucd-misc entry))
+ ;; the last entry in *MISC-TABLE* (see
+ ;; BUILD-MISC-TABLE) is special,
+ ;; reserved for the information for
+ ;; characters unallocated by Unicode.
+ (1- (length *misc-table*)))
(byte 11 21)
(if entry (ucd-transform entry) 0))
stream))))))