From 7230ec4e3e75e0c81750d7682ba5b9ea349b4acf Mon Sep 17 00:00:00 2001 From: Christophe Rhodes Date: Thu, 9 May 2013 15:22:26 +0100 Subject: [PATCH] better UCD treatment of characters not allocated by Unicode fixes lp#1178038 (reported by Ken Harris) --- src/code/target-char.lisp | 10 +++++----- tools-for-build/ucd.lisp | 27 ++++++++++++++++++--------- 2 files changed, 23 insertions(+), 14 deletions(-) diff --git a/src/code/target-char.lisp b/src/code/target-char.lisp index 0e12980..beb84ee 100644 --- a/src/code/target-char.lisp +++ b/src/code/target-char.lisp @@ -167,7 +167,7 @@ ;;;; UCD accessor functions -;;; The first (* 8 395) => 3160 entries in **CHARACTER-DATABASE** +;;; The first (* 8 396) => 3168 entries in **CHARACTER-DATABASE** ;;; contain entries for the distinct character attributes: ;;; specifically, indexes into the GC kinds, Bidi kinds, CCC kinds, ;;; the decimal digit property, the digit property and the @@ -194,12 +194,12 @@ ;;; ;;; To look up information about a character, take the high 13 bits of ;;; its code point, and index the character database with that and a -;;; base of 3160 (going past the miscellaneous information[*], so +;;; base of 3168 (going past the miscellaneous information[*], so ;;; treating (a) as the start of the array). This, labelled A, gives ;;; us another index into the detailed pages[-], which we can use to ;;; look up the details for the character in question: we add the low ;;; 8 bits of the character, shifted twice (because we have four-byte -;;; table entries) to 1024 times the `page' index, with a base of 6088 +;;; table entries) to 1024 times the `page' index, with a base of 7520 ;;; to skip over everything else. This gets us to point B. If we're ;;; after a transformed code point (i.e. an upcase or downcase ;;; operation), we can simply read it off now, beginning with an @@ -224,8 +224,8 @@ (defun ucd-index (char) (let* ((cp (char-code char)) (cp-high (ash cp -8)) - (page (aref **character-database** (+ 3160 cp-high)))) - (+ 7512 (ash page 10) (ash (ldb (byte 8 0) cp) 2)))) + (page (aref **character-database** (+ 3168 cp-high)))) + (+ 7520 (ash page 10) (ash (ldb (byte 8 0) cp) 2)))) (declaim (ftype (sfunction (t) (unsigned-byte 11)) ucd-value-0)) (defun ucd-value-0 (char) diff --git a/tools-for-build/ucd.lisp b/tools-for-build/ucd.lisp index 935460b..fa2014f 100644 --- a/tools-for-build/ucd.lisp +++ b/tools-for-build/ucd.lisp @@ -80,7 +80,17 @@ right-bidi-mirrored)))))))))))))))))) (defun build-misc-table () - (sort *misc-table* #'compare-misc-entry) + (let ((table (sort *misc-table* #'compare-misc-entry))) + ;; after sorting, insert at the end a special entry to handle + ;; unallocated characters. + (setf *misc-table* (make-array (1+ (length table)))) + (replace *misc-table* table) + (setf (aref *misc-table* (length table)) + ;; unallocated characters have a GC index of 31 (not + ;; colliding with any other GC), are not digits or decimal + ;; digits, aren't BOTH-CASE-P, don't decompose, and aren't + ;; interestingly bidi or combining. + '(31 0 0 "" "" "" nil 0))) (setq *misc-mapping* (make-array (1+ *misc-index*))) (loop for i from 0 to *misc-index* do (setf (aref *misc-mapping* @@ -426,14 +436,13 @@ (loop for page across array do (loop for entry across page do (write-4-byte - ;; KLUDGE: while tests indicate that this works - ;; by accident, actually this causes lookups on - ;; characters undefined by Unicode (e.g. U+2FB00) - ;; to zoom off into unrelated bits of - ;; **CHARACTER-DATABASE** (see UCD-VALUE-[01] in - ;; src/code/target-char.lisp). It would be good - ;; to make this work deliberately. - (dpb (if entry (aref *misc-mapping* (ucd-misc entry)) #x7ff) + (dpb (if entry + (aref *misc-mapping* (ucd-misc entry)) + ;; the last entry in *MISC-TABLE* (see + ;; BUILD-MISC-TABLE) is special, + ;; reserved for the information for + ;; characters unallocated by Unicode. + (1- (length *misc-table*))) (byte 11 21) (if entry (ucd-transform entry) 0)) stream)))))) -- 1.7.10.4