From 7230ec4e3e75e0c81750d7682ba5b9ea349b4acf Mon Sep 17 00:00:00 2001
From: Christophe Rhodes <csr21@cantab.net>
Date: Thu, 9 May 2013 15:22:26 +0100
Subject: [PATCH] better UCD treatment of characters not allocated by Unicode

fixes lp#1178038 (reported by Ken Harris)
---
 src/code/target-char.lisp |   10 +++++-----
 tools-for-build/ucd.lisp  |   27 ++++++++++++++++++---------
 2 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/src/code/target-char.lisp b/src/code/target-char.lisp
index 0e12980..beb84ee 100644
--- a/src/code/target-char.lisp
+++ b/src/code/target-char.lisp
@@ -167,7 +167,7 @@
 
 ;;;; UCD accessor functions
 
-;;; The first (* 8 395) => 3160 entries in **CHARACTER-DATABASE**
+;;; The first (* 8 396) => 3168 entries in **CHARACTER-DATABASE**
 ;;; contain entries for the distinct character attributes:
 ;;; specifically, indexes into the GC kinds, Bidi kinds, CCC kinds,
 ;;; the decimal digit property, the digit property and the
@@ -194,12 +194,12 @@
 ;;;
 ;;; To look up information about a character, take the high 13 bits of
 ;;; its code point, and index the character database with that and a
-;;; base of 3160 (going past the miscellaneous information[*], so
+;;; base of 3168 (going past the miscellaneous information[*], so
 ;;; treating (a) as the start of the array).  This, labelled A, gives
 ;;; us another index into the detailed pages[-], which we can use to
 ;;; look up the details for the character in question: we add the low
 ;;; 8 bits of the character, shifted twice (because we have four-byte
-;;; table entries) to 1024 times the `page' index, with a base of 6088
+;;; table entries) to 1024 times the `page' index, with a base of 7520
 ;;; to skip over everything else.  This gets us to point B.  If we're
 ;;; after a transformed code point (i.e. an upcase or downcase
 ;;; operation), we can simply read it off now, beginning with an
@@ -224,8 +224,8 @@
 (defun ucd-index (char)
   (let* ((cp (char-code char))
          (cp-high (ash cp -8))
-         (page (aref **character-database** (+ 3160 cp-high))))
-    (+ 7512 (ash page 10) (ash (ldb (byte 8 0) cp) 2))))
+         (page (aref **character-database** (+ 3168 cp-high))))
+    (+ 7520 (ash page 10) (ash (ldb (byte 8 0) cp) 2))))
 
 (declaim (ftype (sfunction (t) (unsigned-byte 11)) ucd-value-0))
 (defun ucd-value-0 (char)
diff --git a/tools-for-build/ucd.lisp b/tools-for-build/ucd.lisp
index 935460b..fa2014f 100644
--- a/tools-for-build/ucd.lisp
+++ b/tools-for-build/ucd.lisp
@@ -80,7 +80,17 @@
                                                                               right-bidi-mirrored))))))))))))))))))
 
 (defun build-misc-table ()
-  (sort *misc-table* #'compare-misc-entry)
+  (let ((table (sort *misc-table* #'compare-misc-entry)))
+    ;; after sorting, insert at the end a special entry to handle
+    ;; unallocated characters.
+    (setf *misc-table* (make-array (1+ (length table))))
+    (replace *misc-table* table)
+    (setf (aref *misc-table* (length table))
+          ;; unallocated characters have a GC index of 31 (not
+          ;; colliding with any other GC), are not digits or decimal
+          ;; digits, aren't BOTH-CASE-P, don't decompose, and aren't
+          ;; interestingly bidi or combining.
+          '(31 0 0 "" "" "" nil 0)))
   (setq *misc-mapping* (make-array (1+ *misc-index*)))
   (loop for i from 0 to *misc-index*
      do (setf (aref *misc-mapping*
@@ -426,14 +436,13 @@
         (loop for page across array
            do (loop for entry across page
                  do (write-4-byte
-                     ;; KLUDGE: while tests indicate that this works
-                     ;; by accident, actually this causes lookups on
-                     ;; characters undefined by Unicode (e.g. U+2FB00)
-                     ;; to zoom off into unrelated bits of
-                     ;; **CHARACTER-DATABASE** (see UCD-VALUE-[01] in
-                     ;; src/code/target-char.lisp).  It would be good
-                     ;; to make this work deliberately.
-                     (dpb (if entry (aref *misc-mapping* (ucd-misc entry)) #x7ff)
+                     (dpb (if entry
+                              (aref *misc-mapping* (ucd-misc entry))
+                              ;; the last entry in *MISC-TABLE* (see
+                              ;; BUILD-MISC-TABLE) is special,
+                              ;; reserved for the information for
+                              ;; characters unallocated by Unicode.
+                              (1- (length *misc-table*)))
                           (byte 11 21)
                           (if entry (ucd-transform entry) 0))
                      stream))))))
-- 
1.7.10.4