0.9.10.1: Unicode character names -- aka More Bloat
authorNikodemus Siivola <nikodemus@random-state.net>
Mon, 27 Feb 2006 05:13:29 +0000 (05:13 +0000)
committerNikodemus Siivola <nikodemus@random-state.net>
Mon, 27 Feb 2006 05:13:29 +0000 (05:13 +0000)
 * Make CHAR-NAME and NAME-CHAR aware of the Unicode names when building
   with :SB-UNICODE.

build-order.lisp-expr
src/code/cold-init.lisp
src/code/huffman.lisp [new file with mode: 0644]
src/code/target-char.lisp
src/code/target-extensions.lisp
tests/character.pure.lisp
tools-for-build/ucd.lisp
version.lisp-expr

index 7d0fdff..3cb8a46 100644 (file)
  ("src/code/irrat"      :not-host)
 
  ("src/code/char")
+ ("src/code/huffman")
  ("src/code/target-char" :not-host)
  ("src/code/target-misc" :not-host)
  ("src/code/misc")
index 4545f0c..edbcf42 100644 (file)
   (show-and-call !random-cold-init)
 
   (show-and-call !character-database-cold-init)
+  (show-and-call !character-name-database-cold-init)
 
   (show-and-call !early-package-cold-init)
   (show-and-call !package-cold-init)
diff --git a/src/code/huffman.lisp b/src/code/huffman.lisp
new file mode 100644 (file)
index 0000000..51f088f
--- /dev/null
@@ -0,0 +1,106 @@
+;;;; a simple huffman encoder/decoder, used to compress unicode
+;;;; character names.
+
+;;;; This software is part of the SBCL system. See the README file for
+;;;; more information.
+;;;;
+;;;; This software is derived from the CMU CL system, which was
+;;;; written at Carnegie Mellon University and released into the
+;;;; public domain. The software is in the public domain and is
+;;;; provided with absolutely no warranty. See the COPYING and CREDITS
+;;;; files for more information.
+
+(in-package "SB!IMPL")
+
+(defstruct (huffman-node (:constructor make-huffman-node (key weight)))
+  key weight)
+
+(defstruct (huffman-pair
+             (:include huffman-node)
+             (:constructor make-huffman-pair
+                           (left right &aux
+                                 (key   (concatenate 'string
+                                                     (huffman-node-key left)
+                                                     (huffman-node-key right)))
+                                 (weight (+ (huffman-node-weight left)
+                                            (huffman-node-weight right))))))
+  left right)
+
+(defun huffman-weights (corpus)
+  (let ((weight-table (make-hash-table :test #'equal)))
+    (loop for string in corpus
+       do (loop for char across string
+             do (incf (gethash char weight-table 0))))
+    (let (alist)
+      (maphash (lambda (char weight)
+                 (push (make-huffman-node (string char) weight) alist))
+               weight-table)
+      alist)))
+
+(defun make-huffman-tree (corpus)
+  (labels ((merge-table (table)
+             (setf table (sort table #'< :key #'huffman-node-weight))
+             (push (make-huffman-pair (pop table) (pop table))
+                   table)
+             (if (second table)
+                 (merge-table table)
+                 (car table)))
+           (finish-tree (tree)
+             (if (huffman-pair-p tree)
+                 (list (huffman-node-key tree)
+                       (finish-tree (huffman-pair-left tree))
+                       (finish-tree (huffman-pair-right tree)))
+                 (huffman-node-key tree))))
+    (finish-tree (merge-table (huffman-weights corpus)))))
+
+(defun huffman-decode (code tree)
+  (let ((original code))
+   (labels ((pop-bit ()
+              (let* ((bits (integer-length code))
+                     (bit (ldb (byte 1 (- bits 2)) code)))
+                (setf code (dpb 1 (byte 1 (- bits 2))
+                                (ldb (byte (- bits 1) 0) code)))
+                bit))
+            (choose (branch)
+              (destructuring-bind (key left right) branch
+                  (declare (ignore key))
+                (if (zerop (pop-bit))
+                    left
+                    right)))
+            (decode (branch)
+              (when (zerop code)
+                (error "Invalid Huffman-code: ~S" original))
+              (let ((next (choose branch)))
+                 (cond ((consp next)
+                        (decode next))
+                       ((< 1 code)
+                        (concatenate 'string next (decode tree)))
+                       (t
+                        next)))))
+     (decode tree))))
+
+(defun huffman-match (char node)
+  (if (consp node)
+      (find char (the string (car node)) :test #'equal)
+      (eql char (character node))))
+
+(defun huffman-encode (string tree)
+  (let ((code 1))
+    (labels ((encode (bit char tree)
+               (when bit
+                 (setf code (+ (ash code 1) bit)))
+               (if (consp tree)
+                   (destructuring-bind (key left right) tree
+                     (declare (ignore key))
+                     (cond ((huffman-match char left)
+                            (encode 0 char left))
+                           ((huffman-match char right)
+                            (encode 1 char right))
+                           (t
+                            ;; unknown
+                            (return-from huffman-encode nil))))
+                   (unless (huffman-match char tree)
+                     (error "Error encoding ~S (bad tree)." char)))))
+      (loop for char across string
+         do (encode nil char tree))
+      code)))
index 68b02d0..92d529b 100644 (file)
 (defvar *character-database*)
 (declaim (type (simple-array (unsigned-byte 8) (*)) *character-database*))
 
+#!+sb-unicode
+(progn
+ (defvar *unicode-character-name-database*)
+ (defvar *unicode-character-name-huffman-tree*))
+
 (macrolet ((frob ()
-             (with-open-file (stream (merge-pathnames
-                                      (make-pathname
-                                       :directory
-                                       '(:relative :up :up "output")
-                                       :name "ucd"
-                                       :type "dat")
-                                      sb!xc:*compile-file-truename*)
-                                     :direction :input
-                                     :element-type '(unsigned-byte 8))
-               (let* ((length (file-length stream))
-                      (array (make-array length
-                                         :element-type '(unsigned-byte 8))))
-                 (read-sequence array stream)
-                 `(defun !character-database-cold-init ()
-                    (setq *character-database* ',array))))))
+             (flet ((file (name type)
+                      (merge-pathnames (make-pathname
+                                        :directory
+                                        '(:relative :up :up "output")
+                                        :name name :type type)
+                                       sb!xc:*compile-file-truename*)))
+               `(progn
+                  ,(with-open-file (stream (file "ucd" "dat")
+                                           :direction :input
+                                           :element-type '(unsigned-byte 8))
+                     (let* ((length (file-length stream))
+                            (array (make-array
+                                    length :element-type '(unsigned-byte 8))))
+                       (read-sequence array stream)
+                       `(defun !character-database-cold-init ()
+                          (setq *character-database* ',array))))
+                  ,(with-open-file (stream (file "ucd-names" "lisp-expr")
+                                           :direction :input
+                                           :element-type 'character)
+                     (let ((names (make-hash-table)))
+                       #!+sb-unicode
+                       (loop
+                          for code-point = (read stream nil nil)
+                          for char-name = (string-upcase (read stream nil nil))
+                          while code-point
+                          do (setf (gethash code-point names) char-name))
+                       (let ((tree (make-huffman-tree
+                                    (let (list)
+                                      (maphash (lambda (code name)
+                                                 (declare (ignore code))
+                                                 (push name list))
+                                               names)
+                                      list)))
+                             (code->name
+                              (make-array (hash-table-count names)
+                                          :fill-pointer 0))
+                             (name->code nil))
+                         (maphash (lambda (code name)
+                                    (vector-push
+                                     (cons code (huffman-encode name tree))
+                                     code->name))
+                                   names)
+                         (setf name->code
+                               (sort (copy-seq code->name) #'< :key #'cdr))
+                         (setf code->name
+                               (sort (copy-seq name->code) #'< :key #'car))
+                         (setf names nil)
+                         `(defun !character-name-database-cold-init ()
+                            #!+sb-unicode
+                            (setq *unicode-character-name-database*
+                                  (cons ',code->name ',name->code)
+                                  *unicode-character-name-huffman-tree* ',tree)))))))))
   (frob))
 #+sb-xc-host (!character-database-cold-init)
+#+sb-xc-host (!character-name-database-cold-init)
 
-;;; This is the alist of (character-name . character) for characters
-;;; with long names. The first name in this list for a given character
-;;; is used on typeout and is the preferred form for input.
-(macrolet ((frob (char-names-list)
-             (collect ((results))
-               (dolist (code char-names-list)
-                 (destructuring-bind (ccode names) code
-                   (dolist (name names)
-                     (results (cons name ccode)))))
-               `(defparameter *char-name-alist*
-                 (mapcar (lambda (x) (cons (car x) (code-char (cdr x))))
-                         ',(results))))))
+(defparameter *base-char-name-alist*
   ;; Note: The *** markers here indicate character names which are
   ;; required by the ANSI specification of #'CHAR-NAME. For the others,
   ;; we prefer the ASCII standard name.
-  (frob ((#x00 ("Nul" "Null" "^@"))
-         (#x01 ("Soh" "^a"))
-         (#x02 ("Stx" "^b"))
-         (#x03 ("Etx" "^c"))
-         (#x04 ("Eot" "^d"))
-         (#x05 ("Enq" "^e"))
-         (#x06 ("Ack" "^f"))
-         (#x07 ("Bel" "Bell" "^g"))
-         (#x08 ("Backspace" "^h" "Bs")) ; *** See Note above.
-         (#x09 ("Tab" "^i" "Ht")) ; *** See Note above.
-         (#x0A ("Newline" "Linefeed" "^j" "Lf" "Nl" )) ; *** See Note above.
-         (#x0B ("Vt" "^k"))
-         (#x0C ("Page" "^l" "Form" "Formfeed" "Ff" "Np")) ; *** See Note above.
-         (#x0D ("Return" "^m" "Cr")) ; *** See Note above.
-         (#x0E ("So" "^n"))
-         (#x0F ("Si" "^o"))
-         (#x10 ("Dle" "^p"))
-         (#x11 ("Dc1" "^q"))
-         (#x12 ("Dc2" "^r"))
-         (#x13 ("Dc3" "^s"))
-         (#x14 ("Dc4" "^t"))
-         (#x15 ("Nak" "^u"))
-         (#x16 ("Syn" "^v"))
-         (#x17 ("Etb" "^w"))
-         (#x18 ("Can" "^x"))
-         (#x19 ("Em" "^y"))
-         (#x1A ("Sub" "^z"))
-         (#x1B ("Esc" "Escape" "^[" "Altmode" "Alt"))
-         (#x1C ("Fs" "^\\"))
-         (#x1D ("Gs" "^]"))
-         (#x1E ("Rs" "^^"))
-         (#x1F ("Us" "^_"))
-         (#x20 ("Space" "Sp")) ; *** See Note above.
-         (#x7f ("Rubout" "Delete" "Del"))
-         (#x80 ("C80"))
-         (#x81 ("C81"))
-         (#x82 ("Break-Permitted"))
-         (#x83 ("No-Break-Permitted"))
-         (#x84 ("C84"))
-         (#x85 ("Next-Line"))
-         (#x86 ("Start-Selected-Area"))
-         (#x87 ("End-Selected-Area"))
-         (#x88 ("Character-Tabulation-Set"))
-         (#x89 ("Character-Tabulation-With-Justification"))
-         (#x8A ("Line-Tabulation-Set"))
-         (#x8B ("Partial-Line-Forward"))
-         (#x8C ("Partial-Line-Backward"))
-         (#x8D ("Reverse-Linefeed"))
-         (#x8E ("Single-Shift-Two"))
-         (#x8F ("Single-Shift-Three"))
-         (#x90 ("Device-Control-String"))
-         (#x91 ("Private-Use-One"))
-         (#x92 ("Private-Use-Two"))
-         (#x93 ("Set-Transmit-State"))
-         (#x94 ("Cancel-Character"))
-         (#x95 ("Message-Waiting"))
-         (#x96 ("Start-Guarded-Area"))
-         (#x97 ("End-Guarded-Area"))
-         (#x98 ("Start-String"))
-         (#x99 ("C99"))
-         (#x9A ("Single-Character-Introducer"))
-         (#x9B ("Control-Sequence-Introducer"))
-         (#x9C ("String-Terminator"))
-         (#x9D ("Operating-System-Command"))
-         (#x9E ("Privacy-Message"))
-         (#x9F ("Application-Program-Command"))))) ; *** See Note above.
+  '((#x00 "Nul" "Null" "^@")
+    (#x01 "Soh" "^a")
+    (#x02 "Stx" "^b")
+    (#x03 "Etx" "^c")
+    (#x04 "Eot" "^d")
+    (#x05 "Enq" "^e")
+    (#x06 "Ack" "^f")
+    (#x07 "Bel" "Bell" "^g")
+    (#x08 "Backspace" "^h" "Bs") ; *** See Note above
+    (#x09 "Tab" "^i" "Ht") ; *** See Note above
+    (#x0A "Newline" "Linefeed" "^j" "Lf" "Nl") ; *** See Note above
+    (#x0B "Vt" "^k")
+    (#x0C "Page" "^l" "Form" "Formfeed" "Ff" "Np") ; *** See Note above
+    (#x0D "Return" "^m" "Cr") ; *** See Note above
+    (#x0E "So" "^n")
+    (#x0F "Si" "^o")
+    (#x10 "Dle" "^p")
+    (#x11 "Dc1" "^q")
+    (#x12 "Dc2" "^r")
+    (#x13 "Dc3" "^s")
+    (#x14 "Dc4" "^t")
+    (#x15 "Nak" "^u")
+    (#x16 "Syn" "^v")
+    (#x17 "Etb" "^w")
+    (#x18 "Can" "^x")
+    (#x19 "Em" "^y")
+    (#x1A "Sub" "^z")
+    (#x1B "Esc" "Escape" "^[" "Altmode" "Alt")
+    (#x1C "Fs" "^\\")
+    (#x1D "Gs" "^]")
+    (#x1E "Rs" "^^")
+    (#x1F "Us" "^_")
+    (#x20 "Space" "Sp") ; *** See Note above
+    (#x7f "Rubout" "Delete" "Del")
+    (#x80 "C80")
+    (#x81 "C81")
+    (#x82 "Break-Permitted")
+    (#x83 "No-Break-Permitted")
+    (#x84 "C84")
+    (#x85 "Next-Line")
+    (#x86 "Start-Selected-Area")
+    (#x87 "End-Selected-Area")
+    (#x88 "Character-Tabulation-Set")
+    (#x89 "Character-Tabulation-With-Justification")
+    (#x8A "Line-Tabulation-Set")
+    (#x8B "Partial-Line-Forward")
+    (#x8C "Partial-Line-Backward")
+    (#x8D "Reverse-Linefeed")
+    (#x8E "Single-Shift-Two")
+    (#x8F "Single-Shift-Three")
+    (#x90 "Device-Control-String")
+    (#x91 "Private-Use-One")
+    (#x92 "Private-Use-Two")
+    (#x93 "Set-Transmit-State")
+    (#x94 "Cancel-Character")
+    (#x95 "Message-Waiting")
+    (#x96 "Start-Guarded-Area")
+    (#x97 "End-Guarded-Area")
+    (#x98 "Start-String")
+    (#x99 "C99")
+    (#x9A "Single-Character-Introducer")
+    (#x9B "Control-Sequence-Introducer")
+    (#x9C "String-Terminator")
+    (#x9D "Operating-System-Command")
+    (#x9E "Privacy-Message")
+    (#x9F "Application-Program-Command"))) ; *** See Note above
 \f
 ;;;; accessor functions
 
 (defun char-name (char)
   #!+sb-doc
   "Return the name (a STRING) for a CHARACTER object."
-  (car (rassoc char *char-name-alist*)))
+  (let ((char-code (char-code char)))
+    (or (second (assoc char-code *base-char-name-alist*))
+        #!+sb-unicode
+        (let ((h-code (cdr (binary-search char-code
+                                          (car *unicode-character-name-database*)
+                                          :key #'car))))
+          (when h-code
+            (huffman-decode h-code *unicode-character-name-huffman-tree*))))))
 
 (defun name-char (name)
   #!+sb-doc
   "Given an argument acceptable to STRING, NAME-CHAR returns a character
   whose name is that string, if one exists. Otherwise, NIL is returned."
-  (cdr (assoc (string name) *char-name-alist* :test #'string-equal)))
+  (or (let ((char-code (car (rassoc-if (lambda (names)
+                                         (member name names :test #'string-equal))
+                                       *base-char-name-alist*))))
+        (when char-code
+          (code-char char-code)))
+      #!+sb-unicode
+      (let ((encoding (huffman-encode (string-upcase name)
+                                       *unicode-character-name-huffman-tree*)))
+        (when encoding
+          (let ((char-code
+                 (car (binary-search encoding
+                                     (cdr *unicode-character-name-database*)
+                                     :key #'cdr))))
+            (when char-code
+              (code-char char-code)))))))
 \f
 ;;;; predicates
 
index 62f062a..f97607b 100644 (file)
@@ -34,6 +34,24 @@ reserved for user and applications.")
 order when a saved core image starts up, after the system itself has
 been initialized. Unused by SBCL itself: reserved for user and
 applications.")
+
+\f
+;;; Binary search for simple vectors
+(defun binary-search (value seq &key (key #'identity))
+  (declare (simple-vector seq))
+  (labels ((recurse (start end)
+             (when (< start end)
+               (let* ((i (+ start (truncate (- end start) 2)))
+                      (elt (svref seq i))
+                      (key-value (funcall key elt)))
+                 (cond ((< value key-value)
+                        (recurse start i))
+                       ((> value key-value)
+                        (recurse (1+ i) end))
+                       (t
+                        elt))))))
+    (recurse 0 (length seq))))
+
 \f
 ;;; like LISTEN, but any whitespace in the input stream will be flushed
 (defun listen-skip-whitespace (&optional (stream *standard-input*))
@@ -62,3 +80,4 @@ applications.")
          :format-control "~@<~A: ~2I~_~A~:>"
          :format-arguments (list prefix-string (strerror errno))
          other-condition-args))
+
index 1a35bbb..56e899d 100644 (file)
       (let ((coded-char-name (char-name coded-char)))
         (assert (string= name coded-char-name))))))
 
+;;; Trivial tests for some unicode names
+#+sb-unicode
+(dolist (d '(("LATIN_CAPITAL_LETTER_A" 65)
+             ("LATIN_SMALL_LETTER_A" 97)
+             ("LATIN_SMALL_LETTER_CLOSED_OPEN_E" 666)
+             ("DIGRAM_FOR_GREATER_YIN" 9871)))
+  (destructuring-bind (name code) d
+    (assert (eql (code-char code) (name-char (string-downcase name))))
+    (assert (equal name (char-name (code-char code))))))
+
 ;;; bug 230: CHAR= didn't check types of &REST arguments
 (dolist (form '((code-char char-code-limit)
                 (standard-char-p "a")
index eb21782..8ddcb67 100644 (file)
 
 ;;; Generator
 
+(defstruct ucd misc transform)
+
 (defparameter *unicode-character-database*
   (make-pathname :directory (pathname-directory *load-truename*)))
 
 (defparameter *ucd-base* nil)
+(defparameter *unicode-names* (make-hash-table))
 
 (defparameter *last-uppercase* nil)
 (defparameter *uppercase-transition-count* 0)
 
 (defparameter *block-first* nil)
 
+(defun normalize-character-name (name)
+  (when (find #\_ name)
+    (error "Bad name for a character: ~A" name))
+  (unless (or (zerop (length name)) (find #\< name) (find #\> name))
+    (substitute #\_ #\Space name)))
+
 ;;;   3400  --  4DB5  : cjk ideograph extension a ;Lo;0;L;;;;;N;;;;;
 ;;;   AC00  --  D7A3  : hangul syllables ;Lo;0;L;;;;;N;;;;;
 ;;;   D800  --  F8FF  : surrogates and private use
              (setq *last-uppercase* nil)))
           (when (> ccc-index 255)
             (error "canonical combining class too large ~A" ccc-index))
-          (let ((result (vector misc-index (or upper-index lower-index 0))))
+          (let ((result (make-ucd :misc misc-index
+                                  :transform (or upper-index lower-index 0))))
             (when (and (> (length name) 7)
                        (string= ", Last>" name :start2 (- (length name) 7)))
               (let ((page-start (ash (+ *block-first*
                       do (setf (aref (aref *ucd-base* (cp-high point))
                                      (cp-low point))
                                result))))
-            result)))))
+            (values result (normalize-character-name name)))))))
 
 (defun slurp-ucd-line (line)
   (let* ((split-line (split-string line #\;))
       (setf (aref *ucd-base* code-high)
             (make-array (ash 1 *page-size-exponent*)
                         :initial-element nil)))
-    (setf (aref (aref *ucd-base* code-high) code-low)
-          (encode-ucd-line (cdr split-line) code-point))))
+    (multiple-value-bind (encoding name)
+        (encode-ucd-line (cdr split-line) code-point)
+      (setf (aref (aref *ucd-base* code-high) code-low) encoding
+            (gethash code-point *unicode-names*) name))))
 
 (defun second-pass ()
   (loop for i from 0 below (length *ucd-base*)
         do (loop for j from 0 below (length (aref *ucd-base* i))
                  for result = (aref (aref *ucd-base* i) j)
                  when result
-                 when (let* ((transform-point (aref result 1))
+                 when (let* ((transform-point (ucd-transform result))
                              (transform-high (ash transform-point
                                                   (- *page-size-exponent*)))
                              (transform-low (ldb (byte *page-size-exponent* 0)
                                                  transform-point)))
                         (and (plusp transform-point)
-                             (/= (aref (aref (aref *ucd-base* transform-high)
-                                             transform-low)
-                                       1)
+                             (/= (ucd-transform
+                                  (aref (aref *ucd-base* transform-high)
+                                        transform-low))
                                  (+ (ash i *page-size-exponent*) j))))
                  do (destructuring-bind (gc-index bidi-index ccc-index
                                          decimal-digit digit bidi-mirrored
                                          cl-both-case-p)
-                        (aref *misc-table* (aref result 0))
+                        (aref *misc-table* (ucd-misc result))
                       (declare (ignore cl-both-case-p))
                       (format t "~A~%" (+ (ash i *page-size-exponent*) j))
-                      (setf (aref result 0)
+                      (setf (ucd-misc result)
                             (hash-misc gc-index bidi-index ccc-index
                                        decimal-digit digit bidi-mirrored
                                        nil))))))
               do (write-byte 0 stream)
               do (write-byte 0 stream))
         (loop for page across *ucd-base*
-              do (write-byte (if page (gethash page hash) 0) stream))
+           do (write-byte (if page (gethash page hash) 0) stream))
         (loop for page across array
-              do (loop for entry across page
-                       do (write-byte (if entry
-                                          (aref *misc-mapping* (aref entry 0))
-                                          255)
-                                      stream)
-                       do (write-3-byte (if entry (aref entry 1) 0)
-                                        stream))))))
+           do (loop for entry across page
+                 do (write-byte (if entry
+                                    (aref *misc-mapping* (ucd-misc entry))
+                                    255)
+                                stream)
+                 do (write-3-byte (if entry (ucd-transform entry) 0)
+                                  stream))))))
+  (with-open-file (f (make-pathname :name "ucd-names" :type "lisp-expr"
+                                    :defaults *output-directory*)
+                     :direction :output
+                     :if-exists :supersede
+                     :if-does-not-exist :create)
+    (with-standard-io-syntax
+      (write-string ";;; Do not edit by hand: generated by ucd.lisp" f)
+      (maphash (lambda (code name)
+                 (when name
+                  (print code f)
+                  (prin1 name f)))
+               *unicode-names*))
+    (setf *unicode-names* nil))
   (with-open-file (*standard-output*
                    (make-pathname :name "numerics"
                                   :type "lisp-expr"
index 804c615..1c09ad3 100644 (file)
@@ -17,4 +17,4 @@
 ;;; checkins which aren't released. (And occasionally for internal
 ;;; versions, especially for internal versions off the main CVS
 ;;; branch, it gets hairier, e.g. "0.pre7.14.flaky4.13".)
-"0.9.10"
+"0.9.10.1"