("src/code/irrat" :not-host)
("src/code/char")
+ ("src/code/huffman")
("src/code/target-char" :not-host)
("src/code/target-misc" :not-host)
("src/code/misc")
(show-and-call !random-cold-init)
(show-and-call !character-database-cold-init)
+ (show-and-call !character-name-database-cold-init)
(show-and-call !early-package-cold-init)
(show-and-call !package-cold-init)
--- /dev/null
+;;;; a simple huffman encoder/decoder, used to compress unicode
+;;;; character names.
+
+;;;; This software is part of the SBCL system. See the README file for
+;;;; more information.
+;;;;
+;;;; This software is derived from the CMU CL system, which was
+;;;; written at Carnegie Mellon University and released into the
+;;;; public domain. The software is in the public domain and is
+;;;; provided with absolutely no warranty. See the COPYING and CREDITS
+;;;; files for more information.
+
+(in-package "SB!IMPL")
+
+(defstruct (huffman-node (:constructor make-huffman-node (key weight)))
+ key weight)
+
+(defstruct (huffman-pair
+ (:include huffman-node)
+ (:constructor make-huffman-pair
+ (left right &aux
+ (key (concatenate 'string
+ (huffman-node-key left)
+ (huffman-node-key right)))
+ (weight (+ (huffman-node-weight left)
+ (huffman-node-weight right))))))
+ left right)
+
+(defun huffman-weights (corpus)
+ (let ((weight-table (make-hash-table :test #'equal)))
+ (loop for string in corpus
+ do (loop for char across string
+ do (incf (gethash char weight-table 0))))
+ (let (alist)
+ (maphash (lambda (char weight)
+ (push (make-huffman-node (string char) weight) alist))
+ weight-table)
+ alist)))
+
+(defun make-huffman-tree (corpus)
+ (labels ((merge-table (table)
+ (setf table (sort table #'< :key #'huffman-node-weight))
+ (push (make-huffman-pair (pop table) (pop table))
+ table)
+ (if (second table)
+ (merge-table table)
+ (car table)))
+ (finish-tree (tree)
+ (if (huffman-pair-p tree)
+ (list (huffman-node-key tree)
+ (finish-tree (huffman-pair-left tree))
+ (finish-tree (huffman-pair-right tree)))
+ (huffman-node-key tree))))
+ (finish-tree (merge-table (huffman-weights corpus)))))
+
+(defun huffman-decode (code tree)
+ (let ((original code))
+ (labels ((pop-bit ()
+ (let* ((bits (integer-length code))
+ (bit (ldb (byte 1 (- bits 2)) code)))
+ (setf code (dpb 1 (byte 1 (- bits 2))
+ (ldb (byte (- bits 1) 0) code)))
+ bit))
+ (choose (branch)
+ (destructuring-bind (key left right) branch
+ (declare (ignore key))
+ (if (zerop (pop-bit))
+ left
+ right)))
+ (decode (branch)
+ (when (zerop code)
+ (error "Invalid Huffman-code: ~S" original))
+ (let ((next (choose branch)))
+ (cond ((consp next)
+ (decode next))
+ ((< 1 code)
+ (concatenate 'string next (decode tree)))
+ (t
+ next)))))
+ (decode tree))))
+
+(defun huffman-match (char node)
+ (if (consp node)
+ (find char (the string (car node)) :test #'equal)
+ (eql char (character node))))
+
+(defun huffman-encode (string tree)
+ (let ((code 1))
+ (labels ((encode (bit char tree)
+ (when bit
+ (setf code (+ (ash code 1) bit)))
+ (if (consp tree)
+ (destructuring-bind (key left right) tree
+ (declare (ignore key))
+ (cond ((huffman-match char left)
+ (encode 0 char left))
+ ((huffman-match char right)
+ (encode 1 char right))
+ (t
+ ;; unknown
+ (return-from huffman-encode nil))))
+ (unless (huffman-match char tree)
+ (error "Error encoding ~S (bad tree)." char)))))
+ (loop for char across string
+ do (encode nil char tree))
+ code)))
(defvar *character-database*)
(declaim (type (simple-array (unsigned-byte 8) (*)) *character-database*))
+#!+sb-unicode
+(progn
+ (defvar *unicode-character-name-database*)
+ (defvar *unicode-character-name-huffman-tree*))
+
(macrolet ((frob ()
- (with-open-file (stream (merge-pathnames
- (make-pathname
- :directory
- '(:relative :up :up "output")
- :name "ucd"
- :type "dat")
- sb!xc:*compile-file-truename*)
- :direction :input
- :element-type '(unsigned-byte 8))
- (let* ((length (file-length stream))
- (array (make-array length
- :element-type '(unsigned-byte 8))))
- (read-sequence array stream)
- `(defun !character-database-cold-init ()
- (setq *character-database* ',array))))))
+ (flet ((file (name type)
+ (merge-pathnames (make-pathname
+ :directory
+ '(:relative :up :up "output")
+ :name name :type type)
+ sb!xc:*compile-file-truename*)))
+ `(progn
+ ,(with-open-file (stream (file "ucd" "dat")
+ :direction :input
+ :element-type '(unsigned-byte 8))
+ (let* ((length (file-length stream))
+ (array (make-array
+ length :element-type '(unsigned-byte 8))))
+ (read-sequence array stream)
+ `(defun !character-database-cold-init ()
+ (setq *character-database* ',array))))
+ ,(with-open-file (stream (file "ucd-names" "lisp-expr")
+ :direction :input
+ :element-type 'character)
+ (let ((names (make-hash-table)))
+ #!+sb-unicode
+ (loop
+ for code-point = (read stream nil nil)
+ for char-name = (string-upcase (read stream nil nil))
+ while code-point
+ do (setf (gethash code-point names) char-name))
+ (let ((tree (make-huffman-tree
+ (let (list)
+ (maphash (lambda (code name)
+ (declare (ignore code))
+ (push name list))
+ names)
+ list)))
+ (code->name
+ (make-array (hash-table-count names)
+ :fill-pointer 0))
+ (name->code nil))
+ (maphash (lambda (code name)
+ (vector-push
+ (cons code (huffman-encode name tree))
+ code->name))
+ names)
+ (setf name->code
+ (sort (copy-seq code->name) #'< :key #'cdr))
+ (setf code->name
+ (sort (copy-seq name->code) #'< :key #'car))
+ (setf names nil)
+ `(defun !character-name-database-cold-init ()
+ #!+sb-unicode
+ (setq *unicode-character-name-database*
+ (cons ',code->name ',name->code)
+ *unicode-character-name-huffman-tree* ',tree)))))))))
(frob))
#+sb-xc-host (!character-database-cold-init)
+#+sb-xc-host (!character-name-database-cold-init)
-;;; This is the alist of (character-name . character) for characters
-;;; with long names. The first name in this list for a given character
-;;; is used on typeout and is the preferred form for input.
-(macrolet ((frob (char-names-list)
- (collect ((results))
- (dolist (code char-names-list)
- (destructuring-bind (ccode names) code
- (dolist (name names)
- (results (cons name ccode)))))
- `(defparameter *char-name-alist*
- (mapcar (lambda (x) (cons (car x) (code-char (cdr x))))
- ',(results))))))
+(defparameter *base-char-name-alist*
;; Note: The *** markers here indicate character names which are
;; required by the ANSI specification of #'CHAR-NAME. For the others,
;; we prefer the ASCII standard name.
- (frob ((#x00 ("Nul" "Null" "^@"))
- (#x01 ("Soh" "^a"))
- (#x02 ("Stx" "^b"))
- (#x03 ("Etx" "^c"))
- (#x04 ("Eot" "^d"))
- (#x05 ("Enq" "^e"))
- (#x06 ("Ack" "^f"))
- (#x07 ("Bel" "Bell" "^g"))
- (#x08 ("Backspace" "^h" "Bs")) ; *** See Note above.
- (#x09 ("Tab" "^i" "Ht")) ; *** See Note above.
- (#x0A ("Newline" "Linefeed" "^j" "Lf" "Nl" )) ; *** See Note above.
- (#x0B ("Vt" "^k"))
- (#x0C ("Page" "^l" "Form" "Formfeed" "Ff" "Np")) ; *** See Note above.
- (#x0D ("Return" "^m" "Cr")) ; *** See Note above.
- (#x0E ("So" "^n"))
- (#x0F ("Si" "^o"))
- (#x10 ("Dle" "^p"))
- (#x11 ("Dc1" "^q"))
- (#x12 ("Dc2" "^r"))
- (#x13 ("Dc3" "^s"))
- (#x14 ("Dc4" "^t"))
- (#x15 ("Nak" "^u"))
- (#x16 ("Syn" "^v"))
- (#x17 ("Etb" "^w"))
- (#x18 ("Can" "^x"))
- (#x19 ("Em" "^y"))
- (#x1A ("Sub" "^z"))
- (#x1B ("Esc" "Escape" "^[" "Altmode" "Alt"))
- (#x1C ("Fs" "^\\"))
- (#x1D ("Gs" "^]"))
- (#x1E ("Rs" "^^"))
- (#x1F ("Us" "^_"))
- (#x20 ("Space" "Sp")) ; *** See Note above.
- (#x7f ("Rubout" "Delete" "Del"))
- (#x80 ("C80"))
- (#x81 ("C81"))
- (#x82 ("Break-Permitted"))
- (#x83 ("No-Break-Permitted"))
- (#x84 ("C84"))
- (#x85 ("Next-Line"))
- (#x86 ("Start-Selected-Area"))
- (#x87 ("End-Selected-Area"))
- (#x88 ("Character-Tabulation-Set"))
- (#x89 ("Character-Tabulation-With-Justification"))
- (#x8A ("Line-Tabulation-Set"))
- (#x8B ("Partial-Line-Forward"))
- (#x8C ("Partial-Line-Backward"))
- (#x8D ("Reverse-Linefeed"))
- (#x8E ("Single-Shift-Two"))
- (#x8F ("Single-Shift-Three"))
- (#x90 ("Device-Control-String"))
- (#x91 ("Private-Use-One"))
- (#x92 ("Private-Use-Two"))
- (#x93 ("Set-Transmit-State"))
- (#x94 ("Cancel-Character"))
- (#x95 ("Message-Waiting"))
- (#x96 ("Start-Guarded-Area"))
- (#x97 ("End-Guarded-Area"))
- (#x98 ("Start-String"))
- (#x99 ("C99"))
- (#x9A ("Single-Character-Introducer"))
- (#x9B ("Control-Sequence-Introducer"))
- (#x9C ("String-Terminator"))
- (#x9D ("Operating-System-Command"))
- (#x9E ("Privacy-Message"))
- (#x9F ("Application-Program-Command"))))) ; *** See Note above.
+ '((#x00 "Nul" "Null" "^@")
+ (#x01 "Soh" "^a")
+ (#x02 "Stx" "^b")
+ (#x03 "Etx" "^c")
+ (#x04 "Eot" "^d")
+ (#x05 "Enq" "^e")
+ (#x06 "Ack" "^f")
+ (#x07 "Bel" "Bell" "^g")
+ (#x08 "Backspace" "^h" "Bs") ; *** See Note above
+ (#x09 "Tab" "^i" "Ht") ; *** See Note above
+ (#x0A "Newline" "Linefeed" "^j" "Lf" "Nl") ; *** See Note above
+ (#x0B "Vt" "^k")
+ (#x0C "Page" "^l" "Form" "Formfeed" "Ff" "Np") ; *** See Note above
+ (#x0D "Return" "^m" "Cr") ; *** See Note above
+ (#x0E "So" "^n")
+ (#x0F "Si" "^o")
+ (#x10 "Dle" "^p")
+ (#x11 "Dc1" "^q")
+ (#x12 "Dc2" "^r")
+ (#x13 "Dc3" "^s")
+ (#x14 "Dc4" "^t")
+ (#x15 "Nak" "^u")
+ (#x16 "Syn" "^v")
+ (#x17 "Etb" "^w")
+ (#x18 "Can" "^x")
+ (#x19 "Em" "^y")
+ (#x1A "Sub" "^z")
+ (#x1B "Esc" "Escape" "^[" "Altmode" "Alt")
+ (#x1C "Fs" "^\\")
+ (#x1D "Gs" "^]")
+ (#x1E "Rs" "^^")
+ (#x1F "Us" "^_")
+ (#x20 "Space" "Sp") ; *** See Note above
+ (#x7f "Rubout" "Delete" "Del")
+ (#x80 "C80")
+ (#x81 "C81")
+ (#x82 "Break-Permitted")
+ (#x83 "No-Break-Permitted")
+ (#x84 "C84")
+ (#x85 "Next-Line")
+ (#x86 "Start-Selected-Area")
+ (#x87 "End-Selected-Area")
+ (#x88 "Character-Tabulation-Set")
+ (#x89 "Character-Tabulation-With-Justification")
+ (#x8A "Line-Tabulation-Set")
+ (#x8B "Partial-Line-Forward")
+ (#x8C "Partial-Line-Backward")
+ (#x8D "Reverse-Linefeed")
+ (#x8E "Single-Shift-Two")
+ (#x8F "Single-Shift-Three")
+ (#x90 "Device-Control-String")
+ (#x91 "Private-Use-One")
+ (#x92 "Private-Use-Two")
+ (#x93 "Set-Transmit-State")
+ (#x94 "Cancel-Character")
+ (#x95 "Message-Waiting")
+ (#x96 "Start-Guarded-Area")
+ (#x97 "End-Guarded-Area")
+ (#x98 "Start-String")
+ (#x99 "C99")
+ (#x9A "Single-Character-Introducer")
+ (#x9B "Control-Sequence-Introducer")
+ (#x9C "String-Terminator")
+ (#x9D "Operating-System-Command")
+ (#x9E "Privacy-Message")
+ (#x9F "Application-Program-Command"))) ; *** See Note above
\f
;;;; accessor functions
(defun char-name (char)
#!+sb-doc
"Return the name (a STRING) for a CHARACTER object."
- (car (rassoc char *char-name-alist*)))
+ (let ((char-code (char-code char)))
+ (or (second (assoc char-code *base-char-name-alist*))
+ #!+sb-unicode
+ (let ((h-code (cdr (binary-search char-code
+ (car *unicode-character-name-database*)
+ :key #'car))))
+ (when h-code
+ (huffman-decode h-code *unicode-character-name-huffman-tree*))))))
(defun name-char (name)
#!+sb-doc
"Given an argument acceptable to STRING, NAME-CHAR returns a character
whose name is that string, if one exists. Otherwise, NIL is returned."
- (cdr (assoc (string name) *char-name-alist* :test #'string-equal)))
+ (or (let ((char-code (car (rassoc-if (lambda (names)
+ (member name names :test #'string-equal))
+ *base-char-name-alist*))))
+ (when char-code
+ (code-char char-code)))
+ #!+sb-unicode
+ (let ((encoding (huffman-encode (string-upcase name)
+ *unicode-character-name-huffman-tree*)))
+ (when encoding
+ (let ((char-code
+ (car (binary-search encoding
+ (cdr *unicode-character-name-database*)
+ :key #'cdr))))
+ (when char-code
+ (code-char char-code)))))))
\f
;;;; predicates
order when a saved core image starts up, after the system itself has
been initialized. Unused by SBCL itself: reserved for user and
applications.")
+
+\f
+;;; Binary search for simple vectors
+(defun binary-search (value seq &key (key #'identity))
+ (declare (simple-vector seq))
+ (labels ((recurse (start end)
+ (when (< start end)
+ (let* ((i (+ start (truncate (- end start) 2)))
+ (elt (svref seq i))
+ (key-value (funcall key elt)))
+ (cond ((< value key-value)
+ (recurse start i))
+ ((> value key-value)
+ (recurse (1+ i) end))
+ (t
+ elt))))))
+ (recurse 0 (length seq))))
+
\f
;;; like LISTEN, but any whitespace in the input stream will be flushed
(defun listen-skip-whitespace (&optional (stream *standard-input*))
:format-control "~@<~A: ~2I~_~A~:>"
:format-arguments (list prefix-string (strerror errno))
other-condition-args))
+
(let ((coded-char-name (char-name coded-char)))
(assert (string= name coded-char-name))))))
+;;; Trivial tests for some unicode names
+#+sb-unicode
+(dolist (d '(("LATIN_CAPITAL_LETTER_A" 65)
+ ("LATIN_SMALL_LETTER_A" 97)
+ ("LATIN_SMALL_LETTER_CLOSED_OPEN_E" 666)
+ ("DIGRAM_FOR_GREATER_YIN" 9871)))
+ (destructuring-bind (name code) d
+ (assert (eql (code-char code) (name-char (string-downcase name))))
+ (assert (equal name (char-name (code-char code))))))
+
;;; bug 230: CHAR= didn't check types of &REST arguments
(dolist (form '((code-char char-code-limit)
(standard-char-p "a")
;;; Generator
+(defstruct ucd misc transform)
+
(defparameter *unicode-character-database*
(make-pathname :directory (pathname-directory *load-truename*)))
(defparameter *ucd-base* nil)
+(defparameter *unicode-names* (make-hash-table))
(defparameter *last-uppercase* nil)
(defparameter *uppercase-transition-count* 0)
(defparameter *block-first* nil)
+(defun normalize-character-name (name)
+ (when (find #\_ name)
+ (error "Bad name for a character: ~A" name))
+ (unless (or (zerop (length name)) (find #\< name) (find #\> name))
+ (substitute #\_ #\Space name)))
+
;;; 3400 -- 4DB5 : cjk ideograph extension a ;Lo;0;L;;;;;N;;;;;
;;; AC00 -- D7A3 : hangul syllables ;Lo;0;L;;;;;N;;;;;
;;; D800 -- F8FF : surrogates and private use
(setq *last-uppercase* nil)))
(when (> ccc-index 255)
(error "canonical combining class too large ~A" ccc-index))
- (let ((result (vector misc-index (or upper-index lower-index 0))))
+ (let ((result (make-ucd :misc misc-index
+ :transform (or upper-index lower-index 0))))
(when (and (> (length name) 7)
(string= ", Last>" name :start2 (- (length name) 7)))
(let ((page-start (ash (+ *block-first*
do (setf (aref (aref *ucd-base* (cp-high point))
(cp-low point))
result))))
- result)))))
+ (values result (normalize-character-name name)))))))
(defun slurp-ucd-line (line)
(let* ((split-line (split-string line #\;))
(setf (aref *ucd-base* code-high)
(make-array (ash 1 *page-size-exponent*)
:initial-element nil)))
- (setf (aref (aref *ucd-base* code-high) code-low)
- (encode-ucd-line (cdr split-line) code-point))))
+ (multiple-value-bind (encoding name)
+ (encode-ucd-line (cdr split-line) code-point)
+ (setf (aref (aref *ucd-base* code-high) code-low) encoding
+ (gethash code-point *unicode-names*) name))))
(defun second-pass ()
(loop for i from 0 below (length *ucd-base*)
do (loop for j from 0 below (length (aref *ucd-base* i))
for result = (aref (aref *ucd-base* i) j)
when result
- when (let* ((transform-point (aref result 1))
+ when (let* ((transform-point (ucd-transform result))
(transform-high (ash transform-point
(- *page-size-exponent*)))
(transform-low (ldb (byte *page-size-exponent* 0)
transform-point)))
(and (plusp transform-point)
- (/= (aref (aref (aref *ucd-base* transform-high)
- transform-low)
- 1)
+ (/= (ucd-transform
+ (aref (aref *ucd-base* transform-high)
+ transform-low))
(+ (ash i *page-size-exponent*) j))))
do (destructuring-bind (gc-index bidi-index ccc-index
decimal-digit digit bidi-mirrored
cl-both-case-p)
- (aref *misc-table* (aref result 0))
+ (aref *misc-table* (ucd-misc result))
(declare (ignore cl-both-case-p))
(format t "~A~%" (+ (ash i *page-size-exponent*) j))
- (setf (aref result 0)
+ (setf (ucd-misc result)
(hash-misc gc-index bidi-index ccc-index
decimal-digit digit bidi-mirrored
nil))))))
do (write-byte 0 stream)
do (write-byte 0 stream))
(loop for page across *ucd-base*
- do (write-byte (if page (gethash page hash) 0) stream))
+ do (write-byte (if page (gethash page hash) 0) stream))
(loop for page across array
- do (loop for entry across page
- do (write-byte (if entry
- (aref *misc-mapping* (aref entry 0))
- 255)
- stream)
- do (write-3-byte (if entry (aref entry 1) 0)
- stream))))))
+ do (loop for entry across page
+ do (write-byte (if entry
+ (aref *misc-mapping* (ucd-misc entry))
+ 255)
+ stream)
+ do (write-3-byte (if entry (ucd-transform entry) 0)
+ stream))))))
+ (with-open-file (f (make-pathname :name "ucd-names" :type "lisp-expr"
+ :defaults *output-directory*)
+ :direction :output
+ :if-exists :supersede
+ :if-does-not-exist :create)
+ (with-standard-io-syntax
+ (write-string ";;; Do not edit by hand: generated by ucd.lisp" f)
+ (maphash (lambda (code name)
+ (when name
+ (print code f)
+ (prin1 name f)))
+ *unicode-names*))
+ (setf *unicode-names* nil))
(with-open-file (*standard-output*
(make-pathname :name "numerics"
:type "lisp-expr"
;;; checkins which aren't released. (And occasionally for internal
;;; versions, especially for internal versions off the main CVS
;;; branch, it gets hairier, e.g. "0.pre7.14.flaky4.13".)
-"0.9.10"
+"0.9.10.1"