(defparameter *misc-table* nil)
(defparameter *misc-mapping* nil)
(defparameter *both-cases* nil)
-(defparameter *decompositions* nil)
-(defparameter *decomposition-length-max* nil)
+(defparameter *long-decompositions* nil)
(defparameter *decomposition-types* nil)
(defparameter *decomposition-base* nil)
(defun hash-misc (gc-index bidi-index ccc-index decimal-digit digit
- bidi-mirrored cl-both-case-p)
+ bidi-mirrored cl-both-case-p decomposition-info)
(let* ((list (list gc-index bidi-index ccc-index decimal-digit digit
- bidi-mirrored cl-both-case-p))
+ bidi-mirrored cl-both-case-p decomposition-info))
(index (gethash list *misc-hash*)))
(or index
(progn
(setf (gethash list *misc-hash*)
(incf *misc-index*))))))
+(defun gc-index-sort-key (gc-index)
+ (or (cdr (assoc gc-index '((1 . 2) (2 . 1)))) gc-index))
+
(defun compare-misc-entry (left right)
(destructuring-bind (left-gc-index left-bidi-index left-ccc-index
left-decimal-digit left-digit left-bidi-mirrored
- left-cl-both-case-p)
+ left-cl-both-case-p left-decomposition-info)
left
(destructuring-bind (right-gc-index right-bidi-index right-ccc-index
right-decimal-digit right-digit right-bidi-mirrored
- right-cl-both-case-p)
+ right-cl-both-case-p right-decomposition-info)
right
(or (and left-cl-both-case-p (not right-cl-both-case-p))
(and (or left-cl-both-case-p (not right-cl-both-case-p))
- (or (< left-gc-index right-gc-index)
+ (or (< (gc-index-sort-key left-gc-index)
+ (gc-index-sort-key right-gc-index))
(and (= left-gc-index right-gc-index)
- (or (< left-bidi-index right-bidi-index)
- (and (= left-bidi-index right-bidi-index)
- (or (< left-ccc-index right-ccc-index)
- (and (= left-ccc-index right-ccc-index)
- (or (string< left-decimal-digit
- right-decimal-digit)
- (and (string= left-decimal-digit
- right-decimal-digit)
- (or (string< left-digit right-digit)
- (and (string= left-digit
- right-digit)
- (string< left-bidi-mirrored
- right-bidi-mirrored))))))))))))))))
+ (or (< left-decomposition-info right-decomposition-info)
+ (and (= left-decomposition-info right-decomposition-info)
+ (or (< left-bidi-index right-bidi-index)
+ (and (= left-bidi-index right-bidi-index)
+ (or (< left-ccc-index right-ccc-index)
+ (and (= left-ccc-index right-ccc-index)
+ (or (string< left-decimal-digit
+ right-decimal-digit)
+ (and (string= left-decimal-digit
+ right-decimal-digit)
+ (or (string< left-digit right-digit)
+ (and (string= left-digit
+ right-digit)
+ (string< left-bidi-mirrored
+ right-bidi-mirrored))))))))))))))))))
(defun build-misc-table ()
- (sort *misc-table* #'compare-misc-entry)
+ (let ((table (sort *misc-table* #'compare-misc-entry)))
+ ;; after sorting, insert at the end a special entry to handle
+ ;; unallocated characters.
+ (setf *misc-table* (make-array (1+ (length table))))
+ (replace *misc-table* table)
+ (setf (aref *misc-table* (length table))
+ ;; unallocated characters have a GC index of 31 (not
+ ;; colliding with any other GC), are not digits or decimal
+ ;; digits, aren't BOTH-CASE-P, don't decompose, and aren't
+ ;; interestingly bidi or combining.
+ '(31 0 0 "" "" "" nil 0)))
(setq *misc-mapping* (make-array (1+ *misc-index*)))
(loop for i from 0 to *misc-index*
- do (setf (aref *misc-mapping*
- (gethash (aref *misc-table* i) *misc-hash*))
- i)))
+ do (setf (aref *misc-mapping*
+ (gethash (aref *misc-table* i) *misc-hash*))
+ i)))
+
+(defvar *comp-table*)
+
+(defvar *exclusions*
+ (with-open-file (s (make-pathname :name "CompositionExclusions" :type "txt"
+ :defaults *unicode-character-database*))
+ (do ((line (read-line s nil nil) (read-line s nil nil))
+ result)
+ ((null line) result)
+ (when (and (> (length line) 0)
+ (char/= (char line 0) #\#))
+ (push (parse-integer line :end (position #\Space line) :radix 16)
+ result)))))
(defun slurp-ucd ()
+ (setf *comp-table* (make-hash-table :test 'equal))
(setq *last-uppercase* nil)
(setq *uppercase-transition-count* 0)
(setq *different-titlecases* nil)
(setq *name-size* 0)
(setq *misc-hash* (make-hash-table :test #'equal))
(setq *misc-index* -1)
- (setq *misc-table* (make-array 256 :fill-pointer 0))
+ (setq *misc-table* (make-array 2048 :fill-pointer 0))
(setq *both-cases* nil)
- (setq *decompositions* 0)
- (setq *decomposition-types* (make-hash-table :test #'equal))
- (setq *decomposition-length-max* 0)
+ (setq *long-decompositions*
+ (make-array 2048 :fill-pointer 0 :adjustable t))
+ (setq *decomposition-types*
+ (let ((array (make-array 256 :initial-element nil :fill-pointer 1)))
+ (vector-push "" array)
+ (vector-push "<compat>" array)
+ array))
(setq *decomposition-base* (make-array (ash #x110000
(- *page-size-exponent*))
:initial-element nil))
while line
do (slurp-ucd-line line)))
(second-pass)
- (build-misc-table)
+ (fixup-compositions)
(fixup-hangul-syllables)
- *decompositions*)
+ (build-misc-table)
+ (length *long-decompositions*))
+
+(defun fixup-compositions ()
+ (flet ((fixup (k v)
+ (let* ((cp (car k))
+ (ucd (aref (aref *ucd-base* (cp-high cp)) (cp-low cp)))
+ (misc (aref *misc-table* (ucd-misc ucd)))
+ (ccc-index (third misc)))
+ ;; we can do everything in the first pass except for
+ ;; accounting for decompositions where the first
+ ;; character of the decomposition is not a starter.
+ (when (/= ccc-index 0)
+ (remhash k *comp-table*)))))
+ (maphash #'fixup *comp-table*)))
(defun fixup-hangul-syllables ()
;; "Hangul Syllable Composition, Unicode 5.1 section 3-12"
(let* ((l (+ lbase (floor sindex ncount)))
(v (+ vbase (floor (mod sindex ncount) tcount)))
(tee (+ tbase (mod sindex tcount)))
+ (code-point (+ sbase sindex))
(name (format nil "HANGUL_SYLLABLE_~A~A~:[~A~;~]"
(gethash l table) (gethash v table)
(= tee tbase) (gethash tee table))))
- (setf (gethash (+ sbase sindex) *unicode-names*) name)))))
+ (setf (gethash code-point *unicode-names*) name)
+ (unless (aref *decomposition-base* (cp-high code-point))
+ (setf (aref *decomposition-base* (cp-high code-point))
+ (make-array (ash 1 *page-size-exponent*)
+ :initial-element nil)))
+ (setf (aref (aref *decomposition-base* (cp-high code-point))
+ (cp-low code-point))
+ (cons (if (= tee tbase) 2 3) 0))))))
(defun add-jamo-information (line table)
(let* ((split (split-string line #\;))
(defun split-string (line character)
(loop for prev-position = 0 then (1+ position)
- for position = (position character line :start prev-position)
- collect (subseq line prev-position position)
- do (unless position
- (loop-finish))))
+ for position = (position character line :start prev-position)
+ collect (subseq line prev-position position)
+ do (unless position
+ (loop-finish))))
(defun init-indices (strings)
(let ((hash (make-hash-table :test #'equal)))
(loop for string in strings
- for index from 0
- do (setf (gethash string hash) index))
+ for index from 0
+ do (setf (gethash string hash) index))
hash))
(defparameter *general-categories*
(parse-integer simple-titlecase :radix 16)))
(cl-both-case-p
(not (null (or (and (= gc-index 0) lower-index)
- (and (= gc-index 1) upper-index)))))
- (misc-index (hash-misc gc-index bidi-index ccc-index
- decimal-digit digit bidi-mirrored
- cl-both-case-p)))
+ (and (= gc-index 1) upper-index)
+ ;; deal with prosgegrammeni / titlecase
+ (and (= gc-index 2)
+ (typep code-point '(integer #x1000 #x1fff))
+ lower-index)))))
+ (decomposition-info 0))
(declare (ignore digit-index))
(when (and (not cl-both-case-p)
(< gc-index 2))
(format t "~A~%" name))
(incf *name-size* (length name))
(when (string/= "" decomposition-type-and-mapping)
- (let ((split (split-string decomposition-type-and-mapping
- #\Space)))
- (when (char= #\< (aref (first split) 0))
- (setf (gethash (pop split) *decomposition-types*) t))
+ (let ((split (split-string decomposition-type-and-mapping #\Space)))
+ (cond
+ ((char= #\< (aref (first split) 0))
+ (unless (position (first split) *decomposition-types*
+ :test #'equal)
+ (vector-push (first split) *decomposition-types*))
+ (setf decomposition-info (position (pop split) *decomposition-types* :test #'equal)))
+ (t (setf decomposition-info 1)))
(unless (aref *decomposition-base* (cp-high code-point))
(setf (aref *decomposition-base* (cp-high code-point))
(make-array (ash 1 *page-size-exponent*)
:initial-element nil)))
(setf (aref (aref *decomposition-base* (cp-high code-point))
(cp-low code-point))
- (mapcar #'(lambda (string)
- (parse-integer string :radix 16))
- split))
- (setq *decomposition-length-max*
- (max *decomposition-length-max* (length split)))
- (incf *decompositions* (length split))))
+ (let ((decomposition
+ (mapcar #'(lambda (string)
+ (parse-integer string :radix 16))
+ split)))
+ (when (= decomposition-info 1)
+ ;; Primary composition excludes:
+ ;; * singleton decompositions;
+ ;; * decompositions of non-starters;
+ ;; * script-specific decompositions;
+ ;; * later-version decompositions;
+ ;; * decompositions whose first character is a
+ ;; non-starter.
+ ;; All but the last case can be handled here;
+ ;; for the fixup, see FIXUP-COMPOSITIONS
+ (when (and (> (length decomposition) 1)
+ (= ccc-index 0)
+ (not (member code-point *exclusions*)))
+ (unless (= (length decomposition) 2)
+ (error "canonical decomposition unexpectedly long"))
+ (setf (gethash (cons (first decomposition)
+ (second decomposition))
+ *comp-table*)
+ code-point)))
+ (if (= (length decomposition) 1)
+ (cons 1 (car decomposition))
+ (cons (length decomposition)
+ (prog1 (fill-pointer *long-decompositions*)
+ (dolist (code decomposition)
+ (vector-push-extend code *long-decompositions*)))))))))
+ ;; Hangul decomposition; see Unicode 6.2 section 3-12
+ (when (= code-point #xd7a3)
+ ;; KLUDGE: it's a bit ugly to do this here when we've got
+ ;; a reasonable function to do this in
+ ;; (FIXUP-HANGUL-SYLLABLES). The problem is that the
+ ;; fixup would be somewhat tedious to do, what with all
+ ;; the careful hashing of misc data going on.
+ (setf decomposition-info 1)
+ ;; the construction of *decomposition-base* entries is,
+ ;; however, easy to handle within FIXUP-HANGUL-SYLLABLES.
+ )
(when (and (string/= "" simple-uppercase)
(string/= "" simple-lowercase))
(push (list code-point upper-index lower-index) *both-cases*))
(setq *last-uppercase* nil)))
(when (> ccc-index 255)
(error "canonical combining class too large ~A" ccc-index))
- (let ((result (make-ucd :misc misc-index
- :transform (or upper-index lower-index 0))))
+ (let* ((misc-index (hash-misc gc-index bidi-index ccc-index
+ decimal-digit digit bidi-mirrored
+ cl-both-case-p decomposition-info))
+ (result (make-ucd :misc misc-index
+ :transform (or upper-index lower-index 0))))
(when (and (> (length name) 7)
(string= ", Last>" name :start2 (- (length name) 7)))
(let ((page-start (ash (+ *block-first*
(- *page-size-exponent*)))
(page-end (ash code-point (- *page-size-exponent*))))
(loop for point from *block-first*
- below (ash page-start *page-size-exponent*)
- do (setf (aref (aref *ucd-base* (cp-high point))
- (cp-low point))
- result))
+ below (ash page-start *page-size-exponent*)
+ do (setf (aref (aref *ucd-base* (cp-high point))
+ (cp-low point))
+ result))
(loop for page from page-start below page-end
- do (setf (aref *ucd-base* page)
- (make-array (ash 1 *page-size-exponent*)
- :initial-element result)))
+ do (setf (aref *ucd-base* page)
+ (make-array (ash 1 *page-size-exponent*)
+ :initial-element result)))
(loop for point from (ash page-end *page-size-exponent*)
- below code-point
- do (setf (aref (aref *ucd-base* (cp-high point))
- (cp-low point))
- result))))
+ below code-point
+ do (setf (aref (aref *ucd-base* (cp-high point))
+ (cp-low point))
+ result))))
(values result (normalize-character-name name)))))))
(defun slurp-ucd-line (line)
(setf (aref (aref *ucd-base* code-high) code-low) encoding
(gethash code-point *unicode-names*) name))))
+;;; this fixes up the case conversion discrepancy between CL and
+;;; Unicode: CL operators depend on char-downcase / char-upcase being
+;;; inverses, which is not true in general in Unicode even for
+;;; characters which change case to single characters.
(defun second-pass ()
- (loop for i from 0 below (length *ucd-base*)
- when (aref *ucd-base* i)
- do (loop for j from 0 below (length (aref *ucd-base* i))
- for result = (aref (aref *ucd-base* i) j)
- when result
- when (let* ((transform-point (ucd-transform result))
- (transform-high (ash transform-point
- (- *page-size-exponent*)))
- (transform-low (ldb (byte *page-size-exponent* 0)
- transform-point)))
- (and (plusp transform-point)
- (/= (ucd-transform
- (aref (aref *ucd-base* transform-high)
- transform-low))
- (+ (ash i *page-size-exponent*) j))))
- do (destructuring-bind (gc-index bidi-index ccc-index
- decimal-digit digit bidi-mirrored
- cl-both-case-p)
+ (dotimes (i (length *ucd-base*))
+ (let ((base (aref *ucd-base* i)))
+ (dotimes (j (length base)) ; base is NIL or an array
+ (let ((result (aref base j)))
+ (when result
+ ;; fixup case mappings for CL/Unicode mismatch
+ (let* ((transform-point (ucd-transform result))
+ (transform-high (ash transform-point
+ (- *page-size-exponent*)))
+ (transform-low (ldb (byte *page-size-exponent* 0)
+ transform-point)))
+ (when (and (plusp transform-point)
+ (/= (ucd-transform
+ (aref (aref *ucd-base* transform-high)
+ transform-low))
+ (+ (ash i *page-size-exponent*) j)))
+ (destructuring-bind (gc-index bidi-index ccc-index
+ decimal-digit digit bidi-mirrored
+ cl-both-case-p decomposition-info)
(aref *misc-table* (ucd-misc result))
(declare (ignore cl-both-case-p))
(format t "~A~%" (+ (ash i *page-size-exponent*) j))
(setf (ucd-misc result)
(hash-misc gc-index bidi-index ccc-index
decimal-digit digit bidi-mirrored
- nil))))))
+ nil decomposition-info)))))))))))
-(defun write-3-byte (triplet stream)
- (write-byte (ldb (byte 8 0) triplet) stream)
- (write-byte (ldb (byte 8 8) triplet) stream)
- (write-byte (ldb (byte 8 16) triplet) stream))
+(defun write-4-byte (quadruplet stream)
+ (write-byte (ldb (byte 8 24) quadruplet) stream)
+ (write-byte (ldb (byte 8 16) quadruplet) stream)
+ (write-byte (ldb (byte 8 8) quadruplet) stream)
+ (write-byte (ldb (byte 8 0) quadruplet) stream))
(defun digit-to-byte (digit)
(if (string= "" digit)
255
(parse-integer digit)))
-(defun output ()
+(defun output-ucd-data ()
(let ((hash (make-hash-table :test #'equalp))
(index 0))
(loop for page across *ucd-base*
:if-exists :supersede
:if-does-not-exist :create)
(loop for (gc-index bidi-index ccc-index decimal-digit digit
- bidi-mirrored)
+ bidi-mirrored nil decomposition-info)
across *misc-table*
+ ;; three bits spare here
do (write-byte gc-index stream)
+ ;; three bits spare here
do (write-byte bidi-index stream)
do (write-byte ccc-index stream)
+ ;; we could save some space here: decimal-digit and
+ ;; digit are constrained (CHECKME) to be between 0 and
+ ;; 9, so we could encode the pair in a single byte.
+ ;; (Also, decimal-digit is equal to digit or undefined,
+ ;; so we could encode decimal-digit as a single bit,
+ ;; meaning that we could save 11 bits here.
do (write-byte (digit-to-byte decimal-digit) stream)
do (write-byte (digit-to-byte digit) stream)
+ ;; there's an easy 7 bits to spare here
do (write-byte (if (string= "N" bidi-mirrored) 0 1) stream)
- do (write-byte 0 stream)
+ ;; at the moment we store information about which type
+ ;; of compatibility decomposition is used, costing c.3
+ ;; bits. We could elide that.
+ do (write-byte decomposition-info stream)
do (write-byte 0 stream))
(loop for page across *ucd-base*
do (write-byte (if page (gethash page hash) 0) stream))
(loop for page across array
do (loop for entry across page
- do (write-byte (if entry
- (aref *misc-mapping* (ucd-misc entry))
- 255)
- stream)
- do (write-3-byte (if entry (ucd-transform entry) 0)
- stream))))))
+ do (write-4-byte
+ (dpb (if entry
+ (aref *misc-mapping* (ucd-misc entry))
+ ;; the last entry in *MISC-TABLE* (see
+ ;; BUILD-MISC-TABLE) is special,
+ ;; reserved for the information for
+ ;; characters unallocated by Unicode.
+ (1- (length *misc-table*)))
+ (byte 11 21)
+ (if entry (ucd-transform entry) 0))
+ stream)))))))
+
+;;; KLUDGE: this code, to write out decomposition information, is a
+;;; little bit very similar to the ucd entries above. Try factoring
+;;; out the common stuff?
+(defun output-decomposition-data ()
+ (let ((hash (make-hash-table :test #'equalp))
+ (index 0))
+ (loop for page across *decomposition-base*
+ do (when page
+ (unless (gethash page hash)
+ (setf (gethash page hash)
+ (prog1 index (incf index))))))
+ (let ((array (make-array index)))
+ (maphash #'(lambda (key value)
+ (setf (aref array value) key))
+ hash)
+ (with-open-file (stream (make-pathname :name "decomp" :type "dat"
+ :defaults *output-directory*)
+ :direction :output
+ :element-type '(unsigned-byte 8)
+ :if-exists :supersede
+ :if-does-not-exist :create)
+ (loop for page across *decomposition-base*
+ do (write-byte (if page (gethash page hash) 0) stream))
+ (loop for page across array
+ do (loop for entry across page
+ do (write-4-byte
+ (dpb (if entry (car entry) 0)
+ (byte 11 21)
+ (if entry (cdr entry) 0))
+ stream))))
+ (with-open-file (stream (make-pathname :name "ldecomp" :type "dat"
+ :defaults *output-directory*)
+ :direction :output
+ :element-type '(unsigned-byte 8)
+ :if-exists :supersede
+ :if-does-not-exist :create)
+ (loop for code across (copy-seq *long-decompositions*)
+ do (write-4-byte code stream))))))
+
+(defun output-composition-data ()
+ #+nil ; later
+ (let (firsts seconds)
+ (flet ((frob (k v)
+ (declare (ignore v))
+ (pushnew (car k) firsts)
+ (pushnew (cdr k) seconds)))
+ (maphash #'frob *comp-table*)))
+ (with-open-file (stream (make-pathname :name "comp" :type "dat"
+ :defaults *output-directory*)
+ :direction :output
+ :element-type '(unsigned-byte 8)
+ :if-exists :supersede :if-does-not-exist :create)
+ (maphash (lambda (k v)
+ (write-4-byte (car k) stream)
+ (write-4-byte (cdr k) stream)
+ (write-4-byte v stream))
+ *comp-table*)))
+
+(defun output ()
+ (output-ucd-data)
+ (output-decomposition-data)
+ (output-composition-data)
(with-open-file (f (make-pathname :name "ucd-names" :type "lisp-expr"
:defaults *output-directory*)
:direction :output
(write-string ";;; Do not edit by hand: generated by ucd.lisp" f)
(maphash (lambda (code name)
(when name
- (print code f)
- (prin1 name f)))
+ (print code f)
+ (prin1 name f)))
*unicode-names*))
(setf *unicode-names* nil))
(with-open-file (*standard-output*
(with-standard-io-syntax
(let ((*print-pretty* t))
(prin1 `(:length ,(length *misc-table*)
- :uppercase ,(loop for (gc-index) across *misc-table*
- for i from 0
- when (= gc-index 0)
- collect i)
- :lowercase ,(loop for (gc-index) across *misc-table*
- for i from 0
- when (= gc-index 1)
- collect i)
- :titlecase ,(loop for (gc-index) across *misc-table*
- for i from 0
- when (= gc-index 2)
- collect i))))))
+ :uppercase ,(loop for (gc-index) across *misc-table*
+ for i from 0
+ when (= gc-index 0)
+ collect i)
+ :lowercase ,(loop for (gc-index) across *misc-table*
+ for i from 0
+ when (= gc-index 1)
+ collect i)
+ :titlecase ,(loop for (gc-index) across *misc-table*
+ for i from 0
+ when (= gc-index 2)
+ collect i))))))
(values))
;;; Use of the generated files
(values))
;;; The stuff below is dependent on misc.lisp-expr being
-;;; (:LENGTH 217 :UPPERCASE (0 2) :LOWERCASE (1 3) :TITLECASE (4)).
;;;
-;;; There are two entries for UPPERCASE and LOWERCASE because some
-;;; characters have case (by Unicode standards) but are not
-;;; transformable character-by-character in a locale-independent way
-;;; (as CL requires for its standard operators).
+;;; (:LENGTH 395 :UPPERCASE (0 1 2 3 8 9 10 11) :LOWERCASE (4 5 6 7 12 13 14 15) :TITLECASE (16 17))
+;;;
+;;; There are two groups of entries for UPPERCASE and LOWERCASE
+;;; because some characters have case (by Unicode standards) but are
+;;; not transformable character-by-character in a locale-independent
+;;; way (as CL requires for its standard operators).
;;;
;;; for more details on these debugging functions, see the description
;;; of the character database format in src/code/target-char.lisp
-(defparameter *length* 217)
+(defparameter *length* 395)
(defun cp-index (cp)
(let* ((cp-high (cp-high cp))
(* 4 (cp-low cp)))))
(defun cp-value-0 (cp)
- (aref *compiled-ucd* (cp-index cp)))
+ (let ((index (cp-index cp)))
+ (dpb (aref *compiled-ucd* index)
+ (byte 8 3)
+ (ldb (byte 3 5) (aref *compiled-ucd* (1+ index))))))
(defun cp-value-1 (cp)
(let ((index (cp-index cp)))
- (dpb (aref *compiled-ucd* (+ index 3)) (byte 8 16)
+ (dpb (aref *compiled-ucd* (1+ index)) (byte 5 16)
(dpb (aref *compiled-ucd* (+ index 2)) (byte 8 8)
- (aref *compiled-ucd* (1+ index))))))
+ (aref *compiled-ucd* (+ index 3))))))
(defun cp-general-category (cp)
(aref *compiled-ucd* (* 8 (cp-value-0 cp))))
(<= 160 cp)))
(defun cp-char-upcase (cp)
- (if (= (cp-value-0 cp) 1)
+ (if (< 3 (cp-value-0 cp) 8)
(cp-value-1 cp)
cp))
(defun cp-char-downcase (cp)
- (if (= (cp-value-0 cp) 0)
+ (if (< (cp-value-0 cp) 4)
(cp-value-1 cp)
cp))
(defun cp-upper-case-p (cp)
- (= (cp-value-0 cp) 0))
+ (< (cp-value-0 cp) 4))
(defun cp-lower-case-p (cp)
- (= (cp-value-0 cp) 1))
+ (< 3 (cp-value-0 cp) 8))
(defun cp-both-case-p (cp)
- (< (cp-value-0 cp) 2))
+ (< (cp-value-0 cp) 8))