(setf (gethash list *misc-hash*)
(incf *misc-index*))))))
+(defun gc-index-sort-key (gc-index)
+ (or (cdr (assoc gc-index '((1 . 2) (2 . 1)))) gc-index))
+
(defun compare-misc-entry (left right)
(destructuring-bind (left-gc-index left-bidi-index left-ccc-index
left-decimal-digit left-digit left-bidi-mirrored
right
(or (and left-cl-both-case-p (not right-cl-both-case-p))
(and (or left-cl-both-case-p (not right-cl-both-case-p))
- (or (< left-gc-index right-gc-index)
+ (or (< (gc-index-sort-key left-gc-index)
+ (gc-index-sort-key right-gc-index))
(and (= left-gc-index right-gc-index)
(or (< left-decomposition-info right-decomposition-info)
(and (= left-decomposition-info right-decomposition-info)
right-bidi-mirrored))))))))))))))))))
(defun build-misc-table ()
- (sort *misc-table* #'compare-misc-entry)
+ (let ((table (sort *misc-table* #'compare-misc-entry)))
+ ;; after sorting, insert at the end a special entry to handle
+ ;; unallocated characters.
+ (setf *misc-table* (make-array (1+ (length table))))
+ (replace *misc-table* table)
+ (setf (aref *misc-table* (length table))
+ ;; unallocated characters have a GC index of 31 (not
+ ;; colliding with any other GC), are not digits or decimal
+ ;; digits, aren't BOTH-CASE-P, don't decompose, and aren't
+ ;; interestingly bidi or combining.
+ '(31 0 0 "" "" "" nil 0)))
(setq *misc-mapping* (make-array (1+ *misc-index*)))
(loop for i from 0 to *misc-index*
do (setf (aref *misc-mapping*
(gethash (aref *misc-table* i) *misc-hash*))
i)))
+(defvar *comp-table*)
+
+(defvar *exclusions*
+ (with-open-file (s (make-pathname :name "CompositionExclusions" :type "txt"
+ :defaults *unicode-character-database*))
+ (do ((line (read-line s nil nil) (read-line s nil nil))
+ result)
+ ((null line) result)
+ (when (and (> (length line) 0)
+ (char/= (char line 0) #\#))
+ (push (parse-integer line :end (position #\Space line) :radix 16)
+ result)))))
+
(defun slurp-ucd ()
+ (setf *comp-table* (make-hash-table :test 'equal))
(setq *last-uppercase* nil)
(setq *uppercase-transition-count* 0)
(setq *different-titlecases* nil)
while line
do (slurp-ucd-line line)))
(second-pass)
- (build-misc-table)
+ (fixup-compositions)
(fixup-hangul-syllables)
+ (build-misc-table)
(length *long-decompositions*))
+(defun fixup-compositions ()
+ (flet ((fixup (k v)
+ (let* ((cp (car k))
+ (ucd (aref (aref *ucd-base* (cp-high cp)) (cp-low cp)))
+ (misc (aref *misc-table* (ucd-misc ucd)))
+ (ccc-index (third misc)))
+ ;; we can do everything in the first pass except for
+ ;; accounting for decompositions where the first
+ ;; character of the decomposition is not a starter.
+ (when (/= ccc-index 0)
+ (remhash k *comp-table*)))))
+ (maphash #'fixup *comp-table*)))
+
(defun fixup-hangul-syllables ()
;; "Hangul Syllable Composition, Unicode 5.1 section 3-12"
(let* ((sbase #xac00)
(let* ((l (+ lbase (floor sindex ncount)))
(v (+ vbase (floor (mod sindex ncount) tcount)))
(tee (+ tbase (mod sindex tcount)))
+ (code-point (+ sbase sindex))
(name (format nil "HANGUL_SYLLABLE_~A~A~:[~A~;~]"
(gethash l table) (gethash v table)
(= tee tbase) (gethash tee table))))
- (setf (gethash (+ sbase sindex) *unicode-names*) name)))))
+ (setf (gethash code-point *unicode-names*) name)
+ (unless (aref *decomposition-base* (cp-high code-point))
+ (setf (aref *decomposition-base* (cp-high code-point))
+ (make-array (ash 1 *page-size-exponent*)
+ :initial-element nil)))
+ (setf (aref (aref *decomposition-base* (cp-high code-point))
+ (cp-low code-point))
+ (cons (if (= tee tbase) 2 3) 0))))))
(defun add-jamo-information (line table)
(let* ((split (split-string line #\;))
(defun split-string (line character)
(loop for prev-position = 0 then (1+ position)
- for position = (position character line :start prev-position)
- collect (subseq line prev-position position)
- do (unless position
- (loop-finish))))
+ for position = (position character line :start prev-position)
+ collect (subseq line prev-position position)
+ do (unless position
+ (loop-finish))))
(defun init-indices (strings)
(let ((hash (make-hash-table :test #'equal)))
(loop for string in strings
- for index from 0
- do (setf (gethash string hash) index))
+ for index from 0
+ do (setf (gethash string hash) index))
hash))
(defparameter *general-categories*
(parse-integer simple-titlecase :radix 16)))
(cl-both-case-p
(not (null (or (and (= gc-index 0) lower-index)
- (and (= gc-index 1) upper-index)))))
+ (and (= gc-index 1) upper-index)
+ ;; deal with prosgegrammeni / titlecase
+ (and (= gc-index 2)
+ (typep code-point '(integer #x1000 #x1fff))
+ lower-index)))))
(decomposition-info 0))
(declare (ignore digit-index))
(when (and (not cl-both-case-p)
(mapcar #'(lambda (string)
(parse-integer string :radix 16))
split)))
+ (when (= decomposition-info 1)
+ ;; Primary composition excludes:
+ ;; * singleton decompositions;
+ ;; * decompositions of non-starters;
+ ;; * script-specific decompositions;
+ ;; * later-version decompositions;
+ ;; * decompositions whose first character is a
+ ;; non-starter.
+ ;; All but the last case can be handled here;
+ ;; for the fixup, see FIXUP-COMPOSITIONS
+ (when (and (> (length decomposition) 1)
+ (= ccc-index 0)
+ (not (member code-point *exclusions*)))
+ (unless (= (length decomposition) 2)
+ (error "canonical decomposition unexpectedly long"))
+ (setf (gethash (cons (first decomposition)
+ (second decomposition))
+ *comp-table*)
+ code-point)))
(if (= (length decomposition) 1)
(cons 1 (car decomposition))
(cons (length decomposition)
(prog1 (fill-pointer *long-decompositions*)
(dolist (code decomposition)
(vector-push-extend code *long-decompositions*)))))))))
+ ;; Hangul decomposition; see Unicode 6.2 section 3-12
+ (when (= code-point #xd7a3)
+ ;; KLUDGE: it's a bit ugly to do this here when we've got
+ ;; a reasonable function to do this in
+ ;; (FIXUP-HANGUL-SYLLABLES). The problem is that the
+ ;; fixup would be somewhat tedious to do, what with all
+ ;; the careful hashing of misc data going on.
+ (setf decomposition-info 1)
+ ;; the construction of *decomposition-base* entries is,
+ ;; however, easy to handle within FIXUP-HANGUL-SYLLABLES.
+ )
(when (and (string/= "" simple-uppercase)
(string/= "" simple-lowercase))
(push (list code-point upper-index lower-index) *both-cases*))
(- *page-size-exponent*)))
(page-end (ash code-point (- *page-size-exponent*))))
(loop for point from *block-first*
- below (ash page-start *page-size-exponent*)
- do (setf (aref (aref *ucd-base* (cp-high point))
- (cp-low point))
- result))
+ below (ash page-start *page-size-exponent*)
+ do (setf (aref (aref *ucd-base* (cp-high point))
+ (cp-low point))
+ result))
(loop for page from page-start below page-end
- do (setf (aref *ucd-base* page)
- (make-array (ash 1 *page-size-exponent*)
- :initial-element result)))
+ do (setf (aref *ucd-base* page)
+ (make-array (ash 1 *page-size-exponent*)
+ :initial-element result)))
(loop for point from (ash page-end *page-size-exponent*)
- below code-point
- do (setf (aref (aref *ucd-base* (cp-high point))
- (cp-low point))
- result))))
+ below code-point
+ do (setf (aref (aref *ucd-base* (cp-high point))
+ (cp-low point))
+ result))))
(values result (normalize-character-name name)))))))
(defun slurp-ucd-line (line)
;;; inverses, which is not true in general in Unicode even for
;;; characters which change case to single characters.
(defun second-pass ()
- (loop for i from 0 below (length *ucd-base*)
- when (aref *ucd-base* i)
- do (loop for j from 0 below (length (aref *ucd-base* i))
- for result = (aref (aref *ucd-base* i) j)
- when result
- when (let* ((transform-point (ucd-transform result))
- (transform-high (ash transform-point
- (- *page-size-exponent*)))
- (transform-low (ldb (byte *page-size-exponent* 0)
- transform-point)))
- (and (plusp transform-point)
- (/= (ucd-transform
- (aref (aref *ucd-base* transform-high)
- transform-low))
- (+ (ash i *page-size-exponent*) j))))
- do (destructuring-bind (gc-index bidi-index ccc-index
- decimal-digit digit bidi-mirrored
- cl-both-case-p decomposition-info)
+ (dotimes (i (length *ucd-base*))
+ (let ((base (aref *ucd-base* i)))
+ (dotimes (j (length base)) ; base is NIL or an array
+ (let ((result (aref base j)))
+ (when result
+ ;; fixup case mappings for CL/Unicode mismatch
+ (let* ((transform-point (ucd-transform result))
+ (transform-high (ash transform-point
+ (- *page-size-exponent*)))
+ (transform-low (ldb (byte *page-size-exponent* 0)
+ transform-point)))
+ (when (and (plusp transform-point)
+ (/= (ucd-transform
+ (aref (aref *ucd-base* transform-high)
+ transform-low))
+ (+ (ash i *page-size-exponent*) j)))
+ (destructuring-bind (gc-index bidi-index ccc-index
+ decimal-digit digit bidi-mirrored
+ cl-both-case-p decomposition-info)
(aref *misc-table* (ucd-misc result))
(declare (ignore cl-both-case-p))
(format t "~A~%" (+ (ash i *page-size-exponent*) j))
(setf (ucd-misc result)
(hash-misc gc-index bidi-index ccc-index
decimal-digit digit bidi-mirrored
- nil decomposition-info))))))
+ nil decomposition-info)))))))))))
(defun write-4-byte (quadruplet stream)
(write-byte (ldb (byte 8 24) quadruplet) stream)
255
(parse-integer digit)))
-(defun output ()
+(defun output-ucd-data ()
(let ((hash (make-hash-table :test #'equalp))
(index 0))
(loop for page across *ucd-base*
(loop for (gc-index bidi-index ccc-index decimal-digit digit
bidi-mirrored nil decomposition-info)
across *misc-table*
+ ;; three bits spare here
do (write-byte gc-index stream)
+ ;; three bits spare here
do (write-byte bidi-index stream)
do (write-byte ccc-index stream)
+ ;; we could save some space here: decimal-digit and
+ ;; digit are constrained (CHECKME) to be between 0 and
+ ;; 9, so we could encode the pair in a single byte.
+ ;; (Also, decimal-digit is equal to digit or undefined,
+ ;; so we could encode decimal-digit as a single bit,
+ ;; meaning that we could save 11 bits here.
do (write-byte (digit-to-byte decimal-digit) stream)
do (write-byte (digit-to-byte digit) stream)
+ ;; there's an easy 7 bits to spare here
do (write-byte (if (string= "N" bidi-mirrored) 0 1) stream)
+ ;; at the moment we store information about which type
+ ;; of compatibility decomposition is used, costing c.3
+ ;; bits. We could elide that.
do (write-byte decomposition-info stream)
do (write-byte 0 stream))
(loop for page across *ucd-base*
(loop for page across array
do (loop for entry across page
do (write-4-byte
- (dpb (if entry (aref *misc-mapping* (ucd-misc entry)) #x7ff)
+ (dpb (if entry
+ (aref *misc-mapping* (ucd-misc entry))
+ ;; the last entry in *MISC-TABLE* (see
+ ;; BUILD-MISC-TABLE) is special,
+ ;; reserved for the information for
+ ;; characters unallocated by Unicode.
+ (1- (length *misc-table*)))
(byte 11 21)
(if entry (ucd-transform entry) 0))
- stream))))))
- ;; KLUDGE: this code, to write out decomposition information, is a
- ;; little bit very similar to the ucd entries above. Try factoring
- ;; out the common stuff?
+ stream)))))))
+
+;;; KLUDGE: this code, to write out decomposition information, is a
+;;; little bit very similar to the ucd entries above. Try factoring
+;;; out the common stuff?
+(defun output-decomposition-data ()
(let ((hash (make-hash-table :test #'equalp))
(index 0))
(loop for page across *decomposition-base*
- do (when page
- (unless (gethash page hash)
- (setf (gethash page hash)
- (prog1 index (incf index))))))
+ do (when page
+ (unless (gethash page hash)
+ (setf (gethash page hash)
+ (prog1 index (incf index))))))
(let ((array (make-array index)))
(maphash #'(lambda (key value)
(setf (aref array value) key))
:if-exists :supersede
:if-does-not-exist :create)
(loop for code across (copy-seq *long-decompositions*)
- do (write-4-byte code stream)))))
+ do (write-4-byte code stream))))))
+
+(defun output-composition-data ()
+ #+nil ; later
+ (let (firsts seconds)
+ (flet ((frob (k v)
+ (declare (ignore v))
+ (pushnew (car k) firsts)
+ (pushnew (cdr k) seconds)))
+ (maphash #'frob *comp-table*)))
+ (with-open-file (stream (make-pathname :name "comp" :type "dat"
+ :defaults *output-directory*)
+ :direction :output
+ :element-type '(unsigned-byte 8)
+ :if-exists :supersede :if-does-not-exist :create)
+ (maphash (lambda (k v)
+ (write-4-byte (car k) stream)
+ (write-4-byte (cdr k) stream)
+ (write-4-byte v stream))
+ *comp-table*)))
+
+(defun output ()
+ (output-ucd-data)
+ (output-decomposition-data)
+ (output-composition-data)
(with-open-file (f (make-pathname :name "ucd-names" :type "lisp-expr"
:defaults *output-directory*)
:direction :output
(write-string ";;; Do not edit by hand: generated by ucd.lisp" f)
(maphash (lambda (code name)
(when name
- (print code f)
- (prin1 name f)))
+ (print code f)
+ (prin1 name f)))
*unicode-names*))
(setf *unicode-names* nil))
(with-open-file (*standard-output*
(with-standard-io-syntax
(let ((*print-pretty* t))
(prin1 `(:length ,(length *misc-table*)
- :uppercase ,(loop for (gc-index) across *misc-table*
- for i from 0
- when (= gc-index 0)
- collect i)
- :lowercase ,(loop for (gc-index) across *misc-table*
- for i from 0
- when (= gc-index 1)
- collect i)
- :titlecase ,(loop for (gc-index) across *misc-table*
- for i from 0
- when (= gc-index 2)
- collect i))))))
+ :uppercase ,(loop for (gc-index) across *misc-table*
+ for i from 0
+ when (= gc-index 0)
+ collect i)
+ :lowercase ,(loop for (gc-index) across *misc-table*
+ for i from 0
+ when (= gc-index 1)
+ collect i)
+ :titlecase ,(loop for (gc-index) across *misc-table*
+ for i from 0
+ when (= gc-index 2)
+ collect i))))))
(values))
;;; Use of the generated files