beginnings of decomposition
authorChristophe Rhodes <csr21@cantab.net>
Sun, 17 Mar 2013 21:23:59 +0000 (21:23 +0000)
committerChristophe Rhodes <csr21@cantab.net>
Sat, 18 May 2013 19:04:04 +0000 (20:04 +0100)
Store enough information in output from ucd.lisp to be able to actually
decompose individual characters.  Include proof-of-concept implementation
of decomposition, not hooked into anything yet.

src/code/target-char.lisp
tools-for-build/ucd.lisp

index 262cead..0c2b5f7 100644 (file)
@@ -33,9 +33,9 @@
                                   :directory
                                   '(:relative :up :up "output")
                                   :name name :type type)
-                                 sb!xc:*compile-file-truename*)))
-         (let ((character-database
-                (with-open-file (stream (file "ucd" "dat")
+                                 sb!xc:*compile-file-truename*))
+              (read-ub8-vector (pathname)
+                (with-open-file (stream pathname
                                         :direction :input
                                         :element-type '(unsigned-byte 8))
                   (let* ((length (file-length stream))
                                  length :element-type '(unsigned-byte 8))))
                     (read-sequence array stream)
                     array))))
+         (let ((character-database (read-ub8-vector (file "ucd" "dat")))
+               (decompositions (read-ub8-vector (file "decomp" "dat")))
+               (long-decompositions (read-ub8-vector (file "ldecomp" "dat"))))
            `(progn
-              (declaim (type (simple-array (unsigned-byte 8) (*)) **character-database**))
+              (declaim (type (simple-array (unsigned-byte 8) (*)) **character-database** **character-decompositions** **character-long-decompositions**))
               (defglobal **character-database** ,character-database)
+              (defglobal **character-decompositions** ,decompositions)
+              (defglobal **character-long-decompositions** ,long-decompositions)
               (defun !character-database-cold-init ()
                 (setf **character-database** ,character-database))
               ,(with-open-file (stream (file "ucd-names" "lisp-expr")
@@ -602,3 +607,39 @@ character exists."
   (and (typep weight 'fixnum)
        (>= weight 0) (< weight radix) (< weight 36)
        (code-char (if (< weight 10) (+ 48 weight) (+ 55 weight)))))
+\f
+(defun char-decomposition-info (char)
+  (aref **character-database** (+ 6 (* 8 (ucd-value-0 char)))))
+
+(defun char-decomposition (char)
+  (let* ((cp (char-code char))
+         (cp-high (ash cp -8))
+         (decompositions **character-decompositions**)
+         (long-decompositions **character-long-decompositions**)
+         (index (+ #x1100
+                   (ash  (aref decompositions cp-high) 10)
+                   (ash (ldb (byte 8 0) cp) 2)))
+         (v0 (aref decompositions index))
+         (v1 (aref decompositions (+ index 1)))
+         (v2 (aref decompositions (+ index 2)))
+         (v3 (aref decompositions (+ index 3)))
+         (length (dpb v0 (byte 8 3) (ldb (byte 3 5) v1)))
+         (entry (dpb (ldb (byte 5 0) v1) (byte 5 16)
+                     (dpb v2 (byte 8 8) v3))))
+    (if (= length 1)
+        (string (code-char entry))
+        (let ((result (make-string length))
+              (e (* 4 entry)))
+          (dotimes (i length result)
+            (let ((code (dpb (aref long-decompositions (+ e 1))
+                             (byte 8 16)
+                             (dpb (aref long-decompositions (+ e 2))
+                                  (byte 8 8)
+                                  (aref long-decompositions (+ e 3))))))
+              (setf (char result i) (code-char code)))
+            (incf e 4))))))
+
+(defun decompose-char (char)
+  (if (= (char-decomposition-info char) 0)
+      (string char)
+      (char-decomposition char)))
index 1a717f7..623cb21 100644 (file)
@@ -35,8 +35,7 @@
 (defparameter *misc-table* nil)
 (defparameter *misc-mapping* nil)
 (defparameter *both-cases* nil)
-(defparameter *decompositions* nil)
-(defparameter *decomposition-length-max* nil)
+(defparameter *long-decompositions* nil)
 (defparameter *decomposition-types* nil)
 (defparameter *decomposition-base* nil)
 
   (setq *misc-index* -1)
   (setq *misc-table* (make-array 2048 :fill-pointer 0))
   (setq *both-cases* nil)
-  (setq *decompositions* 0)
+  (setq *long-decompositions*
+        (make-array 2048 :fill-pointer 0 :adjustable t))
   (setq *decomposition-types*
         (let ((array (make-array 256 :initial-element nil :fill-pointer 1)))
           (vector-push "" array)
           (vector-push "<compat>" array)
           array))
-  (setq *decomposition-length-max* 0)
   (setq *decomposition-base* (make-array (ash #x110000
                                               (- *page-size-exponent*))
                                          :initial-element nil))
   (second-pass)
   (build-misc-table)
   (fixup-hangul-syllables)
-  *decompositions*)
+  (length *long-decompositions*))
 
 (defun fixup-hangul-syllables ()
   ;; "Hangul Syllable Composition, Unicode 5.1 section 3-12"
                                   :initial-element nil)))
               (setf (aref (aref *decomposition-base* (cp-high code-point))
                           (cp-low code-point))
-                    (mapcar #'(lambda (string)
-                                (parse-integer string :radix 16))
-                            split))
-              (setq *decomposition-length-max*
-                    (max *decomposition-length-max* (length split)))
-              (incf *decompositions* (length split))))
+                    (let ((decomposition
+                           (mapcar #'(lambda (string)
+                                       (parse-integer string :radix 16))
+                                   split)))
+                      (if (= (length decomposition) 1)
+                          (cons 1 (car decomposition))
+                          (cons (length decomposition)
+                                (prog1 (fill-pointer *long-decompositions*)
+                                  (dolist (code decomposition)
+                                    (vector-push-extend code *long-decompositions*)))))))))
           (when (and (string/= "" simple-uppercase)
                      (string/= "" simple-lowercase))
             (push (list code-point upper-index lower-index) *both-cases*))
                           (byte 11 21)
                           (if entry (ucd-transform entry) 0))
                      stream))))))
+  ;; KLUDGE: this code, to write out decomposition information, is a
+  ;; little bit very similar to the ucd entries above.  Try factoring
+  ;; out the common stuff?
+  (let ((hash (make-hash-table :test #'equalp))
+        (index 0))
+    (loop for page across *decomposition-base*
+          do (when page
+               (unless (gethash page hash)
+                 (setf (gethash page hash)
+                       (prog1 index (incf index))))))
+    (let ((array (make-array index)))
+      (maphash #'(lambda (key value)
+                   (setf (aref array value) key))
+               hash)
+      (with-open-file (stream (make-pathname :name "decomp" :type "dat"
+                                             :defaults *output-directory*)
+                              :direction :output
+                              :element-type '(unsigned-byte 8)
+                              :if-exists :supersede
+                              :if-does-not-exist :create)
+        (loop for page across *decomposition-base*
+           do (write-byte (if page (gethash page hash) 0) stream))
+        (loop for page across array
+           do (loop for entry across page
+                 do (write-4-byte
+                     (dpb (if entry (car entry) 0)
+                          (byte 11 21)
+                          (if entry (cdr entry) 0))
+                     stream))))
+      (with-open-file (stream (make-pathname :name "ldecomp" :type "dat"
+                                             :defaults *output-directory*)
+                              :direction :output
+                              :element-type '(unsigned-byte 8)
+                              :if-exists :supersede
+                              :if-does-not-exist :create)
+        (loop for code across (copy-seq *long-decompositions*)
+           do (write-4-byte code stream)))))
   (with-open-file (f (make-pathname :name "ucd-names" :type "lisp-expr"
                                     :defaults *output-directory*)
                      :direction :output