From 4aad92d800164f49530ad5f2bb07d81f61e2911b Mon Sep 17 00:00:00 2001 From: Christophe Rhodes Date: Wed, 27 Mar 2013 12:58:19 +0000 Subject: [PATCH] handle Hangul syllable decomposition Entries for the codepoint range (#xac00 -- #xd7a3) have 1 for their decomposition-info, a decomposition length of 2 or 3, but a zero decomposition index (the decomposition is handled algorithmically instead). --- src/code/target-char.lisp | 42 ++++++++++++++++++++++++++++++++---------- tools-for-build/ucd.lisp | 35 +++++++++++++++++++++++++++-------- 2 files changed, 59 insertions(+), 18 deletions(-) diff --git a/src/code/target-char.lisp b/src/code/target-char.lisp index 6d02245..77ec7b2 100644 --- a/src/code/target-char.lisp +++ b/src/code/target-char.lisp @@ -631,16 +631,38 @@ character exists." (dpb v2 (byte 8 8) v3)))) (if (= length 1) (string (code-char entry)) - (let ((result (make-string length)) - (e (* 4 entry))) - (dotimes (i length result) - (let ((code (dpb (aref long-decompositions (+ e 1)) - (byte 8 16) - (dpb (aref long-decompositions (+ e 2)) - (byte 8 8) - (aref long-decompositions (+ e 3)))))) - (setf (char result i) (code-char code))) - (incf e 4)))))) + (if (<= #xac00 cp #xd7a3) + ;; see Unicode 6.2, section 3-12 + (let* ((sbase #xac00) + (lbase #x1100) + (vbase #x1161) + (tbase #x11a7) + (lcount 19) + (vcount 21) + (tcount 28) + (ncount (* vcount tcount)) + (scount (* lcount ncount)) + (sindex (- cp sbase)) + (lindex (floor sindex ncount)) + (vindex (floor (mod sindex ncount) tcount)) + (tindex (mod sindex tcount)) + (result (make-string length))) + (declare (ignore scount)) + (setf (char result 0) (code-char (+ lbase lindex))) + (setf (char result 1) (code-char (+ vbase vindex))) + (when (> tindex 0) + (setf (char result 2) (code-char (+ tbase tindex)))) + result) + (let ((result (make-string length)) + (e (* 4 entry))) + (dotimes (i length result) + (let ((code (dpb (aref long-decompositions (+ e 1)) + (byte 8 16) + (dpb (aref long-decompositions (+ e 2)) + (byte 8 8) + (aref long-decompositions (+ e 3)))))) + (setf (char result i) (code-char code))) + (incf e 4))))))) (defun decompose-char (char) (if (= (char-decomposition-info char) 0) diff --git a/tools-for-build/ucd.lisp b/tools-for-build/ucd.lisp index f88238f..ddc49b0 100644 --- a/tools-for-build/ucd.lisp +++ b/tools-for-build/ucd.lisp @@ -118,8 +118,8 @@ while line do (slurp-ucd-line line))) (second-pass) - (build-misc-table) (fixup-hangul-syllables) + (build-misc-table) (length *long-decompositions*)) (defun fixup-hangul-syllables () @@ -145,10 +145,18 @@ (let* ((l (+ lbase (floor sindex ncount))) (v (+ vbase (floor (mod sindex ncount) tcount))) (tee (+ tbase (mod sindex tcount))) + (code-point (+ sbase sindex)) (name (format nil "HANGUL_SYLLABLE_~A~A~:[~A~;~]" (gethash l table) (gethash v table) (= tee tbase) (gethash tee table)))) - (setf (gethash (+ sbase sindex) *unicode-names*) name))))) + (setf (gethash code-point *unicode-names*) name) + (unless (aref *decomposition-base* (cp-high code-point)) + (setf (aref *decomposition-base* (cp-high code-point)) + (make-array (ash 1 *page-size-exponent*) + :initial-element nil))) + (setf (aref (aref *decomposition-base* (cp-high code-point)) + (cp-low code-point)) + (cons (if (= tee tbase) 2 3) 0)))))) (defun add-jamo-information (line table) (let* ((split (split-string line #\;)) @@ -159,16 +167,16 @@ (defun split-string (line character) (loop for prev-position = 0 then (1+ position) - for position = (position character line :start prev-position) - collect (subseq line prev-position position) - do (unless position - (loop-finish)))) + for position = (position character line :start prev-position) + collect (subseq line prev-position position) + do (unless position + (loop-finish)))) (defun init-indices (strings) (let ((hash (make-hash-table :test #'equal))) (loop for string in strings - for index from 0 - do (setf (gethash string hash) index)) + for index from 0 + do (setf (gethash string hash) index)) hash)) (defparameter *general-categories* @@ -256,6 +264,17 @@ (prog1 (fill-pointer *long-decompositions*) (dolist (code decomposition) (vector-push-extend code *long-decompositions*))))))))) + ;; Hangul decomposition; see Unicode 6.2 section 3-12 + (when (= code-point #xd7a3) + ;; KLUDGE: it's a bit ugly to do this here when we've got + ;; a reasonable function to do this in + ;; (FIXUP-HANGUL-SYLLABLES). The problem is that the + ;; fixup would be somewhat tedious to do, what with all + ;; the careful hashing of misc data going on. + (setf decomposition-info 1) + ;; the construction of *decomposition-base* entries is, + ;; however, easy to handle within FIXUP-HANGUL-SYLLABLES. + ) (when (and (string/= "" simple-uppercase) (string/= "" simple-lowercase)) (push (list code-point upper-index lower-index) *both-cases*)) -- 1.7.10.4