X-Git-Url: http://repo.macrolet.net/gitweb/?a=blobdiff_plain;f=tools-for-build%2Fucd.lisp;h=7851c367ca5febaecaf7768b9ccc2395e70ba7b7;hb=59b27c998ce5fc950b5efc5a82627b94192c03cf;hp=8ddcb6757fc65ff7eeccc7f8ad0fbb18f6a6c7c9;hpb=acc978383105b5a2bfd970f8a34214fd5774bb2a;p=sbcl.git diff --git a/tools-for-build/ucd.lisp b/tools-for-build/ucd.lisp index 8ddcb67..7851c36 100644 --- a/tools-for-build/ucd.lisp +++ b/tools-for-build/ucd.lisp @@ -114,8 +114,44 @@ do (slurp-ucd-line line))) (second-pass) (build-misc-table) + (fixup-hangul-syllables) *decompositions*) +(defun fixup-hangul-syllables () + ;; "Hangul Syllable Composition, Unicode 5.1 section 3-12" + (let* ((sbase #xac00) + (lbase #x1100) + (vbase #x1161) + (tbase #x11a7) + (scount 11172) + (lcount 19) + (vcount 21) + (tcount 28) + (ncount (* vcount tcount)) + (table (make-hash-table))) + (with-open-file (*standard-input* + (make-pathname :name "Jamo" :type "txt" + :defaults *unicode-character-database*)) + (loop for line = (read-line nil nil) + while line + if (position #\; line) + do (add-jamo-information line table))) + (dotimes (sindex scount) + (let* ((l (+ lbase (floor sindex ncount))) + (v (+ vbase (floor (mod sindex ncount) tcount))) + (tee (+ tbase (mod sindex tcount))) + (name (format nil "HANGUL_SYLLABLE_~A~A~:[~A~;~]" + (gethash l table) (gethash v table) + (= tee tbase) (gethash tee table)))) + (setf (gethash (+ sbase sindex) *unicode-names*) name))))) + +(defun add-jamo-information (line table) + (let* ((split (split-string line #\;)) + (code (parse-integer (first split) :radix 16)) + (syllable (string-trim '(#\Space) + (subseq (second split) 0 (position #\# (second split)))))) + (setf (gethash code table) syllable))) + (defun split-string (line character) (loop for prev-position = 0 then (1+ position) for position = (position character line :start prev-position) @@ -188,6 +224,9 @@ decimal-digit digit bidi-mirrored cl-both-case-p))) (declare (ignore digit-index)) + (when (and (not cl-both-case-p) + (< gc-index 2)) + (format t "~A~%" name)) (incf *name-size* (length name)) (when (string/= "" decomposition-type-and-mapping) (let ((split (split-string decomposition-type-and-mapping @@ -362,9 +401,10 @@ :direction :output :if-exists :supersede :if-does-not-exist :create) - (let ((*print-pretty* t)) - (prin1 (mapcar #'(lambda (x) (cons (car x) (read-from-string (cdr x)))) - *different-numerics*)))) + (with-standard-io-syntax + (let ((*print-pretty* t)) + (prin1 (mapcar #'(lambda (x) (cons (car x) (read-from-string (cdr x)))) + *different-numerics*))))) (with-open-file (*standard-output* (make-pathname :name "titlecases" :type "lisp-expr" @@ -372,8 +412,9 @@ :direction :output :if-exists :supersede :if-does-not-exist :create) - (let ((*print-pretty* t)) - (prin1 *different-titlecases*))) + (with-standard-io-syntax + (let ((*print-pretty* t)) + (prin1 *different-titlecases*)))) (with-open-file (*standard-output* (make-pathname :name "misc" :type "lisp-expr" @@ -381,20 +422,21 @@ :direction :output :if-exists :supersede :if-does-not-exist :create) - (let ((*print-pretty* t)) - (prin1 `(:length ,(length *misc-table*) - :uppercase ,(loop for (gc-index) across *misc-table* - for i from 0 - when (= gc-index 0) - collect i) - :lowercase ,(loop for (gc-index) across *misc-table* - for i from 0 - when (= gc-index 1) - collect i) - :titlecase ,(loop for (gc-index) across *misc-table* - for i from 0 - when (= gc-index 2) - collect i))))) + (with-standard-io-syntax + (let ((*print-pretty* t)) + (prin1 `(:length ,(length *misc-table*) + :uppercase ,(loop for (gc-index) across *misc-table* + for i from 0 + when (= gc-index 0) + collect i) + :lowercase ,(loop for (gc-index) across *misc-table* + for i from 0 + when (= gc-index 1) + collect i) + :titlecase ,(loop for (gc-index) across *misc-table* + for i from 0 + when (= gc-index 2) + collect i)))))) (values)) ;;; Use of the generated files @@ -414,9 +456,17 @@ (values)) ;;; The stuff below is dependent on misc.lisp-expr being -;;; (:LENGTH 186 :UPPERCASE (0) :LOWERCASE (1) :TITLECASE (2)) - -(defparameter *length* 186) +;;; (:LENGTH 206 :UPPERCASE (0 2) :LOWERCASE (1 3) :TITLECASE (4)). +;;; +;;; There are two entries for UPPERCASE and LOWERCASE because some +;;; characters have case (by Unicode standards) but are not +;;; transformable character-by-character in a locale-independet way +;;; (as CL requires for its standard operators). +;;; +;;; for more details on these debugging functions, see the description +;;; of the character database format in src/code/target-char.lisp + +(defparameter *length* 206) (defun cp-index (cp) (let* ((cp-high (cp-high cp))