X-Git-Url: http://repo.macrolet.net/gitweb/?a=blobdiff_plain;f=tools-for-build%2Fucd.lisp;h=7851c367ca5febaecaf7768b9ccc2395e70ba7b7;hb=59b27c998ce5fc950b5efc5a82627b94192c03cf;hp=eb217823dbc7ad9d7afc782ca8b7600f4fdabb55;hpb=e8607908388c96db633bb7046a4b97844642768b;p=sbcl.git diff --git a/tools-for-build/ucd.lisp b/tools-for-build/ucd.lisp index eb21782..7851c36 100644 --- a/tools-for-build/ucd.lisp +++ b/tools-for-build/ucd.lisp @@ -17,10 +17,13 @@ ;;; Generator +(defstruct ucd misc transform) + (defparameter *unicode-character-database* (make-pathname :directory (pathname-directory *load-truename*))) (defparameter *ucd-base* nil) +(defparameter *unicode-names* (make-hash-table)) (defparameter *last-uppercase* nil) (defparameter *uppercase-transition-count* 0) @@ -111,8 +114,44 @@ do (slurp-ucd-line line))) (second-pass) (build-misc-table) + (fixup-hangul-syllables) *decompositions*) +(defun fixup-hangul-syllables () + ;; "Hangul Syllable Composition, Unicode 5.1 section 3-12" + (let* ((sbase #xac00) + (lbase #x1100) + (vbase #x1161) + (tbase #x11a7) + (scount 11172) + (lcount 19) + (vcount 21) + (tcount 28) + (ncount (* vcount tcount)) + (table (make-hash-table))) + (with-open-file (*standard-input* + (make-pathname :name "Jamo" :type "txt" + :defaults *unicode-character-database*)) + (loop for line = (read-line nil nil) + while line + if (position #\; line) + do (add-jamo-information line table))) + (dotimes (sindex scount) + (let* ((l (+ lbase (floor sindex ncount))) + (v (+ vbase (floor (mod sindex ncount) tcount))) + (tee (+ tbase (mod sindex tcount))) + (name (format nil "HANGUL_SYLLABLE_~A~A~:[~A~;~]" + (gethash l table) (gethash v table) + (= tee tbase) (gethash tee table)))) + (setf (gethash (+ sbase sindex) *unicode-names*) name))))) + +(defun add-jamo-information (line table) + (let* ((split (split-string line #\;)) + (code (parse-integer (first split) :radix 16)) + (syllable (string-trim '(#\Space) + (subseq (second split) 0 (position #\# (second split)))))) + (setf (gethash code table) syllable))) + (defun split-string (line character) (loop for prev-position = 0 then (1+ position) for position = (position character line :start prev-position) @@ -138,6 +177,12 @@ (defparameter *block-first* nil) +(defun normalize-character-name (name) + (when (find #\_ name) + (error "Bad name for a character: ~A" name)) + (unless (or (zerop (length name)) (find #\< name) (find #\> name)) + (substitute #\_ #\Space name))) + ;;; 3400 -- 4DB5 : cjk ideograph extension a ;Lo;0;L;;;;;N;;;;; ;;; AC00 -- D7A3 : hangul syllables ;Lo;0;L;;;;;N;;;;; ;;; D800 -- F8FF : surrogates and private use @@ -179,6 +224,9 @@ decimal-digit digit bidi-mirrored cl-both-case-p))) (declare (ignore digit-index)) + (when (and (not cl-both-case-p) + (< gc-index 2)) + (format t "~A~%" name)) (incf *name-size* (length name)) (when (string/= "" decomposition-type-and-mapping) (let ((split (split-string decomposition-type-and-mapping @@ -215,7 +263,8 @@ (setq *last-uppercase* nil))) (when (> ccc-index 255) (error "canonical combining class too large ~A" ccc-index)) - (let ((result (vector misc-index (or upper-index lower-index 0)))) + (let ((result (make-ucd :misc misc-index + :transform (or upper-index lower-index 0)))) (when (and (> (length name) 7) (string= ", Last>" name :start2 (- (length name) 7))) (let ((page-start (ash (+ *block-first* @@ -237,7 +286,7 @@ do (setf (aref (aref *ucd-base* (cp-high point)) (cp-low point)) result)))) - result))))) + (values result (normalize-character-name name))))))) (defun slurp-ucd-line (line) (let* ((split-line (split-string line #\;)) @@ -248,8 +297,10 @@ (setf (aref *ucd-base* code-high) (make-array (ash 1 *page-size-exponent*) :initial-element nil))) - (setf (aref (aref *ucd-base* code-high) code-low) - (encode-ucd-line (cdr split-line) code-point)))) + (multiple-value-bind (encoding name) + (encode-ucd-line (cdr split-line) code-point) + (setf (aref (aref *ucd-base* code-high) code-low) encoding + (gethash code-point *unicode-names*) name)))) (defun second-pass () (loop for i from 0 below (length *ucd-base*) @@ -257,23 +308,23 @@ do (loop for j from 0 below (length (aref *ucd-base* i)) for result = (aref (aref *ucd-base* i) j) when result - when (let* ((transform-point (aref result 1)) + when (let* ((transform-point (ucd-transform result)) (transform-high (ash transform-point (- *page-size-exponent*))) (transform-low (ldb (byte *page-size-exponent* 0) transform-point))) (and (plusp transform-point) - (/= (aref (aref (aref *ucd-base* transform-high) - transform-low) - 1) + (/= (ucd-transform + (aref (aref *ucd-base* transform-high) + transform-low)) (+ (ash i *page-size-exponent*) j)))) do (destructuring-bind (gc-index bidi-index ccc-index decimal-digit digit bidi-mirrored cl-both-case-p) - (aref *misc-table* (aref result 0)) + (aref *misc-table* (ucd-misc result)) (declare (ignore cl-both-case-p)) (format t "~A~%" (+ (ash i *page-size-exponent*) j)) - (setf (aref result 0) + (setf (ucd-misc result) (hash-misc gc-index bidi-index ccc-index decimal-digit digit bidi-mirrored nil)))))) @@ -321,15 +372,28 @@ do (write-byte 0 stream) do (write-byte 0 stream)) (loop for page across *ucd-base* - do (write-byte (if page (gethash page hash) 0) stream)) + do (write-byte (if page (gethash page hash) 0) stream)) (loop for page across array - do (loop for entry across page - do (write-byte (if entry - (aref *misc-mapping* (aref entry 0)) - 255) - stream) - do (write-3-byte (if entry (aref entry 1) 0) - stream)))))) + do (loop for entry across page + do (write-byte (if entry + (aref *misc-mapping* (ucd-misc entry)) + 255) + stream) + do (write-3-byte (if entry (ucd-transform entry) 0) + stream)))))) + (with-open-file (f (make-pathname :name "ucd-names" :type "lisp-expr" + :defaults *output-directory*) + :direction :output + :if-exists :supersede + :if-does-not-exist :create) + (with-standard-io-syntax + (write-string ";;; Do not edit by hand: generated by ucd.lisp" f) + (maphash (lambda (code name) + (when name + (print code f) + (prin1 name f))) + *unicode-names*)) + (setf *unicode-names* nil)) (with-open-file (*standard-output* (make-pathname :name "numerics" :type "lisp-expr" @@ -337,9 +401,10 @@ :direction :output :if-exists :supersede :if-does-not-exist :create) - (let ((*print-pretty* t)) - (prin1 (mapcar #'(lambda (x) (cons (car x) (read-from-string (cdr x)))) - *different-numerics*)))) + (with-standard-io-syntax + (let ((*print-pretty* t)) + (prin1 (mapcar #'(lambda (x) (cons (car x) (read-from-string (cdr x)))) + *different-numerics*))))) (with-open-file (*standard-output* (make-pathname :name "titlecases" :type "lisp-expr" @@ -347,8 +412,9 @@ :direction :output :if-exists :supersede :if-does-not-exist :create) - (let ((*print-pretty* t)) - (prin1 *different-titlecases*))) + (with-standard-io-syntax + (let ((*print-pretty* t)) + (prin1 *different-titlecases*)))) (with-open-file (*standard-output* (make-pathname :name "misc" :type "lisp-expr" @@ -356,20 +422,21 @@ :direction :output :if-exists :supersede :if-does-not-exist :create) - (let ((*print-pretty* t)) - (prin1 `(:length ,(length *misc-table*) - :uppercase ,(loop for (gc-index) across *misc-table* - for i from 0 - when (= gc-index 0) - collect i) - :lowercase ,(loop for (gc-index) across *misc-table* - for i from 0 - when (= gc-index 1) - collect i) - :titlecase ,(loop for (gc-index) across *misc-table* - for i from 0 - when (= gc-index 2) - collect i))))) + (with-standard-io-syntax + (let ((*print-pretty* t)) + (prin1 `(:length ,(length *misc-table*) + :uppercase ,(loop for (gc-index) across *misc-table* + for i from 0 + when (= gc-index 0) + collect i) + :lowercase ,(loop for (gc-index) across *misc-table* + for i from 0 + when (= gc-index 1) + collect i) + :titlecase ,(loop for (gc-index) across *misc-table* + for i from 0 + when (= gc-index 2) + collect i)))))) (values)) ;;; Use of the generated files @@ -389,9 +456,17 @@ (values)) ;;; The stuff below is dependent on misc.lisp-expr being -;;; (:LENGTH 186 :UPPERCASE (0) :LOWERCASE (1) :TITLECASE (2)) - -(defparameter *length* 186) +;;; (:LENGTH 206 :UPPERCASE (0 2) :LOWERCASE (1 3) :TITLECASE (4)). +;;; +;;; There are two entries for UPPERCASE and LOWERCASE because some +;;; characters have case (by Unicode standards) but are not +;;; transformable character-by-character in a locale-independet way +;;; (as CL requires for its standard operators). +;;; +;;; for more details on these debugging functions, see the description +;;; of the character database format in src/code/target-char.lisp + +(defparameter *length* 206) (defun cp-index (cp) (let* ((cp-high (cp-high cp))