X-Git-Url: http://repo.macrolet.net/gitweb/?a=blobdiff_plain;f=tools-for-build%2Fucd.lisp;h=ddc49b0f0b1aa7f4a0f989740f206805cf46b023;hb=4aad92d800164f49530ad5f2bb07d81f61e2911b;hp=599fda87849f4efa97de5307ee52314da2929f6f;hpb=ba1d157b8797b9c0e66b457221d4c2fbbd0261fb;p=sbcl.git diff --git a/tools-for-build/ucd.lisp b/tools-for-build/ucd.lisp index 599fda8..ddc49b0 100644 --- a/tools-for-build/ucd.lisp +++ b/tools-for-build/ucd.lisp @@ -35,15 +35,14 @@ (defparameter *misc-table* nil) (defparameter *misc-mapping* nil) (defparameter *both-cases* nil) -(defparameter *decompositions* nil) -(defparameter *decomposition-length-max* nil) +(defparameter *long-decompositions* nil) (defparameter *decomposition-types* nil) (defparameter *decomposition-base* nil) (defun hash-misc (gc-index bidi-index ccc-index decimal-digit digit - bidi-mirrored cl-both-case-p) + bidi-mirrored cl-both-case-p decomposition-info) (let* ((list (list gc-index bidi-index ccc-index decimal-digit digit - bidi-mirrored cl-both-case-p)) + bidi-mirrored cl-both-case-p decomposition-info)) (index (gethash list *misc-hash*))) (or index (progn @@ -54,37 +53,39 @@ (defun compare-misc-entry (left right) (destructuring-bind (left-gc-index left-bidi-index left-ccc-index left-decimal-digit left-digit left-bidi-mirrored - left-cl-both-case-p) + left-cl-both-case-p left-decomposition-info) left (destructuring-bind (right-gc-index right-bidi-index right-ccc-index right-decimal-digit right-digit right-bidi-mirrored - right-cl-both-case-p) + right-cl-both-case-p right-decomposition-info) right (or (and left-cl-both-case-p (not right-cl-both-case-p)) (and (or left-cl-both-case-p (not right-cl-both-case-p)) (or (< left-gc-index right-gc-index) (and (= left-gc-index right-gc-index) - (or (< left-bidi-index right-bidi-index) - (and (= left-bidi-index right-bidi-index) - (or (< left-ccc-index right-ccc-index) - (and (= left-ccc-index right-ccc-index) - (or (string< left-decimal-digit - right-decimal-digit) - (and (string= left-decimal-digit - right-decimal-digit) - (or (string< left-digit right-digit) - (and (string= left-digit - right-digit) - (string< left-bidi-mirrored - right-bidi-mirrored)))))))))))))))) + (or (< left-decomposition-info right-decomposition-info) + (and (= left-decomposition-info right-decomposition-info) + (or (< left-bidi-index right-bidi-index) + (and (= left-bidi-index right-bidi-index) + (or (< left-ccc-index right-ccc-index) + (and (= left-ccc-index right-ccc-index) + (or (string< left-decimal-digit + right-decimal-digit) + (and (string= left-decimal-digit + right-decimal-digit) + (or (string< left-digit right-digit) + (and (string= left-digit + right-digit) + (string< left-bidi-mirrored + right-bidi-mirrored)))))))))))))))))) (defun build-misc-table () (sort *misc-table* #'compare-misc-entry) (setq *misc-mapping* (make-array (1+ *misc-index*))) (loop for i from 0 to *misc-index* - do (setf (aref *misc-mapping* - (gethash (aref *misc-table* i) *misc-hash*)) - i))) + do (setf (aref *misc-mapping* + (gethash (aref *misc-table* i) *misc-hash*)) + i))) (defun slurp-ucd () (setq *last-uppercase* nil) @@ -94,11 +95,15 @@ (setq *name-size* 0) (setq *misc-hash* (make-hash-table :test #'equal)) (setq *misc-index* -1) - (setq *misc-table* (make-array 256 :fill-pointer 0)) + (setq *misc-table* (make-array 2048 :fill-pointer 0)) (setq *both-cases* nil) - (setq *decompositions* 0) - (setq *decomposition-types* (make-hash-table :test #'equal)) - (setq *decomposition-length-max* 0) + (setq *long-decompositions* + (make-array 2048 :fill-pointer 0 :adjustable t)) + (setq *decomposition-types* + (let ((array (make-array 256 :initial-element nil :fill-pointer 1))) + (vector-push "" array) + (vector-push "" array) + array)) (setq *decomposition-base* (make-array (ash #x110000 (- *page-size-exponent*)) :initial-element nil)) @@ -113,21 +118,65 @@ while line do (slurp-ucd-line line))) (second-pass) + (fixup-hangul-syllables) (build-misc-table) - *decompositions*) + (length *long-decompositions*)) + +(defun fixup-hangul-syllables () + ;; "Hangul Syllable Composition, Unicode 5.1 section 3-12" + (let* ((sbase #xac00) + (lbase #x1100) + (vbase #x1161) + (tbase #x11a7) + (scount 11172) + (lcount 19) + (vcount 21) + (tcount 28) + (ncount (* vcount tcount)) + (table (make-hash-table))) + (with-open-file (*standard-input* + (make-pathname :name "Jamo" :type "txt" + :defaults *unicode-character-database*)) + (loop for line = (read-line nil nil) + while line + if (position #\; line) + do (add-jamo-information line table))) + (dotimes (sindex scount) + (let* ((l (+ lbase (floor sindex ncount))) + (v (+ vbase (floor (mod sindex ncount) tcount))) + (tee (+ tbase (mod sindex tcount))) + (code-point (+ sbase sindex)) + (name (format nil "HANGUL_SYLLABLE_~A~A~:[~A~;~]" + (gethash l table) (gethash v table) + (= tee tbase) (gethash tee table)))) + (setf (gethash code-point *unicode-names*) name) + (unless (aref *decomposition-base* (cp-high code-point)) + (setf (aref *decomposition-base* (cp-high code-point)) + (make-array (ash 1 *page-size-exponent*) + :initial-element nil))) + (setf (aref (aref *decomposition-base* (cp-high code-point)) + (cp-low code-point)) + (cons (if (= tee tbase) 2 3) 0)))))) + +(defun add-jamo-information (line table) + (let* ((split (split-string line #\;)) + (code (parse-integer (first split) :radix 16)) + (syllable (string-trim '(#\Space) + (subseq (second split) 0 (position #\# (second split)))))) + (setf (gethash code table) syllable))) (defun split-string (line character) (loop for prev-position = 0 then (1+ position) - for position = (position character line :start prev-position) - collect (subseq line prev-position position) - do (unless position - (loop-finish)))) + for position = (position character line :start prev-position) + collect (subseq line prev-position position) + do (unless position + (loop-finish)))) (defun init-indices (strings) (let ((hash (make-hash-table :test #'equal))) (loop for string in strings - for index from 0 - do (setf (gethash string hash) index)) + for index from 0 + do (setf (gethash string hash) index)) hash)) (defparameter *general-categories* @@ -184,31 +233,48 @@ (cl-both-case-p (not (null (or (and (= gc-index 0) lower-index) (and (= gc-index 1) upper-index))))) - (misc-index (hash-misc gc-index bidi-index ccc-index - decimal-digit digit bidi-mirrored - cl-both-case-p))) + (decomposition-info 0)) (declare (ignore digit-index)) (when (and (not cl-both-case-p) (< gc-index 2)) (format t "~A~%" name)) (incf *name-size* (length name)) (when (string/= "" decomposition-type-and-mapping) - (let ((split (split-string decomposition-type-and-mapping - #\Space))) - (when (char= #\< (aref (first split) 0)) - (setf (gethash (pop split) *decomposition-types*) t)) + (let ((split (split-string decomposition-type-and-mapping #\Space))) + (cond + ((char= #\< (aref (first split) 0)) + (unless (position (first split) *decomposition-types* + :test #'equal) + (vector-push (first split) *decomposition-types*)) + (setf decomposition-info (position (pop split) *decomposition-types* :test #'equal))) + (t (setf decomposition-info 1))) (unless (aref *decomposition-base* (cp-high code-point)) (setf (aref *decomposition-base* (cp-high code-point)) (make-array (ash 1 *page-size-exponent*) :initial-element nil))) (setf (aref (aref *decomposition-base* (cp-high code-point)) (cp-low code-point)) - (mapcar #'(lambda (string) - (parse-integer string :radix 16)) - split)) - (setq *decomposition-length-max* - (max *decomposition-length-max* (length split))) - (incf *decompositions* (length split)))) + (let ((decomposition + (mapcar #'(lambda (string) + (parse-integer string :radix 16)) + split))) + (if (= (length decomposition) 1) + (cons 1 (car decomposition)) + (cons (length decomposition) + (prog1 (fill-pointer *long-decompositions*) + (dolist (code decomposition) + (vector-push-extend code *long-decompositions*))))))))) + ;; Hangul decomposition; see Unicode 6.2 section 3-12 + (when (= code-point #xd7a3) + ;; KLUDGE: it's a bit ugly to do this here when we've got + ;; a reasonable function to do this in + ;; (FIXUP-HANGUL-SYLLABLES). The problem is that the + ;; fixup would be somewhat tedious to do, what with all + ;; the careful hashing of misc data going on. + (setf decomposition-info 1) + ;; the construction of *decomposition-base* entries is, + ;; however, easy to handle within FIXUP-HANGUL-SYLLABLES. + ) (when (and (string/= "" simple-uppercase) (string/= "" simple-lowercase)) (push (list code-point upper-index lower-index) *both-cases*)) @@ -227,8 +293,11 @@ (setq *last-uppercase* nil))) (when (> ccc-index 255) (error "canonical combining class too large ~A" ccc-index)) - (let ((result (make-ucd :misc misc-index - :transform (or upper-index lower-index 0)))) + (let* ((misc-index (hash-misc gc-index bidi-index ccc-index + decimal-digit digit bidi-mirrored + cl-both-case-p decomposition-info)) + (result (make-ucd :misc misc-index + :transform (or upper-index lower-index 0)))) (when (and (> (length name) 7) (string= ", Last>" name :start2 (- (length name) 7))) (let ((page-start (ash (+ *block-first* @@ -266,6 +335,10 @@ (setf (aref (aref *ucd-base* code-high) code-low) encoding (gethash code-point *unicode-names*) name)))) +;;; this fixes up the case conversion discrepancy between CL and +;;; Unicode: CL operators depend on char-downcase / char-upcase being +;;; inverses, which is not true in general in Unicode even for +;;; characters which change case to single characters. (defun second-pass () (loop for i from 0 below (length *ucd-base*) when (aref *ucd-base* i) @@ -284,19 +357,20 @@ (+ (ash i *page-size-exponent*) j)))) do (destructuring-bind (gc-index bidi-index ccc-index decimal-digit digit bidi-mirrored - cl-both-case-p) + cl-both-case-p decomposition-info) (aref *misc-table* (ucd-misc result)) (declare (ignore cl-both-case-p)) (format t "~A~%" (+ (ash i *page-size-exponent*) j)) (setf (ucd-misc result) (hash-misc gc-index bidi-index ccc-index decimal-digit digit bidi-mirrored - nil)))))) + nil decomposition-info)))))) -(defun write-3-byte (triplet stream) - (write-byte (ldb (byte 8 0) triplet) stream) - (write-byte (ldb (byte 8 8) triplet) stream) - (write-byte (ldb (byte 8 16) triplet) stream)) +(defun write-4-byte (quadruplet stream) + (write-byte (ldb (byte 8 24) quadruplet) stream) + (write-byte (ldb (byte 8 16) quadruplet) stream) + (write-byte (ldb (byte 8 8) quadruplet) stream) + (write-byte (ldb (byte 8 0) quadruplet) stream)) (defun digit-to-byte (digit) (if (string= "" digit) @@ -325,26 +399,74 @@ :if-exists :supersede :if-does-not-exist :create) (loop for (gc-index bidi-index ccc-index decimal-digit digit - bidi-mirrored) + bidi-mirrored nil decomposition-info) across *misc-table* + ;; three bits spare here do (write-byte gc-index stream) + ;; three bits spare here do (write-byte bidi-index stream) do (write-byte ccc-index stream) + ;; we could save some space here: decimal-digit and + ;; digit are constrained (CHECKME) to be between 0 and + ;; 9, so we could encode the pair in a single byte. + ;; (Also, decimal-digit is equal to digit or undefined, + ;; so we could encode decimal-digit as a single bit, + ;; meaning that we could save 11 bits here. do (write-byte (digit-to-byte decimal-digit) stream) do (write-byte (digit-to-byte digit) stream) + ;; there's an easy 7 bits to spare here do (write-byte (if (string= "N" bidi-mirrored) 0 1) stream) - do (write-byte 0 stream) + ;; at the moment we store information about which type + ;; of compatibility decomposition is used, costing c.3 + ;; bits. We could elide that. + do (write-byte decomposition-info stream) do (write-byte 0 stream)) (loop for page across *ucd-base* do (write-byte (if page (gethash page hash) 0) stream)) (loop for page across array do (loop for entry across page - do (write-byte (if entry - (aref *misc-mapping* (ucd-misc entry)) - 255) - stream) - do (write-3-byte (if entry (ucd-transform entry) 0) - stream)))))) + do (write-4-byte + (dpb (if entry (aref *misc-mapping* (ucd-misc entry)) #x7ff) + (byte 11 21) + (if entry (ucd-transform entry) 0)) + stream)))))) + ;; KLUDGE: this code, to write out decomposition information, is a + ;; little bit very similar to the ucd entries above. Try factoring + ;; out the common stuff? + (let ((hash (make-hash-table :test #'equalp)) + (index 0)) + (loop for page across *decomposition-base* + do (when page + (unless (gethash page hash) + (setf (gethash page hash) + (prog1 index (incf index)))))) + (let ((array (make-array index))) + (maphash #'(lambda (key value) + (setf (aref array value) key)) + hash) + (with-open-file (stream (make-pathname :name "decomp" :type "dat" + :defaults *output-directory*) + :direction :output + :element-type '(unsigned-byte 8) + :if-exists :supersede + :if-does-not-exist :create) + (loop for page across *decomposition-base* + do (write-byte (if page (gethash page hash) 0) stream)) + (loop for page across array + do (loop for entry across page + do (write-4-byte + (dpb (if entry (car entry) 0) + (byte 11 21) + (if entry (cdr entry) 0)) + stream)))) + (with-open-file (stream (make-pathname :name "ldecomp" :type "dat" + :defaults *output-directory*) + :direction :output + :element-type '(unsigned-byte 8) + :if-exists :supersede + :if-does-not-exist :create) + (loop for code across (copy-seq *long-decompositions*) + do (write-4-byte code stream))))) (with-open-file (f (make-pathname :name "ucd-names" :type "lisp-expr" :defaults *output-directory*) :direction :output @@ -420,17 +542,18 @@ (values)) ;;; The stuff below is dependent on misc.lisp-expr being -;;; (:LENGTH 206 :UPPERCASE (0 2) :LOWERCASE (1 3) :TITLECASE (4)). ;;; -;;; There are two entries for UPPERCASE and LOWERCASE because some -;;; characters have case (by Unicode standards) but are not -;;; transformable character-by-character in a locale-independet way -;;; (as CL requires for its standard operators). +;;; (:LENGTH 395 :UPPERCASE (0 1 2 3 8 9 10 11) :LOWERCASE (4 5 6 7 12 13 14 15) :TITLECASE (16 17)) +;;; +;;; There are two groups of entries for UPPERCASE and LOWERCASE +;;; because some characters have case (by Unicode standards) but are +;;; not transformable character-by-character in a locale-independent +;;; way (as CL requires for its standard operators). ;;; ;;; for more details on these debugging functions, see the description ;;; of the character database format in src/code/target-char.lisp -(defparameter *length* 206) +(defparameter *length* 395) (defun cp-index (cp) (let* ((cp-high (cp-high cp)) @@ -441,13 +564,16 @@ (* 4 (cp-low cp))))) (defun cp-value-0 (cp) - (aref *compiled-ucd* (cp-index cp))) + (let ((index (cp-index cp))) + (dpb (aref *compiled-ucd* index) + (byte 8 3) + (ldb (byte 3 5) (aref *compiled-ucd* (1+ index)))))) (defun cp-value-1 (cp) (let ((index (cp-index cp))) - (dpb (aref *compiled-ucd* (+ index 3)) (byte 8 16) + (dpb (aref *compiled-ucd* (1+ index)) (byte 5 16) (dpb (aref *compiled-ucd* (+ index 2)) (byte 8 8) - (aref *compiled-ucd* (1+ index)))))) + (aref *compiled-ucd* (+ index 3)))))) (defun cp-general-category (cp) (aref *compiled-ucd* (* 8 (cp-value-0 cp)))) @@ -479,20 +605,20 @@ (<= 160 cp))) (defun cp-char-upcase (cp) - (if (= (cp-value-0 cp) 1) + (if (< 3 (cp-value-0 cp) 8) (cp-value-1 cp) cp)) (defun cp-char-downcase (cp) - (if (= (cp-value-0 cp) 0) + (if (< (cp-value-0 cp) 4) (cp-value-1 cp) cp)) (defun cp-upper-case-p (cp) - (= (cp-value-0 cp) 0)) + (< (cp-value-0 cp) 4)) (defun cp-lower-case-p (cp) - (= (cp-value-0 cp) 1)) + (< 3 (cp-value-0 cp) 8)) (defun cp-both-case-p (cp) - (< (cp-value-0 cp) 2)) + (< (cp-value-0 cp) 8))