X-Git-Url: http://repo.macrolet.net/gitweb/?a=blobdiff_plain;f=tools-for-build%2Fucd.lisp;h=1a470045ff012a323f9f979307a61cb7363c3a42;hb=979539d20a27f4315db9e1bde0a4413135cf8603;hp=ddc49b0f0b1aa7f4a0f989740f206805cf46b023;hpb=4aad92d800164f49530ad5f2bb07d81f61e2911b;p=sbcl.git diff --git a/tools-for-build/ucd.lisp b/tools-for-build/ucd.lisp index ddc49b0..1a47004 100644 --- a/tools-for-build/ucd.lisp +++ b/tools-for-build/ucd.lisp @@ -50,6 +50,9 @@ (setf (gethash list *misc-hash*) (incf *misc-index*)))))) +(defun gc-index-sort-key (gc-index) + (or (cdr (assoc gc-index '((1 . 2) (2 . 1)))) gc-index)) + (defun compare-misc-entry (left right) (destructuring-bind (left-gc-index left-bidi-index left-ccc-index left-decimal-digit left-digit left-bidi-mirrored @@ -61,7 +64,8 @@ right (or (and left-cl-both-case-p (not right-cl-both-case-p)) (and (or left-cl-both-case-p (not right-cl-both-case-p)) - (or (< left-gc-index right-gc-index) + (or (< (gc-index-sort-key left-gc-index) + (gc-index-sort-key right-gc-index)) (and (= left-gc-index right-gc-index) (or (< left-decomposition-info right-decomposition-info) (and (= left-decomposition-info right-decomposition-info) @@ -80,14 +84,38 @@ right-bidi-mirrored)))))))))))))))))) (defun build-misc-table () - (sort *misc-table* #'compare-misc-entry) + (let ((table (sort *misc-table* #'compare-misc-entry))) + ;; after sorting, insert at the end a special entry to handle + ;; unallocated characters. + (setf *misc-table* (make-array (1+ (length table)))) + (replace *misc-table* table) + (setf (aref *misc-table* (length table)) + ;; unallocated characters have a GC index of 31 (not + ;; colliding with any other GC), are not digits or decimal + ;; digits, aren't BOTH-CASE-P, don't decompose, and aren't + ;; interestingly bidi or combining. + '(31 0 0 "" "" "" nil 0))) (setq *misc-mapping* (make-array (1+ *misc-index*))) (loop for i from 0 to *misc-index* do (setf (aref *misc-mapping* (gethash (aref *misc-table* i) *misc-hash*)) i))) +(defvar *comp-table*) + +(defvar *exclusions* + (with-open-file (s (make-pathname :name "CompositionExclusions" :type "txt" + :defaults *unicode-character-database*)) + (do ((line (read-line s nil nil) (read-line s nil nil)) + result) + ((null line) result) + (when (and (> (length line) 0) + (char/= (char line 0) #\#)) + (push (parse-integer line :end (position #\Space line) :radix 16) + result))))) + (defun slurp-ucd () + (setf *comp-table* (make-hash-table :test 'equal)) (setq *last-uppercase* nil) (setq *uppercase-transition-count* 0) (setq *different-titlecases* nil) @@ -118,10 +146,24 @@ while line do (slurp-ucd-line line))) (second-pass) + (fixup-compositions) (fixup-hangul-syllables) (build-misc-table) (length *long-decompositions*)) +(defun fixup-compositions () + (flet ((fixup (k v) + (let* ((cp (car k)) + (ucd (aref (aref *ucd-base* (cp-high cp)) (cp-low cp))) + (misc (aref *misc-table* (ucd-misc ucd))) + (ccc-index (third misc))) + ;; we can do everything in the first pass except for + ;; accounting for decompositions where the first + ;; character of the decomposition is not a starter. + (when (/= ccc-index 0) + (remhash k *comp-table*))))) + (maphash #'fixup *comp-table*))) + (defun fixup-hangul-syllables () ;; "Hangul Syllable Composition, Unicode 5.1 section 3-12" (let* ((sbase #xac00) @@ -232,7 +274,11 @@ (parse-integer simple-titlecase :radix 16))) (cl-both-case-p (not (null (or (and (= gc-index 0) lower-index) - (and (= gc-index 1) upper-index))))) + (and (= gc-index 1) upper-index) + ;; deal with prosgegrammeni / titlecase + (and (= gc-index 2) + (typep code-point '(integer #x1000 #x1fff)) + lower-index))))) (decomposition-info 0)) (declare (ignore digit-index)) (when (and (not cl-both-case-p) @@ -258,6 +304,25 @@ (mapcar #'(lambda (string) (parse-integer string :radix 16)) split))) + (when (= decomposition-info 1) + ;; Primary composition excludes: + ;; * singleton decompositions; + ;; * decompositions of non-starters; + ;; * script-specific decompositions; + ;; * later-version decompositions; + ;; * decompositions whose first character is a + ;; non-starter. + ;; All but the last case can be handled here; + ;; for the fixup, see FIXUP-COMPOSITIONS + (when (and (> (length decomposition) 1) + (= ccc-index 0) + (not (member code-point *exclusions*))) + (unless (= (length decomposition) 2) + (error "canonical decomposition unexpectedly long")) + (setf (gethash (cons (first decomposition) + (second decomposition)) + *comp-table*) + code-point))) (if (= (length decomposition) 1) (cons 1 (car decomposition)) (cons (length decomposition) @@ -306,19 +371,19 @@ (- *page-size-exponent*))) (page-end (ash code-point (- *page-size-exponent*)))) (loop for point from *block-first* - below (ash page-start *page-size-exponent*) - do (setf (aref (aref *ucd-base* (cp-high point)) - (cp-low point)) - result)) + below (ash page-start *page-size-exponent*) + do (setf (aref (aref *ucd-base* (cp-high point)) + (cp-low point)) + result)) (loop for page from page-start below page-end - do (setf (aref *ucd-base* page) - (make-array (ash 1 *page-size-exponent*) - :initial-element result))) + do (setf (aref *ucd-base* page) + (make-array (ash 1 *page-size-exponent*) + :initial-element result))) (loop for point from (ash page-end *page-size-exponent*) - below code-point - do (setf (aref (aref *ucd-base* (cp-high point)) - (cp-low point)) - result)))) + below code-point + do (setf (aref (aref *ucd-base* (cp-high point)) + (cp-low point)) + result)))) (values result (normalize-character-name name))))))) (defun slurp-ucd-line (line) @@ -340,31 +405,32 @@ ;;; inverses, which is not true in general in Unicode even for ;;; characters which change case to single characters. (defun second-pass () - (loop for i from 0 below (length *ucd-base*) - when (aref *ucd-base* i) - do (loop for j from 0 below (length (aref *ucd-base* i)) - for result = (aref (aref *ucd-base* i) j) - when result - when (let* ((transform-point (ucd-transform result)) - (transform-high (ash transform-point - (- *page-size-exponent*))) - (transform-low (ldb (byte *page-size-exponent* 0) - transform-point))) - (and (plusp transform-point) - (/= (ucd-transform - (aref (aref *ucd-base* transform-high) - transform-low)) - (+ (ash i *page-size-exponent*) j)))) - do (destructuring-bind (gc-index bidi-index ccc-index - decimal-digit digit bidi-mirrored - cl-both-case-p decomposition-info) + (dotimes (i (length *ucd-base*)) + (let ((base (aref *ucd-base* i))) + (dotimes (j (length base)) ; base is NIL or an array + (let ((result (aref base j))) + (when result + ;; fixup case mappings for CL/Unicode mismatch + (let* ((transform-point (ucd-transform result)) + (transform-high (ash transform-point + (- *page-size-exponent*))) + (transform-low (ldb (byte *page-size-exponent* 0) + transform-point))) + (when (and (plusp transform-point) + (/= (ucd-transform + (aref (aref *ucd-base* transform-high) + transform-low)) + (+ (ash i *page-size-exponent*) j))) + (destructuring-bind (gc-index bidi-index ccc-index + decimal-digit digit bidi-mirrored + cl-both-case-p decomposition-info) (aref *misc-table* (ucd-misc result)) (declare (ignore cl-both-case-p)) (format t "~A~%" (+ (ash i *page-size-exponent*) j)) (setf (ucd-misc result) (hash-misc gc-index bidi-index ccc-index decimal-digit digit bidi-mirrored - nil decomposition-info)))))) + nil decomposition-info))))))))))) (defun write-4-byte (quadruplet stream) (write-byte (ldb (byte 8 24) quadruplet) stream) @@ -377,7 +443,7 @@ 255 (parse-integer digit))) -(defun output () +(defun output-ucd-data () (let ((hash (make-hash-table :test #'equalp)) (index 0)) (loop for page across *ucd-base* @@ -426,20 +492,28 @@ (loop for page across array do (loop for entry across page do (write-4-byte - (dpb (if entry (aref *misc-mapping* (ucd-misc entry)) #x7ff) + (dpb (if entry + (aref *misc-mapping* (ucd-misc entry)) + ;; the last entry in *MISC-TABLE* (see + ;; BUILD-MISC-TABLE) is special, + ;; reserved for the information for + ;; characters unallocated by Unicode. + (1- (length *misc-table*))) (byte 11 21) (if entry (ucd-transform entry) 0)) - stream)))))) - ;; KLUDGE: this code, to write out decomposition information, is a - ;; little bit very similar to the ucd entries above. Try factoring - ;; out the common stuff? + stream))))))) + +;;; KLUDGE: this code, to write out decomposition information, is a +;;; little bit very similar to the ucd entries above. Try factoring +;;; out the common stuff? +(defun output-decomposition-data () (let ((hash (make-hash-table :test #'equalp)) (index 0)) (loop for page across *decomposition-base* - do (when page - (unless (gethash page hash) - (setf (gethash page hash) - (prog1 index (incf index)))))) + do (when page + (unless (gethash page hash) + (setf (gethash page hash) + (prog1 index (incf index)))))) (let ((array (make-array index))) (maphash #'(lambda (key value) (setf (aref array value) key)) @@ -466,7 +540,31 @@ :if-exists :supersede :if-does-not-exist :create) (loop for code across (copy-seq *long-decompositions*) - do (write-4-byte code stream))))) + do (write-4-byte code stream)))))) + +(defun output-composition-data () + #+nil ; later + (let (firsts seconds) + (flet ((frob (k v) + (declare (ignore v)) + (pushnew (car k) firsts) + (pushnew (cdr k) seconds))) + (maphash #'frob *comp-table*))) + (with-open-file (stream (make-pathname :name "comp" :type "dat" + :defaults *output-directory*) + :direction :output + :element-type '(unsigned-byte 8) + :if-exists :supersede :if-does-not-exist :create) + (maphash (lambda (k v) + (write-4-byte (car k) stream) + (write-4-byte (cdr k) stream) + (write-4-byte v stream)) + *comp-table*))) + +(defun output () + (output-ucd-data) + (output-decomposition-data) + (output-composition-data) (with-open-file (f (make-pathname :name "ucd-names" :type "lisp-expr" :defaults *output-directory*) :direction :output @@ -476,8 +574,8 @@ (write-string ";;; Do not edit by hand: generated by ucd.lisp" f) (maphash (lambda (code name) (when name - (print code f) - (prin1 name f))) + (print code f) + (prin1 name f))) *unicode-names*)) (setf *unicode-names* nil)) (with-open-file (*standard-output* @@ -511,18 +609,18 @@ (with-standard-io-syntax (let ((*print-pretty* t)) (prin1 `(:length ,(length *misc-table*) - :uppercase ,(loop for (gc-index) across *misc-table* - for i from 0 - when (= gc-index 0) - collect i) - :lowercase ,(loop for (gc-index) across *misc-table* - for i from 0 - when (= gc-index 1) - collect i) - :titlecase ,(loop for (gc-index) across *misc-table* - for i from 0 - when (= gc-index 2) - collect i)))))) + :uppercase ,(loop for (gc-index) across *misc-table* + for i from 0 + when (= gc-index 0) + collect i) + :lowercase ,(loop for (gc-index) across *misc-table* + for i from 0 + when (= gc-index 1) + collect i) + :titlecase ,(loop for (gc-index) across *misc-table* + for i from 0 + when (= gc-index 2) + collect i)))))) (values)) ;;; Use of the generated files