X-Git-Url: http://repo.macrolet.net/gitweb/?a=blobdiff_plain;f=tools-for-build%2Fucd.lisp;h=1a470045ff012a323f9f979307a61cb7363c3a42;hb=d306e2d23b38487488eb93881dad836e439e0c77;hp=4ecfdbb40bc4fbe3de7e7291375629cad580ee1e;hpb=9434bfa27a999690d05a196aae45f54cb05e9526;p=sbcl.git diff --git a/tools-for-build/ucd.lisp b/tools-for-build/ucd.lisp index 4ecfdbb..1a47004 100644 --- a/tools-for-build/ucd.lisp +++ b/tools-for-build/ucd.lisp @@ -17,10 +17,13 @@ ;;; Generator +(defstruct ucd misc transform) + (defparameter *unicode-character-database* (make-pathname :directory (pathname-directory *load-truename*))) (defparameter *ucd-base* nil) +(defparameter *unicode-names* (make-hash-table)) (defparameter *last-uppercase* nil) (defparameter *uppercase-transition-count* 0) @@ -32,58 +35,87 @@ (defparameter *misc-table* nil) (defparameter *misc-mapping* nil) (defparameter *both-cases* nil) -(defparameter *decompositions* nil) -(defparameter *decomposition-length-max* nil) +(defparameter *long-decompositions* nil) (defparameter *decomposition-types* nil) (defparameter *decomposition-base* nil) (defun hash-misc (gc-index bidi-index ccc-index decimal-digit digit - bidi-mirrored cl-both-case-p) + bidi-mirrored cl-both-case-p decomposition-info) (let* ((list (list gc-index bidi-index ccc-index decimal-digit digit - bidi-mirrored cl-both-case-p)) - (index (gethash list *misc-hash*))) + bidi-mirrored cl-both-case-p decomposition-info)) + (index (gethash list *misc-hash*))) (or index - (progn - (vector-push list *misc-table*) - (setf (gethash list *misc-hash*) - (incf *misc-index*)))))) + (progn + (vector-push list *misc-table*) + (setf (gethash list *misc-hash*) + (incf *misc-index*)))))) + +(defun gc-index-sort-key (gc-index) + (or (cdr (assoc gc-index '((1 . 2) (2 . 1)))) gc-index)) (defun compare-misc-entry (left right) (destructuring-bind (left-gc-index left-bidi-index left-ccc-index - left-decimal-digit left-digit left-bidi-mirrored - left-cl-both-case-p) + left-decimal-digit left-digit left-bidi-mirrored + left-cl-both-case-p left-decomposition-info) left (destructuring-bind (right-gc-index right-bidi-index right-ccc-index - right-decimal-digit right-digit right-bidi-mirrored - right-cl-both-case-p) - right + right-decimal-digit right-digit right-bidi-mirrored + right-cl-both-case-p right-decomposition-info) + right (or (and left-cl-both-case-p (not right-cl-both-case-p)) - (and (or left-cl-both-case-p (not right-cl-both-case-p)) - (or (< left-gc-index right-gc-index) - (and (= left-gc-index right-gc-index) - (or (< left-bidi-index right-bidi-index) - (and (= left-bidi-index right-bidi-index) - (or (< left-ccc-index right-ccc-index) - (and (= left-ccc-index right-ccc-index) - (or (string< left-decimal-digit - right-decimal-digit) - (and (string= left-decimal-digit - right-decimal-digit) - (or (string< left-digit right-digit) - (and (string= left-digit - right-digit) - (string< left-bidi-mirrored - right-bidi-mirrored)))))))))))))))) + (and (or left-cl-both-case-p (not right-cl-both-case-p)) + (or (< (gc-index-sort-key left-gc-index) + (gc-index-sort-key right-gc-index)) + (and (= left-gc-index right-gc-index) + (or (< left-decomposition-info right-decomposition-info) + (and (= left-decomposition-info right-decomposition-info) + (or (< left-bidi-index right-bidi-index) + (and (= left-bidi-index right-bidi-index) + (or (< left-ccc-index right-ccc-index) + (and (= left-ccc-index right-ccc-index) + (or (string< left-decimal-digit + right-decimal-digit) + (and (string= left-decimal-digit + right-decimal-digit) + (or (string< left-digit right-digit) + (and (string= left-digit + right-digit) + (string< left-bidi-mirrored + right-bidi-mirrored)))))))))))))))))) (defun build-misc-table () - (sort *misc-table* #'compare-misc-entry) + (let ((table (sort *misc-table* #'compare-misc-entry))) + ;; after sorting, insert at the end a special entry to handle + ;; unallocated characters. + (setf *misc-table* (make-array (1+ (length table)))) + (replace *misc-table* table) + (setf (aref *misc-table* (length table)) + ;; unallocated characters have a GC index of 31 (not + ;; colliding with any other GC), are not digits or decimal + ;; digits, aren't BOTH-CASE-P, don't decompose, and aren't + ;; interestingly bidi or combining. + '(31 0 0 "" "" "" nil 0))) (setq *misc-mapping* (make-array (1+ *misc-index*))) (loop for i from 0 to *misc-index* - do (setf (aref *misc-mapping* - (gethash (aref *misc-table* i) *misc-hash*)) - i))) + do (setf (aref *misc-mapping* + (gethash (aref *misc-table* i) *misc-hash*)) + i))) + +(defvar *comp-table*) + +(defvar *exclusions* + (with-open-file (s (make-pathname :name "CompositionExclusions" :type "txt" + :defaults *unicode-character-database*)) + (do ((line (read-line s nil nil) (read-line s nil nil)) + result) + ((null line) result) + (when (and (> (length line) 0) + (char/= (char line 0) #\#)) + (push (parse-integer line :end (position #\Space line) :radix 16) + result))))) (defun slurp-ucd () + (setf *comp-table* (make-hash-table :test 'equal)) (setq *last-uppercase* nil) (setq *uppercase-transition-count* 0) (setq *different-titlecases* nil) @@ -91,53 +123,121 @@ (setq *name-size* 0) (setq *misc-hash* (make-hash-table :test #'equal)) (setq *misc-index* -1) - (setq *misc-table* (make-array 256 :fill-pointer 0)) + (setq *misc-table* (make-array 2048 :fill-pointer 0)) (setq *both-cases* nil) - (setq *decompositions* 0) - (setq *decomposition-types* (make-hash-table :test #'equal)) - (setq *decomposition-length-max* 0) + (setq *long-decompositions* + (make-array 2048 :fill-pointer 0 :adjustable t)) + (setq *decomposition-types* + (let ((array (make-array 256 :initial-element nil :fill-pointer 1))) + (vector-push "" array) + (vector-push "" array) + array)) (setq *decomposition-base* (make-array (ash #x110000 - (- *page-size-exponent*)) - :initial-element nil)) + (- *page-size-exponent*)) + :initial-element nil)) (setq *ucd-base* (make-array (ash #x110000 (- *page-size-exponent*)) - :initial-element nil)) + :initial-element nil)) (with-open-file (*standard-input* - (make-pathname :name "UnicodeData" - :type "txt" - :defaults *unicode-character-database*) - :direction :input) + (make-pathname :name "UnicodeData" + :type "txt" + :defaults *unicode-character-database*) + :direction :input) (loop for line = (read-line nil nil) - while line - do (slurp-ucd-line line))) + while line + do (slurp-ucd-line line))) (second-pass) + (fixup-compositions) + (fixup-hangul-syllables) (build-misc-table) - *decompositions*) + (length *long-decompositions*)) + +(defun fixup-compositions () + (flet ((fixup (k v) + (let* ((cp (car k)) + (ucd (aref (aref *ucd-base* (cp-high cp)) (cp-low cp))) + (misc (aref *misc-table* (ucd-misc ucd))) + (ccc-index (third misc))) + ;; we can do everything in the first pass except for + ;; accounting for decompositions where the first + ;; character of the decomposition is not a starter. + (when (/= ccc-index 0) + (remhash k *comp-table*))))) + (maphash #'fixup *comp-table*))) + +(defun fixup-hangul-syllables () + ;; "Hangul Syllable Composition, Unicode 5.1 section 3-12" + (let* ((sbase #xac00) + (lbase #x1100) + (vbase #x1161) + (tbase #x11a7) + (scount 11172) + (lcount 19) + (vcount 21) + (tcount 28) + (ncount (* vcount tcount)) + (table (make-hash-table))) + (with-open-file (*standard-input* + (make-pathname :name "Jamo" :type "txt" + :defaults *unicode-character-database*)) + (loop for line = (read-line nil nil) + while line + if (position #\; line) + do (add-jamo-information line table))) + (dotimes (sindex scount) + (let* ((l (+ lbase (floor sindex ncount))) + (v (+ vbase (floor (mod sindex ncount) tcount))) + (tee (+ tbase (mod sindex tcount))) + (code-point (+ sbase sindex)) + (name (format nil "HANGUL_SYLLABLE_~A~A~:[~A~;~]" + (gethash l table) (gethash v table) + (= tee tbase) (gethash tee table)))) + (setf (gethash code-point *unicode-names*) name) + (unless (aref *decomposition-base* (cp-high code-point)) + (setf (aref *decomposition-base* (cp-high code-point)) + (make-array (ash 1 *page-size-exponent*) + :initial-element nil))) + (setf (aref (aref *decomposition-base* (cp-high code-point)) + (cp-low code-point)) + (cons (if (= tee tbase) 2 3) 0)))))) + +(defun add-jamo-information (line table) + (let* ((split (split-string line #\;)) + (code (parse-integer (first split) :radix 16)) + (syllable (string-trim '(#\Space) + (subseq (second split) 0 (position #\# (second split)))))) + (setf (gethash code table) syllable))) (defun split-string (line character) (loop for prev-position = 0 then (1+ position) - for position = (position character line :start prev-position) - collect (subseq line prev-position position) - do (unless position - (loop-finish)))) + for position = (position character line :start prev-position) + collect (subseq line prev-position position) + do (unless position + (loop-finish)))) (defun init-indices (strings) (let ((hash (make-hash-table :test #'equal))) (loop for string in strings - for index from 0 - do (setf (gethash string hash) index)) + for index from 0 + do (setf (gethash string hash) index)) hash)) (defparameter *general-categories* (init-indices '("Lu" "Ll" "Lt" "Lm" "Lo" "Cc" "Cf" "Co" "Cs" "Mc" - "Me" "Mn" "Nd" "Nl" "No" "Pc" "Pd" "Pe" "Pf" "Pi" - "Po" "Ps" "Sc" "Sk" "Sm" "So" "Zl" "Zp" "Zs"))) + "Me" "Mn" "Nd" "Nl" "No" "Pc" "Pd" "Pe" "Pf" "Pi" + "Po" "Ps" "Sc" "Sk" "Sm" "So" "Zl" "Zp" "Zs"))) (defparameter *bidi-classes* (init-indices '("AL" "AN" "B" "BN" "CS" "EN" "ES" "ET" "L" "LRE" "LRO" - "NSM" "ON" "PDF" "R" "RLE" "RLO" "S" "WS"))) + "NSM" "ON" "PDF" "R" "RLE" "RLO" "S" "WS"))) (defparameter *block-first* nil) +(defun normalize-character-name (name) + (when (find #\_ name) + (error "Bad name for a character: ~A" name)) + (unless (or (zerop (length name)) (find #\< name) (find #\> name)) + (substitute #\_ #\Space name))) + ;;; 3400 -- 4DB5 : cjk ideograph extension a ;Lo;0;L;;;;;N;;;;; ;;; AC00 -- D7A3 : hangul syllables ;Lo;0;L;;;;;N;;;;; ;;; D800 -- F8FF : surrogates and private use @@ -146,230 +246,381 @@ ;;; 100000 -- 10FFFD: private use (defun encode-ucd-line (line code-point) (destructuring-bind (name general-category canonical-combining-class - bidi-class decomposition-type-and-mapping - decimal-digit digit numeric bidi-mirrored - unicode-1-name iso-10646-comment simple-uppercase - simple-lowercase simple-titlecase) + bidi-class decomposition-type-and-mapping + decimal-digit digit numeric bidi-mirrored + unicode-1-name iso-10646-comment simple-uppercase + simple-lowercase simple-titlecase) line (declare (ignore unicode-1-name iso-10646-comment)) (if (and (> (length name) 8) - (string= ", First>" name :start2 (- (length name) 8))) - (progn - (setq *block-first* code-point) - nil) - (let* ((gc-index (or (gethash general-category *general-categories*) - (error "unknown general category ~A" - general-category))) - (bidi-index (or (gethash bidi-class *bidi-classes*) - (error "unknown bidirectional class ~A" - bidi-class))) - (ccc-index (parse-integer canonical-combining-class)) - (digit-index (unless (string= "" decimal-digit) - (parse-integer decimal-digit))) - (upper-index (unless (string= "" simple-uppercase) - (parse-integer simple-uppercase :radix 16))) - (lower-index (unless (string= "" simple-lowercase) - (parse-integer simple-lowercase :radix 16))) - (title-index (unless (string= "" simple-titlecase) - (parse-integer simple-titlecase :radix 16))) - (cl-both-case-p - (not (null (or (and (= gc-index 0) lower-index) - (and (= gc-index 1) upper-index))))) - (misc-index (hash-misc gc-index bidi-index ccc-index - decimal-digit digit bidi-mirrored - cl-both-case-p))) - (declare (ignore digit-index)) - (incf *name-size* (length name)) - (when (string/= "" decomposition-type-and-mapping) - (let ((split (split-string decomposition-type-and-mapping - #\Space))) - (when (char= #\< (aref (first split) 0)) - (setf (gethash (pop split) *decomposition-types*) t)) - (unless (aref *decomposition-base* (cp-high code-point)) - (setf (aref *decomposition-base* (cp-high code-point)) - (make-array (ash 1 *page-size-exponent*) - :initial-element nil))) - (setf (aref (aref *decomposition-base* (cp-high code-point)) - (cp-low code-point)) - (mapcar #'(lambda (string) - (parse-integer string :radix 16)) - split)) - (setq *decomposition-length-max* - (max *decomposition-length-max* (length split))) - (incf *decompositions* (length split)))) - (when (and (string/= "" simple-uppercase) - (string/= "" simple-lowercase)) - (push (list code-point upper-index lower-index) *both-cases*)) - (when (string/= simple-uppercase simple-titlecase) - (push (cons code-point title-index) *different-titlecases*)) - (when (string/= digit numeric) - (push (cons code-point numeric) *different-numerics*)) - (cond - ((= gc-index 8) - (unless *last-uppercase* - (incf *uppercase-transition-count*)) - (setq *last-uppercase* t)) - (t - (when *last-uppercase* - (incf *uppercase-transition-count*)) - (setq *last-uppercase* nil))) - (when (> ccc-index 255) - (error "canonical combining class too large ~A" ccc-index)) - (let ((result (vector misc-index (or upper-index lower-index 0)))) - (when (and (> (length name) 7) - (string= ", Last>" name :start2 (- (length name) 7))) - (let ((page-start (ash (+ *block-first* - (ash 1 *page-size-exponent*) - -1) - (- *page-size-exponent*))) - (page-end (ash code-point (- *page-size-exponent*)))) - (loop for point from *block-first* - below (ash page-start *page-size-exponent*) - do (setf (aref (aref *ucd-base* (cp-high point)) - (cp-low point)) - result)) - (loop for page from page-start below page-end - do (setf (aref *ucd-base* page) - (make-array (ash 1 *page-size-exponent*) - :initial-element result))) - (loop for point from (ash page-end *page-size-exponent*) - below code-point - do (setf (aref (aref *ucd-base* (cp-high point)) - (cp-low point)) - result)))) - result))))) + (string= ", First>" name :start2 (- (length name) 8))) + (progn + (setq *block-first* code-point) + nil) + (let* ((gc-index (or (gethash general-category *general-categories*) + (error "unknown general category ~A" + general-category))) + (bidi-index (or (gethash bidi-class *bidi-classes*) + (error "unknown bidirectional class ~A" + bidi-class))) + (ccc-index (parse-integer canonical-combining-class)) + (digit-index (unless (string= "" decimal-digit) + (parse-integer decimal-digit))) + (upper-index (unless (string= "" simple-uppercase) + (parse-integer simple-uppercase :radix 16))) + (lower-index (unless (string= "" simple-lowercase) + (parse-integer simple-lowercase :radix 16))) + (title-index (unless (string= "" simple-titlecase) + (parse-integer simple-titlecase :radix 16))) + (cl-both-case-p + (not (null (or (and (= gc-index 0) lower-index) + (and (= gc-index 1) upper-index) + ;; deal with prosgegrammeni / titlecase + (and (= gc-index 2) + (typep code-point '(integer #x1000 #x1fff)) + lower-index))))) + (decomposition-info 0)) + (declare (ignore digit-index)) + (when (and (not cl-both-case-p) + (< gc-index 2)) + (format t "~A~%" name)) + (incf *name-size* (length name)) + (when (string/= "" decomposition-type-and-mapping) + (let ((split (split-string decomposition-type-and-mapping #\Space))) + (cond + ((char= #\< (aref (first split) 0)) + (unless (position (first split) *decomposition-types* + :test #'equal) + (vector-push (first split) *decomposition-types*)) + (setf decomposition-info (position (pop split) *decomposition-types* :test #'equal))) + (t (setf decomposition-info 1))) + (unless (aref *decomposition-base* (cp-high code-point)) + (setf (aref *decomposition-base* (cp-high code-point)) + (make-array (ash 1 *page-size-exponent*) + :initial-element nil))) + (setf (aref (aref *decomposition-base* (cp-high code-point)) + (cp-low code-point)) + (let ((decomposition + (mapcar #'(lambda (string) + (parse-integer string :radix 16)) + split))) + (when (= decomposition-info 1) + ;; Primary composition excludes: + ;; * singleton decompositions; + ;; * decompositions of non-starters; + ;; * script-specific decompositions; + ;; * later-version decompositions; + ;; * decompositions whose first character is a + ;; non-starter. + ;; All but the last case can be handled here; + ;; for the fixup, see FIXUP-COMPOSITIONS + (when (and (> (length decomposition) 1) + (= ccc-index 0) + (not (member code-point *exclusions*))) + (unless (= (length decomposition) 2) + (error "canonical decomposition unexpectedly long")) + (setf (gethash (cons (first decomposition) + (second decomposition)) + *comp-table*) + code-point))) + (if (= (length decomposition) 1) + (cons 1 (car decomposition)) + (cons (length decomposition) + (prog1 (fill-pointer *long-decompositions*) + (dolist (code decomposition) + (vector-push-extend code *long-decompositions*))))))))) + ;; Hangul decomposition; see Unicode 6.2 section 3-12 + (when (= code-point #xd7a3) + ;; KLUDGE: it's a bit ugly to do this here when we've got + ;; a reasonable function to do this in + ;; (FIXUP-HANGUL-SYLLABLES). The problem is that the + ;; fixup would be somewhat tedious to do, what with all + ;; the careful hashing of misc data going on. + (setf decomposition-info 1) + ;; the construction of *decomposition-base* entries is, + ;; however, easy to handle within FIXUP-HANGUL-SYLLABLES. + ) + (when (and (string/= "" simple-uppercase) + (string/= "" simple-lowercase)) + (push (list code-point upper-index lower-index) *both-cases*)) + (when (string/= simple-uppercase simple-titlecase) + (push (cons code-point title-index) *different-titlecases*)) + (when (string/= digit numeric) + (push (cons code-point numeric) *different-numerics*)) + (cond + ((= gc-index 8) + (unless *last-uppercase* + (incf *uppercase-transition-count*)) + (setq *last-uppercase* t)) + (t + (when *last-uppercase* + (incf *uppercase-transition-count*)) + (setq *last-uppercase* nil))) + (when (> ccc-index 255) + (error "canonical combining class too large ~A" ccc-index)) + (let* ((misc-index (hash-misc gc-index bidi-index ccc-index + decimal-digit digit bidi-mirrored + cl-both-case-p decomposition-info)) + (result (make-ucd :misc misc-index + :transform (or upper-index lower-index 0)))) + (when (and (> (length name) 7) + (string= ", Last>" name :start2 (- (length name) 7))) + (let ((page-start (ash (+ *block-first* + (ash 1 *page-size-exponent*) + -1) + (- *page-size-exponent*))) + (page-end (ash code-point (- *page-size-exponent*)))) + (loop for point from *block-first* + below (ash page-start *page-size-exponent*) + do (setf (aref (aref *ucd-base* (cp-high point)) + (cp-low point)) + result)) + (loop for page from page-start below page-end + do (setf (aref *ucd-base* page) + (make-array (ash 1 *page-size-exponent*) + :initial-element result))) + (loop for point from (ash page-end *page-size-exponent*) + below code-point + do (setf (aref (aref *ucd-base* (cp-high point)) + (cp-low point)) + result)))) + (values result (normalize-character-name name))))))) (defun slurp-ucd-line (line) (let* ((split-line (split-string line #\;)) - (code-point (parse-integer (first split-line) :radix 16)) - (code-high (ash code-point (- *page-size-exponent*))) - (code-low (ldb (byte *page-size-exponent* 0) code-point))) + (code-point (parse-integer (first split-line) :radix 16)) + (code-high (ash code-point (- *page-size-exponent*))) + (code-low (ldb (byte *page-size-exponent* 0) code-point))) (unless (aref *ucd-base* code-high) (setf (aref *ucd-base* code-high) - (make-array (ash 1 *page-size-exponent*) - :initial-element nil))) - (setf (aref (aref *ucd-base* code-high) code-low) - (encode-ucd-line (cdr split-line) code-point)))) - + (make-array (ash 1 *page-size-exponent*) + :initial-element nil))) + (multiple-value-bind (encoding name) + (encode-ucd-line (cdr split-line) code-point) + (setf (aref (aref *ucd-base* code-high) code-low) encoding + (gethash code-point *unicode-names*) name)))) + +;;; this fixes up the case conversion discrepancy between CL and +;;; Unicode: CL operators depend on char-downcase / char-upcase being +;;; inverses, which is not true in general in Unicode even for +;;; characters which change case to single characters. (defun second-pass () - (loop for i from 0 below (length *ucd-base*) - when (aref *ucd-base* i) - do (loop for j from 0 below (length (aref *ucd-base* i)) - for result = (aref (aref *ucd-base* i) j) - when result - when (let* ((transform-point (aref result 1)) - (transform-high (ash transform-point - (- *page-size-exponent*))) - (transform-low (ldb (byte *page-size-exponent* 0) - transform-point))) - (and (plusp transform-point) - (/= (aref (aref (aref *ucd-base* transform-high) - transform-low) - 1) - (+ (ash i *page-size-exponent*) j)))) - do (destructuring-bind (gc-index bidi-index ccc-index - decimal-digit digit bidi-mirrored - cl-both-case-p) - (aref *misc-table* (aref result 0)) - (declare (ignore cl-both-case-p)) - (format t "~A~%" (+ (ash i *page-size-exponent*) j)) - (setf (aref result 0) - (hash-misc gc-index bidi-index ccc-index - decimal-digit digit bidi-mirrored - nil)))))) - -(defun write-3-byte (triplet stream) - (write-byte (ldb (byte 8 0) triplet) stream) - (write-byte (ldb (byte 8 8) triplet) stream) - (write-byte (ldb (byte 8 16) triplet) stream)) + (dotimes (i (length *ucd-base*)) + (let ((base (aref *ucd-base* i))) + (dotimes (j (length base)) ; base is NIL or an array + (let ((result (aref base j))) + (when result + ;; fixup case mappings for CL/Unicode mismatch + (let* ((transform-point (ucd-transform result)) + (transform-high (ash transform-point + (- *page-size-exponent*))) + (transform-low (ldb (byte *page-size-exponent* 0) + transform-point))) + (when (and (plusp transform-point) + (/= (ucd-transform + (aref (aref *ucd-base* transform-high) + transform-low)) + (+ (ash i *page-size-exponent*) j))) + (destructuring-bind (gc-index bidi-index ccc-index + decimal-digit digit bidi-mirrored + cl-both-case-p decomposition-info) + (aref *misc-table* (ucd-misc result)) + (declare (ignore cl-both-case-p)) + (format t "~A~%" (+ (ash i *page-size-exponent*) j)) + (setf (ucd-misc result) + (hash-misc gc-index bidi-index ccc-index + decimal-digit digit bidi-mirrored + nil decomposition-info))))))))))) + +(defun write-4-byte (quadruplet stream) + (write-byte (ldb (byte 8 24) quadruplet) stream) + (write-byte (ldb (byte 8 16) quadruplet) stream) + (write-byte (ldb (byte 8 8) quadruplet) stream) + (write-byte (ldb (byte 8 0) quadruplet) stream)) (defun digit-to-byte (digit) (if (string= "" digit) 255 (parse-integer digit))) -(defun output () +(defun output-ucd-data () (let ((hash (make-hash-table :test #'equalp)) - (index 0)) + (index 0)) (loop for page across *ucd-base* - do (when page - (unless (gethash page hash) - (setf (gethash page hash) - (incf index))))) + do (when page + (unless (gethash page hash) + (setf (gethash page hash) + (incf index))))) (let ((array (make-array (1+ index)))) (maphash #'(lambda (key value) - (setf (aref array value) key)) - hash) + (setf (aref array value) key)) + hash) (setf (aref array 0) - (make-array (ash 1 *page-size-exponent*) :initial-element nil)) + (make-array (ash 1 *page-size-exponent*) :initial-element nil)) (with-open-file (stream (make-pathname :name "ucd" - :type "dat" - :defaults *output-directory*) - :direction :output - :element-type '(unsigned-byte 8) - :if-exists :supersede - :if-does-not-exist :create) - (loop for (gc-index bidi-index ccc-index decimal-digit digit - bidi-mirrored) - across *misc-table* - do (write-byte gc-index stream) - do (write-byte bidi-index stream) - do (write-byte ccc-index stream) - do (write-byte (digit-to-byte decimal-digit) stream) - do (write-byte (digit-to-byte digit) stream) - do (write-byte (if (string= "N" bidi-mirrored) 0 1) stream) - do (write-byte 0 stream) - do (write-byte 0 stream)) - (loop for page across *ucd-base* - do (write-byte (if page (gethash page hash) 0) stream)) - (loop for page across array - do (loop for entry across page - do (write-byte (if entry - (aref *misc-mapping* (aref entry 0)) - 255) - stream) - do (write-3-byte (if entry (aref entry 1) 0) - stream)))))) + :type "dat" + :defaults *output-directory*) + :direction :output + :element-type '(unsigned-byte 8) + :if-exists :supersede + :if-does-not-exist :create) + (loop for (gc-index bidi-index ccc-index decimal-digit digit + bidi-mirrored nil decomposition-info) + across *misc-table* + ;; three bits spare here + do (write-byte gc-index stream) + ;; three bits spare here + do (write-byte bidi-index stream) + do (write-byte ccc-index stream) + ;; we could save some space here: decimal-digit and + ;; digit are constrained (CHECKME) to be between 0 and + ;; 9, so we could encode the pair in a single byte. + ;; (Also, decimal-digit is equal to digit or undefined, + ;; so we could encode decimal-digit as a single bit, + ;; meaning that we could save 11 bits here. + do (write-byte (digit-to-byte decimal-digit) stream) + do (write-byte (digit-to-byte digit) stream) + ;; there's an easy 7 bits to spare here + do (write-byte (if (string= "N" bidi-mirrored) 0 1) stream) + ;; at the moment we store information about which type + ;; of compatibility decomposition is used, costing c.3 + ;; bits. We could elide that. + do (write-byte decomposition-info stream) + do (write-byte 0 stream)) + (loop for page across *ucd-base* + do (write-byte (if page (gethash page hash) 0) stream)) + (loop for page across array + do (loop for entry across page + do (write-4-byte + (dpb (if entry + (aref *misc-mapping* (ucd-misc entry)) + ;; the last entry in *MISC-TABLE* (see + ;; BUILD-MISC-TABLE) is special, + ;; reserved for the information for + ;; characters unallocated by Unicode. + (1- (length *misc-table*))) + (byte 11 21) + (if entry (ucd-transform entry) 0)) + stream))))))) + +;;; KLUDGE: this code, to write out decomposition information, is a +;;; little bit very similar to the ucd entries above. Try factoring +;;; out the common stuff? +(defun output-decomposition-data () + (let ((hash (make-hash-table :test #'equalp)) + (index 0)) + (loop for page across *decomposition-base* + do (when page + (unless (gethash page hash) + (setf (gethash page hash) + (prog1 index (incf index)))))) + (let ((array (make-array index))) + (maphash #'(lambda (key value) + (setf (aref array value) key)) + hash) + (with-open-file (stream (make-pathname :name "decomp" :type "dat" + :defaults *output-directory*) + :direction :output + :element-type '(unsigned-byte 8) + :if-exists :supersede + :if-does-not-exist :create) + (loop for page across *decomposition-base* + do (write-byte (if page (gethash page hash) 0) stream)) + (loop for page across array + do (loop for entry across page + do (write-4-byte + (dpb (if entry (car entry) 0) + (byte 11 21) + (if entry (cdr entry) 0)) + stream)))) + (with-open-file (stream (make-pathname :name "ldecomp" :type "dat" + :defaults *output-directory*) + :direction :output + :element-type '(unsigned-byte 8) + :if-exists :supersede + :if-does-not-exist :create) + (loop for code across (copy-seq *long-decompositions*) + do (write-4-byte code stream)))))) + +(defun output-composition-data () + #+nil ; later + (let (firsts seconds) + (flet ((frob (k v) + (declare (ignore v)) + (pushnew (car k) firsts) + (pushnew (cdr k) seconds))) + (maphash #'frob *comp-table*))) + (with-open-file (stream (make-pathname :name "comp" :type "dat" + :defaults *output-directory*) + :direction :output + :element-type '(unsigned-byte 8) + :if-exists :supersede :if-does-not-exist :create) + (maphash (lambda (k v) + (write-4-byte (car k) stream) + (write-4-byte (cdr k) stream) + (write-4-byte v stream)) + *comp-table*))) + +(defun output () + (output-ucd-data) + (output-decomposition-data) + (output-composition-data) + (with-open-file (f (make-pathname :name "ucd-names" :type "lisp-expr" + :defaults *output-directory*) + :direction :output + :if-exists :supersede + :if-does-not-exist :create) + (with-standard-io-syntax + (write-string ";;; Do not edit by hand: generated by ucd.lisp" f) + (maphash (lambda (code name) + (when name + (print code f) + (prin1 name f))) + *unicode-names*)) + (setf *unicode-names* nil)) (with-open-file (*standard-output* - (make-pathname :name "numerics" - :type "lisp-expr" - :defaults *output-directory*) - :direction :output - :if-exists :supersede - :if-does-not-exist :create) - (let ((*print-pretty* t)) - (prin1 (mapcar #'(lambda (x) (cons (car x) (read-from-string (cdr x)))) - *different-numerics*)))) + (make-pathname :name "numerics" + :type "lisp-expr" + :defaults *output-directory*) + :direction :output + :if-exists :supersede + :if-does-not-exist :create) + (with-standard-io-syntax + (let ((*print-pretty* t)) + (prin1 (mapcar #'(lambda (x) (cons (car x) (read-from-string (cdr x)))) + *different-numerics*))))) (with-open-file (*standard-output* - (make-pathname :name "titlecases" - :type "lisp-expr" - :defaults *output-directory*) - :direction :output - :if-exists :supersede - :if-does-not-exist :create) - (let ((*print-pretty* t)) - (prin1 *different-titlecases*))) + (make-pathname :name "titlecases" + :type "lisp-expr" + :defaults *output-directory*) + :direction :output + :if-exists :supersede + :if-does-not-exist :create) + (with-standard-io-syntax + (let ((*print-pretty* t)) + (prin1 *different-titlecases*)))) (with-open-file (*standard-output* - (make-pathname :name "misc" - :type "lisp-expr" - :defaults *output-directory*) - :direction :output - :if-exists :supersede - :if-does-not-exist :create) - (let ((*print-pretty* t)) - (prin1 `(:length ,(length *misc-table*) - :uppercase ,(loop for (gc-index) across *misc-table* - for i from 0 - when (= gc-index 0) - collect i) - :lowercase ,(loop for (gc-index) across *misc-table* - for i from 0 - when (= gc-index 1) - collect i) - :titlecase ,(loop for (gc-index) across *misc-table* - for i from 0 - when (= gc-index 2) - collect i))))) + (make-pathname :name "misc" + :type "lisp-expr" + :defaults *output-directory*) + :direction :output + :if-exists :supersede + :if-does-not-exist :create) + (with-standard-io-syntax + (let ((*print-pretty* t)) + (prin1 `(:length ,(length *misc-table*) + :uppercase ,(loop for (gc-index) across *misc-table* + for i from 0 + when (= gc-index 0) + collect i) + :lowercase ,(loop for (gc-index) across *misc-table* + for i from 0 + when (= gc-index 1) + collect i) + :titlecase ,(loop for (gc-index) across *misc-table* + for i from 0 + when (= gc-index 2) + collect i)))))) (values)) ;;; Use of the generated files @@ -378,37 +629,49 @@ (defun read-compiled-ucd () (with-open-file (stream (make-pathname :name "ucd" - :type "dat" - :defaults *output-directory*) - :direction :input - :element-type '(unsigned-byte 8)) + :type "dat" + :defaults *output-directory*) + :direction :input + :element-type '(unsigned-byte 8)) (let ((length (file-length stream))) (setq *compiled-ucd* - (make-array length :element-type '(unsigned-byte 8))) + (make-array length :element-type '(unsigned-byte 8))) (read-sequence *compiled-ucd* stream))) (values)) ;;; The stuff below is dependent on misc.lisp-expr being -;;; (:LENGTH 186 :UPPERCASE (0) :LOWERCASE (1) :TITLECASE (2)) - -(defparameter *length* 186) +;;; +;;; (:LENGTH 395 :UPPERCASE (0 1 2 3 8 9 10 11) :LOWERCASE (4 5 6 7 12 13 14 15) :TITLECASE (16 17)) +;;; +;;; There are two groups of entries for UPPERCASE and LOWERCASE +;;; because some characters have case (by Unicode standards) but are +;;; not transformable character-by-character in a locale-independent +;;; way (as CL requires for its standard operators). +;;; +;;; for more details on these debugging functions, see the description +;;; of the character database format in src/code/target-char.lisp + +(defparameter *length* 395) (defun cp-index (cp) (let* ((cp-high (cp-high cp)) - (page (aref *compiled-ucd* (+ (* 8 *length*) cp-high)))) + (page (aref *compiled-ucd* (+ (* 8 *length*) cp-high)))) (+ (* 8 *length*) (ash #x110000 (- *page-size-exponent*)) (* (ash 4 *page-size-exponent*) page) (* 4 (cp-low cp))))) (defun cp-value-0 (cp) - (aref *compiled-ucd* (cp-index cp))) + (let ((index (cp-index cp))) + (dpb (aref *compiled-ucd* index) + (byte 8 3) + (ldb (byte 3 5) (aref *compiled-ucd* (1+ index)))))) (defun cp-value-1 (cp) (let ((index (cp-index cp))) - (dpb (aref *compiled-ucd* (+ index 3)) (byte 8 16) - (dpb (aref *compiled-ucd* (+ index 2)) (byte 8 8) - (aref *compiled-ucd* (1+ index)))))) + (dpb (aref *compiled-ucd* (1+ index)) (byte 5 16) + (dpb (aref *compiled-ucd* (+ index 2)) (byte 8 8) + (aref *compiled-ucd* (+ index 3)))))) (defun cp-general-category (cp) (aref *compiled-ucd* (* 8 (cp-value-0 cp)))) @@ -416,7 +679,7 @@ (defun cp-decimal-digit (cp) (let ((decimal-digit (aref *compiled-ucd* (+ 3 (* 8 (cp-value-0 cp)))))) (and (< decimal-digit 10) - decimal-digit))) + decimal-digit))) (defun cp-alpha-char-p (cp) (< (cp-general-category cp) 5)) @@ -424,14 +687,14 @@ (defun cp-alphanumericp (cp) (let ((gc (cp-general-category cp))) (or (< gc 5) - (= gc 12)))) + (= gc 12)))) (defun cp-digit-char-p (cp &optional (radix 10)) (let ((number (or (cp-decimal-digit cp) - (and (<= 65 cp 90) - (- cp 55)) - (and (<= 97 cp 122) - (- cp 87))))) + (and (<= 65 cp 90) + (- cp 55)) + (and (<= 97 cp 122) + (- cp 87))))) (when (and number (< number radix)) number))) @@ -440,20 +703,20 @@ (<= 160 cp))) (defun cp-char-upcase (cp) - (if (= (cp-value-0 cp) 1) + (if (< 3 (cp-value-0 cp) 8) (cp-value-1 cp) cp)) (defun cp-char-downcase (cp) - (if (= (cp-value-0 cp) 0) + (if (< (cp-value-0 cp) 4) (cp-value-1 cp) cp)) (defun cp-upper-case-p (cp) - (= (cp-value-0 cp) 0)) + (< (cp-value-0 cp) 4)) (defun cp-lower-case-p (cp) - (= (cp-value-0 cp) 1)) + (< 3 (cp-value-0 cp) 8)) (defun cp-both-case-p (cp) - (< (cp-value-0 cp) 2)) + (< (cp-value-0 cp) 8))