explicit :BIG-ENDIAN feature
[sbcl.git] / tools-for-build / ucd.lisp
index eb21782..b1cae49 100644 (file)
 
 ;;; Generator
 
+(defstruct ucd misc transform)
+
 (defparameter *unicode-character-database*
   (make-pathname :directory (pathname-directory *load-truename*)))
 
 (defparameter *ucd-base* nil)
+(defparameter *unicode-names* (make-hash-table))
 
 (defparameter *last-uppercase* nil)
 (defparameter *uppercase-transition-count* 0)
           do (slurp-ucd-line line)))
   (second-pass)
   (build-misc-table)
+  (fixup-hangul-syllables)
   *decompositions*)
 
+(defun fixup-hangul-syllables ()
+  ;; "Hangul Syllable Composition, Unicode 5.1 section 3-12"
+  (let* ((sbase #xac00)
+         (lbase #x1100)
+         (vbase #x1161)
+         (tbase #x11a7)
+         (scount 11172)
+         (lcount 19)
+         (vcount 21)
+         (tcount 28)
+         (ncount (* vcount tcount))
+         (table (make-hash-table)))
+    (with-open-file (*standard-input*
+                     (make-pathname :name "Jamo" :type "txt"
+                                    :defaults *unicode-character-database*))
+      (loop for line = (read-line nil nil)
+            while line
+            if (position #\; line)
+            do (add-jamo-information line table)))
+    (dotimes (sindex scount)
+      (let* ((l (+ lbase (floor sindex ncount)))
+             (v (+ vbase (floor (mod sindex ncount) tcount)))
+             (tee (+ tbase (mod sindex tcount)))
+             (name (format nil "HANGUL_SYLLABLE_~A~A~:[~A~;~]"
+                           (gethash l table) (gethash v table)
+                           (= tee tbase) (gethash tee table))))
+        (setf (gethash (+ sbase sindex) *unicode-names*) name)))))
+
+(defun add-jamo-information (line table)
+  (let* ((split (split-string line #\;))
+         (code (parse-integer (first split) :radix 16))
+         (syllable (string-trim '(#\Space)
+                                (subseq (second split) 0 (position #\# (second split))))))
+    (setf (gethash code table) syllable)))
+
 (defun split-string (line character)
   (loop for prev-position = 0 then (1+ position)
         for position = (position character line :start prev-position)
 
 (defparameter *block-first* nil)
 
+(defun normalize-character-name (name)
+  (when (find #\_ name)
+    (error "Bad name for a character: ~A" name))
+  (unless (or (zerop (length name)) (find #\< name) (find #\> name))
+    (substitute #\_ #\Space name)))
+
 ;;;   3400  --  4DB5  : cjk ideograph extension a ;Lo;0;L;;;;;N;;;;;
 ;;;   AC00  --  D7A3  : hangul syllables ;Lo;0;L;;;;;N;;;;;
 ;;;   D800  --  F8FF  : surrogates and private use
                                       decimal-digit digit bidi-mirrored
                                       cl-both-case-p)))
           (declare (ignore digit-index))
+          (when (and (not cl-both-case-p)
+                     (< gc-index 2))
+            (format t "~A~%" name))
           (incf *name-size* (length name))
           (when (string/= "" decomposition-type-and-mapping)
             (let ((split (split-string decomposition-type-and-mapping
              (setq *last-uppercase* nil)))
           (when (> ccc-index 255)
             (error "canonical combining class too large ~A" ccc-index))
-          (let ((result (vector misc-index (or upper-index lower-index 0))))
+          (let ((result (make-ucd :misc misc-index
+                                  :transform (or upper-index lower-index 0))))
             (when (and (> (length name) 7)
                        (string= ", Last>" name :start2 (- (length name) 7)))
               (let ((page-start (ash (+ *block-first*
                       do (setf (aref (aref *ucd-base* (cp-high point))
                                      (cp-low point))
                                result))))
-            result)))))
+            (values result (normalize-character-name name)))))))
 
 (defun slurp-ucd-line (line)
   (let* ((split-line (split-string line #\;))
       (setf (aref *ucd-base* code-high)
             (make-array (ash 1 *page-size-exponent*)
                         :initial-element nil)))
-    (setf (aref (aref *ucd-base* code-high) code-low)
-          (encode-ucd-line (cdr split-line) code-point))))
+    (multiple-value-bind (encoding name)
+        (encode-ucd-line (cdr split-line) code-point)
+      (setf (aref (aref *ucd-base* code-high) code-low) encoding
+            (gethash code-point *unicode-names*) name))))
 
 (defun second-pass ()
   (loop for i from 0 below (length *ucd-base*)
         do (loop for j from 0 below (length (aref *ucd-base* i))
                  for result = (aref (aref *ucd-base* i) j)
                  when result
-                 when (let* ((transform-point (aref result 1))
+                 when (let* ((transform-point (ucd-transform result))
                              (transform-high (ash transform-point
                                                   (- *page-size-exponent*)))
                              (transform-low (ldb (byte *page-size-exponent* 0)
                                                  transform-point)))
                         (and (plusp transform-point)
-                             (/= (aref (aref (aref *ucd-base* transform-high)
-                                             transform-low)
-                                       1)
+                             (/= (ucd-transform
+                                  (aref (aref *ucd-base* transform-high)
+                                        transform-low))
                                  (+ (ash i *page-size-exponent*) j))))
                  do (destructuring-bind (gc-index bidi-index ccc-index
                                          decimal-digit digit bidi-mirrored
                                          cl-both-case-p)
-                        (aref *misc-table* (aref result 0))
+                        (aref *misc-table* (ucd-misc result))
                       (declare (ignore cl-both-case-p))
                       (format t "~A~%" (+ (ash i *page-size-exponent*) j))
-                      (setf (aref result 0)
+                      (setf (ucd-misc result)
                             (hash-misc gc-index bidi-index ccc-index
                                        decimal-digit digit bidi-mirrored
                                        nil))))))
               do (write-byte 0 stream)
               do (write-byte 0 stream))
         (loop for page across *ucd-base*
-              do (write-byte (if page (gethash page hash) 0) stream))
+           do (write-byte (if page (gethash page hash) 0) stream))
         (loop for page across array
-              do (loop for entry across page
-                       do (write-byte (if entry
-                                          (aref *misc-mapping* (aref entry 0))
-                                          255)
-                                      stream)
-                       do (write-3-byte (if entry (aref entry 1) 0)
-                                        stream))))))
+           do (loop for entry across page
+                 do (write-byte (if entry
+                                    (aref *misc-mapping* (ucd-misc entry))
+                                    255)
+                                stream)
+                 do (write-3-byte (if entry (ucd-transform entry) 0)
+                                  stream))))))
+  (with-open-file (f (make-pathname :name "ucd-names" :type "lisp-expr"
+                                    :defaults *output-directory*)
+                     :direction :output
+                     :if-exists :supersede
+                     :if-does-not-exist :create)
+    (with-standard-io-syntax
+      (write-string ";;; Do not edit by hand: generated by ucd.lisp" f)
+      (maphash (lambda (code name)
+                 (when name
+                  (print code f)
+                  (prin1 name f)))
+               *unicode-names*))
+    (setf *unicode-names* nil))
   (with-open-file (*standard-output*
                    (make-pathname :name "numerics"
                                   :type "lisp-expr"
                    :direction :output
                    :if-exists :supersede
                    :if-does-not-exist :create)
-    (let ((*print-pretty* t))
-      (prin1 (mapcar #'(lambda (x) (cons (car x) (read-from-string (cdr x))))
-                     *different-numerics*))))
+    (with-standard-io-syntax
+      (let ((*print-pretty* t))
+        (prin1 (mapcar #'(lambda (x) (cons (car x) (read-from-string (cdr x))))
+                       *different-numerics*)))))
   (with-open-file (*standard-output*
                    (make-pathname :name "titlecases"
                                   :type "lisp-expr"
                    :direction :output
                    :if-exists :supersede
                    :if-does-not-exist :create)
-    (let ((*print-pretty* t))
-      (prin1 *different-titlecases*)))
+    (with-standard-io-syntax
+      (let ((*print-pretty* t))
+        (prin1 *different-titlecases*))))
   (with-open-file (*standard-output*
                    (make-pathname :name "misc"
                                   :type "lisp-expr"
                    :direction :output
                    :if-exists :supersede
                    :if-does-not-exist :create)
-    (let ((*print-pretty* t))
-      (prin1 `(:length ,(length *misc-table*)
-               :uppercase ,(loop for (gc-index) across *misc-table*
-                                 for i from 0
-                                 when (= gc-index 0)
-                                 collect i)
-               :lowercase ,(loop for (gc-index) across *misc-table*
-                                 for i from 0
-                                 when (= gc-index 1)
-                                 collect i)
-               :titlecase ,(loop for (gc-index) across *misc-table*
-                                 for i from 0
-                                 when (= gc-index 2)
-                                 collect i)))))
+    (with-standard-io-syntax
+      (let ((*print-pretty* t))
+        (prin1 `(:length ,(length *misc-table*)
+                 :uppercase ,(loop for (gc-index) across *misc-table*
+                                for i from 0
+                                when (= gc-index 0)
+                                collect i)
+                 :lowercase ,(loop for (gc-index) across *misc-table*
+                                for i from 0
+                                when (= gc-index 1)
+                                collect i)
+                 :titlecase ,(loop for (gc-index) across *misc-table*
+                                for i from 0
+                                when (= gc-index 2)
+                                collect i))))))
   (values))
 
 ;;; Use of the generated files
   (values))
 
 ;;; The stuff below is dependent on misc.lisp-expr being
-;;; (:LENGTH 186 :UPPERCASE (0) :LOWERCASE (1) :TITLECASE (2))
-
-(defparameter *length* 186)
+;;; (:LENGTH 215 :UPPERCASE (0 2) :LOWERCASE (1 3) :TITLECASE (4)).
+;;;
+;;; There are two entries for UPPERCASE and LOWERCASE because some
+;;; characters have case (by Unicode standards) but are not
+;;; transformable character-by-character in a locale-independent way
+;;; (as CL requires for its standard operators).
+;;;
+;;; for more details on these debugging functions, see the description
+;;; of the character database format in src/code/target-char.lisp
+
+(defparameter *length* 215)
 
 (defun cp-index (cp)
   (let* ((cp-high (cp-high cp))