0.9.18.22:
[sbcl.git] / tools-for-build / ucd.lisp
index eb21782..8ddcb67 100644 (file)
 
 ;;; Generator
 
+(defstruct ucd misc transform)
+
 (defparameter *unicode-character-database*
   (make-pathname :directory (pathname-directory *load-truename*)))
 
 (defparameter *ucd-base* nil)
+(defparameter *unicode-names* (make-hash-table))
 
 (defparameter *last-uppercase* nil)
 (defparameter *uppercase-transition-count* 0)
 
 (defparameter *block-first* nil)
 
+(defun normalize-character-name (name)
+  (when (find #\_ name)
+    (error "Bad name for a character: ~A" name))
+  (unless (or (zerop (length name)) (find #\< name) (find #\> name))
+    (substitute #\_ #\Space name)))
+
 ;;;   3400  --  4DB5  : cjk ideograph extension a ;Lo;0;L;;;;;N;;;;;
 ;;;   AC00  --  D7A3  : hangul syllables ;Lo;0;L;;;;;N;;;;;
 ;;;   D800  --  F8FF  : surrogates and private use
              (setq *last-uppercase* nil)))
           (when (> ccc-index 255)
             (error "canonical combining class too large ~A" ccc-index))
-          (let ((result (vector misc-index (or upper-index lower-index 0))))
+          (let ((result (make-ucd :misc misc-index
+                                  :transform (or upper-index lower-index 0))))
             (when (and (> (length name) 7)
                        (string= ", Last>" name :start2 (- (length name) 7)))
               (let ((page-start (ash (+ *block-first*
                       do (setf (aref (aref *ucd-base* (cp-high point))
                                      (cp-low point))
                                result))))
-            result)))))
+            (values result (normalize-character-name name)))))))
 
 (defun slurp-ucd-line (line)
   (let* ((split-line (split-string line #\;))
       (setf (aref *ucd-base* code-high)
             (make-array (ash 1 *page-size-exponent*)
                         :initial-element nil)))
-    (setf (aref (aref *ucd-base* code-high) code-low)
-          (encode-ucd-line (cdr split-line) code-point))))
+    (multiple-value-bind (encoding name)
+        (encode-ucd-line (cdr split-line) code-point)
+      (setf (aref (aref *ucd-base* code-high) code-low) encoding
+            (gethash code-point *unicode-names*) name))))
 
 (defun second-pass ()
   (loop for i from 0 below (length *ucd-base*)
         do (loop for j from 0 below (length (aref *ucd-base* i))
                  for result = (aref (aref *ucd-base* i) j)
                  when result
-                 when (let* ((transform-point (aref result 1))
+                 when (let* ((transform-point (ucd-transform result))
                              (transform-high (ash transform-point
                                                   (- *page-size-exponent*)))
                              (transform-low (ldb (byte *page-size-exponent* 0)
                                                  transform-point)))
                         (and (plusp transform-point)
-                             (/= (aref (aref (aref *ucd-base* transform-high)
-                                             transform-low)
-                                       1)
+                             (/= (ucd-transform
+                                  (aref (aref *ucd-base* transform-high)
+                                        transform-low))
                                  (+ (ash i *page-size-exponent*) j))))
                  do (destructuring-bind (gc-index bidi-index ccc-index
                                          decimal-digit digit bidi-mirrored
                                          cl-both-case-p)
-                        (aref *misc-table* (aref result 0))
+                        (aref *misc-table* (ucd-misc result))
                       (declare (ignore cl-both-case-p))
                       (format t "~A~%" (+ (ash i *page-size-exponent*) j))
-                      (setf (aref result 0)
+                      (setf (ucd-misc result)
                             (hash-misc gc-index bidi-index ccc-index
                                        decimal-digit digit bidi-mirrored
                                        nil))))))
               do (write-byte 0 stream)
               do (write-byte 0 stream))
         (loop for page across *ucd-base*
-              do (write-byte (if page (gethash page hash) 0) stream))
+           do (write-byte (if page (gethash page hash) 0) stream))
         (loop for page across array
-              do (loop for entry across page
-                       do (write-byte (if entry
-                                          (aref *misc-mapping* (aref entry 0))
-                                          255)
-                                      stream)
-                       do (write-3-byte (if entry (aref entry 1) 0)
-                                        stream))))))
+           do (loop for entry across page
+                 do (write-byte (if entry
+                                    (aref *misc-mapping* (ucd-misc entry))
+                                    255)
+                                stream)
+                 do (write-3-byte (if entry (ucd-transform entry) 0)
+                                  stream))))))
+  (with-open-file (f (make-pathname :name "ucd-names" :type "lisp-expr"
+                                    :defaults *output-directory*)
+                     :direction :output
+                     :if-exists :supersede
+                     :if-does-not-exist :create)
+    (with-standard-io-syntax
+      (write-string ";;; Do not edit by hand: generated by ucd.lisp" f)
+      (maphash (lambda (code name)
+                 (when name
+                  (print code f)
+                  (prin1 name f)))
+               *unicode-names*))
+    (setf *unicode-names* nil))
   (with-open-file (*standard-output*
                    (make-pathname :name "numerics"
                                   :type "lisp-expr"