;;; Generator
+(defstruct ucd misc transform)
+
(defparameter *unicode-character-database*
(make-pathname :directory (pathname-directory *load-truename*)))
(defparameter *ucd-base* nil)
+(defparameter *unicode-names* (make-hash-table))
(defparameter *last-uppercase* nil)
(defparameter *uppercase-transition-count* 0)
do (slurp-ucd-line line)))
(second-pass)
(build-misc-table)
+ (fixup-hangul-syllables)
*decompositions*)
+(defun fixup-hangul-syllables ()
+ ;; "Hangul Syllable Composition, Unicode 5.1 section 3-12"
+ (let* ((sbase #xac00)
+ (lbase #x1100)
+ (vbase #x1161)
+ (tbase #x11a7)
+ (scount 11172)
+ (lcount 19)
+ (vcount 21)
+ (tcount 28)
+ (ncount (* vcount tcount))
+ (table (make-hash-table)))
+ (with-open-file (*standard-input*
+ (make-pathname :name "Jamo" :type "txt"
+ :defaults *unicode-character-database*))
+ (loop for line = (read-line nil nil)
+ while line
+ if (position #\; line)
+ do (add-jamo-information line table)))
+ (dotimes (sindex scount)
+ (let* ((l (+ lbase (floor sindex ncount)))
+ (v (+ vbase (floor (mod sindex ncount) tcount)))
+ (tee (+ tbase (mod sindex tcount)))
+ (name (format nil "HANGUL_SYLLABLE_~A~A~:[~A~;~]"
+ (gethash l table) (gethash v table)
+ (= tee tbase) (gethash tee table))))
+ (setf (gethash (+ sbase sindex) *unicode-names*) name)))))
+
+(defun add-jamo-information (line table)
+ (let* ((split (split-string line #\;))
+ (code (parse-integer (first split) :radix 16))
+ (syllable (string-trim '(#\Space)
+ (subseq (second split) 0 (position #\# (second split))))))
+ (setf (gethash code table) syllable)))
+
(defun split-string (line character)
(loop for prev-position = 0 then (1+ position)
for position = (position character line :start prev-position)
(defparameter *block-first* nil)
+(defun normalize-character-name (name)
+ (when (find #\_ name)
+ (error "Bad name for a character: ~A" name))
+ (unless (or (zerop (length name)) (find #\< name) (find #\> name))
+ (substitute #\_ #\Space name)))
+
;;; 3400 -- 4DB5 : cjk ideograph extension a ;Lo;0;L;;;;;N;;;;;
;;; AC00 -- D7A3 : hangul syllables ;Lo;0;L;;;;;N;;;;;
;;; D800 -- F8FF : surrogates and private use
decimal-digit digit bidi-mirrored
cl-both-case-p)))
(declare (ignore digit-index))
+ (when (and (not cl-both-case-p)
+ (< gc-index 2))
+ (format t "~A~%" name))
(incf *name-size* (length name))
(when (string/= "" decomposition-type-and-mapping)
(let ((split (split-string decomposition-type-and-mapping
(setq *last-uppercase* nil)))
(when (> ccc-index 255)
(error "canonical combining class too large ~A" ccc-index))
- (let ((result (vector misc-index (or upper-index lower-index 0))))
+ (let ((result (make-ucd :misc misc-index
+ :transform (or upper-index lower-index 0))))
(when (and (> (length name) 7)
(string= ", Last>" name :start2 (- (length name) 7)))
(let ((page-start (ash (+ *block-first*
do (setf (aref (aref *ucd-base* (cp-high point))
(cp-low point))
result))))
- result)))))
+ (values result (normalize-character-name name)))))))
(defun slurp-ucd-line (line)
(let* ((split-line (split-string line #\;))
(setf (aref *ucd-base* code-high)
(make-array (ash 1 *page-size-exponent*)
:initial-element nil)))
- (setf (aref (aref *ucd-base* code-high) code-low)
- (encode-ucd-line (cdr split-line) code-point))))
+ (multiple-value-bind (encoding name)
+ (encode-ucd-line (cdr split-line) code-point)
+ (setf (aref (aref *ucd-base* code-high) code-low) encoding
+ (gethash code-point *unicode-names*) name))))
(defun second-pass ()
(loop for i from 0 below (length *ucd-base*)
do (loop for j from 0 below (length (aref *ucd-base* i))
for result = (aref (aref *ucd-base* i) j)
when result
- when (let* ((transform-point (aref result 1))
+ when (let* ((transform-point (ucd-transform result))
(transform-high (ash transform-point
(- *page-size-exponent*)))
(transform-low (ldb (byte *page-size-exponent* 0)
transform-point)))
(and (plusp transform-point)
- (/= (aref (aref (aref *ucd-base* transform-high)
- transform-low)
- 1)
+ (/= (ucd-transform
+ (aref (aref *ucd-base* transform-high)
+ transform-low))
(+ (ash i *page-size-exponent*) j))))
do (destructuring-bind (gc-index bidi-index ccc-index
decimal-digit digit bidi-mirrored
cl-both-case-p)
- (aref *misc-table* (aref result 0))
+ (aref *misc-table* (ucd-misc result))
(declare (ignore cl-both-case-p))
(format t "~A~%" (+ (ash i *page-size-exponent*) j))
- (setf (aref result 0)
+ (setf (ucd-misc result)
(hash-misc gc-index bidi-index ccc-index
decimal-digit digit bidi-mirrored
nil))))))
do (write-byte 0 stream)
do (write-byte 0 stream))
(loop for page across *ucd-base*
- do (write-byte (if page (gethash page hash) 0) stream))
+ do (write-byte (if page (gethash page hash) 0) stream))
(loop for page across array
- do (loop for entry across page
- do (write-byte (if entry
- (aref *misc-mapping* (aref entry 0))
- 255)
- stream)
- do (write-3-byte (if entry (aref entry 1) 0)
- stream))))))
+ do (loop for entry across page
+ do (write-byte (if entry
+ (aref *misc-mapping* (ucd-misc entry))
+ 255)
+ stream)
+ do (write-3-byte (if entry (ucd-transform entry) 0)
+ stream))))))
+ (with-open-file (f (make-pathname :name "ucd-names" :type "lisp-expr"
+ :defaults *output-directory*)
+ :direction :output
+ :if-exists :supersede
+ :if-does-not-exist :create)
+ (with-standard-io-syntax
+ (write-string ";;; Do not edit by hand: generated by ucd.lisp" f)
+ (maphash (lambda (code name)
+ (when name
+ (print code f)
+ (prin1 name f)))
+ *unicode-names*))
+ (setf *unicode-names* nil))
(with-open-file (*standard-output*
(make-pathname :name "numerics"
:type "lisp-expr"
:direction :output
:if-exists :supersede
:if-does-not-exist :create)
- (let ((*print-pretty* t))
- (prin1 (mapcar #'(lambda (x) (cons (car x) (read-from-string (cdr x))))
- *different-numerics*))))
+ (with-standard-io-syntax
+ (let ((*print-pretty* t))
+ (prin1 (mapcar #'(lambda (x) (cons (car x) (read-from-string (cdr x))))
+ *different-numerics*)))))
(with-open-file (*standard-output*
(make-pathname :name "titlecases"
:type "lisp-expr"
:direction :output
:if-exists :supersede
:if-does-not-exist :create)
- (let ((*print-pretty* t))
- (prin1 *different-titlecases*)))
+ (with-standard-io-syntax
+ (let ((*print-pretty* t))
+ (prin1 *different-titlecases*))))
(with-open-file (*standard-output*
(make-pathname :name "misc"
:type "lisp-expr"
:direction :output
:if-exists :supersede
:if-does-not-exist :create)
- (let ((*print-pretty* t))
- (prin1 `(:length ,(length *misc-table*)
- :uppercase ,(loop for (gc-index) across *misc-table*
- for i from 0
- when (= gc-index 0)
- collect i)
- :lowercase ,(loop for (gc-index) across *misc-table*
- for i from 0
- when (= gc-index 1)
- collect i)
- :titlecase ,(loop for (gc-index) across *misc-table*
- for i from 0
- when (= gc-index 2)
- collect i)))))
+ (with-standard-io-syntax
+ (let ((*print-pretty* t))
+ (prin1 `(:length ,(length *misc-table*)
+ :uppercase ,(loop for (gc-index) across *misc-table*
+ for i from 0
+ when (= gc-index 0)
+ collect i)
+ :lowercase ,(loop for (gc-index) across *misc-table*
+ for i from 0
+ when (= gc-index 1)
+ collect i)
+ :titlecase ,(loop for (gc-index) across *misc-table*
+ for i from 0
+ when (= gc-index 2)
+ collect i))))))
(values))
;;; Use of the generated files
(values))
;;; The stuff below is dependent on misc.lisp-expr being
-;;; (:LENGTH 186 :UPPERCASE (0) :LOWERCASE (1) :TITLECASE (2))
-
-(defparameter *length* 186)
+;;; (:LENGTH 215 :UPPERCASE (0 2) :LOWERCASE (1 3) :TITLECASE (4)).
+;;;
+;;; There are two entries for UPPERCASE and LOWERCASE because some
+;;; characters have case (by Unicode standards) but are not
+;;; transformable character-by-character in a locale-independent way
+;;; (as CL requires for its standard operators).
+;;;
+;;; for more details on these debugging functions, see the description
+;;; of the character database format in src/code/target-char.lisp
+
+(defparameter *length* 215)
(defun cp-index (cp)
(let* ((cp-high (cp-high cp))