5 (defparameter *output-directory*
7 (make-pathname :directory '(:relative :up "output"))
8 (make-pathname :directory (pathname-directory *load-truename*))))
10 (defparameter *page-size-exponent* 8)
13 (ash cp (- *page-size-exponent*)))
16 (ldb (byte *page-size-exponent* 0) cp))
20 (defstruct ucd misc transform)
22 (defparameter *unicode-character-database*
23 (make-pathname :directory (pathname-directory *load-truename*)))
25 (defparameter *ucd-base* nil)
26 (defparameter *unicode-names* (make-hash-table))
28 (defparameter *last-uppercase* nil)
29 (defparameter *uppercase-transition-count* 0)
30 (defparameter *different-titlecases* nil)
31 (defparameter *different-numerics* nil)
32 (defparameter *name-size* 0)
33 (defparameter *misc-hash* (make-hash-table :test #'equal))
34 (defparameter *misc-index* -1)
35 (defparameter *misc-table* nil)
36 (defparameter *misc-mapping* nil)
37 (defparameter *both-cases* nil)
38 (defparameter *long-decompositions* nil)
39 (defparameter *decomposition-types* nil)
40 (defparameter *decomposition-base* nil)
42 (defun hash-misc (gc-index bidi-index ccc-index decimal-digit digit
43 bidi-mirrored cl-both-case-p decomposition-info)
44 (let* ((list (list gc-index bidi-index ccc-index decimal-digit digit
45 bidi-mirrored cl-both-case-p decomposition-info))
46 (index (gethash list *misc-hash*)))
49 (vector-push list *misc-table*)
50 (setf (gethash list *misc-hash*)
51 (incf *misc-index*))))))
53 (defun compare-misc-entry (left right)
54 (destructuring-bind (left-gc-index left-bidi-index left-ccc-index
55 left-decimal-digit left-digit left-bidi-mirrored
56 left-cl-both-case-p left-decomposition-info)
58 (destructuring-bind (right-gc-index right-bidi-index right-ccc-index
59 right-decimal-digit right-digit right-bidi-mirrored
60 right-cl-both-case-p right-decomposition-info)
62 (or (and left-cl-both-case-p (not right-cl-both-case-p))
63 (and (or left-cl-both-case-p (not right-cl-both-case-p))
64 (or (< left-gc-index right-gc-index)
65 (and (= left-gc-index right-gc-index)
66 (or (< left-decomposition-info right-decomposition-info)
67 (and (= left-decomposition-info right-decomposition-info)
68 (or (< left-bidi-index right-bidi-index)
69 (and (= left-bidi-index right-bidi-index)
70 (or (< left-ccc-index right-ccc-index)
71 (and (= left-ccc-index right-ccc-index)
72 (or (string< left-decimal-digit
74 (and (string= left-decimal-digit
76 (or (string< left-digit right-digit)
77 (and (string= left-digit
79 (string< left-bidi-mirrored
80 right-bidi-mirrored))))))))))))))))))
82 (defun build-misc-table ()
83 (sort *misc-table* #'compare-misc-entry)
84 (setq *misc-mapping* (make-array (1+ *misc-index*)))
85 (loop for i from 0 to *misc-index*
86 do (setf (aref *misc-mapping*
87 (gethash (aref *misc-table* i) *misc-hash*))
91 (setq *last-uppercase* nil)
92 (setq *uppercase-transition-count* 0)
93 (setq *different-titlecases* nil)
94 (setq *different-numerics* nil)
96 (setq *misc-hash* (make-hash-table :test #'equal))
97 (setq *misc-index* -1)
98 (setq *misc-table* (make-array 2048 :fill-pointer 0))
99 (setq *both-cases* nil)
100 (setq *long-decompositions*
101 (make-array 2048 :fill-pointer 0 :adjustable t))
102 (setq *decomposition-types*
103 (let ((array (make-array 256 :initial-element nil :fill-pointer 1)))
104 (vector-push "" array)
105 (vector-push "<compat>" array)
107 (setq *decomposition-base* (make-array (ash #x110000
108 (- *page-size-exponent*))
109 :initial-element nil))
110 (setq *ucd-base* (make-array (ash #x110000 (- *page-size-exponent*))
111 :initial-element nil))
112 (with-open-file (*standard-input*
113 (make-pathname :name "UnicodeData"
115 :defaults *unicode-character-database*)
117 (loop for line = (read-line nil nil)
119 do (slurp-ucd-line line)))
122 (fixup-hangul-syllables)
123 (length *long-decompositions*))
125 (defun fixup-hangul-syllables ()
126 ;; "Hangul Syllable Composition, Unicode 5.1 section 3-12"
127 (let* ((sbase #xac00)
135 (ncount (* vcount tcount))
136 (table (make-hash-table)))
137 (with-open-file (*standard-input*
138 (make-pathname :name "Jamo" :type "txt"
139 :defaults *unicode-character-database*))
140 (loop for line = (read-line nil nil)
142 if (position #\; line)
143 do (add-jamo-information line table)))
144 (dotimes (sindex scount)
145 (let* ((l (+ lbase (floor sindex ncount)))
146 (v (+ vbase (floor (mod sindex ncount) tcount)))
147 (tee (+ tbase (mod sindex tcount)))
148 (name (format nil "HANGUL_SYLLABLE_~A~A~:[~A~;~]"
149 (gethash l table) (gethash v table)
150 (= tee tbase) (gethash tee table))))
151 (setf (gethash (+ sbase sindex) *unicode-names*) name)))))
153 (defun add-jamo-information (line table)
154 (let* ((split (split-string line #\;))
155 (code (parse-integer (first split) :radix 16))
156 (syllable (string-trim '(#\Space)
157 (subseq (second split) 0 (position #\# (second split))))))
158 (setf (gethash code table) syllable)))
160 (defun split-string (line character)
161 (loop for prev-position = 0 then (1+ position)
162 for position = (position character line :start prev-position)
163 collect (subseq line prev-position position)
167 (defun init-indices (strings)
168 (let ((hash (make-hash-table :test #'equal)))
169 (loop for string in strings
171 do (setf (gethash string hash) index))
174 (defparameter *general-categories*
175 (init-indices '("Lu" "Ll" "Lt" "Lm" "Lo" "Cc" "Cf" "Co" "Cs" "Mc"
176 "Me" "Mn" "Nd" "Nl" "No" "Pc" "Pd" "Pe" "Pf" "Pi"
177 "Po" "Ps" "Sc" "Sk" "Sm" "So" "Zl" "Zp" "Zs")))
178 (defparameter *bidi-classes*
179 (init-indices '("AL" "AN" "B" "BN" "CS" "EN" "ES" "ET" "L" "LRE" "LRO"
180 "NSM" "ON" "PDF" "R" "RLE" "RLO" "S" "WS")))
183 (defparameter *block-first* nil)
185 (defun normalize-character-name (name)
186 (when (find #\_ name)
187 (error "Bad name for a character: ~A" name))
188 (unless (or (zerop (length name)) (find #\< name) (find #\> name))
189 (substitute #\_ #\Space name)))
191 ;;; 3400 -- 4DB5 : cjk ideograph extension a ;Lo;0;L;;;;;N;;;;;
192 ;;; AC00 -- D7A3 : hangul syllables ;Lo;0;L;;;;;N;;;;;
193 ;;; D800 -- F8FF : surrogates and private use
194 ;;; 20000 -- 2A6D6 : cjk ideograph extension b ;Lo;0;L;;;;;N;;;;;
195 ;;; F0000 -- FFFFD : private use
196 ;;; 100000 -- 10FFFD: private use
197 (defun encode-ucd-line (line code-point)
198 (destructuring-bind (name general-category canonical-combining-class
199 bidi-class decomposition-type-and-mapping
200 decimal-digit digit numeric bidi-mirrored
201 unicode-1-name iso-10646-comment simple-uppercase
202 simple-lowercase simple-titlecase)
204 (declare (ignore unicode-1-name iso-10646-comment))
205 (if (and (> (length name) 8)
206 (string= ", First>" name :start2 (- (length name) 8)))
208 (setq *block-first* code-point)
210 (let* ((gc-index (or (gethash general-category *general-categories*)
211 (error "unknown general category ~A"
213 (bidi-index (or (gethash bidi-class *bidi-classes*)
214 (error "unknown bidirectional class ~A"
216 (ccc-index (parse-integer canonical-combining-class))
217 (digit-index (unless (string= "" decimal-digit)
218 (parse-integer decimal-digit)))
219 (upper-index (unless (string= "" simple-uppercase)
220 (parse-integer simple-uppercase :radix 16)))
221 (lower-index (unless (string= "" simple-lowercase)
222 (parse-integer simple-lowercase :radix 16)))
223 (title-index (unless (string= "" simple-titlecase)
224 (parse-integer simple-titlecase :radix 16)))
226 (not (null (or (and (= gc-index 0) lower-index)
227 (and (= gc-index 1) upper-index)))))
228 (decomposition-info 0))
229 (declare (ignore digit-index))
230 (when (and (not cl-both-case-p)
232 (format t "~A~%" name))
233 (incf *name-size* (length name))
234 (when (string/= "" decomposition-type-and-mapping)
235 (let ((split (split-string decomposition-type-and-mapping #\Space)))
237 ((char= #\< (aref (first split) 0))
238 (unless (position (first split) *decomposition-types*
240 (vector-push (first split) *decomposition-types*))
241 (setf decomposition-info (position (pop split) *decomposition-types* :test #'equal)))
242 (t (setf decomposition-info 1)))
243 (unless (aref *decomposition-base* (cp-high code-point))
244 (setf (aref *decomposition-base* (cp-high code-point))
245 (make-array (ash 1 *page-size-exponent*)
246 :initial-element nil)))
247 (setf (aref (aref *decomposition-base* (cp-high code-point))
250 (mapcar #'(lambda (string)
251 (parse-integer string :radix 16))
253 (if (= (length decomposition) 1)
254 (cons 1 (car decomposition))
255 (cons (length decomposition)
256 (prog1 (fill-pointer *long-decompositions*)
257 (dolist (code decomposition)
258 (vector-push-extend code *long-decompositions*)))))))))
259 (when (and (string/= "" simple-uppercase)
260 (string/= "" simple-lowercase))
261 (push (list code-point upper-index lower-index) *both-cases*))
262 (when (string/= simple-uppercase simple-titlecase)
263 (push (cons code-point title-index) *different-titlecases*))
264 (when (string/= digit numeric)
265 (push (cons code-point numeric) *different-numerics*))
268 (unless *last-uppercase*
269 (incf *uppercase-transition-count*))
270 (setq *last-uppercase* t))
272 (when *last-uppercase*
273 (incf *uppercase-transition-count*))
274 (setq *last-uppercase* nil)))
275 (when (> ccc-index 255)
276 (error "canonical combining class too large ~A" ccc-index))
277 (let* ((misc-index (hash-misc gc-index bidi-index ccc-index
278 decimal-digit digit bidi-mirrored
279 cl-both-case-p decomposition-info))
280 (result (make-ucd :misc misc-index
281 :transform (or upper-index lower-index 0))))
282 (when (and (> (length name) 7)
283 (string= ", Last>" name :start2 (- (length name) 7)))
284 (let ((page-start (ash (+ *block-first*
285 (ash 1 *page-size-exponent*)
287 (- *page-size-exponent*)))
288 (page-end (ash code-point (- *page-size-exponent*))))
289 (loop for point from *block-first*
290 below (ash page-start *page-size-exponent*)
291 do (setf (aref (aref *ucd-base* (cp-high point))
294 (loop for page from page-start below page-end
295 do (setf (aref *ucd-base* page)
296 (make-array (ash 1 *page-size-exponent*)
297 :initial-element result)))
298 (loop for point from (ash page-end *page-size-exponent*)
300 do (setf (aref (aref *ucd-base* (cp-high point))
303 (values result (normalize-character-name name)))))))
305 (defun slurp-ucd-line (line)
306 (let* ((split-line (split-string line #\;))
307 (code-point (parse-integer (first split-line) :radix 16))
308 (code-high (ash code-point (- *page-size-exponent*)))
309 (code-low (ldb (byte *page-size-exponent* 0) code-point)))
310 (unless (aref *ucd-base* code-high)
311 (setf (aref *ucd-base* code-high)
312 (make-array (ash 1 *page-size-exponent*)
313 :initial-element nil)))
314 (multiple-value-bind (encoding name)
315 (encode-ucd-line (cdr split-line) code-point)
316 (setf (aref (aref *ucd-base* code-high) code-low) encoding
317 (gethash code-point *unicode-names*) name))))
319 ;;; this fixes up the case conversion discrepancy between CL and
320 ;;; Unicode: CL operators depend on char-downcase / char-upcase being
321 ;;; inverses, which is not true in general in Unicode even for
322 ;;; characters which change case to single characters.
323 (defun second-pass ()
324 (loop for i from 0 below (length *ucd-base*)
325 when (aref *ucd-base* i)
326 do (loop for j from 0 below (length (aref *ucd-base* i))
327 for result = (aref (aref *ucd-base* i) j)
329 when (let* ((transform-point (ucd-transform result))
330 (transform-high (ash transform-point
331 (- *page-size-exponent*)))
332 (transform-low (ldb (byte *page-size-exponent* 0)
334 (and (plusp transform-point)
336 (aref (aref *ucd-base* transform-high)
338 (+ (ash i *page-size-exponent*) j))))
339 do (destructuring-bind (gc-index bidi-index ccc-index
340 decimal-digit digit bidi-mirrored
341 cl-both-case-p decomposition-info)
342 (aref *misc-table* (ucd-misc result))
343 (declare (ignore cl-both-case-p))
344 (format t "~A~%" (+ (ash i *page-size-exponent*) j))
345 (setf (ucd-misc result)
346 (hash-misc gc-index bidi-index ccc-index
347 decimal-digit digit bidi-mirrored
348 nil decomposition-info))))))
350 (defun write-4-byte (quadruplet stream)
351 (write-byte (ldb (byte 8 24) quadruplet) stream)
352 (write-byte (ldb (byte 8 16) quadruplet) stream)
353 (write-byte (ldb (byte 8 8) quadruplet) stream)
354 (write-byte (ldb (byte 8 0) quadruplet) stream))
356 (defun digit-to-byte (digit)
357 (if (string= "" digit)
359 (parse-integer digit)))
362 (let ((hash (make-hash-table :test #'equalp))
364 (loop for page across *ucd-base*
366 (unless (gethash page hash)
367 (setf (gethash page hash)
369 (let ((array (make-array (1+ index))))
370 (maphash #'(lambda (key value)
371 (setf (aref array value) key))
374 (make-array (ash 1 *page-size-exponent*) :initial-element nil))
375 (with-open-file (stream (make-pathname :name "ucd"
377 :defaults *output-directory*)
379 :element-type '(unsigned-byte 8)
380 :if-exists :supersede
381 :if-does-not-exist :create)
382 (loop for (gc-index bidi-index ccc-index decimal-digit digit
383 bidi-mirrored nil decomposition-info)
385 ;; three bits spare here
386 do (write-byte gc-index stream)
387 ;; three bits spare here
388 do (write-byte bidi-index stream)
389 do (write-byte ccc-index stream)
390 ;; we could save some space here: decimal-digit and
391 ;; digit are constrained (CHECKME) to be between 0 and
392 ;; 9, so we could encode the pair in a single byte.
393 ;; (Also, decimal-digit is equal to digit or undefined,
394 ;; so we could encode decimal-digit as a single bit,
395 ;; meaning that we could save 11 bits here.
396 do (write-byte (digit-to-byte decimal-digit) stream)
397 do (write-byte (digit-to-byte digit) stream)
398 ;; there's an easy 7 bits to spare here
399 do (write-byte (if (string= "N" bidi-mirrored) 0 1) stream)
400 ;; at the moment we store information about which type
401 ;; of compatibility decomposition is used, costing c.3
402 ;; bits. We could elide that.
403 do (write-byte decomposition-info stream)
404 do (write-byte 0 stream))
405 (loop for page across *ucd-base*
406 do (write-byte (if page (gethash page hash) 0) stream))
407 (loop for page across array
408 do (loop for entry across page
410 (dpb (if entry (aref *misc-mapping* (ucd-misc entry)) #x7ff)
412 (if entry (ucd-transform entry) 0))
414 ;; KLUDGE: this code, to write out decomposition information, is a
415 ;; little bit very similar to the ucd entries above. Try factoring
416 ;; out the common stuff?
417 (let ((hash (make-hash-table :test #'equalp))
419 (loop for page across *decomposition-base*
421 (unless (gethash page hash)
422 (setf (gethash page hash)
423 (prog1 index (incf index))))))
424 (let ((array (make-array index)))
425 (maphash #'(lambda (key value)
426 (setf (aref array value) key))
428 (with-open-file (stream (make-pathname :name "decomp" :type "dat"
429 :defaults *output-directory*)
431 :element-type '(unsigned-byte 8)
432 :if-exists :supersede
433 :if-does-not-exist :create)
434 (loop for page across *decomposition-base*
435 do (write-byte (if page (gethash page hash) 0) stream))
436 (loop for page across array
437 do (loop for entry across page
439 (dpb (if entry (car entry) 0)
441 (if entry (cdr entry) 0))
443 (with-open-file (stream (make-pathname :name "ldecomp" :type "dat"
444 :defaults *output-directory*)
446 :element-type '(unsigned-byte 8)
447 :if-exists :supersede
448 :if-does-not-exist :create)
449 (loop for code across (copy-seq *long-decompositions*)
450 do (write-4-byte code stream)))))
451 (with-open-file (f (make-pathname :name "ucd-names" :type "lisp-expr"
452 :defaults *output-directory*)
454 :if-exists :supersede
455 :if-does-not-exist :create)
456 (with-standard-io-syntax
457 (write-string ";;; Do not edit by hand: generated by ucd.lisp" f)
458 (maphash (lambda (code name)
463 (setf *unicode-names* nil))
464 (with-open-file (*standard-output*
465 (make-pathname :name "numerics"
467 :defaults *output-directory*)
469 :if-exists :supersede
470 :if-does-not-exist :create)
471 (with-standard-io-syntax
472 (let ((*print-pretty* t))
473 (prin1 (mapcar #'(lambda (x) (cons (car x) (read-from-string (cdr x))))
474 *different-numerics*)))))
475 (with-open-file (*standard-output*
476 (make-pathname :name "titlecases"
478 :defaults *output-directory*)
480 :if-exists :supersede
481 :if-does-not-exist :create)
482 (with-standard-io-syntax
483 (let ((*print-pretty* t))
484 (prin1 *different-titlecases*))))
485 (with-open-file (*standard-output*
486 (make-pathname :name "misc"
488 :defaults *output-directory*)
490 :if-exists :supersede
491 :if-does-not-exist :create)
492 (with-standard-io-syntax
493 (let ((*print-pretty* t))
494 (prin1 `(:length ,(length *misc-table*)
495 :uppercase ,(loop for (gc-index) across *misc-table*
499 :lowercase ,(loop for (gc-index) across *misc-table*
503 :titlecase ,(loop for (gc-index) across *misc-table*
509 ;;; Use of the generated files
511 (defparameter *compiled-ucd* nil)
513 (defun read-compiled-ucd ()
514 (with-open-file (stream (make-pathname :name "ucd"
516 :defaults *output-directory*)
518 :element-type '(unsigned-byte 8))
519 (let ((length (file-length stream)))
521 (make-array length :element-type '(unsigned-byte 8)))
522 (read-sequence *compiled-ucd* stream)))
525 ;;; The stuff below is dependent on misc.lisp-expr being
527 ;;; (:LENGTH 395 :UPPERCASE (0 1 2 3 8 9 10 11) :LOWERCASE (4 5 6 7 12 13 14 15) :TITLECASE (16 17))
529 ;;; There are two groups of entries for UPPERCASE and LOWERCASE
530 ;;; because some characters have case (by Unicode standards) but are
531 ;;; not transformable character-by-character in a locale-independent
532 ;;; way (as CL requires for its standard operators).
534 ;;; for more details on these debugging functions, see the description
535 ;;; of the character database format in src/code/target-char.lisp
537 (defparameter *length* 395)
540 (let* ((cp-high (cp-high cp))
541 (page (aref *compiled-ucd* (+ (* 8 *length*) cp-high))))
543 (ash #x110000 (- *page-size-exponent*))
544 (* (ash 4 *page-size-exponent*) page)
547 (defun cp-value-0 (cp)
548 (let ((index (cp-index cp)))
549 (dpb (aref *compiled-ucd* index)
551 (ldb (byte 3 5) (aref *compiled-ucd* (1+ index))))))
553 (defun cp-value-1 (cp)
554 (let ((index (cp-index cp)))
555 (dpb (aref *compiled-ucd* (1+ index)) (byte 5 16)
556 (dpb (aref *compiled-ucd* (+ index 2)) (byte 8 8)
557 (aref *compiled-ucd* (+ index 3))))))
559 (defun cp-general-category (cp)
560 (aref *compiled-ucd* (* 8 (cp-value-0 cp))))
562 (defun cp-decimal-digit (cp)
563 (let ((decimal-digit (aref *compiled-ucd* (+ 3 (* 8 (cp-value-0 cp))))))
564 (and (< decimal-digit 10)
567 (defun cp-alpha-char-p (cp)
568 (< (cp-general-category cp) 5))
570 (defun cp-alphanumericp (cp)
571 (let ((gc (cp-general-category cp)))
575 (defun cp-digit-char-p (cp &optional (radix 10))
576 (let ((number (or (cp-decimal-digit cp)
581 (when (and number (< number radix))
584 (defun cp-graphic-char-p (cp)
588 (defun cp-char-upcase (cp)
589 (if (< 3 (cp-value-0 cp) 8)
593 (defun cp-char-downcase (cp)
594 (if (< (cp-value-0 cp) 4)
598 (defun cp-upper-case-p (cp)
599 (< (cp-value-0 cp) 4))
601 (defun cp-lower-case-p (cp)
602 (< 3 (cp-value-0 cp) 8))
604 (defun cp-both-case-p (cp)
605 (< (cp-value-0 cp) 8))