5 (defparameter *output-directory*
7 (make-pathname :directory '(:relative :up "output"))
8 (make-pathname :directory (pathname-directory *load-truename*))))
10 (defparameter *page-size-exponent* 8)
13 (ash cp (- *page-size-exponent*)))
16 (ldb (byte *page-size-exponent* 0) cp))
20 (defstruct ucd misc transform)
22 (defparameter *unicode-character-database*
23 (make-pathname :directory (pathname-directory *load-truename*)))
25 (defparameter *ucd-base* nil)
26 (defparameter *unicode-names* (make-hash-table))
28 (defparameter *last-uppercase* nil)
29 (defparameter *uppercase-transition-count* 0)
30 (defparameter *different-titlecases* nil)
31 (defparameter *different-numerics* nil)
32 (defparameter *name-size* 0)
33 (defparameter *misc-hash* (make-hash-table :test #'equal))
34 (defparameter *misc-index* -1)
35 (defparameter *misc-table* nil)
36 (defparameter *misc-mapping* nil)
37 (defparameter *both-cases* nil)
38 (defparameter *decompositions* nil)
39 (defparameter *decomposition-length-max* nil)
40 (defparameter *decomposition-types* nil)
41 (defparameter *decomposition-base* nil)
43 (defun hash-misc (gc-index bidi-index ccc-index decimal-digit digit
44 bidi-mirrored cl-both-case-p decomposition-info)
45 (let* ((list (list gc-index bidi-index ccc-index decimal-digit digit
46 bidi-mirrored cl-both-case-p decomposition-info))
47 (index (gethash list *misc-hash*)))
50 (vector-push list *misc-table*)
51 (setf (gethash list *misc-hash*)
52 (incf *misc-index*))))))
54 (defun compare-misc-entry (left right)
55 (destructuring-bind (left-gc-index left-bidi-index left-ccc-index
56 left-decimal-digit left-digit left-bidi-mirrored
57 left-cl-both-case-p left-decomposition-info)
59 (destructuring-bind (right-gc-index right-bidi-index right-ccc-index
60 right-decimal-digit right-digit right-bidi-mirrored
61 right-cl-both-case-p right-decomposition-info)
63 (or (and left-cl-both-case-p (not right-cl-both-case-p))
64 (and (or left-cl-both-case-p (not right-cl-both-case-p))
65 (or (< left-gc-index right-gc-index)
66 (and (= left-gc-index right-gc-index)
67 (or (< left-decomposition-info right-decomposition-info)
68 (and (= left-decomposition-info right-decomposition-info)
69 (or (< left-bidi-index right-bidi-index)
70 (and (= left-bidi-index right-bidi-index)
71 (or (< left-ccc-index right-ccc-index)
72 (and (= left-ccc-index right-ccc-index)
73 (or (string< left-decimal-digit
75 (and (string= left-decimal-digit
77 (or (string< left-digit right-digit)
78 (and (string= left-digit
80 (string< left-bidi-mirrored
81 right-bidi-mirrored))))))))))))))))))
83 (defun build-misc-table ()
84 (sort *misc-table* #'compare-misc-entry)
85 (setq *misc-mapping* (make-array (1+ *misc-index*)))
86 (loop for i from 0 to *misc-index*
87 do (setf (aref *misc-mapping*
88 (gethash (aref *misc-table* i) *misc-hash*))
92 (setq *last-uppercase* nil)
93 (setq *uppercase-transition-count* 0)
94 (setq *different-titlecases* nil)
95 (setq *different-numerics* nil)
97 (setq *misc-hash* (make-hash-table :test #'equal))
98 (setq *misc-index* -1)
99 (setq *misc-table* (make-array 2048 :fill-pointer 0))
100 (setq *both-cases* nil)
101 (setq *decompositions* 0)
102 (setq *decomposition-types*
103 (let ((array (make-array 256 :initial-element nil :fill-pointer 1)))
104 (vector-push "" array)
105 (vector-push "<compat>" array)
107 (setq *decomposition-length-max* 0)
108 (setq *decomposition-base* (make-array (ash #x110000
109 (- *page-size-exponent*))
110 :initial-element nil))
111 (setq *ucd-base* (make-array (ash #x110000 (- *page-size-exponent*))
112 :initial-element nil))
113 (with-open-file (*standard-input*
114 (make-pathname :name "UnicodeData"
116 :defaults *unicode-character-database*)
118 (loop for line = (read-line nil nil)
120 do (slurp-ucd-line line)))
123 (fixup-hangul-syllables)
126 (defun fixup-hangul-syllables ()
127 ;; "Hangul Syllable Composition, Unicode 5.1 section 3-12"
128 (let* ((sbase #xac00)
136 (ncount (* vcount tcount))
137 (table (make-hash-table)))
138 (with-open-file (*standard-input*
139 (make-pathname :name "Jamo" :type "txt"
140 :defaults *unicode-character-database*))
141 (loop for line = (read-line nil nil)
143 if (position #\; line)
144 do (add-jamo-information line table)))
145 (dotimes (sindex scount)
146 (let* ((l (+ lbase (floor sindex ncount)))
147 (v (+ vbase (floor (mod sindex ncount) tcount)))
148 (tee (+ tbase (mod sindex tcount)))
149 (name (format nil "HANGUL_SYLLABLE_~A~A~:[~A~;~]"
150 (gethash l table) (gethash v table)
151 (= tee tbase) (gethash tee table))))
152 (setf (gethash (+ sbase sindex) *unicode-names*) name)))))
154 (defun add-jamo-information (line table)
155 (let* ((split (split-string line #\;))
156 (code (parse-integer (first split) :radix 16))
157 (syllable (string-trim '(#\Space)
158 (subseq (second split) 0 (position #\# (second split))))))
159 (setf (gethash code table) syllable)))
161 (defun split-string (line character)
162 (loop for prev-position = 0 then (1+ position)
163 for position = (position character line :start prev-position)
164 collect (subseq line prev-position position)
168 (defun init-indices (strings)
169 (let ((hash (make-hash-table :test #'equal)))
170 (loop for string in strings
172 do (setf (gethash string hash) index))
175 (defparameter *general-categories*
176 (init-indices '("Lu" "Ll" "Lt" "Lm" "Lo" "Cc" "Cf" "Co" "Cs" "Mc"
177 "Me" "Mn" "Nd" "Nl" "No" "Pc" "Pd" "Pe" "Pf" "Pi"
178 "Po" "Ps" "Sc" "Sk" "Sm" "So" "Zl" "Zp" "Zs")))
179 (defparameter *bidi-classes*
180 (init-indices '("AL" "AN" "B" "BN" "CS" "EN" "ES" "ET" "L" "LRE" "LRO"
181 "NSM" "ON" "PDF" "R" "RLE" "RLO" "S" "WS")))
184 (defparameter *block-first* nil)
186 (defun normalize-character-name (name)
187 (when (find #\_ name)
188 (error "Bad name for a character: ~A" name))
189 (unless (or (zerop (length name)) (find #\< name) (find #\> name))
190 (substitute #\_ #\Space name)))
192 ;;; 3400 -- 4DB5 : cjk ideograph extension a ;Lo;0;L;;;;;N;;;;;
193 ;;; AC00 -- D7A3 : hangul syllables ;Lo;0;L;;;;;N;;;;;
194 ;;; D800 -- F8FF : surrogates and private use
195 ;;; 20000 -- 2A6D6 : cjk ideograph extension b ;Lo;0;L;;;;;N;;;;;
196 ;;; F0000 -- FFFFD : private use
197 ;;; 100000 -- 10FFFD: private use
198 (defun encode-ucd-line (line code-point)
199 (destructuring-bind (name general-category canonical-combining-class
200 bidi-class decomposition-type-and-mapping
201 decimal-digit digit numeric bidi-mirrored
202 unicode-1-name iso-10646-comment simple-uppercase
203 simple-lowercase simple-titlecase)
205 (declare (ignore unicode-1-name iso-10646-comment))
206 (if (and (> (length name) 8)
207 (string= ", First>" name :start2 (- (length name) 8)))
209 (setq *block-first* code-point)
211 (let* ((gc-index (or (gethash general-category *general-categories*)
212 (error "unknown general category ~A"
214 (bidi-index (or (gethash bidi-class *bidi-classes*)
215 (error "unknown bidirectional class ~A"
217 (ccc-index (parse-integer canonical-combining-class))
218 (digit-index (unless (string= "" decimal-digit)
219 (parse-integer decimal-digit)))
220 (upper-index (unless (string= "" simple-uppercase)
221 (parse-integer simple-uppercase :radix 16)))
222 (lower-index (unless (string= "" simple-lowercase)
223 (parse-integer simple-lowercase :radix 16)))
224 (title-index (unless (string= "" simple-titlecase)
225 (parse-integer simple-titlecase :radix 16)))
227 (not (null (or (and (= gc-index 0) lower-index)
228 (and (= gc-index 1) upper-index)))))
229 (decomposition-info 0))
230 (declare (ignore digit-index))
231 (when (and (not cl-both-case-p)
233 (format t "~A~%" name))
234 (incf *name-size* (length name))
235 (when (string/= "" decomposition-type-and-mapping)
236 (let ((split (split-string decomposition-type-and-mapping #\Space)))
238 ((char= #\< (aref (first split) 0))
239 (unless (position (first split) *decomposition-types*
241 (vector-push (first split) *decomposition-types*))
242 (setf decomposition-info (position (pop split) *decomposition-types* :test #'equal)))
243 (t (setf decomposition-info 1)))
244 (unless (aref *decomposition-base* (cp-high code-point))
245 (setf (aref *decomposition-base* (cp-high code-point))
246 (make-array (ash 1 *page-size-exponent*)
247 :initial-element nil)))
248 (setf (aref (aref *decomposition-base* (cp-high code-point))
250 (mapcar #'(lambda (string)
251 (parse-integer string :radix 16))
253 (setq *decomposition-length-max*
254 (max *decomposition-length-max* (length split)))
255 (incf *decompositions* (length split))))
256 (when (and (string/= "" simple-uppercase)
257 (string/= "" simple-lowercase))
258 (push (list code-point upper-index lower-index) *both-cases*))
259 (when (string/= simple-uppercase simple-titlecase)
260 (push (cons code-point title-index) *different-titlecases*))
261 (when (string/= digit numeric)
262 (push (cons code-point numeric) *different-numerics*))
265 (unless *last-uppercase*
266 (incf *uppercase-transition-count*))
267 (setq *last-uppercase* t))
269 (when *last-uppercase*
270 (incf *uppercase-transition-count*))
271 (setq *last-uppercase* nil)))
272 (when (> ccc-index 255)
273 (error "canonical combining class too large ~A" ccc-index))
274 (let* ((misc-index (hash-misc gc-index bidi-index ccc-index
275 decimal-digit digit bidi-mirrored
276 cl-both-case-p decomposition-info))
277 (result (make-ucd :misc misc-index
278 :transform (or upper-index lower-index 0))))
279 (when (and (> (length name) 7)
280 (string= ", Last>" name :start2 (- (length name) 7)))
281 (let ((page-start (ash (+ *block-first*
282 (ash 1 *page-size-exponent*)
284 (- *page-size-exponent*)))
285 (page-end (ash code-point (- *page-size-exponent*))))
286 (loop for point from *block-first*
287 below (ash page-start *page-size-exponent*)
288 do (setf (aref (aref *ucd-base* (cp-high point))
291 (loop for page from page-start below page-end
292 do (setf (aref *ucd-base* page)
293 (make-array (ash 1 *page-size-exponent*)
294 :initial-element result)))
295 (loop for point from (ash page-end *page-size-exponent*)
297 do (setf (aref (aref *ucd-base* (cp-high point))
300 (values result (normalize-character-name name)))))))
302 (defun slurp-ucd-line (line)
303 (let* ((split-line (split-string line #\;))
304 (code-point (parse-integer (first split-line) :radix 16))
305 (code-high (ash code-point (- *page-size-exponent*)))
306 (code-low (ldb (byte *page-size-exponent* 0) code-point)))
307 (unless (aref *ucd-base* code-high)
308 (setf (aref *ucd-base* code-high)
309 (make-array (ash 1 *page-size-exponent*)
310 :initial-element nil)))
311 (multiple-value-bind (encoding name)
312 (encode-ucd-line (cdr split-line) code-point)
313 (setf (aref (aref *ucd-base* code-high) code-low) encoding
314 (gethash code-point *unicode-names*) name))))
316 ;;; this fixes up the case conversion discrepancy between CL and
317 ;;; Unicode: CL operators depend on char-downcase / char-upcase being
318 ;;; inverses, which is not true in general in Unicode even for
319 ;;; characters which change case to single characters.
320 (defun second-pass ()
321 (loop for i from 0 below (length *ucd-base*)
322 when (aref *ucd-base* i)
323 do (loop for j from 0 below (length (aref *ucd-base* i))
324 for result = (aref (aref *ucd-base* i) j)
326 when (let* ((transform-point (ucd-transform result))
327 (transform-high (ash transform-point
328 (- *page-size-exponent*)))
329 (transform-low (ldb (byte *page-size-exponent* 0)
331 (and (plusp transform-point)
333 (aref (aref *ucd-base* transform-high)
335 (+ (ash i *page-size-exponent*) j))))
336 do (destructuring-bind (gc-index bidi-index ccc-index
337 decimal-digit digit bidi-mirrored
338 cl-both-case-p decomposition-info)
339 (aref *misc-table* (ucd-misc result))
340 (declare (ignore cl-both-case-p))
341 (format t "~A~%" (+ (ash i *page-size-exponent*) j))
342 (setf (ucd-misc result)
343 (hash-misc gc-index bidi-index ccc-index
344 decimal-digit digit bidi-mirrored
345 nil decomposition-info))))))
347 (defun write-4-byte (quadruplet stream)
348 (write-byte (ldb (byte 8 24) quadruplet) stream)
349 (write-byte (ldb (byte 8 16) quadruplet) stream)
350 (write-byte (ldb (byte 8 8) quadruplet) stream)
351 (write-byte (ldb (byte 8 0) quadruplet) stream))
353 (defun digit-to-byte (digit)
354 (if (string= "" digit)
356 (parse-integer digit)))
359 (let ((hash (make-hash-table :test #'equalp))
361 (loop for page across *ucd-base*
363 (unless (gethash page hash)
364 (setf (gethash page hash)
366 (let ((array (make-array (1+ index))))
367 (maphash #'(lambda (key value)
368 (setf (aref array value) key))
371 (make-array (ash 1 *page-size-exponent*) :initial-element nil))
372 (with-open-file (stream (make-pathname :name "ucd"
374 :defaults *output-directory*)
376 :element-type '(unsigned-byte 8)
377 :if-exists :supersede
378 :if-does-not-exist :create)
379 (loop for (gc-index bidi-index ccc-index decimal-digit digit
380 bidi-mirrored nil decomposition-info)
382 do (write-byte gc-index stream)
383 do (write-byte bidi-index stream)
384 do (write-byte ccc-index stream)
385 do (write-byte (digit-to-byte decimal-digit) stream)
386 do (write-byte (digit-to-byte digit) stream)
387 do (write-byte (if (string= "N" bidi-mirrored) 0 1) stream)
388 do (write-byte decomposition-info stream)
389 do (write-byte 0 stream))
390 (loop for page across *ucd-base*
391 do (write-byte (if page (gethash page hash) 0) stream))
392 (loop for page across array
393 do (loop for entry across page
395 (dpb (if entry (aref *misc-mapping* (ucd-misc entry)) #x7ff)
397 (if entry (ucd-transform entry) 0))
399 (with-open-file (f (make-pathname :name "ucd-names" :type "lisp-expr"
400 :defaults *output-directory*)
402 :if-exists :supersede
403 :if-does-not-exist :create)
404 (with-standard-io-syntax
405 (write-string ";;; Do not edit by hand: generated by ucd.lisp" f)
406 (maphash (lambda (code name)
411 (setf *unicode-names* nil))
412 (with-open-file (*standard-output*
413 (make-pathname :name "numerics"
415 :defaults *output-directory*)
417 :if-exists :supersede
418 :if-does-not-exist :create)
419 (with-standard-io-syntax
420 (let ((*print-pretty* t))
421 (prin1 (mapcar #'(lambda (x) (cons (car x) (read-from-string (cdr x))))
422 *different-numerics*)))))
423 (with-open-file (*standard-output*
424 (make-pathname :name "titlecases"
426 :defaults *output-directory*)
428 :if-exists :supersede
429 :if-does-not-exist :create)
430 (with-standard-io-syntax
431 (let ((*print-pretty* t))
432 (prin1 *different-titlecases*))))
433 (with-open-file (*standard-output*
434 (make-pathname :name "misc"
436 :defaults *output-directory*)
438 :if-exists :supersede
439 :if-does-not-exist :create)
440 (with-standard-io-syntax
441 (let ((*print-pretty* t))
442 (prin1 `(:length ,(length *misc-table*)
443 :uppercase ,(loop for (gc-index) across *misc-table*
447 :lowercase ,(loop for (gc-index) across *misc-table*
451 :titlecase ,(loop for (gc-index) across *misc-table*
457 ;;; Use of the generated files
459 (defparameter *compiled-ucd* nil)
461 (defun read-compiled-ucd ()
462 (with-open-file (stream (make-pathname :name "ucd"
464 :defaults *output-directory*)
466 :element-type '(unsigned-byte 8))
467 (let ((length (file-length stream)))
469 (make-array length :element-type '(unsigned-byte 8)))
470 (read-sequence *compiled-ucd* stream)))
473 ;;; The stuff below is dependent on misc.lisp-expr being
475 ;;; (:LENGTH 395 :UPPERCASE (0 1 2 3 8 9 10 11) :LOWERCASE (4 5 6 7 12 13 14 15) :TITLECASE (16 17))
477 ;;; There are two groups of entries for UPPERCASE and LOWERCASE
478 ;;; because some characters have case (by Unicode standards) but are
479 ;;; not transformable character-by-character in a locale-independent
480 ;;; way (as CL requires for its standard operators).
482 ;;; for more details on these debugging functions, see the description
483 ;;; of the character database format in src/code/target-char.lisp
485 (defparameter *length* 395)
488 (let* ((cp-high (cp-high cp))
489 (page (aref *compiled-ucd* (+ (* 8 *length*) cp-high))))
491 (ash #x110000 (- *page-size-exponent*))
492 (* (ash 4 *page-size-exponent*) page)
495 (defun cp-value-0 (cp)
496 (let ((index (cp-index cp)))
497 (dpb (aref *compiled-ucd* index)
499 (ldb (byte 3 5) (aref *compiled-ucd* (1+ index))))))
501 (defun cp-value-1 (cp)
502 (let ((index (cp-index cp)))
503 (dpb (aref *compiled-ucd* (1+ index)) (byte 5 16)
504 (dpb (aref *compiled-ucd* (+ index 2)) (byte 8 8)
505 (aref *compiled-ucd* (+ index 3))))))
507 (defun cp-general-category (cp)
508 (aref *compiled-ucd* (* 8 (cp-value-0 cp))))
510 (defun cp-decimal-digit (cp)
511 (let ((decimal-digit (aref *compiled-ucd* (+ 3 (* 8 (cp-value-0 cp))))))
512 (and (< decimal-digit 10)
515 (defun cp-alpha-char-p (cp)
516 (< (cp-general-category cp) 5))
518 (defun cp-alphanumericp (cp)
519 (let ((gc (cp-general-category cp)))
523 (defun cp-digit-char-p (cp &optional (radix 10))
524 (let ((number (or (cp-decimal-digit cp)
529 (when (and number (< number radix))
532 (defun cp-graphic-char-p (cp)
536 (defun cp-char-upcase (cp)
537 (if (< 3 (cp-value-0 cp) 8)
541 (defun cp-char-downcase (cp)
542 (if (< (cp-value-0 cp) 4)
546 (defun cp-upper-case-p (cp)
547 (< (cp-value-0 cp) 4))
549 (defun cp-lower-case-p (cp)
550 (< 3 (cp-value-0 cp) 8))
552 (defun cp-both-case-p (cp)
553 (< (cp-value-0 cp) 8))