5 (defparameter *output-directory*
7 (make-pathname :directory '(:relative :up "output"))
8 (make-pathname :directory (pathname-directory *load-truename*))))
10 (defparameter *page-size-exponent* 8)
13 (ash cp (- *page-size-exponent*)))
16 (ldb (byte *page-size-exponent* 0) cp))
20 (defstruct ucd misc transform)
22 (defparameter *unicode-character-database*
23 (make-pathname :directory (pathname-directory *load-truename*)))
25 (defparameter *ucd-base* nil)
26 (defparameter *unicode-names* (make-hash-table))
28 (defparameter *last-uppercase* nil)
29 (defparameter *uppercase-transition-count* 0)
30 (defparameter *different-titlecases* nil)
31 (defparameter *different-numerics* nil)
32 (defparameter *name-size* 0)
33 (defparameter *misc-hash* (make-hash-table :test #'equal))
34 (defparameter *misc-index* -1)
35 (defparameter *misc-table* nil)
36 (defparameter *misc-mapping* nil)
37 (defparameter *both-cases* nil)
38 (defparameter *decompositions* nil)
39 (defparameter *decomposition-length-max* nil)
40 (defparameter *decomposition-types* nil)
41 (defparameter *decomposition-base* nil)
43 (defun hash-misc (gc-index bidi-index ccc-index decimal-digit digit
44 bidi-mirrored cl-both-case-p)
45 (let* ((list (list gc-index bidi-index ccc-index decimal-digit digit
46 bidi-mirrored cl-both-case-p))
47 (index (gethash list *misc-hash*)))
50 (vector-push list *misc-table*)
51 (setf (gethash list *misc-hash*)
52 (incf *misc-index*))))))
54 (defun compare-misc-entry (left right)
55 (destructuring-bind (left-gc-index left-bidi-index left-ccc-index
56 left-decimal-digit left-digit left-bidi-mirrored
59 (destructuring-bind (right-gc-index right-bidi-index right-ccc-index
60 right-decimal-digit right-digit right-bidi-mirrored
63 (or (and left-cl-both-case-p (not right-cl-both-case-p))
64 (and (or left-cl-both-case-p (not right-cl-both-case-p))
65 (or (< left-gc-index right-gc-index)
66 (and (= left-gc-index right-gc-index)
67 (or (< left-bidi-index right-bidi-index)
68 (and (= left-bidi-index right-bidi-index)
69 (or (< left-ccc-index right-ccc-index)
70 (and (= left-ccc-index right-ccc-index)
71 (or (string< left-decimal-digit
73 (and (string= left-decimal-digit
75 (or (string< left-digit right-digit)
76 (and (string= left-digit
78 (string< left-bidi-mirrored
79 right-bidi-mirrored))))))))))))))))
81 (defun build-misc-table ()
82 (sort *misc-table* #'compare-misc-entry)
83 (setq *misc-mapping* (make-array (1+ *misc-index*)))
84 (loop for i from 0 to *misc-index*
85 do (setf (aref *misc-mapping*
86 (gethash (aref *misc-table* i) *misc-hash*))
90 (setq *last-uppercase* nil)
91 (setq *uppercase-transition-count* 0)
92 (setq *different-titlecases* nil)
93 (setq *different-numerics* nil)
95 (setq *misc-hash* (make-hash-table :test #'equal))
96 (setq *misc-index* -1)
97 (setq *misc-table* (make-array 256 :fill-pointer 0))
98 (setq *both-cases* nil)
99 (setq *decompositions* 0)
100 (setq *decomposition-types* (make-hash-table :test #'equal))
101 (setq *decomposition-length-max* 0)
102 (setq *decomposition-base* (make-array (ash #x110000
103 (- *page-size-exponent*))
104 :initial-element nil))
105 (setq *ucd-base* (make-array (ash #x110000 (- *page-size-exponent*))
106 :initial-element nil))
107 (with-open-file (*standard-input*
108 (make-pathname :name "UnicodeData"
110 :defaults *unicode-character-database*)
112 (loop for line = (read-line nil nil)
114 do (slurp-ucd-line line)))
119 (defun split-string (line character)
120 (loop for prev-position = 0 then (1+ position)
121 for position = (position character line :start prev-position)
122 collect (subseq line prev-position position)
126 (defun init-indices (strings)
127 (let ((hash (make-hash-table :test #'equal)))
128 (loop for string in strings
130 do (setf (gethash string hash) index))
133 (defparameter *general-categories*
134 (init-indices '("Lu" "Ll" "Lt" "Lm" "Lo" "Cc" "Cf" "Co" "Cs" "Mc"
135 "Me" "Mn" "Nd" "Nl" "No" "Pc" "Pd" "Pe" "Pf" "Pi"
136 "Po" "Ps" "Sc" "Sk" "Sm" "So" "Zl" "Zp" "Zs")))
137 (defparameter *bidi-classes*
138 (init-indices '("AL" "AN" "B" "BN" "CS" "EN" "ES" "ET" "L" "LRE" "LRO"
139 "NSM" "ON" "PDF" "R" "RLE" "RLO" "S" "WS")))
142 (defparameter *block-first* nil)
144 (defun normalize-character-name (name)
145 (when (find #\_ name)
146 (error "Bad name for a character: ~A" name))
147 (unless (or (zerop (length name)) (find #\< name) (find #\> name))
148 (substitute #\_ #\Space name)))
150 ;;; 3400 -- 4DB5 : cjk ideograph extension a ;Lo;0;L;;;;;N;;;;;
151 ;;; AC00 -- D7A3 : hangul syllables ;Lo;0;L;;;;;N;;;;;
152 ;;; D800 -- F8FF : surrogates and private use
153 ;;; 20000 -- 2A6D6 : cjk ideograph extension b ;Lo;0;L;;;;;N;;;;;
154 ;;; F0000 -- FFFFD : private use
155 ;;; 100000 -- 10FFFD: private use
156 (defun encode-ucd-line (line code-point)
157 (destructuring-bind (name general-category canonical-combining-class
158 bidi-class decomposition-type-and-mapping
159 decimal-digit digit numeric bidi-mirrored
160 unicode-1-name iso-10646-comment simple-uppercase
161 simple-lowercase simple-titlecase)
163 (declare (ignore unicode-1-name iso-10646-comment))
164 (if (and (> (length name) 8)
165 (string= ", First>" name :start2 (- (length name) 8)))
167 (setq *block-first* code-point)
169 (let* ((gc-index (or (gethash general-category *general-categories*)
170 (error "unknown general category ~A"
172 (bidi-index (or (gethash bidi-class *bidi-classes*)
173 (error "unknown bidirectional class ~A"
175 (ccc-index (parse-integer canonical-combining-class))
176 (digit-index (unless (string= "" decimal-digit)
177 (parse-integer decimal-digit)))
178 (upper-index (unless (string= "" simple-uppercase)
179 (parse-integer simple-uppercase :radix 16)))
180 (lower-index (unless (string= "" simple-lowercase)
181 (parse-integer simple-lowercase :radix 16)))
182 (title-index (unless (string= "" simple-titlecase)
183 (parse-integer simple-titlecase :radix 16)))
185 (not (null (or (and (= gc-index 0) lower-index)
186 (and (= gc-index 1) upper-index)))))
187 (misc-index (hash-misc gc-index bidi-index ccc-index
188 decimal-digit digit bidi-mirrored
190 (declare (ignore digit-index))
191 (when (and (not cl-both-case-p)
193 (format t "~A~%" name))
194 (incf *name-size* (length name))
195 (when (string/= "" decomposition-type-and-mapping)
196 (let ((split (split-string decomposition-type-and-mapping
198 (when (char= #\< (aref (first split) 0))
199 (setf (gethash (pop split) *decomposition-types*) t))
200 (unless (aref *decomposition-base* (cp-high code-point))
201 (setf (aref *decomposition-base* (cp-high code-point))
202 (make-array (ash 1 *page-size-exponent*)
203 :initial-element nil)))
204 (setf (aref (aref *decomposition-base* (cp-high code-point))
206 (mapcar #'(lambda (string)
207 (parse-integer string :radix 16))
209 (setq *decomposition-length-max*
210 (max *decomposition-length-max* (length split)))
211 (incf *decompositions* (length split))))
212 (when (and (string/= "" simple-uppercase)
213 (string/= "" simple-lowercase))
214 (push (list code-point upper-index lower-index) *both-cases*))
215 (when (string/= simple-uppercase simple-titlecase)
216 (push (cons code-point title-index) *different-titlecases*))
217 (when (string/= digit numeric)
218 (push (cons code-point numeric) *different-numerics*))
221 (unless *last-uppercase*
222 (incf *uppercase-transition-count*))
223 (setq *last-uppercase* t))
225 (when *last-uppercase*
226 (incf *uppercase-transition-count*))
227 (setq *last-uppercase* nil)))
228 (when (> ccc-index 255)
229 (error "canonical combining class too large ~A" ccc-index))
230 (let ((result (make-ucd :misc misc-index
231 :transform (or upper-index lower-index 0))))
232 (when (and (> (length name) 7)
233 (string= ", Last>" name :start2 (- (length name) 7)))
234 (let ((page-start (ash (+ *block-first*
235 (ash 1 *page-size-exponent*)
237 (- *page-size-exponent*)))
238 (page-end (ash code-point (- *page-size-exponent*))))
239 (loop for point from *block-first*
240 below (ash page-start *page-size-exponent*)
241 do (setf (aref (aref *ucd-base* (cp-high point))
244 (loop for page from page-start below page-end
245 do (setf (aref *ucd-base* page)
246 (make-array (ash 1 *page-size-exponent*)
247 :initial-element result)))
248 (loop for point from (ash page-end *page-size-exponent*)
250 do (setf (aref (aref *ucd-base* (cp-high point))
253 (values result (normalize-character-name name)))))))
255 (defun slurp-ucd-line (line)
256 (let* ((split-line (split-string line #\;))
257 (code-point (parse-integer (first split-line) :radix 16))
258 (code-high (ash code-point (- *page-size-exponent*)))
259 (code-low (ldb (byte *page-size-exponent* 0) code-point)))
260 (unless (aref *ucd-base* code-high)
261 (setf (aref *ucd-base* code-high)
262 (make-array (ash 1 *page-size-exponent*)
263 :initial-element nil)))
264 (multiple-value-bind (encoding name)
265 (encode-ucd-line (cdr split-line) code-point)
266 (setf (aref (aref *ucd-base* code-high) code-low) encoding
267 (gethash code-point *unicode-names*) name))))
269 (defun second-pass ()
270 (loop for i from 0 below (length *ucd-base*)
271 when (aref *ucd-base* i)
272 do (loop for j from 0 below (length (aref *ucd-base* i))
273 for result = (aref (aref *ucd-base* i) j)
275 when (let* ((transform-point (ucd-transform result))
276 (transform-high (ash transform-point
277 (- *page-size-exponent*)))
278 (transform-low (ldb (byte *page-size-exponent* 0)
280 (and (plusp transform-point)
282 (aref (aref *ucd-base* transform-high)
284 (+ (ash i *page-size-exponent*) j))))
285 do (destructuring-bind (gc-index bidi-index ccc-index
286 decimal-digit digit bidi-mirrored
288 (aref *misc-table* (ucd-misc result))
289 (declare (ignore cl-both-case-p))
290 (format t "~A~%" (+ (ash i *page-size-exponent*) j))
291 (setf (ucd-misc result)
292 (hash-misc gc-index bidi-index ccc-index
293 decimal-digit digit bidi-mirrored
296 (defun write-3-byte (triplet stream)
297 (write-byte (ldb (byte 8 0) triplet) stream)
298 (write-byte (ldb (byte 8 8) triplet) stream)
299 (write-byte (ldb (byte 8 16) triplet) stream))
301 (defun digit-to-byte (digit)
302 (if (string= "" digit)
304 (parse-integer digit)))
307 (let ((hash (make-hash-table :test #'equalp))
309 (loop for page across *ucd-base*
311 (unless (gethash page hash)
312 (setf (gethash page hash)
314 (let ((array (make-array (1+ index))))
315 (maphash #'(lambda (key value)
316 (setf (aref array value) key))
319 (make-array (ash 1 *page-size-exponent*) :initial-element nil))
320 (with-open-file (stream (make-pathname :name "ucd"
322 :defaults *output-directory*)
324 :element-type '(unsigned-byte 8)
325 :if-exists :supersede
326 :if-does-not-exist :create)
327 (loop for (gc-index bidi-index ccc-index decimal-digit digit
330 do (write-byte gc-index stream)
331 do (write-byte bidi-index stream)
332 do (write-byte ccc-index stream)
333 do (write-byte (digit-to-byte decimal-digit) stream)
334 do (write-byte (digit-to-byte digit) stream)
335 do (write-byte (if (string= "N" bidi-mirrored) 0 1) stream)
336 do (write-byte 0 stream)
337 do (write-byte 0 stream))
338 (loop for page across *ucd-base*
339 do (write-byte (if page (gethash page hash) 0) stream))
340 (loop for page across array
341 do (loop for entry across page
342 do (write-byte (if entry
343 (aref *misc-mapping* (ucd-misc entry))
346 do (write-3-byte (if entry (ucd-transform entry) 0)
348 (with-open-file (f (make-pathname :name "ucd-names" :type "lisp-expr"
349 :defaults *output-directory*)
351 :if-exists :supersede
352 :if-does-not-exist :create)
353 (with-standard-io-syntax
354 (write-string ";;; Do not edit by hand: generated by ucd.lisp" f)
355 (maphash (lambda (code name)
360 (setf *unicode-names* nil))
361 (with-open-file (*standard-output*
362 (make-pathname :name "numerics"
364 :defaults *output-directory*)
366 :if-exists :supersede
367 :if-does-not-exist :create)
368 (with-standard-io-syntax
369 (let ((*print-pretty* t))
370 (prin1 (mapcar #'(lambda (x) (cons (car x) (read-from-string (cdr x))))
371 *different-numerics*)))))
372 (with-open-file (*standard-output*
373 (make-pathname :name "titlecases"
375 :defaults *output-directory*)
377 :if-exists :supersede
378 :if-does-not-exist :create)
379 (with-standard-io-syntax
380 (let ((*print-pretty* t))
381 (prin1 *different-titlecases*))))
382 (with-open-file (*standard-output*
383 (make-pathname :name "misc"
385 :defaults *output-directory*)
387 :if-exists :supersede
388 :if-does-not-exist :create)
389 (with-standard-io-syntax
390 (let ((*print-pretty* t))
391 (prin1 `(:length ,(length *misc-table*)
392 :uppercase ,(loop for (gc-index) across *misc-table*
396 :lowercase ,(loop for (gc-index) across *misc-table*
400 :titlecase ,(loop for (gc-index) across *misc-table*
406 ;;; Use of the generated files
408 (defparameter *compiled-ucd* nil)
410 (defun read-compiled-ucd ()
411 (with-open-file (stream (make-pathname :name "ucd"
413 :defaults *output-directory*)
415 :element-type '(unsigned-byte 8))
416 (let ((length (file-length stream)))
418 (make-array length :element-type '(unsigned-byte 8)))
419 (read-sequence *compiled-ucd* stream)))
422 ;;; The stuff below is dependent on misc.lisp-expr being
423 ;;; (:LENGTH 206 :UPPERCASE (0 2) :LOWERCASE (1 3) :TITLECASE (4)).
425 ;;; There are two entries for UPPERCASE and LOWERCASE because some
426 ;;; characters have case (by Unicode standards) but are not
427 ;;; transformable character-by-character in a locale-independet way
428 ;;; (as CL requires for its standard operators).
430 ;;; for more details on these debugging functions, see the description
431 ;;; of the character database format in src/code/target-char.lisp
433 (defparameter *length* 206)
436 (let* ((cp-high (cp-high cp))
437 (page (aref *compiled-ucd* (+ (* 8 *length*) cp-high))))
439 (ash #x110000 (- *page-size-exponent*))
440 (* (ash 4 *page-size-exponent*) page)
443 (defun cp-value-0 (cp)
444 (aref *compiled-ucd* (cp-index cp)))
446 (defun cp-value-1 (cp)
447 (let ((index (cp-index cp)))
448 (dpb (aref *compiled-ucd* (+ index 3)) (byte 8 16)
449 (dpb (aref *compiled-ucd* (+ index 2)) (byte 8 8)
450 (aref *compiled-ucd* (1+ index))))))
452 (defun cp-general-category (cp)
453 (aref *compiled-ucd* (* 8 (cp-value-0 cp))))
455 (defun cp-decimal-digit (cp)
456 (let ((decimal-digit (aref *compiled-ucd* (+ 3 (* 8 (cp-value-0 cp))))))
457 (and (< decimal-digit 10)
460 (defun cp-alpha-char-p (cp)
461 (< (cp-general-category cp) 5))
463 (defun cp-alphanumericp (cp)
464 (let ((gc (cp-general-category cp)))
468 (defun cp-digit-char-p (cp &optional (radix 10))
469 (let ((number (or (cp-decimal-digit cp)
474 (when (and number (< number radix))
477 (defun cp-graphic-char-p (cp)
481 (defun cp-char-upcase (cp)
482 (if (= (cp-value-0 cp) 1)
486 (defun cp-char-downcase (cp)
487 (if (= (cp-value-0 cp) 0)
491 (defun cp-upper-case-p (cp)
492 (= (cp-value-0 cp) 0))
494 (defun cp-lower-case-p (cp)
495 (= (cp-value-0 cp) 1))
497 (defun cp-both-case-p (cp)
498 (< (cp-value-0 cp) 2))