5 (defparameter *output-directory*
7 (make-pathname :directory '(:relative :up "output"))
8 (make-pathname :directory (pathname-directory *load-pathname*))))
10 (defparameter *page-size-exponent* 8)
13 (ash cp (- *page-size-exponent*)))
16 (ldb (byte *page-size-exponent* 0) cp))
20 (defparameter *unicode-character-database*
21 (make-pathname :directory (pathname-directory *load-pathname*)))
23 (defparameter *ucd-base* nil)
25 (defparameter *last-uppercase* nil)
26 (defparameter *uppercase-transition-count* 0)
27 (defparameter *different-titlecases* nil)
28 (defparameter *different-numerics* nil)
29 (defparameter *name-size* 0)
30 (defparameter *misc-hash* (make-hash-table :test #'equal))
31 (defparameter *misc-index* -1)
32 (defparameter *misc-table* nil)
33 (defparameter *misc-mapping* nil)
34 (defparameter *both-cases* nil)
35 (defparameter *decompositions* nil)
36 (defparameter *decomposition-length-max* nil)
37 (defparameter *decomposition-types* nil)
38 (defparameter *decomposition-base* nil)
40 (defun hash-misc (gc-index bidi-index ccc-index decimal-digit digit
41 bidi-mirrored cl-both-case-p)
42 (let* ((list (list gc-index bidi-index ccc-index decimal-digit digit
43 bidi-mirrored cl-both-case-p))
44 (index (gethash list *misc-hash*)))
47 (vector-push list *misc-table*)
48 (setf (gethash list *misc-hash*)
49 (incf *misc-index*))))))
51 (defun compare-misc-entry (left right)
52 (destructuring-bind (left-gc-index left-bidi-index left-ccc-index
53 left-decimal-digit left-digit left-bidi-mirrored
56 (destructuring-bind (right-gc-index right-bidi-index right-ccc-index
57 right-decimal-digit right-digit right-bidi-mirrored
60 (or (and left-cl-both-case-p (not right-cl-both-case-p))
61 (and (or left-cl-both-case-p (not right-cl-both-case-p))
62 (or (< left-gc-index right-gc-index)
63 (and (= left-gc-index right-gc-index)
64 (or (< left-bidi-index right-bidi-index)
65 (and (= left-bidi-index right-bidi-index)
66 (or (< left-ccc-index right-ccc-index)
67 (and (= left-ccc-index right-ccc-index)
68 (or (string< left-decimal-digit
70 (and (string= left-decimal-digit
72 (or (string< left-digit right-digit)
73 (and (string= left-digit
75 (string< left-bidi-mirrored
76 right-bidi-mirrored))))))))))))))))
78 (defun build-misc-table ()
79 (sort *misc-table* #'compare-misc-entry)
80 (setq *misc-mapping* (make-array (1+ *misc-index*)))
81 (loop for i from 0 to *misc-index*
82 do (setf (aref *misc-mapping*
83 (gethash (aref *misc-table* i) *misc-hash*))
87 (setq *last-uppercase* nil)
88 (setq *uppercase-transition-count* 0)
89 (setq *different-titlecases* nil)
90 (setq *different-numerics* nil)
92 (setq *misc-hash* (make-hash-table :test #'equal))
93 (setq *misc-index* -1)
94 (setq *misc-table* (make-array 256 :fill-pointer 0))
95 (setq *both-cases* nil)
96 (setq *decompositions* 0)
97 (setq *decomposition-types* (make-hash-table :test #'equal))
98 (setq *decomposition-length-max* 0)
99 (setq *decomposition-base* (make-array (ash #x110000
100 (- *page-size-exponent*))
101 :initial-element nil))
102 (setq *ucd-base* (make-array (ash #x110000 (- *page-size-exponent*))
103 :initial-element nil))
104 (with-open-file (*standard-input*
105 (make-pathname :name "UnicodeData"
107 :defaults *unicode-character-database*)
109 (loop for line = (read-line nil nil)
111 do (slurp-ucd-line line)))
116 (defun split-string (line character)
117 (loop for prev-position = 0 then (1+ position)
118 for position = (position character line :start prev-position)
119 collect (subseq line prev-position position)
123 (defun init-indices (strings)
124 (let ((hash (make-hash-table :test #'equal)))
125 (loop for string in strings
127 do (setf (gethash string hash) index))
130 (defparameter *general-categories*
131 (init-indices '("Lu" "Ll" "Lt" "Lm" "Lo" "Cc" "Cf" "Co" "Cs" "Mc"
132 "Me" "Mn" "Nd" "Nl" "No" "Pc" "Pd" "Pe" "Pf" "Pi"
133 "Po" "Ps" "Sc" "Sk" "Sm" "So" "Zl" "Zp" "Zs")))
134 (defparameter *bidi-classes*
135 (init-indices '("AL" "AN" "B" "BN" "CS" "EN" "ES" "ET" "L" "LRE" "LRO"
136 "NSM" "ON" "PDF" "R" "RLE" "RLO" "S" "WS")))
139 (defparameter *block-first* nil)
141 ;;; 3400 -- 4DB5 : cjk ideograph extension a ;Lo;0;L;;;;;N;;;;;
142 ;;; AC00 -- D7A3 : hangul syllables ;Lo;0;L;;;;;N;;;;;
143 ;;; D800 -- F8FF : surrogates and private use
144 ;;; 20000 -- 2A6D6 : cjk ideograph extension b ;Lo;0;L;;;;;N;;;;;
145 ;;; F0000 -- FFFFD : private use
146 ;;; 100000 -- 10FFFD: private use
147 (defun encode-ucd-line (line code-point)
148 (destructuring-bind (name general-category canonical-combining-class
149 bidi-class decomposition-type-and-mapping
150 decimal-digit digit numeric bidi-mirrored
151 unicode-1-name iso-10646-comment simple-uppercase
152 simple-lowercase simple-titlecase)
154 (declare (ignore unicode-1-name iso-10646-comment))
155 (if (and (> (length name) 8)
156 (string= ", First>" name :start2 (- (length name) 8)))
158 (setq *block-first* code-point)
160 (let* ((gc-index (or (gethash general-category *general-categories*)
161 (error "unknown general category ~A"
163 (bidi-index (or (gethash bidi-class *bidi-classes*)
164 (error "unknown bidirectional class ~A"
166 (ccc-index (parse-integer canonical-combining-class))
167 (digit-index (unless (string= "" decimal-digit)
168 (parse-integer decimal-digit)))
169 (upper-index (unless (string= "" simple-uppercase)
170 (parse-integer simple-uppercase :radix 16)))
171 (lower-index (unless (string= "" simple-lowercase)
172 (parse-integer simple-lowercase :radix 16)))
173 (title-index (unless (string= "" simple-titlecase)
174 (parse-integer simple-titlecase :radix 16)))
176 (not (null (or (and (= gc-index 0) lower-index)
177 (and (= gc-index 1) upper-index)))))
178 (misc-index (hash-misc gc-index bidi-index ccc-index
179 decimal-digit digit bidi-mirrored
181 (declare (ignore digit-index))
182 (incf *name-size* (length name))
183 (when (string/= "" decomposition-type-and-mapping)
184 (let ((split (split-string decomposition-type-and-mapping
186 (when (char= #\< (aref (first split) 0))
187 (setf (gethash (pop split) *decomposition-types*) t))
188 (unless (aref *decomposition-base* (cp-high code-point))
189 (setf (aref *decomposition-base* (cp-high code-point))
190 (make-array (ash 1 *page-size-exponent*)
191 :initial-element nil)))
192 (setf (aref (aref *decomposition-base* (cp-high code-point))
194 (mapcar #'(lambda (string)
195 (parse-integer string :radix 16))
197 (setq *decomposition-length-max*
198 (max *decomposition-length-max* (length split)))
199 (incf *decompositions* (length split))))
200 (when (and (string/= "" simple-uppercase)
201 (string/= "" simple-lowercase))
202 (push (list code-point upper-index lower-index) *both-cases*))
203 (when (string/= simple-uppercase simple-titlecase)
204 (push (cons code-point title-index) *different-titlecases*))
205 (when (string/= digit numeric)
206 (push (cons code-point numeric) *different-numerics*))
209 (unless *last-uppercase*
210 (incf *uppercase-transition-count*))
211 (setq *last-uppercase* t))
213 (when *last-uppercase*
214 (incf *uppercase-transition-count*))
215 (setq *last-uppercase* nil)))
216 (when (> ccc-index 255)
217 (error "canonical combining class too large ~A" ccc-index))
218 (let ((result (vector misc-index (or upper-index lower-index 0))))
219 (when (and (> (length name) 7)
220 (string= ", Last>" name :start2 (- (length name) 7)))
221 (let ((page-start (ash (+ *block-first*
222 (ash 1 *page-size-exponent*)
224 (- *page-size-exponent*)))
225 (page-end (ash code-point (- *page-size-exponent*))))
226 (loop for point from *block-first*
227 below (ash page-start *page-size-exponent*)
228 do (setf (aref (aref *ucd-base* (cp-high point))
231 (loop for page from page-start below page-end
232 do (setf (aref *ucd-base* page)
233 (make-array (ash 1 *page-size-exponent*)
234 :initial-element result)))
235 (loop for point from (ash page-end *page-size-exponent*)
237 do (setf (aref (aref *ucd-base* (cp-high point))
242 (defun slurp-ucd-line (line)
243 (let* ((split-line (split-string line #\;))
244 (code-point (parse-integer (first split-line) :radix 16))
245 (code-high (ash code-point (- *page-size-exponent*)))
246 (code-low (ldb (byte *page-size-exponent* 0) code-point)))
247 (unless (aref *ucd-base* code-high)
248 (setf (aref *ucd-base* code-high)
249 (make-array (ash 1 *page-size-exponent*)
250 :initial-element nil)))
251 (setf (aref (aref *ucd-base* code-high) code-low)
252 (encode-ucd-line (cdr split-line) code-point))))
254 (defun second-pass ()
255 (loop for i from 0 below (length *ucd-base*)
256 when (aref *ucd-base* i)
257 do (loop for j from 0 below (length (aref *ucd-base* i))
258 for result = (aref (aref *ucd-base* i) j)
260 when (let* ((transform-point (aref result 1))
261 (transform-high (ash transform-point
262 (- *page-size-exponent*)))
263 (transform-low (ldb (byte *page-size-exponent* 0)
265 (and (plusp transform-point)
266 (/= (aref (aref (aref *ucd-base* transform-high)
269 (+ (ash i *page-size-exponent*) j))))
270 do (destructuring-bind (gc-index bidi-index ccc-index
271 decimal-digit digit bidi-mirrored
273 (aref *misc-table* (aref result 0))
274 (declare (ignore cl-both-case-p))
275 (format t "~A~%" (+ (ash i *page-size-exponent*) j))
276 (setf (aref result 0)
277 (hash-misc gc-index bidi-index ccc-index
278 decimal-digit digit bidi-mirrored
281 (defun write-3-byte (triplet stream)
282 (write-byte (ldb (byte 8 0) triplet) stream)
283 (write-byte (ldb (byte 8 8) triplet) stream)
284 (write-byte (ldb (byte 8 16) triplet) stream))
286 (defun digit-to-byte (digit)
287 (if (string= "" digit)
289 (parse-integer digit)))
292 (let ((hash (make-hash-table :test #'equalp))
294 (loop for page across *ucd-base*
296 (unless (gethash page hash)
297 (setf (gethash page hash)
299 (let ((array (make-array (1+ index))))
300 (maphash #'(lambda (key value)
301 (setf (aref array value) key))
304 (make-array (ash 1 *page-size-exponent*) :initial-element nil))
305 (with-open-file (stream (make-pathname :name "ucd"
307 :defaults *output-directory*)
309 :element-type '(unsigned-byte 8)
310 :if-exists :supersede
311 :if-does-not-exist :create)
312 (loop for (gc-index bidi-index ccc-index decimal-digit digit
315 do (write-byte gc-index stream)
316 do (write-byte bidi-index stream)
317 do (write-byte ccc-index stream)
318 do (write-byte (digit-to-byte decimal-digit) stream)
319 do (write-byte (digit-to-byte digit) stream)
320 do (write-byte (if (string= "N" bidi-mirrored) 0 1) stream)
321 do (write-byte 0 stream)
322 do (write-byte 0 stream))
323 (loop for page across *ucd-base*
324 do (write-byte (if page (gethash page hash) 0) stream))
325 (loop for page across array
326 do (loop for entry across page
327 do (write-byte (if entry
328 (aref *misc-mapping* (aref entry 0))
331 do (write-3-byte (if entry (aref entry 1) 0)
333 (with-open-file (*standard-output*
334 (make-pathname :name "numerics"
336 :defaults *output-directory*)
338 :if-exists :supersede
339 :if-does-not-exist :create)
340 (let ((*print-pretty* t))
341 (prin1 (mapcar #'(lambda (x) (cons (car x) (read-from-string (cdr x))))
342 *different-numerics*))))
343 (with-open-file (*standard-output*
344 (make-pathname :name "titlecases"
346 :defaults *output-directory*)
348 :if-exists :supersede
349 :if-does-not-exist :create)
350 (let ((*print-pretty* t))
351 (prin1 *different-titlecases*)))
352 (with-open-file (*standard-output*
353 (make-pathname :name "misc"
355 :defaults *output-directory*)
357 :if-exists :supersede
358 :if-does-not-exist :create)
359 (let ((*print-pretty* t))
360 (prin1 `(:length ,(length *misc-table*)
361 :uppercase ,(loop for (gc-index) across *misc-table*
365 :lowercase ,(loop for (gc-index) across *misc-table*
369 :titlecase ,(loop for (gc-index) across *misc-table*
375 ;;; Use of the generated files
377 (defparameter *compiled-ucd* nil)
379 (defun read-compiled-ucd ()
380 (with-open-file (stream (make-pathname :name "ucd"
382 :defaults *output-directory*)
384 :element-type '(unsigned-byte 8))
385 (let ((length (file-length stream)))
387 (make-array length :element-type '(unsigned-byte 8)))
388 (read-sequence *compiled-ucd* stream)))
391 ;;; The stuff below is dependent on misc.lisp-expr being
392 ;;; (:LENGTH 186 :UPPERCASE (0) :LOWERCASE (1) :TITLECASE (2))
394 (defparameter *length* 186)
397 (let* ((cp-high (cp-high cp))
398 (page (aref *compiled-ucd* (+ (* 8 *length*) cp-high))))
400 (ash #x110000 (- *page-size-exponent*))
401 (* (ash 4 *page-size-exponent*) page)
404 (defun cp-value-0 (cp)
405 (aref *compiled-ucd* (cp-index cp)))
407 (defun cp-value-1 (cp)
408 (let ((index (cp-index cp)))
409 (dpb (aref *compiled-ucd* (+ index 3)) (byte 8 16)
410 (dpb (aref *compiled-ucd* (+ index 2)) (byte 8 8)
411 (aref *compiled-ucd* (1+ index))))))
413 (defun cp-general-category (cp)
414 (aref *compiled-ucd* (* 8 (cp-value-0 cp))))
416 (defun cp-decimal-digit (cp)
417 (let ((decimal-digit (aref *compiled-ucd* (+ 3 (* 8 (cp-value-0 cp))))))
418 (and (< decimal-digit 10)
421 (defun cp-alpha-char-p (cp)
422 (< (cp-general-category cp) 5))
424 (defun cp-alphanumericp (cp)
425 (let ((gc (cp-general-category cp)))
429 (defun cp-digit-char-p (cp &optional (radix 10))
430 (let ((number (or (cp-decimal-digit cp)
435 (when (and number (< number radix))
438 (defun cp-graphic-char-p (cp)
442 (defun cp-char-upcase (cp)
443 (if (= (cp-value-0 cp) 1)
447 (defun cp-char-downcase (cp)
448 (if (= (cp-value-0 cp) 0)
452 (defun cp-upper-case-p (cp)
453 (= (cp-value-0 cp) 0))
455 (defun cp-lower-case-p (cp)
456 (= (cp-value-0 cp) 1))
458 (defun cp-both-case-p (cp)
459 (< (cp-value-0 cp) 2))