1.0.31.32: Update to Unicode 5.0.1
[sbcl.git] / tools-for-build / ucd.lisp
1 (in-package "SB-COLD")
2
3 ;;; Common
4
5 (defparameter *output-directory*
6   (merge-pathnames
7    (make-pathname :directory '(:relative :up "output"))
8    (make-pathname :directory (pathname-directory *load-truename*))))
9
10 (defparameter *page-size-exponent* 8)
11
12 (defun cp-high (cp)
13   (ash cp (- *page-size-exponent*)))
14
15 (defun cp-low (cp)
16   (ldb (byte *page-size-exponent* 0) cp))
17
18 ;;; Generator
19
20 (defstruct ucd misc transform)
21
22 (defparameter *unicode-character-database*
23   (make-pathname :directory (pathname-directory *load-truename*)))
24
25 (defparameter *ucd-base* nil)
26 (defparameter *unicode-names* (make-hash-table))
27
28 (defparameter *last-uppercase* nil)
29 (defparameter *uppercase-transition-count* 0)
30 (defparameter *different-titlecases* nil)
31 (defparameter *different-numerics* nil)
32 (defparameter *name-size* 0)
33 (defparameter *misc-hash* (make-hash-table :test #'equal))
34 (defparameter *misc-index* -1)
35 (defparameter *misc-table* nil)
36 (defparameter *misc-mapping* nil)
37 (defparameter *both-cases* nil)
38 (defparameter *decompositions* nil)
39 (defparameter *decomposition-length-max* nil)
40 (defparameter *decomposition-types* nil)
41 (defparameter *decomposition-base* nil)
42
43 (defun hash-misc (gc-index bidi-index ccc-index decimal-digit digit
44                   bidi-mirrored cl-both-case-p)
45   (let* ((list (list gc-index bidi-index ccc-index decimal-digit digit
46                      bidi-mirrored cl-both-case-p))
47          (index (gethash list *misc-hash*)))
48     (or index
49         (progn
50           (vector-push list *misc-table*)
51           (setf (gethash list *misc-hash*)
52                 (incf *misc-index*))))))
53
54 (defun compare-misc-entry (left right)
55   (destructuring-bind (left-gc-index left-bidi-index left-ccc-index
56                        left-decimal-digit left-digit left-bidi-mirrored
57                        left-cl-both-case-p)
58       left
59     (destructuring-bind (right-gc-index right-bidi-index right-ccc-index
60                          right-decimal-digit right-digit right-bidi-mirrored
61                          right-cl-both-case-p)
62         right
63       (or (and left-cl-both-case-p (not right-cl-both-case-p))
64           (and (or left-cl-both-case-p (not right-cl-both-case-p))
65                (or (< left-gc-index right-gc-index)
66                    (and (= left-gc-index right-gc-index)
67                         (or (< left-bidi-index right-bidi-index)
68                             (and (= left-bidi-index right-bidi-index)
69                                  (or (< left-ccc-index right-ccc-index)
70                                      (and (= left-ccc-index right-ccc-index)
71                                           (or (string< left-decimal-digit
72                                                        right-decimal-digit)
73                                               (and (string= left-decimal-digit
74                                                             right-decimal-digit)
75                                                    (or (string< left-digit right-digit)
76                                                        (and (string= left-digit
77                                                                      right-digit)
78                                                             (string< left-bidi-mirrored
79                                                                      right-bidi-mirrored))))))))))))))))
80
81 (defun build-misc-table ()
82   (sort *misc-table* #'compare-misc-entry)
83   (setq *misc-mapping* (make-array (1+ *misc-index*)))
84   (loop for i from 0 to *misc-index*
85         do (setf (aref *misc-mapping*
86                        (gethash (aref *misc-table* i) *misc-hash*))
87                  i)))
88
89 (defun slurp-ucd ()
90   (setq *last-uppercase* nil)
91   (setq *uppercase-transition-count* 0)
92   (setq *different-titlecases* nil)
93   (setq *different-numerics* nil)
94   (setq *name-size* 0)
95   (setq *misc-hash* (make-hash-table :test #'equal))
96   (setq *misc-index* -1)
97   (setq *misc-table* (make-array 256 :fill-pointer 0))
98   (setq *both-cases* nil)
99   (setq *decompositions* 0)
100   (setq *decomposition-types* (make-hash-table :test #'equal))
101   (setq *decomposition-length-max* 0)
102   (setq *decomposition-base* (make-array (ash #x110000
103                                               (- *page-size-exponent*))
104                                          :initial-element nil))
105   (setq *ucd-base* (make-array (ash #x110000 (- *page-size-exponent*))
106                                :initial-element nil))
107   (with-open-file (*standard-input*
108                    (make-pathname :name "UnicodeData"
109                                   :type "txt"
110                                   :defaults *unicode-character-database*)
111                    :direction :input)
112     (loop for line = (read-line nil nil)
113           while line
114           do (slurp-ucd-line line)))
115   (second-pass)
116   (build-misc-table)
117   *decompositions*)
118
119 (defun split-string (line character)
120   (loop for prev-position = 0 then (1+ position)
121         for position = (position character line :start prev-position)
122         collect (subseq line prev-position position)
123         do (unless position
124              (loop-finish))))
125
126 (defun init-indices (strings)
127   (let ((hash (make-hash-table :test #'equal)))
128     (loop for string in strings
129           for index from 0
130           do (setf (gethash string hash) index))
131     hash))
132
133 (defparameter *general-categories*
134   (init-indices '("Lu" "Ll" "Lt" "Lm" "Lo" "Cc" "Cf" "Co" "Cs" "Mc"
135                   "Me" "Mn" "Nd" "Nl" "No" "Pc" "Pd" "Pe" "Pf" "Pi"
136                   "Po" "Ps" "Sc" "Sk" "Sm" "So" "Zl" "Zp" "Zs")))
137 (defparameter *bidi-classes*
138   (init-indices '("AL" "AN" "B" "BN" "CS" "EN" "ES" "ET" "L" "LRE" "LRO"
139                   "NSM" "ON" "PDF" "R" "RLE" "RLO" "S" "WS")))
140
141
142 (defparameter *block-first* nil)
143
144 (defun normalize-character-name (name)
145   (when (find #\_ name)
146     (error "Bad name for a character: ~A" name))
147   (unless (or (zerop (length name)) (find #\< name) (find #\> name))
148     (substitute #\_ #\Space name)))
149
150 ;;;   3400  --  4DB5  : cjk ideograph extension a ;Lo;0;L;;;;;N;;;;;
151 ;;;   AC00  --  D7A3  : hangul syllables ;Lo;0;L;;;;;N;;;;;
152 ;;;   D800  --  F8FF  : surrogates and private use
153 ;;;  20000  --  2A6D6 : cjk ideograph extension b ;Lo;0;L;;;;;N;;;;;
154 ;;;  F0000  --  FFFFD : private use
155 ;;; 100000  --  10FFFD: private use
156 (defun encode-ucd-line (line code-point)
157   (destructuring-bind (name general-category canonical-combining-class
158                             bidi-class decomposition-type-and-mapping
159                             decimal-digit digit numeric bidi-mirrored
160                             unicode-1-name iso-10646-comment simple-uppercase
161                             simple-lowercase simple-titlecase)
162       line
163     (declare (ignore unicode-1-name iso-10646-comment))
164     (if (and (> (length name) 8)
165              (string= ", First>" name :start2 (- (length name) 8)))
166         (progn
167           (setq *block-first* code-point)
168           nil)
169         (let* ((gc-index (or (gethash general-category *general-categories*)
170                              (error "unknown general category ~A"
171                                     general-category)))
172                (bidi-index (or (gethash bidi-class *bidi-classes*)
173                                (error "unknown bidirectional class ~A"
174                                       bidi-class)))
175                (ccc-index (parse-integer canonical-combining-class))
176                (digit-index (unless (string= "" decimal-digit)
177                               (parse-integer decimal-digit)))
178                (upper-index (unless (string= "" simple-uppercase)
179                               (parse-integer simple-uppercase :radix 16)))
180                (lower-index (unless (string= "" simple-lowercase)
181                               (parse-integer simple-lowercase :radix 16)))
182                (title-index (unless (string= "" simple-titlecase)
183                               (parse-integer simple-titlecase :radix 16)))
184                (cl-both-case-p
185                 (not (null (or (and (= gc-index 0) lower-index)
186                                (and (= gc-index 1) upper-index)))))
187                (misc-index (hash-misc gc-index bidi-index ccc-index
188                                       decimal-digit digit bidi-mirrored
189                                       cl-both-case-p)))
190           (declare (ignore digit-index))
191           (when (and (not cl-both-case-p)
192                      (< gc-index 2))
193             (format t "~A~%" name))
194           (incf *name-size* (length name))
195           (when (string/= "" decomposition-type-and-mapping)
196             (let ((split (split-string decomposition-type-and-mapping
197                                        #\Space)))
198               (when (char= #\< (aref (first split) 0))
199                 (setf (gethash (pop split) *decomposition-types*) t))
200               (unless (aref *decomposition-base* (cp-high code-point))
201                 (setf (aref *decomposition-base* (cp-high code-point))
202                       (make-array (ash 1 *page-size-exponent*)
203                                   :initial-element nil)))
204               (setf (aref (aref *decomposition-base* (cp-high code-point))
205                           (cp-low code-point))
206                     (mapcar #'(lambda (string)
207                                 (parse-integer string :radix 16))
208                             split))
209               (setq *decomposition-length-max*
210                     (max *decomposition-length-max* (length split)))
211               (incf *decompositions* (length split))))
212           (when (and (string/= "" simple-uppercase)
213                      (string/= "" simple-lowercase))
214             (push (list code-point upper-index lower-index) *both-cases*))
215           (when (string/= simple-uppercase simple-titlecase)
216             (push (cons code-point title-index) *different-titlecases*))
217           (when (string/= digit numeric)
218             (push (cons code-point numeric) *different-numerics*))
219           (cond
220             ((= gc-index 8)
221              (unless *last-uppercase*
222                (incf *uppercase-transition-count*))
223              (setq *last-uppercase* t))
224             (t
225              (when *last-uppercase*
226                (incf *uppercase-transition-count*))
227              (setq *last-uppercase* nil)))
228           (when (> ccc-index 255)
229             (error "canonical combining class too large ~A" ccc-index))
230           (let ((result (make-ucd :misc misc-index
231                                   :transform (or upper-index lower-index 0))))
232             (when (and (> (length name) 7)
233                        (string= ", Last>" name :start2 (- (length name) 7)))
234               (let ((page-start (ash (+ *block-first*
235                                         (ash 1 *page-size-exponent*)
236                                         -1)
237                                      (- *page-size-exponent*)))
238                     (page-end (ash code-point (- *page-size-exponent*))))
239                 (loop for point from *block-first*
240                       below (ash page-start *page-size-exponent*)
241                       do (setf (aref (aref *ucd-base* (cp-high point))
242                                      (cp-low point))
243                                result))
244                 (loop for page from page-start below page-end
245                       do (setf (aref *ucd-base* page)
246                                (make-array (ash 1 *page-size-exponent*)
247                                            :initial-element result)))
248                 (loop for point from (ash page-end *page-size-exponent*)
249                       below code-point
250                       do (setf (aref (aref *ucd-base* (cp-high point))
251                                      (cp-low point))
252                                result))))
253             (values result (normalize-character-name name)))))))
254
255 (defun slurp-ucd-line (line)
256   (let* ((split-line (split-string line #\;))
257          (code-point (parse-integer (first split-line) :radix 16))
258          (code-high (ash code-point (- *page-size-exponent*)))
259          (code-low (ldb (byte *page-size-exponent* 0) code-point)))
260     (unless (aref *ucd-base* code-high)
261       (setf (aref *ucd-base* code-high)
262             (make-array (ash 1 *page-size-exponent*)
263                         :initial-element nil)))
264     (multiple-value-bind (encoding name)
265         (encode-ucd-line (cdr split-line) code-point)
266       (setf (aref (aref *ucd-base* code-high) code-low) encoding
267             (gethash code-point *unicode-names*) name))))
268
269 (defun second-pass ()
270   (loop for i from 0 below (length *ucd-base*)
271         when (aref *ucd-base* i)
272         do (loop for j from 0 below (length (aref *ucd-base* i))
273                  for result = (aref (aref *ucd-base* i) j)
274                  when result
275                  when (let* ((transform-point (ucd-transform result))
276                              (transform-high (ash transform-point
277                                                   (- *page-size-exponent*)))
278                              (transform-low (ldb (byte *page-size-exponent* 0)
279                                                  transform-point)))
280                         (and (plusp transform-point)
281                              (/= (ucd-transform
282                                   (aref (aref *ucd-base* transform-high)
283                                         transform-low))
284                                  (+ (ash i *page-size-exponent*) j))))
285                  do (destructuring-bind (gc-index bidi-index ccc-index
286                                          decimal-digit digit bidi-mirrored
287                                          cl-both-case-p)
288                         (aref *misc-table* (ucd-misc result))
289                       (declare (ignore cl-both-case-p))
290                       (format t "~A~%" (+ (ash i *page-size-exponent*) j))
291                       (setf (ucd-misc result)
292                             (hash-misc gc-index bidi-index ccc-index
293                                        decimal-digit digit bidi-mirrored
294                                        nil))))))
295
296 (defun write-3-byte (triplet stream)
297   (write-byte (ldb (byte 8 0) triplet) stream)
298   (write-byte (ldb (byte 8 8) triplet) stream)
299   (write-byte (ldb (byte 8 16) triplet) stream))
300
301 (defun digit-to-byte (digit)
302   (if (string= "" digit)
303       255
304       (parse-integer digit)))
305
306 (defun output ()
307   (let ((hash (make-hash-table :test #'equalp))
308         (index 0))
309     (loop for page across *ucd-base*
310           do (when page
311                (unless (gethash page hash)
312                  (setf (gethash page hash)
313                        (incf index)))))
314     (let ((array (make-array (1+ index))))
315       (maphash #'(lambda (key value)
316                    (setf (aref array value) key))
317                hash)
318       (setf (aref array 0)
319             (make-array (ash 1 *page-size-exponent*) :initial-element nil))
320       (with-open-file (stream (make-pathname :name "ucd"
321                                              :type "dat"
322                                              :defaults *output-directory*)
323                               :direction :output
324                               :element-type '(unsigned-byte 8)
325                               :if-exists :supersede
326                               :if-does-not-exist :create)
327         (loop for (gc-index bidi-index ccc-index decimal-digit digit
328                             bidi-mirrored)
329               across *misc-table*
330               do (write-byte gc-index stream)
331               do (write-byte bidi-index stream)
332               do (write-byte ccc-index stream)
333               do (write-byte (digit-to-byte decimal-digit) stream)
334               do (write-byte (digit-to-byte digit) stream)
335               do (write-byte (if (string= "N" bidi-mirrored) 0 1) stream)
336               do (write-byte 0 stream)
337               do (write-byte 0 stream))
338         (loop for page across *ucd-base*
339            do (write-byte (if page (gethash page hash) 0) stream))
340         (loop for page across array
341            do (loop for entry across page
342                  do (write-byte (if entry
343                                     (aref *misc-mapping* (ucd-misc entry))
344                                     255)
345                                 stream)
346                  do (write-3-byte (if entry (ucd-transform entry) 0)
347                                   stream))))))
348   (with-open-file (f (make-pathname :name "ucd-names" :type "lisp-expr"
349                                     :defaults *output-directory*)
350                      :direction :output
351                      :if-exists :supersede
352                      :if-does-not-exist :create)
353     (with-standard-io-syntax
354       (write-string ";;; Do not edit by hand: generated by ucd.lisp" f)
355       (maphash (lambda (code name)
356                  (when name
357                   (print code f)
358                   (prin1 name f)))
359                *unicode-names*))
360     (setf *unicode-names* nil))
361   (with-open-file (*standard-output*
362                    (make-pathname :name "numerics"
363                                   :type "lisp-expr"
364                                   :defaults *output-directory*)
365                    :direction :output
366                    :if-exists :supersede
367                    :if-does-not-exist :create)
368     (with-standard-io-syntax
369       (let ((*print-pretty* t))
370         (prin1 (mapcar #'(lambda (x) (cons (car x) (read-from-string (cdr x))))
371                        *different-numerics*)))))
372   (with-open-file (*standard-output*
373                    (make-pathname :name "titlecases"
374                                   :type "lisp-expr"
375                                   :defaults *output-directory*)
376                    :direction :output
377                    :if-exists :supersede
378                    :if-does-not-exist :create)
379     (with-standard-io-syntax
380       (let ((*print-pretty* t))
381         (prin1 *different-titlecases*))))
382   (with-open-file (*standard-output*
383                    (make-pathname :name "misc"
384                                   :type "lisp-expr"
385                                   :defaults *output-directory*)
386                    :direction :output
387                    :if-exists :supersede
388                    :if-does-not-exist :create)
389     (with-standard-io-syntax
390       (let ((*print-pretty* t))
391         (prin1 `(:length ,(length *misc-table*)
392                  :uppercase ,(loop for (gc-index) across *misc-table*
393                                 for i from 0
394                                 when (= gc-index 0)
395                                 collect i)
396                  :lowercase ,(loop for (gc-index) across *misc-table*
397                                 for i from 0
398                                 when (= gc-index 1)
399                                 collect i)
400                  :titlecase ,(loop for (gc-index) across *misc-table*
401                                 for i from 0
402                                 when (= gc-index 2)
403                                 collect i))))))
404   (values))
405
406 ;;; Use of the generated files
407
408 (defparameter *compiled-ucd* nil)
409
410 (defun read-compiled-ucd ()
411   (with-open-file (stream (make-pathname :name "ucd"
412                                          :type "dat"
413                                          :defaults *output-directory*)
414                           :direction :input
415                           :element-type '(unsigned-byte 8))
416     (let ((length (file-length stream)))
417       (setq *compiled-ucd*
418             (make-array length :element-type '(unsigned-byte 8)))
419       (read-sequence *compiled-ucd* stream)))
420   (values))
421
422 ;;; The stuff below is dependent on misc.lisp-expr being
423 ;;; (:LENGTH 206 :UPPERCASE (0 2) :LOWERCASE (1 3) :TITLECASE (4)).
424 ;;;
425 ;;; There are two entries for UPPERCASE and LOWERCASE because some
426 ;;; characters have case (by Unicode standards) but are not
427 ;;; transformable character-by-character in a locale-independet way
428 ;;; (as CL requires for its standard operators).
429 ;;;
430 ;;; for more details on these debugging functions, see the description
431 ;;; of the character database format in src/code/target-char.lisp
432
433 (defparameter *length* 206)
434
435 (defun cp-index (cp)
436   (let* ((cp-high (cp-high cp))
437          (page (aref *compiled-ucd* (+ (* 8 *length*) cp-high))))
438     (+ (* 8 *length*)
439        (ash #x110000 (- *page-size-exponent*))
440        (* (ash 4 *page-size-exponent*) page)
441        (* 4 (cp-low cp)))))
442
443 (defun cp-value-0 (cp)
444   (aref *compiled-ucd* (cp-index cp)))
445
446 (defun cp-value-1 (cp)
447   (let ((index (cp-index cp)))
448     (dpb (aref *compiled-ucd* (+ index 3)) (byte 8 16)
449          (dpb (aref *compiled-ucd* (+ index 2)) (byte 8 8)
450               (aref *compiled-ucd* (1+ index))))))
451
452 (defun cp-general-category (cp)
453   (aref *compiled-ucd* (* 8 (cp-value-0 cp))))
454
455 (defun cp-decimal-digit (cp)
456   (let ((decimal-digit (aref *compiled-ucd* (+ 3 (* 8 (cp-value-0 cp))))))
457     (and (< decimal-digit 10)
458          decimal-digit)))
459
460 (defun cp-alpha-char-p (cp)
461   (< (cp-general-category cp) 5))
462
463 (defun cp-alphanumericp (cp)
464   (let ((gc (cp-general-category cp)))
465     (or (< gc 5)
466         (= gc 12))))
467
468 (defun cp-digit-char-p (cp &optional (radix 10))
469   (let ((number (or (cp-decimal-digit cp)
470                     (and (<= 65 cp 90)
471                          (- cp 55))
472                     (and (<= 97 cp 122)
473                          (- cp 87)))))
474     (when (and number (< number radix))
475       number)))
476
477 (defun cp-graphic-char-p (cp)
478   (or (<= 32 cp 127)
479       (<= 160 cp)))
480
481 (defun cp-char-upcase (cp)
482   (if (= (cp-value-0 cp) 1)
483       (cp-value-1 cp)
484       cp))
485
486 (defun cp-char-downcase (cp)
487   (if (= (cp-value-0 cp) 0)
488       (cp-value-1 cp)
489       cp))
490
491 (defun cp-upper-case-p (cp)
492   (= (cp-value-0 cp) 0))
493
494 (defun cp-lower-case-p (cp)
495   (= (cp-value-0 cp) 1))
496
497 (defun cp-both-case-p (cp)
498   (< (cp-value-0 cp) 2))