8b7c3e5e2500b08537d003039fefbca3cc4c238a
[sbcl.git] / tools-for-build / ucd.lisp
1 (in-package "SB-COLD")
2
3 ;;; Common
4
5 (defparameter *output-directory*
6   (merge-pathnames
7    (make-pathname :directory '(:relative :up "output"))
8    (make-pathname :directory (pathname-directory *load-truename*))))
9
10 (defparameter *page-size-exponent* 8)
11
12 (defun cp-high (cp)
13   (ash cp (- *page-size-exponent*)))
14
15 (defun cp-low (cp)
16   (ldb (byte *page-size-exponent* 0) cp))
17
18 ;;; Generator
19
20 (defstruct ucd misc transform)
21
22 (defparameter *unicode-character-database*
23   (make-pathname :directory (pathname-directory *load-truename*)))
24
25 (defparameter *ucd-base* nil)
26 (defparameter *unicode-names* (make-hash-table))
27
28 (defparameter *last-uppercase* nil)
29 (defparameter *uppercase-transition-count* 0)
30 (defparameter *different-titlecases* nil)
31 (defparameter *different-numerics* nil)
32 (defparameter *name-size* 0)
33 (defparameter *misc-hash* (make-hash-table :test #'equal))
34 (defparameter *misc-index* -1)
35 (defparameter *misc-table* nil)
36 (defparameter *misc-mapping* nil)
37 (defparameter *both-cases* nil)
38 (defparameter *decompositions* nil)
39 (defparameter *decomposition-length-max* nil)
40 (defparameter *decomposition-types* nil)
41 (defparameter *decomposition-base* nil)
42
43 (defun hash-misc (gc-index bidi-index ccc-index decimal-digit digit
44                   bidi-mirrored cl-both-case-p decomposition-info)
45   (let* ((list (list gc-index bidi-index ccc-index decimal-digit digit
46                      bidi-mirrored cl-both-case-p decomposition-info))
47          (index (gethash list *misc-hash*)))
48     (or index
49         (progn
50           (vector-push list *misc-table*)
51           (setf (gethash list *misc-hash*)
52                 (incf *misc-index*))))))
53
54 (defun compare-misc-entry (left right)
55   (destructuring-bind (left-gc-index left-bidi-index left-ccc-index
56                        left-decimal-digit left-digit left-bidi-mirrored
57                        left-cl-both-case-p left-decomposition-info)
58       left
59     (destructuring-bind (right-gc-index right-bidi-index right-ccc-index
60                          right-decimal-digit right-digit right-bidi-mirrored
61                          right-cl-both-case-p right-decomposition-info)
62         right
63       (or (and left-cl-both-case-p (not right-cl-both-case-p))
64           (and (or left-cl-both-case-p (not right-cl-both-case-p))
65                (or (< left-gc-index right-gc-index)
66                    (and (= left-gc-index right-gc-index)
67                         (or (< left-decomposition-info right-decomposition-info)
68                             (and (= left-decomposition-info right-decomposition-info)
69                                  (or (< left-bidi-index right-bidi-index)
70                                      (and (= left-bidi-index right-bidi-index)
71                                           (or (< left-ccc-index right-ccc-index)
72                                               (and (= left-ccc-index right-ccc-index)
73                                                    (or (string< left-decimal-digit
74                                                                 right-decimal-digit)
75                                                        (and (string= left-decimal-digit
76                                                                      right-decimal-digit)
77                                                             (or (string< left-digit right-digit)
78                                                                 (and (string= left-digit
79                                                                               right-digit)
80                                                                      (string< left-bidi-mirrored
81                                                                               right-bidi-mirrored))))))))))))))))))
82
83 (defun build-misc-table ()
84   (sort *misc-table* #'compare-misc-entry)
85   (setq *misc-mapping* (make-array (1+ *misc-index*)))
86   (loop for i from 0 to *misc-index*
87      do (setf (aref *misc-mapping*
88                     (gethash (aref *misc-table* i) *misc-hash*))
89               i)))
90
91 (defun slurp-ucd ()
92   (setq *last-uppercase* nil)
93   (setq *uppercase-transition-count* 0)
94   (setq *different-titlecases* nil)
95   (setq *different-numerics* nil)
96   (setq *name-size* 0)
97   (setq *misc-hash* (make-hash-table :test #'equal))
98   (setq *misc-index* -1)
99   (setq *misc-table* (make-array 2048 :fill-pointer 0))
100   (setq *both-cases* nil)
101   (setq *decompositions* 0)
102   (setq *decomposition-types*
103         (let ((array (make-array 256 :initial-element nil :fill-pointer 1)))
104           (vector-push "" array)
105           (vector-push "<compat>" array)
106           array))
107   (setq *decomposition-length-max* 0)
108   (setq *decomposition-base* (make-array (ash #x110000
109                                               (- *page-size-exponent*))
110                                          :initial-element nil))
111   (setq *ucd-base* (make-array (ash #x110000 (- *page-size-exponent*))
112                                :initial-element nil))
113   (with-open-file (*standard-input*
114                    (make-pathname :name "UnicodeData"
115                                   :type "txt"
116                                   :defaults *unicode-character-database*)
117                    :direction :input)
118     (loop for line = (read-line nil nil)
119           while line
120           do (slurp-ucd-line line)))
121   (second-pass)
122   (build-misc-table)
123   (fixup-hangul-syllables)
124   *decompositions*)
125
126 (defun fixup-hangul-syllables ()
127   ;; "Hangul Syllable Composition, Unicode 5.1 section 3-12"
128   (let* ((sbase #xac00)
129          (lbase #x1100)
130          (vbase #x1161)
131          (tbase #x11a7)
132          (scount 11172)
133          (lcount 19)
134          (vcount 21)
135          (tcount 28)
136          (ncount (* vcount tcount))
137          (table (make-hash-table)))
138     (with-open-file (*standard-input*
139                      (make-pathname :name "Jamo" :type "txt"
140                                     :defaults *unicode-character-database*))
141       (loop for line = (read-line nil nil)
142             while line
143             if (position #\; line)
144             do (add-jamo-information line table)))
145     (dotimes (sindex scount)
146       (let* ((l (+ lbase (floor sindex ncount)))
147              (v (+ vbase (floor (mod sindex ncount) tcount)))
148              (tee (+ tbase (mod sindex tcount)))
149              (name (format nil "HANGUL_SYLLABLE_~A~A~:[~A~;~]"
150                            (gethash l table) (gethash v table)
151                            (= tee tbase) (gethash tee table))))
152         (setf (gethash (+ sbase sindex) *unicode-names*) name)))))
153
154 (defun add-jamo-information (line table)
155   (let* ((split (split-string line #\;))
156          (code (parse-integer (first split) :radix 16))
157          (syllable (string-trim '(#\Space)
158                                 (subseq (second split) 0 (position #\# (second split))))))
159     (setf (gethash code table) syllable)))
160
161 (defun split-string (line character)
162   (loop for prev-position = 0 then (1+ position)
163         for position = (position character line :start prev-position)
164         collect (subseq line prev-position position)
165         do (unless position
166              (loop-finish))))
167
168 (defun init-indices (strings)
169   (let ((hash (make-hash-table :test #'equal)))
170     (loop for string in strings
171           for index from 0
172           do (setf (gethash string hash) index))
173     hash))
174
175 (defparameter *general-categories*
176   (init-indices '("Lu" "Ll" "Lt" "Lm" "Lo" "Cc" "Cf" "Co" "Cs" "Mc"
177                   "Me" "Mn" "Nd" "Nl" "No" "Pc" "Pd" "Pe" "Pf" "Pi"
178                   "Po" "Ps" "Sc" "Sk" "Sm" "So" "Zl" "Zp" "Zs")))
179 (defparameter *bidi-classes*
180   (init-indices '("AL" "AN" "B" "BN" "CS" "EN" "ES" "ET" "L" "LRE" "LRO"
181                   "NSM" "ON" "PDF" "R" "RLE" "RLO" "S" "WS")))
182
183
184 (defparameter *block-first* nil)
185
186 (defun normalize-character-name (name)
187   (when (find #\_ name)
188     (error "Bad name for a character: ~A" name))
189   (unless (or (zerop (length name)) (find #\< name) (find #\> name))
190     (substitute #\_ #\Space name)))
191
192 ;;;   3400  --  4DB5  : cjk ideograph extension a ;Lo;0;L;;;;;N;;;;;
193 ;;;   AC00  --  D7A3  : hangul syllables ;Lo;0;L;;;;;N;;;;;
194 ;;;   D800  --  F8FF  : surrogates and private use
195 ;;;  20000  --  2A6D6 : cjk ideograph extension b ;Lo;0;L;;;;;N;;;;;
196 ;;;  F0000  --  FFFFD : private use
197 ;;; 100000  --  10FFFD: private use
198 (defun encode-ucd-line (line code-point)
199   (destructuring-bind (name general-category canonical-combining-class
200                             bidi-class decomposition-type-and-mapping
201                             decimal-digit digit numeric bidi-mirrored
202                             unicode-1-name iso-10646-comment simple-uppercase
203                             simple-lowercase simple-titlecase)
204       line
205     (declare (ignore unicode-1-name iso-10646-comment))
206     (if (and (> (length name) 8)
207              (string= ", First>" name :start2 (- (length name) 8)))
208         (progn
209           (setq *block-first* code-point)
210           nil)
211         (let* ((gc-index (or (gethash general-category *general-categories*)
212                              (error "unknown general category ~A"
213                                     general-category)))
214                (bidi-index (or (gethash bidi-class *bidi-classes*)
215                                (error "unknown bidirectional class ~A"
216                                       bidi-class)))
217                (ccc-index (parse-integer canonical-combining-class))
218                (digit-index (unless (string= "" decimal-digit)
219                               (parse-integer decimal-digit)))
220                (upper-index (unless (string= "" simple-uppercase)
221                               (parse-integer simple-uppercase :radix 16)))
222                (lower-index (unless (string= "" simple-lowercase)
223                               (parse-integer simple-lowercase :radix 16)))
224                (title-index (unless (string= "" simple-titlecase)
225                               (parse-integer simple-titlecase :radix 16)))
226                (cl-both-case-p
227                 (not (null (or (and (= gc-index 0) lower-index)
228                                (and (= gc-index 1) upper-index)))))
229                (decomposition-info 0))
230           (declare (ignore digit-index))
231           (when (and (not cl-both-case-p)
232                      (< gc-index 2))
233             (format t "~A~%" name))
234           (incf *name-size* (length name))
235           (when (string/= "" decomposition-type-and-mapping)
236             (let ((split (split-string decomposition-type-and-mapping #\Space)))
237               (cond
238                 ((char= #\< (aref (first split) 0))
239                  (unless (position (first split) *decomposition-types*
240                                    :test #'equal)
241                    (vector-push (first split) *decomposition-types*))
242                  (setf decomposition-info (position (pop split) *decomposition-types* :test #'equal)))
243                 (t (setf decomposition-info 1)))
244               (unless (aref *decomposition-base* (cp-high code-point))
245                 (setf (aref *decomposition-base* (cp-high code-point))
246                       (make-array (ash 1 *page-size-exponent*)
247                                   :initial-element nil)))
248               (setf (aref (aref *decomposition-base* (cp-high code-point))
249                           (cp-low code-point))
250                     (mapcar #'(lambda (string)
251                                 (parse-integer string :radix 16))
252                             split))
253               (setq *decomposition-length-max*
254                     (max *decomposition-length-max* (length split)))
255               (incf *decompositions* (length split))))
256           (when (and (string/= "" simple-uppercase)
257                      (string/= "" simple-lowercase))
258             (push (list code-point upper-index lower-index) *both-cases*))
259           (when (string/= simple-uppercase simple-titlecase)
260             (push (cons code-point title-index) *different-titlecases*))
261           (when (string/= digit numeric)
262             (push (cons code-point numeric) *different-numerics*))
263           (cond
264             ((= gc-index 8)
265              (unless *last-uppercase*
266                (incf *uppercase-transition-count*))
267              (setq *last-uppercase* t))
268             (t
269              (when *last-uppercase*
270                (incf *uppercase-transition-count*))
271              (setq *last-uppercase* nil)))
272           (when (> ccc-index 255)
273             (error "canonical combining class too large ~A" ccc-index))
274           (let* ((misc-index (hash-misc gc-index bidi-index ccc-index
275                                         decimal-digit digit bidi-mirrored
276                                         cl-both-case-p decomposition-info))
277                  (result (make-ucd :misc misc-index
278                                    :transform (or upper-index lower-index 0))))
279             (when (and (> (length name) 7)
280                        (string= ", Last>" name :start2 (- (length name) 7)))
281               (let ((page-start (ash (+ *block-first*
282                                         (ash 1 *page-size-exponent*)
283                                         -1)
284                                      (- *page-size-exponent*)))
285                     (page-end (ash code-point (- *page-size-exponent*))))
286                 (loop for point from *block-first*
287                       below (ash page-start *page-size-exponent*)
288                       do (setf (aref (aref *ucd-base* (cp-high point))
289                                      (cp-low point))
290                                result))
291                 (loop for page from page-start below page-end
292                       do (setf (aref *ucd-base* page)
293                                (make-array (ash 1 *page-size-exponent*)
294                                            :initial-element result)))
295                 (loop for point from (ash page-end *page-size-exponent*)
296                       below code-point
297                       do (setf (aref (aref *ucd-base* (cp-high point))
298                                      (cp-low point))
299                                result))))
300             (values result (normalize-character-name name)))))))
301
302 (defun slurp-ucd-line (line)
303   (let* ((split-line (split-string line #\;))
304          (code-point (parse-integer (first split-line) :radix 16))
305          (code-high (ash code-point (- *page-size-exponent*)))
306          (code-low (ldb (byte *page-size-exponent* 0) code-point)))
307     (unless (aref *ucd-base* code-high)
308       (setf (aref *ucd-base* code-high)
309             (make-array (ash 1 *page-size-exponent*)
310                         :initial-element nil)))
311     (multiple-value-bind (encoding name)
312         (encode-ucd-line (cdr split-line) code-point)
313       (setf (aref (aref *ucd-base* code-high) code-low) encoding
314             (gethash code-point *unicode-names*) name))))
315
316 ;;; this fixes up the case conversion discrepancy between CL and
317 ;;; Unicode: CL operators depend on char-downcase / char-upcase being
318 ;;; inverses, which is not true in general in Unicode even for
319 ;;; characters which change case to single characters.
320 (defun second-pass ()
321   (loop for i from 0 below (length *ucd-base*)
322         when (aref *ucd-base* i)
323         do (loop for j from 0 below (length (aref *ucd-base* i))
324                  for result = (aref (aref *ucd-base* i) j)
325                  when result
326                  when (let* ((transform-point (ucd-transform result))
327                              (transform-high (ash transform-point
328                                                   (- *page-size-exponent*)))
329                              (transform-low (ldb (byte *page-size-exponent* 0)
330                                                  transform-point)))
331                         (and (plusp transform-point)
332                              (/= (ucd-transform
333                                   (aref (aref *ucd-base* transform-high)
334                                         transform-low))
335                                  (+ (ash i *page-size-exponent*) j))))
336                  do (destructuring-bind (gc-index bidi-index ccc-index
337                                          decimal-digit digit bidi-mirrored
338                                          cl-both-case-p decomposition-info)
339                         (aref *misc-table* (ucd-misc result))
340                       (declare (ignore cl-both-case-p))
341                       (format t "~A~%" (+ (ash i *page-size-exponent*) j))
342                       (setf (ucd-misc result)
343                             (hash-misc gc-index bidi-index ccc-index
344                                        decimal-digit digit bidi-mirrored
345                                        nil decomposition-info))))))
346
347 (defun write-3-byte (triplet stream)
348   (write-byte (ldb (byte 8 0) triplet) stream)
349   (write-byte (ldb (byte 8 8) triplet) stream)
350   (write-byte (ldb (byte 8 16) triplet) stream))
351
352 (defun write-4-byte (quadruplet stream)
353   (write-byte (ldb (byte 8 24) quadruplet) stream)
354   (write-byte (ldb (byte 8 16) quadruplet) stream)
355   (write-byte (ldb (byte 8 8) quadruplet) stream)
356   (write-byte (ldb (byte 8 0) quadruplet) stream))
357
358 (defun digit-to-byte (digit)
359   (if (string= "" digit)
360       255
361       (parse-integer digit)))
362
363 (defun output ()
364   (let ((hash (make-hash-table :test #'equalp))
365         (index 0))
366     (loop for page across *ucd-base*
367           do (when page
368                (unless (gethash page hash)
369                  (setf (gethash page hash)
370                        (incf index)))))
371     (let ((array (make-array (1+ index))))
372       (maphash #'(lambda (key value)
373                    (setf (aref array value) key))
374                hash)
375       (setf (aref array 0)
376             (make-array (ash 1 *page-size-exponent*) :initial-element nil))
377       (with-open-file (stream (make-pathname :name "ucd"
378                                              :type "dat"
379                                              :defaults *output-directory*)
380                               :direction :output
381                               :element-type '(unsigned-byte 8)
382                               :if-exists :supersede
383                               :if-does-not-exist :create)
384         (loop for (gc-index bidi-index ccc-index decimal-digit digit
385                             bidi-mirrored nil decomposition-info)
386               across *misc-table*
387               do (write-byte gc-index stream)
388               do (write-byte bidi-index stream)
389               do (write-byte ccc-index stream)
390               do (write-byte (digit-to-byte decimal-digit) stream)
391               do (write-byte (digit-to-byte digit) stream)
392               do (write-byte (if (string= "N" bidi-mirrored) 0 1) stream)
393               do (write-byte decomposition-info stream)
394               do (write-byte 0 stream))
395         (loop for page across *ucd-base*
396            do (write-byte (if page (gethash page hash) 0) stream))
397         (loop for page across array
398            do (loop for entry across page
399                  do (write-4-byte
400                      (dpb (if entry (aref *misc-mapping* (ucd-misc entry)) #x7ff)
401                           (byte 11 21)
402                           (if entry (ucd-transform entry) 0))
403                      stream)
404                    #+nil #+nil
405                  do (write-byte (if entry
406                                     (aref *misc-mapping* (ucd-misc entry))
407                                     255)
408                                 stream)
409                    #+nil #+nil
410                  do (write-3-byte (if entry (ucd-transform entry) 0)
411                                   stream))))))
412   (with-open-file (f (make-pathname :name "ucd-names" :type "lisp-expr"
413                                     :defaults *output-directory*)
414                      :direction :output
415                      :if-exists :supersede
416                      :if-does-not-exist :create)
417     (with-standard-io-syntax
418       (write-string ";;; Do not edit by hand: generated by ucd.lisp" f)
419       (maphash (lambda (code name)
420                  (when name
421                   (print code f)
422                   (prin1 name f)))
423                *unicode-names*))
424     (setf *unicode-names* nil))
425   (with-open-file (*standard-output*
426                    (make-pathname :name "numerics"
427                                   :type "lisp-expr"
428                                   :defaults *output-directory*)
429                    :direction :output
430                    :if-exists :supersede
431                    :if-does-not-exist :create)
432     (with-standard-io-syntax
433       (let ((*print-pretty* t))
434         (prin1 (mapcar #'(lambda (x) (cons (car x) (read-from-string (cdr x))))
435                        *different-numerics*)))))
436   (with-open-file (*standard-output*
437                    (make-pathname :name "titlecases"
438                                   :type "lisp-expr"
439                                   :defaults *output-directory*)
440                    :direction :output
441                    :if-exists :supersede
442                    :if-does-not-exist :create)
443     (with-standard-io-syntax
444       (let ((*print-pretty* t))
445         (prin1 *different-titlecases*))))
446   (with-open-file (*standard-output*
447                    (make-pathname :name "misc"
448                                   :type "lisp-expr"
449                                   :defaults *output-directory*)
450                    :direction :output
451                    :if-exists :supersede
452                    :if-does-not-exist :create)
453     (with-standard-io-syntax
454       (let ((*print-pretty* t))
455         (prin1 `(:length ,(length *misc-table*)
456                  :uppercase ,(loop for (gc-index) across *misc-table*
457                                 for i from 0
458                                 when (= gc-index 0)
459                                 collect i)
460                  :lowercase ,(loop for (gc-index) across *misc-table*
461                                 for i from 0
462                                 when (= gc-index 1)
463                                 collect i)
464                  :titlecase ,(loop for (gc-index) across *misc-table*
465                                 for i from 0
466                                 when (= gc-index 2)
467                                 collect i))))))
468   (values))
469
470 ;;; Use of the generated files
471
472 (defparameter *compiled-ucd* nil)
473
474 (defun read-compiled-ucd ()
475   (with-open-file (stream (make-pathname :name "ucd"
476                                          :type "dat"
477                                          :defaults *output-directory*)
478                           :direction :input
479                           :element-type '(unsigned-byte 8))
480     (let ((length (file-length stream)))
481       (setq *compiled-ucd*
482             (make-array length :element-type '(unsigned-byte 8)))
483       (read-sequence *compiled-ucd* stream)))
484   (values))
485
486 ;;; The stuff below is dependent on misc.lisp-expr being
487 ;;;
488 ;;; (:LENGTH 395 :UPPERCASE (0 1 2 3 8 9 10 11) :LOWERCASE (4 5 6 7 12 13 14 15) :TITLECASE (16 17))
489 ;;;
490 ;;; There are two groups of entries for UPPERCASE and LOWERCASE
491 ;;; because some characters have case (by Unicode standards) but are
492 ;;; not transformable character-by-character in a locale-independent
493 ;;; way (as CL requires for its standard operators).
494 ;;;
495 ;;; for more details on these debugging functions, see the description
496 ;;; of the character database format in src/code/target-char.lisp
497
498 (defparameter *length* 395)
499
500 (defun cp-index (cp)
501   (let* ((cp-high (cp-high cp))
502          (page (aref *compiled-ucd* (+ (* 8 *length*) cp-high))))
503     (+ (* 8 *length*)
504        (ash #x110000 (- *page-size-exponent*))
505        (* (ash 4 *page-size-exponent*) page)
506        (* 4 (cp-low cp)))))
507
508 (defun cp-value-0 (cp)
509   (let ((index (cp-index cp)))
510     (dpb (aref *compiled-ucd* index)
511          (byte 8 3)
512          (ldb (byte 3 5) (aref *compiled-ucd* (1+ index))))))
513
514 (defun cp-value-1 (cp)
515   (let ((index (cp-index cp)))
516     (dpb (aref *compiled-ucd* (1+ index)) (byte 5 16)
517          (dpb (aref *compiled-ucd* (+ index 2)) (byte 8 8)
518               (aref *compiled-ucd* (+ index 3))))))
519
520 (defun cp-general-category (cp)
521   (aref *compiled-ucd* (* 8 (cp-value-0 cp))))
522
523 (defun cp-decimal-digit (cp)
524   (let ((decimal-digit (aref *compiled-ucd* (+ 3 (* 8 (cp-value-0 cp))))))
525     (and (< decimal-digit 10)
526          decimal-digit)))
527
528 (defun cp-alpha-char-p (cp)
529   (< (cp-general-category cp) 5))
530
531 (defun cp-alphanumericp (cp)
532   (let ((gc (cp-general-category cp)))
533     (or (< gc 5)
534         (= gc 12))))
535
536 (defun cp-digit-char-p (cp &optional (radix 10))
537   (let ((number (or (cp-decimal-digit cp)
538                     (and (<= 65 cp 90)
539                          (- cp 55))
540                     (and (<= 97 cp 122)
541                          (- cp 87)))))
542     (when (and number (< number radix))
543       number)))
544
545 (defun cp-graphic-char-p (cp)
546   (or (<= 32 cp 127)
547       (<= 160 cp)))
548
549 (defun cp-char-upcase (cp)
550   (if (< 3 (cp-value-0 cp) 8)
551       (cp-value-1 cp)
552       cp))
553
554 (defun cp-char-downcase (cp)
555   (if (< (cp-value-0 cp) 4)
556       (cp-value-1 cp)
557       cp))
558
559 (defun cp-upper-case-p (cp)
560   (< (cp-value-0 cp) 4))
561
562 (defun cp-lower-case-p (cp)
563   (< 3 (cp-value-0 cp) 8))
564
565 (defun cp-both-case-p (cp)
566   (< (cp-value-0 cp) 8))