b1cae492216d061b327bb402bbbae9dfbeaf1ab0
[sbcl.git] / tools-for-build / ucd.lisp
1 (in-package "SB-COLD")
2
3 ;;; Common
4
5 (defparameter *output-directory*
6   (merge-pathnames
7    (make-pathname :directory '(:relative :up "output"))
8    (make-pathname :directory (pathname-directory *load-truename*))))
9
10 (defparameter *page-size-exponent* 8)
11
12 (defun cp-high (cp)
13   (ash cp (- *page-size-exponent*)))
14
15 (defun cp-low (cp)
16   (ldb (byte *page-size-exponent* 0) cp))
17
18 ;;; Generator
19
20 (defstruct ucd misc transform)
21
22 (defparameter *unicode-character-database*
23   (make-pathname :directory (pathname-directory *load-truename*)))
24
25 (defparameter *ucd-base* nil)
26 (defparameter *unicode-names* (make-hash-table))
27
28 (defparameter *last-uppercase* nil)
29 (defparameter *uppercase-transition-count* 0)
30 (defparameter *different-titlecases* nil)
31 (defparameter *different-numerics* nil)
32 (defparameter *name-size* 0)
33 (defparameter *misc-hash* (make-hash-table :test #'equal))
34 (defparameter *misc-index* -1)
35 (defparameter *misc-table* nil)
36 (defparameter *misc-mapping* nil)
37 (defparameter *both-cases* nil)
38 (defparameter *decompositions* nil)
39 (defparameter *decomposition-length-max* nil)
40 (defparameter *decomposition-types* nil)
41 (defparameter *decomposition-base* nil)
42
43 (defun hash-misc (gc-index bidi-index ccc-index decimal-digit digit
44                   bidi-mirrored cl-both-case-p)
45   (let* ((list (list gc-index bidi-index ccc-index decimal-digit digit
46                      bidi-mirrored cl-both-case-p))
47          (index (gethash list *misc-hash*)))
48     (or index
49         (progn
50           (vector-push list *misc-table*)
51           (setf (gethash list *misc-hash*)
52                 (incf *misc-index*))))))
53
54 (defun compare-misc-entry (left right)
55   (destructuring-bind (left-gc-index left-bidi-index left-ccc-index
56                        left-decimal-digit left-digit left-bidi-mirrored
57                        left-cl-both-case-p)
58       left
59     (destructuring-bind (right-gc-index right-bidi-index right-ccc-index
60                          right-decimal-digit right-digit right-bidi-mirrored
61                          right-cl-both-case-p)
62         right
63       (or (and left-cl-both-case-p (not right-cl-both-case-p))
64           (and (or left-cl-both-case-p (not right-cl-both-case-p))
65                (or (< left-gc-index right-gc-index)
66                    (and (= left-gc-index right-gc-index)
67                         (or (< left-bidi-index right-bidi-index)
68                             (and (= left-bidi-index right-bidi-index)
69                                  (or (< left-ccc-index right-ccc-index)
70                                      (and (= left-ccc-index right-ccc-index)
71                                           (or (string< left-decimal-digit
72                                                        right-decimal-digit)
73                                               (and (string= left-decimal-digit
74                                                             right-decimal-digit)
75                                                    (or (string< left-digit right-digit)
76                                                        (and (string= left-digit
77                                                                      right-digit)
78                                                             (string< left-bidi-mirrored
79                                                                      right-bidi-mirrored))))))))))))))))
80
81 (defun build-misc-table ()
82   (sort *misc-table* #'compare-misc-entry)
83   (setq *misc-mapping* (make-array (1+ *misc-index*)))
84   (loop for i from 0 to *misc-index*
85         do (setf (aref *misc-mapping*
86                        (gethash (aref *misc-table* i) *misc-hash*))
87                  i)))
88
89 (defun slurp-ucd ()
90   (setq *last-uppercase* nil)
91   (setq *uppercase-transition-count* 0)
92   (setq *different-titlecases* nil)
93   (setq *different-numerics* nil)
94   (setq *name-size* 0)
95   (setq *misc-hash* (make-hash-table :test #'equal))
96   (setq *misc-index* -1)
97   (setq *misc-table* (make-array 256 :fill-pointer 0))
98   (setq *both-cases* nil)
99   (setq *decompositions* 0)
100   (setq *decomposition-types* (make-hash-table :test #'equal))
101   (setq *decomposition-length-max* 0)
102   (setq *decomposition-base* (make-array (ash #x110000
103                                               (- *page-size-exponent*))
104                                          :initial-element nil))
105   (setq *ucd-base* (make-array (ash #x110000 (- *page-size-exponent*))
106                                :initial-element nil))
107   (with-open-file (*standard-input*
108                    (make-pathname :name "UnicodeData"
109                                   :type "txt"
110                                   :defaults *unicode-character-database*)
111                    :direction :input)
112     (loop for line = (read-line nil nil)
113           while line
114           do (slurp-ucd-line line)))
115   (second-pass)
116   (build-misc-table)
117   (fixup-hangul-syllables)
118   *decompositions*)
119
120 (defun fixup-hangul-syllables ()
121   ;; "Hangul Syllable Composition, Unicode 5.1 section 3-12"
122   (let* ((sbase #xac00)
123          (lbase #x1100)
124          (vbase #x1161)
125          (tbase #x11a7)
126          (scount 11172)
127          (lcount 19)
128          (vcount 21)
129          (tcount 28)
130          (ncount (* vcount tcount))
131          (table (make-hash-table)))
132     (with-open-file (*standard-input*
133                      (make-pathname :name "Jamo" :type "txt"
134                                     :defaults *unicode-character-database*))
135       (loop for line = (read-line nil nil)
136             while line
137             if (position #\; line)
138             do (add-jamo-information line table)))
139     (dotimes (sindex scount)
140       (let* ((l (+ lbase (floor sindex ncount)))
141              (v (+ vbase (floor (mod sindex ncount) tcount)))
142              (tee (+ tbase (mod sindex tcount)))
143              (name (format nil "HANGUL_SYLLABLE_~A~A~:[~A~;~]"
144                            (gethash l table) (gethash v table)
145                            (= tee tbase) (gethash tee table))))
146         (setf (gethash (+ sbase sindex) *unicode-names*) name)))))
147
148 (defun add-jamo-information (line table)
149   (let* ((split (split-string line #\;))
150          (code (parse-integer (first split) :radix 16))
151          (syllable (string-trim '(#\Space)
152                                 (subseq (second split) 0 (position #\# (second split))))))
153     (setf (gethash code table) syllable)))
154
155 (defun split-string (line character)
156   (loop for prev-position = 0 then (1+ position)
157         for position = (position character line :start prev-position)
158         collect (subseq line prev-position position)
159         do (unless position
160              (loop-finish))))
161
162 (defun init-indices (strings)
163   (let ((hash (make-hash-table :test #'equal)))
164     (loop for string in strings
165           for index from 0
166           do (setf (gethash string hash) index))
167     hash))
168
169 (defparameter *general-categories*
170   (init-indices '("Lu" "Ll" "Lt" "Lm" "Lo" "Cc" "Cf" "Co" "Cs" "Mc"
171                   "Me" "Mn" "Nd" "Nl" "No" "Pc" "Pd" "Pe" "Pf" "Pi"
172                   "Po" "Ps" "Sc" "Sk" "Sm" "So" "Zl" "Zp" "Zs")))
173 (defparameter *bidi-classes*
174   (init-indices '("AL" "AN" "B" "BN" "CS" "EN" "ES" "ET" "L" "LRE" "LRO"
175                   "NSM" "ON" "PDF" "R" "RLE" "RLO" "S" "WS")))
176
177
178 (defparameter *block-first* nil)
179
180 (defun normalize-character-name (name)
181   (when (find #\_ name)
182     (error "Bad name for a character: ~A" name))
183   (unless (or (zerop (length name)) (find #\< name) (find #\> name))
184     (substitute #\_ #\Space name)))
185
186 ;;;   3400  --  4DB5  : cjk ideograph extension a ;Lo;0;L;;;;;N;;;;;
187 ;;;   AC00  --  D7A3  : hangul syllables ;Lo;0;L;;;;;N;;;;;
188 ;;;   D800  --  F8FF  : surrogates and private use
189 ;;;  20000  --  2A6D6 : cjk ideograph extension b ;Lo;0;L;;;;;N;;;;;
190 ;;;  F0000  --  FFFFD : private use
191 ;;; 100000  --  10FFFD: private use
192 (defun encode-ucd-line (line code-point)
193   (destructuring-bind (name general-category canonical-combining-class
194                             bidi-class decomposition-type-and-mapping
195                             decimal-digit digit numeric bidi-mirrored
196                             unicode-1-name iso-10646-comment simple-uppercase
197                             simple-lowercase simple-titlecase)
198       line
199     (declare (ignore unicode-1-name iso-10646-comment))
200     (if (and (> (length name) 8)
201              (string= ", First>" name :start2 (- (length name) 8)))
202         (progn
203           (setq *block-first* code-point)
204           nil)
205         (let* ((gc-index (or (gethash general-category *general-categories*)
206                              (error "unknown general category ~A"
207                                     general-category)))
208                (bidi-index (or (gethash bidi-class *bidi-classes*)
209                                (error "unknown bidirectional class ~A"
210                                       bidi-class)))
211                (ccc-index (parse-integer canonical-combining-class))
212                (digit-index (unless (string= "" decimal-digit)
213                               (parse-integer decimal-digit)))
214                (upper-index (unless (string= "" simple-uppercase)
215                               (parse-integer simple-uppercase :radix 16)))
216                (lower-index (unless (string= "" simple-lowercase)
217                               (parse-integer simple-lowercase :radix 16)))
218                (title-index (unless (string= "" simple-titlecase)
219                               (parse-integer simple-titlecase :radix 16)))
220                (cl-both-case-p
221                 (not (null (or (and (= gc-index 0) lower-index)
222                                (and (= gc-index 1) upper-index)))))
223                (misc-index (hash-misc gc-index bidi-index ccc-index
224                                       decimal-digit digit bidi-mirrored
225                                       cl-both-case-p)))
226           (declare (ignore digit-index))
227           (when (and (not cl-both-case-p)
228                      (< gc-index 2))
229             (format t "~A~%" name))
230           (incf *name-size* (length name))
231           (when (string/= "" decomposition-type-and-mapping)
232             (let ((split (split-string decomposition-type-and-mapping
233                                        #\Space)))
234               (when (char= #\< (aref (first split) 0))
235                 (setf (gethash (pop split) *decomposition-types*) t))
236               (unless (aref *decomposition-base* (cp-high code-point))
237                 (setf (aref *decomposition-base* (cp-high code-point))
238                       (make-array (ash 1 *page-size-exponent*)
239                                   :initial-element nil)))
240               (setf (aref (aref *decomposition-base* (cp-high code-point))
241                           (cp-low code-point))
242                     (mapcar #'(lambda (string)
243                                 (parse-integer string :radix 16))
244                             split))
245               (setq *decomposition-length-max*
246                     (max *decomposition-length-max* (length split)))
247               (incf *decompositions* (length split))))
248           (when (and (string/= "" simple-uppercase)
249                      (string/= "" simple-lowercase))
250             (push (list code-point upper-index lower-index) *both-cases*))
251           (when (string/= simple-uppercase simple-titlecase)
252             (push (cons code-point title-index) *different-titlecases*))
253           (when (string/= digit numeric)
254             (push (cons code-point numeric) *different-numerics*))
255           (cond
256             ((= gc-index 8)
257              (unless *last-uppercase*
258                (incf *uppercase-transition-count*))
259              (setq *last-uppercase* t))
260             (t
261              (when *last-uppercase*
262                (incf *uppercase-transition-count*))
263              (setq *last-uppercase* nil)))
264           (when (> ccc-index 255)
265             (error "canonical combining class too large ~A" ccc-index))
266           (let ((result (make-ucd :misc misc-index
267                                   :transform (or upper-index lower-index 0))))
268             (when (and (> (length name) 7)
269                        (string= ", Last>" name :start2 (- (length name) 7)))
270               (let ((page-start (ash (+ *block-first*
271                                         (ash 1 *page-size-exponent*)
272                                         -1)
273                                      (- *page-size-exponent*)))
274                     (page-end (ash code-point (- *page-size-exponent*))))
275                 (loop for point from *block-first*
276                       below (ash page-start *page-size-exponent*)
277                       do (setf (aref (aref *ucd-base* (cp-high point))
278                                      (cp-low point))
279                                result))
280                 (loop for page from page-start below page-end
281                       do (setf (aref *ucd-base* page)
282                                (make-array (ash 1 *page-size-exponent*)
283                                            :initial-element result)))
284                 (loop for point from (ash page-end *page-size-exponent*)
285                       below code-point
286                       do (setf (aref (aref *ucd-base* (cp-high point))
287                                      (cp-low point))
288                                result))))
289             (values result (normalize-character-name name)))))))
290
291 (defun slurp-ucd-line (line)
292   (let* ((split-line (split-string line #\;))
293          (code-point (parse-integer (first split-line) :radix 16))
294          (code-high (ash code-point (- *page-size-exponent*)))
295          (code-low (ldb (byte *page-size-exponent* 0) code-point)))
296     (unless (aref *ucd-base* code-high)
297       (setf (aref *ucd-base* code-high)
298             (make-array (ash 1 *page-size-exponent*)
299                         :initial-element nil)))
300     (multiple-value-bind (encoding name)
301         (encode-ucd-line (cdr split-line) code-point)
302       (setf (aref (aref *ucd-base* code-high) code-low) encoding
303             (gethash code-point *unicode-names*) name))))
304
305 (defun second-pass ()
306   (loop for i from 0 below (length *ucd-base*)
307         when (aref *ucd-base* i)
308         do (loop for j from 0 below (length (aref *ucd-base* i))
309                  for result = (aref (aref *ucd-base* i) j)
310                  when result
311                  when (let* ((transform-point (ucd-transform result))
312                              (transform-high (ash transform-point
313                                                   (- *page-size-exponent*)))
314                              (transform-low (ldb (byte *page-size-exponent* 0)
315                                                  transform-point)))
316                         (and (plusp transform-point)
317                              (/= (ucd-transform
318                                   (aref (aref *ucd-base* transform-high)
319                                         transform-low))
320                                  (+ (ash i *page-size-exponent*) j))))
321                  do (destructuring-bind (gc-index bidi-index ccc-index
322                                          decimal-digit digit bidi-mirrored
323                                          cl-both-case-p)
324                         (aref *misc-table* (ucd-misc result))
325                       (declare (ignore cl-both-case-p))
326                       (format t "~A~%" (+ (ash i *page-size-exponent*) j))
327                       (setf (ucd-misc result)
328                             (hash-misc gc-index bidi-index ccc-index
329                                        decimal-digit digit bidi-mirrored
330                                        nil))))))
331
332 (defun write-3-byte (triplet stream)
333   (write-byte (ldb (byte 8 0) triplet) stream)
334   (write-byte (ldb (byte 8 8) triplet) stream)
335   (write-byte (ldb (byte 8 16) triplet) stream))
336
337 (defun digit-to-byte (digit)
338   (if (string= "" digit)
339       255
340       (parse-integer digit)))
341
342 (defun output ()
343   (let ((hash (make-hash-table :test #'equalp))
344         (index 0))
345     (loop for page across *ucd-base*
346           do (when page
347                (unless (gethash page hash)
348                  (setf (gethash page hash)
349                        (incf index)))))
350     (let ((array (make-array (1+ index))))
351       (maphash #'(lambda (key value)
352                    (setf (aref array value) key))
353                hash)
354       (setf (aref array 0)
355             (make-array (ash 1 *page-size-exponent*) :initial-element nil))
356       (with-open-file (stream (make-pathname :name "ucd"
357                                              :type "dat"
358                                              :defaults *output-directory*)
359                               :direction :output
360                               :element-type '(unsigned-byte 8)
361                               :if-exists :supersede
362                               :if-does-not-exist :create)
363         (loop for (gc-index bidi-index ccc-index decimal-digit digit
364                             bidi-mirrored)
365               across *misc-table*
366               do (write-byte gc-index stream)
367               do (write-byte bidi-index stream)
368               do (write-byte ccc-index stream)
369               do (write-byte (digit-to-byte decimal-digit) stream)
370               do (write-byte (digit-to-byte digit) stream)
371               do (write-byte (if (string= "N" bidi-mirrored) 0 1) stream)
372               do (write-byte 0 stream)
373               do (write-byte 0 stream))
374         (loop for page across *ucd-base*
375            do (write-byte (if page (gethash page hash) 0) stream))
376         (loop for page across array
377            do (loop for entry across page
378                  do (write-byte (if entry
379                                     (aref *misc-mapping* (ucd-misc entry))
380                                     255)
381                                 stream)
382                  do (write-3-byte (if entry (ucd-transform entry) 0)
383                                   stream))))))
384   (with-open-file (f (make-pathname :name "ucd-names" :type "lisp-expr"
385                                     :defaults *output-directory*)
386                      :direction :output
387                      :if-exists :supersede
388                      :if-does-not-exist :create)
389     (with-standard-io-syntax
390       (write-string ";;; Do not edit by hand: generated by ucd.lisp" f)
391       (maphash (lambda (code name)
392                  (when name
393                   (print code f)
394                   (prin1 name f)))
395                *unicode-names*))
396     (setf *unicode-names* nil))
397   (with-open-file (*standard-output*
398                    (make-pathname :name "numerics"
399                                   :type "lisp-expr"
400                                   :defaults *output-directory*)
401                    :direction :output
402                    :if-exists :supersede
403                    :if-does-not-exist :create)
404     (with-standard-io-syntax
405       (let ((*print-pretty* t))
406         (prin1 (mapcar #'(lambda (x) (cons (car x) (read-from-string (cdr x))))
407                        *different-numerics*)))))
408   (with-open-file (*standard-output*
409                    (make-pathname :name "titlecases"
410                                   :type "lisp-expr"
411                                   :defaults *output-directory*)
412                    :direction :output
413                    :if-exists :supersede
414                    :if-does-not-exist :create)
415     (with-standard-io-syntax
416       (let ((*print-pretty* t))
417         (prin1 *different-titlecases*))))
418   (with-open-file (*standard-output*
419                    (make-pathname :name "misc"
420                                   :type "lisp-expr"
421                                   :defaults *output-directory*)
422                    :direction :output
423                    :if-exists :supersede
424                    :if-does-not-exist :create)
425     (with-standard-io-syntax
426       (let ((*print-pretty* t))
427         (prin1 `(:length ,(length *misc-table*)
428                  :uppercase ,(loop for (gc-index) across *misc-table*
429                                 for i from 0
430                                 when (= gc-index 0)
431                                 collect i)
432                  :lowercase ,(loop for (gc-index) across *misc-table*
433                                 for i from 0
434                                 when (= gc-index 1)
435                                 collect i)
436                  :titlecase ,(loop for (gc-index) across *misc-table*
437                                 for i from 0
438                                 when (= gc-index 2)
439                                 collect i))))))
440   (values))
441
442 ;;; Use of the generated files
443
444 (defparameter *compiled-ucd* nil)
445
446 (defun read-compiled-ucd ()
447   (with-open-file (stream (make-pathname :name "ucd"
448                                          :type "dat"
449                                          :defaults *output-directory*)
450                           :direction :input
451                           :element-type '(unsigned-byte 8))
452     (let ((length (file-length stream)))
453       (setq *compiled-ucd*
454             (make-array length :element-type '(unsigned-byte 8)))
455       (read-sequence *compiled-ucd* stream)))
456   (values))
457
458 ;;; The stuff below is dependent on misc.lisp-expr being
459 ;;; (:LENGTH 215 :UPPERCASE (0 2) :LOWERCASE (1 3) :TITLECASE (4)).
460 ;;;
461 ;;; There are two entries for UPPERCASE and LOWERCASE because some
462 ;;; characters have case (by Unicode standards) but are not
463 ;;; transformable character-by-character in a locale-independent way
464 ;;; (as CL requires for its standard operators).
465 ;;;
466 ;;; for more details on these debugging functions, see the description
467 ;;; of the character database format in src/code/target-char.lisp
468
469 (defparameter *length* 215)
470
471 (defun cp-index (cp)
472   (let* ((cp-high (cp-high cp))
473          (page (aref *compiled-ucd* (+ (* 8 *length*) cp-high))))
474     (+ (* 8 *length*)
475        (ash #x110000 (- *page-size-exponent*))
476        (* (ash 4 *page-size-exponent*) page)
477        (* 4 (cp-low cp)))))
478
479 (defun cp-value-0 (cp)
480   (aref *compiled-ucd* (cp-index cp)))
481
482 (defun cp-value-1 (cp)
483   (let ((index (cp-index cp)))
484     (dpb (aref *compiled-ucd* (+ index 3)) (byte 8 16)
485          (dpb (aref *compiled-ucd* (+ index 2)) (byte 8 8)
486               (aref *compiled-ucd* (1+ index))))))
487
488 (defun cp-general-category (cp)
489   (aref *compiled-ucd* (* 8 (cp-value-0 cp))))
490
491 (defun cp-decimal-digit (cp)
492   (let ((decimal-digit (aref *compiled-ucd* (+ 3 (* 8 (cp-value-0 cp))))))
493     (and (< decimal-digit 10)
494          decimal-digit)))
495
496 (defun cp-alpha-char-p (cp)
497   (< (cp-general-category cp) 5))
498
499 (defun cp-alphanumericp (cp)
500   (let ((gc (cp-general-category cp)))
501     (or (< gc 5)
502         (= gc 12))))
503
504 (defun cp-digit-char-p (cp &optional (radix 10))
505   (let ((number (or (cp-decimal-digit cp)
506                     (and (<= 65 cp 90)
507                          (- cp 55))
508                     (and (<= 97 cp 122)
509                          (- cp 87)))))
510     (when (and number (< number radix))
511       number)))
512
513 (defun cp-graphic-char-p (cp)
514   (or (<= 32 cp 127)
515       (<= 160 cp)))
516
517 (defun cp-char-upcase (cp)
518   (if (= (cp-value-0 cp) 1)
519       (cp-value-1 cp)
520       cp))
521
522 (defun cp-char-downcase (cp)
523   (if (= (cp-value-0 cp) 0)
524       (cp-value-1 cp)
525       cp))
526
527 (defun cp-upper-case-p (cp)
528   (= (cp-value-0 cp) 0))
529
530 (defun cp-lower-case-p (cp)
531   (= (cp-value-0 cp) 1))
532
533 (defun cp-both-case-p (cp)
534   (< (cp-value-0 cp) 2))