delete now-unused code from ucd.dat
[sbcl.git] / tools-for-build / ucd.lisp
1 (in-package "SB-COLD")
2
3 ;;; Common
4
5 (defparameter *output-directory*
6   (merge-pathnames
7    (make-pathname :directory '(:relative :up "output"))
8    (make-pathname :directory (pathname-directory *load-truename*))))
9
10 (defparameter *page-size-exponent* 8)
11
12 (defun cp-high (cp)
13   (ash cp (- *page-size-exponent*)))
14
15 (defun cp-low (cp)
16   (ldb (byte *page-size-exponent* 0) cp))
17
18 ;;; Generator
19
20 (defstruct ucd misc transform)
21
22 (defparameter *unicode-character-database*
23   (make-pathname :directory (pathname-directory *load-truename*)))
24
25 (defparameter *ucd-base* nil)
26 (defparameter *unicode-names* (make-hash-table))
27
28 (defparameter *last-uppercase* nil)
29 (defparameter *uppercase-transition-count* 0)
30 (defparameter *different-titlecases* nil)
31 (defparameter *different-numerics* nil)
32 (defparameter *name-size* 0)
33 (defparameter *misc-hash* (make-hash-table :test #'equal))
34 (defparameter *misc-index* -1)
35 (defparameter *misc-table* nil)
36 (defparameter *misc-mapping* nil)
37 (defparameter *both-cases* nil)
38 (defparameter *decompositions* nil)
39 (defparameter *decomposition-length-max* nil)
40 (defparameter *decomposition-types* nil)
41 (defparameter *decomposition-base* nil)
42
43 (defun hash-misc (gc-index bidi-index ccc-index decimal-digit digit
44                   bidi-mirrored cl-both-case-p decomposition-info)
45   (let* ((list (list gc-index bidi-index ccc-index decimal-digit digit
46                      bidi-mirrored cl-both-case-p decomposition-info))
47          (index (gethash list *misc-hash*)))
48     (or index
49         (progn
50           (vector-push list *misc-table*)
51           (setf (gethash list *misc-hash*)
52                 (incf *misc-index*))))))
53
54 (defun compare-misc-entry (left right)
55   (destructuring-bind (left-gc-index left-bidi-index left-ccc-index
56                        left-decimal-digit left-digit left-bidi-mirrored
57                        left-cl-both-case-p left-decomposition-info)
58       left
59     (destructuring-bind (right-gc-index right-bidi-index right-ccc-index
60                          right-decimal-digit right-digit right-bidi-mirrored
61                          right-cl-both-case-p right-decomposition-info)
62         right
63       (or (and left-cl-both-case-p (not right-cl-both-case-p))
64           (and (or left-cl-both-case-p (not right-cl-both-case-p))
65                (or (< left-gc-index right-gc-index)
66                    (and (= left-gc-index right-gc-index)
67                         (or (< left-decomposition-info right-decomposition-info)
68                             (and (= left-decomposition-info right-decomposition-info)
69                                  (or (< left-bidi-index right-bidi-index)
70                                      (and (= left-bidi-index right-bidi-index)
71                                           (or (< left-ccc-index right-ccc-index)
72                                               (and (= left-ccc-index right-ccc-index)
73                                                    (or (string< left-decimal-digit
74                                                                 right-decimal-digit)
75                                                        (and (string= left-decimal-digit
76                                                                      right-decimal-digit)
77                                                             (or (string< left-digit right-digit)
78                                                                 (and (string= left-digit
79                                                                               right-digit)
80                                                                      (string< left-bidi-mirrored
81                                                                               right-bidi-mirrored))))))))))))))))))
82
83 (defun build-misc-table ()
84   (sort *misc-table* #'compare-misc-entry)
85   (setq *misc-mapping* (make-array (1+ *misc-index*)))
86   (loop for i from 0 to *misc-index*
87      do (setf (aref *misc-mapping*
88                     (gethash (aref *misc-table* i) *misc-hash*))
89               i)))
90
91 (defun slurp-ucd ()
92   (setq *last-uppercase* nil)
93   (setq *uppercase-transition-count* 0)
94   (setq *different-titlecases* nil)
95   (setq *different-numerics* nil)
96   (setq *name-size* 0)
97   (setq *misc-hash* (make-hash-table :test #'equal))
98   (setq *misc-index* -1)
99   (setq *misc-table* (make-array 2048 :fill-pointer 0))
100   (setq *both-cases* nil)
101   (setq *decompositions* 0)
102   (setq *decomposition-types*
103         (let ((array (make-array 256 :initial-element nil :fill-pointer 1)))
104           (vector-push "" array)
105           (vector-push "<compat>" array)
106           array))
107   (setq *decomposition-length-max* 0)
108   (setq *decomposition-base* (make-array (ash #x110000
109                                               (- *page-size-exponent*))
110                                          :initial-element nil))
111   (setq *ucd-base* (make-array (ash #x110000 (- *page-size-exponent*))
112                                :initial-element nil))
113   (with-open-file (*standard-input*
114                    (make-pathname :name "UnicodeData"
115                                   :type "txt"
116                                   :defaults *unicode-character-database*)
117                    :direction :input)
118     (loop for line = (read-line nil nil)
119           while line
120           do (slurp-ucd-line line)))
121   (second-pass)
122   (build-misc-table)
123   (fixup-hangul-syllables)
124   *decompositions*)
125
126 (defun fixup-hangul-syllables ()
127   ;; "Hangul Syllable Composition, Unicode 5.1 section 3-12"
128   (let* ((sbase #xac00)
129          (lbase #x1100)
130          (vbase #x1161)
131          (tbase #x11a7)
132          (scount 11172)
133          (lcount 19)
134          (vcount 21)
135          (tcount 28)
136          (ncount (* vcount tcount))
137          (table (make-hash-table)))
138     (with-open-file (*standard-input*
139                      (make-pathname :name "Jamo" :type "txt"
140                                     :defaults *unicode-character-database*))
141       (loop for line = (read-line nil nil)
142             while line
143             if (position #\; line)
144             do (add-jamo-information line table)))
145     (dotimes (sindex scount)
146       (let* ((l (+ lbase (floor sindex ncount)))
147              (v (+ vbase (floor (mod sindex ncount) tcount)))
148              (tee (+ tbase (mod sindex tcount)))
149              (name (format nil "HANGUL_SYLLABLE_~A~A~:[~A~;~]"
150                            (gethash l table) (gethash v table)
151                            (= tee tbase) (gethash tee table))))
152         (setf (gethash (+ sbase sindex) *unicode-names*) name)))))
153
154 (defun add-jamo-information (line table)
155   (let* ((split (split-string line #\;))
156          (code (parse-integer (first split) :radix 16))
157          (syllable (string-trim '(#\Space)
158                                 (subseq (second split) 0 (position #\# (second split))))))
159     (setf (gethash code table) syllable)))
160
161 (defun split-string (line character)
162   (loop for prev-position = 0 then (1+ position)
163         for position = (position character line :start prev-position)
164         collect (subseq line prev-position position)
165         do (unless position
166              (loop-finish))))
167
168 (defun init-indices (strings)
169   (let ((hash (make-hash-table :test #'equal)))
170     (loop for string in strings
171           for index from 0
172           do (setf (gethash string hash) index))
173     hash))
174
175 (defparameter *general-categories*
176   (init-indices '("Lu" "Ll" "Lt" "Lm" "Lo" "Cc" "Cf" "Co" "Cs" "Mc"
177                   "Me" "Mn" "Nd" "Nl" "No" "Pc" "Pd" "Pe" "Pf" "Pi"
178                   "Po" "Ps" "Sc" "Sk" "Sm" "So" "Zl" "Zp" "Zs")))
179 (defparameter *bidi-classes*
180   (init-indices '("AL" "AN" "B" "BN" "CS" "EN" "ES" "ET" "L" "LRE" "LRO"
181                   "NSM" "ON" "PDF" "R" "RLE" "RLO" "S" "WS")))
182
183
184 (defparameter *block-first* nil)
185
186 (defun normalize-character-name (name)
187   (when (find #\_ name)
188     (error "Bad name for a character: ~A" name))
189   (unless (or (zerop (length name)) (find #\< name) (find #\> name))
190     (substitute #\_ #\Space name)))
191
192 ;;;   3400  --  4DB5  : cjk ideograph extension a ;Lo;0;L;;;;;N;;;;;
193 ;;;   AC00  --  D7A3  : hangul syllables ;Lo;0;L;;;;;N;;;;;
194 ;;;   D800  --  F8FF  : surrogates and private use
195 ;;;  20000  --  2A6D6 : cjk ideograph extension b ;Lo;0;L;;;;;N;;;;;
196 ;;;  F0000  --  FFFFD : private use
197 ;;; 100000  --  10FFFD: private use
198 (defun encode-ucd-line (line code-point)
199   (destructuring-bind (name general-category canonical-combining-class
200                             bidi-class decomposition-type-and-mapping
201                             decimal-digit digit numeric bidi-mirrored
202                             unicode-1-name iso-10646-comment simple-uppercase
203                             simple-lowercase simple-titlecase)
204       line
205     (declare (ignore unicode-1-name iso-10646-comment))
206     (if (and (> (length name) 8)
207              (string= ", First>" name :start2 (- (length name) 8)))
208         (progn
209           (setq *block-first* code-point)
210           nil)
211         (let* ((gc-index (or (gethash general-category *general-categories*)
212                              (error "unknown general category ~A"
213                                     general-category)))
214                (bidi-index (or (gethash bidi-class *bidi-classes*)
215                                (error "unknown bidirectional class ~A"
216                                       bidi-class)))
217                (ccc-index (parse-integer canonical-combining-class))
218                (digit-index (unless (string= "" decimal-digit)
219                               (parse-integer decimal-digit)))
220                (upper-index (unless (string= "" simple-uppercase)
221                               (parse-integer simple-uppercase :radix 16)))
222                (lower-index (unless (string= "" simple-lowercase)
223                               (parse-integer simple-lowercase :radix 16)))
224                (title-index (unless (string= "" simple-titlecase)
225                               (parse-integer simple-titlecase :radix 16)))
226                (cl-both-case-p
227                 (not (null (or (and (= gc-index 0) lower-index)
228                                (and (= gc-index 1) upper-index)))))
229                (decomposition-info 0))
230           (declare (ignore digit-index))
231           (when (and (not cl-both-case-p)
232                      (< gc-index 2))
233             (format t "~A~%" name))
234           (incf *name-size* (length name))
235           (when (string/= "" decomposition-type-and-mapping)
236             (let ((split (split-string decomposition-type-and-mapping #\Space)))
237               (cond
238                 ((char= #\< (aref (first split) 0))
239                  (unless (position (first split) *decomposition-types*
240                                    :test #'equal)
241                    (vector-push (first split) *decomposition-types*))
242                  (setf decomposition-info (position (pop split) *decomposition-types* :test #'equal)))
243                 (t (setf decomposition-info 1)))
244               (unless (aref *decomposition-base* (cp-high code-point))
245                 (setf (aref *decomposition-base* (cp-high code-point))
246                       (make-array (ash 1 *page-size-exponent*)
247                                   :initial-element nil)))
248               (setf (aref (aref *decomposition-base* (cp-high code-point))
249                           (cp-low code-point))
250                     (mapcar #'(lambda (string)
251                                 (parse-integer string :radix 16))
252                             split))
253               (setq *decomposition-length-max*
254                     (max *decomposition-length-max* (length split)))
255               (incf *decompositions* (length split))))
256           (when (and (string/= "" simple-uppercase)
257                      (string/= "" simple-lowercase))
258             (push (list code-point upper-index lower-index) *both-cases*))
259           (when (string/= simple-uppercase simple-titlecase)
260             (push (cons code-point title-index) *different-titlecases*))
261           (when (string/= digit numeric)
262             (push (cons code-point numeric) *different-numerics*))
263           (cond
264             ((= gc-index 8)
265              (unless *last-uppercase*
266                (incf *uppercase-transition-count*))
267              (setq *last-uppercase* t))
268             (t
269              (when *last-uppercase*
270                (incf *uppercase-transition-count*))
271              (setq *last-uppercase* nil)))
272           (when (> ccc-index 255)
273             (error "canonical combining class too large ~A" ccc-index))
274           (let* ((misc-index (hash-misc gc-index bidi-index ccc-index
275                                         decimal-digit digit bidi-mirrored
276                                         cl-both-case-p decomposition-info))
277                  (result (make-ucd :misc misc-index
278                                    :transform (or upper-index lower-index 0))))
279             (when (and (> (length name) 7)
280                        (string= ", Last>" name :start2 (- (length name) 7)))
281               (let ((page-start (ash (+ *block-first*
282                                         (ash 1 *page-size-exponent*)
283                                         -1)
284                                      (- *page-size-exponent*)))
285                     (page-end (ash code-point (- *page-size-exponent*))))
286                 (loop for point from *block-first*
287                       below (ash page-start *page-size-exponent*)
288                       do (setf (aref (aref *ucd-base* (cp-high point))
289                                      (cp-low point))
290                                result))
291                 (loop for page from page-start below page-end
292                       do (setf (aref *ucd-base* page)
293                                (make-array (ash 1 *page-size-exponent*)
294                                            :initial-element result)))
295                 (loop for point from (ash page-end *page-size-exponent*)
296                       below code-point
297                       do (setf (aref (aref *ucd-base* (cp-high point))
298                                      (cp-low point))
299                                result))))
300             (values result (normalize-character-name name)))))))
301
302 (defun slurp-ucd-line (line)
303   (let* ((split-line (split-string line #\;))
304          (code-point (parse-integer (first split-line) :radix 16))
305          (code-high (ash code-point (- *page-size-exponent*)))
306          (code-low (ldb (byte *page-size-exponent* 0) code-point)))
307     (unless (aref *ucd-base* code-high)
308       (setf (aref *ucd-base* code-high)
309             (make-array (ash 1 *page-size-exponent*)
310                         :initial-element nil)))
311     (multiple-value-bind (encoding name)
312         (encode-ucd-line (cdr split-line) code-point)
313       (setf (aref (aref *ucd-base* code-high) code-low) encoding
314             (gethash code-point *unicode-names*) name))))
315
316 ;;; this fixes up the case conversion discrepancy between CL and
317 ;;; Unicode: CL operators depend on char-downcase / char-upcase being
318 ;;; inverses, which is not true in general in Unicode even for
319 ;;; characters which change case to single characters.
320 (defun second-pass ()
321   (loop for i from 0 below (length *ucd-base*)
322         when (aref *ucd-base* i)
323         do (loop for j from 0 below (length (aref *ucd-base* i))
324                  for result = (aref (aref *ucd-base* i) j)
325                  when result
326                  when (let* ((transform-point (ucd-transform result))
327                              (transform-high (ash transform-point
328                                                   (- *page-size-exponent*)))
329                              (transform-low (ldb (byte *page-size-exponent* 0)
330                                                  transform-point)))
331                         (and (plusp transform-point)
332                              (/= (ucd-transform
333                                   (aref (aref *ucd-base* transform-high)
334                                         transform-low))
335                                  (+ (ash i *page-size-exponent*) j))))
336                  do (destructuring-bind (gc-index bidi-index ccc-index
337                                          decimal-digit digit bidi-mirrored
338                                          cl-both-case-p decomposition-info)
339                         (aref *misc-table* (ucd-misc result))
340                       (declare (ignore cl-both-case-p))
341                       (format t "~A~%" (+ (ash i *page-size-exponent*) j))
342                       (setf (ucd-misc result)
343                             (hash-misc gc-index bidi-index ccc-index
344                                        decimal-digit digit bidi-mirrored
345                                        nil decomposition-info))))))
346
347 (defun write-4-byte (quadruplet stream)
348   (write-byte (ldb (byte 8 24) quadruplet) stream)
349   (write-byte (ldb (byte 8 16) quadruplet) stream)
350   (write-byte (ldb (byte 8 8) quadruplet) stream)
351   (write-byte (ldb (byte 8 0) quadruplet) stream))
352
353 (defun digit-to-byte (digit)
354   (if (string= "" digit)
355       255
356       (parse-integer digit)))
357
358 (defun output ()
359   (let ((hash (make-hash-table :test #'equalp))
360         (index 0))
361     (loop for page across *ucd-base*
362           do (when page
363                (unless (gethash page hash)
364                  (setf (gethash page hash)
365                        (incf index)))))
366     (let ((array (make-array (1+ index))))
367       (maphash #'(lambda (key value)
368                    (setf (aref array value) key))
369                hash)
370       (setf (aref array 0)
371             (make-array (ash 1 *page-size-exponent*) :initial-element nil))
372       (with-open-file (stream (make-pathname :name "ucd"
373                                              :type "dat"
374                                              :defaults *output-directory*)
375                               :direction :output
376                               :element-type '(unsigned-byte 8)
377                               :if-exists :supersede
378                               :if-does-not-exist :create)
379         (loop for (gc-index bidi-index ccc-index decimal-digit digit
380                             bidi-mirrored nil decomposition-info)
381               across *misc-table*
382               do (write-byte gc-index stream)
383               do (write-byte bidi-index stream)
384               do (write-byte ccc-index stream)
385               do (write-byte (digit-to-byte decimal-digit) stream)
386               do (write-byte (digit-to-byte digit) stream)
387               do (write-byte (if (string= "N" bidi-mirrored) 0 1) stream)
388               do (write-byte decomposition-info stream)
389               do (write-byte 0 stream))
390         (loop for page across *ucd-base*
391            do (write-byte (if page (gethash page hash) 0) stream))
392         (loop for page across array
393            do (loop for entry across page
394                  do (write-4-byte
395                      (dpb (if entry (aref *misc-mapping* (ucd-misc entry)) #x7ff)
396                           (byte 11 21)
397                           (if entry (ucd-transform entry) 0))
398                      stream))))))
399   (with-open-file (f (make-pathname :name "ucd-names" :type "lisp-expr"
400                                     :defaults *output-directory*)
401                      :direction :output
402                      :if-exists :supersede
403                      :if-does-not-exist :create)
404     (with-standard-io-syntax
405       (write-string ";;; Do not edit by hand: generated by ucd.lisp" f)
406       (maphash (lambda (code name)
407                  (when name
408                   (print code f)
409                   (prin1 name f)))
410                *unicode-names*))
411     (setf *unicode-names* nil))
412   (with-open-file (*standard-output*
413                    (make-pathname :name "numerics"
414                                   :type "lisp-expr"
415                                   :defaults *output-directory*)
416                    :direction :output
417                    :if-exists :supersede
418                    :if-does-not-exist :create)
419     (with-standard-io-syntax
420       (let ((*print-pretty* t))
421         (prin1 (mapcar #'(lambda (x) (cons (car x) (read-from-string (cdr x))))
422                        *different-numerics*)))))
423   (with-open-file (*standard-output*
424                    (make-pathname :name "titlecases"
425                                   :type "lisp-expr"
426                                   :defaults *output-directory*)
427                    :direction :output
428                    :if-exists :supersede
429                    :if-does-not-exist :create)
430     (with-standard-io-syntax
431       (let ((*print-pretty* t))
432         (prin1 *different-titlecases*))))
433   (with-open-file (*standard-output*
434                    (make-pathname :name "misc"
435                                   :type "lisp-expr"
436                                   :defaults *output-directory*)
437                    :direction :output
438                    :if-exists :supersede
439                    :if-does-not-exist :create)
440     (with-standard-io-syntax
441       (let ((*print-pretty* t))
442         (prin1 `(:length ,(length *misc-table*)
443                  :uppercase ,(loop for (gc-index) across *misc-table*
444                                 for i from 0
445                                 when (= gc-index 0)
446                                 collect i)
447                  :lowercase ,(loop for (gc-index) across *misc-table*
448                                 for i from 0
449                                 when (= gc-index 1)
450                                 collect i)
451                  :titlecase ,(loop for (gc-index) across *misc-table*
452                                 for i from 0
453                                 when (= gc-index 2)
454                                 collect i))))))
455   (values))
456
457 ;;; Use of the generated files
458
459 (defparameter *compiled-ucd* nil)
460
461 (defun read-compiled-ucd ()
462   (with-open-file (stream (make-pathname :name "ucd"
463                                          :type "dat"
464                                          :defaults *output-directory*)
465                           :direction :input
466                           :element-type '(unsigned-byte 8))
467     (let ((length (file-length stream)))
468       (setq *compiled-ucd*
469             (make-array length :element-type '(unsigned-byte 8)))
470       (read-sequence *compiled-ucd* stream)))
471   (values))
472
473 ;;; The stuff below is dependent on misc.lisp-expr being
474 ;;;
475 ;;; (:LENGTH 395 :UPPERCASE (0 1 2 3 8 9 10 11) :LOWERCASE (4 5 6 7 12 13 14 15) :TITLECASE (16 17))
476 ;;;
477 ;;; There are two groups of entries for UPPERCASE and LOWERCASE
478 ;;; because some characters have case (by Unicode standards) but are
479 ;;; not transformable character-by-character in a locale-independent
480 ;;; way (as CL requires for its standard operators).
481 ;;;
482 ;;; for more details on these debugging functions, see the description
483 ;;; of the character database format in src/code/target-char.lisp
484
485 (defparameter *length* 395)
486
487 (defun cp-index (cp)
488   (let* ((cp-high (cp-high cp))
489          (page (aref *compiled-ucd* (+ (* 8 *length*) cp-high))))
490     (+ (* 8 *length*)
491        (ash #x110000 (- *page-size-exponent*))
492        (* (ash 4 *page-size-exponent*) page)
493        (* 4 (cp-low cp)))))
494
495 (defun cp-value-0 (cp)
496   (let ((index (cp-index cp)))
497     (dpb (aref *compiled-ucd* index)
498          (byte 8 3)
499          (ldb (byte 3 5) (aref *compiled-ucd* (1+ index))))))
500
501 (defun cp-value-1 (cp)
502   (let ((index (cp-index cp)))
503     (dpb (aref *compiled-ucd* (1+ index)) (byte 5 16)
504          (dpb (aref *compiled-ucd* (+ index 2)) (byte 8 8)
505               (aref *compiled-ucd* (+ index 3))))))
506
507 (defun cp-general-category (cp)
508   (aref *compiled-ucd* (* 8 (cp-value-0 cp))))
509
510 (defun cp-decimal-digit (cp)
511   (let ((decimal-digit (aref *compiled-ucd* (+ 3 (* 8 (cp-value-0 cp))))))
512     (and (< decimal-digit 10)
513          decimal-digit)))
514
515 (defun cp-alpha-char-p (cp)
516   (< (cp-general-category cp) 5))
517
518 (defun cp-alphanumericp (cp)
519   (let ((gc (cp-general-category cp)))
520     (or (< gc 5)
521         (= gc 12))))
522
523 (defun cp-digit-char-p (cp &optional (radix 10))
524   (let ((number (or (cp-decimal-digit cp)
525                     (and (<= 65 cp 90)
526                          (- cp 55))
527                     (and (<= 97 cp 122)
528                          (- cp 87)))))
529     (when (and number (< number radix))
530       number)))
531
532 (defun cp-graphic-char-p (cp)
533   (or (<= 32 cp 127)
534       (<= 160 cp)))
535
536 (defun cp-char-upcase (cp)
537   (if (< 3 (cp-value-0 cp) 8)
538       (cp-value-1 cp)
539       cp))
540
541 (defun cp-char-downcase (cp)
542   (if (< (cp-value-0 cp) 4)
543       (cp-value-1 cp)
544       cp))
545
546 (defun cp-upper-case-p (cp)
547   (< (cp-value-0 cp) 4))
548
549 (defun cp-lower-case-p (cp)
550   (< 3 (cp-value-0 cp) 8))
551
552 (defun cp-both-case-p (cp)
553   (< (cp-value-0 cp) 8))