0.8.16.8:
[sbcl.git] / tools-for-build / ucd.lisp
1 (in-package "SB-COLD")
2
3 ;;; Common
4
5 (defparameter *output-directory*
6   (merge-pathnames
7    (make-pathname :directory '(:relative :up "output"))
8    (make-pathname :directory (pathname-directory *load-pathname*))))
9
10 (defparameter *page-size-exponent* 8)
11
12 (defun cp-high (cp)
13   (ash cp (- *page-size-exponent*)))
14
15 (defun cp-low (cp)
16   (ldb (byte *page-size-exponent* 0) cp))
17
18 ;;; Generator
19
20 (defparameter *unicode-character-database*
21   (make-pathname :directory (pathname-directory *load-pathname*)))
22
23 (defparameter *ucd-base* nil)
24
25 (defparameter *last-uppercase* nil)
26 (defparameter *uppercase-transition-count* 0)
27 (defparameter *different-titlecases* nil)
28 (defparameter *different-numerics* nil)
29 (defparameter *name-size* 0)
30 (defparameter *misc-hash* (make-hash-table :test #'equal))
31 (defparameter *misc-index* -1)
32 (defparameter *misc-table* nil)
33 (defparameter *misc-mapping* nil)
34 (defparameter *both-cases* nil)
35 (defparameter *decompositions* nil)
36 (defparameter *decomposition-length-max* nil)
37 (defparameter *decomposition-types* nil)
38 (defparameter *decomposition-base* nil)
39
40 (defun hash-misc (gc-index bidi-index ccc-index decimal-digit digit
41                   bidi-mirrored cl-both-case-p)
42   (let* ((list (list gc-index bidi-index ccc-index decimal-digit digit
43                      bidi-mirrored cl-both-case-p))
44          (index (gethash list *misc-hash*)))
45     (or index
46         (progn
47           (vector-push list *misc-table*)
48           (setf (gethash list *misc-hash*)
49                 (incf *misc-index*))))))
50
51 (defun compare-misc-entry (left right)
52   (destructuring-bind (left-gc-index left-bidi-index left-ccc-index
53                        left-decimal-digit left-digit left-bidi-mirrored
54                        left-cl-both-case-p)
55       left
56     (destructuring-bind (right-gc-index right-bidi-index right-ccc-index
57                          right-decimal-digit right-digit right-bidi-mirrored
58                          right-cl-both-case-p)
59         right
60       (or (and left-cl-both-case-p (not right-cl-both-case-p))
61           (and (or left-cl-both-case-p (not right-cl-both-case-p))
62                (or (< left-gc-index right-gc-index)
63                    (and (= left-gc-index right-gc-index)
64                         (or (< left-bidi-index right-bidi-index)
65                             (and (= left-bidi-index right-bidi-index)
66                                  (or (< left-ccc-index right-ccc-index)
67                                      (and (= left-ccc-index right-ccc-index)
68                                           (or (string< left-decimal-digit
69                                                        right-decimal-digit)
70                                               (and (string= left-decimal-digit
71                                                             right-decimal-digit)
72                                                    (or (string< left-digit right-digit)
73                                                        (and (string= left-digit
74                                                                      right-digit)
75                                                             (string< left-bidi-mirrored
76                                                                      right-bidi-mirrored))))))))))))))))
77
78 (defun build-misc-table ()
79   (sort *misc-table* #'compare-misc-entry)
80   (setq *misc-mapping* (make-array (1+ *misc-index*)))
81   (loop for i from 0 to *misc-index*
82         do (setf (aref *misc-mapping*
83                        (gethash (aref *misc-table* i) *misc-hash*))
84                  i)))
85
86 (defun slurp-ucd ()
87   (setq *last-uppercase* nil)
88   (setq *uppercase-transition-count* 0)
89   (setq *different-titlecases* nil)
90   (setq *different-numerics* nil)
91   (setq *name-size* 0)
92   (setq *misc-hash* (make-hash-table :test #'equal))
93   (setq *misc-index* -1)
94   (setq *misc-table* (make-array 256 :fill-pointer 0))
95   (setq *both-cases* nil)
96   (setq *decompositions* 0)
97   (setq *decomposition-types* (make-hash-table :test #'equal))
98   (setq *decomposition-length-max* 0)
99   (setq *decomposition-base* (make-array (ash #x110000
100                                               (- *page-size-exponent*))
101                                          :initial-element nil))
102   (setq *ucd-base* (make-array (ash #x110000 (- *page-size-exponent*))
103                                :initial-element nil))
104   (with-open-file (*standard-input*
105                    (make-pathname :name "UnicodeData"
106                                   :type "txt"
107                                   :defaults *unicode-character-database*)
108                    :direction :input)
109     (loop for line = (read-line nil nil)
110           while line
111           do (slurp-ucd-line line)))
112   (second-pass)
113   (build-misc-table)
114   *decompositions*)
115
116 (defun split-string (line character)
117   (loop for prev-position = 0 then (1+ position)
118         for position = (position character line :start prev-position)
119         collect (subseq line prev-position position)
120         do (unless position
121              (loop-finish))))
122
123 (defun init-indices (strings)
124   (let ((hash (make-hash-table :test #'equal)))
125     (loop for string in strings
126           for index from 0
127           do (setf (gethash string hash) index))
128     hash))
129
130 (defparameter *general-categories*
131   (init-indices '("Lu" "Ll" "Lt" "Lm" "Lo" "Cc" "Cf" "Co" "Cs" "Mc"
132                   "Me" "Mn" "Nd" "Nl" "No" "Pc" "Pd" "Pe" "Pf" "Pi"
133                   "Po" "Ps" "Sc" "Sk" "Sm" "So" "Zl" "Zp" "Zs")))
134 (defparameter *bidi-classes*
135   (init-indices '("AL" "AN" "B" "BN" "CS" "EN" "ES" "ET" "L" "LRE" "LRO"
136                   "NSM" "ON" "PDF" "R" "RLE" "RLO" "S" "WS")))
137
138
139 (defparameter *block-first* nil)
140
141 ;;;   3400  --  4DB5  : cjk ideograph extension a ;Lo;0;L;;;;;N;;;;;
142 ;;;   AC00  --  D7A3  : hangul syllables ;Lo;0;L;;;;;N;;;;;
143 ;;;   D800  --  F8FF  : surrogates and private use
144 ;;;  20000  --  2A6D6 : cjk ideograph extension b ;Lo;0;L;;;;;N;;;;;
145 ;;;  F0000  --  FFFFD : private use
146 ;;; 100000  --  10FFFD: private use
147 (defun encode-ucd-line (line code-point)
148   (destructuring-bind (name general-category canonical-combining-class
149                             bidi-class decomposition-type-and-mapping
150                             decimal-digit digit numeric bidi-mirrored
151                             unicode-1-name iso-10646-comment simple-uppercase
152                             simple-lowercase simple-titlecase)
153       line
154     (declare (ignore unicode-1-name iso-10646-comment))
155     (if (and (> (length name) 8)
156              (string= ", First>" name :start2 (- (length name) 8)))
157         (progn
158           (setq *block-first* code-point)
159           nil)
160         (let* ((gc-index (or (gethash general-category *general-categories*)
161                              (error "unknown general category ~A"
162                                     general-category)))
163                (bidi-index (or (gethash bidi-class *bidi-classes*)
164                                (error "unknown bidirectional class ~A"
165                                       bidi-class)))
166                (ccc-index (parse-integer canonical-combining-class))
167                (digit-index (unless (string= "" decimal-digit)
168                               (parse-integer decimal-digit)))
169                (upper-index (unless (string= "" simple-uppercase)
170                               (parse-integer simple-uppercase :radix 16)))
171                (lower-index (unless (string= "" simple-lowercase)
172                               (parse-integer simple-lowercase :radix 16)))
173                (title-index (unless (string= "" simple-titlecase)
174                               (parse-integer simple-titlecase :radix 16)))
175                (cl-both-case-p
176                 (not (null (or (and (= gc-index 0) lower-index)
177                                (and (= gc-index 1) upper-index)))))
178                (misc-index (hash-misc gc-index bidi-index ccc-index
179                                       decimal-digit digit bidi-mirrored
180                                       cl-both-case-p)))
181           (declare (ignore digit-index))
182           (incf *name-size* (length name))
183           (when (string/= "" decomposition-type-and-mapping)
184             (let ((split (split-string decomposition-type-and-mapping
185                                        #\Space)))
186               (when (char= #\< (aref (first split) 0))
187                 (setf (gethash (pop split) *decomposition-types*) t))
188               (unless (aref *decomposition-base* (cp-high code-point))
189                 (setf (aref *decomposition-base* (cp-high code-point))
190                       (make-array (ash 1 *page-size-exponent*)
191                                   :initial-element nil)))
192               (setf (aref (aref *decomposition-base* (cp-high code-point))
193                           (cp-low code-point))
194                     (mapcar #'(lambda (string)
195                                 (parse-integer string :radix 16))
196                             split))
197               (setq *decomposition-length-max*
198                     (max *decomposition-length-max* (length split)))
199               (incf *decompositions* (length split))))
200           (when (and (string/= "" simple-uppercase)
201                      (string/= "" simple-lowercase))
202             (push (list code-point upper-index lower-index) *both-cases*))
203           (when (string/= simple-uppercase simple-titlecase)
204             (push (cons code-point title-index) *different-titlecases*))
205           (when (string/= digit numeric)
206             (push (cons code-point numeric) *different-numerics*))
207           (cond
208             ((= gc-index 8)
209              (unless *last-uppercase*
210                (incf *uppercase-transition-count*))
211              (setq *last-uppercase* t))
212             (t
213              (when *last-uppercase*
214                (incf *uppercase-transition-count*))
215              (setq *last-uppercase* nil)))
216           (when (> ccc-index 255)
217             (error "canonical combining class too large ~A" ccc-index))
218           (let ((result (vector misc-index (or upper-index lower-index 0))))
219             (when (and (> (length name) 7)
220                        (string= ", Last>" name :start2 (- (length name) 7)))
221               (let ((page-start (ash (+ *block-first*
222                                         (ash 1 *page-size-exponent*)
223                                         -1)
224                                      (- *page-size-exponent*)))
225                     (page-end (ash code-point (- *page-size-exponent*))))
226                 (loop for point from *block-first*
227                       below (ash page-start *page-size-exponent*)
228                       do (setf (aref (aref *ucd-base* (cp-high point))
229                                      (cp-low point))
230                                result))
231                 (loop for page from page-start below page-end
232                       do (setf (aref *ucd-base* page)
233                                (make-array (ash 1 *page-size-exponent*)
234                                            :initial-element result)))
235                 (loop for point from (ash page-end *page-size-exponent*)
236                       below code-point
237                       do (setf (aref (aref *ucd-base* (cp-high point))
238                                      (cp-low point))
239                                result))))
240             result)))))
241
242 (defun slurp-ucd-line (line)
243   (let* ((split-line (split-string line #\;))
244          (code-point (parse-integer (first split-line) :radix 16))
245          (code-high (ash code-point (- *page-size-exponent*)))
246          (code-low (ldb (byte *page-size-exponent* 0) code-point)))
247     (unless (aref *ucd-base* code-high)
248       (setf (aref *ucd-base* code-high)
249             (make-array (ash 1 *page-size-exponent*)
250                         :initial-element nil)))
251     (setf (aref (aref *ucd-base* code-high) code-low)
252           (encode-ucd-line (cdr split-line) code-point))))
253
254 (defun second-pass ()
255   (loop for i from 0 below (length *ucd-base*)
256         when (aref *ucd-base* i)
257         do (loop for j from 0 below (length (aref *ucd-base* i))
258                  for result = (aref (aref *ucd-base* i) j)
259                  when result
260                  when (let* ((transform-point (aref result 1))
261                              (transform-high (ash transform-point
262                                                   (- *page-size-exponent*)))
263                              (transform-low (ldb (byte *page-size-exponent* 0)
264                                                  transform-point)))
265                         (and (plusp transform-point)
266                              (/= (aref (aref (aref *ucd-base* transform-high)
267                                              transform-low)
268                                        1)
269                                  (+ (ash i *page-size-exponent*) j))))
270                  do (destructuring-bind (gc-index bidi-index ccc-index
271                                          decimal-digit digit bidi-mirrored
272                                          cl-both-case-p)
273                         (aref *misc-table* (aref result 0))
274                       (declare (ignore cl-both-case-p))
275                       (format t "~A~%" (+ (ash i *page-size-exponent*) j))
276                       (setf (aref result 0)
277                             (hash-misc gc-index bidi-index ccc-index
278                                        decimal-digit digit bidi-mirrored
279                                        nil))))))
280
281 (defun write-3-byte (triplet stream)
282   (write-byte (ldb (byte 8 0) triplet) stream)
283   (write-byte (ldb (byte 8 8) triplet) stream)
284   (write-byte (ldb (byte 8 16) triplet) stream))
285
286 (defun digit-to-byte (digit)
287   (if (string= "" digit)
288       255
289       (parse-integer digit)))
290
291 (defun output ()
292   (let ((hash (make-hash-table :test #'equalp))
293         (index 0))
294     (loop for page across *ucd-base*
295           do (when page
296                (unless (gethash page hash)
297                  (setf (gethash page hash)
298                        (incf index)))))
299     (let ((array (make-array (1+ index))))
300       (maphash #'(lambda (key value)
301                    (setf (aref array value) key))
302                hash)
303       (setf (aref array 0)
304             (make-array (ash 1 *page-size-exponent*) :initial-element nil))
305       (with-open-file (stream (make-pathname :name "ucd"
306                                              :type "dat"
307                                              :defaults *output-directory*)
308                               :direction :output
309                               :element-type '(unsigned-byte 8)
310                               :if-exists :supersede
311                               :if-does-not-exist :create)
312         (loop for (gc-index bidi-index ccc-index decimal-digit digit
313                             bidi-mirrored)
314               across *misc-table*
315               do (write-byte gc-index stream)
316               do (write-byte bidi-index stream)
317               do (write-byte ccc-index stream)
318               do (write-byte (digit-to-byte decimal-digit) stream)
319               do (write-byte (digit-to-byte digit) stream)
320               do (write-byte (if (string= "N" bidi-mirrored) 0 1) stream)
321               do (write-byte 0 stream)
322               do (write-byte 0 stream))
323         (loop for page across *ucd-base*
324               do (write-byte (if page (gethash page hash) 0) stream))
325         (loop for page across array
326               do (loop for entry across page
327                        do (write-byte (if entry
328                                           (aref *misc-mapping* (aref entry 0))
329                                           255)
330                                       stream)
331                        do (write-3-byte (if entry (aref entry 1) 0)
332                                         stream))))))
333   (with-open-file (*standard-output*
334                    (make-pathname :name "numerics"
335                                   :type "lisp-expr"
336                                   :defaults *output-directory*)
337                    :direction :output
338                    :if-exists :supersede
339                    :if-does-not-exist :create)
340     (let ((*print-pretty* t))
341       (prin1 (mapcar #'(lambda (x) (cons (car x) (read-from-string (cdr x))))
342                      *different-numerics*))))
343   (with-open-file (*standard-output*
344                    (make-pathname :name "titlecases"
345                                   :type "lisp-expr"
346                                   :defaults *output-directory*)
347                    :direction :output
348                    :if-exists :supersede
349                    :if-does-not-exist :create)
350     (let ((*print-pretty* t))
351       (prin1 *different-titlecases*)))
352   (with-open-file (*standard-output*
353                    (make-pathname :name "misc"
354                                   :type "lisp-expr"
355                                   :defaults *output-directory*)
356                    :direction :output
357                    :if-exists :supersede
358                    :if-does-not-exist :create)
359     (let ((*print-pretty* t))
360       (prin1 `(:length ,(length *misc-table*)
361                :uppercase ,(loop for (gc-index) across *misc-table*
362                                  for i from 0
363                                  when (= gc-index 0)
364                                  collect i)
365                :lowercase ,(loop for (gc-index) across *misc-table*
366                                  for i from 0
367                                  when (= gc-index 1)
368                                  collect i)
369                :titlecase ,(loop for (gc-index) across *misc-table*
370                                  for i from 0
371                                  when (= gc-index 2)
372                                  collect i)))))
373   (values))
374
375 ;;; Use of the generated files
376
377 (defparameter *compiled-ucd* nil)
378
379 (defun read-compiled-ucd ()
380   (with-open-file (stream (make-pathname :name "ucd"
381                                          :type "dat"
382                                          :defaults *output-directory*)
383                           :direction :input
384                           :element-type '(unsigned-byte 8))
385     (let ((length (file-length stream)))
386       (setq *compiled-ucd*
387             (make-array length :element-type '(unsigned-byte 8)))
388       (read-sequence *compiled-ucd* stream)))
389   (values))
390
391 ;;; The stuff below is dependent on misc.lisp-expr being
392 ;;; (:LENGTH 186 :UPPERCASE (0) :LOWERCASE (1) :TITLECASE (2))
393
394 (defparameter *length* 186)
395
396 (defun cp-index (cp)
397   (let* ((cp-high (cp-high cp))
398          (page (aref *compiled-ucd* (+ (* 8 *length*) cp-high))))
399     (+ (* 8 *length*)
400        (ash #x110000 (- *page-size-exponent*))
401        (* (ash 4 *page-size-exponent*) page)
402        (* 4 (cp-low cp)))))
403
404 (defun cp-value-0 (cp)
405   (aref *compiled-ucd* (cp-index cp)))
406
407 (defun cp-value-1 (cp)
408   (let ((index (cp-index cp)))
409     (dpb (aref *compiled-ucd* (+ index 3)) (byte 8 16)
410          (dpb (aref *compiled-ucd* (+ index 2)) (byte 8 8)
411               (aref *compiled-ucd* (1+ index))))))
412
413 (defun cp-general-category (cp)
414   (aref *compiled-ucd* (* 8 (cp-value-0 cp))))
415
416 (defun cp-decimal-digit (cp)
417   (let ((decimal-digit (aref *compiled-ucd* (+ 3 (* 8 (cp-value-0 cp))))))
418     (and (< decimal-digit 10)
419          decimal-digit)))
420
421 (defun cp-alpha-char-p (cp)
422   (< (cp-general-category cp) 5))
423
424 (defun cp-alphanumericp (cp)
425   (let ((gc (cp-general-category cp)))
426     (or (< gc 5)
427         (= gc 12))))
428
429 (defun cp-digit-char-p (cp &optional (radix 10))
430   (let ((number (or (cp-decimal-digit cp)
431                     (and (<= 65 cp 90)
432                          (- cp 55))
433                     (and (<= 97 cp 122)
434                          (- cp 87)))))
435     (when (and number (< number radix))
436       number)))
437
438 (defun cp-graphic-char-p (cp)
439   (or (<= 32 cp 127)
440       (<= 160 cp)))
441
442 (defun cp-char-upcase (cp)
443   (if (= (cp-value-0 cp) 1)
444       (cp-value-1 cp)
445       cp))
446
447 (defun cp-char-downcase (cp)
448   (if (= (cp-value-0 cp) 0)
449       (cp-value-1 cp)
450       cp))
451
452 (defun cp-upper-case-p (cp)
453   (= (cp-value-0 cp) 0))
454
455 (defun cp-lower-case-p (cp)
456   (= (cp-value-0 cp) 1))
457
458 (defun cp-both-case-p (cp)
459   (< (cp-value-0 cp) 2))