1.0.10.1: Cleanup code
[sbcl.git] / tools-for-build / ucd.lisp
1 (in-package "SB-COLD")
2
3 ;;; Common
4
5 (defparameter *output-directory*
6   (merge-pathnames
7    (make-pathname :directory '(:relative :up "output"))
8    (make-pathname :directory (pathname-directory *load-truename*))))
9
10 (defparameter *page-size-exponent* 8)
11
12 (defun cp-high (cp)
13   (ash cp (- *page-size-exponent*)))
14
15 (defun cp-low (cp)
16   (ldb (byte *page-size-exponent* 0) cp))
17
18 ;;; Generator
19
20 (defstruct ucd misc transform)
21
22 (defparameter *unicode-character-database*
23   (make-pathname :directory (pathname-directory *load-truename*)))
24
25 (defparameter *ucd-base* nil)
26 (defparameter *unicode-names* (make-hash-table))
27
28 (defparameter *last-uppercase* nil)
29 (defparameter *uppercase-transition-count* 0)
30 (defparameter *different-titlecases* nil)
31 (defparameter *different-numerics* nil)
32 (defparameter *name-size* 0)
33 (defparameter *misc-hash* (make-hash-table :test #'equal))
34 (defparameter *misc-index* -1)
35 (defparameter *misc-table* nil)
36 (defparameter *misc-mapping* nil)
37 (defparameter *both-cases* nil)
38 (defparameter *decompositions* nil)
39 (defparameter *decomposition-length-max* nil)
40 (defparameter *decomposition-types* nil)
41 (defparameter *decomposition-base* nil)
42
43 (defun hash-misc (gc-index bidi-index ccc-index decimal-digit digit
44                   bidi-mirrored cl-both-case-p)
45   (let* ((list (list gc-index bidi-index ccc-index decimal-digit digit
46                      bidi-mirrored cl-both-case-p))
47          (index (gethash list *misc-hash*)))
48     (or index
49         (progn
50           (vector-push list *misc-table*)
51           (setf (gethash list *misc-hash*)
52                 (incf *misc-index*))))))
53
54 (defun compare-misc-entry (left right)
55   (destructuring-bind (left-gc-index left-bidi-index left-ccc-index
56                        left-decimal-digit left-digit left-bidi-mirrored
57                        left-cl-both-case-p)
58       left
59     (destructuring-bind (right-gc-index right-bidi-index right-ccc-index
60                          right-decimal-digit right-digit right-bidi-mirrored
61                          right-cl-both-case-p)
62         right
63       (or (and left-cl-both-case-p (not right-cl-both-case-p))
64           (and (or left-cl-both-case-p (not right-cl-both-case-p))
65                (or (< left-gc-index right-gc-index)
66                    (and (= left-gc-index right-gc-index)
67                         (or (< left-bidi-index right-bidi-index)
68                             (and (= left-bidi-index right-bidi-index)
69                                  (or (< left-ccc-index right-ccc-index)
70                                      (and (= left-ccc-index right-ccc-index)
71                                           (or (string< left-decimal-digit
72                                                        right-decimal-digit)
73                                               (and (string= left-decimal-digit
74                                                             right-decimal-digit)
75                                                    (or (string< left-digit right-digit)
76                                                        (and (string= left-digit
77                                                                      right-digit)
78                                                             (string< left-bidi-mirrored
79                                                                      right-bidi-mirrored))))))))))))))))
80
81 (defun build-misc-table ()
82   (sort *misc-table* #'compare-misc-entry)
83   (setq *misc-mapping* (make-array (1+ *misc-index*)))
84   (loop for i from 0 to *misc-index*
85         do (setf (aref *misc-mapping*
86                        (gethash (aref *misc-table* i) *misc-hash*))
87                  i)))
88
89 (defun slurp-ucd ()
90   (setq *last-uppercase* nil)
91   (setq *uppercase-transition-count* 0)
92   (setq *different-titlecases* nil)
93   (setq *different-numerics* nil)
94   (setq *name-size* 0)
95   (setq *misc-hash* (make-hash-table :test #'equal))
96   (setq *misc-index* -1)
97   (setq *misc-table* (make-array 256 :fill-pointer 0))
98   (setq *both-cases* nil)
99   (setq *decompositions* 0)
100   (setq *decomposition-types* (make-hash-table :test #'equal))
101   (setq *decomposition-length-max* 0)
102   (setq *decomposition-base* (make-array (ash #x110000
103                                               (- *page-size-exponent*))
104                                          :initial-element nil))
105   (setq *ucd-base* (make-array (ash #x110000 (- *page-size-exponent*))
106                                :initial-element nil))
107   (with-open-file (*standard-input*
108                    (make-pathname :name "UnicodeData"
109                                   :type "txt"
110                                   :defaults *unicode-character-database*)
111                    :direction :input)
112     (loop for line = (read-line nil nil)
113           while line
114           do (slurp-ucd-line line)))
115   (second-pass)
116   (build-misc-table)
117   *decompositions*)
118
119 (defun split-string (line character)
120   (loop for prev-position = 0 then (1+ position)
121         for position = (position character line :start prev-position)
122         collect (subseq line prev-position position)
123         do (unless position
124              (loop-finish))))
125
126 (defun init-indices (strings)
127   (let ((hash (make-hash-table :test #'equal)))
128     (loop for string in strings
129           for index from 0
130           do (setf (gethash string hash) index))
131     hash))
132
133 (defparameter *general-categories*
134   (init-indices '("Lu" "Ll" "Lt" "Lm" "Lo" "Cc" "Cf" "Co" "Cs" "Mc"
135                   "Me" "Mn" "Nd" "Nl" "No" "Pc" "Pd" "Pe" "Pf" "Pi"
136                   "Po" "Ps" "Sc" "Sk" "Sm" "So" "Zl" "Zp" "Zs")))
137 (defparameter *bidi-classes*
138   (init-indices '("AL" "AN" "B" "BN" "CS" "EN" "ES" "ET" "L" "LRE" "LRO"
139                   "NSM" "ON" "PDF" "R" "RLE" "RLO" "S" "WS")))
140
141
142 (defparameter *block-first* nil)
143
144 (defun normalize-character-name (name)
145   (when (find #\_ name)
146     (error "Bad name for a character: ~A" name))
147   (unless (or (zerop (length name)) (find #\< name) (find #\> name))
148     (substitute #\_ #\Space name)))
149
150 ;;;   3400  --  4DB5  : cjk ideograph extension a ;Lo;0;L;;;;;N;;;;;
151 ;;;   AC00  --  D7A3  : hangul syllables ;Lo;0;L;;;;;N;;;;;
152 ;;;   D800  --  F8FF  : surrogates and private use
153 ;;;  20000  --  2A6D6 : cjk ideograph extension b ;Lo;0;L;;;;;N;;;;;
154 ;;;  F0000  --  FFFFD : private use
155 ;;; 100000  --  10FFFD: private use
156 (defun encode-ucd-line (line code-point)
157   (destructuring-bind (name general-category canonical-combining-class
158                             bidi-class decomposition-type-and-mapping
159                             decimal-digit digit numeric bidi-mirrored
160                             unicode-1-name iso-10646-comment simple-uppercase
161                             simple-lowercase simple-titlecase)
162       line
163     (declare (ignore unicode-1-name iso-10646-comment))
164     (if (and (> (length name) 8)
165              (string= ", First>" name :start2 (- (length name) 8)))
166         (progn
167           (setq *block-first* code-point)
168           nil)
169         (let* ((gc-index (or (gethash general-category *general-categories*)
170                              (error "unknown general category ~A"
171                                     general-category)))
172                (bidi-index (or (gethash bidi-class *bidi-classes*)
173                                (error "unknown bidirectional class ~A"
174                                       bidi-class)))
175                (ccc-index (parse-integer canonical-combining-class))
176                (digit-index (unless (string= "" decimal-digit)
177                               (parse-integer decimal-digit)))
178                (upper-index (unless (string= "" simple-uppercase)
179                               (parse-integer simple-uppercase :radix 16)))
180                (lower-index (unless (string= "" simple-lowercase)
181                               (parse-integer simple-lowercase :radix 16)))
182                (title-index (unless (string= "" simple-titlecase)
183                               (parse-integer simple-titlecase :radix 16)))
184                (cl-both-case-p
185                 (not (null (or (and (= gc-index 0) lower-index)
186                                (and (= gc-index 1) upper-index)))))
187                (misc-index (hash-misc gc-index bidi-index ccc-index
188                                       decimal-digit digit bidi-mirrored
189                                       cl-both-case-p)))
190           (declare (ignore digit-index))
191           (incf *name-size* (length name))
192           (when (string/= "" decomposition-type-and-mapping)
193             (let ((split (split-string decomposition-type-and-mapping
194                                        #\Space)))
195               (when (char= #\< (aref (first split) 0))
196                 (setf (gethash (pop split) *decomposition-types*) t))
197               (unless (aref *decomposition-base* (cp-high code-point))
198                 (setf (aref *decomposition-base* (cp-high code-point))
199                       (make-array (ash 1 *page-size-exponent*)
200                                   :initial-element nil)))
201               (setf (aref (aref *decomposition-base* (cp-high code-point))
202                           (cp-low code-point))
203                     (mapcar #'(lambda (string)
204                                 (parse-integer string :radix 16))
205                             split))
206               (setq *decomposition-length-max*
207                     (max *decomposition-length-max* (length split)))
208               (incf *decompositions* (length split))))
209           (when (and (string/= "" simple-uppercase)
210                      (string/= "" simple-lowercase))
211             (push (list code-point upper-index lower-index) *both-cases*))
212           (when (string/= simple-uppercase simple-titlecase)
213             (push (cons code-point title-index) *different-titlecases*))
214           (when (string/= digit numeric)
215             (push (cons code-point numeric) *different-numerics*))
216           (cond
217             ((= gc-index 8)
218              (unless *last-uppercase*
219                (incf *uppercase-transition-count*))
220              (setq *last-uppercase* t))
221             (t
222              (when *last-uppercase*
223                (incf *uppercase-transition-count*))
224              (setq *last-uppercase* nil)))
225           (when (> ccc-index 255)
226             (error "canonical combining class too large ~A" ccc-index))
227           (let ((result (make-ucd :misc misc-index
228                                   :transform (or upper-index lower-index 0))))
229             (when (and (> (length name) 7)
230                        (string= ", Last>" name :start2 (- (length name) 7)))
231               (let ((page-start (ash (+ *block-first*
232                                         (ash 1 *page-size-exponent*)
233                                         -1)
234                                      (- *page-size-exponent*)))
235                     (page-end (ash code-point (- *page-size-exponent*))))
236                 (loop for point from *block-first*
237                       below (ash page-start *page-size-exponent*)
238                       do (setf (aref (aref *ucd-base* (cp-high point))
239                                      (cp-low point))
240                                result))
241                 (loop for page from page-start below page-end
242                       do (setf (aref *ucd-base* page)
243                                (make-array (ash 1 *page-size-exponent*)
244                                            :initial-element result)))
245                 (loop for point from (ash page-end *page-size-exponent*)
246                       below code-point
247                       do (setf (aref (aref *ucd-base* (cp-high point))
248                                      (cp-low point))
249                                result))))
250             (values result (normalize-character-name name)))))))
251
252 (defun slurp-ucd-line (line)
253   (let* ((split-line (split-string line #\;))
254          (code-point (parse-integer (first split-line) :radix 16))
255          (code-high (ash code-point (- *page-size-exponent*)))
256          (code-low (ldb (byte *page-size-exponent* 0) code-point)))
257     (unless (aref *ucd-base* code-high)
258       (setf (aref *ucd-base* code-high)
259             (make-array (ash 1 *page-size-exponent*)
260                         :initial-element nil)))
261     (multiple-value-bind (encoding name)
262         (encode-ucd-line (cdr split-line) code-point)
263       (setf (aref (aref *ucd-base* code-high) code-low) encoding
264             (gethash code-point *unicode-names*) name))))
265
266 (defun second-pass ()
267   (loop for i from 0 below (length *ucd-base*)
268         when (aref *ucd-base* i)
269         do (loop for j from 0 below (length (aref *ucd-base* i))
270                  for result = (aref (aref *ucd-base* i) j)
271                  when result
272                  when (let* ((transform-point (ucd-transform result))
273                              (transform-high (ash transform-point
274                                                   (- *page-size-exponent*)))
275                              (transform-low (ldb (byte *page-size-exponent* 0)
276                                                  transform-point)))
277                         (and (plusp transform-point)
278                              (/= (ucd-transform
279                                   (aref (aref *ucd-base* transform-high)
280                                         transform-low))
281                                  (+ (ash i *page-size-exponent*) j))))
282                  do (destructuring-bind (gc-index bidi-index ccc-index
283                                          decimal-digit digit bidi-mirrored
284                                          cl-both-case-p)
285                         (aref *misc-table* (ucd-misc result))
286                       (declare (ignore cl-both-case-p))
287                       (format t "~A~%" (+ (ash i *page-size-exponent*) j))
288                       (setf (ucd-misc result)
289                             (hash-misc gc-index bidi-index ccc-index
290                                        decimal-digit digit bidi-mirrored
291                                        nil))))))
292
293 (defun write-3-byte (triplet stream)
294   (write-byte (ldb (byte 8 0) triplet) stream)
295   (write-byte (ldb (byte 8 8) triplet) stream)
296   (write-byte (ldb (byte 8 16) triplet) stream))
297
298 (defun digit-to-byte (digit)
299   (if (string= "" digit)
300       255
301       (parse-integer digit)))
302
303 (defun output ()
304   (let ((hash (make-hash-table :test #'equalp))
305         (index 0))
306     (loop for page across *ucd-base*
307           do (when page
308                (unless (gethash page hash)
309                  (setf (gethash page hash)
310                        (incf index)))))
311     (let ((array (make-array (1+ index))))
312       (maphash #'(lambda (key value)
313                    (setf (aref array value) key))
314                hash)
315       (setf (aref array 0)
316             (make-array (ash 1 *page-size-exponent*) :initial-element nil))
317       (with-open-file (stream (make-pathname :name "ucd"
318                                              :type "dat"
319                                              :defaults *output-directory*)
320                               :direction :output
321                               :element-type '(unsigned-byte 8)
322                               :if-exists :supersede
323                               :if-does-not-exist :create)
324         (loop for (gc-index bidi-index ccc-index decimal-digit digit
325                             bidi-mirrored)
326               across *misc-table*
327               do (write-byte gc-index stream)
328               do (write-byte bidi-index stream)
329               do (write-byte ccc-index stream)
330               do (write-byte (digit-to-byte decimal-digit) stream)
331               do (write-byte (digit-to-byte digit) stream)
332               do (write-byte (if (string= "N" bidi-mirrored) 0 1) stream)
333               do (write-byte 0 stream)
334               do (write-byte 0 stream))
335         (loop for page across *ucd-base*
336            do (write-byte (if page (gethash page hash) 0) stream))
337         (loop for page across array
338            do (loop for entry across page
339                  do (write-byte (if entry
340                                     (aref *misc-mapping* (ucd-misc entry))
341                                     255)
342                                 stream)
343                  do (write-3-byte (if entry (ucd-transform entry) 0)
344                                   stream))))))
345   (with-open-file (f (make-pathname :name "ucd-names" :type "lisp-expr"
346                                     :defaults *output-directory*)
347                      :direction :output
348                      :if-exists :supersede
349                      :if-does-not-exist :create)
350     (with-standard-io-syntax
351       (write-string ";;; Do not edit by hand: generated by ucd.lisp" f)
352       (maphash (lambda (code name)
353                  (when name
354                   (print code f)
355                   (prin1 name f)))
356                *unicode-names*))
357     (setf *unicode-names* nil))
358   (with-open-file (*standard-output*
359                    (make-pathname :name "numerics"
360                                   :type "lisp-expr"
361                                   :defaults *output-directory*)
362                    :direction :output
363                    :if-exists :supersede
364                    :if-does-not-exist :create)
365     (let ((*print-pretty* t))
366       (prin1 (mapcar #'(lambda (x) (cons (car x) (read-from-string (cdr x))))
367                      *different-numerics*))))
368   (with-open-file (*standard-output*
369                    (make-pathname :name "titlecases"
370                                   :type "lisp-expr"
371                                   :defaults *output-directory*)
372                    :direction :output
373                    :if-exists :supersede
374                    :if-does-not-exist :create)
375     (let ((*print-pretty* t))
376       (prin1 *different-titlecases*)))
377   (with-open-file (*standard-output*
378                    (make-pathname :name "misc"
379                                   :type "lisp-expr"
380                                   :defaults *output-directory*)
381                    :direction :output
382                    :if-exists :supersede
383                    :if-does-not-exist :create)
384     (let ((*print-pretty* t))
385       (prin1 `(:length ,(length *misc-table*)
386                :uppercase ,(loop for (gc-index) across *misc-table*
387                                  for i from 0
388                                  when (= gc-index 0)
389                                  collect i)
390                :lowercase ,(loop for (gc-index) across *misc-table*
391                                  for i from 0
392                                  when (= gc-index 1)
393                                  collect i)
394                :titlecase ,(loop for (gc-index) across *misc-table*
395                                  for i from 0
396                                  when (= gc-index 2)
397                                  collect i)))))
398   (values))
399
400 ;;; Use of the generated files
401
402 (defparameter *compiled-ucd* nil)
403
404 (defun read-compiled-ucd ()
405   (with-open-file (stream (make-pathname :name "ucd"
406                                          :type "dat"
407                                          :defaults *output-directory*)
408                           :direction :input
409                           :element-type '(unsigned-byte 8))
410     (let ((length (file-length stream)))
411       (setq *compiled-ucd*
412             (make-array length :element-type '(unsigned-byte 8)))
413       (read-sequence *compiled-ucd* stream)))
414   (values))
415
416 ;;; The stuff below is dependent on misc.lisp-expr being
417 ;;; (:LENGTH 186 :UPPERCASE (0) :LOWERCASE (1) :TITLECASE (2))
418
419 (defparameter *length* 186)
420
421 (defun cp-index (cp)
422   (let* ((cp-high (cp-high cp))
423          (page (aref *compiled-ucd* (+ (* 8 *length*) cp-high))))
424     (+ (* 8 *length*)
425        (ash #x110000 (- *page-size-exponent*))
426        (* (ash 4 *page-size-exponent*) page)
427        (* 4 (cp-low cp)))))
428
429 (defun cp-value-0 (cp)
430   (aref *compiled-ucd* (cp-index cp)))
431
432 (defun cp-value-1 (cp)
433   (let ((index (cp-index cp)))
434     (dpb (aref *compiled-ucd* (+ index 3)) (byte 8 16)
435          (dpb (aref *compiled-ucd* (+ index 2)) (byte 8 8)
436               (aref *compiled-ucd* (1+ index))))))
437
438 (defun cp-general-category (cp)
439   (aref *compiled-ucd* (* 8 (cp-value-0 cp))))
440
441 (defun cp-decimal-digit (cp)
442   (let ((decimal-digit (aref *compiled-ucd* (+ 3 (* 8 (cp-value-0 cp))))))
443     (and (< decimal-digit 10)
444          decimal-digit)))
445
446 (defun cp-alpha-char-p (cp)
447   (< (cp-general-category cp) 5))
448
449 (defun cp-alphanumericp (cp)
450   (let ((gc (cp-general-category cp)))
451     (or (< gc 5)
452         (= gc 12))))
453
454 (defun cp-digit-char-p (cp &optional (radix 10))
455   (let ((number (or (cp-decimal-digit cp)
456                     (and (<= 65 cp 90)
457                          (- cp 55))
458                     (and (<= 97 cp 122)
459                          (- cp 87)))))
460     (when (and number (< number radix))
461       number)))
462
463 (defun cp-graphic-char-p (cp)
464   (or (<= 32 cp 127)
465       (<= 160 cp)))
466
467 (defun cp-char-upcase (cp)
468   (if (= (cp-value-0 cp) 1)
469       (cp-value-1 cp)
470       cp))
471
472 (defun cp-char-downcase (cp)
473   (if (= (cp-value-0 cp) 0)
474       (cp-value-1 cp)
475       cp))
476
477 (defun cp-upper-case-p (cp)
478   (= (cp-value-0 cp) 0))
479
480 (defun cp-lower-case-p (cp)
481   (= (cp-value-0 cp) 1))
482
483 (defun cp-both-case-p (cp)
484   (< (cp-value-0 cp) 2))