implement primary and canonical composition, and hence NFC/NFKC

author Christophe Rhodes <csr21@cantab.net>

Sat, 18 May 2013 12:37:55 +0000 (13:37 +0100)

committer Christophe Rhodes <csr21@cantab.net>

Sat, 18 May 2013 19:04:05 +0000 (20:04 +0100)
author Christophe Rhodes <csr21@cantab.net>
Sat, 18 May 2013 12:37:55 +0000 (13:37 +0100)
committer Christophe Rhodes <csr21@cantab.net>
Sat, 18 May 2013 19:04:05 +0000 (20:04 +0100)
diff --git a/src/code/target-char.lisp b/src/code/target-char.lisp

index beb84ee..fdd7b9c 100644 (file)
--- a/src/code/target-char.lisp
+++ b/src/code/target-char.lisp
@@ -51,6 +51,22 @@
                (defglobal **character-database** ,character-database)
                (defglobal **character-decompositions** ,decompositions)
                (defglobal **character-long-decompositions** ,long-decompositions)
+              (defglobal **character-primary-compositions**
+                (let ((table (make-hash-table))
+                      (info ,(read-ub8-vector (file "comp" "dat"))))
+                  (flet ((code (j)
+                           (dpb (aref info (* 4 j))
+                                (byte 8 24)
+                                (dpb (aref info (+ (* 4 j) 1))
+                                     (byte 8 16)
+                                     (dpb (aref info (+ (* 4 j) 2))
+                                          (byte 8 8)
+                                          (aref info (+ (* 4 j) 3)))))))
+                    (dotimes (i (/ (length info) 12) table)
+                      (setf (gethash (dpb (code (* 3 i)) (byte 21 21)
+                                          (code (1+ (* 3 i))))
+                                     table)
+                            (code-char (code (+ (* 3 i) 2))))))))
                (defun !character-database-cold-init ()
                  (setf **character-database** ,character-database))
                ,(with-open-file (stream (file "ucd-names" "lisp-expr")
@@ -710,11 +726,21 @@ character exists."
           (go again)))
      (apply 'concatenate 'string (nreverse result))))
  
-#+nil
  (defun primary-composition (char1 char2)
-  (when (and (char= char1 #\e)
-             (char= char2 #\combining_acute_accent))
-    #\latin_small_letter_e_with_acute))
+  (let ((c1 (char-code char1))
+        (c2 (char-code char2)))
+    (cond
+      ((gethash (dpb (char-code char1) (byte 21 21) (char-code char2))
+                **character-primary-compositions**))
+      ((and (<= #x1100 c1) (<= c1 #x1112)
+            (<= #x1161 c2) (<= c2 #x1175))
+       (let ((lindex (- c1 #x1100))
+             (vindex (- c2 #x1161)))
+         (code-char (+ #xac00 (* lindex 588) (* vindex 28)))))
+      ((and (<= #xac00 c1) (<= c1 #.(+ #xac00 11171))
+            (<= #x11a8 c2) (<= c2 #x11c2)
+            (= 0 (rem (- c1 #xac00) 28)))
+       (code-char (+ c1 (- c2 #x11a7)))))))
  
  ;;; This implements a sequence data structure, specialized for
  ;;; efficient deletion of characters at an index, along with tolerable
@@ -774,13 +800,16 @@ character exists."
    (labels ()
      (let* ((result (list (list 0 (length string) string)))
             (previous-starter-index (position 0 string :key #'ucd-ccc))
-           (i (1+ previous-starter-index)))
-      (when (= i (length string))
+           (i (and previous-starter-index (1+ previous-starter-index))))
+      (when (or (not i) (= i (length string)))
          (return-from canonically-compose string))
        (tagbody
         again
-         (when (and (> (- i previous-starter-index) 2)
+         (when (and (>= (- i previous-starter-index) 2)
                      ;; test for Blocked (Unicode 3.11 para. D115)
+                    ;;
+                    ;; (assumes here that string has sorted combiners,
+                    ;; so can look back just one step)
                      (>= (ucd-ccc (lref result (1- i)))
                          (ucd-ccc (lref result i))))
             (when (= (ucd-ccc (lref result i)) 0)
@@ -812,8 +841,12 @@ character exists."
      (base-string string)
      ((or (array character (*)) #!-sb-unicode base-string)
       (ecase form
+       ((:nfc)
+        (canonically-compose (sort-combiners (decompose-string string))))
         ((:nfd)
          (sort-combiners (decompose-string string)))
+       ((:nfkc)
+        (canonically-compose (sort-combiners (decompose-string string :compatibility))))
         ((:nfkd)
          (sort-combiners (decompose-string string :compatibility)))))
      ((array nil (*)) string)))
diff --git a/tests/unicode-normalization.impure.lisp b/tests/unicode-normalization.impure.lisp

index e930440..7cd0fdd 100644 (file)
--- a/tests/unicode-normalization.impure.lisp
+++ b/tests/unicode-normalization.impure.lisp
@@ -17,12 +17,10 @@
  
  (defun test-line (c1 c2 c3 c4 c5)
    ;; NFC
-  #+nil
    (assert-all-string= c2
      (normalize-string c1 :nfc)
      (normalize-string c2 :nfc)
      (normalize-string c3 :nfc))
-  #+nil
    (assert-all-string= c4
      (normalize-string c4 :nfc)
      (normalize-string c5 :nfc))
@@ -37,7 +35,6 @@
      (normalize-string c5 :nfd))
  
    ;; NFKC
-  #+nil
    (assert-all-string= c4
      (normalize-string c1 :nfkc)
      (normalize-string c2 :nfkc)
@@ -55,10 +52,8 @@
  
  (defun test-no-normalization (string)
    (assert-all-string= string
-    #+nil
      (normalize-string string :nfc)
      (normalize-string string :nfd)
-    #+nil
      (normalize-string string :nfkc)
      (normalize-string string :nfkd)))
  
diff --git a/tools-for-build/CompositionExclusions.txt b/tools-for-build/CompositionExclusions.txt

new file mode 100644 (file)

index 0000000..cd19f42
--- /dev/null
+++ b/tools-for-build/CompositionExclusions.txt
@@ -0,0 +1,206 @@
+# CompositionExclusions-6.2.0.txt
+# Date: 2012-05-15, 22:21:00 GMT [KW, LI]
+#
+# This file lists the characters for the Composition Exclusion Table
+# defined in UAX #15, Unicode Normalization Forms.
+#
+# This file is a normative contributory data file in the
+# Unicode Character Database.
+#
+# Copyright (c) 1991-2012 Unicode, Inc.
+# For terms of use, see http://www.unicode.org/terms_of_use.html
+#
+# For more information, see
+# http://www.unicode.org/unicode/reports/tr15/#Primary_Exclusion_List_Table
+#
+# For a full derivation of composition exclusions, see the derived property
+# Full_Composition_Exclusion in DerivedNormalizationProps.txt
+#
+
+# ================================================
+# (1) Script Specifics
+#
+# This list of characters cannot be derived from the UnicodeData.txt file.
+# ================================================
+
+0958    #  DEVANAGARI LETTER QA
+0959    #  DEVANAGARI LETTER KHHA
+095A    #  DEVANAGARI LETTER GHHA
+095B    #  DEVANAGARI LETTER ZA
+095C    #  DEVANAGARI LETTER DDDHA
+095D    #  DEVANAGARI LETTER RHA
+095E    #  DEVANAGARI LETTER FA
+095F    #  DEVANAGARI LETTER YYA
+09DC    #  BENGALI LETTER RRA
+09DD    #  BENGALI LETTER RHA
+09DF    #  BENGALI LETTER YYA
+0A33    #  GURMUKHI LETTER LLA
+0A36    #  GURMUKHI LETTER SHA
+0A59    #  GURMUKHI LETTER KHHA
+0A5A    #  GURMUKHI LETTER GHHA
+0A5B    #  GURMUKHI LETTER ZA
+0A5E    #  GURMUKHI LETTER FA
+0B5C    #  ORIYA LETTER RRA
+0B5D    #  ORIYA LETTER RHA
+0F43    #  TIBETAN LETTER GHA
+0F4D    #  TIBETAN LETTER DDHA
+0F52    #  TIBETAN LETTER DHA
+0F57    #  TIBETAN LETTER BHA
+0F5C    #  TIBETAN LETTER DZHA
+0F69    #  TIBETAN LETTER KSSA
+0F76    #  TIBETAN VOWEL SIGN VOCALIC R
+0F78    #  TIBETAN VOWEL SIGN VOCALIC L
+0F93    #  TIBETAN SUBJOINED LETTER GHA
+0F9D    #  TIBETAN SUBJOINED LETTER DDHA
+0FA2    #  TIBETAN SUBJOINED LETTER DHA
+0FA7    #  TIBETAN SUBJOINED LETTER BHA
+0FAC    #  TIBETAN SUBJOINED LETTER DZHA
+0FB9    #  TIBETAN SUBJOINED LETTER KSSA
+FB1D    #  HEBREW LETTER YOD WITH HIRIQ
+FB1F    #  HEBREW LIGATURE YIDDISH YOD YOD PATAH
+FB2A    #  HEBREW LETTER SHIN WITH SHIN DOT
+FB2B    #  HEBREW LETTER SHIN WITH SIN DOT
+FB2C    #  HEBREW LETTER SHIN WITH DAGESH AND SHIN DOT
+FB2D    #  HEBREW LETTER SHIN WITH DAGESH AND SIN DOT
+FB2E    #  HEBREW LETTER ALEF WITH PATAH
+FB2F    #  HEBREW LETTER ALEF WITH QAMATS
+FB30    #  HEBREW LETTER ALEF WITH MAPIQ
+FB31    #  HEBREW LETTER BET WITH DAGESH
+FB32    #  HEBREW LETTER GIMEL WITH DAGESH
+FB33    #  HEBREW LETTER DALET WITH DAGESH
+FB34    #  HEBREW LETTER HE WITH MAPIQ
+FB35    #  HEBREW LETTER VAV WITH DAGESH
+FB36    #  HEBREW LETTER ZAYIN WITH DAGESH
+FB38    #  HEBREW LETTER TET WITH DAGESH
+FB39    #  HEBREW LETTER YOD WITH DAGESH
+FB3A    #  HEBREW LETTER FINAL KAF WITH DAGESH
+FB3B    #  HEBREW LETTER KAF WITH DAGESH
+FB3C    #  HEBREW LETTER LAMED WITH DAGESH
+FB3E    #  HEBREW LETTER MEM WITH DAGESH
+FB40    #  HEBREW LETTER NUN WITH DAGESH
+FB41    #  HEBREW LETTER SAMEKH WITH DAGESH
+FB43    #  HEBREW LETTER FINAL PE WITH DAGESH
+FB44    #  HEBREW LETTER PE WITH DAGESH
+FB46    #  HEBREW LETTER TSADI WITH DAGESH
+FB47    #  HEBREW LETTER QOF WITH DAGESH
+FB48    #  HEBREW LETTER RESH WITH DAGESH
+FB49    #  HEBREW LETTER SHIN WITH DAGESH
+FB4A    #  HEBREW LETTER TAV WITH DAGESH
+FB4B    #  HEBREW LETTER VAV WITH HOLAM
+FB4C    #  HEBREW LETTER BET WITH RAFE
+FB4D    #  HEBREW LETTER KAF WITH RAFE
+FB4E    #  HEBREW LETTER PE WITH RAFE
+
+# Total code points: 67
+
+# ================================================
+# (2) Post Composition Version precomposed characters
+#
+# These characters cannot be derived solely from the UnicodeData.txt file
+# in this version of Unicode.
+#
+# Note that characters added to the standard after the
+# Composition Version and which have canonical decomposition mappings
+# are not automatically added to this list of Post Composition
+# Version precomposed characters.
+# ================================================
+
+2ADC    #  FORKING
+1D15E   #  MUSICAL SYMBOL HALF NOTE
+1D15F   #  MUSICAL SYMBOL QUARTER NOTE
+1D160   #  MUSICAL SYMBOL EIGHTH NOTE
+1D161   #  MUSICAL SYMBOL SIXTEENTH NOTE
+1D162   #  MUSICAL SYMBOL THIRTY-SECOND NOTE
+1D163   #  MUSICAL SYMBOL SIXTY-FOURTH NOTE
+1D164   #  MUSICAL SYMBOL ONE HUNDRED TWENTY-EIGHTH NOTE
+1D1BB   #  MUSICAL SYMBOL MINIMA
+1D1BC   #  MUSICAL SYMBOL MINIMA BLACK
+1D1BD   #  MUSICAL SYMBOL SEMIMINIMA WHITE
+1D1BE   #  MUSICAL SYMBOL SEMIMINIMA BLACK
+1D1BF   #  MUSICAL SYMBOL FUSA WHITE
+1D1C0   #  MUSICAL SYMBOL FUSA BLACK
+
+# Total code points: 14
+
+# ================================================
+# (3) Singleton Decompositions
+#
+# These characters can be derived from the UnicodeData.txt file
+# by including all canonically decomposable characters whose
+# canonical decomposition consists of a single character.
+#
+# These characters are simply quoted here for reference.
+# See also Full_Composition_Exclusion in DerivedNormalizationProps.txt
+# ================================================
+
+# 0340..0341       [2] COMBINING GRAVE TONE MARK..COMBINING ACUTE TONE MARK
+# 0343                 COMBINING GREEK KORONIS
+# 0374                 GREEK NUMERAL SIGN
+# 037E                 GREEK QUESTION MARK
+# 0387                 GREEK ANO TELEIA
+# 1F71                 GREEK SMALL LETTER ALPHA WITH OXIA
+# 1F73                 GREEK SMALL LETTER EPSILON WITH OXIA
+# 1F75                 GREEK SMALL LETTER ETA WITH OXIA
+# 1F77                 GREEK SMALL LETTER IOTA WITH OXIA
+# 1F79                 GREEK SMALL LETTER OMICRON WITH OXIA
+# 1F7B                 GREEK SMALL LETTER UPSILON WITH OXIA
+# 1F7D                 GREEK SMALL LETTER OMEGA WITH OXIA
+# 1FBB                 GREEK CAPITAL LETTER ALPHA WITH OXIA
+# 1FBE                 GREEK PROSGEGRAMMENI
+# 1FC9                 GREEK CAPITAL LETTER EPSILON WITH OXIA
+# 1FCB                 GREEK CAPITAL LETTER ETA WITH OXIA
+# 1FD3                 GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
+# 1FDB                 GREEK CAPITAL LETTER IOTA WITH OXIA
+# 1FE3                 GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
+# 1FEB                 GREEK CAPITAL LETTER UPSILON WITH OXIA
+# 1FEE..1FEF       [2] GREEK DIALYTIKA AND OXIA..GREEK VARIA
+# 1FF9                 GREEK CAPITAL LETTER OMICRON WITH OXIA
+# 1FFB                 GREEK CAPITAL LETTER OMEGA WITH OXIA
+# 1FFD                 GREEK OXIA
+# 2000..2001       [2] EN QUAD..EM QUAD
+# 2126                 OHM SIGN
+# 212A..212B       [2] KELVIN SIGN..ANGSTROM SIGN
+# 2329                 LEFT-POINTING ANGLE BRACKET
+# 232A                 RIGHT-POINTING ANGLE BRACKET
+# F900..FA0D     [270] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA0D
+# FA10                 CJK COMPATIBILITY IDEOGRAPH-FA10
+# FA12                 CJK COMPATIBILITY IDEOGRAPH-FA12
+# FA15..FA1E      [10] CJK COMPATIBILITY IDEOGRAPH-FA15..CJK COMPATIBILITY IDEOGRAPH-FA1E
+# FA20                 CJK COMPATIBILITY IDEOGRAPH-FA20
+# FA22                 CJK COMPATIBILITY IDEOGRAPH-FA22
+# FA25..FA26       [2] CJK COMPATIBILITY IDEOGRAPH-FA25..CJK COMPATIBILITY IDEOGRAPH-FA26
+# FA2A..FA6D      [68] CJK COMPATIBILITY IDEOGRAPH-FA2A..CJK COMPATIBILITY IDEOGRAPH-FA6D
+# FA70..FAD9     [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILITY IDEOGRAPH-FAD9
+# 2F800..2FA1D   [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D
+
+# Total code points: 1035
+
+# ================================================
+# (4) Non-Starter Decompositions
+#
+# These characters can be derived from the UnicodeData.txt file
+# by including each expanding canonical decomposition
+# (i.e., those which canonically decompose to a sequence
+# of characters instead of a single character), such that:
+#
+# A. The character is not a Starter.
+#
+# OR (inclusive)
+#
+# B. The character's canonical decomposition begins
+# with a character that is not a Starter.
+#
+# Note that a "Starter" is any character with a zero combining class.
+#
+# These characters are simply quoted here for reference.
+# See also Full_Composition_Exclusion in DerivedNormalizationProps.txt
+# ================================================
+
+# 0344                 COMBINING GREEK DIALYTIKA TONOS
+# 0F73                 TIBETAN VOWEL SIGN II
+# 0F75                 TIBETAN VOWEL SIGN UU
+# 0F81                 TIBETAN VOWEL SIGN REVERSED II
+
+# Total code points: 4
+
+# EOF
diff --git a/tools-for-build/ucd.lisp b/tools-for-build/ucd.lisp

index fa2014f..8741bcc 100644 (file)
--- a/tools-for-build/ucd.lisp
+++ b/tools-for-build/ucd.lisp
@@ -97,7 +97,21 @@
                      (gethash (aref *misc-table* i) *misc-hash*))
                i)))
  
+(defvar *comp-table*)
+
+(defvar *exclusions*
+  (with-open-file (s (make-pathname :name "CompositionExclusions" :type "txt"
+                                    :defaults *unicode-character-database*))
+    (do ((line (read-line s nil nil) (read-line s nil nil))
+         result)
+        ((null line) result)
+      (when (and (> (length line) 0)
+                 (char/= (char line 0) #\#))
+        (push (parse-integer line :end (position #\Space line) :radix 16)
+              result)))))
+
  (defun slurp-ucd ()
+  (setf *comp-table* (make-hash-table :test 'equal))
    (setq *last-uppercase* nil)
    (setq *uppercase-transition-count* 0)
    (setq *different-titlecases* nil)
@@ -128,10 +142,24 @@
            while line
            do (slurp-ucd-line line)))
    (second-pass)
+  (fixup-compositions)
    (fixup-hangul-syllables)
    (build-misc-table)
    (length *long-decompositions*))
  
+(defun fixup-compositions ()
+  (flet ((fixup (k v)
+           (let* ((cp (car k))
+                  (ucd (aref (aref *ucd-base* (cp-high cp)) (cp-low cp)))
+                  (misc (aref *misc-table* (ucd-misc ucd)))
+                  (ccc-index (third misc)))
+             ;; we can do everything in the first pass except for
+             ;; accounting for decompositions where the first
+             ;; character of the decomposition is not a starter.
+             (when (/= ccc-index 0)
+               (remhash k *comp-table*)))))
+    (maphash #'fixup *comp-table*)))
+
  (defun fixup-hangul-syllables ()
    ;; "Hangul Syllable Composition, Unicode 5.1 section 3-12"
    (let* ((sbase #xac00)
@@ -268,6 +296,25 @@
                             (mapcar #'(lambda (string)
                                         (parse-integer string :radix 16))
                                     split)))
+                      (when (= decomposition-info 1)
+                        ;; Primary composition excludes:
+                        ;; * singleton decompositions;
+                        ;; * decompositions of non-starters;
+                        ;; * script-specific decompositions;
+                        ;; * later-version decompositions;
+                        ;; * decompositions whose first character is a
+                        ;;   non-starter.
+                        ;; All but the last case can be handled here;
+                        ;; for the fixup, see FIXUP-COMPOSITIONS
+                        (when (and (> (length decomposition) 1)
+                                   (= ccc-index 0)
+                                   (not (member code-point *exclusions*)))
+                          (unless (= (length decomposition) 2)
+                            (error "canonical decomposition unexpectedly long"))
+                          (setf (gethash (cons (first decomposition)
+                                               (second decomposition))
+                                         *comp-table*)
+                                code-point)))
                        (if (= (length decomposition) 1)
                            (cons 1 (car decomposition))
                            (cons (length decomposition)
@@ -316,19 +363,19 @@
                                       (- *page-size-exponent*)))
                      (page-end (ash code-point (- *page-size-exponent*))))
                  (loop for point from *block-first*
-                      below (ash page-start *page-size-exponent*)
-                      do (setf (aref (aref *ucd-base* (cp-high point))
-                                     (cp-low point))
-                               result))
+                   below (ash page-start *page-size-exponent*)
+                   do (setf (aref (aref *ucd-base* (cp-high point))
+                                  (cp-low point))
+                            result))
                  (loop for page from page-start below page-end
-                      do (setf (aref *ucd-base* page)
-                               (make-array (ash 1 *page-size-exponent*)
-                                           :initial-element result)))
+                   do (setf (aref *ucd-base* page)
+                            (make-array (ash 1 *page-size-exponent*)
+                                        :initial-element result)))
                  (loop for point from (ash page-end *page-size-exponent*)
-                      below code-point
-                      do (setf (aref (aref *ucd-base* (cp-high point))
-                                     (cp-low point))
-                               result))))
+                   below code-point
+                   do (setf (aref (aref *ucd-base* (cp-high point))
+                                  (cp-low point))
+                            result))))
              (values result (normalize-character-name name)))))))
  
  (defun slurp-ucd-line (line)
@@ -350,31 +397,32 @@
  ;;; inverses, which is not true in general in Unicode even for
  ;;; characters which change case to single characters.
  (defun second-pass ()
-  (loop for i from 0 below (length *ucd-base*)
-        when (aref *ucd-base* i)
-        do (loop for j from 0 below (length (aref *ucd-base* i))
-                 for result = (aref (aref *ucd-base* i) j)
-                 when result
-                 when (let* ((transform-point (ucd-transform result))
-                             (transform-high (ash transform-point
-                                                  (- *page-size-exponent*)))
-                             (transform-low (ldb (byte *page-size-exponent* 0)
-                                                 transform-point)))
-                        (and (plusp transform-point)
-                             (/= (ucd-transform
-                                  (aref (aref *ucd-base* transform-high)
-                                        transform-low))
-                                 (+ (ash i *page-size-exponent*) j))))
-                 do (destructuring-bind (gc-index bidi-index ccc-index
-                                         decimal-digit digit bidi-mirrored
-                                         cl-both-case-p decomposition-info)
+  (dotimes (i (length *ucd-base*))
+    (let ((base (aref *ucd-base* i)))
+      (dotimes (j (length base)) ; base is NIL or an array
+        (let ((result (aref base j)))
+          (when result
+            ;; fixup case mappings for CL/Unicode mismatch
+            (let* ((transform-point (ucd-transform result))
+                   (transform-high (ash transform-point
+                                        (- *page-size-exponent*)))
+                   (transform-low (ldb (byte *page-size-exponent* 0)
+                                       transform-point)))
+              (when (and (plusp transform-point)
+                         (/= (ucd-transform
+                              (aref (aref *ucd-base* transform-high)
+                                    transform-low))
+                             (+ (ash i *page-size-exponent*) j)))
+                (destructuring-bind (gc-index bidi-index ccc-index
+                                     decimal-digit digit bidi-mirrored
+                                     cl-both-case-p decomposition-info)
                          (aref *misc-table* (ucd-misc result))
                        (declare (ignore cl-both-case-p))
                        (format t "~A~%" (+ (ash i *page-size-exponent*) j))
                        (setf (ucd-misc result)
                              (hash-misc gc-index bidi-index ccc-index
                                         decimal-digit digit bidi-mirrored
-                                       nil decomposition-info))))))
+                                       nil decomposition-info)))))))))))
  
  (defun write-4-byte (quadruplet stream)
    (write-byte (ldb (byte 8 24) quadruplet) stream)
@@ -387,7 +435,7 @@
        255
        (parse-integer digit)))
  
-(defun output ()
+(defun output-ucd-data ()
    (let ((hash (make-hash-table :test #'equalp))
          (index 0))
      (loop for page across *ucd-base*
@@ -445,17 +493,19 @@
                                (1- (length *misc-table*)))
                            (byte 11 21)
                            (if entry (ucd-transform entry) 0))
-                     stream))))))
-  ;; KLUDGE: this code, to write out decomposition information, is a
-  ;; little bit very similar to the ucd entries above.  Try factoring
-  ;; out the common stuff?
+                     stream)))))))
+
+;;; KLUDGE: this code, to write out decomposition information, is a
+;;; little bit very similar to the ucd entries above.  Try factoring
+;;; out the common stuff?
+(defun output-decomposition-data ()
    (let ((hash (make-hash-table :test #'equalp))
          (index 0))
      (loop for page across *decomposition-base*
-          do (when page
-               (unless (gethash page hash)
-                 (setf (gethash page hash)
-                       (prog1 index (incf index))))))
+       do (when page
+            (unless (gethash page hash)
+              (setf (gethash page hash)
+                    (prog1 index (incf index))))))
      (let ((array (make-array index)))
        (maphash #'(lambda (key value)
                     (setf (aref array value) key))
@@ -482,7 +532,31 @@
                                :if-exists :supersede
                                :if-does-not-exist :create)
          (loop for code across (copy-seq *long-decompositions*)
-           do (write-4-byte code stream)))))
+           do (write-4-byte code stream))))))
+
+(defun output-composition-data ()
+  #+nil ; later
+  (let (firsts seconds)
+    (flet ((frob (k v)
+             (declare (ignore v))
+             (pushnew (car k) firsts)
+             (pushnew (cdr k) seconds)))
+      (maphash #'frob *comp-table*)))
+  (with-open-file (stream (make-pathname :name "comp" :type "dat"
+                                         :defaults *output-directory*)
+                          :direction :output
+                          :element-type '(unsigned-byte 8)
+                          :if-exists :supersede :if-does-not-exist :create)
+    (maphash (lambda (k v)
+               (write-4-byte (car k) stream)
+               (write-4-byte (cdr k) stream)
+               (write-4-byte v stream))
+             *comp-table*)))
+
+(defun output ()
+  (output-ucd-data)
+  (output-decomposition-data)
+  (output-composition-data)
    (with-open-file (f (make-pathname :name "ucd-names" :type "lisp-expr"
                                      :defaults *output-directory*)
                       :direction :output
@@ -492,8 +566,8 @@
        (write-string ";;; Do not edit by hand: generated by ucd.lisp" f)
        (maphash (lambda (code name)
                   (when name
-                  (print code f)
-                  (prin1 name f)))
+                   (print code f)
+                   (prin1 name f)))
                 *unicode-names*))
      (setf *unicode-names* nil))
    (with-open-file (*standard-output*
@@ -527,18 +601,18 @@
      (with-standard-io-syntax
        (let ((*print-pretty* t))
          (prin1 `(:length ,(length *misc-table*)
-                 :uppercase ,(loop for (gc-index) across *misc-table*
-                                for i from 0
-                                when (= gc-index 0)
-                                collect i)
-                 :lowercase ,(loop for (gc-index) across *misc-table*
-                                for i from 0
-                                when (= gc-index 1)
-                                collect i)
-                 :titlecase ,(loop for (gc-index) across *misc-table*
-                                for i from 0
-                                when (= gc-index 2)
-                                collect i))))))
+                         :uppercase ,(loop for (gc-index) across *misc-table*
+                                        for i from 0
+                                        when (= gc-index 0)
+                                        collect i)
+                         :lowercase ,(loop for (gc-index) across *misc-table*
+                                        for i from 0
+                                        when (= gc-index 1)
+                                        collect i)
+                         :titlecase ,(loop for (gc-index) across *misc-table*
+                                        for i from 0
+                                        when (= gc-index 2)
+                                        collect i))))))
    (values))
  
  ;;; Use of the generated files
author	Christophe Rhodes <csr21@cantab.net>
	Sat, 18 May 2013 12:37:55 +0000 (13:37 +0100)
committer	Christophe Rhodes <csr21@cantab.net>
	Sat, 18 May 2013 19:04:05 +0000 (20:04 +0100)
src/code/target-char.lisp		patch \| blob \| history
tests/unicode-normalization.impure.lisp		patch \| blob \| history
tools-for-build/CompositionExclusions.txt	[new file with mode: 0644]	patch \| blob
tools-for-build/ucd.lisp		patch \| blob \| history