fix test for Blocked condition in canonical normalization

[sbcl.git] / src / code / target-char.lisp
diff --git a/src/code/target-char.lisp b/src/code/target-char.lisp

index 6d02245..ef51c23 100644 (file)
--- a/src/code/target-char.lisp
+++ b/src/code/target-char.lisp
@@ -631,16 +631,38 @@ character exists."
                       (dpb v2 (byte 8 8) v3))))
      (if (= length 1)
          (string (code-char entry))
-        (let ((result (make-string length))
-              (e (* 4 entry)))
-          (dotimes (i length result)
-            (let ((code (dpb (aref long-decompositions (+ e 1))
-                             (byte 8 16)
-                             (dpb (aref long-decompositions (+ e 2))
-                                  (byte 8 8)
-                                  (aref long-decompositions (+ e 3))))))
-              (setf (char result i) (code-char code)))
-            (incf e 4))))))
+        (if (<= #xac00 cp #xd7a3)
+            ;; see Unicode 6.2, section 3-12
+            (let* ((sbase #xac00)
+                   (lbase #x1100)
+                   (vbase #x1161)
+                   (tbase #x11a7)
+                   (lcount 19)
+                   (vcount 21)
+                   (tcount 28)
+                   (ncount (* vcount tcount))
+                   (scount (* lcount ncount))
+                   (sindex (- cp sbase))
+                   (lindex (floor sindex ncount))
+                   (vindex (floor (mod sindex ncount) tcount))
+                   (tindex (mod sindex tcount))
+                   (result (make-string length)))
+              (declare (ignore scount))
+              (setf (char result 0) (code-char (+ lbase lindex)))
+              (setf (char result 1) (code-char (+ vbase vindex)))
+              (when (> tindex 0)
+                (setf (char result 2) (code-char (+ tbase tindex))))
+              result)
+            (let ((result (make-string length))
+                  (e (* 4 entry)))
+              (dotimes (i length result)
+                (let ((code (dpb (aref long-decompositions (+ e 1))
+                                 (byte 8 16)
+                                 (dpb (aref long-decompositions (+ e 2))
+                                      (byte 8 8)
+                                      (aref long-decompositions (+ e 3))))))
+                  (setf (char result i) (code-char code)))
+                (incf e 4)))))))
  
  (defun decompose-char (char)
    (if (= (char-decomposition-info char) 0)
@@ -687,7 +709,21 @@ character exists."
               (char= char2 #\combining_acute_accent))
      #\latin_small_letter_e_with_acute))
  
-;;; generic sequences.  *sigh*.
+;;; This implements a sequence data structure, specialized for
+;;; efficient deletion of characters at an index, along with tolerable
+;;; random access.  The purpose is to support the canonical
+;;; composition algorithm from Unicode, which involves replacing (not
+;;; necessarily consecutive) pairs of code points with a single code
+;;; point (e.g. [#\e #\combining_acute_accent] with
+;;; #\latin_small_letter_e_with_acute).  The data structure is a list
+;;; of three-element lists, each denoting a chunk of string data
+;;; starting at the first index and ending at the second.
+;;;
+;;; Actually, the implementation isn't particularly efficient, and
+;;; would probably benefit from being rewritten in terms of displaced
+;;; arrays, which would substantially reduce copying.
+;;;
+;;; (also, generic sequences.  *sigh*.)
  (defun lref (lstring index)
    (dolist (l lstring)
      (when (and (<= (first l) index)
@@ -737,7 +773,9 @@ character exists."
        (tagbody
         again
           (when (and (> (- i previous-starter-index) 2)
-                    (= (ucd-ccc (lref result i)) (ucd-ccc (lref result (1- i)))))
+                    ;; test for Blocked (Unicode 3.11 para. D115)
+                    (>= (ucd-ccc (lref result (1- i)))
+                        (ucd-ccc (lref result i))))
             (when (= (ucd-ccc (lref result i)) 0)
               (setf previous-starter-index i))
             (incf i)
@@ -763,11 +801,12 @@ character exists."
  (defun normalize-string (string &optional (form :nfd))
    (declare (type (member :nfd :nfkd :nfc :nfkc) form))
    (etypecase string
-    (simple-base-string string)
-    ((simple-array character (*))
+    #!+sb-unicode
+    (base-string string)
+    ((or (array character (*)) #!-sb-unicode base-string)
       (ecase form
         ((:nfd)
          (sort-combiners (decompose-string string)))
         ((:nfkd)
          (sort-combiners (decompose-string string :compatibility)))))
-    ((simple-array nil (*)) string)))
+    ((array nil (*)) string)))