fix test for Blocked condition in canonical normalization

[sbcl.git] / src / code / target-char.lisp
diff --git a/src/code/target-char.lisp b/src/code/target-char.lisp

index 77ec7b2..ef51c23 100644 (file)
--- a/src/code/target-char.lisp
+++ b/src/code/target-char.lisp
@@ -709,7 +709,21 @@ character exists."
               (char= char2 #\combining_acute_accent))
      #\latin_small_letter_e_with_acute))
  
-;;; generic sequences.  *sigh*.
+;;; This implements a sequence data structure, specialized for
+;;; efficient deletion of characters at an index, along with tolerable
+;;; random access.  The purpose is to support the canonical
+;;; composition algorithm from Unicode, which involves replacing (not
+;;; necessarily consecutive) pairs of code points with a single code
+;;; point (e.g. [#\e #\combining_acute_accent] with
+;;; #\latin_small_letter_e_with_acute).  The data structure is a list
+;;; of three-element lists, each denoting a chunk of string data
+;;; starting at the first index and ending at the second.
+;;;
+;;; Actually, the implementation isn't particularly efficient, and
+;;; would probably benefit from being rewritten in terms of displaced
+;;; arrays, which would substantially reduce copying.
+;;;
+;;; (also, generic sequences.  *sigh*.)
  (defun lref (lstring index)
    (dolist (l lstring)
      (when (and (<= (first l) index)
@@ -759,7 +773,9 @@ character exists."
        (tagbody
         again
           (when (and (> (- i previous-starter-index) 2)
-                    (= (ucd-ccc (lref result i)) (ucd-ccc (lref result (1- i)))))
+                    ;; test for Blocked (Unicode 3.11 para. D115)
+                    (>= (ucd-ccc (lref result (1- i)))
+                        (ucd-ccc (lref result i))))
             (when (= (ucd-ccc (lref result i)) 0)
               (setf previous-starter-index i))
             (incf i)
@@ -785,11 +801,12 @@ character exists."
  (defun normalize-string (string &optional (form :nfd))
    (declare (type (member :nfd :nfkd :nfc :nfkc) form))
    (etypecase string
-    (simple-base-string string)
-    ((simple-array character (*))
+    #!+sb-unicode
+    (base-string string)
+    ((or (array character (*)) #!-sb-unicode base-string)
       (ecase form
         ((:nfd)
          (sort-combiners (decompose-string string)))
         ((:nfkd)
          (sort-combiners (decompose-string string :compatibility)))))
-    ((simple-array nil (*)) string)))
+    ((array nil (*)) string)))