(char= char2 #\combining_acute_accent))
#\latin_small_letter_e_with_acute))
-;;; generic sequences. *sigh*.
+;;; This implements a sequence data structure, specialized for
+;;; efficient deletion of characters at an index, along with tolerable
+;;; random access. The purpose is to support the canonical
+;;; composition algorithm from Unicode, which involves replacing (not
+;;; necessarily consecutive) pairs of code points with a single code
+;;; point (e.g. [#\e #\combining_acute_accent] with
+;;; #\latin_small_letter_e_with_acute). The data structure is a list
+;;; of three-element lists, each denoting a chunk of string data
+;;; starting at the first index and ending at the second.
+;;;
+;;; Actually, the implementation isn't particularly efficient, and
+;;; would probably benefit from being rewritten in terms of displaced
+;;; arrays, which would substantially reduce copying.
+;;;
+;;; (also, generic sequences. *sigh*.)
(defun lref (lstring index)
(dolist (l lstring)
(when (and (<= (first l) index)
(tagbody
again
(when (and (> (- i previous-starter-index) 2)
- (= (ucd-ccc (lref result i)) (ucd-ccc (lref result (1- i)))))
+ ;; test for Blocked (Unicode 3.11 para. D115)
+ (>= (ucd-ccc (lref result (1- i)))
+ (ucd-ccc (lref result i))))
(when (= (ucd-ccc (lref result i)) 0)
(setf previous-starter-index i))
(incf i)
(defun normalize-string (string &optional (form :nfd))
(declare (type (member :nfd :nfkd :nfc :nfkc) form))
(etypecase string
- (simple-base-string string)
- ((simple-array character (*))
+ #!+sb-unicode
+ (base-string string)
+ ((or (array character (*)) #!-sb-unicode base-string)
(ecase form
((:nfd)
(sort-combiners (decompose-string string)))
((:nfkd)
(sort-combiners (decompose-string string :compatibility)))))
- ((simple-array nil (*)) string)))
+ ((array nil (*)) string)))