(dpb v2 (byte 8 8) v3))))
(if (= length 1)
(string (code-char entry))
- (let ((result (make-string length))
- (e (* 4 entry)))
- (dotimes (i length result)
- (let ((code (dpb (aref long-decompositions (+ e 1))
- (byte 8 16)
- (dpb (aref long-decompositions (+ e 2))
- (byte 8 8)
- (aref long-decompositions (+ e 3))))))
- (setf (char result i) (code-char code)))
- (incf e 4))))))
+ (if (<= #xac00 cp #xd7a3)
+ ;; see Unicode 6.2, section 3-12
+ (let* ((sbase #xac00)
+ (lbase #x1100)
+ (vbase #x1161)
+ (tbase #x11a7)
+ (lcount 19)
+ (vcount 21)
+ (tcount 28)
+ (ncount (* vcount tcount))
+ (scount (* lcount ncount))
+ (sindex (- cp sbase))
+ (lindex (floor sindex ncount))
+ (vindex (floor (mod sindex ncount) tcount))
+ (tindex (mod sindex tcount))
+ (result (make-string length)))
+ (declare (ignore scount))
+ (setf (char result 0) (code-char (+ lbase lindex)))
+ (setf (char result 1) (code-char (+ vbase vindex)))
+ (when (> tindex 0)
+ (setf (char result 2) (code-char (+ tbase tindex))))
+ result)
+ (let ((result (make-string length))
+ (e (* 4 entry)))
+ (dotimes (i length result)
+ (let ((code (dpb (aref long-decompositions (+ e 1))
+ (byte 8 16)
+ (dpb (aref long-decompositions (+ e 2))
+ (byte 8 8)
+ (aref long-decompositions (+ e 3))))))
+ (setf (char result i) (code-char code)))
+ (incf e 4)))))))
(defun decompose-char (char)
(if (= (char-decomposition-info char) 0)
(char= char2 #\combining_acute_accent))
#\latin_small_letter_e_with_acute))
-;;; generic sequences. *sigh*.
+;;; This implements a sequence data structure, specialized for
+;;; efficient deletion of characters at an index, along with tolerable
+;;; random access. The purpose is to support the canonical
+;;; composition algorithm from Unicode, which involves replacing (not
+;;; necessarily consecutive) pairs of code points with a single code
+;;; point (e.g. [#\e #\combining_acute_accent] with
+;;; #\latin_small_letter_e_with_acute). The data structure is a list
+;;; of three-element lists, each denoting a chunk of string data
+;;; starting at the first index and ending at the second.
+;;;
+;;; Actually, the implementation isn't particularly efficient, and
+;;; would probably benefit from being rewritten in terms of displaced
+;;; arrays, which would substantially reduce copying.
+;;;
+;;; (also, generic sequences. *sigh*.)
(defun lref (lstring index)
(dolist (l lstring)
(when (and (<= (first l) index)
(tagbody
again
(when (and (> (- i previous-starter-index) 2)
- (= (ucd-ccc (lref result i)) (ucd-ccc (lref result (1- i)))))
+ ;; test for Blocked (Unicode 3.11 para. D115)
+ (>= (ucd-ccc (lref result (1- i)))
+ (ucd-ccc (lref result i))))
(when (= (ucd-ccc (lref result i)) 0)
(setf previous-starter-index i))
(incf i)
(defun normalize-string (string &optional (form :nfd))
(declare (type (member :nfd :nfkd :nfc :nfkc) form))
(etypecase string
- (simple-base-string string)
- ((simple-array character (*))
+ #!+sb-unicode
+ (base-string string)
+ ((or (array character (*)) #!-sb-unicode base-string)
(ecase form
((:nfd)
(sort-combiners (decompose-string string)))
((:nfkd)
(sort-combiners (decompose-string string :compatibility)))))
- ((simple-array nil (*)) string)))
+ ((array nil (*)) string)))