X-Git-Url: http://repo.macrolet.net/gitweb/?a=blobdiff_plain;f=src%2Fcode%2Ftarget-char.lisp;h=ef51c23d3496e54acdb1f68924597570b03523fd;hb=93f6ccd997abd7f4fcefeec1e4383e0249f0df01;hp=0c2b5f7568ce277026bd3a6ee096936ec63dc05c;hpb=026be5a30130cdb1bc4648fa7daea8e1180a6e46;p=sbcl.git diff --git a/src/code/target-char.lisp b/src/code/target-char.lisp index 0c2b5f7..ef51c23 100644 --- a/src/code/target-char.lisp +++ b/src/code/target-char.lisp @@ -254,6 +254,9 @@ (+ 3 (* 8 (ucd-value-0 char)))))) (when (< decimal-digit 10) decimal-digit))) +(declaim (ftype (sfunction (t) (unsigned-byte 8)) ucd-ccc)) +(defun ucd-ccc (char) + (aref **character-database** (+ 2 (* 8 (ucd-value-0 char))))) (defun char-code (char) #!+sb-doc @@ -617,7 +620,7 @@ character exists." (decompositions **character-decompositions**) (long-decompositions **character-long-decompositions**) (index (+ #x1100 - (ash (aref decompositions cp-high) 10) + (ash (aref decompositions cp-high) 10) (ash (ldb (byte 8 0) cp) 2))) (v0 (aref decompositions index)) (v1 (aref decompositions (+ index 1))) @@ -628,18 +631,182 @@ character exists." (dpb v2 (byte 8 8) v3)))) (if (= length 1) (string (code-char entry)) - (let ((result (make-string length)) - (e (* 4 entry))) - (dotimes (i length result) - (let ((code (dpb (aref long-decompositions (+ e 1)) - (byte 8 16) - (dpb (aref long-decompositions (+ e 2)) - (byte 8 8) - (aref long-decompositions (+ e 3)))))) - (setf (char result i) (code-char code))) - (incf e 4)))))) + (if (<= #xac00 cp #xd7a3) + ;; see Unicode 6.2, section 3-12 + (let* ((sbase #xac00) + (lbase #x1100) + (vbase #x1161) + (tbase #x11a7) + (lcount 19) + (vcount 21) + (tcount 28) + (ncount (* vcount tcount)) + (scount (* lcount ncount)) + (sindex (- cp sbase)) + (lindex (floor sindex ncount)) + (vindex (floor (mod sindex ncount) tcount)) + (tindex (mod sindex tcount)) + (result (make-string length))) + (declare (ignore scount)) + (setf (char result 0) (code-char (+ lbase lindex))) + (setf (char result 1) (code-char (+ vbase vindex))) + (when (> tindex 0) + (setf (char result 2) (code-char (+ tbase tindex)))) + result) + (let ((result (make-string length)) + (e (* 4 entry))) + (dotimes (i length result) + (let ((code (dpb (aref long-decompositions (+ e 1)) + (byte 8 16) + (dpb (aref long-decompositions (+ e 2)) + (byte 8 8) + (aref long-decompositions (+ e 3)))))) + (setf (char result i) (code-char code))) + (incf e 4))))))) (defun decompose-char (char) (if (= (char-decomposition-info char) 0) (string char) (char-decomposition char))) + +(defun decompose-string (string &optional (kind :canonical)) + (declare (type (member :canonical :compatibility) kind)) + (flet ((canonical (char) + (= 1 (char-decomposition-info char))) + (compat (char) + (/= 0 (char-decomposition-info char)))) + (let (result + (fun (ecase kind + (:canonical #'canonical) + (:compatibility #'compat)))) + (do* ((start 0 (1+ end)) + (end (position-if fun string :start start) + (position-if fun string :start start))) + ((null end) (push (subseq string start end) result)) + (unless (= start end) + (push (subseq string start end) result)) + (push (decompose-char (char string end)) result)) + (apply 'concatenate 'string (nreverse result))))) + +(defun sort-combiners (string) + (let (result (start 0) first-cc first-non-cc) + (tagbody + again + (setf first-cc (position 0 string :key #'ucd-ccc :test #'/= :start start)) + (when first-cc + (setf first-non-cc (position 0 string :key #'ucd-ccc :test #'= :start first-cc))) + (push (subseq string start first-cc) result) + (when first-cc + (push (stable-sort (subseq string first-cc first-non-cc) #'< :key #'ucd-ccc) result)) + (when first-non-cc + (setf start first-non-cc first-cc nil first-non-cc nil) + (go again))) + (apply 'concatenate 'string (nreverse result)))) + +#+nil +(defun primary-composition (char1 char2) + (when (and (char= char1 #\e) + (char= char2 #\combining_acute_accent)) + #\latin_small_letter_e_with_acute)) + +;;; This implements a sequence data structure, specialized for +;;; efficient deletion of characters at an index, along with tolerable +;;; random access. The purpose is to support the canonical +;;; composition algorithm from Unicode, which involves replacing (not +;;; necessarily consecutive) pairs of code points with a single code +;;; point (e.g. [#\e #\combining_acute_accent] with +;;; #\latin_small_letter_e_with_acute). The data structure is a list +;;; of three-element lists, each denoting a chunk of string data +;;; starting at the first index and ending at the second. +;;; +;;; Actually, the implementation isn't particularly efficient, and +;;; would probably benefit from being rewritten in terms of displaced +;;; arrays, which would substantially reduce copying. +;;; +;;; (also, generic sequences. *sigh*.) +(defun lref (lstring index) + (dolist (l lstring) + (when (and (<= (first l) index) + (< index (second l))) + (return (aref (third l) (- index (first l))))))) +(defun (setf lref) (newchar lstring index) + (dolist (l lstring) + (when (and (<= (first l) index) + (< index (second l))) + (return (setf (aref (third l) (- index (first l))) newchar))))) +(defun llength (lstring) + (second (first (last lstring)))) +(defun lstring (lstring) + (let ((result (make-string (llength lstring)))) + (dolist (l lstring result) + (replace result (third l) :start1 (first l) :end1 (second l))))) +(defun ldelete (lstring index) + (do* ((ls lstring (cdr ls)) + (l (car ls) (car ls)) + so-fars) + ((and (<= (first l) index) + (< index (second l))) + (append + (nreverse so-fars) + (cond + ((= (first l) index) + (list (list (first l) (1- (second l)) (subseq (third l) 1)))) + ((= index (1- (second l))) + (list (list (first l) (1- (second l)) (subseq (third l) 0 (1- (length (third l))))))) + (t + (list + (list (first l) index + (subseq (third l) 0 (- index (first l)))) + (list index (1- (second l)) + (subseq (third l) (1+ (- index (first l)))))))) + (mapcar (lambda (x) (list (1- (first x)) (1- (second x)) (third x))) + (cdr ls)))) + (push l so-fars))) + +(defun canonically-compose (string) + (labels () + (let* ((result (list (list 0 (length string) string))) + (previous-starter-index (position 0 string :key #'ucd-ccc)) + (i (1+ previous-starter-index))) + (when (= i (length string)) + (return-from canonically-compose string)) + (tagbody + again + (when (and (> (- i previous-starter-index) 2) + ;; test for Blocked (Unicode 3.11 para. D115) + (>= (ucd-ccc (lref result (1- i))) + (ucd-ccc (lref result i)))) + (when (= (ucd-ccc (lref result i)) 0) + (setf previous-starter-index i)) + (incf i) + (go next)) + + (let ((comp (primary-composition (lref result previous-starter-index) + (lref result i)))) + (cond + (comp + (setf (lref result previous-starter-index) comp) + (setf result (ldelete result i))) + (t + (when (= (ucd-ccc (lref result i)) 0) + (setf previous-starter-index i)) + (incf i)))) + next + (unless (= i (llength result)) + (go again))) + (if (= i (length string)) + string + (lstring result))))) + +(defun normalize-string (string &optional (form :nfd)) + (declare (type (member :nfd :nfkd :nfc :nfkc) form)) + (etypecase string + #!+sb-unicode + (base-string string) + ((or (array character (*)) #!-sb-unicode base-string) + (ecase form + ((:nfd) + (sort-combiners (decompose-string string))) + ((:nfkd) + (sort-combiners (decompose-string string :compatibility))))) + ((array nil (*)) string)))