X-Git-Url: http://repo.macrolet.net/gitweb/?a=blobdiff_plain;f=src%2Fcode%2Ftarget-char.lisp;h=e217f8e8dbcb04925974211b044db475512328f0;hb=88429c4045707ceaf99a10801d5c5efdca765afa;hp=77ec7b2432d76e5d261d9e8a15ad77c97f9d2d14;hpb=4aad92d800164f49530ad5f2bb07d81f61e2911b;p=sbcl.git diff --git a/src/code/target-char.lisp b/src/code/target-char.lisp index 77ec7b2..e217f8e 100644 --- a/src/code/target-char.lisp +++ b/src/code/target-char.lisp @@ -51,6 +51,24 @@ (defglobal **character-database** ,character-database) (defglobal **character-decompositions** ,decompositions) (defglobal **character-long-decompositions** ,long-decompositions) + (defglobal **character-primary-compositions** + (let ((table (make-hash-table)) + (info ,(read-ub8-vector (file "comp" "dat")))) + (flet ((code (j) + (dpb (aref info (* 4 j)) + (byte 8 24) + (dpb (aref info (+ (* 4 j) 1)) + (byte 8 16) + (dpb (aref info (+ (* 4 j) 2)) + (byte 8 8) + (aref info (+ (* 4 j) 3))))))) + #!+sb-unicode + (dotimes (i (/ (length info) 12)) + (setf (gethash (dpb (code (* 3 i)) (byte 21 21) + (code (1+ (* 3 i)))) + table) + (code-char (code (+ (* 3 i) 2))))) + table))) (defun !character-database-cold-init () (setf **character-database** ,character-database)) ,(with-open-file (stream (file "ucd-names" "lisp-expr") @@ -167,7 +185,7 @@ ;;;; UCD accessor functions -;;; The first (* 8 395) => 3160 entries in **CHARACTER-DATABASE** +;;; The first (* 8 396) => 3168 entries in **CHARACTER-DATABASE** ;;; contain entries for the distinct character attributes: ;;; specifically, indexes into the GC kinds, Bidi kinds, CCC kinds, ;;; the decimal digit property, the digit property and the @@ -194,12 +212,12 @@ ;;; ;;; To look up information about a character, take the high 13 bits of ;;; its code point, and index the character database with that and a -;;; base of 3160 (going past the miscellaneous information[*], so +;;; base of 3168 (going past the miscellaneous information[*], so ;;; treating (a) as the start of the array). This, labelled A, gives ;;; us another index into the detailed pages[-], which we can use to ;;; look up the details for the character in question: we add the low ;;; 8 bits of the character, shifted twice (because we have four-byte -;;; table entries) to 1024 times the `page' index, with a base of 6088 +;;; table entries) to 1024 times the `page' index, with a base of 7520 ;;; to skip over everything else. This gets us to point B. If we're ;;; after a transformed code point (i.e. an upcase or downcase ;;; operation), we can simply read it off now, beginning with an @@ -224,8 +242,8 @@ (defun ucd-index (char) (let* ((cp (char-code char)) (cp-high (ash cp -8)) - (page (aref **character-database** (+ 3160 cp-high)))) - (+ 7512 (ash page 10) (ash (ldb (byte 8 0) cp) 2)))) + (page (aref **character-database** (+ 3168 cp-high)))) + (+ 7520 (ash page 10) (ash (ldb (byte 8 0) cp) 2)))) (declaim (ftype (sfunction (t) (unsigned-byte 11)) ucd-value-0)) (defun ucd-value-0 (char) @@ -495,7 +513,7 @@ is either numeric or alphabetic." (defmacro equal-char-code (character) (let ((ch (gensym))) `(let ((,ch ,character)) - (if (= (ucd-value-0 ,ch) 0) + (if (< (ucd-value-0 ,ch) 4) (ucd-value-1 ,ch) (char-code ,ch))))) @@ -685,7 +703,14 @@ character exists." ((null end) (push (subseq string start end) result)) (unless (= start end) (push (subseq string start end) result)) - (push (decompose-char (char string end)) result)) + ;; FIXME: this recursive call to DECOMPOSE-STRING is necessary + ;; for correctness given our direct encoding of the + ;; decomposition data in UnicodeData.txt. It would, however, + ;; be straightforward enough to perform the recursion in table + ;; construction, and then have this simply revert to a single + ;; lookup. (Wait for tests to be hooked in, then implement). + (push (decompose-string (decompose-char (char string end)) kind) + result)) (apply 'concatenate 'string (nreverse result))))) (defun sort-combiners (string) @@ -703,13 +728,37 @@ character exists." (go again))) (apply 'concatenate 'string (nreverse result)))) -#+nil (defun primary-composition (char1 char2) - (when (and (char= char1 #\e) - (char= char2 #\combining_acute_accent)) - #\latin_small_letter_e_with_acute)) - -;;; generic sequences. *sigh*. + (let ((c1 (char-code char1)) + (c2 (char-code char2))) + (cond + ((gethash (dpb (char-code char1) (byte 21 21) (char-code char2)) + **character-primary-compositions**)) + ((and (<= #x1100 c1) (<= c1 #x1112) + (<= #x1161 c2) (<= c2 #x1175)) + (let ((lindex (- c1 #x1100)) + (vindex (- c2 #x1161))) + (code-char (+ #xac00 (* lindex 588) (* vindex 28))))) + ((and (<= #xac00 c1) (<= c1 #.(+ #xac00 11171)) + (<= #x11a8 c2) (<= c2 #x11c2) + (= 0 (rem (- c1 #xac00) 28))) + (code-char (+ c1 (- c2 #x11a7))))))) + +;;; This implements a sequence data structure, specialized for +;;; efficient deletion of characters at an index, along with tolerable +;;; random access. The purpose is to support the canonical +;;; composition algorithm from Unicode, which involves replacing (not +;;; necessarily consecutive) pairs of code points with a single code +;;; point (e.g. [#\e #\combining_acute_accent] with +;;; #\latin_small_letter_e_with_acute). The data structure is a list +;;; of three-element lists, each denoting a chunk of string data +;;; starting at the first index and ending at the second. +;;; +;;; Actually, the implementation isn't particularly efficient, and +;;; would probably benefit from being rewritten in terms of displaced +;;; arrays, which would substantially reduce copying. +;;; +;;; (also, generic sequences. *sigh*.) (defun lref (lstring index) (dolist (l lstring) (when (and (<= (first l) index) @@ -753,13 +802,18 @@ character exists." (labels () (let* ((result (list (list 0 (length string) string))) (previous-starter-index (position 0 string :key #'ucd-ccc)) - (i (1+ previous-starter-index))) - (when (= i (length string)) + (i (and previous-starter-index (1+ previous-starter-index)))) + (when (or (not i) (= i (length string))) (return-from canonically-compose string)) (tagbody again - (when (and (> (- i previous-starter-index) 2) - (= (ucd-ccc (lref result i)) (ucd-ccc (lref result (1- i))))) + (when (and (>= (- i previous-starter-index) 2) + ;; test for Blocked (Unicode 3.11 para. D115) + ;; + ;; (assumes here that string has sorted combiners, + ;; so can look back just one step) + (>= (ucd-ccc (lref result (1- i))) + (ucd-ccc (lref result i)))) (when (= (ucd-ccc (lref result i)) 0) (setf previous-starter-index i)) (incf i) @@ -784,12 +838,24 @@ character exists." (defun normalize-string (string &optional (form :nfd)) (declare (type (member :nfd :nfkd :nfc :nfkc) form)) + #!-sb-unicode + (etypecase string + ((array nil (*)) string) + (string + (ecase form + ((:nfc :nfkc) string) + ((:nfd :nfkd) (error "Cannot normalize to ~A form in #-SB-UNICODE builds" form))))) + #!+sb-unicode (etypecase string - (simple-base-string string) - ((simple-array character (*)) + (base-string string) + ((array character (*)) (ecase form + ((:nfc) + (canonically-compose (sort-combiners (decompose-string string)))) ((:nfd) (sort-combiners (decompose-string string))) + ((:nfkc) + (canonically-compose (sort-combiners (decompose-string string :compatibility)))) ((:nfkd) (sort-combiners (decompose-string string :compatibility))))) - ((simple-array nil (*)) string))) + ((array nil (*)) string)))