(defglobal **character-database** ,character-database)
(defglobal **character-decompositions** ,decompositions)
(defglobal **character-long-decompositions** ,long-decompositions)
+ (defglobal **character-primary-compositions**
+ (let ((table (make-hash-table))
+ (info ,(read-ub8-vector (file "comp" "dat"))))
+ (flet ((code (j)
+ (dpb (aref info (* 4 j))
+ (byte 8 24)
+ (dpb (aref info (+ (* 4 j) 1))
+ (byte 8 16)
+ (dpb (aref info (+ (* 4 j) 2))
+ (byte 8 8)
+ (aref info (+ (* 4 j) 3)))))))
+ (dotimes (i (/ (length info) 12) table)
+ (setf (gethash (dpb (code (* 3 i)) (byte 21 21)
+ (code (1+ (* 3 i))))
+ table)
+ (code-char (code (+ (* 3 i) 2))))))))
(defun !character-database-cold-init ()
(setf **character-database** ,character-database))
,(with-open-file (stream (file "ucd-names" "lisp-expr")
(go again)))
(apply 'concatenate 'string (nreverse result))))
-#+nil
(defun primary-composition (char1 char2)
- (when (and (char= char1 #\e)
- (char= char2 #\combining_acute_accent))
- #\latin_small_letter_e_with_acute))
+ (let ((c1 (char-code char1))
+ (c2 (char-code char2)))
+ (cond
+ ((gethash (dpb (char-code char1) (byte 21 21) (char-code char2))
+ **character-primary-compositions**))
+ ((and (<= #x1100 c1) (<= c1 #x1112)
+ (<= #x1161 c2) (<= c2 #x1175))
+ (let ((lindex (- c1 #x1100))
+ (vindex (- c2 #x1161)))
+ (code-char (+ #xac00 (* lindex 588) (* vindex 28)))))
+ ((and (<= #xac00 c1) (<= c1 #.(+ #xac00 11171))
+ (<= #x11a8 c2) (<= c2 #x11c2)
+ (= 0 (rem (- c1 #xac00) 28)))
+ (code-char (+ c1 (- c2 #x11a7)))))))
;;; This implements a sequence data structure, specialized for
;;; efficient deletion of characters at an index, along with tolerable
(labels ()
(let* ((result (list (list 0 (length string) string)))
(previous-starter-index (position 0 string :key #'ucd-ccc))
- (i (1+ previous-starter-index)))
- (when (= i (length string))
+ (i (and previous-starter-index (1+ previous-starter-index))))
+ (when (or (not i) (= i (length string)))
(return-from canonically-compose string))
(tagbody
again
- (when (and (> (- i previous-starter-index) 2)
+ (when (and (>= (- i previous-starter-index) 2)
;; test for Blocked (Unicode 3.11 para. D115)
+ ;;
+ ;; (assumes here that string has sorted combiners,
+ ;; so can look back just one step)
(>= (ucd-ccc (lref result (1- i)))
(ucd-ccc (lref result i))))
(when (= (ucd-ccc (lref result i)) 0)
(base-string string)
((or (array character (*)) #!-sb-unicode base-string)
(ecase form
+ ((:nfc)
+ (canonically-compose (sort-combiners (decompose-string string))))
((:nfd)
(sort-combiners (decompose-string string)))
+ ((:nfkc)
+ (canonically-compose (sort-combiners (decompose-string string :compatibility))))
((:nfkd)
(sort-combiners (decompose-string string :compatibility)))))
((array nil (*)) string)))
(defun test-line (c1 c2 c3 c4 c5)
;; NFC
- #+nil
(assert-all-string= c2
(normalize-string c1 :nfc)
(normalize-string c2 :nfc)
(normalize-string c3 :nfc))
- #+nil
(assert-all-string= c4
(normalize-string c4 :nfc)
(normalize-string c5 :nfc))
(normalize-string c5 :nfd))
;; NFKC
- #+nil
(assert-all-string= c4
(normalize-string c1 :nfkc)
(normalize-string c2 :nfkc)
(defun test-no-normalization (string)
(assert-all-string= string
- #+nil
(normalize-string string :nfc)
(normalize-string string :nfd)
- #+nil
(normalize-string string :nfkc)
(normalize-string string :nfkd)))
--- /dev/null
+# CompositionExclusions-6.2.0.txt
+# Date: 2012-05-15, 22:21:00 GMT [KW, LI]
+#
+# This file lists the characters for the Composition Exclusion Table
+# defined in UAX #15, Unicode Normalization Forms.
+#
+# This file is a normative contributory data file in the
+# Unicode Character Database.
+#
+# Copyright (c) 1991-2012 Unicode, Inc.
+# For terms of use, see http://www.unicode.org/terms_of_use.html
+#
+# For more information, see
+# http://www.unicode.org/unicode/reports/tr15/#Primary_Exclusion_List_Table
+#
+# For a full derivation of composition exclusions, see the derived property
+# Full_Composition_Exclusion in DerivedNormalizationProps.txt
+#
+
+# ================================================
+# (1) Script Specifics
+#
+# This list of characters cannot be derived from the UnicodeData.txt file.
+# ================================================
+
+0958 # DEVANAGARI LETTER QA
+0959 # DEVANAGARI LETTER KHHA
+095A # DEVANAGARI LETTER GHHA
+095B # DEVANAGARI LETTER ZA
+095C # DEVANAGARI LETTER DDDHA
+095D # DEVANAGARI LETTER RHA
+095E # DEVANAGARI LETTER FA
+095F # DEVANAGARI LETTER YYA
+09DC # BENGALI LETTER RRA
+09DD # BENGALI LETTER RHA
+09DF # BENGALI LETTER YYA
+0A33 # GURMUKHI LETTER LLA
+0A36 # GURMUKHI LETTER SHA
+0A59 # GURMUKHI LETTER KHHA
+0A5A # GURMUKHI LETTER GHHA
+0A5B # GURMUKHI LETTER ZA
+0A5E # GURMUKHI LETTER FA
+0B5C # ORIYA LETTER RRA
+0B5D # ORIYA LETTER RHA
+0F43 # TIBETAN LETTER GHA
+0F4D # TIBETAN LETTER DDHA
+0F52 # TIBETAN LETTER DHA
+0F57 # TIBETAN LETTER BHA
+0F5C # TIBETAN LETTER DZHA
+0F69 # TIBETAN LETTER KSSA
+0F76 # TIBETAN VOWEL SIGN VOCALIC R
+0F78 # TIBETAN VOWEL SIGN VOCALIC L
+0F93 # TIBETAN SUBJOINED LETTER GHA
+0F9D # TIBETAN SUBJOINED LETTER DDHA
+0FA2 # TIBETAN SUBJOINED LETTER DHA
+0FA7 # TIBETAN SUBJOINED LETTER BHA
+0FAC # TIBETAN SUBJOINED LETTER DZHA
+0FB9 # TIBETAN SUBJOINED LETTER KSSA
+FB1D # HEBREW LETTER YOD WITH HIRIQ
+FB1F # HEBREW LIGATURE YIDDISH YOD YOD PATAH
+FB2A # HEBREW LETTER SHIN WITH SHIN DOT
+FB2B # HEBREW LETTER SHIN WITH SIN DOT
+FB2C # HEBREW LETTER SHIN WITH DAGESH AND SHIN DOT
+FB2D # HEBREW LETTER SHIN WITH DAGESH AND SIN DOT
+FB2E # HEBREW LETTER ALEF WITH PATAH
+FB2F # HEBREW LETTER ALEF WITH QAMATS
+FB30 # HEBREW LETTER ALEF WITH MAPIQ
+FB31 # HEBREW LETTER BET WITH DAGESH
+FB32 # HEBREW LETTER GIMEL WITH DAGESH
+FB33 # HEBREW LETTER DALET WITH DAGESH
+FB34 # HEBREW LETTER HE WITH MAPIQ
+FB35 # HEBREW LETTER VAV WITH DAGESH
+FB36 # HEBREW LETTER ZAYIN WITH DAGESH
+FB38 # HEBREW LETTER TET WITH DAGESH
+FB39 # HEBREW LETTER YOD WITH DAGESH
+FB3A # HEBREW LETTER FINAL KAF WITH DAGESH
+FB3B # HEBREW LETTER KAF WITH DAGESH
+FB3C # HEBREW LETTER LAMED WITH DAGESH
+FB3E # HEBREW LETTER MEM WITH DAGESH
+FB40 # HEBREW LETTER NUN WITH DAGESH
+FB41 # HEBREW LETTER SAMEKH WITH DAGESH
+FB43 # HEBREW LETTER FINAL PE WITH DAGESH
+FB44 # HEBREW LETTER PE WITH DAGESH
+FB46 # HEBREW LETTER TSADI WITH DAGESH
+FB47 # HEBREW LETTER QOF WITH DAGESH
+FB48 # HEBREW LETTER RESH WITH DAGESH
+FB49 # HEBREW LETTER SHIN WITH DAGESH
+FB4A # HEBREW LETTER TAV WITH DAGESH
+FB4B # HEBREW LETTER VAV WITH HOLAM
+FB4C # HEBREW LETTER BET WITH RAFE
+FB4D # HEBREW LETTER KAF WITH RAFE
+FB4E # HEBREW LETTER PE WITH RAFE
+
+# Total code points: 67
+
+# ================================================
+# (2) Post Composition Version precomposed characters
+#
+# These characters cannot be derived solely from the UnicodeData.txt file
+# in this version of Unicode.
+#
+# Note that characters added to the standard after the
+# Composition Version and which have canonical decomposition mappings
+# are not automatically added to this list of Post Composition
+# Version precomposed characters.
+# ================================================
+
+2ADC # FORKING
+1D15E # MUSICAL SYMBOL HALF NOTE
+1D15F # MUSICAL SYMBOL QUARTER NOTE
+1D160 # MUSICAL SYMBOL EIGHTH NOTE
+1D161 # MUSICAL SYMBOL SIXTEENTH NOTE
+1D162 # MUSICAL SYMBOL THIRTY-SECOND NOTE
+1D163 # MUSICAL SYMBOL SIXTY-FOURTH NOTE
+1D164 # MUSICAL SYMBOL ONE HUNDRED TWENTY-EIGHTH NOTE
+1D1BB # MUSICAL SYMBOL MINIMA
+1D1BC # MUSICAL SYMBOL MINIMA BLACK
+1D1BD # MUSICAL SYMBOL SEMIMINIMA WHITE
+1D1BE # MUSICAL SYMBOL SEMIMINIMA BLACK
+1D1BF # MUSICAL SYMBOL FUSA WHITE
+1D1C0 # MUSICAL SYMBOL FUSA BLACK
+
+# Total code points: 14
+
+# ================================================
+# (3) Singleton Decompositions
+#
+# These characters can be derived from the UnicodeData.txt file
+# by including all canonically decomposable characters whose
+# canonical decomposition consists of a single character.
+#
+# These characters are simply quoted here for reference.
+# See also Full_Composition_Exclusion in DerivedNormalizationProps.txt
+# ================================================
+
+# 0340..0341 [2] COMBINING GRAVE TONE MARK..COMBINING ACUTE TONE MARK
+# 0343 COMBINING GREEK KORONIS
+# 0374 GREEK NUMERAL SIGN
+# 037E GREEK QUESTION MARK
+# 0387 GREEK ANO TELEIA
+# 1F71 GREEK SMALL LETTER ALPHA WITH OXIA
+# 1F73 GREEK SMALL LETTER EPSILON WITH OXIA
+# 1F75 GREEK SMALL LETTER ETA WITH OXIA
+# 1F77 GREEK SMALL LETTER IOTA WITH OXIA
+# 1F79 GREEK SMALL LETTER OMICRON WITH OXIA
+# 1F7B GREEK SMALL LETTER UPSILON WITH OXIA
+# 1F7D GREEK SMALL LETTER OMEGA WITH OXIA
+# 1FBB GREEK CAPITAL LETTER ALPHA WITH OXIA
+# 1FBE GREEK PROSGEGRAMMENI
+# 1FC9 GREEK CAPITAL LETTER EPSILON WITH OXIA
+# 1FCB GREEK CAPITAL LETTER ETA WITH OXIA
+# 1FD3 GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
+# 1FDB GREEK CAPITAL LETTER IOTA WITH OXIA
+# 1FE3 GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
+# 1FEB GREEK CAPITAL LETTER UPSILON WITH OXIA
+# 1FEE..1FEF [2] GREEK DIALYTIKA AND OXIA..GREEK VARIA
+# 1FF9 GREEK CAPITAL LETTER OMICRON WITH OXIA
+# 1FFB GREEK CAPITAL LETTER OMEGA WITH OXIA
+# 1FFD GREEK OXIA
+# 2000..2001 [2] EN QUAD..EM QUAD
+# 2126 OHM SIGN
+# 212A..212B [2] KELVIN SIGN..ANGSTROM SIGN
+# 2329 LEFT-POINTING ANGLE BRACKET
+# 232A RIGHT-POINTING ANGLE BRACKET
+# F900..FA0D [270] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA0D
+# FA10 CJK COMPATIBILITY IDEOGRAPH-FA10
+# FA12 CJK COMPATIBILITY IDEOGRAPH-FA12
+# FA15..FA1E [10] CJK COMPATIBILITY IDEOGRAPH-FA15..CJK COMPATIBILITY IDEOGRAPH-FA1E
+# FA20 CJK COMPATIBILITY IDEOGRAPH-FA20
+# FA22 CJK COMPATIBILITY IDEOGRAPH-FA22
+# FA25..FA26 [2] CJK COMPATIBILITY IDEOGRAPH-FA25..CJK COMPATIBILITY IDEOGRAPH-FA26
+# FA2A..FA6D [68] CJK COMPATIBILITY IDEOGRAPH-FA2A..CJK COMPATIBILITY IDEOGRAPH-FA6D
+# FA70..FAD9 [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILITY IDEOGRAPH-FAD9
+# 2F800..2FA1D [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D
+
+# Total code points: 1035
+
+# ================================================
+# (4) Non-Starter Decompositions
+#
+# These characters can be derived from the UnicodeData.txt file
+# by including each expanding canonical decomposition
+# (i.e., those which canonically decompose to a sequence
+# of characters instead of a single character), such that:
+#
+# A. The character is not a Starter.
+#
+# OR (inclusive)
+#
+# B. The character's canonical decomposition begins
+# with a character that is not a Starter.
+#
+# Note that a "Starter" is any character with a zero combining class.
+#
+# These characters are simply quoted here for reference.
+# See also Full_Composition_Exclusion in DerivedNormalizationProps.txt
+# ================================================
+
+# 0344 COMBINING GREEK DIALYTIKA TONOS
+# 0F73 TIBETAN VOWEL SIGN II
+# 0F75 TIBETAN VOWEL SIGN UU
+# 0F81 TIBETAN VOWEL SIGN REVERSED II
+
+# Total code points: 4
+
+# EOF
(gethash (aref *misc-table* i) *misc-hash*))
i)))
+(defvar *comp-table*)
+
+(defvar *exclusions*
+ (with-open-file (s (make-pathname :name "CompositionExclusions" :type "txt"
+ :defaults *unicode-character-database*))
+ (do ((line (read-line s nil nil) (read-line s nil nil))
+ result)
+ ((null line) result)
+ (when (and (> (length line) 0)
+ (char/= (char line 0) #\#))
+ (push (parse-integer line :end (position #\Space line) :radix 16)
+ result)))))
+
(defun slurp-ucd ()
+ (setf *comp-table* (make-hash-table :test 'equal))
(setq *last-uppercase* nil)
(setq *uppercase-transition-count* 0)
(setq *different-titlecases* nil)
while line
do (slurp-ucd-line line)))
(second-pass)
+ (fixup-compositions)
(fixup-hangul-syllables)
(build-misc-table)
(length *long-decompositions*))
+(defun fixup-compositions ()
+ (flet ((fixup (k v)
+ (let* ((cp (car k))
+ (ucd (aref (aref *ucd-base* (cp-high cp)) (cp-low cp)))
+ (misc (aref *misc-table* (ucd-misc ucd)))
+ (ccc-index (third misc)))
+ ;; we can do everything in the first pass except for
+ ;; accounting for decompositions where the first
+ ;; character of the decomposition is not a starter.
+ (when (/= ccc-index 0)
+ (remhash k *comp-table*)))))
+ (maphash #'fixup *comp-table*)))
+
(defun fixup-hangul-syllables ()
;; "Hangul Syllable Composition, Unicode 5.1 section 3-12"
(let* ((sbase #xac00)
(mapcar #'(lambda (string)
(parse-integer string :radix 16))
split)))
+ (when (= decomposition-info 1)
+ ;; Primary composition excludes:
+ ;; * singleton decompositions;
+ ;; * decompositions of non-starters;
+ ;; * script-specific decompositions;
+ ;; * later-version decompositions;
+ ;; * decompositions whose first character is a
+ ;; non-starter.
+ ;; All but the last case can be handled here;
+ ;; for the fixup, see FIXUP-COMPOSITIONS
+ (when (and (> (length decomposition) 1)
+ (= ccc-index 0)
+ (not (member code-point *exclusions*)))
+ (unless (= (length decomposition) 2)
+ (error "canonical decomposition unexpectedly long"))
+ (setf (gethash (cons (first decomposition)
+ (second decomposition))
+ *comp-table*)
+ code-point)))
(if (= (length decomposition) 1)
(cons 1 (car decomposition))
(cons (length decomposition)
(- *page-size-exponent*)))
(page-end (ash code-point (- *page-size-exponent*))))
(loop for point from *block-first*
- below (ash page-start *page-size-exponent*)
- do (setf (aref (aref *ucd-base* (cp-high point))
- (cp-low point))
- result))
+ below (ash page-start *page-size-exponent*)
+ do (setf (aref (aref *ucd-base* (cp-high point))
+ (cp-low point))
+ result))
(loop for page from page-start below page-end
- do (setf (aref *ucd-base* page)
- (make-array (ash 1 *page-size-exponent*)
- :initial-element result)))
+ do (setf (aref *ucd-base* page)
+ (make-array (ash 1 *page-size-exponent*)
+ :initial-element result)))
(loop for point from (ash page-end *page-size-exponent*)
- below code-point
- do (setf (aref (aref *ucd-base* (cp-high point))
- (cp-low point))
- result))))
+ below code-point
+ do (setf (aref (aref *ucd-base* (cp-high point))
+ (cp-low point))
+ result))))
(values result (normalize-character-name name)))))))
(defun slurp-ucd-line (line)
;;; inverses, which is not true in general in Unicode even for
;;; characters which change case to single characters.
(defun second-pass ()
- (loop for i from 0 below (length *ucd-base*)
- when (aref *ucd-base* i)
- do (loop for j from 0 below (length (aref *ucd-base* i))
- for result = (aref (aref *ucd-base* i) j)
- when result
- when (let* ((transform-point (ucd-transform result))
- (transform-high (ash transform-point
- (- *page-size-exponent*)))
- (transform-low (ldb (byte *page-size-exponent* 0)
- transform-point)))
- (and (plusp transform-point)
- (/= (ucd-transform
- (aref (aref *ucd-base* transform-high)
- transform-low))
- (+ (ash i *page-size-exponent*) j))))
- do (destructuring-bind (gc-index bidi-index ccc-index
- decimal-digit digit bidi-mirrored
- cl-both-case-p decomposition-info)
+ (dotimes (i (length *ucd-base*))
+ (let ((base (aref *ucd-base* i)))
+ (dotimes (j (length base)) ; base is NIL or an array
+ (let ((result (aref base j)))
+ (when result
+ ;; fixup case mappings for CL/Unicode mismatch
+ (let* ((transform-point (ucd-transform result))
+ (transform-high (ash transform-point
+ (- *page-size-exponent*)))
+ (transform-low (ldb (byte *page-size-exponent* 0)
+ transform-point)))
+ (when (and (plusp transform-point)
+ (/= (ucd-transform
+ (aref (aref *ucd-base* transform-high)
+ transform-low))
+ (+ (ash i *page-size-exponent*) j)))
+ (destructuring-bind (gc-index bidi-index ccc-index
+ decimal-digit digit bidi-mirrored
+ cl-both-case-p decomposition-info)
(aref *misc-table* (ucd-misc result))
(declare (ignore cl-both-case-p))
(format t "~A~%" (+ (ash i *page-size-exponent*) j))
(setf (ucd-misc result)
(hash-misc gc-index bidi-index ccc-index
decimal-digit digit bidi-mirrored
- nil decomposition-info))))))
+ nil decomposition-info)))))))))))
(defun write-4-byte (quadruplet stream)
(write-byte (ldb (byte 8 24) quadruplet) stream)
255
(parse-integer digit)))
-(defun output ()
+(defun output-ucd-data ()
(let ((hash (make-hash-table :test #'equalp))
(index 0))
(loop for page across *ucd-base*
(1- (length *misc-table*)))
(byte 11 21)
(if entry (ucd-transform entry) 0))
- stream))))))
- ;; KLUDGE: this code, to write out decomposition information, is a
- ;; little bit very similar to the ucd entries above. Try factoring
- ;; out the common stuff?
+ stream)))))))
+
+;;; KLUDGE: this code, to write out decomposition information, is a
+;;; little bit very similar to the ucd entries above. Try factoring
+;;; out the common stuff?
+(defun output-decomposition-data ()
(let ((hash (make-hash-table :test #'equalp))
(index 0))
(loop for page across *decomposition-base*
- do (when page
- (unless (gethash page hash)
- (setf (gethash page hash)
- (prog1 index (incf index))))))
+ do (when page
+ (unless (gethash page hash)
+ (setf (gethash page hash)
+ (prog1 index (incf index))))))
(let ((array (make-array index)))
(maphash #'(lambda (key value)
(setf (aref array value) key))
:if-exists :supersede
:if-does-not-exist :create)
(loop for code across (copy-seq *long-decompositions*)
- do (write-4-byte code stream)))))
+ do (write-4-byte code stream))))))
+
+(defun output-composition-data ()
+ #+nil ; later
+ (let (firsts seconds)
+ (flet ((frob (k v)
+ (declare (ignore v))
+ (pushnew (car k) firsts)
+ (pushnew (cdr k) seconds)))
+ (maphash #'frob *comp-table*)))
+ (with-open-file (stream (make-pathname :name "comp" :type "dat"
+ :defaults *output-directory*)
+ :direction :output
+ :element-type '(unsigned-byte 8)
+ :if-exists :supersede :if-does-not-exist :create)
+ (maphash (lambda (k v)
+ (write-4-byte (car k) stream)
+ (write-4-byte (cdr k) stream)
+ (write-4-byte v stream))
+ *comp-table*)))
+
+(defun output ()
+ (output-ucd-data)
+ (output-decomposition-data)
+ (output-composition-data)
(with-open-file (f (make-pathname :name "ucd-names" :type "lisp-expr"
:defaults *output-directory*)
:direction :output
(write-string ";;; Do not edit by hand: generated by ucd.lisp" f)
(maphash (lambda (code name)
(when name
- (print code f)
- (prin1 name f)))
+ (print code f)
+ (prin1 name f)))
*unicode-names*))
(setf *unicode-names* nil))
(with-open-file (*standard-output*
(with-standard-io-syntax
(let ((*print-pretty* t))
(prin1 `(:length ,(length *misc-table*)
- :uppercase ,(loop for (gc-index) across *misc-table*
- for i from 0
- when (= gc-index 0)
- collect i)
- :lowercase ,(loop for (gc-index) across *misc-table*
- for i from 0
- when (= gc-index 1)
- collect i)
- :titlecase ,(loop for (gc-index) across *misc-table*
- for i from 0
- when (= gc-index 2)
- collect i))))))
+ :uppercase ,(loop for (gc-index) across *misc-table*
+ for i from 0
+ when (= gc-index 0)
+ collect i)
+ :lowercase ,(loop for (gc-index) across *misc-table*
+ for i from 0
+ when (= gc-index 1)
+ collect i)
+ :titlecase ,(loop for (gc-index) across *misc-table*
+ for i from 0
+ when (= gc-index 2)
+ collect i))))))
(values))
;;; Use of the generated files