From f44f6d1adbaaa7057f1948369299c0b2a08bcd6e Mon Sep 17 00:00:00 2001 From: Christophe Rhodes Date: Tue, 4 Jun 2013 13:00:50 +0100 Subject: [PATCH] fix CL case conversions of characters involving iota subscript Oh boy. Judging by the length of the web page explaining the issue (at ) this is a bit of a minefield. I hope that this doesn't contribute further to the trouble... Although the combined _WITH_PROSGEGRAMMENI characters are of general class "Lt" (i.e. titlecase), for CL purposes we treat them as the uppercase equivalent of the lowercase _WITH_YPOGEGRAMMENI characters (as directly specified by the case mapping data in UnicodeData.txt). This is a little awkward, and involves a bit of rearrangement in the indices of the misc table entries to make the (CL) uppercase/lowercase tests efficient, but seems to be the best of all possible worlds given that we must comply with CL's character-to-character case mappings -- the alternative of not providing an uppercase version of LOWERCASE_OMEGA_WITH_YPOGEGRAMMENI seems even weirder. The way this is done in ucd.lisp is a little bit kludgy, because we have to avoid giving the same exception to the serbian titlecase digraphs (Dz and friends) which mustn't map to anything, or else we'd break invertibility. (The lowercase dz and uppercase DZ are already (CL) case mappings of each other). Probably the thing which will confuse future readers is that some (Unicode) titlecase characters are (CL) upper-case-p. --- src/code/target-char.lisp | 12 ++++++------ tests/character.pure.lisp | 3 +-- tools-for-build/ucd.lisp | 12 ++++++++++-- 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/src/code/target-char.lisp b/src/code/target-char.lisp index e217f8e..55b4bcc 100644 --- a/src/code/target-char.lisp +++ b/src/code/target-char.lisp @@ -400,20 +400,20 @@ argument is an alphabetic character, A-Z or a-z; otherwise NIL." #!+sb-doc "The argument must be a character object; UPPER-CASE-P returns T if the argument is an upper-case character, NIL otherwise." - (< (ucd-value-0 char) 4)) + (< (ucd-value-0 char) 5)) (defun lower-case-p (char) #!+sb-doc "The argument must be a character object; LOWER-CASE-P returns T if the argument is a lower-case character, NIL otherwise." - (< 3 (ucd-value-0 char) 8)) + (< 4 (ucd-value-0 char) 9)) (defun both-case-p (char) #!+sb-doc "The argument must be a character object. BOTH-CASE-P returns T if the argument is an alphabetic character and if the character exists in both upper and lower case. For ASCII, this is the same as ALPHA-CHAR-P." - (< (ucd-value-0 char) 8)) + (< (ucd-value-0 char) 9)) (defun digit-char-p (char &optional (radix 10.)) #!+sb-doc @@ -513,7 +513,7 @@ is either numeric or alphabetic." (defmacro equal-char-code (character) (let ((ch (gensym))) `(let ((,ch ,character)) - (if (< (ucd-value-0 ,ch) 4) + (if (< (ucd-value-0 ,ch) 5) (ucd-value-1 ,ch) (char-code ,ch))))) @@ -609,14 +609,14 @@ Case is ignored." #!+sb-doc "Return CHAR converted to upper-case if that is possible. Don't convert lowercase eszet (U+DF)." - (if (< 3 (ucd-value-0 char) 8) + (if (< 4 (ucd-value-0 char) 9) (code-char (ucd-value-1 char)) char)) (defun char-downcase (char) #!+sb-doc "Return CHAR converted to lower-case if that is possible." - (if (< (ucd-value-0 char) 4) + (if (< (ucd-value-0 char) 5) (code-char (ucd-value-1 char)) char)) diff --git a/tests/character.pure.lisp b/tests/character.pure.lisp index d2dd66c..7427234 100644 --- a/tests/character.pure.lisp +++ b/tests/character.pure.lisp @@ -130,8 +130,7 @@ (with-test (:name (:case-insensitive-char-comparisons :eacute)) (assert (char-equal (code-char 201) (code-char 233)))) -(with-test (:name (:case-insensitive-char-comparisons :exhaustive) - :fails-on :sb-unicode) +(with-test (:name (:case-insensitive-char-comparisons :exhaustive)) (dotimes (i char-code-limit) (let* ((char (code-char i)) (down (char-downcase char)) diff --git a/tools-for-build/ucd.lisp b/tools-for-build/ucd.lisp index 8741bcc..1a47004 100644 --- a/tools-for-build/ucd.lisp +++ b/tools-for-build/ucd.lisp @@ -50,6 +50,9 @@ (setf (gethash list *misc-hash*) (incf *misc-index*)))))) +(defun gc-index-sort-key (gc-index) + (or (cdr (assoc gc-index '((1 . 2) (2 . 1)))) gc-index)) + (defun compare-misc-entry (left right) (destructuring-bind (left-gc-index left-bidi-index left-ccc-index left-decimal-digit left-digit left-bidi-mirrored @@ -61,7 +64,8 @@ right (or (and left-cl-both-case-p (not right-cl-both-case-p)) (and (or left-cl-both-case-p (not right-cl-both-case-p)) - (or (< left-gc-index right-gc-index) + (or (< (gc-index-sort-key left-gc-index) + (gc-index-sort-key right-gc-index)) (and (= left-gc-index right-gc-index) (or (< left-decomposition-info right-decomposition-info) (and (= left-decomposition-info right-decomposition-info) @@ -270,7 +274,11 @@ (parse-integer simple-titlecase :radix 16))) (cl-both-case-p (not (null (or (and (= gc-index 0) lower-index) - (and (= gc-index 1) upper-index))))) + (and (= gc-index 1) upper-index) + ;; deal with prosgegrammeni / titlecase + (and (= gc-index 2) + (typep code-point '(integer #x1000 #x1fff)) + lower-index))))) (decomposition-info 0)) (declare (ignore digit-index)) (when (and (not cl-both-case-p) -- 1.7.10.4