From 8c1cdfc03a0070295e595e8b0ba97214ccb50a41 Mon Sep 17 00:00:00 2001 From: Christophe Rhodes Date: Fri, 29 Oct 2004 13:56:55 +0000 Subject: [PATCH] 0.8.16.15: Fix BUG #308 ... use ucd.dat to generate a database for characters with information about graphicness, caseness and the like; ... use the database in the ANSI character operators; ... (frob the compiler transforms to work with latin-1 characters) This patch was brought to you by character_branch --- BUGS | 17 ------ NEWS | 2 + make-host-1.sh | 3 + src/code/cold-init.lisp | 2 + src/code/target-char.lisp | 140 ++++++++++++++++++++++++++++++++++++--------- src/compiler/ir1tran.lisp | 1 + src/compiler/srctran.lisp | 52 ++++++++++++++--- tests/character.pure.lisp | 7 +++ version.lisp-expr | 2 +- 9 files changed, 171 insertions(+), 55 deletions(-) diff --git a/BUGS b/BUGS index b6981d2..5105302 100644 --- a/BUGS +++ b/BUGS @@ -1153,23 +1153,6 @@ WORKAROUND: collect `(array ,(sb-vm:saetp-specifier x))))) => NIL, T (when it should be T, T) -308: "Characters without names" - (reported by Bruno Haible sbcl-devel "character names are missing" - 2004-04-19) - (graphic-char-p (code-char 255)) - => NIL - (char-name (code-char 255)) - => NIL - - SBCL is unsure of what to do about characters with codes in the - range 128-255. Currently they are treated as non-graphic, but don't - have names, which is not compliant with the standard. Various fixes - are possible, such as - * giving them names such as NON-ASCII-128; - * reducing CHAR-CODE-LIMIT to 127 (almost certainly unpopular); - * making the characters graphic (makes a certain amount of sense); - * biting the bullet and implementing Unicode (probably quite hard). - 309: "Dubious values for implementation limits" (reported by Bruno Haible sbcl-devel "Incorrect value of multiple-values-limit" 2004-04-19) diff --git a/NEWS b/NEWS index 09d4da5..3a3e2cf 100644 --- a/NEWS +++ b/NEWS @@ -12,6 +12,8 @@ changes in sbcl-0.8.17 relative to sbcl-0.8.16: * minor incompatible change: SB-C::*COMPILER-ERROR-PRINT-FOO* variables are no longer supported: use SB-EXT:*COMPILER-PRINT-VARIABLE-ALIST* instead. + * fixed bug #308: non-graphic characters now all have names, as + required. (reported by Bruno Haible) * bug fix: Cyclic structures and unprintable objects in compiler messages no longer cause errors. (reported by Bruno Haible) * bug fix: READ, READ-PRESERVING-WHITESPACE, READ-DELIMITED-LIST, diff --git a/make-host-1.sh b/make-host-1.sh index 3187596..6800367 100644 --- a/make-host-1.sh +++ b/make-host-1.sh @@ -43,6 +43,9 @@ $SBCL_XC_HOST <<-'EOF' || exit 1 (when (find :sb-test *shebang-features*) (load "tests/type.before-xc.lisp") (load "tests/info.before-xc.lisp")) + (load "tools-for-build/ucd.lisp") + (sb-cold::slurp-ucd) + (sb-cold::output) (host-cload-stem "src/compiler/generic/genesis") (sb!vm:genesis :c-header-dir-name "src/runtime/genesis") #+cmu (ext:quit) diff --git a/src/code/cold-init.lisp b/src/code/cold-init.lisp index 222256e..45d3d39 100644 --- a/src/code/cold-init.lisp +++ b/src/code/cold-init.lisp @@ -111,6 +111,8 @@ ;; this to be initialized, so we initialize it right away. (show-and-call !random-cold-init) + (show-and-call !character-database-cold-init) + (show-and-call !early-package-cold-init) (show-and-call !package-cold-init) diff --git a/src/code/target-char.lisp b/src/code/target-char.lisp index 161d893..e534058 100644 --- a/src/code/target-char.lisp +++ b/src/code/target-char.lisp @@ -32,6 +32,27 @@ (deftype char-code () `(integer 0 (,char-code-limit))) +(defvar *character-database*) + +(macrolet ((frob () + (with-open-file (stream (merge-pathnames + (make-pathname + :directory + '(:relative :up :up "output") + :name "ucd" + :type "dat") + sb!xc:*compile-file-pathname*) + :direction :input + :element-type '(unsigned-byte 8)) + (let* ((length (file-length stream)) + (array (make-array length + :element-type '(unsigned-byte 8)))) + (read-sequence array stream) + `(defun !character-database-cold-init () + (setq *character-database* ',array)))))) + (frob)) +#+sb-xc-host (!character-database-cold-init) + ;;; This is the alist of (character-name . character) for characters ;;; with long names. The first name in this list for a given character ;;; is used on typeout and is the preferred form for input. @@ -40,8 +61,10 @@ (dolist (code char-names-list) (destructuring-bind (ccode names) code (dolist (name names) - (results (cons name (code-char ccode)))))) - `(defparameter *char-name-alist* ',(results))))) + (results (cons name ccode))))) + `(defparameter *char-name-alist* + (mapcar (lambda (x) (cons (car x) (code-char (cdr x)))) + ',(results)))))) ;; Note: The *** markers here indicate character names which are ;; required by the ANSI specification of #'CHAR-NAME. For the others, ;; we prefer the ASCII standard name. @@ -78,15 +101,76 @@ (#x1E ("Rs" "^^")) (#x1F ("Us" "^_")) (#x20 ("Space" "Sp")) ; *** See Note above. - (#x7f ("Rubout" "Delete" "Del"))))) ; *** See Note above. + (#x7f ("Rubout" "Delete" "Del")) + (#x80 ("C80")) + (#x81 ("C81")) + (#x82 ("Break-Permitted")) + (#x83 ("No-Break-Permitted")) + (#x84 ("C84")) + (#x85 ("Next-Line")) + (#x86 ("Start-Selected-Area")) + (#x87 ("End-Selected-Area")) + (#x88 ("Character-Tabulation-Set")) + (#x89 ("Character-Tabulation-With-Justification")) + (#x8A ("Line-Tabulation-Set")) + (#x8B ("Partial-Line-Forward")) + (#x8C ("Partial-Line-Backward")) + (#x8D ("Reverse-Linefeed")) + (#x8E ("Single-Shift-Two")) + (#x8F ("Single-Shift-Three")) + (#x90 ("Device-Control-String")) + (#x91 ("Private-Use-One")) + (#x92 ("Private-Use-Two")) + (#x93 ("Set-Transmit-State")) + (#x94 ("Cancel-Character")) + (#x95 ("Message-Waiting")) + (#x96 ("Start-Guarded-Area")) + (#x97 ("End-Guarded-Area")) + (#x98 ("Start-String")) + (#x99 ("C99")) + (#x9A ("Single-Character-Introducer")) + (#x9B ("Control-Sequence-Introducer")) + (#x9C ("String-Terminator")) + (#x9D ("Operating-System-Command")) + (#x9E ("Privacy-Message")) + (#x9F ("Application-Program-Command"))))) ; *** See Note above. ;;;; accessor functions +;; (* 8 186) => 1488 +;; (+ 1488 (ash #x110000 -8)) => 5840 +(defun ucd-index (char) + (let* ((cp (char-code char)) + (cp-high (ash cp -8)) + (page (aref *character-database* (+ 1488 cp-high)))) + (+ 5840 (ash page 10) (ash (ldb (byte 8 0) cp) 2)))) + +(defun ucd-value-0 (char) + (aref *character-database* (ucd-index char))) + +(defun ucd-value-1 (char) + (let ((index (ucd-index char))) + (dpb (aref *character-database* (+ index 3)) + (byte 8 16) + (dpb (aref *character-database* (+ index 2)) + (byte 8 8) + (aref *character-database* (1+ index)))))) + +(defun ucd-general-category (char) + (aref *character-database* (* 8 (ucd-value-0 char)))) + +(defun ucd-decimal-digit (char) + (let ((decimal-digit (aref *character-database* + (+ 3 (* 8 (ucd-value-0 char)))))) + (when (< decimal-digit 10) + decimal-digit))) + (defun char-code (char) #!+sb-doc "Return the integer code of CHAR." + ;; FIXME: do we actually need this? (etypecase char - (base-char (char-code (truly-the base-char char))))) + (character (char-code (truly-the character char))))) (defun char-int (char) #!+sb-doc @@ -156,41 +240,34 @@ "The argument must be a character object. GRAPHIC-CHAR-P returns T if the argument is a printing character (space through ~ in ASCII), otherwise returns NIL." - (and (typep char 'base-char) - (< 31 - (char-code (the base-char char)) - 127))) + (let ((n (char-code char))) + (or (< 31 n 127) + (< 159 n)))) (defun alpha-char-p (char) #!+sb-doc "The argument must be a character object. ALPHA-CHAR-P returns T if the argument is an alphabetic character, A-Z or a-z; otherwise NIL." - (let ((m (char-code char))) - (or (< 64 m 91) (< 96 m 123)))) + (< (ucd-general-category char) 5)) (defun upper-case-p (char) #!+sb-doc "The argument must be a character object; UPPER-CASE-P returns T if the argument is an upper-case character, NIL otherwise." - (< 64 - (char-code char) - 91)) + (= (ucd-value-0 char) 0)) (defun lower-case-p (char) #!+sb-doc "The argument must be a character object; LOWER-CASE-P returns T if the argument is a lower-case character, NIL otherwise." - (< 96 - (char-code char) - 123)) + (= (ucd-value-0 char) 1)) (defun both-case-p (char) #!+sb-doc "The argument must be a character object. BOTH-CASE-P returns T if the argument is an alphabetic character and if the character exists in both upper and lower case. For ASCII, this is the same as ALPHA-CHAR-P." - (let ((m (char-code char))) - (or (< 64 m 91) (< 96 m 123)))) + (< (ucd-value-0 char) 2)) (defun digit-char-p (char &optional (radix 10.)) #!+sb-doc @@ -208,14 +285,17 @@ ;; Also check lower case a - z. ((and (>= (setq m (- m 32)) 10) (< m radix)) m) ;; Else, fail. - (t nil)))) + (t (let ((number (ucd-decimal-digit char))) + (when (and number (< number radix)) + number)))))) (defun alphanumericp (char) #!+sb-doc "Given a character-object argument, ALPHANUMERICP returns T if the argument is either numeric or alphabetic." - (let ((m (char-code char))) - (or (< 47 m 58) (< 64 m 91) (< 96 m 123)))) + (let ((gc (ucd-general-category char))) + (or (< gc 5) + (= gc 12)))) (defun char= (character &rest more-characters) #!+sb-doc @@ -279,8 +359,11 @@ ;;; which loses font, bits, and case info. (defmacro equal-char-code (character) - `(let ((ch (char-code ,character))) - (if (< 96 ch 123) (- ch 32) ch))) + (let ((ch (gensym))) + `(let ((,ch ,character)) + (if (= (ucd-value-0 ,ch) 0) + (ucd-value-1 ,ch) + (char-code ,ch))))) (defun char-equal (character &rest more-characters) #!+sb-doc @@ -354,16 +437,17 @@ (defun char-upcase (char) #!+sb-doc - "Return CHAR converted to upper-case if that is possible." - (if (lower-case-p char) - (code-char (- (char-code char) 32)) + "Return CHAR converted to upper-case if that is possible. Don't convert + lowercase eszet (U+DF)." + (if (= (ucd-value-0 char) 1) + (code-char (ucd-value-1 char)) char)) (defun char-downcase (char) #!+sb-doc "Return CHAR converted to lower-case if that is possible." - (if (upper-case-p char) - (code-char (+ (char-code char) 32)) + (if (= (ucd-value-0 char) 0) + (code-char (ucd-value-1 char)) char)) (defun digit-char (weight &optional (radix 10)) diff --git a/src/compiler/ir1tran.lisp b/src/compiler/ir1tran.lisp index 8b2bee0..340a739 100644 --- a/src/compiler/ir1tran.lisp +++ b/src/compiler/ir1tran.lisp @@ -239,6 +239,7 @@ ;; can't contain other objects (unless (typep value '(or #-sb-xc-host unboxed-array + #+sb-xc-host (simple-array (unsigned-byte 8) (*)) symbol number character diff --git a/src/compiler/srctran.lisp b/src/compiler/srctran.lisp index 8f1739f..a49c415 100644 --- a/src/compiler/srctran.lisp +++ b/src/compiler/srctran.lisp @@ -2321,7 +2321,29 @@ (specifier-type `(integer ,lo-res ,hi-res)))))) (defoptimizer (code-char derive-type) ((code)) - (specifier-type 'base-char)) + (let ((type (lvar-type code))) + ;; FIXME: unions of integral ranges? It ought to be easier to do + ;; this, given that CHARACTER-SET is basically an integral range + ;; type. -- CSR, 2004-10-04 + (when (numeric-type-p type) + (let* ((lo (numeric-type-low type)) + (hi (numeric-type-high type)) + (type (specifier-type `(character-set ((,lo . ,hi)))))) + (cond + ;; KLUDGE: when running on the host, we lose a slight amount + ;; of precision so that we don't have to "unparse" types + ;; that formally we can't, such as (CHARACTER-SET ((0 + ;; . 0))). -- CSR, 2004-10-06 + #+sb-xc-host + ((csubtypep type (specifier-type 'standard-char)) type) + #+sb-xc-host + ((csubtypep type (specifier-type 'base-char)) + (specifier-type 'base-char)) + #+sb-xc-host + ((csubtypep type (specifier-type 'extended-char)) + (specifier-type 'extended-char)) + (t #+sb-xc-host (specifier-type 'character) + #-sb-xc-host type)))))) (defoptimizer (values derive-type) ((&rest values)) (make-values-type :required (mapcar #'lvar-type values))) @@ -2917,7 +2939,9 @@ ;;;; character operations -(deftransform char-equal ((a b) (base-char base-char)) +(deftransform char-equal ((a b) + ((character-set ((0 . 255))) + (character-set ((0 . 255))))) "open code" '(let* ((ac (char-code a)) (bc (char-code b)) @@ -2925,21 +2949,31 @@ (or (zerop sum) (when (eql sum #x20) (let ((sum (+ ac bc))) - (and (> sum 161) (< sum 213))))))) + (or (and (> sum 161) (< sum 213)) + (and (> sum 415) (< sum 461)) + (and (> sum 463) (< sum 477)))))))) -(deftransform char-upcase ((x) (base-char)) +(deftransform char-upcase ((x) ((character-set ((0 . 255))))) "open code" '(let ((n-code (char-code x))) - (if (and (> n-code #o140) ; Octal 141 is #\a. - (< n-code #o173)) ; Octal 172 is #\z. + (if (or (and (> n-code #o140) ; Octal 141 is #\a. + (< n-code #o173)) ; Octal 172 is #\z. + (and (> n-code #o337) + (< n-code #o367)) + (and (> n-code #o367) + (< n-code #o377))) (code-char (logxor #x20 n-code)) x))) -(deftransform char-downcase ((x) (base-char)) +(deftransform char-downcase ((x) ((character-set ((0 . 255))))) "open code" '(let ((n-code (char-code x))) - (if (and (> n-code 64) ; 65 is #\A. - (< n-code 91)) ; 90 is #\Z. + (if (or (and (> n-code 64) ; 65 is #\A. + (< n-code 91)) ; 90 is #\Z. + (and (> n-code 191) + (< n-code 215)) + (and (> n-code 215) + (< n-code 223))) (code-char (logxor #x20 n-code)) x))) diff --git a/tests/character.pure.lisp b/tests/character.pure.lisp index 97a2590..74619b4 100644 --- a/tests/character.pure.lisp +++ b/tests/character.pure.lisp @@ -54,3 +54,10 @@ (digit-char 4 1) (digit-char 4 37))) (assert (raises-error? (apply (car form) (mapcar 'eval (cdr form))) type-error))) + +(dotimes (i 256) + (let* ((char (code-char i)) + (graphicp (graphic-char-p char)) + (name (char-name char))) + (unless graphicp + (assert name)))) diff --git a/version.lisp-expr b/version.lisp-expr index 1a2b82e..7718cb2 100644 --- a/version.lisp-expr +++ b/version.lisp-expr @@ -17,4 +17,4 @@ ;;; checkins which aren't released. (And occasionally for internal ;;; versions, especially for internal versions off the main CVS ;;; branch, it gets hairier, e.g. "0.pre7.14.flaky4.13".) -"0.8.16.14" +"0.8.16.15" -- 1.7.10.4