From 7cf58295cc5e1ca50651c4b93023ae8681fed8fb Mon Sep 17 00:00:00 2001 From: Christophe Rhodes Date: Wed, 11 Nov 2009 13:52:19 +0000 Subject: [PATCH] 1.0.32.17: make the utf-8 external format more robust Detect all malformed sequences, including attempts to decode or encode Unicode surrogate codepoints (disallowed by the Unicode definition of UTF-8). Some error tests change behaviour, and some (unexported) condition classes are not triggered under the same circumstances any more. Also, handle null-termination on a successful conversion of an empty range of a nil array. --- NEWS | 3 + src/code/external-formats/enc-basic.lisp | 153 ++++++++++++++++-------------- src/code/octets.lisp | 9 +- tests/octets.pure.lisp | 84 +++++++++++----- version.lisp-expr | 2 +- 5 files changed, 144 insertions(+), 107 deletions(-) diff --git a/NEWS b/NEWS index db51631..2805ddf 100644 --- a/NEWS +++ b/NEWS @@ -14,6 +14,9 @@ changes relative to sbcl-1.0.32: transformations. ** improvement: restarts for providing replacement input/output on coding errors for fd-stream external formats. + ** minor incompatible change: the utf-8 external format now correctly + refuses to encode Lisp characters in the surrogate range (char-codes + between #xd800 and #xdfff). ** fix a typo preventing conversion of strings into octet vectors in the latin-2 encoding. (reported by Attila Lendvai; launchpad bug #471689) diff --git a/src/code/external-formats/enc-basic.lisp b/src/code/external-formats/enc-basic.lisp index a60f050..dee5211 100644 --- a/src/code/external-formats/enc-basic.lisp +++ b/src/code/external-formats/enc-basic.lisp @@ -132,12 +132,29 @@ (type (integer 0 1) null-padding) (type array-range sstart send)) (macrolet ((ascii-bash () + ;; KLUDGE: this depends on the fact that we know that + ;; our arrays are initialized with zeros. '(let ((array (make-array (+ null-padding (- send sstart)) :element-type '(unsigned-byte 8)))) (loop for i from 0 and j from sstart below send do (setf (aref array i) (char-code (char string j)))) - array))) + array)) + (output-code (tag) + `(case (char-len-as-utf8 code) + (1 (add-byte code)) + (2 (add-byte (logior #xc0 (ldb (byte 5 6) code))) + (add-byte (logior #x80 (ldb (byte 6 0) code)))) + (3 (when (<= #xd800 code #xdfff) + (setf error-position i) + (go ,tag)) + (add-byte (logior #xe0 (ldb (byte 4 12) code))) + (add-byte (logior #x80 (ldb (byte 6 6) code))) + (add-byte (logior #x80 (ldb (byte 6 0) code)))) + (4 (add-byte (logior #xf0 (ldb (byte 3 18) code))) + (add-byte (logior #x80 (ldb (byte 6 12) code))) + (add-byte (logior #x80 (ldb (byte 6 6) code))) + (add-byte (logior #x80 (ldb (byte 6 0) code))))))) (etypecase string ((simple-array character (*)) (let ((utf8-length 0)) @@ -149,40 +166,48 @@ (ascii-bash) (let ((array (make-array (+ null-padding utf8-length) :element-type '(unsigned-byte 8))) + (new-array nil) + (error-position 0) (index 0)) (declare (type index index)) - (flet ((add-byte (b) - (setf (aref array index) b) - (incf index))) - (declare (inline add-byte)) - (loop for i of-type index from sstart below send - do (let ((code (char-code (char string i)))) - (case (char-len-as-utf8 code) - (1 - (add-byte code)) - (2 - (add-byte (logior #b11000000 (ldb (byte 5 6) code))) - (add-byte (logior #b10000000 (ldb (byte 6 0) code)))) - (3 - (add-byte (logior #b11100000 (ldb (byte 4 12) code))) - (add-byte (logior #b10000000 (ldb (byte 6 6) code))) - (add-byte (logior #b10000000 (ldb (byte 6 0) code)))) - (4 - (add-byte (logior #b11110000 (ldb (byte 3 18) code))) - (add-byte (logior #b10000000 (ldb (byte 6 12) code))) - (add-byte (logior #b10000000 (ldb (byte 6 6) code))) - (add-byte (logior #b10000000 (ldb (byte 6 0) code)))))) - finally (return array))))))) + (tagbody + :no-error + (flet ((add-byte (b) + (setf (aref array index) b) + (incf index))) + (declare (inline add-byte)) + (loop for i of-type index from sstart below send + for code = (char-code (char string i)) + do (output-code :first-error) + finally (return-from string->utf8 array))) + :first-error + (setf new-array (make-array (* index 2) :adjustable t + :element-type '(unsigned-byte 8) + :fill-pointer index)) + (replace new-array array) + :error + (let ((replacement (encoding-error :utf-8 string index))) + (flet ((add-byte (b) (vector-push-extend b new-array))) + (dotimes (i (length replacement)) + (add-byte (aref replacement i))) + (loop for i of-type index from (1+ error-position) below send + for code = (char-code (char string i)) + do (output-code :error) + finally (return-from string->utf8 + (progn + (unless (zerop null-padding) + (vector-push-extend 0 new-array)) + (copy-seq new-array))))))))))) #!+sb-unicode ((simple-array base-char (*)) ;; On unicode builds BASE-STRINGs are limited to ASCII range, ;; so we can take a fast path -- and get benefit of the element ;; type information. On non-unicode build BASE-CHAR == - ;; CHARACTER. + ;; CHARACTER, handled above. (ascii-bash)) ((simple-array nil (*)) (if (= send sstart) - (make-array 0 :element-type '(unsigned-byte 8)) + (make-array null-padding :element-type '(unsigned-byte 8)) ;; Just get the error... (aref string sstart)))))) @@ -212,18 +237,12 @@ (declare (type (unsigned-byte 8) b)) (let ((ok (cond ((zerop (logand b #b10000000)) 1) - ((= (logand b #b11100000) #b11000000) - 2) - ((= (logand b #b11110000) #b11100000) - 3) - ((= (logand b #b11111000) #b11110000) - 4) - ((= (logand b #b11111100) #b11111000) - 5) - ((= (logand b #b11111110) #b11111100) - 6) - (t - nil)))) + ((and (= (logand b #b11100000) #b11000000) + (>= b #xc2)) 2) + ((= (logand b #b11110000) #b11100000) 3) + ((and (= (logand b #b11111000) #b11110000) + (<= b #xf4)) 4) + (t nil)))) (unless ok (setf reject-reason 'invalid-utf8-starter-byte)) ok)) @@ -236,6 +255,14 @@ (let* ((idx (the array-range (+ pos x))) (b (,accessor array idx)) (ok (= (logand b #b11000000) #b10000000))) + (when (and ok (= x 1)) + (setf ok + (case initial-byte + (#xe0 (>= b #xa0)) + (#xed (< b #xa0)) + (#xf0 (>= b #x90)) + (#xf4 (< b #x90)) + (t t)))) (unless ok (setf reject-reason 'invalid-utf8-continuation-byte) (setf reject-position idx)) @@ -251,14 +278,6 @@ (loop for i from 1 below (min len remaining-bytes) always (valid-secondary-p i)) (enough-bytes-left-p len))) - (overlong-chk (x y) - (let ((ok (or (/= initial-byte x) - (/= (logior (,accessor array (the array-range (+ pos 1))) - y) - y)))) - (unless ok - (setf reject-reason 'overlong-utf8-sequence)) - ok)) (character-below-char-code-limit-p () ;; This is only called on a four-byte sequence ;; (two in non-unicode builds) to ensure we @@ -288,43 +307,27 @@ (declare (inline valid-utf8-starter-byte-p enough-bytes-left-p valid-secondary-p - preliminary-ok-for-length - overlong-chk)) + preliminary-ok-for-length)) (let ((maybe-len (valid-utf8-starter-byte-p initial-byte))) (cond ((eql maybe-len 1) (values 1 nil)) ((and (preliminary-ok-for-length maybe-len 2) - (overlong-chk #b11000000 #b10111111) - (overlong-chk #b11000001 #b10111111) #!-sb-unicode (character-below-char-code-limit-p)) (values 2 nil)) ((and (preliminary-ok-for-length maybe-len 3) - (overlong-chk #b11100000 #b10011111) #!-sb-unicode (not (setf reject-reason 'character-out-of-range))) (values 3 nil)) ((and (preliminary-ok-for-length maybe-len 4) - (overlong-chk #b11110000 #b10001111) #!-sb-unicode (not (setf reject-reason 'character-out-of-range)) (character-below-char-code-limit-p)) (values 4 nil)) - ((and (preliminary-ok-for-length maybe-len 5) - (overlong-chk #b11111000 #b10000111) - (not (setf reject-reason 'character-out-of-range))) - (bug "can't happen")) - ((and (preliminary-ok-for-length maybe-len 6) - (overlong-chk #b11111100 #b10000011) - (not (setf reject-reason 'character-out-of-range))) - (bug "can't happen")) (t - (let* ((bad-end (ecase reject-reason - (invalid-utf8-starter-byte - (1+ pos)) - (end-of-input-in-character - end) - (invalid-utf8-continuation-byte - reject-position) - ((overlong-utf8-sequence character-out-of-range) - (+ pos maybe-len)))) + (let* ((bad-end + (ecase reject-reason + (invalid-utf8-starter-byte (1+ pos)) + (end-of-input-in-character end) + (invalid-utf8-continuation-byte reject-position) + (character-out-of-range (+ pos maybe-len)))) (bad-len (- bad-end pos))) (declare (type array-range bad-end bad-len)) (let ((replacement (decoding-error array pos bad-end :utf-8 reject-reason reject-position))) @@ -379,7 +382,7 @@ (coerce string 'simple-string)))))) (instantiate-octets-definition define-utf8->string) -(define-external-format/variable-width (:utf-8 :utf8) nil +(define-external-format/variable-width (:utf-8 :utf8) t (let ((bits (char-code byte))) (cond ((< bits #x80) 1) ((< bits #x800) 2) @@ -389,7 +392,9 @@ (1 (setf (sap-ref-8 sap tail) bits)) (2 (setf (sap-ref-8 sap tail) (logior #xc0 (ldb (byte 5 6) bits)) (sap-ref-8 sap (+ 1 tail)) (logior #x80 (ldb (byte 6 0) bits)))) - (3 (setf (sap-ref-8 sap tail) (logior #xe0 (ldb (byte 4 12) bits)) + (3 (when (<= #xd800 bits #xdfff) + (external-format-encoding-error stream bits)) + (setf (sap-ref-8 sap tail) (logior #xe0 (ldb (byte 4 12) bits)) (sap-ref-8 sap (+ 1 tail)) (logior #x80 (ldb (byte 6 6) bits)) (sap-ref-8 sap (+ 2 tail)) (logior #x80 (ldb (byte 6 0) bits)))) (4 (setf (sap-ref-8 sap tail) (logior #xf0 (ldb (byte 3 18) bits)) @@ -410,7 +415,9 @@ (3 (let ((byte2 (sap-ref-8 sap (1+ head))) (byte3 (sap-ref-8 sap (+ 2 head)))) (unless (and (<= #x80 byte2 #xbf) - (<= #x80 byte3 #xbf)) + (<= #x80 byte3 #xbf) + (or (/= byte #xe0) (<= #xa0 byte2 #xbf)) + (or (/= byte #xed) (<= #x80 byte2 #x9f))) (return-from decode-break-reason 3)) (dpb byte (byte 4 12) (dpb byte2 (byte 6 6) byte3)))) (4 (let ((byte2 (sap-ref-8 sap (1+ head))) @@ -418,7 +425,9 @@ (byte4 (sap-ref-8 sap (+ 3 head)))) (unless (and (<= #x80 byte2 #xbf) (<= #x80 byte3 #xbf) - (<= #x80 byte4 #xbf)) + (<= #x80 byte4 #xbf) + (or (/= byte #xf0) (<= #x90 byte2 #xbf)) + (or (/= byte #xf4) (<= #x80 byte2 #x8f))) (return-from decode-break-reason 4)) (dpb byte (byte 3 18) (dpb byte2 (byte 6 12) diff --git a/src/code/octets.lisp b/src/code/octets.lisp index d4d5817..0d155f1 100644 --- a/src/code/octets.lisp +++ b/src/code/octets.lisp @@ -74,20 +74,13 @@ one-past-the-end" ;;; character-out-of-range ;;; invalid-utf8-starter-byte ;;; invalid-utf8-continuation-byte -;;; overlong-utf8-sequence ;;; ;;; Of these, the only one truly likely to be of interest to calling ;;; code is end-of-input-in-character (in which case it's likely to ;;; want to make a note of octet-decoding-error-start, supply "" as a ;;; replacement string, and then move that last chunk of bytes to the ;;; beginning of its buffer for the next go round) but they're all -;;; provided on the off chance they're of interest. The next most -;;; likely interesting option is overlong-utf8-sequence -- the -;;; application, if it cares to, can decode this itself (taking care -;;; to ensure that the result isn't out of range of CHAR-CODE-LIMIT) -;;; and return that result. This library doesn't provide support for -;;; that as a conforming UTF-8-using program is supposed to treat it -;;; as an error. +;;; provided on the off chance they're of interest. (define-condition octet-decoding-error (character-decoding-error) ((array :initarg :array :accessor octet-decoding-error-array) diff --git a/tests/octets.pure.lisp b/tests/octets.pure.lisp index 4bf7f18..cb779c5 100644 --- a/tests/octets.pure.lisp +++ b/tests/octets.pure.lisp @@ -51,7 +51,8 @@ (ensure-roundtrip-utf8 () (let ((string (make-string char-code-limit))) (dotimes (i char-code-limit) - (setf (char string i) (code-char i))) + (unless (<= #xd800 i #xdfff) + (setf (char string i) (code-char i)))) (let ((string2 (octets-to-string (string-to-octets string :external-format :utf8) :external-format :utf8))) @@ -131,12 +132,21 @@ (utf8-decode-tests #(#xe0 #xa0 #x80) "?") ; #x800 (utf8-decode-tests #(#xef #xbf #xbf) "?") ; #xffff (utf8-decode-tests #(#xf0 #x90 #x80 #x80) "?")) ; #x10000 - (utf8-decode-tests #(#xf4 #x90 #x80 #x80) "?") ; #x110000 - (utf8-decode-tests #(#xf7 #xbf #xbf #xbf) "?") ; #x1fffff - (utf8-decode-tests #(#xf8 #x88 #x80 #x80 #x80) "?") ; #x200000 - (utf8-decode-tests #(#xfb #xbf #xbf #xbf #xbf) "?") ; #x3ffffff - (utf8-decode-tests #(#xfc #x84 #x80 #x80 #x80 #x80) "?") ; #x4000000 - (utf8-decode-tests #(#xfd #xbf #xbf #xbf #xbf #xbf) "?") ; #x7fffffff + #+nil ; old, 6-byte UTF-8 definition + (progn + (utf8-decode-tests #(#xf4 #x90 #x80 #x80) "?") ; #x110000 + (utf8-decode-tests #(#xf7 #xbf #xbf #xbf) "?") ; #x1fffff + (utf8-decode-tests #(#xf8 #x88 #x80 #x80 #x80) "?") ; #x200000 + (utf8-decode-tests #(#xfb #xbf #xbf #xbf #xbf) "?") ; #x3ffffff + (utf8-decode-tests #(#xfc #x84 #x80 #x80 #x80 #x80) "?") ; #x4000000 + (utf8-decode-tests #(#xfd #xbf #xbf #xbf #xbf #xbf) "?")) ; #x7fffffff + (progn ; new, 4-byte (maximum #x10ffff) UTF-8 definition + (utf8-decode-tests #(#xf4 #x90) "??") ; #x110000 + (utf8-decode-tests #(#xf7 #xbf #xbf #xbf) "????") ; #x1fffff + (utf8-decode-tests #(#xf8 #x88 #x80 #x80 #x80) "?????") ; #x200000 + (utf8-decode-tests #(#xfb #xbf #xbf #xbf #xbf) "?????") ; #x3ffffff + (utf8-decode-tests #(#xfc #x84 #x80 #x80 #x80 #x80) "??????") ; #x4000000 + (utf8-decode-tests #(#xfd #xbf #xbf #xbf #xbf #xbf) "??????")) ; #x7fffffff ;; Unexpected continuation bytes (utf8-decode-tests #(#x80) "?") @@ -178,19 +188,24 @@ ;; Otherwise incomplete sequences (last continuation byte missing) (utf8-decode-tests #0=#(#xc0) "?") - (utf8-decode-tests #1=#(#xe0 #x80) "?") - (utf8-decode-tests #2=#(#xf0 #x80 #x80) "?") + (utf8-decode-tests #1=#(#xe0 #xa0) "?") + (utf8-decode-tests #2=#(#xf0 #x90 #x80) "?") + #+nil (utf8-decode-tests #3=#(#xf8 #x80 #x80 #x80) "?") + #+nil (utf8-decode-tests #4=#(#xfc #x80 #x80 #x80 #x80) "?") (utf8-decode-tests #5=#(#xdf) "?") (utf8-decode-tests #6=#(#xef #xbf) "?") + #+nil (utf8-decode-tests #7=#(#xf7 #xbf #xbf) "?") + #+nil (utf8-decode-tests #8=#(#xfb #xbf #xbf #xbf) "?") + #+nil (utf8-decode-tests #9=#(#xfd #xbf #xbf #xbf #xbf) "?") ;; All ten previous tests concatenated - (utf8-decode-tests (concatenate 'vector #0# #1# #2# #3# #4# #5# #6# #7# #8# #9#) - "??????????") + (utf8-decode-tests (concatenate 'vector #0# #1# #2# #5# #6#) + "?????") ;; Random impossible bytes (utf8-decode-tests #(#xfe) "?") @@ -198,25 +213,25 @@ (utf8-decode-tests #(#xfe #xfe #xff #xff) "????") ;; Overlong sequences - / - (utf8-decode-tests #(#xc0 #xaf) "?") - (utf8-decode-tests #(#xe0 #x80 #xaf) "?") - (utf8-decode-tests #(#xf0 #x80 #x80 #xaf) "?") - (utf8-decode-tests #(#xf8 #x80 #x80 #x80 #xaf) "?") - (utf8-decode-tests #(#xfc #x80 #x80 #x80 #x80 #xaf) "?") + (utf8-decode-tests #(#xc0 #xaf) "??") + (utf8-decode-tests #(#xe0 #x80 #xaf) "???") + (utf8-decode-tests #(#xf0 #x80 #x80 #xaf) "????") + (utf8-decode-tests #(#xf8 #x80 #x80 #x80 #xaf) "?????") + (utf8-decode-tests #(#xfc #x80 #x80 #x80 #x80 #xaf) "??????") ;; Overlong sequences - #\Rubout - (utf8-decode-tests #(#xc1 #xbf) "?") - (utf8-decode-tests #(#xe0 #x9f #xbf) "?") - (utf8-decode-tests #(#xf0 #x8f #xbf #xbf) "?") - (utf8-decode-tests #(#xf8 #x87 #xbf #xbf #xbf) "?") - (utf8-decode-tests #(#xfc #x83 #xbf #xbf #xbf #xbf) "?") + (utf8-decode-tests #(#xc1 #xbf) "??") + (utf8-decode-tests #(#xe0 #x9f #xbf) "???") + (utf8-decode-tests #(#xf0 #x8f #xbf #xbf) "????") + (utf8-decode-tests #(#xf8 #x87 #xbf #xbf #xbf) "?????") + (utf8-decode-tests #(#xfc #x83 #xbf #xbf #xbf #xbf) "??????") ;; Overlong sequences - #\Null - (utf8-decode-tests #(#xc0 #x80) "?") - (utf8-decode-tests #(#xe0 #x80 #x80) "?") - (utf8-decode-tests #(#xf0 #x80 #x80 #x80) "?") - (utf8-decode-tests #(#xf8 #x80 #x80 #x80 #x80) "?") - (utf8-decode-tests #(#xfc #x80 #x80 #x80 #x80 #x80) "?") + (utf8-decode-tests #(#xc0 #x80) "??") + (utf8-decode-tests #(#xe0 #x80 #x80) "???") + (utf8-decode-tests #(#xf0 #x80 #x80 #x80) "????") + (utf8-decode-tests #(#xf8 #x80 #x80 #x80 #x80) "?????") + (utf8-decode-tests #(#xfc #x80 #x80 #x80 #x80 #x80) "??????") ;; Not testing surrogates & characters #xFFFE, #xFFFF; they're ;; perfectly good sbcl chars even if they're not actually ISO 10646 @@ -236,6 +251,9 @@ :external-format :utf-8))) (assert (equalp #() (string-to-octets (make-array 5 :element-type nil) :start 3 :end 3 :external-format :utf-8))) +(assert (equalp #(0) (string-to-octets (make-array 5 :element-type nil) + :start 3 :end 3 :null-terminate t + :external-format :utf-8))) ;;; whoops: the iso-8859-2 format referred to an undefined symbol. #+sb-unicode @@ -251,3 +269,17 @@ (coerce #(182 123 253 238) '(vector (unsigned-byte 8))) :external-format :euc-jp))))) +#+sb-unicode +(with-test (:name (:utf-8 :surrogates :encoding-errors)) + (handler-bind ((sb-int:character-encoding-error + (lambda (c) (use-value #\? c)))) + (assert (equalp (string-to-octets (string (code-char #xd800)) + :external-format :utf-8) + (vector (char-code #\?)))))) +#+sb-unicode +(with-test (:name (:utf-8 :surrogates :decoding-errors)) + (handler-bind ((sb-int:character-decoding-error + (lambda (c) (use-value #\? c)))) + (assert (find #\? (octets-to-string + (coerce #(237 160 128) '(vector (unsigned-byte 8))) + :external-format :utf-8))))) diff --git a/version.lisp-expr b/version.lisp-expr index d484356..cea6f96 100644 --- a/version.lisp-expr +++ b/version.lisp-expr @@ -17,4 +17,4 @@ ;;; checkins which aren't released. (And occasionally for internal ;;; versions, especially for internal versions off the main CVS ;;; branch, it gets hairier, e.g. "0.pre7.14.flaky4.13".) -"1.0.32.16" +"1.0.32.17" -- 1.7.10.4