From: Christophe Rhodes <csr21@cantab.net>
Date: Wed, 11 Nov 2009 13:52:19 +0000 (+0000)
Subject: 1.0.32.17: make the utf-8 external format more robust
X-Git-Url: http://repo.macrolet.net/gitweb/?a=commitdiff_plain;h=7cf58295cc5e1ca50651c4b93023ae8681fed8fb;p=sbcl.git

1.0.32.17: make the utf-8 external format more robust

Detect all malformed sequences, including attempts to decode or encode
Unicode surrogate codepoints (disallowed by the Unicode definition of
UTF-8).  Some error tests change behaviour, and some (unexported)
condition classes are not triggered under the same circumstances any
more.

Also, handle null-termination on a successful conversion of an empty range of
a nil array.
---

diff --git a/NEWS b/NEWS
index db51631..2805ddf 100644
--- a/NEWS
+++ b/NEWS
@@ -14,6 +14,9 @@ changes relative to sbcl-1.0.32:
        transformations.
     ** improvement: restarts for providing replacement input/output on coding
        errors for fd-stream external formats.
+    ** minor incompatible change: the utf-8 external format now correctly
+       refuses to encode Lisp characters in the surrogate range (char-codes
+       between #xd800 and #xdfff).
     ** fix a typo preventing conversion of strings into octet vectors
        in the latin-2 encoding.  (reported by Attila Lendvai; launchpad bug
        #471689)
diff --git a/src/code/external-formats/enc-basic.lisp b/src/code/external-formats/enc-basic.lisp
index a60f050..dee5211 100644
--- a/src/code/external-formats/enc-basic.lisp
+++ b/src/code/external-formats/enc-basic.lisp
@@ -132,12 +132,29 @@
            (type (integer 0 1) null-padding)
            (type array-range sstart send))
   (macrolet ((ascii-bash ()
+               ;; KLUDGE: this depends on the fact that we know that
+               ;; our arrays are initialized with zeros.
                '(let ((array (make-array (+ null-padding (- send sstart))
                                          :element-type '(unsigned-byte 8))))
                  (loop for i from 0
                        and j from sstart below send
                        do (setf (aref array i) (char-code (char string j))))
-                 array)))
+                 array))
+             (output-code (tag)
+               `(case (char-len-as-utf8 code)
+                  (1 (add-byte code))
+                  (2 (add-byte (logior #xc0 (ldb (byte 5 6) code)))
+                     (add-byte (logior #x80 (ldb (byte 6 0) code))))
+                  (3 (when (<= #xd800 code #xdfff)
+                       (setf error-position i)
+                       (go ,tag))
+                     (add-byte (logior #xe0 (ldb (byte 4 12) code)))
+                     (add-byte (logior #x80 (ldb (byte 6 6) code)))
+                     (add-byte (logior #x80 (ldb (byte 6 0) code))))
+                  (4 (add-byte (logior #xf0 (ldb (byte 3 18) code)))
+                     (add-byte (logior #x80 (ldb (byte 6 12) code)))
+                     (add-byte (logior #x80 (ldb (byte 6 6) code)))
+                     (add-byte (logior #x80 (ldb (byte 6 0) code)))))))
     (etypecase string
       ((simple-array character (*))
        (let ((utf8-length 0))
@@ -149,40 +166,48 @@
              (ascii-bash)
              (let ((array (make-array (+ null-padding utf8-length)
                                       :element-type '(unsigned-byte 8)))
+                   (new-array nil)
+                   (error-position 0)
                    (index 0))
                (declare (type index index))
-               (flet ((add-byte (b)
-                        (setf (aref array index) b)
-                        (incf index)))
-                 (declare (inline add-byte))
-                 (loop for i of-type index from sstart below send
-                       do (let ((code (char-code (char string i))))
-                            (case (char-len-as-utf8 code)
-                              (1
-                               (add-byte code))
-                              (2
-                               (add-byte (logior #b11000000 (ldb (byte 5 6) code)))
-                               (add-byte (logior #b10000000 (ldb (byte 6 0) code))))
-                              (3
-                               (add-byte (logior #b11100000 (ldb (byte 4 12) code)))
-                               (add-byte (logior #b10000000 (ldb (byte 6 6) code)))
-                               (add-byte (logior #b10000000 (ldb (byte 6 0) code))))
-                              (4
-                               (add-byte (logior #b11110000 (ldb (byte 3 18) code)))
-                               (add-byte (logior #b10000000 (ldb (byte 6 12) code)))
-                               (add-byte (logior #b10000000 (ldb (byte 6 6) code)))
-                               (add-byte (logior #b10000000 (ldb (byte 6 0) code))))))
-                       finally (return array)))))))
+               (tagbody
+                :no-error
+                  (flet ((add-byte (b)
+                           (setf (aref array index) b)
+                           (incf index)))
+                    (declare (inline add-byte))
+                    (loop for i of-type index from sstart below send
+                          for code = (char-code (char string i))
+                          do (output-code :first-error)
+                          finally (return-from string->utf8 array)))
+                :first-error
+                  (setf new-array (make-array (* index 2) :adjustable t
+                                              :element-type '(unsigned-byte 8)
+                                              :fill-pointer index))
+                  (replace new-array array)
+                :error
+                  (let ((replacement (encoding-error :utf-8 string index)))
+                    (flet ((add-byte (b) (vector-push-extend b new-array)))
+                      (dotimes (i (length replacement))
+                        (add-byte (aref replacement i)))
+                      (loop for i of-type index from (1+ error-position) below send
+                            for code = (char-code (char string i))
+                            do (output-code :error)
+                            finally (return-from string->utf8
+                                      (progn
+                                        (unless (zerop null-padding)
+                                          (vector-push-extend 0 new-array))
+                                        (copy-seq new-array)))))))))))
       #!+sb-unicode
       ((simple-array base-char (*))
        ;; On unicode builds BASE-STRINGs are limited to ASCII range,
        ;; so we can take a fast path -- and get benefit of the element
        ;; type information. On non-unicode build BASE-CHAR ==
-       ;; CHARACTER.
+       ;; CHARACTER, handled above.
        (ascii-bash))
       ((simple-array nil (*))
        (if (= send sstart)
-           (make-array 0 :element-type '(unsigned-byte 8))
+           (make-array null-padding :element-type '(unsigned-byte 8))
            ;; Just get the error...
            (aref string sstart))))))
 
@@ -212,18 +237,12 @@
                        (declare (type (unsigned-byte 8) b))
                        (let ((ok (cond
                                    ((zerop (logand b #b10000000)) 1)
-                                   ((= (logand b #b11100000) #b11000000)
-                                    2)
-                                   ((= (logand b #b11110000) #b11100000)
-                                    3)
-                                   ((= (logand b #b11111000) #b11110000)
-                                    4)
-                                   ((= (logand b #b11111100) #b11111000)
-                                    5)
-                                   ((= (logand b #b11111110) #b11111100)
-                                    6)
-                                   (t
-                                    nil))))
+                                   ((and (= (logand b #b11100000) #b11000000)
+                                         (>= b #xc2)) 2)
+                                   ((= (logand b #b11110000) #b11100000) 3)
+                                   ((and (= (logand b #b11111000) #b11110000)
+                                         (<= b #xf4)) 4)
+                                   (t nil))))
                          (unless ok
                            (setf reject-reason 'invalid-utf8-starter-byte))
                          ok))
@@ -236,6 +255,14 @@
                        (let* ((idx (the array-range (+ pos x)))
                               (b (,accessor array idx))
                               (ok (= (logand b #b11000000) #b10000000)))
+                         (when (and ok (= x 1))
+                           (setf ok
+                                 (case initial-byte
+                                   (#xe0 (>= b #xa0))
+                                   (#xed (< b #xa0))
+                                   (#xf0 (>= b #x90))
+                                   (#xf4 (< b #x90))
+                                   (t t))))
                          (unless ok
                            (setf reject-reason 'invalid-utf8-continuation-byte)
                            (setf reject-position idx))
@@ -251,14 +278,6 @@
                             (loop for i from 1 below (min len remaining-bytes)
                                   always (valid-secondary-p i))
                             (enough-bytes-left-p len)))
-                     (overlong-chk (x y)
-                       (let ((ok (or (/= initial-byte x)
-                                     (/= (logior (,accessor array (the array-range (+ pos 1)))
-                                                 y)
-                                         y))))
-                         (unless ok
-                           (setf reject-reason 'overlong-utf8-sequence))
-                         ok))
                      (character-below-char-code-limit-p ()
                        ;; This is only called on a four-byte sequence
                        ;; (two in non-unicode builds) to ensure we
@@ -288,43 +307,27 @@
               (declare (inline valid-utf8-starter-byte-p
                                enough-bytes-left-p
                                valid-secondary-p
-                               preliminary-ok-for-length
-                               overlong-chk))
+                               preliminary-ok-for-length))
               (let ((maybe-len (valid-utf8-starter-byte-p initial-byte)))
                 (cond ((eql maybe-len 1)
                        (values 1 nil))
                       ((and (preliminary-ok-for-length maybe-len 2)
-                            (overlong-chk #b11000000 #b10111111)
-                            (overlong-chk #b11000001 #b10111111)
                             #!-sb-unicode (character-below-char-code-limit-p))
                        (values 2 nil))
                       ((and (preliminary-ok-for-length maybe-len 3)
-                            (overlong-chk #b11100000 #b10011111)
                             #!-sb-unicode (not (setf reject-reason 'character-out-of-range)))
                        (values 3 nil))
                       ((and (preliminary-ok-for-length maybe-len 4)
-                            (overlong-chk #b11110000 #b10001111)
                             #!-sb-unicode (not (setf reject-reason 'character-out-of-range))
                             (character-below-char-code-limit-p))
                        (values 4 nil))
-                      ((and (preliminary-ok-for-length maybe-len 5)
-                            (overlong-chk #b11111000 #b10000111)
-                            (not (setf reject-reason 'character-out-of-range)))
-                       (bug "can't happen"))
-                      ((and (preliminary-ok-for-length maybe-len 6)
-                            (overlong-chk #b11111100 #b10000011)
-                            (not (setf reject-reason 'character-out-of-range)))
-                       (bug "can't happen"))
                       (t
-                       (let* ((bad-end (ecase reject-reason
-                                         (invalid-utf8-starter-byte
-                                          (1+ pos))
-                                         (end-of-input-in-character
-                                          end)
-                                         (invalid-utf8-continuation-byte
-                                          reject-position)
-                                         ((overlong-utf8-sequence character-out-of-range)
-                                          (+ pos maybe-len))))
+                       (let* ((bad-end
+                               (ecase reject-reason
+                                 (invalid-utf8-starter-byte (1+ pos))
+                                 (end-of-input-in-character end)
+                                 (invalid-utf8-continuation-byte reject-position)
+                                 (character-out-of-range (+ pos maybe-len))))
                               (bad-len (- bad-end pos)))
                          (declare (type array-range bad-end bad-len))
                          (let ((replacement (decoding-error array pos bad-end :utf-8 reject-reason reject-position)))
@@ -379,7 +382,7 @@
           (coerce string 'simple-string))))))
 (instantiate-octets-definition define-utf8->string)
 
-(define-external-format/variable-width (:utf-8 :utf8) nil
+(define-external-format/variable-width (:utf-8 :utf8) t
   (let ((bits (char-code byte)))
     (cond ((< bits #x80) 1)
           ((< bits #x800) 2)
@@ -389,7 +392,9 @@
     (1 (setf (sap-ref-8 sap tail) bits))
     (2 (setf (sap-ref-8 sap tail)       (logior #xc0 (ldb (byte 5 6) bits))
              (sap-ref-8 sap (+ 1 tail)) (logior #x80 (ldb (byte 6 0) bits))))
-    (3 (setf (sap-ref-8 sap tail)       (logior #xe0 (ldb (byte 4 12) bits))
+    (3 (when (<= #xd800 bits #xdfff)
+         (external-format-encoding-error stream bits))
+       (setf (sap-ref-8 sap tail)       (logior #xe0 (ldb (byte 4 12) bits))
              (sap-ref-8 sap (+ 1 tail)) (logior #x80 (ldb (byte 6 6) bits))
              (sap-ref-8 sap (+ 2 tail)) (logior #x80 (ldb (byte 6 0) bits))))
     (4 (setf (sap-ref-8 sap tail)       (logior #xf0 (ldb (byte 3 18) bits))
@@ -410,7 +415,9 @@
                (3 (let ((byte2 (sap-ref-8 sap (1+ head)))
                         (byte3 (sap-ref-8 sap (+ 2 head))))
                     (unless (and (<= #x80 byte2 #xbf)
-                                 (<= #x80 byte3 #xbf))
+                                 (<= #x80 byte3 #xbf)
+                                 (or (/= byte #xe0) (<= #xa0 byte2 #xbf))
+                                 (or (/= byte #xed) (<= #x80 byte2 #x9f)))
                       (return-from decode-break-reason 3))
                     (dpb byte (byte 4 12) (dpb byte2 (byte 6 6) byte3))))
                (4 (let ((byte2 (sap-ref-8 sap (1+ head)))
@@ -418,7 +425,9 @@
                         (byte4 (sap-ref-8 sap (+ 3 head))))
                     (unless (and (<= #x80 byte2 #xbf)
                                  (<= #x80 byte3 #xbf)
-                                 (<= #x80 byte4 #xbf))
+                                 (<= #x80 byte4 #xbf)
+                                 (or (/= byte #xf0) (<= #x90 byte2 #xbf))
+                                 (or (/= byte #xf4) (<= #x80 byte2 #x8f)))
                       (return-from decode-break-reason 4))
                     (dpb byte (byte 3 18)
                          (dpb byte2 (byte 6 12)
diff --git a/src/code/octets.lisp b/src/code/octets.lisp
index d4d5817..0d155f1 100644
--- a/src/code/octets.lisp
+++ b/src/code/octets.lisp
@@ -74,20 +74,13 @@ one-past-the-end"
 ;;;   character-out-of-range
 ;;;   invalid-utf8-starter-byte
 ;;;   invalid-utf8-continuation-byte
-;;;   overlong-utf8-sequence
 ;;;
 ;;; Of these, the only one truly likely to be of interest to calling
 ;;; code is end-of-input-in-character (in which case it's likely to
 ;;; want to make a note of octet-decoding-error-start, supply "" as a
 ;;; replacement string, and then move that last chunk of bytes to the
 ;;; beginning of its buffer for the next go round) but they're all
-;;; provided on the off chance they're of interest.  The next most
-;;; likely interesting option is overlong-utf8-sequence -- the
-;;; application, if it cares to, can decode this itself (taking care
-;;; to ensure that the result isn't out of range of CHAR-CODE-LIMIT)
-;;; and return that result.  This library doesn't provide support for
-;;; that as a conforming UTF-8-using program is supposed to treat it
-;;; as an error.
+;;; provided on the off chance they're of interest.
 
 (define-condition octet-decoding-error (character-decoding-error)
   ((array :initarg :array :accessor octet-decoding-error-array)
diff --git a/tests/octets.pure.lisp b/tests/octets.pure.lisp
index 4bf7f18..cb779c5 100644
--- a/tests/octets.pure.lisp
+++ b/tests/octets.pure.lisp
@@ -51,7 +51,8 @@
          (ensure-roundtrip-utf8 ()
            (let ((string (make-string char-code-limit)))
              (dotimes (i char-code-limit)
-               (setf (char string i) (code-char i)))
+               (unless (<= #xd800 i #xdfff)
+                 (setf (char string i) (code-char i))))
              (let ((string2
                     (octets-to-string (string-to-octets string :external-format :utf8)
                                       :external-format :utf8)))
@@ -131,12 +132,21 @@
     (utf8-decode-tests #(#xe0 #xa0 #x80) "?") ; #x800
     (utf8-decode-tests #(#xef #xbf #xbf) "?") ; #xffff
     (utf8-decode-tests #(#xf0 #x90 #x80 #x80) "?")) ; #x10000
-  (utf8-decode-tests #(#xf4 #x90 #x80 #x80) "?") ; #x110000
-  (utf8-decode-tests #(#xf7 #xbf #xbf #xbf) "?") ; #x1fffff
-  (utf8-decode-tests #(#xf8 #x88 #x80 #x80 #x80) "?") ; #x200000
-  (utf8-decode-tests #(#xfb #xbf #xbf #xbf #xbf) "?") ; #x3ffffff
-  (utf8-decode-tests #(#xfc #x84 #x80 #x80 #x80 #x80) "?") ; #x4000000
-  (utf8-decode-tests #(#xfd #xbf #xbf #xbf #xbf #xbf) "?") ; #x7fffffff
+  #+nil ; old, 6-byte UTF-8 definition
+  (progn
+    (utf8-decode-tests #(#xf4 #x90 #x80 #x80) "?") ; #x110000
+    (utf8-decode-tests #(#xf7 #xbf #xbf #xbf) "?") ; #x1fffff
+    (utf8-decode-tests #(#xf8 #x88 #x80 #x80 #x80) "?") ; #x200000
+    (utf8-decode-tests #(#xfb #xbf #xbf #xbf #xbf) "?") ; #x3ffffff
+    (utf8-decode-tests #(#xfc #x84 #x80 #x80 #x80 #x80) "?") ; #x4000000
+    (utf8-decode-tests #(#xfd #xbf #xbf #xbf #xbf #xbf) "?")) ; #x7fffffff
+  (progn ; new, 4-byte (maximum #x10ffff) UTF-8 definition
+    (utf8-decode-tests #(#xf4 #x90) "??") ; #x110000
+    (utf8-decode-tests #(#xf7 #xbf #xbf #xbf) "????") ; #x1fffff
+    (utf8-decode-tests #(#xf8 #x88 #x80 #x80 #x80) "?????") ; #x200000
+    (utf8-decode-tests #(#xfb #xbf #xbf #xbf #xbf) "?????") ; #x3ffffff
+    (utf8-decode-tests #(#xfc #x84 #x80 #x80 #x80 #x80) "??????") ; #x4000000
+    (utf8-decode-tests #(#xfd #xbf #xbf #xbf #xbf #xbf) "??????")) ; #x7fffffff
 
   ;; Unexpected continuation bytes
   (utf8-decode-tests #(#x80) "?")
@@ -178,19 +188,24 @@
 
   ;; Otherwise incomplete sequences (last continuation byte missing)
   (utf8-decode-tests #0=#(#xc0) "?")
-  (utf8-decode-tests #1=#(#xe0 #x80) "?")
-  (utf8-decode-tests #2=#(#xf0 #x80 #x80) "?")
+  (utf8-decode-tests #1=#(#xe0 #xa0) "?")
+  (utf8-decode-tests #2=#(#xf0 #x90 #x80) "?")
+  #+nil
   (utf8-decode-tests #3=#(#xf8 #x80 #x80 #x80) "?")
+  #+nil
   (utf8-decode-tests #4=#(#xfc #x80 #x80 #x80 #x80) "?")
   (utf8-decode-tests #5=#(#xdf) "?")
   (utf8-decode-tests #6=#(#xef #xbf) "?")
+  #+nil
   (utf8-decode-tests #7=#(#xf7 #xbf #xbf) "?")
+  #+nil
   (utf8-decode-tests #8=#(#xfb #xbf #xbf #xbf) "?")
+  #+nil
   (utf8-decode-tests #9=#(#xfd #xbf #xbf #xbf #xbf) "?")
 
   ;; All ten previous tests concatenated
-  (utf8-decode-tests (concatenate 'vector #0# #1# #2# #3# #4# #5# #6# #7# #8# #9#)
-                     "??????????")
+  (utf8-decode-tests (concatenate 'vector #0# #1# #2# #5# #6#)
+                     "?????")
 
   ;; Random impossible bytes
   (utf8-decode-tests #(#xfe) "?")
@@ -198,25 +213,25 @@
   (utf8-decode-tests #(#xfe #xfe #xff #xff) "????")
 
   ;; Overlong sequences - /
-  (utf8-decode-tests #(#xc0 #xaf) "?")
-  (utf8-decode-tests #(#xe0 #x80 #xaf) "?")
-  (utf8-decode-tests #(#xf0 #x80 #x80 #xaf) "?")
-  (utf8-decode-tests #(#xf8 #x80 #x80 #x80 #xaf) "?")
-  (utf8-decode-tests #(#xfc #x80 #x80 #x80 #x80 #xaf) "?")
+  (utf8-decode-tests #(#xc0 #xaf) "??")
+  (utf8-decode-tests #(#xe0 #x80 #xaf) "???")
+  (utf8-decode-tests #(#xf0 #x80 #x80 #xaf) "????")
+  (utf8-decode-tests #(#xf8 #x80 #x80 #x80 #xaf) "?????")
+  (utf8-decode-tests #(#xfc #x80 #x80 #x80 #x80 #xaf) "??????")
 
   ;; Overlong sequences - #\Rubout
-  (utf8-decode-tests #(#xc1 #xbf) "?")
-  (utf8-decode-tests #(#xe0 #x9f #xbf) "?")
-  (utf8-decode-tests #(#xf0 #x8f #xbf #xbf) "?")
-  (utf8-decode-tests #(#xf8 #x87 #xbf #xbf #xbf) "?")
-  (utf8-decode-tests #(#xfc #x83 #xbf #xbf #xbf #xbf) "?")
+  (utf8-decode-tests #(#xc1 #xbf) "??")
+  (utf8-decode-tests #(#xe0 #x9f #xbf) "???")
+  (utf8-decode-tests #(#xf0 #x8f #xbf #xbf) "????")
+  (utf8-decode-tests #(#xf8 #x87 #xbf #xbf #xbf) "?????")
+  (utf8-decode-tests #(#xfc #x83 #xbf #xbf #xbf #xbf) "??????")
 
   ;; Overlong sequences - #\Null
-  (utf8-decode-tests #(#xc0 #x80) "?")
-  (utf8-decode-tests #(#xe0 #x80 #x80) "?")
-  (utf8-decode-tests #(#xf0 #x80 #x80 #x80) "?")
-  (utf8-decode-tests #(#xf8 #x80 #x80 #x80 #x80) "?")
-  (utf8-decode-tests #(#xfc #x80 #x80 #x80 #x80 #x80) "?")
+  (utf8-decode-tests #(#xc0 #x80) "??")
+  (utf8-decode-tests #(#xe0 #x80 #x80) "???")
+  (utf8-decode-tests #(#xf0 #x80 #x80 #x80) "????")
+  (utf8-decode-tests #(#xf8 #x80 #x80 #x80 #x80) "?????")
+  (utf8-decode-tests #(#xfc #x80 #x80 #x80 #x80 #x80) "??????")
 
   ;; Not testing surrogates & characters #xFFFE, #xFFFF; they're
   ;; perfectly good sbcl chars even if they're not actually ISO 10646
@@ -236,6 +251,9 @@
                                       :external-format :utf-8)))
 (assert (equalp #() (string-to-octets (make-array 5 :element-type nil)
                                       :start 3 :end 3 :external-format :utf-8)))
+(assert (equalp #(0) (string-to-octets (make-array 5 :element-type nil)
+                                       :start 3 :end 3 :null-terminate t
+                                       :external-format :utf-8)))
 
 ;;; whoops: the iso-8859-2 format referred to an undefined symbol.
 #+sb-unicode
@@ -251,3 +269,17 @@
                       (coerce #(182 123 253 238) '(vector (unsigned-byte 8)))
                       :external-format :euc-jp)))))
 
+#+sb-unicode
+(with-test (:name (:utf-8 :surrogates :encoding-errors))
+  (handler-bind ((sb-int:character-encoding-error
+                  (lambda (c) (use-value #\? c))))
+    (assert (equalp (string-to-octets (string (code-char #xd800))
+                                      :external-format :utf-8)
+                    (vector (char-code #\?))))))
+#+sb-unicode
+(with-test (:name (:utf-8 :surrogates :decoding-errors))
+  (handler-bind ((sb-int:character-decoding-error
+                  (lambda (c) (use-value #\? c))))
+    (assert (find #\? (octets-to-string
+                       (coerce #(237 160 128) '(vector (unsigned-byte 8)))
+                       :external-format :utf-8)))))
diff --git a/version.lisp-expr b/version.lisp-expr
index d484356..cea6f96 100644
--- a/version.lisp-expr
+++ b/version.lisp-expr
@@ -17,4 +17,4 @@
 ;;; checkins which aren't released. (And occasionally for internal
 ;;; versions, especially for internal versions off the main CVS
 ;;; branch, it gets hairier, e.g. "0.pre7.14.flaky4.13".)
-"1.0.32.16"
+"1.0.32.17"