X-Git-Url: http://repo.macrolet.net/gitweb/?a=blobdiff_plain;f=tests%2Foctets.pure.lisp;h=e78e2db29493630e035897f45f7713a9df480093;hb=d7875c296a4988e9f27e2776237884deb1984c62;hp=166d01b4bc65fc1d1f41d21b7495b51dbc75e321;hpb=d8ad375f27c0006f8896047686a065b8243c7e0f;p=sbcl.git diff --git a/tests/octets.pure.lisp b/tests/octets.pure.lisp index 166d01b..e78e2db 100644 --- a/tests/octets.pure.lisp +++ b/tests/octets.pure.lisp @@ -51,7 +51,8 @@ (ensure-roundtrip-utf8 () (let ((string (make-string char-code-limit))) (dotimes (i char-code-limit) - (setf (char string i) (code-char i))) + (unless (<= #xd800 i #xdfff) + (setf (char string i) (code-char i)))) (let ((string2 (octets-to-string (string-to-octets string :external-format :utf8) :external-format :utf8))) @@ -95,24 +96,30 @@ l9c)))) (ensure-roundtrip-utf8) - (let ((non-ascii-bytes (make-array 128 - :element-type '(unsigned-byte 8) - :initial-contents (loop for i from 128 below 256 - collect i)))) - (handler-bind ((sb-int:character-decoding-error - (lambda (c) - (use-value "??" c)))) - (assert (string= (octets-to-string non-ascii-bytes :external-format :ascii) - (make-string 256 :initial-element #\?))))) - (let ((non-ascii-chars (make-array 128 - :element-type 'character - :initial-contents (loop for i from 128 below 256 - collect (code-char i))))) - (handler-bind ((sb-int:character-encoding-error - (lambda (c) - (use-value "??" c)))) - (assert (equalp (string-to-octets non-ascii-chars :external-format :ascii) - (make-array 256 :initial-element (char-code #\?)))))) + (with-test (:name (:ascii :decoding-error use-value)) + (let ((non-ascii-bytes (make-array 128 + :element-type '(unsigned-byte 8) + :initial-contents (loop for i from 128 below 256 collect i))) + (error-count 0)) + (handler-bind ((sb-int:character-decoding-error + (lambda (c) + (incf error-count) + (use-value "??" c)))) + (assert (string= (octets-to-string non-ascii-bytes :external-format :ascii) + (make-string 256 :initial-element #\?))) + (assert (= error-count 128))))) + (with-test (:name (:ascii :encoding-error use-value)) + (let ((non-ascii-chars (make-array 128 + :element-type 'character + :initial-contents (loop for i from 128 below 256 collect (code-char i)))) + (error-count 0)) + (handler-bind ((sb-int:character-encoding-error + (lambda (c) + (incf error-count) + (use-value "??" c)))) + (assert (equalp (string-to-octets non-ascii-chars :external-format :ascii) + (make-array 256 :initial-element (char-code #\?)))) + (assert (= error-count 128))))) ;; From Markus Kuhn's UTF-8 test file: ;; http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt @@ -125,12 +132,21 @@ (utf8-decode-tests #(#xe0 #xa0 #x80) "?") ; #x800 (utf8-decode-tests #(#xef #xbf #xbf) "?") ; #xffff (utf8-decode-tests #(#xf0 #x90 #x80 #x80) "?")) ; #x10000 - (utf8-decode-tests #(#xf4 #x90 #x80 #x80) "?") ; #x110000 - (utf8-decode-tests #(#xf7 #xbf #xbf #xbf) "?") ; #x1fffff - (utf8-decode-tests #(#xf8 #x88 #x80 #x80 #x80) "?") ; #x200000 - (utf8-decode-tests #(#xfb #xbf #xbf #xbf #xbf) "?") ; #x3ffffff - (utf8-decode-tests #(#xfc #x84 #x80 #x80 #x80 #x80) "?") ; #x4000000 - (utf8-decode-tests #(#xfd #xbf #xbf #xbf #xbf #xbf) "?") ; #x7fffffff + #+nil ; old, 6-byte UTF-8 definition + (progn + (utf8-decode-tests #(#xf4 #x90 #x80 #x80) "?") ; #x110000 + (utf8-decode-tests #(#xf7 #xbf #xbf #xbf) "?") ; #x1fffff + (utf8-decode-tests #(#xf8 #x88 #x80 #x80 #x80) "?") ; #x200000 + (utf8-decode-tests #(#xfb #xbf #xbf #xbf #xbf) "?") ; #x3ffffff + (utf8-decode-tests #(#xfc #x84 #x80 #x80 #x80 #x80) "?") ; #x4000000 + (utf8-decode-tests #(#xfd #xbf #xbf #xbf #xbf #xbf) "?")) ; #x7fffffff + (progn ; new, 4-byte (maximum #x10ffff) UTF-8 definition + (utf8-decode-tests #(#xf4 #x90) "??") ; #x110000 + (utf8-decode-tests #(#xf7 #xbf #xbf #xbf) "????") ; #x1fffff + (utf8-decode-tests #(#xf8 #x88 #x80 #x80 #x80) "?????") ; #x200000 + (utf8-decode-tests #(#xfb #xbf #xbf #xbf #xbf) "?????") ; #x3ffffff + (utf8-decode-tests #(#xfc #x84 #x80 #x80 #x80 #x80) "??????") ; #x4000000 + (utf8-decode-tests #(#xfd #xbf #xbf #xbf #xbf #xbf) "??????")) ; #x7fffffff ;; Unexpected continuation bytes (utf8-decode-tests #(#x80) "?") @@ -172,19 +188,24 @@ ;; Otherwise incomplete sequences (last continuation byte missing) (utf8-decode-tests #0=#(#xc0) "?") - (utf8-decode-tests #1=#(#xe0 #x80) "?") - (utf8-decode-tests #2=#(#xf0 #x80 #x80) "?") + (utf8-decode-tests #1=#(#xe0 #xa0) "?") + (utf8-decode-tests #2=#(#xf0 #x90 #x80) "?") + #+nil (utf8-decode-tests #3=#(#xf8 #x80 #x80 #x80) "?") + #+nil (utf8-decode-tests #4=#(#xfc #x80 #x80 #x80 #x80) "?") (utf8-decode-tests #5=#(#xdf) "?") (utf8-decode-tests #6=#(#xef #xbf) "?") + #+nil (utf8-decode-tests #7=#(#xf7 #xbf #xbf) "?") + #+nil (utf8-decode-tests #8=#(#xfb #xbf #xbf #xbf) "?") + #+nil (utf8-decode-tests #9=#(#xfd #xbf #xbf #xbf #xbf) "?") ;; All ten previous tests concatenated - (utf8-decode-tests (concatenate 'vector #0# #1# #2# #3# #4# #5# #6# #7# #8# #9#) - "??????????") + (utf8-decode-tests (concatenate 'vector #0# #1# #2# #5# #6#) + "?????") ;; Random impossible bytes (utf8-decode-tests #(#xfe) "?") @@ -192,25 +213,25 @@ (utf8-decode-tests #(#xfe #xfe #xff #xff) "????") ;; Overlong sequences - / - (utf8-decode-tests #(#xc0 #xaf) "?") - (utf8-decode-tests #(#xe0 #x80 #xaf) "?") - (utf8-decode-tests #(#xf0 #x80 #x80 #xaf) "?") - (utf8-decode-tests #(#xf8 #x80 #x80 #x80 #xaf) "?") - (utf8-decode-tests #(#xfc #x80 #x80 #x80 #x80 #xaf) "?") + (utf8-decode-tests #(#xc0 #xaf) "??") + (utf8-decode-tests #(#xe0 #x80 #xaf) "???") + (utf8-decode-tests #(#xf0 #x80 #x80 #xaf) "????") + (utf8-decode-tests #(#xf8 #x80 #x80 #x80 #xaf) "?????") + (utf8-decode-tests #(#xfc #x80 #x80 #x80 #x80 #xaf) "??????") ;; Overlong sequences - #\Rubout - (utf8-decode-tests #(#xc1 #xbf) "?") - (utf8-decode-tests #(#xe0 #x9f #xbf) "?") - (utf8-decode-tests #(#xf0 #x8f #xbf #xbf) "?") - (utf8-decode-tests #(#xf8 #x87 #xbf #xbf #xbf) "?") - (utf8-decode-tests #(#xfc #x83 #xbf #xbf #xbf #xbf) "?") + (utf8-decode-tests #(#xc1 #xbf) "??") + (utf8-decode-tests #(#xe0 #x9f #xbf) "???") + (utf8-decode-tests #(#xf0 #x8f #xbf #xbf) "????") + (utf8-decode-tests #(#xf8 #x87 #xbf #xbf #xbf) "?????") + (utf8-decode-tests #(#xfc #x83 #xbf #xbf #xbf #xbf) "??????") ;; Overlong sequences - #\Null - (utf8-decode-tests #(#xc0 #x80) "?") - (utf8-decode-tests #(#xe0 #x80 #x80) "?") - (utf8-decode-tests #(#xf0 #x80 #x80 #x80) "?") - (utf8-decode-tests #(#xf8 #x80 #x80 #x80 #x80) "?") - (utf8-decode-tests #(#xfc #x80 #x80 #x80 #x80 #x80) "?") + (utf8-decode-tests #(#xc0 #x80) "??") + (utf8-decode-tests #(#xe0 #x80 #x80) "???") + (utf8-decode-tests #(#xf0 #x80 #x80 #x80) "????") + (utf8-decode-tests #(#xf8 #x80 #x80 #x80 #x80) "?????") + (utf8-decode-tests #(#xfc #x80 #x80 #x80 #x80 #x80) "??????") ;; Not testing surrogates & characters #xFFFE, #xFFFF; they're ;; perfectly good sbcl chars even if they're not actually ISO 10646 @@ -220,6 +241,158 @@ ) -;; regression test: STRING->UTF8 didn't properly handle a non-zero -;; START argument. +;;; regression test: STRING->UTF8 didn't properly handle a non-zero +;;; START argument. (assert (equalp #(50) (string-to-octets "42" :start 1 :external-format :utf-8))) + +;;; STRING->UTF8 should cope with NIL strings if a null range is required +(assert (equalp #() (string-to-octets "" :external-format :utf-8))) +(assert (equalp #() (string-to-octets (make-array 0 :element-type nil) + :external-format :utf-8))) +(assert (equalp #() (string-to-octets (make-array 5 :element-type nil) + :start 3 :end 3 :external-format :utf-8))) +(assert (equalp #(0) (string-to-octets (make-array 5 :element-type nil) + :start 3 :end 3 :null-terminate t + :external-format :utf-8))) + +;;; whoops: the iso-8859-2 format referred to an undefined symbol. +#+sb-unicode +(assert (equalp #(251) (string-to-octets (string (code-char 369)) + :external-format :latin-2))) + +(with-test (:name (:euc-jp :decoding-errors) :skipped-on '(not :sb-unicode)) + (handler-bind ((sb-int:character-decoding-error + (lambda (c) (use-value #\? c)))) + (assert (string= "?{?" + (octets-to-string + (coerce #(182 123 253 238) '(vector (unsigned-byte 8))) + :external-format :euc-jp))))) + +(with-test (:name (:utf-8 :surrogates :encoding-errors) :skipped-on '(not :sb-unicode)) + (handler-bind ((sb-int:character-encoding-error + (lambda (c) (use-value #\? c)))) + (assert (equalp (string-to-octets (string (code-char #xd800)) + :external-format :utf-8) + (vector (char-code #\?)))))) +(with-test (:name (:utf-8 :surrogates :decoding-errors) :skipped-on '(not :sb-unicode)) + (handler-bind ((sb-int:character-decoding-error + (lambda (c) (use-value #\? c)))) + (assert (find #\? (octets-to-string + (coerce #(237 160 128) '(vector (unsigned-byte 8))) + :external-format :utf-8))))) + +(with-test (:name (:ucs-2 :out-of-range :encoding-errors) :skipped-on '(not :sb-unicode)) + (handler-bind ((sb-int:character-encoding-error + (lambda (c) (use-value "???" c)))) + (assert (equalp (string-to-octets (string (code-char #x10001)) + :external-format :ucs-2le) + #(63 0 63 0 63 0)))) + (handler-bind ((sb-int:character-encoding-error + (lambda (c) (use-value "???" c)))) + (assert (equalp (string-to-octets (string (code-char #x10001)) + :external-format :ucs-2be) + #(0 63 0 63 0 63))))) + +(with-test (:name (:ucs-4 :out-of-range :decoding-errors) :skipped-on '(not :sb-unicode)) + (handler-bind ((sb-int:character-decoding-error + (lambda (c) (use-value "???" c)))) + (assert (equalp (octets-to-string (coerce '(1 2 3 4) '(vector (unsigned-byte 8))) + :external-format :ucs-4le) + "???"))) + (assert (equalp (octets-to-string (coerce '(#xff #xff #x10 #x00) '(vector (unsigned-byte 8))) + :external-format :ucs-4le) + (string (code-char #x10ffff)))) + (handler-bind ((sb-int:character-decoding-error + (lambda (c) (use-value "???" c)))) + (assert (equalp (octets-to-string (coerce '(1 2 3 4) '(vector (unsigned-byte 8))) + :external-format :ucs-4be) + "???")) + (assert (equalp (octets-to-string (coerce '(#x00 #x10 #xff #xff) '(vector (unsigned-byte 8))) + :external-format :ucs-4be) + (string (code-char #x10ffff)))))) + +(with-test (:name (:utf-16le :ensure-roundtrip) :skipped-on '(not :sb-unicode)) + (flet ((enc (x) + (string-to-octets x :external-format :utf-16le)) + (dec (x) + (octets-to-string (coerce x '(vector (unsigned-byte 8))) + :external-format :utf-16le))) + (let ((string (map 'string 'code-char + '(#x20 #x200 #x2000 #x10000 #x10401 #x10fffd))) + (octets #(#x20 0 0 #x2 0 #x20 0 #xd8 0 #xdc 1 #xd8 1 #xdc #xff #xdb #xfd #xdf))) + (assert (equalp (enc string) octets)) + (assert (equalp (dec octets) string))))) + +(with-test (:name (:utf-16le :encoding-error) :skipped-on '(not :sb-unicode)) + (flet ((enc (x) + (string-to-octets x :external-format '(:utf-16le :replacement #\?))) + (dec (x) + (octets-to-string (coerce x '(vector (unsigned-byte 8))) + :external-format :utf-16le))) + (let ((string (map 'string 'code-char '(#xd800 #xdc00 #xfffe #x10ffff)))) + (assert (equalp (enc string) #(63 0 63 0 63 0 63 0)))))) + +(with-test (:name (:utf-16be :ensure-roundtrip) :skipped-on '(not :sb-unicode)) + (flet ((enc (x) + (string-to-octets x :external-format :utf-16be)) + (dec (x) + (octets-to-string (coerce x '(vector (unsigned-byte 8))) + :external-format :utf-16be))) + (let ((string (map 'string 'code-char + '(#x20 #x200 #x2000 #x10000 #x10401 #x10fffd))) + (octets #(0 #x20 #x2 0 #x20 0 #xd8 0 #xdc 0 #xd8 1 #xdc 1 #xdb #xff #xdf #xfd))) + (assert (equalp (enc string) octets)) + (assert (equalp (dec octets) string))))) + +(with-test (:name (:utf-16be :encoding-error) :skipped-on '(not :sb-unicode)) + (flet ((enc (x) + (string-to-octets x :external-format '(:utf-16be :replacement #\?))) + (dec (x) + (octets-to-string (coerce x '(vector (unsigned-byte 8))) + :external-format :utf-16be))) + (let ((string (map 'string 'code-char '(#xd800 #xdc00 #xfffe #x10ffff)))) + (assert (equalp (enc string) #(0 63 0 63 0 63 0 63)))))) + + +(with-test (:name (:utf-32le :ensure-roundtrip) :skipped-on '(not :sb-unicode)) + (flet ((enc (x) + (string-to-octets x :external-format :utf-32le)) + (dec (x) + (octets-to-string (coerce x '(vector (unsigned-byte 8))) + :external-format :utf-32le))) + (let ((string (map 'string 'code-char + '(#x20 #x200 #x2000 #x10000 #x10401 #x10fffd))) + (octets #(#x20 0 0 0 0 #x2 0 0 0 #x20 0 0 0 0 1 0 1 4 1 0 #xfd #xff #x10 0))) + (assert (equalp (enc string) octets)) + (assert (equalp (dec octets) string))))) + +(with-test (:name (:utf-32le :encoding-error) :skipped-on '(not :sb-unicode)) + (flet ((enc (x) + (string-to-octets x :external-format '(:utf-32le :replacement #\?))) + (dec (x) + (octets-to-string (coerce x '(vector (unsigned-byte 8))) + :external-format :utf-32le))) + (let ((string (map 'string 'code-char '(#xd800 #xdc00 #xfffe #x10ffff)))) + (assert (equalp (enc string) #(63 0 0 0 63 0 0 0 63 0 0 0 63 0 0 0)))))) + + +(with-test (:name (:utf-32be :ensure-roundtrip) :skipped-on '(not :sb-unicode)) + (flet ((enc (x) + (string-to-octets x :external-format :utf-32be)) + (dec (x) + (octets-to-string (coerce x '(vector (unsigned-byte 8))) + :external-format :utf-32be))) + (let ((string (map 'string 'code-char + '(#x20 #x200 #x2000 #x10000 #x10401 #x10fffd))) + (octets #(0 0 0 #x20 0 0 #x2 0 0 0 #x20 0 0 1 0 0 0 1 4 1 0 #x10 #xff #xfd))) + (assert (equalp (enc string) octets)) + (assert (equalp (dec octets) string))))) + +(with-test (:name (:utf-32be :encoding-error) :skipped-on '(not :sb-unicode)) + (flet ((enc (x) + (string-to-octets x :external-format '(:utf-32be :replacement #\?))) + (dec (x) + (octets-to-string (coerce x '(vector (unsigned-byte 8))) + :external-format :utf-32be))) + (let ((string (map 'string 'code-char '(#xd800 #xdc00 #xfffe #x10ffff)))) + (assert (equalp (enc string) #(0 0 0 63 0 0 0 63 0 0 0 63 0 0 0 63))))))