(ensure-roundtrip-utf8 ()
(let ((string (make-string char-code-limit)))
(dotimes (i char-code-limit)
- (setf (char string i) (code-char i)))
+ (unless (<= #xd800 i #xdfff)
+ (setf (char string i) (code-char i))))
(let ((string2
(octets-to-string (string-to-octets string :external-format :utf8)
:external-format :utf8)))
l9c))))
(ensure-roundtrip-utf8)
- (let ((non-ascii-bytes (make-array 128
- :element-type '(unsigned-byte 8)
- :initial-contents (loop for i from 128 below 256
- collect i))))
- (handler-bind ((sb-int:character-decoding-error
- (lambda (c)
- (use-value "??" c))))
- (assert (string= (octets-to-string non-ascii-bytes :external-format :ascii)
- (make-string 256 :initial-element #\?)))))
- (let ((non-ascii-chars (make-array 128
- :element-type 'character
- :initial-contents (loop for i from 128 below 256
- collect (code-char i)))))
- (handler-bind ((sb-int:character-encoding-error
- (lambda (c)
- (use-value "??" c))))
- (assert (equalp (string-to-octets non-ascii-chars :external-format :ascii)
- (make-array 256 :initial-element (char-code #\?))))))
+ (with-test (:name (:ascii :decoding-error use-value))
+ (let ((non-ascii-bytes (make-array 128
+ :element-type '(unsigned-byte 8)
+ :initial-contents (loop for i from 128 below 256 collect i)))
+ (error-count 0))
+ (handler-bind ((sb-int:character-decoding-error
+ (lambda (c)
+ (incf error-count)
+ (use-value "??" c))))
+ (assert (string= (octets-to-string non-ascii-bytes :external-format :ascii)
+ (make-string 256 :initial-element #\?)))
+ (assert (= error-count 128)))))
+ (with-test (:name (:ascii :encoding-error use-value))
+ (let ((non-ascii-chars (make-array 128
+ :element-type 'character
+ :initial-contents (loop for i from 128 below 256 collect (code-char i))))
+ (error-count 0))
+ (handler-bind ((sb-int:character-encoding-error
+ (lambda (c)
+ (incf error-count)
+ (use-value "??" c))))
+ (assert (equalp (string-to-octets non-ascii-chars :external-format :ascii)
+ (make-array 256 :initial-element (char-code #\?))))
+ (assert (= error-count 128)))))
;; From Markus Kuhn's UTF-8 test file:
;; http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
(utf8-decode-tests #(#xe0 #xa0 #x80) "?") ; #x800
(utf8-decode-tests #(#xef #xbf #xbf) "?") ; #xffff
(utf8-decode-tests #(#xf0 #x90 #x80 #x80) "?")) ; #x10000
- (utf8-decode-tests #(#xf4 #x90 #x80 #x80) "?") ; #x110000
- (utf8-decode-tests #(#xf7 #xbf #xbf #xbf) "?") ; #x1fffff
- (utf8-decode-tests #(#xf8 #x88 #x80 #x80 #x80) "?") ; #x200000
- (utf8-decode-tests #(#xfb #xbf #xbf #xbf #xbf) "?") ; #x3ffffff
- (utf8-decode-tests #(#xfc #x84 #x80 #x80 #x80 #x80) "?") ; #x4000000
- (utf8-decode-tests #(#xfd #xbf #xbf #xbf #xbf #xbf) "?") ; #x7fffffff
+ #+nil ; old, 6-byte UTF-8 definition
+ (progn
+ (utf8-decode-tests #(#xf4 #x90 #x80 #x80) "?") ; #x110000
+ (utf8-decode-tests #(#xf7 #xbf #xbf #xbf) "?") ; #x1fffff
+ (utf8-decode-tests #(#xf8 #x88 #x80 #x80 #x80) "?") ; #x200000
+ (utf8-decode-tests #(#xfb #xbf #xbf #xbf #xbf) "?") ; #x3ffffff
+ (utf8-decode-tests #(#xfc #x84 #x80 #x80 #x80 #x80) "?") ; #x4000000
+ (utf8-decode-tests #(#xfd #xbf #xbf #xbf #xbf #xbf) "?")) ; #x7fffffff
+ (progn ; new, 4-byte (maximum #x10ffff) UTF-8 definition
+ (utf8-decode-tests #(#xf4 #x90) "??") ; #x110000
+ (utf8-decode-tests #(#xf7 #xbf #xbf #xbf) "????") ; #x1fffff
+ (utf8-decode-tests #(#xf8 #x88 #x80 #x80 #x80) "?????") ; #x200000
+ (utf8-decode-tests #(#xfb #xbf #xbf #xbf #xbf) "?????") ; #x3ffffff
+ (utf8-decode-tests #(#xfc #x84 #x80 #x80 #x80 #x80) "??????") ; #x4000000
+ (utf8-decode-tests #(#xfd #xbf #xbf #xbf #xbf #xbf) "??????")) ; #x7fffffff
;; Unexpected continuation bytes
(utf8-decode-tests #(#x80) "?")
;; Otherwise incomplete sequences (last continuation byte missing)
(utf8-decode-tests #0=#(#xc0) "?")
- (utf8-decode-tests #1=#(#xe0 #x80) "?")
- (utf8-decode-tests #2=#(#xf0 #x80 #x80) "?")
+ (utf8-decode-tests #1=#(#xe0 #xa0) "?")
+ (utf8-decode-tests #2=#(#xf0 #x90 #x80) "?")
+ #+nil
(utf8-decode-tests #3=#(#xf8 #x80 #x80 #x80) "?")
+ #+nil
(utf8-decode-tests #4=#(#xfc #x80 #x80 #x80 #x80) "?")
(utf8-decode-tests #5=#(#xdf) "?")
(utf8-decode-tests #6=#(#xef #xbf) "?")
+ #+nil
(utf8-decode-tests #7=#(#xf7 #xbf #xbf) "?")
+ #+nil
(utf8-decode-tests #8=#(#xfb #xbf #xbf #xbf) "?")
+ #+nil
(utf8-decode-tests #9=#(#xfd #xbf #xbf #xbf #xbf) "?")
;; All ten previous tests concatenated
- (utf8-decode-tests (concatenate 'vector #0# #1# #2# #3# #4# #5# #6# #7# #8# #9#)
- "??????????")
+ (utf8-decode-tests (concatenate 'vector #0# #1# #2# #5# #6#)
+ "?????")
;; Random impossible bytes
(utf8-decode-tests #(#xfe) "?")
(utf8-decode-tests #(#xfe #xfe #xff #xff) "????")
;; Overlong sequences - /
- (utf8-decode-tests #(#xc0 #xaf) "?")
- (utf8-decode-tests #(#xe0 #x80 #xaf) "?")
- (utf8-decode-tests #(#xf0 #x80 #x80 #xaf) "?")
- (utf8-decode-tests #(#xf8 #x80 #x80 #x80 #xaf) "?")
- (utf8-decode-tests #(#xfc #x80 #x80 #x80 #x80 #xaf) "?")
+ (utf8-decode-tests #(#xc0 #xaf) "??")
+ (utf8-decode-tests #(#xe0 #x80 #xaf) "???")
+ (utf8-decode-tests #(#xf0 #x80 #x80 #xaf) "????")
+ (utf8-decode-tests #(#xf8 #x80 #x80 #x80 #xaf) "?????")
+ (utf8-decode-tests #(#xfc #x80 #x80 #x80 #x80 #xaf) "??????")
;; Overlong sequences - #\Rubout
- (utf8-decode-tests #(#xc1 #xbf) "?")
- (utf8-decode-tests #(#xe0 #x9f #xbf) "?")
- (utf8-decode-tests #(#xf0 #x8f #xbf #xbf) "?")
- (utf8-decode-tests #(#xf8 #x87 #xbf #xbf #xbf) "?")
- (utf8-decode-tests #(#xfc #x83 #xbf #xbf #xbf #xbf) "?")
+ (utf8-decode-tests #(#xc1 #xbf) "??")
+ (utf8-decode-tests #(#xe0 #x9f #xbf) "???")
+ (utf8-decode-tests #(#xf0 #x8f #xbf #xbf) "????")
+ (utf8-decode-tests #(#xf8 #x87 #xbf #xbf #xbf) "?????")
+ (utf8-decode-tests #(#xfc #x83 #xbf #xbf #xbf #xbf) "??????")
;; Overlong sequences - #\Null
- (utf8-decode-tests #(#xc0 #x80) "?")
- (utf8-decode-tests #(#xe0 #x80 #x80) "?")
- (utf8-decode-tests #(#xf0 #x80 #x80 #x80) "?")
- (utf8-decode-tests #(#xf8 #x80 #x80 #x80 #x80) "?")
- (utf8-decode-tests #(#xfc #x80 #x80 #x80 #x80 #x80) "?")
+ (utf8-decode-tests #(#xc0 #x80) "??")
+ (utf8-decode-tests #(#xe0 #x80 #x80) "???")
+ (utf8-decode-tests #(#xf0 #x80 #x80 #x80) "????")
+ (utf8-decode-tests #(#xf8 #x80 #x80 #x80 #x80) "?????")
+ (utf8-decode-tests #(#xfc #x80 #x80 #x80 #x80 #x80) "??????")
;; Not testing surrogates & characters #xFFFE, #xFFFF; they're
;; perfectly good sbcl chars even if they're not actually ISO 10646
)
-;; regression test: STRING->UTF8 didn't properly handle a non-zero
-;; START argument.
+;;; regression test: STRING->UTF8 didn't properly handle a non-zero
+;;; START argument.
(assert (equalp #(50) (string-to-octets "42" :start 1 :external-format :utf-8)))
+
+;;; STRING->UTF8 should cope with NIL strings if a null range is required
+(assert (equalp #() (string-to-octets "" :external-format :utf-8)))
+(assert (equalp #() (string-to-octets (make-array 0 :element-type nil)
+ :external-format :utf-8)))
+(assert (equalp #() (string-to-octets (make-array 5 :element-type nil)
+ :start 3 :end 3 :external-format :utf-8)))
+(assert (equalp #(0) (string-to-octets (make-array 5 :element-type nil)
+ :start 3 :end 3 :null-terminate t
+ :external-format :utf-8)))
+
+;;; whoops: the iso-8859-2 format referred to an undefined symbol.
+#+sb-unicode
+(assert (equalp #(251) (string-to-octets (string (code-char 369))
+ :external-format :latin-2)))
+
+(with-test (:name (:euc-jp :decoding-errors) :skipped-on '(not :sb-unicode))
+ (handler-bind ((sb-int:character-decoding-error
+ (lambda (c) (use-value #\? c))))
+ (assert (string= "?{?"
+ (octets-to-string
+ (coerce #(182 123 253 238) '(vector (unsigned-byte 8)))
+ :external-format :euc-jp)))))
+
+(with-test (:name (:utf-8 :surrogates :encoding-errors) :skipped-on '(not :sb-unicode))
+ (handler-bind ((sb-int:character-encoding-error
+ (lambda (c) (use-value #\? c))))
+ (assert (equalp (string-to-octets (string (code-char #xd800))
+ :external-format :utf-8)
+ (vector (char-code #\?))))))
+(with-test (:name (:utf-8 :surrogates :decoding-errors) :skipped-on '(not :sb-unicode))
+ (handler-bind ((sb-int:character-decoding-error
+ (lambda (c) (use-value #\? c))))
+ (assert (find #\? (octets-to-string
+ (coerce #(237 160 128) '(vector (unsigned-byte 8)))
+ :external-format :utf-8)))))
+
+(with-test (:name (:ucs-2 :out-of-range :encoding-errors) :skipped-on '(not :sb-unicode))
+ (handler-bind ((sb-int:character-encoding-error
+ (lambda (c) (use-value "???" c))))
+ (assert (equalp (string-to-octets (string (code-char #x10001))
+ :external-format :ucs-2le)
+ #(63 0 63 0 63 0))))
+ (handler-bind ((sb-int:character-encoding-error
+ (lambda (c) (use-value "???" c))))
+ (assert (equalp (string-to-octets (string (code-char #x10001))
+ :external-format :ucs-2be)
+ #(0 63 0 63 0 63)))))
+
+(with-test (:name (:ucs-4 :out-of-range :decoding-errors) :skipped-on '(not :sb-unicode))
+ (handler-bind ((sb-int:character-decoding-error
+ (lambda (c) (use-value "???" c))))
+ (assert (equalp (octets-to-string (coerce '(1 2 3 4) '(vector (unsigned-byte 8)))
+ :external-format :ucs-4le)
+ "???")))
+ (assert (equalp (octets-to-string (coerce '(#xff #xff #x10 #x00) '(vector (unsigned-byte 8)))
+ :external-format :ucs-4le)
+ (string (code-char #x10ffff))))
+ (handler-bind ((sb-int:character-decoding-error
+ (lambda (c) (use-value "???" c))))
+ (assert (equalp (octets-to-string (coerce '(1 2 3 4) '(vector (unsigned-byte 8)))
+ :external-format :ucs-4be)
+ "???"))
+ (assert (equalp (octets-to-string (coerce '(#x00 #x10 #xff #xff) '(vector (unsigned-byte 8)))
+ :external-format :ucs-4be)
+ (string (code-char #x10ffff))))))
+
+(with-test (:name (:utf-16le :ensure-roundtrip) :skipped-on '(not :sb-unicode))
+ (flet ((enc (x)
+ (string-to-octets x :external-format :utf-16le))
+ (dec (x)
+ (octets-to-string (coerce x '(vector (unsigned-byte 8)))
+ :external-format :utf-16le)))
+ (let ((string (map 'string 'code-char
+ '(#x20 #x200 #x2000 #x10000 #x10401 #x10fffd)))
+ (octets #(#x20 0 0 #x2 0 #x20 0 #xd8 0 #xdc 1 #xd8 1 #xdc #xff #xdb #xfd #xdf)))
+ (assert (equalp (enc string) octets))
+ (assert (equalp (dec octets) string)))))
+
+(with-test (:name (:utf-16le :encoding-error) :skipped-on '(not :sb-unicode))
+ (flet ((enc (x)
+ (string-to-octets x :external-format '(:utf-16le :replacement #\?)))
+ (dec (x)
+ (octets-to-string (coerce x '(vector (unsigned-byte 8)))
+ :external-format :utf-16le)))
+ (let ((string (map 'string 'code-char '(#xd800 #xdc00 #xfffe #x10ffff))))
+ (assert (equalp (enc string) #(63 0 63 0 63 0 63 0))))))
+
+(with-test (:name (:utf-16be :ensure-roundtrip) :skipped-on '(not :sb-unicode))
+ (flet ((enc (x)
+ (string-to-octets x :external-format :utf-16be))
+ (dec (x)
+ (octets-to-string (coerce x '(vector (unsigned-byte 8)))
+ :external-format :utf-16be)))
+ (let ((string (map 'string 'code-char
+ '(#x20 #x200 #x2000 #x10000 #x10401 #x10fffd)))
+ (octets #(0 #x20 #x2 0 #x20 0 #xd8 0 #xdc 0 #xd8 1 #xdc 1 #xdb #xff #xdf #xfd)))
+ (assert (equalp (enc string) octets))
+ (assert (equalp (dec octets) string)))))
+
+(with-test (:name (:utf-16be :encoding-error) :skipped-on '(not :sb-unicode))
+ (flet ((enc (x)
+ (string-to-octets x :external-format '(:utf-16be :replacement #\?)))
+ (dec (x)
+ (octets-to-string (coerce x '(vector (unsigned-byte 8)))
+ :external-format :utf-16be)))
+ (let ((string (map 'string 'code-char '(#xd800 #xdc00 #xfffe #x10ffff))))
+ (assert (equalp (enc string) #(0 63 0 63 0 63 0 63))))))
+
+
+(with-test (:name (:utf-32le :ensure-roundtrip) :skipped-on '(not :sb-unicode))
+ (flet ((enc (x)
+ (string-to-octets x :external-format :utf-32le))
+ (dec (x)
+ (octets-to-string (coerce x '(vector (unsigned-byte 8)))
+ :external-format :utf-32le)))
+ (let ((string (map 'string 'code-char
+ '(#x20 #x200 #x2000 #x10000 #x10401 #x10fffd)))
+ (octets #(#x20 0 0 0 0 #x2 0 0 0 #x20 0 0 0 0 1 0 1 4 1 0 #xfd #xff #x10 0)))
+ (assert (equalp (enc string) octets))
+ (assert (equalp (dec octets) string)))))
+
+(with-test (:name (:utf-32le :encoding-error) :skipped-on '(not :sb-unicode))
+ (flet ((enc (x)
+ (string-to-octets x :external-format '(:utf-32le :replacement #\?)))
+ (dec (x)
+ (octets-to-string (coerce x '(vector (unsigned-byte 8)))
+ :external-format :utf-32le)))
+ (let ((string (map 'string 'code-char '(#xd800 #xdc00 #xfffe #x10ffff))))
+ (assert (equalp (enc string) #(63 0 0 0 63 0 0 0 63 0 0 0 63 0 0 0))))))
+
+
+(with-test (:name (:utf-32be :ensure-roundtrip) :skipped-on '(not :sb-unicode))
+ (flet ((enc (x)
+ (string-to-octets x :external-format :utf-32be))
+ (dec (x)
+ (octets-to-string (coerce x '(vector (unsigned-byte 8)))
+ :external-format :utf-32be)))
+ (let ((string (map 'string 'code-char
+ '(#x20 #x200 #x2000 #x10000 #x10401 #x10fffd)))
+ (octets #(0 0 0 #x20 0 0 #x2 0 0 0 #x20 0 0 1 0 0 0 1 4 1 0 #x10 #xff #xfd)))
+ (assert (equalp (enc string) octets))
+ (assert (equalp (dec octets) string)))))
+
+(with-test (:name (:utf-32be :encoding-error) :skipped-on '(not :sb-unicode))
+ (flet ((enc (x)
+ (string-to-octets x :external-format '(:utf-32be :replacement #\?)))
+ (dec (x)
+ (octets-to-string (coerce x '(vector (unsigned-byte 8)))
+ :external-format :utf-32be)))
+ (let ((string (map 'string 'code-char '(#xd800 #xdc00 #xfffe #x10ffff))))
+ (assert (equalp (enc string) #(0 0 0 63 0 0 0 63 0 0 0 63 0 0 0 63))))))