tests/octets.pure.lisp

   1 ;;;; tests of octet/character machinery with no side effects
   2
   3 ;;;; This software is part of the SBCL system. See the README file for
   4 ;;;; more information.
   5 ;;;;
   6 ;;;; While most of SBCL is derived from the CMU CL system, the test
   7 ;;;; files (like this one) were written from scratch after the fork
   8 ;;;; from CMU CL.
   9 ;;;;
  10 ;;;; This software is in the public domain and is provided with
  11 ;;;; absolutely no warranty. See the COPYING and CREDITS files for
  12 ;;;; more information.
  13
  14 (cl:in-package :cl-user)
  15
  16 (locally
  17     (declare (optimize debug (speed 0)))
  18
  19 (labels ((ub8 (len-or-seq)
  20            (if (numberp len-or-seq)
  21                (make-array len-or-seq :element-type '(unsigned-byte 8) :initial-element 0)
  22                (coerce len-or-seq '(simple-array (unsigned-byte 8) (*)))))
  23
  24          (ensure-roundtrip-ascii ()
  25            (let ((octets (ub8 128)))
  26              (dotimes (i 128)
  27                (setf (aref octets i) i))
  28              (let* ((str (octets-to-string octets :external-format :ascii))
  29                     (oct2 (string-to-octets str :external-format :ascii)))
  30                (assert (= (length octets) (length oct2)))
  31                (assert (every #'= octets oct2))))
  32            t)
  33
  34          (ensure-roundtrip-latin (format)
  35            (let ((octets (ub8 256)))
  36              (dotimes (i 256)
  37                (setf (aref octets i) i))
  38              (let* ((str (octets-to-string octets :external-format format))
  39                     (oct2 (string-to-octets str :external-format format)))
  40                (assert (= (length octets) (length oct2)))
  41                (assert (every #'= octets oct2))))
  42            t)
  43
  44          (ensure-roundtrip-latin1 ()
  45            (ensure-roundtrip-latin :latin1))
  46
  47          #+sb-unicode
  48          (ensure-roundtrip-latin9 ()
  49            (ensure-roundtrip-latin :latin9))
  50
  51          (ensure-roundtrip-utf8 ()
  52            (let ((string (make-string char-code-limit)))
  53              (dotimes (i char-code-limit)
  54                (setf (char string i) (code-char i)))
  55              (let ((string2
  56                     (octets-to-string (string-to-octets string :external-format :utf8)
  57                                       :external-format :utf8)))
  58                (assert (= (length string2) (length string)))
  59                (assert (string= string string2))))
  60            t)
  61
  62          (utf8-decode-test (octets expected-results expected-errors)
  63            (let ((error-count 0))
  64              (handler-bind ((sb-int:character-decoding-error
  65                              (lambda (c)
  66                                (incf error-count)
  67                                (use-value "?" c))))
  68                (assert (string= expected-results
  69                                 (octets-to-string (ub8 octets)
  70                                                   :external-format :utf-8)))
  71                (assert (= error-count expected-errors)))))
  72
  73          (utf8-decode-tests (octets expected-results)
  74            (let ((expected-errors (count #\? expected-results)))
  75              (utf8-decode-test octets expected-results expected-errors)
  76              (utf8-decode-test (concatenate 'vector
  77                                             '(34)
  78                                             octets
  79                                             '(34))
  80                                (format nil "\"~A\"" expected-results)
  81                                expected-errors))))
  82
  83   (ensure-roundtrip-ascii)
  84   (ensure-roundtrip-latin1)
  85   #+sb-unicode
  86   (progn
  87     (ensure-roundtrip-latin9)
  88     ;; Latin-9 chars; the previous test checked roundtrip from
  89     ;; octets->char and back, now test that the latin-9 characters did
  90     ;; in fact appear during that trip.
  91     (let ((l9c (map 'string #'code-char '(8364 352 353 381 382 338 339 376))))
  92       (assert
  93        (string= (octets-to-string (string-to-octets l9c :external-format :latin9)
  94                                   :external-format :latin9)
  95                 l9c))))
  96   (ensure-roundtrip-utf8)
  97
  98   (let ((non-ascii-bytes (make-array 128
  99                                      :element-type '(unsigned-byte 8)
 100                                      :initial-contents (loop for i from 128 below 256
 101                                                              collect i))))
 102     (handler-bind ((sb-int:character-decoding-error
 103                     (lambda (c)
 104                       (use-value "??" c))))
 105       (assert (string= (octets-to-string non-ascii-bytes :external-format :ascii)
 106                        (make-string 256 :initial-element #\?)))))
 107   (let ((non-ascii-chars (make-array 128
 108                                      :element-type 'character
 109                                      :initial-contents (loop for i from 128 below 256
 110                                                              collect (code-char i)))))
 111     (handler-bind ((sb-int:character-encoding-error
 112                     (lambda (c)
 113                       (use-value "??" c))))
 114       (assert (equalp (string-to-octets non-ascii-chars :external-format :ascii)
 115                       (make-array 256 :initial-element (char-code #\?))))))
 116
 117   ;; From Markus Kuhn's UTF-8 test file:
 118   ;; http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
 119
 120   ;; Too-big characters
 121   #-sb-unicode
 122   (progn
 123     (utf8-decode-tests #(#xc4 #x80) "?") ; #x100
 124     (utf8-decode-tests #(#xdf #xbf) "?") ; #x7ff
 125     (utf8-decode-tests #(#xe0 #xa0 #x80) "?") ; #x800
 126     (utf8-decode-tests #(#xef #xbf #xbf) "?") ; #xffff
 127     (utf8-decode-tests #(#xf0 #x90 #x80 #x80) "?")) ; #x10000
 128   (utf8-decode-tests #(#xf4 #x90 #x80 #x80) "?") ; #x110000
 129   (utf8-decode-tests #(#xf7 #xbf #xbf #xbf) "?") ; #x1fffff
 130   (utf8-decode-tests #(#xf8 #x88 #x80 #x80 #x80) "?") ; #x200000
 131   (utf8-decode-tests #(#xfb #xbf #xbf #xbf #xbf) "?") ; #x3ffffff
 132   (utf8-decode-tests #(#xfc #x84 #x80 #x80 #x80 #x80) "?") ; #x4000000
 133   (utf8-decode-tests #(#xfd #xbf #xbf #xbf #xbf #xbf) "?") ; #x7fffffff
 134
 135   ;; Unexpected continuation bytes
 136   (utf8-decode-tests #(#x80) "?")
 137   (utf8-decode-tests #(#xbf) "?")
 138   (utf8-decode-tests #(#x80 #xbf) "??")
 139   (utf8-decode-tests #(#x80 #xbf #x80) "???")
 140   (utf8-decode-tests #(#x80 #xbf #x80 #xbf) "????")
 141   (utf8-decode-tests #(#x80 #xbf #x80 #xbf #x80) "?????")
 142   (utf8-decode-tests #(#x80 #xbf #x80 #xbf #x80 #xbf) "??????")
 143   (utf8-decode-tests #(#x80 #xbf #x80 #xbf #x80 #xbf #x80) "???????")
 144
 145   ;; All 64 continuation bytes in a row
 146   (apply #'utf8-decode-tests
 147          (loop for i from #x80 to #xbf
 148                collect i into bytes
 149                collect #\? into chars
 150                finally (return (list bytes
 151                                      (coerce chars 'string)))))
 152
 153   ;; Lonely start characters
 154   (flet ((lsc (first last)
 155            (apply #'utf8-decode-tests
 156                   (loop for i from first to last
 157                         nconc (list i 32) into bytes
 158                         nconc (list #\? #\Space) into chars
 159                         finally (return (list bytes
 160                                               (coerce chars 'string)))))
 161            (apply #'utf8-decode-tests
 162                   (loop for i from first to last
 163                         collect i into bytes
 164                         collect #\? into chars
 165                         finally (return (list bytes
 166                                               (coerce chars 'string)))))))
 167     (lsc #xc0 #xdf) ; 2-byte sequence start chars
 168     (lsc #xe0 #xef) ; 3-byte
 169     (lsc #xf0 #xf7) ; 4-byte
 170     (lsc #xf8 #xfb) ; 5-byte
 171     (lsc #xfc #xfd)) ; 6-byte
 172
 173   ;; Otherwise incomplete sequences (last continuation byte missing)
 174   (utf8-decode-tests #0=#(#xc0) "?")
 175   (utf8-decode-tests #1=#(#xe0 #x80) "?")
 176   (utf8-decode-tests #2=#(#xf0 #x80 #x80) "?")
 177   (utf8-decode-tests #3=#(#xf8 #x80 #x80 #x80) "?")
 178   (utf8-decode-tests #4=#(#xfc #x80 #x80 #x80 #x80) "?")
 179   (utf8-decode-tests #5=#(#xdf) "?")
 180   (utf8-decode-tests #6=#(#xef #xbf) "?")
 181   (utf8-decode-tests #7=#(#xf7 #xbf #xbf) "?")
 182   (utf8-decode-tests #8=#(#xfb #xbf #xbf #xbf) "?")
 183   (utf8-decode-tests #9=#(#xfd #xbf #xbf #xbf #xbf) "?")
 184
 185   ;; All ten previous tests concatenated
 186   (utf8-decode-tests (concatenate 'vector #0# #1# #2# #3# #4# #5# #6# #7# #8# #9#)
 187                      "??????????")
 188
 189   ;; Random impossible bytes
 190   (utf8-decode-tests #(#xfe) "?")
 191   (utf8-decode-tests #(#xff) "?")
 192   (utf8-decode-tests #(#xfe #xfe #xff #xff) "????")
 193
 194   ;; Overlong sequences - /
 195   (utf8-decode-tests #(#xc0 #xaf) "?")
 196   (utf8-decode-tests #(#xe0 #x80 #xaf) "?")
 197   (utf8-decode-tests #(#xf0 #x80 #x80 #xaf) "?")
 198   (utf8-decode-tests #(#xf8 #x80 #x80 #x80 #xaf) "?")
 199   (utf8-decode-tests #(#xfc #x80 #x80 #x80 #x80 #xaf) "?")
 200
 201   ;; Overlong sequences - #\Rubout
 202   (utf8-decode-tests #(#xc1 #xbf) "?")
 203   (utf8-decode-tests #(#xe0 #x9f #xbf) "?")
 204   (utf8-decode-tests #(#xf0 #x8f #xbf #xbf) "?")
 205   (utf8-decode-tests #(#xf8 #x87 #xbf #xbf #xbf) "?")
 206   (utf8-decode-tests #(#xfc #x83 #xbf #xbf #xbf #xbf) "?")
 207
 208   ;; Overlong sequences - #\Null
 209   (utf8-decode-tests #(#xc0 #x80) "?")
 210   (utf8-decode-tests #(#xe0 #x80 #x80) "?")
 211   (utf8-decode-tests #(#xf0 #x80 #x80 #x80) "?")
 212   (utf8-decode-tests #(#xf8 #x80 #x80 #x80 #x80) "?")
 213   (utf8-decode-tests #(#xfc #x80 #x80 #x80 #x80 #x80) "?")
 214
 215   ;; Not testing surrogates & characters #xFFFE, #xFFFF; they're
 216   ;; perfectly good sbcl chars even if they're not actually ISO 10646
 217   ;; characters, and it's probably a good idea for s-to-o and o-to-s
 218   ;; to be inverses of each other as far as possible.
 219   )
 220
 221 )
 222
 223 ;;; regression test: STRING->UTF8 didn't properly handle a non-zero
 224 ;;; START argument.
 225 (assert (equalp #(50) (string-to-octets "42" :start 1 :external-format :utf-8)))
 226
 227 ;;; STRING->UTF8 should cope with NIL strings if a null range is required
 228 (assert (equalp #() (string-to-octets "" :external-format :utf-8)))
 229 (assert (equalp #() (string-to-octets (make-array 0 :element-type nil)
 230                                       :external-format :utf-8)))
 231 (assert (equalp #() (string-to-octets (make-array 5 :element-type nil)
 232                                       :start 3 :end 3 :external-format :utf-8)))