From 99a8f9e012fd2b5edc9a0927edfc537332081997 Mon Sep 17 00:00:00 2001 From: Christophe Rhodes Date: Wed, 16 Dec 2009 21:39:30 +0000 Subject: [PATCH] 1.0.33.14: improve UCS external formats Fix a simple bug in the CHAR->UCS-2BE error case. Fix error-handling UCS-2 cases, by actually being careful about using the return values of the encoding-error/decoding-error octet functions, and by using adjustable vectors. Include tests for this. Implement UCS-4, as a straight-through 32-bit encoding of the char-code. Move external-formats/ucs-2.lisp to external-formats/enc-ucs.lisp, and include a comment header explaining the distinction in our terms between UCS and UTF external formats. --- NEWS | 4 + build-order.lisp-expr | 2 +- src/code/external-formats/enc-ucs.lisp | 441 ++++++++++++++++++++++++++++++++ src/code/external-formats/ucs-2.lisp | 212 --------------- tests/external-format.impure.lisp | 18 ++ tests/octets.pure.lisp | 32 +++ version.lisp-expr | 2 +- 7 files changed, 497 insertions(+), 214 deletions(-) create mode 100644 src/code/external-formats/enc-ucs.lisp delete mode 100644 src/code/external-formats/ucs-2.lisp diff --git a/NEWS b/NEWS index bb01da4..778ee70 100644 --- a/NEWS +++ b/NEWS @@ -4,6 +4,10 @@ changes relative to sbcl-1.0.33: builtin types. * enhancement: Errors during compile-time-too processing (i.e. EVAL-WHEN) are now caught and reported just like errors during macroexpansion. + * fixes and improvements related to Unicode and external formats: + ** bug fix: error handling and restart usage in the ucs-2 external format + has been improved. + ** there is now an implementation of the ucs-4 external format. * bug fix: #p"\\\\" can now be read without error on Win32. (reported by Willem Broekema; launchpad bug lp#489698). * bug fix: some minor code rearrangements to reenable warning-free building diff --git a/build-order.lisp-expr b/build-order.lisp-expr index ae859b2..f837252 100644 --- a/build-order.lisp-expr +++ b/build-order.lisp-expr @@ -707,7 +707,7 @@ #!+sb-unicode ("src/code/external-formats/enc-jpn" :not-host) #!+sb-unicode - ("src/code/external-formats/ucs-2" :not-host) + ("src/code/external-formats/enc-ucs" :not-host) ;; The code here can't be compiled until CONDITION and ;; DEFINE-CONDITION are defined and SB!DEBUG:*STACK-TOP-HINT* is diff --git a/src/code/external-formats/enc-ucs.lisp b/src/code/external-formats/enc-ucs.lisp new file mode 100644 index 0000000..6fecf6b --- /dev/null +++ b/src/code/external-formats/enc-ucs.lisp @@ -0,0 +1,441 @@ +;;;; Universal Character Set (UCS) encodings +;;;; +;;;; In our interpretation, these are distinct from UTF encodings: the +;;;; UCS encodings are a direct encoding of the code point, in 16- and +;;;; 32-bit variants; by contrast, the UTF encodings handle Unicode +;;;; surrogate code points specially. + +;;;; This software is part of the SBCL system. See the README file for +;;;; more information. +;;;; +;;;; This software is derived from the CMU CL system, which was +;;;; written at Carnegie Mellon University and released into the +;;;; public domain. The software is in the public domain and is +;;;; provided with absolutely no warranty. See the COPYING and CREDITS +;;;; files for more information. + +(in-package "SB!IMPL") + +;;; TODO Macro for generating different variants: +;;; :ucs-2le (little endian) sap-ref-16le +;;; :ucs-2be (big endian) sap-ref-16be +;;; :ucs-2 (native) sap-ref-16 + +;;; Utilities + +(declaim + (inline sap-ref-16le (setf sap-ref-16le) sap-ref-16be (setf sap-ref-16be) + sap-ref-32le (setf sap-ref-32le) sap-ref-32be (setf sap-ref-32be))) + +;;; Define feature LITTLE-ENDIAN-AND-MISALIGNED-READ? +(defun sap-ref-16le (sap offset) + #!+(or x86 x86-64) + (sap-ref-16 sap offset) + #!-(or x86 x86-64) + (dpb (sap-ref-8 sap (1+ offset)) (byte 8 8) + (sap-ref-8 sap offset))) + +(defun (setf sap-ref-16le) (value sap offset) + #!+(or x86 x86-64) + (setf (sap-ref-16 sap offset) value) + #!-(or x86 x86-64) + (setf (sap-ref-8 sap offset) (logand value #xff) + (sap-ref-8 sap (1+ offset)) (ldb (byte 8 8) value))) + +(defun sap-ref-16be (sap offset) + (dpb (sap-ref-8 sap offset) (byte 8 8) + (sap-ref-8 sap (1+ offset)))) + +(defun (setf sap-ref-16be) (value sap offset) + (setf (sap-ref-8 sap (1+ offset)) (logand value #xff) + (sap-ref-8 sap offset) (ldb (byte 8 8) value))) + +(defun sap-ref-32le (sap offset) + #!+(or x86 x86-64) + (sap-ref-32 sap offset) + #!-(or x86 x86-64) + (dpb (sap-ref-8 sap (+ offset 3)) (byte 8 24) + (dpb (sap-ref-8 sap (+ offset 2)) (byte 8 16) + (sap-ref-16 sap offset)))) + +(defun (setf sap-ref-32le) (value sap offset) + #!+(or x86 x86-64) + (setf (sap-ref-32 sap offset) value) + #!-(or x86 x86-64) + (setf (sap-ref-8 sap offset) (logand value #xff) + (sap-ref-8 sap (1+ offset)) (ldb (byte 8 8) value) + (sap-ref-8 sap (+ offset 2)) (ldb (byte 8 16) value) + (sap-ref-8 sap (+ offset 3)) (ldb (byte 8 24) value))) + +(defun sap-ref-32be (sap offset) + (dpb (sap-ref-8 sap offset) (byte 8 24) + (dpb (sap-ref-8 sap (1+ offset)) (byte 8 16) + (dpb (sap-ref-8 sap (+ offset 2)) (byte 8 8) + (sap-ref-8 sap (+ offset 3)))))) + +(defun (setf sap-ref-32be) (value sap offset) + (setf (sap-ref-8 sap offset) (ldb (byte 8 24) value) + (sap-ref-8 sap (1+ offset)) (ldb (byte 8 16) value) + (sap-ref-8 sap (+ offset 2)) (ldb (byte 8 8) value) + (sap-ref-8 sap (+ offset 3)) (logand value #xff))) + +;;; +;;; octets +;;; + +;;; Conversion to UCS-2{LE,BE} +(declaim (inline char->ucs-2le)) +(defun char->ucs-2le (char dest string pos) + (declare (optimize speed (safety 0)) + (type (array (unsigned-byte 8) (*)) dest)) + (let ((code (char-code char))) + (if (< code #x10000) + (flet ((add-byte (b) + (declare (type (unsigned-byte 8) b)) + (vector-push-extend b dest))) + (declare (inline add-byte)) + (add-byte (ldb (byte 8 0) code)) + (add-byte (ldb (byte 8 8) code))) + (let ((replacement (encoding-error :ucs-2le string pos))) + (declare (type (simple-array (unsigned-byte 8) (*)) replacement)) + (dotimes (i (length replacement)) + (vector-push-extend (aref replacement i) dest)))))) + +(declaim (inline char->ucs-2be)) +(defun char->ucs-2be (char dest string pos) + (declare (optimize speed (safety 0)) + (type (array (unsigned-byte 8) (*)) dest)) + (let ((code (char-code char))) + (if (< code #x10000) + (flet ((add-byte (b) + (declare (type (unsigned-byte 8) b)) + (vector-push-extend b dest))) + (declare (inline add-byte)) + (add-byte (ldb (byte 8 8) code)) + (add-byte (ldb (byte 8 0) code))) + (let ((replacement (encoding-error :ucs-2be string pos))) + (declare (type (simple-array (unsigned-byte 8) (*)) replacement)) + (dotimes (i (length replacement)) + (vector-push-extend (aref replacement i) dest)))))) + +(defun string->ucs-2le (string sstart send additional-space) + (declare (optimize speed (safety 0)) + (type simple-string string) + (type array-range sstart send additional-space)) + (let ((array (make-array (* 2 (+ additional-space (- send sstart))) + :element-type '(unsigned-byte 8) + :fill-pointer 0 :adjustable t))) + (loop for i from sstart below send + do (char->ucs-2le (char string i) array string i)) + (dotimes (i (* 2 additional-space)) + (vector-push-extend 0 array)) + (coerce array '(simple-array (unsigned-byte 8) (*))))) + +(defun string->ucs-2be (string sstart send additional-space) + (declare (optimize speed (safety 0)) + (type simple-string string) + (type array-range sstart send additional-space)) + (let ((array (make-array (* 2 (+ additional-space (- send sstart))) + :element-type '(unsigned-byte 8) + :fill-pointer 0 :adjustable t))) + (loop for i from sstart below send + do (char->ucs-2be (char string i) array string i)) + (dotimes (i (* 2 additional-space)) + (vector-push-extend 0 array)) + (coerce array '(simple-array (unsigned-byte 8) (*))))) + +;; Conversion from UCS-2{LE,BE} +(defmacro define-bytes-per-ucs2-character (accessor type) + (declare (ignore type)) + (let ((name-le (make-od-name 'bytes-per-ucs-2le-character accessor)) + (name-be (make-od-name 'bytes-per-ucs-2be-character accessor))) + `(progn + (defun ,name-le (array pos end) + (declare (ignore array pos end)) + (values 2 nil)) + (defun ,name-be (array pos end) + (declare (ignore array pos end)) + (values 2 nil))))) +(instantiate-octets-definition define-bytes-per-ucs2-character) + +(defmacro define-simple-get-ucs2-character (accessor type) + (let ((name-le (make-od-name 'simple-get-ucs-2le-char accessor)) + (name-be (make-od-name 'simple-get-ucs-2be-char accessor))) + `(progn + (defun ,name-le (array pos bytes) + (declare (optimize speed (safety 0)) + (type ,type array) + (type array-range pos) + (type (integer 1 4) bytes) + (ignore bytes)) + ;; Optimization for SYSTEM-AREA-POINTER: use SAP-REF-16LE that + ;; reads two bytes at once on some architectures. + ,(if (and (eq accessor 'sap-ref-8) + (eq type 'system-area-pointer)) + '(code-char (sap-ref-16le array pos)) + `(flet ((cref (x) + (,accessor array (the array-range (+ pos x))))) + (declare (inline cref)) + (code-char (dpb (cref 1) (byte 8 8) (cref 0)))))) + (defun ,name-be (array pos bytes) + (declare (optimize speed (safety 0)) + (type ,type array) + (type array-range pos) + (type (integer 1 4) bytes) + (ignore bytes)) + ;; Use SAP-REF-16BE even if it is not optimized + ,(if (and (eq accessor 'sap-ref-8) + (eq type 'system-area-pointer)) + '(code-char (sap-ref-16be array pos)) + `(flet ((cref (x) + (,accessor array (the array-range (+ pos x))))) + (declare (inline cref)) + (code-char (dpb (cref 0) (byte 8 8) (cref 1))))))))) + +(instantiate-octets-definition define-simple-get-ucs2-character) + +(defmacro define-ucs-2->string (accessor type) + (let ((name-le (make-od-name 'ucs-2le->string accessor)) + (name-be (make-od-name 'ucs-2be->string accessor))) + `(progn + (defun ,name-le (array astart aend) + (declare (optimize speed (safety 0)) + (type ,type array) + (type array-range astart aend)) + (let ((string (make-array 0 :adjustable t :fill-pointer 0 :element-type 'character))) + (loop with pos = astart + while (< pos aend) + do (multiple-value-bind (bytes invalid) + (,(make-od-name 'bytes-per-ucs-2le-character accessor) array pos aend) + (declare (type (or null string) invalid)) + (aver (null invalid)) + (vector-push-extend + (,(make-od-name 'simple-get-ucs-2le-char accessor) + array pos bytes) + string) + (incf pos bytes))) + string)) + (defun ,name-be (array astart aend) + (declare (optimize speed (safety 0)) + (type ,type array) + (type array-range astart aend)) + (let ((string (make-array 0 :adjustable t :fill-pointer 0 :element-type 'character))) + (loop with pos = astart + while (< pos aend) + do (multiple-value-bind (bytes invalid) + (,(make-od-name 'bytes-per-ucs-2be-character accessor) array pos aend) + (declare (type (or null string) invalid)) + (aver (null invalid)) + (vector-push-extend + (,(make-od-name 'simple-get-ucs-2be-char accessor) + array pos bytes) + string) + (incf pos bytes))) + string))))) + +(instantiate-octets-definition define-ucs-2->string) + +(define-external-format/variable-width (:ucs-2le :ucs2le #!+win32 :ucs2 #!+win32 :ucs-2) t + (code-char #xfffd) + 2 + (if (< bits #x10000) + (setf (sap-ref-16le sap tail) bits) + (external-format-encoding-error stream bits)) + 2 + (code-char (sap-ref-16le sap head)) + ucs-2le->string-aref + string->ucs-2le) + +(define-external-format/variable-width (:ucs-2be :ucs2be) t + (code-char #xfffd) + 2 + (if (< bits #x10000) + (setf (sap-ref-16be sap tail) bits) + (external-format-encoding-error stream bits)) + 2 + (code-char (sap-ref-16be sap head)) + ucs-2be->string-aref + string->ucs-2be) + +(declaim (inline char->ucs-4le)) +(defun char->ucs-4le (char dest string pos) + (declare (optimize speed (safety 0)) + (type (array (unsigned-byte 8) (*)) dest) + (ignore string pos)) + (let ((code (char-code char))) + (flet ((add-byte (b) + (declare (type (unsigned-byte 8) b)) + (vector-push-extend b dest))) + (declare (inline add-byte)) + (add-byte (ldb (byte 8 0) code)) + (add-byte (ldb (byte 8 8) code)) + (add-byte (ldb (byte 8 16) code)) + (add-byte (ldb (byte 8 24) code))))) + +(declaim (inline char->ucs-4be)) +(defun char->ucs-4be (char dest string pos) + (declare (optimize speed (safety 0)) + (type (array (unsigned-byte 8) (*)) dest) + (ignore string pos)) + (let ((code (char-code char))) + (flet ((add-byte (b) + (declare (type (unsigned-byte 8) b)) + (vector-push-extend b dest))) + (declare (inline add-byte)) + (add-byte (ldb (byte 8 24) code)) + (add-byte (ldb (byte 8 16) code)) + (add-byte (ldb (byte 8 8) code)) + (add-byte (ldb (byte 8 0) code))))) + +(defun string->ucs-4le (string sstart send additional-space) + (declare (optimize speed (safety 0)) + (type simple-string string) + (type array-range sstart send additional-space)) + (let ((array (make-array (* 4 (+ additional-space (- send sstart))) + :element-type '(unsigned-byte 8) + :fill-pointer 0 :adjustable t))) + (loop for i from sstart below send + do (char->ucs-4le (char string i) array string i)) + (dotimes (i (* 4 additional-space)) + (vector-push-extend 0 array)) + (coerce array '(simple-array (unsigned-byte 8) (*))))) + +(defun string->ucs-4be (string sstart send additional-space) + (declare (optimize speed (safety 0)) + (type simple-string string) + (type array-range sstart send additional-space)) + (let ((array (make-array (* 4 (+ additional-space (- send sstart))) + :element-type '(unsigned-byte 8) + :fill-pointer 0 :adjustable t))) + (loop for i from sstart below send + do (char->ucs-4be (char string i) array string i)) + (dotimes (i (* 4 additional-space)) + (vector-push-extend 0 array)) + (coerce array '(simple-array (unsigned-byte 8) (*))))) + +;; Conversion from UCS-4{LE,BE} +(defmacro define-bytes-per-ucs4-character (accessor type) + (declare (ignore type)) + (let ((name-le (make-od-name 'bytes-per-ucs-4le-character accessor)) + (name-be (make-od-name 'bytes-per-ucs-4be-character accessor))) + `(progn + (defun ,name-le (array pos end) + (declare (ignore array pos end)) + (values 4 nil)) + (defun ,name-be (array pos end) + (declare (ignore array pos end)) + (values 4 nil))))) +(instantiate-octets-definition define-bytes-per-ucs4-character) + +(defmacro define-simple-get-ucs4-character (accessor type) + (let ((name-le (make-od-name 'simple-get-ucs-4le-char accessor)) + (name-be (make-od-name 'simple-get-ucs-4be-char accessor))) + `(progn + (defun ,name-le (array pos bytes) + (declare (optimize speed (safety 0)) + (type ,type array) + (type array-range pos) + (type (integer 1 4) bytes)) + ;; Optimization for SYSTEM-AREA-POINTER: use SAP-REF-32LE that + ;; reads four bytes at once on some architectures. + (let ((code ,(if (and (eq accessor 'sap-ref-8) + (eq type 'system-area-pointer)) + '(sap-ref-32le array pos) + `(flet ((cref (x) + (,accessor array (the array-range (+ pos x))))) + (declare (inline cref)) + (dpb (cref 3) (byte 8 24) + (dpb (cref 2) (byte 8 16) + (dpb (cref 1) (byte 8 8) (cref 0)))))))) + (if (< code char-code-limit) + (code-char code) + (decoding-error array pos (+ pos bytes) :ucs-4le + 'octet-decoding-error pos)))) + (defun ,name-be (array pos bytes) + (declare (optimize speed (safety 0)) + (type ,type array) + (type array-range pos) + (type (integer 1 4) bytes)) + ;; Use SAP-REF-32BE even if it is not optimized + (let ((code ,(if (and (eq accessor 'sap-ref-8) + (eq type 'system-area-pointer)) + '(sap-ref-32be array pos) + `(flet ((cref (x) + (,accessor array (the array-range (+ pos x))))) + (declare (inline cref)) + (dpb (cref 0) (byte 8 24) + (dpb (cref 1) (byte 8 16) + (dpb (cref 2) (byte 8 8) (cref 3)))))))) + (if (< code char-code-limit) + (code-char code) + (decoding-error array pos (+ pos bytes) :ucs-4be + 'octet-decoding-error pos))))))) + +(instantiate-octets-definition define-simple-get-ucs4-character) + +(defmacro define-ucs-4->string (accessor type) + (let ((name-le (make-od-name 'ucs-4le->string accessor)) + (name-be (make-od-name 'ucs-4be->string accessor))) + `(progn + (defun ,name-le (array astart aend) + (declare (optimize speed (safety 0)) + (type ,type array) + (type array-range astart aend)) + (let ((string (make-array 0 :adjustable t :fill-pointer 0 :element-type 'character))) + (loop with pos = astart + while (< pos aend) + do (multiple-value-bind (bytes invalid) + (,(make-od-name 'bytes-per-ucs-4le-character accessor) array pos aend) + (declare (type (or null string) invalid)) + (aver (null invalid)) + (let ((thing (,(make-od-name 'simple-get-ucs-4le-char accessor) array pos bytes))) + (typecase thing + (character (vector-push-extend thing string)) + (string (dotimes (i (length thing)) + (vector-push-extend (char thing i) string))))) + (incf pos bytes))) + string)) + (defun ,name-be (array astart aend) + (declare (optimize speed (safety 0)) + (type ,type array) + (type array-range astart aend)) + (let ((string (make-array 0 :adjustable t :fill-pointer 0 :element-type 'character))) + (loop with pos = astart + while (< pos aend) + do (multiple-value-bind (bytes invalid) + (,(make-od-name 'bytes-per-ucs-4be-character accessor) array pos aend) + (declare (type (or null string) invalid)) + (aver (null invalid)) + (let ((thing (,(make-od-name 'simple-get-ucs-4be-char accessor) array pos bytes))) + (typecase thing + (character (vector-push-extend thing string)) + (string (dotimes (i (length thing)) + (vector-push-extend (char thing i) string))))) + (incf pos bytes))) + string))))) + +(instantiate-octets-definition define-ucs-4->string) + +(define-external-format/variable-width (:ucs-4le :ucs4le) nil + (code-char #xfffd) + 4 + (setf (sap-ref-32le sap tail) bits) + 4 + (let ((code (sap-ref-32le sap head))) + (if (< code char-code-limit) + (code-char code) + (return-from decode-break-reason 4))) + ucs-4le->string-aref + string->ucs-4le) + +(define-external-format/variable-width (:ucs-4be :ucs4be) t + (code-char #xfffd) + 4 + (setf (sap-ref-32be sap tail) bits) + 4 + (let ((code (sap-ref-32be sap head))) + (if (< code char-code-limit) + (code-char code) + (return-from decode-break-reason 4))) + ucs-4be->string-aref + string->ucs-4be) diff --git a/src/code/external-formats/ucs-2.lisp b/src/code/external-formats/ucs-2.lisp deleted file mode 100644 index 1d15d10..0000000 --- a/src/code/external-formats/ucs-2.lisp +++ /dev/null @@ -1,212 +0,0 @@ -(in-package #:sb!impl) - -;;; TODO Macro for generating different variants: -;;; :ucs-2le (little endian) sap-ref-16le -;;; :ucs-2be (big endian) sap-ref-16be -;;; :ucs-2 (native) sap-ref-16 - -;;; Utilities - -(declaim (inline sap-ref-16le (setf sap-ref-16le) - sap-ref-16be (setf sap-ref-16be))) - -;;; Define feature LITTLE-ENDIAN-AND-MISALIGNED-READ? -(defun sap-ref-16le (sap offset) - #!+(or x86 x86-64) - (sap-ref-16 sap offset) - #!-(or x86 x86-64) - (dpb (sap-ref-8 sap (1+ offset)) (byte 8 8) - (sap-ref-8 sap offset))) - -(defun (setf sap-ref-16le) (value sap offset) - #!+(or x86 x86-64) - (setf (sap-ref-16 sap offset) value) - #!-(or x86 x86-64) - (setf (sap-ref-8 sap offset) (logand #xFF value) - (sap-ref-8 sap (1+ offset)) (ldb (byte 8 8) value))) - -(defun sap-ref-16be (sap offset) - (dpb (sap-ref-8 sap offset) (byte 8 8) - (sap-ref-8 sap (1+ offset)))) - -(defun (setf sap-ref-16be) (value sap offset) - (setf (sap-ref-8 sap (1+ offset)) (logand #xFF value) - (sap-ref-8 sap offset) (ldb (byte 8 8) value))) - -;;; -;;; octets -;;; - -;;; Conversion to UCS-2{LE,BE} -(declaim (inline char->ucs-2le)) -(defun char->ucs-2le (char dest string pos) - (declare (optimize speed (safety 0)) - (type (array (unsigned-byte 8) (*)) dest)) - (let ((code (char-code char))) - (if (< code #x10000) - (flet ((add-byte (b) - (declare (type (unsigned-byte 8) b)) - (vector-push b dest))) - (declare (inline add-byte)) - (add-byte (ldb (byte 8 0) code)) - (add-byte (ldb (byte 8 8) code))) - ; signal error - (encoding-error :ucs-2le string pos)))) - -(declaim (inline char->ucs-2be)) -(defun char->ucs-2be (char dest string pos) - (declare (optimize speed (safety 0)) - (type (array (unsigned-byte 8) (*)) dest)) - (let ((code (char-code char))) - (if (< code #x10000) - (flet ((add-byte (b) - (declare (type (unsigned-byte 8) b)) - (vector-push b dest))) - (declare (inline add-byte)) - (add-byte (ldb (byte 8 8) code)) - (add-byte (ldb (byte 8 0) code))) - ; signal error - (encoding-error :ucs-16be string pos)))) - -(defun string->ucs-2le (string sstart send additional-space) - (declare (optimize speed (safety 0)) - (type simple-string string) - (type array-range sstart send additional-space)) - (let ((array (make-array (* 2 (+ additional-space (- send sstart))) - :element-type '(unsigned-byte 8) - :fill-pointer 0))) - (loop for i from sstart below send - do (char->ucs-2le (char string i) array string i)) - (dotimes (i additional-space) - (vector-push 0 array) - (vector-push 0 array)) - (coerce array '(simple-array (unsigned-byte 8) (*))))) - -(defun string->ucs-2be (string sstart send additional-space) - (declare (optimize speed (safety 0)) - (type simple-string string) - (type array-range sstart send additional-space)) - (let ((array (make-array (* 2 (+ additional-space (- send sstart))) - :element-type '(unsigned-byte 8) - :fill-pointer 0))) - (loop for i from sstart below send - do (char->ucs-2be (char string i) array string i)) - (dotimes (i additional-space) - (vector-push 0 array) - (vector-push 0 array)) - (coerce array '(simple-array (unsigned-byte 8) (*))))) - -;; Conversion from UCS-2{LE,BE} -(defmacro define-bytes-per-ucs2-character (accessor type) - (declare (ignore type)) - (let ((name-le (make-od-name 'bytes-per-ucs-2le-character accessor)) - (name-be (make-od-name 'bytes-per-ucs-2be-character accessor))) - `(progn - (defun ,name-le (array pos end) - (declare (ignore array pos end)) - (values 2 nil)) - (defun ,name-be (array pos end) - (declare (ignore array pos end)) - (values 2 nil))))) -(instantiate-octets-definition define-bytes-per-ucs2-character) - -(defmacro define-simple-get-ucs2-character (accessor type) - (let ((name-le (make-od-name 'simple-get-ucs-2le-char accessor)) - (name-be (make-od-name 'simple-get-ucs-2be-char accessor))) - `(progn - (defun ,name-le (array pos bytes) - (declare (optimize speed (safety 0)) - (type ,type array) - (type array-range pos) - (type (integer 1 4) bytes) - (ignore bytes)) - ;; Optimization for SYSTEM-AREA-POINTER: use SAP-REF-16LE that - ;; reads two bytes at once on some architectures. - ,(if (and (eq accessor 'sap-ref-8) - (eq type 'system-area-pointer)) - '(code-char (sap-ref-16le array pos)) - `(flet ((cref (x) - (,accessor array (the array-range (+ pos x))))) - (declare (inline cref)) - (code-char (dpb (cref 1) (byte 8 8) - (cref 0)))))) - (defun ,name-be (array pos bytes) - (declare (optimize speed (safety 0)) - (type ,type array) - (type array-range pos) - (type (integer 1 4) bytes) - (ignore bytes)) - ;; Use SAP-REF-16BE even if it is not optimized - ,(if (and (eq accessor 'sap-ref-8) - (eq type 'system-area-pointer)) - '(code-char (sap-ref-16be array pos)) - `(flet ((cref (x) - (,accessor array (the array-range (+ pos x))))) - (declare (inline cref)) - (code-char (dpb (cref 0) (byte 8 8) - (cref 1))))))))) - -(instantiate-octets-definition define-simple-get-ucs2-character) - -(defmacro define-ucs-2->string (accessor type) - (let ((name-le (make-od-name 'ucs-2le->string accessor)) - (name-be (make-od-name 'ucs-2be->string accessor))) - `(progn - (defun ,name-le (array astart aend) - (declare (optimize speed (safety 0)) - (type ,type array) - (type array-range astart aend)) - (let ((string (make-array 0 :adjustable t :fill-pointer 0 :element-type 'character))) - (loop with pos = astart - while (< pos aend) - do (multiple-value-bind (bytes invalid) - (,(make-od-name 'bytes-per-ucs-2le-character accessor) array pos aend) - (declare (type (or null string) invalid)) - (assert (null invalid)) - (vector-push-extend - (,(make-od-name 'simple-get-ucs-2le-char accessor) - array pos bytes) - string) - (incf pos bytes))) - string)) - (defun ,name-be (array astart aend) - (declare (optimize speed (safety 0)) - (type ,type array) - (type array-range astart aend)) - (let ((string (make-array 0 :adjustable t :fill-pointer 0 :element-type 'character))) - (loop with pos = astart - while (< pos aend) - do (multiple-value-bind (bytes invalid) - (,(make-od-name 'bytes-per-ucs-2be-character accessor) array pos aend) - (declare (type (or null string) invalid)) - (assert (null invalid)) - (vector-push-extend - (,(make-od-name 'simple-get-ucs-2be-char accessor) - array pos bytes) - string) - (incf pos bytes))) - string))))) - -(instantiate-octets-definition define-ucs-2->string) - -(define-external-format/variable-width (:ucs-2le :ucs2le #!+win32 :ucs2 #!+win32 :ucs-2) t - (code-char #xfffd) - 2 - (if (< bits #x10000) - (setf (sap-ref-16le sap tail) bits) - (external-format-encoding-error stream bits)) - 2 - (code-char (sap-ref-16le sap head)) - ucs-2le->string-aref - string->ucs-2le) - -(define-external-format/variable-width (:ucs-2be :ucs2be) t - (code-char #xfffd) - 2 - (if (< bits #x10000) - (setf (sap-ref-16be sap tail) bits) - (external-format-encoding-error stream bits)) - 2 - (code-char (sap-ref-16be sap head)) - ucs-2be->string-aref - string->ucs-2be) diff --git a/tests/external-format.impure.lisp b/tests/external-format.impure.lisp index 0465ebc..dcc9df7 100644 --- a/tests/external-format.impure.lisp +++ b/tests/external-format.impure.lisp @@ -906,5 +906,23 @@ (assert (char= (char new 0) #\replacement_character)) (assert (char= (char new (1- size)) #\replacement_character)) (assert (string= string new :start1 1 :start2 1 :end1 (1- size) :end2 (1- size))))))) + +(with-test (:name (:multibyte :input-replacement :ucs4le)) + (let ((octets (coerce '(0 1 1 0 1 0 0 1) '(vector (unsigned-byte 8))))) + (with-open-file (s *test-path* :direction :output :if-exists :supersede :element-type '(unsigned-byte 8)) + (write-sequence octets s)) + (with-open-file (s *test-path* :external-format '(:ucs4le :replacement #\replacement_character)) + (let ((string (read-line s))) + (assert (char= (char string 0) (code-char #x10100))) + (assert (char= (char string 1) #\replacement_character)))))) + +(with-test (:name (:multibyte :input-replacement :ucs4le)) + (let ((octets (coerce '(0 1 1 0 1 0 0 1) '(vector (unsigned-byte 8))))) + (with-open-file (s *test-path* :direction :output :if-exists :supersede :element-type '(unsigned-byte 8)) + (write-sequence octets s)) + (with-open-file (s *test-path* :external-format '(:ucs4be :replacement #\replacement_character)) + (let ((string (read-line s))) + (assert (char= (char string 0) (code-char #x10100))) + (assert (char= (char string 1) #\replacement_character)))))) ;;;; success diff --git a/tests/octets.pure.lisp b/tests/octets.pure.lisp index cb779c5..1a276a7 100644 --- a/tests/octets.pure.lisp +++ b/tests/octets.pure.lisp @@ -283,3 +283,35 @@ (assert (find #\? (octets-to-string (coerce #(237 160 128) '(vector (unsigned-byte 8))) :external-format :utf-8))))) + +#+sb-unicode +(with-test (:name (:ucs-2 :out-of-range :encoding-errors)) + (handler-bind ((sb-int:character-encoding-error + (lambda (c) (use-value "???" c)))) + (assert (equalp (string-to-octets (string (code-char #x10001)) + :external-format :ucs-2le) + #(63 0 63 0 63 0)))) + (handler-bind ((sb-int:character-encoding-error + (lambda (c) (use-value "???" c)))) + (assert (equalp (string-to-octets (string (code-char #x10001)) + :external-format :ucs-2be) + #(0 63 0 63 0 63))))) + +#+sb-unicode +(with-test (:name (:ucs-4 :out-of-range :decoding-errors)) + (handler-bind ((sb-int:character-decoding-error + (lambda (c) (use-value "???" c)))) + (assert (equalp (octets-to-string (coerce '(1 2 3 4) '(vector (unsigned-byte 8))) + :external-format :ucs-4le) + "???"))) + (assert (equalp (octets-to-string (coerce '(#xff #xff #x10 #x00) '(vector (unsigned-byte 8))) + :external-format :ucs-4le) + (string (code-char #x10ffff)))) + (handler-bind ((sb-int:character-decoding-error + (lambda (c) (use-value "???" c)))) + (assert (equalp (octets-to-string (coerce '(1 2 3 4) '(vector (unsigned-byte 8))) + :external-format :ucs-4be) + "???")) + (assert (equalp (octets-to-string (coerce '(#x00 #x10 #xff #xff) '(vector (unsigned-byte 8))) + :external-format :ucs-4be) + (string (code-char #x10ffff)))))) diff --git a/version.lisp-expr b/version.lisp-expr index 23d8ec3..2abde29 100644 --- a/version.lisp-expr +++ b/version.lisp-expr @@ -17,4 +17,4 @@ ;;; checkins which aren't released. (And occasionally for internal ;;; versions, especially for internal versions off the main CVS ;;; branch, it gets hairier, e.g. "0.pre7.14.flaky4.13".) -"1.0.33.13" +"1.0.33.14" -- 1.7.10.4