From 77d7fddb855305e783c100bfe9b6b46bdb05e4b6 Mon Sep 17 00:00:00 2001 From: Christophe Rhodes Date: Wed, 16 Dec 2009 22:12:35 +0000 Subject: [PATCH] 1.0.33.16: implement UTF external formats UTF-16 and UTF-32, being strict about handling of surrogates and noncharacters according to Unicode. --- NEWS | 1 + build-order.lisp-expr | 2 + src/code/external-formats/enc-ucs.lisp | 2 +- src/code/external-formats/enc-utf.lisp | 525 ++++++++++++++++++++++++++++++++ tests/external-format.impure.lisp | 59 ++++ tests/octets.pure.lisp | 88 ++++++ version.lisp-expr | 2 +- 7 files changed, 677 insertions(+), 2 deletions(-) create mode 100644 src/code/external-formats/enc-utf.lisp diff --git a/NEWS b/NEWS index 778ee70..0b40d46 100644 --- a/NEWS +++ b/NEWS @@ -8,6 +8,7 @@ changes relative to sbcl-1.0.33: ** bug fix: error handling and restart usage in the ucs-2 external format has been improved. ** there is now an implementation of the ucs-4 external format. + ** the utf-16 and utf-32 external formats are supported. * bug fix: #p"\\\\" can now be read without error on Win32. (reported by Willem Broekema; launchpad bug lp#489698). * bug fix: some minor code rearrangements to reenable warning-free building diff --git a/build-order.lisp-expr b/build-order.lisp-expr index f837252..0a52508 100644 --- a/build-order.lisp-expr +++ b/build-order.lisp-expr @@ -708,6 +708,8 @@ ("src/code/external-formats/enc-jpn" :not-host) #!+sb-unicode ("src/code/external-formats/enc-ucs" :not-host) + #!+sb-unicode + ("src/code/external-formats/enc-utf" :not-host) ;; The code here can't be compiled until CONDITION and ;; DEFINE-CONDITION are defined and SB!DEBUG:*STACK-TOP-HINT* is diff --git a/src/code/external-formats/enc-ucs.lisp b/src/code/external-formats/enc-ucs.lisp index 6fecf6b..ca87c76 100644 --- a/src/code/external-formats/enc-ucs.lisp +++ b/src/code/external-formats/enc-ucs.lisp @@ -428,7 +428,7 @@ ucs-4le->string-aref string->ucs-4le) -(define-external-format/variable-width (:ucs-4be :ucs4be) t +(define-external-format/variable-width (:ucs-4be :ucs4be) nil (code-char #xfffd) 4 (setf (sap-ref-32be sap tail) bits) diff --git a/src/code/external-formats/enc-utf.lisp b/src/code/external-formats/enc-utf.lisp new file mode 100644 index 0000000..bb965c6 --- /dev/null +++ b/src/code/external-formats/enc-utf.lisp @@ -0,0 +1,525 @@ +;;;; Unicode Transformation Format (UTF) encodings +;;;; +;;;; In our interpretation, these are distinct from UCS encodings: the +;;;; UCS encodings are a direct encoding of the code point, in 16- and +;;;; 32-bit variants; by contrast, the UTF encodings handle Unicode +;;;; surrogate code points specially. + +;;;; This software is part of the SBCL system. See the README file for +;;;; more information. +;;;; +;;;; This software is derived from the CMU CL system, which was +;;;; written at Carnegie Mellon University and released into the +;;;; public domain. The software is in the public domain and is +;;;; provided with absolutely no warranty. See the COPYING and CREDITS +;;;; files for more information. + +(in-package "SB!IMPL") + + +(declaim (inline utf-noncharacter-code-p)) +(defun utf-noncharacter-code-p (code) + (or (<= #xd800 code #xdfff) + (<= #xfdd0 code #xfdef) + (= (logand code #xfffe) #xfffe))) + +;;; Conversion to UTF-16{LE,BE} +(declaim (inline char->utf-16le)) +(defun char->utf-16le (char dest string pos) + (declare (optimize speed (safety 0)) + (type (array (unsigned-byte 8) (*)) dest)) + (let ((code (char-code char))) + (if (utf-noncharacter-code-p code) + (let ((replacement (encoding-error :utf-16le string pos))) + (declare (type (simple-array (unsigned-byte 8) (*)) replacement)) + (dotimes (i (length replacement)) + (vector-push-extend (aref replacement i) dest))) + (flet ((add-byte (b) + (declare (type (unsigned-byte 8) b)) + (vector-push-extend b dest))) + (declare (inline add-byte)) + (cond + ((< code #x10000) + (add-byte (ldb (byte 8 0) code)) + (add-byte (ldb (byte 8 8) code))) + (t + (let* ((codeoid (- code #x10000)) + (high (dpb (ldb (byte 10 10) codeoid) (byte 10 0) #xd800)) + (low (dpb (ldb (byte 10 0) codeoid) (byte 10 0) #xdc00))) + (add-byte (ldb (byte 8 0) high)) + (add-byte (ldb (byte 8 8) high)) + (add-byte (ldb (byte 8 0) low)) + (add-byte (ldb (byte 8 8) low))))))))) + +(declaim (inline char->utf-16be)) +(defun char->utf-16be (char dest string pos) + (declare (optimize speed (safety 0)) + (type (array (unsigned-byte 8) (*)) dest)) + (let ((code (char-code char))) + (if (utf-noncharacter-code-p code) + (let ((replacement (encoding-error :utf-16be string pos))) + (declare (type (simple-array (unsigned-byte 8) (*)) replacement)) + (dotimes (i (length replacement)) + (vector-push-extend (aref replacement i) dest))) + (flet ((add-byte (b) + (declare (type (unsigned-byte 8) b)) + (vector-push-extend b dest))) + (declare (inline add-byte)) + (cond + ((< code #x10000) + (add-byte (ldb (byte 8 8) code)) + (add-byte (ldb (byte 8 0) code))) + (t + (let* ((codeoid (- code #x10000)) + (high (dpb (ldb (byte 10 10) codeoid) (byte 10 0) #xd800)) + (low (dpb (ldb (byte 10 0) codeoid) (byte 10 0) #xdc00))) + (add-byte (ldb (byte 8 8) high)) + (add-byte (ldb (byte 8 0) high)) + (add-byte (ldb (byte 8 8) low)) + (add-byte (ldb (byte 8 0) low))))))))) + +(defun string->utf-16le (string sstart send additional-space) + (declare (optimize speed (safety 0)) + (type simple-string string) + (type array-range sstart send additional-space)) + (let ((array (make-array (* 2 (+ additional-space (- send sstart))) + :element-type '(unsigned-byte 8) + :fill-pointer 0 :adjustable t))) + (loop for i from sstart below send + do (char->utf-16le (char string i) array string i)) + (dotimes (i (* 2 additional-space)) + (vector-push-extend 0 array)) + (coerce array '(simple-array (unsigned-byte 8) (*))))) + +(defun string->utf-16be (string sstart send additional-space) + (declare (optimize speed (safety 0)) + (type simple-string string) + (type array-range sstart send additional-space)) + (let ((array (make-array (* 2 (+ additional-space (- send sstart))) + :element-type '(unsigned-byte 8) + :fill-pointer 0 :adjustable t))) + (loop for i from sstart below send + do (char->utf-16be (char string i) array string i)) + (dotimes (i (* 2 additional-space)) + (vector-push-extend 0 array)) + (coerce array '(simple-array (unsigned-byte 8) (*))))) + +;; Conversion from UTF-16{LE,BE} +(defmacro define-bytes-per-utf16-character (accessor type) + (declare (ignore type)) + (let ((name-le (make-od-name 'bytes-per-utf-16le-character accessor)) + (name-be (make-od-name 'bytes-per-utf-16be-character accessor))) + `(progn + (defun ,name-le (array pos end) + (let ((remaining (- end pos))) + (when (< remaining 2) + (return-from ,name-le (values remaining (decoding-error array pos end :utf-16le 'octet-decoding-error pos)))) + (let ((low (dpb (,accessor array (+ pos 1)) (byte 8 8) (,accessor array pos)))) + (if (<= #xd800 low #xdbff) + (if (< remaining 4) + (values remaining (decoding-error array pos end :utf-16le 'octet-decoding-error pos)) + (let ((high (dpb (,accessor array (+ pos 3)) (byte 8 8) (,accessor array (+ pos 2))))) + (if (<= #xdc00 high #xdfff) + (let ((code (dpb (ldb (byte 10 0) low) (byte 10 10) (ldb (byte 10 0) high)))) + (if (= (logand code #xfffe) #xfffe) + (values 4 (decoding-error array pos (+ pos 4) :utf-16le 'octet-decoding-error pos)) + (values 4 nil))) + (values 2 (decoding-error array pos (+ pos 2) :utf-16le 'octet-decoding-error pos))))) + (if (or (<= #xdc00 low #xdfff) + (<= #xfdd0 low #xfdef) + (= (logand low #xfffe) #xfffe)) + (values 2 (decoding-error array pos (+ pos 2) :utf-16le 'octet-decoding-error pos)) + (values 2 nil)))))) + (defun ,name-be (array pos end) + (let ((remaining (- end pos))) + (when (< remaining 2) + (return-from ,name-be (values remaining (decoding-error array pos end :utf-16le 'octet-decoding-error pos)))) + (let ((low (dpb (,accessor array pos) (byte 8 8) (,accessor array (+ pos 1))))) + (if (<= #xd800 low #xdbff) + (if (< remaining 4) + (values remaining (decoding-error array pos end :utf-16le 'octet-decoding-error pos)) + (let ((high (dpb (,accessor array (+ pos 2)) (byte 8 8) (,accessor array (+ pos 3))))) + (if (<= #xdc00 high #xdfff) + (let ((code (dpb (ldb (byte 10 0) low) (byte 10 10) (ldb (byte 10 0) high)))) + (if (= (logand code #xfffe) #xfffe) + (values 4 (decoding-error array pos (+ pos 4) :utf-16le 'octet-decoding-error pos)) + (values 4 nil))) + (values 2 (decoding-error array pos (+ pos 2) :utf-16le 'octet-decoding-error pos))))) + (if (or (<= #xdc00 low #xdfff) + (<= #xfdd0 low #xfdef) + (= (logand low #xfffe) #xfffe)) + (values 2 (decoding-error array pos (+ pos 2) :utf-16le 'octet-decoding-error pos)) + (values 2 nil))))))))) +(instantiate-octets-definition define-bytes-per-utf16-character) + +(defmacro define-simple-get-utf16-character (accessor type) + (let ((name-le (make-od-name 'simple-get-utf-16le-char accessor)) + (name-be (make-od-name 'simple-get-utf-16be-char accessor))) + `(progn + (defun ,name-le (array pos bytes) + (declare (optimize speed (safety 0)) + (type ,type array) + (type array-range pos) + (type (integer 1 4) bytes) + (ignore bytes)) + ;; Optimization for SYSTEM-AREA-POINTER: use SAP-REF-16LE that + ;; reads two bytes at once on some architectures. + (let ((code ,(if (and (eq accessor 'sap-ref-8) + (eq type 'system-area-pointer)) + '(sap-ref-16le array pos) + `(flet ((cref (x) + (,accessor array (the array-range (+ pos x))))) + (declare (inline cref)) + (dpb (cref 1) (byte 8 8) (cref 0)))))) + (if (<= #xd800 code #xdbff) + (let ((next ,(if (and (eq accessor 'sap-ref-8) + (eq type 'system-area-pointer)) + '(sap-ref-16le array (+ pos 2)) + `(flet ((cref (x) + (,accessor array (the array-range (+ pos x))))) + (declare (inline cref)) + (dpb (cref 3) (byte 8 8) (cref 2)))))) + (code-char (+ #x10000 (dpb (ldb (byte 10 0) code) (byte 10 10) (ldb (byte 10 0) next))))) + (code-char code)))) + (defun ,name-be (array pos bytes) + (declare (optimize speed (safety 0)) + (type ,type array) + (type array-range pos) + (type (integer 1 4) bytes) + (ignore bytes)) + ;; Use SAP-REF-16BE even if it is not optimized + (let ((code ,(if (and (eq accessor 'sap-ref-8) + (eq type 'system-area-pointer)) + '(sap-ref-16be array pos) + `(flet ((cref (x) + (,accessor array (the array-range (+ pos x))))) + (declare (inline cref)) + (dpb (cref 0) (byte 8 8) (cref 1)))))) + (if (<= #xd800 code #xdbff) + (let ((next ,(if (and (eq accessor 'sap-ref-8) + (eq type 'system-area-pointer)) + '(sap-ref-16be array (+ pos 2)) + `(flet ((cref (x) + (,accessor array (the array-range (+ pos x))))) + (declare (inline cref)) + (dpb (cref 2) (byte 8 8) (cref 3)))))) + (code-char (+ #x10000 (dpb (ldb (byte 10 0) code) (byte 10 10) (ldb (byte 10 0) next))))) + (code-char code))))))) + +(instantiate-octets-definition define-simple-get-utf16-character) + +(defmacro define-utf-16->string (accessor type) + (let ((name-le (make-od-name 'utf-16le->string accessor)) + (name-be (make-od-name 'utf-16be->string accessor))) + `(progn + (defun ,name-le (array astart aend) + (declare (optimize speed (safety 0)) + (type ,type array) + (type array-range astart aend)) + (let ((string (make-array 0 :adjustable t :fill-pointer 0 :element-type 'character))) + (loop with pos = astart + while (< pos aend) + do (multiple-value-bind (bytes invalid) + (,(make-od-name 'bytes-per-utf-16le-character accessor) array pos aend) + (declare (type (or null string) invalid)) + (cond + ((null invalid) + (vector-push-extend + (,(make-od-name 'simple-get-utf-16le-char accessor) + array pos bytes) + string)) + (t (dotimes (i (length invalid)) + (vector-push-extend (char invalid i) string)))) + (incf pos bytes))) + string)) + (defun ,name-be (array astart aend) + (declare (optimize speed (safety 0)) + (type ,type array) + (type array-range astart aend)) + (let ((string (make-array 0 :adjustable t :fill-pointer 0 :element-type 'character))) + (loop with pos = astart + while (< pos aend) + do (multiple-value-bind (bytes invalid) + (,(make-od-name 'bytes-per-utf-16be-character accessor) array pos aend) + (declare (type (or null string) invalid)) + (cond + ((null invalid) + (vector-push-extend + (,(make-od-name 'simple-get-utf-16be-char accessor) + array pos bytes) + string)) + (t (dotimes (i (length invalid)) + (vector-push-extend (char invalid i) string)))) + (incf pos bytes))) + string))))) + +(instantiate-octets-definition define-utf-16->string) + +(define-external-format/variable-width (:utf-16le :utf16le) t + (code-char #xfffd) + (let ((bits (char-code byte))) + (if (< bits #x10000) 2 4)) + (cond + ((< bits #x10000) + (if (utf-noncharacter-code-p bits) + (external-format-encoding-error stream bits) + (setf (sap-ref-16le sap tail) bits))) + (t (if (= (logand bits #xfffe) #xfffe) + (external-format-encoding-error stream bits) + (let* ((new-bits (- bits #x10000)) + (high (ldb (byte 10 10) new-bits)) + (low (ldb (byte 10 0) new-bits))) + (setf (sap-ref-16le sap tail) (dpb high (byte 10 0) #xd800)) + (setf (sap-ref-16le sap (+ tail 2)) (dpb low (byte 10 0) #xdc00)))))) + (2 (if (<= #xd800 (sap-ref-16le sap head) #xdbff) 4 2)) + (let ((bits (sap-ref-16le sap head))) + (cond + ((or (<= #xdc00 bits #xdfff) + (<= #xfdd0 bits #xfdef) + (= (logand bits #xfffe) #xfffe)) + (return-from decode-break-reason 2)) + ((<= #xd800 bits #xdbff) + (let ((next (sap-ref-16le sap (+ head 2)))) + (unless (<= #xdc00 next #xdfff) + (return-from decode-break-reason 2)) + (let ((code (dpb (ldb (byte 10 0) bits) (byte 10 10) (ldb (byte 10 0) next)))) + (if (= (logand code #xfffe) #xfffe) + (return-from decode-break-reason 4) + (code-char (+ #x10000 code)))))) + (t (code-char bits)))) + utf-16le->string-aref + string->utf-16le) + +(define-external-format/variable-width (:utf-16be :utf16be) t + (code-char #xfffd) + (let ((bits (char-code byte))) + (if (< bits #x10000) 2 4)) + (cond + ((< bits #x10000) + (if (utf-noncharacter-code-p bits) + (external-format-encoding-error stream bits) + (setf (sap-ref-16be sap tail) bits))) + (t (if (= (logand bits #xfffe) #xfffe) + (external-format-encoding-error stream bits) + (let* ((new-bits (- bits #x10000)) + (high (ldb (byte 10 10) new-bits)) + (low (ldb (byte 10 0) new-bits))) + (setf (sap-ref-16be sap tail) (dpb high (byte 10 0) #xd800)) + (setf (sap-ref-16be sap (+ tail 2)) (dpb low (byte 10 0) #xdc00)))))) + (2 (if (<= #xd800 (sap-ref-16be sap head) #xdbff) 4 2)) + (let ((bits (sap-ref-16be sap head))) + (cond + ((or (<= #xdc00 bits #xdfff) + (<= #xfdd0 bits #xfdef) + (= (logand bits #xfffe) #xfffe)) + (return-from decode-break-reason 2)) + ((<= #xd800 bits #xdbff) + (let ((next (sap-ref-16be sap (+ head 2)))) + (unless (<= #xdc00 next #xdfff) + (return-from decode-break-reason 2)) + (let ((code (dpb (ldb (byte 10 0) bits) (byte 10 10) (ldb (byte 10 0) next)))) + (if (= (logand code #xfffe) #xfffe) + (return-from decode-break-reason 4) + (code-char (+ #x10000 code)))))) + (t (code-char bits)))) + utf-16be->string-aref + string->utf-16be) + +(declaim (inline char->utf-32le)) +(defun char->utf-32le (char dest string pos) + (declare (optimize speed (safety 0)) + (type (array (unsigned-byte 8) (*)) dest)) + (let ((code (char-code char))) + (if (utf-noncharacter-code-p code) + (let ((replacement (encoding-error :utf-32le string pos))) + (declare (type (simple-array (unsigned-byte 8) (*)) replacement)) + (dotimes (i (length replacement)) + (vector-push-extend (aref replacement i) dest))) + (flet ((add-byte (b) + (declare (type (unsigned-byte 8) b)) + (vector-push-extend b dest))) + (declare (inline add-byte)) + (add-byte (ldb (byte 8 0) code)) + (add-byte (ldb (byte 8 8) code)) + (add-byte (ldb (byte 8 16) code)) + (add-byte (ldb (byte 8 24) code)))))) + +(declaim (inline char->utf-32be)) +(defun char->utf-32be (char dest string pos) + (declare (optimize speed (safety 0)) + (type (array (unsigned-byte 8) (*)) dest)) + (let ((code (char-code char))) + (if (utf-noncharacter-code-p code) + (let ((replacement (encoding-error :utf-32be string pos))) + (declare (type (simple-array (unsigned-byte 8) (*)) replacement)) + (dotimes (i (length replacement)) + (vector-push-extend (aref replacement i) dest))) + (flet ((add-byte (b) + (declare (type (unsigned-byte 8) b)) + (vector-push-extend b dest))) + (declare (inline add-byte)) + (add-byte (ldb (byte 8 24) code)) + (add-byte (ldb (byte 8 16) code)) + (add-byte (ldb (byte 8 8) code)) + (add-byte (ldb (byte 8 0) code)))))) + +(defun string->utf-32le (string sstart send additional-space) + (declare (optimize speed (safety 0)) + (type simple-string string) + (type array-range sstart send additional-space)) + (let ((array (make-array (* 4 (+ additional-space (- send sstart))) + :element-type '(unsigned-byte 8) + :fill-pointer 0 :adjustable t))) + (loop for i from sstart below send + do (char->utf-32le (char string i) array string i)) + (dotimes (i (* 4 additional-space)) + (vector-push-extend 0 array)) + (coerce array '(simple-array (unsigned-byte 8) (*))))) + +(defun string->utf-32be (string sstart send additional-space) + (declare (optimize speed (safety 0)) + (type simple-string string) + (type array-range sstart send additional-space)) + (let ((array (make-array (* 4 (+ additional-space (- send sstart))) + :element-type '(unsigned-byte 8) + :fill-pointer 0 :adjustable t))) + (loop for i from sstart below send + do (char->utf-32be (char string i) array string i)) + (dotimes (i (* 4 additional-space)) + (vector-push-extend 0 array)) + (coerce array '(simple-array (unsigned-byte 8) (*))))) + +;; Conversion from UTF-32{LE,BE} +(defmacro define-bytes-per-utf32-character (accessor type) + (declare (ignore type)) + (let ((name-le (make-od-name 'bytes-per-utf-32le-character accessor)) + (name-be (make-od-name 'bytes-per-utf-32be-character accessor))) + `(progn + (defun ,name-le (array pos end) + (declare (ignore array pos end)) + (values 4 nil)) + (defun ,name-be (array pos end) + (declare (ignore array pos end)) + (values 4 nil))))) +(instantiate-octets-definition define-bytes-per-utf32-character) + +(defmacro define-simple-get-utf32-character (accessor type) + (let ((name-le (make-od-name 'simple-get-utf-32le-char accessor)) + (name-be (make-od-name 'simple-get-utf-32be-char accessor))) + `(progn + (defun ,name-le (array pos bytes) + (declare (optimize speed (safety 0)) + (type ,type array) + (type array-range pos) + (type (integer 1 4) bytes)) + ;; Optimization for SYSTEM-AREA-POINTER: use SAP-REF-32LE that + ;; reads four bytes at once on some architectures. + (let ((code ,(if (and (eq accessor 'sap-ref-8) + (eq type 'system-area-pointer)) + '(sap-ref-32le array pos) + `(flet ((cref (x) + (,accessor array (the array-range (+ pos x))))) + (declare (inline cref)) + (dpb (cref 3) (byte 8 24) + (dpb (cref 2) (byte 8 16) + (dpb (cref 1) (byte 8 8) (cref 0)))))))) + (if (and (< code char-code-limit) + (not (utf-noncharacter-code-p code))) + (code-char code) + (decoding-error array pos (+ pos bytes) :utf-32le + 'octet-decoding-error pos)))) + (defun ,name-be (array pos bytes) + (declare (optimize speed (safety 0)) + (type ,type array) + (type array-range pos) + (type (integer 1 4) bytes)) + ;; Use SAP-REF-32BE even if it is not optimized + (let ((code ,(if (and (eq accessor 'sap-ref-8) + (eq type 'system-area-pointer)) + '(sap-ref-32be array pos) + `(flet ((cref (x) + (,accessor array (the array-range (+ pos x))))) + (declare (inline cref)) + (dpb (cref 0) (byte 8 24) + (dpb (cref 1) (byte 8 16) + (dpb (cref 2) (byte 8 8) (cref 3)))))))) + (if (and (< code char-code-limit) + (not (utf-noncharacter-code-p code))) + (code-char code) + (decoding-error array pos (+ pos bytes) :utf-32be + 'octet-decoding-error pos))))))) + +(instantiate-octets-definition define-simple-get-utf32-character) + +(defmacro define-utf-32->string (accessor type) + (let ((name-le (make-od-name 'utf-32le->string accessor)) + (name-be (make-od-name 'utf-32be->string accessor))) + `(progn + (defun ,name-le (array astart aend) + (declare (optimize speed (safety 0)) + (type ,type array) + (type array-range astart aend)) + (let ((string (make-array 0 :adjustable t :fill-pointer 0 :element-type 'character))) + (loop with pos = astart + while (< pos aend) + do (multiple-value-bind (bytes invalid) + (,(make-od-name 'bytes-per-utf-32le-character accessor) array pos aend) + (declare (type (or null string) invalid)) + (aver (null invalid)) + (let ((thing (,(make-od-name 'simple-get-utf-32le-char accessor) array pos bytes))) + (typecase thing + (character (vector-push-extend thing string)) + (string (dotimes (i (length thing)) + (vector-push-extend (char thing i) string))))) + (incf pos bytes))) + string)) + (defun ,name-be (array astart aend) + (declare (optimize speed (safety 0)) + (type ,type array) + (type array-range astart aend)) + (let ((string (make-array 0 :adjustable t :fill-pointer 0 :element-type 'character))) + (loop with pos = astart + while (< pos aend) + do (multiple-value-bind (bytes invalid) + (,(make-od-name 'bytes-per-utf-32be-character accessor) array pos aend) + (declare (type (or null string) invalid)) + (aver (null invalid)) + (let ((thing (,(make-od-name 'simple-get-utf-32be-char accessor) array pos bytes))) + (typecase thing + (character (vector-push-extend thing string)) + (string (dotimes (i (length thing)) + (vector-push-extend (char thing i) string))))) + (incf pos bytes))) + string))))) + +(instantiate-octets-definition define-utf-32->string) + +(define-external-format/variable-width (:utf-32le :utf32le) t + (code-char #xfffd) + 4 + (if (utf-noncharacter-code-p bits) + (external-format-encoding-error stream bits) + (setf (sap-ref-32le sap tail) bits)) + 4 + (let ((code (sap-ref-32le sap head))) + (if (and (< code char-code-limit) + (not (utf-noncharacter-code-p code))) + (code-char code) + (return-from decode-break-reason 4))) + utf-32le->string-aref + string->utf-32le) + +(define-external-format/variable-width (:utf-32be :utf32be) t + (code-char #xfffd) + 4 + (if (utf-noncharacter-code-p bits) + (external-format-encoding-error stream bits) + (setf (sap-ref-32be sap tail) bits)) + 4 + (let ((code (sap-ref-32be sap head))) + (if (and (< code char-code-limit) + (not (utf-noncharacter-code-p code))) + (code-char code) + (return-from decode-break-reason 4))) + utf-32be->string-aref + string->utf-32be) diff --git a/tests/external-format.impure.lisp b/tests/external-format.impure.lisp index dcc9df7..96c8608 100644 --- a/tests/external-format.impure.lisp +++ b/tests/external-format.impure.lisp @@ -925,4 +925,63 @@ (assert (char= (char string 0) (code-char #x10100))) (assert (char= (char string 1) #\replacement_character)))))) +;;; utf tests +(with-test (:name (:utf-16le :roundtrip)) + (let ((string (map 'string 'code-char '(#x20 #x200 #x2000 #xfffd #x10fffd)))) + (with-open-file (s *test-path* :direction :output :if-exists :supersede + :external-format :utf-16le) + (write-string string s)) + (with-open-file (s *test-path* :external-format :utf-16le) + (assert (string= string (read-line s)))))) +(with-test (:name (:utf-16be :roundtrip)) + (let ((string (map 'string 'code-char '(#x20 #x200 #x2000 #xfffd #x10fffd)))) + (with-open-file (s *test-path* :direction :output :if-exists :supersede + :external-format :utf-16be) + (write-string string s)) + (with-open-file (s *test-path* :external-format :utf-16be) + (assert (string= string (read-line s)))))) +(with-test (:name (:utf-16le :encoding-error)) + (let ((string (map 'string 'code-char '(#x20 #xfffe #xdc00 #xd800 #x1fffe #x20)))) + (with-open-file (s *test-path* :direction :output :if-exists :supersede + :external-format '(:utf-16le :replacement #\?)) + (write-string string s)) + (with-open-file (s *test-path* :external-format :utf-16le) + (assert (string= " ???? " (read-line s)))))) +(with-test (:name (:utf-16be :encoding-error)) + (let ((string (map 'string 'code-char '(#x20 #xfffe #xdc00 #xd800 #x1fffe #x20)))) + (with-open-file (s *test-path* :direction :output :if-exists :supersede + :external-format '(:utf-16be :replacement #\?)) + (write-string string s)) + (with-open-file (s *test-path* :external-format :utf-16be) + (assert (string= " ???? " (read-line s)))))) + +(with-test (:name (:utf-32le :roundtrip)) + (let ((string (map 'string 'code-char '(#x20 #x200 #x2000 #xfffd #x10fffd)))) + (with-open-file (s *test-path* :direction :output :if-exists :supersede + :external-format :utf-32le) + (write-string string s)) + (with-open-file (s *test-path* :external-format :utf-32le) + (assert (string= string (read-line s)))))) +(with-test (:name (:utf-32be :roundtrip)) + (let ((string (map 'string 'code-char '(#x20 #x200 #x2000 #xfffd #x10fffd)))) + (with-open-file (s *test-path* :direction :output :if-exists :supersede + :external-format :utf-32be) + (write-string string s)) + (with-open-file (s *test-path* :external-format :utf-32be) + (assert (string= string (read-line s)))))) +(with-test (:name (:utf-32le :encoding-error)) + (let ((string (map 'string 'code-char '(#x20 #xfffe #xdc00 #xd800 #x1fffe #x20)))) + (with-open-file (s *test-path* :direction :output :if-exists :supersede + :external-format '(:utf-32le :replacement #\?)) + (write-string string s)) + (with-open-file (s *test-path* :external-format :utf-32le) + (assert (string= " ???? " (read-line s)))))) +(with-test (:name (:utf-32be :encoding-error)) + (let ((string (map 'string 'code-char '(#x20 #xfffe #xdc00 #xd800 #x1fffe #x20)))) + (with-open-file (s *test-path* :direction :output :if-exists :supersede + :external-format '(:utf-32be :replacement #\?)) + (write-string string s)) + (with-open-file (s *test-path* :external-format :utf-32be) + (assert (string= " ???? " (read-line s)))))) + ;;;; success diff --git a/tests/octets.pure.lisp b/tests/octets.pure.lisp index 1a276a7..d40b260 100644 --- a/tests/octets.pure.lisp +++ b/tests/octets.pure.lisp @@ -315,3 +315,91 @@ (assert (equalp (octets-to-string (coerce '(#x00 #x10 #xff #xff) '(vector (unsigned-byte 8))) :external-format :ucs-4be) (string (code-char #x10ffff)))))) + +#+sb-unicode +(with-test (:name (:utf-16le :ensure-roundtrip)) + (flet ((enc (x) + (string-to-octets x :external-format :utf-16le)) + (dec (x) + (octets-to-string (coerce x '(vector (unsigned-byte 8))) + :external-format :utf-16le))) + (let ((string (map 'string 'code-char + '(#x20 #x200 #x2000 #x10000 #x10401 #x10fffd))) + (octets #(#x20 0 0 #x2 0 #x20 0 #xd8 0 #xdc 1 #xd8 1 #xdc #xff #xdb #xfd #xdf))) + (assert (equalp (enc string) octets)) + (assert (equalp (dec octets) string))))) +#+sb-unicode +(with-test (:name (:utf-16le :encoding-error)) + (flet ((enc (x) + (string-to-octets x :external-format '(:utf-16le :replacement #\?))) + (dec (x) + (octets-to-string (coerce x '(vector (unsigned-byte 8))) + :external-format :utf-16le))) + (let ((string (map 'string 'code-char '(#xd800 #xdc00 #xfffe #x10ffff)))) + (assert (equalp (enc string) #(63 0 63 0 63 0 63 0)))))) + +#+sb-unicode +(with-test (:name (:utf-16be :ensure-roundtrip)) + (flet ((enc (x) + (string-to-octets x :external-format :utf-16be)) + (dec (x) + (octets-to-string (coerce x '(vector (unsigned-byte 8))) + :external-format :utf-16be))) + (let ((string (map 'string 'code-char + '(#x20 #x200 #x2000 #x10000 #x10401 #x10fffd))) + (octets #(0 #x20 #x2 0 #x20 0 #xd8 0 #xdc 0 #xd8 1 #xdc 1 #xdb #xff #xdf #xfd))) + (assert (equalp (enc string) octets)) + (assert (equalp (dec octets) string))))) +#+sb-unicode +(with-test (:name (:utf-16be :encoding-error)) + (flet ((enc (x) + (string-to-octets x :external-format '(:utf-16be :replacement #\?))) + (dec (x) + (octets-to-string (coerce x '(vector (unsigned-byte 8))) + :external-format :utf-16be))) + (let ((string (map 'string 'code-char '(#xd800 #xdc00 #xfffe #x10ffff)))) + (assert (equalp (enc string) #(0 63 0 63 0 63 0 63)))))) + +#+sb-unicode +(with-test (:name (:utf-32le :ensure-roundtrip)) + (flet ((enc (x) + (string-to-octets x :external-format :utf-32le)) + (dec (x) + (octets-to-string (coerce x '(vector (unsigned-byte 8))) + :external-format :utf-32le))) + (let ((string (map 'string 'code-char + '(#x20 #x200 #x2000 #x10000 #x10401 #x10fffd))) + (octets #(#x20 0 0 0 0 #x2 0 0 0 #x20 0 0 0 0 1 0 1 4 1 0 #xfd #xff #x10 0))) + (assert (equalp (enc string) octets)) + (assert (equalp (dec octets) string))))) +#+sb-unicode +(with-test (:name (:utf-32le :encoding-error)) + (flet ((enc (x) + (string-to-octets x :external-format '(:utf-32le :replacement #\?))) + (dec (x) + (octets-to-string (coerce x '(vector (unsigned-byte 8))) + :external-format :utf-32le))) + (let ((string (map 'string 'code-char '(#xd800 #xdc00 #xfffe #x10ffff)))) + (assert (equalp (enc string) #(63 0 0 0 63 0 0 0 63 0 0 0 63 0 0 0)))))) + +#+sb-unicode +(with-test (:name (:utf-32be :ensure-roundtrip)) + (flet ((enc (x) + (string-to-octets x :external-format :utf-32be)) + (dec (x) + (octets-to-string (coerce x '(vector (unsigned-byte 8))) + :external-format :utf-32be))) + (let ((string (map 'string 'code-char + '(#x20 #x200 #x2000 #x10000 #x10401 #x10fffd))) + (octets #(0 0 0 #x20 0 0 #x2 0 0 0 #x20 0 0 1 0 0 0 1 4 1 0 #x10 #xff #xfd))) + (assert (equalp (enc string) octets)) + (assert (equalp (dec octets) string))))) +#+sb-unicode +(with-test (:name (:utf-32be :encoding-error)) + (flet ((enc (x) + (string-to-octets x :external-format '(:utf-32be :replacement #\?))) + (dec (x) + (octets-to-string (coerce x '(vector (unsigned-byte 8))) + :external-format :utf-32be))) + (let ((string (map 'string 'code-char '(#xd800 #xdc00 #xfffe #x10ffff)))) + (assert (equalp (enc string) #(0 0 0 63 0 0 0 63 0 0 0 63 0 0 0 63)))))) diff --git a/version.lisp-expr b/version.lisp-expr index 868cb2d..1939214 100644 --- a/version.lisp-expr +++ b/version.lisp-expr @@ -17,4 +17,4 @@ ;;; checkins which aren't released. (And occasionally for internal ;;; versions, especially for internal versions off the main CVS ;;; branch, it gets hairier, e.g. "0.pre7.14.flaky4.13".) -"1.0.33.15" +"1.0.33.16" -- 1.7.10.4