;;;; Unicode Transformation Format (UTF) encodings ;;;; ;;;; In our interpretation, these are distinct from UCS encodings: the ;;;; UCS encodings are a direct encoding of the code point, in 16- and ;;;; 32-bit variants; by contrast, the UTF encodings handle Unicode ;;;; surrogate code points specially. ;;;; This software is part of the SBCL system. See the README file for ;;;; more information. ;;;; ;;;; This software is derived from the CMU CL system, which was ;;;; written at Carnegie Mellon University and released into the ;;;; public domain. The software is in the public domain and is ;;;; provided with absolutely no warranty. See the COPYING and CREDITS ;;;; files for more information. (in-package "SB!IMPL") (declaim (inline utf-noncharacter-code-p)) (defun utf-noncharacter-code-p (code) (or (<= #xd800 code #xdfff) (<= #xfdd0 code #xfdef) (= (logand code #xfffe) #xfffe))) ;;; Conversion to UTF-16{LE,BE} (declaim (inline char->utf-16le)) (defun char->utf-16le (char dest string pos) (declare (optimize speed (safety 0)) (type (array (unsigned-byte 8) (*)) dest)) (let ((code (char-code char))) (if (utf-noncharacter-code-p code) (let ((replacement (encoding-error :utf-16le string pos))) (declare (type (simple-array (unsigned-byte 8) (*)) replacement)) (dotimes (i (length replacement)) (vector-push-extend (aref replacement i) dest))) (flet ((add-byte (b) (declare (type (unsigned-byte 8) b)) (vector-push-extend b dest))) (declare (inline add-byte)) (cond ((< code #x10000) (add-byte (ldb (byte 8 0) code)) (add-byte (ldb (byte 8 8) code))) (t (let* ((codeoid (- code #x10000)) (high (dpb (ldb (byte 10 10) codeoid) (byte 10 0) #xd800)) (low (dpb (ldb (byte 10 0) codeoid) (byte 10 0) #xdc00))) (add-byte (ldb (byte 8 0) high)) (add-byte (ldb (byte 8 8) high)) (add-byte (ldb (byte 8 0) low)) (add-byte (ldb (byte 8 8) low))))))))) (declaim (inline char->utf-16be)) (defun char->utf-16be (char dest string pos) (declare (optimize speed (safety 0)) (type (array (unsigned-byte 8) (*)) dest)) (let ((code (char-code char))) (if (utf-noncharacter-code-p code) (let ((replacement (encoding-error :utf-16be string pos))) (declare (type (simple-array (unsigned-byte 8) (*)) replacement)) (dotimes (i (length replacement)) (vector-push-extend (aref replacement i) dest))) (flet ((add-byte (b) (declare (type (unsigned-byte 8) b)) (vector-push-extend b dest))) (declare (inline add-byte)) (cond ((< code #x10000) (add-byte (ldb (byte 8 8) code)) (add-byte (ldb (byte 8 0) code))) (t (let* ((codeoid (- code #x10000)) (high (dpb (ldb (byte 10 10) codeoid) (byte 10 0) #xd800)) (low (dpb (ldb (byte 10 0) codeoid) (byte 10 0) #xdc00))) (add-byte (ldb (byte 8 8) high)) (add-byte (ldb (byte 8 0) high)) (add-byte (ldb (byte 8 8) low)) (add-byte (ldb (byte 8 0) low))))))))) (defun string->utf-16le (string sstart send additional-space) (declare (optimize speed (safety 0)) (type simple-string string) (type array-range sstart send additional-space)) (let ((array (make-array (* 2 (+ additional-space (- send sstart))) :element-type '(unsigned-byte 8) :fill-pointer 0 :adjustable t))) (loop for i from sstart below send do (char->utf-16le (char string i) array string i)) (dotimes (i (* 2 additional-space)) (vector-push-extend 0 array)) (coerce array '(simple-array (unsigned-byte 8) (*))))) (defun string->utf-16be (string sstart send additional-space) (declare (optimize speed (safety 0)) (type simple-string string) (type array-range sstart send additional-space)) (let ((array (make-array (* 2 (+ additional-space (- send sstart))) :element-type '(unsigned-byte 8) :fill-pointer 0 :adjustable t))) (loop for i from sstart below send do (char->utf-16be (char string i) array string i)) (dotimes (i (* 2 additional-space)) (vector-push-extend 0 array)) (coerce array '(simple-array (unsigned-byte 8) (*))))) ;; Conversion from UTF-16{LE,BE} (defmacro define-bytes-per-utf16-character (accessor type) (declare (ignore type)) (let ((name-le (make-od-name 'bytes-per-utf-16le-character accessor)) (name-be (make-od-name 'bytes-per-utf-16be-character accessor))) `(progn (defun ,name-le (array pos end) (let ((remaining (- end pos))) (when (< remaining 2) (return-from ,name-le (values remaining (decoding-error array pos end :utf-16le 'octet-decoding-error pos)))) (let ((low (dpb (,accessor array (+ pos 1)) (byte 8 8) (,accessor array pos)))) (if (<= #xd800 low #xdbff) (if (< remaining 4) (values remaining (decoding-error array pos end :utf-16le 'octet-decoding-error pos)) (let ((high (dpb (,accessor array (+ pos 3)) (byte 8 8) (,accessor array (+ pos 2))))) (if (<= #xdc00 high #xdfff) (let ((code (dpb (ldb (byte 10 0) low) (byte 10 10) (ldb (byte 10 0) high)))) (if (= (logand code #xfffe) #xfffe) (values 4 (decoding-error array pos (+ pos 4) :utf-16le 'octet-decoding-error pos)) (values 4 nil))) (values 2 (decoding-error array pos (+ pos 2) :utf-16le 'octet-decoding-error pos))))) (if (or (<= #xdc00 low #xdfff) (<= #xfdd0 low #xfdef) (= (logand low #xfffe) #xfffe)) (values 2 (decoding-error array pos (+ pos 2) :utf-16le 'octet-decoding-error pos)) (values 2 nil)))))) (defun ,name-be (array pos end) (let ((remaining (- end pos))) (when (< remaining 2) (return-from ,name-be (values remaining (decoding-error array pos end :utf-16le 'octet-decoding-error pos)))) (let ((low (dpb (,accessor array pos) (byte 8 8) (,accessor array (+ pos 1))))) (if (<= #xd800 low #xdbff) (if (< remaining 4) (values remaining (decoding-error array pos end :utf-16le 'octet-decoding-error pos)) (let ((high (dpb (,accessor array (+ pos 2)) (byte 8 8) (,accessor array (+ pos 3))))) (if (<= #xdc00 high #xdfff) (let ((code (dpb (ldb (byte 10 0) low) (byte 10 10) (ldb (byte 10 0) high)))) (if (= (logand code #xfffe) #xfffe) (values 4 (decoding-error array pos (+ pos 4) :utf-16le 'octet-decoding-error pos)) (values 4 nil))) (values 2 (decoding-error array pos (+ pos 2) :utf-16le 'octet-decoding-error pos))))) (if (or (<= #xdc00 low #xdfff) (<= #xfdd0 low #xfdef) (= (logand low #xfffe) #xfffe)) (values 2 (decoding-error array pos (+ pos 2) :utf-16le 'octet-decoding-error pos)) (values 2 nil))))))))) (instantiate-octets-definition define-bytes-per-utf16-character) (defmacro define-simple-get-utf16-character (accessor type) (let ((name-le (make-od-name 'simple-get-utf-16le-char accessor)) (name-be (make-od-name 'simple-get-utf-16be-char accessor))) `(progn (defun ,name-le (array pos bytes) (declare (optimize speed (safety 0)) (type ,type array) (type array-range pos) (type (integer 1 4) bytes) (ignore bytes)) ;; Optimization for SYSTEM-AREA-POINTER: use SAP-REF-16LE that ;; reads two bytes at once on some architectures. (let ((code ,(if (and (eq accessor 'sap-ref-8) (eq type 'system-area-pointer)) '(sap-ref-16le array pos) `(flet ((cref (x) (,accessor array (the array-range (+ pos x))))) (declare (inline cref)) (dpb (cref 1) (byte 8 8) (cref 0)))))) (if (<= #xd800 code #xdbff) (let ((next ,(if (and (eq accessor 'sap-ref-8) (eq type 'system-area-pointer)) '(sap-ref-16le array (+ pos 2)) `(flet ((cref (x) (,accessor array (the array-range (+ pos x))))) (declare (inline cref)) (dpb (cref 3) (byte 8 8) (cref 2)))))) (code-char (+ #x10000 (dpb (ldb (byte 10 0) code) (byte 10 10) (ldb (byte 10 0) next))))) (code-char code)))) (defun ,name-be (array pos bytes) (declare (optimize speed (safety 0)) (type ,type array) (type array-range pos) (type (integer 1 4) bytes) (ignore bytes)) ;; Use SAP-REF-16BE even if it is not optimized (let ((code ,(if (and (eq accessor 'sap-ref-8) (eq type 'system-area-pointer)) '(sap-ref-16be array pos) `(flet ((cref (x) (,accessor array (the array-range (+ pos x))))) (declare (inline cref)) (dpb (cref 0) (byte 8 8) (cref 1)))))) (if (<= #xd800 code #xdbff) (let ((next ,(if (and (eq accessor 'sap-ref-8) (eq type 'system-area-pointer)) '(sap-ref-16be array (+ pos 2)) `(flet ((cref (x) (,accessor array (the array-range (+ pos x))))) (declare (inline cref)) (dpb (cref 2) (byte 8 8) (cref 3)))))) (code-char (+ #x10000 (dpb (ldb (byte 10 0) code) (byte 10 10) (ldb (byte 10 0) next))))) (code-char code))))))) (instantiate-octets-definition define-simple-get-utf16-character) (defmacro define-utf-16->string (accessor type) (let ((name-le (make-od-name 'utf-16le->string accessor)) (name-be (make-od-name 'utf-16be->string accessor))) `(progn (defun ,name-le (array astart aend) (declare (optimize speed (safety 0)) (type ,type array) (type array-range astart aend)) (let ((string (make-array 0 :adjustable t :fill-pointer 0 :element-type 'character))) (loop with pos = astart while (< pos aend) do (multiple-value-bind (bytes invalid) (,(make-od-name 'bytes-per-utf-16le-character accessor) array pos aend) (declare (type (or null string) invalid)) (cond ((null invalid) (vector-push-extend (,(make-od-name 'simple-get-utf-16le-char accessor) array pos bytes) string)) (t (dotimes (i (length invalid)) (vector-push-extend (char invalid i) string)))) (incf pos bytes))) string)) (defun ,name-be (array astart aend) (declare (optimize speed (safety 0)) (type ,type array) (type array-range astart aend)) (let ((string (make-array 0 :adjustable t :fill-pointer 0 :element-type 'character))) (loop with pos = astart while (< pos aend) do (multiple-value-bind (bytes invalid) (,(make-od-name 'bytes-per-utf-16be-character accessor) array pos aend) (declare (type (or null string) invalid)) (cond ((null invalid) (vector-push-extend (,(make-od-name 'simple-get-utf-16be-char accessor) array pos bytes) string)) (t (dotimes (i (length invalid)) (vector-push-extend (char invalid i) string)))) (incf pos bytes))) string))))) (instantiate-octets-definition define-utf-16->string) (define-external-format/variable-width (:utf-16le :utf16le) t (code-char #xfffd) (let ((bits (char-code byte))) (if (< bits #x10000) 2 4)) (cond ((< bits #x10000) (if (utf-noncharacter-code-p bits) (external-format-encoding-error stream bits) (setf (sap-ref-16le sap tail) bits))) (t (if (= (logand bits #xfffe) #xfffe) (external-format-encoding-error stream bits) (let* ((new-bits (- bits #x10000)) (high (ldb (byte 10 10) new-bits)) (low (ldb (byte 10 0) new-bits))) (setf (sap-ref-16le sap tail) (dpb high (byte 10 0) #xd800)) (setf (sap-ref-16le sap (+ tail 2)) (dpb low (byte 10 0) #xdc00)))))) (2 (if (<= #xd800 (sap-ref-16le sap head) #xdbff) 4 2)) (let ((bits (sap-ref-16le sap head))) (cond ((or (<= #xdc00 bits #xdfff) (<= #xfdd0 bits #xfdef) (= (logand bits #xfffe) #xfffe)) (return-from decode-break-reason 2)) ((<= #xd800 bits #xdbff) (let ((next (sap-ref-16le sap (+ head 2)))) (unless (<= #xdc00 next #xdfff) (return-from decode-break-reason 2)) (let ((code (dpb (ldb (byte 10 0) bits) (byte 10 10) (ldb (byte 10 0) next)))) (if (= (logand code #xfffe) #xfffe) (return-from decode-break-reason 4) (code-char (+ #x10000 code)))))) (t (code-char bits)))) utf-16le->string-aref string->utf-16le) (define-external-format/variable-width (:utf-16be :utf16be) t (code-char #xfffd) (let ((bits (char-code byte))) (if (< bits #x10000) 2 4)) (cond ((< bits #x10000) (if (utf-noncharacter-code-p bits) (external-format-encoding-error stream bits) (setf (sap-ref-16be sap tail) bits))) (t (if (= (logand bits #xfffe) #xfffe) (external-format-encoding-error stream bits) (let* ((new-bits (- bits #x10000)) (high (ldb (byte 10 10) new-bits)) (low (ldb (byte 10 0) new-bits))) (setf (sap-ref-16be sap tail) (dpb high (byte 10 0) #xd800)) (setf (sap-ref-16be sap (+ tail 2)) (dpb low (byte 10 0) #xdc00)))))) (2 (if (<= #xd800 (sap-ref-16be sap head) #xdbff) 4 2)) (let ((bits (sap-ref-16be sap head))) (cond ((or (<= #xdc00 bits #xdfff) (<= #xfdd0 bits #xfdef) (= (logand bits #xfffe) #xfffe)) (return-from decode-break-reason 2)) ((<= #xd800 bits #xdbff) (let ((next (sap-ref-16be sap (+ head 2)))) (unless (<= #xdc00 next #xdfff) (return-from decode-break-reason 2)) (let ((code (dpb (ldb (byte 10 0) bits) (byte 10 10) (ldb (byte 10 0) next)))) (if (= (logand code #xfffe) #xfffe) (return-from decode-break-reason 4) (code-char (+ #x10000 code)))))) (t (code-char bits)))) utf-16be->string-aref string->utf-16be) (declaim (inline char->utf-32le)) (defun char->utf-32le (char dest string pos) (declare (optimize speed (safety 0)) (type (array (unsigned-byte 8) (*)) dest)) (let ((code (char-code char))) (if (utf-noncharacter-code-p code) (let ((replacement (encoding-error :utf-32le string pos))) (declare (type (simple-array (unsigned-byte 8) (*)) replacement)) (dotimes (i (length replacement)) (vector-push-extend (aref replacement i) dest))) (flet ((add-byte (b) (declare (type (unsigned-byte 8) b)) (vector-push-extend b dest))) (declare (inline add-byte)) (add-byte (ldb (byte 8 0) code)) (add-byte (ldb (byte 8 8) code)) (add-byte (ldb (byte 8 16) code)) (add-byte (ldb (byte 8 24) code)))))) (declaim (inline char->utf-32be)) (defun char->utf-32be (char dest string pos) (declare (optimize speed (safety 0)) (type (array (unsigned-byte 8) (*)) dest)) (let ((code (char-code char))) (if (utf-noncharacter-code-p code) (let ((replacement (encoding-error :utf-32be string pos))) (declare (type (simple-array (unsigned-byte 8) (*)) replacement)) (dotimes (i (length replacement)) (vector-push-extend (aref replacement i) dest))) (flet ((add-byte (b) (declare (type (unsigned-byte 8) b)) (vector-push-extend b dest))) (declare (inline add-byte)) (add-byte (ldb (byte 8 24) code)) (add-byte (ldb (byte 8 16) code)) (add-byte (ldb (byte 8 8) code)) (add-byte (ldb (byte 8 0) code)))))) (defun string->utf-32le (string sstart send additional-space) (declare (optimize speed (safety 0)) (type simple-string string) (type array-range sstart send additional-space)) (let ((array (make-array (* 4 (+ additional-space (- send sstart))) :element-type '(unsigned-byte 8) :fill-pointer 0 :adjustable t))) (loop for i from sstart below send do (char->utf-32le (char string i) array string i)) (dotimes (i (* 4 additional-space)) (vector-push-extend 0 array)) (coerce array '(simple-array (unsigned-byte 8) (*))))) (defun string->utf-32be (string sstart send additional-space) (declare (optimize speed (safety 0)) (type simple-string string) (type array-range sstart send additional-space)) (let ((array (make-array (* 4 (+ additional-space (- send sstart))) :element-type '(unsigned-byte 8) :fill-pointer 0 :adjustable t))) (loop for i from sstart below send do (char->utf-32be (char string i) array string i)) (dotimes (i (* 4 additional-space)) (vector-push-extend 0 array)) (coerce array '(simple-array (unsigned-byte 8) (*))))) ;; Conversion from UTF-32{LE,BE} (defmacro define-bytes-per-utf32-character (accessor type) (declare (ignore type)) (let ((name-le (make-od-name 'bytes-per-utf-32le-character accessor)) (name-be (make-od-name 'bytes-per-utf-32be-character accessor))) `(progn (defun ,name-le (array pos end) (declare (ignore array pos end)) (values 4 nil)) (defun ,name-be (array pos end) (declare (ignore array pos end)) (values 4 nil))))) (instantiate-octets-definition define-bytes-per-utf32-character) (defmacro define-simple-get-utf32-character (accessor type) (let ((name-le (make-od-name 'simple-get-utf-32le-char accessor)) (name-be (make-od-name 'simple-get-utf-32be-char accessor))) `(progn (defun ,name-le (array pos bytes) (declare (optimize speed (safety 0)) (type ,type array) (type array-range pos) (type (integer 1 4) bytes)) ;; Optimization for SYSTEM-AREA-POINTER: use SAP-REF-32LE that ;; reads four bytes at once on some architectures. (let ((code ,(if (and (eq accessor 'sap-ref-8) (eq type 'system-area-pointer)) '(sap-ref-32le array pos) `(flet ((cref (x) (,accessor array (the array-range (+ pos x))))) (declare (inline cref)) (dpb (cref 3) (byte 8 24) (dpb (cref 2) (byte 8 16) (dpb (cref 1) (byte 8 8) (cref 0)))))))) (if (and (< code sb!xc:char-code-limit) (not (utf-noncharacter-code-p code))) (code-char code) (decoding-error array pos (+ pos bytes) :utf-32le 'octet-decoding-error pos)))) (defun ,name-be (array pos bytes) (declare (optimize speed (safety 0)) (type ,type array) (type array-range pos) (type (integer 1 4) bytes)) ;; Use SAP-REF-32BE even if it is not optimized (let ((code ,(if (and (eq accessor 'sap-ref-8) (eq type 'system-area-pointer)) '(sap-ref-32be array pos) `(flet ((cref (x) (,accessor array (the array-range (+ pos x))))) (declare (inline cref)) (dpb (cref 0) (byte 8 24) (dpb (cref 1) (byte 8 16) (dpb (cref 2) (byte 8 8) (cref 3)))))))) (if (and (< code sb!xc:char-code-limit) (not (utf-noncharacter-code-p code))) (code-char code) (decoding-error array pos (+ pos bytes) :utf-32be 'octet-decoding-error pos))))))) (instantiate-octets-definition define-simple-get-utf32-character) (defmacro define-utf-32->string (accessor type) (let ((name-le (make-od-name 'utf-32le->string accessor)) (name-be (make-od-name 'utf-32be->string accessor))) `(progn (defun ,name-le (array astart aend) (declare (optimize speed (safety 0)) (type ,type array) (type array-range astart aend)) (let ((string (make-array 0 :adjustable t :fill-pointer 0 :element-type 'character))) (loop with pos = astart while (< pos aend) do (multiple-value-bind (bytes invalid) (,(make-od-name 'bytes-per-utf-32le-character accessor) array pos aend) (declare (type (or null string) invalid)) (aver (null invalid)) (let ((thing (,(make-od-name 'simple-get-utf-32le-char accessor) array pos bytes))) (typecase thing (character (vector-push-extend thing string)) (string (dotimes (i (length thing)) (vector-push-extend (char thing i) string))))) (incf pos bytes))) string)) (defun ,name-be (array astart aend) (declare (optimize speed (safety 0)) (type ,type array) (type array-range astart aend)) (let ((string (make-array 0 :adjustable t :fill-pointer 0 :element-type 'character))) (loop with pos = astart while (< pos aend) do (multiple-value-bind (bytes invalid) (,(make-od-name 'bytes-per-utf-32be-character accessor) array pos aend) (declare (type (or null string) invalid)) (aver (null invalid)) (let ((thing (,(make-od-name 'simple-get-utf-32be-char accessor) array pos bytes))) (typecase thing (character (vector-push-extend thing string)) (string (dotimes (i (length thing)) (vector-push-extend (char thing i) string))))) (incf pos bytes))) string))))) (instantiate-octets-definition define-utf-32->string) (define-external-format/variable-width (:utf-32le :utf32le) t (code-char #xfffd) 4 (if (utf-noncharacter-code-p bits) (external-format-encoding-error stream bits) (setf (sap-ref-32le sap tail) bits)) 4 (let ((code (sap-ref-32le sap head))) (if (and (< code sb!xc:char-code-limit) (not (utf-noncharacter-code-p code))) (code-char code) (return-from decode-break-reason 4))) utf-32le->string-aref string->utf-32le) (define-external-format/variable-width (:utf-32be :utf32be) t (code-char #xfffd) 4 (if (utf-noncharacter-code-p bits) (external-format-encoding-error stream bits) (setf (sap-ref-32be sap tail) bits)) 4 (let ((code (sap-ref-32be sap head))) (if (and (< code sb!xc:char-code-limit) (not (utf-noncharacter-code-p code))) (code-char code) (return-from decode-break-reason 4))) utf-32be->string-aref string->utf-32be)