X-Git-Url: http://repo.macrolet.net/gitweb/?a=blobdiff_plain;f=src%2Fcode%2Fexternal-formats%2Fenc-ucs.lisp;fp=src%2Fcode%2Fexternal-formats%2Fenc-ucs.lisp;h=6fecf6b1a38fbf9650a8d7dc4087dc60b0ca7a9d;hb=99a8f9e012fd2b5edc9a0927edfc537332081997;hp=0000000000000000000000000000000000000000;hpb=840f5a3fa4c9142fd0d63dea8e8b76bce49aaf67;p=sbcl.git diff --git a/src/code/external-formats/enc-ucs.lisp b/src/code/external-formats/enc-ucs.lisp new file mode 100644 index 0000000..6fecf6b --- /dev/null +++ b/src/code/external-formats/enc-ucs.lisp @@ -0,0 +1,441 @@ +;;;; Universal Character Set (UCS) encodings +;;;; +;;;; In our interpretation, these are distinct from UTF encodings: the +;;;; UCS encodings are a direct encoding of the code point, in 16- and +;;;; 32-bit variants; by contrast, the UTF encodings handle Unicode +;;;; surrogate code points specially. + +;;;; This software is part of the SBCL system. See the README file for +;;;; more information. +;;;; +;;;; This software is derived from the CMU CL system, which was +;;;; written at Carnegie Mellon University and released into the +;;;; public domain. The software is in the public domain and is +;;;; provided with absolutely no warranty. See the COPYING and CREDITS +;;;; files for more information. + +(in-package "SB!IMPL") + +;;; TODO Macro for generating different variants: +;;; :ucs-2le (little endian) sap-ref-16le +;;; :ucs-2be (big endian) sap-ref-16be +;;; :ucs-2 (native) sap-ref-16 + +;;; Utilities + +(declaim + (inline sap-ref-16le (setf sap-ref-16le) sap-ref-16be (setf sap-ref-16be) + sap-ref-32le (setf sap-ref-32le) sap-ref-32be (setf sap-ref-32be))) + +;;; Define feature LITTLE-ENDIAN-AND-MISALIGNED-READ? +(defun sap-ref-16le (sap offset) + #!+(or x86 x86-64) + (sap-ref-16 sap offset) + #!-(or x86 x86-64) + (dpb (sap-ref-8 sap (1+ offset)) (byte 8 8) + (sap-ref-8 sap offset))) + +(defun (setf sap-ref-16le) (value sap offset) + #!+(or x86 x86-64) + (setf (sap-ref-16 sap offset) value) + #!-(or x86 x86-64) + (setf (sap-ref-8 sap offset) (logand value #xff) + (sap-ref-8 sap (1+ offset)) (ldb (byte 8 8) value))) + +(defun sap-ref-16be (sap offset) + (dpb (sap-ref-8 sap offset) (byte 8 8) + (sap-ref-8 sap (1+ offset)))) + +(defun (setf sap-ref-16be) (value sap offset) + (setf (sap-ref-8 sap (1+ offset)) (logand value #xff) + (sap-ref-8 sap offset) (ldb (byte 8 8) value))) + +(defun sap-ref-32le (sap offset) + #!+(or x86 x86-64) + (sap-ref-32 sap offset) + #!-(or x86 x86-64) + (dpb (sap-ref-8 sap (+ offset 3)) (byte 8 24) + (dpb (sap-ref-8 sap (+ offset 2)) (byte 8 16) + (sap-ref-16 sap offset)))) + +(defun (setf sap-ref-32le) (value sap offset) + #!+(or x86 x86-64) + (setf (sap-ref-32 sap offset) value) + #!-(or x86 x86-64) + (setf (sap-ref-8 sap offset) (logand value #xff) + (sap-ref-8 sap (1+ offset)) (ldb (byte 8 8) value) + (sap-ref-8 sap (+ offset 2)) (ldb (byte 8 16) value) + (sap-ref-8 sap (+ offset 3)) (ldb (byte 8 24) value))) + +(defun sap-ref-32be (sap offset) + (dpb (sap-ref-8 sap offset) (byte 8 24) + (dpb (sap-ref-8 sap (1+ offset)) (byte 8 16) + (dpb (sap-ref-8 sap (+ offset 2)) (byte 8 8) + (sap-ref-8 sap (+ offset 3)))))) + +(defun (setf sap-ref-32be) (value sap offset) + (setf (sap-ref-8 sap offset) (ldb (byte 8 24) value) + (sap-ref-8 sap (1+ offset)) (ldb (byte 8 16) value) + (sap-ref-8 sap (+ offset 2)) (ldb (byte 8 8) value) + (sap-ref-8 sap (+ offset 3)) (logand value #xff))) + +;;; +;;; octets +;;; + +;;; Conversion to UCS-2{LE,BE} +(declaim (inline char->ucs-2le)) +(defun char->ucs-2le (char dest string pos) + (declare (optimize speed (safety 0)) + (type (array (unsigned-byte 8) (*)) dest)) + (let ((code (char-code char))) + (if (< code #x10000) + (flet ((add-byte (b) + (declare (type (unsigned-byte 8) b)) + (vector-push-extend b dest))) + (declare (inline add-byte)) + (add-byte (ldb (byte 8 0) code)) + (add-byte (ldb (byte 8 8) code))) + (let ((replacement (encoding-error :ucs-2le string pos))) + (declare (type (simple-array (unsigned-byte 8) (*)) replacement)) + (dotimes (i (length replacement)) + (vector-push-extend (aref replacement i) dest)))))) + +(declaim (inline char->ucs-2be)) +(defun char->ucs-2be (char dest string pos) + (declare (optimize speed (safety 0)) + (type (array (unsigned-byte 8) (*)) dest)) + (let ((code (char-code char))) + (if (< code #x10000) + (flet ((add-byte (b) + (declare (type (unsigned-byte 8) b)) + (vector-push-extend b dest))) + (declare (inline add-byte)) + (add-byte (ldb (byte 8 8) code)) + (add-byte (ldb (byte 8 0) code))) + (let ((replacement (encoding-error :ucs-2be string pos))) + (declare (type (simple-array (unsigned-byte 8) (*)) replacement)) + (dotimes (i (length replacement)) + (vector-push-extend (aref replacement i) dest)))))) + +(defun string->ucs-2le (string sstart send additional-space) + (declare (optimize speed (safety 0)) + (type simple-string string) + (type array-range sstart send additional-space)) + (let ((array (make-array (* 2 (+ additional-space (- send sstart))) + :element-type '(unsigned-byte 8) + :fill-pointer 0 :adjustable t))) + (loop for i from sstart below send + do (char->ucs-2le (char string i) array string i)) + (dotimes (i (* 2 additional-space)) + (vector-push-extend 0 array)) + (coerce array '(simple-array (unsigned-byte 8) (*))))) + +(defun string->ucs-2be (string sstart send additional-space) + (declare (optimize speed (safety 0)) + (type simple-string string) + (type array-range sstart send additional-space)) + (let ((array (make-array (* 2 (+ additional-space (- send sstart))) + :element-type '(unsigned-byte 8) + :fill-pointer 0 :adjustable t))) + (loop for i from sstart below send + do (char->ucs-2be (char string i) array string i)) + (dotimes (i (* 2 additional-space)) + (vector-push-extend 0 array)) + (coerce array '(simple-array (unsigned-byte 8) (*))))) + +;; Conversion from UCS-2{LE,BE} +(defmacro define-bytes-per-ucs2-character (accessor type) + (declare (ignore type)) + (let ((name-le (make-od-name 'bytes-per-ucs-2le-character accessor)) + (name-be (make-od-name 'bytes-per-ucs-2be-character accessor))) + `(progn + (defun ,name-le (array pos end) + (declare (ignore array pos end)) + (values 2 nil)) + (defun ,name-be (array pos end) + (declare (ignore array pos end)) + (values 2 nil))))) +(instantiate-octets-definition define-bytes-per-ucs2-character) + +(defmacro define-simple-get-ucs2-character (accessor type) + (let ((name-le (make-od-name 'simple-get-ucs-2le-char accessor)) + (name-be (make-od-name 'simple-get-ucs-2be-char accessor))) + `(progn + (defun ,name-le (array pos bytes) + (declare (optimize speed (safety 0)) + (type ,type array) + (type array-range pos) + (type (integer 1 4) bytes) + (ignore bytes)) + ;; Optimization for SYSTEM-AREA-POINTER: use SAP-REF-16LE that + ;; reads two bytes at once on some architectures. + ,(if (and (eq accessor 'sap-ref-8) + (eq type 'system-area-pointer)) + '(code-char (sap-ref-16le array pos)) + `(flet ((cref (x) + (,accessor array (the array-range (+ pos x))))) + (declare (inline cref)) + (code-char (dpb (cref 1) (byte 8 8) (cref 0)))))) + (defun ,name-be (array pos bytes) + (declare (optimize speed (safety 0)) + (type ,type array) + (type array-range pos) + (type (integer 1 4) bytes) + (ignore bytes)) + ;; Use SAP-REF-16BE even if it is not optimized + ,(if (and (eq accessor 'sap-ref-8) + (eq type 'system-area-pointer)) + '(code-char (sap-ref-16be array pos)) + `(flet ((cref (x) + (,accessor array (the array-range (+ pos x))))) + (declare (inline cref)) + (code-char (dpb (cref 0) (byte 8 8) (cref 1))))))))) + +(instantiate-octets-definition define-simple-get-ucs2-character) + +(defmacro define-ucs-2->string (accessor type) + (let ((name-le (make-od-name 'ucs-2le->string accessor)) + (name-be (make-od-name 'ucs-2be->string accessor))) + `(progn + (defun ,name-le (array astart aend) + (declare (optimize speed (safety 0)) + (type ,type array) + (type array-range astart aend)) + (let ((string (make-array 0 :adjustable t :fill-pointer 0 :element-type 'character))) + (loop with pos = astart + while (< pos aend) + do (multiple-value-bind (bytes invalid) + (,(make-od-name 'bytes-per-ucs-2le-character accessor) array pos aend) + (declare (type (or null string) invalid)) + (aver (null invalid)) + (vector-push-extend + (,(make-od-name 'simple-get-ucs-2le-char accessor) + array pos bytes) + string) + (incf pos bytes))) + string)) + (defun ,name-be (array astart aend) + (declare (optimize speed (safety 0)) + (type ,type array) + (type array-range astart aend)) + (let ((string (make-array 0 :adjustable t :fill-pointer 0 :element-type 'character))) + (loop with pos = astart + while (< pos aend) + do (multiple-value-bind (bytes invalid) + (,(make-od-name 'bytes-per-ucs-2be-character accessor) array pos aend) + (declare (type (or null string) invalid)) + (aver (null invalid)) + (vector-push-extend + (,(make-od-name 'simple-get-ucs-2be-char accessor) + array pos bytes) + string) + (incf pos bytes))) + string))))) + +(instantiate-octets-definition define-ucs-2->string) + +(define-external-format/variable-width (:ucs-2le :ucs2le #!+win32 :ucs2 #!+win32 :ucs-2) t + (code-char #xfffd) + 2 + (if (< bits #x10000) + (setf (sap-ref-16le sap tail) bits) + (external-format-encoding-error stream bits)) + 2 + (code-char (sap-ref-16le sap head)) + ucs-2le->string-aref + string->ucs-2le) + +(define-external-format/variable-width (:ucs-2be :ucs2be) t + (code-char #xfffd) + 2 + (if (< bits #x10000) + (setf (sap-ref-16be sap tail) bits) + (external-format-encoding-error stream bits)) + 2 + (code-char (sap-ref-16be sap head)) + ucs-2be->string-aref + string->ucs-2be) + +(declaim (inline char->ucs-4le)) +(defun char->ucs-4le (char dest string pos) + (declare (optimize speed (safety 0)) + (type (array (unsigned-byte 8) (*)) dest) + (ignore string pos)) + (let ((code (char-code char))) + (flet ((add-byte (b) + (declare (type (unsigned-byte 8) b)) + (vector-push-extend b dest))) + (declare (inline add-byte)) + (add-byte (ldb (byte 8 0) code)) + (add-byte (ldb (byte 8 8) code)) + (add-byte (ldb (byte 8 16) code)) + (add-byte (ldb (byte 8 24) code))))) + +(declaim (inline char->ucs-4be)) +(defun char->ucs-4be (char dest string pos) + (declare (optimize speed (safety 0)) + (type (array (unsigned-byte 8) (*)) dest) + (ignore string pos)) + (let ((code (char-code char))) + (flet ((add-byte (b) + (declare (type (unsigned-byte 8) b)) + (vector-push-extend b dest))) + (declare (inline add-byte)) + (add-byte (ldb (byte 8 24) code)) + (add-byte (ldb (byte 8 16) code)) + (add-byte (ldb (byte 8 8) code)) + (add-byte (ldb (byte 8 0) code))))) + +(defun string->ucs-4le (string sstart send additional-space) + (declare (optimize speed (safety 0)) + (type simple-string string) + (type array-range sstart send additional-space)) + (let ((array (make-array (* 4 (+ additional-space (- send sstart))) + :element-type '(unsigned-byte 8) + :fill-pointer 0 :adjustable t))) + (loop for i from sstart below send + do (char->ucs-4le (char string i) array string i)) + (dotimes (i (* 4 additional-space)) + (vector-push-extend 0 array)) + (coerce array '(simple-array (unsigned-byte 8) (*))))) + +(defun string->ucs-4be (string sstart send additional-space) + (declare (optimize speed (safety 0)) + (type simple-string string) + (type array-range sstart send additional-space)) + (let ((array (make-array (* 4 (+ additional-space (- send sstart))) + :element-type '(unsigned-byte 8) + :fill-pointer 0 :adjustable t))) + (loop for i from sstart below send + do (char->ucs-4be (char string i) array string i)) + (dotimes (i (* 4 additional-space)) + (vector-push-extend 0 array)) + (coerce array '(simple-array (unsigned-byte 8) (*))))) + +;; Conversion from UCS-4{LE,BE} +(defmacro define-bytes-per-ucs4-character (accessor type) + (declare (ignore type)) + (let ((name-le (make-od-name 'bytes-per-ucs-4le-character accessor)) + (name-be (make-od-name 'bytes-per-ucs-4be-character accessor))) + `(progn + (defun ,name-le (array pos end) + (declare (ignore array pos end)) + (values 4 nil)) + (defun ,name-be (array pos end) + (declare (ignore array pos end)) + (values 4 nil))))) +(instantiate-octets-definition define-bytes-per-ucs4-character) + +(defmacro define-simple-get-ucs4-character (accessor type) + (let ((name-le (make-od-name 'simple-get-ucs-4le-char accessor)) + (name-be (make-od-name 'simple-get-ucs-4be-char accessor))) + `(progn + (defun ,name-le (array pos bytes) + (declare (optimize speed (safety 0)) + (type ,type array) + (type array-range pos) + (type (integer 1 4) bytes)) + ;; Optimization for SYSTEM-AREA-POINTER: use SAP-REF-32LE that + ;; reads four bytes at once on some architectures. + (let ((code ,(if (and (eq accessor 'sap-ref-8) + (eq type 'system-area-pointer)) + '(sap-ref-32le array pos) + `(flet ((cref (x) + (,accessor array (the array-range (+ pos x))))) + (declare (inline cref)) + (dpb (cref 3) (byte 8 24) + (dpb (cref 2) (byte 8 16) + (dpb (cref 1) (byte 8 8) (cref 0)))))))) + (if (< code char-code-limit) + (code-char code) + (decoding-error array pos (+ pos bytes) :ucs-4le + 'octet-decoding-error pos)))) + (defun ,name-be (array pos bytes) + (declare (optimize speed (safety 0)) + (type ,type array) + (type array-range pos) + (type (integer 1 4) bytes)) + ;; Use SAP-REF-32BE even if it is not optimized + (let ((code ,(if (and (eq accessor 'sap-ref-8) + (eq type 'system-area-pointer)) + '(sap-ref-32be array pos) + `(flet ((cref (x) + (,accessor array (the array-range (+ pos x))))) + (declare (inline cref)) + (dpb (cref 0) (byte 8 24) + (dpb (cref 1) (byte 8 16) + (dpb (cref 2) (byte 8 8) (cref 3)))))))) + (if (< code char-code-limit) + (code-char code) + (decoding-error array pos (+ pos bytes) :ucs-4be + 'octet-decoding-error pos))))))) + +(instantiate-octets-definition define-simple-get-ucs4-character) + +(defmacro define-ucs-4->string (accessor type) + (let ((name-le (make-od-name 'ucs-4le->string accessor)) + (name-be (make-od-name 'ucs-4be->string accessor))) + `(progn + (defun ,name-le (array astart aend) + (declare (optimize speed (safety 0)) + (type ,type array) + (type array-range astart aend)) + (let ((string (make-array 0 :adjustable t :fill-pointer 0 :element-type 'character))) + (loop with pos = astart + while (< pos aend) + do (multiple-value-bind (bytes invalid) + (,(make-od-name 'bytes-per-ucs-4le-character accessor) array pos aend) + (declare (type (or null string) invalid)) + (aver (null invalid)) + (let ((thing (,(make-od-name 'simple-get-ucs-4le-char accessor) array pos bytes))) + (typecase thing + (character (vector-push-extend thing string)) + (string (dotimes (i (length thing)) + (vector-push-extend (char thing i) string))))) + (incf pos bytes))) + string)) + (defun ,name-be (array astart aend) + (declare (optimize speed (safety 0)) + (type ,type array) + (type array-range astart aend)) + (let ((string (make-array 0 :adjustable t :fill-pointer 0 :element-type 'character))) + (loop with pos = astart + while (< pos aend) + do (multiple-value-bind (bytes invalid) + (,(make-od-name 'bytes-per-ucs-4be-character accessor) array pos aend) + (declare (type (or null string) invalid)) + (aver (null invalid)) + (let ((thing (,(make-od-name 'simple-get-ucs-4be-char accessor) array pos bytes))) + (typecase thing + (character (vector-push-extend thing string)) + (string (dotimes (i (length thing)) + (vector-push-extend (char thing i) string))))) + (incf pos bytes))) + string))))) + +(instantiate-octets-definition define-ucs-4->string) + +(define-external-format/variable-width (:ucs-4le :ucs4le) nil + (code-char #xfffd) + 4 + (setf (sap-ref-32le sap tail) bits) + 4 + (let ((code (sap-ref-32le sap head))) + (if (< code char-code-limit) + (code-char code) + (return-from decode-break-reason 4))) + ucs-4le->string-aref + string->ucs-4le) + +(define-external-format/variable-width (:ucs-4be :ucs4be) t + (code-char #xfffd) + 4 + (setf (sap-ref-32be sap tail) bits) + 4 + (let ((code (sap-ref-32be sap head))) + (if (< code char-code-limit) + (code-char code) + (return-from decode-break-reason 4))) + ucs-4be->string-aref + string->ucs-4be)