1 ;;;; Universal Character Set (UCS) encodings
3 ;;;; In our interpretation, these are distinct from UTF encodings: the
4 ;;;; UCS encodings are a direct encoding of the code point, in 16- and
5 ;;;; 32-bit variants; by contrast, the UTF encodings handle Unicode
6 ;;;; surrogate code points specially.
8 ;;;; This software is part of the SBCL system. See the README file for
11 ;;;; This software is derived from the CMU CL system, which was
12 ;;;; written at Carnegie Mellon University and released into the
13 ;;;; public domain. The software is in the public domain and is
14 ;;;; provided with absolutely no warranty. See the COPYING and CREDITS
15 ;;;; files for more information.
17 (in-package "SB!IMPL")
19 ;;; TODO Macro for generating different variants:
20 ;;; :ucs-2le (little endian) sap-ref-16le
21 ;;; :ucs-2be (big endian) sap-ref-16be
22 ;;; :ucs-2 (native) sap-ref-16
27 (inline sap-ref-16le (setf sap-ref-16le) sap-ref-16be (setf sap-ref-16be)
28 sap-ref-32le (setf sap-ref-32le) sap-ref-32be (setf sap-ref-32be)))
30 ;;; Define feature LITTLE-ENDIAN-AND-MISALIGNED-READ?
31 (defun sap-ref-16le (sap offset)
33 (sap-ref-16 sap offset)
35 (dpb (sap-ref-8 sap (1+ offset)) (byte 8 8)
36 (sap-ref-8 sap offset)))
38 (defun (setf sap-ref-16le) (value sap offset)
40 (setf (sap-ref-16 sap offset) value)
42 (setf (sap-ref-8 sap offset) (logand value #xff)
43 (sap-ref-8 sap (1+ offset)) (ldb (byte 8 8) value)))
45 (defun sap-ref-16be (sap offset)
46 (dpb (sap-ref-8 sap offset) (byte 8 8)
47 (sap-ref-8 sap (1+ offset))))
49 (defun (setf sap-ref-16be) (value sap offset)
50 (setf (sap-ref-8 sap (1+ offset)) (logand value #xff)
51 (sap-ref-8 sap offset) (ldb (byte 8 8) value)))
53 (defun sap-ref-32le (sap offset)
55 (sap-ref-32 sap offset)
57 (dpb (sap-ref-8 sap (+ offset 3)) (byte 8 24)
58 (dpb (sap-ref-8 sap (+ offset 2)) (byte 8 16)
59 (sap-ref-16le sap offset))))
61 (defun (setf sap-ref-32le) (value sap offset)
63 (setf (sap-ref-32 sap offset) value)
65 (setf (sap-ref-8 sap offset) (logand value #xff)
66 (sap-ref-8 sap (1+ offset)) (ldb (byte 8 8) value)
67 (sap-ref-8 sap (+ offset 2)) (ldb (byte 8 16) value)
68 (sap-ref-8 sap (+ offset 3)) (ldb (byte 8 24) value)))
70 (defun sap-ref-32be (sap offset)
71 (dpb (sap-ref-8 sap offset) (byte 8 24)
72 (dpb (sap-ref-8 sap (1+ offset)) (byte 8 16)
73 (dpb (sap-ref-8 sap (+ offset 2)) (byte 8 8)
74 (sap-ref-8 sap (+ offset 3))))))
76 (defun (setf sap-ref-32be) (value sap offset)
77 (setf (sap-ref-8 sap offset) (ldb (byte 8 24) value)
78 (sap-ref-8 sap (1+ offset)) (ldb (byte 8 16) value)
79 (sap-ref-8 sap (+ offset 2)) (ldb (byte 8 8) value)
80 (sap-ref-8 sap (+ offset 3)) (logand value #xff)))
86 ;;; Conversion to UCS-2{LE,BE}
87 (declaim (inline char->ucs-2le))
88 (defun char->ucs-2le (char dest string pos)
89 (declare (optimize speed (safety 0))
90 (type (array (unsigned-byte 8) (*)) dest))
91 (let ((code (char-code char)))
94 (declare (type (unsigned-byte 8) b))
95 (vector-push-extend b dest)))
96 (declare (inline add-byte))
97 (add-byte (ldb (byte 8 0) code))
98 (add-byte (ldb (byte 8 8) code)))
99 (let ((replacement (encoding-error :ucs-2le string pos)))
100 (declare (type (simple-array (unsigned-byte 8) (*)) replacement))
101 (dotimes (i (length replacement))
102 (vector-push-extend (aref replacement i) dest))))))
104 (declaim (inline char->ucs-2be))
105 (defun char->ucs-2be (char dest string pos)
106 (declare (optimize speed (safety 0))
107 (type (array (unsigned-byte 8) (*)) dest))
108 (let ((code (char-code char)))
111 (declare (type (unsigned-byte 8) b))
112 (vector-push-extend b dest)))
113 (declare (inline add-byte))
114 (add-byte (ldb (byte 8 8) code))
115 (add-byte (ldb (byte 8 0) code)))
116 (let ((replacement (encoding-error :ucs-2be string pos)))
117 (declare (type (simple-array (unsigned-byte 8) (*)) replacement))
118 (dotimes (i (length replacement))
119 (vector-push-extend (aref replacement i) dest))))))
121 (defun string->ucs-2le (string sstart send additional-space)
122 (declare (optimize speed (safety 0))
123 (type simple-string string)
124 (type array-range sstart send additional-space))
125 (let ((array (make-array (* 2 (+ additional-space (- send sstart)))
126 :element-type '(unsigned-byte 8)
127 :fill-pointer 0 :adjustable t)))
128 (loop for i from sstart below send
129 do (char->ucs-2le (char string i) array string i))
130 (dotimes (i (* 2 additional-space))
131 (vector-push-extend 0 array))
132 (coerce array '(simple-array (unsigned-byte 8) (*)))))
134 (defun string->ucs-2be (string sstart send additional-space)
135 (declare (optimize speed (safety 0))
136 (type simple-string string)
137 (type array-range sstart send additional-space))
138 (let ((array (make-array (* 2 (+ additional-space (- send sstart)))
139 :element-type '(unsigned-byte 8)
140 :fill-pointer 0 :adjustable t)))
141 (loop for i from sstart below send
142 do (char->ucs-2be (char string i) array string i))
143 (dotimes (i (* 2 additional-space))
144 (vector-push-extend 0 array))
145 (coerce array '(simple-array (unsigned-byte 8) (*)))))
147 ;; Conversion from UCS-2{LE,BE}
148 (defmacro define-bytes-per-ucs2-character (accessor type)
149 (declare (ignore type))
150 (let ((name-le (make-od-name 'bytes-per-ucs-2le-character accessor))
151 (name-be (make-od-name 'bytes-per-ucs-2be-character accessor)))
153 (defun ,name-le (array pos end)
154 (declare (ignore array pos end))
156 (defun ,name-be (array pos end)
157 (declare (ignore array pos end))
159 (instantiate-octets-definition define-bytes-per-ucs2-character)
161 (defmacro define-simple-get-ucs2-character (accessor type)
162 (let ((name-le (make-od-name 'simple-get-ucs-2le-char accessor))
163 (name-be (make-od-name 'simple-get-ucs-2be-char accessor)))
165 (defun ,name-le (array pos bytes)
166 (declare (optimize speed (safety 0))
168 (type array-range pos)
169 (type (integer 1 4) bytes)
171 ;; Optimization for SYSTEM-AREA-POINTER: use SAP-REF-16LE that
172 ;; reads two bytes at once on some architectures.
173 ,(if (and (eq accessor 'sap-ref-8)
174 (eq type 'system-area-pointer))
175 '(code-char (sap-ref-16le array pos))
177 (,accessor array (the array-range (+ pos x)))))
178 (declare (inline cref))
179 (code-char (dpb (cref 1) (byte 8 8) (cref 0))))))
180 (defun ,name-be (array pos bytes)
181 (declare (optimize speed (safety 0))
183 (type array-range pos)
184 (type (integer 1 4) bytes)
186 ;; Use SAP-REF-16BE even if it is not optimized
187 ,(if (and (eq accessor 'sap-ref-8)
188 (eq type 'system-area-pointer))
189 '(code-char (sap-ref-16be array pos))
191 (,accessor array (the array-range (+ pos x)))))
192 (declare (inline cref))
193 (code-char (dpb (cref 0) (byte 8 8) (cref 1)))))))))
195 (instantiate-octets-definition define-simple-get-ucs2-character)
197 (defmacro define-ucs-2->string (accessor type)
198 (let ((name-le (make-od-name 'ucs-2le->string accessor))
199 (name-be (make-od-name 'ucs-2be->string accessor)))
201 (defun ,name-le (array astart aend)
202 (declare (optimize speed (safety 0))
204 (type array-range astart aend))
205 (let ((string (make-array 0 :adjustable t :fill-pointer 0 :element-type 'character)))
206 (loop with pos = astart
208 do (multiple-value-bind (bytes invalid)
209 (,(make-od-name 'bytes-per-ucs-2le-character accessor) array pos aend)
210 (declare (type (or null string) invalid))
211 (aver (null invalid))
213 (,(make-od-name 'simple-get-ucs-2le-char accessor)
218 (defun ,name-be (array astart aend)
219 (declare (optimize speed (safety 0))
221 (type array-range astart aend))
222 (let ((string (make-array 0 :adjustable t :fill-pointer 0 :element-type 'character)))
223 (loop with pos = astart
225 do (multiple-value-bind (bytes invalid)
226 (,(make-od-name 'bytes-per-ucs-2be-character accessor) array pos aend)
227 (declare (type (or null string) invalid))
228 (aver (null invalid))
230 (,(make-od-name 'simple-get-ucs-2be-char accessor)
236 (instantiate-octets-definition define-ucs-2->string)
238 (define-external-format/variable-width (:ucs-2le :ucs2le #!+win32 :ucs2 #!+win32 :ucs-2) t
242 (setf (sap-ref-16le sap tail) bits)
243 (external-format-encoding-error stream bits))
245 (code-char (sap-ref-16le sap head))
249 (define-external-format/variable-width (:ucs-2be :ucs2be) t
253 (setf (sap-ref-16be sap tail) bits)
254 (external-format-encoding-error stream bits))
256 (code-char (sap-ref-16be sap head))
260 (declaim (inline char->ucs-4le))
261 (defun char->ucs-4le (char dest string pos)
262 (declare (optimize speed (safety 0))
263 (type (array (unsigned-byte 8) (*)) dest)
265 (let ((code (char-code char)))
267 (declare (type (unsigned-byte 8) b))
268 (vector-push-extend b dest)))
269 (declare (inline add-byte))
270 (add-byte (ldb (byte 8 0) code))
271 (add-byte (ldb (byte 8 8) code))
272 (add-byte (ldb (byte 8 16) code))
273 (add-byte (ldb (byte 8 24) code)))))
275 (declaim (inline char->ucs-4be))
276 (defun char->ucs-4be (char dest string pos)
277 (declare (optimize speed (safety 0))
278 (type (array (unsigned-byte 8) (*)) dest)
280 (let ((code (char-code char)))
282 (declare (type (unsigned-byte 8) b))
283 (vector-push-extend b dest)))
284 (declare (inline add-byte))
285 (add-byte (ldb (byte 8 24) code))
286 (add-byte (ldb (byte 8 16) code))
287 (add-byte (ldb (byte 8 8) code))
288 (add-byte (ldb (byte 8 0) code)))))
290 (defun string->ucs-4le (string sstart send additional-space)
291 (declare (optimize speed (safety 0))
292 (type simple-string string)
293 (type array-range sstart send additional-space))
294 (let ((array (make-array (* 4 (+ additional-space (- send sstart)))
295 :element-type '(unsigned-byte 8)
296 :fill-pointer 0 :adjustable t)))
297 (loop for i from sstart below send
298 do (char->ucs-4le (char string i) array string i))
299 (dotimes (i (* 4 additional-space))
300 (vector-push-extend 0 array))
301 (coerce array '(simple-array (unsigned-byte 8) (*)))))
303 (defun string->ucs-4be (string sstart send additional-space)
304 (declare (optimize speed (safety 0))
305 (type simple-string string)
306 (type array-range sstart send additional-space))
307 (let ((array (make-array (* 4 (+ additional-space (- send sstart)))
308 :element-type '(unsigned-byte 8)
309 :fill-pointer 0 :adjustable t)))
310 (loop for i from sstart below send
311 do (char->ucs-4be (char string i) array string i))
312 (dotimes (i (* 4 additional-space))
313 (vector-push-extend 0 array))
314 (coerce array '(simple-array (unsigned-byte 8) (*)))))
316 ;; Conversion from UCS-4{LE,BE}
317 (defmacro define-bytes-per-ucs4-character (accessor type)
318 (declare (ignore type))
319 (let ((name-le (make-od-name 'bytes-per-ucs-4le-character accessor))
320 (name-be (make-od-name 'bytes-per-ucs-4be-character accessor)))
322 (defun ,name-le (array pos end)
323 (declare (ignore array pos end))
325 (defun ,name-be (array pos end)
326 (declare (ignore array pos end))
328 (instantiate-octets-definition define-bytes-per-ucs4-character)
330 (defmacro define-simple-get-ucs4-character (accessor type)
331 (let ((name-le (make-od-name 'simple-get-ucs-4le-char accessor))
332 (name-be (make-od-name 'simple-get-ucs-4be-char accessor)))
334 (defun ,name-le (array pos bytes)
335 (declare (optimize speed (safety 0))
337 (type array-range pos)
338 (type (integer 1 4) bytes))
339 ;; Optimization for SYSTEM-AREA-POINTER: use SAP-REF-32LE that
340 ;; reads four bytes at once on some architectures.
341 (let ((code ,(if (and (eq accessor 'sap-ref-8)
342 (eq type 'system-area-pointer))
343 '(sap-ref-32le array pos)
345 (,accessor array (the array-range (+ pos x)))))
346 (declare (inline cref))
347 (dpb (cref 3) (byte 8 24)
348 (dpb (cref 2) (byte 8 16)
349 (dpb (cref 1) (byte 8 8) (cref 0))))))))
350 (if (< code char-code-limit)
352 (decoding-error array pos (+ pos bytes) :ucs-4le
353 'octet-decoding-error pos))))
354 (defun ,name-be (array pos bytes)
355 (declare (optimize speed (safety 0))
357 (type array-range pos)
358 (type (integer 1 4) bytes))
359 ;; Use SAP-REF-32BE even if it is not optimized
360 (let ((code ,(if (and (eq accessor 'sap-ref-8)
361 (eq type 'system-area-pointer))
362 '(sap-ref-32be array pos)
364 (,accessor array (the array-range (+ pos x)))))
365 (declare (inline cref))
366 (dpb (cref 0) (byte 8 24)
367 (dpb (cref 1) (byte 8 16)
368 (dpb (cref 2) (byte 8 8) (cref 3))))))))
369 (if (< code char-code-limit)
371 (decoding-error array pos (+ pos bytes) :ucs-4be
372 'octet-decoding-error pos)))))))
374 (instantiate-octets-definition define-simple-get-ucs4-character)
376 (defmacro define-ucs-4->string (accessor type)
377 (let ((name-le (make-od-name 'ucs-4le->string accessor))
378 (name-be (make-od-name 'ucs-4be->string accessor)))
380 (defun ,name-le (array astart aend)
381 (declare (optimize speed (safety 0))
383 (type array-range astart aend))
384 (let ((string (make-array 0 :adjustable t :fill-pointer 0 :element-type 'character)))
385 (loop with pos = astart
387 do (multiple-value-bind (bytes invalid)
388 (,(make-od-name 'bytes-per-ucs-4le-character accessor) array pos aend)
389 (declare (type (or null string) invalid))
390 (aver (null invalid))
391 (let ((thing (,(make-od-name 'simple-get-ucs-4le-char accessor) array pos bytes)))
393 (character (vector-push-extend thing string))
394 (string (dotimes (i (length thing))
395 (vector-push-extend (char thing i) string)))))
398 (defun ,name-be (array astart aend)
399 (declare (optimize speed (safety 0))
401 (type array-range astart aend))
402 (let ((string (make-array 0 :adjustable t :fill-pointer 0 :element-type 'character)))
403 (loop with pos = astart
405 do (multiple-value-bind (bytes invalid)
406 (,(make-od-name 'bytes-per-ucs-4be-character accessor) array pos aend)
407 (declare (type (or null string) invalid))
408 (aver (null invalid))
409 (let ((thing (,(make-od-name 'simple-get-ucs-4be-char accessor) array pos bytes)))
411 (character (vector-push-extend thing string))
412 (string (dotimes (i (length thing))
413 (vector-push-extend (char thing i) string)))))
417 (instantiate-octets-definition define-ucs-4->string)
419 (define-external-format/variable-width (:ucs-4le :ucs4le) nil
422 (setf (sap-ref-32le sap tail) bits)
424 (let ((code (sap-ref-32le sap head)))
425 (if (< code char-code-limit)
427 (return-from decode-break-reason 4)))
431 (define-external-format/variable-width (:ucs-4be :ucs4be) nil
434 (setf (sap-ref-32be sap tail) bits)
436 (let ((code (sap-ref-32be sap head)))
437 (if (< code char-code-limit)
439 (return-from decode-break-reason 4)))