1 ;;;; Unicode Transformation Format (UTF) encodings
3 ;;;; In our interpretation, these are distinct from UCS encodings: the
4 ;;;; UCS encodings are a direct encoding of the code point, in 16- and
5 ;;;; 32-bit variants; by contrast, the UTF encodings handle Unicode
6 ;;;; surrogate code points specially.
8 ;;;; This software is part of the SBCL system. See the README file for
11 ;;;; This software is derived from the CMU CL system, which was
12 ;;;; written at Carnegie Mellon University and released into the
13 ;;;; public domain. The software is in the public domain and is
14 ;;;; provided with absolutely no warranty. See the COPYING and CREDITS
15 ;;;; files for more information.
17 (in-package "SB!IMPL")
20 (declaim (inline utf-noncharacter-code-p))
21 (defun utf-noncharacter-code-p (code)
22 (or (<= #xd800 code #xdfff)
23 (<= #xfdd0 code #xfdef)
24 (= (logand code #xfffe) #xfffe)))
26 ;;; Conversion to UTF-16{LE,BE}
27 (declaim (inline char->utf-16le))
28 (defun char->utf-16le (char dest string pos)
29 (declare (optimize speed (safety 0))
30 (type (array (unsigned-byte 8) (*)) dest))
31 (let ((code (char-code char)))
32 (if (utf-noncharacter-code-p code)
33 (let ((replacement (encoding-error :utf-16le string pos)))
34 (declare (type (simple-array (unsigned-byte 8) (*)) replacement))
35 (dotimes (i (length replacement))
36 (vector-push-extend (aref replacement i) dest)))
38 (declare (type (unsigned-byte 8) b))
39 (vector-push-extend b dest)))
40 (declare (inline add-byte))
43 (add-byte (ldb (byte 8 0) code))
44 (add-byte (ldb (byte 8 8) code)))
46 (let* ((codeoid (- code #x10000))
47 (high (dpb (ldb (byte 10 10) codeoid) (byte 10 0) #xd800))
48 (low (dpb (ldb (byte 10 0) codeoid) (byte 10 0) #xdc00)))
49 (add-byte (ldb (byte 8 0) high))
50 (add-byte (ldb (byte 8 8) high))
51 (add-byte (ldb (byte 8 0) low))
52 (add-byte (ldb (byte 8 8) low)))))))))
54 (declaim (inline char->utf-16be))
55 (defun char->utf-16be (char dest string pos)
56 (declare (optimize speed (safety 0))
57 (type (array (unsigned-byte 8) (*)) dest))
58 (let ((code (char-code char)))
59 (if (utf-noncharacter-code-p code)
60 (let ((replacement (encoding-error :utf-16be string pos)))
61 (declare (type (simple-array (unsigned-byte 8) (*)) replacement))
62 (dotimes (i (length replacement))
63 (vector-push-extend (aref replacement i) dest)))
65 (declare (type (unsigned-byte 8) b))
66 (vector-push-extend b dest)))
67 (declare (inline add-byte))
70 (add-byte (ldb (byte 8 8) code))
71 (add-byte (ldb (byte 8 0) code)))
73 (let* ((codeoid (- code #x10000))
74 (high (dpb (ldb (byte 10 10) codeoid) (byte 10 0) #xd800))
75 (low (dpb (ldb (byte 10 0) codeoid) (byte 10 0) #xdc00)))
76 (add-byte (ldb (byte 8 8) high))
77 (add-byte (ldb (byte 8 0) high))
78 (add-byte (ldb (byte 8 8) low))
79 (add-byte (ldb (byte 8 0) low)))))))))
81 (defun string->utf-16le (string sstart send additional-space)
82 (declare (optimize speed (safety 0))
83 (type simple-string string)
84 (type array-range sstart send additional-space))
85 (let ((array (make-array (* 2 (+ additional-space (- send sstart)))
86 :element-type '(unsigned-byte 8)
87 :fill-pointer 0 :adjustable t)))
88 (loop for i from sstart below send
89 do (char->utf-16le (char string i) array string i))
90 (dotimes (i (* 2 additional-space))
91 (vector-push-extend 0 array))
92 (coerce array '(simple-array (unsigned-byte 8) (*)))))
94 (defun string->utf-16be (string sstart send additional-space)
95 (declare (optimize speed (safety 0))
96 (type simple-string string)
97 (type array-range sstart send additional-space))
98 (let ((array (make-array (* 2 (+ additional-space (- send sstart)))
99 :element-type '(unsigned-byte 8)
100 :fill-pointer 0 :adjustable t)))
101 (loop for i from sstart below send
102 do (char->utf-16be (char string i) array string i))
103 (dotimes (i (* 2 additional-space))
104 (vector-push-extend 0 array))
105 (coerce array '(simple-array (unsigned-byte 8) (*)))))
107 ;; Conversion from UTF-16{LE,BE}
108 (defmacro define-bytes-per-utf16-character (accessor type)
109 (declare (ignore type))
110 (let ((name-le (make-od-name 'bytes-per-utf-16le-character accessor))
111 (name-be (make-od-name 'bytes-per-utf-16be-character accessor)))
113 (defun ,name-le (array pos end)
114 (let ((remaining (- end pos)))
115 (when (< remaining 2)
116 (return-from ,name-le (values remaining (decoding-error array pos end :utf-16le 'octet-decoding-error pos))))
117 (let ((low (dpb (,accessor array (+ pos 1)) (byte 8 8) (,accessor array pos))))
118 (if (<= #xd800 low #xdbff)
120 (values remaining (decoding-error array pos end :utf-16le 'octet-decoding-error pos))
121 (let ((high (dpb (,accessor array (+ pos 3)) (byte 8 8) (,accessor array (+ pos 2)))))
122 (if (<= #xdc00 high #xdfff)
123 (let ((code (dpb (ldb (byte 10 0) low) (byte 10 10) (ldb (byte 10 0) high))))
124 (if (= (logand code #xfffe) #xfffe)
125 (values 4 (decoding-error array pos (+ pos 4) :utf-16le 'octet-decoding-error pos))
127 (values 2 (decoding-error array pos (+ pos 2) :utf-16le 'octet-decoding-error pos)))))
128 (if (or (<= #xdc00 low #xdfff)
129 (<= #xfdd0 low #xfdef)
130 (= (logand low #xfffe) #xfffe))
131 (values 2 (decoding-error array pos (+ pos 2) :utf-16le 'octet-decoding-error pos))
133 (defun ,name-be (array pos end)
134 (let ((remaining (- end pos)))
135 (when (< remaining 2)
136 (return-from ,name-be (values remaining (decoding-error array pos end :utf-16le 'octet-decoding-error pos))))
137 (let ((low (dpb (,accessor array pos) (byte 8 8) (,accessor array (+ pos 1)))))
138 (if (<= #xd800 low #xdbff)
140 (values remaining (decoding-error array pos end :utf-16le 'octet-decoding-error pos))
141 (let ((high (dpb (,accessor array (+ pos 2)) (byte 8 8) (,accessor array (+ pos 3)))))
142 (if (<= #xdc00 high #xdfff)
143 (let ((code (dpb (ldb (byte 10 0) low) (byte 10 10) (ldb (byte 10 0) high))))
144 (if (= (logand code #xfffe) #xfffe)
145 (values 4 (decoding-error array pos (+ pos 4) :utf-16le 'octet-decoding-error pos))
147 (values 2 (decoding-error array pos (+ pos 2) :utf-16le 'octet-decoding-error pos)))))
148 (if (or (<= #xdc00 low #xdfff)
149 (<= #xfdd0 low #xfdef)
150 (= (logand low #xfffe) #xfffe))
151 (values 2 (decoding-error array pos (+ pos 2) :utf-16le 'octet-decoding-error pos))
152 (values 2 nil)))))))))
153 (instantiate-octets-definition define-bytes-per-utf16-character)
155 (defmacro define-simple-get-utf16-character (accessor type)
156 (let ((name-le (make-od-name 'simple-get-utf-16le-char accessor))
157 (name-be (make-od-name 'simple-get-utf-16be-char accessor)))
159 (defun ,name-le (array pos bytes)
160 (declare (optimize speed (safety 0))
162 (type array-range pos)
163 (type (integer 1 4) bytes)
165 ;; Optimization for SYSTEM-AREA-POINTER: use SAP-REF-16LE that
166 ;; reads two bytes at once on some architectures.
167 (let ((code ,(if (and (eq accessor 'sap-ref-8)
168 (eq type 'system-area-pointer))
169 '(sap-ref-16le array pos)
171 (,accessor array (the array-range (+ pos x)))))
172 (declare (inline cref))
173 (dpb (cref 1) (byte 8 8) (cref 0))))))
174 (if (<= #xd800 code #xdbff)
175 (let ((next ,(if (and (eq accessor 'sap-ref-8)
176 (eq type 'system-area-pointer))
177 '(sap-ref-16le array (+ pos 2))
179 (,accessor array (the array-range (+ pos x)))))
180 (declare (inline cref))
181 (dpb (cref 3) (byte 8 8) (cref 2))))))
182 (code-char (+ #x10000 (dpb (ldb (byte 10 0) code) (byte 10 10) (ldb (byte 10 0) next)))))
184 (defun ,name-be (array pos bytes)
185 (declare (optimize speed (safety 0))
187 (type array-range pos)
188 (type (integer 1 4) bytes)
190 ;; Use SAP-REF-16BE even if it is not optimized
191 (let ((code ,(if (and (eq accessor 'sap-ref-8)
192 (eq type 'system-area-pointer))
193 '(sap-ref-16be array pos)
195 (,accessor array (the array-range (+ pos x)))))
196 (declare (inline cref))
197 (dpb (cref 0) (byte 8 8) (cref 1))))))
198 (if (<= #xd800 code #xdbff)
199 (let ((next ,(if (and (eq accessor 'sap-ref-8)
200 (eq type 'system-area-pointer))
201 '(sap-ref-16be array (+ pos 2))
203 (,accessor array (the array-range (+ pos x)))))
204 (declare (inline cref))
205 (dpb (cref 2) (byte 8 8) (cref 3))))))
206 (code-char (+ #x10000 (dpb (ldb (byte 10 0) code) (byte 10 10) (ldb (byte 10 0) next)))))
207 (code-char code)))))))
209 (instantiate-octets-definition define-simple-get-utf16-character)
211 (defmacro define-utf-16->string (accessor type)
212 (let ((name-le (make-od-name 'utf-16le->string accessor))
213 (name-be (make-od-name 'utf-16be->string accessor)))
215 (defun ,name-le (array astart aend)
216 (declare (optimize speed (safety 0))
218 (type array-range astart aend))
219 (let ((string (make-array 0 :adjustable t :fill-pointer 0 :element-type 'character)))
220 (loop with pos = astart
222 do (multiple-value-bind (bytes invalid)
223 (,(make-od-name 'bytes-per-utf-16le-character accessor) array pos aend)
224 (declare (type (or null string) invalid))
228 (,(make-od-name 'simple-get-utf-16le-char accessor)
231 (t (dotimes (i (length invalid))
232 (vector-push-extend (char invalid i) string))))
235 (defun ,name-be (array astart aend)
236 (declare (optimize speed (safety 0))
238 (type array-range astart aend))
239 (let ((string (make-array 0 :adjustable t :fill-pointer 0 :element-type 'character)))
240 (loop with pos = astart
242 do (multiple-value-bind (bytes invalid)
243 (,(make-od-name 'bytes-per-utf-16be-character accessor) array pos aend)
244 (declare (type (or null string) invalid))
248 (,(make-od-name 'simple-get-utf-16be-char accessor)
251 (t (dotimes (i (length invalid))
252 (vector-push-extend (char invalid i) string))))
256 (instantiate-octets-definition define-utf-16->string)
258 (define-external-format/variable-width (:utf-16le :utf16le) t
260 (let ((bits (char-code byte)))
261 (if (< bits #x10000) 2 4))
264 (if (utf-noncharacter-code-p bits)
265 (external-format-encoding-error stream bits)
266 (setf (sap-ref-16le sap tail) bits)))
267 (t (if (= (logand bits #xfffe) #xfffe)
268 (external-format-encoding-error stream bits)
269 (let* ((new-bits (- bits #x10000))
270 (high (ldb (byte 10 10) new-bits))
271 (low (ldb (byte 10 0) new-bits)))
272 (setf (sap-ref-16le sap tail) (dpb high (byte 10 0) #xd800))
273 (setf (sap-ref-16le sap (+ tail 2)) (dpb low (byte 10 0) #xdc00))))))
274 (2 (if (<= #xd800 (sap-ref-16le sap head) #xdbff) 4 2))
275 (let ((bits (sap-ref-16le sap head)))
277 ((or (<= #xdc00 bits #xdfff)
278 (<= #xfdd0 bits #xfdef)
279 (= (logand bits #xfffe) #xfffe))
280 (return-from decode-break-reason 2))
281 ((<= #xd800 bits #xdbff)
282 (let ((next (sap-ref-16le sap (+ head 2))))
283 (unless (<= #xdc00 next #xdfff)
284 (return-from decode-break-reason 2))
285 (let ((code (dpb (ldb (byte 10 0) bits) (byte 10 10) (ldb (byte 10 0) next))))
286 (if (= (logand code #xfffe) #xfffe)
287 (return-from decode-break-reason 4)
288 (code-char (+ #x10000 code))))))
289 (t (code-char bits))))
290 utf-16le->string-aref
293 (define-external-format/variable-width (:utf-16be :utf16be) t
295 (let ((bits (char-code byte)))
296 (if (< bits #x10000) 2 4))
299 (if (utf-noncharacter-code-p bits)
300 (external-format-encoding-error stream bits)
301 (setf (sap-ref-16be sap tail) bits)))
302 (t (if (= (logand bits #xfffe) #xfffe)
303 (external-format-encoding-error stream bits)
304 (let* ((new-bits (- bits #x10000))
305 (high (ldb (byte 10 10) new-bits))
306 (low (ldb (byte 10 0) new-bits)))
307 (setf (sap-ref-16be sap tail) (dpb high (byte 10 0) #xd800))
308 (setf (sap-ref-16be sap (+ tail 2)) (dpb low (byte 10 0) #xdc00))))))
309 (2 (if (<= #xd800 (sap-ref-16be sap head) #xdbff) 4 2))
310 (let ((bits (sap-ref-16be sap head)))
312 ((or (<= #xdc00 bits #xdfff)
313 (<= #xfdd0 bits #xfdef)
314 (= (logand bits #xfffe) #xfffe))
315 (return-from decode-break-reason 2))
316 ((<= #xd800 bits #xdbff)
317 (let ((next (sap-ref-16be sap (+ head 2))))
318 (unless (<= #xdc00 next #xdfff)
319 (return-from decode-break-reason 2))
320 (let ((code (dpb (ldb (byte 10 0) bits) (byte 10 10) (ldb (byte 10 0) next))))
321 (if (= (logand code #xfffe) #xfffe)
322 (return-from decode-break-reason 4)
323 (code-char (+ #x10000 code))))))
324 (t (code-char bits))))
325 utf-16be->string-aref
328 (declaim (inline char->utf-32le))
329 (defun char->utf-32le (char dest string pos)
330 (declare (optimize speed (safety 0))
331 (type (array (unsigned-byte 8) (*)) dest))
332 (let ((code (char-code char)))
333 (if (utf-noncharacter-code-p code)
334 (let ((replacement (encoding-error :utf-32le string pos)))
335 (declare (type (simple-array (unsigned-byte 8) (*)) replacement))
336 (dotimes (i (length replacement))
337 (vector-push-extend (aref replacement i) dest)))
339 (declare (type (unsigned-byte 8) b))
340 (vector-push-extend b dest)))
341 (declare (inline add-byte))
342 (add-byte (ldb (byte 8 0) code))
343 (add-byte (ldb (byte 8 8) code))
344 (add-byte (ldb (byte 8 16) code))
345 (add-byte (ldb (byte 8 24) code))))))
347 (declaim (inline char->utf-32be))
348 (defun char->utf-32be (char dest string pos)
349 (declare (optimize speed (safety 0))
350 (type (array (unsigned-byte 8) (*)) dest))
351 (let ((code (char-code char)))
352 (if (utf-noncharacter-code-p code)
353 (let ((replacement (encoding-error :utf-32be string pos)))
354 (declare (type (simple-array (unsigned-byte 8) (*)) replacement))
355 (dotimes (i (length replacement))
356 (vector-push-extend (aref replacement i) dest)))
358 (declare (type (unsigned-byte 8) b))
359 (vector-push-extend b dest)))
360 (declare (inline add-byte))
361 (add-byte (ldb (byte 8 24) code))
362 (add-byte (ldb (byte 8 16) code))
363 (add-byte (ldb (byte 8 8) code))
364 (add-byte (ldb (byte 8 0) code))))))
366 (defun string->utf-32le (string sstart send additional-space)
367 (declare (optimize speed (safety 0))
368 (type simple-string string)
369 (type array-range sstart send additional-space))
370 (let ((array (make-array (* 4 (+ additional-space (- send sstart)))
371 :element-type '(unsigned-byte 8)
372 :fill-pointer 0 :adjustable t)))
373 (loop for i from sstart below send
374 do (char->utf-32le (char string i) array string i))
375 (dotimes (i (* 4 additional-space))
376 (vector-push-extend 0 array))
377 (coerce array '(simple-array (unsigned-byte 8) (*)))))
379 (defun string->utf-32be (string sstart send additional-space)
380 (declare (optimize speed (safety 0))
381 (type simple-string string)
382 (type array-range sstart send additional-space))
383 (let ((array (make-array (* 4 (+ additional-space (- send sstart)))
384 :element-type '(unsigned-byte 8)
385 :fill-pointer 0 :adjustable t)))
386 (loop for i from sstart below send
387 do (char->utf-32be (char string i) array string i))
388 (dotimes (i (* 4 additional-space))
389 (vector-push-extend 0 array))
390 (coerce array '(simple-array (unsigned-byte 8) (*)))))
392 ;; Conversion from UTF-32{LE,BE}
393 (defmacro define-bytes-per-utf32-character (accessor type)
394 (declare (ignore type))
395 (let ((name-le (make-od-name 'bytes-per-utf-32le-character accessor))
396 (name-be (make-od-name 'bytes-per-utf-32be-character accessor)))
398 (defun ,name-le (array pos end)
399 (declare (ignore array pos end))
401 (defun ,name-be (array pos end)
402 (declare (ignore array pos end))
404 (instantiate-octets-definition define-bytes-per-utf32-character)
406 (defmacro define-simple-get-utf32-character (accessor type)
407 (let ((name-le (make-od-name 'simple-get-utf-32le-char accessor))
408 (name-be (make-od-name 'simple-get-utf-32be-char accessor)))
410 (defun ,name-le (array pos bytes)
411 (declare (optimize speed (safety 0))
413 (type array-range pos)
414 (type (integer 1 4) bytes))
415 ;; Optimization for SYSTEM-AREA-POINTER: use SAP-REF-32LE that
416 ;; reads four bytes at once on some architectures.
417 (let ((code ,(if (and (eq accessor 'sap-ref-8)
418 (eq type 'system-area-pointer))
419 '(sap-ref-32le array pos)
421 (,accessor array (the array-range (+ pos x)))))
422 (declare (inline cref))
423 (dpb (cref 3) (byte 8 24)
424 (dpb (cref 2) (byte 8 16)
425 (dpb (cref 1) (byte 8 8) (cref 0))))))))
426 (if (and (< code char-code-limit)
427 (not (utf-noncharacter-code-p code)))
429 (decoding-error array pos (+ pos bytes) :utf-32le
430 'octet-decoding-error pos))))
431 (defun ,name-be (array pos bytes)
432 (declare (optimize speed (safety 0))
434 (type array-range pos)
435 (type (integer 1 4) bytes))
436 ;; Use SAP-REF-32BE even if it is not optimized
437 (let ((code ,(if (and (eq accessor 'sap-ref-8)
438 (eq type 'system-area-pointer))
439 '(sap-ref-32be array pos)
441 (,accessor array (the array-range (+ pos x)))))
442 (declare (inline cref))
443 (dpb (cref 0) (byte 8 24)
444 (dpb (cref 1) (byte 8 16)
445 (dpb (cref 2) (byte 8 8) (cref 3))))))))
446 (if (and (< code char-code-limit)
447 (not (utf-noncharacter-code-p code)))
449 (decoding-error array pos (+ pos bytes) :utf-32be
450 'octet-decoding-error pos)))))))
452 (instantiate-octets-definition define-simple-get-utf32-character)
454 (defmacro define-utf-32->string (accessor type)
455 (let ((name-le (make-od-name 'utf-32le->string accessor))
456 (name-be (make-od-name 'utf-32be->string accessor)))
458 (defun ,name-le (array astart aend)
459 (declare (optimize speed (safety 0))
461 (type array-range astart aend))
462 (let ((string (make-array 0 :adjustable t :fill-pointer 0 :element-type 'character)))
463 (loop with pos = astart
465 do (multiple-value-bind (bytes invalid)
466 (,(make-od-name 'bytes-per-utf-32le-character accessor) array pos aend)
467 (declare (type (or null string) invalid))
468 (aver (null invalid))
469 (let ((thing (,(make-od-name 'simple-get-utf-32le-char accessor) array pos bytes)))
471 (character (vector-push-extend thing string))
472 (string (dotimes (i (length thing))
473 (vector-push-extend (char thing i) string)))))
476 (defun ,name-be (array astart aend)
477 (declare (optimize speed (safety 0))
479 (type array-range astart aend))
480 (let ((string (make-array 0 :adjustable t :fill-pointer 0 :element-type 'character)))
481 (loop with pos = astart
483 do (multiple-value-bind (bytes invalid)
484 (,(make-od-name 'bytes-per-utf-32be-character accessor) array pos aend)
485 (declare (type (or null string) invalid))
486 (aver (null invalid))
487 (let ((thing (,(make-od-name 'simple-get-utf-32be-char accessor) array pos bytes)))
489 (character (vector-push-extend thing string))
490 (string (dotimes (i (length thing))
491 (vector-push-extend (char thing i) string)))))
495 (instantiate-octets-definition define-utf-32->string)
497 (define-external-format/variable-width (:utf-32le :utf32le) t
500 (if (utf-noncharacter-code-p bits)
501 (external-format-encoding-error stream bits)
502 (setf (sap-ref-32le sap tail) bits))
504 (let ((code (sap-ref-32le sap head)))
505 (if (and (< code char-code-limit)
506 (not (utf-noncharacter-code-p code)))
508 (return-from decode-break-reason 4)))
509 utf-32le->string-aref
512 (define-external-format/variable-width (:utf-32be :utf32be) t
515 (if (utf-noncharacter-code-p bits)
516 (external-format-encoding-error stream bits)
517 (setf (sap-ref-32be sap tail) bits))
519 (let ((code (sap-ref-32be sap head)))
520 (if (and (< code char-code-limit)
521 (not (utf-noncharacter-code-p code)))
523 (return-from decode-break-reason 4)))
524 utf-32be->string-aref