1 ;;;; encodings available regardless of build-time unicode settings
3 ;;;; This software is part of the SBCL system. See the README file for
6 ;;;; This software is derived from the CMU CL system, which was
7 ;;;; written at Carnegie Mellon University and released into the
8 ;;;; public domain. The software is in the public domain and is
9 ;;;; provided with absolutely no warranty. See the COPYING and CREDITS
10 ;;;; files for more information.
12 (in-package "SB!IMPL")
17 (declaim (inline code->ascii-mapper))
18 (defun code->ascii-mapper (code)
19 (declare (optimize speed (safety 0))
20 (type char-code code))
25 (declaim (inline get-ascii-bytes))
26 (defun get-ascii-bytes (string pos)
27 (declare (optimize speed (safety 0))
28 (type simple-string string)
29 (type array-range pos))
30 (get-latin-bytes #'code->ascii-mapper :ascii string pos))
32 (defun string->ascii (string sstart send null-padding)
33 (declare (optimize speed (safety 0))
34 (type simple-string string)
35 (type array-range sstart send))
36 (values (string->latin% string sstart send #'get-ascii-bytes null-padding)))
38 (defmacro define-ascii->string (accessor type)
39 (let ((name (make-od-name 'ascii->string accessor)))
41 (defun ,name (array astart aend)
42 (declare (optimize speed)
44 (type array-range astart aend))
45 ;; Since there is such a thing as a malformed ascii byte, a
46 ;; simple "make the string, fill it in" won't do.
47 (let ((string (make-array 0 :element-type 'character :fill-pointer 0 :adjustable t)))
48 (loop for apos from astart below aend
49 do (let* ((code (,accessor array apos))
53 (decoding-error array apos (1+ apos) :ascii
54 'malformed-ascii apos))))
55 (if (characterp string-content)
56 (vector-push-extend string-content string)
57 (loop for c across string-content
58 do (vector-push-extend c string))))
59 finally (return (coerce string 'simple-string))))))))
60 (instantiate-octets-definition define-ascii->string)
62 (define-external-format (:ascii :us-ascii :ansi_x3.4-1968
63 :iso-646 :iso-646-us :|646|)
66 (external-format-encoding-error stream bits)
67 (setf (sap-ref-8 sap tail) bits))
74 (declaim (inline get-latin1-bytes))
75 (defun get-latin1-bytes (string pos)
76 (declare (optimize speed (safety 0))
77 (type simple-string string)
78 (type array-range pos))
79 (get-latin-bytes #'identity :latin-1 string pos))
81 (defun string->latin1 (string sstart send null-padding)
82 (declare (optimize speed (safety 0))
83 (type simple-string string)
84 (type array-range sstart send))
85 (values (string->latin% string sstart send #'get-latin1-bytes null-padding)))
87 (defmacro define-latin1->string* (accessor type)
88 (declare (ignore type))
89 (let ((name (make-od-name 'latin1->string* accessor)))
91 (defun ,name (string sstart send array astart aend)
92 (,(make-od-name 'latin->string* accessor) string sstart send array astart aend #'identity)))))
93 (instantiate-octets-definition define-latin1->string*)
95 (defmacro define-latin1->string (accessor type)
96 (declare (ignore type))
97 `(defun ,(make-od-name 'latin1->string accessor) (array astart aend)
98 (,(make-od-name 'latin->string accessor) array astart aend #'identity)))
99 (instantiate-octets-definition define-latin1->string)
101 ;;; Multiple names for the :ISO{,-}8859-* families are needed because on
102 ;;; FreeBSD (and maybe other BSD systems), nl_langinfo("LATIN-1") will
103 ;;; return "ISO8859-1" instead of "ISO-8859-1".
104 (define-external-format (:latin-1 :latin1 :iso-8859-1 :iso8859-1)
107 (external-format-encoding-error stream bits)
108 (setf (sap-ref-8 sap tail) bits))
118 (declaim (inline char-len-as-utf8))
119 (defun char-len-as-utf8 (code)
120 (declare (optimize speed (safety 0))
121 (type (integer 0 (#.sb!xc:char-code-limit)) code))
122 (cond ((< code 0) (bug "can't happen"))
126 ((< code #x110000) 4)
127 (t (bug "can't happen"))))
129 (defun string->utf8 (string sstart send null-padding)
130 (declare (optimize (speed 3) (safety 0))
131 (type simple-string string)
132 (type (integer 0 1) null-padding)
133 (type array-range sstart send))
134 (macrolet ((ascii-bash ()
135 '(let ((array (make-array (+ null-padding (- send sstart))
136 :element-type '(unsigned-byte 8))))
138 and j from sstart below send
139 do (setf (aref array i) (char-code (char string j))))
142 ((simple-array character (*))
143 (let ((utf8-length 0))
144 ;; Since it has to fit in a vector, it must be a fixnum!
145 (declare (type (and unsigned-byte fixnum) utf8-length))
146 (loop for i of-type index from sstart below send
147 do (incf utf8-length (char-len-as-utf8 (char-code (char string i)))))
148 (if (= utf8-length (- send sstart))
150 (let ((array (make-array (+ null-padding utf8-length)
151 :element-type '(unsigned-byte 8)))
153 (declare (type index index))
155 (setf (aref array index) b)
157 (declare (inline add-byte))
158 (loop for i of-type index from sstart below send
159 do (let ((code (char-code (char string i))))
160 (case (char-len-as-utf8 code)
164 (add-byte (logior #b11000000 (ldb (byte 5 6) code)))
165 (add-byte (logior #b10000000 (ldb (byte 6 0) code))))
167 (add-byte (logior #b11100000 (ldb (byte 4 12) code)))
168 (add-byte (logior #b10000000 (ldb (byte 6 6) code)))
169 (add-byte (logior #b10000000 (ldb (byte 6 0) code))))
171 (add-byte (logior #b11110000 (ldb (byte 3 18) code)))
172 (add-byte (logior #b10000000 (ldb (byte 6 12) code)))
173 (add-byte (logior #b10000000 (ldb (byte 6 6) code)))
174 (add-byte (logior #b10000000 (ldb (byte 6 0) code))))))
175 finally (return array)))))))
177 ((simple-array base-char (*))
178 ;; On unicode builds BASE-STRINGs are limited to ASCII range,
179 ;; so we can take a fast path -- and get benefit of the element
180 ;; type information. On non-unicode build BASE-CHAR ==
183 ((simple-array nil (*))
185 (make-array 0 :element-type '(unsigned-byte 8))
186 ;; Just get the error...
187 (aref string sstart))))))
191 (defmacro define-bytes-per-utf8-character (accessor type)
192 (let ((name (make-od-name 'bytes-per-utf8-character accessor)))
194 ;;(declaim (inline ,name))
196 (string->utf8 (string (code-char ,(1- sb!xc:char-code-limit)))
198 (declare (type (simple-array (unsigned-byte 8) (#!+sb-unicode 4 #!-sb-unicode 2)) lexically-max))
199 (defun ,name (array pos end)
200 (declare (optimize speed (safety 0))
202 (type array-range pos end))
203 ;; returns the number of bytes consumed and nil if it's a
204 ;; valid character or the number of bytes consumed and a
205 ;; replacement string if it's not.
206 (let ((initial-byte (,accessor array pos))
208 (reject-position pos)
209 (remaining-bytes (- end pos)))
210 (declare (type array-range reject-position remaining-bytes))
211 (labels ((valid-utf8-starter-byte-p (b)
212 (declare (type (unsigned-byte 8) b))
214 ((zerop (logand b #b10000000)) 1)
215 ((= (logand b #b11100000) #b11000000)
217 ((= (logand b #b11110000) #b11100000)
219 ((= (logand b #b11111000) #b11110000)
221 ((= (logand b #b11111100) #b11111000)
223 ((= (logand b #b11111110) #b11111100)
228 (setf reject-reason 'invalid-utf8-starter-byte))
230 (enough-bytes-left-p (x)
231 (let ((ok (> end (+ pos (1- x)))))
233 (setf reject-reason 'end-of-input-in-character))
235 (valid-secondary-p (x)
236 (let* ((idx (the array-range (+ pos x)))
237 (b (,accessor array idx))
238 (ok (= (logand b #b11000000) #b10000000)))
240 (setf reject-reason 'invalid-utf8-continuation-byte)
241 (setf reject-position idx))
243 (preliminary-ok-for-length (maybe-len len)
244 (and (eql maybe-len len)
245 ;; Has to be done in this order so that
246 ;; certain broken sequences (e.g., the
247 ;; two-byte sequence `"initial (length 3)"
248 ;; "non-continuation"' -- `#xef #x32')
249 ;; signal only part of that sequence as
251 (loop for i from 1 below (min len remaining-bytes)
252 always (valid-secondary-p i))
253 (enough-bytes-left-p len)))
255 (let ((ok (or (/= initial-byte x)
256 (/= (logior (,accessor array (the array-range (+ pos 1)))
260 (setf reject-reason 'overlong-utf8-sequence))
262 (character-below-char-code-limit-p ()
263 ;; This is only called on a four-byte sequence
264 ;; (two in non-unicode builds) to ensure we
265 ;; don't go over SBCL's character limts.
266 (let ((ok (cond ((< (aref lexically-max 0) (,accessor array pos))
268 ((> (aref lexically-max 0) (,accessor array pos))
270 ((< (aref lexically-max 1) (,accessor array (+ pos 1)))
273 ((> (aref lexically-max 1) (,accessor array (+ pos 1)))
276 ((< (aref lexically-max 2) (,accessor array (+ pos 2)))
279 ((> (aref lexically-max 2) (,accessor array (+ pos 2)))
282 ((< (aref lexically-max 3) (,accessor array (+ pos 3)))
286 (setf reject-reason 'character-out-of-range))
288 (declare (inline valid-utf8-starter-byte-p
291 preliminary-ok-for-length
293 (let ((maybe-len (valid-utf8-starter-byte-p initial-byte)))
294 (cond ((eql maybe-len 1)
296 ((and (preliminary-ok-for-length maybe-len 2)
297 (overlong-chk #b11000000 #b10111111)
298 (overlong-chk #b11000001 #b10111111)
299 #!-sb-unicode (character-below-char-code-limit-p))
301 ((and (preliminary-ok-for-length maybe-len 3)
302 (overlong-chk #b11100000 #b10011111)
303 #!-sb-unicode (not (setf reject-reason 'character-out-of-range)))
305 ((and (preliminary-ok-for-length maybe-len 4)
306 (overlong-chk #b11110000 #b10001111)
307 #!-sb-unicode (not (setf reject-reason 'character-out-of-range))
308 (character-below-char-code-limit-p))
310 ((and (preliminary-ok-for-length maybe-len 5)
311 (overlong-chk #b11111000 #b10000111)
312 (not (setf reject-reason 'character-out-of-range)))
313 (bug "can't happen"))
314 ((and (preliminary-ok-for-length maybe-len 6)
315 (overlong-chk #b11111100 #b10000011)
316 (not (setf reject-reason 'character-out-of-range)))
317 (bug "can't happen"))
319 (let* ((bad-end (ecase reject-reason
320 (invalid-utf8-starter-byte
322 (end-of-input-in-character
324 (invalid-utf8-continuation-byte
326 ((overlong-utf8-sequence character-out-of-range)
328 (bad-len (- bad-end pos)))
329 (declare (type array-range bad-end bad-len))
330 (let ((replacement (decoding-error array pos bad-end :utf-8 reject-reason reject-position)))
331 (values bad-len replacement)))))))))))))
332 (instantiate-octets-definition define-bytes-per-utf8-character)
334 (defmacro define-simple-get-utf8-char (accessor type)
335 (let ((name (make-od-name 'simple-get-utf8-char accessor)))
337 (declaim (inline ,name))
338 (defun ,name (array pos bytes)
339 (declare (optimize speed (safety 0))
341 (type array-range pos)
342 (type (integer 1 4) bytes))
344 (,accessor array (the array-range (+ pos x)))))
345 (declare (inline cref))
346 (code-char (ecase bytes
348 (2 (logior (ash (ldb (byte 5 0) (cref 0)) 6)
349 (ldb (byte 6 0) (cref 1))))
350 (3 (logior (ash (ldb (byte 4 0) (cref 0)) 12)
351 (ash (ldb (byte 6 0) (cref 1)) 6)
352 (ldb (byte 6 0) (cref 2))))
353 (4 (logior (ash (ldb (byte 3 0) (cref 0)) 18)
354 (ash (ldb (byte 6 0) (cref 1)) 12)
355 (ash (ldb (byte 6 0) (cref 2)) 6)
356 (ldb (byte 6 0) (cref 3)))))))))))
357 (instantiate-octets-definition define-simple-get-utf8-char)
359 (defmacro define-utf8->string (accessor type)
360 (let ((name (make-od-name 'utf8->string accessor)))
362 (defun ,name (array astart aend)
363 (declare (optimize speed (safety 0))
365 (type array-range astart aend))
366 (let ((string (make-array 0 :adjustable t :fill-pointer 0 :element-type 'character)))
367 (loop with pos = astart
369 do (multiple-value-bind (bytes invalid)
370 (,(make-od-name 'bytes-per-utf8-character accessor) array pos aend)
371 (declare (type (or null string) invalid))
374 (vector-push-extend (,(make-od-name 'simple-get-utf8-char accessor) array pos bytes) string))
376 (dotimes (i (length invalid))
377 (vector-push-extend (char invalid i) string))))
379 (coerce string 'simple-string))))))
380 (instantiate-octets-definition define-utf8->string)
382 (define-external-format/variable-width (:utf-8 :utf8) nil
383 (let ((bits (char-code byte)))
384 (cond ((< bits #x80) 1)
389 (1 (setf (sap-ref-8 sap tail) bits))
390 (2 (setf (sap-ref-8 sap tail) (logior #xc0 (ldb (byte 5 6) bits))
391 (sap-ref-8 sap (+ 1 tail)) (logior #x80 (ldb (byte 6 0) bits))))
392 (3 (setf (sap-ref-8 sap tail) (logior #xe0 (ldb (byte 4 12) bits))
393 (sap-ref-8 sap (+ 1 tail)) (logior #x80 (ldb (byte 6 6) bits))
394 (sap-ref-8 sap (+ 2 tail)) (logior #x80 (ldb (byte 6 0) bits))))
395 (4 (setf (sap-ref-8 sap tail) (logior #xf0 (ldb (byte 3 18) bits))
396 (sap-ref-8 sap (+ 1 tail)) (logior #x80 (ldb (byte 6 12) bits))
397 (sap-ref-8 sap (+ 2 tail)) (logior #x80 (ldb (byte 6 6) bits))
398 (sap-ref-8 sap (+ 3 tail)) (logior #x80 (ldb (byte 6 0) bits)))))
399 (cond ((< byte #x80) 1)
400 ((< byte #xc2) (return-from decode-break-reason 1))
404 (code-char (ecase size
406 (2 (let ((byte2 (sap-ref-8 sap (1+ head))))
407 (unless (<= #x80 byte2 #xbf)
408 (return-from decode-break-reason 2))
409 (dpb byte (byte 5 6) byte2)))
410 (3 (let ((byte2 (sap-ref-8 sap (1+ head)))
411 (byte3 (sap-ref-8 sap (+ 2 head))))
412 (unless (and (<= #x80 byte2 #xbf)
413 (<= #x80 byte3 #xbf))
414 (return-from decode-break-reason 3))
415 (dpb byte (byte 4 12) (dpb byte2 (byte 6 6) byte3))))
416 (4 (let ((byte2 (sap-ref-8 sap (1+ head)))
417 (byte3 (sap-ref-8 sap (+ 2 head)))
418 (byte4 (sap-ref-8 sap (+ 3 head))))
419 (unless (and (<= #x80 byte2 #xbf)
421 (<= #x80 byte4 #xbf))
422 (return-from decode-break-reason 4))
423 (dpb byte (byte 3 18)
424 (dpb byte2 (byte 6 12)
425 (dpb byte3 (byte 6 6) byte4)))))))