1 ;;;; encodings available regardless of build-time unicode settings
3 ;;;; This software is part of the SBCL system. See the README file for
6 ;;;; This software is derived from the CMU CL system, which was
7 ;;;; written at Carnegie Mellon University and released into the
8 ;;;; public domain. The software is in the public domain and is
9 ;;;; provided with absolutely no warranty. See the COPYING and CREDITS
10 ;;;; files for more information.
12 (in-package "SB!IMPL")
17 (declaim (inline code->ascii-mapper))
18 (defun code->ascii-mapper (code)
19 (declare (optimize speed (safety 0))
20 (type char-code code))
25 (declaim (inline get-ascii-bytes))
26 (defun get-ascii-bytes (string pos)
27 (declare (optimize speed (safety 0))
28 (type simple-string string)
29 (type array-range pos))
30 (get-latin-bytes #'code->ascii-mapper :ascii string pos))
32 (defun string->ascii (string sstart send null-padding)
33 (declare (optimize speed (safety 0))
34 (type simple-string string)
35 (type array-range sstart send))
36 (values (string->latin% string sstart send #'get-ascii-bytes null-padding)))
38 (defmacro define-ascii->string (accessor type)
39 (let ((name (make-od-name 'ascii->string accessor)))
41 (defun ,name (array astart aend)
42 (declare (optimize speed)
44 (type array-range astart aend))
45 ;; Since there is such a thing as a malformed ascii byte, a
46 ;; simple "make the string, fill it in" won't do.
47 (let ((string (make-array 0 :element-type 'character :fill-pointer 0 :adjustable t)))
48 (loop for apos from astart below aend
49 do (let* ((code (,accessor array apos))
53 (decoding-error array apos (1+ apos) :ascii
54 'malformed-ascii apos))))
55 (if (characterp string-content)
56 (vector-push-extend string-content string)
57 (loop for c across string-content
58 do (vector-push-extend c string))))
59 finally (return (coerce string 'simple-string))))))))
60 (instantiate-octets-definition define-ascii->string)
62 (define-unibyte-external-format :ascii
63 (:us-ascii :ansi_x3.4-1968 :iso-646 :iso-646-us :|646|)
65 (external-format-encoding-error stream bits)
66 (setf (sap-ref-8 sap tail) bits))
68 (return-from decode-break-reason 1)
75 (declaim (inline get-latin1-bytes))
76 (defun get-latin1-bytes (string pos)
77 (declare (optimize speed (safety 0))
78 (type simple-string string)
79 (type array-range pos))
80 (get-latin-bytes #'identity :latin-1 string pos))
82 (defun string->latin1 (string sstart send null-padding)
83 (declare (optimize speed (safety 0))
84 (type simple-string string)
85 (type array-range sstart send))
86 (values (string->latin% string sstart send #'get-latin1-bytes null-padding)))
88 (defmacro define-latin1->string* (accessor type)
89 (declare (ignore type))
90 (let ((name (make-od-name 'latin1->string* accessor)))
92 (defun ,name (string sstart send array astart aend)
93 (,(make-od-name 'latin->string* accessor) string sstart send array astart aend #'identity)))))
94 (instantiate-octets-definition define-latin1->string*)
96 (defmacro define-latin1->string (accessor type)
97 (declare (ignore type))
98 `(defun ,(make-od-name 'latin1->string accessor) (array astart aend)
99 (,(make-od-name 'latin->string accessor) array astart aend #'identity)))
100 (instantiate-octets-definition define-latin1->string)
102 ;;; Multiple names for the :ISO{,-}8859-* families are needed because on
103 ;;; FreeBSD (and maybe other BSD systems), nl_langinfo("LATIN-1") will
104 ;;; return "ISO8859-1" instead of "ISO-8859-1".
105 (define-unibyte-external-format :latin-1 (:latin1 :iso-8859-1 :iso8859-1)
107 (external-format-encoding-error stream bits)
108 (setf (sap-ref-8 sap tail) bits))
118 (declaim (inline char-len-as-utf8))
119 (defun char-len-as-utf8 (code)
120 (declare (optimize speed (safety 0))
121 (type (integer 0 (#.sb!xc:char-code-limit)) code))
122 (cond ((< code 0) (bug "can't happen"))
126 ((< code #x110000) 4)
127 (t (bug "can't happen"))))
129 (defun string->utf8 (string sstart send null-padding)
130 (declare (optimize (speed 3) (safety 0))
131 (type simple-string string)
132 (type (integer 0 1) null-padding)
133 (type array-range sstart send))
134 (macrolet ((ascii-bash ()
135 ;; KLUDGE: this depends on the fact that we know that
136 ;; our arrays are initialized with zeros.
137 '(let ((array (make-array (+ null-padding (- send sstart))
138 :element-type '(unsigned-byte 8))))
140 and j from sstart below send
141 do (setf (aref array i) (char-code (char string j))))
144 `(case (char-len-as-utf8 code)
146 (2 (add-byte (logior #xc0 (ldb (byte 5 6) code)))
147 (add-byte (logior #x80 (ldb (byte 6 0) code))))
148 (3 (when (<= #xd800 code #xdfff)
149 (setf error-position i)
151 (add-byte (logior #xe0 (ldb (byte 4 12) code)))
152 (add-byte (logior #x80 (ldb (byte 6 6) code)))
153 (add-byte (logior #x80 (ldb (byte 6 0) code))))
154 (4 (add-byte (logior #xf0 (ldb (byte 3 18) code)))
155 (add-byte (logior #x80 (ldb (byte 6 12) code)))
156 (add-byte (logior #x80 (ldb (byte 6 6) code)))
157 (add-byte (logior #x80 (ldb (byte 6 0) code)))))))
159 ((simple-array character (*))
160 (let ((utf8-length 0))
161 ;; Since it has to fit in a vector, it must be a fixnum!
162 (declare (type (and unsigned-byte fixnum) utf8-length))
163 (loop for i of-type index from sstart below send
164 do (incf utf8-length (char-len-as-utf8 (char-code (char string i)))))
165 (if (= utf8-length (- send sstart))
167 (let ((array (make-array (+ null-padding utf8-length)
168 :element-type '(unsigned-byte 8)))
172 (declare (type index index))
176 (setf (aref array index) b)
178 (declare (inline add-byte))
179 (loop for i of-type index from sstart below send
180 for code = (char-code (char string i))
181 do (output-code :first-error)
182 finally (return-from string->utf8 array)))
184 (setf new-array (make-array (* index 2) :adjustable t
185 :element-type '(unsigned-byte 8)
186 :fill-pointer index))
187 (replace new-array array)
189 (let ((replacement (encoding-error :utf-8 string index)))
190 (flet ((add-byte (b) (vector-push-extend b new-array)))
191 (dotimes (i (length replacement))
192 (add-byte (aref replacement i)))
193 (loop for i of-type index from (1+ error-position) below send
194 for code = (char-code (char string i))
195 do (output-code :error)
196 finally (return-from string->utf8
198 (unless (zerop null-padding)
199 (vector-push-extend 0 new-array))
200 (copy-seq new-array)))))))))))
202 ((simple-array base-char (*))
203 ;; On unicode builds BASE-STRINGs are limited to ASCII range,
204 ;; so we can take a fast path -- and get benefit of the element
205 ;; type information. On non-unicode build BASE-CHAR ==
206 ;; CHARACTER, handled above.
208 ((simple-array nil (*))
210 (make-array null-padding :element-type '(unsigned-byte 8))
211 ;; Just get the error...
212 (aref string sstart))))))
216 (defmacro define-bytes-per-utf8-character (accessor type)
217 (let ((name (make-od-name 'bytes-per-utf8-character accessor)))
219 ;;(declaim (inline ,name))
221 (string->utf8 (string (code-char ,(1- sb!xc:char-code-limit)))
223 (declare (type (simple-array (unsigned-byte 8) (#!+sb-unicode 4 #!-sb-unicode 2)) lexically-max))
224 (defun ,name (array pos end)
225 (declare (optimize speed (safety 0))
227 (type array-range pos end))
228 ;; returns the number of bytes consumed and nil if it's a
229 ;; valid character or the number of bytes consumed and a
230 ;; replacement string if it's not.
231 (let ((initial-byte (,accessor array pos))
233 (reject-position pos)
234 (remaining-bytes (- end pos)))
235 (declare (type array-range reject-position remaining-bytes))
236 (labels ((valid-utf8-starter-byte-p (b)
237 (declare (type (unsigned-byte 8) b))
239 ((zerop (logand b #b10000000)) 1)
240 ((and (= (logand b #b11100000) #b11000000)
242 ((= (logand b #b11110000) #b11100000) 3)
243 ((and (= (logand b #b11111000) #b11110000)
247 (setf reject-reason 'invalid-utf8-starter-byte))
249 (enough-bytes-left-p (x)
250 (let ((ok (> end (+ pos (1- x)))))
252 (setf reject-reason 'end-of-input-in-character))
254 (valid-secondary-p (x)
255 (let* ((idx (the array-range (+ pos x)))
256 (b (,accessor array idx))
257 (ok (= (logand b #b11000000) #b10000000)))
258 (when (and ok (= x 1))
267 (setf reject-reason 'invalid-utf8-continuation-byte)
268 (setf reject-position idx))
270 (preliminary-ok-for-length (maybe-len len)
271 (and (eql maybe-len len)
272 ;; Has to be done in this order so that
273 ;; certain broken sequences (e.g., the
274 ;; two-byte sequence `"initial (length 3)"
275 ;; "non-continuation"' -- `#xef #x32')
276 ;; signal only part of that sequence as
278 (loop for i from 1 below (min len remaining-bytes)
279 always (valid-secondary-p i))
280 (enough-bytes-left-p len)))
281 (character-below-char-code-limit-p ()
282 ;; This is only called on a four-byte sequence
283 ;; (two in non-unicode builds) to ensure we
284 ;; don't go over SBCL's character limts.
285 (let ((ok (cond ((< (aref lexically-max 0) (,accessor array pos))
287 ((> (aref lexically-max 0) (,accessor array pos))
289 ((< (aref lexically-max 1) (,accessor array (+ pos 1)))
292 ((> (aref lexically-max 1) (,accessor array (+ pos 1)))
295 ((< (aref lexically-max 2) (,accessor array (+ pos 2)))
298 ((> (aref lexically-max 2) (,accessor array (+ pos 2)))
301 ((< (aref lexically-max 3) (,accessor array (+ pos 3)))
305 (setf reject-reason 'character-out-of-range))
307 (declare (inline valid-utf8-starter-byte-p
310 preliminary-ok-for-length))
311 (let ((maybe-len (valid-utf8-starter-byte-p initial-byte)))
312 (cond ((eql maybe-len 1)
314 ((and (preliminary-ok-for-length maybe-len 2)
315 #!-sb-unicode (character-below-char-code-limit-p))
317 ((and (preliminary-ok-for-length maybe-len 3)
318 #!-sb-unicode (not (setf reject-reason 'character-out-of-range)))
320 ((and (preliminary-ok-for-length maybe-len 4)
321 #!-sb-unicode (not (setf reject-reason 'character-out-of-range))
322 (character-below-char-code-limit-p))
327 (invalid-utf8-starter-byte (1+ pos))
328 (end-of-input-in-character end)
329 (invalid-utf8-continuation-byte reject-position)
330 (character-out-of-range (+ pos maybe-len))))
331 (bad-len (- bad-end pos)))
332 (declare (type array-range bad-end bad-len))
333 (let ((replacement (decoding-error array pos bad-end :utf-8 reject-reason reject-position)))
334 (values bad-len replacement)))))))))))))
335 (instantiate-octets-definition define-bytes-per-utf8-character)
337 (defmacro define-simple-get-utf8-char (accessor type)
338 (let ((name (make-od-name 'simple-get-utf8-char accessor)))
340 (declaim (inline ,name))
341 (defun ,name (array pos bytes)
342 (declare (optimize speed (safety 0))
344 (type array-range pos)
345 (type (integer 1 4) bytes))
347 (,accessor array (the array-range (+ pos x)))))
348 (declare (inline cref))
349 (code-char (ecase bytes
351 (2 (logior (ash (ldb (byte 5 0) (cref 0)) 6)
352 (ldb (byte 6 0) (cref 1))))
353 (3 (logior (ash (ldb (byte 4 0) (cref 0)) 12)
354 (ash (ldb (byte 6 0) (cref 1)) 6)
355 (ldb (byte 6 0) (cref 2))))
356 (4 (logior (ash (ldb (byte 3 0) (cref 0)) 18)
357 (ash (ldb (byte 6 0) (cref 1)) 12)
358 (ash (ldb (byte 6 0) (cref 2)) 6)
359 (ldb (byte 6 0) (cref 3)))))))))))
360 (instantiate-octets-definition define-simple-get-utf8-char)
362 (defmacro define-utf8->string (accessor type)
363 (let ((name (make-od-name 'utf8->string accessor)))
365 (defun ,name (array astart aend)
366 (declare (optimize speed (safety 0))
368 (type array-range astart aend))
369 (let ((string (make-array 0 :adjustable t :fill-pointer 0 :element-type 'character)))
370 (loop with pos = astart
372 do (multiple-value-bind (bytes invalid)
373 (,(make-od-name 'bytes-per-utf8-character accessor) array pos aend)
374 (declare (type (or null string) invalid))
377 (vector-push-extend (,(make-od-name 'simple-get-utf8-char accessor) array pos bytes) string))
379 (dotimes (i (length invalid))
380 (vector-push-extend (char invalid i) string))))
382 (coerce string 'simple-string))))))
383 (instantiate-octets-definition define-utf8->string)
385 (define-external-format/variable-width (:utf-8 :utf8) t
386 #!+sb-unicode (code-char #xfffd) #!-sb-unicode #\?
387 (let ((bits (char-code byte)))
388 (cond ((< bits #x80) 1)
393 (1 (setf (sap-ref-8 sap tail) bits))
394 (2 (setf (sap-ref-8 sap tail) (logior #xc0 (ldb (byte 5 6) bits))
395 (sap-ref-8 sap (+ 1 tail)) (logior #x80 (ldb (byte 6 0) bits))))
396 (3 (when (<= #xd800 bits #xdfff)
397 (external-format-encoding-error stream bits))
398 (setf (sap-ref-8 sap tail) (logior #xe0 (ldb (byte 4 12) bits))
399 (sap-ref-8 sap (+ 1 tail)) (logior #x80 (ldb (byte 6 6) bits))
400 (sap-ref-8 sap (+ 2 tail)) (logior #x80 (ldb (byte 6 0) bits))))
401 (4 (setf (sap-ref-8 sap tail) (logior #xf0 (ldb (byte 3 18) bits))
402 (sap-ref-8 sap (+ 1 tail)) (logior #x80 (ldb (byte 6 12) bits))
403 (sap-ref-8 sap (+ 2 tail)) (logior #x80 (ldb (byte 6 6) bits))
404 (sap-ref-8 sap (+ 3 tail)) (logior #x80 (ldb (byte 6 0) bits)))))
405 (1 (cond ((< byte #x80) 1)
406 ((< byte #xc2) (return-from decode-break-reason 1))
410 (code-char (ecase size
412 (2 (let ((byte2 (sap-ref-8 sap (1+ head))))
413 (unless (<= #x80 byte2 #xbf)
414 (return-from decode-break-reason 2))
415 (dpb byte (byte 5 6) byte2)))
416 (3 (let ((byte2 (sap-ref-8 sap (1+ head)))
417 (byte3 (sap-ref-8 sap (+ 2 head))))
418 (unless (and (<= #x80 byte2 #xbf)
420 (or (/= byte #xe0) (<= #xa0 byte2 #xbf))
421 (or (/= byte #xed) (<= #x80 byte2 #x9f)))
422 (return-from decode-break-reason 3))
423 (dpb byte (byte 4 12) (dpb byte2 (byte 6 6) byte3))))
424 (4 (let ((byte2 (sap-ref-8 sap (1+ head)))
425 (byte3 (sap-ref-8 sap (+ 2 head)))
426 (byte4 (sap-ref-8 sap (+ 3 head))))
427 (unless (and (<= #x80 byte2 #xbf)
430 (or (/= byte #xf0) (<= #x90 byte2 #xbf))
431 (or (/= byte #xf4) (<= #x80 byte2 #x8f)))
432 (return-from decode-break-reason 4))
433 (dpb byte (byte 3 18)
434 (dpb byte2 (byte 6 12)
435 (dpb byte3 (byte 6 6) byte4)))))))