1 ;;;; code for string to octet conversion
3 ;;;; This software is part of the SBCL system. See the README file for
6 ;;;; This software is derived from the CMU CL system, which was
7 ;;;; written at Carnegie Mellon University and released into the
8 ;;;; public domain. The software is in the public domain and is
9 ;;;; provided with absolutely no warranty. See the COPYING and CREDITS
10 ;;;; files for more information.
12 ;;; FIXME: The latin9 stuff is currently #!+sb-unicode, because I
13 ;;; don't like the idea of trying to do CODE-CHAR #x<big>. Is that a
14 ;;; justified fear? Can we arrange that it's caught and converted to
15 ;;; a decoding error error? Or should we just give up on non-Unicode
18 (in-package "SB!IMPL")
20 ;;; FIXME: don't we have this somewhere else?
21 (deftype array-range ()
22 "A number that can represent an index into a vector, including
24 '(integer 0 #.sb!xc:array-dimension-limit))
28 ;;; encoding condition
30 (define-condition octets-encoding-error (character-encoding-error)
31 ((string :initarg :string :reader octets-encoding-error-string)
32 (position :initarg :position :reader octets-encoding-error-position)
33 (external-format :initarg :external-format
34 :reader octets-encoding-error-external-format))
35 (:report (lambda (c s)
36 (format s "Unable to encode character ~A as ~S."
37 (char-code (char (octets-encoding-error-string c)
38 (octets-encoding-error-position c)))
39 (octets-encoding-error-external-format c)))))
41 (defun read-replacement-character ()
43 "Replacement byte, bytes, character, or string (evaluated): ")
44 (finish-output *query-io*)
45 (list (eval (read *query-io*))))
47 (defun encoding-error (external-format string pos)
49 (error 'octets-encoding-error
50 :external-format external-format
53 (use-value (replacement)
54 :report "Supply a set of bytes to use in place of the invalid one."
55 :interactive read-replacement-character
58 (make-array 1 :element-type '(unsigned-byte 8) :initial-element replacement))
60 (string-to-octets (string replacement)
61 :external-format external-format))
63 (string-to-octets replacement
64 :external-format external-format))
66 (coerce replacement '(simple-array (unsigned-byte 8) (*))))))))
68 ;;; decoding condition
70 ;;; for UTF8, the specific condition signalled will be a generalized
71 ;;; instance of one of the following:
73 ;;; end-of-input-in-character
74 ;;; character-out-of-range
75 ;;; invalid-utf8-starter-byte
76 ;;; invalid-utf8-continuation-byte
77 ;;; overlong-utf8-sequence
79 ;;; Of these, the only one truly likely to be of interest to calling
80 ;;; code is end-of-input-in-character (in which case it's likely to
81 ;;; want to make a note of octet-decoding-error-start, supply "" as a
82 ;;; replacement string, and then move that last chunk of bytes to the
83 ;;; beginning of its buffer for the next go round) but they're all
84 ;;; provided on the off chance they're of interest. The next most
85 ;;; likely interesting option is overlong-utf8-sequence -- the
86 ;;; application, if it cares to, can decode this itself (taking care
87 ;;; to ensure that the result isn't out of range of CHAR-CODE-LIMIT)
88 ;;; and return that result. This library doesn't provide support for
89 ;;; that as a conforming UTF-8-using program is supposed to treat it
92 (define-condition octet-decoding-error (character-decoding-error)
93 ((array :initarg :array :accessor octet-decoding-error-array)
94 (start :initarg :start :accessor octet-decoding-error-start)
95 (end :initarg :end :accessor octet-decoding-error-end)
96 (position :initarg :pos :accessor octet-decoding-bad-byte-position)
97 (external-format :initarg :external-format
98 :accessor octet-decoding-error-external-format))
100 (lambda (condition stream)
101 (format stream "Illegal ~S character starting at byte position ~D."
102 (octet-decoding-error-external-format condition)
103 (octet-decoding-error-start condition)))))
105 (define-condition end-of-input-in-character (octet-decoding-error) ())
106 (define-condition character-out-of-range (octet-decoding-error) ())
107 (define-condition invalid-utf8-starter-byte (octet-decoding-error) ())
108 (define-condition invalid-utf8-continuation-byte (octet-decoding-error) ())
109 (define-condition overlong-utf8-sequence (octet-decoding-error) ())
111 (define-condition malformed-ascii (octet-decoding-error) ())
113 (defun read-replacement-string ()
114 (format *query-io* "Enter a replacement string designator (evaluated): ")
115 (finish-output *query-io*)
116 (list (eval (read *query-io*))))
118 (defun decoding-error (array start end external-format reason pos)
121 :external-format external-format
127 :report "Supply a replacement string designator."
128 :interactive read-replacement-string
131 ;;; overflow on caller-supplied-replacement condition
133 (define-condition octet-buffer-overflow (condition)
134 ((replacement :initarg :replacement :accessor octet-buffer-overflow-replacement)))
137 (with-simple-restart (continue "Keep processing, not invoking outer handlers.")
138 (signal 'octet-buffer-overflow :replacement s)))
140 ;;; Utilities used in both to-string and to-octet conversions
142 (defmacro instantiate-octets-definition (definer)
144 (,definer aref (simple-array (unsigned-byte 8) (*)))
145 (,definer sap-ref-8 system-area-pointer)))
147 ;;; maps into TO-SEQ from elements of FROM-SEQ via MAPPER. MAPPER
148 ;;; returns two values: the number of elments stored in TO-SEQ, and
149 ;;; the number used up from FROM-SEQ. MAPPER is responsible for
150 ;;; getting out if either sequence runs out of room.
151 (declaim (inline varimap))
152 (defun varimap (to-seq to-start to-end from-start from-end mapper)
153 (declare (optimize speed (safety 0))
154 (type array-range to-start to-end from-start from-end)
155 (type function mapper))
156 (loop with from-size of-type array-range = 0
157 and to-size of-type array-range = 0
158 for to-pos of-type array-range = to-start then (+ to-pos to-size)
159 for from-pos of-type array-range = from-start then (+ from-pos from-size)
160 while (and (< to-pos to-end)
161 (< from-pos from-end))
162 do (multiple-value-bind (ts fs) (funcall mapper to-pos from-pos)
165 finally (return (values to-seq to-pos from-pos))))
167 ;;; FIXME: find out why the comment about SYMBOLICATE below is true
168 ;;; and fix it, or else replace with SYMBOLICATE.
170 ;;; FIXME: this is cute, but is going to prevent greps for def.*<name>
171 ;;; from working for (defun ,(make-od-name ...) ...)
172 (eval-when (:compile-toplevel :load-toplevel :execute)
173 (defun make-od-name (sym1 sym2)
174 ;; "MAKE-NAME" is too generic, but this doesn't do quite what
175 ;; SYMBOLICATE does; MAKE-OD-NAME ("octets definition") it is
177 (intern (concatenate 'string (symbol-name sym1) "-" (symbol-name sym2))
178 (symbol-package sym1))))
180 (defmacro define-replace-into-notseq (accessor type)
181 (declare (ignore type))
182 (let ((name (make-od-name 'replace-into-notseq accessor)))
184 (declaim (inline ,name))
185 (defun ,name (dest src dest-start)
186 (declare (optimize speed (safety 0)))
187 ;; Known: all of SRC (which is a SEQ) fits into DEST
189 (loop for srcobj in src
190 for idx of-type array-range from dest-start
191 do (setf (,accessor dest idx) srcobj))
192 (loop for srcidx of-type array-range below (length src)
193 for destidx of-type array-range from dest-start
194 do (setf (,accessor dest destidx) (aref src srcidx))))
196 (instantiate-octets-definition define-replace-into-notseq)
198 (defmacro define-vari-transcode (accessor type)
199 (declare (ignore type))
200 (let ((name (make-od-name 'vari-transcode accessor)))
202 (declaim (inline ,name))
204 (to to-start to-end from from-start from-end replacements getter elementp)
205 (declare (optimize speed (safety 0))
206 (type array-range to-start to-end from-start from-end)
207 (type function getter elementp))
208 ;; convert from FROM to TO via the mapping function GETTER
209 ;; which can return either a single element for TO or a
210 ;; sequence of elments; in the latter case the sequence is
211 ;; taken from the head of the (boxed) list REPLACEMENTS.
212 ;; ELEMENTP tests to see which of the two return types was
214 (let ((replacements-box (if replacements (cons nil replacements) nil)))
215 (declare (dynamic-extent replacements-box))
216 (varimap to to-start to-end
218 (lambda (to-pos from-pos)
219 (multiple-value-bind (element-or-vector used-from)
220 (funcall getter from from-pos from-end replacements-box)
222 ((funcall elementp element-or-vector)
223 (setf (,accessor to to-pos) element-or-vector)
224 (values 1 used-from))
225 ((> (+ to-pos (length element-or-vector)) to-end)
226 (overflow element-or-vector)
227 (return-from ,name (values to to-pos from-pos)))
229 (,(make-od-name 'replace-into-notseq accessor) to element-or-vector to-pos)
230 (values (length element-or-vector) used-from)))))))))))
232 (instantiate-octets-definition define-vari-transcode)
234 ;;;; to-octets conversions
236 ;;; to latin (including ascii)
237 (defmacro define-string->latin*% (accessor type)
238 (let ((name (make-od-name 'string->latin*% accessor)))
240 (declaim (inline ,name))
241 (defun ,name (array astart aend string sstart send replacements get-bytes)
242 (declare (optimize speed (safety 0))
243 (type simple-string string)
245 (,(make-od-name 'vari-transcode accessor)
251 (instantiate-octets-definition define-string->latin*%)
253 (declaim (inline get-ascii-bytes))
254 (defun get-ascii-bytes (string pos end replacements-box)
255 (declare (ignore end))
256 (let ((code (char-code (char string pos))))
259 ((null replacements-box)
260 (encoding-error :ascii string pos))
261 (t (pop (cdr replacements-box))))
264 (declaim (inline get-latin-bytes))
265 (defun get-latin-bytes (mapper external-format string pos end replacements-box)
266 (declare (ignore end))
267 (let ((code (funcall mapper (char-code (char string pos)))))
270 ((null replacements-box)
271 (encoding-error external-format string pos))
273 (pop (cdr replacements-box))))
278 (declaim (inline code->latin9-mapper))
279 (defun code->latin9-mapper (code)
280 (declare (optimize speed (safety 0))
281 (type char-code code))
293 (defmacro define-string->ascii* (accessor type)
294 (let ((name (make-od-name 'string->ascii* accessor)))
295 `(defun ,name (array astart aend string sstart send)
296 (declare (optimize speed (safety 0))
298 (type simple-string string)
299 (type array-range astart aend sstart send))
300 (,(make-od-name 'string->latin*% accessor)
304 #'get-ascii-bytes))))
305 (instantiate-octets-definition define-string->ascii*)
307 (declaim (inline get-latin1-bytes))
308 (defun get-latin1-bytes (string pos end replacements)
309 (declare (optimize speed (safety 0))
310 (type simple-string string)
311 (type array-range pos end))
312 (get-latin-bytes #'identity :latin-1 string pos end replacements))
314 (defmacro define-string->latin1* (accessor type)
315 (let ((name (make-od-name 'string->latin1* accessor)))
316 `(defun ,name (array astart aend string sstart send)
317 (declare (optimize speed (safety 0))
319 (type simple-string string)
320 (type array-range astart aend sstart send))
321 (,(make-od-name 'string->latin*% accessor)
325 #'get-latin1-bytes))))
326 (instantiate-octets-definition define-string->latin1*)
330 (declaim (inline get-latin9-bytes))
331 (defun get-latin9-bytes (string pos end replacements)
332 (declare (optimize speed (safety 0))
333 (type simple-string string)
334 (type array-range pos end))
335 (get-latin-bytes #'code->latin9-mapper :latin-9 string pos end replacements))
337 (defmacro define-string->latin9* (accessor type)
338 (let ((name (make-od-name 'string->latin9* accessor)))
339 `(defun ,name (array astart aend string sstart send)
340 (declare (optimize speed (safety 0))
342 (type simple-string string)
343 (type array-range astart aend sstart send))
344 (,(make-od-name 'string->latin*% accessor)
348 #'get-latin9-bytes))))
349 (instantiate-octets-definition define-string->latin9*))
351 (defun get-latin-length (string start end get-bytes)
352 ;; Returns the length and a list of replacements for bad characters
353 (declare (optimize speed (safety 0))
354 (type simple-string string)
355 (type array-range start end)
356 (type function get-bytes))
358 (replacements-start (cons nil nil))
359 (replacements-end replacements-start))
360 (declare (dynamic-extent replacements-start)
361 (type array-range length))
362 (flet ((collect (replacement)
363 (setf (cdr replacements-end) (cons replacement nil)
364 replacements-end (cdr replacements-end))
366 (loop for src of-type fixnum from start below end
367 do (let ((byte-or-bytes (funcall get-bytes string src end nil)))
368 (declare (type (or (unsigned-byte 8) (simple-array (unsigned-byte 8) (*))) byte-or-bytes))
370 ((numberp byte-or-bytes)
373 (let* ((replacement-len (length byte-or-bytes))
374 (total-length (+ length replacement-len)))
375 (unless (< total-length #.sb!xc:array-dimension-limit)
376 (error "Replacement string too long"))
377 (setf length total-length)
378 (collect byte-or-bytes)))))))
379 (values length (cdr replacements-start))))
381 (declaim (inline string->latin%))
382 (defun string->latin% (string sstart send get-bytes null-padding)
383 (declare (optimize speed); (safety 0))
384 (type simple-string string)
385 (type array-range sstart)
386 (type array-range send)
387 (type function get-bytes))
388 (let ((octets (make-array 0 :adjustable t :fill-pointer 0 :element-type '(unsigned-byte 8))))
389 (loop for pos from sstart below send
390 do (let ((byte-or-bytes (funcall get-bytes string pos send nil)))
391 (declare (type (or (unsigned-byte 8) (simple-array (unsigned-byte 8) (*))) byte-or-bytes))
393 ((numberp byte-or-bytes)
394 (vector-push-extend byte-or-bytes octets))
396 (dotimes (i (length byte-or-bytes))
397 (vector-push-extend (aref byte-or-bytes i) octets))))))
398 (dotimes (i null-padding)
399 (vector-push-extend 0 octets))
400 (coerce octets '(simple-array (unsigned-byte 8) (*)))))
402 (defun string->ascii (string sstart send null-padding)
403 (declare (optimize speed (safety 0))
404 (type simple-string string)
405 (type array-range sstart send))
406 (values (string->latin% string sstart send #'get-ascii-bytes null-padding)))
408 (defun string->latin1 (string sstart send null-padding)
409 (declare (optimize speed (safety 0))
410 (type simple-string string)
411 (type array-range sstart send))
412 (values (string->latin% string sstart send #'get-latin1-bytes null-padding)))
415 (defun string->latin9 (string sstart send null-padding)
416 (declare (optimize speed (safety 0))
417 (type simple-string string)
418 (type array-range sstart send))
419 (values (string->latin% string sstart send #'get-latin9-bytes null-padding)))
423 (declaim (inline char-len-as-utf8))
424 (defun char-len-as-utf8 (c)
425 (declare (optimize speed (safety 0))
427 (let ((code (char-code c)))
428 (cond ((< code 0) (bug "can't happen"))
432 ((< code #x110000) 4)
433 (t (bug "can't happen")))))
435 (defmacro define-char->utf8 (accessor type)
436 (let ((name (make-od-name 'char->utf8 accessor)))
438 ;;(declaim (inline ,name))
439 (defun ,name (char dest destpos maxdest)
440 (declare (optimize speed (safety 0))
442 (type array-range destpos maxdest))
443 ;; stores the character in the array DEST if there's room between
444 ;; DESTPOS and MAXDEST. Returns the number of bytes used on
445 ;; success, or NIL on failure.
446 (let ((code (char-code char)))
447 (flet (((setf cref) (c pos)
448 (setf (,accessor dest (+ pos destpos)) c)))
449 (declare (inline (setf cref)))
450 (ecase (char-len-as-utf8 char)
452 (cond ((>= destpos maxdest)
458 (cond ((>= (+ destpos 1) maxdest)
461 (setf (cref 0) (logior #b11000000 (ldb (byte 5 6) code))
462 (cref 1) (logior #b10000000 (ldb (byte 6 0) code)))
465 (cond ((>= (+ destpos 2) maxdest)
468 (setf (cref 0) (logior #b11100000 (ldb (byte 4 12) code))
469 (cref 1) (logior #b10000000 (ldb (byte 6 6) code))
470 (cref 2) (logior #b10000000 (ldb (byte 6 0) code)))
473 (cond ((>= (+ destpos 3) maxdest)
476 (setf (cref 0) (logior #b11110000 (ldb (byte 3 18) code))
477 (cref 1) (logior #b10000000 (ldb (byte 6 12) code))
478 (cref 2) (logior #b10000000 (ldb (byte 6 6) code))
479 (cref 3) (logior #b10000000 (ldb (byte 6 0) code)))
481 (instantiate-octets-definition define-char->utf8)
483 (defmacro define-string->utf8* (accessor type)
484 (let ((name (make-od-name 'string->utf8* accessor)))
486 (defun ,name (array astart aend string sstart send)
487 (declare (optimize speed (safety 0))
488 (type simple-string string)
490 (type array-range astart aend sstart send))
491 (flet ((convert (spos apos)
492 (let ((char-len (,(make-od-name 'char->utf8 accessor) (char string spos) array apos aend)))
494 (return-from ,name (values array apos spos)))
496 (varimap array astart aend
499 (values (convert spos apos) 1))))))))
500 (instantiate-octets-definition define-string->utf8*)
502 (defun string->utf8 (string sstart send additional-space)
503 (declare (optimize speed (safety 0))
504 (type simple-string string)
505 (type array-range sstart send additional-space))
506 (let ((alen (+ (the (integer 0 #.(* 4 sb!xc:array-dimension-limit))
507 (loop with result of-type array-range = 0
508 for i of-type array-range from sstart below send
509 do (incf result (char-len-as-utf8 (char string i)))
510 finally (return result)))
512 (when (>= alen #.sb!xc:array-dimension-limit)
513 (error "string too long as utf8"))
514 (let ((array (make-array alen :element-type '(unsigned-byte 8))))
515 (when (plusp additional-space)
516 (fill array 0 :start (- alen additional-space)))
517 (values (string->utf8*-aref array 0 alen string sstart send)))))
519 ;;;; to-string conversions
521 ;;; from latin (including ascii)
523 (defmacro define-ascii->string* (accessor type)
524 (let ((name (make-od-name 'ascii->string* accessor)))
526 (declaim (inline ,name))
527 (defun ,name (string sstart send array astart aend)
528 (declare (optimize speed (safety 0))
529 (type simple-string string)
531 (type array-range sstart send astart aend))
532 (varimap string sstart send
535 (setf (char string spos)
536 (let ((code (,accessor array apos)))
539 (decoding-error array astart aend :ascii
540 'malformed-ascii apos))))
542 (instantiate-octets-definition define-ascii->string*)
544 (defmacro define-latin->string* (accessor type)
545 (let ((name (make-od-name 'latin->string* accessor)))
547 (declaim (inline ,name))
548 (defun ,name (string sstart send array astart aend mapper)
549 (declare (optimize speed (safety 0))
550 (type simple-string string)
552 (type array-range sstart send astart aend)
554 (varimap string sstart send
557 (setf (char string spos) (code-char (funcall mapper (,accessor array apos))))
559 (instantiate-octets-definition define-latin->string*)
563 (declaim (inline latin9->code-mapper))
564 (defun latin9->code-mapper (byte)
565 (declare (optimize speed (safety 0))
566 (type (unsigned-byte 8) byte))
578 (defmacro define-latin1->string* (accessor type)
579 (declare (ignore type))
580 (let ((name (make-od-name 'latin1->string* accessor)))
582 (defun ,name (string sstart send array astart aend)
583 (,(make-od-name 'latin->string* accessor) string sstart send array astart aend #'identity)))))
584 (instantiate-octets-definition define-latin1->string*)
588 (defmacro define-latin9->string* (accessor type)
589 (declare (ignore type))
590 (let ((name (make-od-name 'latin9->string* accessor)))
592 (defun ,name (string sstart send array astart aend)
593 (,(make-od-name 'latin->string* accessor) string sstart send array astart aend #'latin9->code-mapper)))))
594 (instantiate-octets-definition define-latin9->string*))
596 (declaim (inline ascii->string))
597 (defun ascii->string (array astart aend)
598 (declare (optimize speed (safety 0))
599 (type (simple-array (unsigned-byte 8) (*)) array)
600 (type array-range astart aend))
601 (let ((length (the array-range (- aend astart))))
602 (values (ascii->string*-aref (make-string length) 0 length
603 array astart aend))))
605 (declaim (inline latin->string))
606 (defun latin->string (array astart aend mapper)
607 (declare (optimize speed (safety 0))
608 (type (simple-array (unsigned-byte 8) (*)) array)
609 (type array-range astart aend)
610 (type function mapper))
611 (let ((length (the array-range (- aend astart))))
612 (values (latin->string*-aref (make-string length) 0 length
616 (defun latin1->string (array astart aend)
617 (latin->string array astart aend #'identity))
620 (defun latin9->string (array astart aend)
621 (latin->string array astart aend #'latin9->code-mapper))
625 (defmacro define-bytes-per-utf8-character (accessor type)
626 (let ((name (make-od-name 'bytes-per-utf8-character accessor)))
628 ;;(declaim (inline ,name))
630 (string->utf8 (string (code-char (1- #.sb!xc:char-code-limit)))
632 (defun ,name (array pos end replacements-box)
633 (declare (optimize speed (safety 0))
635 (type array-range pos end))
636 ;; returns the number of bytes consumed and nil if it's a
637 ;; valid character or the number of bytes consumed and a
638 ;; replacement string if it's not. If REPLACEMENTS is NIL,
639 ;; signal a condition to get one, otherwise pop it off the
640 ;; cdr of REPLACEMENTS.
641 (let ((initial-byte (,accessor array pos))
642 (reject-reason 'no-error)
643 (reject-position pos))
644 (declare (type array-range reject-position))
645 (labels ((valid-utf8-starter-byte-p (b)
646 (declare (type (unsigned-byte 8) b))
648 ((zerop (logand b #b10000000)) 1)
649 ((= (logand b #b11100000) #b11000000)
651 ((= (logand b #b11110000) #b11100000)
653 ((= (logand b #b11111000) #b11110000)
655 ((= (logand b #b11111100) #b11111000)
657 ((= (logand b #b11111110) #b11111100)
662 (setf reject-reason 'invalid-utf8-starter-byte))
664 (enough-bytes-left-p (x)
665 (let ((ok (> end (+ pos (1- x)))))
667 (setf reject-reason 'end-of-input-in-character))
669 (valid-secondary-byte-p (b)
670 (declare (type (unsigned-byte 8) b))
671 (= (logand b #b11000000) #b10000000))
672 (valid-secondary-p (x)
673 (let* ((b (,accessor array (the array-range (+ pos x))))
674 (ok (valid-secondary-byte-p b)))
676 (setf reject-reason 'invalid-utf8-continuation-byte)
677 (setf reject-position (+ pos x)))
679 (preliminary-ok-for-length (maybe-len len)
680 (and (eql maybe-len len)
681 (enough-bytes-left-p len)
682 (loop for i from 1 below len
683 always (valid-secondary-p i))))
685 (let ((ok (or (/= initial-byte x)
686 (/= (logior (,accessor array (the array-range (+ pos 1)))
690 (setf reject-reason 'overlong-utf8-sequence))
692 (character-below-char-code-limit-p ()
693 ;; This is only called on a four-byte sequence to
694 ;; ensure we don't go over SBCL's character limts.
695 (let ((ok (cond ((< (aref lexically-max 0) (,accessor array pos))
697 ((> (aref lexically-max 0) (,accessor array pos))
699 ((< (aref lexically-max 1) (,accessor array (+ pos 1)))
701 ((> (aref lexically-max 1) (,accessor array (+ pos 1)))
703 ((< (aref lexically-max 2) (,accessor array (+ pos 2)))
705 ((> (aref lexically-max 2) (,accessor array (+ pos 2)))
707 ((< (aref lexically-max 3) (,accessor array (+ pos 3)))
711 (setf reject-reason 'character-out-of-range))
713 (declare (inline valid-utf8-starter-byte-p
715 valid-secondary-byte-p
717 preliminary-ok-for-length
719 (let ((maybe-len (valid-utf8-starter-byte-p initial-byte)))
720 (cond ((eql maybe-len 1)
722 ((and (preliminary-ok-for-length maybe-len 2)
723 (overlong-chk #b11000000 #b10111111)
724 (overlong-chk #b11000001 #b10111111))
726 ((and (preliminary-ok-for-length maybe-len 3)
727 (overlong-chk #b11100000 #b10011111))
729 ((and (preliminary-ok-for-length maybe-len 4)
730 (overlong-chk #b11110000 #b10001111)
731 (character-below-char-code-limit-p))
733 ((and (preliminary-ok-for-length maybe-len 5)
734 (overlong-chk #b11111000 #b10000111)
735 (not (setf reject-reason 'character-out-of-range)))
736 (bug "can't happen"))
737 ((and (preliminary-ok-for-length maybe-len 6)
738 (overlong-chk #b11111100 #b10000011)
739 (not (setf reject-reason 'character-out-of-range)))
740 (bug "can't happen"))
742 (let* ((bad-end (ecase reject-reason
743 (invalid-utf8-starter-byte
745 (end-of-input-in-character
747 (invalid-utf8-continuation-byte
749 ((overlong-utf8-sequence character-out-of-range)
751 (bad-len (- bad-end pos)))
752 (declare (type array-range bad-end bad-len))
754 (values bad-len (pop (cdr replacements-box)))
755 (let ((replacement (decoding-error array pos bad-end :utf-8 reject-reason reject-position)))
756 (values bad-len replacement))))))))))))))
757 (instantiate-octets-definition define-bytes-per-utf8-character)
759 (defmacro define-simple-get-utf8-char (accessor type)
760 (let ((name (make-od-name 'simple-get-utf8-char accessor)))
762 (declaim (inline ,name))
763 (defun ,name (array pos bytes)
764 (declare (optimize speed (safety 0))
766 (type array-range pos)
767 (type (integer 1 4) bytes))
769 (,accessor array (the array-range (+ pos x)))))
770 (declare (inline cref))
771 (code-char (ecase bytes
773 (2 (logior (ash (ldb (byte 5 0) (cref 0)) 6)
774 (ldb (byte 6 0) (cref 1))))
775 (3 (logior (ash (ldb (byte 4 0) (cref 0)) 12)
776 (ash (ldb (byte 6 0) (cref 1)) 6)
777 (ldb (byte 6 0) (cref 2))))
778 (4 (logior (ash (ldb (byte 3 0) (cref 0)) 18)
779 (ash (ldb (byte 6 0) (cref 1)) 12)
780 (ash (ldb (byte 6 0) (cref 2)) 6)
781 (ldb (byte 6 0) (cref 3)))))))))))
782 (instantiate-octets-definition define-simple-get-utf8-char)
784 (defmacro define-get-utf8-character (accessor type)
785 (let ((name (make-od-name 'get-utf8-character accessor)))
787 (declaim (inline ,name))
788 (defun ,name (array pos end replacements)
789 ;; Returns the character (or nil) and the number of bytes consumed
790 (declare (optimize speed (safety 0))
792 (type array-range pos end))
793 (multiple-value-bind (bytes invalid) (,(make-od-name 'bytes-per-utf8-character accessor) array pos end replacements)
795 (values (,(make-od-name 'simple-get-utf8-char accessor) array pos bytes)
797 (values invalid bytes)))))))
798 (instantiate-octets-definition define-get-utf8-character)
800 (defmacro define-utf8->string% (accessor type)
801 (let ((name (make-od-name 'utf8->string% accessor)))
803 (defun ,name (string sstart send array astart aend replacements)
804 (declare (optimize speed (safety 0))
805 (type simple-string string)
807 (type array-range sstart send astart aend))
808 (vari-transcode-aref ; dest is always a string
812 #',(make-od-name 'get-utf8-character accessor)
814 (instantiate-octets-definition define-utf8->string%)
816 (defmacro define-utf8->string* (accessor type)
817 (let ((name (make-od-name 'utf8->string* accessor)))
819 (defun ,name (string sstart send array astart aend)
820 (declare (optimize speed (safety 0))
821 (type simple-string string)
823 (type array-range sstart send astart aend))
824 (,(make-od-name 'utf8->string% accessor) string sstart send array astart aend nil)))))
825 (instantiate-octets-definition define-utf8->string*)
827 (defmacro define-utf8-string-length (accessor type)
828 (let ((name (make-od-name 'utf8-string-length accessor)))
829 `(defun ,name (array start end)
830 ;; Returns the length and a list of replacements for bad characters
831 (declare (optimize speed (safety 0))
833 (type array-range start end))
836 (replacements-start (cons nil nil))
837 (replacements-end replacements-start))
838 (declare (dynamic-extent replacements-start)
839 (type array-range bytes length))
840 (flet ((collect (replacement)
841 (setf (cdr replacements-end) (cons replacement nil)
842 replacements-end (cdr replacements-end))
844 (loop for src = start then (+ src bytes)
846 do (multiple-value-bind (bytes-this-char invalid) (,(make-od-name 'bytes-per-utf8-character accessor) array src end nil)
847 (declare (type (or null string) invalid))
848 (setf bytes bytes-this-char)
849 (let ((new-length (+ length (if invalid
850 (length (collect invalid))
852 (unless (< new-length #.sb!xc:array-dimension-limit)
853 (error "Replacement string too long"))
854 (setf length new-length)))))
855 (values length (cdr replacements-start))))))
856 (instantiate-octets-definition define-utf8-string-length)
858 (defmacro define-utf8->string (accessor type)
859 (let ((name (make-od-name 'utf8->string accessor)))
861 (defun ,name (array astart aend)
862 (declare (optimize speed (safety 0))
864 (type array-range astart aend))
865 (let ((string (make-array 0 :adjustable t :fill-pointer 0 :element-type 'character)))
866 (loop with pos = astart
867 do (multiple-value-bind (bytes invalid) (,(make-od-name 'bytes-per-utf8-character accessor) array pos aend nil)
868 (declare (type (or null string) invalid))
871 (vector-push-extend (,(make-od-name 'simple-get-utf8-char accessor) array pos bytes) string))
873 (dotimes (i (length invalid))
874 (vector-push-extend (char invalid i) string))))
877 (coerce string 'simple-string))))))
878 (instantiate-octets-definition define-utf8->string)
880 ;;;; external formats
882 (defun default-external-format ()
883 (intern (or (sb!alien:alien-funcall
884 (extern-alien "nl_langinfo"
885 (function c-string int))
890 (defparameter *external-format-functions*
891 '(((:ascii :us-ascii :ansi_x3.4-1968)
892 ascii->string ascii->string*-aref string->ascii string->ascii*-aref)
893 ((:latin1 :latin-1 :iso-8859-1)
894 latin1->string latin1->string*-aref string->latin1 string->latin1*-aref)
896 ((:latin9 :latin-9 :iso-8859-15)
897 latin9->string latin9->string*-aref string->latin9 string->latin9*-aref)
899 utf8->string-aref utf8->string*-aref string->utf8 string->utf8*-aref)))
901 (defun external-formats-funs (external-format)
902 (when (eql external-format :default)
903 (setf external-format (default-external-format)))
904 (or (cdr (find external-format (the list *external-format-functions*)
907 (error "Unknown external-format ~S" external-format)))
909 ;;;; public interface
911 (defun octets-to-string (vector &key (external-format :default) (start 0) end)
912 (declare (type (vector (unsigned-byte 8)) vector))
913 (with-array-data ((vector vector)
915 (end (%check-vector-sequence-bounds vector start end)))
916 (declare (type (simple-array (unsigned-byte 8) (*)) vector))
917 (funcall (symbol-function (first (external-formats-funs external-format)))
920 (defun octets-to-string* (string vector &key (external-format :default)
921 (start1 0) end1 (start2 0) end2)
922 (declare (type string string)
923 (type (vector (unsigned-byte 8)) vector))
927 (end1 (%check-vector-sequence-bounds string start1 end1)))
928 (declare (type simple-string string))
932 (end2 (%check-vector-sequence-bounds vector start2 end2)))
933 (declare (type (simple-array (unsigned-byte 8) (*)) vector))
934 (funcall (symbol-function (second (external-formats-funs external-format)))
935 string start1 end1 vector start2 end2))))
937 (defun string-to-octets (string &key (external-format :default)
938 (start 0) end null-terminate)
939 (declare (type string string))
940 (with-array-data ((string string)
942 (end (%check-vector-sequence-bounds string start end)))
943 (declare (type simple-string string))
944 (funcall (symbol-function (third (external-formats-funs external-format)))
945 string start end (if null-terminate 1 0))))
947 (defun string-to-octets* (vector string &key (external-format :default)
948 (start1 0) end1 (start2 0) end2)
949 (declare (type (vector (unsigned-byte 8)) vector)
950 (type string string))
954 (end1 (%check-vector-sequence-bounds vector start1 end1)))
955 (declare (type (simple-array (unsigned-byte 8) (*)) vector))
959 (end2 (%check-vector-sequence-bounds string start2 end2)))
960 (declare (type simple-string string))
961 (funcall (symbol-function (fourth (external-formats-funs external-format)))
962 vector start1 end1 string start2 end2))))
965 (defvar +unicode-replacement-character+ (string (code-char #xfffd)))
967 (defun use-unicode-replacement-char (condition)
968 (use-value +unicode-replacement-character+ condition))
970 ;;; Utilities that maybe should be exported
973 (defmacro with-standard-replacement-character (&body body)
974 `(handler-bind ((octet-encoding-error #'use-unicode-replacement-char))
977 (defmacro with-default-decoding-replacement ((c) &body body)
978 (let ((cname (gensym)))
981 ((octet-decoding-error (lambda (c)
982 (use-value ,cname c))))
987 (defmacro show-overflow (&body body)
988 `(handler-bind ((octet-buffer-overflow
990 (format t "Overflowed with ~S~%" (octet-buffer-overflow-replacement c))
994 (defun ub8 (len-or-seq)
995 (if (numberp len-or-seq)
996 (make-array len-or-seq :element-type '(unsigned-byte 8) :initial-element 0)
997 (coerce len-or-seq '(simple-array (unsigned-byte 8) (*)))))
999 (defun ensure-roundtrip-utf8 ()
1000 (let ((string (make-string char-code-limit))
1001 (octets (make-array (* 4 char-code-limit) :element-type '(unsigned-byte 8)))
1002 (string2 (make-string char-code-limit)))
1003 (dotimes (i char-code-limit)
1004 (setf (char string i) (code-char i)))
1005 (multiple-value-bind (_ octets-length used-chars)
1006 (string-to-octets* octets string :external-format :utf8)
1007 (declare (ignore _))
1008 (assert (= used-chars (length string)))
1009 (multiple-value-bind (_ string-length used-octets)
1010 (octets-to-string* string2 octets :external-format :utf8 :end2 octets-length)
1011 (declare (ignore _))
1012 (assert (= used-octets octets-length))
1013 (assert (= string-length (length string)))
1014 (assert (string= string string2)))))
1017 (defun ensure-roundtrip-utf8-2 ()
1018 (let ((string (make-string char-code-limit)))
1019 (dotimes (i char-code-limit)
1020 (setf (char string i) (code-char i)))
1022 (octets-to-string (string-to-octets string :external-format :utf8)
1023 :external-format :utf8)))
1024 (assert (= (length string2) (length string)))
1025 (assert (string= string string2))))
1028 (defun ensure-roundtrip-latin (format)
1029 (let ((octets (ub8 256))
1030 (string (make-string 256))
1031 (octets2 (ub8 256)))
1033 (setf (aref octets i) i))
1034 (multiple-value-bind (_ string-length octets-used)
1035 (octets-to-string* string octets :external-format format)
1036 (declare (ignore _))
1037 (assert (= string-length 256))
1038 (assert (= octets-used 256)))
1039 (multiple-value-bind (_ octet-length chars-used)
1040 (string-to-octets* octets2 string :external-format format)
1041 (declare (ignore _))
1042 (assert (= octet-length 256))
1043 (assert (= chars-used 256)))
1044 (assert (every #'= octets octets2)))
1047 (defun ensure-roundtrip-latin-2 (format)
1048 (let ((octets (ub8 256)))
1050 (setf (aref octets i) i))
1051 (let* ((str (octets-to-string octets :external-format format))
1052 (oct2 (string-to-octets str :external-format format)))
1053 (assert (= (length octets) (length oct2)))
1054 (assert (every #'= octets oct2))))
1057 (defun ensure-roundtrip-latin1 ()
1058 (ensure-roundtrip-latin :latin1))
1060 (defun ensure-roundtrip-latin9 ()
1061 (ensure-roundtrip-latin :latin9))
1063 (defun ensure-roundtrip-latin1-2 ()
1064 (ensure-roundtrip-latin-2 :latin1))
1066 (defun ensure-roundtrip-latin9-2 ()
1067 (ensure-roundtrip-latin-2 :latin9))
1069 (defmacro i&c (form)
1070 `(handler-case ,form
1072 (format *trace-output* "~S: ~A~%" ',form c))))
1074 (defun test-octets ()
1075 (i&c (ensure-roundtrip-utf8))
1076 (i&c (ensure-roundtrip-utf8-2))
1077 (i&c (ensure-roundtrip-latin1))
1078 (i&c (ensure-roundtrip-latin1-2))
1079 (i&c (ensure-roundtrip-latin9))
1080 (i&c (ensure-roundtrip-latin9-2)))