3 (defmacro define-multibyte-mapper (name list)
4 (let ((list (sort (copy-list list) #'< :key #'car))
5 (hi (loop for x in list maximize (max (car x) (cadr x)))))
7 (make-array '(,(length list) 2)
8 :element-type '(integer 0 ,hi)
9 :initial-contents ',list))))
11 (defun get-multibyte-mapper (table code)
12 (declare (optimize speed (safety 0))
13 (type (array * (* 2)) table)
15 (labels ((recur (start end)
16 (declare (type fixnum start end))
17 (let* ((m (ash (+ start end) -1))
19 (declare (type fixnum m x))
22 ((and (< x code) (< m end))
24 ((and (> x code) (> m start))
25 (recur start (1- m)))))))
26 (recur 0 (1- (array-dimension table 0)))))
28 (eval-when (:compile-toplevel :load-toplevel :execute)
29 ;; FIXME: better to change make-od-name() to accept multiple
30 ;; arguments in octets.lisp?
31 (defun make-od-name-list (&rest syms)
32 (reduce #'make-od-name syms))
34 (defun define-bytes-per-mb-character-1 (accessor type format
35 mb-len mb-continuation-byte-p)
36 (let ((name (make-od-name-list 'bytes-per format 'character accessor))
37 (invalid-mb-starter-byte
38 (make-od-name-list 'invalid format 'starter-byte))
39 (invalid-mb-continuation-byte
40 (make-od-name-list 'invalid format 'continuation-byte)))
42 ;;(declaim (inline ,name))
43 (defun ,name (array pos end)
44 (declare (optimize speed (safety 0))
46 (type array-range pos end))
47 ;; returns the number of bytes consumed and nil if it's a
48 ;; valid character or the number of bytes consumed and a
49 ;; replacement string if it's not.
50 (let ((initial-byte (,accessor array pos))
53 (remaining-bytes (- end pos)))
54 (declare (type array-range reject-position remaining-bytes))
55 (labels ((valid-starter-byte-p (b)
56 (declare (type (unsigned-byte 8) b))
57 (let ((ok (,mb-len b)))
59 (setf reject-reason ',invalid-mb-starter-byte))
61 (enough-bytes-left-p (x)
62 (let ((ok (> end (+ pos (1- x)))))
64 (setf reject-reason 'end-of-input-in-character))
66 (valid-secondary-p (x)
67 (let* ((idx (the array-range (+ pos x)))
68 (b (,accessor array idx))
69 (ok (,mb-continuation-byte-p b)))
71 (setf reject-reason ',invalid-mb-continuation-byte)
72 (setf reject-position idx))
74 (preliminary-ok-for-length (maybe-len len)
75 (and (eql maybe-len len)
76 ;; Has to be done in this order so that
77 ;; certain broken sequences (e.g., the
78 ;; two-byte sequence `"initial (length 3)"
79 ;; "non-continuation"' -- `#xef #x32')
80 ;; signal only part of that sequence as
82 (loop for i from 1 below (min len remaining-bytes)
83 always (valid-secondary-p i))
84 (enough-bytes-left-p len))))
85 (declare (inline valid-starter-byte-p
88 preliminary-ok-for-length))
89 (let ((maybe-len (valid-starter-byte-p initial-byte)))
90 (cond ((eql maybe-len 1)
92 ((preliminary-ok-for-length maybe-len 2)
94 ((preliminary-ok-for-length maybe-len 3)
97 (let* ((bad-end (ecase reject-reason
98 (,invalid-mb-starter-byte
100 (end-of-input-in-character
102 (,invalid-mb-continuation-byte
104 (bad-len (- bad-end pos)))
105 (declare (type array-range bad-end bad-len))
106 (let ((replacement (decoding-error array pos bad-end ,format reject-reason reject-position)))
107 (values bad-len replacement))))))))))))
109 (defun define-simple-get-mb-char-1 (accessor type format mb-to-ucs)
110 (let ((name (make-od-name-list 'simple-get format 'char accessor))
111 (malformed (make-od-name 'malformed format)))
113 (declaim (inline ,name))
114 (defun ,name (array pos bytes)
115 (declare (optimize speed (safety 0))
117 (type array-range pos)
118 (type (integer 1 3) bytes))
120 (,accessor array (the array-range (+ pos x)))))
121 (declare (inline cref))
122 (let ((code (,mb-to-ucs (ecase bytes
124 (2 (logior (ash (cref 0) 8) (cref 1)))
125 (3 (logior (ash (cref 0) 16)
130 (decoding-error array pos (+ pos bytes) ,format
131 ',malformed pos))))))))
133 (defun define-mb->string-1 (accessor type format)
135 (make-od-name-list format '>string accessor))
136 (bytes-per-mb-character
137 (make-od-name-list 'bytes-per format 'character accessor))
139 (make-od-name-list 'simple-get format 'char accessor)))
141 (defun ,name (array astart aend)
142 (declare (optimize speed (safety 0))
144 (type array-range astart aend))
145 (let ((string (make-array 0 :adjustable t :fill-pointer 0 :element-type 'character)))
146 (loop with pos = astart
148 do (multiple-value-bind (bytes invalid)
149 (,bytes-per-mb-character array pos aend)
150 (declare (type (or null string) invalid))
153 (let ((thing (,simple-get-mb-char array pos bytes)))
155 (character (vector-push-extend thing string))
157 (dotimes (i (length thing))
158 (vector-push-extend (char thing i) string))))))
160 (dotimes (i (length invalid))
161 (vector-push-extend (char invalid i) string))))
163 (coerce string 'simple-string))))))
165 (declaim (inline mb-char-len))
166 (defun mb-char-len (code)
167 (declare (optimize speed (safety 0))
169 (cond ((< code 0) (bug "can't happen"))
172 ((< code #x1000000) 3)
173 (t (bug "can't happen"))))
176 (defmacro define-multibyte-encoding (format aliases
178 mb-len mb-continuation-byte-p)
179 (let ((char->mb (make-od-name 'char-> format))
180 (string->mb (make-od-name 'string-> format))
181 (define-bytes-per-mb-character
182 (make-od-name-list 'define-bytes-per format 'character))
183 (define-simple-get-mb-char
184 (make-od-name-list 'define-simple-get format 'char))
186 (make-od-name-list 'define format '>string)))
189 (define-condition ,(make-od-name 'malformed format)
190 (octet-decoding-error) ())
191 (define-condition ,(make-od-name-list 'invalid format 'starter-byte)
192 (octet-decoding-error) ())
193 (define-condition ,(make-od-name-list 'invalid format 'continuation-byte)
194 (octet-decoding-error) ())
196 (declaim (inline ,char->mb))
197 (defun ,char->mb (char dest string pos)
198 (declare (optimize speed (safety 0))
199 (type (array (unsigned-byte 8) (*)) dest))
200 (let ((code (,ucs-to-mb (char-code char))))
203 (declare (type (unsigned-byte 8) b))
204 (vector-push-extend b dest)))
205 (declare (inline add-byte))
206 (setf code (the fixnum code))
207 (ecase (mb-char-len code)
211 (add-byte (ldb (byte 8 8) code))
212 (add-byte (ldb (byte 8 0) code)))
214 (add-byte (ldb (byte 8 16) code))
215 (add-byte (ldb (byte 8 8) code))
216 (add-byte (ldb (byte 8 0) code)))))
217 (encoding-error ,format string pos))))
219 (defun ,string->mb (string sstart send additional-space)
220 (declare (optimize speed (safety 0))
221 (type simple-string string)
222 (type array-range sstart send additional-space))
223 (let ((array (make-array (+ additional-space (- send sstart))
224 :element-type '(unsigned-byte 8)
227 (loop for i from sstart below send
228 do (,char->mb (char string i) array string i))
229 (dotimes (i additional-space)
230 (vector-push-extend 0 array))
231 (coerce array '(simple-array (unsigned-byte 8) (*)))))
233 (defmacro ,define-bytes-per-mb-character (accessor type)
234 (define-bytes-per-mb-character-1 accessor type ',format
235 ',mb-len ',mb-continuation-byte-p))
237 (instantiate-octets-definition ,define-bytes-per-mb-character)
239 (defmacro ,define-simple-get-mb-char (accessor type)
240 (define-simple-get-mb-char-1 accessor type ',format ',mb-to-ucs))
242 (instantiate-octets-definition ,define-simple-get-mb-char)
244 (defmacro ,define-mb->string (accessor type)
245 (define-mb->string-1 accessor type ',format))
247 (instantiate-octets-definition ,define-mb->string)
249 ;; for fd-stream.lisp
250 (define-external-format/variable-width ,aliases t
251 ;; KLUDGE: it so happens that at present (2009-10-22) none of
252 ;; the external formats defined with
253 ;; define-multibyte-encoding can encode the unicode
254 ;; replacement character, so we hardcode the preferred
258 (mb-char-len (or (,ucs-to-mb (char-code byte))
259 (return-from size 0))))
260 (let ((mb (,ucs-to-mb bits)))
262 (external-format-encoding-error stream byte)
264 (1 (setf (sap-ref-8 sap tail) mb))
265 (2 (setf (sap-ref-8 sap tail) (ldb (byte 8 8) mb)
266 (sap-ref-8 sap (1+ tail)) (ldb (byte 8 0) mb)))
267 (3 (setf (sap-ref-8 sap tail) (ldb (byte 8 16) mb)
268 (sap-ref-8 sap (1+ tail)) (ldb (byte 8 8) mb)
269 (sap-ref-8 sap (+ 2 tail)) (ldb (byte 8 0) mb))))))
271 (let* ((mb (ecase size
273 (2 (let ((byte2 (sap-ref-8 sap (1+ head))))
274 (unless (,mb-continuation-byte-p byte2)
275 (return-from decode-break-reason 2))
276 (dpb byte (byte 8 8) byte2)))
277 (3 (let ((byte2 (sap-ref-8 sap (1+ head)))
278 (byte3 (sap-ref-8 sap (+ 2 head))))
279 (unless (,mb-continuation-byte-p byte2)
280 (return-from decode-break-reason 2))
281 (unless (,mb-continuation-byte-p byte3)
282 (return-from decode-break-reason 3))
283 (dpb byte (byte 8 16) (dpb byte2 (byte 8 8) byte3))))))
284 (ucs (,mb-to-ucs mb)))
286 (return-from decode-break-reason 1)
288 ,(make-od-name format '>string-aref)