3 (defun make-multibyte-mapper (list)
4 (let ((list (sort (copy-list list) #'< :key #'car))
5 (hi (loop for x in list maximize (max (car x) (cadr x)))))
6 (make-array (list (length list) 2)
7 :element-type (list 'integer 0 hi)
8 :initial-contents list)))
10 (defmacro define-multibyte-mapper (name list)
12 (make-multibyte-mapper ,list)))
14 (defun get-multibyte-mapper (table code)
15 (declare (optimize speed (safety 0))
16 (type (array * (* 2)) table)
18 (labels ((recur (start end)
19 (declare (type fixnum start end))
20 (let* ((m (ash (+ start end) -1))
22 (declare (type fixnum m x))
25 ((and (< x code) (< m end))
27 ((and (> x code) (> m start))
28 (recur start (1- m)))))))
29 (recur 0 (1- (array-dimension table 0)))))
31 (eval-when (:compile-toplevel :load-toplevel :execute)
32 ;; FIXME: better to change make-od-name() to accept multiple
33 ;; arguments in octets.lisp?
34 (defun make-od-name-list (&rest syms)
35 (reduce #'make-od-name syms))
37 (defun define-bytes-per-mb-character-1 (accessor type format
38 mb-len mb-continuation-byte-p)
39 (let ((name (make-od-name-list 'bytes-per format 'character accessor))
40 (invalid-mb-starter-byte
41 (make-od-name-list 'invalid format 'starter-byte))
42 (invalid-mb-continuation-byte
43 (make-od-name-list 'invalid format 'continuation-byte)))
45 ;;(declaim (inline ,name))
46 (defun ,name (array pos end)
47 (declare (optimize speed (safety 0))
49 (type array-range pos end))
50 ;; returns the number of bytes consumed and nil if it's a
51 ;; valid character or the number of bytes consumed and a
52 ;; replacement string if it's not.
53 (let ((initial-byte (,accessor array pos))
56 (remaining-bytes (- end pos)))
57 (declare (type array-range reject-position remaining-bytes))
58 (labels ((valid-starter-byte-p (b)
59 (declare (type (unsigned-byte 8) b))
60 (let ((ok (,mb-len b)))
62 (setf reject-reason ',invalid-mb-starter-byte))
64 (enough-bytes-left-p (x)
65 (let ((ok (> end (+ pos (1- x)))))
67 (setf reject-reason 'end-of-input-in-character))
69 (valid-secondary-p (x)
70 (let* ((idx (the array-range (+ pos x)))
71 (b (,accessor array idx))
72 (ok (,mb-continuation-byte-p b)))
74 (setf reject-reason ',invalid-mb-continuation-byte)
75 (setf reject-position idx))
77 (preliminary-ok-for-length (maybe-len len)
78 (and (eql maybe-len len)
79 ;; Has to be done in this order so that
80 ;; certain broken sequences (e.g., the
81 ;; two-byte sequence `"initial (length 3)"
82 ;; "non-continuation"' -- `#xef #x32')
83 ;; signal only part of that sequence as
85 (loop for i from 1 below (min len remaining-bytes)
86 always (valid-secondary-p i))
87 (enough-bytes-left-p len))))
88 (declare (inline valid-starter-byte-p
91 preliminary-ok-for-length))
92 (let ((maybe-len (valid-starter-byte-p initial-byte)))
93 (cond ((eql maybe-len 1)
95 ((preliminary-ok-for-length maybe-len 2)
97 ((preliminary-ok-for-length maybe-len 3)
100 (let* ((bad-end (ecase reject-reason
101 (,invalid-mb-starter-byte
103 (end-of-input-in-character
105 (,invalid-mb-continuation-byte
107 (bad-len (- bad-end pos)))
108 (declare (type array-range bad-end bad-len))
109 (let ((replacement (decoding-error array pos bad-end ,format reject-reason reject-position)))
110 (values bad-len replacement))))))))))))
112 (defun define-simple-get-mb-char-1 (accessor type format mb-to-ucs)
113 (let ((name (make-od-name-list 'simple-get format 'char accessor))
114 (malformed (make-od-name 'malformed format)))
116 (declaim (inline ,name))
117 (defun ,name (array pos bytes)
118 (declare (optimize speed (safety 0))
120 (type array-range pos)
121 (type (integer 1 3) bytes))
123 (,accessor array (the array-range (+ pos x)))))
124 (declare (inline cref))
125 (let ((code (,mb-to-ucs (ecase bytes
127 (2 (logior (ash (cref 0) 8) (cref 1)))
128 (3 (logior (ash (cref 0) 16)
133 (decoding-error array pos (+ pos bytes) ,format
134 ',malformed pos))))))))
136 (defun define-mb->string-1 (accessor type format)
138 (make-od-name-list format '>string accessor))
139 (bytes-per-mb-character
140 (make-od-name-list 'bytes-per format 'character accessor))
142 (make-od-name-list 'simple-get format 'char accessor)))
144 (defun ,name (array astart aend)
145 (declare (optimize speed (safety 0))
147 (type array-range astart aend))
148 (let ((string (make-array 0 :adjustable t :fill-pointer 0 :element-type 'character)))
149 (loop with pos = astart
151 do (multiple-value-bind (bytes invalid)
152 (,bytes-per-mb-character array pos aend)
153 (declare (type (or null string) invalid))
156 (vector-push-extend (,simple-get-mb-char array pos bytes) string))
158 (dotimes (i (length invalid))
159 (vector-push-extend (char invalid i) string))))
161 (coerce string 'simple-string))))))
163 (declaim (inline mb-char-len))
164 (defun mb-char-len (code)
165 (declare (optimize speed (safety 0))
167 (cond ((< code 0) (bug "can't happen"))
170 ((< code #x1000000) 3)
171 (t (bug "can't happen"))))
174 (defmacro define-multibyte-encoding (format aliases
176 mb-len mb-continuation-byte-p)
177 (let ((char->mb (make-od-name 'char-> format))
178 (string->mb (make-od-name 'string-> format))
179 (define-bytes-per-mb-character
180 (make-od-name-list 'define-bytes-per format 'character))
181 (define-simple-get-mb-char
182 (make-od-name-list 'define-simple-get format 'char))
184 (make-od-name-list 'define format '>string)))
186 ;; for fd-stream.lisp
187 (define-external-format/variable-width ,aliases t
188 (mb-char-len (or (,ucs-to-mb (char-code byte)) -1))
189 (let ((mb (,ucs-to-mb bits)))
191 (external-format-encoding-error stream byte)
193 (1 (setf (sap-ref-8 sap tail) mb))
194 (2 (setf (sap-ref-8 sap tail) (ldb (byte 8 8) mb)
195 (sap-ref-8 sap (1+ tail)) (ldb (byte 8 0) mb)))
196 (3 (setf (sap-ref-8 sap tail) (ldb (byte 8 16) mb)
197 (sap-ref-8 sap (1+ tail)) (ldb (byte 8 8) mb)
198 (sap-ref-8 sap (+ 2 tail)) (ldb (byte 8 0) mb))))))
200 (let* ((mb (ecase size
202 (2 (let ((byte2 (sap-ref-8 sap (1+ head))))
203 (unless (,mb-continuation-byte-p byte2)
204 (return-from decode-break-reason 2))
205 (dpb byte (byte 8 8) byte2)))
206 (3 (let ((byte2 (sap-ref-8 sap (1+ head)))
207 (byte3 (sap-ref-8 sap (+ 2 head))))
208 (unless (,mb-continuation-byte-p byte2)
209 (return-from decode-break-reason 2))
210 (unless (,mb-continuation-byte-p byte3)
211 (return-from decode-break-reason 3))
212 (dpb byte (byte 8 16) (dpb byte2 (byte 8 8) byte3))))))
213 (ucs (,mb-to-ucs mb)))
215 (return-from decode-break-reason 1)
219 (define-condition ,(make-od-name 'malformed format)
220 (octet-decoding-error) ())
221 (define-condition ,(make-od-name-list 'invalid format 'starter-byte)
222 (octet-decoding-error) ())
223 (define-condition ,(make-od-name-list 'invalid format 'continuation-byte)
224 (octet-decoding-error) ())
226 (declaim (inline ,char->mb))
227 (defun ,char->mb (char dest string pos)
228 (declare (optimize speed (safety 0))
229 (type (array (unsigned-byte 8) (*)) dest))
230 (let ((code (,ucs-to-mb (char-code char))))
233 (declare (type (unsigned-byte 8) b))
234 (vector-push-extend b dest)))
235 (declare (inline add-byte))
236 (setf code (the fixnum code))
237 (ecase (mb-char-len code)
241 (add-byte (ldb (byte 8 8) code))
242 (add-byte (ldb (byte 8 0) code)))
244 (add-byte (ldb (byte 8 16) code))
245 (add-byte (ldb (byte 8 8) code))
246 (add-byte (ldb (byte 8 0) code)))))
247 (encoding-error ,format string pos))))
249 (defun ,string->mb (string sstart send additional-space)
250 (declare (optimize speed (safety 0))
251 (type simple-string string)
252 (type array-range sstart send additional-space))
253 (let ((array (make-array (+ additional-space (- send sstart))
254 :element-type '(unsigned-byte 8)
257 (loop for i from sstart below send
258 do (,char->mb (char string i) array string i))
259 (dotimes (i additional-space)
260 (vector-push-extend 0 array))
261 (coerce array '(simple-array (unsigned-byte 8) (*)))))
263 (defmacro ,define-bytes-per-mb-character (accessor type)
264 (define-bytes-per-mb-character-1 accessor type ',format
265 ',mb-len ',mb-continuation-byte-p))
267 (instantiate-octets-definition ,define-bytes-per-mb-character)
269 (defmacro ,define-simple-get-mb-char (accessor type)
270 (define-simple-get-mb-char-1 accessor type ',format ',mb-to-ucs))
272 (instantiate-octets-definition ,define-simple-get-mb-char)
274 (defmacro ,define-mb->string (accessor type)
275 (define-mb->string-1 accessor type ',format))
277 (instantiate-octets-definition ,define-mb->string)
280 ,(make-od-name format '>string-aref) ,string->mb)
281 *external-format-functions*)