X-Git-Url: http://repo.macrolet.net/gitweb/?a=blobdiff_plain;f=src%2Fcode%2Fexternal-formats%2Fenc-basic.lisp;h=0deb08a3f55cbc71ac7b6753e9762c3b85ebb284;hb=8ea7b1a452fc87f91273c96bead8aa862bbc8b98;hp=a60f050a629e69a61d3e384d5f426f6f5ba192e7;hpb=f2db6743b1fadeea9e72cb583d857851c87efcd4;p=sbcl.git diff --git a/src/code/external-formats/enc-basic.lisp b/src/code/external-formats/enc-basic.lisp index a60f050..0deb08a 100644 --- a/src/code/external-formats/enc-basic.lisp +++ b/src/code/external-formats/enc-basic.lisp @@ -59,13 +59,14 @@ finally (return (coerce string 'simple-string)))))))) (instantiate-octets-definition define-ascii->string) -(define-external-format (:ascii :us-ascii :ansi_x3.4-1968 - :iso-646 :iso-646-us :|646|) - 1 t +(define-unibyte-external-format :ascii + (:us-ascii :ansi_x3.4-1968 :iso-646 :iso-646-us :|646|) (if (>= bits 128) (external-format-encoding-error stream bits) (setf (sap-ref-8 sap tail) bits)) - (code-char byte) + (if (>= byte 128) + (return-from decode-break-reason 1) + (code-char byte)) ascii->string-aref string->ascii) @@ -101,8 +102,7 @@ ;;; Multiple names for the :ISO{,-}8859-* families are needed because on ;;; FreeBSD (and maybe other BSD systems), nl_langinfo("LATIN-1") will ;;; return "ISO8859-1" instead of "ISO-8859-1". -(define-external-format (:latin-1 :latin1 :iso-8859-1 :iso8859-1) - 1 t +(define-unibyte-external-format :latin-1 (:latin1 :iso-8859-1 :iso8859-1) (if (>= bits 256) (external-format-encoding-error stream bits) (setf (sap-ref-8 sap tail) bits)) @@ -132,12 +132,29 @@ (type (integer 0 1) null-padding) (type array-range sstart send)) (macrolet ((ascii-bash () + ;; KLUDGE: this depends on the fact that we know that + ;; our arrays are initialized with zeros. '(let ((array (make-array (+ null-padding (- send sstart)) :element-type '(unsigned-byte 8)))) (loop for i from 0 and j from sstart below send do (setf (aref array i) (char-code (char string j)))) - array))) + array)) + (output-code (tag) + `(case (char-len-as-utf8 code) + (1 (add-byte code)) + (2 (add-byte (logior #xc0 (ldb (byte 5 6) code))) + (add-byte (logior #x80 (ldb (byte 6 0) code)))) + (3 (when (<= #xd800 code #xdfff) + (setf error-position i) + (go ,tag)) + (add-byte (logior #xe0 (ldb (byte 4 12) code))) + (add-byte (logior #x80 (ldb (byte 6 6) code))) + (add-byte (logior #x80 (ldb (byte 6 0) code)))) + (4 (add-byte (logior #xf0 (ldb (byte 3 18) code))) + (add-byte (logior #x80 (ldb (byte 6 12) code))) + (add-byte (logior #x80 (ldb (byte 6 6) code))) + (add-byte (logior #x80 (ldb (byte 6 0) code))))))) (etypecase string ((simple-array character (*)) (let ((utf8-length 0)) @@ -149,40 +166,48 @@ (ascii-bash) (let ((array (make-array (+ null-padding utf8-length) :element-type '(unsigned-byte 8))) + (new-array nil) + (error-position 0) (index 0)) (declare (type index index)) - (flet ((add-byte (b) - (setf (aref array index) b) - (incf index))) - (declare (inline add-byte)) - (loop for i of-type index from sstart below send - do (let ((code (char-code (char string i)))) - (case (char-len-as-utf8 code) - (1 - (add-byte code)) - (2 - (add-byte (logior #b11000000 (ldb (byte 5 6) code))) - (add-byte (logior #b10000000 (ldb (byte 6 0) code)))) - (3 - (add-byte (logior #b11100000 (ldb (byte 4 12) code))) - (add-byte (logior #b10000000 (ldb (byte 6 6) code))) - (add-byte (logior #b10000000 (ldb (byte 6 0) code)))) - (4 - (add-byte (logior #b11110000 (ldb (byte 3 18) code))) - (add-byte (logior #b10000000 (ldb (byte 6 12) code))) - (add-byte (logior #b10000000 (ldb (byte 6 6) code))) - (add-byte (logior #b10000000 (ldb (byte 6 0) code)))))) - finally (return array))))))) + (tagbody + :no-error + (flet ((add-byte (b) + (setf (aref array index) b) + (incf index))) + (declare (inline add-byte)) + (loop for i of-type index from sstart below send + for code = (char-code (char string i)) + do (output-code :first-error) + finally (return-from string->utf8 array))) + :first-error + (setf new-array (make-array (* index 2) :adjustable t + :element-type '(unsigned-byte 8) + :fill-pointer index)) + (replace new-array array) + :error + (let ((replacement (encoding-error :utf-8 string index))) + (flet ((add-byte (b) (vector-push-extend b new-array))) + (dotimes (i (length replacement)) + (add-byte (aref replacement i))) + (loop for i of-type index from (1+ error-position) below send + for code = (char-code (char string i)) + do (output-code :error) + finally (return-from string->utf8 + (progn + (unless (zerop null-padding) + (vector-push-extend 0 new-array)) + (copy-seq new-array))))))))))) #!+sb-unicode ((simple-array base-char (*)) ;; On unicode builds BASE-STRINGs are limited to ASCII range, ;; so we can take a fast path -- and get benefit of the element ;; type information. On non-unicode build BASE-CHAR == - ;; CHARACTER. + ;; CHARACTER, handled above. (ascii-bash)) ((simple-array nil (*)) (if (= send sstart) - (make-array 0 :element-type '(unsigned-byte 8)) + (make-array null-padding :element-type '(unsigned-byte 8)) ;; Just get the error... (aref string sstart)))))) @@ -212,18 +237,12 @@ (declare (type (unsigned-byte 8) b)) (let ((ok (cond ((zerop (logand b #b10000000)) 1) - ((= (logand b #b11100000) #b11000000) - 2) - ((= (logand b #b11110000) #b11100000) - 3) - ((= (logand b #b11111000) #b11110000) - 4) - ((= (logand b #b11111100) #b11111000) - 5) - ((= (logand b #b11111110) #b11111100) - 6) - (t - nil)))) + ((and (= (logand b #b11100000) #b11000000) + (>= b #xc2)) 2) + ((= (logand b #b11110000) #b11100000) 3) + ((and (= (logand b #b11111000) #b11110000) + (<= b #xf4)) 4) + (t nil)))) (unless ok (setf reject-reason 'invalid-utf8-starter-byte)) ok)) @@ -236,6 +255,14 @@ (let* ((idx (the array-range (+ pos x))) (b (,accessor array idx)) (ok (= (logand b #b11000000) #b10000000))) + (when (and ok (= x 1)) + (setf ok + (case initial-byte + (#xe0 (>= b #xa0)) + (#xed (< b #xa0)) + (#xf0 (>= b #x90)) + (#xf4 (< b #x90)) + (t t)))) (unless ok (setf reject-reason 'invalid-utf8-continuation-byte) (setf reject-position idx)) @@ -251,14 +278,6 @@ (loop for i from 1 below (min len remaining-bytes) always (valid-secondary-p i)) (enough-bytes-left-p len))) - (overlong-chk (x y) - (let ((ok (or (/= initial-byte x) - (/= (logior (,accessor array (the array-range (+ pos 1))) - y) - y)))) - (unless ok - (setf reject-reason 'overlong-utf8-sequence)) - ok)) (character-below-char-code-limit-p () ;; This is only called on a four-byte sequence ;; (two in non-unicode builds) to ensure we @@ -288,43 +307,27 @@ (declare (inline valid-utf8-starter-byte-p enough-bytes-left-p valid-secondary-p - preliminary-ok-for-length - overlong-chk)) + preliminary-ok-for-length)) (let ((maybe-len (valid-utf8-starter-byte-p initial-byte))) (cond ((eql maybe-len 1) (values 1 nil)) ((and (preliminary-ok-for-length maybe-len 2) - (overlong-chk #b11000000 #b10111111) - (overlong-chk #b11000001 #b10111111) #!-sb-unicode (character-below-char-code-limit-p)) (values 2 nil)) ((and (preliminary-ok-for-length maybe-len 3) - (overlong-chk #b11100000 #b10011111) #!-sb-unicode (not (setf reject-reason 'character-out-of-range))) (values 3 nil)) ((and (preliminary-ok-for-length maybe-len 4) - (overlong-chk #b11110000 #b10001111) #!-sb-unicode (not (setf reject-reason 'character-out-of-range)) (character-below-char-code-limit-p)) (values 4 nil)) - ((and (preliminary-ok-for-length maybe-len 5) - (overlong-chk #b11111000 #b10000111) - (not (setf reject-reason 'character-out-of-range))) - (bug "can't happen")) - ((and (preliminary-ok-for-length maybe-len 6) - (overlong-chk #b11111100 #b10000011) - (not (setf reject-reason 'character-out-of-range))) - (bug "can't happen")) (t - (let* ((bad-end (ecase reject-reason - (invalid-utf8-starter-byte - (1+ pos)) - (end-of-input-in-character - end) - (invalid-utf8-continuation-byte - reject-position) - ((overlong-utf8-sequence character-out-of-range) - (+ pos maybe-len)))) + (let* ((bad-end + (ecase reject-reason + (invalid-utf8-starter-byte (1+ pos)) + (end-of-input-in-character end) + (invalid-utf8-continuation-byte reject-position) + (character-out-of-range (+ pos maybe-len)))) (bad-len (- bad-end pos))) (declare (type array-range bad-end bad-len)) (let ((replacement (decoding-error array pos bad-end :utf-8 reject-reason reject-position))) @@ -379,7 +382,8 @@ (coerce string 'simple-string)))))) (instantiate-octets-definition define-utf8->string) -(define-external-format/variable-width (:utf-8 :utf8) nil +(define-external-format/variable-width (:utf-8 :utf8) t + #!+sb-unicode (code-char #xfffd) #!-sb-unicode #\? (let ((bits (char-code byte))) (cond ((< bits #x80) 1) ((< bits #x800) 2) @@ -389,18 +393,20 @@ (1 (setf (sap-ref-8 sap tail) bits)) (2 (setf (sap-ref-8 sap tail) (logior #xc0 (ldb (byte 5 6) bits)) (sap-ref-8 sap (+ 1 tail)) (logior #x80 (ldb (byte 6 0) bits)))) - (3 (setf (sap-ref-8 sap tail) (logior #xe0 (ldb (byte 4 12) bits)) + (3 (when (<= #xd800 bits #xdfff) + (external-format-encoding-error stream bits)) + (setf (sap-ref-8 sap tail) (logior #xe0 (ldb (byte 4 12) bits)) (sap-ref-8 sap (+ 1 tail)) (logior #x80 (ldb (byte 6 6) bits)) (sap-ref-8 sap (+ 2 tail)) (logior #x80 (ldb (byte 6 0) bits)))) (4 (setf (sap-ref-8 sap tail) (logior #xf0 (ldb (byte 3 18) bits)) (sap-ref-8 sap (+ 1 tail)) (logior #x80 (ldb (byte 6 12) bits)) (sap-ref-8 sap (+ 2 tail)) (logior #x80 (ldb (byte 6 6) bits)) (sap-ref-8 sap (+ 3 tail)) (logior #x80 (ldb (byte 6 0) bits))))) - (cond ((< byte #x80) 1) - ((< byte #xc2) (return-from decode-break-reason 1)) - ((< byte #xe0) 2) - ((< byte #xf0) 3) - (t 4)) + (1 (cond ((< byte #x80) 1) + ((< byte #xc2) (return-from decode-break-reason 1)) + ((< byte #xe0) 2) + ((< byte #xf0) 3) + (t 4))) (code-char (ecase size (1 byte) (2 (let ((byte2 (sap-ref-8 sap (1+ head)))) @@ -410,7 +416,9 @@ (3 (let ((byte2 (sap-ref-8 sap (1+ head))) (byte3 (sap-ref-8 sap (+ 2 head)))) (unless (and (<= #x80 byte2 #xbf) - (<= #x80 byte3 #xbf)) + (<= #x80 byte3 #xbf) + (or (/= byte #xe0) (<= #xa0 byte2 #xbf)) + (or (/= byte #xed) (<= #x80 byte2 #x9f))) (return-from decode-break-reason 3)) (dpb byte (byte 4 12) (dpb byte2 (byte 6 6) byte3)))) (4 (let ((byte2 (sap-ref-8 sap (1+ head))) @@ -418,7 +426,9 @@ (byte4 (sap-ref-8 sap (+ 3 head)))) (unless (and (<= #x80 byte2 #xbf) (<= #x80 byte3 #xbf) - (<= #x80 byte4 #xbf)) + (<= #x80 byte4 #xbf) + (or (/= byte #xf0) (<= #x90 byte2 #xbf)) + (or (/= byte #xf4) (<= #x80 byte2 #x8f))) (return-from decode-break-reason 4)) (dpb byte (byte 3 18) (dpb byte2 (byte 6 12)