From f2db6743b1fadeea9e72cb583d857851c87efcd4 Mon Sep 17 00:00:00 2001 From: Nathan Froyd Date: Tue, 29 Sep 2009 01:02:20 +0000 Subject: [PATCH] 1.0.31.23: OAOOize external-format support fd-streams and octets support independently kept records of external-format->function maps, suitable for the purposes of each. This revision stores all the relevant information for an external format in a single place--a new EXTERNAL-FORMAT structure--and has both clients reference things in that single place. Doing so offers opportunities for other cleanups and speedups. fd-streams external-format storage was an alist of lists, which was bad for client code, since everything referred to fields with NTH or SECOND/FOURTH/FIFTH. A proper DEFSTRUCT helps here and should be slightly more space-efficient, as we're replacing a list with (effectively) a vector. Also, since clients had to scan through an alist to find an external-format, this design was hurting performance in streams code, most notably OPEN. Replacing the alist with a hash table (which the octets code was already using) should make things a lot snappier. --- build-order.lisp-expr | 1 + src/code/external-formats/enc-basic.lisp | 427 ++++++++++++++++++++++++++ src/code/external-formats/enc-cyr.lisp | 21 +- src/code/external-formats/enc-dos.lisp | 98 +++--- src/code/external-formats/enc-ebcdic.lisp | 7 +- src/code/external-formats/enc-iso.lisp | 137 ++++++--- src/code/external-formats/enc-win.lisp | 66 ++-- src/code/external-formats/mb-util.lisp | 69 ++--- src/code/external-formats/ucs-2.lisp | 42 ++- src/code/fd-stream.lisp | 239 ++++++--------- src/code/host-c-call.lisp | 4 +- src/code/octets.lisp | 467 ++--------------------------- src/code/target-c-call.lisp | 12 +- tests/external-format.impure.lisp | 6 +- version.lisp-expr | 2 +- 15 files changed, 784 insertions(+), 814 deletions(-) create mode 100644 src/code/external-formats/enc-basic.lisp diff --git a/build-order.lisp-expr b/build-order.lisp-expr index cf68857..ae859b2 100644 --- a/build-order.lisp-expr +++ b/build-order.lisp-expr @@ -686,6 +686,7 @@ ("src/code/debug" :not-host) ("src/code/octets" :not-host) + ("src/code/external-formats/enc-basic" :not-host) ("src/code/external-formats/enc-ebcdic" :not-host) #!+sb-unicode ("src/code/external-formats/enc-cyr" :not-host) diff --git a/src/code/external-formats/enc-basic.lisp b/src/code/external-formats/enc-basic.lisp new file mode 100644 index 0000000..a60f050 --- /dev/null +++ b/src/code/external-formats/enc-basic.lisp @@ -0,0 +1,427 @@ +;;;; encodings available regardless of build-time unicode settings + +;;;; This software is part of the SBCL system. See the README file for +;;;; more information. +;;;; +;;;; This software is derived from the CMU CL system, which was +;;;; written at Carnegie Mellon University and released into the +;;;; public domain. The software is in the public domain and is +;;;; provided with absolutely no warranty. See the COPYING and CREDITS +;;;; files for more information. + +(in-package "SB!IMPL") + + +;;; ASCII + +(declaim (inline code->ascii-mapper)) +(defun code->ascii-mapper (code) + (declare (optimize speed (safety 0)) + (type char-code code)) + (if (> code 127) + nil + code)) + +(declaim (inline get-ascii-bytes)) +(defun get-ascii-bytes (string pos) + (declare (optimize speed (safety 0)) + (type simple-string string) + (type array-range pos)) + (get-latin-bytes #'code->ascii-mapper :ascii string pos)) + +(defun string->ascii (string sstart send null-padding) + (declare (optimize speed (safety 0)) + (type simple-string string) + (type array-range sstart send)) + (values (string->latin% string sstart send #'get-ascii-bytes null-padding))) + +(defmacro define-ascii->string (accessor type) + (let ((name (make-od-name 'ascii->string accessor))) + `(progn + (defun ,name (array astart aend) + (declare (optimize speed) + (type ,type array) + (type array-range astart aend)) + ;; Since there is such a thing as a malformed ascii byte, a + ;; simple "make the string, fill it in" won't do. + (let ((string (make-array 0 :element-type 'character :fill-pointer 0 :adjustable t))) + (loop for apos from astart below aend + do (let* ((code (,accessor array apos)) + (string-content + (if (< code 128) + (code-char code) + (decoding-error array apos (1+ apos) :ascii + 'malformed-ascii apos)))) + (if (characterp string-content) + (vector-push-extend string-content string) + (loop for c across string-content + do (vector-push-extend c string)))) + finally (return (coerce string 'simple-string)))))))) +(instantiate-octets-definition define-ascii->string) + +(define-external-format (:ascii :us-ascii :ansi_x3.4-1968 + :iso-646 :iso-646-us :|646|) + 1 t + (if (>= bits 128) + (external-format-encoding-error stream bits) + (setf (sap-ref-8 sap tail) bits)) + (code-char byte) + ascii->string-aref + string->ascii) + +;;; Latin-1 + +(declaim (inline get-latin1-bytes)) +(defun get-latin1-bytes (string pos) + (declare (optimize speed (safety 0)) + (type simple-string string) + (type array-range pos)) + (get-latin-bytes #'identity :latin-1 string pos)) + +(defun string->latin1 (string sstart send null-padding) + (declare (optimize speed (safety 0)) + (type simple-string string) + (type array-range sstart send)) + (values (string->latin% string sstart send #'get-latin1-bytes null-padding))) + +(defmacro define-latin1->string* (accessor type) + (declare (ignore type)) + (let ((name (make-od-name 'latin1->string* accessor))) + `(progn + (defun ,name (string sstart send array astart aend) + (,(make-od-name 'latin->string* accessor) string sstart send array astart aend #'identity))))) +(instantiate-octets-definition define-latin1->string*) + +(defmacro define-latin1->string (accessor type) + (declare (ignore type)) + `(defun ,(make-od-name 'latin1->string accessor) (array astart aend) + (,(make-od-name 'latin->string accessor) array astart aend #'identity))) +(instantiate-octets-definition define-latin1->string) + +;;; Multiple names for the :ISO{,-}8859-* families are needed because on +;;; FreeBSD (and maybe other BSD systems), nl_langinfo("LATIN-1") will +;;; return "ISO8859-1" instead of "ISO-8859-1". +(define-external-format (:latin-1 :latin1 :iso-8859-1 :iso8859-1) + 1 t + (if (>= bits 256) + (external-format-encoding-error stream bits) + (setf (sap-ref-8 sap tail) bits)) + (code-char byte) + latin1->string-aref + string->latin1) + + +;;; UTF-8 + +;;; to UTF-8 + +(declaim (inline char-len-as-utf8)) +(defun char-len-as-utf8 (code) + (declare (optimize speed (safety 0)) + (type (integer 0 (#.sb!xc:char-code-limit)) code)) + (cond ((< code 0) (bug "can't happen")) + ((< code #x80) 1) + ((< code #x800) 2) + ((< code #x10000) 3) + ((< code #x110000) 4) + (t (bug "can't happen")))) + +(defun string->utf8 (string sstart send null-padding) + (declare (optimize (speed 3) (safety 0)) + (type simple-string string) + (type (integer 0 1) null-padding) + (type array-range sstart send)) + (macrolet ((ascii-bash () + '(let ((array (make-array (+ null-padding (- send sstart)) + :element-type '(unsigned-byte 8)))) + (loop for i from 0 + and j from sstart below send + do (setf (aref array i) (char-code (char string j)))) + array))) + (etypecase string + ((simple-array character (*)) + (let ((utf8-length 0)) + ;; Since it has to fit in a vector, it must be a fixnum! + (declare (type (and unsigned-byte fixnum) utf8-length)) + (loop for i of-type index from sstart below send + do (incf utf8-length (char-len-as-utf8 (char-code (char string i))))) + (if (= utf8-length (- send sstart)) + (ascii-bash) + (let ((array (make-array (+ null-padding utf8-length) + :element-type '(unsigned-byte 8))) + (index 0)) + (declare (type index index)) + (flet ((add-byte (b) + (setf (aref array index) b) + (incf index))) + (declare (inline add-byte)) + (loop for i of-type index from sstart below send + do (let ((code (char-code (char string i)))) + (case (char-len-as-utf8 code) + (1 + (add-byte code)) + (2 + (add-byte (logior #b11000000 (ldb (byte 5 6) code))) + (add-byte (logior #b10000000 (ldb (byte 6 0) code)))) + (3 + (add-byte (logior #b11100000 (ldb (byte 4 12) code))) + (add-byte (logior #b10000000 (ldb (byte 6 6) code))) + (add-byte (logior #b10000000 (ldb (byte 6 0) code)))) + (4 + (add-byte (logior #b11110000 (ldb (byte 3 18) code))) + (add-byte (logior #b10000000 (ldb (byte 6 12) code))) + (add-byte (logior #b10000000 (ldb (byte 6 6) code))) + (add-byte (logior #b10000000 (ldb (byte 6 0) code)))))) + finally (return array))))))) + #!+sb-unicode + ((simple-array base-char (*)) + ;; On unicode builds BASE-STRINGs are limited to ASCII range, + ;; so we can take a fast path -- and get benefit of the element + ;; type information. On non-unicode build BASE-CHAR == + ;; CHARACTER. + (ascii-bash)) + ((simple-array nil (*)) + (if (= send sstart) + (make-array 0 :element-type '(unsigned-byte 8)) + ;; Just get the error... + (aref string sstart)))))) + +;;; from UTF-8 + +(defmacro define-bytes-per-utf8-character (accessor type) + (let ((name (make-od-name 'bytes-per-utf8-character accessor))) + `(progn + ;;(declaim (inline ,name)) + (let ((lexically-max + (string->utf8 (string (code-char ,(1- sb!xc:char-code-limit))) + 0 1 0))) + (declare (type (simple-array (unsigned-byte 8) (#!+sb-unicode 4 #!-sb-unicode 2)) lexically-max)) + (defun ,name (array pos end) + (declare (optimize speed (safety 0)) + (type ,type array) + (type array-range pos end)) + ;; returns the number of bytes consumed and nil if it's a + ;; valid character or the number of bytes consumed and a + ;; replacement string if it's not. + (let ((initial-byte (,accessor array pos)) + (reject-reason nil) + (reject-position pos) + (remaining-bytes (- end pos))) + (declare (type array-range reject-position remaining-bytes)) + (labels ((valid-utf8-starter-byte-p (b) + (declare (type (unsigned-byte 8) b)) + (let ((ok (cond + ((zerop (logand b #b10000000)) 1) + ((= (logand b #b11100000) #b11000000) + 2) + ((= (logand b #b11110000) #b11100000) + 3) + ((= (logand b #b11111000) #b11110000) + 4) + ((= (logand b #b11111100) #b11111000) + 5) + ((= (logand b #b11111110) #b11111100) + 6) + (t + nil)))) + (unless ok + (setf reject-reason 'invalid-utf8-starter-byte)) + ok)) + (enough-bytes-left-p (x) + (let ((ok (> end (+ pos (1- x))))) + (unless ok + (setf reject-reason 'end-of-input-in-character)) + ok)) + (valid-secondary-p (x) + (let* ((idx (the array-range (+ pos x))) + (b (,accessor array idx)) + (ok (= (logand b #b11000000) #b10000000))) + (unless ok + (setf reject-reason 'invalid-utf8-continuation-byte) + (setf reject-position idx)) + ok)) + (preliminary-ok-for-length (maybe-len len) + (and (eql maybe-len len) + ;; Has to be done in this order so that + ;; certain broken sequences (e.g., the + ;; two-byte sequence `"initial (length 3)" + ;; "non-continuation"' -- `#xef #x32') + ;; signal only part of that sequence as + ;; erroneous. + (loop for i from 1 below (min len remaining-bytes) + always (valid-secondary-p i)) + (enough-bytes-left-p len))) + (overlong-chk (x y) + (let ((ok (or (/= initial-byte x) + (/= (logior (,accessor array (the array-range (+ pos 1))) + y) + y)))) + (unless ok + (setf reject-reason 'overlong-utf8-sequence)) + ok)) + (character-below-char-code-limit-p () + ;; This is only called on a four-byte sequence + ;; (two in non-unicode builds) to ensure we + ;; don't go over SBCL's character limts. + (let ((ok (cond ((< (aref lexically-max 0) (,accessor array pos)) + nil) + ((> (aref lexically-max 0) (,accessor array pos)) + t) + ((< (aref lexically-max 1) (,accessor array (+ pos 1))) + nil) + #!+sb-unicode + ((> (aref lexically-max 1) (,accessor array (+ pos 1))) + t) + #!+sb-unicode + ((< (aref lexically-max 2) (,accessor array (+ pos 2))) + nil) + #!+sb-unicode + ((> (aref lexically-max 2) (,accessor array (+ pos 2))) + t) + #!+sb-unicode + ((< (aref lexically-max 3) (,accessor array (+ pos 3))) + nil) + (t t)))) + (unless ok + (setf reject-reason 'character-out-of-range)) + ok))) + (declare (inline valid-utf8-starter-byte-p + enough-bytes-left-p + valid-secondary-p + preliminary-ok-for-length + overlong-chk)) + (let ((maybe-len (valid-utf8-starter-byte-p initial-byte))) + (cond ((eql maybe-len 1) + (values 1 nil)) + ((and (preliminary-ok-for-length maybe-len 2) + (overlong-chk #b11000000 #b10111111) + (overlong-chk #b11000001 #b10111111) + #!-sb-unicode (character-below-char-code-limit-p)) + (values 2 nil)) + ((and (preliminary-ok-for-length maybe-len 3) + (overlong-chk #b11100000 #b10011111) + #!-sb-unicode (not (setf reject-reason 'character-out-of-range))) + (values 3 nil)) + ((and (preliminary-ok-for-length maybe-len 4) + (overlong-chk #b11110000 #b10001111) + #!-sb-unicode (not (setf reject-reason 'character-out-of-range)) + (character-below-char-code-limit-p)) + (values 4 nil)) + ((and (preliminary-ok-for-length maybe-len 5) + (overlong-chk #b11111000 #b10000111) + (not (setf reject-reason 'character-out-of-range))) + (bug "can't happen")) + ((and (preliminary-ok-for-length maybe-len 6) + (overlong-chk #b11111100 #b10000011) + (not (setf reject-reason 'character-out-of-range))) + (bug "can't happen")) + (t + (let* ((bad-end (ecase reject-reason + (invalid-utf8-starter-byte + (1+ pos)) + (end-of-input-in-character + end) + (invalid-utf8-continuation-byte + reject-position) + ((overlong-utf8-sequence character-out-of-range) + (+ pos maybe-len)))) + (bad-len (- bad-end pos))) + (declare (type array-range bad-end bad-len)) + (let ((replacement (decoding-error array pos bad-end :utf-8 reject-reason reject-position))) + (values bad-len replacement))))))))))))) +(instantiate-octets-definition define-bytes-per-utf8-character) + +(defmacro define-simple-get-utf8-char (accessor type) + (let ((name (make-od-name 'simple-get-utf8-char accessor))) + `(progn + (declaim (inline ,name)) + (defun ,name (array pos bytes) + (declare (optimize speed (safety 0)) + (type ,type array) + (type array-range pos) + (type (integer 1 4) bytes)) + (flet ((cref (x) + (,accessor array (the array-range (+ pos x))))) + (declare (inline cref)) + (code-char (ecase bytes + (1 (cref 0)) + (2 (logior (ash (ldb (byte 5 0) (cref 0)) 6) + (ldb (byte 6 0) (cref 1)))) + (3 (logior (ash (ldb (byte 4 0) (cref 0)) 12) + (ash (ldb (byte 6 0) (cref 1)) 6) + (ldb (byte 6 0) (cref 2)))) + (4 (logior (ash (ldb (byte 3 0) (cref 0)) 18) + (ash (ldb (byte 6 0) (cref 1)) 12) + (ash (ldb (byte 6 0) (cref 2)) 6) + (ldb (byte 6 0) (cref 3))))))))))) +(instantiate-octets-definition define-simple-get-utf8-char) + +(defmacro define-utf8->string (accessor type) + (let ((name (make-od-name 'utf8->string accessor))) + `(progn + (defun ,name (array astart aend) + (declare (optimize speed (safety 0)) + (type ,type array) + (type array-range astart aend)) + (let ((string (make-array 0 :adjustable t :fill-pointer 0 :element-type 'character))) + (loop with pos = astart + while (< pos aend) + do (multiple-value-bind (bytes invalid) + (,(make-od-name 'bytes-per-utf8-character accessor) array pos aend) + (declare (type (or null string) invalid)) + (cond + ((null invalid) + (vector-push-extend (,(make-od-name 'simple-get-utf8-char accessor) array pos bytes) string)) + (t + (dotimes (i (length invalid)) + (vector-push-extend (char invalid i) string)))) + (incf pos bytes))) + (coerce string 'simple-string)))))) +(instantiate-octets-definition define-utf8->string) + +(define-external-format/variable-width (:utf-8 :utf8) nil + (let ((bits (char-code byte))) + (cond ((< bits #x80) 1) + ((< bits #x800) 2) + ((< bits #x10000) 3) + (t 4))) + (ecase size + (1 (setf (sap-ref-8 sap tail) bits)) + (2 (setf (sap-ref-8 sap tail) (logior #xc0 (ldb (byte 5 6) bits)) + (sap-ref-8 sap (+ 1 tail)) (logior #x80 (ldb (byte 6 0) bits)))) + (3 (setf (sap-ref-8 sap tail) (logior #xe0 (ldb (byte 4 12) bits)) + (sap-ref-8 sap (+ 1 tail)) (logior #x80 (ldb (byte 6 6) bits)) + (sap-ref-8 sap (+ 2 tail)) (logior #x80 (ldb (byte 6 0) bits)))) + (4 (setf (sap-ref-8 sap tail) (logior #xf0 (ldb (byte 3 18) bits)) + (sap-ref-8 sap (+ 1 tail)) (logior #x80 (ldb (byte 6 12) bits)) + (sap-ref-8 sap (+ 2 tail)) (logior #x80 (ldb (byte 6 6) bits)) + (sap-ref-8 sap (+ 3 tail)) (logior #x80 (ldb (byte 6 0) bits))))) + (cond ((< byte #x80) 1) + ((< byte #xc2) (return-from decode-break-reason 1)) + ((< byte #xe0) 2) + ((< byte #xf0) 3) + (t 4)) + (code-char (ecase size + (1 byte) + (2 (let ((byte2 (sap-ref-8 sap (1+ head)))) + (unless (<= #x80 byte2 #xbf) + (return-from decode-break-reason 2)) + (dpb byte (byte 5 6) byte2))) + (3 (let ((byte2 (sap-ref-8 sap (1+ head))) + (byte3 (sap-ref-8 sap (+ 2 head)))) + (unless (and (<= #x80 byte2 #xbf) + (<= #x80 byte3 #xbf)) + (return-from decode-break-reason 3)) + (dpb byte (byte 4 12) (dpb byte2 (byte 6 6) byte3)))) + (4 (let ((byte2 (sap-ref-8 sap (1+ head))) + (byte3 (sap-ref-8 sap (+ 2 head))) + (byte4 (sap-ref-8 sap (+ 3 head)))) + (unless (and (<= #x80 byte2 #xbf) + (<= #x80 byte3 #xbf) + (<= #x80 byte4 #xbf)) + (return-from decode-break-reason 4)) + (dpb byte (byte 3 18) + (dpb byte2 (byte 6 12) + (dpb byte3 (byte 6 6) byte4))))))) + utf8->string-aref + string->utf8) diff --git a/src/code/external-formats/enc-cyr.lisp b/src/code/external-formats/enc-cyr.lisp index a0e033e..7883f06 100644 --- a/src/code/external-formats/enc-cyr.lisp +++ b/src/code/external-formats/enc-cyr.lisp @@ -160,9 +160,6 @@ (instantiate-octets-definition define-koi8-r->string) -(add-external-format-funs '(:koi8-r :|koi8-r|) - '(koi8-r->string-aref string->koi8-r)) - (define-external-format (:koi8-r :|koi8-r|) 1 t (let ((koi8-r-byte (code->koi8-r-mapper bits))) @@ -172,7 +169,9 @@ (let ((code (koi8-r->code-mapper byte))) (if code (code-char code) - (external-format-decoding-error stream byte)))) ;; TODO -- error check + (external-format-decoding-error stream byte))) + koi8-r->string-aref + string->koi8-r) ;; TODO -- error check (define-unibyte-mapper koi8-u->code-mapper code->koi8-u-mapper (#x80 #x2500) ; BOX DRAWINGS LIGHT HORIZONTAL @@ -334,9 +333,6 @@ (instantiate-octets-definition define-koi8-u->string) -(add-external-format-funs '(:koi8-u :|koi8-u|) - '(koi8-u->string-aref string->koi8-u)) - (define-external-format (:koi8-u :|koi8-u|) 1 t (let ((koi8-u-byte (code->koi8-u-mapper bits))) @@ -346,7 +342,9 @@ (let ((code (koi8-u->code-mapper byte))) (if code (code-char code) - (external-format-decoding-error stream byte)))) ;; TODO -- error check + (external-format-decoding-error stream byte))) + koi8-u->string-aref + string->koi8-u) ;; TODO -- error check (define-unibyte-mapper x-mac-cyrillic->code-mapper code->x-mac-cyrillic-mapper (#x80 #x0410) ; CYRILLIC CAPITAL LETTER A @@ -503,9 +501,6 @@ (instantiate-octets-definition define-x-mac-cyrillic->string) -(add-external-format-funs '(:x-mac-cyrillic :|x-mac-cyrillic|) - '(x-mac-cyrillic->string-aref string->x-mac-cyrillic)) - (define-external-format (:x-mac-cyrillic :|x-mac-cyrillic|) 1 t (let ((x-mac-cyrillic-byte (code->x-mac-cyrillic-mapper bits))) @@ -515,4 +510,6 @@ (let ((code (x-mac-cyrillic->code-mapper byte))) (if code (code-char code) - (external-format-decoding-error stream byte)))) ;; TODO -- error check + (external-format-decoding-error stream byte))) + x-mac-cyrillic->string-aref + string->x-mac-cyrillic) ;; TODO -- error check diff --git a/src/code/external-formats/enc-dos.lisp b/src/code/external-formats/enc-dos.lisp index 83441ce..0b8ea99 100644 --- a/src/code/external-formats/enc-dos.lisp +++ b/src/code/external-formats/enc-dos.lisp @@ -160,9 +160,6 @@ (instantiate-octets-definition define-cp437->string) -(add-external-format-funs '(:cp437 :|cp437|) - '(cp437->string-aref string->cp437)) - (define-external-format (:cp437 :|cp437|) 1 t (let ((cp437-byte (code->cp437-mapper bits))) @@ -172,7 +169,9 @@ (let ((code (cp437->code-mapper byte))) (if code (code-char code) - (external-format-decoding-error stream byte)))) ;; TODO -- error check + (external-format-decoding-error stream byte))) + cp437->string-aref + string->cp437) ;; TODO -- error check (define-unibyte-mapper cp850->code-mapper code->cp850-mapper (#x80 #x00C7) ; LATIN CAPITAL LETTER C WITH CEDILLA @@ -334,9 +333,6 @@ (instantiate-octets-definition define-cp850->string) -(add-external-format-funs '(:cp850 :|cp850|) - '(cp850->string-aref string->cp850)) - (define-external-format (:cp850 :|cp850|) 1 t (let ((cp850-byte (code->cp850-mapper bits))) @@ -346,7 +342,9 @@ (let ((code (cp850->code-mapper byte))) (if code (code-char code) - (external-format-decoding-error stream byte)))) ;; TODO -- error check + (external-format-decoding-error stream byte))) + cp850->string-aref + string->cp850) ;; TODO -- error check (define-unibyte-mapper cp852->code-mapper code->cp852-mapper (#x80 #x00C7) ; LATIN CAPITAL LETTER C WITH CEDILLA @@ -508,9 +506,6 @@ (instantiate-octets-definition define-cp852->string) -(add-external-format-funs '(:cp852 :|cp852|) - '(cp852->string-aref string->cp852)) - (define-external-format (:cp852 :|cp852|) 1 t (let ((cp852-byte (code->cp852-mapper bits))) @@ -520,7 +515,9 @@ (let ((code (cp852->code-mapper byte))) (if code (code-char code) - (external-format-decoding-error stream byte)))) ;; TODO -- error check + (external-format-decoding-error stream byte))) + cp852->string-aref + string->cp852) ;; TODO -- error check (define-unibyte-mapper cp855->code-mapper code->cp855-mapper (#x80 #x0452) ; CYRILLIC SMALL LETTER DJE @@ -682,9 +679,6 @@ (instantiate-octets-definition define-cp855->string) -(add-external-format-funs '(:cp855 :|cp855|) - '(cp855->string-aref string->cp855)) - (define-external-format (:cp855 :|cp855|) 1 t (let ((cp855-byte (code->cp855-mapper bits))) @@ -694,7 +688,9 @@ (let ((code (cp855->code-mapper byte))) (if code (code-char code) - (external-format-decoding-error stream byte)))) ;; TODO -- error check + (external-format-decoding-error stream byte))) + cp855->string-aref + string->cp855) ;; TODO -- error check (define-unibyte-mapper cp857->code-mapper code->cp857-mapper (#x80 #x00C7) ; LATIN CAPITAL LETTER C WITH CEDILLA @@ -855,9 +851,6 @@ (instantiate-octets-definition define-cp857->string) -(add-external-format-funs '(:cp857 :|cp857|) - '(cp857->string-aref string->cp857)) - (define-external-format (:cp857 :|cp857|) 1 t (let ((cp857-byte (code->cp857-mapper bits))) @@ -867,7 +860,9 @@ (let ((code (cp857->code-mapper byte))) (if code (code-char code) - (external-format-decoding-error stream byte)))) ;; TODO -- error check + (external-format-decoding-error stream byte))) + cp857->string-aref + string->cp857) ;; TODO -- error check (define-unibyte-mapper cp860->code-mapper code->cp860-mapper (#x80 #x00C7) ; LATIN CAPITAL LETTER C WITH CEDILLA @@ -1029,9 +1024,6 @@ (instantiate-octets-definition define-cp860->string) -(add-external-format-funs '(:cp860 :|cp860|) - '(cp860->string-aref string->cp860)) - (define-external-format (:cp860 :|cp860|) 1 t (let ((cp860-byte (code->cp860-mapper bits))) @@ -1041,7 +1033,9 @@ (let ((code (cp860->code-mapper byte))) (if code (code-char code) - (external-format-decoding-error stream byte)))) ;; TODO -- error check + (external-format-decoding-error stream byte))) + cp860->string-aref + string->cp860) ;; TODO -- error check (define-unibyte-mapper cp861->code-mapper code->cp861-mapper (#x80 #x00C7) ; LATIN CAPITAL LETTER C WITH CEDILLA @@ -1203,9 +1197,6 @@ (instantiate-octets-definition define-cp861->string) -(add-external-format-funs '(:cp861 :|cp861|) - '(cp861->string-aref string->cp861)) - (define-external-format (:cp861 :|cp861|) 1 t (let ((cp861-byte (code->cp861-mapper bits))) @@ -1215,7 +1206,9 @@ (let ((code (cp861->code-mapper byte))) (if code (code-char code) - (external-format-decoding-error stream byte)))) ;; TODO -- error check + (external-format-decoding-error stream byte))) + cp861->string-aref + string->cp861) ;; TODO -- error check (define-unibyte-mapper cp862->code-mapper code->cp862-mapper (#x80 #x05D0) ; HEBREW LETTER ALEF @@ -1377,9 +1370,6 @@ (instantiate-octets-definition define-cp862->string) -(add-external-format-funs '(:cp862 :|cp862|) - '(cp862->string-aref string->cp862)) - (define-external-format (:cp862 :|cp862|) 1 t (let ((cp862-byte (code->cp862-mapper bits))) @@ -1389,7 +1379,9 @@ (let ((code (cp862->code-mapper byte))) (if code (code-char code) - (external-format-decoding-error stream byte)))) ;; TODO -- error check + (external-format-decoding-error stream byte))) + cp862->string-aref + string->cp862) ;; TODO -- error check (define-unibyte-mapper cp863->code-mapper code->cp863-mapper (#x80 #x00C7) ; LATIN CAPITAL LETTER C WITH CEDILLA @@ -1551,9 +1543,6 @@ (instantiate-octets-definition define-cp863->string) -(add-external-format-funs '(:cp863 :|cp863|) - '(cp863->string-aref string->cp863)) - (define-external-format (:cp863 :|cp863|) 1 t (let ((cp863-byte (code->cp863-mapper bits))) @@ -1563,7 +1552,9 @@ (let ((code (cp863->code-mapper byte))) (if code (code-char code) - (external-format-decoding-error stream byte)))) ;; TODO -- error check + (external-format-decoding-error stream byte))) + cp863->string-aref + string->cp863) ;; TODO -- error check (define-unibyte-mapper cp864->code-mapper code->cp864-mapper (#x80 #x00B0) ; DEGREE SIGN @@ -1722,9 +1713,6 @@ (instantiate-octets-definition define-cp864->string) -(add-external-format-funs '(:cp864 :|cp864|) - '(cp864->string-aref string->cp864)) - (define-external-format (:cp864 :|cp864|) 1 t (let ((cp864-byte (code->cp864-mapper bits))) @@ -1734,7 +1722,9 @@ (let ((code (cp864->code-mapper byte))) (if code (code-char code) - (external-format-decoding-error stream byte)))) ;; TODO -- error check + (external-format-decoding-error stream byte))) + cp864->string-aref + string->cp864) ;; TODO -- error check (define-unibyte-mapper cp865->code-mapper code->cp865-mapper (#x80 #x00C7) ; LATIN CAPITAL LETTER C WITH CEDILLA @@ -1896,9 +1886,6 @@ (instantiate-octets-definition define-cp865->string) -(add-external-format-funs '(:cp865 :|cp865|) - '(cp865->string-aref string->cp865)) - (define-external-format (:cp865 :|cp865|) 1 t (let ((cp865-byte (code->cp865-mapper bits))) @@ -1908,7 +1895,9 @@ (let ((code (cp865->code-mapper byte))) (if code (code-char code) - (external-format-decoding-error stream byte)))) ;; TODO -- error check + (external-format-decoding-error stream byte))) + cp865->string-aref + string->cp865) ;; TODO -- error check (define-unibyte-mapper cp866->code-mapper code->cp866-mapper (#x80 #x0410) ; CYRILLIC CAPITAL LETTER A @@ -2070,9 +2059,6 @@ (instantiate-octets-definition define-cp866->string) -(add-external-format-funs '(:cp866 :|cp866|) - '(cp866->string-aref string->cp866)) - (define-external-format (:cp866 :|cp866|) 1 t (let ((cp866-byte (code->cp866-mapper bits))) @@ -2082,7 +2068,9 @@ (let ((code (cp866->code-mapper byte))) (if code (code-char code) - (external-format-decoding-error stream byte)))) ;; TODO -- error check + (external-format-decoding-error stream byte))) + cp866->string-aref + string->cp866) ;; TODO -- error check (define-unibyte-mapper cp869->code-mapper code->cp869-mapper (#x80 nil) @@ -2244,9 +2232,6 @@ (instantiate-octets-definition define-cp869->string) -(add-external-format-funs '(:cp869 :|cp869|) - '(cp869->string-aref string->cp869)) - (define-external-format (:cp869 :|cp869|) 1 t (let ((cp869-byte (code->cp869-mapper bits))) @@ -2256,7 +2241,9 @@ (let ((code (cp869->code-mapper byte))) (if code (code-char code) - (external-format-decoding-error stream byte)))) ;; TODO -- error check + (external-format-decoding-error stream byte))) + cp869->string-aref + string->cp869) ;; TODO -- error check (define-unibyte-mapper cp874->code-mapper code->cp874-mapper (#x80 #x20AC) ; EURO SIGN @@ -2417,9 +2404,6 @@ (instantiate-octets-definition define-cp874->string) -(add-external-format-funs '(:cp874 :|cp874|) - '(cp874->string-aref string->cp874)) - (define-external-format (:cp874 :|cp874|) 1 t (let ((cp874-byte (code->cp874-mapper bits))) @@ -2429,4 +2413,6 @@ (let ((code (cp874->code-mapper byte))) (if code (code-char code) - (external-format-decoding-error stream byte)))) ;; TODO -- error check + (external-format-decoding-error stream byte))) + cp874->string-aref + string->cp874) ;; TODO -- error check diff --git a/src/code/external-formats/enc-ebcdic.lisp b/src/code/external-formats/enc-ebcdic.lisp index f3d1144..b54e630 100644 --- a/src/code/external-formats/enc-ebcdic.lisp +++ b/src/code/external-formats/enc-ebcdic.lisp @@ -61,13 +61,12 @@ (,(make-od-name 'latin->string accessor) array astart aend #'ebcdic-us->code-mapper))) (instantiate-octets-definition define-ebcdic-us->string) -(add-external-format-funs '(:ebcdic-us :cp037 :|cp037| :ibm-037 :ibm037) - '(ebcdic-us->string-aref string->ebcdic-us)) - (define-external-format (:ebcdic-us :cp037 :|cp037| :ibm-037 :ibm037) 1 t (let ((ebcdic-us-byte (code->ebcdic-us-mapper bits))) (if ebcdic-us-byte (setf (sap-ref-8 sap tail) ebcdic-us-byte) (external-format-encoding-error stream bits))) - (code-char (ebcdic-us->code-mapper byte))) + (code-char (ebcdic-us->code-mapper byte)) + ebcdic-us->string-aref + string->ebcdic-us) diff --git a/src/code/external-formats/enc-iso.lisp b/src/code/external-formats/enc-iso.lisp index a6f446c..db20263 100644 --- a/src/code/external-formats/enc-iso.lisp +++ b/src/code/external-formats/enc-iso.lisp @@ -89,9 +89,6 @@ (instantiate-octets-definition define-iso-8859-2->string) -(add-external-format-funs '(:iso-8859-2 :|iso-8859-2| :latin-2 :|latin-2|) - '(iso-8859-2->string-aref string->iso-8859-2)) - (define-external-format (:iso-8859-2 :|iso-8859-2| :latin-2 :|latin-2|) 1 t (let ((iso-8859-2-byte (code->iso-8859-2-mapper bits))) @@ -101,7 +98,9 @@ (let ((code (iso-8859-2->code-mapper byte))) (if code (code-char code) - (external-format-decoding-error stream byte)))) ;; TODO -- error check + (external-format-decoding-error stream byte))) + iso-8859-2->string-aref + string->iso8859-2) ;; TODO -- error check (define-unibyte-mapper iso-8859-3->code-mapper code->iso-8859-3-mapper (#xA1 #x0126) ; LATIN CAPITAL LETTER H WITH STROKE @@ -170,9 +169,6 @@ (instantiate-octets-definition define-iso-8859-3->string) -(add-external-format-funs '(:iso-8859-3 :|iso-8859-3| :latin-3 :|latin-3|) - '(iso-8859-3->string-aref string->iso-8859-3)) - (define-external-format (:iso-8859-3 :|iso-8859-3| :latin-3 :|latin-3|) 1 t (let ((iso-8859-3-byte (code->iso-8859-3-mapper bits))) @@ -182,7 +178,9 @@ (let ((code (iso-8859-3->code-mapper byte))) (if code (code-char code) - (external-format-decoding-error stream byte)))) ;; TODO -- error check + (external-format-decoding-error stream byte))) + iso-8859-3->string-aref + string->iso-8859-3) ;; TODO -- error check (define-unibyte-mapper iso-8859-4->code-mapper code->iso-8859-4-mapper (#xA1 #x0104) ; LATIN CAPITAL LETTER A WITH OGONEK @@ -266,9 +264,6 @@ (instantiate-octets-definition define-iso-8859-4->string) -(add-external-format-funs '(:iso-8859-4 :|iso-8859-4| :latin-4 :|latin-4|) - '(iso-8859-4->string-aref string->iso-8859-4)) - (define-external-format (:iso-8859-4 :|iso-8859-4| :latin-4 :|latin-4|) 1 t (let ((iso-8859-4-byte (code->iso-8859-4-mapper bits))) @@ -278,7 +273,9 @@ (let ((code (iso-8859-4->code-mapper byte))) (if code (code-char code) - (external-format-decoding-error stream byte)))) ;; TODO -- error check + (external-format-decoding-error stream byte))) + iso-8859-4->string-aref + string->iso-8859-4) ;; TODO -- error check (define-unibyte-mapper iso-8859-5->code-mapper code->iso-8859-5-mapper (#xA1 #x0401) ; CYRILLIC CAPITAL LETTER IO @@ -406,9 +403,6 @@ (instantiate-octets-definition define-iso-8859-5->string) -(add-external-format-funs '(:iso-8859-5 :|iso-8859-5|) - '(iso-8859-5->string-aref string->iso-8859-5)) - (define-external-format (:iso-8859-5 :|iso-8859-5|) 1 t (let ((iso-8859-5-byte (code->iso-8859-5-mapper bits))) @@ -418,7 +412,9 @@ (let ((code (iso-8859-5->code-mapper byte))) (if code (code-char code) - (external-format-decoding-error stream byte)))) ;; TODO -- error check + (external-format-decoding-error stream byte))) + iso-8859-5->string-aref + string->iso-8859-5) ;; TODO -- error check (define-unibyte-mapper iso-8859-6->code-mapper code->iso-8859-6-mapper (#xA1 nil) @@ -545,9 +541,6 @@ (instantiate-octets-definition define-iso-8859-6->string) -(add-external-format-funs '(:iso-8859-6 :|iso-8859-6|) - '(iso-8859-6->string-aref string->iso-8859-6)) - (define-external-format (:iso-8859-6 :|iso-8859-6|) 1 t (let ((iso-8859-6-byte (code->iso-8859-6-mapper bits))) @@ -557,7 +550,9 @@ (let ((code (iso-8859-6->code-mapper byte))) (if code (code-char code) - (external-format-decoding-error stream byte)))) ;; TODO -- error check + (external-format-decoding-error stream byte))) + iso-8859-6->string-aref + string->iso-8859-6) ;; TODO -- error check (define-unibyte-mapper iso-8859-7->code-mapper code->iso-8859-7-mapper (#xA1 #x02BD) ; MODIFIER LETTER REVERSED COMMA @@ -671,9 +666,6 @@ (instantiate-octets-definition define-iso-8859-7->string) -(add-external-format-funs '(:iso-8859-7 :|iso-8859-7|) - '(iso-8859-7->string-aref string->iso-8859-7)) - (define-external-format (:iso-8859-7 :|iso-8859-7|) 1 t (let ((iso-8859-7-byte (code->iso-8859-7-mapper bits))) @@ -683,7 +675,9 @@ (let ((code (iso-8859-7->code-mapper byte))) (if code (code-char code) - (external-format-decoding-error stream byte)))) ;; TODO -- error check + (external-format-decoding-error stream byte))) + iso-8859-7->string-aref + string->iso-8859-7) ;; TODO -- error check (define-unibyte-mapper iso-8859-8->code-mapper code->iso-8859-8-mapper (#xA1 nil) @@ -786,9 +780,6 @@ (instantiate-octets-definition define-iso-8859-8->string) -(add-external-format-funs '(:iso-8859-8 :|iso-8859-8|) - '(iso-8859-8->string-aref string->iso-8859-8)) - (define-external-format (:iso-8859-8 :|iso-8859-8|) 1 t (let ((iso-8859-8-byte (code->iso-8859-8-mapper bits))) @@ -798,7 +789,9 @@ (let ((code (iso-8859-8->code-mapper byte))) (if code (code-char code) - (external-format-decoding-error stream byte)))) ;; TODO -- error check + (external-format-decoding-error stream byte))) + iso-8859-8->string-aref + string->iso-8859-8) ;; TODO -- error check (define-unibyte-mapper iso-8859-9->code-mapper code->iso-8859-9-mapper (#xD0 #x011E) ; LATIN CAPITAL LETTER G WITH BREVE @@ -838,9 +831,6 @@ (instantiate-octets-definition define-iso-8859-9->string) -(add-external-format-funs '(:iso-8859-9 :|iso-8859-9| :latin-5 :|latin-5|) - '(iso-8859-9->string-aref string->iso-8859-9)) - (define-external-format (:iso-8859-9 :|iso-8859-9| :latin-5 :|latin-5|) 1 t (let ((iso-8859-9-byte (code->iso-8859-9-mapper bits))) @@ -850,7 +840,9 @@ (let ((code (iso-8859-9->code-mapper byte))) (if code (code-char code) - (external-format-decoding-error stream byte)))) ;; TODO -- error check + (external-format-decoding-error stream byte))) + iso-8859-9->string-aref + string->iso-8859-9) ;; TODO -- error check (define-unibyte-mapper iso-8859-10->code-mapper code->iso-8859-10-mapper (#xA1 #x0104) ; LATIN CAPITAL LETTER A WITH OGONEK @@ -930,9 +922,6 @@ (instantiate-octets-definition define-iso-8859-10->string) -(add-external-format-funs '(:iso-8859-10 :|iso-8859-10| :latin-6 :|latin-6|) - '(iso-8859-10->string-aref string->iso-8859-10)) - (define-external-format (:iso-8859-10 :|iso-8859-10| :latin-6 :|latin-6|) 1 t (let ((iso-8859-10-byte (code->iso-8859-10-mapper bits))) @@ -942,7 +931,9 @@ (let ((code (iso-8859-10->code-mapper byte))) (if code (code-char code) - (external-format-decoding-error stream byte)))) ;; TODO -- error check + (external-format-decoding-error stream byte))) + iso-8859-10->string-aref + string->iso-8859-10) ;; TODO -- error check (define-unibyte-mapper iso-8859-11->code-mapper code->iso-8859-11-mapper (#xA1 #x0E01) ; THAI CHARACTER KO KAI @@ -1071,9 +1062,6 @@ (instantiate-octets-definition define-iso-8859-11->string) -(add-external-format-funs '(:iso-8859-11 :|iso-8859-11|) - '(iso-8859-11->string-aref string->iso-8859-11)) - (define-external-format (:iso-8859-11 :|iso-8859-11|) 1 t (let ((iso-8859-11-byte (code->iso-8859-11-mapper bits))) @@ -1083,7 +1071,9 @@ (let ((code (iso-8859-11->code-mapper byte))) (if code (code-char code) - (external-format-decoding-error stream byte)))) ;; TODO -- error check + (external-format-decoding-error stream byte))) + iso-8859-11->string-aref + string->iso-8859-11) ;; TODO -- error check (define-unibyte-mapper iso-8859-13->code-mapper code->iso-8859-13-mapper (#xA1 #x201D) ; RIGHT DOUBLE QUOTATION MARK @@ -1173,9 +1163,6 @@ (instantiate-octets-definition define-iso-8859-13->string) -(add-external-format-funs '(:iso-8859-13 :|iso-8859-13| :latin-7 :|latin-7|) - '(iso-8859-13->string-aref string->iso-8859-13)) - (define-external-format (:iso-8859-13 :|iso-8859-13| :latin-7 :|latin-7|) 1 t (let ((iso-8859-13-byte (code->iso-8859-13-mapper bits))) @@ -1185,7 +1172,9 @@ (let ((code (iso-8859-13->code-mapper byte))) (if code (code-char code) - (external-format-decoding-error stream byte)))) ;; TODO -- error check + (external-format-decoding-error stream byte))) + iso-8859-13->string-aref + string->iso-8859-13) ;; TODO -- error check (define-unibyte-mapper iso-8859-14->code-mapper code->iso-8859-14-mapper (#xA1 #x1E02) ; LATIN CAPITAL LETTER B WITH DOT ABOVE @@ -1250,9 +1239,6 @@ (instantiate-octets-definition define-iso-8859-14->string) -(add-external-format-funs '(:iso-8859-14 :|iso-8859-14| :latin-8 :|latin-8|) - '(iso-8859-14->string-aref string->iso-8859-14)) - (define-external-format (:iso-8859-14 :|iso-8859-14| :latin-8 :|latin-8|) 1 t (let ((iso-8859-14-byte (code->iso-8859-14-mapper bits))) @@ -1262,4 +1248,59 @@ (let ((code (iso-8859-14->code-mapper byte))) (if code (code-char code) - (external-format-decoding-error stream byte)))) ;; TODO -- error check + (external-format-decoding-error stream byte))) + iso-8859-14->string-aref + string->iso-8859-14) ;; TODO -- error check + +(define-unibyte-mapper + latin9->code-mapper + code->latin9-mapper + (#xA4 #x20AC) + (#xA6 #x0160) + (#xA8 #x0161) + (#xB4 #x017D) + (#xB8 #x017E) + (#xBC #x0152) + (#xBD #x0153) + (#xBE #x0178)) + +(declaim (inline get-latin9-bytes)) +(defun get-latin9-bytes (string pos) + (declare (optimize speed (safety 0)) + (type simple-string string) + (type array-range pos)) + (get-latin-bytes #'code->latin9-mapper :latin-9 string pos)) + +(defun string->latin9 (string sstart send null-padding) + (declare (optimize speed (safety 0)) + (type simple-string string) + (type array-range sstart send)) + (values (string->latin% string sstart send #'get-latin9-bytes null-padding))) + +(defmacro define-latin9->string* (accessor type) + (declare (ignore type)) + (let ((name (make-od-name 'latin9->string* accessor))) + `(progn + (defun ,name (string sstart send array astart aend) + (,(make-od-name 'latin->string* accessor) string sstart send array astart aend #'latin9->code-mapper))))) +(instantiate-octets-definition define-latin9->string*) + +(defmacro define-latin9->string (accessor type) + (declare (ignore type)) + `(defun ,(make-od-name 'latin9->string accessor) (array astart aend) + (,(make-od-name 'latin->string accessor) array astart aend #'latin9->code-mapper))) + (instantiate-octets-definition define-latin9->string) + +;;; The names for latin9 are different due to a historical accident. +(define-external-format (:latin-9 :latin9 :iso-8859-15 :iso8859-15) + 1 t + (let ((latin-9-byte (code->latin9-mapper bits))) + (if latin-9-byte + (setf (sap-ref-8 sap tail) latin-9-byte) + (external-format-encoding-error stream bits))) + (let ((code (latin9->code-mapper byte))) + (if code + (code-char code) + (external-format-decoding-error stream byte))) + latin9->string-aref + string->latin9) diff --git a/src/code/external-formats/enc-win.lisp b/src/code/external-formats/enc-win.lisp index 6fca638..82e293a 100644 --- a/src/code/external-formats/enc-win.lisp +++ b/src/code/external-formats/enc-win.lisp @@ -111,9 +111,6 @@ (instantiate-octets-definition define-cp1250->string) -(add-external-format-funs '(:cp1250 :|cp1250| :windows-1250 :|windows-1250|) - '(cp1250->string-aref string->cp1250)) - (define-external-format (:cp1250 :|cp1250| :windows-1250 :|windows-1250|) 1 t (let ((cp1250-byte (code->cp1250-mapper bits))) @@ -123,7 +120,9 @@ (let ((code (cp1250->code-mapper byte))) (if code (code-char code) - (external-format-decoding-error stream byte)))) ;; TODO -- error check + (external-format-decoding-error stream byte))) + cp1250->string-aref + string->cp1250) ;; TODO -- error check (define-unibyte-mapper cp1251->code-mapper code->cp1251-mapper (#x80 #x0402) ; CYRILLIC CAPITAL LETTER DJE @@ -270,9 +269,6 @@ (instantiate-octets-definition define-cp1251->string) -(add-external-format-funs '(:cp1251 :|cp1251| :windows-1251 :|windows-1251|) - '(cp1251->string-aref string->cp1251)) - (define-external-format (:cp1251 :|cp1251| :windows-1251 :|windows-1251|) 1 t (let ((cp1251-byte (code->cp1251-mapper bits))) @@ -282,7 +278,9 @@ (let ((code (cp1251->code-mapper byte))) (if code (code-char code) - (external-format-decoding-error stream byte)))) ;; TODO -- error check + (external-format-decoding-error stream byte))) + cp1251->string-aref + string->cp1251) ;; TODO -- error check (define-unibyte-mapper cp1252->code-mapper code->cp1252-mapper (#x80 #x20AC) ; EURO SIGN @@ -348,9 +346,6 @@ (instantiate-octets-definition define-cp1252->string) -(add-external-format-funs '(:cp1252 :|cp1252| :windows-1252 :|windows-1252|) - '(cp1252->string-aref string->cp1252)) - (define-external-format (:cp1252 :|cp1252| :windows-1252 :|windows-1252|) 1 t (let ((cp1252-byte (code->cp1252-mapper bits))) @@ -360,7 +355,9 @@ (let ((code (cp1252->code-mapper byte))) (if code (code-char code) - (external-format-decoding-error stream byte)))) ;; TODO -- error check + (external-format-decoding-error stream byte))) + cp1252->string-aref + string->cp1252) ;; TODO -- error check (define-unibyte-mapper cp1253->code-mapper code->cp1253-mapper (#x80 #x20AC) ; EURO SIGN @@ -501,9 +498,6 @@ (instantiate-octets-definition define-cp1253->string) -(add-external-format-funs '(:cp1253 :|cp1253| :windows-1253 :|windows-1253|) - '(cp1253->string-aref string->cp1253)) - (define-external-format (:cp1253 :|cp1253| :windows-1253 :|windows-1253|) 1 t (let ((cp1253-byte (code->cp1253-mapper bits))) @@ -513,7 +507,9 @@ (let ((code (cp1253->code-mapper byte))) (if code (code-char code) - (external-format-decoding-error stream byte)))) ;; TODO -- error check + (external-format-decoding-error stream byte))) + cp1253->string-aref + string->cp1253) ;; TODO -- error check (define-unibyte-mapper cp1254->code-mapper code->cp1254-mapper (#x80 #x20AC) ; EURO SIGN @@ -585,9 +581,6 @@ (instantiate-octets-definition define-cp1254->string) -(add-external-format-funs '(:cp1254 :|cp1254| :windows-1254 :|windows-1254|) - '(cp1254->string-aref string->cp1254)) - (define-external-format (:cp1254 :|cp1254|) 1 t (let ((cp1254-byte (code->cp1254-mapper bits))) @@ -597,7 +590,9 @@ (let ((code (cp1254->code-mapper byte))) (if code (code-char code) - (external-format-decoding-error stream byte)))) ;; TODO -- error check + (external-format-decoding-error stream byte))) + cp1254->string-aref + string->cp1254) ;; TODO -- error check (define-unibyte-mapper cp1255->code-mapper code->cp1255-mapper (#x80 #x20AC) ; EURO SIGN @@ -730,9 +725,6 @@ (instantiate-octets-definition define-cp1255->string) -(add-external-format-funs '(:cp1255 :|cp1255| :windows-1255 :|windows-1255|) - '(cp1255->string-aref string->cp1255)) - (define-external-format (:cp1255 :|cp1255| :windows-1255 :|windows-1255|) 1 t (let ((cp1255-byte (code->cp1255-mapper bits))) @@ -742,7 +734,9 @@ (let ((code (cp1255->code-mapper byte))) (if code (code-char code) - (external-format-decoding-error stream byte)))) ;; TODO -- error check + (external-format-decoding-error stream byte))) + cp1255->string-aref + string->cp1255) ;; TODO -- error check (define-unibyte-mapper cp1256->code-mapper code->cp1256-mapper (#x80 #x20AC) ; EURO SIGN @@ -861,10 +855,7 @@ (instantiate-octets-definition define-cp1256->string) -(add-external-format-funs '(:cp1256 :|cp1256| :windows-1256 :|windows-1256|) - '(cp1256->string-aref string->cp1256)) - -(define-external-format (:cp1256 :|cp1256|) +(define-external-format (:cp1256 :|cp1256| :windows-1256 :|windows-1256|) 1 t (let ((cp1256-byte (code->cp1256-mapper bits))) (if cp1256-byte @@ -873,7 +864,9 @@ (let ((code (cp1256->code-mapper byte))) (if code (code-char code) - (external-format-decoding-error stream byte)))) ;; TODO -- error check + (external-format-decoding-error stream byte))) + cp1256->string-aref + string->cp1256) ;; TODO -- error check (define-unibyte-mapper cp1257->code-mapper code->cp1257-mapper (#x80 #x20AC) ; EURO SIGN @@ -994,9 +987,6 @@ (instantiate-octets-definition define-cp1257->string) -(add-external-format-funs '(:cp1257 :|cp1257| :windows-1257 :|windows-1257|) - '(cp1257->string-aref string->cp1257)) - (define-external-format (:cp1257 :|cp1257| :windows-1257 :|windows-1257|) 1 t (let ((cp1257-byte (code->cp1257-mapper bits))) @@ -1006,7 +996,9 @@ (let ((code (cp1257->code-mapper byte))) (if code (code-char code) - (external-format-decoding-error stream byte)))) ;; TODO -- error check + (external-format-decoding-error stream byte))) + cp1257->string-aref + string->cp1257) ;; TODO -- error check (define-unibyte-mapper cp1258->code-mapper code->cp1258-mapper (#x80 #x20AC) ; EURO SIGN @@ -1086,9 +1078,6 @@ (instantiate-octets-definition define-cp1258->string) -(add-external-format-funs '(:cp1258 :|cp1258| :windows-1258 :|windows-1258|) - '(cp1258->string-aref string->cp1258)) - (define-external-format (:cp1258 :|cp1258| :windows-1258 :|windows-1258|) 1 t (let ((cp1258-byte (code->cp1258-mapper bits))) @@ -1098,4 +1087,7 @@ (let ((code (cp1258->code-mapper byte))) (if code (code-char code) - (external-format-decoding-error stream byte)))) ;; TODO -- error check + (external-format-decoding-error stream byte))) + cp1258->string-aref + string->cp1258) ;; TODO -- error check + diff --git a/src/code/external-formats/mb-util.lisp b/src/code/external-formats/mb-util.lisp index e60dfed..e883099 100644 --- a/src/code/external-formats/mb-util.lisp +++ b/src/code/external-formats/mb-util.lisp @@ -183,38 +183,6 @@ (define-mb->string (make-od-name-list 'define format '>string))) `(progn - ;; for fd-stream.lisp - (define-external-format/variable-width ,aliases t - (mb-char-len (or (,ucs-to-mb (char-code byte)) -1)) - (let ((mb (,ucs-to-mb bits))) - (if (null mb) - (external-format-encoding-error stream byte) - (ecase size - (1 (setf (sap-ref-8 sap tail) mb)) - (2 (setf (sap-ref-8 sap tail) (ldb (byte 8 8) mb) - (sap-ref-8 sap (1+ tail)) (ldb (byte 8 0) mb))) - (3 (setf (sap-ref-8 sap tail) (ldb (byte 8 16) mb) - (sap-ref-8 sap (1+ tail)) (ldb (byte 8 8) mb) - (sap-ref-8 sap (+ 2 tail)) (ldb (byte 8 0) mb)))))) - (,mb-len byte) - (let* ((mb (ecase size - (1 byte) - (2 (let ((byte2 (sap-ref-8 sap (1+ head)))) - (unless (,mb-continuation-byte-p byte2) - (return-from decode-break-reason 2)) - (dpb byte (byte 8 8) byte2))) - (3 (let ((byte2 (sap-ref-8 sap (1+ head))) - (byte3 (sap-ref-8 sap (+ 2 head)))) - (unless (,mb-continuation-byte-p byte2) - (return-from decode-break-reason 2)) - (unless (,mb-continuation-byte-p byte3) - (return-from decode-break-reason 3)) - (dpb byte (byte 8 16) (dpb byte2 (byte 8 8) byte3)))))) - (ucs (,mb-to-ucs mb))) - (if (null ucs) - (return-from decode-break-reason 1) - (code-char ucs)))) - ;; for octets.lisp (define-condition ,(make-od-name 'malformed format) (octet-decoding-error) ()) @@ -276,7 +244,36 @@ (instantiate-octets-definition ,define-mb->string) - (add-external-format-funs ',aliases - '(,(make-od-name format '>string-aref) - ,string->mb)) - ))) + ;; for fd-stream.lisp + (define-external-format/variable-width ,aliases t + (mb-char-len (or (,ucs-to-mb (char-code byte)) -1)) + (let ((mb (,ucs-to-mb bits))) + (if (null mb) + (external-format-encoding-error stream byte) + (ecase size + (1 (setf (sap-ref-8 sap tail) mb)) + (2 (setf (sap-ref-8 sap tail) (ldb (byte 8 8) mb) + (sap-ref-8 sap (1+ tail)) (ldb (byte 8 0) mb))) + (3 (setf (sap-ref-8 sap tail) (ldb (byte 8 16) mb) + (sap-ref-8 sap (1+ tail)) (ldb (byte 8 8) mb) + (sap-ref-8 sap (+ 2 tail)) (ldb (byte 8 0) mb)))))) + (,mb-len byte) + (let* ((mb (ecase size + (1 byte) + (2 (let ((byte2 (sap-ref-8 sap (1+ head)))) + (unless (,mb-continuation-byte-p byte2) + (return-from decode-break-reason 2)) + (dpb byte (byte 8 8) byte2))) + (3 (let ((byte2 (sap-ref-8 sap (1+ head))) + (byte3 (sap-ref-8 sap (+ 2 head)))) + (unless (,mb-continuation-byte-p byte2) + (return-from decode-break-reason 2)) + (unless (,mb-continuation-byte-p byte3) + (return-from decode-break-reason 3)) + (dpb byte (byte 8 16) (dpb byte2 (byte 8 8) byte3)))))) + (ucs (,mb-to-ucs mb))) + (if (null ucs) + (return-from decode-break-reason 1) + (code-char ucs))) + ,(make-od-name format '>string-aref) + ,string->mb)))) diff --git a/src/code/external-formats/ucs-2.lisp b/src/code/external-formats/ucs-2.lisp index fc1d4bd..8eefcef 100644 --- a/src/code/external-formats/ucs-2.lisp +++ b/src/code/external-formats/ucs-2.lisp @@ -34,26 +34,6 @@ (sap-ref-8 sap offset) (ldb (byte 8 8) value))) ;;; -;;; Define external format: fd-stream -;;; -(define-external-format/variable-width (:ucs-2le :ucs2le #!+win32 :ucs2 #!+win32 :ucs-2) nil - 2 - (if (< bits #x10000) - (setf (sap-ref-16le sap tail) bits) - (external-format-encoding-error stream bits)) - 2 - (code-char (sap-ref-16le sap head))) - -(define-external-format/variable-width (:ucs-2be :ucs2be) nil - 2 - (if (< bits #x10000) - (setf (sap-ref-16be sap tail) bits) - (external-format-encoding-error stream bits)) - 2 - (code-char (sap-ref-16be sap head))) - - -;;; ;;; octets ;;; @@ -209,8 +189,22 @@ (instantiate-octets-definition define-ucs-2->string) -(add-external-format-funs '(:ucs-2le :ucs2le #!+win32 :ucs2 #!+win32 :ucs-2) - '(ucs-2le->string-aref string->ucs-2le)) +(define-external-format/variable-width (:ucs-2le :ucs2le #!+win32 :ucs2 #!+win32 :ucs-2) nil + 2 + (if (< bits #x10000) + (setf (sap-ref-16le sap tail) bits) + (external-format-encoding-error stream bits)) + 2 + (code-char (sap-ref-16le sap head)) + ucs-2le->string-aref + string->ucs-2le) -(add-external-format-funs '(:ucs-2be :ucs2be) - '(ucs-2be->string-aref string->ucs-2be)) +(define-external-format/variable-width (:ucs-2be :ucs2be) nil + 2 + (if (< bits #x10000) + (setf (sap-ref-16be sap tail) bits) + (external-format-encoding-error stream bits)) + 2 + (code-char (sap-ref-16be sap head)) + ucs-2be->string-aref + string->ucs-2be) diff --git a/src/code/fd-stream.lisp b/src/code/fd-stream.lisp index 4c978bc..cc026d4 100644 --- a/src/code/fd-stream.lisp +++ b/src/code/fd-stream.lisp @@ -723,20 +723,44 @@ (setf (fd-stream-char-pos stream) (- end last-newline 1)) (incf (fd-stream-char-pos stream) (- end start)))))) -(defvar *external-formats* () +(defstruct (external-format + (:constructor %make-external-format) + (:conc-name ef-) + (:predicate external-format-p) + (:copier nil)) + ;; All the names that can refer to this external format. The first + ;; one is the canonical name. + (names (missing-arg) :type list :read-only t) + (read-n-chars-fun (missing-arg) :type function :read-only t) + (read-char-fun (missing-arg) :type function :read-only t) + (write-n-bytes-fun (missing-arg) :type function :read-only t) + (write-char-none-buffered-fun (missing-arg) :type function :read-only t) + (write-char-line-buffered-fun (missing-arg) :type function :read-only t) + (write-char-full-buffered-fun (missing-arg) :type function :read-only t) + ;; Can be nil for fixed-width formats. + (resync-fun nil :type (or function null) :read-only t) + (bytes-for-char-fun (missing-arg) :type function :read-only t) + (read-c-string-fun (missing-arg) :type function :read-only t) + (write-c-string-fun (missing-arg) :type function :read-only t) + ;; We make these symbols so that a developer working on the octets + ;; code can easily redefine things and use the new function definition + ;; without redefining the external format as well. The slots above + ;; are functions because a developer working with those slots would be + ;; redefining the external format anyway. + (octets-to-string-sym (missing-arg) :type symbol :read-only t) + (string-to-octets-sym (missing-arg) :type symbol :read-only t)) + +(defvar *external-formats* (make-hash-table) #!+sb-doc - "List of all available external formats. Each element is a list of the - element-type, string input function name, character input function name, - and string output function name.") + "Hashtable of all available external formats. The table maps from + external-format names to EXTERNAL-FORMAT structures.") (defun get-external-format (external-format) - (dolist (entry *external-formats*) - (when (member external-format (first entry)) - (return entry)))) + (gethash external-format *external-formats*)) -(defun get-external-format-function (external-format index) - (let ((entry (get-external-format external-format))) - (when entry (nth index entry)))) +(defun get-external-format-or-lose (external-format) + (or (get-external-format external-format) + (error "Undefined external-format ~A" external-format))) ;;; Find an output routine to use given the type and buffering. Return ;;; as multiple values the routine, the real type transfered, and the @@ -746,15 +770,14 @@ (let ((entry (get-external-format external-format))) (when entry (return-from pick-output-routine - (values (symbol-function (nth (ecase buffering - (:none 4) - (:line 5) - (:full 6)) - entry)) + (values (ecase buffering + (:none (ef-write-char-none-buffered-fun entry)) + (:line (ef-write-char-line-buffered-fun entry)) + (:full (ef-write-char-full-buffered-fun entry))) 'character 1 - (symbol-function (fourth entry)) - (first (first entry))))))) + (ef-write-n-bytes-fun entry) + (first (ef-names entry))))))) (dolist (entry *output-routines*) (when (and (subtypep type (first entry)) (eq buffering (second entry)) @@ -1092,14 +1115,14 @@ ;;; bytes per element (and for character types string input routine). (defun pick-input-routine (type &optional external-format) (when (subtypep type 'character) - (dolist (entry *external-formats*) - (when (member external-format (first entry)) + (let ((entry (get-external-format external-format))) + (when entry (return-from pick-input-routine - (values (symbol-function (third entry)) + (values (ef-read-char-fun entry) 'character 1 - (symbol-function (second entry)) - (first (first entry))))))) + (ef-read-n-chars-fun entry) + (first (ef-names entry))))))) (dolist (entry *input-routines*) (when (and (subtypep type (first entry)) (or (not (fourth entry)) @@ -1202,15 +1225,14 @@ )))) (defun fd-stream-resync (stream) - (dolist (entry *external-formats*) - (when (member (fd-stream-external-format stream) (first entry)) - (return-from fd-stream-resync - (funcall (symbol-function (eighth entry)) stream))))) + (let ((entry (get-external-format (fd-stream-external-format stream)))) + (when entry + (funcall (ef-resync-fun entry) stream)))) (defun get-fd-stream-character-sizer (stream) - (dolist (entry *external-formats*) - (when (member (fd-stream-external-format stream) (first entry)) - (return-from get-fd-stream-character-sizer (ninth entry))))) + (let ((entry (get-external-format (fd-stream-external-format stream)))) + (when entry + (ef-bytes-for-char-fun entry)))) (defun fd-stream-character-size (stream char) (let ((sizer (get-fd-stream-character-sizer stream))) @@ -1223,17 +1245,18 @@ (defun find-external-format (external-format) (when external-format - (find external-format *external-formats* :test #'member :key #'car))) + (get-external-format external-format))) (defun variable-width-external-format-p (ef-entry) - (when (eighth ef-entry) t)) + (and ef-entry (not (null (ef-resync-fun ef-entry))))) (defun bytes-for-char-fun (ef-entry) - (if ef-entry (symbol-function (ninth ef-entry)) (constantly 1))) + (if ef-entry (ef-bytes-for-char-fun ef-entry) (constantly 1))) -;;; FIXME: OAOOM here vrt. *EXTERNAL-FORMAT-FUNCTIONS* in fd-stream.lisp (defmacro define-external-format (external-format size output-restart - out-expr in-expr) + out-expr in-expr + octets-to-string-sym + string-to-octets-sym) (let* ((name (first external-format)) (out-function (symbolicate "OUTPUT-BYTES/" name)) (format (format nil "OUTPUT-CHAR-~A-~~A-BUFFERED" (string name))) @@ -1398,18 +1421,28 @@ (declare (ignorable bits byte)) ,out-expr))) ,n-buffer))) - (setf *external-formats* - (cons '(,external-format ,in-function ,in-char-function ,out-function - ,@(mapcar #'(lambda (buffering) - (intern (format nil format (string buffering)))) - '(:none :line :full)) - nil ; no resync-function - ,size-function ,read-c-string-function ,output-c-string-function) - *external-formats*))))) + (let ((entry (%make-external-format + :names ',external-format + :read-n-chars-fun #',in-function + :read-char-fun #',in-char-function + :write-n-bytes-fun #',out-function + ,@(mapcan #'(lambda (buffering) + (list (intern (format nil "WRITE-CHAR-~A-BUFFERED-FUN" buffering) :keyword) + `#',(intern (format nil format (string buffering))))) + '(:none :line :full)) + :resync-fun nil + :bytes-for-char-fun #',size-function + :read-c-string-fun #',read-c-string-function + :write-c-string-fun #',output-c-string-function + :octets-to-string-sym ',octets-to-string-sym + :string-to-octets-sym ',string-to-octets-sym))) + (dolist (ef ',external-format) + (setf (gethash ef *external-formats*) entry)))))) (defmacro define-external-format/variable-width (external-format output-restart out-size-expr - out-expr in-size-expr in-expr) + out-expr in-size-expr in-expr + octets-to-string-sym string-to-octets-sym) (let* ((name (first external-format)) (out-function (symbolicate "OUTPUT-BYTES/" name)) (format (format nil "OUTPUT-CHAR-~A-~~A-BUFFERED" (string name))) @@ -1651,109 +1684,23 @@ ,out-expr))) ,n-buffer))) - (setf *external-formats* - (cons '(,external-format ,in-function ,in-char-function ,out-function - ,@(mapcar #'(lambda (buffering) - (intern (format nil format (string buffering)))) - '(:none :line :full)) - ,resync-function - ,size-function ,read-c-string-function ,output-c-string-function) - *external-formats*))))) - -;;; Multiple names for the :ISO{,-}8859-* families are needed because on -;;; FreeBSD (and maybe other BSD systems), nl_langinfo("LATIN-1") will -;;; return "ISO8859-1" instead of "ISO-8859-1". -(define-external-format (:latin-1 :latin1 :iso-8859-1 :iso8859-1) - 1 t - (if (>= bits 256) - (external-format-encoding-error stream bits) - (setf (sap-ref-8 sap tail) bits)) - (code-char byte)) - -(define-external-format (:ascii :us-ascii :ansi_x3.4-1968 - :iso-646 :iso-646-us :|646|) - 1 t - (if (>= bits 128) - (external-format-encoding-error stream bits) - (setf (sap-ref-8 sap tail) bits)) - (code-char byte)) - -#!+sb-unicode -(let ((latin-9-table (let ((table (make-string 256))) - (do ((i 0 (1+ i))) - ((= i 256)) - (setf (aref table i) (code-char i))) - (setf (aref table #xa4) (code-char #x20ac)) - (setf (aref table #xa6) (code-char #x0160)) - (setf (aref table #xa8) (code-char #x0161)) - (setf (aref table #xb4) (code-char #x017d)) - (setf (aref table #xb8) (code-char #x017e)) - (setf (aref table #xbc) (code-char #x0152)) - (setf (aref table #xbd) (code-char #x0153)) - (setf (aref table #xbe) (code-char #x0178)) - table)) - (latin-9-reverse-1 (make-array 16 - :element-type '(unsigned-byte 21) - :initial-contents '(#x0160 #x0161 #x0152 #x0153 0 0 0 0 #x0178 0 0 0 #x20ac #x017d #x017e 0))) - (latin-9-reverse-2 (make-array 16 - :element-type '(unsigned-byte 8) - :initial-contents '(#xa6 #xa8 #xbc #xbd 0 0 0 0 #xbe 0 0 0 #xa4 #xb4 #xb8 0)))) - (define-external-format (:latin-9 :latin9 :iso-8859-15 :iso8859-15) - 1 t - (setf (sap-ref-8 sap tail) - (if (< bits 256) - (if (= bits (char-code (aref latin-9-table bits))) - bits - (external-format-encoding-error stream byte)) - (if (= (aref latin-9-reverse-1 (logand bits 15)) bits) - (aref latin-9-reverse-2 (logand bits 15)) - (external-format-encoding-error stream byte)))) - (aref latin-9-table byte))) - -(define-external-format/variable-width (:utf-8 :utf8) nil - (let ((bits (char-code byte))) - (cond ((< bits #x80) 1) - ((< bits #x800) 2) - ((< bits #x10000) 3) - (t 4))) - (ecase size - (1 (setf (sap-ref-8 sap tail) bits)) - (2 (setf (sap-ref-8 sap tail) (logior #xc0 (ldb (byte 5 6) bits)) - (sap-ref-8 sap (+ 1 tail)) (logior #x80 (ldb (byte 6 0) bits)))) - (3 (setf (sap-ref-8 sap tail) (logior #xe0 (ldb (byte 4 12) bits)) - (sap-ref-8 sap (+ 1 tail)) (logior #x80 (ldb (byte 6 6) bits)) - (sap-ref-8 sap (+ 2 tail)) (logior #x80 (ldb (byte 6 0) bits)))) - (4 (setf (sap-ref-8 sap tail) (logior #xf0 (ldb (byte 3 18) bits)) - (sap-ref-8 sap (+ 1 tail)) (logior #x80 (ldb (byte 6 12) bits)) - (sap-ref-8 sap (+ 2 tail)) (logior #x80 (ldb (byte 6 6) bits)) - (sap-ref-8 sap (+ 3 tail)) (logior #x80 (ldb (byte 6 0) bits))))) - (cond ((< byte #x80) 1) - ((< byte #xc2) (return-from decode-break-reason 1)) - ((< byte #xe0) 2) - ((< byte #xf0) 3) - (t 4)) - (code-char (ecase size - (1 byte) - (2 (let ((byte2 (sap-ref-8 sap (1+ head)))) - (unless (<= #x80 byte2 #xbf) - (return-from decode-break-reason 2)) - (dpb byte (byte 5 6) byte2))) - (3 (let ((byte2 (sap-ref-8 sap (1+ head))) - (byte3 (sap-ref-8 sap (+ 2 head)))) - (unless (and (<= #x80 byte2 #xbf) - (<= #x80 byte3 #xbf)) - (return-from decode-break-reason 3)) - (dpb byte (byte 4 12) (dpb byte2 (byte 6 6) byte3)))) - (4 (let ((byte2 (sap-ref-8 sap (1+ head))) - (byte3 (sap-ref-8 sap (+ 2 head))) - (byte4 (sap-ref-8 sap (+ 3 head)))) - (unless (and (<= #x80 byte2 #xbf) - (<= #x80 byte3 #xbf) - (<= #x80 byte4 #xbf)) - (return-from decode-break-reason 4)) - (dpb byte (byte 3 18) - (dpb byte2 (byte 6 12) - (dpb byte3 (byte 6 6) byte4)))))))) + (let ((entry (%make-external-format + :names ',external-format + :read-n-chars-fun #',in-function + :read-char-fun #',in-char-function + :write-n-bytes-fun #',out-function + ,@(mapcan #'(lambda (buffering) + (list (intern (format nil "WRITE-CHAR-~A-BUFFERED-FUN" buffering) :keyword) + `#',(intern (format nil format (string buffering))))) + '(:none :line :full)) + :resync-fun #',resync-function + :bytes-for-char-fun #',size-function + :read-c-string-fun #',read-c-string-function + :write-c-string-fun #',output-c-string-function + :octets-to-string-sym ',octets-to-string-sym + :string-to-octets-sym ',string-to-octets-sym))) + (dolist (ef ',external-format) + (setf (gethash ef *external-formats*) entry)))))) ;;;; utility functions (misc routines, etc) diff --git a/src/code/host-c-call.lisp b/src/code/host-c-call.lisp index 5639751..d688879 100644 --- a/src/code/host-c-call.lisp +++ b/src/code/host-c-call.lisp @@ -55,11 +55,11 @@ ;; changes. (alien-c-string-type-external-format type)))) (not (and external-format - (or (eq (caar external-format) :ascii) + (or (eq (first (sb!impl::ef-names external-format)) :ascii) ;; On non-SB-UNICODE all latin-1 codepoints will fit ;; into a base-char, on SB-UNICODE they won't. #!-sb-unicode - (eq (caar external-format) :latin-1)))))) + (eq (first (sb!impl::ef-names external-format)) :latin-1)))))) (define-alien-type-method (c-string :naturalize-gen) (type alien) `(if (zerop (sap-int ,alien)) diff --git a/src/code/octets.lisp b/src/code/octets.lisp index b860d0f..ce032f6 100644 --- a/src/code/octets.lisp +++ b/src/code/octets.lisp @@ -235,19 +235,6 @@ one-past-the-end" (aref code-to-byte-table (1+ low)) nil))))))))) -#!+sb-unicode -(define-unibyte-mapper - latin9->code-mapper - code->latin9-mapper - (#xA4 #x20AC) - (#xA6 #x0160) - (#xA8 #x0161) - (#xB4 #x017D) - (#xB8 #x017E) - (#xBC #x0152) - (#xBD #x0153) - (#xBE #x0178)) - (declaim (inline get-latin-bytes)) (defun get-latin-bytes (mapper external-format string pos) (let ((code (funcall mapper (char-code (char string pos))))) @@ -258,37 +245,6 @@ one-past-the-end" (encoding-error external-format string pos))) 1))) -(declaim (inline code->ascii-mapper)) -(defun code->ascii-mapper (code) - (declare (optimize speed (safety 0)) - (type char-code code)) - (if (> code 127) - nil - code)) - -(declaim (inline get-ascii-bytes)) -(defun get-ascii-bytes (string pos) - (declare (optimize speed (safety 0)) - (type simple-string string) - (type array-range pos)) - (get-latin-bytes #'code->ascii-mapper :ascii string pos)) - -(declaim (inline get-latin1-bytes)) -(defun get-latin1-bytes (string pos) - (declare (optimize speed (safety 0)) - (type simple-string string) - (type array-range pos)) - (get-latin-bytes #'identity :latin-1 string pos)) - -#!+sb-unicode -(progn - (declaim (inline get-latin9-bytes)) - (defun get-latin9-bytes (string pos) - (declare (optimize speed (safety 0)) - (type simple-string string) - (type array-range pos)) - (get-latin-bytes #'code->latin9-mapper :latin-9 string pos))) - (declaim (inline string->latin%)) (defun string->latin% (string sstart send get-bytes null-padding) (declare (optimize speed) @@ -341,127 +297,11 @@ one-past-the-end" (unless (zerop null-padding) (vector-push-extend 0 new-octets)) (copy-seq new-octets)))))))) - -(defun string->ascii (string sstart send null-padding) - (declare (optimize speed (safety 0)) - (type simple-string string) - (type array-range sstart send)) - (values (string->latin% string sstart send #'get-ascii-bytes null-padding))) - -(defun string->latin1 (string sstart send null-padding) - (declare (optimize speed (safety 0)) - (type simple-string string) - (type array-range sstart send)) - (values (string->latin% string sstart send #'get-latin1-bytes null-padding))) - -#!+sb-unicode -(defun string->latin9 (string sstart send null-padding) - (declare (optimize speed (safety 0)) - (type simple-string string) - (type array-range sstart send)) - (values (string->latin% string sstart send #'get-latin9-bytes null-padding))) - -;;; to utf8 - -(declaim (inline char-len-as-utf8)) -(defun char-len-as-utf8 (code) - (declare (optimize speed (safety 0)) - (type (integer 0 (#.sb!xc:char-code-limit)) code)) - (cond ((< code 0) (bug "can't happen")) - ((< code #x80) 1) - ((< code #x800) 2) - ((< code #x10000) 3) - ((< code #x110000) 4) - (t (bug "can't happen")))) - -(defun string->utf8 (string sstart send null-padding) - (declare (optimize (speed 3) (safety 0)) - (type simple-string string) - (type (integer 0 1) null-padding) - (type array-range sstart send)) - (macrolet ((ascii-bash () - '(let ((array (make-array (+ null-padding (- send sstart)) - :element-type '(unsigned-byte 8)))) - (loop for i from 0 - and j from sstart below send - do (setf (aref array i) (char-code (char string j)))) - array))) - (etypecase string - ((simple-array character (*)) - (let ((utf8-length 0)) - ;; Since it has to fit in a vector, it must be a fixnum! - (declare (type (and unsigned-byte fixnum) utf8-length)) - (loop for i of-type index from sstart below send - do (incf utf8-length (char-len-as-utf8 (char-code (char string i))))) - (if (= utf8-length (- send sstart)) - (ascii-bash) - (let ((array (make-array (+ null-padding utf8-length) - :element-type '(unsigned-byte 8))) - (index 0)) - (declare (type index index)) - (flet ((add-byte (b) - (setf (aref array index) b) - (incf index))) - (declare (inline add-byte)) - (loop for i of-type index from sstart below send - do (let ((code (char-code (char string i)))) - (case (char-len-as-utf8 code) - (1 - (add-byte code)) - (2 - (add-byte (logior #b11000000 (ldb (byte 5 6) code))) - (add-byte (logior #b10000000 (ldb (byte 6 0) code)))) - (3 - (add-byte (logior #b11100000 (ldb (byte 4 12) code))) - (add-byte (logior #b10000000 (ldb (byte 6 6) code))) - (add-byte (logior #b10000000 (ldb (byte 6 0) code)))) - (4 - (add-byte (logior #b11110000 (ldb (byte 3 18) code))) - (add-byte (logior #b10000000 (ldb (byte 6 12) code))) - (add-byte (logior #b10000000 (ldb (byte 6 6) code))) - (add-byte (logior #b10000000 (ldb (byte 6 0) code)))))) - finally (return array))))))) - #!+sb-unicode - ((simple-array base-char (*)) - ;; On unicode builds BASE-STRINGs are limited to ASCII range, - ;; so we can take a fast path -- and get benefit of the element - ;; type information. On non-unicode build BASE-CHAR == - ;; CHARACTER. - (ascii-bash)) - ((simple-array nil (*)) - (if (= send sstart) - (make-array 0 :element-type '(unsigned-byte 8)) - ;; Just get the error... - (aref string sstart)))))) ;;;; to-string conversions ;;; from latin (including ascii) -(defmacro define-ascii->string (accessor type) - (let ((name (make-od-name 'ascii->string accessor))) - `(progn - (defun ,name (array astart aend) - (declare (optimize speed) - (type ,type array) - (type array-range astart aend)) - ;; Since there is such a thing as a malformed ascii byte, a - ;; simple "make the string, fill it in" won't do. - (let ((string (make-array 0 :element-type 'character :fill-pointer 0 :adjustable t))) - (loop for apos from astart below aend - do (let* ((code (,accessor array apos)) - (string-content - (if (< code 128) - (code-char code) - (decoding-error array apos (1+ apos) :ascii - 'malformed-ascii apos)))) - (if (characterp string-content) - (vector-push-extend string-content string) - (loop for c across string-content - do (vector-push-extend c string)))) - finally (return (coerce string 'simple-string)))))))) -(instantiate-octets-definition define-ascii->string) - (defmacro define-latin->string* (accessor type) (let ((name (make-od-name 'latin->string* accessor))) `(progn @@ -479,24 +319,6 @@ one-past-the-end" finally (return (values string spos apos))))))) (instantiate-octets-definition define-latin->string*) -(defmacro define-latin1->string* (accessor type) - (declare (ignore type)) - (let ((name (make-od-name 'latin1->string* accessor))) - `(progn - (defun ,name (string sstart send array astart aend) - (,(make-od-name 'latin->string* accessor) string sstart send array astart aend #'identity))))) -(instantiate-octets-definition define-latin1->string*) - -#!+sb-unicode -(progn - (defmacro define-latin9->string* (accessor type) - (declare (ignore type)) - (let ((name (make-od-name 'latin9->string* accessor))) - `(progn - (defun ,name (string sstart send array astart aend) - (,(make-od-name 'latin->string* accessor) string sstart send array astart aend #'latin9->code-mapper))))) - (instantiate-octets-definition define-latin9->string*)) - (defmacro define-latin->string (accessor type) (let ((name (make-od-name 'latin->string accessor))) `(progn @@ -511,213 +333,6 @@ one-past-the-end" array astart aend mapper))))))) (instantiate-octets-definition define-latin->string) - -(defmacro define-latin1->string (accessor type) - (declare (ignore type)) - `(defun ,(make-od-name 'latin1->string accessor) (array astart aend) - (,(make-od-name 'latin->string accessor) array astart aend #'identity))) -(instantiate-octets-definition define-latin1->string) - -#!+sb-unicode -(progn - (defmacro define-latin9->string (accessor type) - (declare (ignore type)) - `(defun ,(make-od-name 'latin9->string accessor) (array astart aend) - (,(make-od-name 'latin->string accessor) array astart aend #'latin9->code-mapper))) - (instantiate-octets-definition define-latin9->string)) - -;;; from utf8 - -(defmacro define-bytes-per-utf8-character (accessor type) - (let ((name (make-od-name 'bytes-per-utf8-character accessor))) - `(progn - ;;(declaim (inline ,name)) - (let ((lexically-max - (string->utf8 (string (code-char ,(1- sb!xc:char-code-limit))) - 0 1 0))) - (declare (type (simple-array (unsigned-byte 8) (#!+sb-unicode 4 #!-sb-unicode 2)) lexically-max)) - (defun ,name (array pos end) - (declare (optimize speed (safety 0)) - (type ,type array) - (type array-range pos end)) - ;; returns the number of bytes consumed and nil if it's a - ;; valid character or the number of bytes consumed and a - ;; replacement string if it's not. - (let ((initial-byte (,accessor array pos)) - (reject-reason nil) - (reject-position pos) - (remaining-bytes (- end pos))) - (declare (type array-range reject-position remaining-bytes)) - (labels ((valid-utf8-starter-byte-p (b) - (declare (type (unsigned-byte 8) b)) - (let ((ok (cond - ((zerop (logand b #b10000000)) 1) - ((= (logand b #b11100000) #b11000000) - 2) - ((= (logand b #b11110000) #b11100000) - 3) - ((= (logand b #b11111000) #b11110000) - 4) - ((= (logand b #b11111100) #b11111000) - 5) - ((= (logand b #b11111110) #b11111100) - 6) - (t - nil)))) - (unless ok - (setf reject-reason 'invalid-utf8-starter-byte)) - ok)) - (enough-bytes-left-p (x) - (let ((ok (> end (+ pos (1- x))))) - (unless ok - (setf reject-reason 'end-of-input-in-character)) - ok)) - (valid-secondary-p (x) - (let* ((idx (the array-range (+ pos x))) - (b (,accessor array idx)) - (ok (= (logand b #b11000000) #b10000000))) - (unless ok - (setf reject-reason 'invalid-utf8-continuation-byte) - (setf reject-position idx)) - ok)) - (preliminary-ok-for-length (maybe-len len) - (and (eql maybe-len len) - ;; Has to be done in this order so that - ;; certain broken sequences (e.g., the - ;; two-byte sequence `"initial (length 3)" - ;; "non-continuation"' -- `#xef #x32') - ;; signal only part of that sequence as - ;; erroneous. - (loop for i from 1 below (min len remaining-bytes) - always (valid-secondary-p i)) - (enough-bytes-left-p len))) - (overlong-chk (x y) - (let ((ok (or (/= initial-byte x) - (/= (logior (,accessor array (the array-range (+ pos 1))) - y) - y)))) - (unless ok - (setf reject-reason 'overlong-utf8-sequence)) - ok)) - (character-below-char-code-limit-p () - ;; This is only called on a four-byte sequence - ;; (two in non-unicode builds) to ensure we - ;; don't go over SBCL's character limts. - (let ((ok (cond ((< (aref lexically-max 0) (,accessor array pos)) - nil) - ((> (aref lexically-max 0) (,accessor array pos)) - t) - ((< (aref lexically-max 1) (,accessor array (+ pos 1))) - nil) - #!+sb-unicode - ((> (aref lexically-max 1) (,accessor array (+ pos 1))) - t) - #!+sb-unicode - ((< (aref lexically-max 2) (,accessor array (+ pos 2))) - nil) - #!+sb-unicode - ((> (aref lexically-max 2) (,accessor array (+ pos 2))) - t) - #!+sb-unicode - ((< (aref lexically-max 3) (,accessor array (+ pos 3))) - nil) - (t t)))) - (unless ok - (setf reject-reason 'character-out-of-range)) - ok))) - (declare (inline valid-utf8-starter-byte-p - enough-bytes-left-p - valid-secondary-p - preliminary-ok-for-length - overlong-chk)) - (let ((maybe-len (valid-utf8-starter-byte-p initial-byte))) - (cond ((eql maybe-len 1) - (values 1 nil)) - ((and (preliminary-ok-for-length maybe-len 2) - (overlong-chk #b11000000 #b10111111) - (overlong-chk #b11000001 #b10111111) - #!-sb-unicode (character-below-char-code-limit-p)) - (values 2 nil)) - ((and (preliminary-ok-for-length maybe-len 3) - (overlong-chk #b11100000 #b10011111) - #!-sb-unicode (not (setf reject-reason 'character-out-of-range))) - (values 3 nil)) - ((and (preliminary-ok-for-length maybe-len 4) - (overlong-chk #b11110000 #b10001111) - #!-sb-unicode (not (setf reject-reason 'character-out-of-range)) - (character-below-char-code-limit-p)) - (values 4 nil)) - ((and (preliminary-ok-for-length maybe-len 5) - (overlong-chk #b11111000 #b10000111) - (not (setf reject-reason 'character-out-of-range))) - (bug "can't happen")) - ((and (preliminary-ok-for-length maybe-len 6) - (overlong-chk #b11111100 #b10000011) - (not (setf reject-reason 'character-out-of-range))) - (bug "can't happen")) - (t - (let* ((bad-end (ecase reject-reason - (invalid-utf8-starter-byte - (1+ pos)) - (end-of-input-in-character - end) - (invalid-utf8-continuation-byte - reject-position) - ((overlong-utf8-sequence character-out-of-range) - (+ pos maybe-len)))) - (bad-len (- bad-end pos))) - (declare (type array-range bad-end bad-len)) - (let ((replacement (decoding-error array pos bad-end :utf-8 reject-reason reject-position))) - (values bad-len replacement))))))))))))) -(instantiate-octets-definition define-bytes-per-utf8-character) - -(defmacro define-simple-get-utf8-char (accessor type) - (let ((name (make-od-name 'simple-get-utf8-char accessor))) - `(progn - (declaim (inline ,name)) - (defun ,name (array pos bytes) - (declare (optimize speed (safety 0)) - (type ,type array) - (type array-range pos) - (type (integer 1 4) bytes)) - (flet ((cref (x) - (,accessor array (the array-range (+ pos x))))) - (declare (inline cref)) - (code-char (ecase bytes - (1 (cref 0)) - (2 (logior (ash (ldb (byte 5 0) (cref 0)) 6) - (ldb (byte 6 0) (cref 1)))) - (3 (logior (ash (ldb (byte 4 0) (cref 0)) 12) - (ash (ldb (byte 6 0) (cref 1)) 6) - (ldb (byte 6 0) (cref 2)))) - (4 (logior (ash (ldb (byte 3 0) (cref 0)) 18) - (ash (ldb (byte 6 0) (cref 1)) 12) - (ash (ldb (byte 6 0) (cref 2)) 6) - (ldb (byte 6 0) (cref 3))))))))))) -(instantiate-octets-definition define-simple-get-utf8-char) - -(defmacro define-utf8->string (accessor type) - (let ((name (make-od-name 'utf8->string accessor))) - `(progn - (defun ,name (array astart aend) - (declare (optimize speed (safety 0)) - (type ,type array) - (type array-range astart aend)) - (let ((string (make-array 0 :adjustable t :fill-pointer 0 :element-type 'character))) - (loop with pos = astart - while (< pos aend) - do (multiple-value-bind (bytes invalid) - (,(make-od-name 'bytes-per-utf8-character accessor) array pos aend) - (declare (type (or null string) invalid)) - (cond - ((null invalid) - (vector-push-extend (,(make-od-name 'simple-get-utf8-char accessor) array pos bytes) string)) - (t - (dotimes (i (length invalid)) - (vector-push-extend (char invalid i) string)))) - (incf pos bytes))) - (coerce string 'simple-string)))))) -(instantiate-octets-definition define-utf8->string) ;;;; external formats @@ -744,59 +359,35 @@ one-past-the-end" #!+sb-show (cold-print external-format) (/show0 "matching to known aliases") - (dolist (entry *external-formats* - (progn - ;;; FIXME! This WARN would try to do printing - ;;; before the streams have been initialized, - ;;; causing an infinite erroring loop. We should - ;;; either print it by calling to C, or delay the - ;;; warning until later. Since we're in freeze - ;;; right now, and the warning isn't really - ;;; essential, I'm doing what's least likely to - ;;; cause damage, and commenting it out. This - ;;; should be revisited after 0.9.17. -- JES, - ;;; 2006-09-21 - #+nil - (warn "Invalid external-format ~A; using LATIN-1" - external-format) - (setf external-format :latin-1))) - (/show0 "cold printing known aliases:") - #!+sb-show - (dolist (alias (first entry)) (cold-print alias)) - (/show0 "done cold-printing known aliases") - (when (member external-format (first entry)) - (/show0 "matched") - (return))) + (let ((entry (sb!impl::get-external-format external-format))) + (cond + (entry + (/show0 "matched")) + (t + ;; FIXME! This WARN would try to do printing + ;; before the streams have been initialized, + ;; causing an infinite erroring loop. We should + ;; either print it by calling to C, or delay the + ;; warning until later. Since we're in freeze + ;; right now, and the warning isn't really + ;; essential, I'm doing what's least likely to + ;; cause damage, and commenting it out. This + ;; should be revisited after 0.9.17. -- JES, + ;; 2006-09-21 + #+nil + (warn "Invalid external-format ~A; using LATIN-1" + external-format) + (setf external-format :latin-1)))) (/show0 "/default external format ok") (setf *default-external-format* external-format)))) - -;;; FIXME: OAOOM here vrt. DEFINE-EXTERNAL-FORMAT in fd-stream.lisp -(defparameter *external-format-functions* (make-hash-table)) - -(defun add-external-format-funs (format-names funs) - (dolist (name format-names (values)) - (setf (gethash name *external-format-functions*) funs))) - -(add-external-format-funs - '(:ascii :us-ascii :ansi_x3.4-1968 :iso-646 :iso-646-us :|646|) - '(ascii->string-aref string->ascii)) -(add-external-format-funs - '(:latin1 :latin-1 :iso-8859-1 :iso8859-1) - '(latin1->string-aref string->latin1)) -#!+sb-unicode -(add-external-format-funs - '(:latin9 :latin-9 :iso-8859-15 :iso8859-15) - '(latin9->string-aref string->latin9)) -(add-external-format-funs '(:utf8 :utf-8) '(utf8->string-aref string->utf8)) - -(defun external-formats-funs (external-format) - (when (eql external-format :default) - (setf external-format (default-external-format))) - (or (gethash external-format *external-format-functions*) - (error "Unknown external-format ~S" external-format))) ;;;; public interface +(defun maybe-defaulted-external-format (external-format) + (sb!impl::get-external-format-or-lose (if (eq external-format :default) + (default-external-format) + external-format))) + (defun octets-to-string (vector &key (external-format :default) (start 0) end) (declare (type (vector (unsigned-byte 8)) vector)) (with-array-data ((vector vector) @@ -804,8 +395,9 @@ one-past-the-end" (end end) :check-fill-pointer t) (declare (type (simple-array (unsigned-byte 8) (*)) vector)) - (funcall (symbol-function (first (external-formats-funs external-format))) - vector start end))) + (let ((ef (maybe-defaulted-external-format external-format))) + (funcall (symbol-function (sb!impl::ef-octets-to-string-sym ef)) + vector start end)))) (defun string-to-octets (string &key (external-format :default) (start 0) end null-terminate) @@ -815,8 +407,9 @@ one-past-the-end" (end end) :check-fill-pointer t) (declare (type simple-string string)) - (funcall (symbol-function (second (external-formats-funs external-format))) - string start end (if null-terminate 1 0)))) + (let ((ef (maybe-defaulted-external-format external-format))) + (funcall (symbol-function (sb!impl::ef-string-to-octets-sym ef)) + string start end (if null-terminate 1 0))))) #!+sb-unicode (defvar +unicode-replacement-character+ (string (code-char #xfffd))) diff --git a/src/code/target-c-call.lisp b/src/code/target-c-call.lisp index a3ff56a..09d9258 100644 --- a/src/code/target-c-call.lisp +++ b/src/code/target-c-call.lisp @@ -72,17 +72,13 @@ (declare (type simple-string string)) (locally (declare (optimize (speed 3) (safety 0))) - (let ((func (sb!impl::get-external-format-function external-format 10))) - (unless func - (error "Undefined external-format ~A.~%" external-format)) - (funcall (symbol-function func) string)))) + (let ((external-format (sb!impl::get-external-format-or-lose external-format))) + (funcall (sb!impl::ef-write-c-string-fun external-format) string)))) (defun c-string-to-string (sap external-format element-type) (declare (type system-area-pointer sap)) (locally (declare (optimize (speed 3) (safety 0))) - (let ((func (sb!impl::get-external-format-function external-format 9))) - (unless func - (error "Undefined external-format ~A.~%" external-format)) - (funcall (symbol-function func) sap element-type)))) + (let ((external-format (sb!impl::get-external-format-or-lose external-format))) + (funcall (sb!impl::ef-read-c-string-fun external-format) sap element-type)))) diff --git a/tests/external-format.impure.lisp b/tests/external-format.impure.lisp index 3a96fb7..78285f7 100644 --- a/tests/external-format.impure.lisp +++ b/tests/external-format.impure.lisp @@ -17,9 +17,9 @@ (defmacro do-external-formats ((xf &optional result) &body body) (let ((nxf (gensym))) - `(dolist (,nxf sb-impl::*external-formats* ,result) - (let ((,xf (first (first ,nxf)))) - ,@body)))) + `(loop for ,nxf being the hash-values of sb-impl::*external-formats* + do (let ((,xf (first (sb-impl::ef-names ,nxf)))) + ,@body)))) (defvar *test-path* "external-format-test.tmp") diff --git a/version.lisp-expr b/version.lisp-expr index a2b2aa3..4a273fc 100644 --- a/version.lisp-expr +++ b/version.lisp-expr @@ -17,4 +17,4 @@ ;;; checkins which aren't released. (And occasionally for internal ;;; versions, especially for internal versions off the main CVS ;;; branch, it gets hairier, e.g. "0.pre7.14.flaky4.13".) -"1.0.31.22" +"1.0.31.23" -- 1.7.10.4