From: Moskvitin Andrey Date: Tue, 13 Apr 2010 07:47:59 +0000 (+0400) Subject: Merge branch 'master' of git://git.b9.com/puri X-Git-Url: http://repo.macrolet.net/gitweb/?a=commitdiff_plain;h=e67bd6b13ba38797fba3533d0ec3becf786c5456;hp=d714cd08ba167679c2dbd42f9db11eba5602a5bb;p=puri-unicode.git Merge branch 'master' of git://git.b9.com/puri --- diff --git a/README b/README index 754cc97..de81773 100644 --- a/README +++ b/README @@ -1,21 +1,22 @@ -PURI - Portable URI Library -=========================== +PURI-UNICODE - Portable URI Library with UTF-8 encoding support +=============================================================== AUTHORS ------- Franz, Inc Kevin Rosenberg +Andrey Moskvitin DOWNLOAD -------- -Puri home: http://files.b9.com/puri/ +puri-unicode home: http://github.com/archimag/puri-unicode Portable tester home: http://files.b9.com/tester/ SUPPORTED PLATFORMS ------------------- - AllegroCL, CLISP, CMUCL, Lispworks, OpenMCL, SBCL + AllegroCL, CLISP, CMUCL, Lispworks, SBCL, ClosureCL OVERVIEW @@ -33,6 +34,10 @@ implementations. Puri completes 126/126 regression tests successfully. Franz's unmodified documentation file is included in the file uri.html. +DIFFERENCES BETWEEN PURI-UNICODE and PURI +----------------------------------------- + +* puri-unicode uses the babbel for support utf-8 encoding DIFFERENCES BETWEEN PURI and NET.URI ------------------------------------ @@ -44,3 +49,4 @@ DIFFERENCES BETWEEN PURI and NET.URI divergence occurs because Franz's parse-error condition uses :format-control and :format-arguments slots which are not in the ANSI specification for the parse-error condition. + diff --git a/puri.asd b/puri.asd index 0e4ea9c..5c12ad6 100644 --- a/puri.asd +++ b/puri.asd @@ -12,6 +12,7 @@ :maintainer "Kevin M. Rosenberg " :licence "GNU Lesser General Public License" :description "Portable Universal Resource Indentifier Library" + :depends-on (#:babel) :components ((:file "src"))) diff --git a/src.lisp b/src.lisp index 44ec5ea..564229f 100644 --- a/src.lisp +++ b/src.lisp @@ -790,37 +790,93 @@ URI ~s contains illegal character ~s at position ~d." (defun decode-escaped-encoding (string escape &optional (reserved-chars *reserved-characters*)) - ;; Return a string with the real characters. + ;;Return a string with the real characters. (when (null escape) (return-from decode-escaped-encoding string)) - (do* ((i 0 (1+ i)) - (max (length string)) - (new-string (copy-seq string)) - (new-i 0 (1+ new-i)) - ch ch2 chc chc2) - ((= i max) - (shrink-vector new-string new-i)) - (if* (char= #\% (setq ch (char string i))) - then (when (> (+ i 3) max) - (.parse-error - "Unsyntactic escaped encoding in ~s." string)) - (setq ch (char string (incf i))) - (setq ch2 (char string (incf i))) - (when (not (and (setq chc (digit-char-p ch 16)) - (setq chc2 (digit-char-p ch2 16)))) - (.parse-error - "Non-hexidecimal digits after %: %c%c." ch ch2)) - (let ((ci (+ (* 16 chc) chc2))) - (if* (or (null reserved-chars) - (> ci 127) ; bug11527 - (= 0 (sbit reserved-chars ci))) - then ;; ok as is - (setf (char new-string new-i) - (code-char ci)) - else (setf (char new-string new-i) #\%) - (setf (char new-string (incf new-i)) ch) - (setf (char new-string (incf new-i)) ch2))) - else (setf (char new-string new-i) ch)))) + (let ((curpos 0) + (maxpos (length string)) + (strs nil)) + (flet ((next-ansii-substring () + (let ((pos (or (position #\% string :start curpos) + maxpos))) + (when (and (= curpos 0) + (= pos maxpos)) + (return-from decode-escaped-encoding string)) + (when (< curpos pos) + (push (subseq string + curpos + pos) + strs) + (setf curpos pos)))) + (next-encoded-substring () + (let ((pos (or (loop for i from curpos below maxpos by 3 + unless (char= (aref string i) + #\%) + return i) + maxpos))) + (when (< curpos pos) + (let ((octets (handler-case (make-array (/ (- pos curpos) + 3) + :element-type '(unsigned-byte 8) + :fill-pointer 0) + (error () + (.parse-error "Unsyntactic escaped encoding in ~s." string))))) + (loop for i from curpos below pos by 3 + do (vector-push (handler-case + (parse-integer string + :start (1+ i) + :end (+ i 3) + :radix 16) + (error () + (.parse-error "Non-hexidecimal digits after %: ~c~c." + (aref string (1+ i)) + (aref string (+ 2 i))))) + octets)) + + (let* ((decoded-string (babel:octets-to-string octets + :encoding :utf-8)) + (rpos (if reserved-chars + (position-if #'(lambda (ch) + (not (or (> (char-code ch) 127) + (= (sbit reserved-chars (char-code ch)) 0)))) + decoded-string)))) + (push (if rpos + (with-output-to-string (out) + (loop for ch across decoded-string + with i = curpos + do (let ((code (char-code ch))) + (cond + ((or (null reserved-chars) + (> code 127) + (= (sbit reserved-chars code) 0)) + (write-char ch out) + (incf i + (* (cond + ((< code #x80) 1) + ((< code #x800) 2) + ((< code #x10000) 3) + ((< code #x200000) 4) + ((< code #x4000000) 5) + (t 6)) + 3))) + (t (write-string (subseq string i (+ i 3)) out) + (incf i 3)))))) + decoded-string) + strs)))) + (setf curpos pos)))) + (loop + while (< curpos maxpos) + do (next-ansii-substring) + while (< curpos maxpos) + do (next-encoded-substring))) + (if (cdr strs) + (apply #'concatenate + 'string + (nreverse strs)) + (or (car strs) + "")))) + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; Printing @@ -900,34 +956,17 @@ URI ~s contains illegal character ~s at position ~d." then (format stream "~a" (uri-string urn)) else (uri-string urn))) -(defparameter *escaped-encoding* - (vector #\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9 #\a #\b #\c #\d #\e #\f)) - (defun encode-escaped-encoding (string reserved-chars escape) (when (null escape) (return-from encode-escaped-encoding string)) - ;; Make a string as big as it possibly needs to be (3 times the original - ;; size), and truncate it at the end. - (do* ((max (length string)) - (new-max (* 3 max)) ;; worst case new size - (new-string (make-string new-max)) - (i 0 (1+ i)) - (new-i -1) - c ci) - ((= i max) - (shrink-vector new-string (incf new-i))) - (setq ci (char-int (setq c (char string i)))) - (if* (or (null reserved-chars) - (> ci 127) - (= 0 (sbit reserved-chars ci))) - then ;; ok as is - (incf new-i) - (setf (char new-string new-i) c) - else ;; need to escape it - (multiple-value-bind (q r) (truncate ci 16) - (setf (char new-string (incf new-i)) #\%) - (setf (char new-string (incf new-i)) (elt *escaped-encoding* q)) - (setf (char new-string (incf new-i)) - (elt *escaped-encoding* r)))))) + (with-output-to-string (out) + (loop for ch across string + do (let ((code (char-code ch))) + (if (and (< code 128) + (or (null reserved-chars) + (= 0 (sbit reserved-chars code)))) + (write-char ch out) + (loop for octet across (babel:string-to-octets (string ch) :encoding :utf-8) + do (format out "%~(~2,'0x~)" octet))))))) (defmethod print-object ((uri uri) stream) (if* *print-escape*