Merge branch 'master' of git://git.b9.com/puri
authorMoskvitin Andrey <archimag@gmail.com>
Tue, 11 May 2010 06:49:01 +0000 (10:49 +0400)
committerMoskvitin Andrey <archimag@gmail.com>
Tue, 11 May 2010 06:49:01 +0000 (10:49 +0400)
README
puri.asd
src.lisp

diff --git a/README b/README
index 754cc97..de81773 100644 (file)
--- a/README
+++ b/README
@@ -1,21 +1,22 @@
-PURI - Portable URI Library
-===========================
+PURI-UNICODE - Portable URI Library with UTF-8 encoding support
+===============================================================
 
 AUTHORS
 -------
 Franz, Inc <http://www.franz.com>
 Kevin Rosenberg <kevin@rosenberg.net>
+Andrey Moskvitin <archimag@gmail.com>
 
 
 DOWNLOAD
 --------
-Puri home: http://files.b9.com/puri/
+puri-unicode home: http://github.com/archimag/puri-unicode
 Portable tester home: http://files.b9.com/tester/
 
 
 SUPPORTED PLATFORMS
 -------------------
-   AllegroCL, CLISP, CMUCL, Lispworks, OpenMCL, SBCL
+   AllegroCL, CLISP, CMUCL, Lispworks, SBCL, ClosureCL
 
 
 OVERVIEW
@@ -33,6 +34,10 @@ implementations. Puri completes 126/126 regression tests successfully.
 Franz's unmodified documentation file is included in the file
 uri.html. 
 
+DIFFERENCES BETWEEN PURI-UNICODE and PURI
+-----------------------------------------
+
+* puri-unicode uses the babbel for support utf-8 encoding
 
 DIFFERENCES BETWEEN PURI and NET.URI
 ------------------------------------
@@ -44,3 +49,4 @@ DIFFERENCES BETWEEN PURI and NET.URI
   divergence occurs because Franz's parse-error condition uses
   :format-control and :format-arguments slots which are not in the ANSI
   specification for the parse-error condition.
+
index 0e4ea9c..5c12ad6 100644 (file)
--- a/puri.asd
+++ b/puri.asd
@@ -12,6 +12,7 @@
   :maintainer "Kevin M. Rosenberg <kmr@debian.org>"
   :licence "GNU Lesser General Public License"
   :description "Portable Universal Resource Indentifier Library"
+  :depends-on (#:babel)
   :components
   ((:file "src")))
 
index da6d9fd..532d836 100644 (file)
--- a/src.lisp
+++ b/src.lisp
@@ -790,37 +790,93 @@ URI ~s contains illegal character ~s at position ~d."
 (defun decode-escaped-encoding (string escape
                                 &optional (reserved-chars
                                            *reserved-characters*))
-  ;; Return a string with the real characters.
+  ;;Return a string with the real characters.
   (when (null escape) (return-from decode-escaped-encoding string))
-  (do* ((i 0 (1+ i))
-        (max (length string))
-        (new-string (copy-seq string))
-        (new-i 0 (1+ new-i))
-        ch ch2 chc chc2)
-      ((= i max)
-       (shrink-vector new-string new-i))
-    (if* (char= #\% (setq ch (char string i)))
-       then (when (> (+ i 3) max)
-              (.parse-error
-               "Unsyntactic escaped encoding in ~s." string))
-            (setq ch (char string (incf i)))
-            (setq ch2 (char string (incf i)))
-            (when (not (and (setq chc (digit-char-p ch 16))
-                            (setq chc2 (digit-char-p ch2 16))))
-              (.parse-error
-               "Non-hexidecimal digits after %: %c%c." ch ch2))
-            (let ((ci (+ (* 16 chc) chc2)))
-              (if* (or (null reserved-chars)
-                       (> ci 127)       ; bug11527
-                       (= 0 (sbit reserved-chars ci)))
-                 then ;; ok as is
-                      (setf (char new-string new-i)
-                        (code-char ci))
-                 else (setf (char new-string new-i) #\%)
-                      (setf (char new-string (incf new-i)) ch)
-                      (setf (char new-string (incf new-i)) ch2)))
-       else (setf (char new-string new-i) ch))))
 
+  (let ((curpos 0)
+        (maxpos (length string))
+        (strs nil))
+    (flet ((next-ansii-substring ()
+             (let ((pos (or (position #\% string :start curpos)
+                            maxpos)))
+               (when (and (= curpos 0)
+                          (= pos maxpos))
+                 (return-from decode-escaped-encoding string))
+               (when (< curpos pos)
+                 (push (subseq string
+                               curpos
+                               pos)
+                       strs)
+                 (setf curpos pos))))
+           (next-encoded-substring ()
+             (let ((pos (or (loop for i from curpos below maxpos by 3
+                               unless (char= (aref string i)
+                                             #\%)
+                               return i)
+                            maxpos)))
+               (when (< curpos pos)
+                 (let ((octets (handler-case (make-array (/ (- pos curpos)
+                                                            3)
+                                                         :element-type '(unsigned-byte 8)
+                                                         :fill-pointer 0)
+                                 (error () 
+                                   (.parse-error "Unsyntactic escaped encoding in ~s." string)))))
+                   (loop for i from curpos below pos by 3
+                      do (vector-push (handler-case
+                                          (parse-integer string
+                                                         :start (1+ i)
+                                                         :end (+ i 3)
+                                                         :radix 16)
+                                        (error ()
+                                          (.parse-error "Non-hexidecimal digits after %: ~c~c." 
+                                                        (aref string  (1+ i))
+                                                        (aref string (+ 2 i)))))
+                                      octets))
+                   
+                   (let* ((decoded-string (babel:octets-to-string octets
+                                                                  :encoding :utf-8))
+                          (rpos (if reserved-chars
+                                    (position-if #'(lambda (ch)
+                                                     (not (or (> (char-code ch) 127)
+                                                              (= (sbit reserved-chars (char-code ch)) 0))))
+                                                 decoded-string))))
+                     (push (if rpos
+                               (with-output-to-string (out)
+                                 (loop for ch across decoded-string
+                                    with i = curpos
+                                    do (let ((code (char-code ch)))
+                                         (cond
+                                           ((or (null reserved-chars)
+                                                (> code 127)
+                                                (= (sbit reserved-chars code) 0))
+                                            (write-char ch out)
+                                            (incf i
+                                                  (* (cond
+                                                       ((< code #x80) 1)
+                                                       ((< code #x800) 2)
+                                                       ((< code #x10000) 3)
+                                                       ((< code #x200000) 4)
+                                                       ((< code #x4000000) 5)
+                                                       (t 6))
+                                                     3)))
+                                           (t (write-string (subseq string i (+ i 3)) out)
+                                              (incf i 3))))))
+                               decoded-string)
+                           strs))))
+                   (setf curpos pos))))
+      (loop
+         while (< curpos maxpos)
+         do (next-ansii-substring)
+         while (< curpos maxpos)
+         do (next-encoded-substring)))
+    (if (cdr strs)
+        (apply #'concatenate
+                   'string
+                   (nreverse strs))
+        (or (car strs)
+            ""))))
+
+                         
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;; Printing
 
@@ -900,34 +956,17 @@ URI ~s contains illegal character ~s at position ~d."
      then (format stream "~a" (uri-string urn))
      else (uri-string urn)))
 
-(defparameter *escaped-encoding*
-    (vector #\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9 #\a #\b #\c #\d #\e #\f))
-
 (defun encode-escaped-encoding (string reserved-chars escape)
   (when (null escape) (return-from encode-escaped-encoding string))
-  ;; Make a string as big as it possibly needs to be (3 times the original
-  ;; size), and truncate it at the end.
-  (do* ((max (length string))
-        (new-max (* 3 max)) ;; worst case new size
-        (new-string (make-string new-max))
-        (i 0 (1+ i))
-        (new-i -1)
-        c ci)
-      ((= i max)
-       (shrink-vector new-string (incf new-i)))
-    (setq ci (char-int (setq c (char string i))))
-    (if* (or (null reserved-chars)
-             (> ci 127)
-             (= 0 (sbit reserved-chars ci)))
-       then ;; ok as is
-            (incf new-i)
-            (setf (char new-string new-i) c)
-       else ;; need to escape it
-            (multiple-value-bind (q r) (truncate ci 16)
-              (setf (char new-string (incf new-i)) #\%)
-              (setf (char new-string (incf new-i)) (elt *escaped-encoding* q))
-              (setf (char new-string (incf new-i))
-                (elt *escaped-encoding* r))))))
+  (with-output-to-string (out)
+    (loop for ch across string
+       do (let ((code (char-code ch)))  
+            (if (and (< code 128)
+                     (or (null reserved-chars)
+                         (= 0 (sbit reserved-chars code))))
+                (write-char ch out)
+                (loop for octet across (babel:string-to-octets (string ch) :encoding :utf-8)
+                   do (format out "%~(~2,'0x~)" octet)))))))
 
 (defmethod print-object ((uri uri) stream)
   (if* *print-escape*