Merge branch 'master' of git://git.b9.com/puri
[puri-unicode.git] / src.lisp
index 8aee812..564229f 100644 (file)
--- a/src.lisp
+++ b/src.lisp
 ;; Parsing
 
 (defparameter *excluded-characters*
-    '(;; `delims' (except #\%, because it's handled specially):
+    (append
+     ;; exclude control characters
+     (loop for i from 0 to #x1f
+          collect (code-char i))
+     '(;; `delims' (except #\%, because it's handled specially):
       #\< #\> #\" #\space #\#
+      #\Rubout ;; (code-char #x7f)
       ;; `unwise':
       #\{ #\} #\| #\\ #\^ #\[ #\] #\`))
+  "Excluded charcters from RFC2369 (http://www.ietf.org/rfc/rfc2396.txt 2.4.3)")
 
 (defun reserved-char-vector (chars &key except)
-  (do* ((a (make-array 127 :element-type 'bit :initial-element 0))
+  (do* ((a (make-array 128 :element-type 'bit :initial-element 0))
         (chars chars (cdr chars))
         (c (car chars) (car chars)))
       ((null chars) a)
@@ -837,13 +843,24 @@ URI ~s contains illegal character ~s at position ~d."
                      (push (if rpos
                                (with-output-to-string (out)
                                  (loop for ch across decoded-string
-                                    for i from curpos by 3
-                                    do (let ((octet (char-code ch)))
-                                         (if (or (null reserved-chars)
-                                                 (> octet 127)
-                                                 (= (sbit reserved-chars octet) 0))
-                                             (write-char ch out)
-                                             (write-string (subseq string i (+ i 3)) out)))))
+                                    with i = curpos
+                                    do (let ((code (char-code ch)))
+                                         (cond
+                                           ((or (null reserved-chars)
+                                                (> code 127)
+                                                (= (sbit reserved-chars code) 0))
+                                            (write-char ch out)
+                                            (incf i
+                                                  (* (cond
+                                                       ((< code #x80) 1)
+                                                       ((< code #x800) 2)
+                                                       ((< code #x10000) 3)
+                                                       ((< code #x200000) 4)
+                                                       ((< code #x4000000) 5)
+                                                       (t 6))
+                                                     3)))
+                                           (t (write-string (subseq string i (+ i 3)) out)
+                                              (incf i 3))))))
                                decoded-string)
                            strs))))
                    (setf curpos pos))))