SBCL-style Unicode support: one character = one Unicode codepoint.

author Ken Harris <kengruven@gmail.com>

Sun, 5 May 2013 06:32:00 +0000 (23:32 -0700)

committer Ken Harris <kengruven@gmail.com>

Sun, 5 May 2013 06:32:00 +0000 (23:32 -0700)
author Ken Harris <kengruven@gmail.com>
Sun, 5 May 2013 06:32:00 +0000 (23:32 -0700)
committer Ken Harris <kengruven@gmail.com>
Sun, 5 May 2013 06:32:00 +0000 (23:32 -0700)
diff --git a/src/compiler.lisp b/src/compiler.lisp

index 8421b16..64cd871 100644 (file)
--- a/src/compiler.lisp
+++ b/src/compiler.lisp
@@ -1457,17 +1457,17 @@
  
  (define-builtin char-code (x)
    (type-check (("x" "string" x))
-    "x.charCodeAt(0)"))
+    "char_to_codepoint(x)"))
  
  (define-builtin code-char (x)
    (type-check (("x" "number" x))
-    "String.fromCharCode(x)"))
+    "char_from_codepoint(x)"))
  
  (define-builtin characterp (x)
    (js!bool
     (js!selfcall
       "var x = " x ";" *newline*
-     "return (typeof(" x ") == \"string\") && x.length == 1;")))
+     "return (typeof(" x ") == \"string\") && (x.length == 1 || x.length == 2);")))
  
  (define-builtin char-to-string (x)
    (js!selfcall
@@ -1476,10 +1476,10 @@
      "return r"))
  
  (define-builtin char-upcase (x)
-  (code x ".toUpperCase()"))
+  (code "safe_char_upcase(" x ")"))
  
  (define-builtin char-downcase (x)
-  (code x ".toLowerCase()"))
+  (code "safe_char_downcase(" x ")"))
  
  (define-builtin stringp (x)
    (js!bool
diff --git a/src/prelude.js b/src/prelude.js

index 6f5feea..162058e 100644 (file)
--- a/src/prelude.js
+++ b/src/prelude.js
@@ -51,14 +51,57 @@ function QIList(){
      }
  }
  
+// Return a new Array of strings, each either length-1, or length-2 (a UTF-16 surrogate pair).
+function codepoints(string) {
+    return string.split(/(?![\udc00-\udfff])/);
+}
  
  // Create and return a lisp string for the Javascript string STRING.
  function make_lisp_string (string){
-    var array = string.split("");
+    var array = codepoints(string);
      array.type = 'character'
      return array;
  }
  
+function char_to_codepoint(ch) {
+    if (ch.length == 1) {
+        return ch.charCodeAt(0);
+    } else {
+        var xh = ch.charCodeAt(0) - 0xD800;
+        var xl = ch.charCodeAt(1) - 0xDC00;
+        return 0x10000 + (xh << 10) + (xl);
+    }
+}
+
+function char_from_codepoint(x) {
+    if (x <= 0xFFFF) {
+        return String.fromCharCode(x);
+    } else {
+        x -= 0x10000;
+        var xh = x >> 10;
+        var xl = x & 0x3FF;
+        return String.fromCharCode(0xD800 + xh) + String.fromCharCode(0xDC00 + xl);
+    }
+}
+
+// if a char (JS string) has the same number of codepoints after .toUpperCase(), return that, else the original.
+function safe_char_upcase(x) {
+    var xu = x.toUpperCase();
+    if (codepoints(xu).length == 1) {
+        return xu;
+    } else {
+        return x;
+    }
+}
+function safe_char_downcase(x) {
+    var xl = x.toLowerCase();
+    if (codepoints(xl).length == 1) {
+        return xl;
+    } else {
+        return x;
+    }
+}
+
  function xstring(x){ return x.join(''); }
  
  
diff --git a/tests/characters.lisp b/tests/characters.lisp

new file mode 100644 (file)

index 0000000..97374a7
--- /dev/null
+++ b/tests/characters.lisp
@@ -0,0 +1,31 @@
+;; CHAR=
+(test (char= (code-char 127744) (code-char 127744)))
+
+;; CHARACTERP
+(test (characterp #\a))
+(test (characterp (code-char 65)))
+(test (char= #\A (code-char 65)))
+(test (not (characterp 10)))
+(test (not (characterp "a")))
+(test (not (characterp "ab")))
+(test (characterp (code-char 127744)))
+
+;; CODE-CHAR, CHAR-CODE
+(test (char= #\A (code-char 65)))
+(test (= 65 (char-code #\A)))
+(test (= 127744 (char-code (code-char 127744))))
+
+;; CHAR-TO-STRING
+(test (= 1 (string-length (char-to-string (code-char 127744)))))
+
+;; CHAR-UPCASE
+(test (char= #\A (char-upcase #\a)))
+(test (char= #\A (char-upcase #\A)))
+(test (char= (code-char 223) (char-upcase (code-char 223))))  ;; changes length, so you get the original back
+(test (char= (code-char 127744) (char-upcase (code-char 127744))))  ;; no upper case
+
+;; CHAR-DOWNCASE
+(test (char= #\a (char-downcase #\a)))
+(test (char= #\a (char-downcase #\A)))
+(test (char= (code-char 223) (char-downcase (code-char 223))))  ;; already lower case
+(test (char= (code-char 127744) (char-downcase (code-char 127744))))  ;; no lower case
author	Ken Harris <kengruven@gmail.com>
	Sun, 5 May 2013 06:32:00 +0000 (23:32 -0700)
committer	Ken Harris <kengruven@gmail.com>
	Sun, 5 May 2013 06:32:00 +0000 (23:32 -0700)
src/compiler.lisp		patch \| blob \| history
src/prelude.js		patch \| blob \| history
tests/characters.lisp	[new file with mode: 0644]	patch \| blob