From: Ken Harris Date: Sun, 5 May 2013 06:32:00 +0000 (-0700) Subject: SBCL-style Unicode support: one character = one Unicode codepoint. X-Git-Url: http://repo.macrolet.net/gitweb/?a=commitdiff_plain;h=6d622a8f09811bc4af366f28bd8f25b1022d4ffd;p=jscl.git SBCL-style Unicode support: one character = one Unicode codepoint. Allow one JSCL "character" object to be a surrogate pair (length-2 JS string). --- diff --git a/src/compiler.lisp b/src/compiler.lisp index 8421b16..64cd871 100644 --- a/src/compiler.lisp +++ b/src/compiler.lisp @@ -1457,17 +1457,17 @@ (define-builtin char-code (x) (type-check (("x" "string" x)) - "x.charCodeAt(0)")) + "char_to_codepoint(x)")) (define-builtin code-char (x) (type-check (("x" "number" x)) - "String.fromCharCode(x)")) + "char_from_codepoint(x)")) (define-builtin characterp (x) (js!bool (js!selfcall "var x = " x ";" *newline* - "return (typeof(" x ") == \"string\") && x.length == 1;"))) + "return (typeof(" x ") == \"string\") && (x.length == 1 || x.length == 2);"))) (define-builtin char-to-string (x) (js!selfcall @@ -1476,10 +1476,10 @@ "return r")) (define-builtin char-upcase (x) - (code x ".toUpperCase()")) + (code "safe_char_upcase(" x ")")) (define-builtin char-downcase (x) - (code x ".toLowerCase()")) + (code "safe_char_downcase(" x ")")) (define-builtin stringp (x) (js!bool diff --git a/src/prelude.js b/src/prelude.js index 6f5feea..162058e 100644 --- a/src/prelude.js +++ b/src/prelude.js @@ -51,14 +51,57 @@ function QIList(){ } } +// Return a new Array of strings, each either length-1, or length-2 (a UTF-16 surrogate pair). +function codepoints(string) { + return string.split(/(?![\udc00-\udfff])/); +} // Create and return a lisp string for the Javascript string STRING. function make_lisp_string (string){ - var array = string.split(""); + var array = codepoints(string); array.type = 'character' return array; } +function char_to_codepoint(ch) { + if (ch.length == 1) { + return ch.charCodeAt(0); + } else { + var xh = ch.charCodeAt(0) - 0xD800; + var xl = ch.charCodeAt(1) - 0xDC00; + return 0x10000 + (xh << 10) + (xl); + } +} + +function char_from_codepoint(x) { + if (x <= 0xFFFF) { + return String.fromCharCode(x); + } else { + x -= 0x10000; + var xh = x >> 10; + var xl = x & 0x3FF; + return String.fromCharCode(0xD800 + xh) + String.fromCharCode(0xDC00 + xl); + } +} + +// if a char (JS string) has the same number of codepoints after .toUpperCase(), return that, else the original. +function safe_char_upcase(x) { + var xu = x.toUpperCase(); + if (codepoints(xu).length == 1) { + return xu; + } else { + return x; + } +} +function safe_char_downcase(x) { + var xl = x.toLowerCase(); + if (codepoints(xl).length == 1) { + return xl; + } else { + return x; + } +} + function xstring(x){ return x.join(''); } diff --git a/tests/characters.lisp b/tests/characters.lisp new file mode 100644 index 0000000..97374a7 --- /dev/null +++ b/tests/characters.lisp @@ -0,0 +1,31 @@ +;; CHAR= +(test (char= (code-char 127744) (code-char 127744))) + +;; CHARACTERP +(test (characterp #\a)) +(test (characterp (code-char 65))) +(test (char= #\A (code-char 65))) +(test (not (characterp 10))) +(test (not (characterp "a"))) +(test (not (characterp "ab"))) +(test (characterp (code-char 127744))) + +;; CODE-CHAR, CHAR-CODE +(test (char= #\A (code-char 65))) +(test (= 65 (char-code #\A))) +(test (= 127744 (char-code (code-char 127744)))) + +;; CHAR-TO-STRING +(test (= 1 (string-length (char-to-string (code-char 127744))))) + +;; CHAR-UPCASE +(test (char= #\A (char-upcase #\a))) +(test (char= #\A (char-upcase #\A))) +(test (char= (code-char 223) (char-upcase (code-char 223)))) ;; changes length, so you get the original back +(test (char= (code-char 127744) (char-upcase (code-char 127744)))) ;; no upper case + +;; CHAR-DOWNCASE +(test (char= #\a (char-downcase #\a))) +(test (char= #\a (char-downcase #\A))) +(test (char= (code-char 223) (char-downcase (code-char 223)))) ;; already lower case +(test (char= (code-char 127744) (char-downcase (code-char 127744)))) ;; no lower case