aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorhryx <codroid@gmail.com>2013-12-31 04:37:04 -0800
committerhryx <codroid@gmail.com>2013-12-31 04:37:04 -0800
commit52d06037d13afdc68384f1031766ac087cb5e70e (patch)
treefc2edc150b6ab3624ce4f407926f64981bfaaf51
parentb63895a2c719003ea2d4f77772074f40f3bddc00 (diff)
downloadQuickie-52d06037d13afdc68384f1031766ac087cb5e70e.tar.gz
Quickie-52d06037d13afdc68384f1031766ac087cb5e70e.tar.bz2
Quickie-52d06037d13afdc68384f1031766ac087cb5e70e.tar.xz
Quickie-52d06037d13afdc68384f1031766ac087cb5e70e.zip
Added utf8.encode
Here's a function that takes a Unicode code point and returns the corresponding UTF-8 encoded character bytes. Example: utf8.encode(0x265c) -- returns '♜' Please feel free to edit or revert if it's not your style. It might not be relevant to Quickie, though it's a handy UTF-8 utility.
-rw-r--r--utf8.lua35
1 files changed, 35 insertions, 0 deletions
diff --git a/utf8.lua b/utf8.lua
index fcb5a6c..90a4ea0 100644
--- a/utf8.lua
+++ b/utf8.lua
@@ -123,6 +123,40 @@ local function reverse(s)
return table.concat(t)
end
+-- Convert a Unicode code point to a UTF-8 byte sequence
+-- Logic stolen from this page:
+-- http://scripts.sil.org/cms/scripts/page.php?site_id=nrsi&id=iws-appendixa
+--
+-- Arguments:
+-- Number representing the Unicode code point (e.g. 0x265c).
+--
+-- Returns:
+-- UTF-8 encoded string of the given character.
+-- Numbers out of range produce a blank string.
+local function encode(code)
+ if code < 0 then
+ error('Code point must not be negative.')
+ elseif code <= 0x7f then
+ return string.char(code)
+ elseif code <= 0x7ff then
+ local c1 = code / 64 + 192
+ local c2 = code % 64 + 128
+ return string.char(c1, c2)
+ elseif code <= 0xffff then
+ local c1 = code / 4096 + 224
+ local c2 = code % 4096 / 64 + 128
+ local c3 = code % 64 + 128
+ return string.char(c1, c2, c3)
+ elseif code <= 0x10ffff then
+ local c1 = code / 262144 + 240
+ local c2 = code % 262144 / 4096 + 128
+ local c3 = code % 4096 / 64 + 128
+ local c4 = code % 64 + 128
+ return string.char(c1, c2, c3, c4)
+ end
+ return ''
+end
+
return {
iter = iter,
chars = chars,
@@ -130,4 +164,5 @@ return {
sub = sub,
split = split,
reverse = reverse,
+ encode = encode
}