aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--utf8.lua35
1 files changed, 35 insertions, 0 deletions
diff --git a/utf8.lua b/utf8.lua
index fcb5a6c..90a4ea0 100644
--- a/utf8.lua
+++ b/utf8.lua
@@ -123,6 +123,40 @@ local function reverse(s)
return table.concat(t)
end
+-- Convert a Unicode code point to a UTF-8 byte sequence
+-- Logic stolen from this page:
+-- http://scripts.sil.org/cms/scripts/page.php?site_id=nrsi&id=iws-appendixa
+--
+-- Arguments:
+-- Number representing the Unicode code point (e.g. 0x265c).
+--
+-- Returns:
+-- UTF-8 encoded string of the given character.
+-- Numbers out of range produce a blank string.
+local function encode(code)
+ if code < 0 then
+ error('Code point must not be negative.')
+ elseif code <= 0x7f then
+ return string.char(code)
+ elseif code <= 0x7ff then
+ local c1 = code / 64 + 192
+ local c2 = code % 64 + 128
+ return string.char(c1, c2)
+ elseif code <= 0xffff then
+ local c1 = code / 4096 + 224
+ local c2 = code % 4096 / 64 + 128
+ local c3 = code % 64 + 128
+ return string.char(c1, c2, c3)
+ elseif code <= 0x10ffff then
+ local c1 = code / 262144 + 240
+ local c2 = code % 262144 / 4096 + 128
+ local c3 = code % 4096 / 64 + 128
+ local c4 = code % 64 + 128
+ return string.char(c1, c2, c3, c4)
+ end
+ return ''
+end
+
return {
iter = iter,
chars = chars,
@@ -130,4 +164,5 @@ return {
sub = sub,
split = split,
reverse = reverse,
+ encode = encode
}