diff options
-rw-r--r-- | utf8.lua | 35 |
1 files changed, 35 insertions, 0 deletions
@@ -123,6 +123,40 @@ local function reverse(s) return table.concat(t) end +-- Convert a Unicode code point to a UTF-8 byte sequence +-- Logic stolen from this page: +-- http://scripts.sil.org/cms/scripts/page.php?site_id=nrsi&id=iws-appendixa +-- +-- Arguments: +-- Number representing the Unicode code point (e.g. 0x265c). +-- +-- Returns: +-- UTF-8 encoded string of the given character. +-- Numbers out of range produce a blank string. +local function encode(code) + if code < 0 then + error('Code point must not be negative.') + elseif code <= 0x7f then + return string.char(code) + elseif code <= 0x7ff then + local c1 = code / 64 + 192 + local c2 = code % 64 + 128 + return string.char(c1, c2) + elseif code <= 0xffff then + local c1 = code / 4096 + 224 + local c2 = code % 4096 / 64 + 128 + local c3 = code % 64 + 128 + return string.char(c1, c2, c3) + elseif code <= 0x10ffff then + local c1 = code / 262144 + 240 + local c2 = code % 262144 / 4096 + 128 + local c3 = code % 4096 / 64 + 128 + local c4 = code % 64 + 128 + return string.char(c1, c2, c3, c4) + end + return '' +end + return { iter = iter, chars = chars, @@ -130,4 +164,5 @@ return { sub = sub, split = split, reverse = reverse, + encode = encode } |