From 52d06037d13afdc68384f1031766ac087cb5e70e Mon Sep 17 00:00:00 2001 From: hryx Date: Tue, 31 Dec 2013 04:37:04 -0800 Subject: Added utf8.encode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Here's a function that takes a Unicode code point and returns the corresponding UTF-8 encoded character bytes. Example: utf8.encode(0x265c) -- returns '♜' Please feel free to edit or revert if it's not your style. It might not be relevant to Quickie, though it's a handy UTF-8 utility. --- utf8.lua | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/utf8.lua b/utf8.lua index fcb5a6c..90a4ea0 100644 --- a/utf8.lua +++ b/utf8.lua @@ -123,6 +123,40 @@ local function reverse(s) return table.concat(t) end +-- Convert a Unicode code point to a UTF-8 byte sequence +-- Logic stolen from this page: +-- http://scripts.sil.org/cms/scripts/page.php?site_id=nrsi&id=iws-appendixa +-- +-- Arguments: +-- Number representing the Unicode code point (e.g. 0x265c). +-- +-- Returns: +-- UTF-8 encoded string of the given character. +-- Numbers out of range produce a blank string. +local function encode(code) + if code < 0 then + error('Code point must not be negative.') + elseif code <= 0x7f then + return string.char(code) + elseif code <= 0x7ff then + local c1 = code / 64 + 192 + local c2 = code % 64 + 128 + return string.char(c1, c2) + elseif code <= 0xffff then + local c1 = code / 4096 + 224 + local c2 = code % 4096 / 64 + 128 + local c3 = code % 64 + 128 + return string.char(c1, c2, c3) + elseif code <= 0x10ffff then + local c1 = code / 262144 + 240 + local c2 = code % 262144 / 4096 + 128 + local c3 = code % 4096 / 64 + 128 + local c4 = code % 64 + 128 + return string.char(c1, c2, c3, c4) + end + return '' +end + return { iter = iter, chars = chars, @@ -130,4 +164,5 @@ return { sub = sub, split = split, reverse = reverse, + encode = encode } -- cgit v1.2.3-70-g09d2