aboutsummaryrefslogtreecommitdiffstats
path: root/utf8.lua
blob: 90a4ea050d2dcb557666893deba563dd508ff7b5 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
-- utf8.lua - Basic (and unsafe) utf8 string support in plain Lua - public domain
--
-- Written in 2013 by Matthias Richter (vrld@vrld.org)
--
-- This software is in the public domain. Where that dedication is not
-- recognized, you are granted a perpetual, irrevokable license to copy and
-- modify this file as you see fit. This software is distributed without any
-- warranty.

-- !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
--  ALL FUNCTIONS ARE UNSAFE: THEY ASSUME VALID UTF8 INPUT
-- !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

-- Generic for iterator.
--
-- Arguments:
--     s ... The utf8 string.
--     i ... Last byte of the previous codepoint.
--
-- Returns:
--     k ... Number of the *last* byte of the codepoint.
--     c ... The utf8 codepoint (character).
--     n ... Width/number of bytes of the codepoint.
local function iter(s, i)
	if i >= #s then return end
	local b, nbytes = s:byte(i+1,i+1), 1

	-- determine width of the codepoint by counting the number of set bits in the first byte
	-- warning: there is no validation of the following bytes!
	if     b >= 0xc0 and b <= 0xdf then nbytes = 2 -- 1100 0000 to 1101 1111
	elseif b >= 0xe0 and b <= 0xef then nbytes = 3 -- 1110 0000 to 1110 1111
	elseif b >= 0xf0 and b <= 0xf7 then nbytes = 4 -- 1111 0000 to 1111 0111
	elseif b >= 0xf8 and b <= 0xfb then nbytes = 5 -- 1111 1000 to 1111 1011
	elseif b >= 0xfc and b <= 0xfd then nbytes = 6 -- 1111 1100 to 1111 1101
	elseif b <  0x00 or  b >  0x7f then error(("Invalid codepoint: 0x%02x"):format(b))
	end
	return i+nbytes, s:sub(i+1,i+nbytes), nbytes
end

-- Shortcut to the generic for iterator.
--
-- Usage:
--    for k, c, n in chars(s) do
--        ...
--    end
--
--    Meaning of k, c, and n is the same as in iter(s, i).
local function chars(s)
	return iter, s, 0
end

-- Get length in characters of an utf8 string.
--
-- Arguments:
--     s ... The utf8 string.
--
-- Returns:
--     n ... Number of utf8 characters in s.
local function len(s)
	-- assumes sane utf8 string: count the number of bytes that is *not* 10xxxxxx
	local _, c = s:gsub('[^\128-\191]', '')
	return c
end

-- Get substring, same semantics as string.sub(s,i,j).
--
-- Arguments:
--     s ... The utf8 string.
--     i ... Starting position, may be negative.
--     j ... (optional) Ending position, may be negative.
--
-- Returns:
--     t ... The substring.
local function sub(s, i, j)
	local l = len(s)
	j = j or l
	if i < 0 then i = l + i + 1 end
	if j < 0 then j = l + j + 1 end
	if j < i then return '' end

	local k, t = 1, {}
	for _, c in chars(s) do
		if k >= i then t[#t+1] = c end
		if k >= j then break end
		k = k + 1
	end
	return table.concat(t)
end

-- Split utf8 string in two substrings
--
-- Arguments:
--     s ... The utf8 string.
--     i ... The position to split, may be negative.
--
-- Returns:
--     left  ... Substring before i.
--     right ... Substring after i.
local function split(s, i)
	local l = len(s)
	if i < 0 then i = l + i + 1 end

	local k, pos = 1, 0
	for byte in chars(s) do
		if k > i then break end
		pos, k = byte, k + 1
	end
	return s:sub(1, pos), s:sub(pos+1, -1)
end

-- Reverses order of characters in an utf8 string.
--
-- Arguments:
--     s ... The utf8 string.
--
-- Returns:
--     t ... The revered string.
local function reverse(s)
	local t = {}
	for _, c in chars(s) do
		table.insert(t, 1, c)
	end
	return table.concat(t)
end

-- Convert a Unicode code point to a UTF-8 byte sequence
-- Logic stolen from this page:
-- http://scripts.sil.org/cms/scripts/page.php?site_id=nrsi&id=iws-appendixa
--
-- Arguments:
--     Number representing the Unicode code point (e.g. 0x265c).
--
-- Returns:
--     UTF-8 encoded string of the given character.
--     Numbers out of range produce a blank string.
local function encode(code)
	if code < 0 then
		error('Code point must not be negative.')
	elseif code <= 0x7f then
		return string.char(code)
	elseif code <= 0x7ff then
		local c1 = code / 64 + 192
		local c2 = code % 64 + 128
		return string.char(c1, c2)
	elseif code <= 0xffff then
		local c1 = code / 4096 + 224
		local c2 = code % 4096 / 64 + 128
		local c3 = code % 64 + 128
		return string.char(c1, c2, c3)
	elseif code <= 0x10ffff then
		local c1 = code / 262144 + 240
		local c2 = code % 262144 / 4096 + 128
		local c3 = code % 4096 / 64 + 128
		local c4 = code % 64 + 128
		return string.char(c1, c2, c3, c4)
	end
	return ''
end

return {
	iter    = iter,
	chars   = chars,
	len     = len,
	sub     = sub,
	split   = split,
	reverse = reverse,
	encode  = encode
}